xref: /openbmc/linux/net/core/sock.c (revision 3e26a691)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 
139 #include <trace/events/sock.h>
140 
141 #ifdef CONFIG_INET
142 #include <net/tcp.h>
143 #endif
144 
145 #include <net/busy_poll.h>
146 
147 static DEFINE_MUTEX(proto_list_mutex);
148 static LIST_HEAD(proto_list);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family:
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 static const char *const af_family_key_strings[AF_MAX+1] = {
211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
225 };
226 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
227   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
228   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
229   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
230   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
231   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
232   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
233   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
234   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
235   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
236   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
237   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
238   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
239   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
240   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
241 };
242 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
243   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
244   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
245   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
246   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
247   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
248   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
249   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
250   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
251   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
252   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
253   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
254   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
255   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
256   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
257 };
258 
259 /*
260  * sk_callback_lock locking rules are per-address-family,
261  * so split the lock classes by using a per-AF key:
262  */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 
265 /* Take into consideration the size of the struct sk_buff overhead in the
266  * determination of these values, since that is non-constant across
267  * platforms.  This makes socket queueing behavior and performance
268  * not depend upon such differences.
269  */
270 #define _SK_MEM_PACKETS		256
271 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
272 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274 
275 /* Run time adjustable parameters. */
276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
277 EXPORT_SYMBOL(sysctl_wmem_max);
278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
279 EXPORT_SYMBOL(sysctl_rmem_max);
280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
282 
283 /* Maximal space eaten by iovec or ancillary data plus some space */
284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
285 EXPORT_SYMBOL(sysctl_optmem_max);
286 
287 int sysctl_tstamp_allow_data __read_mostly = 1;
288 
289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290 EXPORT_SYMBOL_GPL(memalloc_socks);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_key_slow_inc(&memalloc_socks);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_key_slow_dec(&memalloc_socks);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned long pflags = current->flags;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	current->flags |= PF_MEMALLOC;
334 	ret = sk->sk_backlog_rcv(sk, skb);
335 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
336 
337 	return ret;
338 }
339 EXPORT_SYMBOL(__sk_backlog_rcv);
340 
341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342 {
343 	struct timeval tv;
344 
345 	if (optlen < sizeof(tv))
346 		return -EINVAL;
347 	if (copy_from_user(&tv, optval, sizeof(tv)))
348 		return -EFAULT;
349 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 		return -EDOM;
351 
352 	if (tv.tv_sec < 0) {
353 		static int warned __read_mostly;
354 
355 		*timeo_p = 0;
356 		if (warned < 10 && net_ratelimit()) {
357 			warned++;
358 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 				__func__, current->comm, task_pid_nr(current));
360 		}
361 		return 0;
362 	}
363 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
364 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 		return 0;
366 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 	return 0;
369 }
370 
371 static void sock_warn_obsolete_bsdism(const char *name)
372 {
373 	static int warned;
374 	static char warncomm[TASK_COMM_LEN];
375 	if (strcmp(warncomm, current->comm) && warned < 5) {
376 		strcpy(warncomm,  current->comm);
377 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 			warncomm, name);
379 		warned++;
380 	}
381 }
382 
383 static bool sock_needs_netstamp(const struct sock *sk)
384 {
385 	switch (sk->sk_family) {
386 	case AF_UNSPEC:
387 	case AF_UNIX:
388 		return false;
389 	default:
390 		return true;
391 	}
392 }
393 
394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
395 {
396 	if (sk->sk_flags & flags) {
397 		sk->sk_flags &= ~flags;
398 		if (sock_needs_netstamp(sk) &&
399 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
400 			net_disable_timestamp();
401 	}
402 }
403 
404 
405 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406 {
407 	int err;
408 	unsigned long flags;
409 	struct sk_buff_head *list = &sk->sk_receive_queue;
410 
411 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
412 		atomic_inc(&sk->sk_drops);
413 		trace_sock_rcvqueue_full(sk, skb);
414 		return -ENOMEM;
415 	}
416 
417 	err = sk_filter(sk, skb);
418 	if (err)
419 		return err;
420 
421 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
422 		atomic_inc(&sk->sk_drops);
423 		return -ENOBUFS;
424 	}
425 
426 	skb->dev = NULL;
427 	skb_set_owner_r(skb, sk);
428 
429 	/* we escape from rcu protected region, make sure we dont leak
430 	 * a norefcounted dst
431 	 */
432 	skb_dst_force(skb);
433 
434 	spin_lock_irqsave(&list->lock, flags);
435 	sock_skb_set_dropcount(sk, skb);
436 	__skb_queue_tail(list, skb);
437 	spin_unlock_irqrestore(&list->lock, flags);
438 
439 	if (!sock_flag(sk, SOCK_DEAD))
440 		sk->sk_data_ready(sk);
441 	return 0;
442 }
443 EXPORT_SYMBOL(sock_queue_rcv_skb);
444 
445 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
446 {
447 	int rc = NET_RX_SUCCESS;
448 
449 	if (sk_filter(sk, skb))
450 		goto discard_and_relse;
451 
452 	skb->dev = NULL;
453 
454 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
455 		atomic_inc(&sk->sk_drops);
456 		goto discard_and_relse;
457 	}
458 	if (nested)
459 		bh_lock_sock_nested(sk);
460 	else
461 		bh_lock_sock(sk);
462 	if (!sock_owned_by_user(sk)) {
463 		/*
464 		 * trylock + unlock semantics:
465 		 */
466 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467 
468 		rc = sk_backlog_rcv(sk, skb);
469 
470 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
471 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
472 		bh_unlock_sock(sk);
473 		atomic_inc(&sk->sk_drops);
474 		goto discard_and_relse;
475 	}
476 
477 	bh_unlock_sock(sk);
478 out:
479 	sock_put(sk);
480 	return rc;
481 discard_and_relse:
482 	kfree_skb(skb);
483 	goto out;
484 }
485 EXPORT_SYMBOL(sk_receive_skb);
486 
487 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488 {
489 	struct dst_entry *dst = __sk_dst_get(sk);
490 
491 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
492 		sk_tx_queue_clear(sk);
493 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
494 		dst_release(dst);
495 		return NULL;
496 	}
497 
498 	return dst;
499 }
500 EXPORT_SYMBOL(__sk_dst_check);
501 
502 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503 {
504 	struct dst_entry *dst = sk_dst_get(sk);
505 
506 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 		sk_dst_reset(sk);
508 		dst_release(dst);
509 		return NULL;
510 	}
511 
512 	return dst;
513 }
514 EXPORT_SYMBOL(sk_dst_check);
515 
516 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 				int optlen)
518 {
519 	int ret = -ENOPROTOOPT;
520 #ifdef CONFIG_NETDEVICES
521 	struct net *net = sock_net(sk);
522 	char devname[IFNAMSIZ];
523 	int index;
524 
525 	/* Sorry... */
526 	ret = -EPERM;
527 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
528 		goto out;
529 
530 	ret = -EINVAL;
531 	if (optlen < 0)
532 		goto out;
533 
534 	/* Bind this socket to a particular device like "eth0",
535 	 * as specified in the passed interface name. If the
536 	 * name is "" or the option length is zero the socket
537 	 * is not bound.
538 	 */
539 	if (optlen > IFNAMSIZ - 1)
540 		optlen = IFNAMSIZ - 1;
541 	memset(devname, 0, sizeof(devname));
542 
543 	ret = -EFAULT;
544 	if (copy_from_user(devname, optval, optlen))
545 		goto out;
546 
547 	index = 0;
548 	if (devname[0] != '\0') {
549 		struct net_device *dev;
550 
551 		rcu_read_lock();
552 		dev = dev_get_by_name_rcu(net, devname);
553 		if (dev)
554 			index = dev->ifindex;
555 		rcu_read_unlock();
556 		ret = -ENODEV;
557 		if (!dev)
558 			goto out;
559 	}
560 
561 	lock_sock(sk);
562 	sk->sk_bound_dev_if = index;
563 	sk_dst_reset(sk);
564 	release_sock(sk);
565 
566 	ret = 0;
567 
568 out:
569 #endif
570 
571 	return ret;
572 }
573 
574 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 				int __user *optlen, int len)
576 {
577 	int ret = -ENOPROTOOPT;
578 #ifdef CONFIG_NETDEVICES
579 	struct net *net = sock_net(sk);
580 	char devname[IFNAMSIZ];
581 
582 	if (sk->sk_bound_dev_if == 0) {
583 		len = 0;
584 		goto zero;
585 	}
586 
587 	ret = -EINVAL;
588 	if (len < IFNAMSIZ)
589 		goto out;
590 
591 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 	if (ret)
593 		goto out;
594 
595 	len = strlen(devname) + 1;
596 
597 	ret = -EFAULT;
598 	if (copy_to_user(optval, devname, len))
599 		goto out;
600 
601 zero:
602 	ret = -EFAULT;
603 	if (put_user(len, optlen))
604 		goto out;
605 
606 	ret = 0;
607 
608 out:
609 #endif
610 
611 	return ret;
612 }
613 
614 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615 {
616 	if (valbool)
617 		sock_set_flag(sk, bit);
618 	else
619 		sock_reset_flag(sk, bit);
620 }
621 
622 bool sk_mc_loop(struct sock *sk)
623 {
624 	if (dev_recursion_level())
625 		return false;
626 	if (!sk)
627 		return true;
628 	switch (sk->sk_family) {
629 	case AF_INET:
630 		return inet_sk(sk)->mc_loop;
631 #if IS_ENABLED(CONFIG_IPV6)
632 	case AF_INET6:
633 		return inet6_sk(sk)->mc_loop;
634 #endif
635 	}
636 	WARN_ON(1);
637 	return true;
638 }
639 EXPORT_SYMBOL(sk_mc_loop);
640 
641 /*
642  *	This is meant for all protocols to use and covers goings on
643  *	at the socket level. Everything here is generic.
644  */
645 
646 int sock_setsockopt(struct socket *sock, int level, int optname,
647 		    char __user *optval, unsigned int optlen)
648 {
649 	struct sock *sk = sock->sk;
650 	int val;
651 	int valbool;
652 	struct linger ling;
653 	int ret = 0;
654 
655 	/*
656 	 *	Options without arguments
657 	 */
658 
659 	if (optname == SO_BINDTODEVICE)
660 		return sock_setbindtodevice(sk, optval, optlen);
661 
662 	if (optlen < sizeof(int))
663 		return -EINVAL;
664 
665 	if (get_user(val, (int __user *)optval))
666 		return -EFAULT;
667 
668 	valbool = val ? 1 : 0;
669 
670 	lock_sock(sk);
671 
672 	switch (optname) {
673 	case SO_DEBUG:
674 		if (val && !capable(CAP_NET_ADMIN))
675 			ret = -EACCES;
676 		else
677 			sock_valbool_flag(sk, SOCK_DBG, valbool);
678 		break;
679 	case SO_REUSEADDR:
680 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
681 		break;
682 	case SO_REUSEPORT:
683 		sk->sk_reuseport = valbool;
684 		break;
685 	case SO_TYPE:
686 	case SO_PROTOCOL:
687 	case SO_DOMAIN:
688 	case SO_ERROR:
689 		ret = -ENOPROTOOPT;
690 		break;
691 	case SO_DONTROUTE:
692 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
693 		break;
694 	case SO_BROADCAST:
695 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 		break;
697 	case SO_SNDBUF:
698 		/* Don't error on this BSD doesn't and if you think
699 		 * about it this is right. Otherwise apps have to
700 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 		 * are treated in BSD as hints
702 		 */
703 		val = min_t(u32, val, sysctl_wmem_max);
704 set_sndbuf:
705 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
706 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 		/* Wake up sending tasks if we upped the value. */
708 		sk->sk_write_space(sk);
709 		break;
710 
711 	case SO_SNDBUFFORCE:
712 		if (!capable(CAP_NET_ADMIN)) {
713 			ret = -EPERM;
714 			break;
715 		}
716 		goto set_sndbuf;
717 
718 	case SO_RCVBUF:
719 		/* Don't error on this BSD doesn't and if you think
720 		 * about it this is right. Otherwise apps have to
721 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 		 * are treated in BSD as hints
723 		 */
724 		val = min_t(u32, val, sysctl_rmem_max);
725 set_rcvbuf:
726 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 		/*
728 		 * We double it on the way in to account for
729 		 * "struct sk_buff" etc. overhead.   Applications
730 		 * assume that the SO_RCVBUF setting they make will
731 		 * allow that much actual data to be received on that
732 		 * socket.
733 		 *
734 		 * Applications are unaware that "struct sk_buff" and
735 		 * other overheads allocate from the receive buffer
736 		 * during socket buffer allocation.
737 		 *
738 		 * And after considering the possible alternatives,
739 		 * returning the value we actually used in getsockopt
740 		 * is the most desirable behavior.
741 		 */
742 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
743 		break;
744 
745 	case SO_RCVBUFFORCE:
746 		if (!capable(CAP_NET_ADMIN)) {
747 			ret = -EPERM;
748 			break;
749 		}
750 		goto set_rcvbuf;
751 
752 	case SO_KEEPALIVE:
753 #ifdef CONFIG_INET
754 		if (sk->sk_protocol == IPPROTO_TCP &&
755 		    sk->sk_type == SOCK_STREAM)
756 			tcp_set_keepalive(sk, valbool);
757 #endif
758 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 		break;
760 
761 	case SO_OOBINLINE:
762 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 		break;
764 
765 	case SO_NO_CHECK:
766 		sk->sk_no_check_tx = valbool;
767 		break;
768 
769 	case SO_PRIORITY:
770 		if ((val >= 0 && val <= 6) ||
771 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
772 			sk->sk_priority = val;
773 		else
774 			ret = -EPERM;
775 		break;
776 
777 	case SO_LINGER:
778 		if (optlen < sizeof(ling)) {
779 			ret = -EINVAL;	/* 1003.1g */
780 			break;
781 		}
782 		if (copy_from_user(&ling, optval, sizeof(ling))) {
783 			ret = -EFAULT;
784 			break;
785 		}
786 		if (!ling.l_onoff)
787 			sock_reset_flag(sk, SOCK_LINGER);
788 		else {
789 #if (BITS_PER_LONG == 32)
790 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
792 			else
793 #endif
794 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 			sock_set_flag(sk, SOCK_LINGER);
796 		}
797 		break;
798 
799 	case SO_BSDCOMPAT:
800 		sock_warn_obsolete_bsdism("setsockopt");
801 		break;
802 
803 	case SO_PASSCRED:
804 		if (valbool)
805 			set_bit(SOCK_PASSCRED, &sock->flags);
806 		else
807 			clear_bit(SOCK_PASSCRED, &sock->flags);
808 		break;
809 
810 	case SO_TIMESTAMP:
811 	case SO_TIMESTAMPNS:
812 		if (valbool)  {
813 			if (optname == SO_TIMESTAMP)
814 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 			else
816 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
817 			sock_set_flag(sk, SOCK_RCVTSTAMP);
818 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
819 		} else {
820 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
821 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 		}
823 		break;
824 
825 	case SO_TIMESTAMPING:
826 		if (val & ~SOF_TIMESTAMPING_MASK) {
827 			ret = -EINVAL;
828 			break;
829 		}
830 
831 		if (val & SOF_TIMESTAMPING_OPT_ID &&
832 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
833 			if (sk->sk_protocol == IPPROTO_TCP &&
834 			    sk->sk_type == SOCK_STREAM) {
835 				if (sk->sk_state != TCP_ESTABLISHED) {
836 					ret = -EINVAL;
837 					break;
838 				}
839 				sk->sk_tskey = tcp_sk(sk)->snd_una;
840 			} else {
841 				sk->sk_tskey = 0;
842 			}
843 		}
844 		sk->sk_tsflags = val;
845 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
846 			sock_enable_timestamp(sk,
847 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
848 		else
849 			sock_disable_timestamp(sk,
850 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
851 		break;
852 
853 	case SO_RCVLOWAT:
854 		if (val < 0)
855 			val = INT_MAX;
856 		sk->sk_rcvlowat = val ? : 1;
857 		break;
858 
859 	case SO_RCVTIMEO:
860 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
861 		break;
862 
863 	case SO_SNDTIMEO:
864 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
865 		break;
866 
867 	case SO_ATTACH_FILTER:
868 		ret = -EINVAL;
869 		if (optlen == sizeof(struct sock_fprog)) {
870 			struct sock_fprog fprog;
871 
872 			ret = -EFAULT;
873 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
874 				break;
875 
876 			ret = sk_attach_filter(&fprog, sk);
877 		}
878 		break;
879 
880 	case SO_ATTACH_BPF:
881 		ret = -EINVAL;
882 		if (optlen == sizeof(u32)) {
883 			u32 ufd;
884 
885 			ret = -EFAULT;
886 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
887 				break;
888 
889 			ret = sk_attach_bpf(ufd, sk);
890 		}
891 		break;
892 
893 	case SO_ATTACH_REUSEPORT_CBPF:
894 		ret = -EINVAL;
895 		if (optlen == sizeof(struct sock_fprog)) {
896 			struct sock_fprog fprog;
897 
898 			ret = -EFAULT;
899 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
900 				break;
901 
902 			ret = sk_reuseport_attach_filter(&fprog, sk);
903 		}
904 		break;
905 
906 	case SO_ATTACH_REUSEPORT_EBPF:
907 		ret = -EINVAL;
908 		if (optlen == sizeof(u32)) {
909 			u32 ufd;
910 
911 			ret = -EFAULT;
912 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
913 				break;
914 
915 			ret = sk_reuseport_attach_bpf(ufd, sk);
916 		}
917 		break;
918 
919 	case SO_DETACH_FILTER:
920 		ret = sk_detach_filter(sk);
921 		break;
922 
923 	case SO_LOCK_FILTER:
924 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
925 			ret = -EPERM;
926 		else
927 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
928 		break;
929 
930 	case SO_PASSSEC:
931 		if (valbool)
932 			set_bit(SOCK_PASSSEC, &sock->flags);
933 		else
934 			clear_bit(SOCK_PASSSEC, &sock->flags);
935 		break;
936 	case SO_MARK:
937 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
938 			ret = -EPERM;
939 		else
940 			sk->sk_mark = val;
941 		break;
942 
943 	case SO_RXQ_OVFL:
944 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
945 		break;
946 
947 	case SO_WIFI_STATUS:
948 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
949 		break;
950 
951 	case SO_PEEK_OFF:
952 		if (sock->ops->set_peek_off)
953 			ret = sock->ops->set_peek_off(sk, val);
954 		else
955 			ret = -EOPNOTSUPP;
956 		break;
957 
958 	case SO_NOFCS:
959 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
960 		break;
961 
962 	case SO_SELECT_ERR_QUEUE:
963 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
964 		break;
965 
966 #ifdef CONFIG_NET_RX_BUSY_POLL
967 	case SO_BUSY_POLL:
968 		/* allow unprivileged users to decrease the value */
969 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
970 			ret = -EPERM;
971 		else {
972 			if (val < 0)
973 				ret = -EINVAL;
974 			else
975 				sk->sk_ll_usec = val;
976 		}
977 		break;
978 #endif
979 
980 	case SO_MAX_PACING_RATE:
981 		sk->sk_max_pacing_rate = val;
982 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
983 					 sk->sk_max_pacing_rate);
984 		break;
985 
986 	case SO_INCOMING_CPU:
987 		sk->sk_incoming_cpu = val;
988 		break;
989 
990 	case SO_CNX_ADVICE:
991 		if (val == 1)
992 			dst_negative_advice(sk);
993 		break;
994 	default:
995 		ret = -ENOPROTOOPT;
996 		break;
997 	}
998 	release_sock(sk);
999 	return ret;
1000 }
1001 EXPORT_SYMBOL(sock_setsockopt);
1002 
1003 
1004 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1005 			  struct ucred *ucred)
1006 {
1007 	ucred->pid = pid_vnr(pid);
1008 	ucred->uid = ucred->gid = -1;
1009 	if (cred) {
1010 		struct user_namespace *current_ns = current_user_ns();
1011 
1012 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1013 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1014 	}
1015 }
1016 
1017 int sock_getsockopt(struct socket *sock, int level, int optname,
1018 		    char __user *optval, int __user *optlen)
1019 {
1020 	struct sock *sk = sock->sk;
1021 
1022 	union {
1023 		int val;
1024 		struct linger ling;
1025 		struct timeval tm;
1026 	} v;
1027 
1028 	int lv = sizeof(int);
1029 	int len;
1030 
1031 	if (get_user(len, optlen))
1032 		return -EFAULT;
1033 	if (len < 0)
1034 		return -EINVAL;
1035 
1036 	memset(&v, 0, sizeof(v));
1037 
1038 	switch (optname) {
1039 	case SO_DEBUG:
1040 		v.val = sock_flag(sk, SOCK_DBG);
1041 		break;
1042 
1043 	case SO_DONTROUTE:
1044 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1045 		break;
1046 
1047 	case SO_BROADCAST:
1048 		v.val = sock_flag(sk, SOCK_BROADCAST);
1049 		break;
1050 
1051 	case SO_SNDBUF:
1052 		v.val = sk->sk_sndbuf;
1053 		break;
1054 
1055 	case SO_RCVBUF:
1056 		v.val = sk->sk_rcvbuf;
1057 		break;
1058 
1059 	case SO_REUSEADDR:
1060 		v.val = sk->sk_reuse;
1061 		break;
1062 
1063 	case SO_REUSEPORT:
1064 		v.val = sk->sk_reuseport;
1065 		break;
1066 
1067 	case SO_KEEPALIVE:
1068 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1069 		break;
1070 
1071 	case SO_TYPE:
1072 		v.val = sk->sk_type;
1073 		break;
1074 
1075 	case SO_PROTOCOL:
1076 		v.val = sk->sk_protocol;
1077 		break;
1078 
1079 	case SO_DOMAIN:
1080 		v.val = sk->sk_family;
1081 		break;
1082 
1083 	case SO_ERROR:
1084 		v.val = -sock_error(sk);
1085 		if (v.val == 0)
1086 			v.val = xchg(&sk->sk_err_soft, 0);
1087 		break;
1088 
1089 	case SO_OOBINLINE:
1090 		v.val = sock_flag(sk, SOCK_URGINLINE);
1091 		break;
1092 
1093 	case SO_NO_CHECK:
1094 		v.val = sk->sk_no_check_tx;
1095 		break;
1096 
1097 	case SO_PRIORITY:
1098 		v.val = sk->sk_priority;
1099 		break;
1100 
1101 	case SO_LINGER:
1102 		lv		= sizeof(v.ling);
1103 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1104 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1105 		break;
1106 
1107 	case SO_BSDCOMPAT:
1108 		sock_warn_obsolete_bsdism("getsockopt");
1109 		break;
1110 
1111 	case SO_TIMESTAMP:
1112 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1113 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1114 		break;
1115 
1116 	case SO_TIMESTAMPNS:
1117 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1118 		break;
1119 
1120 	case SO_TIMESTAMPING:
1121 		v.val = sk->sk_tsflags;
1122 		break;
1123 
1124 	case SO_RCVTIMEO:
1125 		lv = sizeof(struct timeval);
1126 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1127 			v.tm.tv_sec = 0;
1128 			v.tm.tv_usec = 0;
1129 		} else {
1130 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1131 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1132 		}
1133 		break;
1134 
1135 	case SO_SNDTIMEO:
1136 		lv = sizeof(struct timeval);
1137 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1138 			v.tm.tv_sec = 0;
1139 			v.tm.tv_usec = 0;
1140 		} else {
1141 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1142 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1143 		}
1144 		break;
1145 
1146 	case SO_RCVLOWAT:
1147 		v.val = sk->sk_rcvlowat;
1148 		break;
1149 
1150 	case SO_SNDLOWAT:
1151 		v.val = 1;
1152 		break;
1153 
1154 	case SO_PASSCRED:
1155 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1156 		break;
1157 
1158 	case SO_PEERCRED:
1159 	{
1160 		struct ucred peercred;
1161 		if (len > sizeof(peercred))
1162 			len = sizeof(peercred);
1163 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1164 		if (copy_to_user(optval, &peercred, len))
1165 			return -EFAULT;
1166 		goto lenout;
1167 	}
1168 
1169 	case SO_PEERNAME:
1170 	{
1171 		char address[128];
1172 
1173 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1174 			return -ENOTCONN;
1175 		if (lv < len)
1176 			return -EINVAL;
1177 		if (copy_to_user(optval, address, len))
1178 			return -EFAULT;
1179 		goto lenout;
1180 	}
1181 
1182 	/* Dubious BSD thing... Probably nobody even uses it, but
1183 	 * the UNIX standard wants it for whatever reason... -DaveM
1184 	 */
1185 	case SO_ACCEPTCONN:
1186 		v.val = sk->sk_state == TCP_LISTEN;
1187 		break;
1188 
1189 	case SO_PASSSEC:
1190 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1191 		break;
1192 
1193 	case SO_PEERSEC:
1194 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1195 
1196 	case SO_MARK:
1197 		v.val = sk->sk_mark;
1198 		break;
1199 
1200 	case SO_RXQ_OVFL:
1201 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1202 		break;
1203 
1204 	case SO_WIFI_STATUS:
1205 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1206 		break;
1207 
1208 	case SO_PEEK_OFF:
1209 		if (!sock->ops->set_peek_off)
1210 			return -EOPNOTSUPP;
1211 
1212 		v.val = sk->sk_peek_off;
1213 		break;
1214 	case SO_NOFCS:
1215 		v.val = sock_flag(sk, SOCK_NOFCS);
1216 		break;
1217 
1218 	case SO_BINDTODEVICE:
1219 		return sock_getbindtodevice(sk, optval, optlen, len);
1220 
1221 	case SO_GET_FILTER:
1222 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1223 		if (len < 0)
1224 			return len;
1225 
1226 		goto lenout;
1227 
1228 	case SO_LOCK_FILTER:
1229 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1230 		break;
1231 
1232 	case SO_BPF_EXTENSIONS:
1233 		v.val = bpf_tell_extensions();
1234 		break;
1235 
1236 	case SO_SELECT_ERR_QUEUE:
1237 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1238 		break;
1239 
1240 #ifdef CONFIG_NET_RX_BUSY_POLL
1241 	case SO_BUSY_POLL:
1242 		v.val = sk->sk_ll_usec;
1243 		break;
1244 #endif
1245 
1246 	case SO_MAX_PACING_RATE:
1247 		v.val = sk->sk_max_pacing_rate;
1248 		break;
1249 
1250 	case SO_INCOMING_CPU:
1251 		v.val = sk->sk_incoming_cpu;
1252 		break;
1253 
1254 	default:
1255 		/* We implement the SO_SNDLOWAT etc to not be settable
1256 		 * (1003.1g 7).
1257 		 */
1258 		return -ENOPROTOOPT;
1259 	}
1260 
1261 	if (len > lv)
1262 		len = lv;
1263 	if (copy_to_user(optval, &v, len))
1264 		return -EFAULT;
1265 lenout:
1266 	if (put_user(len, optlen))
1267 		return -EFAULT;
1268 	return 0;
1269 }
1270 
1271 /*
1272  * Initialize an sk_lock.
1273  *
1274  * (We also register the sk_lock with the lock validator.)
1275  */
1276 static inline void sock_lock_init(struct sock *sk)
1277 {
1278 	sock_lock_init_class_and_name(sk,
1279 			af_family_slock_key_strings[sk->sk_family],
1280 			af_family_slock_keys + sk->sk_family,
1281 			af_family_key_strings[sk->sk_family],
1282 			af_family_keys + sk->sk_family);
1283 }
1284 
1285 /*
1286  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1287  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1288  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1289  */
1290 static void sock_copy(struct sock *nsk, const struct sock *osk)
1291 {
1292 #ifdef CONFIG_SECURITY_NETWORK
1293 	void *sptr = nsk->sk_security;
1294 #endif
1295 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1296 
1297 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1298 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1299 
1300 #ifdef CONFIG_SECURITY_NETWORK
1301 	nsk->sk_security = sptr;
1302 	security_sk_clone(osk, nsk);
1303 #endif
1304 }
1305 
1306 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1307 {
1308 	unsigned long nulls1, nulls2;
1309 
1310 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1311 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1312 	if (nulls1 > nulls2)
1313 		swap(nulls1, nulls2);
1314 
1315 	if (nulls1 != 0)
1316 		memset((char *)sk, 0, nulls1);
1317 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1318 	       nulls2 - nulls1 - sizeof(void *));
1319 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1320 	       size - nulls2 - sizeof(void *));
1321 }
1322 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1323 
1324 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1325 		int family)
1326 {
1327 	struct sock *sk;
1328 	struct kmem_cache *slab;
1329 
1330 	slab = prot->slab;
1331 	if (slab != NULL) {
1332 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1333 		if (!sk)
1334 			return sk;
1335 		if (priority & __GFP_ZERO) {
1336 			if (prot->clear_sk)
1337 				prot->clear_sk(sk, prot->obj_size);
1338 			else
1339 				sk_prot_clear_nulls(sk, prot->obj_size);
1340 		}
1341 	} else
1342 		sk = kmalloc(prot->obj_size, priority);
1343 
1344 	if (sk != NULL) {
1345 		kmemcheck_annotate_bitfield(sk, flags);
1346 
1347 		if (security_sk_alloc(sk, family, priority))
1348 			goto out_free;
1349 
1350 		if (!try_module_get(prot->owner))
1351 			goto out_free_sec;
1352 		sk_tx_queue_clear(sk);
1353 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1354 	}
1355 
1356 	return sk;
1357 
1358 out_free_sec:
1359 	security_sk_free(sk);
1360 out_free:
1361 	if (slab != NULL)
1362 		kmem_cache_free(slab, sk);
1363 	else
1364 		kfree(sk);
1365 	return NULL;
1366 }
1367 
1368 static void sk_prot_free(struct proto *prot, struct sock *sk)
1369 {
1370 	struct kmem_cache *slab;
1371 	struct module *owner;
1372 
1373 	owner = prot->owner;
1374 	slab = prot->slab;
1375 
1376 	cgroup_sk_free(&sk->sk_cgrp_data);
1377 	security_sk_free(sk);
1378 	if (slab != NULL)
1379 		kmem_cache_free(slab, sk);
1380 	else
1381 		kfree(sk);
1382 	module_put(owner);
1383 }
1384 
1385 /**
1386  *	sk_alloc - All socket objects are allocated here
1387  *	@net: the applicable net namespace
1388  *	@family: protocol family
1389  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1390  *	@prot: struct proto associated with this new sock instance
1391  *	@kern: is this to be a kernel socket?
1392  */
1393 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1394 		      struct proto *prot, int kern)
1395 {
1396 	struct sock *sk;
1397 
1398 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1399 	if (sk) {
1400 		sk->sk_family = family;
1401 		/*
1402 		 * See comment in struct sock definition to understand
1403 		 * why we need sk_prot_creator -acme
1404 		 */
1405 		sk->sk_prot = sk->sk_prot_creator = prot;
1406 		sock_lock_init(sk);
1407 		sk->sk_net_refcnt = kern ? 0 : 1;
1408 		if (likely(sk->sk_net_refcnt))
1409 			get_net(net);
1410 		sock_net_set(sk, net);
1411 		atomic_set(&sk->sk_wmem_alloc, 1);
1412 
1413 		sock_update_classid(&sk->sk_cgrp_data);
1414 		sock_update_netprioidx(&sk->sk_cgrp_data);
1415 	}
1416 
1417 	return sk;
1418 }
1419 EXPORT_SYMBOL(sk_alloc);
1420 
1421 void sk_destruct(struct sock *sk)
1422 {
1423 	struct sk_filter *filter;
1424 
1425 	if (sk->sk_destruct)
1426 		sk->sk_destruct(sk);
1427 
1428 	filter = rcu_dereference_check(sk->sk_filter,
1429 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1430 	if (filter) {
1431 		sk_filter_uncharge(sk, filter);
1432 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1433 	}
1434 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1435 		reuseport_detach_sock(sk);
1436 
1437 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1438 
1439 	if (atomic_read(&sk->sk_omem_alloc))
1440 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1441 			 __func__, atomic_read(&sk->sk_omem_alloc));
1442 
1443 	if (sk->sk_peer_cred)
1444 		put_cred(sk->sk_peer_cred);
1445 	put_pid(sk->sk_peer_pid);
1446 	if (likely(sk->sk_net_refcnt))
1447 		put_net(sock_net(sk));
1448 	sk_prot_free(sk->sk_prot_creator, sk);
1449 }
1450 
1451 static void __sk_free(struct sock *sk)
1452 {
1453 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1454 		sock_diag_broadcast_destroy(sk);
1455 	else
1456 		sk_destruct(sk);
1457 }
1458 
1459 void sk_free(struct sock *sk)
1460 {
1461 	/*
1462 	 * We subtract one from sk_wmem_alloc and can know if
1463 	 * some packets are still in some tx queue.
1464 	 * If not null, sock_wfree() will call __sk_free(sk) later
1465 	 */
1466 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1467 		__sk_free(sk);
1468 }
1469 EXPORT_SYMBOL(sk_free);
1470 
1471 /**
1472  *	sk_clone_lock - clone a socket, and lock its clone
1473  *	@sk: the socket to clone
1474  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1475  *
1476  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1477  */
1478 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1479 {
1480 	struct sock *newsk;
1481 	bool is_charged = true;
1482 
1483 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1484 	if (newsk != NULL) {
1485 		struct sk_filter *filter;
1486 
1487 		sock_copy(newsk, sk);
1488 
1489 		/* SANITY */
1490 		if (likely(newsk->sk_net_refcnt))
1491 			get_net(sock_net(newsk));
1492 		sk_node_init(&newsk->sk_node);
1493 		sock_lock_init(newsk);
1494 		bh_lock_sock(newsk);
1495 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1496 		newsk->sk_backlog.len = 0;
1497 
1498 		atomic_set(&newsk->sk_rmem_alloc, 0);
1499 		/*
1500 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1501 		 */
1502 		atomic_set(&newsk->sk_wmem_alloc, 1);
1503 		atomic_set(&newsk->sk_omem_alloc, 0);
1504 		skb_queue_head_init(&newsk->sk_receive_queue);
1505 		skb_queue_head_init(&newsk->sk_write_queue);
1506 
1507 		rwlock_init(&newsk->sk_callback_lock);
1508 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1509 				af_callback_keys + newsk->sk_family,
1510 				af_family_clock_key_strings[newsk->sk_family]);
1511 
1512 		newsk->sk_dst_cache	= NULL;
1513 		newsk->sk_wmem_queued	= 0;
1514 		newsk->sk_forward_alloc = 0;
1515 		newsk->sk_send_head	= NULL;
1516 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1517 
1518 		sock_reset_flag(newsk, SOCK_DONE);
1519 		skb_queue_head_init(&newsk->sk_error_queue);
1520 
1521 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1522 		if (filter != NULL)
1523 			/* though it's an empty new sock, the charging may fail
1524 			 * if sysctl_optmem_max was changed between creation of
1525 			 * original socket and cloning
1526 			 */
1527 			is_charged = sk_filter_charge(newsk, filter);
1528 
1529 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1530 			/* It is still raw copy of parent, so invalidate
1531 			 * destructor and make plain sk_free() */
1532 			newsk->sk_destruct = NULL;
1533 			bh_unlock_sock(newsk);
1534 			sk_free(newsk);
1535 			newsk = NULL;
1536 			goto out;
1537 		}
1538 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1539 
1540 		newsk->sk_err	   = 0;
1541 		newsk->sk_priority = 0;
1542 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1543 		atomic64_set(&newsk->sk_cookie, 0);
1544 		/*
1545 		 * Before updating sk_refcnt, we must commit prior changes to memory
1546 		 * (Documentation/RCU/rculist_nulls.txt for details)
1547 		 */
1548 		smp_wmb();
1549 		atomic_set(&newsk->sk_refcnt, 2);
1550 
1551 		/*
1552 		 * Increment the counter in the same struct proto as the master
1553 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1554 		 * is the same as sk->sk_prot->socks, as this field was copied
1555 		 * with memcpy).
1556 		 *
1557 		 * This _changes_ the previous behaviour, where
1558 		 * tcp_create_openreq_child always was incrementing the
1559 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1560 		 * to be taken into account in all callers. -acme
1561 		 */
1562 		sk_refcnt_debug_inc(newsk);
1563 		sk_set_socket(newsk, NULL);
1564 		newsk->sk_wq = NULL;
1565 
1566 		if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1567 			sock_update_memcg(newsk);
1568 
1569 		if (newsk->sk_prot->sockets_allocated)
1570 			sk_sockets_allocated_inc(newsk);
1571 
1572 		if (sock_needs_netstamp(sk) &&
1573 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1574 			net_enable_timestamp();
1575 	}
1576 out:
1577 	return newsk;
1578 }
1579 EXPORT_SYMBOL_GPL(sk_clone_lock);
1580 
1581 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1582 {
1583 	u32 max_segs = 1;
1584 
1585 	sk_dst_set(sk, dst);
1586 	sk->sk_route_caps = dst->dev->features;
1587 	if (sk->sk_route_caps & NETIF_F_GSO)
1588 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1589 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1590 	if (sk_can_gso(sk)) {
1591 		if (dst->header_len) {
1592 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1593 		} else {
1594 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1595 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1596 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1597 		}
1598 	}
1599 	sk->sk_gso_max_segs = max_segs;
1600 }
1601 EXPORT_SYMBOL_GPL(sk_setup_caps);
1602 
1603 /*
1604  *	Simple resource managers for sockets.
1605  */
1606 
1607 
1608 /*
1609  * Write buffer destructor automatically called from kfree_skb.
1610  */
1611 void sock_wfree(struct sk_buff *skb)
1612 {
1613 	struct sock *sk = skb->sk;
1614 	unsigned int len = skb->truesize;
1615 
1616 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1617 		/*
1618 		 * Keep a reference on sk_wmem_alloc, this will be released
1619 		 * after sk_write_space() call
1620 		 */
1621 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1622 		sk->sk_write_space(sk);
1623 		len = 1;
1624 	}
1625 	/*
1626 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1627 	 * could not do because of in-flight packets
1628 	 */
1629 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1630 		__sk_free(sk);
1631 }
1632 EXPORT_SYMBOL(sock_wfree);
1633 
1634 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1635 {
1636 	skb_orphan(skb);
1637 	skb->sk = sk;
1638 #ifdef CONFIG_INET
1639 	if (unlikely(!sk_fullsock(sk))) {
1640 		skb->destructor = sock_edemux;
1641 		sock_hold(sk);
1642 		return;
1643 	}
1644 #endif
1645 	skb->destructor = sock_wfree;
1646 	skb_set_hash_from_sk(skb, sk);
1647 	/*
1648 	 * We used to take a refcount on sk, but following operation
1649 	 * is enough to guarantee sk_free() wont free this sock until
1650 	 * all in-flight packets are completed
1651 	 */
1652 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1653 }
1654 EXPORT_SYMBOL(skb_set_owner_w);
1655 
1656 void skb_orphan_partial(struct sk_buff *skb)
1657 {
1658 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1659 	 * so we do not completely orphan skb, but transfert all
1660 	 * accounted bytes but one, to avoid unexpected reorders.
1661 	 */
1662 	if (skb->destructor == sock_wfree
1663 #ifdef CONFIG_INET
1664 	    || skb->destructor == tcp_wfree
1665 #endif
1666 		) {
1667 		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1668 		skb->truesize = 1;
1669 	} else {
1670 		skb_orphan(skb);
1671 	}
1672 }
1673 EXPORT_SYMBOL(skb_orphan_partial);
1674 
1675 /*
1676  * Read buffer destructor automatically called from kfree_skb.
1677  */
1678 void sock_rfree(struct sk_buff *skb)
1679 {
1680 	struct sock *sk = skb->sk;
1681 	unsigned int len = skb->truesize;
1682 
1683 	atomic_sub(len, &sk->sk_rmem_alloc);
1684 	sk_mem_uncharge(sk, len);
1685 }
1686 EXPORT_SYMBOL(sock_rfree);
1687 
1688 /*
1689  * Buffer destructor for skbs that are not used directly in read or write
1690  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1691  */
1692 void sock_efree(struct sk_buff *skb)
1693 {
1694 	sock_put(skb->sk);
1695 }
1696 EXPORT_SYMBOL(sock_efree);
1697 
1698 kuid_t sock_i_uid(struct sock *sk)
1699 {
1700 	kuid_t uid;
1701 
1702 	read_lock_bh(&sk->sk_callback_lock);
1703 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1704 	read_unlock_bh(&sk->sk_callback_lock);
1705 	return uid;
1706 }
1707 EXPORT_SYMBOL(sock_i_uid);
1708 
1709 unsigned long sock_i_ino(struct sock *sk)
1710 {
1711 	unsigned long ino;
1712 
1713 	read_lock_bh(&sk->sk_callback_lock);
1714 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1715 	read_unlock_bh(&sk->sk_callback_lock);
1716 	return ino;
1717 }
1718 EXPORT_SYMBOL(sock_i_ino);
1719 
1720 /*
1721  * Allocate a skb from the socket's send buffer.
1722  */
1723 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1724 			     gfp_t priority)
1725 {
1726 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1727 		struct sk_buff *skb = alloc_skb(size, priority);
1728 		if (skb) {
1729 			skb_set_owner_w(skb, sk);
1730 			return skb;
1731 		}
1732 	}
1733 	return NULL;
1734 }
1735 EXPORT_SYMBOL(sock_wmalloc);
1736 
1737 /*
1738  * Allocate a memory block from the socket's option memory buffer.
1739  */
1740 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1741 {
1742 	if ((unsigned int)size <= sysctl_optmem_max &&
1743 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1744 		void *mem;
1745 		/* First do the add, to avoid the race if kmalloc
1746 		 * might sleep.
1747 		 */
1748 		atomic_add(size, &sk->sk_omem_alloc);
1749 		mem = kmalloc(size, priority);
1750 		if (mem)
1751 			return mem;
1752 		atomic_sub(size, &sk->sk_omem_alloc);
1753 	}
1754 	return NULL;
1755 }
1756 EXPORT_SYMBOL(sock_kmalloc);
1757 
1758 /* Free an option memory block. Note, we actually want the inline
1759  * here as this allows gcc to detect the nullify and fold away the
1760  * condition entirely.
1761  */
1762 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1763 				  const bool nullify)
1764 {
1765 	if (WARN_ON_ONCE(!mem))
1766 		return;
1767 	if (nullify)
1768 		kzfree(mem);
1769 	else
1770 		kfree(mem);
1771 	atomic_sub(size, &sk->sk_omem_alloc);
1772 }
1773 
1774 void sock_kfree_s(struct sock *sk, void *mem, int size)
1775 {
1776 	__sock_kfree_s(sk, mem, size, false);
1777 }
1778 EXPORT_SYMBOL(sock_kfree_s);
1779 
1780 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1781 {
1782 	__sock_kfree_s(sk, mem, size, true);
1783 }
1784 EXPORT_SYMBOL(sock_kzfree_s);
1785 
1786 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1787    I think, these locks should be removed for datagram sockets.
1788  */
1789 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1790 {
1791 	DEFINE_WAIT(wait);
1792 
1793 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1794 	for (;;) {
1795 		if (!timeo)
1796 			break;
1797 		if (signal_pending(current))
1798 			break;
1799 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1800 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1801 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1802 			break;
1803 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1804 			break;
1805 		if (sk->sk_err)
1806 			break;
1807 		timeo = schedule_timeout(timeo);
1808 	}
1809 	finish_wait(sk_sleep(sk), &wait);
1810 	return timeo;
1811 }
1812 
1813 
1814 /*
1815  *	Generic send/receive buffer handlers
1816  */
1817 
1818 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1819 				     unsigned long data_len, int noblock,
1820 				     int *errcode, int max_page_order)
1821 {
1822 	struct sk_buff *skb;
1823 	long timeo;
1824 	int err;
1825 
1826 	timeo = sock_sndtimeo(sk, noblock);
1827 	for (;;) {
1828 		err = sock_error(sk);
1829 		if (err != 0)
1830 			goto failure;
1831 
1832 		err = -EPIPE;
1833 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1834 			goto failure;
1835 
1836 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1837 			break;
1838 
1839 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1840 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1841 		err = -EAGAIN;
1842 		if (!timeo)
1843 			goto failure;
1844 		if (signal_pending(current))
1845 			goto interrupted;
1846 		timeo = sock_wait_for_wmem(sk, timeo);
1847 	}
1848 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1849 				   errcode, sk->sk_allocation);
1850 	if (skb)
1851 		skb_set_owner_w(skb, sk);
1852 	return skb;
1853 
1854 interrupted:
1855 	err = sock_intr_errno(timeo);
1856 failure:
1857 	*errcode = err;
1858 	return NULL;
1859 }
1860 EXPORT_SYMBOL(sock_alloc_send_pskb);
1861 
1862 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1863 				    int noblock, int *errcode)
1864 {
1865 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1866 }
1867 EXPORT_SYMBOL(sock_alloc_send_skb);
1868 
1869 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1870 		   struct sockcm_cookie *sockc)
1871 {
1872 	struct cmsghdr *cmsg;
1873 
1874 	for_each_cmsghdr(cmsg, msg) {
1875 		if (!CMSG_OK(msg, cmsg))
1876 			return -EINVAL;
1877 		if (cmsg->cmsg_level != SOL_SOCKET)
1878 			continue;
1879 		switch (cmsg->cmsg_type) {
1880 		case SO_MARK:
1881 			if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1882 				return -EPERM;
1883 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1884 				return -EINVAL;
1885 			sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1886 			break;
1887 		default:
1888 			return -EINVAL;
1889 		}
1890 	}
1891 	return 0;
1892 }
1893 EXPORT_SYMBOL(sock_cmsg_send);
1894 
1895 /* On 32bit arches, an skb frag is limited to 2^15 */
1896 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1897 
1898 /**
1899  * skb_page_frag_refill - check that a page_frag contains enough room
1900  * @sz: minimum size of the fragment we want to get
1901  * @pfrag: pointer to page_frag
1902  * @gfp: priority for memory allocation
1903  *
1904  * Note: While this allocator tries to use high order pages, there is
1905  * no guarantee that allocations succeed. Therefore, @sz MUST be
1906  * less or equal than PAGE_SIZE.
1907  */
1908 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1909 {
1910 	if (pfrag->page) {
1911 		if (page_ref_count(pfrag->page) == 1) {
1912 			pfrag->offset = 0;
1913 			return true;
1914 		}
1915 		if (pfrag->offset + sz <= pfrag->size)
1916 			return true;
1917 		put_page(pfrag->page);
1918 	}
1919 
1920 	pfrag->offset = 0;
1921 	if (SKB_FRAG_PAGE_ORDER) {
1922 		/* Avoid direct reclaim but allow kswapd to wake */
1923 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1924 					  __GFP_COMP | __GFP_NOWARN |
1925 					  __GFP_NORETRY,
1926 					  SKB_FRAG_PAGE_ORDER);
1927 		if (likely(pfrag->page)) {
1928 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1929 			return true;
1930 		}
1931 	}
1932 	pfrag->page = alloc_page(gfp);
1933 	if (likely(pfrag->page)) {
1934 		pfrag->size = PAGE_SIZE;
1935 		return true;
1936 	}
1937 	return false;
1938 }
1939 EXPORT_SYMBOL(skb_page_frag_refill);
1940 
1941 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1942 {
1943 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1944 		return true;
1945 
1946 	sk_enter_memory_pressure(sk);
1947 	sk_stream_moderate_sndbuf(sk);
1948 	return false;
1949 }
1950 EXPORT_SYMBOL(sk_page_frag_refill);
1951 
1952 static void __lock_sock(struct sock *sk)
1953 	__releases(&sk->sk_lock.slock)
1954 	__acquires(&sk->sk_lock.slock)
1955 {
1956 	DEFINE_WAIT(wait);
1957 
1958 	for (;;) {
1959 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1960 					TASK_UNINTERRUPTIBLE);
1961 		spin_unlock_bh(&sk->sk_lock.slock);
1962 		schedule();
1963 		spin_lock_bh(&sk->sk_lock.slock);
1964 		if (!sock_owned_by_user(sk))
1965 			break;
1966 	}
1967 	finish_wait(&sk->sk_lock.wq, &wait);
1968 }
1969 
1970 static void __release_sock(struct sock *sk)
1971 	__releases(&sk->sk_lock.slock)
1972 	__acquires(&sk->sk_lock.slock)
1973 {
1974 	struct sk_buff *skb = sk->sk_backlog.head;
1975 
1976 	do {
1977 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1978 		bh_unlock_sock(sk);
1979 
1980 		do {
1981 			struct sk_buff *next = skb->next;
1982 
1983 			prefetch(next);
1984 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1985 			skb->next = NULL;
1986 			sk_backlog_rcv(sk, skb);
1987 
1988 			/*
1989 			 * We are in process context here with softirqs
1990 			 * disabled, use cond_resched_softirq() to preempt.
1991 			 * This is safe to do because we've taken the backlog
1992 			 * queue private:
1993 			 */
1994 			cond_resched_softirq();
1995 
1996 			skb = next;
1997 		} while (skb != NULL);
1998 
1999 		bh_lock_sock(sk);
2000 	} while ((skb = sk->sk_backlog.head) != NULL);
2001 
2002 	/*
2003 	 * Doing the zeroing here guarantee we can not loop forever
2004 	 * while a wild producer attempts to flood us.
2005 	 */
2006 	sk->sk_backlog.len = 0;
2007 }
2008 
2009 /**
2010  * sk_wait_data - wait for data to arrive at sk_receive_queue
2011  * @sk:    sock to wait on
2012  * @timeo: for how long
2013  * @skb:   last skb seen on sk_receive_queue
2014  *
2015  * Now socket state including sk->sk_err is changed only under lock,
2016  * hence we may omit checks after joining wait queue.
2017  * We check receive queue before schedule() only as optimization;
2018  * it is very likely that release_sock() added new data.
2019  */
2020 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2021 {
2022 	int rc;
2023 	DEFINE_WAIT(wait);
2024 
2025 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2026 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2027 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2028 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2029 	finish_wait(sk_sleep(sk), &wait);
2030 	return rc;
2031 }
2032 EXPORT_SYMBOL(sk_wait_data);
2033 
2034 /**
2035  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2036  *	@sk: socket
2037  *	@size: memory size to allocate
2038  *	@kind: allocation type
2039  *
2040  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2041  *	rmem allocation. This function assumes that protocols which have
2042  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2043  */
2044 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2045 {
2046 	struct proto *prot = sk->sk_prot;
2047 	int amt = sk_mem_pages(size);
2048 	long allocated;
2049 
2050 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2051 
2052 	allocated = sk_memory_allocated_add(sk, amt);
2053 
2054 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2055 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2056 		goto suppress_allocation;
2057 
2058 	/* Under limit. */
2059 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2060 		sk_leave_memory_pressure(sk);
2061 		return 1;
2062 	}
2063 
2064 	/* Under pressure. */
2065 	if (allocated > sk_prot_mem_limits(sk, 1))
2066 		sk_enter_memory_pressure(sk);
2067 
2068 	/* Over hard limit. */
2069 	if (allocated > sk_prot_mem_limits(sk, 2))
2070 		goto suppress_allocation;
2071 
2072 	/* guarantee minimum buffer size under pressure */
2073 	if (kind == SK_MEM_RECV) {
2074 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2075 			return 1;
2076 
2077 	} else { /* SK_MEM_SEND */
2078 		if (sk->sk_type == SOCK_STREAM) {
2079 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2080 				return 1;
2081 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2082 			   prot->sysctl_wmem[0])
2083 				return 1;
2084 	}
2085 
2086 	if (sk_has_memory_pressure(sk)) {
2087 		int alloc;
2088 
2089 		if (!sk_under_memory_pressure(sk))
2090 			return 1;
2091 		alloc = sk_sockets_allocated_read_positive(sk);
2092 		if (sk_prot_mem_limits(sk, 2) > alloc *
2093 		    sk_mem_pages(sk->sk_wmem_queued +
2094 				 atomic_read(&sk->sk_rmem_alloc) +
2095 				 sk->sk_forward_alloc))
2096 			return 1;
2097 	}
2098 
2099 suppress_allocation:
2100 
2101 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2102 		sk_stream_moderate_sndbuf(sk);
2103 
2104 		/* Fail only if socket is _under_ its sndbuf.
2105 		 * In this case we cannot block, so that we have to fail.
2106 		 */
2107 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2108 			return 1;
2109 	}
2110 
2111 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2112 
2113 	/* Alas. Undo changes. */
2114 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2115 
2116 	sk_memory_allocated_sub(sk, amt);
2117 
2118 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2119 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2120 
2121 	return 0;
2122 }
2123 EXPORT_SYMBOL(__sk_mem_schedule);
2124 
2125 /**
2126  *	__sk_mem_reclaim - reclaim memory_allocated
2127  *	@sk: socket
2128  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2129  */
2130 void __sk_mem_reclaim(struct sock *sk, int amount)
2131 {
2132 	amount >>= SK_MEM_QUANTUM_SHIFT;
2133 	sk_memory_allocated_sub(sk, amount);
2134 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2135 
2136 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2137 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2138 
2139 	if (sk_under_memory_pressure(sk) &&
2140 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2141 		sk_leave_memory_pressure(sk);
2142 }
2143 EXPORT_SYMBOL(__sk_mem_reclaim);
2144 
2145 
2146 /*
2147  * Set of default routines for initialising struct proto_ops when
2148  * the protocol does not support a particular function. In certain
2149  * cases where it makes no sense for a protocol to have a "do nothing"
2150  * function, some default processing is provided.
2151  */
2152 
2153 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2154 {
2155 	return -EOPNOTSUPP;
2156 }
2157 EXPORT_SYMBOL(sock_no_bind);
2158 
2159 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2160 		    int len, int flags)
2161 {
2162 	return -EOPNOTSUPP;
2163 }
2164 EXPORT_SYMBOL(sock_no_connect);
2165 
2166 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2167 {
2168 	return -EOPNOTSUPP;
2169 }
2170 EXPORT_SYMBOL(sock_no_socketpair);
2171 
2172 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2173 {
2174 	return -EOPNOTSUPP;
2175 }
2176 EXPORT_SYMBOL(sock_no_accept);
2177 
2178 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2179 		    int *len, int peer)
2180 {
2181 	return -EOPNOTSUPP;
2182 }
2183 EXPORT_SYMBOL(sock_no_getname);
2184 
2185 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2186 {
2187 	return 0;
2188 }
2189 EXPORT_SYMBOL(sock_no_poll);
2190 
2191 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2192 {
2193 	return -EOPNOTSUPP;
2194 }
2195 EXPORT_SYMBOL(sock_no_ioctl);
2196 
2197 int sock_no_listen(struct socket *sock, int backlog)
2198 {
2199 	return -EOPNOTSUPP;
2200 }
2201 EXPORT_SYMBOL(sock_no_listen);
2202 
2203 int sock_no_shutdown(struct socket *sock, int how)
2204 {
2205 	return -EOPNOTSUPP;
2206 }
2207 EXPORT_SYMBOL(sock_no_shutdown);
2208 
2209 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2210 		    char __user *optval, unsigned int optlen)
2211 {
2212 	return -EOPNOTSUPP;
2213 }
2214 EXPORT_SYMBOL(sock_no_setsockopt);
2215 
2216 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2217 		    char __user *optval, int __user *optlen)
2218 {
2219 	return -EOPNOTSUPP;
2220 }
2221 EXPORT_SYMBOL(sock_no_getsockopt);
2222 
2223 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2224 {
2225 	return -EOPNOTSUPP;
2226 }
2227 EXPORT_SYMBOL(sock_no_sendmsg);
2228 
2229 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2230 		    int flags)
2231 {
2232 	return -EOPNOTSUPP;
2233 }
2234 EXPORT_SYMBOL(sock_no_recvmsg);
2235 
2236 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2237 {
2238 	/* Mirror missing mmap method error code */
2239 	return -ENODEV;
2240 }
2241 EXPORT_SYMBOL(sock_no_mmap);
2242 
2243 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2244 {
2245 	ssize_t res;
2246 	struct msghdr msg = {.msg_flags = flags};
2247 	struct kvec iov;
2248 	char *kaddr = kmap(page);
2249 	iov.iov_base = kaddr + offset;
2250 	iov.iov_len = size;
2251 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2252 	kunmap(page);
2253 	return res;
2254 }
2255 EXPORT_SYMBOL(sock_no_sendpage);
2256 
2257 /*
2258  *	Default Socket Callbacks
2259  */
2260 
2261 static void sock_def_wakeup(struct sock *sk)
2262 {
2263 	struct socket_wq *wq;
2264 
2265 	rcu_read_lock();
2266 	wq = rcu_dereference(sk->sk_wq);
2267 	if (skwq_has_sleeper(wq))
2268 		wake_up_interruptible_all(&wq->wait);
2269 	rcu_read_unlock();
2270 }
2271 
2272 static void sock_def_error_report(struct sock *sk)
2273 {
2274 	struct socket_wq *wq;
2275 
2276 	rcu_read_lock();
2277 	wq = rcu_dereference(sk->sk_wq);
2278 	if (skwq_has_sleeper(wq))
2279 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2280 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2281 	rcu_read_unlock();
2282 }
2283 
2284 static void sock_def_readable(struct sock *sk)
2285 {
2286 	struct socket_wq *wq;
2287 
2288 	rcu_read_lock();
2289 	wq = rcu_dereference(sk->sk_wq);
2290 	if (skwq_has_sleeper(wq))
2291 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2292 						POLLRDNORM | POLLRDBAND);
2293 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2294 	rcu_read_unlock();
2295 }
2296 
2297 static void sock_def_write_space(struct sock *sk)
2298 {
2299 	struct socket_wq *wq;
2300 
2301 	rcu_read_lock();
2302 
2303 	/* Do not wake up a writer until he can make "significant"
2304 	 * progress.  --DaveM
2305 	 */
2306 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2307 		wq = rcu_dereference(sk->sk_wq);
2308 		if (skwq_has_sleeper(wq))
2309 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2310 						POLLWRNORM | POLLWRBAND);
2311 
2312 		/* Should agree with poll, otherwise some programs break */
2313 		if (sock_writeable(sk))
2314 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2315 	}
2316 
2317 	rcu_read_unlock();
2318 }
2319 
2320 static void sock_def_destruct(struct sock *sk)
2321 {
2322 }
2323 
2324 void sk_send_sigurg(struct sock *sk)
2325 {
2326 	if (sk->sk_socket && sk->sk_socket->file)
2327 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2328 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2329 }
2330 EXPORT_SYMBOL(sk_send_sigurg);
2331 
2332 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2333 		    unsigned long expires)
2334 {
2335 	if (!mod_timer(timer, expires))
2336 		sock_hold(sk);
2337 }
2338 EXPORT_SYMBOL(sk_reset_timer);
2339 
2340 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2341 {
2342 	if (del_timer(timer))
2343 		__sock_put(sk);
2344 }
2345 EXPORT_SYMBOL(sk_stop_timer);
2346 
2347 void sock_init_data(struct socket *sock, struct sock *sk)
2348 {
2349 	skb_queue_head_init(&sk->sk_receive_queue);
2350 	skb_queue_head_init(&sk->sk_write_queue);
2351 	skb_queue_head_init(&sk->sk_error_queue);
2352 
2353 	sk->sk_send_head	=	NULL;
2354 
2355 	init_timer(&sk->sk_timer);
2356 
2357 	sk->sk_allocation	=	GFP_KERNEL;
2358 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2359 	sk->sk_sndbuf		=	sysctl_wmem_default;
2360 	sk->sk_state		=	TCP_CLOSE;
2361 	sk_set_socket(sk, sock);
2362 
2363 	sock_set_flag(sk, SOCK_ZAPPED);
2364 
2365 	if (sock) {
2366 		sk->sk_type	=	sock->type;
2367 		sk->sk_wq	=	sock->wq;
2368 		sock->sk	=	sk;
2369 	} else
2370 		sk->sk_wq	=	NULL;
2371 
2372 	rwlock_init(&sk->sk_callback_lock);
2373 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2374 			af_callback_keys + sk->sk_family,
2375 			af_family_clock_key_strings[sk->sk_family]);
2376 
2377 	sk->sk_state_change	=	sock_def_wakeup;
2378 	sk->sk_data_ready	=	sock_def_readable;
2379 	sk->sk_write_space	=	sock_def_write_space;
2380 	sk->sk_error_report	=	sock_def_error_report;
2381 	sk->sk_destruct		=	sock_def_destruct;
2382 
2383 	sk->sk_frag.page	=	NULL;
2384 	sk->sk_frag.offset	=	0;
2385 	sk->sk_peek_off		=	-1;
2386 
2387 	sk->sk_peer_pid 	=	NULL;
2388 	sk->sk_peer_cred	=	NULL;
2389 	sk->sk_write_pending	=	0;
2390 	sk->sk_rcvlowat		=	1;
2391 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2392 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2393 
2394 	sk->sk_stamp = ktime_set(-1L, 0);
2395 
2396 #ifdef CONFIG_NET_RX_BUSY_POLL
2397 	sk->sk_napi_id		=	0;
2398 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2399 #endif
2400 
2401 	sk->sk_max_pacing_rate = ~0U;
2402 	sk->sk_pacing_rate = ~0U;
2403 	sk->sk_incoming_cpu = -1;
2404 	/*
2405 	 * Before updating sk_refcnt, we must commit prior changes to memory
2406 	 * (Documentation/RCU/rculist_nulls.txt for details)
2407 	 */
2408 	smp_wmb();
2409 	atomic_set(&sk->sk_refcnt, 1);
2410 	atomic_set(&sk->sk_drops, 0);
2411 }
2412 EXPORT_SYMBOL(sock_init_data);
2413 
2414 void lock_sock_nested(struct sock *sk, int subclass)
2415 {
2416 	might_sleep();
2417 	spin_lock_bh(&sk->sk_lock.slock);
2418 	if (sk->sk_lock.owned)
2419 		__lock_sock(sk);
2420 	sk->sk_lock.owned = 1;
2421 	spin_unlock(&sk->sk_lock.slock);
2422 	/*
2423 	 * The sk_lock has mutex_lock() semantics here:
2424 	 */
2425 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2426 	local_bh_enable();
2427 }
2428 EXPORT_SYMBOL(lock_sock_nested);
2429 
2430 void release_sock(struct sock *sk)
2431 {
2432 	/*
2433 	 * The sk_lock has mutex_unlock() semantics:
2434 	 */
2435 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2436 
2437 	spin_lock_bh(&sk->sk_lock.slock);
2438 	if (sk->sk_backlog.tail)
2439 		__release_sock(sk);
2440 
2441 	/* Warning : release_cb() might need to release sk ownership,
2442 	 * ie call sock_release_ownership(sk) before us.
2443 	 */
2444 	if (sk->sk_prot->release_cb)
2445 		sk->sk_prot->release_cb(sk);
2446 
2447 	sock_release_ownership(sk);
2448 	if (waitqueue_active(&sk->sk_lock.wq))
2449 		wake_up(&sk->sk_lock.wq);
2450 	spin_unlock_bh(&sk->sk_lock.slock);
2451 }
2452 EXPORT_SYMBOL(release_sock);
2453 
2454 /**
2455  * lock_sock_fast - fast version of lock_sock
2456  * @sk: socket
2457  *
2458  * This version should be used for very small section, where process wont block
2459  * return false if fast path is taken
2460  *   sk_lock.slock locked, owned = 0, BH disabled
2461  * return true if slow path is taken
2462  *   sk_lock.slock unlocked, owned = 1, BH enabled
2463  */
2464 bool lock_sock_fast(struct sock *sk)
2465 {
2466 	might_sleep();
2467 	spin_lock_bh(&sk->sk_lock.slock);
2468 
2469 	if (!sk->sk_lock.owned)
2470 		/*
2471 		 * Note : We must disable BH
2472 		 */
2473 		return false;
2474 
2475 	__lock_sock(sk);
2476 	sk->sk_lock.owned = 1;
2477 	spin_unlock(&sk->sk_lock.slock);
2478 	/*
2479 	 * The sk_lock has mutex_lock() semantics here:
2480 	 */
2481 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2482 	local_bh_enable();
2483 	return true;
2484 }
2485 EXPORT_SYMBOL(lock_sock_fast);
2486 
2487 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2488 {
2489 	struct timeval tv;
2490 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2491 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2492 	tv = ktime_to_timeval(sk->sk_stamp);
2493 	if (tv.tv_sec == -1)
2494 		return -ENOENT;
2495 	if (tv.tv_sec == 0) {
2496 		sk->sk_stamp = ktime_get_real();
2497 		tv = ktime_to_timeval(sk->sk_stamp);
2498 	}
2499 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2500 }
2501 EXPORT_SYMBOL(sock_get_timestamp);
2502 
2503 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2504 {
2505 	struct timespec ts;
2506 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2507 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2508 	ts = ktime_to_timespec(sk->sk_stamp);
2509 	if (ts.tv_sec == -1)
2510 		return -ENOENT;
2511 	if (ts.tv_sec == 0) {
2512 		sk->sk_stamp = ktime_get_real();
2513 		ts = ktime_to_timespec(sk->sk_stamp);
2514 	}
2515 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2516 }
2517 EXPORT_SYMBOL(sock_get_timestampns);
2518 
2519 void sock_enable_timestamp(struct sock *sk, int flag)
2520 {
2521 	if (!sock_flag(sk, flag)) {
2522 		unsigned long previous_flags = sk->sk_flags;
2523 
2524 		sock_set_flag(sk, flag);
2525 		/*
2526 		 * we just set one of the two flags which require net
2527 		 * time stamping, but time stamping might have been on
2528 		 * already because of the other one
2529 		 */
2530 		if (sock_needs_netstamp(sk) &&
2531 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2532 			net_enable_timestamp();
2533 	}
2534 }
2535 
2536 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2537 		       int level, int type)
2538 {
2539 	struct sock_exterr_skb *serr;
2540 	struct sk_buff *skb;
2541 	int copied, err;
2542 
2543 	err = -EAGAIN;
2544 	skb = sock_dequeue_err_skb(sk);
2545 	if (skb == NULL)
2546 		goto out;
2547 
2548 	copied = skb->len;
2549 	if (copied > len) {
2550 		msg->msg_flags |= MSG_TRUNC;
2551 		copied = len;
2552 	}
2553 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2554 	if (err)
2555 		goto out_free_skb;
2556 
2557 	sock_recv_timestamp(msg, sk, skb);
2558 
2559 	serr = SKB_EXT_ERR(skb);
2560 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2561 
2562 	msg->msg_flags |= MSG_ERRQUEUE;
2563 	err = copied;
2564 
2565 out_free_skb:
2566 	kfree_skb(skb);
2567 out:
2568 	return err;
2569 }
2570 EXPORT_SYMBOL(sock_recv_errqueue);
2571 
2572 /*
2573  *	Get a socket option on an socket.
2574  *
2575  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2576  *	asynchronous errors should be reported by getsockopt. We assume
2577  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2578  */
2579 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2580 			   char __user *optval, int __user *optlen)
2581 {
2582 	struct sock *sk = sock->sk;
2583 
2584 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2585 }
2586 EXPORT_SYMBOL(sock_common_getsockopt);
2587 
2588 #ifdef CONFIG_COMPAT
2589 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2590 				  char __user *optval, int __user *optlen)
2591 {
2592 	struct sock *sk = sock->sk;
2593 
2594 	if (sk->sk_prot->compat_getsockopt != NULL)
2595 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2596 						      optval, optlen);
2597 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2598 }
2599 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2600 #endif
2601 
2602 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2603 			int flags)
2604 {
2605 	struct sock *sk = sock->sk;
2606 	int addr_len = 0;
2607 	int err;
2608 
2609 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2610 				   flags & ~MSG_DONTWAIT, &addr_len);
2611 	if (err >= 0)
2612 		msg->msg_namelen = addr_len;
2613 	return err;
2614 }
2615 EXPORT_SYMBOL(sock_common_recvmsg);
2616 
2617 /*
2618  *	Set socket options on an inet socket.
2619  */
2620 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2621 			   char __user *optval, unsigned int optlen)
2622 {
2623 	struct sock *sk = sock->sk;
2624 
2625 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2626 }
2627 EXPORT_SYMBOL(sock_common_setsockopt);
2628 
2629 #ifdef CONFIG_COMPAT
2630 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2631 				  char __user *optval, unsigned int optlen)
2632 {
2633 	struct sock *sk = sock->sk;
2634 
2635 	if (sk->sk_prot->compat_setsockopt != NULL)
2636 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2637 						      optval, optlen);
2638 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2639 }
2640 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2641 #endif
2642 
2643 void sk_common_release(struct sock *sk)
2644 {
2645 	if (sk->sk_prot->destroy)
2646 		sk->sk_prot->destroy(sk);
2647 
2648 	/*
2649 	 * Observation: when sock_common_release is called, processes have
2650 	 * no access to socket. But net still has.
2651 	 * Step one, detach it from networking:
2652 	 *
2653 	 * A. Remove from hash tables.
2654 	 */
2655 
2656 	sk->sk_prot->unhash(sk);
2657 
2658 	/*
2659 	 * In this point socket cannot receive new packets, but it is possible
2660 	 * that some packets are in flight because some CPU runs receiver and
2661 	 * did hash table lookup before we unhashed socket. They will achieve
2662 	 * receive queue and will be purged by socket destructor.
2663 	 *
2664 	 * Also we still have packets pending on receive queue and probably,
2665 	 * our own packets waiting in device queues. sock_destroy will drain
2666 	 * receive queue, but transmitted packets will delay socket destruction
2667 	 * until the last reference will be released.
2668 	 */
2669 
2670 	sock_orphan(sk);
2671 
2672 	xfrm_sk_free_policy(sk);
2673 
2674 	sk_refcnt_debug_release(sk);
2675 
2676 	if (sk->sk_frag.page) {
2677 		put_page(sk->sk_frag.page);
2678 		sk->sk_frag.page = NULL;
2679 	}
2680 
2681 	sock_put(sk);
2682 }
2683 EXPORT_SYMBOL(sk_common_release);
2684 
2685 #ifdef CONFIG_PROC_FS
2686 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2687 struct prot_inuse {
2688 	int val[PROTO_INUSE_NR];
2689 };
2690 
2691 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2692 
2693 #ifdef CONFIG_NET_NS
2694 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2695 {
2696 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2697 }
2698 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2699 
2700 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2701 {
2702 	int cpu, idx = prot->inuse_idx;
2703 	int res = 0;
2704 
2705 	for_each_possible_cpu(cpu)
2706 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2707 
2708 	return res >= 0 ? res : 0;
2709 }
2710 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2711 
2712 static int __net_init sock_inuse_init_net(struct net *net)
2713 {
2714 	net->core.inuse = alloc_percpu(struct prot_inuse);
2715 	return net->core.inuse ? 0 : -ENOMEM;
2716 }
2717 
2718 static void __net_exit sock_inuse_exit_net(struct net *net)
2719 {
2720 	free_percpu(net->core.inuse);
2721 }
2722 
2723 static struct pernet_operations net_inuse_ops = {
2724 	.init = sock_inuse_init_net,
2725 	.exit = sock_inuse_exit_net,
2726 };
2727 
2728 static __init int net_inuse_init(void)
2729 {
2730 	if (register_pernet_subsys(&net_inuse_ops))
2731 		panic("Cannot initialize net inuse counters");
2732 
2733 	return 0;
2734 }
2735 
2736 core_initcall(net_inuse_init);
2737 #else
2738 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2739 
2740 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2741 {
2742 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2743 }
2744 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2745 
2746 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2747 {
2748 	int cpu, idx = prot->inuse_idx;
2749 	int res = 0;
2750 
2751 	for_each_possible_cpu(cpu)
2752 		res += per_cpu(prot_inuse, cpu).val[idx];
2753 
2754 	return res >= 0 ? res : 0;
2755 }
2756 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2757 #endif
2758 
2759 static void assign_proto_idx(struct proto *prot)
2760 {
2761 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2762 
2763 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2764 		pr_err("PROTO_INUSE_NR exhausted\n");
2765 		return;
2766 	}
2767 
2768 	set_bit(prot->inuse_idx, proto_inuse_idx);
2769 }
2770 
2771 static void release_proto_idx(struct proto *prot)
2772 {
2773 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2774 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2775 }
2776 #else
2777 static inline void assign_proto_idx(struct proto *prot)
2778 {
2779 }
2780 
2781 static inline void release_proto_idx(struct proto *prot)
2782 {
2783 }
2784 #endif
2785 
2786 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2787 {
2788 	if (!rsk_prot)
2789 		return;
2790 	kfree(rsk_prot->slab_name);
2791 	rsk_prot->slab_name = NULL;
2792 	kmem_cache_destroy(rsk_prot->slab);
2793 	rsk_prot->slab = NULL;
2794 }
2795 
2796 static int req_prot_init(const struct proto *prot)
2797 {
2798 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2799 
2800 	if (!rsk_prot)
2801 		return 0;
2802 
2803 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2804 					prot->name);
2805 	if (!rsk_prot->slab_name)
2806 		return -ENOMEM;
2807 
2808 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2809 					   rsk_prot->obj_size, 0,
2810 					   prot->slab_flags, NULL);
2811 
2812 	if (!rsk_prot->slab) {
2813 		pr_crit("%s: Can't create request sock SLAB cache!\n",
2814 			prot->name);
2815 		return -ENOMEM;
2816 	}
2817 	return 0;
2818 }
2819 
2820 int proto_register(struct proto *prot, int alloc_slab)
2821 {
2822 	if (alloc_slab) {
2823 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2824 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2825 					NULL);
2826 
2827 		if (prot->slab == NULL) {
2828 			pr_crit("%s: Can't create sock SLAB cache!\n",
2829 				prot->name);
2830 			goto out;
2831 		}
2832 
2833 		if (req_prot_init(prot))
2834 			goto out_free_request_sock_slab;
2835 
2836 		if (prot->twsk_prot != NULL) {
2837 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2838 
2839 			if (prot->twsk_prot->twsk_slab_name == NULL)
2840 				goto out_free_request_sock_slab;
2841 
2842 			prot->twsk_prot->twsk_slab =
2843 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2844 						  prot->twsk_prot->twsk_obj_size,
2845 						  0,
2846 						  prot->slab_flags,
2847 						  NULL);
2848 			if (prot->twsk_prot->twsk_slab == NULL)
2849 				goto out_free_timewait_sock_slab_name;
2850 		}
2851 	}
2852 
2853 	mutex_lock(&proto_list_mutex);
2854 	list_add(&prot->node, &proto_list);
2855 	assign_proto_idx(prot);
2856 	mutex_unlock(&proto_list_mutex);
2857 	return 0;
2858 
2859 out_free_timewait_sock_slab_name:
2860 	kfree(prot->twsk_prot->twsk_slab_name);
2861 out_free_request_sock_slab:
2862 	req_prot_cleanup(prot->rsk_prot);
2863 
2864 	kmem_cache_destroy(prot->slab);
2865 	prot->slab = NULL;
2866 out:
2867 	return -ENOBUFS;
2868 }
2869 EXPORT_SYMBOL(proto_register);
2870 
2871 void proto_unregister(struct proto *prot)
2872 {
2873 	mutex_lock(&proto_list_mutex);
2874 	release_proto_idx(prot);
2875 	list_del(&prot->node);
2876 	mutex_unlock(&proto_list_mutex);
2877 
2878 	kmem_cache_destroy(prot->slab);
2879 	prot->slab = NULL;
2880 
2881 	req_prot_cleanup(prot->rsk_prot);
2882 
2883 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2884 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2885 		kfree(prot->twsk_prot->twsk_slab_name);
2886 		prot->twsk_prot->twsk_slab = NULL;
2887 	}
2888 }
2889 EXPORT_SYMBOL(proto_unregister);
2890 
2891 #ifdef CONFIG_PROC_FS
2892 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2893 	__acquires(proto_list_mutex)
2894 {
2895 	mutex_lock(&proto_list_mutex);
2896 	return seq_list_start_head(&proto_list, *pos);
2897 }
2898 
2899 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2900 {
2901 	return seq_list_next(v, &proto_list, pos);
2902 }
2903 
2904 static void proto_seq_stop(struct seq_file *seq, void *v)
2905 	__releases(proto_list_mutex)
2906 {
2907 	mutex_unlock(&proto_list_mutex);
2908 }
2909 
2910 static char proto_method_implemented(const void *method)
2911 {
2912 	return method == NULL ? 'n' : 'y';
2913 }
2914 static long sock_prot_memory_allocated(struct proto *proto)
2915 {
2916 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2917 }
2918 
2919 static char *sock_prot_memory_pressure(struct proto *proto)
2920 {
2921 	return proto->memory_pressure != NULL ?
2922 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2923 }
2924 
2925 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2926 {
2927 
2928 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2929 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2930 		   proto->name,
2931 		   proto->obj_size,
2932 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2933 		   sock_prot_memory_allocated(proto),
2934 		   sock_prot_memory_pressure(proto),
2935 		   proto->max_header,
2936 		   proto->slab == NULL ? "no" : "yes",
2937 		   module_name(proto->owner),
2938 		   proto_method_implemented(proto->close),
2939 		   proto_method_implemented(proto->connect),
2940 		   proto_method_implemented(proto->disconnect),
2941 		   proto_method_implemented(proto->accept),
2942 		   proto_method_implemented(proto->ioctl),
2943 		   proto_method_implemented(proto->init),
2944 		   proto_method_implemented(proto->destroy),
2945 		   proto_method_implemented(proto->shutdown),
2946 		   proto_method_implemented(proto->setsockopt),
2947 		   proto_method_implemented(proto->getsockopt),
2948 		   proto_method_implemented(proto->sendmsg),
2949 		   proto_method_implemented(proto->recvmsg),
2950 		   proto_method_implemented(proto->sendpage),
2951 		   proto_method_implemented(proto->bind),
2952 		   proto_method_implemented(proto->backlog_rcv),
2953 		   proto_method_implemented(proto->hash),
2954 		   proto_method_implemented(proto->unhash),
2955 		   proto_method_implemented(proto->get_port),
2956 		   proto_method_implemented(proto->enter_memory_pressure));
2957 }
2958 
2959 static int proto_seq_show(struct seq_file *seq, void *v)
2960 {
2961 	if (v == &proto_list)
2962 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2963 			   "protocol",
2964 			   "size",
2965 			   "sockets",
2966 			   "memory",
2967 			   "press",
2968 			   "maxhdr",
2969 			   "slab",
2970 			   "module",
2971 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2972 	else
2973 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2974 	return 0;
2975 }
2976 
2977 static const struct seq_operations proto_seq_ops = {
2978 	.start  = proto_seq_start,
2979 	.next   = proto_seq_next,
2980 	.stop   = proto_seq_stop,
2981 	.show   = proto_seq_show,
2982 };
2983 
2984 static int proto_seq_open(struct inode *inode, struct file *file)
2985 {
2986 	return seq_open_net(inode, file, &proto_seq_ops,
2987 			    sizeof(struct seq_net_private));
2988 }
2989 
2990 static const struct file_operations proto_seq_fops = {
2991 	.owner		= THIS_MODULE,
2992 	.open		= proto_seq_open,
2993 	.read		= seq_read,
2994 	.llseek		= seq_lseek,
2995 	.release	= seq_release_net,
2996 };
2997 
2998 static __net_init int proto_init_net(struct net *net)
2999 {
3000 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3001 		return -ENOMEM;
3002 
3003 	return 0;
3004 }
3005 
3006 static __net_exit void proto_exit_net(struct net *net)
3007 {
3008 	remove_proc_entry("protocols", net->proc_net);
3009 }
3010 
3011 
3012 static __net_initdata struct pernet_operations proto_net_ops = {
3013 	.init = proto_init_net,
3014 	.exit = proto_exit_net,
3015 };
3016 
3017 static int __init proto_init(void)
3018 {
3019 	return register_pernet_subsys(&proto_net_ops);
3020 }
3021 
3022 subsys_initcall(proto_init);
3023 
3024 #endif /* PROC_FS */
3025