xref: /openbmc/linux/net/core/sock.c (revision 8fdff1dc)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 #include <linux/user_namespace.h>
116 #include <linux/static_key.h>
117 #include <linux/memcontrol.h>
118 #include <linux/prefetch.h>
119 
120 #include <asm/uaccess.h>
121 
122 #include <linux/netdevice.h>
123 #include <net/protocol.h>
124 #include <linux/skbuff.h>
125 #include <net/net_namespace.h>
126 #include <net/request_sock.h>
127 #include <net/sock.h>
128 #include <linux/net_tstamp.h>
129 #include <net/xfrm.h>
130 #include <linux/ipsec.h>
131 #include <net/cls_cgroup.h>
132 #include <net/netprio_cgroup.h>
133 
134 #include <linux/filter.h>
135 
136 #include <trace/events/sock.h>
137 
138 #ifdef CONFIG_INET
139 #include <net/tcp.h>
140 #endif
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 #ifdef CONFIG_MEMCG_KMEM
146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147 {
148 	struct proto *proto;
149 	int ret = 0;
150 
151 	mutex_lock(&proto_list_mutex);
152 	list_for_each_entry(proto, &proto_list, node) {
153 		if (proto->init_cgroup) {
154 			ret = proto->init_cgroup(memcg, ss);
155 			if (ret)
156 				goto out;
157 		}
158 	}
159 
160 	mutex_unlock(&proto_list_mutex);
161 	return ret;
162 out:
163 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
164 		if (proto->destroy_cgroup)
165 			proto->destroy_cgroup(memcg);
166 	mutex_unlock(&proto_list_mutex);
167 	return ret;
168 }
169 
170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171 {
172 	struct proto *proto;
173 
174 	mutex_lock(&proto_list_mutex);
175 	list_for_each_entry_reverse(proto, &proto_list, node)
176 		if (proto->destroy_cgroup)
177 			proto->destroy_cgroup(memcg);
178 	mutex_unlock(&proto_list_mutex);
179 }
180 #endif
181 
182 /*
183  * Each address family might have different locking rules, so we have
184  * one slock key per address family:
185  */
186 static struct lock_class_key af_family_keys[AF_MAX];
187 static struct lock_class_key af_family_slock_keys[AF_MAX];
188 
189 struct static_key memcg_socket_limit_enabled;
190 EXPORT_SYMBOL(memcg_socket_limit_enabled);
191 
192 /*
193  * Make lock validator output more readable. (we pre-construct these
194  * strings build-time, so that runtime initialization of socket
195  * locks is fast):
196  */
197 static const char *const af_family_key_strings[AF_MAX+1] = {
198   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
199   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
200   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
201   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
202   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
203   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
204   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
205   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
206   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
207   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
208   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
209   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
210   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
211   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
212 };
213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
214   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
215   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
216   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
217   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
218   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
219   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
220   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
221   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
222   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
223   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
224   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
225   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
226   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
227   "slock-AF_NFC"   , "slock-AF_MAX"
228 };
229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
230   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
231   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
232   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
233   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
234   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
235   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
236   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
237   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
238   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
239   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
240   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
241   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
242   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
243   "clock-AF_NFC"   , "clock-AF_MAX"
244 };
245 
246 /*
247  * sk_callback_lock locking rules are per-address-family,
248  * so split the lock classes by using a per-AF key:
249  */
250 static struct lock_class_key af_callback_keys[AF_MAX];
251 
252 /* Take into consideration the size of the struct sk_buff overhead in the
253  * determination of these values, since that is non-constant across
254  * platforms.  This makes socket queueing behavior and performance
255  * not depend upon such differences.
256  */
257 #define _SK_MEM_PACKETS		256
258 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
259 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261 
262 /* Run time adjustable parameters. */
263 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
264 EXPORT_SYMBOL(sysctl_wmem_max);
265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
266 EXPORT_SYMBOL(sysctl_rmem_max);
267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
269 
270 /* Maximal space eaten by iovec or ancillary data plus some space */
271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272 EXPORT_SYMBOL(sysctl_optmem_max);
273 
274 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275 EXPORT_SYMBOL_GPL(memalloc_socks);
276 
277 /**
278  * sk_set_memalloc - sets %SOCK_MEMALLOC
279  * @sk: socket to set it on
280  *
281  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
282  * It's the responsibility of the admin to adjust min_free_kbytes
283  * to meet the requirements
284  */
285 void sk_set_memalloc(struct sock *sk)
286 {
287 	sock_set_flag(sk, SOCK_MEMALLOC);
288 	sk->sk_allocation |= __GFP_MEMALLOC;
289 	static_key_slow_inc(&memalloc_socks);
290 }
291 EXPORT_SYMBOL_GPL(sk_set_memalloc);
292 
293 void sk_clear_memalloc(struct sock *sk)
294 {
295 	sock_reset_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation &= ~__GFP_MEMALLOC;
297 	static_key_slow_dec(&memalloc_socks);
298 
299 	/*
300 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301 	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302 	 * it has rmem allocations there is a risk that the user of the
303 	 * socket cannot make forward progress due to exceeding the rmem
304 	 * limits. By rights, sk_clear_memalloc() should only be called
305 	 * on sockets being torn down but warn and reset the accounting if
306 	 * that assumption breaks.
307 	 */
308 	if (WARN_ON(sk->sk_forward_alloc))
309 		sk_mem_reclaim(sk);
310 }
311 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
312 
313 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
314 {
315 	int ret;
316 	unsigned long pflags = current->flags;
317 
318 	/* these should have been dropped before queueing */
319 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
320 
321 	current->flags |= PF_MEMALLOC;
322 	ret = sk->sk_backlog_rcv(sk, skb);
323 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
324 
325 	return ret;
326 }
327 EXPORT_SYMBOL(__sk_backlog_rcv);
328 
329 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
330 {
331 	struct timeval tv;
332 
333 	if (optlen < sizeof(tv))
334 		return -EINVAL;
335 	if (copy_from_user(&tv, optval, sizeof(tv)))
336 		return -EFAULT;
337 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
338 		return -EDOM;
339 
340 	if (tv.tv_sec < 0) {
341 		static int warned __read_mostly;
342 
343 		*timeo_p = 0;
344 		if (warned < 10 && net_ratelimit()) {
345 			warned++;
346 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
347 				__func__, current->comm, task_pid_nr(current));
348 		}
349 		return 0;
350 	}
351 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
352 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
353 		return 0;
354 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
355 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
356 	return 0;
357 }
358 
359 static void sock_warn_obsolete_bsdism(const char *name)
360 {
361 	static int warned;
362 	static char warncomm[TASK_COMM_LEN];
363 	if (strcmp(warncomm, current->comm) && warned < 5) {
364 		strcpy(warncomm,  current->comm);
365 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
366 			warncomm, name);
367 		warned++;
368 	}
369 }
370 
371 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
372 
373 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
374 {
375 	if (sk->sk_flags & flags) {
376 		sk->sk_flags &= ~flags;
377 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
378 			net_disable_timestamp();
379 	}
380 }
381 
382 
383 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
384 {
385 	int err;
386 	int skb_len;
387 	unsigned long flags;
388 	struct sk_buff_head *list = &sk->sk_receive_queue;
389 
390 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
391 		atomic_inc(&sk->sk_drops);
392 		trace_sock_rcvqueue_full(sk, skb);
393 		return -ENOMEM;
394 	}
395 
396 	err = sk_filter(sk, skb);
397 	if (err)
398 		return err;
399 
400 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
401 		atomic_inc(&sk->sk_drops);
402 		return -ENOBUFS;
403 	}
404 
405 	skb->dev = NULL;
406 	skb_set_owner_r(skb, sk);
407 
408 	/* Cache the SKB length before we tack it onto the receive
409 	 * queue.  Once it is added it no longer belongs to us and
410 	 * may be freed by other threads of control pulling packets
411 	 * from the queue.
412 	 */
413 	skb_len = skb->len;
414 
415 	/* we escape from rcu protected region, make sure we dont leak
416 	 * a norefcounted dst
417 	 */
418 	skb_dst_force(skb);
419 
420 	spin_lock_irqsave(&list->lock, flags);
421 	skb->dropcount = atomic_read(&sk->sk_drops);
422 	__skb_queue_tail(list, skb);
423 	spin_unlock_irqrestore(&list->lock, flags);
424 
425 	if (!sock_flag(sk, SOCK_DEAD))
426 		sk->sk_data_ready(sk, skb_len);
427 	return 0;
428 }
429 EXPORT_SYMBOL(sock_queue_rcv_skb);
430 
431 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
432 {
433 	int rc = NET_RX_SUCCESS;
434 
435 	if (sk_filter(sk, skb))
436 		goto discard_and_relse;
437 
438 	skb->dev = NULL;
439 
440 	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
441 		atomic_inc(&sk->sk_drops);
442 		goto discard_and_relse;
443 	}
444 	if (nested)
445 		bh_lock_sock_nested(sk);
446 	else
447 		bh_lock_sock(sk);
448 	if (!sock_owned_by_user(sk)) {
449 		/*
450 		 * trylock + unlock semantics:
451 		 */
452 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
453 
454 		rc = sk_backlog_rcv(sk, skb);
455 
456 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
457 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
458 		bh_unlock_sock(sk);
459 		atomic_inc(&sk->sk_drops);
460 		goto discard_and_relse;
461 	}
462 
463 	bh_unlock_sock(sk);
464 out:
465 	sock_put(sk);
466 	return rc;
467 discard_and_relse:
468 	kfree_skb(skb);
469 	goto out;
470 }
471 EXPORT_SYMBOL(sk_receive_skb);
472 
473 void sk_reset_txq(struct sock *sk)
474 {
475 	sk_tx_queue_clear(sk);
476 }
477 EXPORT_SYMBOL(sk_reset_txq);
478 
479 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
480 {
481 	struct dst_entry *dst = __sk_dst_get(sk);
482 
483 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
484 		sk_tx_queue_clear(sk);
485 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
486 		dst_release(dst);
487 		return NULL;
488 	}
489 
490 	return dst;
491 }
492 EXPORT_SYMBOL(__sk_dst_check);
493 
494 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
495 {
496 	struct dst_entry *dst = sk_dst_get(sk);
497 
498 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
499 		sk_dst_reset(sk);
500 		dst_release(dst);
501 		return NULL;
502 	}
503 
504 	return dst;
505 }
506 EXPORT_SYMBOL(sk_dst_check);
507 
508 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
509 				int optlen)
510 {
511 	int ret = -ENOPROTOOPT;
512 #ifdef CONFIG_NETDEVICES
513 	struct net *net = sock_net(sk);
514 	char devname[IFNAMSIZ];
515 	int index;
516 
517 	/* Sorry... */
518 	ret = -EPERM;
519 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
520 		goto out;
521 
522 	ret = -EINVAL;
523 	if (optlen < 0)
524 		goto out;
525 
526 	/* Bind this socket to a particular device like "eth0",
527 	 * as specified in the passed interface name. If the
528 	 * name is "" or the option length is zero the socket
529 	 * is not bound.
530 	 */
531 	if (optlen > IFNAMSIZ - 1)
532 		optlen = IFNAMSIZ - 1;
533 	memset(devname, 0, sizeof(devname));
534 
535 	ret = -EFAULT;
536 	if (copy_from_user(devname, optval, optlen))
537 		goto out;
538 
539 	index = 0;
540 	if (devname[0] != '\0') {
541 		struct net_device *dev;
542 
543 		rcu_read_lock();
544 		dev = dev_get_by_name_rcu(net, devname);
545 		if (dev)
546 			index = dev->ifindex;
547 		rcu_read_unlock();
548 		ret = -ENODEV;
549 		if (!dev)
550 			goto out;
551 	}
552 
553 	lock_sock(sk);
554 	sk->sk_bound_dev_if = index;
555 	sk_dst_reset(sk);
556 	release_sock(sk);
557 
558 	ret = 0;
559 
560 out:
561 #endif
562 
563 	return ret;
564 }
565 
566 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
567 				int __user *optlen, int len)
568 {
569 	int ret = -ENOPROTOOPT;
570 #ifdef CONFIG_NETDEVICES
571 	struct net *net = sock_net(sk);
572 	struct net_device *dev;
573 	char devname[IFNAMSIZ];
574 	unsigned seq;
575 
576 	if (sk->sk_bound_dev_if == 0) {
577 		len = 0;
578 		goto zero;
579 	}
580 
581 	ret = -EINVAL;
582 	if (len < IFNAMSIZ)
583 		goto out;
584 
585 retry:
586 	seq = read_seqcount_begin(&devnet_rename_seq);
587 	rcu_read_lock();
588 	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
589 	ret = -ENODEV;
590 	if (!dev) {
591 		rcu_read_unlock();
592 		goto out;
593 	}
594 
595 	strcpy(devname, dev->name);
596 	rcu_read_unlock();
597 	if (read_seqcount_retry(&devnet_rename_seq, seq))
598 		goto retry;
599 
600 	len = strlen(devname) + 1;
601 
602 	ret = -EFAULT;
603 	if (copy_to_user(optval, devname, len))
604 		goto out;
605 
606 zero:
607 	ret = -EFAULT;
608 	if (put_user(len, optlen))
609 		goto out;
610 
611 	ret = 0;
612 
613 out:
614 #endif
615 
616 	return ret;
617 }
618 
619 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
620 {
621 	if (valbool)
622 		sock_set_flag(sk, bit);
623 	else
624 		sock_reset_flag(sk, bit);
625 }
626 
627 /*
628  *	This is meant for all protocols to use and covers goings on
629  *	at the socket level. Everything here is generic.
630  */
631 
632 int sock_setsockopt(struct socket *sock, int level, int optname,
633 		    char __user *optval, unsigned int optlen)
634 {
635 	struct sock *sk = sock->sk;
636 	int val;
637 	int valbool;
638 	struct linger ling;
639 	int ret = 0;
640 
641 	/*
642 	 *	Options without arguments
643 	 */
644 
645 	if (optname == SO_BINDTODEVICE)
646 		return sock_setbindtodevice(sk, optval, optlen);
647 
648 	if (optlen < sizeof(int))
649 		return -EINVAL;
650 
651 	if (get_user(val, (int __user *)optval))
652 		return -EFAULT;
653 
654 	valbool = val ? 1 : 0;
655 
656 	lock_sock(sk);
657 
658 	switch (optname) {
659 	case SO_DEBUG:
660 		if (val && !capable(CAP_NET_ADMIN))
661 			ret = -EACCES;
662 		else
663 			sock_valbool_flag(sk, SOCK_DBG, valbool);
664 		break;
665 	case SO_REUSEADDR:
666 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
667 		break;
668 	case SO_TYPE:
669 	case SO_PROTOCOL:
670 	case SO_DOMAIN:
671 	case SO_ERROR:
672 		ret = -ENOPROTOOPT;
673 		break;
674 	case SO_DONTROUTE:
675 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
676 		break;
677 	case SO_BROADCAST:
678 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
679 		break;
680 	case SO_SNDBUF:
681 		/* Don't error on this BSD doesn't and if you think
682 		 * about it this is right. Otherwise apps have to
683 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
684 		 * are treated in BSD as hints
685 		 */
686 		val = min_t(u32, val, sysctl_wmem_max);
687 set_sndbuf:
688 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
689 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
690 		/* Wake up sending tasks if we upped the value. */
691 		sk->sk_write_space(sk);
692 		break;
693 
694 	case SO_SNDBUFFORCE:
695 		if (!capable(CAP_NET_ADMIN)) {
696 			ret = -EPERM;
697 			break;
698 		}
699 		goto set_sndbuf;
700 
701 	case SO_RCVBUF:
702 		/* Don't error on this BSD doesn't and if you think
703 		 * about it this is right. Otherwise apps have to
704 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
705 		 * are treated in BSD as hints
706 		 */
707 		val = min_t(u32, val, sysctl_rmem_max);
708 set_rcvbuf:
709 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
710 		/*
711 		 * We double it on the way in to account for
712 		 * "struct sk_buff" etc. overhead.   Applications
713 		 * assume that the SO_RCVBUF setting they make will
714 		 * allow that much actual data to be received on that
715 		 * socket.
716 		 *
717 		 * Applications are unaware that "struct sk_buff" and
718 		 * other overheads allocate from the receive buffer
719 		 * during socket buffer allocation.
720 		 *
721 		 * And after considering the possible alternatives,
722 		 * returning the value we actually used in getsockopt
723 		 * is the most desirable behavior.
724 		 */
725 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
726 		break;
727 
728 	case SO_RCVBUFFORCE:
729 		if (!capable(CAP_NET_ADMIN)) {
730 			ret = -EPERM;
731 			break;
732 		}
733 		goto set_rcvbuf;
734 
735 	case SO_KEEPALIVE:
736 #ifdef CONFIG_INET
737 		if (sk->sk_protocol == IPPROTO_TCP &&
738 		    sk->sk_type == SOCK_STREAM)
739 			tcp_set_keepalive(sk, valbool);
740 #endif
741 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
742 		break;
743 
744 	case SO_OOBINLINE:
745 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
746 		break;
747 
748 	case SO_NO_CHECK:
749 		sk->sk_no_check = valbool;
750 		break;
751 
752 	case SO_PRIORITY:
753 		if ((val >= 0 && val <= 6) ||
754 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
755 			sk->sk_priority = val;
756 		else
757 			ret = -EPERM;
758 		break;
759 
760 	case SO_LINGER:
761 		if (optlen < sizeof(ling)) {
762 			ret = -EINVAL;	/* 1003.1g */
763 			break;
764 		}
765 		if (copy_from_user(&ling, optval, sizeof(ling))) {
766 			ret = -EFAULT;
767 			break;
768 		}
769 		if (!ling.l_onoff)
770 			sock_reset_flag(sk, SOCK_LINGER);
771 		else {
772 #if (BITS_PER_LONG == 32)
773 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
774 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
775 			else
776 #endif
777 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
778 			sock_set_flag(sk, SOCK_LINGER);
779 		}
780 		break;
781 
782 	case SO_BSDCOMPAT:
783 		sock_warn_obsolete_bsdism("setsockopt");
784 		break;
785 
786 	case SO_PASSCRED:
787 		if (valbool)
788 			set_bit(SOCK_PASSCRED, &sock->flags);
789 		else
790 			clear_bit(SOCK_PASSCRED, &sock->flags);
791 		break;
792 
793 	case SO_TIMESTAMP:
794 	case SO_TIMESTAMPNS:
795 		if (valbool)  {
796 			if (optname == SO_TIMESTAMP)
797 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
798 			else
799 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
800 			sock_set_flag(sk, SOCK_RCVTSTAMP);
801 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
802 		} else {
803 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
804 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
805 		}
806 		break;
807 
808 	case SO_TIMESTAMPING:
809 		if (val & ~SOF_TIMESTAMPING_MASK) {
810 			ret = -EINVAL;
811 			break;
812 		}
813 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
814 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
815 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
816 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
817 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
818 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
819 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
820 			sock_enable_timestamp(sk,
821 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
822 		else
823 			sock_disable_timestamp(sk,
824 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
825 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
826 				  val & SOF_TIMESTAMPING_SOFTWARE);
827 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
828 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
829 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
830 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
831 		break;
832 
833 	case SO_RCVLOWAT:
834 		if (val < 0)
835 			val = INT_MAX;
836 		sk->sk_rcvlowat = val ? : 1;
837 		break;
838 
839 	case SO_RCVTIMEO:
840 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
841 		break;
842 
843 	case SO_SNDTIMEO:
844 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
845 		break;
846 
847 	case SO_ATTACH_FILTER:
848 		ret = -EINVAL;
849 		if (optlen == sizeof(struct sock_fprog)) {
850 			struct sock_fprog fprog;
851 
852 			ret = -EFAULT;
853 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
854 				break;
855 
856 			ret = sk_attach_filter(&fprog, sk);
857 		}
858 		break;
859 
860 	case SO_DETACH_FILTER:
861 		ret = sk_detach_filter(sk);
862 		break;
863 
864 	case SO_PASSSEC:
865 		if (valbool)
866 			set_bit(SOCK_PASSSEC, &sock->flags);
867 		else
868 			clear_bit(SOCK_PASSSEC, &sock->flags);
869 		break;
870 	case SO_MARK:
871 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
872 			ret = -EPERM;
873 		else
874 			sk->sk_mark = val;
875 		break;
876 
877 		/* We implement the SO_SNDLOWAT etc to
878 		   not be settable (1003.1g 5.3) */
879 	case SO_RXQ_OVFL:
880 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
881 		break;
882 
883 	case SO_WIFI_STATUS:
884 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
885 		break;
886 
887 	case SO_PEEK_OFF:
888 		if (sock->ops->set_peek_off)
889 			sock->ops->set_peek_off(sk, val);
890 		else
891 			ret = -EOPNOTSUPP;
892 		break;
893 
894 	case SO_NOFCS:
895 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
896 		break;
897 
898 	default:
899 		ret = -ENOPROTOOPT;
900 		break;
901 	}
902 	release_sock(sk);
903 	return ret;
904 }
905 EXPORT_SYMBOL(sock_setsockopt);
906 
907 
908 void cred_to_ucred(struct pid *pid, const struct cred *cred,
909 		   struct ucred *ucred)
910 {
911 	ucred->pid = pid_vnr(pid);
912 	ucred->uid = ucred->gid = -1;
913 	if (cred) {
914 		struct user_namespace *current_ns = current_user_ns();
915 
916 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
917 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
918 	}
919 }
920 EXPORT_SYMBOL_GPL(cred_to_ucred);
921 
922 int sock_getsockopt(struct socket *sock, int level, int optname,
923 		    char __user *optval, int __user *optlen)
924 {
925 	struct sock *sk = sock->sk;
926 
927 	union {
928 		int val;
929 		struct linger ling;
930 		struct timeval tm;
931 	} v;
932 
933 	int lv = sizeof(int);
934 	int len;
935 
936 	if (get_user(len, optlen))
937 		return -EFAULT;
938 	if (len < 0)
939 		return -EINVAL;
940 
941 	memset(&v, 0, sizeof(v));
942 
943 	switch (optname) {
944 	case SO_DEBUG:
945 		v.val = sock_flag(sk, SOCK_DBG);
946 		break;
947 
948 	case SO_DONTROUTE:
949 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
950 		break;
951 
952 	case SO_BROADCAST:
953 		v.val = sock_flag(sk, SOCK_BROADCAST);
954 		break;
955 
956 	case SO_SNDBUF:
957 		v.val = sk->sk_sndbuf;
958 		break;
959 
960 	case SO_RCVBUF:
961 		v.val = sk->sk_rcvbuf;
962 		break;
963 
964 	case SO_REUSEADDR:
965 		v.val = sk->sk_reuse;
966 		break;
967 
968 	case SO_KEEPALIVE:
969 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
970 		break;
971 
972 	case SO_TYPE:
973 		v.val = sk->sk_type;
974 		break;
975 
976 	case SO_PROTOCOL:
977 		v.val = sk->sk_protocol;
978 		break;
979 
980 	case SO_DOMAIN:
981 		v.val = sk->sk_family;
982 		break;
983 
984 	case SO_ERROR:
985 		v.val = -sock_error(sk);
986 		if (v.val == 0)
987 			v.val = xchg(&sk->sk_err_soft, 0);
988 		break;
989 
990 	case SO_OOBINLINE:
991 		v.val = sock_flag(sk, SOCK_URGINLINE);
992 		break;
993 
994 	case SO_NO_CHECK:
995 		v.val = sk->sk_no_check;
996 		break;
997 
998 	case SO_PRIORITY:
999 		v.val = sk->sk_priority;
1000 		break;
1001 
1002 	case SO_LINGER:
1003 		lv		= sizeof(v.ling);
1004 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1005 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1006 		break;
1007 
1008 	case SO_BSDCOMPAT:
1009 		sock_warn_obsolete_bsdism("getsockopt");
1010 		break;
1011 
1012 	case SO_TIMESTAMP:
1013 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1014 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1015 		break;
1016 
1017 	case SO_TIMESTAMPNS:
1018 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1019 		break;
1020 
1021 	case SO_TIMESTAMPING:
1022 		v.val = 0;
1023 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1024 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1025 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1026 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1027 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1028 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1029 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1030 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1031 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1032 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1033 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1034 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1035 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1036 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1037 		break;
1038 
1039 	case SO_RCVTIMEO:
1040 		lv = sizeof(struct timeval);
1041 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1042 			v.tm.tv_sec = 0;
1043 			v.tm.tv_usec = 0;
1044 		} else {
1045 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1046 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1047 		}
1048 		break;
1049 
1050 	case SO_SNDTIMEO:
1051 		lv = sizeof(struct timeval);
1052 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1053 			v.tm.tv_sec = 0;
1054 			v.tm.tv_usec = 0;
1055 		} else {
1056 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1057 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1058 		}
1059 		break;
1060 
1061 	case SO_RCVLOWAT:
1062 		v.val = sk->sk_rcvlowat;
1063 		break;
1064 
1065 	case SO_SNDLOWAT:
1066 		v.val = 1;
1067 		break;
1068 
1069 	case SO_PASSCRED:
1070 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1071 		break;
1072 
1073 	case SO_PEERCRED:
1074 	{
1075 		struct ucred peercred;
1076 		if (len > sizeof(peercred))
1077 			len = sizeof(peercred);
1078 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1079 		if (copy_to_user(optval, &peercred, len))
1080 			return -EFAULT;
1081 		goto lenout;
1082 	}
1083 
1084 	case SO_PEERNAME:
1085 	{
1086 		char address[128];
1087 
1088 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1089 			return -ENOTCONN;
1090 		if (lv < len)
1091 			return -EINVAL;
1092 		if (copy_to_user(optval, address, len))
1093 			return -EFAULT;
1094 		goto lenout;
1095 	}
1096 
1097 	/* Dubious BSD thing... Probably nobody even uses it, but
1098 	 * the UNIX standard wants it for whatever reason... -DaveM
1099 	 */
1100 	case SO_ACCEPTCONN:
1101 		v.val = sk->sk_state == TCP_LISTEN;
1102 		break;
1103 
1104 	case SO_PASSSEC:
1105 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1106 		break;
1107 
1108 	case SO_PEERSEC:
1109 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1110 
1111 	case SO_MARK:
1112 		v.val = sk->sk_mark;
1113 		break;
1114 
1115 	case SO_RXQ_OVFL:
1116 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1117 		break;
1118 
1119 	case SO_WIFI_STATUS:
1120 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1121 		break;
1122 
1123 	case SO_PEEK_OFF:
1124 		if (!sock->ops->set_peek_off)
1125 			return -EOPNOTSUPP;
1126 
1127 		v.val = sk->sk_peek_off;
1128 		break;
1129 	case SO_NOFCS:
1130 		v.val = sock_flag(sk, SOCK_NOFCS);
1131 		break;
1132 
1133 	case SO_BINDTODEVICE:
1134 		return sock_getbindtodevice(sk, optval, optlen, len);
1135 
1136 	case SO_GET_FILTER:
1137 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1138 		if (len < 0)
1139 			return len;
1140 
1141 		goto lenout;
1142 
1143 	default:
1144 		return -ENOPROTOOPT;
1145 	}
1146 
1147 	if (len > lv)
1148 		len = lv;
1149 	if (copy_to_user(optval, &v, len))
1150 		return -EFAULT;
1151 lenout:
1152 	if (put_user(len, optlen))
1153 		return -EFAULT;
1154 	return 0;
1155 }
1156 
1157 /*
1158  * Initialize an sk_lock.
1159  *
1160  * (We also register the sk_lock with the lock validator.)
1161  */
1162 static inline void sock_lock_init(struct sock *sk)
1163 {
1164 	sock_lock_init_class_and_name(sk,
1165 			af_family_slock_key_strings[sk->sk_family],
1166 			af_family_slock_keys + sk->sk_family,
1167 			af_family_key_strings[sk->sk_family],
1168 			af_family_keys + sk->sk_family);
1169 }
1170 
1171 /*
1172  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1173  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1174  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1175  */
1176 static void sock_copy(struct sock *nsk, const struct sock *osk)
1177 {
1178 #ifdef CONFIG_SECURITY_NETWORK
1179 	void *sptr = nsk->sk_security;
1180 #endif
1181 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1182 
1183 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1184 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1185 
1186 #ifdef CONFIG_SECURITY_NETWORK
1187 	nsk->sk_security = sptr;
1188 	security_sk_clone(osk, nsk);
1189 #endif
1190 }
1191 
1192 /*
1193  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1194  * un-modified. Special care is taken when initializing object to zero.
1195  */
1196 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1197 {
1198 	if (offsetof(struct sock, sk_node.next) != 0)
1199 		memset(sk, 0, offsetof(struct sock, sk_node.next));
1200 	memset(&sk->sk_node.pprev, 0,
1201 	       size - offsetof(struct sock, sk_node.pprev));
1202 }
1203 
1204 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1205 {
1206 	unsigned long nulls1, nulls2;
1207 
1208 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1209 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1210 	if (nulls1 > nulls2)
1211 		swap(nulls1, nulls2);
1212 
1213 	if (nulls1 != 0)
1214 		memset((char *)sk, 0, nulls1);
1215 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1216 	       nulls2 - nulls1 - sizeof(void *));
1217 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1218 	       size - nulls2 - sizeof(void *));
1219 }
1220 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1221 
1222 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1223 		int family)
1224 {
1225 	struct sock *sk;
1226 	struct kmem_cache *slab;
1227 
1228 	slab = prot->slab;
1229 	if (slab != NULL) {
1230 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1231 		if (!sk)
1232 			return sk;
1233 		if (priority & __GFP_ZERO) {
1234 			if (prot->clear_sk)
1235 				prot->clear_sk(sk, prot->obj_size);
1236 			else
1237 				sk_prot_clear_nulls(sk, prot->obj_size);
1238 		}
1239 	} else
1240 		sk = kmalloc(prot->obj_size, priority);
1241 
1242 	if (sk != NULL) {
1243 		kmemcheck_annotate_bitfield(sk, flags);
1244 
1245 		if (security_sk_alloc(sk, family, priority))
1246 			goto out_free;
1247 
1248 		if (!try_module_get(prot->owner))
1249 			goto out_free_sec;
1250 		sk_tx_queue_clear(sk);
1251 	}
1252 
1253 	return sk;
1254 
1255 out_free_sec:
1256 	security_sk_free(sk);
1257 out_free:
1258 	if (slab != NULL)
1259 		kmem_cache_free(slab, sk);
1260 	else
1261 		kfree(sk);
1262 	return NULL;
1263 }
1264 
1265 static void sk_prot_free(struct proto *prot, struct sock *sk)
1266 {
1267 	struct kmem_cache *slab;
1268 	struct module *owner;
1269 
1270 	owner = prot->owner;
1271 	slab = prot->slab;
1272 
1273 	security_sk_free(sk);
1274 	if (slab != NULL)
1275 		kmem_cache_free(slab, sk);
1276 	else
1277 		kfree(sk);
1278 	module_put(owner);
1279 }
1280 
1281 #ifdef CONFIG_CGROUPS
1282 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1283 void sock_update_classid(struct sock *sk, struct task_struct *task)
1284 {
1285 	u32 classid;
1286 
1287 	classid = task_cls_classid(task);
1288 	if (classid != sk->sk_classid)
1289 		sk->sk_classid = classid;
1290 }
1291 EXPORT_SYMBOL(sock_update_classid);
1292 #endif
1293 
1294 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1295 void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1296 {
1297 	if (in_interrupt())
1298 		return;
1299 
1300 	sk->sk_cgrp_prioidx = task_netprioidx(task);
1301 }
1302 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1303 #endif
1304 #endif
1305 
1306 /**
1307  *	sk_alloc - All socket objects are allocated here
1308  *	@net: the applicable net namespace
1309  *	@family: protocol family
1310  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1311  *	@prot: struct proto associated with this new sock instance
1312  */
1313 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1314 		      struct proto *prot)
1315 {
1316 	struct sock *sk;
1317 
1318 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1319 	if (sk) {
1320 		sk->sk_family = family;
1321 		/*
1322 		 * See comment in struct sock definition to understand
1323 		 * why we need sk_prot_creator -acme
1324 		 */
1325 		sk->sk_prot = sk->sk_prot_creator = prot;
1326 		sock_lock_init(sk);
1327 		sock_net_set(sk, get_net(net));
1328 		atomic_set(&sk->sk_wmem_alloc, 1);
1329 
1330 		sock_update_classid(sk, current);
1331 		sock_update_netprioidx(sk, current);
1332 	}
1333 
1334 	return sk;
1335 }
1336 EXPORT_SYMBOL(sk_alloc);
1337 
1338 static void __sk_free(struct sock *sk)
1339 {
1340 	struct sk_filter *filter;
1341 
1342 	if (sk->sk_destruct)
1343 		sk->sk_destruct(sk);
1344 
1345 	filter = rcu_dereference_check(sk->sk_filter,
1346 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1347 	if (filter) {
1348 		sk_filter_uncharge(sk, filter);
1349 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1350 	}
1351 
1352 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1353 
1354 	if (atomic_read(&sk->sk_omem_alloc))
1355 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1356 			 __func__, atomic_read(&sk->sk_omem_alloc));
1357 
1358 	if (sk->sk_peer_cred)
1359 		put_cred(sk->sk_peer_cred);
1360 	put_pid(sk->sk_peer_pid);
1361 	put_net(sock_net(sk));
1362 	sk_prot_free(sk->sk_prot_creator, sk);
1363 }
1364 
1365 void sk_free(struct sock *sk)
1366 {
1367 	/*
1368 	 * We subtract one from sk_wmem_alloc and can know if
1369 	 * some packets are still in some tx queue.
1370 	 * If not null, sock_wfree() will call __sk_free(sk) later
1371 	 */
1372 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1373 		__sk_free(sk);
1374 }
1375 EXPORT_SYMBOL(sk_free);
1376 
1377 /*
1378  * Last sock_put should drop reference to sk->sk_net. It has already
1379  * been dropped in sk_change_net. Taking reference to stopping namespace
1380  * is not an option.
1381  * Take reference to a socket to remove it from hash _alive_ and after that
1382  * destroy it in the context of init_net.
1383  */
1384 void sk_release_kernel(struct sock *sk)
1385 {
1386 	if (sk == NULL || sk->sk_socket == NULL)
1387 		return;
1388 
1389 	sock_hold(sk);
1390 	sock_release(sk->sk_socket);
1391 	release_net(sock_net(sk));
1392 	sock_net_set(sk, get_net(&init_net));
1393 	sock_put(sk);
1394 }
1395 EXPORT_SYMBOL(sk_release_kernel);
1396 
1397 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1398 {
1399 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1400 		sock_update_memcg(newsk);
1401 }
1402 
1403 /**
1404  *	sk_clone_lock - clone a socket, and lock its clone
1405  *	@sk: the socket to clone
1406  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1407  *
1408  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1409  */
1410 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1411 {
1412 	struct sock *newsk;
1413 
1414 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1415 	if (newsk != NULL) {
1416 		struct sk_filter *filter;
1417 
1418 		sock_copy(newsk, sk);
1419 
1420 		/* SANITY */
1421 		get_net(sock_net(newsk));
1422 		sk_node_init(&newsk->sk_node);
1423 		sock_lock_init(newsk);
1424 		bh_lock_sock(newsk);
1425 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1426 		newsk->sk_backlog.len = 0;
1427 
1428 		atomic_set(&newsk->sk_rmem_alloc, 0);
1429 		/*
1430 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1431 		 */
1432 		atomic_set(&newsk->sk_wmem_alloc, 1);
1433 		atomic_set(&newsk->sk_omem_alloc, 0);
1434 		skb_queue_head_init(&newsk->sk_receive_queue);
1435 		skb_queue_head_init(&newsk->sk_write_queue);
1436 #ifdef CONFIG_NET_DMA
1437 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1438 #endif
1439 
1440 		spin_lock_init(&newsk->sk_dst_lock);
1441 		rwlock_init(&newsk->sk_callback_lock);
1442 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1443 				af_callback_keys + newsk->sk_family,
1444 				af_family_clock_key_strings[newsk->sk_family]);
1445 
1446 		newsk->sk_dst_cache	= NULL;
1447 		newsk->sk_wmem_queued	= 0;
1448 		newsk->sk_forward_alloc = 0;
1449 		newsk->sk_send_head	= NULL;
1450 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1451 
1452 		sock_reset_flag(newsk, SOCK_DONE);
1453 		skb_queue_head_init(&newsk->sk_error_queue);
1454 
1455 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1456 		if (filter != NULL)
1457 			sk_filter_charge(newsk, filter);
1458 
1459 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1460 			/* It is still raw copy of parent, so invalidate
1461 			 * destructor and make plain sk_free() */
1462 			newsk->sk_destruct = NULL;
1463 			bh_unlock_sock(newsk);
1464 			sk_free(newsk);
1465 			newsk = NULL;
1466 			goto out;
1467 		}
1468 
1469 		newsk->sk_err	   = 0;
1470 		newsk->sk_priority = 0;
1471 		/*
1472 		 * Before updating sk_refcnt, we must commit prior changes to memory
1473 		 * (Documentation/RCU/rculist_nulls.txt for details)
1474 		 */
1475 		smp_wmb();
1476 		atomic_set(&newsk->sk_refcnt, 2);
1477 
1478 		/*
1479 		 * Increment the counter in the same struct proto as the master
1480 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1481 		 * is the same as sk->sk_prot->socks, as this field was copied
1482 		 * with memcpy).
1483 		 *
1484 		 * This _changes_ the previous behaviour, where
1485 		 * tcp_create_openreq_child always was incrementing the
1486 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1487 		 * to be taken into account in all callers. -acme
1488 		 */
1489 		sk_refcnt_debug_inc(newsk);
1490 		sk_set_socket(newsk, NULL);
1491 		newsk->sk_wq = NULL;
1492 
1493 		sk_update_clone(sk, newsk);
1494 
1495 		if (newsk->sk_prot->sockets_allocated)
1496 			sk_sockets_allocated_inc(newsk);
1497 
1498 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1499 			net_enable_timestamp();
1500 	}
1501 out:
1502 	return newsk;
1503 }
1504 EXPORT_SYMBOL_GPL(sk_clone_lock);
1505 
1506 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1507 {
1508 	__sk_dst_set(sk, dst);
1509 	sk->sk_route_caps = dst->dev->features;
1510 	if (sk->sk_route_caps & NETIF_F_GSO)
1511 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1512 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1513 	if (sk_can_gso(sk)) {
1514 		if (dst->header_len) {
1515 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1516 		} else {
1517 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1518 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1519 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1520 		}
1521 	}
1522 }
1523 EXPORT_SYMBOL_GPL(sk_setup_caps);
1524 
1525 /*
1526  *	Simple resource managers for sockets.
1527  */
1528 
1529 
1530 /*
1531  * Write buffer destructor automatically called from kfree_skb.
1532  */
1533 void sock_wfree(struct sk_buff *skb)
1534 {
1535 	struct sock *sk = skb->sk;
1536 	unsigned int len = skb->truesize;
1537 
1538 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1539 		/*
1540 		 * Keep a reference on sk_wmem_alloc, this will be released
1541 		 * after sk_write_space() call
1542 		 */
1543 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1544 		sk->sk_write_space(sk);
1545 		len = 1;
1546 	}
1547 	/*
1548 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1549 	 * could not do because of in-flight packets
1550 	 */
1551 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1552 		__sk_free(sk);
1553 }
1554 EXPORT_SYMBOL(sock_wfree);
1555 
1556 /*
1557  * Read buffer destructor automatically called from kfree_skb.
1558  */
1559 void sock_rfree(struct sk_buff *skb)
1560 {
1561 	struct sock *sk = skb->sk;
1562 	unsigned int len = skb->truesize;
1563 
1564 	atomic_sub(len, &sk->sk_rmem_alloc);
1565 	sk_mem_uncharge(sk, len);
1566 }
1567 EXPORT_SYMBOL(sock_rfree);
1568 
1569 void sock_edemux(struct sk_buff *skb)
1570 {
1571 	struct sock *sk = skb->sk;
1572 
1573 #ifdef CONFIG_INET
1574 	if (sk->sk_state == TCP_TIME_WAIT)
1575 		inet_twsk_put(inet_twsk(sk));
1576 	else
1577 #endif
1578 		sock_put(sk);
1579 }
1580 EXPORT_SYMBOL(sock_edemux);
1581 
1582 kuid_t sock_i_uid(struct sock *sk)
1583 {
1584 	kuid_t uid;
1585 
1586 	read_lock_bh(&sk->sk_callback_lock);
1587 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1588 	read_unlock_bh(&sk->sk_callback_lock);
1589 	return uid;
1590 }
1591 EXPORT_SYMBOL(sock_i_uid);
1592 
1593 unsigned long sock_i_ino(struct sock *sk)
1594 {
1595 	unsigned long ino;
1596 
1597 	read_lock_bh(&sk->sk_callback_lock);
1598 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1599 	read_unlock_bh(&sk->sk_callback_lock);
1600 	return ino;
1601 }
1602 EXPORT_SYMBOL(sock_i_ino);
1603 
1604 /*
1605  * Allocate a skb from the socket's send buffer.
1606  */
1607 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1608 			     gfp_t priority)
1609 {
1610 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1611 		struct sk_buff *skb = alloc_skb(size, priority);
1612 		if (skb) {
1613 			skb_set_owner_w(skb, sk);
1614 			return skb;
1615 		}
1616 	}
1617 	return NULL;
1618 }
1619 EXPORT_SYMBOL(sock_wmalloc);
1620 
1621 /*
1622  * Allocate a skb from the socket's receive buffer.
1623  */
1624 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1625 			     gfp_t priority)
1626 {
1627 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1628 		struct sk_buff *skb = alloc_skb(size, priority);
1629 		if (skb) {
1630 			skb_set_owner_r(skb, sk);
1631 			return skb;
1632 		}
1633 	}
1634 	return NULL;
1635 }
1636 
1637 /*
1638  * Allocate a memory block from the socket's option memory buffer.
1639  */
1640 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1641 {
1642 	if ((unsigned int)size <= sysctl_optmem_max &&
1643 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1644 		void *mem;
1645 		/* First do the add, to avoid the race if kmalloc
1646 		 * might sleep.
1647 		 */
1648 		atomic_add(size, &sk->sk_omem_alloc);
1649 		mem = kmalloc(size, priority);
1650 		if (mem)
1651 			return mem;
1652 		atomic_sub(size, &sk->sk_omem_alloc);
1653 	}
1654 	return NULL;
1655 }
1656 EXPORT_SYMBOL(sock_kmalloc);
1657 
1658 /*
1659  * Free an option memory block.
1660  */
1661 void sock_kfree_s(struct sock *sk, void *mem, int size)
1662 {
1663 	kfree(mem);
1664 	atomic_sub(size, &sk->sk_omem_alloc);
1665 }
1666 EXPORT_SYMBOL(sock_kfree_s);
1667 
1668 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1669    I think, these locks should be removed for datagram sockets.
1670  */
1671 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1672 {
1673 	DEFINE_WAIT(wait);
1674 
1675 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1676 	for (;;) {
1677 		if (!timeo)
1678 			break;
1679 		if (signal_pending(current))
1680 			break;
1681 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1682 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1683 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1684 			break;
1685 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1686 			break;
1687 		if (sk->sk_err)
1688 			break;
1689 		timeo = schedule_timeout(timeo);
1690 	}
1691 	finish_wait(sk_sleep(sk), &wait);
1692 	return timeo;
1693 }
1694 
1695 
1696 /*
1697  *	Generic send/receive buffer handlers
1698  */
1699 
1700 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1701 				     unsigned long data_len, int noblock,
1702 				     int *errcode)
1703 {
1704 	struct sk_buff *skb;
1705 	gfp_t gfp_mask;
1706 	long timeo;
1707 	int err;
1708 	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1709 
1710 	err = -EMSGSIZE;
1711 	if (npages > MAX_SKB_FRAGS)
1712 		goto failure;
1713 
1714 	gfp_mask = sk->sk_allocation;
1715 	if (gfp_mask & __GFP_WAIT)
1716 		gfp_mask |= __GFP_REPEAT;
1717 
1718 	timeo = sock_sndtimeo(sk, noblock);
1719 	while (1) {
1720 		err = sock_error(sk);
1721 		if (err != 0)
1722 			goto failure;
1723 
1724 		err = -EPIPE;
1725 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1726 			goto failure;
1727 
1728 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1729 			skb = alloc_skb(header_len, gfp_mask);
1730 			if (skb) {
1731 				int i;
1732 
1733 				/* No pages, we're done... */
1734 				if (!data_len)
1735 					break;
1736 
1737 				skb->truesize += data_len;
1738 				skb_shinfo(skb)->nr_frags = npages;
1739 				for (i = 0; i < npages; i++) {
1740 					struct page *page;
1741 
1742 					page = alloc_pages(sk->sk_allocation, 0);
1743 					if (!page) {
1744 						err = -ENOBUFS;
1745 						skb_shinfo(skb)->nr_frags = i;
1746 						kfree_skb(skb);
1747 						goto failure;
1748 					}
1749 
1750 					__skb_fill_page_desc(skb, i,
1751 							page, 0,
1752 							(data_len >= PAGE_SIZE ?
1753 							 PAGE_SIZE :
1754 							 data_len));
1755 					data_len -= PAGE_SIZE;
1756 				}
1757 
1758 				/* Full success... */
1759 				break;
1760 			}
1761 			err = -ENOBUFS;
1762 			goto failure;
1763 		}
1764 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1765 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1766 		err = -EAGAIN;
1767 		if (!timeo)
1768 			goto failure;
1769 		if (signal_pending(current))
1770 			goto interrupted;
1771 		timeo = sock_wait_for_wmem(sk, timeo);
1772 	}
1773 
1774 	skb_set_owner_w(skb, sk);
1775 	return skb;
1776 
1777 interrupted:
1778 	err = sock_intr_errno(timeo);
1779 failure:
1780 	*errcode = err;
1781 	return NULL;
1782 }
1783 EXPORT_SYMBOL(sock_alloc_send_pskb);
1784 
1785 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1786 				    int noblock, int *errcode)
1787 {
1788 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1789 }
1790 EXPORT_SYMBOL(sock_alloc_send_skb);
1791 
1792 /* On 32bit arches, an skb frag is limited to 2^15 */
1793 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1794 
1795 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1796 {
1797 	int order;
1798 
1799 	if (pfrag->page) {
1800 		if (atomic_read(&pfrag->page->_count) == 1) {
1801 			pfrag->offset = 0;
1802 			return true;
1803 		}
1804 		if (pfrag->offset < pfrag->size)
1805 			return true;
1806 		put_page(pfrag->page);
1807 	}
1808 
1809 	/* We restrict high order allocations to users that can afford to wait */
1810 	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1811 
1812 	do {
1813 		gfp_t gfp = sk->sk_allocation;
1814 
1815 		if (order)
1816 			gfp |= __GFP_COMP | __GFP_NOWARN;
1817 		pfrag->page = alloc_pages(gfp, order);
1818 		if (likely(pfrag->page)) {
1819 			pfrag->offset = 0;
1820 			pfrag->size = PAGE_SIZE << order;
1821 			return true;
1822 		}
1823 	} while (--order >= 0);
1824 
1825 	sk_enter_memory_pressure(sk);
1826 	sk_stream_moderate_sndbuf(sk);
1827 	return false;
1828 }
1829 EXPORT_SYMBOL(sk_page_frag_refill);
1830 
1831 static void __lock_sock(struct sock *sk)
1832 	__releases(&sk->sk_lock.slock)
1833 	__acquires(&sk->sk_lock.slock)
1834 {
1835 	DEFINE_WAIT(wait);
1836 
1837 	for (;;) {
1838 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1839 					TASK_UNINTERRUPTIBLE);
1840 		spin_unlock_bh(&sk->sk_lock.slock);
1841 		schedule();
1842 		spin_lock_bh(&sk->sk_lock.slock);
1843 		if (!sock_owned_by_user(sk))
1844 			break;
1845 	}
1846 	finish_wait(&sk->sk_lock.wq, &wait);
1847 }
1848 
1849 static void __release_sock(struct sock *sk)
1850 	__releases(&sk->sk_lock.slock)
1851 	__acquires(&sk->sk_lock.slock)
1852 {
1853 	struct sk_buff *skb = sk->sk_backlog.head;
1854 
1855 	do {
1856 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1857 		bh_unlock_sock(sk);
1858 
1859 		do {
1860 			struct sk_buff *next = skb->next;
1861 
1862 			prefetch(next);
1863 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1864 			skb->next = NULL;
1865 			sk_backlog_rcv(sk, skb);
1866 
1867 			/*
1868 			 * We are in process context here with softirqs
1869 			 * disabled, use cond_resched_softirq() to preempt.
1870 			 * This is safe to do because we've taken the backlog
1871 			 * queue private:
1872 			 */
1873 			cond_resched_softirq();
1874 
1875 			skb = next;
1876 		} while (skb != NULL);
1877 
1878 		bh_lock_sock(sk);
1879 	} while ((skb = sk->sk_backlog.head) != NULL);
1880 
1881 	/*
1882 	 * Doing the zeroing here guarantee we can not loop forever
1883 	 * while a wild producer attempts to flood us.
1884 	 */
1885 	sk->sk_backlog.len = 0;
1886 }
1887 
1888 /**
1889  * sk_wait_data - wait for data to arrive at sk_receive_queue
1890  * @sk:    sock to wait on
1891  * @timeo: for how long
1892  *
1893  * Now socket state including sk->sk_err is changed only under lock,
1894  * hence we may omit checks after joining wait queue.
1895  * We check receive queue before schedule() only as optimization;
1896  * it is very likely that release_sock() added new data.
1897  */
1898 int sk_wait_data(struct sock *sk, long *timeo)
1899 {
1900 	int rc;
1901 	DEFINE_WAIT(wait);
1902 
1903 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1904 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1905 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1906 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1907 	finish_wait(sk_sleep(sk), &wait);
1908 	return rc;
1909 }
1910 EXPORT_SYMBOL(sk_wait_data);
1911 
1912 /**
1913  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1914  *	@sk: socket
1915  *	@size: memory size to allocate
1916  *	@kind: allocation type
1917  *
1918  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1919  *	rmem allocation. This function assumes that protocols which have
1920  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1921  */
1922 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1923 {
1924 	struct proto *prot = sk->sk_prot;
1925 	int amt = sk_mem_pages(size);
1926 	long allocated;
1927 	int parent_status = UNDER_LIMIT;
1928 
1929 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1930 
1931 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1932 
1933 	/* Under limit. */
1934 	if (parent_status == UNDER_LIMIT &&
1935 			allocated <= sk_prot_mem_limits(sk, 0)) {
1936 		sk_leave_memory_pressure(sk);
1937 		return 1;
1938 	}
1939 
1940 	/* Under pressure. (we or our parents) */
1941 	if ((parent_status > SOFT_LIMIT) ||
1942 			allocated > sk_prot_mem_limits(sk, 1))
1943 		sk_enter_memory_pressure(sk);
1944 
1945 	/* Over hard limit (we or our parents) */
1946 	if ((parent_status == OVER_LIMIT) ||
1947 			(allocated > sk_prot_mem_limits(sk, 2)))
1948 		goto suppress_allocation;
1949 
1950 	/* guarantee minimum buffer size under pressure */
1951 	if (kind == SK_MEM_RECV) {
1952 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1953 			return 1;
1954 
1955 	} else { /* SK_MEM_SEND */
1956 		if (sk->sk_type == SOCK_STREAM) {
1957 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1958 				return 1;
1959 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1960 			   prot->sysctl_wmem[0])
1961 				return 1;
1962 	}
1963 
1964 	if (sk_has_memory_pressure(sk)) {
1965 		int alloc;
1966 
1967 		if (!sk_under_memory_pressure(sk))
1968 			return 1;
1969 		alloc = sk_sockets_allocated_read_positive(sk);
1970 		if (sk_prot_mem_limits(sk, 2) > alloc *
1971 		    sk_mem_pages(sk->sk_wmem_queued +
1972 				 atomic_read(&sk->sk_rmem_alloc) +
1973 				 sk->sk_forward_alloc))
1974 			return 1;
1975 	}
1976 
1977 suppress_allocation:
1978 
1979 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1980 		sk_stream_moderate_sndbuf(sk);
1981 
1982 		/* Fail only if socket is _under_ its sndbuf.
1983 		 * In this case we cannot block, so that we have to fail.
1984 		 */
1985 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1986 			return 1;
1987 	}
1988 
1989 	trace_sock_exceed_buf_limit(sk, prot, allocated);
1990 
1991 	/* Alas. Undo changes. */
1992 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1993 
1994 	sk_memory_allocated_sub(sk, amt);
1995 
1996 	return 0;
1997 }
1998 EXPORT_SYMBOL(__sk_mem_schedule);
1999 
2000 /**
2001  *	__sk_reclaim - reclaim memory_allocated
2002  *	@sk: socket
2003  */
2004 void __sk_mem_reclaim(struct sock *sk)
2005 {
2006 	sk_memory_allocated_sub(sk,
2007 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2008 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2009 
2010 	if (sk_under_memory_pressure(sk) &&
2011 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2012 		sk_leave_memory_pressure(sk);
2013 }
2014 EXPORT_SYMBOL(__sk_mem_reclaim);
2015 
2016 
2017 /*
2018  * Set of default routines for initialising struct proto_ops when
2019  * the protocol does not support a particular function. In certain
2020  * cases where it makes no sense for a protocol to have a "do nothing"
2021  * function, some default processing is provided.
2022  */
2023 
2024 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2025 {
2026 	return -EOPNOTSUPP;
2027 }
2028 EXPORT_SYMBOL(sock_no_bind);
2029 
2030 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2031 		    int len, int flags)
2032 {
2033 	return -EOPNOTSUPP;
2034 }
2035 EXPORT_SYMBOL(sock_no_connect);
2036 
2037 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2038 {
2039 	return -EOPNOTSUPP;
2040 }
2041 EXPORT_SYMBOL(sock_no_socketpair);
2042 
2043 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2044 {
2045 	return -EOPNOTSUPP;
2046 }
2047 EXPORT_SYMBOL(sock_no_accept);
2048 
2049 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2050 		    int *len, int peer)
2051 {
2052 	return -EOPNOTSUPP;
2053 }
2054 EXPORT_SYMBOL(sock_no_getname);
2055 
2056 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2057 {
2058 	return 0;
2059 }
2060 EXPORT_SYMBOL(sock_no_poll);
2061 
2062 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2063 {
2064 	return -EOPNOTSUPP;
2065 }
2066 EXPORT_SYMBOL(sock_no_ioctl);
2067 
2068 int sock_no_listen(struct socket *sock, int backlog)
2069 {
2070 	return -EOPNOTSUPP;
2071 }
2072 EXPORT_SYMBOL(sock_no_listen);
2073 
2074 int sock_no_shutdown(struct socket *sock, int how)
2075 {
2076 	return -EOPNOTSUPP;
2077 }
2078 EXPORT_SYMBOL(sock_no_shutdown);
2079 
2080 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2081 		    char __user *optval, unsigned int optlen)
2082 {
2083 	return -EOPNOTSUPP;
2084 }
2085 EXPORT_SYMBOL(sock_no_setsockopt);
2086 
2087 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2088 		    char __user *optval, int __user *optlen)
2089 {
2090 	return -EOPNOTSUPP;
2091 }
2092 EXPORT_SYMBOL(sock_no_getsockopt);
2093 
2094 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2095 		    size_t len)
2096 {
2097 	return -EOPNOTSUPP;
2098 }
2099 EXPORT_SYMBOL(sock_no_sendmsg);
2100 
2101 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2102 		    size_t len, int flags)
2103 {
2104 	return -EOPNOTSUPP;
2105 }
2106 EXPORT_SYMBOL(sock_no_recvmsg);
2107 
2108 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2109 {
2110 	/* Mirror missing mmap method error code */
2111 	return -ENODEV;
2112 }
2113 EXPORT_SYMBOL(sock_no_mmap);
2114 
2115 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2116 {
2117 	ssize_t res;
2118 	struct msghdr msg = {.msg_flags = flags};
2119 	struct kvec iov;
2120 	char *kaddr = kmap(page);
2121 	iov.iov_base = kaddr + offset;
2122 	iov.iov_len = size;
2123 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2124 	kunmap(page);
2125 	return res;
2126 }
2127 EXPORT_SYMBOL(sock_no_sendpage);
2128 
2129 /*
2130  *	Default Socket Callbacks
2131  */
2132 
2133 static void sock_def_wakeup(struct sock *sk)
2134 {
2135 	struct socket_wq *wq;
2136 
2137 	rcu_read_lock();
2138 	wq = rcu_dereference(sk->sk_wq);
2139 	if (wq_has_sleeper(wq))
2140 		wake_up_interruptible_all(&wq->wait);
2141 	rcu_read_unlock();
2142 }
2143 
2144 static void sock_def_error_report(struct sock *sk)
2145 {
2146 	struct socket_wq *wq;
2147 
2148 	rcu_read_lock();
2149 	wq = rcu_dereference(sk->sk_wq);
2150 	if (wq_has_sleeper(wq))
2151 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2152 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2153 	rcu_read_unlock();
2154 }
2155 
2156 static void sock_def_readable(struct sock *sk, int len)
2157 {
2158 	struct socket_wq *wq;
2159 
2160 	rcu_read_lock();
2161 	wq = rcu_dereference(sk->sk_wq);
2162 	if (wq_has_sleeper(wq))
2163 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2164 						POLLRDNORM | POLLRDBAND);
2165 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2166 	rcu_read_unlock();
2167 }
2168 
2169 static void sock_def_write_space(struct sock *sk)
2170 {
2171 	struct socket_wq *wq;
2172 
2173 	rcu_read_lock();
2174 
2175 	/* Do not wake up a writer until he can make "significant"
2176 	 * progress.  --DaveM
2177 	 */
2178 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2179 		wq = rcu_dereference(sk->sk_wq);
2180 		if (wq_has_sleeper(wq))
2181 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2182 						POLLWRNORM | POLLWRBAND);
2183 
2184 		/* Should agree with poll, otherwise some programs break */
2185 		if (sock_writeable(sk))
2186 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2187 	}
2188 
2189 	rcu_read_unlock();
2190 }
2191 
2192 static void sock_def_destruct(struct sock *sk)
2193 {
2194 	kfree(sk->sk_protinfo);
2195 }
2196 
2197 void sk_send_sigurg(struct sock *sk)
2198 {
2199 	if (sk->sk_socket && sk->sk_socket->file)
2200 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2201 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2202 }
2203 EXPORT_SYMBOL(sk_send_sigurg);
2204 
2205 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2206 		    unsigned long expires)
2207 {
2208 	if (!mod_timer(timer, expires))
2209 		sock_hold(sk);
2210 }
2211 EXPORT_SYMBOL(sk_reset_timer);
2212 
2213 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2214 {
2215 	if (timer_pending(timer) && del_timer(timer))
2216 		__sock_put(sk);
2217 }
2218 EXPORT_SYMBOL(sk_stop_timer);
2219 
2220 void sock_init_data(struct socket *sock, struct sock *sk)
2221 {
2222 	skb_queue_head_init(&sk->sk_receive_queue);
2223 	skb_queue_head_init(&sk->sk_write_queue);
2224 	skb_queue_head_init(&sk->sk_error_queue);
2225 #ifdef CONFIG_NET_DMA
2226 	skb_queue_head_init(&sk->sk_async_wait_queue);
2227 #endif
2228 
2229 	sk->sk_send_head	=	NULL;
2230 
2231 	init_timer(&sk->sk_timer);
2232 
2233 	sk->sk_allocation	=	GFP_KERNEL;
2234 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2235 	sk->sk_sndbuf		=	sysctl_wmem_default;
2236 	sk->sk_state		=	TCP_CLOSE;
2237 	sk_set_socket(sk, sock);
2238 
2239 	sock_set_flag(sk, SOCK_ZAPPED);
2240 
2241 	if (sock) {
2242 		sk->sk_type	=	sock->type;
2243 		sk->sk_wq	=	sock->wq;
2244 		sock->sk	=	sk;
2245 	} else
2246 		sk->sk_wq	=	NULL;
2247 
2248 	spin_lock_init(&sk->sk_dst_lock);
2249 	rwlock_init(&sk->sk_callback_lock);
2250 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2251 			af_callback_keys + sk->sk_family,
2252 			af_family_clock_key_strings[sk->sk_family]);
2253 
2254 	sk->sk_state_change	=	sock_def_wakeup;
2255 	sk->sk_data_ready	=	sock_def_readable;
2256 	sk->sk_write_space	=	sock_def_write_space;
2257 	sk->sk_error_report	=	sock_def_error_report;
2258 	sk->sk_destruct		=	sock_def_destruct;
2259 
2260 	sk->sk_frag.page	=	NULL;
2261 	sk->sk_frag.offset	=	0;
2262 	sk->sk_peek_off		=	-1;
2263 
2264 	sk->sk_peer_pid 	=	NULL;
2265 	sk->sk_peer_cred	=	NULL;
2266 	sk->sk_write_pending	=	0;
2267 	sk->sk_rcvlowat		=	1;
2268 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2269 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2270 
2271 	sk->sk_stamp = ktime_set(-1L, 0);
2272 
2273 	/*
2274 	 * Before updating sk_refcnt, we must commit prior changes to memory
2275 	 * (Documentation/RCU/rculist_nulls.txt for details)
2276 	 */
2277 	smp_wmb();
2278 	atomic_set(&sk->sk_refcnt, 1);
2279 	atomic_set(&sk->sk_drops, 0);
2280 }
2281 EXPORT_SYMBOL(sock_init_data);
2282 
2283 void lock_sock_nested(struct sock *sk, int subclass)
2284 {
2285 	might_sleep();
2286 	spin_lock_bh(&sk->sk_lock.slock);
2287 	if (sk->sk_lock.owned)
2288 		__lock_sock(sk);
2289 	sk->sk_lock.owned = 1;
2290 	spin_unlock(&sk->sk_lock.slock);
2291 	/*
2292 	 * The sk_lock has mutex_lock() semantics here:
2293 	 */
2294 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2295 	local_bh_enable();
2296 }
2297 EXPORT_SYMBOL(lock_sock_nested);
2298 
2299 void release_sock(struct sock *sk)
2300 {
2301 	/*
2302 	 * The sk_lock has mutex_unlock() semantics:
2303 	 */
2304 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2305 
2306 	spin_lock_bh(&sk->sk_lock.slock);
2307 	if (sk->sk_backlog.tail)
2308 		__release_sock(sk);
2309 
2310 	if (sk->sk_prot->release_cb)
2311 		sk->sk_prot->release_cb(sk);
2312 
2313 	sk->sk_lock.owned = 0;
2314 	if (waitqueue_active(&sk->sk_lock.wq))
2315 		wake_up(&sk->sk_lock.wq);
2316 	spin_unlock_bh(&sk->sk_lock.slock);
2317 }
2318 EXPORT_SYMBOL(release_sock);
2319 
2320 /**
2321  * lock_sock_fast - fast version of lock_sock
2322  * @sk: socket
2323  *
2324  * This version should be used for very small section, where process wont block
2325  * return false if fast path is taken
2326  *   sk_lock.slock locked, owned = 0, BH disabled
2327  * return true if slow path is taken
2328  *   sk_lock.slock unlocked, owned = 1, BH enabled
2329  */
2330 bool lock_sock_fast(struct sock *sk)
2331 {
2332 	might_sleep();
2333 	spin_lock_bh(&sk->sk_lock.slock);
2334 
2335 	if (!sk->sk_lock.owned)
2336 		/*
2337 		 * Note : We must disable BH
2338 		 */
2339 		return false;
2340 
2341 	__lock_sock(sk);
2342 	sk->sk_lock.owned = 1;
2343 	spin_unlock(&sk->sk_lock.slock);
2344 	/*
2345 	 * The sk_lock has mutex_lock() semantics here:
2346 	 */
2347 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2348 	local_bh_enable();
2349 	return true;
2350 }
2351 EXPORT_SYMBOL(lock_sock_fast);
2352 
2353 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2354 {
2355 	struct timeval tv;
2356 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2357 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2358 	tv = ktime_to_timeval(sk->sk_stamp);
2359 	if (tv.tv_sec == -1)
2360 		return -ENOENT;
2361 	if (tv.tv_sec == 0) {
2362 		sk->sk_stamp = ktime_get_real();
2363 		tv = ktime_to_timeval(sk->sk_stamp);
2364 	}
2365 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2366 }
2367 EXPORT_SYMBOL(sock_get_timestamp);
2368 
2369 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2370 {
2371 	struct timespec ts;
2372 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2373 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2374 	ts = ktime_to_timespec(sk->sk_stamp);
2375 	if (ts.tv_sec == -1)
2376 		return -ENOENT;
2377 	if (ts.tv_sec == 0) {
2378 		sk->sk_stamp = ktime_get_real();
2379 		ts = ktime_to_timespec(sk->sk_stamp);
2380 	}
2381 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2382 }
2383 EXPORT_SYMBOL(sock_get_timestampns);
2384 
2385 void sock_enable_timestamp(struct sock *sk, int flag)
2386 {
2387 	if (!sock_flag(sk, flag)) {
2388 		unsigned long previous_flags = sk->sk_flags;
2389 
2390 		sock_set_flag(sk, flag);
2391 		/*
2392 		 * we just set one of the two flags which require net
2393 		 * time stamping, but time stamping might have been on
2394 		 * already because of the other one
2395 		 */
2396 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2397 			net_enable_timestamp();
2398 	}
2399 }
2400 
2401 /*
2402  *	Get a socket option on an socket.
2403  *
2404  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2405  *	asynchronous errors should be reported by getsockopt. We assume
2406  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2407  */
2408 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2409 			   char __user *optval, int __user *optlen)
2410 {
2411 	struct sock *sk = sock->sk;
2412 
2413 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2414 }
2415 EXPORT_SYMBOL(sock_common_getsockopt);
2416 
2417 #ifdef CONFIG_COMPAT
2418 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2419 				  char __user *optval, int __user *optlen)
2420 {
2421 	struct sock *sk = sock->sk;
2422 
2423 	if (sk->sk_prot->compat_getsockopt != NULL)
2424 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2425 						      optval, optlen);
2426 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2427 }
2428 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2429 #endif
2430 
2431 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2432 			struct msghdr *msg, size_t size, int flags)
2433 {
2434 	struct sock *sk = sock->sk;
2435 	int addr_len = 0;
2436 	int err;
2437 
2438 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2439 				   flags & ~MSG_DONTWAIT, &addr_len);
2440 	if (err >= 0)
2441 		msg->msg_namelen = addr_len;
2442 	return err;
2443 }
2444 EXPORT_SYMBOL(sock_common_recvmsg);
2445 
2446 /*
2447  *	Set socket options on an inet socket.
2448  */
2449 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2450 			   char __user *optval, unsigned int optlen)
2451 {
2452 	struct sock *sk = sock->sk;
2453 
2454 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2455 }
2456 EXPORT_SYMBOL(sock_common_setsockopt);
2457 
2458 #ifdef CONFIG_COMPAT
2459 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2460 				  char __user *optval, unsigned int optlen)
2461 {
2462 	struct sock *sk = sock->sk;
2463 
2464 	if (sk->sk_prot->compat_setsockopt != NULL)
2465 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2466 						      optval, optlen);
2467 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2468 }
2469 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2470 #endif
2471 
2472 void sk_common_release(struct sock *sk)
2473 {
2474 	if (sk->sk_prot->destroy)
2475 		sk->sk_prot->destroy(sk);
2476 
2477 	/*
2478 	 * Observation: when sock_common_release is called, processes have
2479 	 * no access to socket. But net still has.
2480 	 * Step one, detach it from networking:
2481 	 *
2482 	 * A. Remove from hash tables.
2483 	 */
2484 
2485 	sk->sk_prot->unhash(sk);
2486 
2487 	/*
2488 	 * In this point socket cannot receive new packets, but it is possible
2489 	 * that some packets are in flight because some CPU runs receiver and
2490 	 * did hash table lookup before we unhashed socket. They will achieve
2491 	 * receive queue and will be purged by socket destructor.
2492 	 *
2493 	 * Also we still have packets pending on receive queue and probably,
2494 	 * our own packets waiting in device queues. sock_destroy will drain
2495 	 * receive queue, but transmitted packets will delay socket destruction
2496 	 * until the last reference will be released.
2497 	 */
2498 
2499 	sock_orphan(sk);
2500 
2501 	xfrm_sk_free_policy(sk);
2502 
2503 	sk_refcnt_debug_release(sk);
2504 
2505 	if (sk->sk_frag.page) {
2506 		put_page(sk->sk_frag.page);
2507 		sk->sk_frag.page = NULL;
2508 	}
2509 
2510 	sock_put(sk);
2511 }
2512 EXPORT_SYMBOL(sk_common_release);
2513 
2514 #ifdef CONFIG_PROC_FS
2515 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2516 struct prot_inuse {
2517 	int val[PROTO_INUSE_NR];
2518 };
2519 
2520 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2521 
2522 #ifdef CONFIG_NET_NS
2523 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2524 {
2525 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2526 }
2527 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2528 
2529 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2530 {
2531 	int cpu, idx = prot->inuse_idx;
2532 	int res = 0;
2533 
2534 	for_each_possible_cpu(cpu)
2535 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2536 
2537 	return res >= 0 ? res : 0;
2538 }
2539 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2540 
2541 static int __net_init sock_inuse_init_net(struct net *net)
2542 {
2543 	net->core.inuse = alloc_percpu(struct prot_inuse);
2544 	return net->core.inuse ? 0 : -ENOMEM;
2545 }
2546 
2547 static void __net_exit sock_inuse_exit_net(struct net *net)
2548 {
2549 	free_percpu(net->core.inuse);
2550 }
2551 
2552 static struct pernet_operations net_inuse_ops = {
2553 	.init = sock_inuse_init_net,
2554 	.exit = sock_inuse_exit_net,
2555 };
2556 
2557 static __init int net_inuse_init(void)
2558 {
2559 	if (register_pernet_subsys(&net_inuse_ops))
2560 		panic("Cannot initialize net inuse counters");
2561 
2562 	return 0;
2563 }
2564 
2565 core_initcall(net_inuse_init);
2566 #else
2567 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2568 
2569 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2570 {
2571 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2572 }
2573 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2574 
2575 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2576 {
2577 	int cpu, idx = prot->inuse_idx;
2578 	int res = 0;
2579 
2580 	for_each_possible_cpu(cpu)
2581 		res += per_cpu(prot_inuse, cpu).val[idx];
2582 
2583 	return res >= 0 ? res : 0;
2584 }
2585 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2586 #endif
2587 
2588 static void assign_proto_idx(struct proto *prot)
2589 {
2590 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2591 
2592 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2593 		pr_err("PROTO_INUSE_NR exhausted\n");
2594 		return;
2595 	}
2596 
2597 	set_bit(prot->inuse_idx, proto_inuse_idx);
2598 }
2599 
2600 static void release_proto_idx(struct proto *prot)
2601 {
2602 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2603 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2604 }
2605 #else
2606 static inline void assign_proto_idx(struct proto *prot)
2607 {
2608 }
2609 
2610 static inline void release_proto_idx(struct proto *prot)
2611 {
2612 }
2613 #endif
2614 
2615 int proto_register(struct proto *prot, int alloc_slab)
2616 {
2617 	if (alloc_slab) {
2618 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2619 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2620 					NULL);
2621 
2622 		if (prot->slab == NULL) {
2623 			pr_crit("%s: Can't create sock SLAB cache!\n",
2624 				prot->name);
2625 			goto out;
2626 		}
2627 
2628 		if (prot->rsk_prot != NULL) {
2629 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2630 			if (prot->rsk_prot->slab_name == NULL)
2631 				goto out_free_sock_slab;
2632 
2633 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2634 								 prot->rsk_prot->obj_size, 0,
2635 								 SLAB_HWCACHE_ALIGN, NULL);
2636 
2637 			if (prot->rsk_prot->slab == NULL) {
2638 				pr_crit("%s: Can't create request sock SLAB cache!\n",
2639 					prot->name);
2640 				goto out_free_request_sock_slab_name;
2641 			}
2642 		}
2643 
2644 		if (prot->twsk_prot != NULL) {
2645 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2646 
2647 			if (prot->twsk_prot->twsk_slab_name == NULL)
2648 				goto out_free_request_sock_slab;
2649 
2650 			prot->twsk_prot->twsk_slab =
2651 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2652 						  prot->twsk_prot->twsk_obj_size,
2653 						  0,
2654 						  SLAB_HWCACHE_ALIGN |
2655 							prot->slab_flags,
2656 						  NULL);
2657 			if (prot->twsk_prot->twsk_slab == NULL)
2658 				goto out_free_timewait_sock_slab_name;
2659 		}
2660 	}
2661 
2662 	mutex_lock(&proto_list_mutex);
2663 	list_add(&prot->node, &proto_list);
2664 	assign_proto_idx(prot);
2665 	mutex_unlock(&proto_list_mutex);
2666 	return 0;
2667 
2668 out_free_timewait_sock_slab_name:
2669 	kfree(prot->twsk_prot->twsk_slab_name);
2670 out_free_request_sock_slab:
2671 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2672 		kmem_cache_destroy(prot->rsk_prot->slab);
2673 		prot->rsk_prot->slab = NULL;
2674 	}
2675 out_free_request_sock_slab_name:
2676 	if (prot->rsk_prot)
2677 		kfree(prot->rsk_prot->slab_name);
2678 out_free_sock_slab:
2679 	kmem_cache_destroy(prot->slab);
2680 	prot->slab = NULL;
2681 out:
2682 	return -ENOBUFS;
2683 }
2684 EXPORT_SYMBOL(proto_register);
2685 
2686 void proto_unregister(struct proto *prot)
2687 {
2688 	mutex_lock(&proto_list_mutex);
2689 	release_proto_idx(prot);
2690 	list_del(&prot->node);
2691 	mutex_unlock(&proto_list_mutex);
2692 
2693 	if (prot->slab != NULL) {
2694 		kmem_cache_destroy(prot->slab);
2695 		prot->slab = NULL;
2696 	}
2697 
2698 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2699 		kmem_cache_destroy(prot->rsk_prot->slab);
2700 		kfree(prot->rsk_prot->slab_name);
2701 		prot->rsk_prot->slab = NULL;
2702 	}
2703 
2704 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2705 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2706 		kfree(prot->twsk_prot->twsk_slab_name);
2707 		prot->twsk_prot->twsk_slab = NULL;
2708 	}
2709 }
2710 EXPORT_SYMBOL(proto_unregister);
2711 
2712 #ifdef CONFIG_PROC_FS
2713 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2714 	__acquires(proto_list_mutex)
2715 {
2716 	mutex_lock(&proto_list_mutex);
2717 	return seq_list_start_head(&proto_list, *pos);
2718 }
2719 
2720 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2721 {
2722 	return seq_list_next(v, &proto_list, pos);
2723 }
2724 
2725 static void proto_seq_stop(struct seq_file *seq, void *v)
2726 	__releases(proto_list_mutex)
2727 {
2728 	mutex_unlock(&proto_list_mutex);
2729 }
2730 
2731 static char proto_method_implemented(const void *method)
2732 {
2733 	return method == NULL ? 'n' : 'y';
2734 }
2735 static long sock_prot_memory_allocated(struct proto *proto)
2736 {
2737 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2738 }
2739 
2740 static char *sock_prot_memory_pressure(struct proto *proto)
2741 {
2742 	return proto->memory_pressure != NULL ?
2743 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2744 }
2745 
2746 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2747 {
2748 
2749 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2750 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2751 		   proto->name,
2752 		   proto->obj_size,
2753 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2754 		   sock_prot_memory_allocated(proto),
2755 		   sock_prot_memory_pressure(proto),
2756 		   proto->max_header,
2757 		   proto->slab == NULL ? "no" : "yes",
2758 		   module_name(proto->owner),
2759 		   proto_method_implemented(proto->close),
2760 		   proto_method_implemented(proto->connect),
2761 		   proto_method_implemented(proto->disconnect),
2762 		   proto_method_implemented(proto->accept),
2763 		   proto_method_implemented(proto->ioctl),
2764 		   proto_method_implemented(proto->init),
2765 		   proto_method_implemented(proto->destroy),
2766 		   proto_method_implemented(proto->shutdown),
2767 		   proto_method_implemented(proto->setsockopt),
2768 		   proto_method_implemented(proto->getsockopt),
2769 		   proto_method_implemented(proto->sendmsg),
2770 		   proto_method_implemented(proto->recvmsg),
2771 		   proto_method_implemented(proto->sendpage),
2772 		   proto_method_implemented(proto->bind),
2773 		   proto_method_implemented(proto->backlog_rcv),
2774 		   proto_method_implemented(proto->hash),
2775 		   proto_method_implemented(proto->unhash),
2776 		   proto_method_implemented(proto->get_port),
2777 		   proto_method_implemented(proto->enter_memory_pressure));
2778 }
2779 
2780 static int proto_seq_show(struct seq_file *seq, void *v)
2781 {
2782 	if (v == &proto_list)
2783 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2784 			   "protocol",
2785 			   "size",
2786 			   "sockets",
2787 			   "memory",
2788 			   "press",
2789 			   "maxhdr",
2790 			   "slab",
2791 			   "module",
2792 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2793 	else
2794 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2795 	return 0;
2796 }
2797 
2798 static const struct seq_operations proto_seq_ops = {
2799 	.start  = proto_seq_start,
2800 	.next   = proto_seq_next,
2801 	.stop   = proto_seq_stop,
2802 	.show   = proto_seq_show,
2803 };
2804 
2805 static int proto_seq_open(struct inode *inode, struct file *file)
2806 {
2807 	return seq_open_net(inode, file, &proto_seq_ops,
2808 			    sizeof(struct seq_net_private));
2809 }
2810 
2811 static const struct file_operations proto_seq_fops = {
2812 	.owner		= THIS_MODULE,
2813 	.open		= proto_seq_open,
2814 	.read		= seq_read,
2815 	.llseek		= seq_lseek,
2816 	.release	= seq_release_net,
2817 };
2818 
2819 static __net_init int proto_init_net(struct net *net)
2820 {
2821 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2822 		return -ENOMEM;
2823 
2824 	return 0;
2825 }
2826 
2827 static __net_exit void proto_exit_net(struct net *net)
2828 {
2829 	proc_net_remove(net, "protocols");
2830 }
2831 
2832 
2833 static __net_initdata struct pernet_operations proto_net_ops = {
2834 	.init = proto_init_net,
2835 	.exit = proto_exit_net,
2836 };
2837 
2838 static int __init proto_init(void)
2839 {
2840 	return register_pernet_subsys(&proto_net_ops);
2841 }
2842 
2843 subsys_initcall(proto_init);
2844 
2845 #endif /* PROC_FS */
2846