xref: /openbmc/linux/net/core/sock.c (revision 5104d265)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 #include <linux/user_namespace.h>
116 #include <linux/static_key.h>
117 #include <linux/memcontrol.h>
118 #include <linux/prefetch.h>
119 
120 #include <asm/uaccess.h>
121 
122 #include <linux/netdevice.h>
123 #include <net/protocol.h>
124 #include <linux/skbuff.h>
125 #include <net/net_namespace.h>
126 #include <net/request_sock.h>
127 #include <net/sock.h>
128 #include <linux/net_tstamp.h>
129 #include <net/xfrm.h>
130 #include <linux/ipsec.h>
131 #include <net/cls_cgroup.h>
132 #include <net/netprio_cgroup.h>
133 
134 #include <linux/filter.h>
135 
136 #include <trace/events/sock.h>
137 
138 #ifdef CONFIG_INET
139 #include <net/tcp.h>
140 #endif
141 
142 #include <net/busy_poll.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 #ifdef CONFIG_MEMCG_KMEM
148 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
149 {
150 	struct proto *proto;
151 	int ret = 0;
152 
153 	mutex_lock(&proto_list_mutex);
154 	list_for_each_entry(proto, &proto_list, node) {
155 		if (proto->init_cgroup) {
156 			ret = proto->init_cgroup(memcg, ss);
157 			if (ret)
158 				goto out;
159 		}
160 	}
161 
162 	mutex_unlock(&proto_list_mutex);
163 	return ret;
164 out:
165 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
166 		if (proto->destroy_cgroup)
167 			proto->destroy_cgroup(memcg);
168 	mutex_unlock(&proto_list_mutex);
169 	return ret;
170 }
171 
172 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
173 {
174 	struct proto *proto;
175 
176 	mutex_lock(&proto_list_mutex);
177 	list_for_each_entry_reverse(proto, &proto_list, node)
178 		if (proto->destroy_cgroup)
179 			proto->destroy_cgroup(memcg);
180 	mutex_unlock(&proto_list_mutex);
181 }
182 #endif
183 
184 /*
185  * Each address family might have different locking rules, so we have
186  * one slock key per address family:
187  */
188 static struct lock_class_key af_family_keys[AF_MAX];
189 static struct lock_class_key af_family_slock_keys[AF_MAX];
190 
191 #if defined(CONFIG_MEMCG_KMEM)
192 struct static_key memcg_socket_limit_enabled;
193 EXPORT_SYMBOL(memcg_socket_limit_enabled);
194 #endif
195 
196 /*
197  * Make lock validator output more readable. (we pre-construct these
198  * strings build-time, so that runtime initialization of socket
199  * locks is fast):
200  */
201 static const char *const af_family_key_strings[AF_MAX+1] = {
202   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
203   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
204   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
205   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
206   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
207   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
208   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
209   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
210   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
211   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
212   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
213   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
214   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
215   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
216 };
217 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
218   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
219   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
220   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
221   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
222   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
223   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
224   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
225   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
226   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
227   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
228   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
229   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
230   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
231   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
232 };
233 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
234   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
235   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
236   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
237   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
238   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
239   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
240   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
241   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
242   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
243   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
244   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
245   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
246   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
247   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
248 };
249 
250 /*
251  * sk_callback_lock locking rules are per-address-family,
252  * so split the lock classes by using a per-AF key:
253  */
254 static struct lock_class_key af_callback_keys[AF_MAX];
255 
256 /* Take into consideration the size of the struct sk_buff overhead in the
257  * determination of these values, since that is non-constant across
258  * platforms.  This makes socket queueing behavior and performance
259  * not depend upon such differences.
260  */
261 #define _SK_MEM_PACKETS		256
262 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
263 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
264 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
265 
266 /* Run time adjustable parameters. */
267 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
268 EXPORT_SYMBOL(sysctl_wmem_max);
269 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
270 EXPORT_SYMBOL(sysctl_rmem_max);
271 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
272 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
273 
274 /* Maximal space eaten by iovec or ancillary data plus some space */
275 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
276 EXPORT_SYMBOL(sysctl_optmem_max);
277 
278 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
279 EXPORT_SYMBOL_GPL(memalloc_socks);
280 
281 /**
282  * sk_set_memalloc - sets %SOCK_MEMALLOC
283  * @sk: socket to set it on
284  *
285  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
286  * It's the responsibility of the admin to adjust min_free_kbytes
287  * to meet the requirements
288  */
289 void sk_set_memalloc(struct sock *sk)
290 {
291 	sock_set_flag(sk, SOCK_MEMALLOC);
292 	sk->sk_allocation |= __GFP_MEMALLOC;
293 	static_key_slow_inc(&memalloc_socks);
294 }
295 EXPORT_SYMBOL_GPL(sk_set_memalloc);
296 
297 void sk_clear_memalloc(struct sock *sk)
298 {
299 	sock_reset_flag(sk, SOCK_MEMALLOC);
300 	sk->sk_allocation &= ~__GFP_MEMALLOC;
301 	static_key_slow_dec(&memalloc_socks);
302 
303 	/*
304 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
305 	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
306 	 * it has rmem allocations there is a risk that the user of the
307 	 * socket cannot make forward progress due to exceeding the rmem
308 	 * limits. By rights, sk_clear_memalloc() should only be called
309 	 * on sockets being torn down but warn and reset the accounting if
310 	 * that assumption breaks.
311 	 */
312 	if (WARN_ON(sk->sk_forward_alloc))
313 		sk_mem_reclaim(sk);
314 }
315 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
316 
317 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
318 {
319 	int ret;
320 	unsigned long pflags = current->flags;
321 
322 	/* these should have been dropped before queueing */
323 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
324 
325 	current->flags |= PF_MEMALLOC;
326 	ret = sk->sk_backlog_rcv(sk, skb);
327 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
328 
329 	return ret;
330 }
331 EXPORT_SYMBOL(__sk_backlog_rcv);
332 
333 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
334 {
335 	struct timeval tv;
336 
337 	if (optlen < sizeof(tv))
338 		return -EINVAL;
339 	if (copy_from_user(&tv, optval, sizeof(tv)))
340 		return -EFAULT;
341 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
342 		return -EDOM;
343 
344 	if (tv.tv_sec < 0) {
345 		static int warned __read_mostly;
346 
347 		*timeo_p = 0;
348 		if (warned < 10 && net_ratelimit()) {
349 			warned++;
350 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
351 				__func__, current->comm, task_pid_nr(current));
352 		}
353 		return 0;
354 	}
355 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
356 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
357 		return 0;
358 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
359 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
360 	return 0;
361 }
362 
363 static void sock_warn_obsolete_bsdism(const char *name)
364 {
365 	static int warned;
366 	static char warncomm[TASK_COMM_LEN];
367 	if (strcmp(warncomm, current->comm) && warned < 5) {
368 		strcpy(warncomm,  current->comm);
369 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
370 			warncomm, name);
371 		warned++;
372 	}
373 }
374 
375 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
376 
377 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
378 {
379 	if (sk->sk_flags & flags) {
380 		sk->sk_flags &= ~flags;
381 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
382 			net_disable_timestamp();
383 	}
384 }
385 
386 
387 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
388 {
389 	int err;
390 	int skb_len;
391 	unsigned long flags;
392 	struct sk_buff_head *list = &sk->sk_receive_queue;
393 
394 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
395 		atomic_inc(&sk->sk_drops);
396 		trace_sock_rcvqueue_full(sk, skb);
397 		return -ENOMEM;
398 	}
399 
400 	err = sk_filter(sk, skb);
401 	if (err)
402 		return err;
403 
404 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
405 		atomic_inc(&sk->sk_drops);
406 		return -ENOBUFS;
407 	}
408 
409 	skb->dev = NULL;
410 	skb_set_owner_r(skb, sk);
411 
412 	/* Cache the SKB length before we tack it onto the receive
413 	 * queue.  Once it is added it no longer belongs to us and
414 	 * may be freed by other threads of control pulling packets
415 	 * from the queue.
416 	 */
417 	skb_len = skb->len;
418 
419 	/* we escape from rcu protected region, make sure we dont leak
420 	 * a norefcounted dst
421 	 */
422 	skb_dst_force(skb);
423 
424 	spin_lock_irqsave(&list->lock, flags);
425 	skb->dropcount = atomic_read(&sk->sk_drops);
426 	__skb_queue_tail(list, skb);
427 	spin_unlock_irqrestore(&list->lock, flags);
428 
429 	if (!sock_flag(sk, SOCK_DEAD))
430 		sk->sk_data_ready(sk, skb_len);
431 	return 0;
432 }
433 EXPORT_SYMBOL(sock_queue_rcv_skb);
434 
435 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
436 {
437 	int rc = NET_RX_SUCCESS;
438 
439 	if (sk_filter(sk, skb))
440 		goto discard_and_relse;
441 
442 	skb->dev = NULL;
443 
444 	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
445 		atomic_inc(&sk->sk_drops);
446 		goto discard_and_relse;
447 	}
448 	if (nested)
449 		bh_lock_sock_nested(sk);
450 	else
451 		bh_lock_sock(sk);
452 	if (!sock_owned_by_user(sk)) {
453 		/*
454 		 * trylock + unlock semantics:
455 		 */
456 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
457 
458 		rc = sk_backlog_rcv(sk, skb);
459 
460 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
461 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
462 		bh_unlock_sock(sk);
463 		atomic_inc(&sk->sk_drops);
464 		goto discard_and_relse;
465 	}
466 
467 	bh_unlock_sock(sk);
468 out:
469 	sock_put(sk);
470 	return rc;
471 discard_and_relse:
472 	kfree_skb(skb);
473 	goto out;
474 }
475 EXPORT_SYMBOL(sk_receive_skb);
476 
477 void sk_reset_txq(struct sock *sk)
478 {
479 	sk_tx_queue_clear(sk);
480 }
481 EXPORT_SYMBOL(sk_reset_txq);
482 
483 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
484 {
485 	struct dst_entry *dst = __sk_dst_get(sk);
486 
487 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
488 		sk_tx_queue_clear(sk);
489 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
490 		dst_release(dst);
491 		return NULL;
492 	}
493 
494 	return dst;
495 }
496 EXPORT_SYMBOL(__sk_dst_check);
497 
498 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
499 {
500 	struct dst_entry *dst = sk_dst_get(sk);
501 
502 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
503 		sk_dst_reset(sk);
504 		dst_release(dst);
505 		return NULL;
506 	}
507 
508 	return dst;
509 }
510 EXPORT_SYMBOL(sk_dst_check);
511 
512 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
513 				int optlen)
514 {
515 	int ret = -ENOPROTOOPT;
516 #ifdef CONFIG_NETDEVICES
517 	struct net *net = sock_net(sk);
518 	char devname[IFNAMSIZ];
519 	int index;
520 
521 	/* Sorry... */
522 	ret = -EPERM;
523 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
524 		goto out;
525 
526 	ret = -EINVAL;
527 	if (optlen < 0)
528 		goto out;
529 
530 	/* Bind this socket to a particular device like "eth0",
531 	 * as specified in the passed interface name. If the
532 	 * name is "" or the option length is zero the socket
533 	 * is not bound.
534 	 */
535 	if (optlen > IFNAMSIZ - 1)
536 		optlen = IFNAMSIZ - 1;
537 	memset(devname, 0, sizeof(devname));
538 
539 	ret = -EFAULT;
540 	if (copy_from_user(devname, optval, optlen))
541 		goto out;
542 
543 	index = 0;
544 	if (devname[0] != '\0') {
545 		struct net_device *dev;
546 
547 		rcu_read_lock();
548 		dev = dev_get_by_name_rcu(net, devname);
549 		if (dev)
550 			index = dev->ifindex;
551 		rcu_read_unlock();
552 		ret = -ENODEV;
553 		if (!dev)
554 			goto out;
555 	}
556 
557 	lock_sock(sk);
558 	sk->sk_bound_dev_if = index;
559 	sk_dst_reset(sk);
560 	release_sock(sk);
561 
562 	ret = 0;
563 
564 out:
565 #endif
566 
567 	return ret;
568 }
569 
570 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
571 				int __user *optlen, int len)
572 {
573 	int ret = -ENOPROTOOPT;
574 #ifdef CONFIG_NETDEVICES
575 	struct net *net = sock_net(sk);
576 	char devname[IFNAMSIZ];
577 
578 	if (sk->sk_bound_dev_if == 0) {
579 		len = 0;
580 		goto zero;
581 	}
582 
583 	ret = -EINVAL;
584 	if (len < IFNAMSIZ)
585 		goto out;
586 
587 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
588 	if (ret)
589 		goto out;
590 
591 	len = strlen(devname) + 1;
592 
593 	ret = -EFAULT;
594 	if (copy_to_user(optval, devname, len))
595 		goto out;
596 
597 zero:
598 	ret = -EFAULT;
599 	if (put_user(len, optlen))
600 		goto out;
601 
602 	ret = 0;
603 
604 out:
605 #endif
606 
607 	return ret;
608 }
609 
610 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
611 {
612 	if (valbool)
613 		sock_set_flag(sk, bit);
614 	else
615 		sock_reset_flag(sk, bit);
616 }
617 
618 /*
619  *	This is meant for all protocols to use and covers goings on
620  *	at the socket level. Everything here is generic.
621  */
622 
623 int sock_setsockopt(struct socket *sock, int level, int optname,
624 		    char __user *optval, unsigned int optlen)
625 {
626 	struct sock *sk = sock->sk;
627 	int val;
628 	int valbool;
629 	struct linger ling;
630 	int ret = 0;
631 
632 	/*
633 	 *	Options without arguments
634 	 */
635 
636 	if (optname == SO_BINDTODEVICE)
637 		return sock_setbindtodevice(sk, optval, optlen);
638 
639 	if (optlen < sizeof(int))
640 		return -EINVAL;
641 
642 	if (get_user(val, (int __user *)optval))
643 		return -EFAULT;
644 
645 	valbool = val ? 1 : 0;
646 
647 	lock_sock(sk);
648 
649 	switch (optname) {
650 	case SO_DEBUG:
651 		if (val && !capable(CAP_NET_ADMIN))
652 			ret = -EACCES;
653 		else
654 			sock_valbool_flag(sk, SOCK_DBG, valbool);
655 		break;
656 	case SO_REUSEADDR:
657 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
658 		break;
659 	case SO_REUSEPORT:
660 		sk->sk_reuseport = valbool;
661 		break;
662 	case SO_TYPE:
663 	case SO_PROTOCOL:
664 	case SO_DOMAIN:
665 	case SO_ERROR:
666 		ret = -ENOPROTOOPT;
667 		break;
668 	case SO_DONTROUTE:
669 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
670 		break;
671 	case SO_BROADCAST:
672 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
673 		break;
674 	case SO_SNDBUF:
675 		/* Don't error on this BSD doesn't and if you think
676 		 * about it this is right. Otherwise apps have to
677 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
678 		 * are treated in BSD as hints
679 		 */
680 		val = min_t(u32, val, sysctl_wmem_max);
681 set_sndbuf:
682 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
683 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
684 		/* Wake up sending tasks if we upped the value. */
685 		sk->sk_write_space(sk);
686 		break;
687 
688 	case SO_SNDBUFFORCE:
689 		if (!capable(CAP_NET_ADMIN)) {
690 			ret = -EPERM;
691 			break;
692 		}
693 		goto set_sndbuf;
694 
695 	case SO_RCVBUF:
696 		/* Don't error on this BSD doesn't and if you think
697 		 * about it this is right. Otherwise apps have to
698 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
699 		 * are treated in BSD as hints
700 		 */
701 		val = min_t(u32, val, sysctl_rmem_max);
702 set_rcvbuf:
703 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
704 		/*
705 		 * We double it on the way in to account for
706 		 * "struct sk_buff" etc. overhead.   Applications
707 		 * assume that the SO_RCVBUF setting they make will
708 		 * allow that much actual data to be received on that
709 		 * socket.
710 		 *
711 		 * Applications are unaware that "struct sk_buff" and
712 		 * other overheads allocate from the receive buffer
713 		 * during socket buffer allocation.
714 		 *
715 		 * And after considering the possible alternatives,
716 		 * returning the value we actually used in getsockopt
717 		 * is the most desirable behavior.
718 		 */
719 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
720 		break;
721 
722 	case SO_RCVBUFFORCE:
723 		if (!capable(CAP_NET_ADMIN)) {
724 			ret = -EPERM;
725 			break;
726 		}
727 		goto set_rcvbuf;
728 
729 	case SO_KEEPALIVE:
730 #ifdef CONFIG_INET
731 		if (sk->sk_protocol == IPPROTO_TCP &&
732 		    sk->sk_type == SOCK_STREAM)
733 			tcp_set_keepalive(sk, valbool);
734 #endif
735 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
736 		break;
737 
738 	case SO_OOBINLINE:
739 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
740 		break;
741 
742 	case SO_NO_CHECK:
743 		sk->sk_no_check = valbool;
744 		break;
745 
746 	case SO_PRIORITY:
747 		if ((val >= 0 && val <= 6) ||
748 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
749 			sk->sk_priority = val;
750 		else
751 			ret = -EPERM;
752 		break;
753 
754 	case SO_LINGER:
755 		if (optlen < sizeof(ling)) {
756 			ret = -EINVAL;	/* 1003.1g */
757 			break;
758 		}
759 		if (copy_from_user(&ling, optval, sizeof(ling))) {
760 			ret = -EFAULT;
761 			break;
762 		}
763 		if (!ling.l_onoff)
764 			sock_reset_flag(sk, SOCK_LINGER);
765 		else {
766 #if (BITS_PER_LONG == 32)
767 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
768 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
769 			else
770 #endif
771 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
772 			sock_set_flag(sk, SOCK_LINGER);
773 		}
774 		break;
775 
776 	case SO_BSDCOMPAT:
777 		sock_warn_obsolete_bsdism("setsockopt");
778 		break;
779 
780 	case SO_PASSCRED:
781 		if (valbool)
782 			set_bit(SOCK_PASSCRED, &sock->flags);
783 		else
784 			clear_bit(SOCK_PASSCRED, &sock->flags);
785 		break;
786 
787 	case SO_TIMESTAMP:
788 	case SO_TIMESTAMPNS:
789 		if (valbool)  {
790 			if (optname == SO_TIMESTAMP)
791 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
792 			else
793 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
794 			sock_set_flag(sk, SOCK_RCVTSTAMP);
795 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
796 		} else {
797 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
798 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
799 		}
800 		break;
801 
802 	case SO_TIMESTAMPING:
803 		if (val & ~SOF_TIMESTAMPING_MASK) {
804 			ret = -EINVAL;
805 			break;
806 		}
807 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
808 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
809 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
810 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
811 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
812 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
813 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
814 			sock_enable_timestamp(sk,
815 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
816 		else
817 			sock_disable_timestamp(sk,
818 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
819 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
820 				  val & SOF_TIMESTAMPING_SOFTWARE);
821 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
822 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
823 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
824 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
825 		break;
826 
827 	case SO_RCVLOWAT:
828 		if (val < 0)
829 			val = INT_MAX;
830 		sk->sk_rcvlowat = val ? : 1;
831 		break;
832 
833 	case SO_RCVTIMEO:
834 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
835 		break;
836 
837 	case SO_SNDTIMEO:
838 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
839 		break;
840 
841 	case SO_ATTACH_FILTER:
842 		ret = -EINVAL;
843 		if (optlen == sizeof(struct sock_fprog)) {
844 			struct sock_fprog fprog;
845 
846 			ret = -EFAULT;
847 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
848 				break;
849 
850 			ret = sk_attach_filter(&fprog, sk);
851 		}
852 		break;
853 
854 	case SO_DETACH_FILTER:
855 		ret = sk_detach_filter(sk);
856 		break;
857 
858 	case SO_LOCK_FILTER:
859 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
860 			ret = -EPERM;
861 		else
862 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
863 		break;
864 
865 	case SO_PASSSEC:
866 		if (valbool)
867 			set_bit(SOCK_PASSSEC, &sock->flags);
868 		else
869 			clear_bit(SOCK_PASSSEC, &sock->flags);
870 		break;
871 	case SO_MARK:
872 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
873 			ret = -EPERM;
874 		else
875 			sk->sk_mark = val;
876 		break;
877 
878 		/* We implement the SO_SNDLOWAT etc to
879 		   not be settable (1003.1g 5.3) */
880 	case SO_RXQ_OVFL:
881 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
882 		break;
883 
884 	case SO_WIFI_STATUS:
885 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
886 		break;
887 
888 	case SO_PEEK_OFF:
889 		if (sock->ops->set_peek_off)
890 			sock->ops->set_peek_off(sk, val);
891 		else
892 			ret = -EOPNOTSUPP;
893 		break;
894 
895 	case SO_NOFCS:
896 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
897 		break;
898 
899 	case SO_SELECT_ERR_QUEUE:
900 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
901 		break;
902 
903 #ifdef CONFIG_NET_LL_RX_POLL
904 	case SO_BUSY_POLL:
905 		/* allow unprivileged users to decrease the value */
906 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
907 			ret = -EPERM;
908 		else {
909 			if (val < 0)
910 				ret = -EINVAL;
911 			else
912 				sk->sk_ll_usec = val;
913 		}
914 		break;
915 #endif
916 	default:
917 		ret = -ENOPROTOOPT;
918 		break;
919 	}
920 	release_sock(sk);
921 	return ret;
922 }
923 EXPORT_SYMBOL(sock_setsockopt);
924 
925 
926 void cred_to_ucred(struct pid *pid, const struct cred *cred,
927 		   struct ucred *ucred)
928 {
929 	ucred->pid = pid_vnr(pid);
930 	ucred->uid = ucred->gid = -1;
931 	if (cred) {
932 		struct user_namespace *current_ns = current_user_ns();
933 
934 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
935 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
936 	}
937 }
938 EXPORT_SYMBOL_GPL(cred_to_ucred);
939 
940 int sock_getsockopt(struct socket *sock, int level, int optname,
941 		    char __user *optval, int __user *optlen)
942 {
943 	struct sock *sk = sock->sk;
944 
945 	union {
946 		int val;
947 		struct linger ling;
948 		struct timeval tm;
949 	} v;
950 
951 	int lv = sizeof(int);
952 	int len;
953 
954 	if (get_user(len, optlen))
955 		return -EFAULT;
956 	if (len < 0)
957 		return -EINVAL;
958 
959 	memset(&v, 0, sizeof(v));
960 
961 	switch (optname) {
962 	case SO_DEBUG:
963 		v.val = sock_flag(sk, SOCK_DBG);
964 		break;
965 
966 	case SO_DONTROUTE:
967 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
968 		break;
969 
970 	case SO_BROADCAST:
971 		v.val = sock_flag(sk, SOCK_BROADCAST);
972 		break;
973 
974 	case SO_SNDBUF:
975 		v.val = sk->sk_sndbuf;
976 		break;
977 
978 	case SO_RCVBUF:
979 		v.val = sk->sk_rcvbuf;
980 		break;
981 
982 	case SO_REUSEADDR:
983 		v.val = sk->sk_reuse;
984 		break;
985 
986 	case SO_REUSEPORT:
987 		v.val = sk->sk_reuseport;
988 		break;
989 
990 	case SO_KEEPALIVE:
991 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
992 		break;
993 
994 	case SO_TYPE:
995 		v.val = sk->sk_type;
996 		break;
997 
998 	case SO_PROTOCOL:
999 		v.val = sk->sk_protocol;
1000 		break;
1001 
1002 	case SO_DOMAIN:
1003 		v.val = sk->sk_family;
1004 		break;
1005 
1006 	case SO_ERROR:
1007 		v.val = -sock_error(sk);
1008 		if (v.val == 0)
1009 			v.val = xchg(&sk->sk_err_soft, 0);
1010 		break;
1011 
1012 	case SO_OOBINLINE:
1013 		v.val = sock_flag(sk, SOCK_URGINLINE);
1014 		break;
1015 
1016 	case SO_NO_CHECK:
1017 		v.val = sk->sk_no_check;
1018 		break;
1019 
1020 	case SO_PRIORITY:
1021 		v.val = sk->sk_priority;
1022 		break;
1023 
1024 	case SO_LINGER:
1025 		lv		= sizeof(v.ling);
1026 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1027 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1028 		break;
1029 
1030 	case SO_BSDCOMPAT:
1031 		sock_warn_obsolete_bsdism("getsockopt");
1032 		break;
1033 
1034 	case SO_TIMESTAMP:
1035 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1036 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1037 		break;
1038 
1039 	case SO_TIMESTAMPNS:
1040 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1041 		break;
1042 
1043 	case SO_TIMESTAMPING:
1044 		v.val = 0;
1045 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1046 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1047 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1048 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1049 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1050 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1051 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1052 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1053 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1054 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1055 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1056 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1057 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1058 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1059 		break;
1060 
1061 	case SO_RCVTIMEO:
1062 		lv = sizeof(struct timeval);
1063 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1064 			v.tm.tv_sec = 0;
1065 			v.tm.tv_usec = 0;
1066 		} else {
1067 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1068 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1069 		}
1070 		break;
1071 
1072 	case SO_SNDTIMEO:
1073 		lv = sizeof(struct timeval);
1074 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1075 			v.tm.tv_sec = 0;
1076 			v.tm.tv_usec = 0;
1077 		} else {
1078 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1079 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1080 		}
1081 		break;
1082 
1083 	case SO_RCVLOWAT:
1084 		v.val = sk->sk_rcvlowat;
1085 		break;
1086 
1087 	case SO_SNDLOWAT:
1088 		v.val = 1;
1089 		break;
1090 
1091 	case SO_PASSCRED:
1092 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1093 		break;
1094 
1095 	case SO_PEERCRED:
1096 	{
1097 		struct ucred peercred;
1098 		if (len > sizeof(peercred))
1099 			len = sizeof(peercred);
1100 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1101 		if (copy_to_user(optval, &peercred, len))
1102 			return -EFAULT;
1103 		goto lenout;
1104 	}
1105 
1106 	case SO_PEERNAME:
1107 	{
1108 		char address[128];
1109 
1110 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1111 			return -ENOTCONN;
1112 		if (lv < len)
1113 			return -EINVAL;
1114 		if (copy_to_user(optval, address, len))
1115 			return -EFAULT;
1116 		goto lenout;
1117 	}
1118 
1119 	/* Dubious BSD thing... Probably nobody even uses it, but
1120 	 * the UNIX standard wants it for whatever reason... -DaveM
1121 	 */
1122 	case SO_ACCEPTCONN:
1123 		v.val = sk->sk_state == TCP_LISTEN;
1124 		break;
1125 
1126 	case SO_PASSSEC:
1127 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1128 		break;
1129 
1130 	case SO_PEERSEC:
1131 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1132 
1133 	case SO_MARK:
1134 		v.val = sk->sk_mark;
1135 		break;
1136 
1137 	case SO_RXQ_OVFL:
1138 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1139 		break;
1140 
1141 	case SO_WIFI_STATUS:
1142 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1143 		break;
1144 
1145 	case SO_PEEK_OFF:
1146 		if (!sock->ops->set_peek_off)
1147 			return -EOPNOTSUPP;
1148 
1149 		v.val = sk->sk_peek_off;
1150 		break;
1151 	case SO_NOFCS:
1152 		v.val = sock_flag(sk, SOCK_NOFCS);
1153 		break;
1154 
1155 	case SO_BINDTODEVICE:
1156 		return sock_getbindtodevice(sk, optval, optlen, len);
1157 
1158 	case SO_GET_FILTER:
1159 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1160 		if (len < 0)
1161 			return len;
1162 
1163 		goto lenout;
1164 
1165 	case SO_LOCK_FILTER:
1166 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1167 		break;
1168 
1169 	case SO_SELECT_ERR_QUEUE:
1170 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1171 		break;
1172 
1173 #ifdef CONFIG_NET_LL_RX_POLL
1174 	case SO_BUSY_POLL:
1175 		v.val = sk->sk_ll_usec;
1176 		break;
1177 #endif
1178 
1179 	default:
1180 		return -ENOPROTOOPT;
1181 	}
1182 
1183 	if (len > lv)
1184 		len = lv;
1185 	if (copy_to_user(optval, &v, len))
1186 		return -EFAULT;
1187 lenout:
1188 	if (put_user(len, optlen))
1189 		return -EFAULT;
1190 	return 0;
1191 }
1192 
1193 /*
1194  * Initialize an sk_lock.
1195  *
1196  * (We also register the sk_lock with the lock validator.)
1197  */
1198 static inline void sock_lock_init(struct sock *sk)
1199 {
1200 	sock_lock_init_class_and_name(sk,
1201 			af_family_slock_key_strings[sk->sk_family],
1202 			af_family_slock_keys + sk->sk_family,
1203 			af_family_key_strings[sk->sk_family],
1204 			af_family_keys + sk->sk_family);
1205 }
1206 
1207 /*
1208  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1209  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1210  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1211  */
1212 static void sock_copy(struct sock *nsk, const struct sock *osk)
1213 {
1214 #ifdef CONFIG_SECURITY_NETWORK
1215 	void *sptr = nsk->sk_security;
1216 #endif
1217 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1218 
1219 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1220 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1221 
1222 #ifdef CONFIG_SECURITY_NETWORK
1223 	nsk->sk_security = sptr;
1224 	security_sk_clone(osk, nsk);
1225 #endif
1226 }
1227 
1228 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1229 {
1230 	unsigned long nulls1, nulls2;
1231 
1232 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1233 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1234 	if (nulls1 > nulls2)
1235 		swap(nulls1, nulls2);
1236 
1237 	if (nulls1 != 0)
1238 		memset((char *)sk, 0, nulls1);
1239 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1240 	       nulls2 - nulls1 - sizeof(void *));
1241 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1242 	       size - nulls2 - sizeof(void *));
1243 }
1244 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1245 
1246 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1247 		int family)
1248 {
1249 	struct sock *sk;
1250 	struct kmem_cache *slab;
1251 
1252 	slab = prot->slab;
1253 	if (slab != NULL) {
1254 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1255 		if (!sk)
1256 			return sk;
1257 		if (priority & __GFP_ZERO) {
1258 			if (prot->clear_sk)
1259 				prot->clear_sk(sk, prot->obj_size);
1260 			else
1261 				sk_prot_clear_nulls(sk, prot->obj_size);
1262 		}
1263 	} else
1264 		sk = kmalloc(prot->obj_size, priority);
1265 
1266 	if (sk != NULL) {
1267 		kmemcheck_annotate_bitfield(sk, flags);
1268 
1269 		if (security_sk_alloc(sk, family, priority))
1270 			goto out_free;
1271 
1272 		if (!try_module_get(prot->owner))
1273 			goto out_free_sec;
1274 		sk_tx_queue_clear(sk);
1275 	}
1276 
1277 	return sk;
1278 
1279 out_free_sec:
1280 	security_sk_free(sk);
1281 out_free:
1282 	if (slab != NULL)
1283 		kmem_cache_free(slab, sk);
1284 	else
1285 		kfree(sk);
1286 	return NULL;
1287 }
1288 
1289 static void sk_prot_free(struct proto *prot, struct sock *sk)
1290 {
1291 	struct kmem_cache *slab;
1292 	struct module *owner;
1293 
1294 	owner = prot->owner;
1295 	slab = prot->slab;
1296 
1297 	security_sk_free(sk);
1298 	if (slab != NULL)
1299 		kmem_cache_free(slab, sk);
1300 	else
1301 		kfree(sk);
1302 	module_put(owner);
1303 }
1304 
1305 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1306 void sock_update_classid(struct sock *sk)
1307 {
1308 	u32 classid;
1309 
1310 	classid = task_cls_classid(current);
1311 	if (classid != sk->sk_classid)
1312 		sk->sk_classid = classid;
1313 }
1314 EXPORT_SYMBOL(sock_update_classid);
1315 #endif
1316 
1317 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1318 void sock_update_netprioidx(struct sock *sk)
1319 {
1320 	if (in_interrupt())
1321 		return;
1322 
1323 	sk->sk_cgrp_prioidx = task_netprioidx(current);
1324 }
1325 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1326 #endif
1327 
1328 /**
1329  *	sk_alloc - All socket objects are allocated here
1330  *	@net: the applicable net namespace
1331  *	@family: protocol family
1332  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1333  *	@prot: struct proto associated with this new sock instance
1334  */
1335 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1336 		      struct proto *prot)
1337 {
1338 	struct sock *sk;
1339 
1340 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1341 	if (sk) {
1342 		sk->sk_family = family;
1343 		/*
1344 		 * See comment in struct sock definition to understand
1345 		 * why we need sk_prot_creator -acme
1346 		 */
1347 		sk->sk_prot = sk->sk_prot_creator = prot;
1348 		sock_lock_init(sk);
1349 		sock_net_set(sk, get_net(net));
1350 		atomic_set(&sk->sk_wmem_alloc, 1);
1351 
1352 		sock_update_classid(sk);
1353 		sock_update_netprioidx(sk);
1354 	}
1355 
1356 	return sk;
1357 }
1358 EXPORT_SYMBOL(sk_alloc);
1359 
1360 static void __sk_free(struct sock *sk)
1361 {
1362 	struct sk_filter *filter;
1363 
1364 	if (sk->sk_destruct)
1365 		sk->sk_destruct(sk);
1366 
1367 	filter = rcu_dereference_check(sk->sk_filter,
1368 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1369 	if (filter) {
1370 		sk_filter_uncharge(sk, filter);
1371 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1372 	}
1373 
1374 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1375 
1376 	if (atomic_read(&sk->sk_omem_alloc))
1377 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1378 			 __func__, atomic_read(&sk->sk_omem_alloc));
1379 
1380 	if (sk->sk_peer_cred)
1381 		put_cred(sk->sk_peer_cred);
1382 	put_pid(sk->sk_peer_pid);
1383 	put_net(sock_net(sk));
1384 	sk_prot_free(sk->sk_prot_creator, sk);
1385 }
1386 
1387 void sk_free(struct sock *sk)
1388 {
1389 	/*
1390 	 * We subtract one from sk_wmem_alloc and can know if
1391 	 * some packets are still in some tx queue.
1392 	 * If not null, sock_wfree() will call __sk_free(sk) later
1393 	 */
1394 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1395 		__sk_free(sk);
1396 }
1397 EXPORT_SYMBOL(sk_free);
1398 
1399 /*
1400  * Last sock_put should drop reference to sk->sk_net. It has already
1401  * been dropped in sk_change_net. Taking reference to stopping namespace
1402  * is not an option.
1403  * Take reference to a socket to remove it from hash _alive_ and after that
1404  * destroy it in the context of init_net.
1405  */
1406 void sk_release_kernel(struct sock *sk)
1407 {
1408 	if (sk == NULL || sk->sk_socket == NULL)
1409 		return;
1410 
1411 	sock_hold(sk);
1412 	sock_release(sk->sk_socket);
1413 	release_net(sock_net(sk));
1414 	sock_net_set(sk, get_net(&init_net));
1415 	sock_put(sk);
1416 }
1417 EXPORT_SYMBOL(sk_release_kernel);
1418 
1419 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1420 {
1421 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1422 		sock_update_memcg(newsk);
1423 }
1424 
1425 /**
1426  *	sk_clone_lock - clone a socket, and lock its clone
1427  *	@sk: the socket to clone
1428  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1429  *
1430  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1431  */
1432 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1433 {
1434 	struct sock *newsk;
1435 
1436 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1437 	if (newsk != NULL) {
1438 		struct sk_filter *filter;
1439 
1440 		sock_copy(newsk, sk);
1441 
1442 		/* SANITY */
1443 		get_net(sock_net(newsk));
1444 		sk_node_init(&newsk->sk_node);
1445 		sock_lock_init(newsk);
1446 		bh_lock_sock(newsk);
1447 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1448 		newsk->sk_backlog.len = 0;
1449 
1450 		atomic_set(&newsk->sk_rmem_alloc, 0);
1451 		/*
1452 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1453 		 */
1454 		atomic_set(&newsk->sk_wmem_alloc, 1);
1455 		atomic_set(&newsk->sk_omem_alloc, 0);
1456 		skb_queue_head_init(&newsk->sk_receive_queue);
1457 		skb_queue_head_init(&newsk->sk_write_queue);
1458 #ifdef CONFIG_NET_DMA
1459 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1460 #endif
1461 
1462 		spin_lock_init(&newsk->sk_dst_lock);
1463 		rwlock_init(&newsk->sk_callback_lock);
1464 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1465 				af_callback_keys + newsk->sk_family,
1466 				af_family_clock_key_strings[newsk->sk_family]);
1467 
1468 		newsk->sk_dst_cache	= NULL;
1469 		newsk->sk_wmem_queued	= 0;
1470 		newsk->sk_forward_alloc = 0;
1471 		newsk->sk_send_head	= NULL;
1472 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1473 
1474 		sock_reset_flag(newsk, SOCK_DONE);
1475 		skb_queue_head_init(&newsk->sk_error_queue);
1476 
1477 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1478 		if (filter != NULL)
1479 			sk_filter_charge(newsk, filter);
1480 
1481 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1482 			/* It is still raw copy of parent, so invalidate
1483 			 * destructor and make plain sk_free() */
1484 			newsk->sk_destruct = NULL;
1485 			bh_unlock_sock(newsk);
1486 			sk_free(newsk);
1487 			newsk = NULL;
1488 			goto out;
1489 		}
1490 
1491 		newsk->sk_err	   = 0;
1492 		newsk->sk_priority = 0;
1493 		/*
1494 		 * Before updating sk_refcnt, we must commit prior changes to memory
1495 		 * (Documentation/RCU/rculist_nulls.txt for details)
1496 		 */
1497 		smp_wmb();
1498 		atomic_set(&newsk->sk_refcnt, 2);
1499 
1500 		/*
1501 		 * Increment the counter in the same struct proto as the master
1502 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1503 		 * is the same as sk->sk_prot->socks, as this field was copied
1504 		 * with memcpy).
1505 		 *
1506 		 * This _changes_ the previous behaviour, where
1507 		 * tcp_create_openreq_child always was incrementing the
1508 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1509 		 * to be taken into account in all callers. -acme
1510 		 */
1511 		sk_refcnt_debug_inc(newsk);
1512 		sk_set_socket(newsk, NULL);
1513 		newsk->sk_wq = NULL;
1514 
1515 		sk_update_clone(sk, newsk);
1516 
1517 		if (newsk->sk_prot->sockets_allocated)
1518 			sk_sockets_allocated_inc(newsk);
1519 
1520 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1521 			net_enable_timestamp();
1522 	}
1523 out:
1524 	return newsk;
1525 }
1526 EXPORT_SYMBOL_GPL(sk_clone_lock);
1527 
1528 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1529 {
1530 	__sk_dst_set(sk, dst);
1531 	sk->sk_route_caps = dst->dev->features;
1532 	if (sk->sk_route_caps & NETIF_F_GSO)
1533 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1534 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1535 	if (sk_can_gso(sk)) {
1536 		if (dst->header_len) {
1537 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1538 		} else {
1539 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1540 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1541 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1542 		}
1543 	}
1544 }
1545 EXPORT_SYMBOL_GPL(sk_setup_caps);
1546 
1547 /*
1548  *	Simple resource managers for sockets.
1549  */
1550 
1551 
1552 /*
1553  * Write buffer destructor automatically called from kfree_skb.
1554  */
1555 void sock_wfree(struct sk_buff *skb)
1556 {
1557 	struct sock *sk = skb->sk;
1558 	unsigned int len = skb->truesize;
1559 
1560 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1561 		/*
1562 		 * Keep a reference on sk_wmem_alloc, this will be released
1563 		 * after sk_write_space() call
1564 		 */
1565 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1566 		sk->sk_write_space(sk);
1567 		len = 1;
1568 	}
1569 	/*
1570 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1571 	 * could not do because of in-flight packets
1572 	 */
1573 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1574 		__sk_free(sk);
1575 }
1576 EXPORT_SYMBOL(sock_wfree);
1577 
1578 /*
1579  * Read buffer destructor automatically called from kfree_skb.
1580  */
1581 void sock_rfree(struct sk_buff *skb)
1582 {
1583 	struct sock *sk = skb->sk;
1584 	unsigned int len = skb->truesize;
1585 
1586 	atomic_sub(len, &sk->sk_rmem_alloc);
1587 	sk_mem_uncharge(sk, len);
1588 }
1589 EXPORT_SYMBOL(sock_rfree);
1590 
1591 void sock_edemux(struct sk_buff *skb)
1592 {
1593 	struct sock *sk = skb->sk;
1594 
1595 #ifdef CONFIG_INET
1596 	if (sk->sk_state == TCP_TIME_WAIT)
1597 		inet_twsk_put(inet_twsk(sk));
1598 	else
1599 #endif
1600 		sock_put(sk);
1601 }
1602 EXPORT_SYMBOL(sock_edemux);
1603 
1604 kuid_t sock_i_uid(struct sock *sk)
1605 {
1606 	kuid_t uid;
1607 
1608 	read_lock_bh(&sk->sk_callback_lock);
1609 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1610 	read_unlock_bh(&sk->sk_callback_lock);
1611 	return uid;
1612 }
1613 EXPORT_SYMBOL(sock_i_uid);
1614 
1615 unsigned long sock_i_ino(struct sock *sk)
1616 {
1617 	unsigned long ino;
1618 
1619 	read_lock_bh(&sk->sk_callback_lock);
1620 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1621 	read_unlock_bh(&sk->sk_callback_lock);
1622 	return ino;
1623 }
1624 EXPORT_SYMBOL(sock_i_ino);
1625 
1626 /*
1627  * Allocate a skb from the socket's send buffer.
1628  */
1629 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1630 			     gfp_t priority)
1631 {
1632 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1633 		struct sk_buff *skb = alloc_skb(size, priority);
1634 		if (skb) {
1635 			skb_set_owner_w(skb, sk);
1636 			return skb;
1637 		}
1638 	}
1639 	return NULL;
1640 }
1641 EXPORT_SYMBOL(sock_wmalloc);
1642 
1643 /*
1644  * Allocate a skb from the socket's receive buffer.
1645  */
1646 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1647 			     gfp_t priority)
1648 {
1649 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1650 		struct sk_buff *skb = alloc_skb(size, priority);
1651 		if (skb) {
1652 			skb_set_owner_r(skb, sk);
1653 			return skb;
1654 		}
1655 	}
1656 	return NULL;
1657 }
1658 
1659 /*
1660  * Allocate a memory block from the socket's option memory buffer.
1661  */
1662 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1663 {
1664 	if ((unsigned int)size <= sysctl_optmem_max &&
1665 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1666 		void *mem;
1667 		/* First do the add, to avoid the race if kmalloc
1668 		 * might sleep.
1669 		 */
1670 		atomic_add(size, &sk->sk_omem_alloc);
1671 		mem = kmalloc(size, priority);
1672 		if (mem)
1673 			return mem;
1674 		atomic_sub(size, &sk->sk_omem_alloc);
1675 	}
1676 	return NULL;
1677 }
1678 EXPORT_SYMBOL(sock_kmalloc);
1679 
1680 /*
1681  * Free an option memory block.
1682  */
1683 void sock_kfree_s(struct sock *sk, void *mem, int size)
1684 {
1685 	kfree(mem);
1686 	atomic_sub(size, &sk->sk_omem_alloc);
1687 }
1688 EXPORT_SYMBOL(sock_kfree_s);
1689 
1690 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1691    I think, these locks should be removed for datagram sockets.
1692  */
1693 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1694 {
1695 	DEFINE_WAIT(wait);
1696 
1697 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1698 	for (;;) {
1699 		if (!timeo)
1700 			break;
1701 		if (signal_pending(current))
1702 			break;
1703 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1704 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1705 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1706 			break;
1707 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1708 			break;
1709 		if (sk->sk_err)
1710 			break;
1711 		timeo = schedule_timeout(timeo);
1712 	}
1713 	finish_wait(sk_sleep(sk), &wait);
1714 	return timeo;
1715 }
1716 
1717 
1718 /*
1719  *	Generic send/receive buffer handlers
1720  */
1721 
1722 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1723 				     unsigned long data_len, int noblock,
1724 				     int *errcode)
1725 {
1726 	struct sk_buff *skb;
1727 	gfp_t gfp_mask;
1728 	long timeo;
1729 	int err;
1730 	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1731 
1732 	err = -EMSGSIZE;
1733 	if (npages > MAX_SKB_FRAGS)
1734 		goto failure;
1735 
1736 	gfp_mask = sk->sk_allocation;
1737 	if (gfp_mask & __GFP_WAIT)
1738 		gfp_mask |= __GFP_REPEAT;
1739 
1740 	timeo = sock_sndtimeo(sk, noblock);
1741 	while (1) {
1742 		err = sock_error(sk);
1743 		if (err != 0)
1744 			goto failure;
1745 
1746 		err = -EPIPE;
1747 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1748 			goto failure;
1749 
1750 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1751 			skb = alloc_skb(header_len, gfp_mask);
1752 			if (skb) {
1753 				int i;
1754 
1755 				/* No pages, we're done... */
1756 				if (!data_len)
1757 					break;
1758 
1759 				skb->truesize += data_len;
1760 				skb_shinfo(skb)->nr_frags = npages;
1761 				for (i = 0; i < npages; i++) {
1762 					struct page *page;
1763 
1764 					page = alloc_pages(sk->sk_allocation, 0);
1765 					if (!page) {
1766 						err = -ENOBUFS;
1767 						skb_shinfo(skb)->nr_frags = i;
1768 						kfree_skb(skb);
1769 						goto failure;
1770 					}
1771 
1772 					__skb_fill_page_desc(skb, i,
1773 							page, 0,
1774 							(data_len >= PAGE_SIZE ?
1775 							 PAGE_SIZE :
1776 							 data_len));
1777 					data_len -= PAGE_SIZE;
1778 				}
1779 
1780 				/* Full success... */
1781 				break;
1782 			}
1783 			err = -ENOBUFS;
1784 			goto failure;
1785 		}
1786 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1787 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1788 		err = -EAGAIN;
1789 		if (!timeo)
1790 			goto failure;
1791 		if (signal_pending(current))
1792 			goto interrupted;
1793 		timeo = sock_wait_for_wmem(sk, timeo);
1794 	}
1795 
1796 	skb_set_owner_w(skb, sk);
1797 	return skb;
1798 
1799 interrupted:
1800 	err = sock_intr_errno(timeo);
1801 failure:
1802 	*errcode = err;
1803 	return NULL;
1804 }
1805 EXPORT_SYMBOL(sock_alloc_send_pskb);
1806 
1807 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1808 				    int noblock, int *errcode)
1809 {
1810 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1811 }
1812 EXPORT_SYMBOL(sock_alloc_send_skb);
1813 
1814 /* On 32bit arches, an skb frag is limited to 2^15 */
1815 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1816 
1817 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1818 {
1819 	int order;
1820 
1821 	if (pfrag->page) {
1822 		if (atomic_read(&pfrag->page->_count) == 1) {
1823 			pfrag->offset = 0;
1824 			return true;
1825 		}
1826 		if (pfrag->offset < pfrag->size)
1827 			return true;
1828 		put_page(pfrag->page);
1829 	}
1830 
1831 	/* We restrict high order allocations to users that can afford to wait */
1832 	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1833 
1834 	do {
1835 		gfp_t gfp = sk->sk_allocation;
1836 
1837 		if (order)
1838 			gfp |= __GFP_COMP | __GFP_NOWARN;
1839 		pfrag->page = alloc_pages(gfp, order);
1840 		if (likely(pfrag->page)) {
1841 			pfrag->offset = 0;
1842 			pfrag->size = PAGE_SIZE << order;
1843 			return true;
1844 		}
1845 	} while (--order >= 0);
1846 
1847 	sk_enter_memory_pressure(sk);
1848 	sk_stream_moderate_sndbuf(sk);
1849 	return false;
1850 }
1851 EXPORT_SYMBOL(sk_page_frag_refill);
1852 
1853 static void __lock_sock(struct sock *sk)
1854 	__releases(&sk->sk_lock.slock)
1855 	__acquires(&sk->sk_lock.slock)
1856 {
1857 	DEFINE_WAIT(wait);
1858 
1859 	for (;;) {
1860 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1861 					TASK_UNINTERRUPTIBLE);
1862 		spin_unlock_bh(&sk->sk_lock.slock);
1863 		schedule();
1864 		spin_lock_bh(&sk->sk_lock.slock);
1865 		if (!sock_owned_by_user(sk))
1866 			break;
1867 	}
1868 	finish_wait(&sk->sk_lock.wq, &wait);
1869 }
1870 
1871 static void __release_sock(struct sock *sk)
1872 	__releases(&sk->sk_lock.slock)
1873 	__acquires(&sk->sk_lock.slock)
1874 {
1875 	struct sk_buff *skb = sk->sk_backlog.head;
1876 
1877 	do {
1878 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1879 		bh_unlock_sock(sk);
1880 
1881 		do {
1882 			struct sk_buff *next = skb->next;
1883 
1884 			prefetch(next);
1885 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1886 			skb->next = NULL;
1887 			sk_backlog_rcv(sk, skb);
1888 
1889 			/*
1890 			 * We are in process context here with softirqs
1891 			 * disabled, use cond_resched_softirq() to preempt.
1892 			 * This is safe to do because we've taken the backlog
1893 			 * queue private:
1894 			 */
1895 			cond_resched_softirq();
1896 
1897 			skb = next;
1898 		} while (skb != NULL);
1899 
1900 		bh_lock_sock(sk);
1901 	} while ((skb = sk->sk_backlog.head) != NULL);
1902 
1903 	/*
1904 	 * Doing the zeroing here guarantee we can not loop forever
1905 	 * while a wild producer attempts to flood us.
1906 	 */
1907 	sk->sk_backlog.len = 0;
1908 }
1909 
1910 /**
1911  * sk_wait_data - wait for data to arrive at sk_receive_queue
1912  * @sk:    sock to wait on
1913  * @timeo: for how long
1914  *
1915  * Now socket state including sk->sk_err is changed only under lock,
1916  * hence we may omit checks after joining wait queue.
1917  * We check receive queue before schedule() only as optimization;
1918  * it is very likely that release_sock() added new data.
1919  */
1920 int sk_wait_data(struct sock *sk, long *timeo)
1921 {
1922 	int rc;
1923 	DEFINE_WAIT(wait);
1924 
1925 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1926 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1927 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1928 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1929 	finish_wait(sk_sleep(sk), &wait);
1930 	return rc;
1931 }
1932 EXPORT_SYMBOL(sk_wait_data);
1933 
1934 /**
1935  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1936  *	@sk: socket
1937  *	@size: memory size to allocate
1938  *	@kind: allocation type
1939  *
1940  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1941  *	rmem allocation. This function assumes that protocols which have
1942  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1943  */
1944 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1945 {
1946 	struct proto *prot = sk->sk_prot;
1947 	int amt = sk_mem_pages(size);
1948 	long allocated;
1949 	int parent_status = UNDER_LIMIT;
1950 
1951 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1952 
1953 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1954 
1955 	/* Under limit. */
1956 	if (parent_status == UNDER_LIMIT &&
1957 			allocated <= sk_prot_mem_limits(sk, 0)) {
1958 		sk_leave_memory_pressure(sk);
1959 		return 1;
1960 	}
1961 
1962 	/* Under pressure. (we or our parents) */
1963 	if ((parent_status > SOFT_LIMIT) ||
1964 			allocated > sk_prot_mem_limits(sk, 1))
1965 		sk_enter_memory_pressure(sk);
1966 
1967 	/* Over hard limit (we or our parents) */
1968 	if ((parent_status == OVER_LIMIT) ||
1969 			(allocated > sk_prot_mem_limits(sk, 2)))
1970 		goto suppress_allocation;
1971 
1972 	/* guarantee minimum buffer size under pressure */
1973 	if (kind == SK_MEM_RECV) {
1974 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1975 			return 1;
1976 
1977 	} else { /* SK_MEM_SEND */
1978 		if (sk->sk_type == SOCK_STREAM) {
1979 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1980 				return 1;
1981 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1982 			   prot->sysctl_wmem[0])
1983 				return 1;
1984 	}
1985 
1986 	if (sk_has_memory_pressure(sk)) {
1987 		int alloc;
1988 
1989 		if (!sk_under_memory_pressure(sk))
1990 			return 1;
1991 		alloc = sk_sockets_allocated_read_positive(sk);
1992 		if (sk_prot_mem_limits(sk, 2) > alloc *
1993 		    sk_mem_pages(sk->sk_wmem_queued +
1994 				 atomic_read(&sk->sk_rmem_alloc) +
1995 				 sk->sk_forward_alloc))
1996 			return 1;
1997 	}
1998 
1999 suppress_allocation:
2000 
2001 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2002 		sk_stream_moderate_sndbuf(sk);
2003 
2004 		/* Fail only if socket is _under_ its sndbuf.
2005 		 * In this case we cannot block, so that we have to fail.
2006 		 */
2007 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2008 			return 1;
2009 	}
2010 
2011 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2012 
2013 	/* Alas. Undo changes. */
2014 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2015 
2016 	sk_memory_allocated_sub(sk, amt);
2017 
2018 	return 0;
2019 }
2020 EXPORT_SYMBOL(__sk_mem_schedule);
2021 
2022 /**
2023  *	__sk_reclaim - reclaim memory_allocated
2024  *	@sk: socket
2025  */
2026 void __sk_mem_reclaim(struct sock *sk)
2027 {
2028 	sk_memory_allocated_sub(sk,
2029 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2030 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2031 
2032 	if (sk_under_memory_pressure(sk) &&
2033 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2034 		sk_leave_memory_pressure(sk);
2035 }
2036 EXPORT_SYMBOL(__sk_mem_reclaim);
2037 
2038 
2039 /*
2040  * Set of default routines for initialising struct proto_ops when
2041  * the protocol does not support a particular function. In certain
2042  * cases where it makes no sense for a protocol to have a "do nothing"
2043  * function, some default processing is provided.
2044  */
2045 
2046 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2047 {
2048 	return -EOPNOTSUPP;
2049 }
2050 EXPORT_SYMBOL(sock_no_bind);
2051 
2052 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2053 		    int len, int flags)
2054 {
2055 	return -EOPNOTSUPP;
2056 }
2057 EXPORT_SYMBOL(sock_no_connect);
2058 
2059 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2060 {
2061 	return -EOPNOTSUPP;
2062 }
2063 EXPORT_SYMBOL(sock_no_socketpair);
2064 
2065 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2066 {
2067 	return -EOPNOTSUPP;
2068 }
2069 EXPORT_SYMBOL(sock_no_accept);
2070 
2071 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2072 		    int *len, int peer)
2073 {
2074 	return -EOPNOTSUPP;
2075 }
2076 EXPORT_SYMBOL(sock_no_getname);
2077 
2078 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2079 {
2080 	return 0;
2081 }
2082 EXPORT_SYMBOL(sock_no_poll);
2083 
2084 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2085 {
2086 	return -EOPNOTSUPP;
2087 }
2088 EXPORT_SYMBOL(sock_no_ioctl);
2089 
2090 int sock_no_listen(struct socket *sock, int backlog)
2091 {
2092 	return -EOPNOTSUPP;
2093 }
2094 EXPORT_SYMBOL(sock_no_listen);
2095 
2096 int sock_no_shutdown(struct socket *sock, int how)
2097 {
2098 	return -EOPNOTSUPP;
2099 }
2100 EXPORT_SYMBOL(sock_no_shutdown);
2101 
2102 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2103 		    char __user *optval, unsigned int optlen)
2104 {
2105 	return -EOPNOTSUPP;
2106 }
2107 EXPORT_SYMBOL(sock_no_setsockopt);
2108 
2109 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2110 		    char __user *optval, int __user *optlen)
2111 {
2112 	return -EOPNOTSUPP;
2113 }
2114 EXPORT_SYMBOL(sock_no_getsockopt);
2115 
2116 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2117 		    size_t len)
2118 {
2119 	return -EOPNOTSUPP;
2120 }
2121 EXPORT_SYMBOL(sock_no_sendmsg);
2122 
2123 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2124 		    size_t len, int flags)
2125 {
2126 	return -EOPNOTSUPP;
2127 }
2128 EXPORT_SYMBOL(sock_no_recvmsg);
2129 
2130 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2131 {
2132 	/* Mirror missing mmap method error code */
2133 	return -ENODEV;
2134 }
2135 EXPORT_SYMBOL(sock_no_mmap);
2136 
2137 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2138 {
2139 	ssize_t res;
2140 	struct msghdr msg = {.msg_flags = flags};
2141 	struct kvec iov;
2142 	char *kaddr = kmap(page);
2143 	iov.iov_base = kaddr + offset;
2144 	iov.iov_len = size;
2145 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2146 	kunmap(page);
2147 	return res;
2148 }
2149 EXPORT_SYMBOL(sock_no_sendpage);
2150 
2151 /*
2152  *	Default Socket Callbacks
2153  */
2154 
2155 static void sock_def_wakeup(struct sock *sk)
2156 {
2157 	struct socket_wq *wq;
2158 
2159 	rcu_read_lock();
2160 	wq = rcu_dereference(sk->sk_wq);
2161 	if (wq_has_sleeper(wq))
2162 		wake_up_interruptible_all(&wq->wait);
2163 	rcu_read_unlock();
2164 }
2165 
2166 static void sock_def_error_report(struct sock *sk)
2167 {
2168 	struct socket_wq *wq;
2169 
2170 	rcu_read_lock();
2171 	wq = rcu_dereference(sk->sk_wq);
2172 	if (wq_has_sleeper(wq))
2173 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2174 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2175 	rcu_read_unlock();
2176 }
2177 
2178 static void sock_def_readable(struct sock *sk, int len)
2179 {
2180 	struct socket_wq *wq;
2181 
2182 	rcu_read_lock();
2183 	wq = rcu_dereference(sk->sk_wq);
2184 	if (wq_has_sleeper(wq))
2185 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2186 						POLLRDNORM | POLLRDBAND);
2187 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2188 	rcu_read_unlock();
2189 }
2190 
2191 static void sock_def_write_space(struct sock *sk)
2192 {
2193 	struct socket_wq *wq;
2194 
2195 	rcu_read_lock();
2196 
2197 	/* Do not wake up a writer until he can make "significant"
2198 	 * progress.  --DaveM
2199 	 */
2200 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2201 		wq = rcu_dereference(sk->sk_wq);
2202 		if (wq_has_sleeper(wq))
2203 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2204 						POLLWRNORM | POLLWRBAND);
2205 
2206 		/* Should agree with poll, otherwise some programs break */
2207 		if (sock_writeable(sk))
2208 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2209 	}
2210 
2211 	rcu_read_unlock();
2212 }
2213 
2214 static void sock_def_destruct(struct sock *sk)
2215 {
2216 	kfree(sk->sk_protinfo);
2217 }
2218 
2219 void sk_send_sigurg(struct sock *sk)
2220 {
2221 	if (sk->sk_socket && sk->sk_socket->file)
2222 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2223 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2224 }
2225 EXPORT_SYMBOL(sk_send_sigurg);
2226 
2227 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2228 		    unsigned long expires)
2229 {
2230 	if (!mod_timer(timer, expires))
2231 		sock_hold(sk);
2232 }
2233 EXPORT_SYMBOL(sk_reset_timer);
2234 
2235 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2236 {
2237 	if (del_timer(timer))
2238 		__sock_put(sk);
2239 }
2240 EXPORT_SYMBOL(sk_stop_timer);
2241 
2242 void sock_init_data(struct socket *sock, struct sock *sk)
2243 {
2244 	skb_queue_head_init(&sk->sk_receive_queue);
2245 	skb_queue_head_init(&sk->sk_write_queue);
2246 	skb_queue_head_init(&sk->sk_error_queue);
2247 #ifdef CONFIG_NET_DMA
2248 	skb_queue_head_init(&sk->sk_async_wait_queue);
2249 #endif
2250 
2251 	sk->sk_send_head	=	NULL;
2252 
2253 	init_timer(&sk->sk_timer);
2254 
2255 	sk->sk_allocation	=	GFP_KERNEL;
2256 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2257 	sk->sk_sndbuf		=	sysctl_wmem_default;
2258 	sk->sk_state		=	TCP_CLOSE;
2259 	sk_set_socket(sk, sock);
2260 
2261 	sock_set_flag(sk, SOCK_ZAPPED);
2262 
2263 	if (sock) {
2264 		sk->sk_type	=	sock->type;
2265 		sk->sk_wq	=	sock->wq;
2266 		sock->sk	=	sk;
2267 	} else
2268 		sk->sk_wq	=	NULL;
2269 
2270 	spin_lock_init(&sk->sk_dst_lock);
2271 	rwlock_init(&sk->sk_callback_lock);
2272 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2273 			af_callback_keys + sk->sk_family,
2274 			af_family_clock_key_strings[sk->sk_family]);
2275 
2276 	sk->sk_state_change	=	sock_def_wakeup;
2277 	sk->sk_data_ready	=	sock_def_readable;
2278 	sk->sk_write_space	=	sock_def_write_space;
2279 	sk->sk_error_report	=	sock_def_error_report;
2280 	sk->sk_destruct		=	sock_def_destruct;
2281 
2282 	sk->sk_frag.page	=	NULL;
2283 	sk->sk_frag.offset	=	0;
2284 	sk->sk_peek_off		=	-1;
2285 
2286 	sk->sk_peer_pid 	=	NULL;
2287 	sk->sk_peer_cred	=	NULL;
2288 	sk->sk_write_pending	=	0;
2289 	sk->sk_rcvlowat		=	1;
2290 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2291 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2292 
2293 	sk->sk_stamp = ktime_set(-1L, 0);
2294 
2295 #ifdef CONFIG_NET_LL_RX_POLL
2296 	sk->sk_napi_id		=	0;
2297 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2298 #endif
2299 
2300 	/*
2301 	 * Before updating sk_refcnt, we must commit prior changes to memory
2302 	 * (Documentation/RCU/rculist_nulls.txt for details)
2303 	 */
2304 	smp_wmb();
2305 	atomic_set(&sk->sk_refcnt, 1);
2306 	atomic_set(&sk->sk_drops, 0);
2307 }
2308 EXPORT_SYMBOL(sock_init_data);
2309 
2310 void lock_sock_nested(struct sock *sk, int subclass)
2311 {
2312 	might_sleep();
2313 	spin_lock_bh(&sk->sk_lock.slock);
2314 	if (sk->sk_lock.owned)
2315 		__lock_sock(sk);
2316 	sk->sk_lock.owned = 1;
2317 	spin_unlock(&sk->sk_lock.slock);
2318 	/*
2319 	 * The sk_lock has mutex_lock() semantics here:
2320 	 */
2321 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2322 	local_bh_enable();
2323 }
2324 EXPORT_SYMBOL(lock_sock_nested);
2325 
2326 void release_sock(struct sock *sk)
2327 {
2328 	/*
2329 	 * The sk_lock has mutex_unlock() semantics:
2330 	 */
2331 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2332 
2333 	spin_lock_bh(&sk->sk_lock.slock);
2334 	if (sk->sk_backlog.tail)
2335 		__release_sock(sk);
2336 
2337 	if (sk->sk_prot->release_cb)
2338 		sk->sk_prot->release_cb(sk);
2339 
2340 	sk->sk_lock.owned = 0;
2341 	if (waitqueue_active(&sk->sk_lock.wq))
2342 		wake_up(&sk->sk_lock.wq);
2343 	spin_unlock_bh(&sk->sk_lock.slock);
2344 }
2345 EXPORT_SYMBOL(release_sock);
2346 
2347 /**
2348  * lock_sock_fast - fast version of lock_sock
2349  * @sk: socket
2350  *
2351  * This version should be used for very small section, where process wont block
2352  * return false if fast path is taken
2353  *   sk_lock.slock locked, owned = 0, BH disabled
2354  * return true if slow path is taken
2355  *   sk_lock.slock unlocked, owned = 1, BH enabled
2356  */
2357 bool lock_sock_fast(struct sock *sk)
2358 {
2359 	might_sleep();
2360 	spin_lock_bh(&sk->sk_lock.slock);
2361 
2362 	if (!sk->sk_lock.owned)
2363 		/*
2364 		 * Note : We must disable BH
2365 		 */
2366 		return false;
2367 
2368 	__lock_sock(sk);
2369 	sk->sk_lock.owned = 1;
2370 	spin_unlock(&sk->sk_lock.slock);
2371 	/*
2372 	 * The sk_lock has mutex_lock() semantics here:
2373 	 */
2374 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2375 	local_bh_enable();
2376 	return true;
2377 }
2378 EXPORT_SYMBOL(lock_sock_fast);
2379 
2380 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2381 {
2382 	struct timeval tv;
2383 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2384 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2385 	tv = ktime_to_timeval(sk->sk_stamp);
2386 	if (tv.tv_sec == -1)
2387 		return -ENOENT;
2388 	if (tv.tv_sec == 0) {
2389 		sk->sk_stamp = ktime_get_real();
2390 		tv = ktime_to_timeval(sk->sk_stamp);
2391 	}
2392 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2393 }
2394 EXPORT_SYMBOL(sock_get_timestamp);
2395 
2396 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2397 {
2398 	struct timespec ts;
2399 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2400 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2401 	ts = ktime_to_timespec(sk->sk_stamp);
2402 	if (ts.tv_sec == -1)
2403 		return -ENOENT;
2404 	if (ts.tv_sec == 0) {
2405 		sk->sk_stamp = ktime_get_real();
2406 		ts = ktime_to_timespec(sk->sk_stamp);
2407 	}
2408 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2409 }
2410 EXPORT_SYMBOL(sock_get_timestampns);
2411 
2412 void sock_enable_timestamp(struct sock *sk, int flag)
2413 {
2414 	if (!sock_flag(sk, flag)) {
2415 		unsigned long previous_flags = sk->sk_flags;
2416 
2417 		sock_set_flag(sk, flag);
2418 		/*
2419 		 * we just set one of the two flags which require net
2420 		 * time stamping, but time stamping might have been on
2421 		 * already because of the other one
2422 		 */
2423 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2424 			net_enable_timestamp();
2425 	}
2426 }
2427 
2428 /*
2429  *	Get a socket option on an socket.
2430  *
2431  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2432  *	asynchronous errors should be reported by getsockopt. We assume
2433  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2434  */
2435 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2436 			   char __user *optval, int __user *optlen)
2437 {
2438 	struct sock *sk = sock->sk;
2439 
2440 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2441 }
2442 EXPORT_SYMBOL(sock_common_getsockopt);
2443 
2444 #ifdef CONFIG_COMPAT
2445 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2446 				  char __user *optval, int __user *optlen)
2447 {
2448 	struct sock *sk = sock->sk;
2449 
2450 	if (sk->sk_prot->compat_getsockopt != NULL)
2451 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2452 						      optval, optlen);
2453 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2454 }
2455 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2456 #endif
2457 
2458 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2459 			struct msghdr *msg, size_t size, int flags)
2460 {
2461 	struct sock *sk = sock->sk;
2462 	int addr_len = 0;
2463 	int err;
2464 
2465 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2466 				   flags & ~MSG_DONTWAIT, &addr_len);
2467 	if (err >= 0)
2468 		msg->msg_namelen = addr_len;
2469 	return err;
2470 }
2471 EXPORT_SYMBOL(sock_common_recvmsg);
2472 
2473 /*
2474  *	Set socket options on an inet socket.
2475  */
2476 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2477 			   char __user *optval, unsigned int optlen)
2478 {
2479 	struct sock *sk = sock->sk;
2480 
2481 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2482 }
2483 EXPORT_SYMBOL(sock_common_setsockopt);
2484 
2485 #ifdef CONFIG_COMPAT
2486 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2487 				  char __user *optval, unsigned int optlen)
2488 {
2489 	struct sock *sk = sock->sk;
2490 
2491 	if (sk->sk_prot->compat_setsockopt != NULL)
2492 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2493 						      optval, optlen);
2494 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2495 }
2496 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2497 #endif
2498 
2499 void sk_common_release(struct sock *sk)
2500 {
2501 	if (sk->sk_prot->destroy)
2502 		sk->sk_prot->destroy(sk);
2503 
2504 	/*
2505 	 * Observation: when sock_common_release is called, processes have
2506 	 * no access to socket. But net still has.
2507 	 * Step one, detach it from networking:
2508 	 *
2509 	 * A. Remove from hash tables.
2510 	 */
2511 
2512 	sk->sk_prot->unhash(sk);
2513 
2514 	/*
2515 	 * In this point socket cannot receive new packets, but it is possible
2516 	 * that some packets are in flight because some CPU runs receiver and
2517 	 * did hash table lookup before we unhashed socket. They will achieve
2518 	 * receive queue and will be purged by socket destructor.
2519 	 *
2520 	 * Also we still have packets pending on receive queue and probably,
2521 	 * our own packets waiting in device queues. sock_destroy will drain
2522 	 * receive queue, but transmitted packets will delay socket destruction
2523 	 * until the last reference will be released.
2524 	 */
2525 
2526 	sock_orphan(sk);
2527 
2528 	xfrm_sk_free_policy(sk);
2529 
2530 	sk_refcnt_debug_release(sk);
2531 
2532 	if (sk->sk_frag.page) {
2533 		put_page(sk->sk_frag.page);
2534 		sk->sk_frag.page = NULL;
2535 	}
2536 
2537 	sock_put(sk);
2538 }
2539 EXPORT_SYMBOL(sk_common_release);
2540 
2541 #ifdef CONFIG_PROC_FS
2542 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2543 struct prot_inuse {
2544 	int val[PROTO_INUSE_NR];
2545 };
2546 
2547 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2548 
2549 #ifdef CONFIG_NET_NS
2550 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2551 {
2552 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2553 }
2554 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2555 
2556 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2557 {
2558 	int cpu, idx = prot->inuse_idx;
2559 	int res = 0;
2560 
2561 	for_each_possible_cpu(cpu)
2562 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2563 
2564 	return res >= 0 ? res : 0;
2565 }
2566 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2567 
2568 static int __net_init sock_inuse_init_net(struct net *net)
2569 {
2570 	net->core.inuse = alloc_percpu(struct prot_inuse);
2571 	return net->core.inuse ? 0 : -ENOMEM;
2572 }
2573 
2574 static void __net_exit sock_inuse_exit_net(struct net *net)
2575 {
2576 	free_percpu(net->core.inuse);
2577 }
2578 
2579 static struct pernet_operations net_inuse_ops = {
2580 	.init = sock_inuse_init_net,
2581 	.exit = sock_inuse_exit_net,
2582 };
2583 
2584 static __init int net_inuse_init(void)
2585 {
2586 	if (register_pernet_subsys(&net_inuse_ops))
2587 		panic("Cannot initialize net inuse counters");
2588 
2589 	return 0;
2590 }
2591 
2592 core_initcall(net_inuse_init);
2593 #else
2594 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2595 
2596 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2597 {
2598 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2599 }
2600 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2601 
2602 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2603 {
2604 	int cpu, idx = prot->inuse_idx;
2605 	int res = 0;
2606 
2607 	for_each_possible_cpu(cpu)
2608 		res += per_cpu(prot_inuse, cpu).val[idx];
2609 
2610 	return res >= 0 ? res : 0;
2611 }
2612 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2613 #endif
2614 
2615 static void assign_proto_idx(struct proto *prot)
2616 {
2617 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2618 
2619 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2620 		pr_err("PROTO_INUSE_NR exhausted\n");
2621 		return;
2622 	}
2623 
2624 	set_bit(prot->inuse_idx, proto_inuse_idx);
2625 }
2626 
2627 static void release_proto_idx(struct proto *prot)
2628 {
2629 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2630 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2631 }
2632 #else
2633 static inline void assign_proto_idx(struct proto *prot)
2634 {
2635 }
2636 
2637 static inline void release_proto_idx(struct proto *prot)
2638 {
2639 }
2640 #endif
2641 
2642 int proto_register(struct proto *prot, int alloc_slab)
2643 {
2644 	if (alloc_slab) {
2645 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2646 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2647 					NULL);
2648 
2649 		if (prot->slab == NULL) {
2650 			pr_crit("%s: Can't create sock SLAB cache!\n",
2651 				prot->name);
2652 			goto out;
2653 		}
2654 
2655 		if (prot->rsk_prot != NULL) {
2656 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2657 			if (prot->rsk_prot->slab_name == NULL)
2658 				goto out_free_sock_slab;
2659 
2660 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2661 								 prot->rsk_prot->obj_size, 0,
2662 								 SLAB_HWCACHE_ALIGN, NULL);
2663 
2664 			if (prot->rsk_prot->slab == NULL) {
2665 				pr_crit("%s: Can't create request sock SLAB cache!\n",
2666 					prot->name);
2667 				goto out_free_request_sock_slab_name;
2668 			}
2669 		}
2670 
2671 		if (prot->twsk_prot != NULL) {
2672 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2673 
2674 			if (prot->twsk_prot->twsk_slab_name == NULL)
2675 				goto out_free_request_sock_slab;
2676 
2677 			prot->twsk_prot->twsk_slab =
2678 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2679 						  prot->twsk_prot->twsk_obj_size,
2680 						  0,
2681 						  SLAB_HWCACHE_ALIGN |
2682 							prot->slab_flags,
2683 						  NULL);
2684 			if (prot->twsk_prot->twsk_slab == NULL)
2685 				goto out_free_timewait_sock_slab_name;
2686 		}
2687 	}
2688 
2689 	mutex_lock(&proto_list_mutex);
2690 	list_add(&prot->node, &proto_list);
2691 	assign_proto_idx(prot);
2692 	mutex_unlock(&proto_list_mutex);
2693 	return 0;
2694 
2695 out_free_timewait_sock_slab_name:
2696 	kfree(prot->twsk_prot->twsk_slab_name);
2697 out_free_request_sock_slab:
2698 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2699 		kmem_cache_destroy(prot->rsk_prot->slab);
2700 		prot->rsk_prot->slab = NULL;
2701 	}
2702 out_free_request_sock_slab_name:
2703 	if (prot->rsk_prot)
2704 		kfree(prot->rsk_prot->slab_name);
2705 out_free_sock_slab:
2706 	kmem_cache_destroy(prot->slab);
2707 	prot->slab = NULL;
2708 out:
2709 	return -ENOBUFS;
2710 }
2711 EXPORT_SYMBOL(proto_register);
2712 
2713 void proto_unregister(struct proto *prot)
2714 {
2715 	mutex_lock(&proto_list_mutex);
2716 	release_proto_idx(prot);
2717 	list_del(&prot->node);
2718 	mutex_unlock(&proto_list_mutex);
2719 
2720 	if (prot->slab != NULL) {
2721 		kmem_cache_destroy(prot->slab);
2722 		prot->slab = NULL;
2723 	}
2724 
2725 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2726 		kmem_cache_destroy(prot->rsk_prot->slab);
2727 		kfree(prot->rsk_prot->slab_name);
2728 		prot->rsk_prot->slab = NULL;
2729 	}
2730 
2731 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2732 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2733 		kfree(prot->twsk_prot->twsk_slab_name);
2734 		prot->twsk_prot->twsk_slab = NULL;
2735 	}
2736 }
2737 EXPORT_SYMBOL(proto_unregister);
2738 
2739 #ifdef CONFIG_PROC_FS
2740 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2741 	__acquires(proto_list_mutex)
2742 {
2743 	mutex_lock(&proto_list_mutex);
2744 	return seq_list_start_head(&proto_list, *pos);
2745 }
2746 
2747 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2748 {
2749 	return seq_list_next(v, &proto_list, pos);
2750 }
2751 
2752 static void proto_seq_stop(struct seq_file *seq, void *v)
2753 	__releases(proto_list_mutex)
2754 {
2755 	mutex_unlock(&proto_list_mutex);
2756 }
2757 
2758 static char proto_method_implemented(const void *method)
2759 {
2760 	return method == NULL ? 'n' : 'y';
2761 }
2762 static long sock_prot_memory_allocated(struct proto *proto)
2763 {
2764 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2765 }
2766 
2767 static char *sock_prot_memory_pressure(struct proto *proto)
2768 {
2769 	return proto->memory_pressure != NULL ?
2770 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2771 }
2772 
2773 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2774 {
2775 
2776 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2777 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2778 		   proto->name,
2779 		   proto->obj_size,
2780 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2781 		   sock_prot_memory_allocated(proto),
2782 		   sock_prot_memory_pressure(proto),
2783 		   proto->max_header,
2784 		   proto->slab == NULL ? "no" : "yes",
2785 		   module_name(proto->owner),
2786 		   proto_method_implemented(proto->close),
2787 		   proto_method_implemented(proto->connect),
2788 		   proto_method_implemented(proto->disconnect),
2789 		   proto_method_implemented(proto->accept),
2790 		   proto_method_implemented(proto->ioctl),
2791 		   proto_method_implemented(proto->init),
2792 		   proto_method_implemented(proto->destroy),
2793 		   proto_method_implemented(proto->shutdown),
2794 		   proto_method_implemented(proto->setsockopt),
2795 		   proto_method_implemented(proto->getsockopt),
2796 		   proto_method_implemented(proto->sendmsg),
2797 		   proto_method_implemented(proto->recvmsg),
2798 		   proto_method_implemented(proto->sendpage),
2799 		   proto_method_implemented(proto->bind),
2800 		   proto_method_implemented(proto->backlog_rcv),
2801 		   proto_method_implemented(proto->hash),
2802 		   proto_method_implemented(proto->unhash),
2803 		   proto_method_implemented(proto->get_port),
2804 		   proto_method_implemented(proto->enter_memory_pressure));
2805 }
2806 
2807 static int proto_seq_show(struct seq_file *seq, void *v)
2808 {
2809 	if (v == &proto_list)
2810 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2811 			   "protocol",
2812 			   "size",
2813 			   "sockets",
2814 			   "memory",
2815 			   "press",
2816 			   "maxhdr",
2817 			   "slab",
2818 			   "module",
2819 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2820 	else
2821 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2822 	return 0;
2823 }
2824 
2825 static const struct seq_operations proto_seq_ops = {
2826 	.start  = proto_seq_start,
2827 	.next   = proto_seq_next,
2828 	.stop   = proto_seq_stop,
2829 	.show   = proto_seq_show,
2830 };
2831 
2832 static int proto_seq_open(struct inode *inode, struct file *file)
2833 {
2834 	return seq_open_net(inode, file, &proto_seq_ops,
2835 			    sizeof(struct seq_net_private));
2836 }
2837 
2838 static const struct file_operations proto_seq_fops = {
2839 	.owner		= THIS_MODULE,
2840 	.open		= proto_seq_open,
2841 	.read		= seq_read,
2842 	.llseek		= seq_lseek,
2843 	.release	= seq_release_net,
2844 };
2845 
2846 static __net_init int proto_init_net(struct net *net)
2847 {
2848 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2849 		return -ENOMEM;
2850 
2851 	return 0;
2852 }
2853 
2854 static __net_exit void proto_exit_net(struct net *net)
2855 {
2856 	remove_proc_entry("protocols", net->proc_net);
2857 }
2858 
2859 
2860 static __net_initdata struct pernet_operations proto_net_ops = {
2861 	.init = proto_init_net,
2862 	.exit = proto_exit_net,
2863 };
2864 
2865 static int __init proto_init(void)
2866 {
2867 	return register_pernet_subsys(&proto_net_ops);
2868 }
2869 
2870 subsys_initcall(proto_init);
2871 
2872 #endif /* PROC_FS */
2873