xref: /openbmc/linux/net/core/sock.c (revision bc000245)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 
135 #include <linux/filter.h>
136 
137 #include <trace/events/sock.h>
138 
139 #ifdef CONFIG_INET
140 #include <net/tcp.h>
141 #endif
142 
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 #ifdef CONFIG_MEMCG_KMEM
149 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
150 {
151 	struct proto *proto;
152 	int ret = 0;
153 
154 	mutex_lock(&proto_list_mutex);
155 	list_for_each_entry(proto, &proto_list, node) {
156 		if (proto->init_cgroup) {
157 			ret = proto->init_cgroup(memcg, ss);
158 			if (ret)
159 				goto out;
160 		}
161 	}
162 
163 	mutex_unlock(&proto_list_mutex);
164 	return ret;
165 out:
166 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
167 		if (proto->destroy_cgroup)
168 			proto->destroy_cgroup(memcg);
169 	mutex_unlock(&proto_list_mutex);
170 	return ret;
171 }
172 
173 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
174 {
175 	struct proto *proto;
176 
177 	mutex_lock(&proto_list_mutex);
178 	list_for_each_entry_reverse(proto, &proto_list, node)
179 		if (proto->destroy_cgroup)
180 			proto->destroy_cgroup(memcg);
181 	mutex_unlock(&proto_list_mutex);
182 }
183 #endif
184 
185 /*
186  * Each address family might have different locking rules, so we have
187  * one slock key per address family:
188  */
189 static struct lock_class_key af_family_keys[AF_MAX];
190 static struct lock_class_key af_family_slock_keys[AF_MAX];
191 
192 #if defined(CONFIG_MEMCG_KMEM)
193 struct static_key memcg_socket_limit_enabled;
194 EXPORT_SYMBOL(memcg_socket_limit_enabled);
195 #endif
196 
197 /*
198  * Make lock validator output more readable. (we pre-construct these
199  * strings build-time, so that runtime initialization of socket
200  * locks is fast):
201  */
202 static const char *const af_family_key_strings[AF_MAX+1] = {
203   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
204   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
205   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
206   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
207   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
208   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
209   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
210   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
211   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
212   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
213   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
214   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
215   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
216   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
217 };
218 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
219   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
220   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
221   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
222   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
223   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
224   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
225   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
226   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
227   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
228   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
229   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
230   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
231   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
232   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
233 };
234 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
235   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
236   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
237   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
238   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
239   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
240   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
241   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
242   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
243   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
244   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
245   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
246   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
247   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
248   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
249 };
250 
251 /*
252  * sk_callback_lock locking rules are per-address-family,
253  * so split the lock classes by using a per-AF key:
254  */
255 static struct lock_class_key af_callback_keys[AF_MAX];
256 
257 /* Take into consideration the size of the struct sk_buff overhead in the
258  * determination of these values, since that is non-constant across
259  * platforms.  This makes socket queueing behavior and performance
260  * not depend upon such differences.
261  */
262 #define _SK_MEM_PACKETS		256
263 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
264 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
265 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
266 
267 /* Run time adjustable parameters. */
268 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
269 EXPORT_SYMBOL(sysctl_wmem_max);
270 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
271 EXPORT_SYMBOL(sysctl_rmem_max);
272 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
273 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
274 
275 /* Maximal space eaten by iovec or ancillary data plus some space */
276 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
277 EXPORT_SYMBOL(sysctl_optmem_max);
278 
279 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
280 EXPORT_SYMBOL_GPL(memalloc_socks);
281 
282 /**
283  * sk_set_memalloc - sets %SOCK_MEMALLOC
284  * @sk: socket to set it on
285  *
286  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
287  * It's the responsibility of the admin to adjust min_free_kbytes
288  * to meet the requirements
289  */
290 void sk_set_memalloc(struct sock *sk)
291 {
292 	sock_set_flag(sk, SOCK_MEMALLOC);
293 	sk->sk_allocation |= __GFP_MEMALLOC;
294 	static_key_slow_inc(&memalloc_socks);
295 }
296 EXPORT_SYMBOL_GPL(sk_set_memalloc);
297 
298 void sk_clear_memalloc(struct sock *sk)
299 {
300 	sock_reset_flag(sk, SOCK_MEMALLOC);
301 	sk->sk_allocation &= ~__GFP_MEMALLOC;
302 	static_key_slow_dec(&memalloc_socks);
303 
304 	/*
305 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
306 	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
307 	 * it has rmem allocations there is a risk that the user of the
308 	 * socket cannot make forward progress due to exceeding the rmem
309 	 * limits. By rights, sk_clear_memalloc() should only be called
310 	 * on sockets being torn down but warn and reset the accounting if
311 	 * that assumption breaks.
312 	 */
313 	if (WARN_ON(sk->sk_forward_alloc))
314 		sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317 
318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 	int ret;
321 	unsigned long pflags = current->flags;
322 
323 	/* these should have been dropped before queueing */
324 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325 
326 	current->flags |= PF_MEMALLOC;
327 	ret = sk->sk_backlog_rcv(sk, skb);
328 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
329 
330 	return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333 
334 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
335 {
336 	struct timeval tv;
337 
338 	if (optlen < sizeof(tv))
339 		return -EINVAL;
340 	if (copy_from_user(&tv, optval, sizeof(tv)))
341 		return -EFAULT;
342 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
343 		return -EDOM;
344 
345 	if (tv.tv_sec < 0) {
346 		static int warned __read_mostly;
347 
348 		*timeo_p = 0;
349 		if (warned < 10 && net_ratelimit()) {
350 			warned++;
351 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
352 				__func__, current->comm, task_pid_nr(current));
353 		}
354 		return 0;
355 	}
356 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
357 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
358 		return 0;
359 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
360 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
361 	return 0;
362 }
363 
364 static void sock_warn_obsolete_bsdism(const char *name)
365 {
366 	static int warned;
367 	static char warncomm[TASK_COMM_LEN];
368 	if (strcmp(warncomm, current->comm) && warned < 5) {
369 		strcpy(warncomm,  current->comm);
370 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
371 			warncomm, name);
372 		warned++;
373 	}
374 }
375 
376 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
377 
378 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
379 {
380 	if (sk->sk_flags & flags) {
381 		sk->sk_flags &= ~flags;
382 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
383 			net_disable_timestamp();
384 	}
385 }
386 
387 
388 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
389 {
390 	int err;
391 	int skb_len;
392 	unsigned long flags;
393 	struct sk_buff_head *list = &sk->sk_receive_queue;
394 
395 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
396 		atomic_inc(&sk->sk_drops);
397 		trace_sock_rcvqueue_full(sk, skb);
398 		return -ENOMEM;
399 	}
400 
401 	err = sk_filter(sk, skb);
402 	if (err)
403 		return err;
404 
405 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
406 		atomic_inc(&sk->sk_drops);
407 		return -ENOBUFS;
408 	}
409 
410 	skb->dev = NULL;
411 	skb_set_owner_r(skb, sk);
412 
413 	/* Cache the SKB length before we tack it onto the receive
414 	 * queue.  Once it is added it no longer belongs to us and
415 	 * may be freed by other threads of control pulling packets
416 	 * from the queue.
417 	 */
418 	skb_len = skb->len;
419 
420 	/* we escape from rcu protected region, make sure we dont leak
421 	 * a norefcounted dst
422 	 */
423 	skb_dst_force(skb);
424 
425 	spin_lock_irqsave(&list->lock, flags);
426 	skb->dropcount = atomic_read(&sk->sk_drops);
427 	__skb_queue_tail(list, skb);
428 	spin_unlock_irqrestore(&list->lock, flags);
429 
430 	if (!sock_flag(sk, SOCK_DEAD))
431 		sk->sk_data_ready(sk, skb_len);
432 	return 0;
433 }
434 EXPORT_SYMBOL(sock_queue_rcv_skb);
435 
436 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
437 {
438 	int rc = NET_RX_SUCCESS;
439 
440 	if (sk_filter(sk, skb))
441 		goto discard_and_relse;
442 
443 	skb->dev = NULL;
444 
445 	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
446 		atomic_inc(&sk->sk_drops);
447 		goto discard_and_relse;
448 	}
449 	if (nested)
450 		bh_lock_sock_nested(sk);
451 	else
452 		bh_lock_sock(sk);
453 	if (!sock_owned_by_user(sk)) {
454 		/*
455 		 * trylock + unlock semantics:
456 		 */
457 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
458 
459 		rc = sk_backlog_rcv(sk, skb);
460 
461 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
462 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
463 		bh_unlock_sock(sk);
464 		atomic_inc(&sk->sk_drops);
465 		goto discard_and_relse;
466 	}
467 
468 	bh_unlock_sock(sk);
469 out:
470 	sock_put(sk);
471 	return rc;
472 discard_and_relse:
473 	kfree_skb(skb);
474 	goto out;
475 }
476 EXPORT_SYMBOL(sk_receive_skb);
477 
478 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
479 {
480 	struct dst_entry *dst = __sk_dst_get(sk);
481 
482 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
483 		sk_tx_queue_clear(sk);
484 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
485 		dst_release(dst);
486 		return NULL;
487 	}
488 
489 	return dst;
490 }
491 EXPORT_SYMBOL(__sk_dst_check);
492 
493 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
494 {
495 	struct dst_entry *dst = sk_dst_get(sk);
496 
497 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498 		sk_dst_reset(sk);
499 		dst_release(dst);
500 		return NULL;
501 	}
502 
503 	return dst;
504 }
505 EXPORT_SYMBOL(sk_dst_check);
506 
507 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
508 				int optlen)
509 {
510 	int ret = -ENOPROTOOPT;
511 #ifdef CONFIG_NETDEVICES
512 	struct net *net = sock_net(sk);
513 	char devname[IFNAMSIZ];
514 	int index;
515 
516 	/* Sorry... */
517 	ret = -EPERM;
518 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
519 		goto out;
520 
521 	ret = -EINVAL;
522 	if (optlen < 0)
523 		goto out;
524 
525 	/* Bind this socket to a particular device like "eth0",
526 	 * as specified in the passed interface name. If the
527 	 * name is "" or the option length is zero the socket
528 	 * is not bound.
529 	 */
530 	if (optlen > IFNAMSIZ - 1)
531 		optlen = IFNAMSIZ - 1;
532 	memset(devname, 0, sizeof(devname));
533 
534 	ret = -EFAULT;
535 	if (copy_from_user(devname, optval, optlen))
536 		goto out;
537 
538 	index = 0;
539 	if (devname[0] != '\0') {
540 		struct net_device *dev;
541 
542 		rcu_read_lock();
543 		dev = dev_get_by_name_rcu(net, devname);
544 		if (dev)
545 			index = dev->ifindex;
546 		rcu_read_unlock();
547 		ret = -ENODEV;
548 		if (!dev)
549 			goto out;
550 	}
551 
552 	lock_sock(sk);
553 	sk->sk_bound_dev_if = index;
554 	sk_dst_reset(sk);
555 	release_sock(sk);
556 
557 	ret = 0;
558 
559 out:
560 #endif
561 
562 	return ret;
563 }
564 
565 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
566 				int __user *optlen, int len)
567 {
568 	int ret = -ENOPROTOOPT;
569 #ifdef CONFIG_NETDEVICES
570 	struct net *net = sock_net(sk);
571 	char devname[IFNAMSIZ];
572 
573 	if (sk->sk_bound_dev_if == 0) {
574 		len = 0;
575 		goto zero;
576 	}
577 
578 	ret = -EINVAL;
579 	if (len < IFNAMSIZ)
580 		goto out;
581 
582 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
583 	if (ret)
584 		goto out;
585 
586 	len = strlen(devname) + 1;
587 
588 	ret = -EFAULT;
589 	if (copy_to_user(optval, devname, len))
590 		goto out;
591 
592 zero:
593 	ret = -EFAULT;
594 	if (put_user(len, optlen))
595 		goto out;
596 
597 	ret = 0;
598 
599 out:
600 #endif
601 
602 	return ret;
603 }
604 
605 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
606 {
607 	if (valbool)
608 		sock_set_flag(sk, bit);
609 	else
610 		sock_reset_flag(sk, bit);
611 }
612 
613 /*
614  *	This is meant for all protocols to use and covers goings on
615  *	at the socket level. Everything here is generic.
616  */
617 
618 int sock_setsockopt(struct socket *sock, int level, int optname,
619 		    char __user *optval, unsigned int optlen)
620 {
621 	struct sock *sk = sock->sk;
622 	int val;
623 	int valbool;
624 	struct linger ling;
625 	int ret = 0;
626 
627 	/*
628 	 *	Options without arguments
629 	 */
630 
631 	if (optname == SO_BINDTODEVICE)
632 		return sock_setbindtodevice(sk, optval, optlen);
633 
634 	if (optlen < sizeof(int))
635 		return -EINVAL;
636 
637 	if (get_user(val, (int __user *)optval))
638 		return -EFAULT;
639 
640 	valbool = val ? 1 : 0;
641 
642 	lock_sock(sk);
643 
644 	switch (optname) {
645 	case SO_DEBUG:
646 		if (val && !capable(CAP_NET_ADMIN))
647 			ret = -EACCES;
648 		else
649 			sock_valbool_flag(sk, SOCK_DBG, valbool);
650 		break;
651 	case SO_REUSEADDR:
652 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
653 		break;
654 	case SO_REUSEPORT:
655 		sk->sk_reuseport = valbool;
656 		break;
657 	case SO_TYPE:
658 	case SO_PROTOCOL:
659 	case SO_DOMAIN:
660 	case SO_ERROR:
661 		ret = -ENOPROTOOPT;
662 		break;
663 	case SO_DONTROUTE:
664 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
665 		break;
666 	case SO_BROADCAST:
667 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
668 		break;
669 	case SO_SNDBUF:
670 		/* Don't error on this BSD doesn't and if you think
671 		 * about it this is right. Otherwise apps have to
672 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
673 		 * are treated in BSD as hints
674 		 */
675 		val = min_t(u32, val, sysctl_wmem_max);
676 set_sndbuf:
677 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
678 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
679 		/* Wake up sending tasks if we upped the value. */
680 		sk->sk_write_space(sk);
681 		break;
682 
683 	case SO_SNDBUFFORCE:
684 		if (!capable(CAP_NET_ADMIN)) {
685 			ret = -EPERM;
686 			break;
687 		}
688 		goto set_sndbuf;
689 
690 	case SO_RCVBUF:
691 		/* Don't error on this BSD doesn't and if you think
692 		 * about it this is right. Otherwise apps have to
693 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
694 		 * are treated in BSD as hints
695 		 */
696 		val = min_t(u32, val, sysctl_rmem_max);
697 set_rcvbuf:
698 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
699 		/*
700 		 * We double it on the way in to account for
701 		 * "struct sk_buff" etc. overhead.   Applications
702 		 * assume that the SO_RCVBUF setting they make will
703 		 * allow that much actual data to be received on that
704 		 * socket.
705 		 *
706 		 * Applications are unaware that "struct sk_buff" and
707 		 * other overheads allocate from the receive buffer
708 		 * during socket buffer allocation.
709 		 *
710 		 * And after considering the possible alternatives,
711 		 * returning the value we actually used in getsockopt
712 		 * is the most desirable behavior.
713 		 */
714 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
715 		break;
716 
717 	case SO_RCVBUFFORCE:
718 		if (!capable(CAP_NET_ADMIN)) {
719 			ret = -EPERM;
720 			break;
721 		}
722 		goto set_rcvbuf;
723 
724 	case SO_KEEPALIVE:
725 #ifdef CONFIG_INET
726 		if (sk->sk_protocol == IPPROTO_TCP &&
727 		    sk->sk_type == SOCK_STREAM)
728 			tcp_set_keepalive(sk, valbool);
729 #endif
730 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
731 		break;
732 
733 	case SO_OOBINLINE:
734 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
735 		break;
736 
737 	case SO_NO_CHECK:
738 		sk->sk_no_check = valbool;
739 		break;
740 
741 	case SO_PRIORITY:
742 		if ((val >= 0 && val <= 6) ||
743 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
744 			sk->sk_priority = val;
745 		else
746 			ret = -EPERM;
747 		break;
748 
749 	case SO_LINGER:
750 		if (optlen < sizeof(ling)) {
751 			ret = -EINVAL;	/* 1003.1g */
752 			break;
753 		}
754 		if (copy_from_user(&ling, optval, sizeof(ling))) {
755 			ret = -EFAULT;
756 			break;
757 		}
758 		if (!ling.l_onoff)
759 			sock_reset_flag(sk, SOCK_LINGER);
760 		else {
761 #if (BITS_PER_LONG == 32)
762 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
763 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
764 			else
765 #endif
766 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
767 			sock_set_flag(sk, SOCK_LINGER);
768 		}
769 		break;
770 
771 	case SO_BSDCOMPAT:
772 		sock_warn_obsolete_bsdism("setsockopt");
773 		break;
774 
775 	case SO_PASSCRED:
776 		if (valbool)
777 			set_bit(SOCK_PASSCRED, &sock->flags);
778 		else
779 			clear_bit(SOCK_PASSCRED, &sock->flags);
780 		break;
781 
782 	case SO_TIMESTAMP:
783 	case SO_TIMESTAMPNS:
784 		if (valbool)  {
785 			if (optname == SO_TIMESTAMP)
786 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
787 			else
788 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
789 			sock_set_flag(sk, SOCK_RCVTSTAMP);
790 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
791 		} else {
792 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
793 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
794 		}
795 		break;
796 
797 	case SO_TIMESTAMPING:
798 		if (val & ~SOF_TIMESTAMPING_MASK) {
799 			ret = -EINVAL;
800 			break;
801 		}
802 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
803 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
804 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
805 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
806 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
807 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
808 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
809 			sock_enable_timestamp(sk,
810 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
811 		else
812 			sock_disable_timestamp(sk,
813 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
814 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
815 				  val & SOF_TIMESTAMPING_SOFTWARE);
816 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
817 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
818 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
819 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
820 		break;
821 
822 	case SO_RCVLOWAT:
823 		if (val < 0)
824 			val = INT_MAX;
825 		sk->sk_rcvlowat = val ? : 1;
826 		break;
827 
828 	case SO_RCVTIMEO:
829 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
830 		break;
831 
832 	case SO_SNDTIMEO:
833 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
834 		break;
835 
836 	case SO_ATTACH_FILTER:
837 		ret = -EINVAL;
838 		if (optlen == sizeof(struct sock_fprog)) {
839 			struct sock_fprog fprog;
840 
841 			ret = -EFAULT;
842 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
843 				break;
844 
845 			ret = sk_attach_filter(&fprog, sk);
846 		}
847 		break;
848 
849 	case SO_DETACH_FILTER:
850 		ret = sk_detach_filter(sk);
851 		break;
852 
853 	case SO_LOCK_FILTER:
854 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
855 			ret = -EPERM;
856 		else
857 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
858 		break;
859 
860 	case SO_PASSSEC:
861 		if (valbool)
862 			set_bit(SOCK_PASSSEC, &sock->flags);
863 		else
864 			clear_bit(SOCK_PASSSEC, &sock->flags);
865 		break;
866 	case SO_MARK:
867 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
868 			ret = -EPERM;
869 		else
870 			sk->sk_mark = val;
871 		break;
872 
873 		/* We implement the SO_SNDLOWAT etc to
874 		   not be settable (1003.1g 5.3) */
875 	case SO_RXQ_OVFL:
876 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
877 		break;
878 
879 	case SO_WIFI_STATUS:
880 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
881 		break;
882 
883 	case SO_PEEK_OFF:
884 		if (sock->ops->set_peek_off)
885 			ret = sock->ops->set_peek_off(sk, val);
886 		else
887 			ret = -EOPNOTSUPP;
888 		break;
889 
890 	case SO_NOFCS:
891 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
892 		break;
893 
894 	case SO_SELECT_ERR_QUEUE:
895 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
896 		break;
897 
898 #ifdef CONFIG_NET_RX_BUSY_POLL
899 	case SO_BUSY_POLL:
900 		/* allow unprivileged users to decrease the value */
901 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
902 			ret = -EPERM;
903 		else {
904 			if (val < 0)
905 				ret = -EINVAL;
906 			else
907 				sk->sk_ll_usec = val;
908 		}
909 		break;
910 #endif
911 
912 	case SO_MAX_PACING_RATE:
913 		sk->sk_max_pacing_rate = val;
914 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
915 					 sk->sk_max_pacing_rate);
916 		break;
917 
918 	default:
919 		ret = -ENOPROTOOPT;
920 		break;
921 	}
922 	release_sock(sk);
923 	return ret;
924 }
925 EXPORT_SYMBOL(sock_setsockopt);
926 
927 
928 void cred_to_ucred(struct pid *pid, const struct cred *cred,
929 		   struct ucred *ucred)
930 {
931 	ucred->pid = pid_vnr(pid);
932 	ucred->uid = ucred->gid = -1;
933 	if (cred) {
934 		struct user_namespace *current_ns = current_user_ns();
935 
936 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
937 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
938 	}
939 }
940 EXPORT_SYMBOL_GPL(cred_to_ucred);
941 
942 int sock_getsockopt(struct socket *sock, int level, int optname,
943 		    char __user *optval, int __user *optlen)
944 {
945 	struct sock *sk = sock->sk;
946 
947 	union {
948 		int val;
949 		struct linger ling;
950 		struct timeval tm;
951 	} v;
952 
953 	int lv = sizeof(int);
954 	int len;
955 
956 	if (get_user(len, optlen))
957 		return -EFAULT;
958 	if (len < 0)
959 		return -EINVAL;
960 
961 	memset(&v, 0, sizeof(v));
962 
963 	switch (optname) {
964 	case SO_DEBUG:
965 		v.val = sock_flag(sk, SOCK_DBG);
966 		break;
967 
968 	case SO_DONTROUTE:
969 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
970 		break;
971 
972 	case SO_BROADCAST:
973 		v.val = sock_flag(sk, SOCK_BROADCAST);
974 		break;
975 
976 	case SO_SNDBUF:
977 		v.val = sk->sk_sndbuf;
978 		break;
979 
980 	case SO_RCVBUF:
981 		v.val = sk->sk_rcvbuf;
982 		break;
983 
984 	case SO_REUSEADDR:
985 		v.val = sk->sk_reuse;
986 		break;
987 
988 	case SO_REUSEPORT:
989 		v.val = sk->sk_reuseport;
990 		break;
991 
992 	case SO_KEEPALIVE:
993 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
994 		break;
995 
996 	case SO_TYPE:
997 		v.val = sk->sk_type;
998 		break;
999 
1000 	case SO_PROTOCOL:
1001 		v.val = sk->sk_protocol;
1002 		break;
1003 
1004 	case SO_DOMAIN:
1005 		v.val = sk->sk_family;
1006 		break;
1007 
1008 	case SO_ERROR:
1009 		v.val = -sock_error(sk);
1010 		if (v.val == 0)
1011 			v.val = xchg(&sk->sk_err_soft, 0);
1012 		break;
1013 
1014 	case SO_OOBINLINE:
1015 		v.val = sock_flag(sk, SOCK_URGINLINE);
1016 		break;
1017 
1018 	case SO_NO_CHECK:
1019 		v.val = sk->sk_no_check;
1020 		break;
1021 
1022 	case SO_PRIORITY:
1023 		v.val = sk->sk_priority;
1024 		break;
1025 
1026 	case SO_LINGER:
1027 		lv		= sizeof(v.ling);
1028 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1029 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1030 		break;
1031 
1032 	case SO_BSDCOMPAT:
1033 		sock_warn_obsolete_bsdism("getsockopt");
1034 		break;
1035 
1036 	case SO_TIMESTAMP:
1037 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1038 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1039 		break;
1040 
1041 	case SO_TIMESTAMPNS:
1042 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1043 		break;
1044 
1045 	case SO_TIMESTAMPING:
1046 		v.val = 0;
1047 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1048 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1049 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1050 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1051 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1052 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1053 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1054 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1055 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1056 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1057 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1058 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1059 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1060 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1061 		break;
1062 
1063 	case SO_RCVTIMEO:
1064 		lv = sizeof(struct timeval);
1065 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1066 			v.tm.tv_sec = 0;
1067 			v.tm.tv_usec = 0;
1068 		} else {
1069 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1070 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1071 		}
1072 		break;
1073 
1074 	case SO_SNDTIMEO:
1075 		lv = sizeof(struct timeval);
1076 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1077 			v.tm.tv_sec = 0;
1078 			v.tm.tv_usec = 0;
1079 		} else {
1080 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1081 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1082 		}
1083 		break;
1084 
1085 	case SO_RCVLOWAT:
1086 		v.val = sk->sk_rcvlowat;
1087 		break;
1088 
1089 	case SO_SNDLOWAT:
1090 		v.val = 1;
1091 		break;
1092 
1093 	case SO_PASSCRED:
1094 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1095 		break;
1096 
1097 	case SO_PEERCRED:
1098 	{
1099 		struct ucred peercred;
1100 		if (len > sizeof(peercred))
1101 			len = sizeof(peercred);
1102 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1103 		if (copy_to_user(optval, &peercred, len))
1104 			return -EFAULT;
1105 		goto lenout;
1106 	}
1107 
1108 	case SO_PEERNAME:
1109 	{
1110 		char address[128];
1111 
1112 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1113 			return -ENOTCONN;
1114 		if (lv < len)
1115 			return -EINVAL;
1116 		if (copy_to_user(optval, address, len))
1117 			return -EFAULT;
1118 		goto lenout;
1119 	}
1120 
1121 	/* Dubious BSD thing... Probably nobody even uses it, but
1122 	 * the UNIX standard wants it for whatever reason... -DaveM
1123 	 */
1124 	case SO_ACCEPTCONN:
1125 		v.val = sk->sk_state == TCP_LISTEN;
1126 		break;
1127 
1128 	case SO_PASSSEC:
1129 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1130 		break;
1131 
1132 	case SO_PEERSEC:
1133 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1134 
1135 	case SO_MARK:
1136 		v.val = sk->sk_mark;
1137 		break;
1138 
1139 	case SO_RXQ_OVFL:
1140 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1141 		break;
1142 
1143 	case SO_WIFI_STATUS:
1144 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1145 		break;
1146 
1147 	case SO_PEEK_OFF:
1148 		if (!sock->ops->set_peek_off)
1149 			return -EOPNOTSUPP;
1150 
1151 		v.val = sk->sk_peek_off;
1152 		break;
1153 	case SO_NOFCS:
1154 		v.val = sock_flag(sk, SOCK_NOFCS);
1155 		break;
1156 
1157 	case SO_BINDTODEVICE:
1158 		return sock_getbindtodevice(sk, optval, optlen, len);
1159 
1160 	case SO_GET_FILTER:
1161 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1162 		if (len < 0)
1163 			return len;
1164 
1165 		goto lenout;
1166 
1167 	case SO_LOCK_FILTER:
1168 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1169 		break;
1170 
1171 	case SO_SELECT_ERR_QUEUE:
1172 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1173 		break;
1174 
1175 #ifdef CONFIG_NET_RX_BUSY_POLL
1176 	case SO_BUSY_POLL:
1177 		v.val = sk->sk_ll_usec;
1178 		break;
1179 #endif
1180 
1181 	case SO_MAX_PACING_RATE:
1182 		v.val = sk->sk_max_pacing_rate;
1183 		break;
1184 
1185 	default:
1186 		return -ENOPROTOOPT;
1187 	}
1188 
1189 	if (len > lv)
1190 		len = lv;
1191 	if (copy_to_user(optval, &v, len))
1192 		return -EFAULT;
1193 lenout:
1194 	if (put_user(len, optlen))
1195 		return -EFAULT;
1196 	return 0;
1197 }
1198 
1199 /*
1200  * Initialize an sk_lock.
1201  *
1202  * (We also register the sk_lock with the lock validator.)
1203  */
1204 static inline void sock_lock_init(struct sock *sk)
1205 {
1206 	sock_lock_init_class_and_name(sk,
1207 			af_family_slock_key_strings[sk->sk_family],
1208 			af_family_slock_keys + sk->sk_family,
1209 			af_family_key_strings[sk->sk_family],
1210 			af_family_keys + sk->sk_family);
1211 }
1212 
1213 /*
1214  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1215  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1216  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1217  */
1218 static void sock_copy(struct sock *nsk, const struct sock *osk)
1219 {
1220 #ifdef CONFIG_SECURITY_NETWORK
1221 	void *sptr = nsk->sk_security;
1222 #endif
1223 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1224 
1225 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1226 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1227 
1228 #ifdef CONFIG_SECURITY_NETWORK
1229 	nsk->sk_security = sptr;
1230 	security_sk_clone(osk, nsk);
1231 #endif
1232 }
1233 
1234 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1235 {
1236 	unsigned long nulls1, nulls2;
1237 
1238 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1239 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1240 	if (nulls1 > nulls2)
1241 		swap(nulls1, nulls2);
1242 
1243 	if (nulls1 != 0)
1244 		memset((char *)sk, 0, nulls1);
1245 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1246 	       nulls2 - nulls1 - sizeof(void *));
1247 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1248 	       size - nulls2 - sizeof(void *));
1249 }
1250 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1251 
1252 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1253 		int family)
1254 {
1255 	struct sock *sk;
1256 	struct kmem_cache *slab;
1257 
1258 	slab = prot->slab;
1259 	if (slab != NULL) {
1260 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1261 		if (!sk)
1262 			return sk;
1263 		if (priority & __GFP_ZERO) {
1264 			if (prot->clear_sk)
1265 				prot->clear_sk(sk, prot->obj_size);
1266 			else
1267 				sk_prot_clear_nulls(sk, prot->obj_size);
1268 		}
1269 	} else
1270 		sk = kmalloc(prot->obj_size, priority);
1271 
1272 	if (sk != NULL) {
1273 		kmemcheck_annotate_bitfield(sk, flags);
1274 
1275 		if (security_sk_alloc(sk, family, priority))
1276 			goto out_free;
1277 
1278 		if (!try_module_get(prot->owner))
1279 			goto out_free_sec;
1280 		sk_tx_queue_clear(sk);
1281 	}
1282 
1283 	return sk;
1284 
1285 out_free_sec:
1286 	security_sk_free(sk);
1287 out_free:
1288 	if (slab != NULL)
1289 		kmem_cache_free(slab, sk);
1290 	else
1291 		kfree(sk);
1292 	return NULL;
1293 }
1294 
1295 static void sk_prot_free(struct proto *prot, struct sock *sk)
1296 {
1297 	struct kmem_cache *slab;
1298 	struct module *owner;
1299 
1300 	owner = prot->owner;
1301 	slab = prot->slab;
1302 
1303 	security_sk_free(sk);
1304 	if (slab != NULL)
1305 		kmem_cache_free(slab, sk);
1306 	else
1307 		kfree(sk);
1308 	module_put(owner);
1309 }
1310 
1311 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1312 void sock_update_classid(struct sock *sk)
1313 {
1314 	u32 classid;
1315 
1316 	classid = task_cls_classid(current);
1317 	if (classid != sk->sk_classid)
1318 		sk->sk_classid = classid;
1319 }
1320 EXPORT_SYMBOL(sock_update_classid);
1321 #endif
1322 
1323 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1324 void sock_update_netprioidx(struct sock *sk)
1325 {
1326 	if (in_interrupt())
1327 		return;
1328 
1329 	sk->sk_cgrp_prioidx = task_netprioidx(current);
1330 }
1331 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1332 #endif
1333 
1334 /**
1335  *	sk_alloc - All socket objects are allocated here
1336  *	@net: the applicable net namespace
1337  *	@family: protocol family
1338  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1339  *	@prot: struct proto associated with this new sock instance
1340  */
1341 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1342 		      struct proto *prot)
1343 {
1344 	struct sock *sk;
1345 
1346 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1347 	if (sk) {
1348 		sk->sk_family = family;
1349 		/*
1350 		 * See comment in struct sock definition to understand
1351 		 * why we need sk_prot_creator -acme
1352 		 */
1353 		sk->sk_prot = sk->sk_prot_creator = prot;
1354 		sock_lock_init(sk);
1355 		sock_net_set(sk, get_net(net));
1356 		atomic_set(&sk->sk_wmem_alloc, 1);
1357 
1358 		sock_update_classid(sk);
1359 		sock_update_netprioidx(sk);
1360 	}
1361 
1362 	return sk;
1363 }
1364 EXPORT_SYMBOL(sk_alloc);
1365 
1366 static void __sk_free(struct sock *sk)
1367 {
1368 	struct sk_filter *filter;
1369 
1370 	if (sk->sk_destruct)
1371 		sk->sk_destruct(sk);
1372 
1373 	filter = rcu_dereference_check(sk->sk_filter,
1374 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1375 	if (filter) {
1376 		sk_filter_uncharge(sk, filter);
1377 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1378 	}
1379 
1380 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1381 
1382 	if (atomic_read(&sk->sk_omem_alloc))
1383 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1384 			 __func__, atomic_read(&sk->sk_omem_alloc));
1385 
1386 	if (sk->sk_peer_cred)
1387 		put_cred(sk->sk_peer_cred);
1388 	put_pid(sk->sk_peer_pid);
1389 	put_net(sock_net(sk));
1390 	sk_prot_free(sk->sk_prot_creator, sk);
1391 }
1392 
1393 void sk_free(struct sock *sk)
1394 {
1395 	/*
1396 	 * We subtract one from sk_wmem_alloc and can know if
1397 	 * some packets are still in some tx queue.
1398 	 * If not null, sock_wfree() will call __sk_free(sk) later
1399 	 */
1400 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1401 		__sk_free(sk);
1402 }
1403 EXPORT_SYMBOL(sk_free);
1404 
1405 /*
1406  * Last sock_put should drop reference to sk->sk_net. It has already
1407  * been dropped in sk_change_net. Taking reference to stopping namespace
1408  * is not an option.
1409  * Take reference to a socket to remove it from hash _alive_ and after that
1410  * destroy it in the context of init_net.
1411  */
1412 void sk_release_kernel(struct sock *sk)
1413 {
1414 	if (sk == NULL || sk->sk_socket == NULL)
1415 		return;
1416 
1417 	sock_hold(sk);
1418 	sock_release(sk->sk_socket);
1419 	release_net(sock_net(sk));
1420 	sock_net_set(sk, get_net(&init_net));
1421 	sock_put(sk);
1422 }
1423 EXPORT_SYMBOL(sk_release_kernel);
1424 
1425 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1426 {
1427 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1428 		sock_update_memcg(newsk);
1429 }
1430 
1431 /**
1432  *	sk_clone_lock - clone a socket, and lock its clone
1433  *	@sk: the socket to clone
1434  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1435  *
1436  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1437  */
1438 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1439 {
1440 	struct sock *newsk;
1441 
1442 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1443 	if (newsk != NULL) {
1444 		struct sk_filter *filter;
1445 
1446 		sock_copy(newsk, sk);
1447 
1448 		/* SANITY */
1449 		get_net(sock_net(newsk));
1450 		sk_node_init(&newsk->sk_node);
1451 		sock_lock_init(newsk);
1452 		bh_lock_sock(newsk);
1453 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1454 		newsk->sk_backlog.len = 0;
1455 
1456 		atomic_set(&newsk->sk_rmem_alloc, 0);
1457 		/*
1458 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1459 		 */
1460 		atomic_set(&newsk->sk_wmem_alloc, 1);
1461 		atomic_set(&newsk->sk_omem_alloc, 0);
1462 		skb_queue_head_init(&newsk->sk_receive_queue);
1463 		skb_queue_head_init(&newsk->sk_write_queue);
1464 #ifdef CONFIG_NET_DMA
1465 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1466 #endif
1467 
1468 		spin_lock_init(&newsk->sk_dst_lock);
1469 		rwlock_init(&newsk->sk_callback_lock);
1470 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1471 				af_callback_keys + newsk->sk_family,
1472 				af_family_clock_key_strings[newsk->sk_family]);
1473 
1474 		newsk->sk_dst_cache	= NULL;
1475 		newsk->sk_wmem_queued	= 0;
1476 		newsk->sk_forward_alloc = 0;
1477 		newsk->sk_send_head	= NULL;
1478 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1479 
1480 		sock_reset_flag(newsk, SOCK_DONE);
1481 		skb_queue_head_init(&newsk->sk_error_queue);
1482 
1483 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1484 		if (filter != NULL)
1485 			sk_filter_charge(newsk, filter);
1486 
1487 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1488 			/* It is still raw copy of parent, so invalidate
1489 			 * destructor and make plain sk_free() */
1490 			newsk->sk_destruct = NULL;
1491 			bh_unlock_sock(newsk);
1492 			sk_free(newsk);
1493 			newsk = NULL;
1494 			goto out;
1495 		}
1496 
1497 		newsk->sk_err	   = 0;
1498 		newsk->sk_priority = 0;
1499 		/*
1500 		 * Before updating sk_refcnt, we must commit prior changes to memory
1501 		 * (Documentation/RCU/rculist_nulls.txt for details)
1502 		 */
1503 		smp_wmb();
1504 		atomic_set(&newsk->sk_refcnt, 2);
1505 
1506 		/*
1507 		 * Increment the counter in the same struct proto as the master
1508 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1509 		 * is the same as sk->sk_prot->socks, as this field was copied
1510 		 * with memcpy).
1511 		 *
1512 		 * This _changes_ the previous behaviour, where
1513 		 * tcp_create_openreq_child always was incrementing the
1514 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1515 		 * to be taken into account in all callers. -acme
1516 		 */
1517 		sk_refcnt_debug_inc(newsk);
1518 		sk_set_socket(newsk, NULL);
1519 		newsk->sk_wq = NULL;
1520 
1521 		sk_update_clone(sk, newsk);
1522 
1523 		if (newsk->sk_prot->sockets_allocated)
1524 			sk_sockets_allocated_inc(newsk);
1525 
1526 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1527 			net_enable_timestamp();
1528 	}
1529 out:
1530 	return newsk;
1531 }
1532 EXPORT_SYMBOL_GPL(sk_clone_lock);
1533 
1534 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1535 {
1536 	__sk_dst_set(sk, dst);
1537 	sk->sk_route_caps = dst->dev->features;
1538 	if (sk->sk_route_caps & NETIF_F_GSO)
1539 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1540 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1541 	if (sk_can_gso(sk)) {
1542 		if (dst->header_len) {
1543 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1544 		} else {
1545 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1546 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1547 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1548 		}
1549 	}
1550 }
1551 EXPORT_SYMBOL_GPL(sk_setup_caps);
1552 
1553 /*
1554  *	Simple resource managers for sockets.
1555  */
1556 
1557 
1558 /*
1559  * Write buffer destructor automatically called from kfree_skb.
1560  */
1561 void sock_wfree(struct sk_buff *skb)
1562 {
1563 	struct sock *sk = skb->sk;
1564 	unsigned int len = skb->truesize;
1565 
1566 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1567 		/*
1568 		 * Keep a reference on sk_wmem_alloc, this will be released
1569 		 * after sk_write_space() call
1570 		 */
1571 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1572 		sk->sk_write_space(sk);
1573 		len = 1;
1574 	}
1575 	/*
1576 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1577 	 * could not do because of in-flight packets
1578 	 */
1579 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1580 		__sk_free(sk);
1581 }
1582 EXPORT_SYMBOL(sock_wfree);
1583 
1584 void skb_orphan_partial(struct sk_buff *skb)
1585 {
1586 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1587 	 * so we do not completely orphan skb, but transfert all
1588 	 * accounted bytes but one, to avoid unexpected reorders.
1589 	 */
1590 	if (skb->destructor == sock_wfree
1591 #ifdef CONFIG_INET
1592 	    || skb->destructor == tcp_wfree
1593 #endif
1594 		) {
1595 		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1596 		skb->truesize = 1;
1597 	} else {
1598 		skb_orphan(skb);
1599 	}
1600 }
1601 EXPORT_SYMBOL(skb_orphan_partial);
1602 
1603 /*
1604  * Read buffer destructor automatically called from kfree_skb.
1605  */
1606 void sock_rfree(struct sk_buff *skb)
1607 {
1608 	struct sock *sk = skb->sk;
1609 	unsigned int len = skb->truesize;
1610 
1611 	atomic_sub(len, &sk->sk_rmem_alloc);
1612 	sk_mem_uncharge(sk, len);
1613 }
1614 EXPORT_SYMBOL(sock_rfree);
1615 
1616 void sock_edemux(struct sk_buff *skb)
1617 {
1618 	struct sock *sk = skb->sk;
1619 
1620 #ifdef CONFIG_INET
1621 	if (sk->sk_state == TCP_TIME_WAIT)
1622 		inet_twsk_put(inet_twsk(sk));
1623 	else
1624 #endif
1625 		sock_put(sk);
1626 }
1627 EXPORT_SYMBOL(sock_edemux);
1628 
1629 kuid_t sock_i_uid(struct sock *sk)
1630 {
1631 	kuid_t uid;
1632 
1633 	read_lock_bh(&sk->sk_callback_lock);
1634 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1635 	read_unlock_bh(&sk->sk_callback_lock);
1636 	return uid;
1637 }
1638 EXPORT_SYMBOL(sock_i_uid);
1639 
1640 unsigned long sock_i_ino(struct sock *sk)
1641 {
1642 	unsigned long ino;
1643 
1644 	read_lock_bh(&sk->sk_callback_lock);
1645 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1646 	read_unlock_bh(&sk->sk_callback_lock);
1647 	return ino;
1648 }
1649 EXPORT_SYMBOL(sock_i_ino);
1650 
1651 /*
1652  * Allocate a skb from the socket's send buffer.
1653  */
1654 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1655 			     gfp_t priority)
1656 {
1657 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1658 		struct sk_buff *skb = alloc_skb(size, priority);
1659 		if (skb) {
1660 			skb_set_owner_w(skb, sk);
1661 			return skb;
1662 		}
1663 	}
1664 	return NULL;
1665 }
1666 EXPORT_SYMBOL(sock_wmalloc);
1667 
1668 /*
1669  * Allocate a skb from the socket's receive buffer.
1670  */
1671 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1672 			     gfp_t priority)
1673 {
1674 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1675 		struct sk_buff *skb = alloc_skb(size, priority);
1676 		if (skb) {
1677 			skb_set_owner_r(skb, sk);
1678 			return skb;
1679 		}
1680 	}
1681 	return NULL;
1682 }
1683 
1684 /*
1685  * Allocate a memory block from the socket's option memory buffer.
1686  */
1687 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1688 {
1689 	if ((unsigned int)size <= sysctl_optmem_max &&
1690 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1691 		void *mem;
1692 		/* First do the add, to avoid the race if kmalloc
1693 		 * might sleep.
1694 		 */
1695 		atomic_add(size, &sk->sk_omem_alloc);
1696 		mem = kmalloc(size, priority);
1697 		if (mem)
1698 			return mem;
1699 		atomic_sub(size, &sk->sk_omem_alloc);
1700 	}
1701 	return NULL;
1702 }
1703 EXPORT_SYMBOL(sock_kmalloc);
1704 
1705 /*
1706  * Free an option memory block.
1707  */
1708 void sock_kfree_s(struct sock *sk, void *mem, int size)
1709 {
1710 	kfree(mem);
1711 	atomic_sub(size, &sk->sk_omem_alloc);
1712 }
1713 EXPORT_SYMBOL(sock_kfree_s);
1714 
1715 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1716    I think, these locks should be removed for datagram sockets.
1717  */
1718 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1719 {
1720 	DEFINE_WAIT(wait);
1721 
1722 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1723 	for (;;) {
1724 		if (!timeo)
1725 			break;
1726 		if (signal_pending(current))
1727 			break;
1728 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1729 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1730 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1731 			break;
1732 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1733 			break;
1734 		if (sk->sk_err)
1735 			break;
1736 		timeo = schedule_timeout(timeo);
1737 	}
1738 	finish_wait(sk_sleep(sk), &wait);
1739 	return timeo;
1740 }
1741 
1742 
1743 /*
1744  *	Generic send/receive buffer handlers
1745  */
1746 
1747 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1748 				     unsigned long data_len, int noblock,
1749 				     int *errcode, int max_page_order)
1750 {
1751 	struct sk_buff *skb = NULL;
1752 	unsigned long chunk;
1753 	gfp_t gfp_mask;
1754 	long timeo;
1755 	int err;
1756 	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1757 	struct page *page;
1758 	int i;
1759 
1760 	err = -EMSGSIZE;
1761 	if (npages > MAX_SKB_FRAGS)
1762 		goto failure;
1763 
1764 	timeo = sock_sndtimeo(sk, noblock);
1765 	while (!skb) {
1766 		err = sock_error(sk);
1767 		if (err != 0)
1768 			goto failure;
1769 
1770 		err = -EPIPE;
1771 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1772 			goto failure;
1773 
1774 		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1775 			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1776 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1777 			err = -EAGAIN;
1778 			if (!timeo)
1779 				goto failure;
1780 			if (signal_pending(current))
1781 				goto interrupted;
1782 			timeo = sock_wait_for_wmem(sk, timeo);
1783 			continue;
1784 		}
1785 
1786 		err = -ENOBUFS;
1787 		gfp_mask = sk->sk_allocation;
1788 		if (gfp_mask & __GFP_WAIT)
1789 			gfp_mask |= __GFP_REPEAT;
1790 
1791 		skb = alloc_skb(header_len, gfp_mask);
1792 		if (!skb)
1793 			goto failure;
1794 
1795 		skb->truesize += data_len;
1796 
1797 		for (i = 0; npages > 0; i++) {
1798 			int order = max_page_order;
1799 
1800 			while (order) {
1801 				if (npages >= 1 << order) {
1802 					page = alloc_pages(sk->sk_allocation |
1803 							   __GFP_COMP | __GFP_NOWARN,
1804 							   order);
1805 					if (page)
1806 						goto fill_page;
1807 				}
1808 				order--;
1809 			}
1810 			page = alloc_page(sk->sk_allocation);
1811 			if (!page)
1812 				goto failure;
1813 fill_page:
1814 			chunk = min_t(unsigned long, data_len,
1815 				      PAGE_SIZE << order);
1816 			skb_fill_page_desc(skb, i, page, 0, chunk);
1817 			data_len -= chunk;
1818 			npages -= 1 << order;
1819 		}
1820 	}
1821 
1822 	skb_set_owner_w(skb, sk);
1823 	return skb;
1824 
1825 interrupted:
1826 	err = sock_intr_errno(timeo);
1827 failure:
1828 	kfree_skb(skb);
1829 	*errcode = err;
1830 	return NULL;
1831 }
1832 EXPORT_SYMBOL(sock_alloc_send_pskb);
1833 
1834 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1835 				    int noblock, int *errcode)
1836 {
1837 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1838 }
1839 EXPORT_SYMBOL(sock_alloc_send_skb);
1840 
1841 /* On 32bit arches, an skb frag is limited to 2^15 */
1842 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1843 
1844 /**
1845  * skb_page_frag_refill - check that a page_frag contains enough room
1846  * @sz: minimum size of the fragment we want to get
1847  * @pfrag: pointer to page_frag
1848  * @prio: priority for memory allocation
1849  *
1850  * Note: While this allocator tries to use high order pages, there is
1851  * no guarantee that allocations succeed. Therefore, @sz MUST be
1852  * less or equal than PAGE_SIZE.
1853  */
1854 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
1855 {
1856 	int order;
1857 
1858 	if (pfrag->page) {
1859 		if (atomic_read(&pfrag->page->_count) == 1) {
1860 			pfrag->offset = 0;
1861 			return true;
1862 		}
1863 		if (pfrag->offset + sz <= pfrag->size)
1864 			return true;
1865 		put_page(pfrag->page);
1866 	}
1867 
1868 	/* We restrict high order allocations to users that can afford to wait */
1869 	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1870 
1871 	do {
1872 		gfp_t gfp = prio;
1873 
1874 		if (order)
1875 			gfp |= __GFP_COMP | __GFP_NOWARN;
1876 		pfrag->page = alloc_pages(gfp, order);
1877 		if (likely(pfrag->page)) {
1878 			pfrag->offset = 0;
1879 			pfrag->size = PAGE_SIZE << order;
1880 			return true;
1881 		}
1882 	} while (--order >= 0);
1883 
1884 	return false;
1885 }
1886 EXPORT_SYMBOL(skb_page_frag_refill);
1887 
1888 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1889 {
1890 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1891 		return true;
1892 
1893 	sk_enter_memory_pressure(sk);
1894 	sk_stream_moderate_sndbuf(sk);
1895 	return false;
1896 }
1897 EXPORT_SYMBOL(sk_page_frag_refill);
1898 
1899 static void __lock_sock(struct sock *sk)
1900 	__releases(&sk->sk_lock.slock)
1901 	__acquires(&sk->sk_lock.slock)
1902 {
1903 	DEFINE_WAIT(wait);
1904 
1905 	for (;;) {
1906 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1907 					TASK_UNINTERRUPTIBLE);
1908 		spin_unlock_bh(&sk->sk_lock.slock);
1909 		schedule();
1910 		spin_lock_bh(&sk->sk_lock.slock);
1911 		if (!sock_owned_by_user(sk))
1912 			break;
1913 	}
1914 	finish_wait(&sk->sk_lock.wq, &wait);
1915 }
1916 
1917 static void __release_sock(struct sock *sk)
1918 	__releases(&sk->sk_lock.slock)
1919 	__acquires(&sk->sk_lock.slock)
1920 {
1921 	struct sk_buff *skb = sk->sk_backlog.head;
1922 
1923 	do {
1924 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1925 		bh_unlock_sock(sk);
1926 
1927 		do {
1928 			struct sk_buff *next = skb->next;
1929 
1930 			prefetch(next);
1931 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1932 			skb->next = NULL;
1933 			sk_backlog_rcv(sk, skb);
1934 
1935 			/*
1936 			 * We are in process context here with softirqs
1937 			 * disabled, use cond_resched_softirq() to preempt.
1938 			 * This is safe to do because we've taken the backlog
1939 			 * queue private:
1940 			 */
1941 			cond_resched_softirq();
1942 
1943 			skb = next;
1944 		} while (skb != NULL);
1945 
1946 		bh_lock_sock(sk);
1947 	} while ((skb = sk->sk_backlog.head) != NULL);
1948 
1949 	/*
1950 	 * Doing the zeroing here guarantee we can not loop forever
1951 	 * while a wild producer attempts to flood us.
1952 	 */
1953 	sk->sk_backlog.len = 0;
1954 }
1955 
1956 /**
1957  * sk_wait_data - wait for data to arrive at sk_receive_queue
1958  * @sk:    sock to wait on
1959  * @timeo: for how long
1960  *
1961  * Now socket state including sk->sk_err is changed only under lock,
1962  * hence we may omit checks after joining wait queue.
1963  * We check receive queue before schedule() only as optimization;
1964  * it is very likely that release_sock() added new data.
1965  */
1966 int sk_wait_data(struct sock *sk, long *timeo)
1967 {
1968 	int rc;
1969 	DEFINE_WAIT(wait);
1970 
1971 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1972 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1973 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1974 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1975 	finish_wait(sk_sleep(sk), &wait);
1976 	return rc;
1977 }
1978 EXPORT_SYMBOL(sk_wait_data);
1979 
1980 /**
1981  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1982  *	@sk: socket
1983  *	@size: memory size to allocate
1984  *	@kind: allocation type
1985  *
1986  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1987  *	rmem allocation. This function assumes that protocols which have
1988  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1989  */
1990 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1991 {
1992 	struct proto *prot = sk->sk_prot;
1993 	int amt = sk_mem_pages(size);
1994 	long allocated;
1995 	int parent_status = UNDER_LIMIT;
1996 
1997 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1998 
1999 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2000 
2001 	/* Under limit. */
2002 	if (parent_status == UNDER_LIMIT &&
2003 			allocated <= sk_prot_mem_limits(sk, 0)) {
2004 		sk_leave_memory_pressure(sk);
2005 		return 1;
2006 	}
2007 
2008 	/* Under pressure. (we or our parents) */
2009 	if ((parent_status > SOFT_LIMIT) ||
2010 			allocated > sk_prot_mem_limits(sk, 1))
2011 		sk_enter_memory_pressure(sk);
2012 
2013 	/* Over hard limit (we or our parents) */
2014 	if ((parent_status == OVER_LIMIT) ||
2015 			(allocated > sk_prot_mem_limits(sk, 2)))
2016 		goto suppress_allocation;
2017 
2018 	/* guarantee minimum buffer size under pressure */
2019 	if (kind == SK_MEM_RECV) {
2020 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2021 			return 1;
2022 
2023 	} else { /* SK_MEM_SEND */
2024 		if (sk->sk_type == SOCK_STREAM) {
2025 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2026 				return 1;
2027 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2028 			   prot->sysctl_wmem[0])
2029 				return 1;
2030 	}
2031 
2032 	if (sk_has_memory_pressure(sk)) {
2033 		int alloc;
2034 
2035 		if (!sk_under_memory_pressure(sk))
2036 			return 1;
2037 		alloc = sk_sockets_allocated_read_positive(sk);
2038 		if (sk_prot_mem_limits(sk, 2) > alloc *
2039 		    sk_mem_pages(sk->sk_wmem_queued +
2040 				 atomic_read(&sk->sk_rmem_alloc) +
2041 				 sk->sk_forward_alloc))
2042 			return 1;
2043 	}
2044 
2045 suppress_allocation:
2046 
2047 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2048 		sk_stream_moderate_sndbuf(sk);
2049 
2050 		/* Fail only if socket is _under_ its sndbuf.
2051 		 * In this case we cannot block, so that we have to fail.
2052 		 */
2053 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2054 			return 1;
2055 	}
2056 
2057 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2058 
2059 	/* Alas. Undo changes. */
2060 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2061 
2062 	sk_memory_allocated_sub(sk, amt);
2063 
2064 	return 0;
2065 }
2066 EXPORT_SYMBOL(__sk_mem_schedule);
2067 
2068 /**
2069  *	__sk_reclaim - reclaim memory_allocated
2070  *	@sk: socket
2071  */
2072 void __sk_mem_reclaim(struct sock *sk)
2073 {
2074 	sk_memory_allocated_sub(sk,
2075 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2076 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2077 
2078 	if (sk_under_memory_pressure(sk) &&
2079 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2080 		sk_leave_memory_pressure(sk);
2081 }
2082 EXPORT_SYMBOL(__sk_mem_reclaim);
2083 
2084 
2085 /*
2086  * Set of default routines for initialising struct proto_ops when
2087  * the protocol does not support a particular function. In certain
2088  * cases where it makes no sense for a protocol to have a "do nothing"
2089  * function, some default processing is provided.
2090  */
2091 
2092 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2093 {
2094 	return -EOPNOTSUPP;
2095 }
2096 EXPORT_SYMBOL(sock_no_bind);
2097 
2098 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2099 		    int len, int flags)
2100 {
2101 	return -EOPNOTSUPP;
2102 }
2103 EXPORT_SYMBOL(sock_no_connect);
2104 
2105 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2106 {
2107 	return -EOPNOTSUPP;
2108 }
2109 EXPORT_SYMBOL(sock_no_socketpair);
2110 
2111 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2112 {
2113 	return -EOPNOTSUPP;
2114 }
2115 EXPORT_SYMBOL(sock_no_accept);
2116 
2117 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2118 		    int *len, int peer)
2119 {
2120 	return -EOPNOTSUPP;
2121 }
2122 EXPORT_SYMBOL(sock_no_getname);
2123 
2124 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2125 {
2126 	return 0;
2127 }
2128 EXPORT_SYMBOL(sock_no_poll);
2129 
2130 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2131 {
2132 	return -EOPNOTSUPP;
2133 }
2134 EXPORT_SYMBOL(sock_no_ioctl);
2135 
2136 int sock_no_listen(struct socket *sock, int backlog)
2137 {
2138 	return -EOPNOTSUPP;
2139 }
2140 EXPORT_SYMBOL(sock_no_listen);
2141 
2142 int sock_no_shutdown(struct socket *sock, int how)
2143 {
2144 	return -EOPNOTSUPP;
2145 }
2146 EXPORT_SYMBOL(sock_no_shutdown);
2147 
2148 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2149 		    char __user *optval, unsigned int optlen)
2150 {
2151 	return -EOPNOTSUPP;
2152 }
2153 EXPORT_SYMBOL(sock_no_setsockopt);
2154 
2155 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2156 		    char __user *optval, int __user *optlen)
2157 {
2158 	return -EOPNOTSUPP;
2159 }
2160 EXPORT_SYMBOL(sock_no_getsockopt);
2161 
2162 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2163 		    size_t len)
2164 {
2165 	return -EOPNOTSUPP;
2166 }
2167 EXPORT_SYMBOL(sock_no_sendmsg);
2168 
2169 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2170 		    size_t len, int flags)
2171 {
2172 	return -EOPNOTSUPP;
2173 }
2174 EXPORT_SYMBOL(sock_no_recvmsg);
2175 
2176 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2177 {
2178 	/* Mirror missing mmap method error code */
2179 	return -ENODEV;
2180 }
2181 EXPORT_SYMBOL(sock_no_mmap);
2182 
2183 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2184 {
2185 	ssize_t res;
2186 	struct msghdr msg = {.msg_flags = flags};
2187 	struct kvec iov;
2188 	char *kaddr = kmap(page);
2189 	iov.iov_base = kaddr + offset;
2190 	iov.iov_len = size;
2191 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2192 	kunmap(page);
2193 	return res;
2194 }
2195 EXPORT_SYMBOL(sock_no_sendpage);
2196 
2197 /*
2198  *	Default Socket Callbacks
2199  */
2200 
2201 static void sock_def_wakeup(struct sock *sk)
2202 {
2203 	struct socket_wq *wq;
2204 
2205 	rcu_read_lock();
2206 	wq = rcu_dereference(sk->sk_wq);
2207 	if (wq_has_sleeper(wq))
2208 		wake_up_interruptible_all(&wq->wait);
2209 	rcu_read_unlock();
2210 }
2211 
2212 static void sock_def_error_report(struct sock *sk)
2213 {
2214 	struct socket_wq *wq;
2215 
2216 	rcu_read_lock();
2217 	wq = rcu_dereference(sk->sk_wq);
2218 	if (wq_has_sleeper(wq))
2219 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2220 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2221 	rcu_read_unlock();
2222 }
2223 
2224 static void sock_def_readable(struct sock *sk, int len)
2225 {
2226 	struct socket_wq *wq;
2227 
2228 	rcu_read_lock();
2229 	wq = rcu_dereference(sk->sk_wq);
2230 	if (wq_has_sleeper(wq))
2231 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2232 						POLLRDNORM | POLLRDBAND);
2233 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2234 	rcu_read_unlock();
2235 }
2236 
2237 static void sock_def_write_space(struct sock *sk)
2238 {
2239 	struct socket_wq *wq;
2240 
2241 	rcu_read_lock();
2242 
2243 	/* Do not wake up a writer until he can make "significant"
2244 	 * progress.  --DaveM
2245 	 */
2246 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2247 		wq = rcu_dereference(sk->sk_wq);
2248 		if (wq_has_sleeper(wq))
2249 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2250 						POLLWRNORM | POLLWRBAND);
2251 
2252 		/* Should agree with poll, otherwise some programs break */
2253 		if (sock_writeable(sk))
2254 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2255 	}
2256 
2257 	rcu_read_unlock();
2258 }
2259 
2260 static void sock_def_destruct(struct sock *sk)
2261 {
2262 	kfree(sk->sk_protinfo);
2263 }
2264 
2265 void sk_send_sigurg(struct sock *sk)
2266 {
2267 	if (sk->sk_socket && sk->sk_socket->file)
2268 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2269 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2270 }
2271 EXPORT_SYMBOL(sk_send_sigurg);
2272 
2273 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2274 		    unsigned long expires)
2275 {
2276 	if (!mod_timer(timer, expires))
2277 		sock_hold(sk);
2278 }
2279 EXPORT_SYMBOL(sk_reset_timer);
2280 
2281 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2282 {
2283 	if (del_timer(timer))
2284 		__sock_put(sk);
2285 }
2286 EXPORT_SYMBOL(sk_stop_timer);
2287 
2288 void sock_init_data(struct socket *sock, struct sock *sk)
2289 {
2290 	skb_queue_head_init(&sk->sk_receive_queue);
2291 	skb_queue_head_init(&sk->sk_write_queue);
2292 	skb_queue_head_init(&sk->sk_error_queue);
2293 #ifdef CONFIG_NET_DMA
2294 	skb_queue_head_init(&sk->sk_async_wait_queue);
2295 #endif
2296 
2297 	sk->sk_send_head	=	NULL;
2298 
2299 	init_timer(&sk->sk_timer);
2300 
2301 	sk->sk_allocation	=	GFP_KERNEL;
2302 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2303 	sk->sk_sndbuf		=	sysctl_wmem_default;
2304 	sk->sk_state		=	TCP_CLOSE;
2305 	sk_set_socket(sk, sock);
2306 
2307 	sock_set_flag(sk, SOCK_ZAPPED);
2308 
2309 	if (sock) {
2310 		sk->sk_type	=	sock->type;
2311 		sk->sk_wq	=	sock->wq;
2312 		sock->sk	=	sk;
2313 	} else
2314 		sk->sk_wq	=	NULL;
2315 
2316 	spin_lock_init(&sk->sk_dst_lock);
2317 	rwlock_init(&sk->sk_callback_lock);
2318 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2319 			af_callback_keys + sk->sk_family,
2320 			af_family_clock_key_strings[sk->sk_family]);
2321 
2322 	sk->sk_state_change	=	sock_def_wakeup;
2323 	sk->sk_data_ready	=	sock_def_readable;
2324 	sk->sk_write_space	=	sock_def_write_space;
2325 	sk->sk_error_report	=	sock_def_error_report;
2326 	sk->sk_destruct		=	sock_def_destruct;
2327 
2328 	sk->sk_frag.page	=	NULL;
2329 	sk->sk_frag.offset	=	0;
2330 	sk->sk_peek_off		=	-1;
2331 
2332 	sk->sk_peer_pid 	=	NULL;
2333 	sk->sk_peer_cred	=	NULL;
2334 	sk->sk_write_pending	=	0;
2335 	sk->sk_rcvlowat		=	1;
2336 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2337 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2338 
2339 	sk->sk_stamp = ktime_set(-1L, 0);
2340 
2341 #ifdef CONFIG_NET_RX_BUSY_POLL
2342 	sk->sk_napi_id		=	0;
2343 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2344 #endif
2345 
2346 	sk->sk_max_pacing_rate = ~0U;
2347 	sk->sk_pacing_rate = ~0U;
2348 	/*
2349 	 * Before updating sk_refcnt, we must commit prior changes to memory
2350 	 * (Documentation/RCU/rculist_nulls.txt for details)
2351 	 */
2352 	smp_wmb();
2353 	atomic_set(&sk->sk_refcnt, 1);
2354 	atomic_set(&sk->sk_drops, 0);
2355 }
2356 EXPORT_SYMBOL(sock_init_data);
2357 
2358 void lock_sock_nested(struct sock *sk, int subclass)
2359 {
2360 	might_sleep();
2361 	spin_lock_bh(&sk->sk_lock.slock);
2362 	if (sk->sk_lock.owned)
2363 		__lock_sock(sk);
2364 	sk->sk_lock.owned = 1;
2365 	spin_unlock(&sk->sk_lock.slock);
2366 	/*
2367 	 * The sk_lock has mutex_lock() semantics here:
2368 	 */
2369 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2370 	local_bh_enable();
2371 }
2372 EXPORT_SYMBOL(lock_sock_nested);
2373 
2374 void release_sock(struct sock *sk)
2375 {
2376 	/*
2377 	 * The sk_lock has mutex_unlock() semantics:
2378 	 */
2379 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2380 
2381 	spin_lock_bh(&sk->sk_lock.slock);
2382 	if (sk->sk_backlog.tail)
2383 		__release_sock(sk);
2384 
2385 	if (sk->sk_prot->release_cb)
2386 		sk->sk_prot->release_cb(sk);
2387 
2388 	sk->sk_lock.owned = 0;
2389 	if (waitqueue_active(&sk->sk_lock.wq))
2390 		wake_up(&sk->sk_lock.wq);
2391 	spin_unlock_bh(&sk->sk_lock.slock);
2392 }
2393 EXPORT_SYMBOL(release_sock);
2394 
2395 /**
2396  * lock_sock_fast - fast version of lock_sock
2397  * @sk: socket
2398  *
2399  * This version should be used for very small section, where process wont block
2400  * return false if fast path is taken
2401  *   sk_lock.slock locked, owned = 0, BH disabled
2402  * return true if slow path is taken
2403  *   sk_lock.slock unlocked, owned = 1, BH enabled
2404  */
2405 bool lock_sock_fast(struct sock *sk)
2406 {
2407 	might_sleep();
2408 	spin_lock_bh(&sk->sk_lock.slock);
2409 
2410 	if (!sk->sk_lock.owned)
2411 		/*
2412 		 * Note : We must disable BH
2413 		 */
2414 		return false;
2415 
2416 	__lock_sock(sk);
2417 	sk->sk_lock.owned = 1;
2418 	spin_unlock(&sk->sk_lock.slock);
2419 	/*
2420 	 * The sk_lock has mutex_lock() semantics here:
2421 	 */
2422 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2423 	local_bh_enable();
2424 	return true;
2425 }
2426 EXPORT_SYMBOL(lock_sock_fast);
2427 
2428 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2429 {
2430 	struct timeval tv;
2431 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2432 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2433 	tv = ktime_to_timeval(sk->sk_stamp);
2434 	if (tv.tv_sec == -1)
2435 		return -ENOENT;
2436 	if (tv.tv_sec == 0) {
2437 		sk->sk_stamp = ktime_get_real();
2438 		tv = ktime_to_timeval(sk->sk_stamp);
2439 	}
2440 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2441 }
2442 EXPORT_SYMBOL(sock_get_timestamp);
2443 
2444 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2445 {
2446 	struct timespec ts;
2447 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2448 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2449 	ts = ktime_to_timespec(sk->sk_stamp);
2450 	if (ts.tv_sec == -1)
2451 		return -ENOENT;
2452 	if (ts.tv_sec == 0) {
2453 		sk->sk_stamp = ktime_get_real();
2454 		ts = ktime_to_timespec(sk->sk_stamp);
2455 	}
2456 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2457 }
2458 EXPORT_SYMBOL(sock_get_timestampns);
2459 
2460 void sock_enable_timestamp(struct sock *sk, int flag)
2461 {
2462 	if (!sock_flag(sk, flag)) {
2463 		unsigned long previous_flags = sk->sk_flags;
2464 
2465 		sock_set_flag(sk, flag);
2466 		/*
2467 		 * we just set one of the two flags which require net
2468 		 * time stamping, but time stamping might have been on
2469 		 * already because of the other one
2470 		 */
2471 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2472 			net_enable_timestamp();
2473 	}
2474 }
2475 
2476 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2477 		       int level, int type)
2478 {
2479 	struct sock_exterr_skb *serr;
2480 	struct sk_buff *skb, *skb2;
2481 	int copied, err;
2482 
2483 	err = -EAGAIN;
2484 	skb = skb_dequeue(&sk->sk_error_queue);
2485 	if (skb == NULL)
2486 		goto out;
2487 
2488 	copied = skb->len;
2489 	if (copied > len) {
2490 		msg->msg_flags |= MSG_TRUNC;
2491 		copied = len;
2492 	}
2493 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2494 	if (err)
2495 		goto out_free_skb;
2496 
2497 	sock_recv_timestamp(msg, sk, skb);
2498 
2499 	serr = SKB_EXT_ERR(skb);
2500 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2501 
2502 	msg->msg_flags |= MSG_ERRQUEUE;
2503 	err = copied;
2504 
2505 	/* Reset and regenerate socket error */
2506 	spin_lock_bh(&sk->sk_error_queue.lock);
2507 	sk->sk_err = 0;
2508 	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2509 		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2510 		spin_unlock_bh(&sk->sk_error_queue.lock);
2511 		sk->sk_error_report(sk);
2512 	} else
2513 		spin_unlock_bh(&sk->sk_error_queue.lock);
2514 
2515 out_free_skb:
2516 	kfree_skb(skb);
2517 out:
2518 	return err;
2519 }
2520 EXPORT_SYMBOL(sock_recv_errqueue);
2521 
2522 /*
2523  *	Get a socket option on an socket.
2524  *
2525  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2526  *	asynchronous errors should be reported by getsockopt. We assume
2527  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2528  */
2529 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2530 			   char __user *optval, int __user *optlen)
2531 {
2532 	struct sock *sk = sock->sk;
2533 
2534 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2535 }
2536 EXPORT_SYMBOL(sock_common_getsockopt);
2537 
2538 #ifdef CONFIG_COMPAT
2539 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2540 				  char __user *optval, int __user *optlen)
2541 {
2542 	struct sock *sk = sock->sk;
2543 
2544 	if (sk->sk_prot->compat_getsockopt != NULL)
2545 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2546 						      optval, optlen);
2547 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2548 }
2549 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2550 #endif
2551 
2552 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2553 			struct msghdr *msg, size_t size, int flags)
2554 {
2555 	struct sock *sk = sock->sk;
2556 	int addr_len = 0;
2557 	int err;
2558 
2559 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2560 				   flags & ~MSG_DONTWAIT, &addr_len);
2561 	if (err >= 0)
2562 		msg->msg_namelen = addr_len;
2563 	return err;
2564 }
2565 EXPORT_SYMBOL(sock_common_recvmsg);
2566 
2567 /*
2568  *	Set socket options on an inet socket.
2569  */
2570 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2571 			   char __user *optval, unsigned int optlen)
2572 {
2573 	struct sock *sk = sock->sk;
2574 
2575 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2576 }
2577 EXPORT_SYMBOL(sock_common_setsockopt);
2578 
2579 #ifdef CONFIG_COMPAT
2580 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2581 				  char __user *optval, unsigned int optlen)
2582 {
2583 	struct sock *sk = sock->sk;
2584 
2585 	if (sk->sk_prot->compat_setsockopt != NULL)
2586 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2587 						      optval, optlen);
2588 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2589 }
2590 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2591 #endif
2592 
2593 void sk_common_release(struct sock *sk)
2594 {
2595 	if (sk->sk_prot->destroy)
2596 		sk->sk_prot->destroy(sk);
2597 
2598 	/*
2599 	 * Observation: when sock_common_release is called, processes have
2600 	 * no access to socket. But net still has.
2601 	 * Step one, detach it from networking:
2602 	 *
2603 	 * A. Remove from hash tables.
2604 	 */
2605 
2606 	sk->sk_prot->unhash(sk);
2607 
2608 	/*
2609 	 * In this point socket cannot receive new packets, but it is possible
2610 	 * that some packets are in flight because some CPU runs receiver and
2611 	 * did hash table lookup before we unhashed socket. They will achieve
2612 	 * receive queue and will be purged by socket destructor.
2613 	 *
2614 	 * Also we still have packets pending on receive queue and probably,
2615 	 * our own packets waiting in device queues. sock_destroy will drain
2616 	 * receive queue, but transmitted packets will delay socket destruction
2617 	 * until the last reference will be released.
2618 	 */
2619 
2620 	sock_orphan(sk);
2621 
2622 	xfrm_sk_free_policy(sk);
2623 
2624 	sk_refcnt_debug_release(sk);
2625 
2626 	if (sk->sk_frag.page) {
2627 		put_page(sk->sk_frag.page);
2628 		sk->sk_frag.page = NULL;
2629 	}
2630 
2631 	sock_put(sk);
2632 }
2633 EXPORT_SYMBOL(sk_common_release);
2634 
2635 #ifdef CONFIG_PROC_FS
2636 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2637 struct prot_inuse {
2638 	int val[PROTO_INUSE_NR];
2639 };
2640 
2641 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2642 
2643 #ifdef CONFIG_NET_NS
2644 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2645 {
2646 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2647 }
2648 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2649 
2650 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2651 {
2652 	int cpu, idx = prot->inuse_idx;
2653 	int res = 0;
2654 
2655 	for_each_possible_cpu(cpu)
2656 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2657 
2658 	return res >= 0 ? res : 0;
2659 }
2660 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2661 
2662 static int __net_init sock_inuse_init_net(struct net *net)
2663 {
2664 	net->core.inuse = alloc_percpu(struct prot_inuse);
2665 	return net->core.inuse ? 0 : -ENOMEM;
2666 }
2667 
2668 static void __net_exit sock_inuse_exit_net(struct net *net)
2669 {
2670 	free_percpu(net->core.inuse);
2671 }
2672 
2673 static struct pernet_operations net_inuse_ops = {
2674 	.init = sock_inuse_init_net,
2675 	.exit = sock_inuse_exit_net,
2676 };
2677 
2678 static __init int net_inuse_init(void)
2679 {
2680 	if (register_pernet_subsys(&net_inuse_ops))
2681 		panic("Cannot initialize net inuse counters");
2682 
2683 	return 0;
2684 }
2685 
2686 core_initcall(net_inuse_init);
2687 #else
2688 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2689 
2690 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2691 {
2692 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2693 }
2694 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2695 
2696 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2697 {
2698 	int cpu, idx = prot->inuse_idx;
2699 	int res = 0;
2700 
2701 	for_each_possible_cpu(cpu)
2702 		res += per_cpu(prot_inuse, cpu).val[idx];
2703 
2704 	return res >= 0 ? res : 0;
2705 }
2706 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2707 #endif
2708 
2709 static void assign_proto_idx(struct proto *prot)
2710 {
2711 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2712 
2713 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2714 		pr_err("PROTO_INUSE_NR exhausted\n");
2715 		return;
2716 	}
2717 
2718 	set_bit(prot->inuse_idx, proto_inuse_idx);
2719 }
2720 
2721 static void release_proto_idx(struct proto *prot)
2722 {
2723 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2724 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2725 }
2726 #else
2727 static inline void assign_proto_idx(struct proto *prot)
2728 {
2729 }
2730 
2731 static inline void release_proto_idx(struct proto *prot)
2732 {
2733 }
2734 #endif
2735 
2736 int proto_register(struct proto *prot, int alloc_slab)
2737 {
2738 	if (alloc_slab) {
2739 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2740 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2741 					NULL);
2742 
2743 		if (prot->slab == NULL) {
2744 			pr_crit("%s: Can't create sock SLAB cache!\n",
2745 				prot->name);
2746 			goto out;
2747 		}
2748 
2749 		if (prot->rsk_prot != NULL) {
2750 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2751 			if (prot->rsk_prot->slab_name == NULL)
2752 				goto out_free_sock_slab;
2753 
2754 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2755 								 prot->rsk_prot->obj_size, 0,
2756 								 SLAB_HWCACHE_ALIGN, NULL);
2757 
2758 			if (prot->rsk_prot->slab == NULL) {
2759 				pr_crit("%s: Can't create request sock SLAB cache!\n",
2760 					prot->name);
2761 				goto out_free_request_sock_slab_name;
2762 			}
2763 		}
2764 
2765 		if (prot->twsk_prot != NULL) {
2766 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2767 
2768 			if (prot->twsk_prot->twsk_slab_name == NULL)
2769 				goto out_free_request_sock_slab;
2770 
2771 			prot->twsk_prot->twsk_slab =
2772 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2773 						  prot->twsk_prot->twsk_obj_size,
2774 						  0,
2775 						  SLAB_HWCACHE_ALIGN |
2776 							prot->slab_flags,
2777 						  NULL);
2778 			if (prot->twsk_prot->twsk_slab == NULL)
2779 				goto out_free_timewait_sock_slab_name;
2780 		}
2781 	}
2782 
2783 	mutex_lock(&proto_list_mutex);
2784 	list_add(&prot->node, &proto_list);
2785 	assign_proto_idx(prot);
2786 	mutex_unlock(&proto_list_mutex);
2787 	return 0;
2788 
2789 out_free_timewait_sock_slab_name:
2790 	kfree(prot->twsk_prot->twsk_slab_name);
2791 out_free_request_sock_slab:
2792 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2793 		kmem_cache_destroy(prot->rsk_prot->slab);
2794 		prot->rsk_prot->slab = NULL;
2795 	}
2796 out_free_request_sock_slab_name:
2797 	if (prot->rsk_prot)
2798 		kfree(prot->rsk_prot->slab_name);
2799 out_free_sock_slab:
2800 	kmem_cache_destroy(prot->slab);
2801 	prot->slab = NULL;
2802 out:
2803 	return -ENOBUFS;
2804 }
2805 EXPORT_SYMBOL(proto_register);
2806 
2807 void proto_unregister(struct proto *prot)
2808 {
2809 	mutex_lock(&proto_list_mutex);
2810 	release_proto_idx(prot);
2811 	list_del(&prot->node);
2812 	mutex_unlock(&proto_list_mutex);
2813 
2814 	if (prot->slab != NULL) {
2815 		kmem_cache_destroy(prot->slab);
2816 		prot->slab = NULL;
2817 	}
2818 
2819 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2820 		kmem_cache_destroy(prot->rsk_prot->slab);
2821 		kfree(prot->rsk_prot->slab_name);
2822 		prot->rsk_prot->slab = NULL;
2823 	}
2824 
2825 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2826 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2827 		kfree(prot->twsk_prot->twsk_slab_name);
2828 		prot->twsk_prot->twsk_slab = NULL;
2829 	}
2830 }
2831 EXPORT_SYMBOL(proto_unregister);
2832 
2833 #ifdef CONFIG_PROC_FS
2834 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2835 	__acquires(proto_list_mutex)
2836 {
2837 	mutex_lock(&proto_list_mutex);
2838 	return seq_list_start_head(&proto_list, *pos);
2839 }
2840 
2841 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2842 {
2843 	return seq_list_next(v, &proto_list, pos);
2844 }
2845 
2846 static void proto_seq_stop(struct seq_file *seq, void *v)
2847 	__releases(proto_list_mutex)
2848 {
2849 	mutex_unlock(&proto_list_mutex);
2850 }
2851 
2852 static char proto_method_implemented(const void *method)
2853 {
2854 	return method == NULL ? 'n' : 'y';
2855 }
2856 static long sock_prot_memory_allocated(struct proto *proto)
2857 {
2858 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2859 }
2860 
2861 static char *sock_prot_memory_pressure(struct proto *proto)
2862 {
2863 	return proto->memory_pressure != NULL ?
2864 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2865 }
2866 
2867 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2868 {
2869 
2870 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2871 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2872 		   proto->name,
2873 		   proto->obj_size,
2874 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2875 		   sock_prot_memory_allocated(proto),
2876 		   sock_prot_memory_pressure(proto),
2877 		   proto->max_header,
2878 		   proto->slab == NULL ? "no" : "yes",
2879 		   module_name(proto->owner),
2880 		   proto_method_implemented(proto->close),
2881 		   proto_method_implemented(proto->connect),
2882 		   proto_method_implemented(proto->disconnect),
2883 		   proto_method_implemented(proto->accept),
2884 		   proto_method_implemented(proto->ioctl),
2885 		   proto_method_implemented(proto->init),
2886 		   proto_method_implemented(proto->destroy),
2887 		   proto_method_implemented(proto->shutdown),
2888 		   proto_method_implemented(proto->setsockopt),
2889 		   proto_method_implemented(proto->getsockopt),
2890 		   proto_method_implemented(proto->sendmsg),
2891 		   proto_method_implemented(proto->recvmsg),
2892 		   proto_method_implemented(proto->sendpage),
2893 		   proto_method_implemented(proto->bind),
2894 		   proto_method_implemented(proto->backlog_rcv),
2895 		   proto_method_implemented(proto->hash),
2896 		   proto_method_implemented(proto->unhash),
2897 		   proto_method_implemented(proto->get_port),
2898 		   proto_method_implemented(proto->enter_memory_pressure));
2899 }
2900 
2901 static int proto_seq_show(struct seq_file *seq, void *v)
2902 {
2903 	if (v == &proto_list)
2904 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2905 			   "protocol",
2906 			   "size",
2907 			   "sockets",
2908 			   "memory",
2909 			   "press",
2910 			   "maxhdr",
2911 			   "slab",
2912 			   "module",
2913 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2914 	else
2915 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2916 	return 0;
2917 }
2918 
2919 static const struct seq_operations proto_seq_ops = {
2920 	.start  = proto_seq_start,
2921 	.next   = proto_seq_next,
2922 	.stop   = proto_seq_stop,
2923 	.show   = proto_seq_show,
2924 };
2925 
2926 static int proto_seq_open(struct inode *inode, struct file *file)
2927 {
2928 	return seq_open_net(inode, file, &proto_seq_ops,
2929 			    sizeof(struct seq_net_private));
2930 }
2931 
2932 static const struct file_operations proto_seq_fops = {
2933 	.owner		= THIS_MODULE,
2934 	.open		= proto_seq_open,
2935 	.read		= seq_read,
2936 	.llseek		= seq_lseek,
2937 	.release	= seq_release_net,
2938 };
2939 
2940 static __net_init int proto_init_net(struct net *net)
2941 {
2942 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2943 		return -ENOMEM;
2944 
2945 	return 0;
2946 }
2947 
2948 static __net_exit void proto_exit_net(struct net *net)
2949 {
2950 	remove_proc_entry("protocols", net->proc_net);
2951 }
2952 
2953 
2954 static __net_initdata struct pernet_operations proto_net_ops = {
2955 	.init = proto_init_net,
2956 	.exit = proto_exit_net,
2957 };
2958 
2959 static int __init proto_init(void)
2960 {
2961 	return register_pernet_subsys(&proto_net_ops);
2962 }
2963 
2964 subsys_initcall(proto_init);
2965 
2966 #endif /* PROC_FS */
2967