xref: /openbmc/linux/net/core/sock.c (revision 7490ca1e)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/jump_label.h>
115 #include <linux/memcontrol.h>
116 
117 #include <asm/uaccess.h>
118 #include <asm/system.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 
132 #include <linux/filter.h>
133 
134 #include <trace/events/sock.h>
135 
136 #ifdef CONFIG_INET
137 #include <net/tcp.h>
138 #endif
139 
140 static DEFINE_MUTEX(proto_list_mutex);
141 static LIST_HEAD(proto_list);
142 
143 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144 int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145 {
146 	struct proto *proto;
147 	int ret = 0;
148 
149 	mutex_lock(&proto_list_mutex);
150 	list_for_each_entry(proto, &proto_list, node) {
151 		if (proto->init_cgroup) {
152 			ret = proto->init_cgroup(cgrp, ss);
153 			if (ret)
154 				goto out;
155 		}
156 	}
157 
158 	mutex_unlock(&proto_list_mutex);
159 	return ret;
160 out:
161 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
162 		if (proto->destroy_cgroup)
163 			proto->destroy_cgroup(cgrp, ss);
164 	mutex_unlock(&proto_list_mutex);
165 	return ret;
166 }
167 
168 void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
169 {
170 	struct proto *proto;
171 
172 	mutex_lock(&proto_list_mutex);
173 	list_for_each_entry_reverse(proto, &proto_list, node)
174 		if (proto->destroy_cgroup)
175 			proto->destroy_cgroup(cgrp, ss);
176 	mutex_unlock(&proto_list_mutex);
177 }
178 #endif
179 
180 /*
181  * Each address family might have different locking rules, so we have
182  * one slock key per address family:
183  */
184 static struct lock_class_key af_family_keys[AF_MAX];
185 static struct lock_class_key af_family_slock_keys[AF_MAX];
186 
187 struct jump_label_key memcg_socket_limit_enabled;
188 EXPORT_SYMBOL(memcg_socket_limit_enabled);
189 
190 /*
191  * Make lock validator output more readable. (we pre-construct these
192  * strings build-time, so that runtime initialization of socket
193  * locks is fast):
194  */
195 static const char *const af_family_key_strings[AF_MAX+1] = {
196   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
197   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
198   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
199   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
200   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
201   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
202   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
203   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
204   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
205   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
206   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
207   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
208   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
209   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
210 };
211 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
212   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
213   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
214   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
215   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
216   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
217   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
218   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
219   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
220   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
221   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
222   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
223   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
224   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
225   "slock-AF_NFC"   , "slock-AF_MAX"
226 };
227 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
228   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
229   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
230   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
231   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
232   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
233   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
234   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
235   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
236   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
237   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
238   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
239   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
240   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
241   "clock-AF_NFC"   , "clock-AF_MAX"
242 };
243 
244 /*
245  * sk_callback_lock locking rules are per-address-family,
246  * so split the lock classes by using a per-AF key:
247  */
248 static struct lock_class_key af_callback_keys[AF_MAX];
249 
250 /* Take into consideration the size of the struct sk_buff overhead in the
251  * determination of these values, since that is non-constant across
252  * platforms.  This makes socket queueing behavior and performance
253  * not depend upon such differences.
254  */
255 #define _SK_MEM_PACKETS		256
256 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
257 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259 
260 /* Run time adjustable parameters. */
261 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
262 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
263 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
264 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
265 
266 /* Maximal space eaten by iovec or ancillary data plus some space */
267 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
268 EXPORT_SYMBOL(sysctl_optmem_max);
269 
270 #if defined(CONFIG_CGROUPS)
271 #if !defined(CONFIG_NET_CLS_CGROUP)
272 int net_cls_subsys_id = -1;
273 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
274 #endif
275 #if !defined(CONFIG_NETPRIO_CGROUP)
276 int net_prio_subsys_id = -1;
277 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
278 #endif
279 #endif
280 
281 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
282 {
283 	struct timeval tv;
284 
285 	if (optlen < sizeof(tv))
286 		return -EINVAL;
287 	if (copy_from_user(&tv, optval, sizeof(tv)))
288 		return -EFAULT;
289 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
290 		return -EDOM;
291 
292 	if (tv.tv_sec < 0) {
293 		static int warned __read_mostly;
294 
295 		*timeo_p = 0;
296 		if (warned < 10 && net_ratelimit()) {
297 			warned++;
298 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
299 			       "tries to set negative timeout\n",
300 				current->comm, task_pid_nr(current));
301 		}
302 		return 0;
303 	}
304 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
305 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
306 		return 0;
307 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
308 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
309 	return 0;
310 }
311 
312 static void sock_warn_obsolete_bsdism(const char *name)
313 {
314 	static int warned;
315 	static char warncomm[TASK_COMM_LEN];
316 	if (strcmp(warncomm, current->comm) && warned < 5) {
317 		strcpy(warncomm,  current->comm);
318 		printk(KERN_WARNING "process `%s' is using obsolete "
319 		       "%s SO_BSDCOMPAT\n", warncomm, name);
320 		warned++;
321 	}
322 }
323 
324 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
325 
326 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
327 {
328 	if (sk->sk_flags & flags) {
329 		sk->sk_flags &= ~flags;
330 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
331 			net_disable_timestamp();
332 	}
333 }
334 
335 
336 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
337 {
338 	int err;
339 	int skb_len;
340 	unsigned long flags;
341 	struct sk_buff_head *list = &sk->sk_receive_queue;
342 
343 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
344 		atomic_inc(&sk->sk_drops);
345 		trace_sock_rcvqueue_full(sk, skb);
346 		return -ENOMEM;
347 	}
348 
349 	err = sk_filter(sk, skb);
350 	if (err)
351 		return err;
352 
353 	if (!sk_rmem_schedule(sk, skb->truesize)) {
354 		atomic_inc(&sk->sk_drops);
355 		return -ENOBUFS;
356 	}
357 
358 	skb->dev = NULL;
359 	skb_set_owner_r(skb, sk);
360 
361 	/* Cache the SKB length before we tack it onto the receive
362 	 * queue.  Once it is added it no longer belongs to us and
363 	 * may be freed by other threads of control pulling packets
364 	 * from the queue.
365 	 */
366 	skb_len = skb->len;
367 
368 	/* we escape from rcu protected region, make sure we dont leak
369 	 * a norefcounted dst
370 	 */
371 	skb_dst_force(skb);
372 
373 	spin_lock_irqsave(&list->lock, flags);
374 	skb->dropcount = atomic_read(&sk->sk_drops);
375 	__skb_queue_tail(list, skb);
376 	spin_unlock_irqrestore(&list->lock, flags);
377 
378 	if (!sock_flag(sk, SOCK_DEAD))
379 		sk->sk_data_ready(sk, skb_len);
380 	return 0;
381 }
382 EXPORT_SYMBOL(sock_queue_rcv_skb);
383 
384 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
385 {
386 	int rc = NET_RX_SUCCESS;
387 
388 	if (sk_filter(sk, skb))
389 		goto discard_and_relse;
390 
391 	skb->dev = NULL;
392 
393 	if (sk_rcvqueues_full(sk, skb)) {
394 		atomic_inc(&sk->sk_drops);
395 		goto discard_and_relse;
396 	}
397 	if (nested)
398 		bh_lock_sock_nested(sk);
399 	else
400 		bh_lock_sock(sk);
401 	if (!sock_owned_by_user(sk)) {
402 		/*
403 		 * trylock + unlock semantics:
404 		 */
405 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
406 
407 		rc = sk_backlog_rcv(sk, skb);
408 
409 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
410 	} else if (sk_add_backlog(sk, skb)) {
411 		bh_unlock_sock(sk);
412 		atomic_inc(&sk->sk_drops);
413 		goto discard_and_relse;
414 	}
415 
416 	bh_unlock_sock(sk);
417 out:
418 	sock_put(sk);
419 	return rc;
420 discard_and_relse:
421 	kfree_skb(skb);
422 	goto out;
423 }
424 EXPORT_SYMBOL(sk_receive_skb);
425 
426 void sk_reset_txq(struct sock *sk)
427 {
428 	sk_tx_queue_clear(sk);
429 }
430 EXPORT_SYMBOL(sk_reset_txq);
431 
432 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
433 {
434 	struct dst_entry *dst = __sk_dst_get(sk);
435 
436 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
437 		sk_tx_queue_clear(sk);
438 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
439 		dst_release(dst);
440 		return NULL;
441 	}
442 
443 	return dst;
444 }
445 EXPORT_SYMBOL(__sk_dst_check);
446 
447 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
448 {
449 	struct dst_entry *dst = sk_dst_get(sk);
450 
451 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
452 		sk_dst_reset(sk);
453 		dst_release(dst);
454 		return NULL;
455 	}
456 
457 	return dst;
458 }
459 EXPORT_SYMBOL(sk_dst_check);
460 
461 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
462 {
463 	int ret = -ENOPROTOOPT;
464 #ifdef CONFIG_NETDEVICES
465 	struct net *net = sock_net(sk);
466 	char devname[IFNAMSIZ];
467 	int index;
468 
469 	/* Sorry... */
470 	ret = -EPERM;
471 	if (!capable(CAP_NET_RAW))
472 		goto out;
473 
474 	ret = -EINVAL;
475 	if (optlen < 0)
476 		goto out;
477 
478 	/* Bind this socket to a particular device like "eth0",
479 	 * as specified in the passed interface name. If the
480 	 * name is "" or the option length is zero the socket
481 	 * is not bound.
482 	 */
483 	if (optlen > IFNAMSIZ - 1)
484 		optlen = IFNAMSIZ - 1;
485 	memset(devname, 0, sizeof(devname));
486 
487 	ret = -EFAULT;
488 	if (copy_from_user(devname, optval, optlen))
489 		goto out;
490 
491 	index = 0;
492 	if (devname[0] != '\0') {
493 		struct net_device *dev;
494 
495 		rcu_read_lock();
496 		dev = dev_get_by_name_rcu(net, devname);
497 		if (dev)
498 			index = dev->ifindex;
499 		rcu_read_unlock();
500 		ret = -ENODEV;
501 		if (!dev)
502 			goto out;
503 	}
504 
505 	lock_sock(sk);
506 	sk->sk_bound_dev_if = index;
507 	sk_dst_reset(sk);
508 	release_sock(sk);
509 
510 	ret = 0;
511 
512 out:
513 #endif
514 
515 	return ret;
516 }
517 
518 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
519 {
520 	if (valbool)
521 		sock_set_flag(sk, bit);
522 	else
523 		sock_reset_flag(sk, bit);
524 }
525 
526 /*
527  *	This is meant for all protocols to use and covers goings on
528  *	at the socket level. Everything here is generic.
529  */
530 
531 int sock_setsockopt(struct socket *sock, int level, int optname,
532 		    char __user *optval, unsigned int optlen)
533 {
534 	struct sock *sk = sock->sk;
535 	int val;
536 	int valbool;
537 	struct linger ling;
538 	int ret = 0;
539 
540 	/*
541 	 *	Options without arguments
542 	 */
543 
544 	if (optname == SO_BINDTODEVICE)
545 		return sock_bindtodevice(sk, optval, optlen);
546 
547 	if (optlen < sizeof(int))
548 		return -EINVAL;
549 
550 	if (get_user(val, (int __user *)optval))
551 		return -EFAULT;
552 
553 	valbool = val ? 1 : 0;
554 
555 	lock_sock(sk);
556 
557 	switch (optname) {
558 	case SO_DEBUG:
559 		if (val && !capable(CAP_NET_ADMIN))
560 			ret = -EACCES;
561 		else
562 			sock_valbool_flag(sk, SOCK_DBG, valbool);
563 		break;
564 	case SO_REUSEADDR:
565 		sk->sk_reuse = valbool;
566 		break;
567 	case SO_TYPE:
568 	case SO_PROTOCOL:
569 	case SO_DOMAIN:
570 	case SO_ERROR:
571 		ret = -ENOPROTOOPT;
572 		break;
573 	case SO_DONTROUTE:
574 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
575 		break;
576 	case SO_BROADCAST:
577 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
578 		break;
579 	case SO_SNDBUF:
580 		/* Don't error on this BSD doesn't and if you think
581 		   about it this is right. Otherwise apps have to
582 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
583 		   are treated in BSD as hints */
584 
585 		if (val > sysctl_wmem_max)
586 			val = sysctl_wmem_max;
587 set_sndbuf:
588 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
589 		if ((val * 2) < SOCK_MIN_SNDBUF)
590 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
591 		else
592 			sk->sk_sndbuf = val * 2;
593 
594 		/*
595 		 *	Wake up sending tasks if we
596 		 *	upped the value.
597 		 */
598 		sk->sk_write_space(sk);
599 		break;
600 
601 	case SO_SNDBUFFORCE:
602 		if (!capable(CAP_NET_ADMIN)) {
603 			ret = -EPERM;
604 			break;
605 		}
606 		goto set_sndbuf;
607 
608 	case SO_RCVBUF:
609 		/* Don't error on this BSD doesn't and if you think
610 		   about it this is right. Otherwise apps have to
611 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
612 		   are treated in BSD as hints */
613 
614 		if (val > sysctl_rmem_max)
615 			val = sysctl_rmem_max;
616 set_rcvbuf:
617 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
618 		/*
619 		 * We double it on the way in to account for
620 		 * "struct sk_buff" etc. overhead.   Applications
621 		 * assume that the SO_RCVBUF setting they make will
622 		 * allow that much actual data to be received on that
623 		 * socket.
624 		 *
625 		 * Applications are unaware that "struct sk_buff" and
626 		 * other overheads allocate from the receive buffer
627 		 * during socket buffer allocation.
628 		 *
629 		 * And after considering the possible alternatives,
630 		 * returning the value we actually used in getsockopt
631 		 * is the most desirable behavior.
632 		 */
633 		if ((val * 2) < SOCK_MIN_RCVBUF)
634 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
635 		else
636 			sk->sk_rcvbuf = val * 2;
637 		break;
638 
639 	case SO_RCVBUFFORCE:
640 		if (!capable(CAP_NET_ADMIN)) {
641 			ret = -EPERM;
642 			break;
643 		}
644 		goto set_rcvbuf;
645 
646 	case SO_KEEPALIVE:
647 #ifdef CONFIG_INET
648 		if (sk->sk_protocol == IPPROTO_TCP)
649 			tcp_set_keepalive(sk, valbool);
650 #endif
651 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
652 		break;
653 
654 	case SO_OOBINLINE:
655 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
656 		break;
657 
658 	case SO_NO_CHECK:
659 		sk->sk_no_check = valbool;
660 		break;
661 
662 	case SO_PRIORITY:
663 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
664 			sk->sk_priority = val;
665 		else
666 			ret = -EPERM;
667 		break;
668 
669 	case SO_LINGER:
670 		if (optlen < sizeof(ling)) {
671 			ret = -EINVAL;	/* 1003.1g */
672 			break;
673 		}
674 		if (copy_from_user(&ling, optval, sizeof(ling))) {
675 			ret = -EFAULT;
676 			break;
677 		}
678 		if (!ling.l_onoff)
679 			sock_reset_flag(sk, SOCK_LINGER);
680 		else {
681 #if (BITS_PER_LONG == 32)
682 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
683 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
684 			else
685 #endif
686 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
687 			sock_set_flag(sk, SOCK_LINGER);
688 		}
689 		break;
690 
691 	case SO_BSDCOMPAT:
692 		sock_warn_obsolete_bsdism("setsockopt");
693 		break;
694 
695 	case SO_PASSCRED:
696 		if (valbool)
697 			set_bit(SOCK_PASSCRED, &sock->flags);
698 		else
699 			clear_bit(SOCK_PASSCRED, &sock->flags);
700 		break;
701 
702 	case SO_TIMESTAMP:
703 	case SO_TIMESTAMPNS:
704 		if (valbool)  {
705 			if (optname == SO_TIMESTAMP)
706 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
707 			else
708 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
709 			sock_set_flag(sk, SOCK_RCVTSTAMP);
710 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
711 		} else {
712 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
713 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
714 		}
715 		break;
716 
717 	case SO_TIMESTAMPING:
718 		if (val & ~SOF_TIMESTAMPING_MASK) {
719 			ret = -EINVAL;
720 			break;
721 		}
722 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
723 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
724 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
725 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
726 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
727 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
728 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
729 			sock_enable_timestamp(sk,
730 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
731 		else
732 			sock_disable_timestamp(sk,
733 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
734 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
735 				  val & SOF_TIMESTAMPING_SOFTWARE);
736 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
737 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
738 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
739 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
740 		break;
741 
742 	case SO_RCVLOWAT:
743 		if (val < 0)
744 			val = INT_MAX;
745 		sk->sk_rcvlowat = val ? : 1;
746 		break;
747 
748 	case SO_RCVTIMEO:
749 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
750 		break;
751 
752 	case SO_SNDTIMEO:
753 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
754 		break;
755 
756 	case SO_ATTACH_FILTER:
757 		ret = -EINVAL;
758 		if (optlen == sizeof(struct sock_fprog)) {
759 			struct sock_fprog fprog;
760 
761 			ret = -EFAULT;
762 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
763 				break;
764 
765 			ret = sk_attach_filter(&fprog, sk);
766 		}
767 		break;
768 
769 	case SO_DETACH_FILTER:
770 		ret = sk_detach_filter(sk);
771 		break;
772 
773 	case SO_PASSSEC:
774 		if (valbool)
775 			set_bit(SOCK_PASSSEC, &sock->flags);
776 		else
777 			clear_bit(SOCK_PASSSEC, &sock->flags);
778 		break;
779 	case SO_MARK:
780 		if (!capable(CAP_NET_ADMIN))
781 			ret = -EPERM;
782 		else
783 			sk->sk_mark = val;
784 		break;
785 
786 		/* We implement the SO_SNDLOWAT etc to
787 		   not be settable (1003.1g 5.3) */
788 	case SO_RXQ_OVFL:
789 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
790 		break;
791 
792 	case SO_WIFI_STATUS:
793 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
794 		break;
795 
796 	default:
797 		ret = -ENOPROTOOPT;
798 		break;
799 	}
800 	release_sock(sk);
801 	return ret;
802 }
803 EXPORT_SYMBOL(sock_setsockopt);
804 
805 
806 void cred_to_ucred(struct pid *pid, const struct cred *cred,
807 		   struct ucred *ucred)
808 {
809 	ucred->pid = pid_vnr(pid);
810 	ucred->uid = ucred->gid = -1;
811 	if (cred) {
812 		struct user_namespace *current_ns = current_user_ns();
813 
814 		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
815 		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
816 	}
817 }
818 EXPORT_SYMBOL_GPL(cred_to_ucred);
819 
820 int sock_getsockopt(struct socket *sock, int level, int optname,
821 		    char __user *optval, int __user *optlen)
822 {
823 	struct sock *sk = sock->sk;
824 
825 	union {
826 		int val;
827 		struct linger ling;
828 		struct timeval tm;
829 	} v;
830 
831 	int lv = sizeof(int);
832 	int len;
833 
834 	if (get_user(len, optlen))
835 		return -EFAULT;
836 	if (len < 0)
837 		return -EINVAL;
838 
839 	memset(&v, 0, sizeof(v));
840 
841 	switch (optname) {
842 	case SO_DEBUG:
843 		v.val = sock_flag(sk, SOCK_DBG);
844 		break;
845 
846 	case SO_DONTROUTE:
847 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
848 		break;
849 
850 	case SO_BROADCAST:
851 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
852 		break;
853 
854 	case SO_SNDBUF:
855 		v.val = sk->sk_sndbuf;
856 		break;
857 
858 	case SO_RCVBUF:
859 		v.val = sk->sk_rcvbuf;
860 		break;
861 
862 	case SO_REUSEADDR:
863 		v.val = sk->sk_reuse;
864 		break;
865 
866 	case SO_KEEPALIVE:
867 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
868 		break;
869 
870 	case SO_TYPE:
871 		v.val = sk->sk_type;
872 		break;
873 
874 	case SO_PROTOCOL:
875 		v.val = sk->sk_protocol;
876 		break;
877 
878 	case SO_DOMAIN:
879 		v.val = sk->sk_family;
880 		break;
881 
882 	case SO_ERROR:
883 		v.val = -sock_error(sk);
884 		if (v.val == 0)
885 			v.val = xchg(&sk->sk_err_soft, 0);
886 		break;
887 
888 	case SO_OOBINLINE:
889 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
890 		break;
891 
892 	case SO_NO_CHECK:
893 		v.val = sk->sk_no_check;
894 		break;
895 
896 	case SO_PRIORITY:
897 		v.val = sk->sk_priority;
898 		break;
899 
900 	case SO_LINGER:
901 		lv		= sizeof(v.ling);
902 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
903 		v.ling.l_linger	= sk->sk_lingertime / HZ;
904 		break;
905 
906 	case SO_BSDCOMPAT:
907 		sock_warn_obsolete_bsdism("getsockopt");
908 		break;
909 
910 	case SO_TIMESTAMP:
911 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
912 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
913 		break;
914 
915 	case SO_TIMESTAMPNS:
916 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
917 		break;
918 
919 	case SO_TIMESTAMPING:
920 		v.val = 0;
921 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
922 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
923 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
924 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
925 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
926 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
927 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
928 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
929 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
930 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
931 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
932 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
933 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
934 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
935 		break;
936 
937 	case SO_RCVTIMEO:
938 		lv = sizeof(struct timeval);
939 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
940 			v.tm.tv_sec = 0;
941 			v.tm.tv_usec = 0;
942 		} else {
943 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
944 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
945 		}
946 		break;
947 
948 	case SO_SNDTIMEO:
949 		lv = sizeof(struct timeval);
950 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
951 			v.tm.tv_sec = 0;
952 			v.tm.tv_usec = 0;
953 		} else {
954 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
955 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
956 		}
957 		break;
958 
959 	case SO_RCVLOWAT:
960 		v.val = sk->sk_rcvlowat;
961 		break;
962 
963 	case SO_SNDLOWAT:
964 		v.val = 1;
965 		break;
966 
967 	case SO_PASSCRED:
968 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
969 		break;
970 
971 	case SO_PEERCRED:
972 	{
973 		struct ucred peercred;
974 		if (len > sizeof(peercred))
975 			len = sizeof(peercred);
976 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
977 		if (copy_to_user(optval, &peercred, len))
978 			return -EFAULT;
979 		goto lenout;
980 	}
981 
982 	case SO_PEERNAME:
983 	{
984 		char address[128];
985 
986 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
987 			return -ENOTCONN;
988 		if (lv < len)
989 			return -EINVAL;
990 		if (copy_to_user(optval, address, len))
991 			return -EFAULT;
992 		goto lenout;
993 	}
994 
995 	/* Dubious BSD thing... Probably nobody even uses it, but
996 	 * the UNIX standard wants it for whatever reason... -DaveM
997 	 */
998 	case SO_ACCEPTCONN:
999 		v.val = sk->sk_state == TCP_LISTEN;
1000 		break;
1001 
1002 	case SO_PASSSEC:
1003 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
1004 		break;
1005 
1006 	case SO_PEERSEC:
1007 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1008 
1009 	case SO_MARK:
1010 		v.val = sk->sk_mark;
1011 		break;
1012 
1013 	case SO_RXQ_OVFL:
1014 		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1015 		break;
1016 
1017 	case SO_WIFI_STATUS:
1018 		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1019 		break;
1020 
1021 	default:
1022 		return -ENOPROTOOPT;
1023 	}
1024 
1025 	if (len > lv)
1026 		len = lv;
1027 	if (copy_to_user(optval, &v, len))
1028 		return -EFAULT;
1029 lenout:
1030 	if (put_user(len, optlen))
1031 		return -EFAULT;
1032 	return 0;
1033 }
1034 
1035 /*
1036  * Initialize an sk_lock.
1037  *
1038  * (We also register the sk_lock with the lock validator.)
1039  */
1040 static inline void sock_lock_init(struct sock *sk)
1041 {
1042 	sock_lock_init_class_and_name(sk,
1043 			af_family_slock_key_strings[sk->sk_family],
1044 			af_family_slock_keys + sk->sk_family,
1045 			af_family_key_strings[sk->sk_family],
1046 			af_family_keys + sk->sk_family);
1047 }
1048 
1049 /*
1050  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1051  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1052  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1053  */
1054 static void sock_copy(struct sock *nsk, const struct sock *osk)
1055 {
1056 #ifdef CONFIG_SECURITY_NETWORK
1057 	void *sptr = nsk->sk_security;
1058 #endif
1059 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1060 
1061 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1062 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1063 
1064 #ifdef CONFIG_SECURITY_NETWORK
1065 	nsk->sk_security = sptr;
1066 	security_sk_clone(osk, nsk);
1067 #endif
1068 }
1069 
1070 /*
1071  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1072  * un-modified. Special care is taken when initializing object to zero.
1073  */
1074 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1075 {
1076 	if (offsetof(struct sock, sk_node.next) != 0)
1077 		memset(sk, 0, offsetof(struct sock, sk_node.next));
1078 	memset(&sk->sk_node.pprev, 0,
1079 	       size - offsetof(struct sock, sk_node.pprev));
1080 }
1081 
1082 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1083 {
1084 	unsigned long nulls1, nulls2;
1085 
1086 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1087 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1088 	if (nulls1 > nulls2)
1089 		swap(nulls1, nulls2);
1090 
1091 	if (nulls1 != 0)
1092 		memset((char *)sk, 0, nulls1);
1093 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1094 	       nulls2 - nulls1 - sizeof(void *));
1095 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1096 	       size - nulls2 - sizeof(void *));
1097 }
1098 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1099 
1100 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1101 		int family)
1102 {
1103 	struct sock *sk;
1104 	struct kmem_cache *slab;
1105 
1106 	slab = prot->slab;
1107 	if (slab != NULL) {
1108 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1109 		if (!sk)
1110 			return sk;
1111 		if (priority & __GFP_ZERO) {
1112 			if (prot->clear_sk)
1113 				prot->clear_sk(sk, prot->obj_size);
1114 			else
1115 				sk_prot_clear_nulls(sk, prot->obj_size);
1116 		}
1117 	} else
1118 		sk = kmalloc(prot->obj_size, priority);
1119 
1120 	if (sk != NULL) {
1121 		kmemcheck_annotate_bitfield(sk, flags);
1122 
1123 		if (security_sk_alloc(sk, family, priority))
1124 			goto out_free;
1125 
1126 		if (!try_module_get(prot->owner))
1127 			goto out_free_sec;
1128 		sk_tx_queue_clear(sk);
1129 	}
1130 
1131 	return sk;
1132 
1133 out_free_sec:
1134 	security_sk_free(sk);
1135 out_free:
1136 	if (slab != NULL)
1137 		kmem_cache_free(slab, sk);
1138 	else
1139 		kfree(sk);
1140 	return NULL;
1141 }
1142 
1143 static void sk_prot_free(struct proto *prot, struct sock *sk)
1144 {
1145 	struct kmem_cache *slab;
1146 	struct module *owner;
1147 
1148 	owner = prot->owner;
1149 	slab = prot->slab;
1150 
1151 	security_sk_free(sk);
1152 	if (slab != NULL)
1153 		kmem_cache_free(slab, sk);
1154 	else
1155 		kfree(sk);
1156 	module_put(owner);
1157 }
1158 
1159 #ifdef CONFIG_CGROUPS
1160 void sock_update_classid(struct sock *sk)
1161 {
1162 	u32 classid;
1163 
1164 	rcu_read_lock();  /* doing current task, which cannot vanish. */
1165 	classid = task_cls_classid(current);
1166 	rcu_read_unlock();
1167 	if (classid && classid != sk->sk_classid)
1168 		sk->sk_classid = classid;
1169 }
1170 EXPORT_SYMBOL(sock_update_classid);
1171 
1172 void sock_update_netprioidx(struct sock *sk)
1173 {
1174 	if (in_interrupt())
1175 		return;
1176 
1177 	sk->sk_cgrp_prioidx = task_netprioidx(current);
1178 }
1179 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1180 #endif
1181 
1182 /**
1183  *	sk_alloc - All socket objects are allocated here
1184  *	@net: the applicable net namespace
1185  *	@family: protocol family
1186  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1187  *	@prot: struct proto associated with this new sock instance
1188  */
1189 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1190 		      struct proto *prot)
1191 {
1192 	struct sock *sk;
1193 
1194 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1195 	if (sk) {
1196 		sk->sk_family = family;
1197 		/*
1198 		 * See comment in struct sock definition to understand
1199 		 * why we need sk_prot_creator -acme
1200 		 */
1201 		sk->sk_prot = sk->sk_prot_creator = prot;
1202 		sock_lock_init(sk);
1203 		sock_net_set(sk, get_net(net));
1204 		atomic_set(&sk->sk_wmem_alloc, 1);
1205 
1206 		sock_update_classid(sk);
1207 		sock_update_netprioidx(sk);
1208 	}
1209 
1210 	return sk;
1211 }
1212 EXPORT_SYMBOL(sk_alloc);
1213 
1214 static void __sk_free(struct sock *sk)
1215 {
1216 	struct sk_filter *filter;
1217 
1218 	if (sk->sk_destruct)
1219 		sk->sk_destruct(sk);
1220 
1221 	filter = rcu_dereference_check(sk->sk_filter,
1222 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1223 	if (filter) {
1224 		sk_filter_uncharge(sk, filter);
1225 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1226 	}
1227 
1228 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1229 
1230 	if (atomic_read(&sk->sk_omem_alloc))
1231 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1232 		       __func__, atomic_read(&sk->sk_omem_alloc));
1233 
1234 	if (sk->sk_peer_cred)
1235 		put_cred(sk->sk_peer_cred);
1236 	put_pid(sk->sk_peer_pid);
1237 	put_net(sock_net(sk));
1238 	sk_prot_free(sk->sk_prot_creator, sk);
1239 }
1240 
1241 void sk_free(struct sock *sk)
1242 {
1243 	/*
1244 	 * We subtract one from sk_wmem_alloc and can know if
1245 	 * some packets are still in some tx queue.
1246 	 * If not null, sock_wfree() will call __sk_free(sk) later
1247 	 */
1248 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1249 		__sk_free(sk);
1250 }
1251 EXPORT_SYMBOL(sk_free);
1252 
1253 /*
1254  * Last sock_put should drop reference to sk->sk_net. It has already
1255  * been dropped in sk_change_net. Taking reference to stopping namespace
1256  * is not an option.
1257  * Take reference to a socket to remove it from hash _alive_ and after that
1258  * destroy it in the context of init_net.
1259  */
1260 void sk_release_kernel(struct sock *sk)
1261 {
1262 	if (sk == NULL || sk->sk_socket == NULL)
1263 		return;
1264 
1265 	sock_hold(sk);
1266 	sock_release(sk->sk_socket);
1267 	release_net(sock_net(sk));
1268 	sock_net_set(sk, get_net(&init_net));
1269 	sock_put(sk);
1270 }
1271 EXPORT_SYMBOL(sk_release_kernel);
1272 
1273 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1274 {
1275 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1276 		sock_update_memcg(newsk);
1277 }
1278 
1279 /**
1280  *	sk_clone_lock - clone a socket, and lock its clone
1281  *	@sk: the socket to clone
1282  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1283  *
1284  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1285  */
1286 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1287 {
1288 	struct sock *newsk;
1289 
1290 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1291 	if (newsk != NULL) {
1292 		struct sk_filter *filter;
1293 
1294 		sock_copy(newsk, sk);
1295 
1296 		/* SANITY */
1297 		get_net(sock_net(newsk));
1298 		sk_node_init(&newsk->sk_node);
1299 		sock_lock_init(newsk);
1300 		bh_lock_sock(newsk);
1301 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1302 		newsk->sk_backlog.len = 0;
1303 
1304 		atomic_set(&newsk->sk_rmem_alloc, 0);
1305 		/*
1306 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1307 		 */
1308 		atomic_set(&newsk->sk_wmem_alloc, 1);
1309 		atomic_set(&newsk->sk_omem_alloc, 0);
1310 		skb_queue_head_init(&newsk->sk_receive_queue);
1311 		skb_queue_head_init(&newsk->sk_write_queue);
1312 #ifdef CONFIG_NET_DMA
1313 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1314 #endif
1315 
1316 		spin_lock_init(&newsk->sk_dst_lock);
1317 		rwlock_init(&newsk->sk_callback_lock);
1318 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1319 				af_callback_keys + newsk->sk_family,
1320 				af_family_clock_key_strings[newsk->sk_family]);
1321 
1322 		newsk->sk_dst_cache	= NULL;
1323 		newsk->sk_wmem_queued	= 0;
1324 		newsk->sk_forward_alloc = 0;
1325 		newsk->sk_send_head	= NULL;
1326 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1327 
1328 		sock_reset_flag(newsk, SOCK_DONE);
1329 		skb_queue_head_init(&newsk->sk_error_queue);
1330 
1331 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1332 		if (filter != NULL)
1333 			sk_filter_charge(newsk, filter);
1334 
1335 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1336 			/* It is still raw copy of parent, so invalidate
1337 			 * destructor and make plain sk_free() */
1338 			newsk->sk_destruct = NULL;
1339 			bh_unlock_sock(newsk);
1340 			sk_free(newsk);
1341 			newsk = NULL;
1342 			goto out;
1343 		}
1344 
1345 		newsk->sk_err	   = 0;
1346 		newsk->sk_priority = 0;
1347 		/*
1348 		 * Before updating sk_refcnt, we must commit prior changes to memory
1349 		 * (Documentation/RCU/rculist_nulls.txt for details)
1350 		 */
1351 		smp_wmb();
1352 		atomic_set(&newsk->sk_refcnt, 2);
1353 
1354 		/*
1355 		 * Increment the counter in the same struct proto as the master
1356 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1357 		 * is the same as sk->sk_prot->socks, as this field was copied
1358 		 * with memcpy).
1359 		 *
1360 		 * This _changes_ the previous behaviour, where
1361 		 * tcp_create_openreq_child always was incrementing the
1362 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1363 		 * to be taken into account in all callers. -acme
1364 		 */
1365 		sk_refcnt_debug_inc(newsk);
1366 		sk_set_socket(newsk, NULL);
1367 		newsk->sk_wq = NULL;
1368 
1369 		sk_update_clone(sk, newsk);
1370 
1371 		if (newsk->sk_prot->sockets_allocated)
1372 			sk_sockets_allocated_inc(newsk);
1373 
1374 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1375 			net_enable_timestamp();
1376 	}
1377 out:
1378 	return newsk;
1379 }
1380 EXPORT_SYMBOL_GPL(sk_clone_lock);
1381 
1382 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1383 {
1384 	__sk_dst_set(sk, dst);
1385 	sk->sk_route_caps = dst->dev->features;
1386 	if (sk->sk_route_caps & NETIF_F_GSO)
1387 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1388 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1389 	if (sk_can_gso(sk)) {
1390 		if (dst->header_len) {
1391 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1392 		} else {
1393 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1394 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1395 		}
1396 	}
1397 }
1398 EXPORT_SYMBOL_GPL(sk_setup_caps);
1399 
1400 void __init sk_init(void)
1401 {
1402 	if (totalram_pages <= 4096) {
1403 		sysctl_wmem_max = 32767;
1404 		sysctl_rmem_max = 32767;
1405 		sysctl_wmem_default = 32767;
1406 		sysctl_rmem_default = 32767;
1407 	} else if (totalram_pages >= 131072) {
1408 		sysctl_wmem_max = 131071;
1409 		sysctl_rmem_max = 131071;
1410 	}
1411 }
1412 
1413 /*
1414  *	Simple resource managers for sockets.
1415  */
1416 
1417 
1418 /*
1419  * Write buffer destructor automatically called from kfree_skb.
1420  */
1421 void sock_wfree(struct sk_buff *skb)
1422 {
1423 	struct sock *sk = skb->sk;
1424 	unsigned int len = skb->truesize;
1425 
1426 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1427 		/*
1428 		 * Keep a reference on sk_wmem_alloc, this will be released
1429 		 * after sk_write_space() call
1430 		 */
1431 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1432 		sk->sk_write_space(sk);
1433 		len = 1;
1434 	}
1435 	/*
1436 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1437 	 * could not do because of in-flight packets
1438 	 */
1439 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1440 		__sk_free(sk);
1441 }
1442 EXPORT_SYMBOL(sock_wfree);
1443 
1444 /*
1445  * Read buffer destructor automatically called from kfree_skb.
1446  */
1447 void sock_rfree(struct sk_buff *skb)
1448 {
1449 	struct sock *sk = skb->sk;
1450 	unsigned int len = skb->truesize;
1451 
1452 	atomic_sub(len, &sk->sk_rmem_alloc);
1453 	sk_mem_uncharge(sk, len);
1454 }
1455 EXPORT_SYMBOL(sock_rfree);
1456 
1457 
1458 int sock_i_uid(struct sock *sk)
1459 {
1460 	int uid;
1461 
1462 	read_lock_bh(&sk->sk_callback_lock);
1463 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1464 	read_unlock_bh(&sk->sk_callback_lock);
1465 	return uid;
1466 }
1467 EXPORT_SYMBOL(sock_i_uid);
1468 
1469 unsigned long sock_i_ino(struct sock *sk)
1470 {
1471 	unsigned long ino;
1472 
1473 	read_lock_bh(&sk->sk_callback_lock);
1474 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1475 	read_unlock_bh(&sk->sk_callback_lock);
1476 	return ino;
1477 }
1478 EXPORT_SYMBOL(sock_i_ino);
1479 
1480 /*
1481  * Allocate a skb from the socket's send buffer.
1482  */
1483 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1484 			     gfp_t priority)
1485 {
1486 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1487 		struct sk_buff *skb = alloc_skb(size, priority);
1488 		if (skb) {
1489 			skb_set_owner_w(skb, sk);
1490 			return skb;
1491 		}
1492 	}
1493 	return NULL;
1494 }
1495 EXPORT_SYMBOL(sock_wmalloc);
1496 
1497 /*
1498  * Allocate a skb from the socket's receive buffer.
1499  */
1500 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1501 			     gfp_t priority)
1502 {
1503 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1504 		struct sk_buff *skb = alloc_skb(size, priority);
1505 		if (skb) {
1506 			skb_set_owner_r(skb, sk);
1507 			return skb;
1508 		}
1509 	}
1510 	return NULL;
1511 }
1512 
1513 /*
1514  * Allocate a memory block from the socket's option memory buffer.
1515  */
1516 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1517 {
1518 	if ((unsigned)size <= sysctl_optmem_max &&
1519 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1520 		void *mem;
1521 		/* First do the add, to avoid the race if kmalloc
1522 		 * might sleep.
1523 		 */
1524 		atomic_add(size, &sk->sk_omem_alloc);
1525 		mem = kmalloc(size, priority);
1526 		if (mem)
1527 			return mem;
1528 		atomic_sub(size, &sk->sk_omem_alloc);
1529 	}
1530 	return NULL;
1531 }
1532 EXPORT_SYMBOL(sock_kmalloc);
1533 
1534 /*
1535  * Free an option memory block.
1536  */
1537 void sock_kfree_s(struct sock *sk, void *mem, int size)
1538 {
1539 	kfree(mem);
1540 	atomic_sub(size, &sk->sk_omem_alloc);
1541 }
1542 EXPORT_SYMBOL(sock_kfree_s);
1543 
1544 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1545    I think, these locks should be removed for datagram sockets.
1546  */
1547 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1548 {
1549 	DEFINE_WAIT(wait);
1550 
1551 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1552 	for (;;) {
1553 		if (!timeo)
1554 			break;
1555 		if (signal_pending(current))
1556 			break;
1557 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1558 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1559 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1560 			break;
1561 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1562 			break;
1563 		if (sk->sk_err)
1564 			break;
1565 		timeo = schedule_timeout(timeo);
1566 	}
1567 	finish_wait(sk_sleep(sk), &wait);
1568 	return timeo;
1569 }
1570 
1571 
1572 /*
1573  *	Generic send/receive buffer handlers
1574  */
1575 
1576 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1577 				     unsigned long data_len, int noblock,
1578 				     int *errcode)
1579 {
1580 	struct sk_buff *skb;
1581 	gfp_t gfp_mask;
1582 	long timeo;
1583 	int err;
1584 
1585 	gfp_mask = sk->sk_allocation;
1586 	if (gfp_mask & __GFP_WAIT)
1587 		gfp_mask |= __GFP_REPEAT;
1588 
1589 	timeo = sock_sndtimeo(sk, noblock);
1590 	while (1) {
1591 		err = sock_error(sk);
1592 		if (err != 0)
1593 			goto failure;
1594 
1595 		err = -EPIPE;
1596 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1597 			goto failure;
1598 
1599 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1600 			skb = alloc_skb(header_len, gfp_mask);
1601 			if (skb) {
1602 				int npages;
1603 				int i;
1604 
1605 				/* No pages, we're done... */
1606 				if (!data_len)
1607 					break;
1608 
1609 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1610 				skb->truesize += data_len;
1611 				skb_shinfo(skb)->nr_frags = npages;
1612 				for (i = 0; i < npages; i++) {
1613 					struct page *page;
1614 
1615 					page = alloc_pages(sk->sk_allocation, 0);
1616 					if (!page) {
1617 						err = -ENOBUFS;
1618 						skb_shinfo(skb)->nr_frags = i;
1619 						kfree_skb(skb);
1620 						goto failure;
1621 					}
1622 
1623 					__skb_fill_page_desc(skb, i,
1624 							page, 0,
1625 							(data_len >= PAGE_SIZE ?
1626 							 PAGE_SIZE :
1627 							 data_len));
1628 					data_len -= PAGE_SIZE;
1629 				}
1630 
1631 				/* Full success... */
1632 				break;
1633 			}
1634 			err = -ENOBUFS;
1635 			goto failure;
1636 		}
1637 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1638 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1639 		err = -EAGAIN;
1640 		if (!timeo)
1641 			goto failure;
1642 		if (signal_pending(current))
1643 			goto interrupted;
1644 		timeo = sock_wait_for_wmem(sk, timeo);
1645 	}
1646 
1647 	skb_set_owner_w(skb, sk);
1648 	return skb;
1649 
1650 interrupted:
1651 	err = sock_intr_errno(timeo);
1652 failure:
1653 	*errcode = err;
1654 	return NULL;
1655 }
1656 EXPORT_SYMBOL(sock_alloc_send_pskb);
1657 
1658 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1659 				    int noblock, int *errcode)
1660 {
1661 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1662 }
1663 EXPORT_SYMBOL(sock_alloc_send_skb);
1664 
1665 static void __lock_sock(struct sock *sk)
1666 	__releases(&sk->sk_lock.slock)
1667 	__acquires(&sk->sk_lock.slock)
1668 {
1669 	DEFINE_WAIT(wait);
1670 
1671 	for (;;) {
1672 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1673 					TASK_UNINTERRUPTIBLE);
1674 		spin_unlock_bh(&sk->sk_lock.slock);
1675 		schedule();
1676 		spin_lock_bh(&sk->sk_lock.slock);
1677 		if (!sock_owned_by_user(sk))
1678 			break;
1679 	}
1680 	finish_wait(&sk->sk_lock.wq, &wait);
1681 }
1682 
1683 static void __release_sock(struct sock *sk)
1684 	__releases(&sk->sk_lock.slock)
1685 	__acquires(&sk->sk_lock.slock)
1686 {
1687 	struct sk_buff *skb = sk->sk_backlog.head;
1688 
1689 	do {
1690 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1691 		bh_unlock_sock(sk);
1692 
1693 		do {
1694 			struct sk_buff *next = skb->next;
1695 
1696 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1697 			skb->next = NULL;
1698 			sk_backlog_rcv(sk, skb);
1699 
1700 			/*
1701 			 * We are in process context here with softirqs
1702 			 * disabled, use cond_resched_softirq() to preempt.
1703 			 * This is safe to do because we've taken the backlog
1704 			 * queue private:
1705 			 */
1706 			cond_resched_softirq();
1707 
1708 			skb = next;
1709 		} while (skb != NULL);
1710 
1711 		bh_lock_sock(sk);
1712 	} while ((skb = sk->sk_backlog.head) != NULL);
1713 
1714 	/*
1715 	 * Doing the zeroing here guarantee we can not loop forever
1716 	 * while a wild producer attempts to flood us.
1717 	 */
1718 	sk->sk_backlog.len = 0;
1719 }
1720 
1721 /**
1722  * sk_wait_data - wait for data to arrive at sk_receive_queue
1723  * @sk:    sock to wait on
1724  * @timeo: for how long
1725  *
1726  * Now socket state including sk->sk_err is changed only under lock,
1727  * hence we may omit checks after joining wait queue.
1728  * We check receive queue before schedule() only as optimization;
1729  * it is very likely that release_sock() added new data.
1730  */
1731 int sk_wait_data(struct sock *sk, long *timeo)
1732 {
1733 	int rc;
1734 	DEFINE_WAIT(wait);
1735 
1736 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1737 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1738 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1739 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1740 	finish_wait(sk_sleep(sk), &wait);
1741 	return rc;
1742 }
1743 EXPORT_SYMBOL(sk_wait_data);
1744 
1745 /**
1746  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1747  *	@sk: socket
1748  *	@size: memory size to allocate
1749  *	@kind: allocation type
1750  *
1751  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1752  *	rmem allocation. This function assumes that protocols which have
1753  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1754  */
1755 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1756 {
1757 	struct proto *prot = sk->sk_prot;
1758 	int amt = sk_mem_pages(size);
1759 	long allocated;
1760 	int parent_status = UNDER_LIMIT;
1761 
1762 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1763 
1764 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1765 
1766 	/* Under limit. */
1767 	if (parent_status == UNDER_LIMIT &&
1768 			allocated <= sk_prot_mem_limits(sk, 0)) {
1769 		sk_leave_memory_pressure(sk);
1770 		return 1;
1771 	}
1772 
1773 	/* Under pressure. (we or our parents) */
1774 	if ((parent_status > SOFT_LIMIT) ||
1775 			allocated > sk_prot_mem_limits(sk, 1))
1776 		sk_enter_memory_pressure(sk);
1777 
1778 	/* Over hard limit (we or our parents) */
1779 	if ((parent_status == OVER_LIMIT) ||
1780 			(allocated > sk_prot_mem_limits(sk, 2)))
1781 		goto suppress_allocation;
1782 
1783 	/* guarantee minimum buffer size under pressure */
1784 	if (kind == SK_MEM_RECV) {
1785 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1786 			return 1;
1787 
1788 	} else { /* SK_MEM_SEND */
1789 		if (sk->sk_type == SOCK_STREAM) {
1790 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1791 				return 1;
1792 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1793 			   prot->sysctl_wmem[0])
1794 				return 1;
1795 	}
1796 
1797 	if (sk_has_memory_pressure(sk)) {
1798 		int alloc;
1799 
1800 		if (!sk_under_memory_pressure(sk))
1801 			return 1;
1802 		alloc = sk_sockets_allocated_read_positive(sk);
1803 		if (sk_prot_mem_limits(sk, 2) > alloc *
1804 		    sk_mem_pages(sk->sk_wmem_queued +
1805 				 atomic_read(&sk->sk_rmem_alloc) +
1806 				 sk->sk_forward_alloc))
1807 			return 1;
1808 	}
1809 
1810 suppress_allocation:
1811 
1812 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1813 		sk_stream_moderate_sndbuf(sk);
1814 
1815 		/* Fail only if socket is _under_ its sndbuf.
1816 		 * In this case we cannot block, so that we have to fail.
1817 		 */
1818 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1819 			return 1;
1820 	}
1821 
1822 	trace_sock_exceed_buf_limit(sk, prot, allocated);
1823 
1824 	/* Alas. Undo changes. */
1825 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1826 
1827 	sk_memory_allocated_sub(sk, amt);
1828 
1829 	return 0;
1830 }
1831 EXPORT_SYMBOL(__sk_mem_schedule);
1832 
1833 /**
1834  *	__sk_reclaim - reclaim memory_allocated
1835  *	@sk: socket
1836  */
1837 void __sk_mem_reclaim(struct sock *sk)
1838 {
1839 	sk_memory_allocated_sub(sk,
1840 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1841 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1842 
1843 	if (sk_under_memory_pressure(sk) &&
1844 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1845 		sk_leave_memory_pressure(sk);
1846 }
1847 EXPORT_SYMBOL(__sk_mem_reclaim);
1848 
1849 
1850 /*
1851  * Set of default routines for initialising struct proto_ops when
1852  * the protocol does not support a particular function. In certain
1853  * cases where it makes no sense for a protocol to have a "do nothing"
1854  * function, some default processing is provided.
1855  */
1856 
1857 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1858 {
1859 	return -EOPNOTSUPP;
1860 }
1861 EXPORT_SYMBOL(sock_no_bind);
1862 
1863 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1864 		    int len, int flags)
1865 {
1866 	return -EOPNOTSUPP;
1867 }
1868 EXPORT_SYMBOL(sock_no_connect);
1869 
1870 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1871 {
1872 	return -EOPNOTSUPP;
1873 }
1874 EXPORT_SYMBOL(sock_no_socketpair);
1875 
1876 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1877 {
1878 	return -EOPNOTSUPP;
1879 }
1880 EXPORT_SYMBOL(sock_no_accept);
1881 
1882 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1883 		    int *len, int peer)
1884 {
1885 	return -EOPNOTSUPP;
1886 }
1887 EXPORT_SYMBOL(sock_no_getname);
1888 
1889 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1890 {
1891 	return 0;
1892 }
1893 EXPORT_SYMBOL(sock_no_poll);
1894 
1895 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1896 {
1897 	return -EOPNOTSUPP;
1898 }
1899 EXPORT_SYMBOL(sock_no_ioctl);
1900 
1901 int sock_no_listen(struct socket *sock, int backlog)
1902 {
1903 	return -EOPNOTSUPP;
1904 }
1905 EXPORT_SYMBOL(sock_no_listen);
1906 
1907 int sock_no_shutdown(struct socket *sock, int how)
1908 {
1909 	return -EOPNOTSUPP;
1910 }
1911 EXPORT_SYMBOL(sock_no_shutdown);
1912 
1913 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1914 		    char __user *optval, unsigned int optlen)
1915 {
1916 	return -EOPNOTSUPP;
1917 }
1918 EXPORT_SYMBOL(sock_no_setsockopt);
1919 
1920 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1921 		    char __user *optval, int __user *optlen)
1922 {
1923 	return -EOPNOTSUPP;
1924 }
1925 EXPORT_SYMBOL(sock_no_getsockopt);
1926 
1927 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1928 		    size_t len)
1929 {
1930 	return -EOPNOTSUPP;
1931 }
1932 EXPORT_SYMBOL(sock_no_sendmsg);
1933 
1934 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1935 		    size_t len, int flags)
1936 {
1937 	return -EOPNOTSUPP;
1938 }
1939 EXPORT_SYMBOL(sock_no_recvmsg);
1940 
1941 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1942 {
1943 	/* Mirror missing mmap method error code */
1944 	return -ENODEV;
1945 }
1946 EXPORT_SYMBOL(sock_no_mmap);
1947 
1948 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1949 {
1950 	ssize_t res;
1951 	struct msghdr msg = {.msg_flags = flags};
1952 	struct kvec iov;
1953 	char *kaddr = kmap(page);
1954 	iov.iov_base = kaddr + offset;
1955 	iov.iov_len = size;
1956 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1957 	kunmap(page);
1958 	return res;
1959 }
1960 EXPORT_SYMBOL(sock_no_sendpage);
1961 
1962 /*
1963  *	Default Socket Callbacks
1964  */
1965 
1966 static void sock_def_wakeup(struct sock *sk)
1967 {
1968 	struct socket_wq *wq;
1969 
1970 	rcu_read_lock();
1971 	wq = rcu_dereference(sk->sk_wq);
1972 	if (wq_has_sleeper(wq))
1973 		wake_up_interruptible_all(&wq->wait);
1974 	rcu_read_unlock();
1975 }
1976 
1977 static void sock_def_error_report(struct sock *sk)
1978 {
1979 	struct socket_wq *wq;
1980 
1981 	rcu_read_lock();
1982 	wq = rcu_dereference(sk->sk_wq);
1983 	if (wq_has_sleeper(wq))
1984 		wake_up_interruptible_poll(&wq->wait, POLLERR);
1985 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1986 	rcu_read_unlock();
1987 }
1988 
1989 static void sock_def_readable(struct sock *sk, int len)
1990 {
1991 	struct socket_wq *wq;
1992 
1993 	rcu_read_lock();
1994 	wq = rcu_dereference(sk->sk_wq);
1995 	if (wq_has_sleeper(wq))
1996 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1997 						POLLRDNORM | POLLRDBAND);
1998 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1999 	rcu_read_unlock();
2000 }
2001 
2002 static void sock_def_write_space(struct sock *sk)
2003 {
2004 	struct socket_wq *wq;
2005 
2006 	rcu_read_lock();
2007 
2008 	/* Do not wake up a writer until he can make "significant"
2009 	 * progress.  --DaveM
2010 	 */
2011 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2012 		wq = rcu_dereference(sk->sk_wq);
2013 		if (wq_has_sleeper(wq))
2014 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2015 						POLLWRNORM | POLLWRBAND);
2016 
2017 		/* Should agree with poll, otherwise some programs break */
2018 		if (sock_writeable(sk))
2019 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2020 	}
2021 
2022 	rcu_read_unlock();
2023 }
2024 
2025 static void sock_def_destruct(struct sock *sk)
2026 {
2027 	kfree(sk->sk_protinfo);
2028 }
2029 
2030 void sk_send_sigurg(struct sock *sk)
2031 {
2032 	if (sk->sk_socket && sk->sk_socket->file)
2033 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2034 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2035 }
2036 EXPORT_SYMBOL(sk_send_sigurg);
2037 
2038 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2039 		    unsigned long expires)
2040 {
2041 	if (!mod_timer(timer, expires))
2042 		sock_hold(sk);
2043 }
2044 EXPORT_SYMBOL(sk_reset_timer);
2045 
2046 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2047 {
2048 	if (timer_pending(timer) && del_timer(timer))
2049 		__sock_put(sk);
2050 }
2051 EXPORT_SYMBOL(sk_stop_timer);
2052 
2053 void sock_init_data(struct socket *sock, struct sock *sk)
2054 {
2055 	skb_queue_head_init(&sk->sk_receive_queue);
2056 	skb_queue_head_init(&sk->sk_write_queue);
2057 	skb_queue_head_init(&sk->sk_error_queue);
2058 #ifdef CONFIG_NET_DMA
2059 	skb_queue_head_init(&sk->sk_async_wait_queue);
2060 #endif
2061 
2062 	sk->sk_send_head	=	NULL;
2063 
2064 	init_timer(&sk->sk_timer);
2065 
2066 	sk->sk_allocation	=	GFP_KERNEL;
2067 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2068 	sk->sk_sndbuf		=	sysctl_wmem_default;
2069 	sk->sk_state		=	TCP_CLOSE;
2070 	sk_set_socket(sk, sock);
2071 
2072 	sock_set_flag(sk, SOCK_ZAPPED);
2073 
2074 	if (sock) {
2075 		sk->sk_type	=	sock->type;
2076 		sk->sk_wq	=	sock->wq;
2077 		sock->sk	=	sk;
2078 	} else
2079 		sk->sk_wq	=	NULL;
2080 
2081 	spin_lock_init(&sk->sk_dst_lock);
2082 	rwlock_init(&sk->sk_callback_lock);
2083 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2084 			af_callback_keys + sk->sk_family,
2085 			af_family_clock_key_strings[sk->sk_family]);
2086 
2087 	sk->sk_state_change	=	sock_def_wakeup;
2088 	sk->sk_data_ready	=	sock_def_readable;
2089 	sk->sk_write_space	=	sock_def_write_space;
2090 	sk->sk_error_report	=	sock_def_error_report;
2091 	sk->sk_destruct		=	sock_def_destruct;
2092 
2093 	sk->sk_sndmsg_page	=	NULL;
2094 	sk->sk_sndmsg_off	=	0;
2095 
2096 	sk->sk_peer_pid 	=	NULL;
2097 	sk->sk_peer_cred	=	NULL;
2098 	sk->sk_write_pending	=	0;
2099 	sk->sk_rcvlowat		=	1;
2100 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2101 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2102 
2103 	sk->sk_stamp = ktime_set(-1L, 0);
2104 
2105 	/*
2106 	 * Before updating sk_refcnt, we must commit prior changes to memory
2107 	 * (Documentation/RCU/rculist_nulls.txt for details)
2108 	 */
2109 	smp_wmb();
2110 	atomic_set(&sk->sk_refcnt, 1);
2111 	atomic_set(&sk->sk_drops, 0);
2112 }
2113 EXPORT_SYMBOL(sock_init_data);
2114 
2115 void lock_sock_nested(struct sock *sk, int subclass)
2116 {
2117 	might_sleep();
2118 	spin_lock_bh(&sk->sk_lock.slock);
2119 	if (sk->sk_lock.owned)
2120 		__lock_sock(sk);
2121 	sk->sk_lock.owned = 1;
2122 	spin_unlock(&sk->sk_lock.slock);
2123 	/*
2124 	 * The sk_lock has mutex_lock() semantics here:
2125 	 */
2126 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2127 	local_bh_enable();
2128 }
2129 EXPORT_SYMBOL(lock_sock_nested);
2130 
2131 void release_sock(struct sock *sk)
2132 {
2133 	/*
2134 	 * The sk_lock has mutex_unlock() semantics:
2135 	 */
2136 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2137 
2138 	spin_lock_bh(&sk->sk_lock.slock);
2139 	if (sk->sk_backlog.tail)
2140 		__release_sock(sk);
2141 	sk->sk_lock.owned = 0;
2142 	if (waitqueue_active(&sk->sk_lock.wq))
2143 		wake_up(&sk->sk_lock.wq);
2144 	spin_unlock_bh(&sk->sk_lock.slock);
2145 }
2146 EXPORT_SYMBOL(release_sock);
2147 
2148 /**
2149  * lock_sock_fast - fast version of lock_sock
2150  * @sk: socket
2151  *
2152  * This version should be used for very small section, where process wont block
2153  * return false if fast path is taken
2154  *   sk_lock.slock locked, owned = 0, BH disabled
2155  * return true if slow path is taken
2156  *   sk_lock.slock unlocked, owned = 1, BH enabled
2157  */
2158 bool lock_sock_fast(struct sock *sk)
2159 {
2160 	might_sleep();
2161 	spin_lock_bh(&sk->sk_lock.slock);
2162 
2163 	if (!sk->sk_lock.owned)
2164 		/*
2165 		 * Note : We must disable BH
2166 		 */
2167 		return false;
2168 
2169 	__lock_sock(sk);
2170 	sk->sk_lock.owned = 1;
2171 	spin_unlock(&sk->sk_lock.slock);
2172 	/*
2173 	 * The sk_lock has mutex_lock() semantics here:
2174 	 */
2175 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2176 	local_bh_enable();
2177 	return true;
2178 }
2179 EXPORT_SYMBOL(lock_sock_fast);
2180 
2181 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2182 {
2183 	struct timeval tv;
2184 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2185 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2186 	tv = ktime_to_timeval(sk->sk_stamp);
2187 	if (tv.tv_sec == -1)
2188 		return -ENOENT;
2189 	if (tv.tv_sec == 0) {
2190 		sk->sk_stamp = ktime_get_real();
2191 		tv = ktime_to_timeval(sk->sk_stamp);
2192 	}
2193 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2194 }
2195 EXPORT_SYMBOL(sock_get_timestamp);
2196 
2197 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2198 {
2199 	struct timespec ts;
2200 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2201 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2202 	ts = ktime_to_timespec(sk->sk_stamp);
2203 	if (ts.tv_sec == -1)
2204 		return -ENOENT;
2205 	if (ts.tv_sec == 0) {
2206 		sk->sk_stamp = ktime_get_real();
2207 		ts = ktime_to_timespec(sk->sk_stamp);
2208 	}
2209 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2210 }
2211 EXPORT_SYMBOL(sock_get_timestampns);
2212 
2213 void sock_enable_timestamp(struct sock *sk, int flag)
2214 {
2215 	if (!sock_flag(sk, flag)) {
2216 		unsigned long previous_flags = sk->sk_flags;
2217 
2218 		sock_set_flag(sk, flag);
2219 		/*
2220 		 * we just set one of the two flags which require net
2221 		 * time stamping, but time stamping might have been on
2222 		 * already because of the other one
2223 		 */
2224 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2225 			net_enable_timestamp();
2226 	}
2227 }
2228 
2229 /*
2230  *	Get a socket option on an socket.
2231  *
2232  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2233  *	asynchronous errors should be reported by getsockopt. We assume
2234  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2235  */
2236 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2237 			   char __user *optval, int __user *optlen)
2238 {
2239 	struct sock *sk = sock->sk;
2240 
2241 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2242 }
2243 EXPORT_SYMBOL(sock_common_getsockopt);
2244 
2245 #ifdef CONFIG_COMPAT
2246 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2247 				  char __user *optval, int __user *optlen)
2248 {
2249 	struct sock *sk = sock->sk;
2250 
2251 	if (sk->sk_prot->compat_getsockopt != NULL)
2252 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2253 						      optval, optlen);
2254 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2255 }
2256 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2257 #endif
2258 
2259 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2260 			struct msghdr *msg, size_t size, int flags)
2261 {
2262 	struct sock *sk = sock->sk;
2263 	int addr_len = 0;
2264 	int err;
2265 
2266 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2267 				   flags & ~MSG_DONTWAIT, &addr_len);
2268 	if (err >= 0)
2269 		msg->msg_namelen = addr_len;
2270 	return err;
2271 }
2272 EXPORT_SYMBOL(sock_common_recvmsg);
2273 
2274 /*
2275  *	Set socket options on an inet socket.
2276  */
2277 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2278 			   char __user *optval, unsigned int optlen)
2279 {
2280 	struct sock *sk = sock->sk;
2281 
2282 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2283 }
2284 EXPORT_SYMBOL(sock_common_setsockopt);
2285 
2286 #ifdef CONFIG_COMPAT
2287 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2288 				  char __user *optval, unsigned int optlen)
2289 {
2290 	struct sock *sk = sock->sk;
2291 
2292 	if (sk->sk_prot->compat_setsockopt != NULL)
2293 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2294 						      optval, optlen);
2295 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2296 }
2297 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2298 #endif
2299 
2300 void sk_common_release(struct sock *sk)
2301 {
2302 	if (sk->sk_prot->destroy)
2303 		sk->sk_prot->destroy(sk);
2304 
2305 	/*
2306 	 * Observation: when sock_common_release is called, processes have
2307 	 * no access to socket. But net still has.
2308 	 * Step one, detach it from networking:
2309 	 *
2310 	 * A. Remove from hash tables.
2311 	 */
2312 
2313 	sk->sk_prot->unhash(sk);
2314 
2315 	/*
2316 	 * In this point socket cannot receive new packets, but it is possible
2317 	 * that some packets are in flight because some CPU runs receiver and
2318 	 * did hash table lookup before we unhashed socket. They will achieve
2319 	 * receive queue and will be purged by socket destructor.
2320 	 *
2321 	 * Also we still have packets pending on receive queue and probably,
2322 	 * our own packets waiting in device queues. sock_destroy will drain
2323 	 * receive queue, but transmitted packets will delay socket destruction
2324 	 * until the last reference will be released.
2325 	 */
2326 
2327 	sock_orphan(sk);
2328 
2329 	xfrm_sk_free_policy(sk);
2330 
2331 	sk_refcnt_debug_release(sk);
2332 	sock_put(sk);
2333 }
2334 EXPORT_SYMBOL(sk_common_release);
2335 
2336 #ifdef CONFIG_PROC_FS
2337 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2338 struct prot_inuse {
2339 	int val[PROTO_INUSE_NR];
2340 };
2341 
2342 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2343 
2344 #ifdef CONFIG_NET_NS
2345 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2346 {
2347 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2348 }
2349 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2350 
2351 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2352 {
2353 	int cpu, idx = prot->inuse_idx;
2354 	int res = 0;
2355 
2356 	for_each_possible_cpu(cpu)
2357 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2358 
2359 	return res >= 0 ? res : 0;
2360 }
2361 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2362 
2363 static int __net_init sock_inuse_init_net(struct net *net)
2364 {
2365 	net->core.inuse = alloc_percpu(struct prot_inuse);
2366 	return net->core.inuse ? 0 : -ENOMEM;
2367 }
2368 
2369 static void __net_exit sock_inuse_exit_net(struct net *net)
2370 {
2371 	free_percpu(net->core.inuse);
2372 }
2373 
2374 static struct pernet_operations net_inuse_ops = {
2375 	.init = sock_inuse_init_net,
2376 	.exit = sock_inuse_exit_net,
2377 };
2378 
2379 static __init int net_inuse_init(void)
2380 {
2381 	if (register_pernet_subsys(&net_inuse_ops))
2382 		panic("Cannot initialize net inuse counters");
2383 
2384 	return 0;
2385 }
2386 
2387 core_initcall(net_inuse_init);
2388 #else
2389 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2390 
2391 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2392 {
2393 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2394 }
2395 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2396 
2397 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2398 {
2399 	int cpu, idx = prot->inuse_idx;
2400 	int res = 0;
2401 
2402 	for_each_possible_cpu(cpu)
2403 		res += per_cpu(prot_inuse, cpu).val[idx];
2404 
2405 	return res >= 0 ? res : 0;
2406 }
2407 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2408 #endif
2409 
2410 static void assign_proto_idx(struct proto *prot)
2411 {
2412 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2413 
2414 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2415 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2416 		return;
2417 	}
2418 
2419 	set_bit(prot->inuse_idx, proto_inuse_idx);
2420 }
2421 
2422 static void release_proto_idx(struct proto *prot)
2423 {
2424 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2425 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2426 }
2427 #else
2428 static inline void assign_proto_idx(struct proto *prot)
2429 {
2430 }
2431 
2432 static inline void release_proto_idx(struct proto *prot)
2433 {
2434 }
2435 #endif
2436 
2437 int proto_register(struct proto *prot, int alloc_slab)
2438 {
2439 	if (alloc_slab) {
2440 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2441 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2442 					NULL);
2443 
2444 		if (prot->slab == NULL) {
2445 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2446 			       prot->name);
2447 			goto out;
2448 		}
2449 
2450 		if (prot->rsk_prot != NULL) {
2451 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2452 			if (prot->rsk_prot->slab_name == NULL)
2453 				goto out_free_sock_slab;
2454 
2455 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2456 								 prot->rsk_prot->obj_size, 0,
2457 								 SLAB_HWCACHE_ALIGN, NULL);
2458 
2459 			if (prot->rsk_prot->slab == NULL) {
2460 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2461 				       prot->name);
2462 				goto out_free_request_sock_slab_name;
2463 			}
2464 		}
2465 
2466 		if (prot->twsk_prot != NULL) {
2467 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2468 
2469 			if (prot->twsk_prot->twsk_slab_name == NULL)
2470 				goto out_free_request_sock_slab;
2471 
2472 			prot->twsk_prot->twsk_slab =
2473 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2474 						  prot->twsk_prot->twsk_obj_size,
2475 						  0,
2476 						  SLAB_HWCACHE_ALIGN |
2477 							prot->slab_flags,
2478 						  NULL);
2479 			if (prot->twsk_prot->twsk_slab == NULL)
2480 				goto out_free_timewait_sock_slab_name;
2481 		}
2482 	}
2483 
2484 	mutex_lock(&proto_list_mutex);
2485 	list_add(&prot->node, &proto_list);
2486 	assign_proto_idx(prot);
2487 	mutex_unlock(&proto_list_mutex);
2488 	return 0;
2489 
2490 out_free_timewait_sock_slab_name:
2491 	kfree(prot->twsk_prot->twsk_slab_name);
2492 out_free_request_sock_slab:
2493 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2494 		kmem_cache_destroy(prot->rsk_prot->slab);
2495 		prot->rsk_prot->slab = NULL;
2496 	}
2497 out_free_request_sock_slab_name:
2498 	if (prot->rsk_prot)
2499 		kfree(prot->rsk_prot->slab_name);
2500 out_free_sock_slab:
2501 	kmem_cache_destroy(prot->slab);
2502 	prot->slab = NULL;
2503 out:
2504 	return -ENOBUFS;
2505 }
2506 EXPORT_SYMBOL(proto_register);
2507 
2508 void proto_unregister(struct proto *prot)
2509 {
2510 	mutex_lock(&proto_list_mutex);
2511 	release_proto_idx(prot);
2512 	list_del(&prot->node);
2513 	mutex_unlock(&proto_list_mutex);
2514 
2515 	if (prot->slab != NULL) {
2516 		kmem_cache_destroy(prot->slab);
2517 		prot->slab = NULL;
2518 	}
2519 
2520 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2521 		kmem_cache_destroy(prot->rsk_prot->slab);
2522 		kfree(prot->rsk_prot->slab_name);
2523 		prot->rsk_prot->slab = NULL;
2524 	}
2525 
2526 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2527 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2528 		kfree(prot->twsk_prot->twsk_slab_name);
2529 		prot->twsk_prot->twsk_slab = NULL;
2530 	}
2531 }
2532 EXPORT_SYMBOL(proto_unregister);
2533 
2534 #ifdef CONFIG_PROC_FS
2535 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2536 	__acquires(proto_list_mutex)
2537 {
2538 	mutex_lock(&proto_list_mutex);
2539 	return seq_list_start_head(&proto_list, *pos);
2540 }
2541 
2542 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2543 {
2544 	return seq_list_next(v, &proto_list, pos);
2545 }
2546 
2547 static void proto_seq_stop(struct seq_file *seq, void *v)
2548 	__releases(proto_list_mutex)
2549 {
2550 	mutex_unlock(&proto_list_mutex);
2551 }
2552 
2553 static char proto_method_implemented(const void *method)
2554 {
2555 	return method == NULL ? 'n' : 'y';
2556 }
2557 static long sock_prot_memory_allocated(struct proto *proto)
2558 {
2559 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
2560 }
2561 
2562 static char *sock_prot_memory_pressure(struct proto *proto)
2563 {
2564 	return proto->memory_pressure != NULL ?
2565 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2566 }
2567 
2568 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2569 {
2570 
2571 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2572 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2573 		   proto->name,
2574 		   proto->obj_size,
2575 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2576 		   sock_prot_memory_allocated(proto),
2577 		   sock_prot_memory_pressure(proto),
2578 		   proto->max_header,
2579 		   proto->slab == NULL ? "no" : "yes",
2580 		   module_name(proto->owner),
2581 		   proto_method_implemented(proto->close),
2582 		   proto_method_implemented(proto->connect),
2583 		   proto_method_implemented(proto->disconnect),
2584 		   proto_method_implemented(proto->accept),
2585 		   proto_method_implemented(proto->ioctl),
2586 		   proto_method_implemented(proto->init),
2587 		   proto_method_implemented(proto->destroy),
2588 		   proto_method_implemented(proto->shutdown),
2589 		   proto_method_implemented(proto->setsockopt),
2590 		   proto_method_implemented(proto->getsockopt),
2591 		   proto_method_implemented(proto->sendmsg),
2592 		   proto_method_implemented(proto->recvmsg),
2593 		   proto_method_implemented(proto->sendpage),
2594 		   proto_method_implemented(proto->bind),
2595 		   proto_method_implemented(proto->backlog_rcv),
2596 		   proto_method_implemented(proto->hash),
2597 		   proto_method_implemented(proto->unhash),
2598 		   proto_method_implemented(proto->get_port),
2599 		   proto_method_implemented(proto->enter_memory_pressure));
2600 }
2601 
2602 static int proto_seq_show(struct seq_file *seq, void *v)
2603 {
2604 	if (v == &proto_list)
2605 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2606 			   "protocol",
2607 			   "size",
2608 			   "sockets",
2609 			   "memory",
2610 			   "press",
2611 			   "maxhdr",
2612 			   "slab",
2613 			   "module",
2614 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2615 	else
2616 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2617 	return 0;
2618 }
2619 
2620 static const struct seq_operations proto_seq_ops = {
2621 	.start  = proto_seq_start,
2622 	.next   = proto_seq_next,
2623 	.stop   = proto_seq_stop,
2624 	.show   = proto_seq_show,
2625 };
2626 
2627 static int proto_seq_open(struct inode *inode, struct file *file)
2628 {
2629 	return seq_open_net(inode, file, &proto_seq_ops,
2630 			    sizeof(struct seq_net_private));
2631 }
2632 
2633 static const struct file_operations proto_seq_fops = {
2634 	.owner		= THIS_MODULE,
2635 	.open		= proto_seq_open,
2636 	.read		= seq_read,
2637 	.llseek		= seq_lseek,
2638 	.release	= seq_release_net,
2639 };
2640 
2641 static __net_init int proto_init_net(struct net *net)
2642 {
2643 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2644 		return -ENOMEM;
2645 
2646 	return 0;
2647 }
2648 
2649 static __net_exit void proto_exit_net(struct net *net)
2650 {
2651 	proc_net_remove(net, "protocols");
2652 }
2653 
2654 
2655 static __net_initdata struct pernet_operations proto_net_ops = {
2656 	.init = proto_init_net,
2657 	.exit = proto_exit_net,
2658 };
2659 
2660 static int __init proto_init(void)
2661 {
2662 	return register_pernet_subsys(&proto_net_ops);
2663 }
2664 
2665 subsys_initcall(proto_init);
2666 
2667 #endif /* PROC_FS */
2668