xref: /openbmc/linux/net/core/sock.c (revision 5d4a2e29)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116 
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126 #include <net/cls_cgroup.h>
127 
128 #include <linux/filter.h>
129 
130 #ifdef CONFIG_INET
131 #include <net/tcp.h>
132 #endif
133 
134 /*
135  * Each address family might have different locking rules, so we have
136  * one slock key per address family:
137  */
138 static struct lock_class_key af_family_keys[AF_MAX];
139 static struct lock_class_key af_family_slock_keys[AF_MAX];
140 
141 /*
142  * Make lock validator output more readable. (we pre-construct these
143  * strings build-time, so that runtime initialization of socket
144  * locks is fast):
145  */
146 static const char *const af_family_key_strings[AF_MAX+1] = {
147   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
148   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
149   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
150   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
151   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
152   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
153   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
154   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
155   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
156   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
157   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
158   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
159   "sk_lock-AF_IEEE802154",
160   "sk_lock-AF_MAX"
161 };
162 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
163   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
164   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
165   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
166   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
167   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
168   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
169   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
170   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
171   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
172   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
173   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
174   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
175   "slock-AF_IEEE802154",
176   "slock-AF_MAX"
177 };
178 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
179   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
180   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
181   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
182   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
183   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
184   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
185   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
186   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
187   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
188   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
189   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
190   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
191   "clock-AF_IEEE802154",
192   "clock-AF_MAX"
193 };
194 
195 /*
196  * sk_callback_lock locking rules are per-address-family,
197  * so split the lock classes by using a per-AF key:
198  */
199 static struct lock_class_key af_callback_keys[AF_MAX];
200 
201 /* Take into consideration the size of the struct sk_buff overhead in the
202  * determination of these values, since that is non-constant across
203  * platforms.  This makes socket queueing behavior and performance
204  * not depend upon such differences.
205  */
206 #define _SK_MEM_PACKETS		256
207 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
208 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
210 
211 /* Run time adjustable parameters. */
212 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
213 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
214 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
215 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
216 
217 /* Maximal space eaten by iovec or ancilliary data plus some space */
218 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
219 EXPORT_SYMBOL(sysctl_optmem_max);
220 
221 #if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
222 int net_cls_subsys_id = -1;
223 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
224 #endif
225 
226 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
227 {
228 	struct timeval tv;
229 
230 	if (optlen < sizeof(tv))
231 		return -EINVAL;
232 	if (copy_from_user(&tv, optval, sizeof(tv)))
233 		return -EFAULT;
234 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
235 		return -EDOM;
236 
237 	if (tv.tv_sec < 0) {
238 		static int warned __read_mostly;
239 
240 		*timeo_p = 0;
241 		if (warned < 10 && net_ratelimit()) {
242 			warned++;
243 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
244 			       "tries to set negative timeout\n",
245 				current->comm, task_pid_nr(current));
246 		}
247 		return 0;
248 	}
249 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
250 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
251 		return 0;
252 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
253 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
254 	return 0;
255 }
256 
257 static void sock_warn_obsolete_bsdism(const char *name)
258 {
259 	static int warned;
260 	static char warncomm[TASK_COMM_LEN];
261 	if (strcmp(warncomm, current->comm) && warned < 5) {
262 		strcpy(warncomm,  current->comm);
263 		printk(KERN_WARNING "process `%s' is using obsolete "
264 		       "%s SO_BSDCOMPAT\n", warncomm, name);
265 		warned++;
266 	}
267 }
268 
269 static void sock_disable_timestamp(struct sock *sk, int flag)
270 {
271 	if (sock_flag(sk, flag)) {
272 		sock_reset_flag(sk, flag);
273 		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
274 		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
275 			net_disable_timestamp();
276 		}
277 	}
278 }
279 
280 
281 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
282 {
283 	int err;
284 	int skb_len;
285 	unsigned long flags;
286 	struct sk_buff_head *list = &sk->sk_receive_queue;
287 
288 	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
289 	   number of warnings when compiling with -W --ANK
290 	 */
291 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
292 	    (unsigned)sk->sk_rcvbuf) {
293 		atomic_inc(&sk->sk_drops);
294 		return -ENOMEM;
295 	}
296 
297 	err = sk_filter(sk, skb);
298 	if (err)
299 		return err;
300 
301 	if (!sk_rmem_schedule(sk, skb->truesize)) {
302 		atomic_inc(&sk->sk_drops);
303 		return -ENOBUFS;
304 	}
305 
306 	skb->dev = NULL;
307 	skb_set_owner_r(skb, sk);
308 
309 	/* Cache the SKB length before we tack it onto the receive
310 	 * queue.  Once it is added it no longer belongs to us and
311 	 * may be freed by other threads of control pulling packets
312 	 * from the queue.
313 	 */
314 	skb_len = skb->len;
315 
316 	/* we escape from rcu protected region, make sure we dont leak
317 	 * a norefcounted dst
318 	 */
319 	skb_dst_force(skb);
320 
321 	spin_lock_irqsave(&list->lock, flags);
322 	skb->dropcount = atomic_read(&sk->sk_drops);
323 	__skb_queue_tail(list, skb);
324 	spin_unlock_irqrestore(&list->lock, flags);
325 
326 	if (!sock_flag(sk, SOCK_DEAD))
327 		sk->sk_data_ready(sk, skb_len);
328 	return 0;
329 }
330 EXPORT_SYMBOL(sock_queue_rcv_skb);
331 
332 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
333 {
334 	int rc = NET_RX_SUCCESS;
335 
336 	if (sk_filter(sk, skb))
337 		goto discard_and_relse;
338 
339 	skb->dev = NULL;
340 
341 	if (sk_rcvqueues_full(sk, skb)) {
342 		atomic_inc(&sk->sk_drops);
343 		goto discard_and_relse;
344 	}
345 	if (nested)
346 		bh_lock_sock_nested(sk);
347 	else
348 		bh_lock_sock(sk);
349 	if (!sock_owned_by_user(sk)) {
350 		/*
351 		 * trylock + unlock semantics:
352 		 */
353 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
354 
355 		rc = sk_backlog_rcv(sk, skb);
356 
357 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
358 	} else if (sk_add_backlog(sk, skb)) {
359 		bh_unlock_sock(sk);
360 		atomic_inc(&sk->sk_drops);
361 		goto discard_and_relse;
362 	}
363 
364 	bh_unlock_sock(sk);
365 out:
366 	sock_put(sk);
367 	return rc;
368 discard_and_relse:
369 	kfree_skb(skb);
370 	goto out;
371 }
372 EXPORT_SYMBOL(sk_receive_skb);
373 
374 void sk_reset_txq(struct sock *sk)
375 {
376 	sk_tx_queue_clear(sk);
377 }
378 EXPORT_SYMBOL(sk_reset_txq);
379 
380 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
381 {
382 	struct dst_entry *dst = __sk_dst_get(sk);
383 
384 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
385 		sk_tx_queue_clear(sk);
386 		rcu_assign_pointer(sk->sk_dst_cache, NULL);
387 		dst_release(dst);
388 		return NULL;
389 	}
390 
391 	return dst;
392 }
393 EXPORT_SYMBOL(__sk_dst_check);
394 
395 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
396 {
397 	struct dst_entry *dst = sk_dst_get(sk);
398 
399 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
400 		sk_dst_reset(sk);
401 		dst_release(dst);
402 		return NULL;
403 	}
404 
405 	return dst;
406 }
407 EXPORT_SYMBOL(sk_dst_check);
408 
409 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
410 {
411 	int ret = -ENOPROTOOPT;
412 #ifdef CONFIG_NETDEVICES
413 	struct net *net = sock_net(sk);
414 	char devname[IFNAMSIZ];
415 	int index;
416 
417 	/* Sorry... */
418 	ret = -EPERM;
419 	if (!capable(CAP_NET_RAW))
420 		goto out;
421 
422 	ret = -EINVAL;
423 	if (optlen < 0)
424 		goto out;
425 
426 	/* Bind this socket to a particular device like "eth0",
427 	 * as specified in the passed interface name. If the
428 	 * name is "" or the option length is zero the socket
429 	 * is not bound.
430 	 */
431 	if (optlen > IFNAMSIZ - 1)
432 		optlen = IFNAMSIZ - 1;
433 	memset(devname, 0, sizeof(devname));
434 
435 	ret = -EFAULT;
436 	if (copy_from_user(devname, optval, optlen))
437 		goto out;
438 
439 	index = 0;
440 	if (devname[0] != '\0') {
441 		struct net_device *dev;
442 
443 		rcu_read_lock();
444 		dev = dev_get_by_name_rcu(net, devname);
445 		if (dev)
446 			index = dev->ifindex;
447 		rcu_read_unlock();
448 		ret = -ENODEV;
449 		if (!dev)
450 			goto out;
451 	}
452 
453 	lock_sock(sk);
454 	sk->sk_bound_dev_if = index;
455 	sk_dst_reset(sk);
456 	release_sock(sk);
457 
458 	ret = 0;
459 
460 out:
461 #endif
462 
463 	return ret;
464 }
465 
466 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
467 {
468 	if (valbool)
469 		sock_set_flag(sk, bit);
470 	else
471 		sock_reset_flag(sk, bit);
472 }
473 
474 /*
475  *	This is meant for all protocols to use and covers goings on
476  *	at the socket level. Everything here is generic.
477  */
478 
479 int sock_setsockopt(struct socket *sock, int level, int optname,
480 		    char __user *optval, unsigned int optlen)
481 {
482 	struct sock *sk = sock->sk;
483 	int val;
484 	int valbool;
485 	struct linger ling;
486 	int ret = 0;
487 
488 	/*
489 	 *	Options without arguments
490 	 */
491 
492 	if (optname == SO_BINDTODEVICE)
493 		return sock_bindtodevice(sk, optval, optlen);
494 
495 	if (optlen < sizeof(int))
496 		return -EINVAL;
497 
498 	if (get_user(val, (int __user *)optval))
499 		return -EFAULT;
500 
501 	valbool = val ? 1 : 0;
502 
503 	lock_sock(sk);
504 
505 	switch (optname) {
506 	case SO_DEBUG:
507 		if (val && !capable(CAP_NET_ADMIN))
508 			ret = -EACCES;
509 		else
510 			sock_valbool_flag(sk, SOCK_DBG, valbool);
511 		break;
512 	case SO_REUSEADDR:
513 		sk->sk_reuse = valbool;
514 		break;
515 	case SO_TYPE:
516 	case SO_PROTOCOL:
517 	case SO_DOMAIN:
518 	case SO_ERROR:
519 		ret = -ENOPROTOOPT;
520 		break;
521 	case SO_DONTROUTE:
522 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
523 		break;
524 	case SO_BROADCAST:
525 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
526 		break;
527 	case SO_SNDBUF:
528 		/* Don't error on this BSD doesn't and if you think
529 		   about it this is right. Otherwise apps have to
530 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
531 		   are treated in BSD as hints */
532 
533 		if (val > sysctl_wmem_max)
534 			val = sysctl_wmem_max;
535 set_sndbuf:
536 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
537 		if ((val * 2) < SOCK_MIN_SNDBUF)
538 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
539 		else
540 			sk->sk_sndbuf = val * 2;
541 
542 		/*
543 		 *	Wake up sending tasks if we
544 		 *	upped the value.
545 		 */
546 		sk->sk_write_space(sk);
547 		break;
548 
549 	case SO_SNDBUFFORCE:
550 		if (!capable(CAP_NET_ADMIN)) {
551 			ret = -EPERM;
552 			break;
553 		}
554 		goto set_sndbuf;
555 
556 	case SO_RCVBUF:
557 		/* Don't error on this BSD doesn't and if you think
558 		   about it this is right. Otherwise apps have to
559 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
560 		   are treated in BSD as hints */
561 
562 		if (val > sysctl_rmem_max)
563 			val = sysctl_rmem_max;
564 set_rcvbuf:
565 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
566 		/*
567 		 * We double it on the way in to account for
568 		 * "struct sk_buff" etc. overhead.   Applications
569 		 * assume that the SO_RCVBUF setting they make will
570 		 * allow that much actual data to be received on that
571 		 * socket.
572 		 *
573 		 * Applications are unaware that "struct sk_buff" and
574 		 * other overheads allocate from the receive buffer
575 		 * during socket buffer allocation.
576 		 *
577 		 * And after considering the possible alternatives,
578 		 * returning the value we actually used in getsockopt
579 		 * is the most desirable behavior.
580 		 */
581 		if ((val * 2) < SOCK_MIN_RCVBUF)
582 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
583 		else
584 			sk->sk_rcvbuf = val * 2;
585 		break;
586 
587 	case SO_RCVBUFFORCE:
588 		if (!capable(CAP_NET_ADMIN)) {
589 			ret = -EPERM;
590 			break;
591 		}
592 		goto set_rcvbuf;
593 
594 	case SO_KEEPALIVE:
595 #ifdef CONFIG_INET
596 		if (sk->sk_protocol == IPPROTO_TCP)
597 			tcp_set_keepalive(sk, valbool);
598 #endif
599 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
600 		break;
601 
602 	case SO_OOBINLINE:
603 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
604 		break;
605 
606 	case SO_NO_CHECK:
607 		sk->sk_no_check = valbool;
608 		break;
609 
610 	case SO_PRIORITY:
611 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
612 			sk->sk_priority = val;
613 		else
614 			ret = -EPERM;
615 		break;
616 
617 	case SO_LINGER:
618 		if (optlen < sizeof(ling)) {
619 			ret = -EINVAL;	/* 1003.1g */
620 			break;
621 		}
622 		if (copy_from_user(&ling, optval, sizeof(ling))) {
623 			ret = -EFAULT;
624 			break;
625 		}
626 		if (!ling.l_onoff)
627 			sock_reset_flag(sk, SOCK_LINGER);
628 		else {
629 #if (BITS_PER_LONG == 32)
630 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
631 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
632 			else
633 #endif
634 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
635 			sock_set_flag(sk, SOCK_LINGER);
636 		}
637 		break;
638 
639 	case SO_BSDCOMPAT:
640 		sock_warn_obsolete_bsdism("setsockopt");
641 		break;
642 
643 	case SO_PASSCRED:
644 		if (valbool)
645 			set_bit(SOCK_PASSCRED, &sock->flags);
646 		else
647 			clear_bit(SOCK_PASSCRED, &sock->flags);
648 		break;
649 
650 	case SO_TIMESTAMP:
651 	case SO_TIMESTAMPNS:
652 		if (valbool)  {
653 			if (optname == SO_TIMESTAMP)
654 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
655 			else
656 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
657 			sock_set_flag(sk, SOCK_RCVTSTAMP);
658 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
659 		} else {
660 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
661 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
662 		}
663 		break;
664 
665 	case SO_TIMESTAMPING:
666 		if (val & ~SOF_TIMESTAMPING_MASK) {
667 			ret = -EINVAL;
668 			break;
669 		}
670 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
671 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
672 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
673 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
674 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
675 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
676 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
677 			sock_enable_timestamp(sk,
678 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
679 		else
680 			sock_disable_timestamp(sk,
681 					       SOCK_TIMESTAMPING_RX_SOFTWARE);
682 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
683 				  val & SOF_TIMESTAMPING_SOFTWARE);
684 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
685 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
686 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
687 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
688 		break;
689 
690 	case SO_RCVLOWAT:
691 		if (val < 0)
692 			val = INT_MAX;
693 		sk->sk_rcvlowat = val ? : 1;
694 		break;
695 
696 	case SO_RCVTIMEO:
697 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
698 		break;
699 
700 	case SO_SNDTIMEO:
701 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
702 		break;
703 
704 	case SO_ATTACH_FILTER:
705 		ret = -EINVAL;
706 		if (optlen == sizeof(struct sock_fprog)) {
707 			struct sock_fprog fprog;
708 
709 			ret = -EFAULT;
710 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
711 				break;
712 
713 			ret = sk_attach_filter(&fprog, sk);
714 		}
715 		break;
716 
717 	case SO_DETACH_FILTER:
718 		ret = sk_detach_filter(sk);
719 		break;
720 
721 	case SO_PASSSEC:
722 		if (valbool)
723 			set_bit(SOCK_PASSSEC, &sock->flags);
724 		else
725 			clear_bit(SOCK_PASSSEC, &sock->flags);
726 		break;
727 	case SO_MARK:
728 		if (!capable(CAP_NET_ADMIN))
729 			ret = -EPERM;
730 		else
731 			sk->sk_mark = val;
732 		break;
733 
734 		/* We implement the SO_SNDLOWAT etc to
735 		   not be settable (1003.1g 5.3) */
736 	case SO_RXQ_OVFL:
737 		if (valbool)
738 			sock_set_flag(sk, SOCK_RXQ_OVFL);
739 		else
740 			sock_reset_flag(sk, SOCK_RXQ_OVFL);
741 		break;
742 	default:
743 		ret = -ENOPROTOOPT;
744 		break;
745 	}
746 	release_sock(sk);
747 	return ret;
748 }
749 EXPORT_SYMBOL(sock_setsockopt);
750 
751 
752 int sock_getsockopt(struct socket *sock, int level, int optname,
753 		    char __user *optval, int __user *optlen)
754 {
755 	struct sock *sk = sock->sk;
756 
757 	union {
758 		int val;
759 		struct linger ling;
760 		struct timeval tm;
761 	} v;
762 
763 	int lv = sizeof(int);
764 	int len;
765 
766 	if (get_user(len, optlen))
767 		return -EFAULT;
768 	if (len < 0)
769 		return -EINVAL;
770 
771 	memset(&v, 0, sizeof(v));
772 
773 	switch (optname) {
774 	case SO_DEBUG:
775 		v.val = sock_flag(sk, SOCK_DBG);
776 		break;
777 
778 	case SO_DONTROUTE:
779 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
780 		break;
781 
782 	case SO_BROADCAST:
783 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
784 		break;
785 
786 	case SO_SNDBUF:
787 		v.val = sk->sk_sndbuf;
788 		break;
789 
790 	case SO_RCVBUF:
791 		v.val = sk->sk_rcvbuf;
792 		break;
793 
794 	case SO_REUSEADDR:
795 		v.val = sk->sk_reuse;
796 		break;
797 
798 	case SO_KEEPALIVE:
799 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
800 		break;
801 
802 	case SO_TYPE:
803 		v.val = sk->sk_type;
804 		break;
805 
806 	case SO_PROTOCOL:
807 		v.val = sk->sk_protocol;
808 		break;
809 
810 	case SO_DOMAIN:
811 		v.val = sk->sk_family;
812 		break;
813 
814 	case SO_ERROR:
815 		v.val = -sock_error(sk);
816 		if (v.val == 0)
817 			v.val = xchg(&sk->sk_err_soft, 0);
818 		break;
819 
820 	case SO_OOBINLINE:
821 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
822 		break;
823 
824 	case SO_NO_CHECK:
825 		v.val = sk->sk_no_check;
826 		break;
827 
828 	case SO_PRIORITY:
829 		v.val = sk->sk_priority;
830 		break;
831 
832 	case SO_LINGER:
833 		lv		= sizeof(v.ling);
834 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
835 		v.ling.l_linger	= sk->sk_lingertime / HZ;
836 		break;
837 
838 	case SO_BSDCOMPAT:
839 		sock_warn_obsolete_bsdism("getsockopt");
840 		break;
841 
842 	case SO_TIMESTAMP:
843 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
844 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
845 		break;
846 
847 	case SO_TIMESTAMPNS:
848 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
849 		break;
850 
851 	case SO_TIMESTAMPING:
852 		v.val = 0;
853 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
854 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
855 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
856 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
857 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
858 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
859 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
860 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
861 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
862 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
863 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
864 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
865 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
866 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
867 		break;
868 
869 	case SO_RCVTIMEO:
870 		lv = sizeof(struct timeval);
871 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
872 			v.tm.tv_sec = 0;
873 			v.tm.tv_usec = 0;
874 		} else {
875 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
876 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
877 		}
878 		break;
879 
880 	case SO_SNDTIMEO:
881 		lv = sizeof(struct timeval);
882 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
883 			v.tm.tv_sec = 0;
884 			v.tm.tv_usec = 0;
885 		} else {
886 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
887 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
888 		}
889 		break;
890 
891 	case SO_RCVLOWAT:
892 		v.val = sk->sk_rcvlowat;
893 		break;
894 
895 	case SO_SNDLOWAT:
896 		v.val = 1;
897 		break;
898 
899 	case SO_PASSCRED:
900 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
901 		break;
902 
903 	case SO_PEERCRED:
904 		if (len > sizeof(sk->sk_peercred))
905 			len = sizeof(sk->sk_peercred);
906 		if (copy_to_user(optval, &sk->sk_peercred, len))
907 			return -EFAULT;
908 		goto lenout;
909 
910 	case SO_PEERNAME:
911 	{
912 		char address[128];
913 
914 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
915 			return -ENOTCONN;
916 		if (lv < len)
917 			return -EINVAL;
918 		if (copy_to_user(optval, address, len))
919 			return -EFAULT;
920 		goto lenout;
921 	}
922 
923 	/* Dubious BSD thing... Probably nobody even uses it, but
924 	 * the UNIX standard wants it for whatever reason... -DaveM
925 	 */
926 	case SO_ACCEPTCONN:
927 		v.val = sk->sk_state == TCP_LISTEN;
928 		break;
929 
930 	case SO_PASSSEC:
931 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
932 		break;
933 
934 	case SO_PEERSEC:
935 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
936 
937 	case SO_MARK:
938 		v.val = sk->sk_mark;
939 		break;
940 
941 	case SO_RXQ_OVFL:
942 		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
943 		break;
944 
945 	default:
946 		return -ENOPROTOOPT;
947 	}
948 
949 	if (len > lv)
950 		len = lv;
951 	if (copy_to_user(optval, &v, len))
952 		return -EFAULT;
953 lenout:
954 	if (put_user(len, optlen))
955 		return -EFAULT;
956 	return 0;
957 }
958 
959 /*
960  * Initialize an sk_lock.
961  *
962  * (We also register the sk_lock with the lock validator.)
963  */
964 static inline void sock_lock_init(struct sock *sk)
965 {
966 	sock_lock_init_class_and_name(sk,
967 			af_family_slock_key_strings[sk->sk_family],
968 			af_family_slock_keys + sk->sk_family,
969 			af_family_key_strings[sk->sk_family],
970 			af_family_keys + sk->sk_family);
971 }
972 
973 /*
974  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
975  * even temporarly, because of RCU lookups. sk_node should also be left as is.
976  */
977 static void sock_copy(struct sock *nsk, const struct sock *osk)
978 {
979 #ifdef CONFIG_SECURITY_NETWORK
980 	void *sptr = nsk->sk_security;
981 #endif
982 	BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
983 		     sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
984 		     sizeof(osk->sk_tx_queue_mapping));
985 	memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
986 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
987 #ifdef CONFIG_SECURITY_NETWORK
988 	nsk->sk_security = sptr;
989 	security_sk_clone(osk, nsk);
990 #endif
991 }
992 
993 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
994 		int family)
995 {
996 	struct sock *sk;
997 	struct kmem_cache *slab;
998 
999 	slab = prot->slab;
1000 	if (slab != NULL) {
1001 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1002 		if (!sk)
1003 			return sk;
1004 		if (priority & __GFP_ZERO) {
1005 			/*
1006 			 * caches using SLAB_DESTROY_BY_RCU should let
1007 			 * sk_node.next un-modified. Special care is taken
1008 			 * when initializing object to zero.
1009 			 */
1010 			if (offsetof(struct sock, sk_node.next) != 0)
1011 				memset(sk, 0, offsetof(struct sock, sk_node.next));
1012 			memset(&sk->sk_node.pprev, 0,
1013 			       prot->obj_size - offsetof(struct sock,
1014 							 sk_node.pprev));
1015 		}
1016 	}
1017 	else
1018 		sk = kmalloc(prot->obj_size, priority);
1019 
1020 	if (sk != NULL) {
1021 		kmemcheck_annotate_bitfield(sk, flags);
1022 
1023 		if (security_sk_alloc(sk, family, priority))
1024 			goto out_free;
1025 
1026 		if (!try_module_get(prot->owner))
1027 			goto out_free_sec;
1028 		sk_tx_queue_clear(sk);
1029 	}
1030 
1031 	return sk;
1032 
1033 out_free_sec:
1034 	security_sk_free(sk);
1035 out_free:
1036 	if (slab != NULL)
1037 		kmem_cache_free(slab, sk);
1038 	else
1039 		kfree(sk);
1040 	return NULL;
1041 }
1042 
1043 static void sk_prot_free(struct proto *prot, struct sock *sk)
1044 {
1045 	struct kmem_cache *slab;
1046 	struct module *owner;
1047 
1048 	owner = prot->owner;
1049 	slab = prot->slab;
1050 
1051 	security_sk_free(sk);
1052 	if (slab != NULL)
1053 		kmem_cache_free(slab, sk);
1054 	else
1055 		kfree(sk);
1056 	module_put(owner);
1057 }
1058 
1059 #ifdef CONFIG_CGROUPS
1060 void sock_update_classid(struct sock *sk)
1061 {
1062 	u32 classid = task_cls_classid(current);
1063 
1064 	if (classid && classid != sk->sk_classid)
1065 		sk->sk_classid = classid;
1066 }
1067 EXPORT_SYMBOL(sock_update_classid);
1068 #endif
1069 
1070 /**
1071  *	sk_alloc - All socket objects are allocated here
1072  *	@net: the applicable net namespace
1073  *	@family: protocol family
1074  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1075  *	@prot: struct proto associated with this new sock instance
1076  */
1077 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1078 		      struct proto *prot)
1079 {
1080 	struct sock *sk;
1081 
1082 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1083 	if (sk) {
1084 		sk->sk_family = family;
1085 		/*
1086 		 * See comment in struct sock definition to understand
1087 		 * why we need sk_prot_creator -acme
1088 		 */
1089 		sk->sk_prot = sk->sk_prot_creator = prot;
1090 		sock_lock_init(sk);
1091 		sock_net_set(sk, get_net(net));
1092 		atomic_set(&sk->sk_wmem_alloc, 1);
1093 
1094 		sock_update_classid(sk);
1095 	}
1096 
1097 	return sk;
1098 }
1099 EXPORT_SYMBOL(sk_alloc);
1100 
1101 static void __sk_free(struct sock *sk)
1102 {
1103 	struct sk_filter *filter;
1104 
1105 	if (sk->sk_destruct)
1106 		sk->sk_destruct(sk);
1107 
1108 	filter = rcu_dereference_check(sk->sk_filter,
1109 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1110 	if (filter) {
1111 		sk_filter_uncharge(sk, filter);
1112 		rcu_assign_pointer(sk->sk_filter, NULL);
1113 	}
1114 
1115 	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1116 	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1117 
1118 	if (atomic_read(&sk->sk_omem_alloc))
1119 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1120 		       __func__, atomic_read(&sk->sk_omem_alloc));
1121 
1122 	put_net(sock_net(sk));
1123 	sk_prot_free(sk->sk_prot_creator, sk);
1124 }
1125 
1126 void sk_free(struct sock *sk)
1127 {
1128 	/*
1129 	 * We substract one from sk_wmem_alloc and can know if
1130 	 * some packets are still in some tx queue.
1131 	 * If not null, sock_wfree() will call __sk_free(sk) later
1132 	 */
1133 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1134 		__sk_free(sk);
1135 }
1136 EXPORT_SYMBOL(sk_free);
1137 
1138 /*
1139  * Last sock_put should drop referrence to sk->sk_net. It has already
1140  * been dropped in sk_change_net. Taking referrence to stopping namespace
1141  * is not an option.
1142  * Take referrence to a socket to remove it from hash _alive_ and after that
1143  * destroy it in the context of init_net.
1144  */
1145 void sk_release_kernel(struct sock *sk)
1146 {
1147 	if (sk == NULL || sk->sk_socket == NULL)
1148 		return;
1149 
1150 	sock_hold(sk);
1151 	sock_release(sk->sk_socket);
1152 	release_net(sock_net(sk));
1153 	sock_net_set(sk, get_net(&init_net));
1154 	sock_put(sk);
1155 }
1156 EXPORT_SYMBOL(sk_release_kernel);
1157 
1158 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1159 {
1160 	struct sock *newsk;
1161 
1162 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1163 	if (newsk != NULL) {
1164 		struct sk_filter *filter;
1165 
1166 		sock_copy(newsk, sk);
1167 
1168 		/* SANITY */
1169 		get_net(sock_net(newsk));
1170 		sk_node_init(&newsk->sk_node);
1171 		sock_lock_init(newsk);
1172 		bh_lock_sock(newsk);
1173 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1174 		newsk->sk_backlog.len = 0;
1175 
1176 		atomic_set(&newsk->sk_rmem_alloc, 0);
1177 		/*
1178 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1179 		 */
1180 		atomic_set(&newsk->sk_wmem_alloc, 1);
1181 		atomic_set(&newsk->sk_omem_alloc, 0);
1182 		skb_queue_head_init(&newsk->sk_receive_queue);
1183 		skb_queue_head_init(&newsk->sk_write_queue);
1184 #ifdef CONFIG_NET_DMA
1185 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1186 #endif
1187 
1188 		spin_lock_init(&newsk->sk_dst_lock);
1189 		rwlock_init(&newsk->sk_callback_lock);
1190 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1191 				af_callback_keys + newsk->sk_family,
1192 				af_family_clock_key_strings[newsk->sk_family]);
1193 
1194 		newsk->sk_dst_cache	= NULL;
1195 		newsk->sk_wmem_queued	= 0;
1196 		newsk->sk_forward_alloc = 0;
1197 		newsk->sk_send_head	= NULL;
1198 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1199 
1200 		sock_reset_flag(newsk, SOCK_DONE);
1201 		skb_queue_head_init(&newsk->sk_error_queue);
1202 
1203 		filter = newsk->sk_filter;
1204 		if (filter != NULL)
1205 			sk_filter_charge(newsk, filter);
1206 
1207 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1208 			/* It is still raw copy of parent, so invalidate
1209 			 * destructor and make plain sk_free() */
1210 			newsk->sk_destruct = NULL;
1211 			sk_free(newsk);
1212 			newsk = NULL;
1213 			goto out;
1214 		}
1215 
1216 		newsk->sk_err	   = 0;
1217 		newsk->sk_priority = 0;
1218 		/*
1219 		 * Before updating sk_refcnt, we must commit prior changes to memory
1220 		 * (Documentation/RCU/rculist_nulls.txt for details)
1221 		 */
1222 		smp_wmb();
1223 		atomic_set(&newsk->sk_refcnt, 2);
1224 
1225 		/*
1226 		 * Increment the counter in the same struct proto as the master
1227 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1228 		 * is the same as sk->sk_prot->socks, as this field was copied
1229 		 * with memcpy).
1230 		 *
1231 		 * This _changes_ the previous behaviour, where
1232 		 * tcp_create_openreq_child always was incrementing the
1233 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1234 		 * to be taken into account in all callers. -acme
1235 		 */
1236 		sk_refcnt_debug_inc(newsk);
1237 		sk_set_socket(newsk, NULL);
1238 		newsk->sk_wq = NULL;
1239 
1240 		if (newsk->sk_prot->sockets_allocated)
1241 			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1242 
1243 		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1244 		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1245 			net_enable_timestamp();
1246 	}
1247 out:
1248 	return newsk;
1249 }
1250 EXPORT_SYMBOL_GPL(sk_clone);
1251 
1252 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1253 {
1254 	__sk_dst_set(sk, dst);
1255 	sk->sk_route_caps = dst->dev->features;
1256 	if (sk->sk_route_caps & NETIF_F_GSO)
1257 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1258 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1259 	if (sk_can_gso(sk)) {
1260 		if (dst->header_len) {
1261 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1262 		} else {
1263 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1264 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1265 		}
1266 	}
1267 }
1268 EXPORT_SYMBOL_GPL(sk_setup_caps);
1269 
1270 void __init sk_init(void)
1271 {
1272 	if (totalram_pages <= 4096) {
1273 		sysctl_wmem_max = 32767;
1274 		sysctl_rmem_max = 32767;
1275 		sysctl_wmem_default = 32767;
1276 		sysctl_rmem_default = 32767;
1277 	} else if (totalram_pages >= 131072) {
1278 		sysctl_wmem_max = 131071;
1279 		sysctl_rmem_max = 131071;
1280 	}
1281 }
1282 
1283 /*
1284  *	Simple resource managers for sockets.
1285  */
1286 
1287 
1288 /*
1289  * Write buffer destructor automatically called from kfree_skb.
1290  */
1291 void sock_wfree(struct sk_buff *skb)
1292 {
1293 	struct sock *sk = skb->sk;
1294 	unsigned int len = skb->truesize;
1295 
1296 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1297 		/*
1298 		 * Keep a reference on sk_wmem_alloc, this will be released
1299 		 * after sk_write_space() call
1300 		 */
1301 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1302 		sk->sk_write_space(sk);
1303 		len = 1;
1304 	}
1305 	/*
1306 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1307 	 * could not do because of in-flight packets
1308 	 */
1309 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1310 		__sk_free(sk);
1311 }
1312 EXPORT_SYMBOL(sock_wfree);
1313 
1314 /*
1315  * Read buffer destructor automatically called from kfree_skb.
1316  */
1317 void sock_rfree(struct sk_buff *skb)
1318 {
1319 	struct sock *sk = skb->sk;
1320 
1321 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1322 	sk_mem_uncharge(skb->sk, skb->truesize);
1323 }
1324 EXPORT_SYMBOL(sock_rfree);
1325 
1326 
1327 int sock_i_uid(struct sock *sk)
1328 {
1329 	int uid;
1330 
1331 	read_lock(&sk->sk_callback_lock);
1332 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1333 	read_unlock(&sk->sk_callback_lock);
1334 	return uid;
1335 }
1336 EXPORT_SYMBOL(sock_i_uid);
1337 
1338 unsigned long sock_i_ino(struct sock *sk)
1339 {
1340 	unsigned long ino;
1341 
1342 	read_lock(&sk->sk_callback_lock);
1343 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1344 	read_unlock(&sk->sk_callback_lock);
1345 	return ino;
1346 }
1347 EXPORT_SYMBOL(sock_i_ino);
1348 
1349 /*
1350  * Allocate a skb from the socket's send buffer.
1351  */
1352 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1353 			     gfp_t priority)
1354 {
1355 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1356 		struct sk_buff *skb = alloc_skb(size, priority);
1357 		if (skb) {
1358 			skb_set_owner_w(skb, sk);
1359 			return skb;
1360 		}
1361 	}
1362 	return NULL;
1363 }
1364 EXPORT_SYMBOL(sock_wmalloc);
1365 
1366 /*
1367  * Allocate a skb from the socket's receive buffer.
1368  */
1369 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1370 			     gfp_t priority)
1371 {
1372 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1373 		struct sk_buff *skb = alloc_skb(size, priority);
1374 		if (skb) {
1375 			skb_set_owner_r(skb, sk);
1376 			return skb;
1377 		}
1378 	}
1379 	return NULL;
1380 }
1381 
1382 /*
1383  * Allocate a memory block from the socket's option memory buffer.
1384  */
1385 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1386 {
1387 	if ((unsigned)size <= sysctl_optmem_max &&
1388 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1389 		void *mem;
1390 		/* First do the add, to avoid the race if kmalloc
1391 		 * might sleep.
1392 		 */
1393 		atomic_add(size, &sk->sk_omem_alloc);
1394 		mem = kmalloc(size, priority);
1395 		if (mem)
1396 			return mem;
1397 		atomic_sub(size, &sk->sk_omem_alloc);
1398 	}
1399 	return NULL;
1400 }
1401 EXPORT_SYMBOL(sock_kmalloc);
1402 
1403 /*
1404  * Free an option memory block.
1405  */
1406 void sock_kfree_s(struct sock *sk, void *mem, int size)
1407 {
1408 	kfree(mem);
1409 	atomic_sub(size, &sk->sk_omem_alloc);
1410 }
1411 EXPORT_SYMBOL(sock_kfree_s);
1412 
1413 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1414    I think, these locks should be removed for datagram sockets.
1415  */
1416 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1417 {
1418 	DEFINE_WAIT(wait);
1419 
1420 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1421 	for (;;) {
1422 		if (!timeo)
1423 			break;
1424 		if (signal_pending(current))
1425 			break;
1426 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1427 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1428 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1429 			break;
1430 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1431 			break;
1432 		if (sk->sk_err)
1433 			break;
1434 		timeo = schedule_timeout(timeo);
1435 	}
1436 	finish_wait(sk_sleep(sk), &wait);
1437 	return timeo;
1438 }
1439 
1440 
1441 /*
1442  *	Generic send/receive buffer handlers
1443  */
1444 
1445 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1446 				     unsigned long data_len, int noblock,
1447 				     int *errcode)
1448 {
1449 	struct sk_buff *skb;
1450 	gfp_t gfp_mask;
1451 	long timeo;
1452 	int err;
1453 
1454 	gfp_mask = sk->sk_allocation;
1455 	if (gfp_mask & __GFP_WAIT)
1456 		gfp_mask |= __GFP_REPEAT;
1457 
1458 	timeo = sock_sndtimeo(sk, noblock);
1459 	while (1) {
1460 		err = sock_error(sk);
1461 		if (err != 0)
1462 			goto failure;
1463 
1464 		err = -EPIPE;
1465 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1466 			goto failure;
1467 
1468 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1469 			skb = alloc_skb(header_len, gfp_mask);
1470 			if (skb) {
1471 				int npages;
1472 				int i;
1473 
1474 				/* No pages, we're done... */
1475 				if (!data_len)
1476 					break;
1477 
1478 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1479 				skb->truesize += data_len;
1480 				skb_shinfo(skb)->nr_frags = npages;
1481 				for (i = 0; i < npages; i++) {
1482 					struct page *page;
1483 					skb_frag_t *frag;
1484 
1485 					page = alloc_pages(sk->sk_allocation, 0);
1486 					if (!page) {
1487 						err = -ENOBUFS;
1488 						skb_shinfo(skb)->nr_frags = i;
1489 						kfree_skb(skb);
1490 						goto failure;
1491 					}
1492 
1493 					frag = &skb_shinfo(skb)->frags[i];
1494 					frag->page = page;
1495 					frag->page_offset = 0;
1496 					frag->size = (data_len >= PAGE_SIZE ?
1497 						      PAGE_SIZE :
1498 						      data_len);
1499 					data_len -= PAGE_SIZE;
1500 				}
1501 
1502 				/* Full success... */
1503 				break;
1504 			}
1505 			err = -ENOBUFS;
1506 			goto failure;
1507 		}
1508 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1509 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1510 		err = -EAGAIN;
1511 		if (!timeo)
1512 			goto failure;
1513 		if (signal_pending(current))
1514 			goto interrupted;
1515 		timeo = sock_wait_for_wmem(sk, timeo);
1516 	}
1517 
1518 	skb_set_owner_w(skb, sk);
1519 	return skb;
1520 
1521 interrupted:
1522 	err = sock_intr_errno(timeo);
1523 failure:
1524 	*errcode = err;
1525 	return NULL;
1526 }
1527 EXPORT_SYMBOL(sock_alloc_send_pskb);
1528 
1529 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1530 				    int noblock, int *errcode)
1531 {
1532 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1533 }
1534 EXPORT_SYMBOL(sock_alloc_send_skb);
1535 
1536 static void __lock_sock(struct sock *sk)
1537 {
1538 	DEFINE_WAIT(wait);
1539 
1540 	for (;;) {
1541 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1542 					TASK_UNINTERRUPTIBLE);
1543 		spin_unlock_bh(&sk->sk_lock.slock);
1544 		schedule();
1545 		spin_lock_bh(&sk->sk_lock.slock);
1546 		if (!sock_owned_by_user(sk))
1547 			break;
1548 	}
1549 	finish_wait(&sk->sk_lock.wq, &wait);
1550 }
1551 
1552 static void __release_sock(struct sock *sk)
1553 {
1554 	struct sk_buff *skb = sk->sk_backlog.head;
1555 
1556 	do {
1557 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1558 		bh_unlock_sock(sk);
1559 
1560 		do {
1561 			struct sk_buff *next = skb->next;
1562 
1563 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1564 			skb->next = NULL;
1565 			sk_backlog_rcv(sk, skb);
1566 
1567 			/*
1568 			 * We are in process context here with softirqs
1569 			 * disabled, use cond_resched_softirq() to preempt.
1570 			 * This is safe to do because we've taken the backlog
1571 			 * queue private:
1572 			 */
1573 			cond_resched_softirq();
1574 
1575 			skb = next;
1576 		} while (skb != NULL);
1577 
1578 		bh_lock_sock(sk);
1579 	} while ((skb = sk->sk_backlog.head) != NULL);
1580 
1581 	/*
1582 	 * Doing the zeroing here guarantee we can not loop forever
1583 	 * while a wild producer attempts to flood us.
1584 	 */
1585 	sk->sk_backlog.len = 0;
1586 }
1587 
1588 /**
1589  * sk_wait_data - wait for data to arrive at sk_receive_queue
1590  * @sk:    sock to wait on
1591  * @timeo: for how long
1592  *
1593  * Now socket state including sk->sk_err is changed only under lock,
1594  * hence we may omit checks after joining wait queue.
1595  * We check receive queue before schedule() only as optimization;
1596  * it is very likely that release_sock() added new data.
1597  */
1598 int sk_wait_data(struct sock *sk, long *timeo)
1599 {
1600 	int rc;
1601 	DEFINE_WAIT(wait);
1602 
1603 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1604 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1605 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1606 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1607 	finish_wait(sk_sleep(sk), &wait);
1608 	return rc;
1609 }
1610 EXPORT_SYMBOL(sk_wait_data);
1611 
1612 /**
1613  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1614  *	@sk: socket
1615  *	@size: memory size to allocate
1616  *	@kind: allocation type
1617  *
1618  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1619  *	rmem allocation. This function assumes that protocols which have
1620  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1621  */
1622 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1623 {
1624 	struct proto *prot = sk->sk_prot;
1625 	int amt = sk_mem_pages(size);
1626 	int allocated;
1627 
1628 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1629 	allocated = atomic_add_return(amt, prot->memory_allocated);
1630 
1631 	/* Under limit. */
1632 	if (allocated <= prot->sysctl_mem[0]) {
1633 		if (prot->memory_pressure && *prot->memory_pressure)
1634 			*prot->memory_pressure = 0;
1635 		return 1;
1636 	}
1637 
1638 	/* Under pressure. */
1639 	if (allocated > prot->sysctl_mem[1])
1640 		if (prot->enter_memory_pressure)
1641 			prot->enter_memory_pressure(sk);
1642 
1643 	/* Over hard limit. */
1644 	if (allocated > prot->sysctl_mem[2])
1645 		goto suppress_allocation;
1646 
1647 	/* guarantee minimum buffer size under pressure */
1648 	if (kind == SK_MEM_RECV) {
1649 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1650 			return 1;
1651 	} else { /* SK_MEM_SEND */
1652 		if (sk->sk_type == SOCK_STREAM) {
1653 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1654 				return 1;
1655 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1656 			   prot->sysctl_wmem[0])
1657 				return 1;
1658 	}
1659 
1660 	if (prot->memory_pressure) {
1661 		int alloc;
1662 
1663 		if (!*prot->memory_pressure)
1664 			return 1;
1665 		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1666 		if (prot->sysctl_mem[2] > alloc *
1667 		    sk_mem_pages(sk->sk_wmem_queued +
1668 				 atomic_read(&sk->sk_rmem_alloc) +
1669 				 sk->sk_forward_alloc))
1670 			return 1;
1671 	}
1672 
1673 suppress_allocation:
1674 
1675 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1676 		sk_stream_moderate_sndbuf(sk);
1677 
1678 		/* Fail only if socket is _under_ its sndbuf.
1679 		 * In this case we cannot block, so that we have to fail.
1680 		 */
1681 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1682 			return 1;
1683 	}
1684 
1685 	/* Alas. Undo changes. */
1686 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1687 	atomic_sub(amt, prot->memory_allocated);
1688 	return 0;
1689 }
1690 EXPORT_SYMBOL(__sk_mem_schedule);
1691 
1692 /**
1693  *	__sk_reclaim - reclaim memory_allocated
1694  *	@sk: socket
1695  */
1696 void __sk_mem_reclaim(struct sock *sk)
1697 {
1698 	struct proto *prot = sk->sk_prot;
1699 
1700 	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1701 		   prot->memory_allocated);
1702 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1703 
1704 	if (prot->memory_pressure && *prot->memory_pressure &&
1705 	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1706 		*prot->memory_pressure = 0;
1707 }
1708 EXPORT_SYMBOL(__sk_mem_reclaim);
1709 
1710 
1711 /*
1712  * Set of default routines for initialising struct proto_ops when
1713  * the protocol does not support a particular function. In certain
1714  * cases where it makes no sense for a protocol to have a "do nothing"
1715  * function, some default processing is provided.
1716  */
1717 
1718 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1719 {
1720 	return -EOPNOTSUPP;
1721 }
1722 EXPORT_SYMBOL(sock_no_bind);
1723 
1724 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1725 		    int len, int flags)
1726 {
1727 	return -EOPNOTSUPP;
1728 }
1729 EXPORT_SYMBOL(sock_no_connect);
1730 
1731 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1732 {
1733 	return -EOPNOTSUPP;
1734 }
1735 EXPORT_SYMBOL(sock_no_socketpair);
1736 
1737 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1738 {
1739 	return -EOPNOTSUPP;
1740 }
1741 EXPORT_SYMBOL(sock_no_accept);
1742 
1743 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1744 		    int *len, int peer)
1745 {
1746 	return -EOPNOTSUPP;
1747 }
1748 EXPORT_SYMBOL(sock_no_getname);
1749 
1750 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1751 {
1752 	return 0;
1753 }
1754 EXPORT_SYMBOL(sock_no_poll);
1755 
1756 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1757 {
1758 	return -EOPNOTSUPP;
1759 }
1760 EXPORT_SYMBOL(sock_no_ioctl);
1761 
1762 int sock_no_listen(struct socket *sock, int backlog)
1763 {
1764 	return -EOPNOTSUPP;
1765 }
1766 EXPORT_SYMBOL(sock_no_listen);
1767 
1768 int sock_no_shutdown(struct socket *sock, int how)
1769 {
1770 	return -EOPNOTSUPP;
1771 }
1772 EXPORT_SYMBOL(sock_no_shutdown);
1773 
1774 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1775 		    char __user *optval, unsigned int optlen)
1776 {
1777 	return -EOPNOTSUPP;
1778 }
1779 EXPORT_SYMBOL(sock_no_setsockopt);
1780 
1781 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1782 		    char __user *optval, int __user *optlen)
1783 {
1784 	return -EOPNOTSUPP;
1785 }
1786 EXPORT_SYMBOL(sock_no_getsockopt);
1787 
1788 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1789 		    size_t len)
1790 {
1791 	return -EOPNOTSUPP;
1792 }
1793 EXPORT_SYMBOL(sock_no_sendmsg);
1794 
1795 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1796 		    size_t len, int flags)
1797 {
1798 	return -EOPNOTSUPP;
1799 }
1800 EXPORT_SYMBOL(sock_no_recvmsg);
1801 
1802 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1803 {
1804 	/* Mirror missing mmap method error code */
1805 	return -ENODEV;
1806 }
1807 EXPORT_SYMBOL(sock_no_mmap);
1808 
1809 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1810 {
1811 	ssize_t res;
1812 	struct msghdr msg = {.msg_flags = flags};
1813 	struct kvec iov;
1814 	char *kaddr = kmap(page);
1815 	iov.iov_base = kaddr + offset;
1816 	iov.iov_len = size;
1817 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1818 	kunmap(page);
1819 	return res;
1820 }
1821 EXPORT_SYMBOL(sock_no_sendpage);
1822 
1823 /*
1824  *	Default Socket Callbacks
1825  */
1826 
1827 static void sock_def_wakeup(struct sock *sk)
1828 {
1829 	struct socket_wq *wq;
1830 
1831 	rcu_read_lock();
1832 	wq = rcu_dereference(sk->sk_wq);
1833 	if (wq_has_sleeper(wq))
1834 		wake_up_interruptible_all(&wq->wait);
1835 	rcu_read_unlock();
1836 }
1837 
1838 static void sock_def_error_report(struct sock *sk)
1839 {
1840 	struct socket_wq *wq;
1841 
1842 	rcu_read_lock();
1843 	wq = rcu_dereference(sk->sk_wq);
1844 	if (wq_has_sleeper(wq))
1845 		wake_up_interruptible_poll(&wq->wait, POLLERR);
1846 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1847 	rcu_read_unlock();
1848 }
1849 
1850 static void sock_def_readable(struct sock *sk, int len)
1851 {
1852 	struct socket_wq *wq;
1853 
1854 	rcu_read_lock();
1855 	wq = rcu_dereference(sk->sk_wq);
1856 	if (wq_has_sleeper(wq))
1857 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
1858 						POLLRDNORM | POLLRDBAND);
1859 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1860 	rcu_read_unlock();
1861 }
1862 
1863 static void sock_def_write_space(struct sock *sk)
1864 {
1865 	struct socket_wq *wq;
1866 
1867 	rcu_read_lock();
1868 
1869 	/* Do not wake up a writer until he can make "significant"
1870 	 * progress.  --DaveM
1871 	 */
1872 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1873 		wq = rcu_dereference(sk->sk_wq);
1874 		if (wq_has_sleeper(wq))
1875 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1876 						POLLWRNORM | POLLWRBAND);
1877 
1878 		/* Should agree with poll, otherwise some programs break */
1879 		if (sock_writeable(sk))
1880 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1881 	}
1882 
1883 	rcu_read_unlock();
1884 }
1885 
1886 static void sock_def_destruct(struct sock *sk)
1887 {
1888 	kfree(sk->sk_protinfo);
1889 }
1890 
1891 void sk_send_sigurg(struct sock *sk)
1892 {
1893 	if (sk->sk_socket && sk->sk_socket->file)
1894 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1895 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1896 }
1897 EXPORT_SYMBOL(sk_send_sigurg);
1898 
1899 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1900 		    unsigned long expires)
1901 {
1902 	if (!mod_timer(timer, expires))
1903 		sock_hold(sk);
1904 }
1905 EXPORT_SYMBOL(sk_reset_timer);
1906 
1907 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1908 {
1909 	if (timer_pending(timer) && del_timer(timer))
1910 		__sock_put(sk);
1911 }
1912 EXPORT_SYMBOL(sk_stop_timer);
1913 
1914 void sock_init_data(struct socket *sock, struct sock *sk)
1915 {
1916 	skb_queue_head_init(&sk->sk_receive_queue);
1917 	skb_queue_head_init(&sk->sk_write_queue);
1918 	skb_queue_head_init(&sk->sk_error_queue);
1919 #ifdef CONFIG_NET_DMA
1920 	skb_queue_head_init(&sk->sk_async_wait_queue);
1921 #endif
1922 
1923 	sk->sk_send_head	=	NULL;
1924 
1925 	init_timer(&sk->sk_timer);
1926 
1927 	sk->sk_allocation	=	GFP_KERNEL;
1928 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1929 	sk->sk_sndbuf		=	sysctl_wmem_default;
1930 	sk->sk_state		=	TCP_CLOSE;
1931 	sk_set_socket(sk, sock);
1932 
1933 	sock_set_flag(sk, SOCK_ZAPPED);
1934 
1935 	if (sock) {
1936 		sk->sk_type	=	sock->type;
1937 		sk->sk_wq	=	sock->wq;
1938 		sock->sk	=	sk;
1939 	} else
1940 		sk->sk_wq	=	NULL;
1941 
1942 	spin_lock_init(&sk->sk_dst_lock);
1943 	rwlock_init(&sk->sk_callback_lock);
1944 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1945 			af_callback_keys + sk->sk_family,
1946 			af_family_clock_key_strings[sk->sk_family]);
1947 
1948 	sk->sk_state_change	=	sock_def_wakeup;
1949 	sk->sk_data_ready	=	sock_def_readable;
1950 	sk->sk_write_space	=	sock_def_write_space;
1951 	sk->sk_error_report	=	sock_def_error_report;
1952 	sk->sk_destruct		=	sock_def_destruct;
1953 
1954 	sk->sk_sndmsg_page	=	NULL;
1955 	sk->sk_sndmsg_off	=	0;
1956 
1957 	sk->sk_peercred.pid 	=	0;
1958 	sk->sk_peercred.uid	=	-1;
1959 	sk->sk_peercred.gid	=	-1;
1960 	sk->sk_write_pending	=	0;
1961 	sk->sk_rcvlowat		=	1;
1962 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1963 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1964 
1965 	sk->sk_stamp = ktime_set(-1L, 0);
1966 
1967 	/*
1968 	 * Before updating sk_refcnt, we must commit prior changes to memory
1969 	 * (Documentation/RCU/rculist_nulls.txt for details)
1970 	 */
1971 	smp_wmb();
1972 	atomic_set(&sk->sk_refcnt, 1);
1973 	atomic_set(&sk->sk_drops, 0);
1974 }
1975 EXPORT_SYMBOL(sock_init_data);
1976 
1977 void lock_sock_nested(struct sock *sk, int subclass)
1978 {
1979 	might_sleep();
1980 	spin_lock_bh(&sk->sk_lock.slock);
1981 	if (sk->sk_lock.owned)
1982 		__lock_sock(sk);
1983 	sk->sk_lock.owned = 1;
1984 	spin_unlock(&sk->sk_lock.slock);
1985 	/*
1986 	 * The sk_lock has mutex_lock() semantics here:
1987 	 */
1988 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1989 	local_bh_enable();
1990 }
1991 EXPORT_SYMBOL(lock_sock_nested);
1992 
1993 void release_sock(struct sock *sk)
1994 {
1995 	/*
1996 	 * The sk_lock has mutex_unlock() semantics:
1997 	 */
1998 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1999 
2000 	spin_lock_bh(&sk->sk_lock.slock);
2001 	if (sk->sk_backlog.tail)
2002 		__release_sock(sk);
2003 	sk->sk_lock.owned = 0;
2004 	if (waitqueue_active(&sk->sk_lock.wq))
2005 		wake_up(&sk->sk_lock.wq);
2006 	spin_unlock_bh(&sk->sk_lock.slock);
2007 }
2008 EXPORT_SYMBOL(release_sock);
2009 
2010 /**
2011  * lock_sock_fast - fast version of lock_sock
2012  * @sk: socket
2013  *
2014  * This version should be used for very small section, where process wont block
2015  * return false if fast path is taken
2016  *   sk_lock.slock locked, owned = 0, BH disabled
2017  * return true if slow path is taken
2018  *   sk_lock.slock unlocked, owned = 1, BH enabled
2019  */
2020 bool lock_sock_fast(struct sock *sk)
2021 {
2022 	might_sleep();
2023 	spin_lock_bh(&sk->sk_lock.slock);
2024 
2025 	if (!sk->sk_lock.owned)
2026 		/*
2027 		 * Note : We must disable BH
2028 		 */
2029 		return false;
2030 
2031 	__lock_sock(sk);
2032 	sk->sk_lock.owned = 1;
2033 	spin_unlock(&sk->sk_lock.slock);
2034 	/*
2035 	 * The sk_lock has mutex_lock() semantics here:
2036 	 */
2037 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2038 	local_bh_enable();
2039 	return true;
2040 }
2041 EXPORT_SYMBOL(lock_sock_fast);
2042 
2043 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2044 {
2045 	struct timeval tv;
2046 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2047 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2048 	tv = ktime_to_timeval(sk->sk_stamp);
2049 	if (tv.tv_sec == -1)
2050 		return -ENOENT;
2051 	if (tv.tv_sec == 0) {
2052 		sk->sk_stamp = ktime_get_real();
2053 		tv = ktime_to_timeval(sk->sk_stamp);
2054 	}
2055 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2056 }
2057 EXPORT_SYMBOL(sock_get_timestamp);
2058 
2059 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2060 {
2061 	struct timespec ts;
2062 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2063 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2064 	ts = ktime_to_timespec(sk->sk_stamp);
2065 	if (ts.tv_sec == -1)
2066 		return -ENOENT;
2067 	if (ts.tv_sec == 0) {
2068 		sk->sk_stamp = ktime_get_real();
2069 		ts = ktime_to_timespec(sk->sk_stamp);
2070 	}
2071 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2072 }
2073 EXPORT_SYMBOL(sock_get_timestampns);
2074 
2075 void sock_enable_timestamp(struct sock *sk, int flag)
2076 {
2077 	if (!sock_flag(sk, flag)) {
2078 		sock_set_flag(sk, flag);
2079 		/*
2080 		 * we just set one of the two flags which require net
2081 		 * time stamping, but time stamping might have been on
2082 		 * already because of the other one
2083 		 */
2084 		if (!sock_flag(sk,
2085 				flag == SOCK_TIMESTAMP ?
2086 				SOCK_TIMESTAMPING_RX_SOFTWARE :
2087 				SOCK_TIMESTAMP))
2088 			net_enable_timestamp();
2089 	}
2090 }
2091 
2092 /*
2093  *	Get a socket option on an socket.
2094  *
2095  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2096  *	asynchronous errors should be reported by getsockopt. We assume
2097  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2098  */
2099 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2100 			   char __user *optval, int __user *optlen)
2101 {
2102 	struct sock *sk = sock->sk;
2103 
2104 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2105 }
2106 EXPORT_SYMBOL(sock_common_getsockopt);
2107 
2108 #ifdef CONFIG_COMPAT
2109 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2110 				  char __user *optval, int __user *optlen)
2111 {
2112 	struct sock *sk = sock->sk;
2113 
2114 	if (sk->sk_prot->compat_getsockopt != NULL)
2115 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2116 						      optval, optlen);
2117 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2118 }
2119 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2120 #endif
2121 
2122 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2123 			struct msghdr *msg, size_t size, int flags)
2124 {
2125 	struct sock *sk = sock->sk;
2126 	int addr_len = 0;
2127 	int err;
2128 
2129 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2130 				   flags & ~MSG_DONTWAIT, &addr_len);
2131 	if (err >= 0)
2132 		msg->msg_namelen = addr_len;
2133 	return err;
2134 }
2135 EXPORT_SYMBOL(sock_common_recvmsg);
2136 
2137 /*
2138  *	Set socket options on an inet socket.
2139  */
2140 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2141 			   char __user *optval, unsigned int optlen)
2142 {
2143 	struct sock *sk = sock->sk;
2144 
2145 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2146 }
2147 EXPORT_SYMBOL(sock_common_setsockopt);
2148 
2149 #ifdef CONFIG_COMPAT
2150 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2151 				  char __user *optval, unsigned int optlen)
2152 {
2153 	struct sock *sk = sock->sk;
2154 
2155 	if (sk->sk_prot->compat_setsockopt != NULL)
2156 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2157 						      optval, optlen);
2158 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2159 }
2160 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2161 #endif
2162 
2163 void sk_common_release(struct sock *sk)
2164 {
2165 	if (sk->sk_prot->destroy)
2166 		sk->sk_prot->destroy(sk);
2167 
2168 	/*
2169 	 * Observation: when sock_common_release is called, processes have
2170 	 * no access to socket. But net still has.
2171 	 * Step one, detach it from networking:
2172 	 *
2173 	 * A. Remove from hash tables.
2174 	 */
2175 
2176 	sk->sk_prot->unhash(sk);
2177 
2178 	/*
2179 	 * In this point socket cannot receive new packets, but it is possible
2180 	 * that some packets are in flight because some CPU runs receiver and
2181 	 * did hash table lookup before we unhashed socket. They will achieve
2182 	 * receive queue and will be purged by socket destructor.
2183 	 *
2184 	 * Also we still have packets pending on receive queue and probably,
2185 	 * our own packets waiting in device queues. sock_destroy will drain
2186 	 * receive queue, but transmitted packets will delay socket destruction
2187 	 * until the last reference will be released.
2188 	 */
2189 
2190 	sock_orphan(sk);
2191 
2192 	xfrm_sk_free_policy(sk);
2193 
2194 	sk_refcnt_debug_release(sk);
2195 	sock_put(sk);
2196 }
2197 EXPORT_SYMBOL(sk_common_release);
2198 
2199 static DEFINE_RWLOCK(proto_list_lock);
2200 static LIST_HEAD(proto_list);
2201 
2202 #ifdef CONFIG_PROC_FS
2203 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2204 struct prot_inuse {
2205 	int val[PROTO_INUSE_NR];
2206 };
2207 
2208 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2209 
2210 #ifdef CONFIG_NET_NS
2211 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2212 {
2213 	int cpu = smp_processor_id();
2214 	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2215 }
2216 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2217 
2218 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2219 {
2220 	int cpu, idx = prot->inuse_idx;
2221 	int res = 0;
2222 
2223 	for_each_possible_cpu(cpu)
2224 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2225 
2226 	return res >= 0 ? res : 0;
2227 }
2228 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2229 
2230 static int __net_init sock_inuse_init_net(struct net *net)
2231 {
2232 	net->core.inuse = alloc_percpu(struct prot_inuse);
2233 	return net->core.inuse ? 0 : -ENOMEM;
2234 }
2235 
2236 static void __net_exit sock_inuse_exit_net(struct net *net)
2237 {
2238 	free_percpu(net->core.inuse);
2239 }
2240 
2241 static struct pernet_operations net_inuse_ops = {
2242 	.init = sock_inuse_init_net,
2243 	.exit = sock_inuse_exit_net,
2244 };
2245 
2246 static __init int net_inuse_init(void)
2247 {
2248 	if (register_pernet_subsys(&net_inuse_ops))
2249 		panic("Cannot initialize net inuse counters");
2250 
2251 	return 0;
2252 }
2253 
2254 core_initcall(net_inuse_init);
2255 #else
2256 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2257 
2258 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2259 {
2260 	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2261 }
2262 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2263 
2264 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2265 {
2266 	int cpu, idx = prot->inuse_idx;
2267 	int res = 0;
2268 
2269 	for_each_possible_cpu(cpu)
2270 		res += per_cpu(prot_inuse, cpu).val[idx];
2271 
2272 	return res >= 0 ? res : 0;
2273 }
2274 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2275 #endif
2276 
2277 static void assign_proto_idx(struct proto *prot)
2278 {
2279 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2280 
2281 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2282 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2283 		return;
2284 	}
2285 
2286 	set_bit(prot->inuse_idx, proto_inuse_idx);
2287 }
2288 
2289 static void release_proto_idx(struct proto *prot)
2290 {
2291 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2292 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2293 }
2294 #else
2295 static inline void assign_proto_idx(struct proto *prot)
2296 {
2297 }
2298 
2299 static inline void release_proto_idx(struct proto *prot)
2300 {
2301 }
2302 #endif
2303 
2304 int proto_register(struct proto *prot, int alloc_slab)
2305 {
2306 	if (alloc_slab) {
2307 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2308 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2309 					NULL);
2310 
2311 		if (prot->slab == NULL) {
2312 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2313 			       prot->name);
2314 			goto out;
2315 		}
2316 
2317 		if (prot->rsk_prot != NULL) {
2318 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2319 			if (prot->rsk_prot->slab_name == NULL)
2320 				goto out_free_sock_slab;
2321 
2322 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2323 								 prot->rsk_prot->obj_size, 0,
2324 								 SLAB_HWCACHE_ALIGN, NULL);
2325 
2326 			if (prot->rsk_prot->slab == NULL) {
2327 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2328 				       prot->name);
2329 				goto out_free_request_sock_slab_name;
2330 			}
2331 		}
2332 
2333 		if (prot->twsk_prot != NULL) {
2334 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2335 
2336 			if (prot->twsk_prot->twsk_slab_name == NULL)
2337 				goto out_free_request_sock_slab;
2338 
2339 			prot->twsk_prot->twsk_slab =
2340 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2341 						  prot->twsk_prot->twsk_obj_size,
2342 						  0,
2343 						  SLAB_HWCACHE_ALIGN |
2344 							prot->slab_flags,
2345 						  NULL);
2346 			if (prot->twsk_prot->twsk_slab == NULL)
2347 				goto out_free_timewait_sock_slab_name;
2348 		}
2349 	}
2350 
2351 	write_lock(&proto_list_lock);
2352 	list_add(&prot->node, &proto_list);
2353 	assign_proto_idx(prot);
2354 	write_unlock(&proto_list_lock);
2355 	return 0;
2356 
2357 out_free_timewait_sock_slab_name:
2358 	kfree(prot->twsk_prot->twsk_slab_name);
2359 out_free_request_sock_slab:
2360 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2361 		kmem_cache_destroy(prot->rsk_prot->slab);
2362 		prot->rsk_prot->slab = NULL;
2363 	}
2364 out_free_request_sock_slab_name:
2365 	if (prot->rsk_prot)
2366 		kfree(prot->rsk_prot->slab_name);
2367 out_free_sock_slab:
2368 	kmem_cache_destroy(prot->slab);
2369 	prot->slab = NULL;
2370 out:
2371 	return -ENOBUFS;
2372 }
2373 EXPORT_SYMBOL(proto_register);
2374 
2375 void proto_unregister(struct proto *prot)
2376 {
2377 	write_lock(&proto_list_lock);
2378 	release_proto_idx(prot);
2379 	list_del(&prot->node);
2380 	write_unlock(&proto_list_lock);
2381 
2382 	if (prot->slab != NULL) {
2383 		kmem_cache_destroy(prot->slab);
2384 		prot->slab = NULL;
2385 	}
2386 
2387 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2388 		kmem_cache_destroy(prot->rsk_prot->slab);
2389 		kfree(prot->rsk_prot->slab_name);
2390 		prot->rsk_prot->slab = NULL;
2391 	}
2392 
2393 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2394 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2395 		kfree(prot->twsk_prot->twsk_slab_name);
2396 		prot->twsk_prot->twsk_slab = NULL;
2397 	}
2398 }
2399 EXPORT_SYMBOL(proto_unregister);
2400 
2401 #ifdef CONFIG_PROC_FS
2402 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2403 	__acquires(proto_list_lock)
2404 {
2405 	read_lock(&proto_list_lock);
2406 	return seq_list_start_head(&proto_list, *pos);
2407 }
2408 
2409 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2410 {
2411 	return seq_list_next(v, &proto_list, pos);
2412 }
2413 
2414 static void proto_seq_stop(struct seq_file *seq, void *v)
2415 	__releases(proto_list_lock)
2416 {
2417 	read_unlock(&proto_list_lock);
2418 }
2419 
2420 static char proto_method_implemented(const void *method)
2421 {
2422 	return method == NULL ? 'n' : 'y';
2423 }
2424 
2425 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2426 {
2427 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2428 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2429 		   proto->name,
2430 		   proto->obj_size,
2431 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2432 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2433 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2434 		   proto->max_header,
2435 		   proto->slab == NULL ? "no" : "yes",
2436 		   module_name(proto->owner),
2437 		   proto_method_implemented(proto->close),
2438 		   proto_method_implemented(proto->connect),
2439 		   proto_method_implemented(proto->disconnect),
2440 		   proto_method_implemented(proto->accept),
2441 		   proto_method_implemented(proto->ioctl),
2442 		   proto_method_implemented(proto->init),
2443 		   proto_method_implemented(proto->destroy),
2444 		   proto_method_implemented(proto->shutdown),
2445 		   proto_method_implemented(proto->setsockopt),
2446 		   proto_method_implemented(proto->getsockopt),
2447 		   proto_method_implemented(proto->sendmsg),
2448 		   proto_method_implemented(proto->recvmsg),
2449 		   proto_method_implemented(proto->sendpage),
2450 		   proto_method_implemented(proto->bind),
2451 		   proto_method_implemented(proto->backlog_rcv),
2452 		   proto_method_implemented(proto->hash),
2453 		   proto_method_implemented(proto->unhash),
2454 		   proto_method_implemented(proto->get_port),
2455 		   proto_method_implemented(proto->enter_memory_pressure));
2456 }
2457 
2458 static int proto_seq_show(struct seq_file *seq, void *v)
2459 {
2460 	if (v == &proto_list)
2461 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2462 			   "protocol",
2463 			   "size",
2464 			   "sockets",
2465 			   "memory",
2466 			   "press",
2467 			   "maxhdr",
2468 			   "slab",
2469 			   "module",
2470 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2471 	else
2472 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2473 	return 0;
2474 }
2475 
2476 static const struct seq_operations proto_seq_ops = {
2477 	.start  = proto_seq_start,
2478 	.next   = proto_seq_next,
2479 	.stop   = proto_seq_stop,
2480 	.show   = proto_seq_show,
2481 };
2482 
2483 static int proto_seq_open(struct inode *inode, struct file *file)
2484 {
2485 	return seq_open_net(inode, file, &proto_seq_ops,
2486 			    sizeof(struct seq_net_private));
2487 }
2488 
2489 static const struct file_operations proto_seq_fops = {
2490 	.owner		= THIS_MODULE,
2491 	.open		= proto_seq_open,
2492 	.read		= seq_read,
2493 	.llseek		= seq_lseek,
2494 	.release	= seq_release_net,
2495 };
2496 
2497 static __net_init int proto_init_net(struct net *net)
2498 {
2499 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2500 		return -ENOMEM;
2501 
2502 	return 0;
2503 }
2504 
2505 static __net_exit void proto_exit_net(struct net *net)
2506 {
2507 	proc_net_remove(net, "protocols");
2508 }
2509 
2510 
2511 static __net_initdata struct pernet_operations proto_net_ops = {
2512 	.init = proto_init_net,
2513 	.exit = proto_exit_net,
2514 };
2515 
2516 static int __init proto_init(void)
2517 {
2518 	return register_pernet_subsys(&proto_net_ops);
2519 }
2520 
2521 subsys_initcall(proto_init);
2522 
2523 #endif /* PROC_FS */
2524