xref: /openbmc/linux/net/core/sock.c (revision 664a722b)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <linux/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 
139 #include <trace/events/sock.h>
140 
141 #ifdef CONFIG_INET
142 #include <net/tcp.h>
143 #endif
144 
145 #include <net/busy_poll.h>
146 
147 static DEFINE_MUTEX(proto_list_mutex);
148 static LIST_HEAD(proto_list);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family:
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 static const char *const af_family_key_strings[AF_MAX+1] = {
211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
225   "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
226 };
227 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
228   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
229   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
230   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
231   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
232   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
233   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
234   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
235   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
236   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
237   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
238   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
239   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
240   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
241   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
242   "slock-AF_QIPCRTR", "slock-AF_MAX"
243 };
244 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
245   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
246   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
247   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
248   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
249   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
250   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
251   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
252   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
253   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
254   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
255   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
256   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
257   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
258   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
259   "clock-AF_QIPCRTR", "clock-AF_MAX"
260 };
261 
262 /*
263  * sk_callback_lock locking rules are per-address-family,
264  * so split the lock classes by using a per-AF key:
265  */
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 
268 /* Take into consideration the size of the struct sk_buff overhead in the
269  * determination of these values, since that is non-constant across
270  * platforms.  This makes socket queueing behavior and performance
271  * not depend upon such differences.
272  */
273 #define _SK_MEM_PACKETS		256
274 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
275 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
276 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
277 
278 /* Run time adjustable parameters. */
279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280 EXPORT_SYMBOL(sysctl_wmem_max);
281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282 EXPORT_SYMBOL(sysctl_rmem_max);
283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285 
286 /* Maximal space eaten by iovec or ancillary data plus some space */
287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
288 EXPORT_SYMBOL(sysctl_optmem_max);
289 
290 int sysctl_tstamp_allow_data __read_mostly = 1;
291 
292 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
293 EXPORT_SYMBOL_GPL(memalloc_socks);
294 
295 /**
296  * sk_set_memalloc - sets %SOCK_MEMALLOC
297  * @sk: socket to set it on
298  *
299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300  * It's the responsibility of the admin to adjust min_free_kbytes
301  * to meet the requirements
302  */
303 void sk_set_memalloc(struct sock *sk)
304 {
305 	sock_set_flag(sk, SOCK_MEMALLOC);
306 	sk->sk_allocation |= __GFP_MEMALLOC;
307 	static_key_slow_inc(&memalloc_socks);
308 }
309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
310 
311 void sk_clear_memalloc(struct sock *sk)
312 {
313 	sock_reset_flag(sk, SOCK_MEMALLOC);
314 	sk->sk_allocation &= ~__GFP_MEMALLOC;
315 	static_key_slow_dec(&memalloc_socks);
316 
317 	/*
318 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
319 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
320 	 * it has rmem allocations due to the last swapfile being deactivated
321 	 * but there is a risk that the socket is unusable due to exceeding
322 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 	 */
324 	sk_mem_reclaim(sk);
325 }
326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327 
328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329 {
330 	int ret;
331 	unsigned long pflags = current->flags;
332 
333 	/* these should have been dropped before queueing */
334 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335 
336 	current->flags |= PF_MEMALLOC;
337 	ret = sk->sk_backlog_rcv(sk, skb);
338 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
345 {
346 	struct timeval tv;
347 
348 	if (optlen < sizeof(tv))
349 		return -EINVAL;
350 	if (copy_from_user(&tv, optval, sizeof(tv)))
351 		return -EFAULT;
352 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
353 		return -EDOM;
354 
355 	if (tv.tv_sec < 0) {
356 		static int warned __read_mostly;
357 
358 		*timeo_p = 0;
359 		if (warned < 10 && net_ratelimit()) {
360 			warned++;
361 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
362 				__func__, current->comm, task_pid_nr(current));
363 		}
364 		return 0;
365 	}
366 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
367 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
368 		return 0;
369 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
370 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
371 	return 0;
372 }
373 
374 static void sock_warn_obsolete_bsdism(const char *name)
375 {
376 	static int warned;
377 	static char warncomm[TASK_COMM_LEN];
378 	if (strcmp(warncomm, current->comm) && warned < 5) {
379 		strcpy(warncomm,  current->comm);
380 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
381 			warncomm, name);
382 		warned++;
383 	}
384 }
385 
386 static bool sock_needs_netstamp(const struct sock *sk)
387 {
388 	switch (sk->sk_family) {
389 	case AF_UNSPEC:
390 	case AF_UNIX:
391 		return false;
392 	default:
393 		return true;
394 	}
395 }
396 
397 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
398 {
399 	if (sk->sk_flags & flags) {
400 		sk->sk_flags &= ~flags;
401 		if (sock_needs_netstamp(sk) &&
402 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
403 			net_disable_timestamp();
404 	}
405 }
406 
407 
408 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
409 {
410 	unsigned long flags;
411 	struct sk_buff_head *list = &sk->sk_receive_queue;
412 
413 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
414 		atomic_inc(&sk->sk_drops);
415 		trace_sock_rcvqueue_full(sk, skb);
416 		return -ENOMEM;
417 	}
418 
419 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
420 		atomic_inc(&sk->sk_drops);
421 		return -ENOBUFS;
422 	}
423 
424 	skb->dev = NULL;
425 	skb_set_owner_r(skb, sk);
426 
427 	/* we escape from rcu protected region, make sure we dont leak
428 	 * a norefcounted dst
429 	 */
430 	skb_dst_force(skb);
431 
432 	spin_lock_irqsave(&list->lock, flags);
433 	sock_skb_set_dropcount(sk, skb);
434 	__skb_queue_tail(list, skb);
435 	spin_unlock_irqrestore(&list->lock, flags);
436 
437 	if (!sock_flag(sk, SOCK_DEAD))
438 		sk->sk_data_ready(sk);
439 	return 0;
440 }
441 EXPORT_SYMBOL(__sock_queue_rcv_skb);
442 
443 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
444 {
445 	int err;
446 
447 	err = sk_filter(sk, skb);
448 	if (err)
449 		return err;
450 
451 	return __sock_queue_rcv_skb(sk, skb);
452 }
453 EXPORT_SYMBOL(sock_queue_rcv_skb);
454 
455 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
456 		     const int nested, unsigned int trim_cap, bool refcounted)
457 {
458 	int rc = NET_RX_SUCCESS;
459 
460 	if (sk_filter_trim_cap(sk, skb, trim_cap))
461 		goto discard_and_relse;
462 
463 	skb->dev = NULL;
464 
465 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
466 		atomic_inc(&sk->sk_drops);
467 		goto discard_and_relse;
468 	}
469 	if (nested)
470 		bh_lock_sock_nested(sk);
471 	else
472 		bh_lock_sock(sk);
473 	if (!sock_owned_by_user(sk)) {
474 		/*
475 		 * trylock + unlock semantics:
476 		 */
477 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
478 
479 		rc = sk_backlog_rcv(sk, skb);
480 
481 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
482 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
483 		bh_unlock_sock(sk);
484 		atomic_inc(&sk->sk_drops);
485 		goto discard_and_relse;
486 	}
487 
488 	bh_unlock_sock(sk);
489 out:
490 	if (refcounted)
491 		sock_put(sk);
492 	return rc;
493 discard_and_relse:
494 	kfree_skb(skb);
495 	goto out;
496 }
497 EXPORT_SYMBOL(__sk_receive_skb);
498 
499 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
500 {
501 	struct dst_entry *dst = __sk_dst_get(sk);
502 
503 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
504 		sk_tx_queue_clear(sk);
505 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
506 		dst_release(dst);
507 		return NULL;
508 	}
509 
510 	return dst;
511 }
512 EXPORT_SYMBOL(__sk_dst_check);
513 
514 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
515 {
516 	struct dst_entry *dst = sk_dst_get(sk);
517 
518 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
519 		sk_dst_reset(sk);
520 		dst_release(dst);
521 		return NULL;
522 	}
523 
524 	return dst;
525 }
526 EXPORT_SYMBOL(sk_dst_check);
527 
528 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
529 				int optlen)
530 {
531 	int ret = -ENOPROTOOPT;
532 #ifdef CONFIG_NETDEVICES
533 	struct net *net = sock_net(sk);
534 	char devname[IFNAMSIZ];
535 	int index;
536 
537 	/* Sorry... */
538 	ret = -EPERM;
539 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
540 		goto out;
541 
542 	ret = -EINVAL;
543 	if (optlen < 0)
544 		goto out;
545 
546 	/* Bind this socket to a particular device like "eth0",
547 	 * as specified in the passed interface name. If the
548 	 * name is "" or the option length is zero the socket
549 	 * is not bound.
550 	 */
551 	if (optlen > IFNAMSIZ - 1)
552 		optlen = IFNAMSIZ - 1;
553 	memset(devname, 0, sizeof(devname));
554 
555 	ret = -EFAULT;
556 	if (copy_from_user(devname, optval, optlen))
557 		goto out;
558 
559 	index = 0;
560 	if (devname[0] != '\0') {
561 		struct net_device *dev;
562 
563 		rcu_read_lock();
564 		dev = dev_get_by_name_rcu(net, devname);
565 		if (dev)
566 			index = dev->ifindex;
567 		rcu_read_unlock();
568 		ret = -ENODEV;
569 		if (!dev)
570 			goto out;
571 	}
572 
573 	lock_sock(sk);
574 	sk->sk_bound_dev_if = index;
575 	sk_dst_reset(sk);
576 	release_sock(sk);
577 
578 	ret = 0;
579 
580 out:
581 #endif
582 
583 	return ret;
584 }
585 
586 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
587 				int __user *optlen, int len)
588 {
589 	int ret = -ENOPROTOOPT;
590 #ifdef CONFIG_NETDEVICES
591 	struct net *net = sock_net(sk);
592 	char devname[IFNAMSIZ];
593 
594 	if (sk->sk_bound_dev_if == 0) {
595 		len = 0;
596 		goto zero;
597 	}
598 
599 	ret = -EINVAL;
600 	if (len < IFNAMSIZ)
601 		goto out;
602 
603 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
604 	if (ret)
605 		goto out;
606 
607 	len = strlen(devname) + 1;
608 
609 	ret = -EFAULT;
610 	if (copy_to_user(optval, devname, len))
611 		goto out;
612 
613 zero:
614 	ret = -EFAULT;
615 	if (put_user(len, optlen))
616 		goto out;
617 
618 	ret = 0;
619 
620 out:
621 #endif
622 
623 	return ret;
624 }
625 
626 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
627 {
628 	if (valbool)
629 		sock_set_flag(sk, bit);
630 	else
631 		sock_reset_flag(sk, bit);
632 }
633 
634 bool sk_mc_loop(struct sock *sk)
635 {
636 	if (dev_recursion_level())
637 		return false;
638 	if (!sk)
639 		return true;
640 	switch (sk->sk_family) {
641 	case AF_INET:
642 		return inet_sk(sk)->mc_loop;
643 #if IS_ENABLED(CONFIG_IPV6)
644 	case AF_INET6:
645 		return inet6_sk(sk)->mc_loop;
646 #endif
647 	}
648 	WARN_ON(1);
649 	return true;
650 }
651 EXPORT_SYMBOL(sk_mc_loop);
652 
653 /*
654  *	This is meant for all protocols to use and covers goings on
655  *	at the socket level. Everything here is generic.
656  */
657 
658 int sock_setsockopt(struct socket *sock, int level, int optname,
659 		    char __user *optval, unsigned int optlen)
660 {
661 	struct sock *sk = sock->sk;
662 	int val;
663 	int valbool;
664 	struct linger ling;
665 	int ret = 0;
666 
667 	/*
668 	 *	Options without arguments
669 	 */
670 
671 	if (optname == SO_BINDTODEVICE)
672 		return sock_setbindtodevice(sk, optval, optlen);
673 
674 	if (optlen < sizeof(int))
675 		return -EINVAL;
676 
677 	if (get_user(val, (int __user *)optval))
678 		return -EFAULT;
679 
680 	valbool = val ? 1 : 0;
681 
682 	lock_sock(sk);
683 
684 	switch (optname) {
685 	case SO_DEBUG:
686 		if (val && !capable(CAP_NET_ADMIN))
687 			ret = -EACCES;
688 		else
689 			sock_valbool_flag(sk, SOCK_DBG, valbool);
690 		break;
691 	case SO_REUSEADDR:
692 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
693 		break;
694 	case SO_REUSEPORT:
695 		sk->sk_reuseport = valbool;
696 		break;
697 	case SO_TYPE:
698 	case SO_PROTOCOL:
699 	case SO_DOMAIN:
700 	case SO_ERROR:
701 		ret = -ENOPROTOOPT;
702 		break;
703 	case SO_DONTROUTE:
704 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
705 		break;
706 	case SO_BROADCAST:
707 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
708 		break;
709 	case SO_SNDBUF:
710 		/* Don't error on this BSD doesn't and if you think
711 		 * about it this is right. Otherwise apps have to
712 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
713 		 * are treated in BSD as hints
714 		 */
715 		val = min_t(u32, val, sysctl_wmem_max);
716 set_sndbuf:
717 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
718 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
719 		/* Wake up sending tasks if we upped the value. */
720 		sk->sk_write_space(sk);
721 		break;
722 
723 	case SO_SNDBUFFORCE:
724 		if (!capable(CAP_NET_ADMIN)) {
725 			ret = -EPERM;
726 			break;
727 		}
728 		goto set_sndbuf;
729 
730 	case SO_RCVBUF:
731 		/* Don't error on this BSD doesn't and if you think
732 		 * about it this is right. Otherwise apps have to
733 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
734 		 * are treated in BSD as hints
735 		 */
736 		val = min_t(u32, val, sysctl_rmem_max);
737 set_rcvbuf:
738 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
739 		/*
740 		 * We double it on the way in to account for
741 		 * "struct sk_buff" etc. overhead.   Applications
742 		 * assume that the SO_RCVBUF setting they make will
743 		 * allow that much actual data to be received on that
744 		 * socket.
745 		 *
746 		 * Applications are unaware that "struct sk_buff" and
747 		 * other overheads allocate from the receive buffer
748 		 * during socket buffer allocation.
749 		 *
750 		 * And after considering the possible alternatives,
751 		 * returning the value we actually used in getsockopt
752 		 * is the most desirable behavior.
753 		 */
754 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
755 		break;
756 
757 	case SO_RCVBUFFORCE:
758 		if (!capable(CAP_NET_ADMIN)) {
759 			ret = -EPERM;
760 			break;
761 		}
762 		goto set_rcvbuf;
763 
764 	case SO_KEEPALIVE:
765 #ifdef CONFIG_INET
766 		if (sk->sk_protocol == IPPROTO_TCP &&
767 		    sk->sk_type == SOCK_STREAM)
768 			tcp_set_keepalive(sk, valbool);
769 #endif
770 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
771 		break;
772 
773 	case SO_OOBINLINE:
774 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
775 		break;
776 
777 	case SO_NO_CHECK:
778 		sk->sk_no_check_tx = valbool;
779 		break;
780 
781 	case SO_PRIORITY:
782 		if ((val >= 0 && val <= 6) ||
783 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
784 			sk->sk_priority = val;
785 		else
786 			ret = -EPERM;
787 		break;
788 
789 	case SO_LINGER:
790 		if (optlen < sizeof(ling)) {
791 			ret = -EINVAL;	/* 1003.1g */
792 			break;
793 		}
794 		if (copy_from_user(&ling, optval, sizeof(ling))) {
795 			ret = -EFAULT;
796 			break;
797 		}
798 		if (!ling.l_onoff)
799 			sock_reset_flag(sk, SOCK_LINGER);
800 		else {
801 #if (BITS_PER_LONG == 32)
802 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
803 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
804 			else
805 #endif
806 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
807 			sock_set_flag(sk, SOCK_LINGER);
808 		}
809 		break;
810 
811 	case SO_BSDCOMPAT:
812 		sock_warn_obsolete_bsdism("setsockopt");
813 		break;
814 
815 	case SO_PASSCRED:
816 		if (valbool)
817 			set_bit(SOCK_PASSCRED, &sock->flags);
818 		else
819 			clear_bit(SOCK_PASSCRED, &sock->flags);
820 		break;
821 
822 	case SO_TIMESTAMP:
823 	case SO_TIMESTAMPNS:
824 		if (valbool)  {
825 			if (optname == SO_TIMESTAMP)
826 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
827 			else
828 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
829 			sock_set_flag(sk, SOCK_RCVTSTAMP);
830 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
831 		} else {
832 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
833 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
834 		}
835 		break;
836 
837 	case SO_TIMESTAMPING:
838 		if (val & ~SOF_TIMESTAMPING_MASK) {
839 			ret = -EINVAL;
840 			break;
841 		}
842 
843 		if (val & SOF_TIMESTAMPING_OPT_ID &&
844 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
845 			if (sk->sk_protocol == IPPROTO_TCP &&
846 			    sk->sk_type == SOCK_STREAM) {
847 				if ((1 << sk->sk_state) &
848 				    (TCPF_CLOSE | TCPF_LISTEN)) {
849 					ret = -EINVAL;
850 					break;
851 				}
852 				sk->sk_tskey = tcp_sk(sk)->snd_una;
853 			} else {
854 				sk->sk_tskey = 0;
855 			}
856 		}
857 
858 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
859 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
860 			ret = -EINVAL;
861 			break;
862 		}
863 
864 		sk->sk_tsflags = val;
865 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
866 			sock_enable_timestamp(sk,
867 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
868 		else
869 			sock_disable_timestamp(sk,
870 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
871 		break;
872 
873 	case SO_RCVLOWAT:
874 		if (val < 0)
875 			val = INT_MAX;
876 		sk->sk_rcvlowat = val ? : 1;
877 		break;
878 
879 	case SO_RCVTIMEO:
880 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
881 		break;
882 
883 	case SO_SNDTIMEO:
884 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
885 		break;
886 
887 	case SO_ATTACH_FILTER:
888 		ret = -EINVAL;
889 		if (optlen == sizeof(struct sock_fprog)) {
890 			struct sock_fprog fprog;
891 
892 			ret = -EFAULT;
893 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
894 				break;
895 
896 			ret = sk_attach_filter(&fprog, sk);
897 		}
898 		break;
899 
900 	case SO_ATTACH_BPF:
901 		ret = -EINVAL;
902 		if (optlen == sizeof(u32)) {
903 			u32 ufd;
904 
905 			ret = -EFAULT;
906 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
907 				break;
908 
909 			ret = sk_attach_bpf(ufd, sk);
910 		}
911 		break;
912 
913 	case SO_ATTACH_REUSEPORT_CBPF:
914 		ret = -EINVAL;
915 		if (optlen == sizeof(struct sock_fprog)) {
916 			struct sock_fprog fprog;
917 
918 			ret = -EFAULT;
919 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
920 				break;
921 
922 			ret = sk_reuseport_attach_filter(&fprog, sk);
923 		}
924 		break;
925 
926 	case SO_ATTACH_REUSEPORT_EBPF:
927 		ret = -EINVAL;
928 		if (optlen == sizeof(u32)) {
929 			u32 ufd;
930 
931 			ret = -EFAULT;
932 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
933 				break;
934 
935 			ret = sk_reuseport_attach_bpf(ufd, sk);
936 		}
937 		break;
938 
939 	case SO_DETACH_FILTER:
940 		ret = sk_detach_filter(sk);
941 		break;
942 
943 	case SO_LOCK_FILTER:
944 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
945 			ret = -EPERM;
946 		else
947 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
948 		break;
949 
950 	case SO_PASSSEC:
951 		if (valbool)
952 			set_bit(SOCK_PASSSEC, &sock->flags);
953 		else
954 			clear_bit(SOCK_PASSSEC, &sock->flags);
955 		break;
956 	case SO_MARK:
957 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
958 			ret = -EPERM;
959 		else
960 			sk->sk_mark = val;
961 		break;
962 
963 	case SO_RXQ_OVFL:
964 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
965 		break;
966 
967 	case SO_WIFI_STATUS:
968 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
969 		break;
970 
971 	case SO_PEEK_OFF:
972 		if (sock->ops->set_peek_off)
973 			ret = sock->ops->set_peek_off(sk, val);
974 		else
975 			ret = -EOPNOTSUPP;
976 		break;
977 
978 	case SO_NOFCS:
979 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
980 		break;
981 
982 	case SO_SELECT_ERR_QUEUE:
983 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
984 		break;
985 
986 #ifdef CONFIG_NET_RX_BUSY_POLL
987 	case SO_BUSY_POLL:
988 		/* allow unprivileged users to decrease the value */
989 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
990 			ret = -EPERM;
991 		else {
992 			if (val < 0)
993 				ret = -EINVAL;
994 			else
995 				sk->sk_ll_usec = val;
996 		}
997 		break;
998 #endif
999 
1000 	case SO_MAX_PACING_RATE:
1001 		sk->sk_max_pacing_rate = val;
1002 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1003 					 sk->sk_max_pacing_rate);
1004 		break;
1005 
1006 	case SO_INCOMING_CPU:
1007 		sk->sk_incoming_cpu = val;
1008 		break;
1009 
1010 	case SO_CNX_ADVICE:
1011 		if (val == 1)
1012 			dst_negative_advice(sk);
1013 		break;
1014 	default:
1015 		ret = -ENOPROTOOPT;
1016 		break;
1017 	}
1018 	release_sock(sk);
1019 	return ret;
1020 }
1021 EXPORT_SYMBOL(sock_setsockopt);
1022 
1023 
1024 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1025 			  struct ucred *ucred)
1026 {
1027 	ucred->pid = pid_vnr(pid);
1028 	ucred->uid = ucred->gid = -1;
1029 	if (cred) {
1030 		struct user_namespace *current_ns = current_user_ns();
1031 
1032 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1033 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1034 	}
1035 }
1036 
1037 int sock_getsockopt(struct socket *sock, int level, int optname,
1038 		    char __user *optval, int __user *optlen)
1039 {
1040 	struct sock *sk = sock->sk;
1041 
1042 	union {
1043 		int val;
1044 		struct linger ling;
1045 		struct timeval tm;
1046 	} v;
1047 
1048 	int lv = sizeof(int);
1049 	int len;
1050 
1051 	if (get_user(len, optlen))
1052 		return -EFAULT;
1053 	if (len < 0)
1054 		return -EINVAL;
1055 
1056 	memset(&v, 0, sizeof(v));
1057 
1058 	switch (optname) {
1059 	case SO_DEBUG:
1060 		v.val = sock_flag(sk, SOCK_DBG);
1061 		break;
1062 
1063 	case SO_DONTROUTE:
1064 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1065 		break;
1066 
1067 	case SO_BROADCAST:
1068 		v.val = sock_flag(sk, SOCK_BROADCAST);
1069 		break;
1070 
1071 	case SO_SNDBUF:
1072 		v.val = sk->sk_sndbuf;
1073 		break;
1074 
1075 	case SO_RCVBUF:
1076 		v.val = sk->sk_rcvbuf;
1077 		break;
1078 
1079 	case SO_REUSEADDR:
1080 		v.val = sk->sk_reuse;
1081 		break;
1082 
1083 	case SO_REUSEPORT:
1084 		v.val = sk->sk_reuseport;
1085 		break;
1086 
1087 	case SO_KEEPALIVE:
1088 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1089 		break;
1090 
1091 	case SO_TYPE:
1092 		v.val = sk->sk_type;
1093 		break;
1094 
1095 	case SO_PROTOCOL:
1096 		v.val = sk->sk_protocol;
1097 		break;
1098 
1099 	case SO_DOMAIN:
1100 		v.val = sk->sk_family;
1101 		break;
1102 
1103 	case SO_ERROR:
1104 		v.val = -sock_error(sk);
1105 		if (v.val == 0)
1106 			v.val = xchg(&sk->sk_err_soft, 0);
1107 		break;
1108 
1109 	case SO_OOBINLINE:
1110 		v.val = sock_flag(sk, SOCK_URGINLINE);
1111 		break;
1112 
1113 	case SO_NO_CHECK:
1114 		v.val = sk->sk_no_check_tx;
1115 		break;
1116 
1117 	case SO_PRIORITY:
1118 		v.val = sk->sk_priority;
1119 		break;
1120 
1121 	case SO_LINGER:
1122 		lv		= sizeof(v.ling);
1123 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1124 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1125 		break;
1126 
1127 	case SO_BSDCOMPAT:
1128 		sock_warn_obsolete_bsdism("getsockopt");
1129 		break;
1130 
1131 	case SO_TIMESTAMP:
1132 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1133 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1134 		break;
1135 
1136 	case SO_TIMESTAMPNS:
1137 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1138 		break;
1139 
1140 	case SO_TIMESTAMPING:
1141 		v.val = sk->sk_tsflags;
1142 		break;
1143 
1144 	case SO_RCVTIMEO:
1145 		lv = sizeof(struct timeval);
1146 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1147 			v.tm.tv_sec = 0;
1148 			v.tm.tv_usec = 0;
1149 		} else {
1150 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1151 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1152 		}
1153 		break;
1154 
1155 	case SO_SNDTIMEO:
1156 		lv = sizeof(struct timeval);
1157 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1158 			v.tm.tv_sec = 0;
1159 			v.tm.tv_usec = 0;
1160 		} else {
1161 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1162 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1163 		}
1164 		break;
1165 
1166 	case SO_RCVLOWAT:
1167 		v.val = sk->sk_rcvlowat;
1168 		break;
1169 
1170 	case SO_SNDLOWAT:
1171 		v.val = 1;
1172 		break;
1173 
1174 	case SO_PASSCRED:
1175 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1176 		break;
1177 
1178 	case SO_PEERCRED:
1179 	{
1180 		struct ucred peercred;
1181 		if (len > sizeof(peercred))
1182 			len = sizeof(peercred);
1183 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1184 		if (copy_to_user(optval, &peercred, len))
1185 			return -EFAULT;
1186 		goto lenout;
1187 	}
1188 
1189 	case SO_PEERNAME:
1190 	{
1191 		char address[128];
1192 
1193 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1194 			return -ENOTCONN;
1195 		if (lv < len)
1196 			return -EINVAL;
1197 		if (copy_to_user(optval, address, len))
1198 			return -EFAULT;
1199 		goto lenout;
1200 	}
1201 
1202 	/* Dubious BSD thing... Probably nobody even uses it, but
1203 	 * the UNIX standard wants it for whatever reason... -DaveM
1204 	 */
1205 	case SO_ACCEPTCONN:
1206 		v.val = sk->sk_state == TCP_LISTEN;
1207 		break;
1208 
1209 	case SO_PASSSEC:
1210 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1211 		break;
1212 
1213 	case SO_PEERSEC:
1214 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1215 
1216 	case SO_MARK:
1217 		v.val = sk->sk_mark;
1218 		break;
1219 
1220 	case SO_RXQ_OVFL:
1221 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1222 		break;
1223 
1224 	case SO_WIFI_STATUS:
1225 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1226 		break;
1227 
1228 	case SO_PEEK_OFF:
1229 		if (!sock->ops->set_peek_off)
1230 			return -EOPNOTSUPP;
1231 
1232 		v.val = sk->sk_peek_off;
1233 		break;
1234 	case SO_NOFCS:
1235 		v.val = sock_flag(sk, SOCK_NOFCS);
1236 		break;
1237 
1238 	case SO_BINDTODEVICE:
1239 		return sock_getbindtodevice(sk, optval, optlen, len);
1240 
1241 	case SO_GET_FILTER:
1242 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1243 		if (len < 0)
1244 			return len;
1245 
1246 		goto lenout;
1247 
1248 	case SO_LOCK_FILTER:
1249 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1250 		break;
1251 
1252 	case SO_BPF_EXTENSIONS:
1253 		v.val = bpf_tell_extensions();
1254 		break;
1255 
1256 	case SO_SELECT_ERR_QUEUE:
1257 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1258 		break;
1259 
1260 #ifdef CONFIG_NET_RX_BUSY_POLL
1261 	case SO_BUSY_POLL:
1262 		v.val = sk->sk_ll_usec;
1263 		break;
1264 #endif
1265 
1266 	case SO_MAX_PACING_RATE:
1267 		v.val = sk->sk_max_pacing_rate;
1268 		break;
1269 
1270 	case SO_INCOMING_CPU:
1271 		v.val = sk->sk_incoming_cpu;
1272 		break;
1273 
1274 	default:
1275 		/* We implement the SO_SNDLOWAT etc to not be settable
1276 		 * (1003.1g 7).
1277 		 */
1278 		return -ENOPROTOOPT;
1279 	}
1280 
1281 	if (len > lv)
1282 		len = lv;
1283 	if (copy_to_user(optval, &v, len))
1284 		return -EFAULT;
1285 lenout:
1286 	if (put_user(len, optlen))
1287 		return -EFAULT;
1288 	return 0;
1289 }
1290 
1291 /*
1292  * Initialize an sk_lock.
1293  *
1294  * (We also register the sk_lock with the lock validator.)
1295  */
1296 static inline void sock_lock_init(struct sock *sk)
1297 {
1298 	sock_lock_init_class_and_name(sk,
1299 			af_family_slock_key_strings[sk->sk_family],
1300 			af_family_slock_keys + sk->sk_family,
1301 			af_family_key_strings[sk->sk_family],
1302 			af_family_keys + sk->sk_family);
1303 }
1304 
1305 /*
1306  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1307  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1308  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1309  */
1310 static void sock_copy(struct sock *nsk, const struct sock *osk)
1311 {
1312 #ifdef CONFIG_SECURITY_NETWORK
1313 	void *sptr = nsk->sk_security;
1314 #endif
1315 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1316 
1317 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1318 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1319 
1320 #ifdef CONFIG_SECURITY_NETWORK
1321 	nsk->sk_security = sptr;
1322 	security_sk_clone(osk, nsk);
1323 #endif
1324 }
1325 
1326 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1327 		int family)
1328 {
1329 	struct sock *sk;
1330 	struct kmem_cache *slab;
1331 
1332 	slab = prot->slab;
1333 	if (slab != NULL) {
1334 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1335 		if (!sk)
1336 			return sk;
1337 		if (priority & __GFP_ZERO)
1338 			sk_prot_clear_nulls(sk, prot->obj_size);
1339 	} else
1340 		sk = kmalloc(prot->obj_size, priority);
1341 
1342 	if (sk != NULL) {
1343 		kmemcheck_annotate_bitfield(sk, flags);
1344 
1345 		if (security_sk_alloc(sk, family, priority))
1346 			goto out_free;
1347 
1348 		if (!try_module_get(prot->owner))
1349 			goto out_free_sec;
1350 		sk_tx_queue_clear(sk);
1351 	}
1352 
1353 	return sk;
1354 
1355 out_free_sec:
1356 	security_sk_free(sk);
1357 out_free:
1358 	if (slab != NULL)
1359 		kmem_cache_free(slab, sk);
1360 	else
1361 		kfree(sk);
1362 	return NULL;
1363 }
1364 
1365 static void sk_prot_free(struct proto *prot, struct sock *sk)
1366 {
1367 	struct kmem_cache *slab;
1368 	struct module *owner;
1369 
1370 	owner = prot->owner;
1371 	slab = prot->slab;
1372 
1373 	cgroup_sk_free(&sk->sk_cgrp_data);
1374 	mem_cgroup_sk_free(sk);
1375 	security_sk_free(sk);
1376 	if (slab != NULL)
1377 		kmem_cache_free(slab, sk);
1378 	else
1379 		kfree(sk);
1380 	module_put(owner);
1381 }
1382 
1383 /**
1384  *	sk_alloc - All socket objects are allocated here
1385  *	@net: the applicable net namespace
1386  *	@family: protocol family
1387  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1388  *	@prot: struct proto associated with this new sock instance
1389  *	@kern: is this to be a kernel socket?
1390  */
1391 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1392 		      struct proto *prot, int kern)
1393 {
1394 	struct sock *sk;
1395 
1396 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1397 	if (sk) {
1398 		sk->sk_family = family;
1399 		/*
1400 		 * See comment in struct sock definition to understand
1401 		 * why we need sk_prot_creator -acme
1402 		 */
1403 		sk->sk_prot = sk->sk_prot_creator = prot;
1404 		sock_lock_init(sk);
1405 		sk->sk_net_refcnt = kern ? 0 : 1;
1406 		if (likely(sk->sk_net_refcnt))
1407 			get_net(net);
1408 		sock_net_set(sk, net);
1409 		atomic_set(&sk->sk_wmem_alloc, 1);
1410 
1411 		mem_cgroup_sk_alloc(sk);
1412 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1413 		sock_update_classid(&sk->sk_cgrp_data);
1414 		sock_update_netprioidx(&sk->sk_cgrp_data);
1415 	}
1416 
1417 	return sk;
1418 }
1419 EXPORT_SYMBOL(sk_alloc);
1420 
1421 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1422  * grace period. This is the case for UDP sockets and TCP listeners.
1423  */
1424 static void __sk_destruct(struct rcu_head *head)
1425 {
1426 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1427 	struct sk_filter *filter;
1428 
1429 	if (sk->sk_destruct)
1430 		sk->sk_destruct(sk);
1431 
1432 	filter = rcu_dereference_check(sk->sk_filter,
1433 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1434 	if (filter) {
1435 		sk_filter_uncharge(sk, filter);
1436 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1437 	}
1438 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1439 		reuseport_detach_sock(sk);
1440 
1441 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1442 
1443 	if (atomic_read(&sk->sk_omem_alloc))
1444 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1445 			 __func__, atomic_read(&sk->sk_omem_alloc));
1446 
1447 	if (sk->sk_peer_cred)
1448 		put_cred(sk->sk_peer_cred);
1449 	put_pid(sk->sk_peer_pid);
1450 	if (likely(sk->sk_net_refcnt))
1451 		put_net(sock_net(sk));
1452 	sk_prot_free(sk->sk_prot_creator, sk);
1453 }
1454 
1455 void sk_destruct(struct sock *sk)
1456 {
1457 	if (sock_flag(sk, SOCK_RCU_FREE))
1458 		call_rcu(&sk->sk_rcu, __sk_destruct);
1459 	else
1460 		__sk_destruct(&sk->sk_rcu);
1461 }
1462 
1463 static void __sk_free(struct sock *sk)
1464 {
1465 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1466 		sock_diag_broadcast_destroy(sk);
1467 	else
1468 		sk_destruct(sk);
1469 }
1470 
1471 void sk_free(struct sock *sk)
1472 {
1473 	/*
1474 	 * We subtract one from sk_wmem_alloc and can know if
1475 	 * some packets are still in some tx queue.
1476 	 * If not null, sock_wfree() will call __sk_free(sk) later
1477 	 */
1478 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1479 		__sk_free(sk);
1480 }
1481 EXPORT_SYMBOL(sk_free);
1482 
1483 /**
1484  *	sk_clone_lock - clone a socket, and lock its clone
1485  *	@sk: the socket to clone
1486  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1487  *
1488  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1489  */
1490 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1491 {
1492 	struct sock *newsk;
1493 	bool is_charged = true;
1494 
1495 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1496 	if (newsk != NULL) {
1497 		struct sk_filter *filter;
1498 
1499 		sock_copy(newsk, sk);
1500 
1501 		/* SANITY */
1502 		if (likely(newsk->sk_net_refcnt))
1503 			get_net(sock_net(newsk));
1504 		sk_node_init(&newsk->sk_node);
1505 		sock_lock_init(newsk);
1506 		bh_lock_sock(newsk);
1507 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1508 		newsk->sk_backlog.len = 0;
1509 
1510 		atomic_set(&newsk->sk_rmem_alloc, 0);
1511 		/*
1512 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1513 		 */
1514 		atomic_set(&newsk->sk_wmem_alloc, 1);
1515 		atomic_set(&newsk->sk_omem_alloc, 0);
1516 		skb_queue_head_init(&newsk->sk_receive_queue);
1517 		skb_queue_head_init(&newsk->sk_write_queue);
1518 
1519 		rwlock_init(&newsk->sk_callback_lock);
1520 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1521 				af_callback_keys + newsk->sk_family,
1522 				af_family_clock_key_strings[newsk->sk_family]);
1523 
1524 		newsk->sk_dst_cache	= NULL;
1525 		newsk->sk_wmem_queued	= 0;
1526 		newsk->sk_forward_alloc = 0;
1527 		atomic_set(&newsk->sk_drops, 0);
1528 		newsk->sk_send_head	= NULL;
1529 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1530 
1531 		sock_reset_flag(newsk, SOCK_DONE);
1532 		skb_queue_head_init(&newsk->sk_error_queue);
1533 
1534 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1535 		if (filter != NULL)
1536 			/* though it's an empty new sock, the charging may fail
1537 			 * if sysctl_optmem_max was changed between creation of
1538 			 * original socket and cloning
1539 			 */
1540 			is_charged = sk_filter_charge(newsk, filter);
1541 
1542 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1543 			/* It is still raw copy of parent, so invalidate
1544 			 * destructor and make plain sk_free() */
1545 			newsk->sk_destruct = NULL;
1546 			bh_unlock_sock(newsk);
1547 			sk_free(newsk);
1548 			newsk = NULL;
1549 			goto out;
1550 		}
1551 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1552 
1553 		newsk->sk_err	   = 0;
1554 		newsk->sk_err_soft = 0;
1555 		newsk->sk_priority = 0;
1556 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1557 		atomic64_set(&newsk->sk_cookie, 0);
1558 
1559 		mem_cgroup_sk_alloc(newsk);
1560 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1561 
1562 		/*
1563 		 * Before updating sk_refcnt, we must commit prior changes to memory
1564 		 * (Documentation/RCU/rculist_nulls.txt for details)
1565 		 */
1566 		smp_wmb();
1567 		atomic_set(&newsk->sk_refcnt, 2);
1568 
1569 		/*
1570 		 * Increment the counter in the same struct proto as the master
1571 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1572 		 * is the same as sk->sk_prot->socks, as this field was copied
1573 		 * with memcpy).
1574 		 *
1575 		 * This _changes_ the previous behaviour, where
1576 		 * tcp_create_openreq_child always was incrementing the
1577 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1578 		 * to be taken into account in all callers. -acme
1579 		 */
1580 		sk_refcnt_debug_inc(newsk);
1581 		sk_set_socket(newsk, NULL);
1582 		newsk->sk_wq = NULL;
1583 
1584 		if (newsk->sk_prot->sockets_allocated)
1585 			sk_sockets_allocated_inc(newsk);
1586 
1587 		if (sock_needs_netstamp(sk) &&
1588 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1589 			net_enable_timestamp();
1590 	}
1591 out:
1592 	return newsk;
1593 }
1594 EXPORT_SYMBOL_GPL(sk_clone_lock);
1595 
1596 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1597 {
1598 	u32 max_segs = 1;
1599 
1600 	sk_dst_set(sk, dst);
1601 	sk->sk_route_caps = dst->dev->features;
1602 	if (sk->sk_route_caps & NETIF_F_GSO)
1603 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1604 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1605 	if (sk_can_gso(sk)) {
1606 		if (dst->header_len) {
1607 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1608 		} else {
1609 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1610 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1611 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1612 		}
1613 	}
1614 	sk->sk_gso_max_segs = max_segs;
1615 }
1616 EXPORT_SYMBOL_GPL(sk_setup_caps);
1617 
1618 /*
1619  *	Simple resource managers for sockets.
1620  */
1621 
1622 
1623 /*
1624  * Write buffer destructor automatically called from kfree_skb.
1625  */
1626 void sock_wfree(struct sk_buff *skb)
1627 {
1628 	struct sock *sk = skb->sk;
1629 	unsigned int len = skb->truesize;
1630 
1631 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1632 		/*
1633 		 * Keep a reference on sk_wmem_alloc, this will be released
1634 		 * after sk_write_space() call
1635 		 */
1636 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1637 		sk->sk_write_space(sk);
1638 		len = 1;
1639 	}
1640 	/*
1641 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1642 	 * could not do because of in-flight packets
1643 	 */
1644 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1645 		__sk_free(sk);
1646 }
1647 EXPORT_SYMBOL(sock_wfree);
1648 
1649 /* This variant of sock_wfree() is used by TCP,
1650  * since it sets SOCK_USE_WRITE_QUEUE.
1651  */
1652 void __sock_wfree(struct sk_buff *skb)
1653 {
1654 	struct sock *sk = skb->sk;
1655 
1656 	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1657 		__sk_free(sk);
1658 }
1659 
1660 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1661 {
1662 	skb_orphan(skb);
1663 	skb->sk = sk;
1664 #ifdef CONFIG_INET
1665 	if (unlikely(!sk_fullsock(sk))) {
1666 		skb->destructor = sock_edemux;
1667 		sock_hold(sk);
1668 		return;
1669 	}
1670 #endif
1671 	skb->destructor = sock_wfree;
1672 	skb_set_hash_from_sk(skb, sk);
1673 	/*
1674 	 * We used to take a refcount on sk, but following operation
1675 	 * is enough to guarantee sk_free() wont free this sock until
1676 	 * all in-flight packets are completed
1677 	 */
1678 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1679 }
1680 EXPORT_SYMBOL(skb_set_owner_w);
1681 
1682 /* This helper is used by netem, as it can hold packets in its
1683  * delay queue. We want to allow the owner socket to send more
1684  * packets, as if they were already TX completed by a typical driver.
1685  * But we also want to keep skb->sk set because some packet schedulers
1686  * rely on it (sch_fq for example). So we set skb->truesize to a small
1687  * amount (1) and decrease sk_wmem_alloc accordingly.
1688  */
1689 void skb_orphan_partial(struct sk_buff *skb)
1690 {
1691 	/* If this skb is a TCP pure ACK or already went here,
1692 	 * we have nothing to do. 2 is already a very small truesize.
1693 	 */
1694 	if (skb->truesize <= 2)
1695 		return;
1696 
1697 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1698 	 * so we do not completely orphan skb, but transfert all
1699 	 * accounted bytes but one, to avoid unexpected reorders.
1700 	 */
1701 	if (skb->destructor == sock_wfree
1702 #ifdef CONFIG_INET
1703 	    || skb->destructor == tcp_wfree
1704 #endif
1705 		) {
1706 		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1707 		skb->truesize = 1;
1708 	} else {
1709 		skb_orphan(skb);
1710 	}
1711 }
1712 EXPORT_SYMBOL(skb_orphan_partial);
1713 
1714 /*
1715  * Read buffer destructor automatically called from kfree_skb.
1716  */
1717 void sock_rfree(struct sk_buff *skb)
1718 {
1719 	struct sock *sk = skb->sk;
1720 	unsigned int len = skb->truesize;
1721 
1722 	atomic_sub(len, &sk->sk_rmem_alloc);
1723 	sk_mem_uncharge(sk, len);
1724 }
1725 EXPORT_SYMBOL(sock_rfree);
1726 
1727 /*
1728  * Buffer destructor for skbs that are not used directly in read or write
1729  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1730  */
1731 void sock_efree(struct sk_buff *skb)
1732 {
1733 	sock_put(skb->sk);
1734 }
1735 EXPORT_SYMBOL(sock_efree);
1736 
1737 kuid_t sock_i_uid(struct sock *sk)
1738 {
1739 	kuid_t uid;
1740 
1741 	read_lock_bh(&sk->sk_callback_lock);
1742 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1743 	read_unlock_bh(&sk->sk_callback_lock);
1744 	return uid;
1745 }
1746 EXPORT_SYMBOL(sock_i_uid);
1747 
1748 unsigned long sock_i_ino(struct sock *sk)
1749 {
1750 	unsigned long ino;
1751 
1752 	read_lock_bh(&sk->sk_callback_lock);
1753 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1754 	read_unlock_bh(&sk->sk_callback_lock);
1755 	return ino;
1756 }
1757 EXPORT_SYMBOL(sock_i_ino);
1758 
1759 /*
1760  * Allocate a skb from the socket's send buffer.
1761  */
1762 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1763 			     gfp_t priority)
1764 {
1765 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1766 		struct sk_buff *skb = alloc_skb(size, priority);
1767 		if (skb) {
1768 			skb_set_owner_w(skb, sk);
1769 			return skb;
1770 		}
1771 	}
1772 	return NULL;
1773 }
1774 EXPORT_SYMBOL(sock_wmalloc);
1775 
1776 /*
1777  * Allocate a memory block from the socket's option memory buffer.
1778  */
1779 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1780 {
1781 	if ((unsigned int)size <= sysctl_optmem_max &&
1782 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1783 		void *mem;
1784 		/* First do the add, to avoid the race if kmalloc
1785 		 * might sleep.
1786 		 */
1787 		atomic_add(size, &sk->sk_omem_alloc);
1788 		mem = kmalloc(size, priority);
1789 		if (mem)
1790 			return mem;
1791 		atomic_sub(size, &sk->sk_omem_alloc);
1792 	}
1793 	return NULL;
1794 }
1795 EXPORT_SYMBOL(sock_kmalloc);
1796 
1797 /* Free an option memory block. Note, we actually want the inline
1798  * here as this allows gcc to detect the nullify and fold away the
1799  * condition entirely.
1800  */
1801 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1802 				  const bool nullify)
1803 {
1804 	if (WARN_ON_ONCE(!mem))
1805 		return;
1806 	if (nullify)
1807 		kzfree(mem);
1808 	else
1809 		kfree(mem);
1810 	atomic_sub(size, &sk->sk_omem_alloc);
1811 }
1812 
1813 void sock_kfree_s(struct sock *sk, void *mem, int size)
1814 {
1815 	__sock_kfree_s(sk, mem, size, false);
1816 }
1817 EXPORT_SYMBOL(sock_kfree_s);
1818 
1819 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1820 {
1821 	__sock_kfree_s(sk, mem, size, true);
1822 }
1823 EXPORT_SYMBOL(sock_kzfree_s);
1824 
1825 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1826    I think, these locks should be removed for datagram sockets.
1827  */
1828 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1829 {
1830 	DEFINE_WAIT(wait);
1831 
1832 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1833 	for (;;) {
1834 		if (!timeo)
1835 			break;
1836 		if (signal_pending(current))
1837 			break;
1838 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1839 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1840 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1841 			break;
1842 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1843 			break;
1844 		if (sk->sk_err)
1845 			break;
1846 		timeo = schedule_timeout(timeo);
1847 	}
1848 	finish_wait(sk_sleep(sk), &wait);
1849 	return timeo;
1850 }
1851 
1852 
1853 /*
1854  *	Generic send/receive buffer handlers
1855  */
1856 
1857 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1858 				     unsigned long data_len, int noblock,
1859 				     int *errcode, int max_page_order)
1860 {
1861 	struct sk_buff *skb;
1862 	long timeo;
1863 	int err;
1864 
1865 	timeo = sock_sndtimeo(sk, noblock);
1866 	for (;;) {
1867 		err = sock_error(sk);
1868 		if (err != 0)
1869 			goto failure;
1870 
1871 		err = -EPIPE;
1872 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1873 			goto failure;
1874 
1875 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1876 			break;
1877 
1878 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1879 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1880 		err = -EAGAIN;
1881 		if (!timeo)
1882 			goto failure;
1883 		if (signal_pending(current))
1884 			goto interrupted;
1885 		timeo = sock_wait_for_wmem(sk, timeo);
1886 	}
1887 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1888 				   errcode, sk->sk_allocation);
1889 	if (skb)
1890 		skb_set_owner_w(skb, sk);
1891 	return skb;
1892 
1893 interrupted:
1894 	err = sock_intr_errno(timeo);
1895 failure:
1896 	*errcode = err;
1897 	return NULL;
1898 }
1899 EXPORT_SYMBOL(sock_alloc_send_pskb);
1900 
1901 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1902 				    int noblock, int *errcode)
1903 {
1904 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1905 }
1906 EXPORT_SYMBOL(sock_alloc_send_skb);
1907 
1908 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1909 		     struct sockcm_cookie *sockc)
1910 {
1911 	u32 tsflags;
1912 
1913 	switch (cmsg->cmsg_type) {
1914 	case SO_MARK:
1915 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1916 			return -EPERM;
1917 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1918 			return -EINVAL;
1919 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1920 		break;
1921 	case SO_TIMESTAMPING:
1922 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1923 			return -EINVAL;
1924 
1925 		tsflags = *(u32 *)CMSG_DATA(cmsg);
1926 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1927 			return -EINVAL;
1928 
1929 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1930 		sockc->tsflags |= tsflags;
1931 		break;
1932 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1933 	case SCM_RIGHTS:
1934 	case SCM_CREDENTIALS:
1935 		break;
1936 	default:
1937 		return -EINVAL;
1938 	}
1939 	return 0;
1940 }
1941 EXPORT_SYMBOL(__sock_cmsg_send);
1942 
1943 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1944 		   struct sockcm_cookie *sockc)
1945 {
1946 	struct cmsghdr *cmsg;
1947 	int ret;
1948 
1949 	for_each_cmsghdr(cmsg, msg) {
1950 		if (!CMSG_OK(msg, cmsg))
1951 			return -EINVAL;
1952 		if (cmsg->cmsg_level != SOL_SOCKET)
1953 			continue;
1954 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1955 		if (ret)
1956 			return ret;
1957 	}
1958 	return 0;
1959 }
1960 EXPORT_SYMBOL(sock_cmsg_send);
1961 
1962 /* On 32bit arches, an skb frag is limited to 2^15 */
1963 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1964 
1965 /**
1966  * skb_page_frag_refill - check that a page_frag contains enough room
1967  * @sz: minimum size of the fragment we want to get
1968  * @pfrag: pointer to page_frag
1969  * @gfp: priority for memory allocation
1970  *
1971  * Note: While this allocator tries to use high order pages, there is
1972  * no guarantee that allocations succeed. Therefore, @sz MUST be
1973  * less or equal than PAGE_SIZE.
1974  */
1975 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1976 {
1977 	if (pfrag->page) {
1978 		if (page_ref_count(pfrag->page) == 1) {
1979 			pfrag->offset = 0;
1980 			return true;
1981 		}
1982 		if (pfrag->offset + sz <= pfrag->size)
1983 			return true;
1984 		put_page(pfrag->page);
1985 	}
1986 
1987 	pfrag->offset = 0;
1988 	if (SKB_FRAG_PAGE_ORDER) {
1989 		/* Avoid direct reclaim but allow kswapd to wake */
1990 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1991 					  __GFP_COMP | __GFP_NOWARN |
1992 					  __GFP_NORETRY,
1993 					  SKB_FRAG_PAGE_ORDER);
1994 		if (likely(pfrag->page)) {
1995 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1996 			return true;
1997 		}
1998 	}
1999 	pfrag->page = alloc_page(gfp);
2000 	if (likely(pfrag->page)) {
2001 		pfrag->size = PAGE_SIZE;
2002 		return true;
2003 	}
2004 	return false;
2005 }
2006 EXPORT_SYMBOL(skb_page_frag_refill);
2007 
2008 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2009 {
2010 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2011 		return true;
2012 
2013 	sk_enter_memory_pressure(sk);
2014 	sk_stream_moderate_sndbuf(sk);
2015 	return false;
2016 }
2017 EXPORT_SYMBOL(sk_page_frag_refill);
2018 
2019 static void __lock_sock(struct sock *sk)
2020 	__releases(&sk->sk_lock.slock)
2021 	__acquires(&sk->sk_lock.slock)
2022 {
2023 	DEFINE_WAIT(wait);
2024 
2025 	for (;;) {
2026 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2027 					TASK_UNINTERRUPTIBLE);
2028 		spin_unlock_bh(&sk->sk_lock.slock);
2029 		schedule();
2030 		spin_lock_bh(&sk->sk_lock.slock);
2031 		if (!sock_owned_by_user(sk))
2032 			break;
2033 	}
2034 	finish_wait(&sk->sk_lock.wq, &wait);
2035 }
2036 
2037 static void __release_sock(struct sock *sk)
2038 	__releases(&sk->sk_lock.slock)
2039 	__acquires(&sk->sk_lock.slock)
2040 {
2041 	struct sk_buff *skb, *next;
2042 
2043 	while ((skb = sk->sk_backlog.head) != NULL) {
2044 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2045 
2046 		spin_unlock_bh(&sk->sk_lock.slock);
2047 
2048 		do {
2049 			next = skb->next;
2050 			prefetch(next);
2051 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2052 			skb->next = NULL;
2053 			sk_backlog_rcv(sk, skb);
2054 
2055 			cond_resched();
2056 
2057 			skb = next;
2058 		} while (skb != NULL);
2059 
2060 		spin_lock_bh(&sk->sk_lock.slock);
2061 	}
2062 
2063 	/*
2064 	 * Doing the zeroing here guarantee we can not loop forever
2065 	 * while a wild producer attempts to flood us.
2066 	 */
2067 	sk->sk_backlog.len = 0;
2068 }
2069 
2070 void __sk_flush_backlog(struct sock *sk)
2071 {
2072 	spin_lock_bh(&sk->sk_lock.slock);
2073 	__release_sock(sk);
2074 	spin_unlock_bh(&sk->sk_lock.slock);
2075 }
2076 
2077 /**
2078  * sk_wait_data - wait for data to arrive at sk_receive_queue
2079  * @sk:    sock to wait on
2080  * @timeo: for how long
2081  * @skb:   last skb seen on sk_receive_queue
2082  *
2083  * Now socket state including sk->sk_err is changed only under lock,
2084  * hence we may omit checks after joining wait queue.
2085  * We check receive queue before schedule() only as optimization;
2086  * it is very likely that release_sock() added new data.
2087  */
2088 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2089 {
2090 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2091 	int rc;
2092 
2093 	add_wait_queue(sk_sleep(sk), &wait);
2094 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2095 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2096 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2097 	remove_wait_queue(sk_sleep(sk), &wait);
2098 	return rc;
2099 }
2100 EXPORT_SYMBOL(sk_wait_data);
2101 
2102 /**
2103  *	__sk_mem_raise_allocated - increase memory_allocated
2104  *	@sk: socket
2105  *	@size: memory size to allocate
2106  *	@amt: pages to allocate
2107  *	@kind: allocation type
2108  *
2109  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2110  */
2111 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2112 {
2113 	struct proto *prot = sk->sk_prot;
2114 	long allocated = sk_memory_allocated_add(sk, amt);
2115 
2116 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2117 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2118 		goto suppress_allocation;
2119 
2120 	/* Under limit. */
2121 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2122 		sk_leave_memory_pressure(sk);
2123 		return 1;
2124 	}
2125 
2126 	/* Under pressure. */
2127 	if (allocated > sk_prot_mem_limits(sk, 1))
2128 		sk_enter_memory_pressure(sk);
2129 
2130 	/* Over hard limit. */
2131 	if (allocated > sk_prot_mem_limits(sk, 2))
2132 		goto suppress_allocation;
2133 
2134 	/* guarantee minimum buffer size under pressure */
2135 	if (kind == SK_MEM_RECV) {
2136 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2137 			return 1;
2138 
2139 	} else { /* SK_MEM_SEND */
2140 		if (sk->sk_type == SOCK_STREAM) {
2141 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2142 				return 1;
2143 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2144 			   prot->sysctl_wmem[0])
2145 				return 1;
2146 	}
2147 
2148 	if (sk_has_memory_pressure(sk)) {
2149 		int alloc;
2150 
2151 		if (!sk_under_memory_pressure(sk))
2152 			return 1;
2153 		alloc = sk_sockets_allocated_read_positive(sk);
2154 		if (sk_prot_mem_limits(sk, 2) > alloc *
2155 		    sk_mem_pages(sk->sk_wmem_queued +
2156 				 atomic_read(&sk->sk_rmem_alloc) +
2157 				 sk->sk_forward_alloc))
2158 			return 1;
2159 	}
2160 
2161 suppress_allocation:
2162 
2163 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2164 		sk_stream_moderate_sndbuf(sk);
2165 
2166 		/* Fail only if socket is _under_ its sndbuf.
2167 		 * In this case we cannot block, so that we have to fail.
2168 		 */
2169 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2170 			return 1;
2171 	}
2172 
2173 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2174 
2175 	sk_memory_allocated_sub(sk, amt);
2176 
2177 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2178 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2179 
2180 	return 0;
2181 }
2182 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2183 
2184 /**
2185  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2186  *	@sk: socket
2187  *	@size: memory size to allocate
2188  *	@kind: allocation type
2189  *
2190  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2191  *	rmem allocation. This function assumes that protocols which have
2192  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2193  */
2194 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2195 {
2196 	int ret, amt = sk_mem_pages(size);
2197 
2198 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2199 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2200 	if (!ret)
2201 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2202 	return ret;
2203 }
2204 EXPORT_SYMBOL(__sk_mem_schedule);
2205 
2206 /**
2207  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2208  *	@sk: socket
2209  *	@amount: number of quanta
2210  *
2211  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2212  */
2213 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2214 {
2215 	sk_memory_allocated_sub(sk, amount);
2216 
2217 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2218 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2219 
2220 	if (sk_under_memory_pressure(sk) &&
2221 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2222 		sk_leave_memory_pressure(sk);
2223 }
2224 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2225 
2226 /**
2227  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2228  *	@sk: socket
2229  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2230  */
2231 void __sk_mem_reclaim(struct sock *sk, int amount)
2232 {
2233 	amount >>= SK_MEM_QUANTUM_SHIFT;
2234 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2235 	__sk_mem_reduce_allocated(sk, amount);
2236 }
2237 EXPORT_SYMBOL(__sk_mem_reclaim);
2238 
2239 int sk_set_peek_off(struct sock *sk, int val)
2240 {
2241 	if (val < 0)
2242 		return -EINVAL;
2243 
2244 	sk->sk_peek_off = val;
2245 	return 0;
2246 }
2247 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2248 
2249 /*
2250  * Set of default routines for initialising struct proto_ops when
2251  * the protocol does not support a particular function. In certain
2252  * cases where it makes no sense for a protocol to have a "do nothing"
2253  * function, some default processing is provided.
2254  */
2255 
2256 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2257 {
2258 	return -EOPNOTSUPP;
2259 }
2260 EXPORT_SYMBOL(sock_no_bind);
2261 
2262 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2263 		    int len, int flags)
2264 {
2265 	return -EOPNOTSUPP;
2266 }
2267 EXPORT_SYMBOL(sock_no_connect);
2268 
2269 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2270 {
2271 	return -EOPNOTSUPP;
2272 }
2273 EXPORT_SYMBOL(sock_no_socketpair);
2274 
2275 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2276 {
2277 	return -EOPNOTSUPP;
2278 }
2279 EXPORT_SYMBOL(sock_no_accept);
2280 
2281 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2282 		    int *len, int peer)
2283 {
2284 	return -EOPNOTSUPP;
2285 }
2286 EXPORT_SYMBOL(sock_no_getname);
2287 
2288 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2289 {
2290 	return 0;
2291 }
2292 EXPORT_SYMBOL(sock_no_poll);
2293 
2294 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2295 {
2296 	return -EOPNOTSUPP;
2297 }
2298 EXPORT_SYMBOL(sock_no_ioctl);
2299 
2300 int sock_no_listen(struct socket *sock, int backlog)
2301 {
2302 	return -EOPNOTSUPP;
2303 }
2304 EXPORT_SYMBOL(sock_no_listen);
2305 
2306 int sock_no_shutdown(struct socket *sock, int how)
2307 {
2308 	return -EOPNOTSUPP;
2309 }
2310 EXPORT_SYMBOL(sock_no_shutdown);
2311 
2312 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2313 		    char __user *optval, unsigned int optlen)
2314 {
2315 	return -EOPNOTSUPP;
2316 }
2317 EXPORT_SYMBOL(sock_no_setsockopt);
2318 
2319 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2320 		    char __user *optval, int __user *optlen)
2321 {
2322 	return -EOPNOTSUPP;
2323 }
2324 EXPORT_SYMBOL(sock_no_getsockopt);
2325 
2326 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2327 {
2328 	return -EOPNOTSUPP;
2329 }
2330 EXPORT_SYMBOL(sock_no_sendmsg);
2331 
2332 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2333 		    int flags)
2334 {
2335 	return -EOPNOTSUPP;
2336 }
2337 EXPORT_SYMBOL(sock_no_recvmsg);
2338 
2339 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2340 {
2341 	/* Mirror missing mmap method error code */
2342 	return -ENODEV;
2343 }
2344 EXPORT_SYMBOL(sock_no_mmap);
2345 
2346 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2347 {
2348 	ssize_t res;
2349 	struct msghdr msg = {.msg_flags = flags};
2350 	struct kvec iov;
2351 	char *kaddr = kmap(page);
2352 	iov.iov_base = kaddr + offset;
2353 	iov.iov_len = size;
2354 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2355 	kunmap(page);
2356 	return res;
2357 }
2358 EXPORT_SYMBOL(sock_no_sendpage);
2359 
2360 /*
2361  *	Default Socket Callbacks
2362  */
2363 
2364 static void sock_def_wakeup(struct sock *sk)
2365 {
2366 	struct socket_wq *wq;
2367 
2368 	rcu_read_lock();
2369 	wq = rcu_dereference(sk->sk_wq);
2370 	if (skwq_has_sleeper(wq))
2371 		wake_up_interruptible_all(&wq->wait);
2372 	rcu_read_unlock();
2373 }
2374 
2375 static void sock_def_error_report(struct sock *sk)
2376 {
2377 	struct socket_wq *wq;
2378 
2379 	rcu_read_lock();
2380 	wq = rcu_dereference(sk->sk_wq);
2381 	if (skwq_has_sleeper(wq))
2382 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2383 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2384 	rcu_read_unlock();
2385 }
2386 
2387 static void sock_def_readable(struct sock *sk)
2388 {
2389 	struct socket_wq *wq;
2390 
2391 	rcu_read_lock();
2392 	wq = rcu_dereference(sk->sk_wq);
2393 	if (skwq_has_sleeper(wq))
2394 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2395 						POLLRDNORM | POLLRDBAND);
2396 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2397 	rcu_read_unlock();
2398 }
2399 
2400 static void sock_def_write_space(struct sock *sk)
2401 {
2402 	struct socket_wq *wq;
2403 
2404 	rcu_read_lock();
2405 
2406 	/* Do not wake up a writer until he can make "significant"
2407 	 * progress.  --DaveM
2408 	 */
2409 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2410 		wq = rcu_dereference(sk->sk_wq);
2411 		if (skwq_has_sleeper(wq))
2412 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2413 						POLLWRNORM | POLLWRBAND);
2414 
2415 		/* Should agree with poll, otherwise some programs break */
2416 		if (sock_writeable(sk))
2417 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2418 	}
2419 
2420 	rcu_read_unlock();
2421 }
2422 
2423 static void sock_def_destruct(struct sock *sk)
2424 {
2425 }
2426 
2427 void sk_send_sigurg(struct sock *sk)
2428 {
2429 	if (sk->sk_socket && sk->sk_socket->file)
2430 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2431 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2432 }
2433 EXPORT_SYMBOL(sk_send_sigurg);
2434 
2435 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2436 		    unsigned long expires)
2437 {
2438 	if (!mod_timer(timer, expires))
2439 		sock_hold(sk);
2440 }
2441 EXPORT_SYMBOL(sk_reset_timer);
2442 
2443 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2444 {
2445 	if (del_timer(timer))
2446 		__sock_put(sk);
2447 }
2448 EXPORT_SYMBOL(sk_stop_timer);
2449 
2450 void sock_init_data(struct socket *sock, struct sock *sk)
2451 {
2452 	skb_queue_head_init(&sk->sk_receive_queue);
2453 	skb_queue_head_init(&sk->sk_write_queue);
2454 	skb_queue_head_init(&sk->sk_error_queue);
2455 
2456 	sk->sk_send_head	=	NULL;
2457 
2458 	init_timer(&sk->sk_timer);
2459 
2460 	sk->sk_allocation	=	GFP_KERNEL;
2461 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2462 	sk->sk_sndbuf		=	sysctl_wmem_default;
2463 	sk->sk_state		=	TCP_CLOSE;
2464 	sk_set_socket(sk, sock);
2465 
2466 	sock_set_flag(sk, SOCK_ZAPPED);
2467 
2468 	if (sock) {
2469 		sk->sk_type	=	sock->type;
2470 		sk->sk_wq	=	sock->wq;
2471 		sock->sk	=	sk;
2472 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2473 	} else {
2474 		sk->sk_wq	=	NULL;
2475 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2476 	}
2477 
2478 	rwlock_init(&sk->sk_callback_lock);
2479 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2480 			af_callback_keys + sk->sk_family,
2481 			af_family_clock_key_strings[sk->sk_family]);
2482 
2483 	sk->sk_state_change	=	sock_def_wakeup;
2484 	sk->sk_data_ready	=	sock_def_readable;
2485 	sk->sk_write_space	=	sock_def_write_space;
2486 	sk->sk_error_report	=	sock_def_error_report;
2487 	sk->sk_destruct		=	sock_def_destruct;
2488 
2489 	sk->sk_frag.page	=	NULL;
2490 	sk->sk_frag.offset	=	0;
2491 	sk->sk_peek_off		=	-1;
2492 
2493 	sk->sk_peer_pid 	=	NULL;
2494 	sk->sk_peer_cred	=	NULL;
2495 	sk->sk_write_pending	=	0;
2496 	sk->sk_rcvlowat		=	1;
2497 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2498 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2499 
2500 	sk->sk_stamp = ktime_set(-1L, 0);
2501 
2502 #ifdef CONFIG_NET_RX_BUSY_POLL
2503 	sk->sk_napi_id		=	0;
2504 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2505 #endif
2506 
2507 	sk->sk_max_pacing_rate = ~0U;
2508 	sk->sk_pacing_rate = ~0U;
2509 	sk->sk_incoming_cpu = -1;
2510 	/*
2511 	 * Before updating sk_refcnt, we must commit prior changes to memory
2512 	 * (Documentation/RCU/rculist_nulls.txt for details)
2513 	 */
2514 	smp_wmb();
2515 	atomic_set(&sk->sk_refcnt, 1);
2516 	atomic_set(&sk->sk_drops, 0);
2517 }
2518 EXPORT_SYMBOL(sock_init_data);
2519 
2520 void lock_sock_nested(struct sock *sk, int subclass)
2521 {
2522 	might_sleep();
2523 	spin_lock_bh(&sk->sk_lock.slock);
2524 	if (sk->sk_lock.owned)
2525 		__lock_sock(sk);
2526 	sk->sk_lock.owned = 1;
2527 	spin_unlock(&sk->sk_lock.slock);
2528 	/*
2529 	 * The sk_lock has mutex_lock() semantics here:
2530 	 */
2531 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2532 	local_bh_enable();
2533 }
2534 EXPORT_SYMBOL(lock_sock_nested);
2535 
2536 void release_sock(struct sock *sk)
2537 {
2538 	spin_lock_bh(&sk->sk_lock.slock);
2539 	if (sk->sk_backlog.tail)
2540 		__release_sock(sk);
2541 
2542 	/* Warning : release_cb() might need to release sk ownership,
2543 	 * ie call sock_release_ownership(sk) before us.
2544 	 */
2545 	if (sk->sk_prot->release_cb)
2546 		sk->sk_prot->release_cb(sk);
2547 
2548 	sock_release_ownership(sk);
2549 	if (waitqueue_active(&sk->sk_lock.wq))
2550 		wake_up(&sk->sk_lock.wq);
2551 	spin_unlock_bh(&sk->sk_lock.slock);
2552 }
2553 EXPORT_SYMBOL(release_sock);
2554 
2555 /**
2556  * lock_sock_fast - fast version of lock_sock
2557  * @sk: socket
2558  *
2559  * This version should be used for very small section, where process wont block
2560  * return false if fast path is taken
2561  *   sk_lock.slock locked, owned = 0, BH disabled
2562  * return true if slow path is taken
2563  *   sk_lock.slock unlocked, owned = 1, BH enabled
2564  */
2565 bool lock_sock_fast(struct sock *sk)
2566 {
2567 	might_sleep();
2568 	spin_lock_bh(&sk->sk_lock.slock);
2569 
2570 	if (!sk->sk_lock.owned)
2571 		/*
2572 		 * Note : We must disable BH
2573 		 */
2574 		return false;
2575 
2576 	__lock_sock(sk);
2577 	sk->sk_lock.owned = 1;
2578 	spin_unlock(&sk->sk_lock.slock);
2579 	/*
2580 	 * The sk_lock has mutex_lock() semantics here:
2581 	 */
2582 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2583 	local_bh_enable();
2584 	return true;
2585 }
2586 EXPORT_SYMBOL(lock_sock_fast);
2587 
2588 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2589 {
2590 	struct timeval tv;
2591 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2592 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2593 	tv = ktime_to_timeval(sk->sk_stamp);
2594 	if (tv.tv_sec == -1)
2595 		return -ENOENT;
2596 	if (tv.tv_sec == 0) {
2597 		sk->sk_stamp = ktime_get_real();
2598 		tv = ktime_to_timeval(sk->sk_stamp);
2599 	}
2600 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2601 }
2602 EXPORT_SYMBOL(sock_get_timestamp);
2603 
2604 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2605 {
2606 	struct timespec ts;
2607 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2608 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2609 	ts = ktime_to_timespec(sk->sk_stamp);
2610 	if (ts.tv_sec == -1)
2611 		return -ENOENT;
2612 	if (ts.tv_sec == 0) {
2613 		sk->sk_stamp = ktime_get_real();
2614 		ts = ktime_to_timespec(sk->sk_stamp);
2615 	}
2616 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2617 }
2618 EXPORT_SYMBOL(sock_get_timestampns);
2619 
2620 void sock_enable_timestamp(struct sock *sk, int flag)
2621 {
2622 	if (!sock_flag(sk, flag)) {
2623 		unsigned long previous_flags = sk->sk_flags;
2624 
2625 		sock_set_flag(sk, flag);
2626 		/*
2627 		 * we just set one of the two flags which require net
2628 		 * time stamping, but time stamping might have been on
2629 		 * already because of the other one
2630 		 */
2631 		if (sock_needs_netstamp(sk) &&
2632 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2633 			net_enable_timestamp();
2634 	}
2635 }
2636 
2637 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2638 		       int level, int type)
2639 {
2640 	struct sock_exterr_skb *serr;
2641 	struct sk_buff *skb;
2642 	int copied, err;
2643 
2644 	err = -EAGAIN;
2645 	skb = sock_dequeue_err_skb(sk);
2646 	if (skb == NULL)
2647 		goto out;
2648 
2649 	copied = skb->len;
2650 	if (copied > len) {
2651 		msg->msg_flags |= MSG_TRUNC;
2652 		copied = len;
2653 	}
2654 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2655 	if (err)
2656 		goto out_free_skb;
2657 
2658 	sock_recv_timestamp(msg, sk, skb);
2659 
2660 	serr = SKB_EXT_ERR(skb);
2661 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2662 
2663 	msg->msg_flags |= MSG_ERRQUEUE;
2664 	err = copied;
2665 
2666 out_free_skb:
2667 	kfree_skb(skb);
2668 out:
2669 	return err;
2670 }
2671 EXPORT_SYMBOL(sock_recv_errqueue);
2672 
2673 /*
2674  *	Get a socket option on an socket.
2675  *
2676  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2677  *	asynchronous errors should be reported by getsockopt. We assume
2678  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2679  */
2680 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2681 			   char __user *optval, int __user *optlen)
2682 {
2683 	struct sock *sk = sock->sk;
2684 
2685 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2686 }
2687 EXPORT_SYMBOL(sock_common_getsockopt);
2688 
2689 #ifdef CONFIG_COMPAT
2690 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2691 				  char __user *optval, int __user *optlen)
2692 {
2693 	struct sock *sk = sock->sk;
2694 
2695 	if (sk->sk_prot->compat_getsockopt != NULL)
2696 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2697 						      optval, optlen);
2698 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2699 }
2700 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2701 #endif
2702 
2703 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2704 			int flags)
2705 {
2706 	struct sock *sk = sock->sk;
2707 	int addr_len = 0;
2708 	int err;
2709 
2710 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2711 				   flags & ~MSG_DONTWAIT, &addr_len);
2712 	if (err >= 0)
2713 		msg->msg_namelen = addr_len;
2714 	return err;
2715 }
2716 EXPORT_SYMBOL(sock_common_recvmsg);
2717 
2718 /*
2719  *	Set socket options on an inet socket.
2720  */
2721 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2722 			   char __user *optval, unsigned int optlen)
2723 {
2724 	struct sock *sk = sock->sk;
2725 
2726 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2727 }
2728 EXPORT_SYMBOL(sock_common_setsockopt);
2729 
2730 #ifdef CONFIG_COMPAT
2731 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2732 				  char __user *optval, unsigned int optlen)
2733 {
2734 	struct sock *sk = sock->sk;
2735 
2736 	if (sk->sk_prot->compat_setsockopt != NULL)
2737 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2738 						      optval, optlen);
2739 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2740 }
2741 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2742 #endif
2743 
2744 void sk_common_release(struct sock *sk)
2745 {
2746 	if (sk->sk_prot->destroy)
2747 		sk->sk_prot->destroy(sk);
2748 
2749 	/*
2750 	 * Observation: when sock_common_release is called, processes have
2751 	 * no access to socket. But net still has.
2752 	 * Step one, detach it from networking:
2753 	 *
2754 	 * A. Remove from hash tables.
2755 	 */
2756 
2757 	sk->sk_prot->unhash(sk);
2758 
2759 	/*
2760 	 * In this point socket cannot receive new packets, but it is possible
2761 	 * that some packets are in flight because some CPU runs receiver and
2762 	 * did hash table lookup before we unhashed socket. They will achieve
2763 	 * receive queue and will be purged by socket destructor.
2764 	 *
2765 	 * Also we still have packets pending on receive queue and probably,
2766 	 * our own packets waiting in device queues. sock_destroy will drain
2767 	 * receive queue, but transmitted packets will delay socket destruction
2768 	 * until the last reference will be released.
2769 	 */
2770 
2771 	sock_orphan(sk);
2772 
2773 	xfrm_sk_free_policy(sk);
2774 
2775 	sk_refcnt_debug_release(sk);
2776 
2777 	if (sk->sk_frag.page) {
2778 		put_page(sk->sk_frag.page);
2779 		sk->sk_frag.page = NULL;
2780 	}
2781 
2782 	sock_put(sk);
2783 }
2784 EXPORT_SYMBOL(sk_common_release);
2785 
2786 #ifdef CONFIG_PROC_FS
2787 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2788 struct prot_inuse {
2789 	int val[PROTO_INUSE_NR];
2790 };
2791 
2792 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2793 
2794 #ifdef CONFIG_NET_NS
2795 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2796 {
2797 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2798 }
2799 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2800 
2801 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2802 {
2803 	int cpu, idx = prot->inuse_idx;
2804 	int res = 0;
2805 
2806 	for_each_possible_cpu(cpu)
2807 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2808 
2809 	return res >= 0 ? res : 0;
2810 }
2811 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2812 
2813 static int __net_init sock_inuse_init_net(struct net *net)
2814 {
2815 	net->core.inuse = alloc_percpu(struct prot_inuse);
2816 	return net->core.inuse ? 0 : -ENOMEM;
2817 }
2818 
2819 static void __net_exit sock_inuse_exit_net(struct net *net)
2820 {
2821 	free_percpu(net->core.inuse);
2822 }
2823 
2824 static struct pernet_operations net_inuse_ops = {
2825 	.init = sock_inuse_init_net,
2826 	.exit = sock_inuse_exit_net,
2827 };
2828 
2829 static __init int net_inuse_init(void)
2830 {
2831 	if (register_pernet_subsys(&net_inuse_ops))
2832 		panic("Cannot initialize net inuse counters");
2833 
2834 	return 0;
2835 }
2836 
2837 core_initcall(net_inuse_init);
2838 #else
2839 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2840 
2841 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2842 {
2843 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2844 }
2845 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2846 
2847 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2848 {
2849 	int cpu, idx = prot->inuse_idx;
2850 	int res = 0;
2851 
2852 	for_each_possible_cpu(cpu)
2853 		res += per_cpu(prot_inuse, cpu).val[idx];
2854 
2855 	return res >= 0 ? res : 0;
2856 }
2857 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2858 #endif
2859 
2860 static void assign_proto_idx(struct proto *prot)
2861 {
2862 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2863 
2864 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2865 		pr_err("PROTO_INUSE_NR exhausted\n");
2866 		return;
2867 	}
2868 
2869 	set_bit(prot->inuse_idx, proto_inuse_idx);
2870 }
2871 
2872 static void release_proto_idx(struct proto *prot)
2873 {
2874 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2875 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2876 }
2877 #else
2878 static inline void assign_proto_idx(struct proto *prot)
2879 {
2880 }
2881 
2882 static inline void release_proto_idx(struct proto *prot)
2883 {
2884 }
2885 #endif
2886 
2887 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2888 {
2889 	if (!rsk_prot)
2890 		return;
2891 	kfree(rsk_prot->slab_name);
2892 	rsk_prot->slab_name = NULL;
2893 	kmem_cache_destroy(rsk_prot->slab);
2894 	rsk_prot->slab = NULL;
2895 }
2896 
2897 static int req_prot_init(const struct proto *prot)
2898 {
2899 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2900 
2901 	if (!rsk_prot)
2902 		return 0;
2903 
2904 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2905 					prot->name);
2906 	if (!rsk_prot->slab_name)
2907 		return -ENOMEM;
2908 
2909 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2910 					   rsk_prot->obj_size, 0,
2911 					   prot->slab_flags, NULL);
2912 
2913 	if (!rsk_prot->slab) {
2914 		pr_crit("%s: Can't create request sock SLAB cache!\n",
2915 			prot->name);
2916 		return -ENOMEM;
2917 	}
2918 	return 0;
2919 }
2920 
2921 int proto_register(struct proto *prot, int alloc_slab)
2922 {
2923 	if (alloc_slab) {
2924 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2925 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2926 					NULL);
2927 
2928 		if (prot->slab == NULL) {
2929 			pr_crit("%s: Can't create sock SLAB cache!\n",
2930 				prot->name);
2931 			goto out;
2932 		}
2933 
2934 		if (req_prot_init(prot))
2935 			goto out_free_request_sock_slab;
2936 
2937 		if (prot->twsk_prot != NULL) {
2938 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2939 
2940 			if (prot->twsk_prot->twsk_slab_name == NULL)
2941 				goto out_free_request_sock_slab;
2942 
2943 			prot->twsk_prot->twsk_slab =
2944 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2945 						  prot->twsk_prot->twsk_obj_size,
2946 						  0,
2947 						  prot->slab_flags,
2948 						  NULL);
2949 			if (prot->twsk_prot->twsk_slab == NULL)
2950 				goto out_free_timewait_sock_slab_name;
2951 		}
2952 	}
2953 
2954 	mutex_lock(&proto_list_mutex);
2955 	list_add(&prot->node, &proto_list);
2956 	assign_proto_idx(prot);
2957 	mutex_unlock(&proto_list_mutex);
2958 	return 0;
2959 
2960 out_free_timewait_sock_slab_name:
2961 	kfree(prot->twsk_prot->twsk_slab_name);
2962 out_free_request_sock_slab:
2963 	req_prot_cleanup(prot->rsk_prot);
2964 
2965 	kmem_cache_destroy(prot->slab);
2966 	prot->slab = NULL;
2967 out:
2968 	return -ENOBUFS;
2969 }
2970 EXPORT_SYMBOL(proto_register);
2971 
2972 void proto_unregister(struct proto *prot)
2973 {
2974 	mutex_lock(&proto_list_mutex);
2975 	release_proto_idx(prot);
2976 	list_del(&prot->node);
2977 	mutex_unlock(&proto_list_mutex);
2978 
2979 	kmem_cache_destroy(prot->slab);
2980 	prot->slab = NULL;
2981 
2982 	req_prot_cleanup(prot->rsk_prot);
2983 
2984 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2985 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2986 		kfree(prot->twsk_prot->twsk_slab_name);
2987 		prot->twsk_prot->twsk_slab = NULL;
2988 	}
2989 }
2990 EXPORT_SYMBOL(proto_unregister);
2991 
2992 #ifdef CONFIG_PROC_FS
2993 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2994 	__acquires(proto_list_mutex)
2995 {
2996 	mutex_lock(&proto_list_mutex);
2997 	return seq_list_start_head(&proto_list, *pos);
2998 }
2999 
3000 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3001 {
3002 	return seq_list_next(v, &proto_list, pos);
3003 }
3004 
3005 static void proto_seq_stop(struct seq_file *seq, void *v)
3006 	__releases(proto_list_mutex)
3007 {
3008 	mutex_unlock(&proto_list_mutex);
3009 }
3010 
3011 static char proto_method_implemented(const void *method)
3012 {
3013 	return method == NULL ? 'n' : 'y';
3014 }
3015 static long sock_prot_memory_allocated(struct proto *proto)
3016 {
3017 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3018 }
3019 
3020 static char *sock_prot_memory_pressure(struct proto *proto)
3021 {
3022 	return proto->memory_pressure != NULL ?
3023 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3024 }
3025 
3026 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3027 {
3028 
3029 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3030 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3031 		   proto->name,
3032 		   proto->obj_size,
3033 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3034 		   sock_prot_memory_allocated(proto),
3035 		   sock_prot_memory_pressure(proto),
3036 		   proto->max_header,
3037 		   proto->slab == NULL ? "no" : "yes",
3038 		   module_name(proto->owner),
3039 		   proto_method_implemented(proto->close),
3040 		   proto_method_implemented(proto->connect),
3041 		   proto_method_implemented(proto->disconnect),
3042 		   proto_method_implemented(proto->accept),
3043 		   proto_method_implemented(proto->ioctl),
3044 		   proto_method_implemented(proto->init),
3045 		   proto_method_implemented(proto->destroy),
3046 		   proto_method_implemented(proto->shutdown),
3047 		   proto_method_implemented(proto->setsockopt),
3048 		   proto_method_implemented(proto->getsockopt),
3049 		   proto_method_implemented(proto->sendmsg),
3050 		   proto_method_implemented(proto->recvmsg),
3051 		   proto_method_implemented(proto->sendpage),
3052 		   proto_method_implemented(proto->bind),
3053 		   proto_method_implemented(proto->backlog_rcv),
3054 		   proto_method_implemented(proto->hash),
3055 		   proto_method_implemented(proto->unhash),
3056 		   proto_method_implemented(proto->get_port),
3057 		   proto_method_implemented(proto->enter_memory_pressure));
3058 }
3059 
3060 static int proto_seq_show(struct seq_file *seq, void *v)
3061 {
3062 	if (v == &proto_list)
3063 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3064 			   "protocol",
3065 			   "size",
3066 			   "sockets",
3067 			   "memory",
3068 			   "press",
3069 			   "maxhdr",
3070 			   "slab",
3071 			   "module",
3072 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3073 	else
3074 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3075 	return 0;
3076 }
3077 
3078 static const struct seq_operations proto_seq_ops = {
3079 	.start  = proto_seq_start,
3080 	.next   = proto_seq_next,
3081 	.stop   = proto_seq_stop,
3082 	.show   = proto_seq_show,
3083 };
3084 
3085 static int proto_seq_open(struct inode *inode, struct file *file)
3086 {
3087 	return seq_open_net(inode, file, &proto_seq_ops,
3088 			    sizeof(struct seq_net_private));
3089 }
3090 
3091 static const struct file_operations proto_seq_fops = {
3092 	.owner		= THIS_MODULE,
3093 	.open		= proto_seq_open,
3094 	.read		= seq_read,
3095 	.llseek		= seq_lseek,
3096 	.release	= seq_release_net,
3097 };
3098 
3099 static __net_init int proto_init_net(struct net *net)
3100 {
3101 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3102 		return -ENOMEM;
3103 
3104 	return 0;
3105 }
3106 
3107 static __net_exit void proto_exit_net(struct net *net)
3108 {
3109 	remove_proc_entry("protocols", net->proc_net);
3110 }
3111 
3112 
3113 static __net_initdata struct pernet_operations proto_net_ops = {
3114 	.init = proto_init_net,
3115 	.exit = proto_exit_net,
3116 };
3117 
3118 static int __init proto_init(void)
3119 {
3120 	return register_pernet_subsys(&proto_net_ops);
3121 }
3122 
3123 subsys_initcall(proto_init);
3124 
3125 #endif /* PROC_FS */
3126