xref: /openbmc/linux/net/unix/af_unix.c (revision 9e6fd874c7bb47b6a4295abc4c81b2f41b97e970)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/freezer.h>
116 #include <linux/file.h>
117 #include <linux/btf_ids.h>
118 
119 #include "scm.h"
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 #define unix_peer(sk) (unix_sk(sk)->peer)
215 
216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
217 {
218 	return unix_peer(osk) == sk;
219 }
220 
221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
222 {
223 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
224 }
225 
226 static inline int unix_recvq_full(const struct sock *sk)
227 {
228 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
229 }
230 
231 static inline int unix_recvq_full_lockless(const struct sock *sk)
232 {
233 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
234 		READ_ONCE(sk->sk_max_ack_backlog);
235 }
236 
237 struct sock *unix_peer_get(struct sock *s)
238 {
239 	struct sock *peer;
240 
241 	unix_state_lock(s);
242 	peer = unix_peer(s);
243 	if (peer)
244 		sock_hold(peer);
245 	unix_state_unlock(s);
246 	return peer;
247 }
248 EXPORT_SYMBOL_GPL(unix_peer_get);
249 
250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
251 					     int addr_len)
252 {
253 	struct unix_address *addr;
254 
255 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
256 	if (!addr)
257 		return NULL;
258 
259 	refcount_set(&addr->refcnt, 1);
260 	addr->len = addr_len;
261 	memcpy(addr->name, sunaddr, addr_len);
262 
263 	return addr;
264 }
265 
266 static inline void unix_release_addr(struct unix_address *addr)
267 {
268 	if (refcount_dec_and_test(&addr->refcnt))
269 		kfree(addr);
270 }
271 
272 /*
273  *	Check unix socket name:
274  *		- should be not zero length.
275  *	        - if started by not zero, should be NULL terminated (FS object)
276  *		- if started by zero, it is abstract name.
277  */
278 
279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
280 {
281 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
282 	    addr_len > sizeof(*sunaddr))
283 		return -EINVAL;
284 
285 	if (sunaddr->sun_family != AF_UNIX)
286 		return -EINVAL;
287 
288 	return 0;
289 }
290 
291 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
292 {
293 	/* This may look like an off by one error but it is a bit more
294 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
295 	 * sun_path[108] doesn't as such exist.  However in kernel space
296 	 * we are guaranteed that it is a valid memory location in our
297 	 * kernel address buffer because syscall functions always pass
298 	 * a pointer of struct sockaddr_storage which has a bigger buffer
299 	 * than 108.
300 	 */
301 	((char *)sunaddr)[addr_len] = 0;
302 }
303 
304 static void __unix_remove_socket(struct sock *sk)
305 {
306 	sk_del_node_init(sk);
307 }
308 
309 static void __unix_insert_socket(struct net *net, struct sock *sk)
310 {
311 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
312 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
313 }
314 
315 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
316 				 struct unix_address *addr, unsigned int hash)
317 {
318 	__unix_remove_socket(sk);
319 	smp_store_release(&unix_sk(sk)->addr, addr);
320 
321 	sk->sk_hash = hash;
322 	__unix_insert_socket(net, sk);
323 }
324 
325 static void unix_remove_socket(struct net *net, struct sock *sk)
326 {
327 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
328 	__unix_remove_socket(sk);
329 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
330 }
331 
332 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
333 {
334 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
335 	__unix_insert_socket(net, sk);
336 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
337 }
338 
339 static void unix_insert_bsd_socket(struct sock *sk)
340 {
341 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
342 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
343 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
344 }
345 
346 static void unix_remove_bsd_socket(struct sock *sk)
347 {
348 	if (!hlist_unhashed(&sk->sk_bind_node)) {
349 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
350 		__sk_del_bind_node(sk);
351 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
352 
353 		sk_node_init(&sk->sk_bind_node);
354 	}
355 }
356 
357 static struct sock *__unix_find_socket_byname(struct net *net,
358 					      struct sockaddr_un *sunname,
359 					      int len, unsigned int hash)
360 {
361 	struct sock *s;
362 
363 	sk_for_each(s, &net->unx.table.buckets[hash]) {
364 		struct unix_sock *u = unix_sk(s);
365 
366 		if (u->addr->len == len &&
367 		    !memcmp(u->addr->name, sunname, len))
368 			return s;
369 	}
370 	return NULL;
371 }
372 
373 static inline struct sock *unix_find_socket_byname(struct net *net,
374 						   struct sockaddr_un *sunname,
375 						   int len, unsigned int hash)
376 {
377 	struct sock *s;
378 
379 	spin_lock(&net->unx.table.locks[hash]);
380 	s = __unix_find_socket_byname(net, sunname, len, hash);
381 	if (s)
382 		sock_hold(s);
383 	spin_unlock(&net->unx.table.locks[hash]);
384 	return s;
385 }
386 
387 static struct sock *unix_find_socket_byinode(struct inode *i)
388 {
389 	unsigned int hash = unix_bsd_hash(i);
390 	struct sock *s;
391 
392 	spin_lock(&bsd_socket_locks[hash]);
393 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
394 		struct dentry *dentry = unix_sk(s)->path.dentry;
395 
396 		if (dentry && d_backing_inode(dentry) == i) {
397 			sock_hold(s);
398 			spin_unlock(&bsd_socket_locks[hash]);
399 			return s;
400 		}
401 	}
402 	spin_unlock(&bsd_socket_locks[hash]);
403 	return NULL;
404 }
405 
406 /* Support code for asymmetrically connected dgram sockets
407  *
408  * If a datagram socket is connected to a socket not itself connected
409  * to the first socket (eg, /dev/log), clients may only enqueue more
410  * messages if the present receive queue of the server socket is not
411  * "too large". This means there's a second writeability condition
412  * poll and sendmsg need to test. The dgram recv code will do a wake
413  * up on the peer_wait wait queue of a socket upon reception of a
414  * datagram which needs to be propagated to sleeping would-be writers
415  * since these might not have sent anything so far. This can't be
416  * accomplished via poll_wait because the lifetime of the server
417  * socket might be less than that of its clients if these break their
418  * association with it or if the server socket is closed while clients
419  * are still connected to it and there's no way to inform "a polling
420  * implementation" that it should let go of a certain wait queue
421  *
422  * In order to propagate a wake up, a wait_queue_entry_t of the client
423  * socket is enqueued on the peer_wait queue of the server socket
424  * whose wake function does a wake_up on the ordinary client socket
425  * wait queue. This connection is established whenever a write (or
426  * poll for write) hit the flow control condition and broken when the
427  * association to the server socket is dissolved or after a wake up
428  * was relayed.
429  */
430 
431 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
432 				      void *key)
433 {
434 	struct unix_sock *u;
435 	wait_queue_head_t *u_sleep;
436 
437 	u = container_of(q, struct unix_sock, peer_wake);
438 
439 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
440 			    q);
441 	u->peer_wake.private = NULL;
442 
443 	/* relaying can only happen while the wq still exists */
444 	u_sleep = sk_sleep(&u->sk);
445 	if (u_sleep)
446 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
447 
448 	return 0;
449 }
450 
451 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
452 {
453 	struct unix_sock *u, *u_other;
454 	int rc;
455 
456 	u = unix_sk(sk);
457 	u_other = unix_sk(other);
458 	rc = 0;
459 	spin_lock(&u_other->peer_wait.lock);
460 
461 	if (!u->peer_wake.private) {
462 		u->peer_wake.private = other;
463 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
464 
465 		rc = 1;
466 	}
467 
468 	spin_unlock(&u_other->peer_wait.lock);
469 	return rc;
470 }
471 
472 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
473 					    struct sock *other)
474 {
475 	struct unix_sock *u, *u_other;
476 
477 	u = unix_sk(sk);
478 	u_other = unix_sk(other);
479 	spin_lock(&u_other->peer_wait.lock);
480 
481 	if (u->peer_wake.private == other) {
482 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
483 		u->peer_wake.private = NULL;
484 	}
485 
486 	spin_unlock(&u_other->peer_wait.lock);
487 }
488 
489 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
490 						   struct sock *other)
491 {
492 	unix_dgram_peer_wake_disconnect(sk, other);
493 	wake_up_interruptible_poll(sk_sleep(sk),
494 				   EPOLLOUT |
495 				   EPOLLWRNORM |
496 				   EPOLLWRBAND);
497 }
498 
499 /* preconditions:
500  *	- unix_peer(sk) == other
501  *	- association is stable
502  */
503 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
504 {
505 	int connected;
506 
507 	connected = unix_dgram_peer_wake_connect(sk, other);
508 
509 	/* If other is SOCK_DEAD, we want to make sure we signal
510 	 * POLLOUT, such that a subsequent write() can get a
511 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
512 	 * to other and its full, we will hang waiting for POLLOUT.
513 	 */
514 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
515 		return 1;
516 
517 	if (connected)
518 		unix_dgram_peer_wake_disconnect(sk, other);
519 
520 	return 0;
521 }
522 
523 static int unix_writable(const struct sock *sk)
524 {
525 	return sk->sk_state != TCP_LISTEN &&
526 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
527 }
528 
529 static void unix_write_space(struct sock *sk)
530 {
531 	struct socket_wq *wq;
532 
533 	rcu_read_lock();
534 	if (unix_writable(sk)) {
535 		wq = rcu_dereference(sk->sk_wq);
536 		if (skwq_has_sleeper(wq))
537 			wake_up_interruptible_sync_poll(&wq->wait,
538 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
539 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
540 	}
541 	rcu_read_unlock();
542 }
543 
544 /* When dgram socket disconnects (or changes its peer), we clear its receive
545  * queue of packets arrived from previous peer. First, it allows to do
546  * flow control based only on wmem_alloc; second, sk connected to peer
547  * may receive messages only from that peer. */
548 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
549 {
550 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
551 		skb_queue_purge(&sk->sk_receive_queue);
552 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
553 
554 		/* If one link of bidirectional dgram pipe is disconnected,
555 		 * we signal error. Messages are lost. Do not make this,
556 		 * when peer was not connected to us.
557 		 */
558 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
559 			other->sk_err = ECONNRESET;
560 			sk_error_report(other);
561 		}
562 	}
563 	other->sk_state = TCP_CLOSE;
564 }
565 
566 static void unix_sock_destructor(struct sock *sk)
567 {
568 	struct unix_sock *u = unix_sk(sk);
569 
570 	skb_queue_purge(&sk->sk_receive_queue);
571 
572 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
573 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
574 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
575 	if (!sock_flag(sk, SOCK_DEAD)) {
576 		pr_info("Attempt to release alive unix socket: %p\n", sk);
577 		return;
578 	}
579 
580 	if (u->addr)
581 		unix_release_addr(u->addr);
582 
583 	atomic_long_dec(&unix_nr_socks);
584 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
585 #ifdef UNIX_REFCNT_DEBUG
586 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
587 		atomic_long_read(&unix_nr_socks));
588 #endif
589 }
590 
591 static void unix_release_sock(struct sock *sk, int embrion)
592 {
593 	struct unix_sock *u = unix_sk(sk);
594 	struct sock *skpair;
595 	struct sk_buff *skb;
596 	struct path path;
597 	int state;
598 
599 	unix_remove_socket(sock_net(sk), sk);
600 	unix_remove_bsd_socket(sk);
601 
602 	/* Clear state */
603 	unix_state_lock(sk);
604 	sock_orphan(sk);
605 	sk->sk_shutdown = SHUTDOWN_MASK;
606 	path	     = u->path;
607 	u->path.dentry = NULL;
608 	u->path.mnt = NULL;
609 	state = sk->sk_state;
610 	sk->sk_state = TCP_CLOSE;
611 
612 	skpair = unix_peer(sk);
613 	unix_peer(sk) = NULL;
614 
615 	unix_state_unlock(sk);
616 
617 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
618 	if (u->oob_skb) {
619 		kfree_skb(u->oob_skb);
620 		u->oob_skb = NULL;
621 	}
622 #endif
623 
624 	wake_up_interruptible_all(&u->peer_wait);
625 
626 	if (skpair != NULL) {
627 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
628 			unix_state_lock(skpair);
629 			/* No more writes */
630 			skpair->sk_shutdown = SHUTDOWN_MASK;
631 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
632 				skpair->sk_err = ECONNRESET;
633 			unix_state_unlock(skpair);
634 			skpair->sk_state_change(skpair);
635 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
636 		}
637 
638 		unix_dgram_peer_wake_disconnect(sk, skpair);
639 		sock_put(skpair); /* It may now die */
640 	}
641 
642 	/* Try to flush out this socket. Throw out buffers at least */
643 
644 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
645 		if (state == TCP_LISTEN)
646 			unix_release_sock(skb->sk, 1);
647 		/* passed fds are erased in the kfree_skb hook	      */
648 		UNIXCB(skb).consumed = skb->len;
649 		kfree_skb(skb);
650 	}
651 
652 	if (path.dentry)
653 		path_put(&path);
654 
655 	sock_put(sk);
656 
657 	/* ---- Socket is dead now and most probably destroyed ---- */
658 
659 	/*
660 	 * Fixme: BSD difference: In BSD all sockets connected to us get
661 	 *	  ECONNRESET and we die on the spot. In Linux we behave
662 	 *	  like files and pipes do and wait for the last
663 	 *	  dereference.
664 	 *
665 	 * Can't we simply set sock->err?
666 	 *
667 	 *	  What the above comment does talk about? --ANK(980817)
668 	 */
669 
670 	if (unix_tot_inflight)
671 		unix_gc();		/* Garbage collect fds */
672 }
673 
674 static void init_peercred(struct sock *sk)
675 {
676 	const struct cred *old_cred;
677 	struct pid *old_pid;
678 
679 	spin_lock(&sk->sk_peer_lock);
680 	old_pid = sk->sk_peer_pid;
681 	old_cred = sk->sk_peer_cred;
682 	sk->sk_peer_pid  = get_pid(task_tgid(current));
683 	sk->sk_peer_cred = get_current_cred();
684 	spin_unlock(&sk->sk_peer_lock);
685 
686 	put_pid(old_pid);
687 	put_cred(old_cred);
688 }
689 
690 static void copy_peercred(struct sock *sk, struct sock *peersk)
691 {
692 	const struct cred *old_cred;
693 	struct pid *old_pid;
694 
695 	if (sk < peersk) {
696 		spin_lock(&sk->sk_peer_lock);
697 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
698 	} else {
699 		spin_lock(&peersk->sk_peer_lock);
700 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
701 	}
702 	old_pid = sk->sk_peer_pid;
703 	old_cred = sk->sk_peer_cred;
704 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
705 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
706 
707 	spin_unlock(&sk->sk_peer_lock);
708 	spin_unlock(&peersk->sk_peer_lock);
709 
710 	put_pid(old_pid);
711 	put_cred(old_cred);
712 }
713 
714 static int unix_listen(struct socket *sock, int backlog)
715 {
716 	int err;
717 	struct sock *sk = sock->sk;
718 	struct unix_sock *u = unix_sk(sk);
719 
720 	err = -EOPNOTSUPP;
721 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
722 		goto out;	/* Only stream/seqpacket sockets accept */
723 	err = -EINVAL;
724 	if (!u->addr)
725 		goto out;	/* No listens on an unbound socket */
726 	unix_state_lock(sk);
727 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
728 		goto out_unlock;
729 	if (backlog > sk->sk_max_ack_backlog)
730 		wake_up_interruptible_all(&u->peer_wait);
731 	sk->sk_max_ack_backlog	= backlog;
732 	sk->sk_state		= TCP_LISTEN;
733 	/* set credentials so connect can copy them */
734 	init_peercred(sk);
735 	err = 0;
736 
737 out_unlock:
738 	unix_state_unlock(sk);
739 out:
740 	return err;
741 }
742 
743 static int unix_release(struct socket *);
744 static int unix_bind(struct socket *, struct sockaddr *, int);
745 static int unix_stream_connect(struct socket *, struct sockaddr *,
746 			       int addr_len, int flags);
747 static int unix_socketpair(struct socket *, struct socket *);
748 static int unix_accept(struct socket *, struct socket *, int, bool);
749 static int unix_getname(struct socket *, struct sockaddr *, int);
750 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
751 static __poll_t unix_dgram_poll(struct file *, struct socket *,
752 				    poll_table *);
753 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
754 #ifdef CONFIG_COMPAT
755 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
756 #endif
757 static int unix_shutdown(struct socket *, int);
758 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
759 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
760 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
761 				    size_t size, int flags);
762 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
763 				       struct pipe_inode_info *, size_t size,
764 				       unsigned int flags);
765 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
766 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
767 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
768 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
769 static int unix_dgram_connect(struct socket *, struct sockaddr *,
770 			      int, int);
771 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
772 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
773 				  int);
774 
775 static int unix_set_peek_off(struct sock *sk, int val)
776 {
777 	struct unix_sock *u = unix_sk(sk);
778 
779 	if (mutex_lock_interruptible(&u->iolock))
780 		return -EINTR;
781 
782 	sk->sk_peek_off = val;
783 	mutex_unlock(&u->iolock);
784 
785 	return 0;
786 }
787 
788 #ifdef CONFIG_PROC_FS
789 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
790 {
791 	struct sock *sk = sock->sk;
792 	struct unix_sock *u;
793 
794 	if (sk) {
795 		u = unix_sk(sock->sk);
796 		seq_printf(m, "scm_fds: %u\n",
797 			   atomic_read(&u->scm_stat.nr_fds));
798 	}
799 }
800 #else
801 #define unix_show_fdinfo NULL
802 #endif
803 
804 static const struct proto_ops unix_stream_ops = {
805 	.family =	PF_UNIX,
806 	.owner =	THIS_MODULE,
807 	.release =	unix_release,
808 	.bind =		unix_bind,
809 	.connect =	unix_stream_connect,
810 	.socketpair =	unix_socketpair,
811 	.accept =	unix_accept,
812 	.getname =	unix_getname,
813 	.poll =		unix_poll,
814 	.ioctl =	unix_ioctl,
815 #ifdef CONFIG_COMPAT
816 	.compat_ioctl =	unix_compat_ioctl,
817 #endif
818 	.listen =	unix_listen,
819 	.shutdown =	unix_shutdown,
820 	.sendmsg =	unix_stream_sendmsg,
821 	.recvmsg =	unix_stream_recvmsg,
822 	.read_skb =	unix_stream_read_skb,
823 	.mmap =		sock_no_mmap,
824 	.sendpage =	unix_stream_sendpage,
825 	.splice_read =	unix_stream_splice_read,
826 	.set_peek_off =	unix_set_peek_off,
827 	.show_fdinfo =	unix_show_fdinfo,
828 };
829 
830 static const struct proto_ops unix_dgram_ops = {
831 	.family =	PF_UNIX,
832 	.owner =	THIS_MODULE,
833 	.release =	unix_release,
834 	.bind =		unix_bind,
835 	.connect =	unix_dgram_connect,
836 	.socketpair =	unix_socketpair,
837 	.accept =	sock_no_accept,
838 	.getname =	unix_getname,
839 	.poll =		unix_dgram_poll,
840 	.ioctl =	unix_ioctl,
841 #ifdef CONFIG_COMPAT
842 	.compat_ioctl =	unix_compat_ioctl,
843 #endif
844 	.listen =	sock_no_listen,
845 	.shutdown =	unix_shutdown,
846 	.sendmsg =	unix_dgram_sendmsg,
847 	.read_skb =	unix_read_skb,
848 	.recvmsg =	unix_dgram_recvmsg,
849 	.mmap =		sock_no_mmap,
850 	.sendpage =	sock_no_sendpage,
851 	.set_peek_off =	unix_set_peek_off,
852 	.show_fdinfo =	unix_show_fdinfo,
853 };
854 
855 static const struct proto_ops unix_seqpacket_ops = {
856 	.family =	PF_UNIX,
857 	.owner =	THIS_MODULE,
858 	.release =	unix_release,
859 	.bind =		unix_bind,
860 	.connect =	unix_stream_connect,
861 	.socketpair =	unix_socketpair,
862 	.accept =	unix_accept,
863 	.getname =	unix_getname,
864 	.poll =		unix_dgram_poll,
865 	.ioctl =	unix_ioctl,
866 #ifdef CONFIG_COMPAT
867 	.compat_ioctl =	unix_compat_ioctl,
868 #endif
869 	.listen =	unix_listen,
870 	.shutdown =	unix_shutdown,
871 	.sendmsg =	unix_seqpacket_sendmsg,
872 	.recvmsg =	unix_seqpacket_recvmsg,
873 	.mmap =		sock_no_mmap,
874 	.sendpage =	sock_no_sendpage,
875 	.set_peek_off =	unix_set_peek_off,
876 	.show_fdinfo =	unix_show_fdinfo,
877 };
878 
879 static void unix_close(struct sock *sk, long timeout)
880 {
881 	/* Nothing to do here, unix socket does not need a ->close().
882 	 * This is merely for sockmap.
883 	 */
884 }
885 
886 static void unix_unhash(struct sock *sk)
887 {
888 	/* Nothing to do here, unix socket does not need a ->unhash().
889 	 * This is merely for sockmap.
890 	 */
891 }
892 
893 struct proto unix_dgram_proto = {
894 	.name			= "UNIX",
895 	.owner			= THIS_MODULE,
896 	.obj_size		= sizeof(struct unix_sock),
897 	.close			= unix_close,
898 #ifdef CONFIG_BPF_SYSCALL
899 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
900 #endif
901 };
902 
903 struct proto unix_stream_proto = {
904 	.name			= "UNIX-STREAM",
905 	.owner			= THIS_MODULE,
906 	.obj_size		= sizeof(struct unix_sock),
907 	.close			= unix_close,
908 	.unhash			= unix_unhash,
909 #ifdef CONFIG_BPF_SYSCALL
910 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
911 #endif
912 };
913 
914 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
915 {
916 	struct unix_sock *u;
917 	struct sock *sk;
918 	int err;
919 
920 	atomic_long_inc(&unix_nr_socks);
921 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
922 		err = -ENFILE;
923 		goto err;
924 	}
925 
926 	if (type == SOCK_STREAM)
927 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
928 	else /*dgram and  seqpacket */
929 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
930 
931 	if (!sk) {
932 		err = -ENOMEM;
933 		goto err;
934 	}
935 
936 	sock_init_data(sock, sk);
937 
938 	sk->sk_hash		= unix_unbound_hash(sk);
939 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
940 	sk->sk_write_space	= unix_write_space;
941 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
942 	sk->sk_destruct		= unix_sock_destructor;
943 	u	  = unix_sk(sk);
944 	u->path.dentry = NULL;
945 	u->path.mnt = NULL;
946 	spin_lock_init(&u->lock);
947 	atomic_long_set(&u->inflight, 0);
948 	INIT_LIST_HEAD(&u->link);
949 	mutex_init(&u->iolock); /* single task reading lock */
950 	mutex_init(&u->bindlock); /* single task binding lock */
951 	init_waitqueue_head(&u->peer_wait);
952 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
953 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
954 	unix_insert_unbound_socket(net, sk);
955 
956 	sock_prot_inuse_add(net, sk->sk_prot, 1);
957 
958 	return sk;
959 
960 err:
961 	atomic_long_dec(&unix_nr_socks);
962 	return ERR_PTR(err);
963 }
964 
965 static int unix_create(struct net *net, struct socket *sock, int protocol,
966 		       int kern)
967 {
968 	struct sock *sk;
969 
970 	if (protocol && protocol != PF_UNIX)
971 		return -EPROTONOSUPPORT;
972 
973 	sock->state = SS_UNCONNECTED;
974 
975 	switch (sock->type) {
976 	case SOCK_STREAM:
977 		sock->ops = &unix_stream_ops;
978 		break;
979 		/*
980 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
981 		 *	nothing uses it.
982 		 */
983 	case SOCK_RAW:
984 		sock->type = SOCK_DGRAM;
985 		fallthrough;
986 	case SOCK_DGRAM:
987 		sock->ops = &unix_dgram_ops;
988 		break;
989 	case SOCK_SEQPACKET:
990 		sock->ops = &unix_seqpacket_ops;
991 		break;
992 	default:
993 		return -ESOCKTNOSUPPORT;
994 	}
995 
996 	sk = unix_create1(net, sock, kern, sock->type);
997 	if (IS_ERR(sk))
998 		return PTR_ERR(sk);
999 
1000 	return 0;
1001 }
1002 
1003 static int unix_release(struct socket *sock)
1004 {
1005 	struct sock *sk = sock->sk;
1006 
1007 	if (!sk)
1008 		return 0;
1009 
1010 	sk->sk_prot->close(sk, 0);
1011 	unix_release_sock(sk, 0);
1012 	sock->sk = NULL;
1013 
1014 	return 0;
1015 }
1016 
1017 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1018 				  int type)
1019 {
1020 	struct inode *inode;
1021 	struct path path;
1022 	struct sock *sk;
1023 	int err;
1024 
1025 	unix_mkname_bsd(sunaddr, addr_len);
1026 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1027 	if (err)
1028 		goto fail;
1029 
1030 	err = path_permission(&path, MAY_WRITE);
1031 	if (err)
1032 		goto path_put;
1033 
1034 	err = -ECONNREFUSED;
1035 	inode = d_backing_inode(path.dentry);
1036 	if (!S_ISSOCK(inode->i_mode))
1037 		goto path_put;
1038 
1039 	sk = unix_find_socket_byinode(inode);
1040 	if (!sk)
1041 		goto path_put;
1042 
1043 	err = -EPROTOTYPE;
1044 	if (sk->sk_type == type)
1045 		touch_atime(&path);
1046 	else
1047 		goto sock_put;
1048 
1049 	path_put(&path);
1050 
1051 	return sk;
1052 
1053 sock_put:
1054 	sock_put(sk);
1055 path_put:
1056 	path_put(&path);
1057 fail:
1058 	return ERR_PTR(err);
1059 }
1060 
1061 static struct sock *unix_find_abstract(struct net *net,
1062 				       struct sockaddr_un *sunaddr,
1063 				       int addr_len, int type)
1064 {
1065 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1066 	struct dentry *dentry;
1067 	struct sock *sk;
1068 
1069 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1070 	if (!sk)
1071 		return ERR_PTR(-ECONNREFUSED);
1072 
1073 	dentry = unix_sk(sk)->path.dentry;
1074 	if (dentry)
1075 		touch_atime(&unix_sk(sk)->path);
1076 
1077 	return sk;
1078 }
1079 
1080 static struct sock *unix_find_other(struct net *net,
1081 				    struct sockaddr_un *sunaddr,
1082 				    int addr_len, int type)
1083 {
1084 	struct sock *sk;
1085 
1086 	if (sunaddr->sun_path[0])
1087 		sk = unix_find_bsd(sunaddr, addr_len, type);
1088 	else
1089 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1090 
1091 	return sk;
1092 }
1093 
1094 static int unix_autobind(struct sock *sk)
1095 {
1096 	unsigned int new_hash, old_hash = sk->sk_hash;
1097 	struct unix_sock *u = unix_sk(sk);
1098 	struct net *net = sock_net(sk);
1099 	struct unix_address *addr;
1100 	u32 lastnum, ordernum;
1101 	int err;
1102 
1103 	err = mutex_lock_interruptible(&u->bindlock);
1104 	if (err)
1105 		return err;
1106 
1107 	if (u->addr)
1108 		goto out;
1109 
1110 	err = -ENOMEM;
1111 	addr = kzalloc(sizeof(*addr) +
1112 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1113 	if (!addr)
1114 		goto out;
1115 
1116 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1117 	addr->name->sun_family = AF_UNIX;
1118 	refcount_set(&addr->refcnt, 1);
1119 
1120 	ordernum = prandom_u32();
1121 	lastnum = ordernum & 0xFFFFF;
1122 retry:
1123 	ordernum = (ordernum + 1) & 0xFFFFF;
1124 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1125 
1126 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1127 	unix_table_double_lock(net, old_hash, new_hash);
1128 
1129 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1130 		unix_table_double_unlock(net, old_hash, new_hash);
1131 
1132 		/* __unix_find_socket_byname() may take long time if many names
1133 		 * are already in use.
1134 		 */
1135 		cond_resched();
1136 
1137 		if (ordernum == lastnum) {
1138 			/* Give up if all names seems to be in use. */
1139 			err = -ENOSPC;
1140 			unix_release_addr(addr);
1141 			goto out;
1142 		}
1143 
1144 		goto retry;
1145 	}
1146 
1147 	__unix_set_addr_hash(net, sk, addr, new_hash);
1148 	unix_table_double_unlock(net, old_hash, new_hash);
1149 	err = 0;
1150 
1151 out:	mutex_unlock(&u->bindlock);
1152 	return err;
1153 }
1154 
1155 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1156 			 int addr_len)
1157 {
1158 	umode_t mode = S_IFSOCK |
1159 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1160 	unsigned int new_hash, old_hash = sk->sk_hash;
1161 	struct unix_sock *u = unix_sk(sk);
1162 	struct net *net = sock_net(sk);
1163 	struct user_namespace *ns; // barf...
1164 	struct unix_address *addr;
1165 	struct dentry *dentry;
1166 	struct path parent;
1167 	int err;
1168 
1169 	unix_mkname_bsd(sunaddr, addr_len);
1170 	addr_len = strlen(sunaddr->sun_path) +
1171 		offsetof(struct sockaddr_un, sun_path) + 1;
1172 
1173 	addr = unix_create_addr(sunaddr, addr_len);
1174 	if (!addr)
1175 		return -ENOMEM;
1176 
1177 	/*
1178 	 * Get the parent directory, calculate the hash for last
1179 	 * component.
1180 	 */
1181 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1182 	if (IS_ERR(dentry)) {
1183 		err = PTR_ERR(dentry);
1184 		goto out;
1185 	}
1186 
1187 	/*
1188 	 * All right, let's create it.
1189 	 */
1190 	ns = mnt_user_ns(parent.mnt);
1191 	err = security_path_mknod(&parent, dentry, mode, 0);
1192 	if (!err)
1193 		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1194 	if (err)
1195 		goto out_path;
1196 	err = mutex_lock_interruptible(&u->bindlock);
1197 	if (err)
1198 		goto out_unlink;
1199 	if (u->addr)
1200 		goto out_unlock;
1201 
1202 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1203 	unix_table_double_lock(net, old_hash, new_hash);
1204 	u->path.mnt = mntget(parent.mnt);
1205 	u->path.dentry = dget(dentry);
1206 	__unix_set_addr_hash(net, sk, addr, new_hash);
1207 	unix_table_double_unlock(net, old_hash, new_hash);
1208 	unix_insert_bsd_socket(sk);
1209 	mutex_unlock(&u->bindlock);
1210 	done_path_create(&parent, dentry);
1211 	return 0;
1212 
1213 out_unlock:
1214 	mutex_unlock(&u->bindlock);
1215 	err = -EINVAL;
1216 out_unlink:
1217 	/* failed after successful mknod?  unlink what we'd created... */
1218 	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1219 out_path:
1220 	done_path_create(&parent, dentry);
1221 out:
1222 	unix_release_addr(addr);
1223 	return err == -EEXIST ? -EADDRINUSE : err;
1224 }
1225 
1226 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1227 			      int addr_len)
1228 {
1229 	unsigned int new_hash, old_hash = sk->sk_hash;
1230 	struct unix_sock *u = unix_sk(sk);
1231 	struct net *net = sock_net(sk);
1232 	struct unix_address *addr;
1233 	int err;
1234 
1235 	addr = unix_create_addr(sunaddr, addr_len);
1236 	if (!addr)
1237 		return -ENOMEM;
1238 
1239 	err = mutex_lock_interruptible(&u->bindlock);
1240 	if (err)
1241 		goto out;
1242 
1243 	if (u->addr) {
1244 		err = -EINVAL;
1245 		goto out_mutex;
1246 	}
1247 
1248 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1249 	unix_table_double_lock(net, old_hash, new_hash);
1250 
1251 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1252 		goto out_spin;
1253 
1254 	__unix_set_addr_hash(net, sk, addr, new_hash);
1255 	unix_table_double_unlock(net, old_hash, new_hash);
1256 	mutex_unlock(&u->bindlock);
1257 	return 0;
1258 
1259 out_spin:
1260 	unix_table_double_unlock(net, old_hash, new_hash);
1261 	err = -EADDRINUSE;
1262 out_mutex:
1263 	mutex_unlock(&u->bindlock);
1264 out:
1265 	unix_release_addr(addr);
1266 	return err;
1267 }
1268 
1269 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1270 {
1271 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1272 	struct sock *sk = sock->sk;
1273 	int err;
1274 
1275 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1276 	    sunaddr->sun_family == AF_UNIX)
1277 		return unix_autobind(sk);
1278 
1279 	err = unix_validate_addr(sunaddr, addr_len);
1280 	if (err)
1281 		return err;
1282 
1283 	if (sunaddr->sun_path[0])
1284 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1285 	else
1286 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1287 
1288 	return err;
1289 }
1290 
1291 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1292 {
1293 	if (unlikely(sk1 == sk2) || !sk2) {
1294 		unix_state_lock(sk1);
1295 		return;
1296 	}
1297 	if (sk1 < sk2) {
1298 		unix_state_lock(sk1);
1299 		unix_state_lock_nested(sk2);
1300 	} else {
1301 		unix_state_lock(sk2);
1302 		unix_state_lock_nested(sk1);
1303 	}
1304 }
1305 
1306 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1307 {
1308 	if (unlikely(sk1 == sk2) || !sk2) {
1309 		unix_state_unlock(sk1);
1310 		return;
1311 	}
1312 	unix_state_unlock(sk1);
1313 	unix_state_unlock(sk2);
1314 }
1315 
1316 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1317 			      int alen, int flags)
1318 {
1319 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1320 	struct sock *sk = sock->sk;
1321 	struct sock *other;
1322 	int err;
1323 
1324 	err = -EINVAL;
1325 	if (alen < offsetofend(struct sockaddr, sa_family))
1326 		goto out;
1327 
1328 	if (addr->sa_family != AF_UNSPEC) {
1329 		err = unix_validate_addr(sunaddr, alen);
1330 		if (err)
1331 			goto out;
1332 
1333 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1334 		    !unix_sk(sk)->addr) {
1335 			err = unix_autobind(sk);
1336 			if (err)
1337 				goto out;
1338 		}
1339 
1340 restart:
1341 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1342 		if (IS_ERR(other)) {
1343 			err = PTR_ERR(other);
1344 			goto out;
1345 		}
1346 
1347 		unix_state_double_lock(sk, other);
1348 
1349 		/* Apparently VFS overslept socket death. Retry. */
1350 		if (sock_flag(other, SOCK_DEAD)) {
1351 			unix_state_double_unlock(sk, other);
1352 			sock_put(other);
1353 			goto restart;
1354 		}
1355 
1356 		err = -EPERM;
1357 		if (!unix_may_send(sk, other))
1358 			goto out_unlock;
1359 
1360 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1361 		if (err)
1362 			goto out_unlock;
1363 
1364 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1365 	} else {
1366 		/*
1367 		 *	1003.1g breaking connected state with AF_UNSPEC
1368 		 */
1369 		other = NULL;
1370 		unix_state_double_lock(sk, other);
1371 	}
1372 
1373 	/*
1374 	 * If it was connected, reconnect.
1375 	 */
1376 	if (unix_peer(sk)) {
1377 		struct sock *old_peer = unix_peer(sk);
1378 
1379 		unix_peer(sk) = other;
1380 		if (!other)
1381 			sk->sk_state = TCP_CLOSE;
1382 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1383 
1384 		unix_state_double_unlock(sk, other);
1385 
1386 		if (other != old_peer)
1387 			unix_dgram_disconnected(sk, old_peer);
1388 		sock_put(old_peer);
1389 	} else {
1390 		unix_peer(sk) = other;
1391 		unix_state_double_unlock(sk, other);
1392 	}
1393 
1394 	return 0;
1395 
1396 out_unlock:
1397 	unix_state_double_unlock(sk, other);
1398 	sock_put(other);
1399 out:
1400 	return err;
1401 }
1402 
1403 static long unix_wait_for_peer(struct sock *other, long timeo)
1404 	__releases(&unix_sk(other)->lock)
1405 {
1406 	struct unix_sock *u = unix_sk(other);
1407 	int sched;
1408 	DEFINE_WAIT(wait);
1409 
1410 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1411 
1412 	sched = !sock_flag(other, SOCK_DEAD) &&
1413 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1414 		unix_recvq_full(other);
1415 
1416 	unix_state_unlock(other);
1417 
1418 	if (sched)
1419 		timeo = schedule_timeout(timeo);
1420 
1421 	finish_wait(&u->peer_wait, &wait);
1422 	return timeo;
1423 }
1424 
1425 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1426 			       int addr_len, int flags)
1427 {
1428 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1429 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1430 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1431 	struct net *net = sock_net(sk);
1432 	struct sk_buff *skb = NULL;
1433 	long timeo;
1434 	int err;
1435 	int st;
1436 
1437 	err = unix_validate_addr(sunaddr, addr_len);
1438 	if (err)
1439 		goto out;
1440 
1441 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1442 		err = unix_autobind(sk);
1443 		if (err)
1444 			goto out;
1445 	}
1446 
1447 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1448 
1449 	/* First of all allocate resources.
1450 	   If we will make it after state is locked,
1451 	   we will have to recheck all again in any case.
1452 	 */
1453 
1454 	/* create new sock for complete connection */
1455 	newsk = unix_create1(net, NULL, 0, sock->type);
1456 	if (IS_ERR(newsk)) {
1457 		err = PTR_ERR(newsk);
1458 		newsk = NULL;
1459 		goto out;
1460 	}
1461 
1462 	err = -ENOMEM;
1463 
1464 	/* Allocate skb for sending to listening sock */
1465 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1466 	if (skb == NULL)
1467 		goto out;
1468 
1469 restart:
1470 	/*  Find listening sock. */
1471 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1472 	if (IS_ERR(other)) {
1473 		err = PTR_ERR(other);
1474 		other = NULL;
1475 		goto out;
1476 	}
1477 
1478 	/* Latch state of peer */
1479 	unix_state_lock(other);
1480 
1481 	/* Apparently VFS overslept socket death. Retry. */
1482 	if (sock_flag(other, SOCK_DEAD)) {
1483 		unix_state_unlock(other);
1484 		sock_put(other);
1485 		goto restart;
1486 	}
1487 
1488 	err = -ECONNREFUSED;
1489 	if (other->sk_state != TCP_LISTEN)
1490 		goto out_unlock;
1491 	if (other->sk_shutdown & RCV_SHUTDOWN)
1492 		goto out_unlock;
1493 
1494 	if (unix_recvq_full(other)) {
1495 		err = -EAGAIN;
1496 		if (!timeo)
1497 			goto out_unlock;
1498 
1499 		timeo = unix_wait_for_peer(other, timeo);
1500 
1501 		err = sock_intr_errno(timeo);
1502 		if (signal_pending(current))
1503 			goto out;
1504 		sock_put(other);
1505 		goto restart;
1506 	}
1507 
1508 	/* Latch our state.
1509 
1510 	   It is tricky place. We need to grab our state lock and cannot
1511 	   drop lock on peer. It is dangerous because deadlock is
1512 	   possible. Connect to self case and simultaneous
1513 	   attempt to connect are eliminated by checking socket
1514 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1515 	   check this before attempt to grab lock.
1516 
1517 	   Well, and we have to recheck the state after socket locked.
1518 	 */
1519 	st = sk->sk_state;
1520 
1521 	switch (st) {
1522 	case TCP_CLOSE:
1523 		/* This is ok... continue with connect */
1524 		break;
1525 	case TCP_ESTABLISHED:
1526 		/* Socket is already connected */
1527 		err = -EISCONN;
1528 		goto out_unlock;
1529 	default:
1530 		err = -EINVAL;
1531 		goto out_unlock;
1532 	}
1533 
1534 	unix_state_lock_nested(sk);
1535 
1536 	if (sk->sk_state != st) {
1537 		unix_state_unlock(sk);
1538 		unix_state_unlock(other);
1539 		sock_put(other);
1540 		goto restart;
1541 	}
1542 
1543 	err = security_unix_stream_connect(sk, other, newsk);
1544 	if (err) {
1545 		unix_state_unlock(sk);
1546 		goto out_unlock;
1547 	}
1548 
1549 	/* The way is open! Fastly set all the necessary fields... */
1550 
1551 	sock_hold(sk);
1552 	unix_peer(newsk)	= sk;
1553 	newsk->sk_state		= TCP_ESTABLISHED;
1554 	newsk->sk_type		= sk->sk_type;
1555 	init_peercred(newsk);
1556 	newu = unix_sk(newsk);
1557 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1558 	otheru = unix_sk(other);
1559 
1560 	/* copy address information from listening to new sock
1561 	 *
1562 	 * The contents of *(otheru->addr) and otheru->path
1563 	 * are seen fully set up here, since we have found
1564 	 * otheru in hash under its lock.  Insertion into the
1565 	 * hash chain we'd found it in had been done in an
1566 	 * earlier critical area protected by the chain's lock,
1567 	 * the same one where we'd set *(otheru->addr) contents,
1568 	 * as well as otheru->path and otheru->addr itself.
1569 	 *
1570 	 * Using smp_store_release() here to set newu->addr
1571 	 * is enough to make those stores, as well as stores
1572 	 * to newu->path visible to anyone who gets newu->addr
1573 	 * by smp_load_acquire().  IOW, the same warranties
1574 	 * as for unix_sock instances bound in unix_bind() or
1575 	 * in unix_autobind().
1576 	 */
1577 	if (otheru->path.dentry) {
1578 		path_get(&otheru->path);
1579 		newu->path = otheru->path;
1580 	}
1581 	refcount_inc(&otheru->addr->refcnt);
1582 	smp_store_release(&newu->addr, otheru->addr);
1583 
1584 	/* Set credentials */
1585 	copy_peercred(sk, other);
1586 
1587 	sock->state	= SS_CONNECTED;
1588 	sk->sk_state	= TCP_ESTABLISHED;
1589 	sock_hold(newsk);
1590 
1591 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1592 	unix_peer(sk)	= newsk;
1593 
1594 	unix_state_unlock(sk);
1595 
1596 	/* take ten and send info to listening sock */
1597 	spin_lock(&other->sk_receive_queue.lock);
1598 	__skb_queue_tail(&other->sk_receive_queue, skb);
1599 	spin_unlock(&other->sk_receive_queue.lock);
1600 	unix_state_unlock(other);
1601 	other->sk_data_ready(other);
1602 	sock_put(other);
1603 	return 0;
1604 
1605 out_unlock:
1606 	if (other)
1607 		unix_state_unlock(other);
1608 
1609 out:
1610 	kfree_skb(skb);
1611 	if (newsk)
1612 		unix_release_sock(newsk, 0);
1613 	if (other)
1614 		sock_put(other);
1615 	return err;
1616 }
1617 
1618 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1619 {
1620 	struct sock *ska = socka->sk, *skb = sockb->sk;
1621 
1622 	/* Join our sockets back to back */
1623 	sock_hold(ska);
1624 	sock_hold(skb);
1625 	unix_peer(ska) = skb;
1626 	unix_peer(skb) = ska;
1627 	init_peercred(ska);
1628 	init_peercred(skb);
1629 
1630 	ska->sk_state = TCP_ESTABLISHED;
1631 	skb->sk_state = TCP_ESTABLISHED;
1632 	socka->state  = SS_CONNECTED;
1633 	sockb->state  = SS_CONNECTED;
1634 	return 0;
1635 }
1636 
1637 static void unix_sock_inherit_flags(const struct socket *old,
1638 				    struct socket *new)
1639 {
1640 	if (test_bit(SOCK_PASSCRED, &old->flags))
1641 		set_bit(SOCK_PASSCRED, &new->flags);
1642 	if (test_bit(SOCK_PASSSEC, &old->flags))
1643 		set_bit(SOCK_PASSSEC, &new->flags);
1644 }
1645 
1646 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1647 		       bool kern)
1648 {
1649 	struct sock *sk = sock->sk;
1650 	struct sock *tsk;
1651 	struct sk_buff *skb;
1652 	int err;
1653 
1654 	err = -EOPNOTSUPP;
1655 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1656 		goto out;
1657 
1658 	err = -EINVAL;
1659 	if (sk->sk_state != TCP_LISTEN)
1660 		goto out;
1661 
1662 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1663 	 * so that no locks are necessary.
1664 	 */
1665 
1666 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1667 				&err);
1668 	if (!skb) {
1669 		/* This means receive shutdown. */
1670 		if (err == 0)
1671 			err = -EINVAL;
1672 		goto out;
1673 	}
1674 
1675 	tsk = skb->sk;
1676 	skb_free_datagram(sk, skb);
1677 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1678 
1679 	/* attach accepted sock to socket */
1680 	unix_state_lock(tsk);
1681 	newsock->state = SS_CONNECTED;
1682 	unix_sock_inherit_flags(sock, newsock);
1683 	sock_graft(tsk, newsock);
1684 	unix_state_unlock(tsk);
1685 	return 0;
1686 
1687 out:
1688 	return err;
1689 }
1690 
1691 
1692 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1693 {
1694 	struct sock *sk = sock->sk;
1695 	struct unix_address *addr;
1696 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1697 	int err = 0;
1698 
1699 	if (peer) {
1700 		sk = unix_peer_get(sk);
1701 
1702 		err = -ENOTCONN;
1703 		if (!sk)
1704 			goto out;
1705 		err = 0;
1706 	} else {
1707 		sock_hold(sk);
1708 	}
1709 
1710 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1711 	if (!addr) {
1712 		sunaddr->sun_family = AF_UNIX;
1713 		sunaddr->sun_path[0] = 0;
1714 		err = offsetof(struct sockaddr_un, sun_path);
1715 	} else {
1716 		err = addr->len;
1717 		memcpy(sunaddr, addr->name, addr->len);
1718 	}
1719 	sock_put(sk);
1720 out:
1721 	return err;
1722 }
1723 
1724 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1725 {
1726 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1727 
1728 	/*
1729 	 * Garbage collection of unix sockets starts by selecting a set of
1730 	 * candidate sockets which have reference only from being in flight
1731 	 * (total_refs == inflight_refs).  This condition is checked once during
1732 	 * the candidate collection phase, and candidates are marked as such, so
1733 	 * that non-candidates can later be ignored.  While inflight_refs is
1734 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1735 	 * is an instantaneous decision.
1736 	 *
1737 	 * Once a candidate, however, the socket must not be reinstalled into a
1738 	 * file descriptor while the garbage collection is in progress.
1739 	 *
1740 	 * If the above conditions are met, then the directed graph of
1741 	 * candidates (*) does not change while unix_gc_lock is held.
1742 	 *
1743 	 * Any operations that changes the file count through file descriptors
1744 	 * (dup, close, sendmsg) does not change the graph since candidates are
1745 	 * not installed in fds.
1746 	 *
1747 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1748 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1749 	 * serialized with garbage collection.
1750 	 *
1751 	 * MSG_PEEK is special in that it does not change the inflight count,
1752 	 * yet does install the socket into an fd.  The following lock/unlock
1753 	 * pair is to ensure serialization with garbage collection.  It must be
1754 	 * done between incrementing the file count and installing the file into
1755 	 * an fd.
1756 	 *
1757 	 * If garbage collection starts after the barrier provided by the
1758 	 * lock/unlock, then it will see the elevated refcount and not mark this
1759 	 * as a candidate.  If a garbage collection is already in progress
1760 	 * before the file count was incremented, then the lock/unlock pair will
1761 	 * ensure that garbage collection is finished before progressing to
1762 	 * installing the fd.
1763 	 *
1764 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1765 	 * which is on the queue of listening socket A.
1766 	 */
1767 	spin_lock(&unix_gc_lock);
1768 	spin_unlock(&unix_gc_lock);
1769 }
1770 
1771 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1772 {
1773 	int err = 0;
1774 
1775 	UNIXCB(skb).pid  = get_pid(scm->pid);
1776 	UNIXCB(skb).uid = scm->creds.uid;
1777 	UNIXCB(skb).gid = scm->creds.gid;
1778 	UNIXCB(skb).fp = NULL;
1779 	unix_get_secdata(scm, skb);
1780 	if (scm->fp && send_fds)
1781 		err = unix_attach_fds(scm, skb);
1782 
1783 	skb->destructor = unix_destruct_scm;
1784 	return err;
1785 }
1786 
1787 static bool unix_passcred_enabled(const struct socket *sock,
1788 				  const struct sock *other)
1789 {
1790 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1791 	       !other->sk_socket ||
1792 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1793 }
1794 
1795 /*
1796  * Some apps rely on write() giving SCM_CREDENTIALS
1797  * We include credentials if source or destination socket
1798  * asserted SOCK_PASSCRED.
1799  */
1800 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1801 			    const struct sock *other)
1802 {
1803 	if (UNIXCB(skb).pid)
1804 		return;
1805 	if (unix_passcred_enabled(sock, other)) {
1806 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1807 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1808 	}
1809 }
1810 
1811 static int maybe_init_creds(struct scm_cookie *scm,
1812 			    struct socket *socket,
1813 			    const struct sock *other)
1814 {
1815 	int err;
1816 	struct msghdr msg = { .msg_controllen = 0 };
1817 
1818 	err = scm_send(socket, &msg, scm, false);
1819 	if (err)
1820 		return err;
1821 
1822 	if (unix_passcred_enabled(socket, other)) {
1823 		scm->pid = get_pid(task_tgid(current));
1824 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1825 	}
1826 	return err;
1827 }
1828 
1829 static bool unix_skb_scm_eq(struct sk_buff *skb,
1830 			    struct scm_cookie *scm)
1831 {
1832 	return UNIXCB(skb).pid == scm->pid &&
1833 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1834 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1835 	       unix_secdata_eq(scm, skb);
1836 }
1837 
1838 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1839 {
1840 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1841 	struct unix_sock *u = unix_sk(sk);
1842 
1843 	if (unlikely(fp && fp->count))
1844 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1845 }
1846 
1847 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1848 {
1849 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1850 	struct unix_sock *u = unix_sk(sk);
1851 
1852 	if (unlikely(fp && fp->count))
1853 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1854 }
1855 
1856 /*
1857  *	Send AF_UNIX data.
1858  */
1859 
1860 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1861 			      size_t len)
1862 {
1863 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1864 	struct sock *sk = sock->sk, *other = NULL;
1865 	struct unix_sock *u = unix_sk(sk);
1866 	struct scm_cookie scm;
1867 	struct sk_buff *skb;
1868 	int data_len = 0;
1869 	int sk_locked;
1870 	long timeo;
1871 	int err;
1872 
1873 	wait_for_unix_gc();
1874 	err = scm_send(sock, msg, &scm, false);
1875 	if (err < 0)
1876 		return err;
1877 
1878 	err = -EOPNOTSUPP;
1879 	if (msg->msg_flags&MSG_OOB)
1880 		goto out;
1881 
1882 	if (msg->msg_namelen) {
1883 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1884 		if (err)
1885 			goto out;
1886 	} else {
1887 		sunaddr = NULL;
1888 		err = -ENOTCONN;
1889 		other = unix_peer_get(sk);
1890 		if (!other)
1891 			goto out;
1892 	}
1893 
1894 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1895 		err = unix_autobind(sk);
1896 		if (err)
1897 			goto out;
1898 	}
1899 
1900 	err = -EMSGSIZE;
1901 	if (len > sk->sk_sndbuf - 32)
1902 		goto out;
1903 
1904 	if (len > SKB_MAX_ALLOC) {
1905 		data_len = min_t(size_t,
1906 				 len - SKB_MAX_ALLOC,
1907 				 MAX_SKB_FRAGS * PAGE_SIZE);
1908 		data_len = PAGE_ALIGN(data_len);
1909 
1910 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1911 	}
1912 
1913 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1914 				   msg->msg_flags & MSG_DONTWAIT, &err,
1915 				   PAGE_ALLOC_COSTLY_ORDER);
1916 	if (skb == NULL)
1917 		goto out;
1918 
1919 	err = unix_scm_to_skb(&scm, skb, true);
1920 	if (err < 0)
1921 		goto out_free;
1922 
1923 	skb_put(skb, len - data_len);
1924 	skb->data_len = data_len;
1925 	skb->len = len;
1926 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1927 	if (err)
1928 		goto out_free;
1929 
1930 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1931 
1932 restart:
1933 	if (!other) {
1934 		err = -ECONNRESET;
1935 		if (sunaddr == NULL)
1936 			goto out_free;
1937 
1938 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1939 					sk->sk_type);
1940 		if (IS_ERR(other)) {
1941 			err = PTR_ERR(other);
1942 			other = NULL;
1943 			goto out_free;
1944 		}
1945 	}
1946 
1947 	if (sk_filter(other, skb) < 0) {
1948 		/* Toss the packet but do not return any error to the sender */
1949 		err = len;
1950 		goto out_free;
1951 	}
1952 
1953 	sk_locked = 0;
1954 	unix_state_lock(other);
1955 restart_locked:
1956 	err = -EPERM;
1957 	if (!unix_may_send(sk, other))
1958 		goto out_unlock;
1959 
1960 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1961 		/*
1962 		 *	Check with 1003.1g - what should
1963 		 *	datagram error
1964 		 */
1965 		unix_state_unlock(other);
1966 		sock_put(other);
1967 
1968 		if (!sk_locked)
1969 			unix_state_lock(sk);
1970 
1971 		err = 0;
1972 		if (unix_peer(sk) == other) {
1973 			unix_peer(sk) = NULL;
1974 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1975 
1976 			unix_state_unlock(sk);
1977 
1978 			sk->sk_state = TCP_CLOSE;
1979 			unix_dgram_disconnected(sk, other);
1980 			sock_put(other);
1981 			err = -ECONNREFUSED;
1982 		} else {
1983 			unix_state_unlock(sk);
1984 		}
1985 
1986 		other = NULL;
1987 		if (err)
1988 			goto out_free;
1989 		goto restart;
1990 	}
1991 
1992 	err = -EPIPE;
1993 	if (other->sk_shutdown & RCV_SHUTDOWN)
1994 		goto out_unlock;
1995 
1996 	if (sk->sk_type != SOCK_SEQPACKET) {
1997 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1998 		if (err)
1999 			goto out_unlock;
2000 	}
2001 
2002 	/* other == sk && unix_peer(other) != sk if
2003 	 * - unix_peer(sk) == NULL, destination address bound to sk
2004 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2005 	 */
2006 	if (other != sk &&
2007 	    unlikely(unix_peer(other) != sk &&
2008 	    unix_recvq_full_lockless(other))) {
2009 		if (timeo) {
2010 			timeo = unix_wait_for_peer(other, timeo);
2011 
2012 			err = sock_intr_errno(timeo);
2013 			if (signal_pending(current))
2014 				goto out_free;
2015 
2016 			goto restart;
2017 		}
2018 
2019 		if (!sk_locked) {
2020 			unix_state_unlock(other);
2021 			unix_state_double_lock(sk, other);
2022 		}
2023 
2024 		if (unix_peer(sk) != other ||
2025 		    unix_dgram_peer_wake_me(sk, other)) {
2026 			err = -EAGAIN;
2027 			sk_locked = 1;
2028 			goto out_unlock;
2029 		}
2030 
2031 		if (!sk_locked) {
2032 			sk_locked = 1;
2033 			goto restart_locked;
2034 		}
2035 	}
2036 
2037 	if (unlikely(sk_locked))
2038 		unix_state_unlock(sk);
2039 
2040 	if (sock_flag(other, SOCK_RCVTSTAMP))
2041 		__net_timestamp(skb);
2042 	maybe_add_creds(skb, sock, other);
2043 	scm_stat_add(other, skb);
2044 	skb_queue_tail(&other->sk_receive_queue, skb);
2045 	unix_state_unlock(other);
2046 	other->sk_data_ready(other);
2047 	sock_put(other);
2048 	scm_destroy(&scm);
2049 	return len;
2050 
2051 out_unlock:
2052 	if (sk_locked)
2053 		unix_state_unlock(sk);
2054 	unix_state_unlock(other);
2055 out_free:
2056 	kfree_skb(skb);
2057 out:
2058 	if (other)
2059 		sock_put(other);
2060 	scm_destroy(&scm);
2061 	return err;
2062 }
2063 
2064 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2065  * bytes, and a minimum of a full page.
2066  */
2067 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2068 
2069 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2070 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2071 {
2072 	struct unix_sock *ousk = unix_sk(other);
2073 	struct sk_buff *skb;
2074 	int err = 0;
2075 
2076 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2077 
2078 	if (!skb)
2079 		return err;
2080 
2081 	skb_put(skb, 1);
2082 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2083 
2084 	if (err) {
2085 		kfree_skb(skb);
2086 		return err;
2087 	}
2088 
2089 	unix_state_lock(other);
2090 
2091 	if (sock_flag(other, SOCK_DEAD) ||
2092 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2093 		unix_state_unlock(other);
2094 		kfree_skb(skb);
2095 		return -EPIPE;
2096 	}
2097 
2098 	maybe_add_creds(skb, sock, other);
2099 	skb_get(skb);
2100 
2101 	if (ousk->oob_skb)
2102 		consume_skb(ousk->oob_skb);
2103 
2104 	WRITE_ONCE(ousk->oob_skb, skb);
2105 
2106 	scm_stat_add(other, skb);
2107 	skb_queue_tail(&other->sk_receive_queue, skb);
2108 	sk_send_sigurg(other);
2109 	unix_state_unlock(other);
2110 	other->sk_data_ready(other);
2111 
2112 	return err;
2113 }
2114 #endif
2115 
2116 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2117 			       size_t len)
2118 {
2119 	struct sock *sk = sock->sk;
2120 	struct sock *other = NULL;
2121 	int err, size;
2122 	struct sk_buff *skb;
2123 	int sent = 0;
2124 	struct scm_cookie scm;
2125 	bool fds_sent = false;
2126 	int data_len;
2127 
2128 	wait_for_unix_gc();
2129 	err = scm_send(sock, msg, &scm, false);
2130 	if (err < 0)
2131 		return err;
2132 
2133 	err = -EOPNOTSUPP;
2134 	if (msg->msg_flags & MSG_OOB) {
2135 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2136 		if (len)
2137 			len--;
2138 		else
2139 #endif
2140 			goto out_err;
2141 	}
2142 
2143 	if (msg->msg_namelen) {
2144 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2145 		goto out_err;
2146 	} else {
2147 		err = -ENOTCONN;
2148 		other = unix_peer(sk);
2149 		if (!other)
2150 			goto out_err;
2151 	}
2152 
2153 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2154 		goto pipe_err;
2155 
2156 	while (sent < len) {
2157 		size = len - sent;
2158 
2159 		/* Keep two messages in the pipe so it schedules better */
2160 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2161 
2162 		/* allow fallback to order-0 allocations */
2163 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2164 
2165 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2166 
2167 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2168 
2169 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2170 					   msg->msg_flags & MSG_DONTWAIT, &err,
2171 					   get_order(UNIX_SKB_FRAGS_SZ));
2172 		if (!skb)
2173 			goto out_err;
2174 
2175 		/* Only send the fds in the first buffer */
2176 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2177 		if (err < 0) {
2178 			kfree_skb(skb);
2179 			goto out_err;
2180 		}
2181 		fds_sent = true;
2182 
2183 		skb_put(skb, size - data_len);
2184 		skb->data_len = data_len;
2185 		skb->len = size;
2186 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2187 		if (err) {
2188 			kfree_skb(skb);
2189 			goto out_err;
2190 		}
2191 
2192 		unix_state_lock(other);
2193 
2194 		if (sock_flag(other, SOCK_DEAD) ||
2195 		    (other->sk_shutdown & RCV_SHUTDOWN))
2196 			goto pipe_err_free;
2197 
2198 		maybe_add_creds(skb, sock, other);
2199 		scm_stat_add(other, skb);
2200 		skb_queue_tail(&other->sk_receive_queue, skb);
2201 		unix_state_unlock(other);
2202 		other->sk_data_ready(other);
2203 		sent += size;
2204 	}
2205 
2206 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2207 	if (msg->msg_flags & MSG_OOB) {
2208 		err = queue_oob(sock, msg, other);
2209 		if (err)
2210 			goto out_err;
2211 		sent++;
2212 	}
2213 #endif
2214 
2215 	scm_destroy(&scm);
2216 
2217 	return sent;
2218 
2219 pipe_err_free:
2220 	unix_state_unlock(other);
2221 	kfree_skb(skb);
2222 pipe_err:
2223 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2224 		send_sig(SIGPIPE, current, 0);
2225 	err = -EPIPE;
2226 out_err:
2227 	scm_destroy(&scm);
2228 	return sent ? : err;
2229 }
2230 
2231 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2232 				    int offset, size_t size, int flags)
2233 {
2234 	int err;
2235 	bool send_sigpipe = false;
2236 	bool init_scm = true;
2237 	struct scm_cookie scm;
2238 	struct sock *other, *sk = socket->sk;
2239 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2240 
2241 	if (flags & MSG_OOB)
2242 		return -EOPNOTSUPP;
2243 
2244 	other = unix_peer(sk);
2245 	if (!other || sk->sk_state != TCP_ESTABLISHED)
2246 		return -ENOTCONN;
2247 
2248 	if (false) {
2249 alloc_skb:
2250 		unix_state_unlock(other);
2251 		mutex_unlock(&unix_sk(other)->iolock);
2252 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2253 					      &err, 0);
2254 		if (!newskb)
2255 			goto err;
2256 	}
2257 
2258 	/* we must acquire iolock as we modify already present
2259 	 * skbs in the sk_receive_queue and mess with skb->len
2260 	 */
2261 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2262 	if (err) {
2263 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2264 		goto err;
2265 	}
2266 
2267 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2268 		err = -EPIPE;
2269 		send_sigpipe = true;
2270 		goto err_unlock;
2271 	}
2272 
2273 	unix_state_lock(other);
2274 
2275 	if (sock_flag(other, SOCK_DEAD) ||
2276 	    other->sk_shutdown & RCV_SHUTDOWN) {
2277 		err = -EPIPE;
2278 		send_sigpipe = true;
2279 		goto err_state_unlock;
2280 	}
2281 
2282 	if (init_scm) {
2283 		err = maybe_init_creds(&scm, socket, other);
2284 		if (err)
2285 			goto err_state_unlock;
2286 		init_scm = false;
2287 	}
2288 
2289 	skb = skb_peek_tail(&other->sk_receive_queue);
2290 	if (tail && tail == skb) {
2291 		skb = newskb;
2292 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2293 		if (newskb) {
2294 			skb = newskb;
2295 		} else {
2296 			tail = skb;
2297 			goto alloc_skb;
2298 		}
2299 	} else if (newskb) {
2300 		/* this is fast path, we don't necessarily need to
2301 		 * call to kfree_skb even though with newskb == NULL
2302 		 * this - does no harm
2303 		 */
2304 		consume_skb(newskb);
2305 		newskb = NULL;
2306 	}
2307 
2308 	if (skb_append_pagefrags(skb, page, offset, size)) {
2309 		tail = skb;
2310 		goto alloc_skb;
2311 	}
2312 
2313 	skb->len += size;
2314 	skb->data_len += size;
2315 	skb->truesize += size;
2316 	refcount_add(size, &sk->sk_wmem_alloc);
2317 
2318 	if (newskb) {
2319 		err = unix_scm_to_skb(&scm, skb, false);
2320 		if (err)
2321 			goto err_state_unlock;
2322 		spin_lock(&other->sk_receive_queue.lock);
2323 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2324 		spin_unlock(&other->sk_receive_queue.lock);
2325 	}
2326 
2327 	unix_state_unlock(other);
2328 	mutex_unlock(&unix_sk(other)->iolock);
2329 
2330 	other->sk_data_ready(other);
2331 	scm_destroy(&scm);
2332 	return size;
2333 
2334 err_state_unlock:
2335 	unix_state_unlock(other);
2336 err_unlock:
2337 	mutex_unlock(&unix_sk(other)->iolock);
2338 err:
2339 	kfree_skb(newskb);
2340 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2341 		send_sig(SIGPIPE, current, 0);
2342 	if (!init_scm)
2343 		scm_destroy(&scm);
2344 	return err;
2345 }
2346 
2347 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2348 				  size_t len)
2349 {
2350 	int err;
2351 	struct sock *sk = sock->sk;
2352 
2353 	err = sock_error(sk);
2354 	if (err)
2355 		return err;
2356 
2357 	if (sk->sk_state != TCP_ESTABLISHED)
2358 		return -ENOTCONN;
2359 
2360 	if (msg->msg_namelen)
2361 		msg->msg_namelen = 0;
2362 
2363 	return unix_dgram_sendmsg(sock, msg, len);
2364 }
2365 
2366 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2367 				  size_t size, int flags)
2368 {
2369 	struct sock *sk = sock->sk;
2370 
2371 	if (sk->sk_state != TCP_ESTABLISHED)
2372 		return -ENOTCONN;
2373 
2374 	return unix_dgram_recvmsg(sock, msg, size, flags);
2375 }
2376 
2377 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2378 {
2379 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2380 
2381 	if (addr) {
2382 		msg->msg_namelen = addr->len;
2383 		memcpy(msg->msg_name, addr->name, addr->len);
2384 	}
2385 }
2386 
2387 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2388 			 int flags)
2389 {
2390 	struct scm_cookie scm;
2391 	struct socket *sock = sk->sk_socket;
2392 	struct unix_sock *u = unix_sk(sk);
2393 	struct sk_buff *skb, *last;
2394 	long timeo;
2395 	int skip;
2396 	int err;
2397 
2398 	err = -EOPNOTSUPP;
2399 	if (flags&MSG_OOB)
2400 		goto out;
2401 
2402 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2403 
2404 	do {
2405 		mutex_lock(&u->iolock);
2406 
2407 		skip = sk_peek_offset(sk, flags);
2408 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2409 					      &skip, &err, &last);
2410 		if (skb) {
2411 			if (!(flags & MSG_PEEK))
2412 				scm_stat_del(sk, skb);
2413 			break;
2414 		}
2415 
2416 		mutex_unlock(&u->iolock);
2417 
2418 		if (err != -EAGAIN)
2419 			break;
2420 	} while (timeo &&
2421 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2422 					      &err, &timeo, last));
2423 
2424 	if (!skb) { /* implies iolock unlocked */
2425 		unix_state_lock(sk);
2426 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2427 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2428 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2429 			err = 0;
2430 		unix_state_unlock(sk);
2431 		goto out;
2432 	}
2433 
2434 	if (wq_has_sleeper(&u->peer_wait))
2435 		wake_up_interruptible_sync_poll(&u->peer_wait,
2436 						EPOLLOUT | EPOLLWRNORM |
2437 						EPOLLWRBAND);
2438 
2439 	if (msg->msg_name)
2440 		unix_copy_addr(msg, skb->sk);
2441 
2442 	if (size > skb->len - skip)
2443 		size = skb->len - skip;
2444 	else if (size < skb->len - skip)
2445 		msg->msg_flags |= MSG_TRUNC;
2446 
2447 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2448 	if (err)
2449 		goto out_free;
2450 
2451 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2452 		__sock_recv_timestamp(msg, sk, skb);
2453 
2454 	memset(&scm, 0, sizeof(scm));
2455 
2456 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2457 	unix_set_secdata(&scm, skb);
2458 
2459 	if (!(flags & MSG_PEEK)) {
2460 		if (UNIXCB(skb).fp)
2461 			unix_detach_fds(&scm, skb);
2462 
2463 		sk_peek_offset_bwd(sk, skb->len);
2464 	} else {
2465 		/* It is questionable: on PEEK we could:
2466 		   - do not return fds - good, but too simple 8)
2467 		   - return fds, and do not return them on read (old strategy,
2468 		     apparently wrong)
2469 		   - clone fds (I chose it for now, it is the most universal
2470 		     solution)
2471 
2472 		   POSIX 1003.1g does not actually define this clearly
2473 		   at all. POSIX 1003.1g doesn't define a lot of things
2474 		   clearly however!
2475 
2476 		*/
2477 
2478 		sk_peek_offset_fwd(sk, size);
2479 
2480 		if (UNIXCB(skb).fp)
2481 			unix_peek_fds(&scm, skb);
2482 	}
2483 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2484 
2485 	scm_recv(sock, msg, &scm, flags);
2486 
2487 out_free:
2488 	skb_free_datagram(sk, skb);
2489 	mutex_unlock(&u->iolock);
2490 out:
2491 	return err;
2492 }
2493 
2494 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2495 			      int flags)
2496 {
2497 	struct sock *sk = sock->sk;
2498 
2499 #ifdef CONFIG_BPF_SYSCALL
2500 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2501 
2502 	if (prot != &unix_dgram_proto)
2503 		return prot->recvmsg(sk, msg, size, flags, NULL);
2504 #endif
2505 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2506 }
2507 
2508 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2509 {
2510 	int copied = 0;
2511 
2512 	while (1) {
2513 		struct unix_sock *u = unix_sk(sk);
2514 		struct sk_buff *skb;
2515 		int used, err;
2516 
2517 		mutex_lock(&u->iolock);
2518 		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2519 		mutex_unlock(&u->iolock);
2520 		if (!skb)
2521 			return err;
2522 
2523 		used = recv_actor(sk, skb);
2524 		if (used <= 0) {
2525 			if (!copied)
2526 				copied = used;
2527 			kfree_skb(skb);
2528 			break;
2529 		} else if (used <= skb->len) {
2530 			copied += used;
2531 		}
2532 
2533 		kfree_skb(skb);
2534 		break;
2535 	}
2536 
2537 	return copied;
2538 }
2539 
2540 /*
2541  *	Sleep until more data has arrived. But check for races..
2542  */
2543 static long unix_stream_data_wait(struct sock *sk, long timeo,
2544 				  struct sk_buff *last, unsigned int last_len,
2545 				  bool freezable)
2546 {
2547 	struct sk_buff *tail;
2548 	DEFINE_WAIT(wait);
2549 
2550 	unix_state_lock(sk);
2551 
2552 	for (;;) {
2553 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2554 
2555 		tail = skb_peek_tail(&sk->sk_receive_queue);
2556 		if (tail != last ||
2557 		    (tail && tail->len != last_len) ||
2558 		    sk->sk_err ||
2559 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2560 		    signal_pending(current) ||
2561 		    !timeo)
2562 			break;
2563 
2564 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2565 		unix_state_unlock(sk);
2566 		if (freezable)
2567 			timeo = freezable_schedule_timeout(timeo);
2568 		else
2569 			timeo = schedule_timeout(timeo);
2570 		unix_state_lock(sk);
2571 
2572 		if (sock_flag(sk, SOCK_DEAD))
2573 			break;
2574 
2575 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2576 	}
2577 
2578 	finish_wait(sk_sleep(sk), &wait);
2579 	unix_state_unlock(sk);
2580 	return timeo;
2581 }
2582 
2583 static unsigned int unix_skb_len(const struct sk_buff *skb)
2584 {
2585 	return skb->len - UNIXCB(skb).consumed;
2586 }
2587 
2588 struct unix_stream_read_state {
2589 	int (*recv_actor)(struct sk_buff *, int, int,
2590 			  struct unix_stream_read_state *);
2591 	struct socket *socket;
2592 	struct msghdr *msg;
2593 	struct pipe_inode_info *pipe;
2594 	size_t size;
2595 	int flags;
2596 	unsigned int splice_flags;
2597 };
2598 
2599 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2600 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2601 {
2602 	struct socket *sock = state->socket;
2603 	struct sock *sk = sock->sk;
2604 	struct unix_sock *u = unix_sk(sk);
2605 	int chunk = 1;
2606 	struct sk_buff *oob_skb;
2607 
2608 	mutex_lock(&u->iolock);
2609 	unix_state_lock(sk);
2610 
2611 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2612 		unix_state_unlock(sk);
2613 		mutex_unlock(&u->iolock);
2614 		return -EINVAL;
2615 	}
2616 
2617 	oob_skb = u->oob_skb;
2618 
2619 	if (!(state->flags & MSG_PEEK))
2620 		WRITE_ONCE(u->oob_skb, NULL);
2621 
2622 	unix_state_unlock(sk);
2623 
2624 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2625 
2626 	if (!(state->flags & MSG_PEEK)) {
2627 		UNIXCB(oob_skb).consumed += 1;
2628 		kfree_skb(oob_skb);
2629 	}
2630 
2631 	mutex_unlock(&u->iolock);
2632 
2633 	if (chunk < 0)
2634 		return -EFAULT;
2635 
2636 	state->msg->msg_flags |= MSG_OOB;
2637 	return 1;
2638 }
2639 
2640 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2641 				  int flags, int copied)
2642 {
2643 	struct unix_sock *u = unix_sk(sk);
2644 
2645 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2646 		skb_unlink(skb, &sk->sk_receive_queue);
2647 		consume_skb(skb);
2648 		skb = NULL;
2649 	} else {
2650 		if (skb == u->oob_skb) {
2651 			if (copied) {
2652 				skb = NULL;
2653 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2654 				if (!(flags & MSG_PEEK)) {
2655 					WRITE_ONCE(u->oob_skb, NULL);
2656 					consume_skb(skb);
2657 				}
2658 			} else if (!(flags & MSG_PEEK)) {
2659 				skb_unlink(skb, &sk->sk_receive_queue);
2660 				consume_skb(skb);
2661 				skb = skb_peek(&sk->sk_receive_queue);
2662 			}
2663 		}
2664 	}
2665 	return skb;
2666 }
2667 #endif
2668 
2669 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2670 {
2671 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2672 		return -ENOTCONN;
2673 
2674 	return unix_read_skb(sk, recv_actor);
2675 }
2676 
2677 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2678 				    bool freezable)
2679 {
2680 	struct scm_cookie scm;
2681 	struct socket *sock = state->socket;
2682 	struct sock *sk = sock->sk;
2683 	struct unix_sock *u = unix_sk(sk);
2684 	int copied = 0;
2685 	int flags = state->flags;
2686 	int noblock = flags & MSG_DONTWAIT;
2687 	bool check_creds = false;
2688 	int target;
2689 	int err = 0;
2690 	long timeo;
2691 	int skip;
2692 	size_t size = state->size;
2693 	unsigned int last_len;
2694 
2695 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2696 		err = -EINVAL;
2697 		goto out;
2698 	}
2699 
2700 	if (unlikely(flags & MSG_OOB)) {
2701 		err = -EOPNOTSUPP;
2702 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2703 		err = unix_stream_recv_urg(state);
2704 #endif
2705 		goto out;
2706 	}
2707 
2708 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2709 	timeo = sock_rcvtimeo(sk, noblock);
2710 
2711 	memset(&scm, 0, sizeof(scm));
2712 
2713 	/* Lock the socket to prevent queue disordering
2714 	 * while sleeps in memcpy_tomsg
2715 	 */
2716 	mutex_lock(&u->iolock);
2717 
2718 	skip = max(sk_peek_offset(sk, flags), 0);
2719 
2720 	do {
2721 		int chunk;
2722 		bool drop_skb;
2723 		struct sk_buff *skb, *last;
2724 
2725 redo:
2726 		unix_state_lock(sk);
2727 		if (sock_flag(sk, SOCK_DEAD)) {
2728 			err = -ECONNRESET;
2729 			goto unlock;
2730 		}
2731 		last = skb = skb_peek(&sk->sk_receive_queue);
2732 		last_len = last ? last->len : 0;
2733 
2734 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2735 		if (skb) {
2736 			skb = manage_oob(skb, sk, flags, copied);
2737 			if (!skb) {
2738 				unix_state_unlock(sk);
2739 				if (copied)
2740 					break;
2741 				goto redo;
2742 			}
2743 		}
2744 #endif
2745 again:
2746 		if (skb == NULL) {
2747 			if (copied >= target)
2748 				goto unlock;
2749 
2750 			/*
2751 			 *	POSIX 1003.1g mandates this order.
2752 			 */
2753 
2754 			err = sock_error(sk);
2755 			if (err)
2756 				goto unlock;
2757 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2758 				goto unlock;
2759 
2760 			unix_state_unlock(sk);
2761 			if (!timeo) {
2762 				err = -EAGAIN;
2763 				break;
2764 			}
2765 
2766 			mutex_unlock(&u->iolock);
2767 
2768 			timeo = unix_stream_data_wait(sk, timeo, last,
2769 						      last_len, freezable);
2770 
2771 			if (signal_pending(current)) {
2772 				err = sock_intr_errno(timeo);
2773 				scm_destroy(&scm);
2774 				goto out;
2775 			}
2776 
2777 			mutex_lock(&u->iolock);
2778 			goto redo;
2779 unlock:
2780 			unix_state_unlock(sk);
2781 			break;
2782 		}
2783 
2784 		while (skip >= unix_skb_len(skb)) {
2785 			skip -= unix_skb_len(skb);
2786 			last = skb;
2787 			last_len = skb->len;
2788 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2789 			if (!skb)
2790 				goto again;
2791 		}
2792 
2793 		unix_state_unlock(sk);
2794 
2795 		if (check_creds) {
2796 			/* Never glue messages from different writers */
2797 			if (!unix_skb_scm_eq(skb, &scm))
2798 				break;
2799 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2800 			/* Copy credentials */
2801 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2802 			unix_set_secdata(&scm, skb);
2803 			check_creds = true;
2804 		}
2805 
2806 		/* Copy address just once */
2807 		if (state->msg && state->msg->msg_name) {
2808 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2809 					 state->msg->msg_name);
2810 			unix_copy_addr(state->msg, skb->sk);
2811 			sunaddr = NULL;
2812 		}
2813 
2814 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2815 		skb_get(skb);
2816 		chunk = state->recv_actor(skb, skip, chunk, state);
2817 		drop_skb = !unix_skb_len(skb);
2818 		/* skb is only safe to use if !drop_skb */
2819 		consume_skb(skb);
2820 		if (chunk < 0) {
2821 			if (copied == 0)
2822 				copied = -EFAULT;
2823 			break;
2824 		}
2825 		copied += chunk;
2826 		size -= chunk;
2827 
2828 		if (drop_skb) {
2829 			/* the skb was touched by a concurrent reader;
2830 			 * we should not expect anything from this skb
2831 			 * anymore and assume it invalid - we can be
2832 			 * sure it was dropped from the socket queue
2833 			 *
2834 			 * let's report a short read
2835 			 */
2836 			err = 0;
2837 			break;
2838 		}
2839 
2840 		/* Mark read part of skb as used */
2841 		if (!(flags & MSG_PEEK)) {
2842 			UNIXCB(skb).consumed += chunk;
2843 
2844 			sk_peek_offset_bwd(sk, chunk);
2845 
2846 			if (UNIXCB(skb).fp) {
2847 				scm_stat_del(sk, skb);
2848 				unix_detach_fds(&scm, skb);
2849 			}
2850 
2851 			if (unix_skb_len(skb))
2852 				break;
2853 
2854 			skb_unlink(skb, &sk->sk_receive_queue);
2855 			consume_skb(skb);
2856 
2857 			if (scm.fp)
2858 				break;
2859 		} else {
2860 			/* It is questionable, see note in unix_dgram_recvmsg.
2861 			 */
2862 			if (UNIXCB(skb).fp)
2863 				unix_peek_fds(&scm, skb);
2864 
2865 			sk_peek_offset_fwd(sk, chunk);
2866 
2867 			if (UNIXCB(skb).fp)
2868 				break;
2869 
2870 			skip = 0;
2871 			last = skb;
2872 			last_len = skb->len;
2873 			unix_state_lock(sk);
2874 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2875 			if (skb)
2876 				goto again;
2877 			unix_state_unlock(sk);
2878 			break;
2879 		}
2880 	} while (size);
2881 
2882 	mutex_unlock(&u->iolock);
2883 	if (state->msg)
2884 		scm_recv(sock, state->msg, &scm, flags);
2885 	else
2886 		scm_destroy(&scm);
2887 out:
2888 	return copied ? : err;
2889 }
2890 
2891 static int unix_stream_read_actor(struct sk_buff *skb,
2892 				  int skip, int chunk,
2893 				  struct unix_stream_read_state *state)
2894 {
2895 	int ret;
2896 
2897 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2898 				    state->msg, chunk);
2899 	return ret ?: chunk;
2900 }
2901 
2902 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2903 			  size_t size, int flags)
2904 {
2905 	struct unix_stream_read_state state = {
2906 		.recv_actor = unix_stream_read_actor,
2907 		.socket = sk->sk_socket,
2908 		.msg = msg,
2909 		.size = size,
2910 		.flags = flags
2911 	};
2912 
2913 	return unix_stream_read_generic(&state, true);
2914 }
2915 
2916 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2917 			       size_t size, int flags)
2918 {
2919 	struct unix_stream_read_state state = {
2920 		.recv_actor = unix_stream_read_actor,
2921 		.socket = sock,
2922 		.msg = msg,
2923 		.size = size,
2924 		.flags = flags
2925 	};
2926 
2927 #ifdef CONFIG_BPF_SYSCALL
2928 	struct sock *sk = sock->sk;
2929 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2930 
2931 	if (prot != &unix_stream_proto)
2932 		return prot->recvmsg(sk, msg, size, flags, NULL);
2933 #endif
2934 	return unix_stream_read_generic(&state, true);
2935 }
2936 
2937 static int unix_stream_splice_actor(struct sk_buff *skb,
2938 				    int skip, int chunk,
2939 				    struct unix_stream_read_state *state)
2940 {
2941 	return skb_splice_bits(skb, state->socket->sk,
2942 			       UNIXCB(skb).consumed + skip,
2943 			       state->pipe, chunk, state->splice_flags);
2944 }
2945 
2946 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2947 				       struct pipe_inode_info *pipe,
2948 				       size_t size, unsigned int flags)
2949 {
2950 	struct unix_stream_read_state state = {
2951 		.recv_actor = unix_stream_splice_actor,
2952 		.socket = sock,
2953 		.pipe = pipe,
2954 		.size = size,
2955 		.splice_flags = flags,
2956 	};
2957 
2958 	if (unlikely(*ppos))
2959 		return -ESPIPE;
2960 
2961 	if (sock->file->f_flags & O_NONBLOCK ||
2962 	    flags & SPLICE_F_NONBLOCK)
2963 		state.flags = MSG_DONTWAIT;
2964 
2965 	return unix_stream_read_generic(&state, false);
2966 }
2967 
2968 static int unix_shutdown(struct socket *sock, int mode)
2969 {
2970 	struct sock *sk = sock->sk;
2971 	struct sock *other;
2972 
2973 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2974 		return -EINVAL;
2975 	/* This maps:
2976 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2977 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2978 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2979 	 */
2980 	++mode;
2981 
2982 	unix_state_lock(sk);
2983 	sk->sk_shutdown |= mode;
2984 	other = unix_peer(sk);
2985 	if (other)
2986 		sock_hold(other);
2987 	unix_state_unlock(sk);
2988 	sk->sk_state_change(sk);
2989 
2990 	if (other &&
2991 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2992 
2993 		int peer_mode = 0;
2994 		const struct proto *prot = READ_ONCE(other->sk_prot);
2995 
2996 		if (prot->unhash)
2997 			prot->unhash(other);
2998 		if (mode&RCV_SHUTDOWN)
2999 			peer_mode |= SEND_SHUTDOWN;
3000 		if (mode&SEND_SHUTDOWN)
3001 			peer_mode |= RCV_SHUTDOWN;
3002 		unix_state_lock(other);
3003 		other->sk_shutdown |= peer_mode;
3004 		unix_state_unlock(other);
3005 		other->sk_state_change(other);
3006 		if (peer_mode == SHUTDOWN_MASK)
3007 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3008 		else if (peer_mode & RCV_SHUTDOWN)
3009 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3010 	}
3011 	if (other)
3012 		sock_put(other);
3013 
3014 	return 0;
3015 }
3016 
3017 long unix_inq_len(struct sock *sk)
3018 {
3019 	struct sk_buff *skb;
3020 	long amount = 0;
3021 
3022 	if (sk->sk_state == TCP_LISTEN)
3023 		return -EINVAL;
3024 
3025 	spin_lock(&sk->sk_receive_queue.lock);
3026 	if (sk->sk_type == SOCK_STREAM ||
3027 	    sk->sk_type == SOCK_SEQPACKET) {
3028 		skb_queue_walk(&sk->sk_receive_queue, skb)
3029 			amount += unix_skb_len(skb);
3030 	} else {
3031 		skb = skb_peek(&sk->sk_receive_queue);
3032 		if (skb)
3033 			amount = skb->len;
3034 	}
3035 	spin_unlock(&sk->sk_receive_queue.lock);
3036 
3037 	return amount;
3038 }
3039 EXPORT_SYMBOL_GPL(unix_inq_len);
3040 
3041 long unix_outq_len(struct sock *sk)
3042 {
3043 	return sk_wmem_alloc_get(sk);
3044 }
3045 EXPORT_SYMBOL_GPL(unix_outq_len);
3046 
3047 static int unix_open_file(struct sock *sk)
3048 {
3049 	struct path path;
3050 	struct file *f;
3051 	int fd;
3052 
3053 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3054 		return -EPERM;
3055 
3056 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3057 		return -ENOENT;
3058 
3059 	path = unix_sk(sk)->path;
3060 	if (!path.dentry)
3061 		return -ENOENT;
3062 
3063 	path_get(&path);
3064 
3065 	fd = get_unused_fd_flags(O_CLOEXEC);
3066 	if (fd < 0)
3067 		goto out;
3068 
3069 	f = dentry_open(&path, O_PATH, current_cred());
3070 	if (IS_ERR(f)) {
3071 		put_unused_fd(fd);
3072 		fd = PTR_ERR(f);
3073 		goto out;
3074 	}
3075 
3076 	fd_install(fd, f);
3077 out:
3078 	path_put(&path);
3079 
3080 	return fd;
3081 }
3082 
3083 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3084 {
3085 	struct sock *sk = sock->sk;
3086 	long amount = 0;
3087 	int err;
3088 
3089 	switch (cmd) {
3090 	case SIOCOUTQ:
3091 		amount = unix_outq_len(sk);
3092 		err = put_user(amount, (int __user *)arg);
3093 		break;
3094 	case SIOCINQ:
3095 		amount = unix_inq_len(sk);
3096 		if (amount < 0)
3097 			err = amount;
3098 		else
3099 			err = put_user(amount, (int __user *)arg);
3100 		break;
3101 	case SIOCUNIXFILE:
3102 		err = unix_open_file(sk);
3103 		break;
3104 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3105 	case SIOCATMARK:
3106 		{
3107 			struct sk_buff *skb;
3108 			int answ = 0;
3109 
3110 			skb = skb_peek(&sk->sk_receive_queue);
3111 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3112 				answ = 1;
3113 			err = put_user(answ, (int __user *)arg);
3114 		}
3115 		break;
3116 #endif
3117 	default:
3118 		err = -ENOIOCTLCMD;
3119 		break;
3120 	}
3121 	return err;
3122 }
3123 
3124 #ifdef CONFIG_COMPAT
3125 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3126 {
3127 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3128 }
3129 #endif
3130 
3131 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3132 {
3133 	struct sock *sk = sock->sk;
3134 	__poll_t mask;
3135 
3136 	sock_poll_wait(file, sock, wait);
3137 	mask = 0;
3138 
3139 	/* exceptional events? */
3140 	if (sk->sk_err)
3141 		mask |= EPOLLERR;
3142 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3143 		mask |= EPOLLHUP;
3144 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3145 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3146 
3147 	/* readable? */
3148 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3149 		mask |= EPOLLIN | EPOLLRDNORM;
3150 	if (sk_is_readable(sk))
3151 		mask |= EPOLLIN | EPOLLRDNORM;
3152 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3153 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3154 		mask |= EPOLLPRI;
3155 #endif
3156 
3157 	/* Connection-based need to check for termination and startup */
3158 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3159 	    sk->sk_state == TCP_CLOSE)
3160 		mask |= EPOLLHUP;
3161 
3162 	/*
3163 	 * we set writable also when the other side has shut down the
3164 	 * connection. This prevents stuck sockets.
3165 	 */
3166 	if (unix_writable(sk))
3167 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3168 
3169 	return mask;
3170 }
3171 
3172 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3173 				    poll_table *wait)
3174 {
3175 	struct sock *sk = sock->sk, *other;
3176 	unsigned int writable;
3177 	__poll_t mask;
3178 
3179 	sock_poll_wait(file, sock, wait);
3180 	mask = 0;
3181 
3182 	/* exceptional events? */
3183 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3184 		mask |= EPOLLERR |
3185 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3186 
3187 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3188 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3189 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3190 		mask |= EPOLLHUP;
3191 
3192 	/* readable? */
3193 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3194 		mask |= EPOLLIN | EPOLLRDNORM;
3195 	if (sk_is_readable(sk))
3196 		mask |= EPOLLIN | EPOLLRDNORM;
3197 
3198 	/* Connection-based need to check for termination and startup */
3199 	if (sk->sk_type == SOCK_SEQPACKET) {
3200 		if (sk->sk_state == TCP_CLOSE)
3201 			mask |= EPOLLHUP;
3202 		/* connection hasn't started yet? */
3203 		if (sk->sk_state == TCP_SYN_SENT)
3204 			return mask;
3205 	}
3206 
3207 	/* No write status requested, avoid expensive OUT tests. */
3208 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3209 		return mask;
3210 
3211 	writable = unix_writable(sk);
3212 	if (writable) {
3213 		unix_state_lock(sk);
3214 
3215 		other = unix_peer(sk);
3216 		if (other && unix_peer(other) != sk &&
3217 		    unix_recvq_full_lockless(other) &&
3218 		    unix_dgram_peer_wake_me(sk, other))
3219 			writable = 0;
3220 
3221 		unix_state_unlock(sk);
3222 	}
3223 
3224 	if (writable)
3225 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3226 	else
3227 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3228 
3229 	return mask;
3230 }
3231 
3232 #ifdef CONFIG_PROC_FS
3233 
3234 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3235 
3236 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3237 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3238 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3239 
3240 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3241 {
3242 	unsigned long offset = get_offset(*pos);
3243 	unsigned long bucket = get_bucket(*pos);
3244 	unsigned long count = 0;
3245 	struct sock *sk;
3246 
3247 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3248 	     sk; sk = sk_next(sk)) {
3249 		if (++count == offset)
3250 			break;
3251 	}
3252 
3253 	return sk;
3254 }
3255 
3256 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3257 {
3258 	unsigned long bucket = get_bucket(*pos);
3259 	struct net *net = seq_file_net(seq);
3260 	struct sock *sk;
3261 
3262 	while (bucket < UNIX_HASH_SIZE) {
3263 		spin_lock(&net->unx.table.locks[bucket]);
3264 
3265 		sk = unix_from_bucket(seq, pos);
3266 		if (sk)
3267 			return sk;
3268 
3269 		spin_unlock(&net->unx.table.locks[bucket]);
3270 
3271 		*pos = set_bucket_offset(++bucket, 1);
3272 	}
3273 
3274 	return NULL;
3275 }
3276 
3277 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3278 				  loff_t *pos)
3279 {
3280 	unsigned long bucket = get_bucket(*pos);
3281 
3282 	sk = sk_next(sk);
3283 	if (sk)
3284 		return sk;
3285 
3286 
3287 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3288 
3289 	*pos = set_bucket_offset(++bucket, 1);
3290 
3291 	return unix_get_first(seq, pos);
3292 }
3293 
3294 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3295 {
3296 	if (!*pos)
3297 		return SEQ_START_TOKEN;
3298 
3299 	return unix_get_first(seq, pos);
3300 }
3301 
3302 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3303 {
3304 	++*pos;
3305 
3306 	if (v == SEQ_START_TOKEN)
3307 		return unix_get_first(seq, pos);
3308 
3309 	return unix_get_next(seq, v, pos);
3310 }
3311 
3312 static void unix_seq_stop(struct seq_file *seq, void *v)
3313 {
3314 	struct sock *sk = v;
3315 
3316 	if (sk)
3317 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3318 }
3319 
3320 static int unix_seq_show(struct seq_file *seq, void *v)
3321 {
3322 
3323 	if (v == SEQ_START_TOKEN)
3324 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3325 			 "Inode Path\n");
3326 	else {
3327 		struct sock *s = v;
3328 		struct unix_sock *u = unix_sk(s);
3329 		unix_state_lock(s);
3330 
3331 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3332 			s,
3333 			refcount_read(&s->sk_refcnt),
3334 			0,
3335 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3336 			s->sk_type,
3337 			s->sk_socket ?
3338 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3339 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3340 			sock_i_ino(s));
3341 
3342 		if (u->addr) {	// under a hash table lock here
3343 			int i, len;
3344 			seq_putc(seq, ' ');
3345 
3346 			i = 0;
3347 			len = u->addr->len -
3348 				offsetof(struct sockaddr_un, sun_path);
3349 			if (u->addr->name->sun_path[0]) {
3350 				len--;
3351 			} else {
3352 				seq_putc(seq, '@');
3353 				i++;
3354 			}
3355 			for ( ; i < len; i++)
3356 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3357 					 '@');
3358 		}
3359 		unix_state_unlock(s);
3360 		seq_putc(seq, '\n');
3361 	}
3362 
3363 	return 0;
3364 }
3365 
3366 static const struct seq_operations unix_seq_ops = {
3367 	.start  = unix_seq_start,
3368 	.next   = unix_seq_next,
3369 	.stop   = unix_seq_stop,
3370 	.show   = unix_seq_show,
3371 };
3372 
3373 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3374 struct bpf_unix_iter_state {
3375 	struct seq_net_private p;
3376 	unsigned int cur_sk;
3377 	unsigned int end_sk;
3378 	unsigned int max_sk;
3379 	struct sock **batch;
3380 	bool st_bucket_done;
3381 };
3382 
3383 struct bpf_iter__unix {
3384 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3385 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3386 	uid_t uid __aligned(8);
3387 };
3388 
3389 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3390 			      struct unix_sock *unix_sk, uid_t uid)
3391 {
3392 	struct bpf_iter__unix ctx;
3393 
3394 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3395 	ctx.meta = meta;
3396 	ctx.unix_sk = unix_sk;
3397 	ctx.uid = uid;
3398 	return bpf_iter_run_prog(prog, &ctx);
3399 }
3400 
3401 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3402 
3403 {
3404 	struct bpf_unix_iter_state *iter = seq->private;
3405 	unsigned int expected = 1;
3406 	struct sock *sk;
3407 
3408 	sock_hold(start_sk);
3409 	iter->batch[iter->end_sk++] = start_sk;
3410 
3411 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3412 		if (iter->end_sk < iter->max_sk) {
3413 			sock_hold(sk);
3414 			iter->batch[iter->end_sk++] = sk;
3415 		}
3416 
3417 		expected++;
3418 	}
3419 
3420 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3421 
3422 	return expected;
3423 }
3424 
3425 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3426 {
3427 	while (iter->cur_sk < iter->end_sk)
3428 		sock_put(iter->batch[iter->cur_sk++]);
3429 }
3430 
3431 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3432 				       unsigned int new_batch_sz)
3433 {
3434 	struct sock **new_batch;
3435 
3436 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3437 			     GFP_USER | __GFP_NOWARN);
3438 	if (!new_batch)
3439 		return -ENOMEM;
3440 
3441 	bpf_iter_unix_put_batch(iter);
3442 	kvfree(iter->batch);
3443 	iter->batch = new_batch;
3444 	iter->max_sk = new_batch_sz;
3445 
3446 	return 0;
3447 }
3448 
3449 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3450 					loff_t *pos)
3451 {
3452 	struct bpf_unix_iter_state *iter = seq->private;
3453 	unsigned int expected;
3454 	bool resized = false;
3455 	struct sock *sk;
3456 
3457 	if (iter->st_bucket_done)
3458 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3459 
3460 again:
3461 	/* Get a new batch */
3462 	iter->cur_sk = 0;
3463 	iter->end_sk = 0;
3464 
3465 	sk = unix_get_first(seq, pos);
3466 	if (!sk)
3467 		return NULL; /* Done */
3468 
3469 	expected = bpf_iter_unix_hold_batch(seq, sk);
3470 
3471 	if (iter->end_sk == expected) {
3472 		iter->st_bucket_done = true;
3473 		return sk;
3474 	}
3475 
3476 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3477 		resized = true;
3478 		goto again;
3479 	}
3480 
3481 	return sk;
3482 }
3483 
3484 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3485 {
3486 	if (!*pos)
3487 		return SEQ_START_TOKEN;
3488 
3489 	/* bpf iter does not support lseek, so it always
3490 	 * continue from where it was stop()-ped.
3491 	 */
3492 	return bpf_iter_unix_batch(seq, pos);
3493 }
3494 
3495 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3496 {
3497 	struct bpf_unix_iter_state *iter = seq->private;
3498 	struct sock *sk;
3499 
3500 	/* Whenever seq_next() is called, the iter->cur_sk is
3501 	 * done with seq_show(), so advance to the next sk in
3502 	 * the batch.
3503 	 */
3504 	if (iter->cur_sk < iter->end_sk)
3505 		sock_put(iter->batch[iter->cur_sk++]);
3506 
3507 	++*pos;
3508 
3509 	if (iter->cur_sk < iter->end_sk)
3510 		sk = iter->batch[iter->cur_sk];
3511 	else
3512 		sk = bpf_iter_unix_batch(seq, pos);
3513 
3514 	return sk;
3515 }
3516 
3517 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3518 {
3519 	struct bpf_iter_meta meta;
3520 	struct bpf_prog *prog;
3521 	struct sock *sk = v;
3522 	uid_t uid;
3523 	bool slow;
3524 	int ret;
3525 
3526 	if (v == SEQ_START_TOKEN)
3527 		return 0;
3528 
3529 	slow = lock_sock_fast(sk);
3530 
3531 	if (unlikely(sk_unhashed(sk))) {
3532 		ret = SEQ_SKIP;
3533 		goto unlock;
3534 	}
3535 
3536 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3537 	meta.seq = seq;
3538 	prog = bpf_iter_get_info(&meta, false);
3539 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3540 unlock:
3541 	unlock_sock_fast(sk, slow);
3542 	return ret;
3543 }
3544 
3545 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3546 {
3547 	struct bpf_unix_iter_state *iter = seq->private;
3548 	struct bpf_iter_meta meta;
3549 	struct bpf_prog *prog;
3550 
3551 	if (!v) {
3552 		meta.seq = seq;
3553 		prog = bpf_iter_get_info(&meta, true);
3554 		if (prog)
3555 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3556 	}
3557 
3558 	if (iter->cur_sk < iter->end_sk)
3559 		bpf_iter_unix_put_batch(iter);
3560 }
3561 
3562 static const struct seq_operations bpf_iter_unix_seq_ops = {
3563 	.start	= bpf_iter_unix_seq_start,
3564 	.next	= bpf_iter_unix_seq_next,
3565 	.stop	= bpf_iter_unix_seq_stop,
3566 	.show	= bpf_iter_unix_seq_show,
3567 };
3568 #endif
3569 #endif
3570 
3571 static const struct net_proto_family unix_family_ops = {
3572 	.family = PF_UNIX,
3573 	.create = unix_create,
3574 	.owner	= THIS_MODULE,
3575 };
3576 
3577 
3578 static int __net_init unix_net_init(struct net *net)
3579 {
3580 	int i;
3581 
3582 	net->unx.sysctl_max_dgram_qlen = 10;
3583 	if (unix_sysctl_register(net))
3584 		goto out;
3585 
3586 #ifdef CONFIG_PROC_FS
3587 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3588 			     sizeof(struct seq_net_private)))
3589 		goto err_sysctl;
3590 #endif
3591 
3592 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3593 					      sizeof(spinlock_t), GFP_KERNEL);
3594 	if (!net->unx.table.locks)
3595 		goto err_proc;
3596 
3597 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3598 						sizeof(struct hlist_head),
3599 						GFP_KERNEL);
3600 	if (!net->unx.table.buckets)
3601 		goto free_locks;
3602 
3603 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3604 		spin_lock_init(&net->unx.table.locks[i]);
3605 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3606 	}
3607 
3608 	return 0;
3609 
3610 free_locks:
3611 	kvfree(net->unx.table.locks);
3612 err_proc:
3613 #ifdef CONFIG_PROC_FS
3614 	remove_proc_entry("unix", net->proc_net);
3615 err_sysctl:
3616 #endif
3617 	unix_sysctl_unregister(net);
3618 out:
3619 	return -ENOMEM;
3620 }
3621 
3622 static void __net_exit unix_net_exit(struct net *net)
3623 {
3624 	kvfree(net->unx.table.buckets);
3625 	kvfree(net->unx.table.locks);
3626 	unix_sysctl_unregister(net);
3627 	remove_proc_entry("unix", net->proc_net);
3628 }
3629 
3630 static struct pernet_operations unix_net_ops = {
3631 	.init = unix_net_init,
3632 	.exit = unix_net_exit,
3633 };
3634 
3635 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3636 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3637 		     struct unix_sock *unix_sk, uid_t uid)
3638 
3639 #define INIT_BATCH_SZ 16
3640 
3641 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3642 {
3643 	struct bpf_unix_iter_state *iter = priv_data;
3644 	int err;
3645 
3646 	err = bpf_iter_init_seq_net(priv_data, aux);
3647 	if (err)
3648 		return err;
3649 
3650 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3651 	if (err) {
3652 		bpf_iter_fini_seq_net(priv_data);
3653 		return err;
3654 	}
3655 
3656 	return 0;
3657 }
3658 
3659 static void bpf_iter_fini_unix(void *priv_data)
3660 {
3661 	struct bpf_unix_iter_state *iter = priv_data;
3662 
3663 	bpf_iter_fini_seq_net(priv_data);
3664 	kvfree(iter->batch);
3665 }
3666 
3667 static const struct bpf_iter_seq_info unix_seq_info = {
3668 	.seq_ops		= &bpf_iter_unix_seq_ops,
3669 	.init_seq_private	= bpf_iter_init_unix,
3670 	.fini_seq_private	= bpf_iter_fini_unix,
3671 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3672 };
3673 
3674 static const struct bpf_func_proto *
3675 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3676 			     const struct bpf_prog *prog)
3677 {
3678 	switch (func_id) {
3679 	case BPF_FUNC_setsockopt:
3680 		return &bpf_sk_setsockopt_proto;
3681 	case BPF_FUNC_getsockopt:
3682 		return &bpf_sk_getsockopt_proto;
3683 	default:
3684 		return NULL;
3685 	}
3686 }
3687 
3688 static struct bpf_iter_reg unix_reg_info = {
3689 	.target			= "unix",
3690 	.ctx_arg_info_size	= 1,
3691 	.ctx_arg_info		= {
3692 		{ offsetof(struct bpf_iter__unix, unix_sk),
3693 		  PTR_TO_BTF_ID_OR_NULL },
3694 	},
3695 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3696 	.seq_info		= &unix_seq_info,
3697 };
3698 
3699 static void __init bpf_iter_register(void)
3700 {
3701 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3702 	if (bpf_iter_reg_target(&unix_reg_info))
3703 		pr_warn("Warning: could not register bpf iterator unix\n");
3704 }
3705 #endif
3706 
3707 static int __init af_unix_init(void)
3708 {
3709 	int i, rc = -1;
3710 
3711 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3712 
3713 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3714 		spin_lock_init(&bsd_socket_locks[i]);
3715 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3716 	}
3717 
3718 	rc = proto_register(&unix_dgram_proto, 1);
3719 	if (rc != 0) {
3720 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3721 		goto out;
3722 	}
3723 
3724 	rc = proto_register(&unix_stream_proto, 1);
3725 	if (rc != 0) {
3726 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3727 		goto out;
3728 	}
3729 
3730 	sock_register(&unix_family_ops);
3731 	register_pernet_subsys(&unix_net_ops);
3732 	unix_bpf_build_proto();
3733 
3734 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3735 	bpf_iter_register();
3736 #endif
3737 
3738 out:
3739 	return rc;
3740 }
3741 
3742 static void __exit af_unix_exit(void)
3743 {
3744 	sock_unregister(PF_UNIX);
3745 	proto_unregister(&unix_dgram_proto);
3746 	proto_unregister(&unix_stream_proto);
3747 	unregister_pernet_subsys(&unix_net_ops);
3748 }
3749 
3750 /* Earlier than device_initcall() so that other drivers invoking
3751    request_module() don't end up in a loop when modprobe tries
3752    to use a UNIX socket. But later than subsys_initcall() because
3753    we depend on stuff initialised there */
3754 fs_initcall(af_unix_init);
3755 module_exit(af_unix_exit);
3756 
3757 MODULE_LICENSE("GPL");
3758 MODULE_ALIAS_NETPROTO(PF_UNIX);
3759