xref: /openbmc/linux/net/unix/af_unix.c (revision 61ae993c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 #define unix_peer(sk) (unix_sk(sk)->peer)
216 
217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
218 {
219 	return unix_peer(osk) == sk;
220 }
221 
222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
223 {
224 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
225 }
226 
227 static inline int unix_recvq_full(const struct sock *sk)
228 {
229 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
230 }
231 
232 static inline int unix_recvq_full_lockless(const struct sock *sk)
233 {
234 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 		READ_ONCE(sk->sk_max_ack_backlog);
236 }
237 
238 struct sock *unix_peer_get(struct sock *s)
239 {
240 	struct sock *peer;
241 
242 	unix_state_lock(s);
243 	peer = unix_peer(s);
244 	if (peer)
245 		sock_hold(peer);
246 	unix_state_unlock(s);
247 	return peer;
248 }
249 EXPORT_SYMBOL_GPL(unix_peer_get);
250 
251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
252 					     int addr_len)
253 {
254 	struct unix_address *addr;
255 
256 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
257 	if (!addr)
258 		return NULL;
259 
260 	refcount_set(&addr->refcnt, 1);
261 	addr->len = addr_len;
262 	memcpy(addr->name, sunaddr, addr_len);
263 
264 	return addr;
265 }
266 
267 static inline void unix_release_addr(struct unix_address *addr)
268 {
269 	if (refcount_dec_and_test(&addr->refcnt))
270 		kfree(addr);
271 }
272 
273 /*
274  *	Check unix socket name:
275  *		- should be not zero length.
276  *	        - if started by not zero, should be NULL terminated (FS object)
277  *		- if started by zero, it is abstract name.
278  */
279 
280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
281 {
282 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 	    addr_len > sizeof(*sunaddr))
284 		return -EINVAL;
285 
286 	if (sunaddr->sun_family != AF_UNIX)
287 		return -EINVAL;
288 
289 	return 0;
290 }
291 
292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
293 {
294 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
295 	short offset = offsetof(struct sockaddr_storage, __data);
296 
297 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
298 
299 	/* This may look like an off by one error but it is a bit more
300 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
301 	 * sun_path[108] doesn't as such exist.  However in kernel space
302 	 * we are guaranteed that it is a valid memory location in our
303 	 * kernel address buffer because syscall functions always pass
304 	 * a pointer of struct sockaddr_storage which has a bigger buffer
305 	 * than 108.  Also, we must terminate sun_path for strlen() in
306 	 * getname_kernel().
307 	 */
308 	addr->__data[addr_len - offset] = 0;
309 
310 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
311 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
312 	 * know the actual buffer.
313 	 */
314 	return strlen(addr->__data) + offset + 1;
315 }
316 
317 static void __unix_remove_socket(struct sock *sk)
318 {
319 	sk_del_node_init(sk);
320 }
321 
322 static void __unix_insert_socket(struct net *net, struct sock *sk)
323 {
324 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
325 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
326 }
327 
328 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
329 				 struct unix_address *addr, unsigned int hash)
330 {
331 	__unix_remove_socket(sk);
332 	smp_store_release(&unix_sk(sk)->addr, addr);
333 
334 	sk->sk_hash = hash;
335 	__unix_insert_socket(net, sk);
336 }
337 
338 static void unix_remove_socket(struct net *net, struct sock *sk)
339 {
340 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
341 	__unix_remove_socket(sk);
342 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
343 }
344 
345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
346 {
347 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
348 	__unix_insert_socket(net, sk);
349 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
350 }
351 
352 static void unix_insert_bsd_socket(struct sock *sk)
353 {
354 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
356 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357 }
358 
359 static void unix_remove_bsd_socket(struct sock *sk)
360 {
361 	if (!hlist_unhashed(&sk->sk_bind_node)) {
362 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
363 		__sk_del_bind_node(sk);
364 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
365 
366 		sk_node_init(&sk->sk_bind_node);
367 	}
368 }
369 
370 static struct sock *__unix_find_socket_byname(struct net *net,
371 					      struct sockaddr_un *sunname,
372 					      int len, unsigned int hash)
373 {
374 	struct sock *s;
375 
376 	sk_for_each(s, &net->unx.table.buckets[hash]) {
377 		struct unix_sock *u = unix_sk(s);
378 
379 		if (u->addr->len == len &&
380 		    !memcmp(u->addr->name, sunname, len))
381 			return s;
382 	}
383 	return NULL;
384 }
385 
386 static inline struct sock *unix_find_socket_byname(struct net *net,
387 						   struct sockaddr_un *sunname,
388 						   int len, unsigned int hash)
389 {
390 	struct sock *s;
391 
392 	spin_lock(&net->unx.table.locks[hash]);
393 	s = __unix_find_socket_byname(net, sunname, len, hash);
394 	if (s)
395 		sock_hold(s);
396 	spin_unlock(&net->unx.table.locks[hash]);
397 	return s;
398 }
399 
400 static struct sock *unix_find_socket_byinode(struct inode *i)
401 {
402 	unsigned int hash = unix_bsd_hash(i);
403 	struct sock *s;
404 
405 	spin_lock(&bsd_socket_locks[hash]);
406 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
407 		struct dentry *dentry = unix_sk(s)->path.dentry;
408 
409 		if (dentry && d_backing_inode(dentry) == i) {
410 			sock_hold(s);
411 			spin_unlock(&bsd_socket_locks[hash]);
412 			return s;
413 		}
414 	}
415 	spin_unlock(&bsd_socket_locks[hash]);
416 	return NULL;
417 }
418 
419 /* Support code for asymmetrically connected dgram sockets
420  *
421  * If a datagram socket is connected to a socket not itself connected
422  * to the first socket (eg, /dev/log), clients may only enqueue more
423  * messages if the present receive queue of the server socket is not
424  * "too large". This means there's a second writeability condition
425  * poll and sendmsg need to test. The dgram recv code will do a wake
426  * up on the peer_wait wait queue of a socket upon reception of a
427  * datagram which needs to be propagated to sleeping would-be writers
428  * since these might not have sent anything so far. This can't be
429  * accomplished via poll_wait because the lifetime of the server
430  * socket might be less than that of its clients if these break their
431  * association with it or if the server socket is closed while clients
432  * are still connected to it and there's no way to inform "a polling
433  * implementation" that it should let go of a certain wait queue
434  *
435  * In order to propagate a wake up, a wait_queue_entry_t of the client
436  * socket is enqueued on the peer_wait queue of the server socket
437  * whose wake function does a wake_up on the ordinary client socket
438  * wait queue. This connection is established whenever a write (or
439  * poll for write) hit the flow control condition and broken when the
440  * association to the server socket is dissolved or after a wake up
441  * was relayed.
442  */
443 
444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
445 				      void *key)
446 {
447 	struct unix_sock *u;
448 	wait_queue_head_t *u_sleep;
449 
450 	u = container_of(q, struct unix_sock, peer_wake);
451 
452 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
453 			    q);
454 	u->peer_wake.private = NULL;
455 
456 	/* relaying can only happen while the wq still exists */
457 	u_sleep = sk_sleep(&u->sk);
458 	if (u_sleep)
459 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
460 
461 	return 0;
462 }
463 
464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
465 {
466 	struct unix_sock *u, *u_other;
467 	int rc;
468 
469 	u = unix_sk(sk);
470 	u_other = unix_sk(other);
471 	rc = 0;
472 	spin_lock(&u_other->peer_wait.lock);
473 
474 	if (!u->peer_wake.private) {
475 		u->peer_wake.private = other;
476 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
477 
478 		rc = 1;
479 	}
480 
481 	spin_unlock(&u_other->peer_wait.lock);
482 	return rc;
483 }
484 
485 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
486 					    struct sock *other)
487 {
488 	struct unix_sock *u, *u_other;
489 
490 	u = unix_sk(sk);
491 	u_other = unix_sk(other);
492 	spin_lock(&u_other->peer_wait.lock);
493 
494 	if (u->peer_wake.private == other) {
495 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
496 		u->peer_wake.private = NULL;
497 	}
498 
499 	spin_unlock(&u_other->peer_wait.lock);
500 }
501 
502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
503 						   struct sock *other)
504 {
505 	unix_dgram_peer_wake_disconnect(sk, other);
506 	wake_up_interruptible_poll(sk_sleep(sk),
507 				   EPOLLOUT |
508 				   EPOLLWRNORM |
509 				   EPOLLWRBAND);
510 }
511 
512 /* preconditions:
513  *	- unix_peer(sk) == other
514  *	- association is stable
515  */
516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
517 {
518 	int connected;
519 
520 	connected = unix_dgram_peer_wake_connect(sk, other);
521 
522 	/* If other is SOCK_DEAD, we want to make sure we signal
523 	 * POLLOUT, such that a subsequent write() can get a
524 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
525 	 * to other and its full, we will hang waiting for POLLOUT.
526 	 */
527 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
528 		return 1;
529 
530 	if (connected)
531 		unix_dgram_peer_wake_disconnect(sk, other);
532 
533 	return 0;
534 }
535 
536 static int unix_writable(const struct sock *sk)
537 {
538 	return sk->sk_state != TCP_LISTEN &&
539 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
540 }
541 
542 static void unix_write_space(struct sock *sk)
543 {
544 	struct socket_wq *wq;
545 
546 	rcu_read_lock();
547 	if (unix_writable(sk)) {
548 		wq = rcu_dereference(sk->sk_wq);
549 		if (skwq_has_sleeper(wq))
550 			wake_up_interruptible_sync_poll(&wq->wait,
551 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
552 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
553 	}
554 	rcu_read_unlock();
555 }
556 
557 /* When dgram socket disconnects (or changes its peer), we clear its receive
558  * queue of packets arrived from previous peer. First, it allows to do
559  * flow control based only on wmem_alloc; second, sk connected to peer
560  * may receive messages only from that peer. */
561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
562 {
563 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
564 		skb_queue_purge(&sk->sk_receive_queue);
565 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
566 
567 		/* If one link of bidirectional dgram pipe is disconnected,
568 		 * we signal error. Messages are lost. Do not make this,
569 		 * when peer was not connected to us.
570 		 */
571 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
572 			WRITE_ONCE(other->sk_err, ECONNRESET);
573 			sk_error_report(other);
574 		}
575 	}
576 	other->sk_state = TCP_CLOSE;
577 }
578 
579 static void unix_sock_destructor(struct sock *sk)
580 {
581 	struct unix_sock *u = unix_sk(sk);
582 
583 	skb_queue_purge(&sk->sk_receive_queue);
584 
585 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
586 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
587 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
588 	if (!sock_flag(sk, SOCK_DEAD)) {
589 		pr_info("Attempt to release alive unix socket: %p\n", sk);
590 		return;
591 	}
592 
593 	if (u->addr)
594 		unix_release_addr(u->addr);
595 
596 	atomic_long_dec(&unix_nr_socks);
597 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
598 #ifdef UNIX_REFCNT_DEBUG
599 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
600 		atomic_long_read(&unix_nr_socks));
601 #endif
602 }
603 
604 static void unix_release_sock(struct sock *sk, int embrion)
605 {
606 	struct unix_sock *u = unix_sk(sk);
607 	struct sock *skpair;
608 	struct sk_buff *skb;
609 	struct path path;
610 	int state;
611 
612 	unix_remove_socket(sock_net(sk), sk);
613 	unix_remove_bsd_socket(sk);
614 
615 	/* Clear state */
616 	unix_state_lock(sk);
617 	sock_orphan(sk);
618 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
619 	path	     = u->path;
620 	u->path.dentry = NULL;
621 	u->path.mnt = NULL;
622 	state = sk->sk_state;
623 	sk->sk_state = TCP_CLOSE;
624 
625 	skpair = unix_peer(sk);
626 	unix_peer(sk) = NULL;
627 
628 	unix_state_unlock(sk);
629 
630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
631 	if (u->oob_skb) {
632 		kfree_skb(u->oob_skb);
633 		u->oob_skb = NULL;
634 	}
635 #endif
636 
637 	wake_up_interruptible_all(&u->peer_wait);
638 
639 	if (skpair != NULL) {
640 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
641 			unix_state_lock(skpair);
642 			/* No more writes */
643 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
644 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
645 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
646 			unix_state_unlock(skpair);
647 			skpair->sk_state_change(skpair);
648 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
649 		}
650 
651 		unix_dgram_peer_wake_disconnect(sk, skpair);
652 		sock_put(skpair); /* It may now die */
653 	}
654 
655 	/* Try to flush out this socket. Throw out buffers at least */
656 
657 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
658 		if (state == TCP_LISTEN)
659 			unix_release_sock(skb->sk, 1);
660 		/* passed fds are erased in the kfree_skb hook	      */
661 		UNIXCB(skb).consumed = skb->len;
662 		kfree_skb(skb);
663 	}
664 
665 	if (path.dentry)
666 		path_put(&path);
667 
668 	sock_put(sk);
669 
670 	/* ---- Socket is dead now and most probably destroyed ---- */
671 
672 	/*
673 	 * Fixme: BSD difference: In BSD all sockets connected to us get
674 	 *	  ECONNRESET and we die on the spot. In Linux we behave
675 	 *	  like files and pipes do and wait for the last
676 	 *	  dereference.
677 	 *
678 	 * Can't we simply set sock->err?
679 	 *
680 	 *	  What the above comment does talk about? --ANK(980817)
681 	 */
682 
683 	if (READ_ONCE(unix_tot_inflight))
684 		unix_gc();		/* Garbage collect fds */
685 }
686 
687 static void init_peercred(struct sock *sk)
688 {
689 	const struct cred *old_cred;
690 	struct pid *old_pid;
691 
692 	spin_lock(&sk->sk_peer_lock);
693 	old_pid = sk->sk_peer_pid;
694 	old_cred = sk->sk_peer_cred;
695 	sk->sk_peer_pid  = get_pid(task_tgid(current));
696 	sk->sk_peer_cred = get_current_cred();
697 	spin_unlock(&sk->sk_peer_lock);
698 
699 	put_pid(old_pid);
700 	put_cred(old_cred);
701 }
702 
703 static void copy_peercred(struct sock *sk, struct sock *peersk)
704 {
705 	const struct cred *old_cred;
706 	struct pid *old_pid;
707 
708 	if (sk < peersk) {
709 		spin_lock(&sk->sk_peer_lock);
710 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711 	} else {
712 		spin_lock(&peersk->sk_peer_lock);
713 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
714 	}
715 	old_pid = sk->sk_peer_pid;
716 	old_cred = sk->sk_peer_cred;
717 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
718 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
719 
720 	spin_unlock(&sk->sk_peer_lock);
721 	spin_unlock(&peersk->sk_peer_lock);
722 
723 	put_pid(old_pid);
724 	put_cred(old_cred);
725 }
726 
727 static int unix_listen(struct socket *sock, int backlog)
728 {
729 	int err;
730 	struct sock *sk = sock->sk;
731 	struct unix_sock *u = unix_sk(sk);
732 
733 	err = -EOPNOTSUPP;
734 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
735 		goto out;	/* Only stream/seqpacket sockets accept */
736 	err = -EINVAL;
737 	if (!u->addr)
738 		goto out;	/* No listens on an unbound socket */
739 	unix_state_lock(sk);
740 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
741 		goto out_unlock;
742 	if (backlog > sk->sk_max_ack_backlog)
743 		wake_up_interruptible_all(&u->peer_wait);
744 	sk->sk_max_ack_backlog	= backlog;
745 	sk->sk_state		= TCP_LISTEN;
746 	/* set credentials so connect can copy them */
747 	init_peercred(sk);
748 	err = 0;
749 
750 out_unlock:
751 	unix_state_unlock(sk);
752 out:
753 	return err;
754 }
755 
756 static int unix_release(struct socket *);
757 static int unix_bind(struct socket *, struct sockaddr *, int);
758 static int unix_stream_connect(struct socket *, struct sockaddr *,
759 			       int addr_len, int flags);
760 static int unix_socketpair(struct socket *, struct socket *);
761 static int unix_accept(struct socket *, struct socket *, int, bool);
762 static int unix_getname(struct socket *, struct sockaddr *, int);
763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
764 static __poll_t unix_dgram_poll(struct file *, struct socket *,
765 				    poll_table *);
766 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
767 #ifdef CONFIG_COMPAT
768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
769 #endif
770 static int unix_shutdown(struct socket *, int);
771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
773 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
774 				       struct pipe_inode_info *, size_t size,
775 				       unsigned int flags);
776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
780 static int unix_dgram_connect(struct socket *, struct sockaddr *,
781 			      int, int);
782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
784 				  int);
785 
786 static int unix_set_peek_off(struct sock *sk, int val)
787 {
788 	struct unix_sock *u = unix_sk(sk);
789 
790 	if (mutex_lock_interruptible(&u->iolock))
791 		return -EINTR;
792 
793 	WRITE_ONCE(sk->sk_peek_off, val);
794 	mutex_unlock(&u->iolock);
795 
796 	return 0;
797 }
798 
799 #ifdef CONFIG_PROC_FS
800 static int unix_count_nr_fds(struct sock *sk)
801 {
802 	struct sk_buff *skb;
803 	struct unix_sock *u;
804 	int nr_fds = 0;
805 
806 	spin_lock(&sk->sk_receive_queue.lock);
807 	skb = skb_peek(&sk->sk_receive_queue);
808 	while (skb) {
809 		u = unix_sk(skb->sk);
810 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
811 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
812 	}
813 	spin_unlock(&sk->sk_receive_queue.lock);
814 
815 	return nr_fds;
816 }
817 
818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
819 {
820 	struct sock *sk = sock->sk;
821 	unsigned char s_state;
822 	struct unix_sock *u;
823 	int nr_fds = 0;
824 
825 	if (sk) {
826 		s_state = READ_ONCE(sk->sk_state);
827 		u = unix_sk(sk);
828 
829 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
830 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
831 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
832 		 */
833 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
834 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
835 		else if (s_state == TCP_LISTEN)
836 			nr_fds = unix_count_nr_fds(sk);
837 
838 		seq_printf(m, "scm_fds: %u\n", nr_fds);
839 	}
840 }
841 #else
842 #define unix_show_fdinfo NULL
843 #endif
844 
845 static const struct proto_ops unix_stream_ops = {
846 	.family =	PF_UNIX,
847 	.owner =	THIS_MODULE,
848 	.release =	unix_release,
849 	.bind =		unix_bind,
850 	.connect =	unix_stream_connect,
851 	.socketpair =	unix_socketpair,
852 	.accept =	unix_accept,
853 	.getname =	unix_getname,
854 	.poll =		unix_poll,
855 	.ioctl =	unix_ioctl,
856 #ifdef CONFIG_COMPAT
857 	.compat_ioctl =	unix_compat_ioctl,
858 #endif
859 	.listen =	unix_listen,
860 	.shutdown =	unix_shutdown,
861 	.sendmsg =	unix_stream_sendmsg,
862 	.recvmsg =	unix_stream_recvmsg,
863 	.read_skb =	unix_stream_read_skb,
864 	.mmap =		sock_no_mmap,
865 	.splice_read =	unix_stream_splice_read,
866 	.set_peek_off =	unix_set_peek_off,
867 	.show_fdinfo =	unix_show_fdinfo,
868 };
869 
870 static const struct proto_ops unix_dgram_ops = {
871 	.family =	PF_UNIX,
872 	.owner =	THIS_MODULE,
873 	.release =	unix_release,
874 	.bind =		unix_bind,
875 	.connect =	unix_dgram_connect,
876 	.socketpair =	unix_socketpair,
877 	.accept =	sock_no_accept,
878 	.getname =	unix_getname,
879 	.poll =		unix_dgram_poll,
880 	.ioctl =	unix_ioctl,
881 #ifdef CONFIG_COMPAT
882 	.compat_ioctl =	unix_compat_ioctl,
883 #endif
884 	.listen =	sock_no_listen,
885 	.shutdown =	unix_shutdown,
886 	.sendmsg =	unix_dgram_sendmsg,
887 	.read_skb =	unix_read_skb,
888 	.recvmsg =	unix_dgram_recvmsg,
889 	.mmap =		sock_no_mmap,
890 	.set_peek_off =	unix_set_peek_off,
891 	.show_fdinfo =	unix_show_fdinfo,
892 };
893 
894 static const struct proto_ops unix_seqpacket_ops = {
895 	.family =	PF_UNIX,
896 	.owner =	THIS_MODULE,
897 	.release =	unix_release,
898 	.bind =		unix_bind,
899 	.connect =	unix_stream_connect,
900 	.socketpair =	unix_socketpair,
901 	.accept =	unix_accept,
902 	.getname =	unix_getname,
903 	.poll =		unix_dgram_poll,
904 	.ioctl =	unix_ioctl,
905 #ifdef CONFIG_COMPAT
906 	.compat_ioctl =	unix_compat_ioctl,
907 #endif
908 	.listen =	unix_listen,
909 	.shutdown =	unix_shutdown,
910 	.sendmsg =	unix_seqpacket_sendmsg,
911 	.recvmsg =	unix_seqpacket_recvmsg,
912 	.mmap =		sock_no_mmap,
913 	.set_peek_off =	unix_set_peek_off,
914 	.show_fdinfo =	unix_show_fdinfo,
915 };
916 
917 static void unix_close(struct sock *sk, long timeout)
918 {
919 	/* Nothing to do here, unix socket does not need a ->close().
920 	 * This is merely for sockmap.
921 	 */
922 }
923 
924 static void unix_unhash(struct sock *sk)
925 {
926 	/* Nothing to do here, unix socket does not need a ->unhash().
927 	 * This is merely for sockmap.
928 	 */
929 }
930 
931 static bool unix_bpf_bypass_getsockopt(int level, int optname)
932 {
933 	if (level == SOL_SOCKET) {
934 		switch (optname) {
935 		case SO_PEERPIDFD:
936 			return true;
937 		default:
938 			return false;
939 		}
940 	}
941 
942 	return false;
943 }
944 
945 struct proto unix_dgram_proto = {
946 	.name			= "UNIX",
947 	.owner			= THIS_MODULE,
948 	.obj_size		= sizeof(struct unix_sock),
949 	.close			= unix_close,
950 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
951 #ifdef CONFIG_BPF_SYSCALL
952 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
953 #endif
954 };
955 
956 struct proto unix_stream_proto = {
957 	.name			= "UNIX-STREAM",
958 	.owner			= THIS_MODULE,
959 	.obj_size		= sizeof(struct unix_sock),
960 	.close			= unix_close,
961 	.unhash			= unix_unhash,
962 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
963 #ifdef CONFIG_BPF_SYSCALL
964 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
965 #endif
966 };
967 
968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
969 {
970 	struct unix_sock *u;
971 	struct sock *sk;
972 	int err;
973 
974 	atomic_long_inc(&unix_nr_socks);
975 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
976 		err = -ENFILE;
977 		goto err;
978 	}
979 
980 	if (type == SOCK_STREAM)
981 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
982 	else /*dgram and  seqpacket */
983 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
984 
985 	if (!sk) {
986 		err = -ENOMEM;
987 		goto err;
988 	}
989 
990 	sock_init_data(sock, sk);
991 
992 	sk->sk_hash		= unix_unbound_hash(sk);
993 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
994 	sk->sk_write_space	= unix_write_space;
995 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
996 	sk->sk_destruct		= unix_sock_destructor;
997 	u	  = unix_sk(sk);
998 	u->path.dentry = NULL;
999 	u->path.mnt = NULL;
1000 	spin_lock_init(&u->lock);
1001 	atomic_long_set(&u->inflight, 0);
1002 	INIT_LIST_HEAD(&u->link);
1003 	mutex_init(&u->iolock); /* single task reading lock */
1004 	mutex_init(&u->bindlock); /* single task binding lock */
1005 	init_waitqueue_head(&u->peer_wait);
1006 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1007 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1008 	unix_insert_unbound_socket(net, sk);
1009 
1010 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1011 
1012 	return sk;
1013 
1014 err:
1015 	atomic_long_dec(&unix_nr_socks);
1016 	return ERR_PTR(err);
1017 }
1018 
1019 static int unix_create(struct net *net, struct socket *sock, int protocol,
1020 		       int kern)
1021 {
1022 	struct sock *sk;
1023 
1024 	if (protocol && protocol != PF_UNIX)
1025 		return -EPROTONOSUPPORT;
1026 
1027 	sock->state = SS_UNCONNECTED;
1028 
1029 	switch (sock->type) {
1030 	case SOCK_STREAM:
1031 		sock->ops = &unix_stream_ops;
1032 		break;
1033 		/*
1034 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1035 		 *	nothing uses it.
1036 		 */
1037 	case SOCK_RAW:
1038 		sock->type = SOCK_DGRAM;
1039 		fallthrough;
1040 	case SOCK_DGRAM:
1041 		sock->ops = &unix_dgram_ops;
1042 		break;
1043 	case SOCK_SEQPACKET:
1044 		sock->ops = &unix_seqpacket_ops;
1045 		break;
1046 	default:
1047 		return -ESOCKTNOSUPPORT;
1048 	}
1049 
1050 	sk = unix_create1(net, sock, kern, sock->type);
1051 	if (IS_ERR(sk))
1052 		return PTR_ERR(sk);
1053 
1054 	return 0;
1055 }
1056 
1057 static int unix_release(struct socket *sock)
1058 {
1059 	struct sock *sk = sock->sk;
1060 
1061 	if (!sk)
1062 		return 0;
1063 
1064 	sk->sk_prot->close(sk, 0);
1065 	unix_release_sock(sk, 0);
1066 	sock->sk = NULL;
1067 
1068 	return 0;
1069 }
1070 
1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1072 				  int type)
1073 {
1074 	struct inode *inode;
1075 	struct path path;
1076 	struct sock *sk;
1077 	int err;
1078 
1079 	unix_mkname_bsd(sunaddr, addr_len);
1080 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1081 	if (err)
1082 		goto fail;
1083 
1084 	err = path_permission(&path, MAY_WRITE);
1085 	if (err)
1086 		goto path_put;
1087 
1088 	err = -ECONNREFUSED;
1089 	inode = d_backing_inode(path.dentry);
1090 	if (!S_ISSOCK(inode->i_mode))
1091 		goto path_put;
1092 
1093 	sk = unix_find_socket_byinode(inode);
1094 	if (!sk)
1095 		goto path_put;
1096 
1097 	err = -EPROTOTYPE;
1098 	if (sk->sk_type == type)
1099 		touch_atime(&path);
1100 	else
1101 		goto sock_put;
1102 
1103 	path_put(&path);
1104 
1105 	return sk;
1106 
1107 sock_put:
1108 	sock_put(sk);
1109 path_put:
1110 	path_put(&path);
1111 fail:
1112 	return ERR_PTR(err);
1113 }
1114 
1115 static struct sock *unix_find_abstract(struct net *net,
1116 				       struct sockaddr_un *sunaddr,
1117 				       int addr_len, int type)
1118 {
1119 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1120 	struct dentry *dentry;
1121 	struct sock *sk;
1122 
1123 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1124 	if (!sk)
1125 		return ERR_PTR(-ECONNREFUSED);
1126 
1127 	dentry = unix_sk(sk)->path.dentry;
1128 	if (dentry)
1129 		touch_atime(&unix_sk(sk)->path);
1130 
1131 	return sk;
1132 }
1133 
1134 static struct sock *unix_find_other(struct net *net,
1135 				    struct sockaddr_un *sunaddr,
1136 				    int addr_len, int type)
1137 {
1138 	struct sock *sk;
1139 
1140 	if (sunaddr->sun_path[0])
1141 		sk = unix_find_bsd(sunaddr, addr_len, type);
1142 	else
1143 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1144 
1145 	return sk;
1146 }
1147 
1148 static int unix_autobind(struct sock *sk)
1149 {
1150 	unsigned int new_hash, old_hash = sk->sk_hash;
1151 	struct unix_sock *u = unix_sk(sk);
1152 	struct net *net = sock_net(sk);
1153 	struct unix_address *addr;
1154 	u32 lastnum, ordernum;
1155 	int err;
1156 
1157 	err = mutex_lock_interruptible(&u->bindlock);
1158 	if (err)
1159 		return err;
1160 
1161 	if (u->addr)
1162 		goto out;
1163 
1164 	err = -ENOMEM;
1165 	addr = kzalloc(sizeof(*addr) +
1166 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1167 	if (!addr)
1168 		goto out;
1169 
1170 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1171 	addr->name->sun_family = AF_UNIX;
1172 	refcount_set(&addr->refcnt, 1);
1173 
1174 	ordernum = get_random_u32();
1175 	lastnum = ordernum & 0xFFFFF;
1176 retry:
1177 	ordernum = (ordernum + 1) & 0xFFFFF;
1178 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1179 
1180 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1181 	unix_table_double_lock(net, old_hash, new_hash);
1182 
1183 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1184 		unix_table_double_unlock(net, old_hash, new_hash);
1185 
1186 		/* __unix_find_socket_byname() may take long time if many names
1187 		 * are already in use.
1188 		 */
1189 		cond_resched();
1190 
1191 		if (ordernum == lastnum) {
1192 			/* Give up if all names seems to be in use. */
1193 			err = -ENOSPC;
1194 			unix_release_addr(addr);
1195 			goto out;
1196 		}
1197 
1198 		goto retry;
1199 	}
1200 
1201 	__unix_set_addr_hash(net, sk, addr, new_hash);
1202 	unix_table_double_unlock(net, old_hash, new_hash);
1203 	err = 0;
1204 
1205 out:	mutex_unlock(&u->bindlock);
1206 	return err;
1207 }
1208 
1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1210 			 int addr_len)
1211 {
1212 	umode_t mode = S_IFSOCK |
1213 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1214 	unsigned int new_hash, old_hash = sk->sk_hash;
1215 	struct unix_sock *u = unix_sk(sk);
1216 	struct net *net = sock_net(sk);
1217 	struct mnt_idmap *idmap;
1218 	struct unix_address *addr;
1219 	struct dentry *dentry;
1220 	struct path parent;
1221 	int err;
1222 
1223 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1224 	addr = unix_create_addr(sunaddr, addr_len);
1225 	if (!addr)
1226 		return -ENOMEM;
1227 
1228 	/*
1229 	 * Get the parent directory, calculate the hash for last
1230 	 * component.
1231 	 */
1232 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1233 	if (IS_ERR(dentry)) {
1234 		err = PTR_ERR(dentry);
1235 		goto out;
1236 	}
1237 
1238 	/*
1239 	 * All right, let's create it.
1240 	 */
1241 	idmap = mnt_idmap(parent.mnt);
1242 	err = security_path_mknod(&parent, dentry, mode, 0);
1243 	if (!err)
1244 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1245 	if (err)
1246 		goto out_path;
1247 	err = mutex_lock_interruptible(&u->bindlock);
1248 	if (err)
1249 		goto out_unlink;
1250 	if (u->addr)
1251 		goto out_unlock;
1252 
1253 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1254 	unix_table_double_lock(net, old_hash, new_hash);
1255 	u->path.mnt = mntget(parent.mnt);
1256 	u->path.dentry = dget(dentry);
1257 	__unix_set_addr_hash(net, sk, addr, new_hash);
1258 	unix_table_double_unlock(net, old_hash, new_hash);
1259 	unix_insert_bsd_socket(sk);
1260 	mutex_unlock(&u->bindlock);
1261 	done_path_create(&parent, dentry);
1262 	return 0;
1263 
1264 out_unlock:
1265 	mutex_unlock(&u->bindlock);
1266 	err = -EINVAL;
1267 out_unlink:
1268 	/* failed after successful mknod?  unlink what we'd created... */
1269 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1270 out_path:
1271 	done_path_create(&parent, dentry);
1272 out:
1273 	unix_release_addr(addr);
1274 	return err == -EEXIST ? -EADDRINUSE : err;
1275 }
1276 
1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1278 			      int addr_len)
1279 {
1280 	unsigned int new_hash, old_hash = sk->sk_hash;
1281 	struct unix_sock *u = unix_sk(sk);
1282 	struct net *net = sock_net(sk);
1283 	struct unix_address *addr;
1284 	int err;
1285 
1286 	addr = unix_create_addr(sunaddr, addr_len);
1287 	if (!addr)
1288 		return -ENOMEM;
1289 
1290 	err = mutex_lock_interruptible(&u->bindlock);
1291 	if (err)
1292 		goto out;
1293 
1294 	if (u->addr) {
1295 		err = -EINVAL;
1296 		goto out_mutex;
1297 	}
1298 
1299 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1300 	unix_table_double_lock(net, old_hash, new_hash);
1301 
1302 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1303 		goto out_spin;
1304 
1305 	__unix_set_addr_hash(net, sk, addr, new_hash);
1306 	unix_table_double_unlock(net, old_hash, new_hash);
1307 	mutex_unlock(&u->bindlock);
1308 	return 0;
1309 
1310 out_spin:
1311 	unix_table_double_unlock(net, old_hash, new_hash);
1312 	err = -EADDRINUSE;
1313 out_mutex:
1314 	mutex_unlock(&u->bindlock);
1315 out:
1316 	unix_release_addr(addr);
1317 	return err;
1318 }
1319 
1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1321 {
1322 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1323 	struct sock *sk = sock->sk;
1324 	int err;
1325 
1326 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1327 	    sunaddr->sun_family == AF_UNIX)
1328 		return unix_autobind(sk);
1329 
1330 	err = unix_validate_addr(sunaddr, addr_len);
1331 	if (err)
1332 		return err;
1333 
1334 	if (sunaddr->sun_path[0])
1335 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1336 	else
1337 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1338 
1339 	return err;
1340 }
1341 
1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1343 {
1344 	if (unlikely(sk1 == sk2) || !sk2) {
1345 		unix_state_lock(sk1);
1346 		return;
1347 	}
1348 	if (sk1 < sk2) {
1349 		unix_state_lock(sk1);
1350 		unix_state_lock_nested(sk2);
1351 	} else {
1352 		unix_state_lock(sk2);
1353 		unix_state_lock_nested(sk1);
1354 	}
1355 }
1356 
1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1358 {
1359 	if (unlikely(sk1 == sk2) || !sk2) {
1360 		unix_state_unlock(sk1);
1361 		return;
1362 	}
1363 	unix_state_unlock(sk1);
1364 	unix_state_unlock(sk2);
1365 }
1366 
1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1368 			      int alen, int flags)
1369 {
1370 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1371 	struct sock *sk = sock->sk;
1372 	struct sock *other;
1373 	int err;
1374 
1375 	err = -EINVAL;
1376 	if (alen < offsetofend(struct sockaddr, sa_family))
1377 		goto out;
1378 
1379 	if (addr->sa_family != AF_UNSPEC) {
1380 		err = unix_validate_addr(sunaddr, alen);
1381 		if (err)
1382 			goto out;
1383 
1384 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1385 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1386 		    !unix_sk(sk)->addr) {
1387 			err = unix_autobind(sk);
1388 			if (err)
1389 				goto out;
1390 		}
1391 
1392 restart:
1393 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1394 		if (IS_ERR(other)) {
1395 			err = PTR_ERR(other);
1396 			goto out;
1397 		}
1398 
1399 		unix_state_double_lock(sk, other);
1400 
1401 		/* Apparently VFS overslept socket death. Retry. */
1402 		if (sock_flag(other, SOCK_DEAD)) {
1403 			unix_state_double_unlock(sk, other);
1404 			sock_put(other);
1405 			goto restart;
1406 		}
1407 
1408 		err = -EPERM;
1409 		if (!unix_may_send(sk, other))
1410 			goto out_unlock;
1411 
1412 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1413 		if (err)
1414 			goto out_unlock;
1415 
1416 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1417 	} else {
1418 		/*
1419 		 *	1003.1g breaking connected state with AF_UNSPEC
1420 		 */
1421 		other = NULL;
1422 		unix_state_double_lock(sk, other);
1423 	}
1424 
1425 	/*
1426 	 * If it was connected, reconnect.
1427 	 */
1428 	if (unix_peer(sk)) {
1429 		struct sock *old_peer = unix_peer(sk);
1430 
1431 		unix_peer(sk) = other;
1432 		if (!other)
1433 			sk->sk_state = TCP_CLOSE;
1434 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1435 
1436 		unix_state_double_unlock(sk, other);
1437 
1438 		if (other != old_peer)
1439 			unix_dgram_disconnected(sk, old_peer);
1440 		sock_put(old_peer);
1441 	} else {
1442 		unix_peer(sk) = other;
1443 		unix_state_double_unlock(sk, other);
1444 	}
1445 
1446 	return 0;
1447 
1448 out_unlock:
1449 	unix_state_double_unlock(sk, other);
1450 	sock_put(other);
1451 out:
1452 	return err;
1453 }
1454 
1455 static long unix_wait_for_peer(struct sock *other, long timeo)
1456 	__releases(&unix_sk(other)->lock)
1457 {
1458 	struct unix_sock *u = unix_sk(other);
1459 	int sched;
1460 	DEFINE_WAIT(wait);
1461 
1462 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1463 
1464 	sched = !sock_flag(other, SOCK_DEAD) &&
1465 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1466 		unix_recvq_full_lockless(other);
1467 
1468 	unix_state_unlock(other);
1469 
1470 	if (sched)
1471 		timeo = schedule_timeout(timeo);
1472 
1473 	finish_wait(&u->peer_wait, &wait);
1474 	return timeo;
1475 }
1476 
1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1478 			       int addr_len, int flags)
1479 {
1480 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1481 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1482 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1483 	struct net *net = sock_net(sk);
1484 	struct sk_buff *skb = NULL;
1485 	long timeo;
1486 	int err;
1487 	int st;
1488 
1489 	err = unix_validate_addr(sunaddr, addr_len);
1490 	if (err)
1491 		goto out;
1492 
1493 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1495 		err = unix_autobind(sk);
1496 		if (err)
1497 			goto out;
1498 	}
1499 
1500 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1501 
1502 	/* First of all allocate resources.
1503 	   If we will make it after state is locked,
1504 	   we will have to recheck all again in any case.
1505 	 */
1506 
1507 	/* create new sock for complete connection */
1508 	newsk = unix_create1(net, NULL, 0, sock->type);
1509 	if (IS_ERR(newsk)) {
1510 		err = PTR_ERR(newsk);
1511 		newsk = NULL;
1512 		goto out;
1513 	}
1514 
1515 	err = -ENOMEM;
1516 
1517 	/* Allocate skb for sending to listening sock */
1518 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1519 	if (skb == NULL)
1520 		goto out;
1521 
1522 restart:
1523 	/*  Find listening sock. */
1524 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1525 	if (IS_ERR(other)) {
1526 		err = PTR_ERR(other);
1527 		other = NULL;
1528 		goto out;
1529 	}
1530 
1531 	/* Latch state of peer */
1532 	unix_state_lock(other);
1533 
1534 	/* Apparently VFS overslept socket death. Retry. */
1535 	if (sock_flag(other, SOCK_DEAD)) {
1536 		unix_state_unlock(other);
1537 		sock_put(other);
1538 		goto restart;
1539 	}
1540 
1541 	err = -ECONNREFUSED;
1542 	if (other->sk_state != TCP_LISTEN)
1543 		goto out_unlock;
1544 	if (other->sk_shutdown & RCV_SHUTDOWN)
1545 		goto out_unlock;
1546 
1547 	if (unix_recvq_full(other)) {
1548 		err = -EAGAIN;
1549 		if (!timeo)
1550 			goto out_unlock;
1551 
1552 		timeo = unix_wait_for_peer(other, timeo);
1553 
1554 		err = sock_intr_errno(timeo);
1555 		if (signal_pending(current))
1556 			goto out;
1557 		sock_put(other);
1558 		goto restart;
1559 	}
1560 
1561 	/* Latch our state.
1562 
1563 	   It is tricky place. We need to grab our state lock and cannot
1564 	   drop lock on peer. It is dangerous because deadlock is
1565 	   possible. Connect to self case and simultaneous
1566 	   attempt to connect are eliminated by checking socket
1567 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1568 	   check this before attempt to grab lock.
1569 
1570 	   Well, and we have to recheck the state after socket locked.
1571 	 */
1572 	st = sk->sk_state;
1573 
1574 	switch (st) {
1575 	case TCP_CLOSE:
1576 		/* This is ok... continue with connect */
1577 		break;
1578 	case TCP_ESTABLISHED:
1579 		/* Socket is already connected */
1580 		err = -EISCONN;
1581 		goto out_unlock;
1582 	default:
1583 		err = -EINVAL;
1584 		goto out_unlock;
1585 	}
1586 
1587 	unix_state_lock_nested(sk);
1588 
1589 	if (sk->sk_state != st) {
1590 		unix_state_unlock(sk);
1591 		unix_state_unlock(other);
1592 		sock_put(other);
1593 		goto restart;
1594 	}
1595 
1596 	err = security_unix_stream_connect(sk, other, newsk);
1597 	if (err) {
1598 		unix_state_unlock(sk);
1599 		goto out_unlock;
1600 	}
1601 
1602 	/* The way is open! Fastly set all the necessary fields... */
1603 
1604 	sock_hold(sk);
1605 	unix_peer(newsk)	= sk;
1606 	newsk->sk_state		= TCP_ESTABLISHED;
1607 	newsk->sk_type		= sk->sk_type;
1608 	init_peercred(newsk);
1609 	newu = unix_sk(newsk);
1610 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611 	otheru = unix_sk(other);
1612 
1613 	/* copy address information from listening to new sock
1614 	 *
1615 	 * The contents of *(otheru->addr) and otheru->path
1616 	 * are seen fully set up here, since we have found
1617 	 * otheru in hash under its lock.  Insertion into the
1618 	 * hash chain we'd found it in had been done in an
1619 	 * earlier critical area protected by the chain's lock,
1620 	 * the same one where we'd set *(otheru->addr) contents,
1621 	 * as well as otheru->path and otheru->addr itself.
1622 	 *
1623 	 * Using smp_store_release() here to set newu->addr
1624 	 * is enough to make those stores, as well as stores
1625 	 * to newu->path visible to anyone who gets newu->addr
1626 	 * by smp_load_acquire().  IOW, the same warranties
1627 	 * as for unix_sock instances bound in unix_bind() or
1628 	 * in unix_autobind().
1629 	 */
1630 	if (otheru->path.dentry) {
1631 		path_get(&otheru->path);
1632 		newu->path = otheru->path;
1633 	}
1634 	refcount_inc(&otheru->addr->refcnt);
1635 	smp_store_release(&newu->addr, otheru->addr);
1636 
1637 	/* Set credentials */
1638 	copy_peercred(sk, other);
1639 
1640 	sock->state	= SS_CONNECTED;
1641 	sk->sk_state	= TCP_ESTABLISHED;
1642 	sock_hold(newsk);
1643 
1644 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1645 	unix_peer(sk)	= newsk;
1646 
1647 	unix_state_unlock(sk);
1648 
1649 	/* take ten and send info to listening sock */
1650 	spin_lock(&other->sk_receive_queue.lock);
1651 	__skb_queue_tail(&other->sk_receive_queue, skb);
1652 	spin_unlock(&other->sk_receive_queue.lock);
1653 	unix_state_unlock(other);
1654 	other->sk_data_ready(other);
1655 	sock_put(other);
1656 	return 0;
1657 
1658 out_unlock:
1659 	if (other)
1660 		unix_state_unlock(other);
1661 
1662 out:
1663 	kfree_skb(skb);
1664 	if (newsk)
1665 		unix_release_sock(newsk, 0);
1666 	if (other)
1667 		sock_put(other);
1668 	return err;
1669 }
1670 
1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1672 {
1673 	struct sock *ska = socka->sk, *skb = sockb->sk;
1674 
1675 	/* Join our sockets back to back */
1676 	sock_hold(ska);
1677 	sock_hold(skb);
1678 	unix_peer(ska) = skb;
1679 	unix_peer(skb) = ska;
1680 	init_peercred(ska);
1681 	init_peercred(skb);
1682 
1683 	ska->sk_state = TCP_ESTABLISHED;
1684 	skb->sk_state = TCP_ESTABLISHED;
1685 	socka->state  = SS_CONNECTED;
1686 	sockb->state  = SS_CONNECTED;
1687 	return 0;
1688 }
1689 
1690 static void unix_sock_inherit_flags(const struct socket *old,
1691 				    struct socket *new)
1692 {
1693 	if (test_bit(SOCK_PASSCRED, &old->flags))
1694 		set_bit(SOCK_PASSCRED, &new->flags);
1695 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696 		set_bit(SOCK_PASSPIDFD, &new->flags);
1697 	if (test_bit(SOCK_PASSSEC, &old->flags))
1698 		set_bit(SOCK_PASSSEC, &new->flags);
1699 }
1700 
1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1702 		       bool kern)
1703 {
1704 	struct sock *sk = sock->sk;
1705 	struct sock *tsk;
1706 	struct sk_buff *skb;
1707 	int err;
1708 
1709 	err = -EOPNOTSUPP;
1710 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1711 		goto out;
1712 
1713 	err = -EINVAL;
1714 	if (sk->sk_state != TCP_LISTEN)
1715 		goto out;
1716 
1717 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1718 	 * so that no locks are necessary.
1719 	 */
1720 
1721 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1722 				&err);
1723 	if (!skb) {
1724 		/* This means receive shutdown. */
1725 		if (err == 0)
1726 			err = -EINVAL;
1727 		goto out;
1728 	}
1729 
1730 	tsk = skb->sk;
1731 	skb_free_datagram(sk, skb);
1732 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1733 
1734 	/* attach accepted sock to socket */
1735 	unix_state_lock(tsk);
1736 	newsock->state = SS_CONNECTED;
1737 	unix_sock_inherit_flags(sock, newsock);
1738 	sock_graft(tsk, newsock);
1739 	unix_state_unlock(tsk);
1740 	return 0;
1741 
1742 out:
1743 	return err;
1744 }
1745 
1746 
1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1748 {
1749 	struct sock *sk = sock->sk;
1750 	struct unix_address *addr;
1751 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1752 	int err = 0;
1753 
1754 	if (peer) {
1755 		sk = unix_peer_get(sk);
1756 
1757 		err = -ENOTCONN;
1758 		if (!sk)
1759 			goto out;
1760 		err = 0;
1761 	} else {
1762 		sock_hold(sk);
1763 	}
1764 
1765 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1766 	if (!addr) {
1767 		sunaddr->sun_family = AF_UNIX;
1768 		sunaddr->sun_path[0] = 0;
1769 		err = offsetof(struct sockaddr_un, sun_path);
1770 	} else {
1771 		err = addr->len;
1772 		memcpy(sunaddr, addr->name, addr->len);
1773 	}
1774 	sock_put(sk);
1775 out:
1776 	return err;
1777 }
1778 
1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1780 {
1781 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1782 
1783 	/*
1784 	 * Garbage collection of unix sockets starts by selecting a set of
1785 	 * candidate sockets which have reference only from being in flight
1786 	 * (total_refs == inflight_refs).  This condition is checked once during
1787 	 * the candidate collection phase, and candidates are marked as such, so
1788 	 * that non-candidates can later be ignored.  While inflight_refs is
1789 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1790 	 * is an instantaneous decision.
1791 	 *
1792 	 * Once a candidate, however, the socket must not be reinstalled into a
1793 	 * file descriptor while the garbage collection is in progress.
1794 	 *
1795 	 * If the above conditions are met, then the directed graph of
1796 	 * candidates (*) does not change while unix_gc_lock is held.
1797 	 *
1798 	 * Any operations that changes the file count through file descriptors
1799 	 * (dup, close, sendmsg) does not change the graph since candidates are
1800 	 * not installed in fds.
1801 	 *
1802 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1803 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1804 	 * serialized with garbage collection.
1805 	 *
1806 	 * MSG_PEEK is special in that it does not change the inflight count,
1807 	 * yet does install the socket into an fd.  The following lock/unlock
1808 	 * pair is to ensure serialization with garbage collection.  It must be
1809 	 * done between incrementing the file count and installing the file into
1810 	 * an fd.
1811 	 *
1812 	 * If garbage collection starts after the barrier provided by the
1813 	 * lock/unlock, then it will see the elevated refcount and not mark this
1814 	 * as a candidate.  If a garbage collection is already in progress
1815 	 * before the file count was incremented, then the lock/unlock pair will
1816 	 * ensure that garbage collection is finished before progressing to
1817 	 * installing the fd.
1818 	 *
1819 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1820 	 * which is on the queue of listening socket A.
1821 	 */
1822 	spin_lock(&unix_gc_lock);
1823 	spin_unlock(&unix_gc_lock);
1824 }
1825 
1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1827 {
1828 	int err = 0;
1829 
1830 	UNIXCB(skb).pid  = get_pid(scm->pid);
1831 	UNIXCB(skb).uid = scm->creds.uid;
1832 	UNIXCB(skb).gid = scm->creds.gid;
1833 	UNIXCB(skb).fp = NULL;
1834 	unix_get_secdata(scm, skb);
1835 	if (scm->fp && send_fds)
1836 		err = unix_attach_fds(scm, skb);
1837 
1838 	skb->destructor = unix_destruct_scm;
1839 	return err;
1840 }
1841 
1842 static bool unix_passcred_enabled(const struct socket *sock,
1843 				  const struct sock *other)
1844 {
1845 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1846 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1847 	       !other->sk_socket ||
1848 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1849 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1850 }
1851 
1852 /*
1853  * Some apps rely on write() giving SCM_CREDENTIALS
1854  * We include credentials if source or destination socket
1855  * asserted SOCK_PASSCRED.
1856  */
1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1858 			    const struct sock *other)
1859 {
1860 	if (UNIXCB(skb).pid)
1861 		return;
1862 	if (unix_passcred_enabled(sock, other)) {
1863 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1864 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1865 	}
1866 }
1867 
1868 static bool unix_skb_scm_eq(struct sk_buff *skb,
1869 			    struct scm_cookie *scm)
1870 {
1871 	return UNIXCB(skb).pid == scm->pid &&
1872 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1873 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1874 	       unix_secdata_eq(scm, skb);
1875 }
1876 
1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1878 {
1879 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1880 	struct unix_sock *u = unix_sk(sk);
1881 
1882 	if (unlikely(fp && fp->count))
1883 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1884 }
1885 
1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1887 {
1888 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1889 	struct unix_sock *u = unix_sk(sk);
1890 
1891 	if (unlikely(fp && fp->count))
1892 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1893 }
1894 
1895 /*
1896  *	Send AF_UNIX data.
1897  */
1898 
1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1900 			      size_t len)
1901 {
1902 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903 	struct sock *sk = sock->sk, *other = NULL;
1904 	struct unix_sock *u = unix_sk(sk);
1905 	struct scm_cookie scm;
1906 	struct sk_buff *skb;
1907 	int data_len = 0;
1908 	int sk_locked;
1909 	long timeo;
1910 	int err;
1911 
1912 	wait_for_unix_gc();
1913 	err = scm_send(sock, msg, &scm, false);
1914 	if (err < 0)
1915 		return err;
1916 
1917 	err = -EOPNOTSUPP;
1918 	if (msg->msg_flags&MSG_OOB)
1919 		goto out;
1920 
1921 	if (msg->msg_namelen) {
1922 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1923 		if (err)
1924 			goto out;
1925 	} else {
1926 		sunaddr = NULL;
1927 		err = -ENOTCONN;
1928 		other = unix_peer_get(sk);
1929 		if (!other)
1930 			goto out;
1931 	}
1932 
1933 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1934 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1935 		err = unix_autobind(sk);
1936 		if (err)
1937 			goto out;
1938 	}
1939 
1940 	err = -EMSGSIZE;
1941 	if (len > sk->sk_sndbuf - 32)
1942 		goto out;
1943 
1944 	if (len > SKB_MAX_ALLOC) {
1945 		data_len = min_t(size_t,
1946 				 len - SKB_MAX_ALLOC,
1947 				 MAX_SKB_FRAGS * PAGE_SIZE);
1948 		data_len = PAGE_ALIGN(data_len);
1949 
1950 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951 	}
1952 
1953 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954 				   msg->msg_flags & MSG_DONTWAIT, &err,
1955 				   PAGE_ALLOC_COSTLY_ORDER);
1956 	if (skb == NULL)
1957 		goto out;
1958 
1959 	err = unix_scm_to_skb(&scm, skb, true);
1960 	if (err < 0)
1961 		goto out_free;
1962 
1963 	skb_put(skb, len - data_len);
1964 	skb->data_len = data_len;
1965 	skb->len = len;
1966 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967 	if (err)
1968 		goto out_free;
1969 
1970 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971 
1972 restart:
1973 	if (!other) {
1974 		err = -ECONNRESET;
1975 		if (sunaddr == NULL)
1976 			goto out_free;
1977 
1978 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979 					sk->sk_type);
1980 		if (IS_ERR(other)) {
1981 			err = PTR_ERR(other);
1982 			other = NULL;
1983 			goto out_free;
1984 		}
1985 	}
1986 
1987 	if (sk_filter(other, skb) < 0) {
1988 		/* Toss the packet but do not return any error to the sender */
1989 		err = len;
1990 		goto out_free;
1991 	}
1992 
1993 	sk_locked = 0;
1994 	unix_state_lock(other);
1995 restart_locked:
1996 	err = -EPERM;
1997 	if (!unix_may_send(sk, other))
1998 		goto out_unlock;
1999 
2000 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001 		/*
2002 		 *	Check with 1003.1g - what should
2003 		 *	datagram error
2004 		 */
2005 		unix_state_unlock(other);
2006 		sock_put(other);
2007 
2008 		if (!sk_locked)
2009 			unix_state_lock(sk);
2010 
2011 		err = 0;
2012 		if (sk->sk_type == SOCK_SEQPACKET) {
2013 			/* We are here only when racing with unix_release_sock()
2014 			 * is clearing @other. Never change state to TCP_CLOSE
2015 			 * unlike SOCK_DGRAM wants.
2016 			 */
2017 			unix_state_unlock(sk);
2018 			err = -EPIPE;
2019 		} else if (unix_peer(sk) == other) {
2020 			unix_peer(sk) = NULL;
2021 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022 
2023 			sk->sk_state = TCP_CLOSE;
2024 			unix_state_unlock(sk);
2025 
2026 			unix_dgram_disconnected(sk, other);
2027 			sock_put(other);
2028 			err = -ECONNREFUSED;
2029 		} else {
2030 			unix_state_unlock(sk);
2031 		}
2032 
2033 		other = NULL;
2034 		if (err)
2035 			goto out_free;
2036 		goto restart;
2037 	}
2038 
2039 	err = -EPIPE;
2040 	if (other->sk_shutdown & RCV_SHUTDOWN)
2041 		goto out_unlock;
2042 
2043 	if (sk->sk_type != SOCK_SEQPACKET) {
2044 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045 		if (err)
2046 			goto out_unlock;
2047 	}
2048 
2049 	/* other == sk && unix_peer(other) != sk if
2050 	 * - unix_peer(sk) == NULL, destination address bound to sk
2051 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2052 	 */
2053 	if (other != sk &&
2054 	    unlikely(unix_peer(other) != sk &&
2055 	    unix_recvq_full_lockless(other))) {
2056 		if (timeo) {
2057 			timeo = unix_wait_for_peer(other, timeo);
2058 
2059 			err = sock_intr_errno(timeo);
2060 			if (signal_pending(current))
2061 				goto out_free;
2062 
2063 			goto restart;
2064 		}
2065 
2066 		if (!sk_locked) {
2067 			unix_state_unlock(other);
2068 			unix_state_double_lock(sk, other);
2069 		}
2070 
2071 		if (unix_peer(sk) != other ||
2072 		    unix_dgram_peer_wake_me(sk, other)) {
2073 			err = -EAGAIN;
2074 			sk_locked = 1;
2075 			goto out_unlock;
2076 		}
2077 
2078 		if (!sk_locked) {
2079 			sk_locked = 1;
2080 			goto restart_locked;
2081 		}
2082 	}
2083 
2084 	if (unlikely(sk_locked))
2085 		unix_state_unlock(sk);
2086 
2087 	if (sock_flag(other, SOCK_RCVTSTAMP))
2088 		__net_timestamp(skb);
2089 	maybe_add_creds(skb, sock, other);
2090 	scm_stat_add(other, skb);
2091 	skb_queue_tail(&other->sk_receive_queue, skb);
2092 	unix_state_unlock(other);
2093 	other->sk_data_ready(other);
2094 	sock_put(other);
2095 	scm_destroy(&scm);
2096 	return len;
2097 
2098 out_unlock:
2099 	if (sk_locked)
2100 		unix_state_unlock(sk);
2101 	unix_state_unlock(other);
2102 out_free:
2103 	kfree_skb(skb);
2104 out:
2105 	if (other)
2106 		sock_put(other);
2107 	scm_destroy(&scm);
2108 	return err;
2109 }
2110 
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112  * bytes, and a minimum of a full page.
2113  */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115 
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118 		     struct scm_cookie *scm, bool fds_sent)
2119 {
2120 	struct unix_sock *ousk = unix_sk(other);
2121 	struct sk_buff *skb;
2122 	int err = 0;
2123 
2124 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125 
2126 	if (!skb)
2127 		return err;
2128 
2129 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2130 	if (err < 0) {
2131 		kfree_skb(skb);
2132 		return err;
2133 	}
2134 	skb_put(skb, 1);
2135 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136 
2137 	if (err) {
2138 		kfree_skb(skb);
2139 		return err;
2140 	}
2141 
2142 	unix_state_lock(other);
2143 
2144 	if (sock_flag(other, SOCK_DEAD) ||
2145 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2146 		unix_state_unlock(other);
2147 		kfree_skb(skb);
2148 		return -EPIPE;
2149 	}
2150 
2151 	maybe_add_creds(skb, sock, other);
2152 	skb_get(skb);
2153 
2154 	if (ousk->oob_skb)
2155 		consume_skb(ousk->oob_skb);
2156 
2157 	WRITE_ONCE(ousk->oob_skb, skb);
2158 
2159 	scm_stat_add(other, skb);
2160 	skb_queue_tail(&other->sk_receive_queue, skb);
2161 	sk_send_sigurg(other);
2162 	unix_state_unlock(other);
2163 	other->sk_data_ready(other);
2164 
2165 	return err;
2166 }
2167 #endif
2168 
2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2170 			       size_t len)
2171 {
2172 	struct sock *sk = sock->sk;
2173 	struct sock *other = NULL;
2174 	int err, size;
2175 	struct sk_buff *skb;
2176 	int sent = 0;
2177 	struct scm_cookie scm;
2178 	bool fds_sent = false;
2179 	int data_len;
2180 
2181 	wait_for_unix_gc();
2182 	err = scm_send(sock, msg, &scm, false);
2183 	if (err < 0)
2184 		return err;
2185 
2186 	err = -EOPNOTSUPP;
2187 	if (msg->msg_flags & MSG_OOB) {
2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2189 		if (len)
2190 			len--;
2191 		else
2192 #endif
2193 			goto out_err;
2194 	}
2195 
2196 	if (msg->msg_namelen) {
2197 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2198 		goto out_err;
2199 	} else {
2200 		err = -ENOTCONN;
2201 		other = unix_peer(sk);
2202 		if (!other)
2203 			goto out_err;
2204 	}
2205 
2206 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2207 		goto pipe_err;
2208 
2209 	while (sent < len) {
2210 		size = len - sent;
2211 
2212 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2213 			skb = sock_alloc_send_pskb(sk, 0, 0,
2214 						   msg->msg_flags & MSG_DONTWAIT,
2215 						   &err, 0);
2216 		} else {
2217 			/* Keep two messages in the pipe so it schedules better */
2218 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2219 
2220 			/* allow fallback to order-0 allocations */
2221 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2222 
2223 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2224 
2225 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2226 
2227 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2228 						   msg->msg_flags & MSG_DONTWAIT, &err,
2229 						   get_order(UNIX_SKB_FRAGS_SZ));
2230 		}
2231 		if (!skb)
2232 			goto out_err;
2233 
2234 		/* Only send the fds in the first buffer */
2235 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2236 		if (err < 0) {
2237 			kfree_skb(skb);
2238 			goto out_err;
2239 		}
2240 		fds_sent = true;
2241 
2242 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2243 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2244 						   sk->sk_allocation);
2245 			if (err < 0) {
2246 				kfree_skb(skb);
2247 				goto out_err;
2248 			}
2249 			size = err;
2250 			refcount_add(size, &sk->sk_wmem_alloc);
2251 		} else {
2252 			skb_put(skb, size - data_len);
2253 			skb->data_len = data_len;
2254 			skb->len = size;
2255 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2256 			if (err) {
2257 				kfree_skb(skb);
2258 				goto out_err;
2259 			}
2260 		}
2261 
2262 		unix_state_lock(other);
2263 
2264 		if (sock_flag(other, SOCK_DEAD) ||
2265 		    (other->sk_shutdown & RCV_SHUTDOWN))
2266 			goto pipe_err_free;
2267 
2268 		maybe_add_creds(skb, sock, other);
2269 		scm_stat_add(other, skb);
2270 		skb_queue_tail(&other->sk_receive_queue, skb);
2271 		unix_state_unlock(other);
2272 		other->sk_data_ready(other);
2273 		sent += size;
2274 	}
2275 
2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2277 	if (msg->msg_flags & MSG_OOB) {
2278 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2279 		if (err)
2280 			goto out_err;
2281 		sent++;
2282 	}
2283 #endif
2284 
2285 	scm_destroy(&scm);
2286 
2287 	return sent;
2288 
2289 pipe_err_free:
2290 	unix_state_unlock(other);
2291 	kfree_skb(skb);
2292 pipe_err:
2293 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2294 		send_sig(SIGPIPE, current, 0);
2295 	err = -EPIPE;
2296 out_err:
2297 	scm_destroy(&scm);
2298 	return sent ? : err;
2299 }
2300 
2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2302 				  size_t len)
2303 {
2304 	int err;
2305 	struct sock *sk = sock->sk;
2306 
2307 	err = sock_error(sk);
2308 	if (err)
2309 		return err;
2310 
2311 	if (sk->sk_state != TCP_ESTABLISHED)
2312 		return -ENOTCONN;
2313 
2314 	if (msg->msg_namelen)
2315 		msg->msg_namelen = 0;
2316 
2317 	return unix_dgram_sendmsg(sock, msg, len);
2318 }
2319 
2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2321 				  size_t size, int flags)
2322 {
2323 	struct sock *sk = sock->sk;
2324 
2325 	if (sk->sk_state != TCP_ESTABLISHED)
2326 		return -ENOTCONN;
2327 
2328 	return unix_dgram_recvmsg(sock, msg, size, flags);
2329 }
2330 
2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2332 {
2333 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2334 
2335 	if (addr) {
2336 		msg->msg_namelen = addr->len;
2337 		memcpy(msg->msg_name, addr->name, addr->len);
2338 	}
2339 }
2340 
2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2342 			 int flags)
2343 {
2344 	struct scm_cookie scm;
2345 	struct socket *sock = sk->sk_socket;
2346 	struct unix_sock *u = unix_sk(sk);
2347 	struct sk_buff *skb, *last;
2348 	long timeo;
2349 	int skip;
2350 	int err;
2351 
2352 	err = -EOPNOTSUPP;
2353 	if (flags&MSG_OOB)
2354 		goto out;
2355 
2356 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2357 
2358 	do {
2359 		mutex_lock(&u->iolock);
2360 
2361 		skip = sk_peek_offset(sk, flags);
2362 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2363 					      &skip, &err, &last);
2364 		if (skb) {
2365 			if (!(flags & MSG_PEEK))
2366 				scm_stat_del(sk, skb);
2367 			break;
2368 		}
2369 
2370 		mutex_unlock(&u->iolock);
2371 
2372 		if (err != -EAGAIN)
2373 			break;
2374 	} while (timeo &&
2375 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2376 					      &err, &timeo, last));
2377 
2378 	if (!skb) { /* implies iolock unlocked */
2379 		unix_state_lock(sk);
2380 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2381 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2382 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2383 			err = 0;
2384 		unix_state_unlock(sk);
2385 		goto out;
2386 	}
2387 
2388 	if (wq_has_sleeper(&u->peer_wait))
2389 		wake_up_interruptible_sync_poll(&u->peer_wait,
2390 						EPOLLOUT | EPOLLWRNORM |
2391 						EPOLLWRBAND);
2392 
2393 	if (msg->msg_name)
2394 		unix_copy_addr(msg, skb->sk);
2395 
2396 	if (size > skb->len - skip)
2397 		size = skb->len - skip;
2398 	else if (size < skb->len - skip)
2399 		msg->msg_flags |= MSG_TRUNC;
2400 
2401 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2402 	if (err)
2403 		goto out_free;
2404 
2405 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2406 		__sock_recv_timestamp(msg, sk, skb);
2407 
2408 	memset(&scm, 0, sizeof(scm));
2409 
2410 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2411 	unix_set_secdata(&scm, skb);
2412 
2413 	if (!(flags & MSG_PEEK)) {
2414 		if (UNIXCB(skb).fp)
2415 			unix_detach_fds(&scm, skb);
2416 
2417 		sk_peek_offset_bwd(sk, skb->len);
2418 	} else {
2419 		/* It is questionable: on PEEK we could:
2420 		   - do not return fds - good, but too simple 8)
2421 		   - return fds, and do not return them on read (old strategy,
2422 		     apparently wrong)
2423 		   - clone fds (I chose it for now, it is the most universal
2424 		     solution)
2425 
2426 		   POSIX 1003.1g does not actually define this clearly
2427 		   at all. POSIX 1003.1g doesn't define a lot of things
2428 		   clearly however!
2429 
2430 		*/
2431 
2432 		sk_peek_offset_fwd(sk, size);
2433 
2434 		if (UNIXCB(skb).fp)
2435 			unix_peek_fds(&scm, skb);
2436 	}
2437 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2438 
2439 	scm_recv_unix(sock, msg, &scm, flags);
2440 
2441 out_free:
2442 	skb_free_datagram(sk, skb);
2443 	mutex_unlock(&u->iolock);
2444 out:
2445 	return err;
2446 }
2447 
2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2449 			      int flags)
2450 {
2451 	struct sock *sk = sock->sk;
2452 
2453 #ifdef CONFIG_BPF_SYSCALL
2454 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2455 
2456 	if (prot != &unix_dgram_proto)
2457 		return prot->recvmsg(sk, msg, size, flags, NULL);
2458 #endif
2459 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2460 }
2461 
2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2463 {
2464 	struct unix_sock *u = unix_sk(sk);
2465 	struct sk_buff *skb;
2466 	int err;
2467 
2468 	mutex_lock(&u->iolock);
2469 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2470 	mutex_unlock(&u->iolock);
2471 	if (!skb)
2472 		return err;
2473 
2474 	return recv_actor(sk, skb);
2475 }
2476 
2477 /*
2478  *	Sleep until more data has arrived. But check for races..
2479  */
2480 static long unix_stream_data_wait(struct sock *sk, long timeo,
2481 				  struct sk_buff *last, unsigned int last_len,
2482 				  bool freezable)
2483 {
2484 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2485 	struct sk_buff *tail;
2486 	DEFINE_WAIT(wait);
2487 
2488 	unix_state_lock(sk);
2489 
2490 	for (;;) {
2491 		prepare_to_wait(sk_sleep(sk), &wait, state);
2492 
2493 		tail = skb_peek_tail(&sk->sk_receive_queue);
2494 		if (tail != last ||
2495 		    (tail && tail->len != last_len) ||
2496 		    sk->sk_err ||
2497 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2498 		    signal_pending(current) ||
2499 		    !timeo)
2500 			break;
2501 
2502 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2503 		unix_state_unlock(sk);
2504 		timeo = schedule_timeout(timeo);
2505 		unix_state_lock(sk);
2506 
2507 		if (sock_flag(sk, SOCK_DEAD))
2508 			break;
2509 
2510 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2511 	}
2512 
2513 	finish_wait(sk_sleep(sk), &wait);
2514 	unix_state_unlock(sk);
2515 	return timeo;
2516 }
2517 
2518 static unsigned int unix_skb_len(const struct sk_buff *skb)
2519 {
2520 	return skb->len - UNIXCB(skb).consumed;
2521 }
2522 
2523 struct unix_stream_read_state {
2524 	int (*recv_actor)(struct sk_buff *, int, int,
2525 			  struct unix_stream_read_state *);
2526 	struct socket *socket;
2527 	struct msghdr *msg;
2528 	struct pipe_inode_info *pipe;
2529 	size_t size;
2530 	int flags;
2531 	unsigned int splice_flags;
2532 };
2533 
2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2536 {
2537 	struct socket *sock = state->socket;
2538 	struct sock *sk = sock->sk;
2539 	struct unix_sock *u = unix_sk(sk);
2540 	int chunk = 1;
2541 	struct sk_buff *oob_skb;
2542 
2543 	mutex_lock(&u->iolock);
2544 	unix_state_lock(sk);
2545 
2546 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2547 		unix_state_unlock(sk);
2548 		mutex_unlock(&u->iolock);
2549 		return -EINVAL;
2550 	}
2551 
2552 	oob_skb = u->oob_skb;
2553 
2554 	if (!(state->flags & MSG_PEEK))
2555 		WRITE_ONCE(u->oob_skb, NULL);
2556 	else
2557 		skb_get(oob_skb);
2558 	unix_state_unlock(sk);
2559 
2560 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2561 
2562 	if (!(state->flags & MSG_PEEK))
2563 		UNIXCB(oob_skb).consumed += 1;
2564 
2565 	consume_skb(oob_skb);
2566 
2567 	mutex_unlock(&u->iolock);
2568 
2569 	if (chunk < 0)
2570 		return -EFAULT;
2571 
2572 	state->msg->msg_flags |= MSG_OOB;
2573 	return 1;
2574 }
2575 
2576 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2577 				  int flags, int copied)
2578 {
2579 	struct unix_sock *u = unix_sk(sk);
2580 
2581 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2582 		skb_unlink(skb, &sk->sk_receive_queue);
2583 		consume_skb(skb);
2584 		skb = NULL;
2585 	} else {
2586 		if (skb == u->oob_skb) {
2587 			if (copied) {
2588 				skb = NULL;
2589 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2590 				if (!(flags & MSG_PEEK)) {
2591 					WRITE_ONCE(u->oob_skb, NULL);
2592 					consume_skb(skb);
2593 				}
2594 			} else if (!(flags & MSG_PEEK)) {
2595 				skb_unlink(skb, &sk->sk_receive_queue);
2596 				consume_skb(skb);
2597 				skb = skb_peek(&sk->sk_receive_queue);
2598 			}
2599 		}
2600 	}
2601 	return skb;
2602 }
2603 #endif
2604 
2605 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2606 {
2607 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2608 		return -ENOTCONN;
2609 
2610 	return unix_read_skb(sk, recv_actor);
2611 }
2612 
2613 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2614 				    bool freezable)
2615 {
2616 	struct scm_cookie scm;
2617 	struct socket *sock = state->socket;
2618 	struct sock *sk = sock->sk;
2619 	struct unix_sock *u = unix_sk(sk);
2620 	int copied = 0;
2621 	int flags = state->flags;
2622 	int noblock = flags & MSG_DONTWAIT;
2623 	bool check_creds = false;
2624 	int target;
2625 	int err = 0;
2626 	long timeo;
2627 	int skip;
2628 	size_t size = state->size;
2629 	unsigned int last_len;
2630 
2631 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2632 		err = -EINVAL;
2633 		goto out;
2634 	}
2635 
2636 	if (unlikely(flags & MSG_OOB)) {
2637 		err = -EOPNOTSUPP;
2638 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2639 		err = unix_stream_recv_urg(state);
2640 #endif
2641 		goto out;
2642 	}
2643 
2644 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2645 	timeo = sock_rcvtimeo(sk, noblock);
2646 
2647 	memset(&scm, 0, sizeof(scm));
2648 
2649 	/* Lock the socket to prevent queue disordering
2650 	 * while sleeps in memcpy_tomsg
2651 	 */
2652 	mutex_lock(&u->iolock);
2653 
2654 	skip = max(sk_peek_offset(sk, flags), 0);
2655 
2656 	do {
2657 		int chunk;
2658 		bool drop_skb;
2659 		struct sk_buff *skb, *last;
2660 
2661 redo:
2662 		unix_state_lock(sk);
2663 		if (sock_flag(sk, SOCK_DEAD)) {
2664 			err = -ECONNRESET;
2665 			goto unlock;
2666 		}
2667 		last = skb = skb_peek(&sk->sk_receive_queue);
2668 		last_len = last ? last->len : 0;
2669 
2670 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2671 		if (skb) {
2672 			skb = manage_oob(skb, sk, flags, copied);
2673 			if (!skb) {
2674 				unix_state_unlock(sk);
2675 				if (copied)
2676 					break;
2677 				goto redo;
2678 			}
2679 		}
2680 #endif
2681 again:
2682 		if (skb == NULL) {
2683 			if (copied >= target)
2684 				goto unlock;
2685 
2686 			/*
2687 			 *	POSIX 1003.1g mandates this order.
2688 			 */
2689 
2690 			err = sock_error(sk);
2691 			if (err)
2692 				goto unlock;
2693 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2694 				goto unlock;
2695 
2696 			unix_state_unlock(sk);
2697 			if (!timeo) {
2698 				err = -EAGAIN;
2699 				break;
2700 			}
2701 
2702 			mutex_unlock(&u->iolock);
2703 
2704 			timeo = unix_stream_data_wait(sk, timeo, last,
2705 						      last_len, freezable);
2706 
2707 			if (signal_pending(current)) {
2708 				err = sock_intr_errno(timeo);
2709 				scm_destroy(&scm);
2710 				goto out;
2711 			}
2712 
2713 			mutex_lock(&u->iolock);
2714 			goto redo;
2715 unlock:
2716 			unix_state_unlock(sk);
2717 			break;
2718 		}
2719 
2720 		while (skip >= unix_skb_len(skb)) {
2721 			skip -= unix_skb_len(skb);
2722 			last = skb;
2723 			last_len = skb->len;
2724 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2725 			if (!skb)
2726 				goto again;
2727 		}
2728 
2729 		unix_state_unlock(sk);
2730 
2731 		if (check_creds) {
2732 			/* Never glue messages from different writers */
2733 			if (!unix_skb_scm_eq(skb, &scm))
2734 				break;
2735 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2736 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2737 			/* Copy credentials */
2738 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2739 			unix_set_secdata(&scm, skb);
2740 			check_creds = true;
2741 		}
2742 
2743 		/* Copy address just once */
2744 		if (state->msg && state->msg->msg_name) {
2745 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2746 					 state->msg->msg_name);
2747 			unix_copy_addr(state->msg, skb->sk);
2748 			sunaddr = NULL;
2749 		}
2750 
2751 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2752 		skb_get(skb);
2753 		chunk = state->recv_actor(skb, skip, chunk, state);
2754 		drop_skb = !unix_skb_len(skb);
2755 		/* skb is only safe to use if !drop_skb */
2756 		consume_skb(skb);
2757 		if (chunk < 0) {
2758 			if (copied == 0)
2759 				copied = -EFAULT;
2760 			break;
2761 		}
2762 		copied += chunk;
2763 		size -= chunk;
2764 
2765 		if (drop_skb) {
2766 			/* the skb was touched by a concurrent reader;
2767 			 * we should not expect anything from this skb
2768 			 * anymore and assume it invalid - we can be
2769 			 * sure it was dropped from the socket queue
2770 			 *
2771 			 * let's report a short read
2772 			 */
2773 			err = 0;
2774 			break;
2775 		}
2776 
2777 		/* Mark read part of skb as used */
2778 		if (!(flags & MSG_PEEK)) {
2779 			UNIXCB(skb).consumed += chunk;
2780 
2781 			sk_peek_offset_bwd(sk, chunk);
2782 
2783 			if (UNIXCB(skb).fp) {
2784 				scm_stat_del(sk, skb);
2785 				unix_detach_fds(&scm, skb);
2786 			}
2787 
2788 			if (unix_skb_len(skb))
2789 				break;
2790 
2791 			skb_unlink(skb, &sk->sk_receive_queue);
2792 			consume_skb(skb);
2793 
2794 			if (scm.fp)
2795 				break;
2796 		} else {
2797 			/* It is questionable, see note in unix_dgram_recvmsg.
2798 			 */
2799 			if (UNIXCB(skb).fp)
2800 				unix_peek_fds(&scm, skb);
2801 
2802 			sk_peek_offset_fwd(sk, chunk);
2803 
2804 			if (UNIXCB(skb).fp)
2805 				break;
2806 
2807 			skip = 0;
2808 			last = skb;
2809 			last_len = skb->len;
2810 			unix_state_lock(sk);
2811 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2812 			if (skb)
2813 				goto again;
2814 			unix_state_unlock(sk);
2815 			break;
2816 		}
2817 	} while (size);
2818 
2819 	mutex_unlock(&u->iolock);
2820 	if (state->msg)
2821 		scm_recv_unix(sock, state->msg, &scm, flags);
2822 	else
2823 		scm_destroy(&scm);
2824 out:
2825 	return copied ? : err;
2826 }
2827 
2828 static int unix_stream_read_actor(struct sk_buff *skb,
2829 				  int skip, int chunk,
2830 				  struct unix_stream_read_state *state)
2831 {
2832 	int ret;
2833 
2834 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2835 				    state->msg, chunk);
2836 	return ret ?: chunk;
2837 }
2838 
2839 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2840 			  size_t size, int flags)
2841 {
2842 	struct unix_stream_read_state state = {
2843 		.recv_actor = unix_stream_read_actor,
2844 		.socket = sk->sk_socket,
2845 		.msg = msg,
2846 		.size = size,
2847 		.flags = flags
2848 	};
2849 
2850 	return unix_stream_read_generic(&state, true);
2851 }
2852 
2853 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2854 			       size_t size, int flags)
2855 {
2856 	struct unix_stream_read_state state = {
2857 		.recv_actor = unix_stream_read_actor,
2858 		.socket = sock,
2859 		.msg = msg,
2860 		.size = size,
2861 		.flags = flags
2862 	};
2863 
2864 #ifdef CONFIG_BPF_SYSCALL
2865 	struct sock *sk = sock->sk;
2866 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2867 
2868 	if (prot != &unix_stream_proto)
2869 		return prot->recvmsg(sk, msg, size, flags, NULL);
2870 #endif
2871 	return unix_stream_read_generic(&state, true);
2872 }
2873 
2874 static int unix_stream_splice_actor(struct sk_buff *skb,
2875 				    int skip, int chunk,
2876 				    struct unix_stream_read_state *state)
2877 {
2878 	return skb_splice_bits(skb, state->socket->sk,
2879 			       UNIXCB(skb).consumed + skip,
2880 			       state->pipe, chunk, state->splice_flags);
2881 }
2882 
2883 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2884 				       struct pipe_inode_info *pipe,
2885 				       size_t size, unsigned int flags)
2886 {
2887 	struct unix_stream_read_state state = {
2888 		.recv_actor = unix_stream_splice_actor,
2889 		.socket = sock,
2890 		.pipe = pipe,
2891 		.size = size,
2892 		.splice_flags = flags,
2893 	};
2894 
2895 	if (unlikely(*ppos))
2896 		return -ESPIPE;
2897 
2898 	if (sock->file->f_flags & O_NONBLOCK ||
2899 	    flags & SPLICE_F_NONBLOCK)
2900 		state.flags = MSG_DONTWAIT;
2901 
2902 	return unix_stream_read_generic(&state, false);
2903 }
2904 
2905 static int unix_shutdown(struct socket *sock, int mode)
2906 {
2907 	struct sock *sk = sock->sk;
2908 	struct sock *other;
2909 
2910 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2911 		return -EINVAL;
2912 	/* This maps:
2913 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2914 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2915 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2916 	 */
2917 	++mode;
2918 
2919 	unix_state_lock(sk);
2920 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2921 	other = unix_peer(sk);
2922 	if (other)
2923 		sock_hold(other);
2924 	unix_state_unlock(sk);
2925 	sk->sk_state_change(sk);
2926 
2927 	if (other &&
2928 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2929 
2930 		int peer_mode = 0;
2931 		const struct proto *prot = READ_ONCE(other->sk_prot);
2932 
2933 		if (prot->unhash)
2934 			prot->unhash(other);
2935 		if (mode&RCV_SHUTDOWN)
2936 			peer_mode |= SEND_SHUTDOWN;
2937 		if (mode&SEND_SHUTDOWN)
2938 			peer_mode |= RCV_SHUTDOWN;
2939 		unix_state_lock(other);
2940 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2941 		unix_state_unlock(other);
2942 		other->sk_state_change(other);
2943 		if (peer_mode == SHUTDOWN_MASK)
2944 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2945 		else if (peer_mode & RCV_SHUTDOWN)
2946 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2947 	}
2948 	if (other)
2949 		sock_put(other);
2950 
2951 	return 0;
2952 }
2953 
2954 long unix_inq_len(struct sock *sk)
2955 {
2956 	struct sk_buff *skb;
2957 	long amount = 0;
2958 
2959 	if (sk->sk_state == TCP_LISTEN)
2960 		return -EINVAL;
2961 
2962 	spin_lock(&sk->sk_receive_queue.lock);
2963 	if (sk->sk_type == SOCK_STREAM ||
2964 	    sk->sk_type == SOCK_SEQPACKET) {
2965 		skb_queue_walk(&sk->sk_receive_queue, skb)
2966 			amount += unix_skb_len(skb);
2967 	} else {
2968 		skb = skb_peek(&sk->sk_receive_queue);
2969 		if (skb)
2970 			amount = skb->len;
2971 	}
2972 	spin_unlock(&sk->sk_receive_queue.lock);
2973 
2974 	return amount;
2975 }
2976 EXPORT_SYMBOL_GPL(unix_inq_len);
2977 
2978 long unix_outq_len(struct sock *sk)
2979 {
2980 	return sk_wmem_alloc_get(sk);
2981 }
2982 EXPORT_SYMBOL_GPL(unix_outq_len);
2983 
2984 static int unix_open_file(struct sock *sk)
2985 {
2986 	struct path path;
2987 	struct file *f;
2988 	int fd;
2989 
2990 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2991 		return -EPERM;
2992 
2993 	if (!smp_load_acquire(&unix_sk(sk)->addr))
2994 		return -ENOENT;
2995 
2996 	path = unix_sk(sk)->path;
2997 	if (!path.dentry)
2998 		return -ENOENT;
2999 
3000 	path_get(&path);
3001 
3002 	fd = get_unused_fd_flags(O_CLOEXEC);
3003 	if (fd < 0)
3004 		goto out;
3005 
3006 	f = dentry_open(&path, O_PATH, current_cred());
3007 	if (IS_ERR(f)) {
3008 		put_unused_fd(fd);
3009 		fd = PTR_ERR(f);
3010 		goto out;
3011 	}
3012 
3013 	fd_install(fd, f);
3014 out:
3015 	path_put(&path);
3016 
3017 	return fd;
3018 }
3019 
3020 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3021 {
3022 	struct sock *sk = sock->sk;
3023 	long amount = 0;
3024 	int err;
3025 
3026 	switch (cmd) {
3027 	case SIOCOUTQ:
3028 		amount = unix_outq_len(sk);
3029 		err = put_user(amount, (int __user *)arg);
3030 		break;
3031 	case SIOCINQ:
3032 		amount = unix_inq_len(sk);
3033 		if (amount < 0)
3034 			err = amount;
3035 		else
3036 			err = put_user(amount, (int __user *)arg);
3037 		break;
3038 	case SIOCUNIXFILE:
3039 		err = unix_open_file(sk);
3040 		break;
3041 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3042 	case SIOCATMARK:
3043 		{
3044 			struct sk_buff *skb;
3045 			int answ = 0;
3046 
3047 			skb = skb_peek(&sk->sk_receive_queue);
3048 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3049 				answ = 1;
3050 			err = put_user(answ, (int __user *)arg);
3051 		}
3052 		break;
3053 #endif
3054 	default:
3055 		err = -ENOIOCTLCMD;
3056 		break;
3057 	}
3058 	return err;
3059 }
3060 
3061 #ifdef CONFIG_COMPAT
3062 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3063 {
3064 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3065 }
3066 #endif
3067 
3068 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3069 {
3070 	struct sock *sk = sock->sk;
3071 	__poll_t mask;
3072 	u8 shutdown;
3073 
3074 	sock_poll_wait(file, sock, wait);
3075 	mask = 0;
3076 	shutdown = READ_ONCE(sk->sk_shutdown);
3077 
3078 	/* exceptional events? */
3079 	if (READ_ONCE(sk->sk_err))
3080 		mask |= EPOLLERR;
3081 	if (shutdown == SHUTDOWN_MASK)
3082 		mask |= EPOLLHUP;
3083 	if (shutdown & RCV_SHUTDOWN)
3084 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3085 
3086 	/* readable? */
3087 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3088 		mask |= EPOLLIN | EPOLLRDNORM;
3089 	if (sk_is_readable(sk))
3090 		mask |= EPOLLIN | EPOLLRDNORM;
3091 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3092 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3093 		mask |= EPOLLPRI;
3094 #endif
3095 
3096 	/* Connection-based need to check for termination and startup */
3097 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3098 	    sk->sk_state == TCP_CLOSE)
3099 		mask |= EPOLLHUP;
3100 
3101 	/*
3102 	 * we set writable also when the other side has shut down the
3103 	 * connection. This prevents stuck sockets.
3104 	 */
3105 	if (unix_writable(sk))
3106 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3107 
3108 	return mask;
3109 }
3110 
3111 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3112 				    poll_table *wait)
3113 {
3114 	struct sock *sk = sock->sk, *other;
3115 	unsigned int writable;
3116 	__poll_t mask;
3117 	u8 shutdown;
3118 
3119 	sock_poll_wait(file, sock, wait);
3120 	mask = 0;
3121 	shutdown = READ_ONCE(sk->sk_shutdown);
3122 
3123 	/* exceptional events? */
3124 	if (READ_ONCE(sk->sk_err) ||
3125 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3126 		mask |= EPOLLERR |
3127 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3128 
3129 	if (shutdown & RCV_SHUTDOWN)
3130 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3131 	if (shutdown == SHUTDOWN_MASK)
3132 		mask |= EPOLLHUP;
3133 
3134 	/* readable? */
3135 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3136 		mask |= EPOLLIN | EPOLLRDNORM;
3137 	if (sk_is_readable(sk))
3138 		mask |= EPOLLIN | EPOLLRDNORM;
3139 
3140 	/* Connection-based need to check for termination and startup */
3141 	if (sk->sk_type == SOCK_SEQPACKET) {
3142 		if (sk->sk_state == TCP_CLOSE)
3143 			mask |= EPOLLHUP;
3144 		/* connection hasn't started yet? */
3145 		if (sk->sk_state == TCP_SYN_SENT)
3146 			return mask;
3147 	}
3148 
3149 	/* No write status requested, avoid expensive OUT tests. */
3150 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3151 		return mask;
3152 
3153 	writable = unix_writable(sk);
3154 	if (writable) {
3155 		unix_state_lock(sk);
3156 
3157 		other = unix_peer(sk);
3158 		if (other && unix_peer(other) != sk &&
3159 		    unix_recvq_full_lockless(other) &&
3160 		    unix_dgram_peer_wake_me(sk, other))
3161 			writable = 0;
3162 
3163 		unix_state_unlock(sk);
3164 	}
3165 
3166 	if (writable)
3167 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3168 	else
3169 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3170 
3171 	return mask;
3172 }
3173 
3174 #ifdef CONFIG_PROC_FS
3175 
3176 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3177 
3178 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3179 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3180 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3181 
3182 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3183 {
3184 	unsigned long offset = get_offset(*pos);
3185 	unsigned long bucket = get_bucket(*pos);
3186 	unsigned long count = 0;
3187 	struct sock *sk;
3188 
3189 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3190 	     sk; sk = sk_next(sk)) {
3191 		if (++count == offset)
3192 			break;
3193 	}
3194 
3195 	return sk;
3196 }
3197 
3198 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3199 {
3200 	unsigned long bucket = get_bucket(*pos);
3201 	struct net *net = seq_file_net(seq);
3202 	struct sock *sk;
3203 
3204 	while (bucket < UNIX_HASH_SIZE) {
3205 		spin_lock(&net->unx.table.locks[bucket]);
3206 
3207 		sk = unix_from_bucket(seq, pos);
3208 		if (sk)
3209 			return sk;
3210 
3211 		spin_unlock(&net->unx.table.locks[bucket]);
3212 
3213 		*pos = set_bucket_offset(++bucket, 1);
3214 	}
3215 
3216 	return NULL;
3217 }
3218 
3219 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3220 				  loff_t *pos)
3221 {
3222 	unsigned long bucket = get_bucket(*pos);
3223 
3224 	sk = sk_next(sk);
3225 	if (sk)
3226 		return sk;
3227 
3228 
3229 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3230 
3231 	*pos = set_bucket_offset(++bucket, 1);
3232 
3233 	return unix_get_first(seq, pos);
3234 }
3235 
3236 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3237 {
3238 	if (!*pos)
3239 		return SEQ_START_TOKEN;
3240 
3241 	return unix_get_first(seq, pos);
3242 }
3243 
3244 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3245 {
3246 	++*pos;
3247 
3248 	if (v == SEQ_START_TOKEN)
3249 		return unix_get_first(seq, pos);
3250 
3251 	return unix_get_next(seq, v, pos);
3252 }
3253 
3254 static void unix_seq_stop(struct seq_file *seq, void *v)
3255 {
3256 	struct sock *sk = v;
3257 
3258 	if (sk)
3259 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3260 }
3261 
3262 static int unix_seq_show(struct seq_file *seq, void *v)
3263 {
3264 
3265 	if (v == SEQ_START_TOKEN)
3266 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3267 			 "Inode Path\n");
3268 	else {
3269 		struct sock *s = v;
3270 		struct unix_sock *u = unix_sk(s);
3271 		unix_state_lock(s);
3272 
3273 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3274 			s,
3275 			refcount_read(&s->sk_refcnt),
3276 			0,
3277 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3278 			s->sk_type,
3279 			s->sk_socket ?
3280 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3281 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3282 			sock_i_ino(s));
3283 
3284 		if (u->addr) {	// under a hash table lock here
3285 			int i, len;
3286 			seq_putc(seq, ' ');
3287 
3288 			i = 0;
3289 			len = u->addr->len -
3290 				offsetof(struct sockaddr_un, sun_path);
3291 			if (u->addr->name->sun_path[0]) {
3292 				len--;
3293 			} else {
3294 				seq_putc(seq, '@');
3295 				i++;
3296 			}
3297 			for ( ; i < len; i++)
3298 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3299 					 '@');
3300 		}
3301 		unix_state_unlock(s);
3302 		seq_putc(seq, '\n');
3303 	}
3304 
3305 	return 0;
3306 }
3307 
3308 static const struct seq_operations unix_seq_ops = {
3309 	.start  = unix_seq_start,
3310 	.next   = unix_seq_next,
3311 	.stop   = unix_seq_stop,
3312 	.show   = unix_seq_show,
3313 };
3314 
3315 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3316 struct bpf_unix_iter_state {
3317 	struct seq_net_private p;
3318 	unsigned int cur_sk;
3319 	unsigned int end_sk;
3320 	unsigned int max_sk;
3321 	struct sock **batch;
3322 	bool st_bucket_done;
3323 };
3324 
3325 struct bpf_iter__unix {
3326 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3327 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3328 	uid_t uid __aligned(8);
3329 };
3330 
3331 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3332 			      struct unix_sock *unix_sk, uid_t uid)
3333 {
3334 	struct bpf_iter__unix ctx;
3335 
3336 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3337 	ctx.meta = meta;
3338 	ctx.unix_sk = unix_sk;
3339 	ctx.uid = uid;
3340 	return bpf_iter_run_prog(prog, &ctx);
3341 }
3342 
3343 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3344 
3345 {
3346 	struct bpf_unix_iter_state *iter = seq->private;
3347 	unsigned int expected = 1;
3348 	struct sock *sk;
3349 
3350 	sock_hold(start_sk);
3351 	iter->batch[iter->end_sk++] = start_sk;
3352 
3353 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3354 		if (iter->end_sk < iter->max_sk) {
3355 			sock_hold(sk);
3356 			iter->batch[iter->end_sk++] = sk;
3357 		}
3358 
3359 		expected++;
3360 	}
3361 
3362 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3363 
3364 	return expected;
3365 }
3366 
3367 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3368 {
3369 	while (iter->cur_sk < iter->end_sk)
3370 		sock_put(iter->batch[iter->cur_sk++]);
3371 }
3372 
3373 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3374 				       unsigned int new_batch_sz)
3375 {
3376 	struct sock **new_batch;
3377 
3378 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3379 			     GFP_USER | __GFP_NOWARN);
3380 	if (!new_batch)
3381 		return -ENOMEM;
3382 
3383 	bpf_iter_unix_put_batch(iter);
3384 	kvfree(iter->batch);
3385 	iter->batch = new_batch;
3386 	iter->max_sk = new_batch_sz;
3387 
3388 	return 0;
3389 }
3390 
3391 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3392 					loff_t *pos)
3393 {
3394 	struct bpf_unix_iter_state *iter = seq->private;
3395 	unsigned int expected;
3396 	bool resized = false;
3397 	struct sock *sk;
3398 
3399 	if (iter->st_bucket_done)
3400 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3401 
3402 again:
3403 	/* Get a new batch */
3404 	iter->cur_sk = 0;
3405 	iter->end_sk = 0;
3406 
3407 	sk = unix_get_first(seq, pos);
3408 	if (!sk)
3409 		return NULL; /* Done */
3410 
3411 	expected = bpf_iter_unix_hold_batch(seq, sk);
3412 
3413 	if (iter->end_sk == expected) {
3414 		iter->st_bucket_done = true;
3415 		return sk;
3416 	}
3417 
3418 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3419 		resized = true;
3420 		goto again;
3421 	}
3422 
3423 	return sk;
3424 }
3425 
3426 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3427 {
3428 	if (!*pos)
3429 		return SEQ_START_TOKEN;
3430 
3431 	/* bpf iter does not support lseek, so it always
3432 	 * continue from where it was stop()-ped.
3433 	 */
3434 	return bpf_iter_unix_batch(seq, pos);
3435 }
3436 
3437 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3438 {
3439 	struct bpf_unix_iter_state *iter = seq->private;
3440 	struct sock *sk;
3441 
3442 	/* Whenever seq_next() is called, the iter->cur_sk is
3443 	 * done with seq_show(), so advance to the next sk in
3444 	 * the batch.
3445 	 */
3446 	if (iter->cur_sk < iter->end_sk)
3447 		sock_put(iter->batch[iter->cur_sk++]);
3448 
3449 	++*pos;
3450 
3451 	if (iter->cur_sk < iter->end_sk)
3452 		sk = iter->batch[iter->cur_sk];
3453 	else
3454 		sk = bpf_iter_unix_batch(seq, pos);
3455 
3456 	return sk;
3457 }
3458 
3459 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3460 {
3461 	struct bpf_iter_meta meta;
3462 	struct bpf_prog *prog;
3463 	struct sock *sk = v;
3464 	uid_t uid;
3465 	bool slow;
3466 	int ret;
3467 
3468 	if (v == SEQ_START_TOKEN)
3469 		return 0;
3470 
3471 	slow = lock_sock_fast(sk);
3472 
3473 	if (unlikely(sk_unhashed(sk))) {
3474 		ret = SEQ_SKIP;
3475 		goto unlock;
3476 	}
3477 
3478 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3479 	meta.seq = seq;
3480 	prog = bpf_iter_get_info(&meta, false);
3481 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3482 unlock:
3483 	unlock_sock_fast(sk, slow);
3484 	return ret;
3485 }
3486 
3487 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3488 {
3489 	struct bpf_unix_iter_state *iter = seq->private;
3490 	struct bpf_iter_meta meta;
3491 	struct bpf_prog *prog;
3492 
3493 	if (!v) {
3494 		meta.seq = seq;
3495 		prog = bpf_iter_get_info(&meta, true);
3496 		if (prog)
3497 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3498 	}
3499 
3500 	if (iter->cur_sk < iter->end_sk)
3501 		bpf_iter_unix_put_batch(iter);
3502 }
3503 
3504 static const struct seq_operations bpf_iter_unix_seq_ops = {
3505 	.start	= bpf_iter_unix_seq_start,
3506 	.next	= bpf_iter_unix_seq_next,
3507 	.stop	= bpf_iter_unix_seq_stop,
3508 	.show	= bpf_iter_unix_seq_show,
3509 };
3510 #endif
3511 #endif
3512 
3513 static const struct net_proto_family unix_family_ops = {
3514 	.family = PF_UNIX,
3515 	.create = unix_create,
3516 	.owner	= THIS_MODULE,
3517 };
3518 
3519 
3520 static int __net_init unix_net_init(struct net *net)
3521 {
3522 	int i;
3523 
3524 	net->unx.sysctl_max_dgram_qlen = 10;
3525 	if (unix_sysctl_register(net))
3526 		goto out;
3527 
3528 #ifdef CONFIG_PROC_FS
3529 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3530 			     sizeof(struct seq_net_private)))
3531 		goto err_sysctl;
3532 #endif
3533 
3534 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3535 					      sizeof(spinlock_t), GFP_KERNEL);
3536 	if (!net->unx.table.locks)
3537 		goto err_proc;
3538 
3539 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3540 						sizeof(struct hlist_head),
3541 						GFP_KERNEL);
3542 	if (!net->unx.table.buckets)
3543 		goto free_locks;
3544 
3545 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3546 		spin_lock_init(&net->unx.table.locks[i]);
3547 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3548 	}
3549 
3550 	return 0;
3551 
3552 free_locks:
3553 	kvfree(net->unx.table.locks);
3554 err_proc:
3555 #ifdef CONFIG_PROC_FS
3556 	remove_proc_entry("unix", net->proc_net);
3557 err_sysctl:
3558 #endif
3559 	unix_sysctl_unregister(net);
3560 out:
3561 	return -ENOMEM;
3562 }
3563 
3564 static void __net_exit unix_net_exit(struct net *net)
3565 {
3566 	kvfree(net->unx.table.buckets);
3567 	kvfree(net->unx.table.locks);
3568 	unix_sysctl_unregister(net);
3569 	remove_proc_entry("unix", net->proc_net);
3570 }
3571 
3572 static struct pernet_operations unix_net_ops = {
3573 	.init = unix_net_init,
3574 	.exit = unix_net_exit,
3575 };
3576 
3577 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3578 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3579 		     struct unix_sock *unix_sk, uid_t uid)
3580 
3581 #define INIT_BATCH_SZ 16
3582 
3583 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3584 {
3585 	struct bpf_unix_iter_state *iter = priv_data;
3586 	int err;
3587 
3588 	err = bpf_iter_init_seq_net(priv_data, aux);
3589 	if (err)
3590 		return err;
3591 
3592 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3593 	if (err) {
3594 		bpf_iter_fini_seq_net(priv_data);
3595 		return err;
3596 	}
3597 
3598 	return 0;
3599 }
3600 
3601 static void bpf_iter_fini_unix(void *priv_data)
3602 {
3603 	struct bpf_unix_iter_state *iter = priv_data;
3604 
3605 	bpf_iter_fini_seq_net(priv_data);
3606 	kvfree(iter->batch);
3607 }
3608 
3609 static const struct bpf_iter_seq_info unix_seq_info = {
3610 	.seq_ops		= &bpf_iter_unix_seq_ops,
3611 	.init_seq_private	= bpf_iter_init_unix,
3612 	.fini_seq_private	= bpf_iter_fini_unix,
3613 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3614 };
3615 
3616 static const struct bpf_func_proto *
3617 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3618 			     const struct bpf_prog *prog)
3619 {
3620 	switch (func_id) {
3621 	case BPF_FUNC_setsockopt:
3622 		return &bpf_sk_setsockopt_proto;
3623 	case BPF_FUNC_getsockopt:
3624 		return &bpf_sk_getsockopt_proto;
3625 	default:
3626 		return NULL;
3627 	}
3628 }
3629 
3630 static struct bpf_iter_reg unix_reg_info = {
3631 	.target			= "unix",
3632 	.ctx_arg_info_size	= 1,
3633 	.ctx_arg_info		= {
3634 		{ offsetof(struct bpf_iter__unix, unix_sk),
3635 		  PTR_TO_BTF_ID_OR_NULL },
3636 	},
3637 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3638 	.seq_info		= &unix_seq_info,
3639 };
3640 
3641 static void __init bpf_iter_register(void)
3642 {
3643 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3644 	if (bpf_iter_reg_target(&unix_reg_info))
3645 		pr_warn("Warning: could not register bpf iterator unix\n");
3646 }
3647 #endif
3648 
3649 static int __init af_unix_init(void)
3650 {
3651 	int i, rc = -1;
3652 
3653 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3654 
3655 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3656 		spin_lock_init(&bsd_socket_locks[i]);
3657 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3658 	}
3659 
3660 	rc = proto_register(&unix_dgram_proto, 1);
3661 	if (rc != 0) {
3662 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3663 		goto out;
3664 	}
3665 
3666 	rc = proto_register(&unix_stream_proto, 1);
3667 	if (rc != 0) {
3668 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3669 		proto_unregister(&unix_dgram_proto);
3670 		goto out;
3671 	}
3672 
3673 	sock_register(&unix_family_ops);
3674 	register_pernet_subsys(&unix_net_ops);
3675 	unix_bpf_build_proto();
3676 
3677 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3678 	bpf_iter_register();
3679 #endif
3680 
3681 out:
3682 	return rc;
3683 }
3684 
3685 static void __exit af_unix_exit(void)
3686 {
3687 	sock_unregister(PF_UNIX);
3688 	proto_unregister(&unix_dgram_proto);
3689 	proto_unregister(&unix_stream_proto);
3690 	unregister_pernet_subsys(&unix_net_ops);
3691 }
3692 
3693 /* Earlier than device_initcall() so that other drivers invoking
3694    request_module() don't end up in a loop when modprobe tries
3695    to use a UNIX socket. But later than subsys_initcall() because
3696    we depend on stuff initialised there */
3697 fs_initcall(af_unix_init);
3698 module_exit(af_unix_exit);
3699 
3700 MODULE_LICENSE("GPL");
3701 MODULE_ALIAS_NETPROTO(PF_UNIX);
3702