xref: /openbmc/linux/net/unix/af_unix.c (revision da097dcc)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 	return unix_peer(osk) == sk;
218 }
219 
220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224 
225 static inline int unix_recvq_full_lockless(const struct sock *sk)
226 {
227 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229 
230 struct sock *unix_peer_get(struct sock *s)
231 {
232 	struct sock *peer;
233 
234 	unix_state_lock(s);
235 	peer = unix_peer(s);
236 	if (peer)
237 		sock_hold(peer);
238 	unix_state_unlock(s);
239 	return peer;
240 }
241 EXPORT_SYMBOL_GPL(unix_peer_get);
242 
243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
244 					     int addr_len)
245 {
246 	struct unix_address *addr;
247 
248 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
249 	if (!addr)
250 		return NULL;
251 
252 	refcount_set(&addr->refcnt, 1);
253 	addr->len = addr_len;
254 	memcpy(addr->name, sunaddr, addr_len);
255 
256 	return addr;
257 }
258 
259 static inline void unix_release_addr(struct unix_address *addr)
260 {
261 	if (refcount_dec_and_test(&addr->refcnt))
262 		kfree(addr);
263 }
264 
265 /*
266  *	Check unix socket name:
267  *		- should be not zero length.
268  *	        - if started by not zero, should be NULL terminated (FS object)
269  *		- if started by zero, it is abstract name.
270  */
271 
272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
273 {
274 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
275 	    addr_len > sizeof(*sunaddr))
276 		return -EINVAL;
277 
278 	if (sunaddr->sun_family != AF_UNIX)
279 		return -EINVAL;
280 
281 	return 0;
282 }
283 
284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
285 {
286 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
287 	short offset = offsetof(struct sockaddr_storage, __data);
288 
289 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
290 
291 	/* This may look like an off by one error but it is a bit more
292 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
293 	 * sun_path[108] doesn't as such exist.  However in kernel space
294 	 * we are guaranteed that it is a valid memory location in our
295 	 * kernel address buffer because syscall functions always pass
296 	 * a pointer of struct sockaddr_storage which has a bigger buffer
297 	 * than 108.  Also, we must terminate sun_path for strlen() in
298 	 * getname_kernel().
299 	 */
300 	addr->__data[addr_len - offset] = 0;
301 
302 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
303 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
304 	 * know the actual buffer.
305 	 */
306 	return strlen(addr->__data) + offset + 1;
307 }
308 
309 static void __unix_remove_socket(struct sock *sk)
310 {
311 	sk_del_node_init(sk);
312 }
313 
314 static void __unix_insert_socket(struct net *net, struct sock *sk)
315 {
316 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
317 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
318 }
319 
320 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
321 				 struct unix_address *addr, unsigned int hash)
322 {
323 	__unix_remove_socket(sk);
324 	smp_store_release(&unix_sk(sk)->addr, addr);
325 
326 	sk->sk_hash = hash;
327 	__unix_insert_socket(net, sk);
328 }
329 
330 static void unix_remove_socket(struct net *net, struct sock *sk)
331 {
332 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
333 	__unix_remove_socket(sk);
334 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
335 }
336 
337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
338 {
339 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
340 	__unix_insert_socket(net, sk);
341 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
342 }
343 
344 static void unix_insert_bsd_socket(struct sock *sk)
345 {
346 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
347 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
348 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
349 }
350 
351 static void unix_remove_bsd_socket(struct sock *sk)
352 {
353 	if (!hlist_unhashed(&sk->sk_bind_node)) {
354 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 		__sk_del_bind_node(sk);
356 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357 
358 		sk_node_init(&sk->sk_bind_node);
359 	}
360 }
361 
362 static struct sock *__unix_find_socket_byname(struct net *net,
363 					      struct sockaddr_un *sunname,
364 					      int len, unsigned int hash)
365 {
366 	struct sock *s;
367 
368 	sk_for_each(s, &net->unx.table.buckets[hash]) {
369 		struct unix_sock *u = unix_sk(s);
370 
371 		if (u->addr->len == len &&
372 		    !memcmp(u->addr->name, sunname, len))
373 			return s;
374 	}
375 	return NULL;
376 }
377 
378 static inline struct sock *unix_find_socket_byname(struct net *net,
379 						   struct sockaddr_un *sunname,
380 						   int len, unsigned int hash)
381 {
382 	struct sock *s;
383 
384 	spin_lock(&net->unx.table.locks[hash]);
385 	s = __unix_find_socket_byname(net, sunname, len, hash);
386 	if (s)
387 		sock_hold(s);
388 	spin_unlock(&net->unx.table.locks[hash]);
389 	return s;
390 }
391 
392 static struct sock *unix_find_socket_byinode(struct inode *i)
393 {
394 	unsigned int hash = unix_bsd_hash(i);
395 	struct sock *s;
396 
397 	spin_lock(&bsd_socket_locks[hash]);
398 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
399 		struct dentry *dentry = unix_sk(s)->path.dentry;
400 
401 		if (dentry && d_backing_inode(dentry) == i) {
402 			sock_hold(s);
403 			spin_unlock(&bsd_socket_locks[hash]);
404 			return s;
405 		}
406 	}
407 	spin_unlock(&bsd_socket_locks[hash]);
408 	return NULL;
409 }
410 
411 /* Support code for asymmetrically connected dgram sockets
412  *
413  * If a datagram socket is connected to a socket not itself connected
414  * to the first socket (eg, /dev/log), clients may only enqueue more
415  * messages if the present receive queue of the server socket is not
416  * "too large". This means there's a second writeability condition
417  * poll and sendmsg need to test. The dgram recv code will do a wake
418  * up on the peer_wait wait queue of a socket upon reception of a
419  * datagram which needs to be propagated to sleeping would-be writers
420  * since these might not have sent anything so far. This can't be
421  * accomplished via poll_wait because the lifetime of the server
422  * socket might be less than that of its clients if these break their
423  * association with it or if the server socket is closed while clients
424  * are still connected to it and there's no way to inform "a polling
425  * implementation" that it should let go of a certain wait queue
426  *
427  * In order to propagate a wake up, a wait_queue_entry_t of the client
428  * socket is enqueued on the peer_wait queue of the server socket
429  * whose wake function does a wake_up on the ordinary client socket
430  * wait queue. This connection is established whenever a write (or
431  * poll for write) hit the flow control condition and broken when the
432  * association to the server socket is dissolved or after a wake up
433  * was relayed.
434  */
435 
436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
437 				      void *key)
438 {
439 	struct unix_sock *u;
440 	wait_queue_head_t *u_sleep;
441 
442 	u = container_of(q, struct unix_sock, peer_wake);
443 
444 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
445 			    q);
446 	u->peer_wake.private = NULL;
447 
448 	/* relaying can only happen while the wq still exists */
449 	u_sleep = sk_sleep(&u->sk);
450 	if (u_sleep)
451 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
452 
453 	return 0;
454 }
455 
456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
457 {
458 	struct unix_sock *u, *u_other;
459 	int rc;
460 
461 	u = unix_sk(sk);
462 	u_other = unix_sk(other);
463 	rc = 0;
464 	spin_lock(&u_other->peer_wait.lock);
465 
466 	if (!u->peer_wake.private) {
467 		u->peer_wake.private = other;
468 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
469 
470 		rc = 1;
471 	}
472 
473 	spin_unlock(&u_other->peer_wait.lock);
474 	return rc;
475 }
476 
477 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
478 					    struct sock *other)
479 {
480 	struct unix_sock *u, *u_other;
481 
482 	u = unix_sk(sk);
483 	u_other = unix_sk(other);
484 	spin_lock(&u_other->peer_wait.lock);
485 
486 	if (u->peer_wake.private == other) {
487 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
488 		u->peer_wake.private = NULL;
489 	}
490 
491 	spin_unlock(&u_other->peer_wait.lock);
492 }
493 
494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
495 						   struct sock *other)
496 {
497 	unix_dgram_peer_wake_disconnect(sk, other);
498 	wake_up_interruptible_poll(sk_sleep(sk),
499 				   EPOLLOUT |
500 				   EPOLLWRNORM |
501 				   EPOLLWRBAND);
502 }
503 
504 /* preconditions:
505  *	- unix_peer(sk) == other
506  *	- association is stable
507  */
508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
509 {
510 	int connected;
511 
512 	connected = unix_dgram_peer_wake_connect(sk, other);
513 
514 	/* If other is SOCK_DEAD, we want to make sure we signal
515 	 * POLLOUT, such that a subsequent write() can get a
516 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
517 	 * to other and its full, we will hang waiting for POLLOUT.
518 	 */
519 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
520 		return 1;
521 
522 	if (connected)
523 		unix_dgram_peer_wake_disconnect(sk, other);
524 
525 	return 0;
526 }
527 
528 static int unix_writable(const struct sock *sk, unsigned char state)
529 {
530 	return state != TCP_LISTEN &&
531 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
532 }
533 
534 static void unix_write_space(struct sock *sk)
535 {
536 	struct socket_wq *wq;
537 
538 	rcu_read_lock();
539 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
540 		wq = rcu_dereference(sk->sk_wq);
541 		if (skwq_has_sleeper(wq))
542 			wake_up_interruptible_sync_poll(&wq->wait,
543 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
544 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
545 	}
546 	rcu_read_unlock();
547 }
548 
549 /* When dgram socket disconnects (or changes its peer), we clear its receive
550  * queue of packets arrived from previous peer. First, it allows to do
551  * flow control based only on wmem_alloc; second, sk connected to peer
552  * may receive messages only from that peer. */
553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
554 {
555 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
556 		skb_queue_purge(&sk->sk_receive_queue);
557 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
558 
559 		/* If one link of bidirectional dgram pipe is disconnected,
560 		 * we signal error. Messages are lost. Do not make this,
561 		 * when peer was not connected to us.
562 		 */
563 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
564 			WRITE_ONCE(other->sk_err, ECONNRESET);
565 			sk_error_report(other);
566 		}
567 	}
568 }
569 
570 static void unix_sock_destructor(struct sock *sk)
571 {
572 	struct unix_sock *u = unix_sk(sk);
573 
574 	skb_queue_purge(&sk->sk_receive_queue);
575 
576 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
577 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
578 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
579 	if (!sock_flag(sk, SOCK_DEAD)) {
580 		pr_info("Attempt to release alive unix socket: %p\n", sk);
581 		return;
582 	}
583 
584 	if (u->addr)
585 		unix_release_addr(u->addr);
586 
587 	atomic_long_dec(&unix_nr_socks);
588 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
589 #ifdef UNIX_REFCNT_DEBUG
590 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
591 		atomic_long_read(&unix_nr_socks));
592 #endif
593 }
594 
595 static void unix_release_sock(struct sock *sk, int embrion)
596 {
597 	struct unix_sock *u = unix_sk(sk);
598 	struct sock *skpair;
599 	struct sk_buff *skb;
600 	struct path path;
601 	int state;
602 
603 	unix_remove_socket(sock_net(sk), sk);
604 	unix_remove_bsd_socket(sk);
605 
606 	/* Clear state */
607 	unix_state_lock(sk);
608 	sock_orphan(sk);
609 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
610 	path	     = u->path;
611 	u->path.dentry = NULL;
612 	u->path.mnt = NULL;
613 	state = sk->sk_state;
614 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
615 
616 	skpair = unix_peer(sk);
617 	unix_peer(sk) = NULL;
618 
619 	unix_state_unlock(sk);
620 
621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
622 	if (u->oob_skb) {
623 		kfree_skb(u->oob_skb);
624 		u->oob_skb = NULL;
625 	}
626 #endif
627 
628 	wake_up_interruptible_all(&u->peer_wait);
629 
630 	if (skpair != NULL) {
631 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
632 			unix_state_lock(skpair);
633 			/* No more writes */
634 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
635 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
636 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
637 			unix_state_unlock(skpair);
638 			skpair->sk_state_change(skpair);
639 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
640 		}
641 
642 		unix_dgram_peer_wake_disconnect(sk, skpair);
643 		sock_put(skpair); /* It may now die */
644 	}
645 
646 	/* Try to flush out this socket. Throw out buffers at least */
647 
648 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
649 		if (state == TCP_LISTEN)
650 			unix_release_sock(skb->sk, 1);
651 		/* passed fds are erased in the kfree_skb hook	      */
652 		UNIXCB(skb).consumed = skb->len;
653 		kfree_skb(skb);
654 	}
655 
656 	if (path.dentry)
657 		path_put(&path);
658 
659 	sock_put(sk);
660 
661 	/* ---- Socket is dead now and most probably destroyed ---- */
662 
663 	/*
664 	 * Fixme: BSD difference: In BSD all sockets connected to us get
665 	 *	  ECONNRESET and we die on the spot. In Linux we behave
666 	 *	  like files and pipes do and wait for the last
667 	 *	  dereference.
668 	 *
669 	 * Can't we simply set sock->err?
670 	 *
671 	 *	  What the above comment does talk about? --ANK(980817)
672 	 */
673 
674 	if (READ_ONCE(unix_tot_inflight))
675 		unix_gc();		/* Garbage collect fds */
676 }
677 
678 static void init_peercred(struct sock *sk)
679 {
680 	const struct cred *old_cred;
681 	struct pid *old_pid;
682 
683 	spin_lock(&sk->sk_peer_lock);
684 	old_pid = sk->sk_peer_pid;
685 	old_cred = sk->sk_peer_cred;
686 	sk->sk_peer_pid  = get_pid(task_tgid(current));
687 	sk->sk_peer_cred = get_current_cred();
688 	spin_unlock(&sk->sk_peer_lock);
689 
690 	put_pid(old_pid);
691 	put_cred(old_cred);
692 }
693 
694 static void copy_peercred(struct sock *sk, struct sock *peersk)
695 {
696 	const struct cred *old_cred;
697 	struct pid *old_pid;
698 
699 	if (sk < peersk) {
700 		spin_lock(&sk->sk_peer_lock);
701 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 	} else {
703 		spin_lock(&peersk->sk_peer_lock);
704 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
705 	}
706 	old_pid = sk->sk_peer_pid;
707 	old_cred = sk->sk_peer_cred;
708 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
709 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
710 
711 	spin_unlock(&sk->sk_peer_lock);
712 	spin_unlock(&peersk->sk_peer_lock);
713 
714 	put_pid(old_pid);
715 	put_cred(old_cred);
716 }
717 
718 static int unix_listen(struct socket *sock, int backlog)
719 {
720 	int err;
721 	struct sock *sk = sock->sk;
722 	struct unix_sock *u = unix_sk(sk);
723 
724 	err = -EOPNOTSUPP;
725 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
726 		goto out;	/* Only stream/seqpacket sockets accept */
727 	err = -EINVAL;
728 	if (!READ_ONCE(u->addr))
729 		goto out;	/* No listens on an unbound socket */
730 	unix_state_lock(sk);
731 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
732 		goto out_unlock;
733 	if (backlog > sk->sk_max_ack_backlog)
734 		wake_up_interruptible_all(&u->peer_wait);
735 	sk->sk_max_ack_backlog	= backlog;
736 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
737 
738 	/* set credentials so connect can copy them */
739 	init_peercred(sk);
740 	err = 0;
741 
742 out_unlock:
743 	unix_state_unlock(sk);
744 out:
745 	return err;
746 }
747 
748 static int unix_release(struct socket *);
749 static int unix_bind(struct socket *, struct sockaddr *, int);
750 static int unix_stream_connect(struct socket *, struct sockaddr *,
751 			       int addr_len, int flags);
752 static int unix_socketpair(struct socket *, struct socket *);
753 static int unix_accept(struct socket *, struct socket *, int, bool);
754 static int unix_getname(struct socket *, struct sockaddr *, int);
755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
756 static __poll_t unix_dgram_poll(struct file *, struct socket *,
757 				    poll_table *);
758 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
759 #ifdef CONFIG_COMPAT
760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
761 #endif
762 static int unix_shutdown(struct socket *, int);
763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
765 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
766 				       struct pipe_inode_info *, size_t size,
767 				       unsigned int flags);
768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
772 static int unix_dgram_connect(struct socket *, struct sockaddr *,
773 			      int, int);
774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
776 				  int);
777 
778 static int unix_set_peek_off(struct sock *sk, int val)
779 {
780 	struct unix_sock *u = unix_sk(sk);
781 
782 	if (mutex_lock_interruptible(&u->iolock))
783 		return -EINTR;
784 
785 	WRITE_ONCE(sk->sk_peek_off, val);
786 	mutex_unlock(&u->iolock);
787 
788 	return 0;
789 }
790 
791 #ifdef CONFIG_PROC_FS
792 static int unix_count_nr_fds(struct sock *sk)
793 {
794 	struct sk_buff *skb;
795 	struct unix_sock *u;
796 	int nr_fds = 0;
797 
798 	spin_lock(&sk->sk_receive_queue.lock);
799 	skb = skb_peek(&sk->sk_receive_queue);
800 	while (skb) {
801 		u = unix_sk(skb->sk);
802 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
803 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
804 	}
805 	spin_unlock(&sk->sk_receive_queue.lock);
806 
807 	return nr_fds;
808 }
809 
810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
811 {
812 	struct sock *sk = sock->sk;
813 	unsigned char s_state;
814 	struct unix_sock *u;
815 	int nr_fds = 0;
816 
817 	if (sk) {
818 		s_state = READ_ONCE(sk->sk_state);
819 		u = unix_sk(sk);
820 
821 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
822 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
823 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
824 		 */
825 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
826 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
827 		else if (s_state == TCP_LISTEN)
828 			nr_fds = unix_count_nr_fds(sk);
829 
830 		seq_printf(m, "scm_fds: %u\n", nr_fds);
831 	}
832 }
833 #else
834 #define unix_show_fdinfo NULL
835 #endif
836 
837 static const struct proto_ops unix_stream_ops = {
838 	.family =	PF_UNIX,
839 	.owner =	THIS_MODULE,
840 	.release =	unix_release,
841 	.bind =		unix_bind,
842 	.connect =	unix_stream_connect,
843 	.socketpair =	unix_socketpair,
844 	.accept =	unix_accept,
845 	.getname =	unix_getname,
846 	.poll =		unix_poll,
847 	.ioctl =	unix_ioctl,
848 #ifdef CONFIG_COMPAT
849 	.compat_ioctl =	unix_compat_ioctl,
850 #endif
851 	.listen =	unix_listen,
852 	.shutdown =	unix_shutdown,
853 	.sendmsg =	unix_stream_sendmsg,
854 	.recvmsg =	unix_stream_recvmsg,
855 	.read_skb =	unix_stream_read_skb,
856 	.mmap =		sock_no_mmap,
857 	.splice_read =	unix_stream_splice_read,
858 	.set_peek_off =	unix_set_peek_off,
859 	.show_fdinfo =	unix_show_fdinfo,
860 };
861 
862 static const struct proto_ops unix_dgram_ops = {
863 	.family =	PF_UNIX,
864 	.owner =	THIS_MODULE,
865 	.release =	unix_release,
866 	.bind =		unix_bind,
867 	.connect =	unix_dgram_connect,
868 	.socketpair =	unix_socketpair,
869 	.accept =	sock_no_accept,
870 	.getname =	unix_getname,
871 	.poll =		unix_dgram_poll,
872 	.ioctl =	unix_ioctl,
873 #ifdef CONFIG_COMPAT
874 	.compat_ioctl =	unix_compat_ioctl,
875 #endif
876 	.listen =	sock_no_listen,
877 	.shutdown =	unix_shutdown,
878 	.sendmsg =	unix_dgram_sendmsg,
879 	.read_skb =	unix_read_skb,
880 	.recvmsg =	unix_dgram_recvmsg,
881 	.mmap =		sock_no_mmap,
882 	.set_peek_off =	unix_set_peek_off,
883 	.show_fdinfo =	unix_show_fdinfo,
884 };
885 
886 static const struct proto_ops unix_seqpacket_ops = {
887 	.family =	PF_UNIX,
888 	.owner =	THIS_MODULE,
889 	.release =	unix_release,
890 	.bind =		unix_bind,
891 	.connect =	unix_stream_connect,
892 	.socketpair =	unix_socketpair,
893 	.accept =	unix_accept,
894 	.getname =	unix_getname,
895 	.poll =		unix_dgram_poll,
896 	.ioctl =	unix_ioctl,
897 #ifdef CONFIG_COMPAT
898 	.compat_ioctl =	unix_compat_ioctl,
899 #endif
900 	.listen =	unix_listen,
901 	.shutdown =	unix_shutdown,
902 	.sendmsg =	unix_seqpacket_sendmsg,
903 	.recvmsg =	unix_seqpacket_recvmsg,
904 	.mmap =		sock_no_mmap,
905 	.set_peek_off =	unix_set_peek_off,
906 	.show_fdinfo =	unix_show_fdinfo,
907 };
908 
909 static void unix_close(struct sock *sk, long timeout)
910 {
911 	/* Nothing to do here, unix socket does not need a ->close().
912 	 * This is merely for sockmap.
913 	 */
914 }
915 
916 static void unix_unhash(struct sock *sk)
917 {
918 	/* Nothing to do here, unix socket does not need a ->unhash().
919 	 * This is merely for sockmap.
920 	 */
921 }
922 
923 static bool unix_bpf_bypass_getsockopt(int level, int optname)
924 {
925 	if (level == SOL_SOCKET) {
926 		switch (optname) {
927 		case SO_PEERPIDFD:
928 			return true;
929 		default:
930 			return false;
931 		}
932 	}
933 
934 	return false;
935 }
936 
937 struct proto unix_dgram_proto = {
938 	.name			= "UNIX",
939 	.owner			= THIS_MODULE,
940 	.obj_size		= sizeof(struct unix_sock),
941 	.close			= unix_close,
942 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
943 #ifdef CONFIG_BPF_SYSCALL
944 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
945 #endif
946 };
947 
948 struct proto unix_stream_proto = {
949 	.name			= "UNIX-STREAM",
950 	.owner			= THIS_MODULE,
951 	.obj_size		= sizeof(struct unix_sock),
952 	.close			= unix_close,
953 	.unhash			= unix_unhash,
954 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
955 #ifdef CONFIG_BPF_SYSCALL
956 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
957 #endif
958 };
959 
960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
961 {
962 	struct unix_sock *u;
963 	struct sock *sk;
964 	int err;
965 
966 	atomic_long_inc(&unix_nr_socks);
967 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
968 		err = -ENFILE;
969 		goto err;
970 	}
971 
972 	if (type == SOCK_STREAM)
973 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
974 	else /*dgram and  seqpacket */
975 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
976 
977 	if (!sk) {
978 		err = -ENOMEM;
979 		goto err;
980 	}
981 
982 	sock_init_data(sock, sk);
983 
984 	sk->sk_hash		= unix_unbound_hash(sk);
985 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
986 	sk->sk_write_space	= unix_write_space;
987 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
988 	sk->sk_destruct		= unix_sock_destructor;
989 	u = unix_sk(sk);
990 	u->inflight = 0;
991 	u->path.dentry = NULL;
992 	u->path.mnt = NULL;
993 	spin_lock_init(&u->lock);
994 	INIT_LIST_HEAD(&u->link);
995 	mutex_init(&u->iolock); /* single task reading lock */
996 	mutex_init(&u->bindlock); /* single task binding lock */
997 	init_waitqueue_head(&u->peer_wait);
998 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
999 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1000 	unix_insert_unbound_socket(net, sk);
1001 
1002 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1003 
1004 	return sk;
1005 
1006 err:
1007 	atomic_long_dec(&unix_nr_socks);
1008 	return ERR_PTR(err);
1009 }
1010 
1011 static int unix_create(struct net *net, struct socket *sock, int protocol,
1012 		       int kern)
1013 {
1014 	struct sock *sk;
1015 
1016 	if (protocol && protocol != PF_UNIX)
1017 		return -EPROTONOSUPPORT;
1018 
1019 	sock->state = SS_UNCONNECTED;
1020 
1021 	switch (sock->type) {
1022 	case SOCK_STREAM:
1023 		sock->ops = &unix_stream_ops;
1024 		break;
1025 		/*
1026 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1027 		 *	nothing uses it.
1028 		 */
1029 	case SOCK_RAW:
1030 		sock->type = SOCK_DGRAM;
1031 		fallthrough;
1032 	case SOCK_DGRAM:
1033 		sock->ops = &unix_dgram_ops;
1034 		break;
1035 	case SOCK_SEQPACKET:
1036 		sock->ops = &unix_seqpacket_ops;
1037 		break;
1038 	default:
1039 		return -ESOCKTNOSUPPORT;
1040 	}
1041 
1042 	sk = unix_create1(net, sock, kern, sock->type);
1043 	if (IS_ERR(sk))
1044 		return PTR_ERR(sk);
1045 
1046 	return 0;
1047 }
1048 
1049 static int unix_release(struct socket *sock)
1050 {
1051 	struct sock *sk = sock->sk;
1052 
1053 	if (!sk)
1054 		return 0;
1055 
1056 	sk->sk_prot->close(sk, 0);
1057 	unix_release_sock(sk, 0);
1058 	sock->sk = NULL;
1059 
1060 	return 0;
1061 }
1062 
1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1064 				  int type)
1065 {
1066 	struct inode *inode;
1067 	struct path path;
1068 	struct sock *sk;
1069 	int err;
1070 
1071 	unix_mkname_bsd(sunaddr, addr_len);
1072 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1073 	if (err)
1074 		goto fail;
1075 
1076 	err = path_permission(&path, MAY_WRITE);
1077 	if (err)
1078 		goto path_put;
1079 
1080 	err = -ECONNREFUSED;
1081 	inode = d_backing_inode(path.dentry);
1082 	if (!S_ISSOCK(inode->i_mode))
1083 		goto path_put;
1084 
1085 	sk = unix_find_socket_byinode(inode);
1086 	if (!sk)
1087 		goto path_put;
1088 
1089 	err = -EPROTOTYPE;
1090 	if (sk->sk_type == type)
1091 		touch_atime(&path);
1092 	else
1093 		goto sock_put;
1094 
1095 	path_put(&path);
1096 
1097 	return sk;
1098 
1099 sock_put:
1100 	sock_put(sk);
1101 path_put:
1102 	path_put(&path);
1103 fail:
1104 	return ERR_PTR(err);
1105 }
1106 
1107 static struct sock *unix_find_abstract(struct net *net,
1108 				       struct sockaddr_un *sunaddr,
1109 				       int addr_len, int type)
1110 {
1111 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1112 	struct dentry *dentry;
1113 	struct sock *sk;
1114 
1115 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1116 	if (!sk)
1117 		return ERR_PTR(-ECONNREFUSED);
1118 
1119 	dentry = unix_sk(sk)->path.dentry;
1120 	if (dentry)
1121 		touch_atime(&unix_sk(sk)->path);
1122 
1123 	return sk;
1124 }
1125 
1126 static struct sock *unix_find_other(struct net *net,
1127 				    struct sockaddr_un *sunaddr,
1128 				    int addr_len, int type)
1129 {
1130 	struct sock *sk;
1131 
1132 	if (sunaddr->sun_path[0])
1133 		sk = unix_find_bsd(sunaddr, addr_len, type);
1134 	else
1135 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1136 
1137 	return sk;
1138 }
1139 
1140 static int unix_autobind(struct sock *sk)
1141 {
1142 	struct unix_sock *u = unix_sk(sk);
1143 	unsigned int new_hash, old_hash;
1144 	struct net *net = sock_net(sk);
1145 	struct unix_address *addr;
1146 	u32 lastnum, ordernum;
1147 	int err;
1148 
1149 	err = mutex_lock_interruptible(&u->bindlock);
1150 	if (err)
1151 		return err;
1152 
1153 	if (u->addr)
1154 		goto out;
1155 
1156 	err = -ENOMEM;
1157 	addr = kzalloc(sizeof(*addr) +
1158 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1159 	if (!addr)
1160 		goto out;
1161 
1162 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1163 	addr->name->sun_family = AF_UNIX;
1164 	refcount_set(&addr->refcnt, 1);
1165 
1166 	old_hash = sk->sk_hash;
1167 	ordernum = get_random_u32();
1168 	lastnum = ordernum & 0xFFFFF;
1169 retry:
1170 	ordernum = (ordernum + 1) & 0xFFFFF;
1171 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1172 
1173 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1174 	unix_table_double_lock(net, old_hash, new_hash);
1175 
1176 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1177 		unix_table_double_unlock(net, old_hash, new_hash);
1178 
1179 		/* __unix_find_socket_byname() may take long time if many names
1180 		 * are already in use.
1181 		 */
1182 		cond_resched();
1183 
1184 		if (ordernum == lastnum) {
1185 			/* Give up if all names seems to be in use. */
1186 			err = -ENOSPC;
1187 			unix_release_addr(addr);
1188 			goto out;
1189 		}
1190 
1191 		goto retry;
1192 	}
1193 
1194 	__unix_set_addr_hash(net, sk, addr, new_hash);
1195 	unix_table_double_unlock(net, old_hash, new_hash);
1196 	err = 0;
1197 
1198 out:	mutex_unlock(&u->bindlock);
1199 	return err;
1200 }
1201 
1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1203 			 int addr_len)
1204 {
1205 	umode_t mode = S_IFSOCK |
1206 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1207 	struct unix_sock *u = unix_sk(sk);
1208 	unsigned int new_hash, old_hash;
1209 	struct net *net = sock_net(sk);
1210 	struct mnt_idmap *idmap;
1211 	struct unix_address *addr;
1212 	struct dentry *dentry;
1213 	struct path parent;
1214 	int err;
1215 
1216 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1217 	addr = unix_create_addr(sunaddr, addr_len);
1218 	if (!addr)
1219 		return -ENOMEM;
1220 
1221 	/*
1222 	 * Get the parent directory, calculate the hash for last
1223 	 * component.
1224 	 */
1225 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1226 	if (IS_ERR(dentry)) {
1227 		err = PTR_ERR(dentry);
1228 		goto out;
1229 	}
1230 
1231 	/*
1232 	 * All right, let's create it.
1233 	 */
1234 	idmap = mnt_idmap(parent.mnt);
1235 	err = security_path_mknod(&parent, dentry, mode, 0);
1236 	if (!err)
1237 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1238 	if (err)
1239 		goto out_path;
1240 	err = mutex_lock_interruptible(&u->bindlock);
1241 	if (err)
1242 		goto out_unlink;
1243 	if (u->addr)
1244 		goto out_unlock;
1245 
1246 	old_hash = sk->sk_hash;
1247 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1248 	unix_table_double_lock(net, old_hash, new_hash);
1249 	u->path.mnt = mntget(parent.mnt);
1250 	u->path.dentry = dget(dentry);
1251 	__unix_set_addr_hash(net, sk, addr, new_hash);
1252 	unix_table_double_unlock(net, old_hash, new_hash);
1253 	unix_insert_bsd_socket(sk);
1254 	mutex_unlock(&u->bindlock);
1255 	done_path_create(&parent, dentry);
1256 	return 0;
1257 
1258 out_unlock:
1259 	mutex_unlock(&u->bindlock);
1260 	err = -EINVAL;
1261 out_unlink:
1262 	/* failed after successful mknod?  unlink what we'd created... */
1263 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1264 out_path:
1265 	done_path_create(&parent, dentry);
1266 out:
1267 	unix_release_addr(addr);
1268 	return err == -EEXIST ? -EADDRINUSE : err;
1269 }
1270 
1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1272 			      int addr_len)
1273 {
1274 	struct unix_sock *u = unix_sk(sk);
1275 	unsigned int new_hash, old_hash;
1276 	struct net *net = sock_net(sk);
1277 	struct unix_address *addr;
1278 	int err;
1279 
1280 	addr = unix_create_addr(sunaddr, addr_len);
1281 	if (!addr)
1282 		return -ENOMEM;
1283 
1284 	err = mutex_lock_interruptible(&u->bindlock);
1285 	if (err)
1286 		goto out;
1287 
1288 	if (u->addr) {
1289 		err = -EINVAL;
1290 		goto out_mutex;
1291 	}
1292 
1293 	old_hash = sk->sk_hash;
1294 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1295 	unix_table_double_lock(net, old_hash, new_hash);
1296 
1297 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1298 		goto out_spin;
1299 
1300 	__unix_set_addr_hash(net, sk, addr, new_hash);
1301 	unix_table_double_unlock(net, old_hash, new_hash);
1302 	mutex_unlock(&u->bindlock);
1303 	return 0;
1304 
1305 out_spin:
1306 	unix_table_double_unlock(net, old_hash, new_hash);
1307 	err = -EADDRINUSE;
1308 out_mutex:
1309 	mutex_unlock(&u->bindlock);
1310 out:
1311 	unix_release_addr(addr);
1312 	return err;
1313 }
1314 
1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1316 {
1317 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1318 	struct sock *sk = sock->sk;
1319 	int err;
1320 
1321 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1322 	    sunaddr->sun_family == AF_UNIX)
1323 		return unix_autobind(sk);
1324 
1325 	err = unix_validate_addr(sunaddr, addr_len);
1326 	if (err)
1327 		return err;
1328 
1329 	if (sunaddr->sun_path[0])
1330 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1331 	else
1332 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1333 
1334 	return err;
1335 }
1336 
1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1338 {
1339 	if (unlikely(sk1 == sk2) || !sk2) {
1340 		unix_state_lock(sk1);
1341 		return;
1342 	}
1343 	if (sk1 > sk2)
1344 		swap(sk1, sk2);
1345 
1346 	unix_state_lock(sk1);
1347 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1348 }
1349 
1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1351 {
1352 	if (unlikely(sk1 == sk2) || !sk2) {
1353 		unix_state_unlock(sk1);
1354 		return;
1355 	}
1356 	unix_state_unlock(sk1);
1357 	unix_state_unlock(sk2);
1358 }
1359 
1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1361 			      int alen, int flags)
1362 {
1363 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1364 	struct sock *sk = sock->sk;
1365 	struct sock *other;
1366 	int err;
1367 
1368 	err = -EINVAL;
1369 	if (alen < offsetofend(struct sockaddr, sa_family))
1370 		goto out;
1371 
1372 	if (addr->sa_family != AF_UNSPEC) {
1373 		err = unix_validate_addr(sunaddr, alen);
1374 		if (err)
1375 			goto out;
1376 
1377 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1378 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1379 		    !READ_ONCE(unix_sk(sk)->addr)) {
1380 			err = unix_autobind(sk);
1381 			if (err)
1382 				goto out;
1383 		}
1384 
1385 restart:
1386 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1387 		if (IS_ERR(other)) {
1388 			err = PTR_ERR(other);
1389 			goto out;
1390 		}
1391 
1392 		unix_state_double_lock(sk, other);
1393 
1394 		/* Apparently VFS overslept socket death. Retry. */
1395 		if (sock_flag(other, SOCK_DEAD)) {
1396 			unix_state_double_unlock(sk, other);
1397 			sock_put(other);
1398 			goto restart;
1399 		}
1400 
1401 		err = -EPERM;
1402 		if (!unix_may_send(sk, other))
1403 			goto out_unlock;
1404 
1405 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1406 		if (err)
1407 			goto out_unlock;
1408 
1409 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1410 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1411 	} else {
1412 		/*
1413 		 *	1003.1g breaking connected state with AF_UNSPEC
1414 		 */
1415 		other = NULL;
1416 		unix_state_double_lock(sk, other);
1417 	}
1418 
1419 	/*
1420 	 * If it was connected, reconnect.
1421 	 */
1422 	if (unix_peer(sk)) {
1423 		struct sock *old_peer = unix_peer(sk);
1424 
1425 		unix_peer(sk) = other;
1426 		if (!other)
1427 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1428 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1429 
1430 		unix_state_double_unlock(sk, other);
1431 
1432 		if (other != old_peer) {
1433 			unix_dgram_disconnected(sk, old_peer);
1434 
1435 			unix_state_lock(old_peer);
1436 			if (!unix_peer(old_peer))
1437 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1438 			unix_state_unlock(old_peer);
1439 		}
1440 
1441 		sock_put(old_peer);
1442 	} else {
1443 		unix_peer(sk) = other;
1444 		unix_state_double_unlock(sk, other);
1445 	}
1446 
1447 	return 0;
1448 
1449 out_unlock:
1450 	unix_state_double_unlock(sk, other);
1451 	sock_put(other);
1452 out:
1453 	return err;
1454 }
1455 
1456 static long unix_wait_for_peer(struct sock *other, long timeo)
1457 	__releases(&unix_sk(other)->lock)
1458 {
1459 	struct unix_sock *u = unix_sk(other);
1460 	int sched;
1461 	DEFINE_WAIT(wait);
1462 
1463 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464 
1465 	sched = !sock_flag(other, SOCK_DEAD) &&
1466 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1467 		unix_recvq_full_lockless(other);
1468 
1469 	unix_state_unlock(other);
1470 
1471 	if (sched)
1472 		timeo = schedule_timeout(timeo);
1473 
1474 	finish_wait(&u->peer_wait, &wait);
1475 	return timeo;
1476 }
1477 
1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1479 			       int addr_len, int flags)
1480 {
1481 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1482 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1483 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1484 	struct net *net = sock_net(sk);
1485 	struct sk_buff *skb = NULL;
1486 	long timeo;
1487 	int err;
1488 
1489 	err = unix_validate_addr(sunaddr, addr_len);
1490 	if (err)
1491 		goto out;
1492 
1493 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1495 	    !READ_ONCE(u->addr)) {
1496 		err = unix_autobind(sk);
1497 		if (err)
1498 			goto out;
1499 	}
1500 
1501 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1502 
1503 	/* First of all allocate resources.
1504 	   If we will make it after state is locked,
1505 	   we will have to recheck all again in any case.
1506 	 */
1507 
1508 	/* create new sock for complete connection */
1509 	newsk = unix_create1(net, NULL, 0, sock->type);
1510 	if (IS_ERR(newsk)) {
1511 		err = PTR_ERR(newsk);
1512 		newsk = NULL;
1513 		goto out;
1514 	}
1515 
1516 	err = -ENOMEM;
1517 
1518 	/* Allocate skb for sending to listening sock */
1519 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1520 	if (skb == NULL)
1521 		goto out;
1522 
1523 restart:
1524 	/*  Find listening sock. */
1525 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1526 	if (IS_ERR(other)) {
1527 		err = PTR_ERR(other);
1528 		other = NULL;
1529 		goto out;
1530 	}
1531 
1532 	/* Latch state of peer */
1533 	unix_state_lock(other);
1534 
1535 	/* Apparently VFS overslept socket death. Retry. */
1536 	if (sock_flag(other, SOCK_DEAD)) {
1537 		unix_state_unlock(other);
1538 		sock_put(other);
1539 		goto restart;
1540 	}
1541 
1542 	err = -ECONNREFUSED;
1543 	if (other->sk_state != TCP_LISTEN)
1544 		goto out_unlock;
1545 	if (other->sk_shutdown & RCV_SHUTDOWN)
1546 		goto out_unlock;
1547 
1548 	if (unix_recvq_full_lockless(other)) {
1549 		err = -EAGAIN;
1550 		if (!timeo)
1551 			goto out_unlock;
1552 
1553 		timeo = unix_wait_for_peer(other, timeo);
1554 
1555 		err = sock_intr_errno(timeo);
1556 		if (signal_pending(current))
1557 			goto out;
1558 		sock_put(other);
1559 		goto restart;
1560 	}
1561 
1562 	/* Latch our state.
1563 
1564 	   It is tricky place. We need to grab our state lock and cannot
1565 	   drop lock on peer. It is dangerous because deadlock is
1566 	   possible. Connect to self case and simultaneous
1567 	   attempt to connect are eliminated by checking socket
1568 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1569 	   check this before attempt to grab lock.
1570 
1571 	   Well, and we have to recheck the state after socket locked.
1572 	 */
1573 	switch (READ_ONCE(sk->sk_state)) {
1574 	case TCP_CLOSE:
1575 		/* This is ok... continue with connect */
1576 		break;
1577 	case TCP_ESTABLISHED:
1578 		/* Socket is already connected */
1579 		err = -EISCONN;
1580 		goto out_unlock;
1581 	default:
1582 		err = -EINVAL;
1583 		goto out_unlock;
1584 	}
1585 
1586 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1587 
1588 	if (sk->sk_state != TCP_CLOSE) {
1589 		unix_state_unlock(sk);
1590 		unix_state_unlock(other);
1591 		sock_put(other);
1592 		goto restart;
1593 	}
1594 
1595 	err = security_unix_stream_connect(sk, other, newsk);
1596 	if (err) {
1597 		unix_state_unlock(sk);
1598 		goto out_unlock;
1599 	}
1600 
1601 	/* The way is open! Fastly set all the necessary fields... */
1602 
1603 	sock_hold(sk);
1604 	unix_peer(newsk)	= sk;
1605 	newsk->sk_state		= TCP_ESTABLISHED;
1606 	newsk->sk_type		= sk->sk_type;
1607 	init_peercred(newsk);
1608 	newu = unix_sk(newsk);
1609 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1610 	otheru = unix_sk(other);
1611 
1612 	/* copy address information from listening to new sock
1613 	 *
1614 	 * The contents of *(otheru->addr) and otheru->path
1615 	 * are seen fully set up here, since we have found
1616 	 * otheru in hash under its lock.  Insertion into the
1617 	 * hash chain we'd found it in had been done in an
1618 	 * earlier critical area protected by the chain's lock,
1619 	 * the same one where we'd set *(otheru->addr) contents,
1620 	 * as well as otheru->path and otheru->addr itself.
1621 	 *
1622 	 * Using smp_store_release() here to set newu->addr
1623 	 * is enough to make those stores, as well as stores
1624 	 * to newu->path visible to anyone who gets newu->addr
1625 	 * by smp_load_acquire().  IOW, the same warranties
1626 	 * as for unix_sock instances bound in unix_bind() or
1627 	 * in unix_autobind().
1628 	 */
1629 	if (otheru->path.dentry) {
1630 		path_get(&otheru->path);
1631 		newu->path = otheru->path;
1632 	}
1633 	refcount_inc(&otheru->addr->refcnt);
1634 	smp_store_release(&newu->addr, otheru->addr);
1635 
1636 	/* Set credentials */
1637 	copy_peercred(sk, other);
1638 
1639 	sock->state	= SS_CONNECTED;
1640 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1641 	sock_hold(newsk);
1642 
1643 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1644 	unix_peer(sk)	= newsk;
1645 
1646 	unix_state_unlock(sk);
1647 
1648 	/* take ten and send info to listening sock */
1649 	spin_lock(&other->sk_receive_queue.lock);
1650 	__skb_queue_tail(&other->sk_receive_queue, skb);
1651 	spin_unlock(&other->sk_receive_queue.lock);
1652 	unix_state_unlock(other);
1653 	other->sk_data_ready(other);
1654 	sock_put(other);
1655 	return 0;
1656 
1657 out_unlock:
1658 	if (other)
1659 		unix_state_unlock(other);
1660 
1661 out:
1662 	kfree_skb(skb);
1663 	if (newsk)
1664 		unix_release_sock(newsk, 0);
1665 	if (other)
1666 		sock_put(other);
1667 	return err;
1668 }
1669 
1670 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1671 {
1672 	struct sock *ska = socka->sk, *skb = sockb->sk;
1673 
1674 	/* Join our sockets back to back */
1675 	sock_hold(ska);
1676 	sock_hold(skb);
1677 	unix_peer(ska) = skb;
1678 	unix_peer(skb) = ska;
1679 	init_peercred(ska);
1680 	init_peercred(skb);
1681 
1682 	ska->sk_state = TCP_ESTABLISHED;
1683 	skb->sk_state = TCP_ESTABLISHED;
1684 	socka->state  = SS_CONNECTED;
1685 	sockb->state  = SS_CONNECTED;
1686 	return 0;
1687 }
1688 
1689 static void unix_sock_inherit_flags(const struct socket *old,
1690 				    struct socket *new)
1691 {
1692 	if (test_bit(SOCK_PASSCRED, &old->flags))
1693 		set_bit(SOCK_PASSCRED, &new->flags);
1694 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1695 		set_bit(SOCK_PASSPIDFD, &new->flags);
1696 	if (test_bit(SOCK_PASSSEC, &old->flags))
1697 		set_bit(SOCK_PASSSEC, &new->flags);
1698 }
1699 
1700 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1701 		       bool kern)
1702 {
1703 	struct sock *sk = sock->sk;
1704 	struct sock *tsk;
1705 	struct sk_buff *skb;
1706 	int err;
1707 
1708 	err = -EOPNOTSUPP;
1709 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1710 		goto out;
1711 
1712 	err = -EINVAL;
1713 	if (sk->sk_state != TCP_LISTEN)
1714 		goto out;
1715 
1716 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1717 	 * so that no locks are necessary.
1718 	 */
1719 
1720 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1721 				&err);
1722 	if (!skb) {
1723 		/* This means receive shutdown. */
1724 		if (err == 0)
1725 			err = -EINVAL;
1726 		goto out;
1727 	}
1728 
1729 	tsk = skb->sk;
1730 	skb_free_datagram(sk, skb);
1731 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1732 
1733 	/* attach accepted sock to socket */
1734 	unix_state_lock(tsk);
1735 	newsock->state = SS_CONNECTED;
1736 	unix_sock_inherit_flags(sock, newsock);
1737 	sock_graft(tsk, newsock);
1738 	unix_state_unlock(tsk);
1739 	return 0;
1740 
1741 out:
1742 	return err;
1743 }
1744 
1745 
1746 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1747 {
1748 	struct sock *sk = sock->sk;
1749 	struct unix_address *addr;
1750 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1751 	int err = 0;
1752 
1753 	if (peer) {
1754 		sk = unix_peer_get(sk);
1755 
1756 		err = -ENOTCONN;
1757 		if (!sk)
1758 			goto out;
1759 		err = 0;
1760 	} else {
1761 		sock_hold(sk);
1762 	}
1763 
1764 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1765 	if (!addr) {
1766 		sunaddr->sun_family = AF_UNIX;
1767 		sunaddr->sun_path[0] = 0;
1768 		err = offsetof(struct sockaddr_un, sun_path);
1769 	} else {
1770 		err = addr->len;
1771 		memcpy(sunaddr, addr->name, addr->len);
1772 	}
1773 	sock_put(sk);
1774 out:
1775 	return err;
1776 }
1777 
1778 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1779 {
1780 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1781 
1782 	/*
1783 	 * Garbage collection of unix sockets starts by selecting a set of
1784 	 * candidate sockets which have reference only from being in flight
1785 	 * (total_refs == inflight_refs).  This condition is checked once during
1786 	 * the candidate collection phase, and candidates are marked as such, so
1787 	 * that non-candidates can later be ignored.  While inflight_refs is
1788 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1789 	 * is an instantaneous decision.
1790 	 *
1791 	 * Once a candidate, however, the socket must not be reinstalled into a
1792 	 * file descriptor while the garbage collection is in progress.
1793 	 *
1794 	 * If the above conditions are met, then the directed graph of
1795 	 * candidates (*) does not change while unix_gc_lock is held.
1796 	 *
1797 	 * Any operations that changes the file count through file descriptors
1798 	 * (dup, close, sendmsg) does not change the graph since candidates are
1799 	 * not installed in fds.
1800 	 *
1801 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1802 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1803 	 * serialized with garbage collection.
1804 	 *
1805 	 * MSG_PEEK is special in that it does not change the inflight count,
1806 	 * yet does install the socket into an fd.  The following lock/unlock
1807 	 * pair is to ensure serialization with garbage collection.  It must be
1808 	 * done between incrementing the file count and installing the file into
1809 	 * an fd.
1810 	 *
1811 	 * If garbage collection starts after the barrier provided by the
1812 	 * lock/unlock, then it will see the elevated refcount and not mark this
1813 	 * as a candidate.  If a garbage collection is already in progress
1814 	 * before the file count was incremented, then the lock/unlock pair will
1815 	 * ensure that garbage collection is finished before progressing to
1816 	 * installing the fd.
1817 	 *
1818 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1819 	 * which is on the queue of listening socket A.
1820 	 */
1821 	spin_lock(&unix_gc_lock);
1822 	spin_unlock(&unix_gc_lock);
1823 }
1824 
1825 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1826 {
1827 	int err = 0;
1828 
1829 	UNIXCB(skb).pid  = get_pid(scm->pid);
1830 	UNIXCB(skb).uid = scm->creds.uid;
1831 	UNIXCB(skb).gid = scm->creds.gid;
1832 	UNIXCB(skb).fp = NULL;
1833 	unix_get_secdata(scm, skb);
1834 	if (scm->fp && send_fds)
1835 		err = unix_attach_fds(scm, skb);
1836 
1837 	skb->destructor = unix_destruct_scm;
1838 	return err;
1839 }
1840 
1841 static bool unix_passcred_enabled(const struct socket *sock,
1842 				  const struct sock *other)
1843 {
1844 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1845 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1846 	       !other->sk_socket ||
1847 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1848 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1849 }
1850 
1851 /*
1852  * Some apps rely on write() giving SCM_CREDENTIALS
1853  * We include credentials if source or destination socket
1854  * asserted SOCK_PASSCRED.
1855  */
1856 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1857 			    const struct sock *other)
1858 {
1859 	if (UNIXCB(skb).pid)
1860 		return;
1861 	if (unix_passcred_enabled(sock, other)) {
1862 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1863 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1864 	}
1865 }
1866 
1867 static bool unix_skb_scm_eq(struct sk_buff *skb,
1868 			    struct scm_cookie *scm)
1869 {
1870 	return UNIXCB(skb).pid == scm->pid &&
1871 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1872 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1873 	       unix_secdata_eq(scm, skb);
1874 }
1875 
1876 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1877 {
1878 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1879 	struct unix_sock *u = unix_sk(sk);
1880 
1881 	if (unlikely(fp && fp->count))
1882 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1883 }
1884 
1885 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1886 {
1887 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1888 	struct unix_sock *u = unix_sk(sk);
1889 
1890 	if (unlikely(fp && fp->count))
1891 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1892 }
1893 
1894 /*
1895  *	Send AF_UNIX data.
1896  */
1897 
1898 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1899 			      size_t len)
1900 {
1901 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1902 	struct sock *sk = sock->sk, *other = NULL;
1903 	struct unix_sock *u = unix_sk(sk);
1904 	struct scm_cookie scm;
1905 	struct sk_buff *skb;
1906 	int data_len = 0;
1907 	int sk_locked;
1908 	long timeo;
1909 	int err;
1910 
1911 	wait_for_unix_gc();
1912 	err = scm_send(sock, msg, &scm, false);
1913 	if (err < 0)
1914 		return err;
1915 
1916 	err = -EOPNOTSUPP;
1917 	if (msg->msg_flags&MSG_OOB)
1918 		goto out;
1919 
1920 	if (msg->msg_namelen) {
1921 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1922 		if (err)
1923 			goto out;
1924 	} else {
1925 		sunaddr = NULL;
1926 		err = -ENOTCONN;
1927 		other = unix_peer_get(sk);
1928 		if (!other)
1929 			goto out;
1930 	}
1931 
1932 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1933 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1934 	    !READ_ONCE(u->addr)) {
1935 		err = unix_autobind(sk);
1936 		if (err)
1937 			goto out;
1938 	}
1939 
1940 	err = -EMSGSIZE;
1941 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1942 		goto out;
1943 
1944 	if (len > SKB_MAX_ALLOC) {
1945 		data_len = min_t(size_t,
1946 				 len - SKB_MAX_ALLOC,
1947 				 MAX_SKB_FRAGS * PAGE_SIZE);
1948 		data_len = PAGE_ALIGN(data_len);
1949 
1950 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951 	}
1952 
1953 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954 				   msg->msg_flags & MSG_DONTWAIT, &err,
1955 				   PAGE_ALLOC_COSTLY_ORDER);
1956 	if (skb == NULL)
1957 		goto out;
1958 
1959 	err = unix_scm_to_skb(&scm, skb, true);
1960 	if (err < 0)
1961 		goto out_free;
1962 
1963 	skb_put(skb, len - data_len);
1964 	skb->data_len = data_len;
1965 	skb->len = len;
1966 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967 	if (err)
1968 		goto out_free;
1969 
1970 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971 
1972 restart:
1973 	if (!other) {
1974 		err = -ECONNRESET;
1975 		if (sunaddr == NULL)
1976 			goto out_free;
1977 
1978 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979 					sk->sk_type);
1980 		if (IS_ERR(other)) {
1981 			err = PTR_ERR(other);
1982 			other = NULL;
1983 			goto out_free;
1984 		}
1985 	}
1986 
1987 	if (sk_filter(other, skb) < 0) {
1988 		/* Toss the packet but do not return any error to the sender */
1989 		err = len;
1990 		goto out_free;
1991 	}
1992 
1993 	sk_locked = 0;
1994 	unix_state_lock(other);
1995 restart_locked:
1996 	err = -EPERM;
1997 	if (!unix_may_send(sk, other))
1998 		goto out_unlock;
1999 
2000 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001 		/*
2002 		 *	Check with 1003.1g - what should
2003 		 *	datagram error
2004 		 */
2005 		unix_state_unlock(other);
2006 		sock_put(other);
2007 
2008 		if (!sk_locked)
2009 			unix_state_lock(sk);
2010 
2011 		err = 0;
2012 		if (sk->sk_type == SOCK_SEQPACKET) {
2013 			/* We are here only when racing with unix_release_sock()
2014 			 * is clearing @other. Never change state to TCP_CLOSE
2015 			 * unlike SOCK_DGRAM wants.
2016 			 */
2017 			unix_state_unlock(sk);
2018 			err = -EPIPE;
2019 		} else if (unix_peer(sk) == other) {
2020 			unix_peer(sk) = NULL;
2021 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022 
2023 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2024 			unix_state_unlock(sk);
2025 
2026 			unix_dgram_disconnected(sk, other);
2027 			sock_put(other);
2028 			err = -ECONNREFUSED;
2029 		} else {
2030 			unix_state_unlock(sk);
2031 		}
2032 
2033 		other = NULL;
2034 		if (err)
2035 			goto out_free;
2036 		goto restart;
2037 	}
2038 
2039 	err = -EPIPE;
2040 	if (other->sk_shutdown & RCV_SHUTDOWN)
2041 		goto out_unlock;
2042 
2043 	if (sk->sk_type != SOCK_SEQPACKET) {
2044 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045 		if (err)
2046 			goto out_unlock;
2047 	}
2048 
2049 	/* other == sk && unix_peer(other) != sk if
2050 	 * - unix_peer(sk) == NULL, destination address bound to sk
2051 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2052 	 */
2053 	if (other != sk &&
2054 	    unlikely(unix_peer(other) != sk &&
2055 	    unix_recvq_full_lockless(other))) {
2056 		if (timeo) {
2057 			timeo = unix_wait_for_peer(other, timeo);
2058 
2059 			err = sock_intr_errno(timeo);
2060 			if (signal_pending(current))
2061 				goto out_free;
2062 
2063 			goto restart;
2064 		}
2065 
2066 		if (!sk_locked) {
2067 			unix_state_unlock(other);
2068 			unix_state_double_lock(sk, other);
2069 		}
2070 
2071 		if (unix_peer(sk) != other ||
2072 		    unix_dgram_peer_wake_me(sk, other)) {
2073 			err = -EAGAIN;
2074 			sk_locked = 1;
2075 			goto out_unlock;
2076 		}
2077 
2078 		if (!sk_locked) {
2079 			sk_locked = 1;
2080 			goto restart_locked;
2081 		}
2082 	}
2083 
2084 	if (unlikely(sk_locked))
2085 		unix_state_unlock(sk);
2086 
2087 	if (sock_flag(other, SOCK_RCVTSTAMP))
2088 		__net_timestamp(skb);
2089 	maybe_add_creds(skb, sock, other);
2090 	scm_stat_add(other, skb);
2091 	skb_queue_tail(&other->sk_receive_queue, skb);
2092 	unix_state_unlock(other);
2093 	other->sk_data_ready(other);
2094 	sock_put(other);
2095 	scm_destroy(&scm);
2096 	return len;
2097 
2098 out_unlock:
2099 	if (sk_locked)
2100 		unix_state_unlock(sk);
2101 	unix_state_unlock(other);
2102 out_free:
2103 	kfree_skb(skb);
2104 out:
2105 	if (other)
2106 		sock_put(other);
2107 	scm_destroy(&scm);
2108 	return err;
2109 }
2110 
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112  * bytes, and a minimum of a full page.
2113  */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115 
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118 		     struct scm_cookie *scm, bool fds_sent)
2119 {
2120 	struct unix_sock *ousk = unix_sk(other);
2121 	struct sk_buff *skb;
2122 	int err = 0;
2123 
2124 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125 
2126 	if (!skb)
2127 		return err;
2128 
2129 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2130 	if (err < 0) {
2131 		kfree_skb(skb);
2132 		return err;
2133 	}
2134 	skb_put(skb, 1);
2135 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136 
2137 	if (err) {
2138 		kfree_skb(skb);
2139 		return err;
2140 	}
2141 
2142 	unix_state_lock(other);
2143 
2144 	if (sock_flag(other, SOCK_DEAD) ||
2145 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2146 		unix_state_unlock(other);
2147 		kfree_skb(skb);
2148 		return -EPIPE;
2149 	}
2150 
2151 	maybe_add_creds(skb, sock, other);
2152 	skb_get(skb);
2153 
2154 	scm_stat_add(other, skb);
2155 
2156 	spin_lock(&other->sk_receive_queue.lock);
2157 	if (ousk->oob_skb)
2158 		consume_skb(ousk->oob_skb);
2159 	WRITE_ONCE(ousk->oob_skb, skb);
2160 	__skb_queue_tail(&other->sk_receive_queue, skb);
2161 	spin_unlock(&other->sk_receive_queue.lock);
2162 
2163 	sk_send_sigurg(other);
2164 	unix_state_unlock(other);
2165 	other->sk_data_ready(other);
2166 
2167 	return err;
2168 }
2169 #endif
2170 
2171 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2172 			       size_t len)
2173 {
2174 	struct sock *sk = sock->sk;
2175 	struct sock *other = NULL;
2176 	int err, size;
2177 	struct sk_buff *skb;
2178 	int sent = 0;
2179 	struct scm_cookie scm;
2180 	bool fds_sent = false;
2181 	int data_len;
2182 
2183 	wait_for_unix_gc();
2184 	err = scm_send(sock, msg, &scm, false);
2185 	if (err < 0)
2186 		return err;
2187 
2188 	err = -EOPNOTSUPP;
2189 	if (msg->msg_flags & MSG_OOB) {
2190 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2191 		if (len)
2192 			len--;
2193 		else
2194 #endif
2195 			goto out_err;
2196 	}
2197 
2198 	if (msg->msg_namelen) {
2199 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2200 		goto out_err;
2201 	} else {
2202 		err = -ENOTCONN;
2203 		other = unix_peer(sk);
2204 		if (!other)
2205 			goto out_err;
2206 	}
2207 
2208 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2209 		goto pipe_err;
2210 
2211 	while (sent < len) {
2212 		size = len - sent;
2213 
2214 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2215 			skb = sock_alloc_send_pskb(sk, 0, 0,
2216 						   msg->msg_flags & MSG_DONTWAIT,
2217 						   &err, 0);
2218 		} else {
2219 			/* Keep two messages in the pipe so it schedules better */
2220 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2221 
2222 			/* allow fallback to order-0 allocations */
2223 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2224 
2225 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2226 
2227 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2228 
2229 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2230 						   msg->msg_flags & MSG_DONTWAIT, &err,
2231 						   get_order(UNIX_SKB_FRAGS_SZ));
2232 		}
2233 		if (!skb)
2234 			goto out_err;
2235 
2236 		/* Only send the fds in the first buffer */
2237 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2238 		if (err < 0) {
2239 			kfree_skb(skb);
2240 			goto out_err;
2241 		}
2242 		fds_sent = true;
2243 
2244 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2245 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2246 						   sk->sk_allocation);
2247 			if (err < 0) {
2248 				kfree_skb(skb);
2249 				goto out_err;
2250 			}
2251 			size = err;
2252 			refcount_add(size, &sk->sk_wmem_alloc);
2253 		} else {
2254 			skb_put(skb, size - data_len);
2255 			skb->data_len = data_len;
2256 			skb->len = size;
2257 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2258 			if (err) {
2259 				kfree_skb(skb);
2260 				goto out_err;
2261 			}
2262 		}
2263 
2264 		unix_state_lock(other);
2265 
2266 		if (sock_flag(other, SOCK_DEAD) ||
2267 		    (other->sk_shutdown & RCV_SHUTDOWN))
2268 			goto pipe_err_free;
2269 
2270 		maybe_add_creds(skb, sock, other);
2271 		scm_stat_add(other, skb);
2272 		skb_queue_tail(&other->sk_receive_queue, skb);
2273 		unix_state_unlock(other);
2274 		other->sk_data_ready(other);
2275 		sent += size;
2276 	}
2277 
2278 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2279 	if (msg->msg_flags & MSG_OOB) {
2280 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2281 		if (err)
2282 			goto out_err;
2283 		sent++;
2284 	}
2285 #endif
2286 
2287 	scm_destroy(&scm);
2288 
2289 	return sent;
2290 
2291 pipe_err_free:
2292 	unix_state_unlock(other);
2293 	kfree_skb(skb);
2294 pipe_err:
2295 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2296 		send_sig(SIGPIPE, current, 0);
2297 	err = -EPIPE;
2298 out_err:
2299 	scm_destroy(&scm);
2300 	return sent ? : err;
2301 }
2302 
2303 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2304 				  size_t len)
2305 {
2306 	int err;
2307 	struct sock *sk = sock->sk;
2308 
2309 	err = sock_error(sk);
2310 	if (err)
2311 		return err;
2312 
2313 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2314 		return -ENOTCONN;
2315 
2316 	if (msg->msg_namelen)
2317 		msg->msg_namelen = 0;
2318 
2319 	return unix_dgram_sendmsg(sock, msg, len);
2320 }
2321 
2322 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2323 				  size_t size, int flags)
2324 {
2325 	struct sock *sk = sock->sk;
2326 
2327 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2328 		return -ENOTCONN;
2329 
2330 	return unix_dgram_recvmsg(sock, msg, size, flags);
2331 }
2332 
2333 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2334 {
2335 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2336 
2337 	if (addr) {
2338 		msg->msg_namelen = addr->len;
2339 		memcpy(msg->msg_name, addr->name, addr->len);
2340 	}
2341 }
2342 
2343 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2344 			 int flags)
2345 {
2346 	struct scm_cookie scm;
2347 	struct socket *sock = sk->sk_socket;
2348 	struct unix_sock *u = unix_sk(sk);
2349 	struct sk_buff *skb, *last;
2350 	long timeo;
2351 	int skip;
2352 	int err;
2353 
2354 	err = -EOPNOTSUPP;
2355 	if (flags&MSG_OOB)
2356 		goto out;
2357 
2358 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2359 
2360 	do {
2361 		mutex_lock(&u->iolock);
2362 
2363 		skip = sk_peek_offset(sk, flags);
2364 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2365 					      &skip, &err, &last);
2366 		if (skb) {
2367 			if (!(flags & MSG_PEEK))
2368 				scm_stat_del(sk, skb);
2369 			break;
2370 		}
2371 
2372 		mutex_unlock(&u->iolock);
2373 
2374 		if (err != -EAGAIN)
2375 			break;
2376 	} while (timeo &&
2377 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2378 					      &err, &timeo, last));
2379 
2380 	if (!skb) { /* implies iolock unlocked */
2381 		unix_state_lock(sk);
2382 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2383 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2384 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2385 			err = 0;
2386 		unix_state_unlock(sk);
2387 		goto out;
2388 	}
2389 
2390 	if (wq_has_sleeper(&u->peer_wait))
2391 		wake_up_interruptible_sync_poll(&u->peer_wait,
2392 						EPOLLOUT | EPOLLWRNORM |
2393 						EPOLLWRBAND);
2394 
2395 	if (msg->msg_name)
2396 		unix_copy_addr(msg, skb->sk);
2397 
2398 	if (size > skb->len - skip)
2399 		size = skb->len - skip;
2400 	else if (size < skb->len - skip)
2401 		msg->msg_flags |= MSG_TRUNC;
2402 
2403 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2404 	if (err)
2405 		goto out_free;
2406 
2407 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2408 		__sock_recv_timestamp(msg, sk, skb);
2409 
2410 	memset(&scm, 0, sizeof(scm));
2411 
2412 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2413 	unix_set_secdata(&scm, skb);
2414 
2415 	if (!(flags & MSG_PEEK)) {
2416 		if (UNIXCB(skb).fp)
2417 			unix_detach_fds(&scm, skb);
2418 
2419 		sk_peek_offset_bwd(sk, skb->len);
2420 	} else {
2421 		/* It is questionable: on PEEK we could:
2422 		   - do not return fds - good, but too simple 8)
2423 		   - return fds, and do not return them on read (old strategy,
2424 		     apparently wrong)
2425 		   - clone fds (I chose it for now, it is the most universal
2426 		     solution)
2427 
2428 		   POSIX 1003.1g does not actually define this clearly
2429 		   at all. POSIX 1003.1g doesn't define a lot of things
2430 		   clearly however!
2431 
2432 		*/
2433 
2434 		sk_peek_offset_fwd(sk, size);
2435 
2436 		if (UNIXCB(skb).fp)
2437 			unix_peek_fds(&scm, skb);
2438 	}
2439 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2440 
2441 	scm_recv_unix(sock, msg, &scm, flags);
2442 
2443 out_free:
2444 	skb_free_datagram(sk, skb);
2445 	mutex_unlock(&u->iolock);
2446 out:
2447 	return err;
2448 }
2449 
2450 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2451 			      int flags)
2452 {
2453 	struct sock *sk = sock->sk;
2454 
2455 #ifdef CONFIG_BPF_SYSCALL
2456 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2457 
2458 	if (prot != &unix_dgram_proto)
2459 		return prot->recvmsg(sk, msg, size, flags, NULL);
2460 #endif
2461 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2462 }
2463 
2464 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2465 {
2466 	struct unix_sock *u = unix_sk(sk);
2467 	struct sk_buff *skb;
2468 	int err;
2469 
2470 	mutex_lock(&u->iolock);
2471 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2472 	mutex_unlock(&u->iolock);
2473 	if (!skb)
2474 		return err;
2475 
2476 	return recv_actor(sk, skb);
2477 }
2478 
2479 /*
2480  *	Sleep until more data has arrived. But check for races..
2481  */
2482 static long unix_stream_data_wait(struct sock *sk, long timeo,
2483 				  struct sk_buff *last, unsigned int last_len,
2484 				  bool freezable)
2485 {
2486 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2487 	struct sk_buff *tail;
2488 	DEFINE_WAIT(wait);
2489 
2490 	unix_state_lock(sk);
2491 
2492 	for (;;) {
2493 		prepare_to_wait(sk_sleep(sk), &wait, state);
2494 
2495 		tail = skb_peek_tail(&sk->sk_receive_queue);
2496 		if (tail != last ||
2497 		    (tail && tail->len != last_len) ||
2498 		    sk->sk_err ||
2499 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2500 		    signal_pending(current) ||
2501 		    !timeo)
2502 			break;
2503 
2504 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2505 		unix_state_unlock(sk);
2506 		timeo = schedule_timeout(timeo);
2507 		unix_state_lock(sk);
2508 
2509 		if (sock_flag(sk, SOCK_DEAD))
2510 			break;
2511 
2512 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2513 	}
2514 
2515 	finish_wait(sk_sleep(sk), &wait);
2516 	unix_state_unlock(sk);
2517 	return timeo;
2518 }
2519 
2520 static unsigned int unix_skb_len(const struct sk_buff *skb)
2521 {
2522 	return skb->len - UNIXCB(skb).consumed;
2523 }
2524 
2525 struct unix_stream_read_state {
2526 	int (*recv_actor)(struct sk_buff *, int, int,
2527 			  struct unix_stream_read_state *);
2528 	struct socket *socket;
2529 	struct msghdr *msg;
2530 	struct pipe_inode_info *pipe;
2531 	size_t size;
2532 	int flags;
2533 	unsigned int splice_flags;
2534 };
2535 
2536 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2537 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2538 {
2539 	struct socket *sock = state->socket;
2540 	struct sock *sk = sock->sk;
2541 	struct unix_sock *u = unix_sk(sk);
2542 	int chunk = 1;
2543 	struct sk_buff *oob_skb;
2544 
2545 	mutex_lock(&u->iolock);
2546 	unix_state_lock(sk);
2547 	spin_lock(&sk->sk_receive_queue.lock);
2548 
2549 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2550 		spin_unlock(&sk->sk_receive_queue.lock);
2551 		unix_state_unlock(sk);
2552 		mutex_unlock(&u->iolock);
2553 		return -EINVAL;
2554 	}
2555 
2556 	oob_skb = u->oob_skb;
2557 
2558 	if (!(state->flags & MSG_PEEK))
2559 		WRITE_ONCE(u->oob_skb, NULL);
2560 	else
2561 		skb_get(oob_skb);
2562 
2563 	spin_unlock(&sk->sk_receive_queue.lock);
2564 	unix_state_unlock(sk);
2565 
2566 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2567 
2568 	if (!(state->flags & MSG_PEEK))
2569 		UNIXCB(oob_skb).consumed += 1;
2570 
2571 	consume_skb(oob_skb);
2572 
2573 	mutex_unlock(&u->iolock);
2574 
2575 	if (chunk < 0)
2576 		return -EFAULT;
2577 
2578 	state->msg->msg_flags |= MSG_OOB;
2579 	return 1;
2580 }
2581 
2582 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2583 				  int flags, int copied)
2584 {
2585 	struct unix_sock *u = unix_sk(sk);
2586 
2587 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2588 		skb_unlink(skb, &sk->sk_receive_queue);
2589 		consume_skb(skb);
2590 		skb = NULL;
2591 	} else {
2592 		struct sk_buff *unlinked_skb = NULL;
2593 
2594 		spin_lock(&sk->sk_receive_queue.lock);
2595 
2596 		if (skb == u->oob_skb) {
2597 			if (copied) {
2598 				skb = NULL;
2599 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2600 				if (!(flags & MSG_PEEK)) {
2601 					WRITE_ONCE(u->oob_skb, NULL);
2602 					consume_skb(skb);
2603 				}
2604 			} else if (flags & MSG_PEEK) {
2605 				skb = NULL;
2606 			} else {
2607 				__skb_unlink(skb, &sk->sk_receive_queue);
2608 				WRITE_ONCE(u->oob_skb, NULL);
2609 				unlinked_skb = skb;
2610 				skb = skb_peek(&sk->sk_receive_queue);
2611 			}
2612 		}
2613 
2614 		spin_unlock(&sk->sk_receive_queue.lock);
2615 
2616 		if (unlinked_skb) {
2617 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2618 			kfree_skb(unlinked_skb);
2619 		}
2620 	}
2621 	return skb;
2622 }
2623 #endif
2624 
2625 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2626 {
2627 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2628 		return -ENOTCONN;
2629 
2630 	return unix_read_skb(sk, recv_actor);
2631 }
2632 
2633 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2634 				    bool freezable)
2635 {
2636 	struct scm_cookie scm;
2637 	struct socket *sock = state->socket;
2638 	struct sock *sk = sock->sk;
2639 	struct unix_sock *u = unix_sk(sk);
2640 	int copied = 0;
2641 	int flags = state->flags;
2642 	int noblock = flags & MSG_DONTWAIT;
2643 	bool check_creds = false;
2644 	int target;
2645 	int err = 0;
2646 	long timeo;
2647 	int skip;
2648 	size_t size = state->size;
2649 	unsigned int last_len;
2650 
2651 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2652 		err = -EINVAL;
2653 		goto out;
2654 	}
2655 
2656 	if (unlikely(flags & MSG_OOB)) {
2657 		err = -EOPNOTSUPP;
2658 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2659 		err = unix_stream_recv_urg(state);
2660 #endif
2661 		goto out;
2662 	}
2663 
2664 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2665 	timeo = sock_rcvtimeo(sk, noblock);
2666 
2667 	memset(&scm, 0, sizeof(scm));
2668 
2669 	/* Lock the socket to prevent queue disordering
2670 	 * while sleeps in memcpy_tomsg
2671 	 */
2672 	mutex_lock(&u->iolock);
2673 
2674 	skip = max(sk_peek_offset(sk, flags), 0);
2675 
2676 	do {
2677 		int chunk;
2678 		bool drop_skb;
2679 		struct sk_buff *skb, *last;
2680 
2681 redo:
2682 		unix_state_lock(sk);
2683 		if (sock_flag(sk, SOCK_DEAD)) {
2684 			err = -ECONNRESET;
2685 			goto unlock;
2686 		}
2687 		last = skb = skb_peek(&sk->sk_receive_queue);
2688 		last_len = last ? last->len : 0;
2689 
2690 again:
2691 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2692 		if (skb) {
2693 			skb = manage_oob(skb, sk, flags, copied);
2694 			if (!skb && copied) {
2695 				unix_state_unlock(sk);
2696 				break;
2697 			}
2698 		}
2699 #endif
2700 		if (skb == NULL) {
2701 			if (copied >= target)
2702 				goto unlock;
2703 
2704 			/*
2705 			 *	POSIX 1003.1g mandates this order.
2706 			 */
2707 
2708 			err = sock_error(sk);
2709 			if (err)
2710 				goto unlock;
2711 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2712 				goto unlock;
2713 
2714 			unix_state_unlock(sk);
2715 			if (!timeo) {
2716 				err = -EAGAIN;
2717 				break;
2718 			}
2719 
2720 			mutex_unlock(&u->iolock);
2721 
2722 			timeo = unix_stream_data_wait(sk, timeo, last,
2723 						      last_len, freezable);
2724 
2725 			if (signal_pending(current)) {
2726 				err = sock_intr_errno(timeo);
2727 				scm_destroy(&scm);
2728 				goto out;
2729 			}
2730 
2731 			mutex_lock(&u->iolock);
2732 			goto redo;
2733 unlock:
2734 			unix_state_unlock(sk);
2735 			break;
2736 		}
2737 
2738 		while (skip >= unix_skb_len(skb)) {
2739 			skip -= unix_skb_len(skb);
2740 			last = skb;
2741 			last_len = skb->len;
2742 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2743 			if (!skb)
2744 				goto again;
2745 		}
2746 
2747 		unix_state_unlock(sk);
2748 
2749 		if (check_creds) {
2750 			/* Never glue messages from different writers */
2751 			if (!unix_skb_scm_eq(skb, &scm))
2752 				break;
2753 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2754 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2755 			/* Copy credentials */
2756 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2757 			unix_set_secdata(&scm, skb);
2758 			check_creds = true;
2759 		}
2760 
2761 		/* Copy address just once */
2762 		if (state->msg && state->msg->msg_name) {
2763 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2764 					 state->msg->msg_name);
2765 			unix_copy_addr(state->msg, skb->sk);
2766 			sunaddr = NULL;
2767 		}
2768 
2769 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2770 		skb_get(skb);
2771 		chunk = state->recv_actor(skb, skip, chunk, state);
2772 		drop_skb = !unix_skb_len(skb);
2773 		/* skb is only safe to use if !drop_skb */
2774 		consume_skb(skb);
2775 		if (chunk < 0) {
2776 			if (copied == 0)
2777 				copied = -EFAULT;
2778 			break;
2779 		}
2780 		copied += chunk;
2781 		size -= chunk;
2782 
2783 		if (drop_skb) {
2784 			/* the skb was touched by a concurrent reader;
2785 			 * we should not expect anything from this skb
2786 			 * anymore and assume it invalid - we can be
2787 			 * sure it was dropped from the socket queue
2788 			 *
2789 			 * let's report a short read
2790 			 */
2791 			err = 0;
2792 			break;
2793 		}
2794 
2795 		/* Mark read part of skb as used */
2796 		if (!(flags & MSG_PEEK)) {
2797 			UNIXCB(skb).consumed += chunk;
2798 
2799 			sk_peek_offset_bwd(sk, chunk);
2800 
2801 			if (UNIXCB(skb).fp) {
2802 				scm_stat_del(sk, skb);
2803 				unix_detach_fds(&scm, skb);
2804 			}
2805 
2806 			if (unix_skb_len(skb))
2807 				break;
2808 
2809 			skb_unlink(skb, &sk->sk_receive_queue);
2810 			consume_skb(skb);
2811 
2812 			if (scm.fp)
2813 				break;
2814 		} else {
2815 			/* It is questionable, see note in unix_dgram_recvmsg.
2816 			 */
2817 			if (UNIXCB(skb).fp)
2818 				unix_peek_fds(&scm, skb);
2819 
2820 			sk_peek_offset_fwd(sk, chunk);
2821 
2822 			if (UNIXCB(skb).fp)
2823 				break;
2824 
2825 			skip = 0;
2826 			last = skb;
2827 			last_len = skb->len;
2828 			unix_state_lock(sk);
2829 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2830 			if (skb)
2831 				goto again;
2832 			unix_state_unlock(sk);
2833 			break;
2834 		}
2835 	} while (size);
2836 
2837 	mutex_unlock(&u->iolock);
2838 	if (state->msg)
2839 		scm_recv_unix(sock, state->msg, &scm, flags);
2840 	else
2841 		scm_destroy(&scm);
2842 out:
2843 	return copied ? : err;
2844 }
2845 
2846 static int unix_stream_read_actor(struct sk_buff *skb,
2847 				  int skip, int chunk,
2848 				  struct unix_stream_read_state *state)
2849 {
2850 	int ret;
2851 
2852 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2853 				    state->msg, chunk);
2854 	return ret ?: chunk;
2855 }
2856 
2857 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2858 			  size_t size, int flags)
2859 {
2860 	struct unix_stream_read_state state = {
2861 		.recv_actor = unix_stream_read_actor,
2862 		.socket = sk->sk_socket,
2863 		.msg = msg,
2864 		.size = size,
2865 		.flags = flags
2866 	};
2867 
2868 	return unix_stream_read_generic(&state, true);
2869 }
2870 
2871 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2872 			       size_t size, int flags)
2873 {
2874 	struct unix_stream_read_state state = {
2875 		.recv_actor = unix_stream_read_actor,
2876 		.socket = sock,
2877 		.msg = msg,
2878 		.size = size,
2879 		.flags = flags
2880 	};
2881 
2882 #ifdef CONFIG_BPF_SYSCALL
2883 	struct sock *sk = sock->sk;
2884 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2885 
2886 	if (prot != &unix_stream_proto)
2887 		return prot->recvmsg(sk, msg, size, flags, NULL);
2888 #endif
2889 	return unix_stream_read_generic(&state, true);
2890 }
2891 
2892 static int unix_stream_splice_actor(struct sk_buff *skb,
2893 				    int skip, int chunk,
2894 				    struct unix_stream_read_state *state)
2895 {
2896 	return skb_splice_bits(skb, state->socket->sk,
2897 			       UNIXCB(skb).consumed + skip,
2898 			       state->pipe, chunk, state->splice_flags);
2899 }
2900 
2901 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2902 				       struct pipe_inode_info *pipe,
2903 				       size_t size, unsigned int flags)
2904 {
2905 	struct unix_stream_read_state state = {
2906 		.recv_actor = unix_stream_splice_actor,
2907 		.socket = sock,
2908 		.pipe = pipe,
2909 		.size = size,
2910 		.splice_flags = flags,
2911 	};
2912 
2913 	if (unlikely(*ppos))
2914 		return -ESPIPE;
2915 
2916 	if (sock->file->f_flags & O_NONBLOCK ||
2917 	    flags & SPLICE_F_NONBLOCK)
2918 		state.flags = MSG_DONTWAIT;
2919 
2920 	return unix_stream_read_generic(&state, false);
2921 }
2922 
2923 static int unix_shutdown(struct socket *sock, int mode)
2924 {
2925 	struct sock *sk = sock->sk;
2926 	struct sock *other;
2927 
2928 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2929 		return -EINVAL;
2930 	/* This maps:
2931 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2932 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2933 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2934 	 */
2935 	++mode;
2936 
2937 	unix_state_lock(sk);
2938 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2939 	other = unix_peer(sk);
2940 	if (other)
2941 		sock_hold(other);
2942 	unix_state_unlock(sk);
2943 	sk->sk_state_change(sk);
2944 
2945 	if (other &&
2946 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2947 
2948 		int peer_mode = 0;
2949 		const struct proto *prot = READ_ONCE(other->sk_prot);
2950 
2951 		if (prot->unhash)
2952 			prot->unhash(other);
2953 		if (mode&RCV_SHUTDOWN)
2954 			peer_mode |= SEND_SHUTDOWN;
2955 		if (mode&SEND_SHUTDOWN)
2956 			peer_mode |= RCV_SHUTDOWN;
2957 		unix_state_lock(other);
2958 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2959 		unix_state_unlock(other);
2960 		other->sk_state_change(other);
2961 		if (peer_mode == SHUTDOWN_MASK)
2962 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2963 		else if (peer_mode & RCV_SHUTDOWN)
2964 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2965 	}
2966 	if (other)
2967 		sock_put(other);
2968 
2969 	return 0;
2970 }
2971 
2972 long unix_inq_len(struct sock *sk)
2973 {
2974 	struct sk_buff *skb;
2975 	long amount = 0;
2976 
2977 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
2978 		return -EINVAL;
2979 
2980 	spin_lock(&sk->sk_receive_queue.lock);
2981 	if (sk->sk_type == SOCK_STREAM ||
2982 	    sk->sk_type == SOCK_SEQPACKET) {
2983 		skb_queue_walk(&sk->sk_receive_queue, skb)
2984 			amount += unix_skb_len(skb);
2985 	} else {
2986 		skb = skb_peek(&sk->sk_receive_queue);
2987 		if (skb)
2988 			amount = skb->len;
2989 	}
2990 	spin_unlock(&sk->sk_receive_queue.lock);
2991 
2992 	return amount;
2993 }
2994 EXPORT_SYMBOL_GPL(unix_inq_len);
2995 
2996 long unix_outq_len(struct sock *sk)
2997 {
2998 	return sk_wmem_alloc_get(sk);
2999 }
3000 EXPORT_SYMBOL_GPL(unix_outq_len);
3001 
3002 static int unix_open_file(struct sock *sk)
3003 {
3004 	struct path path;
3005 	struct file *f;
3006 	int fd;
3007 
3008 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3009 		return -EPERM;
3010 
3011 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3012 		return -ENOENT;
3013 
3014 	path = unix_sk(sk)->path;
3015 	if (!path.dentry)
3016 		return -ENOENT;
3017 
3018 	path_get(&path);
3019 
3020 	fd = get_unused_fd_flags(O_CLOEXEC);
3021 	if (fd < 0)
3022 		goto out;
3023 
3024 	f = dentry_open(&path, O_PATH, current_cred());
3025 	if (IS_ERR(f)) {
3026 		put_unused_fd(fd);
3027 		fd = PTR_ERR(f);
3028 		goto out;
3029 	}
3030 
3031 	fd_install(fd, f);
3032 out:
3033 	path_put(&path);
3034 
3035 	return fd;
3036 }
3037 
3038 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3039 {
3040 	struct sock *sk = sock->sk;
3041 	long amount = 0;
3042 	int err;
3043 
3044 	switch (cmd) {
3045 	case SIOCOUTQ:
3046 		amount = unix_outq_len(sk);
3047 		err = put_user(amount, (int __user *)arg);
3048 		break;
3049 	case SIOCINQ:
3050 		amount = unix_inq_len(sk);
3051 		if (amount < 0)
3052 			err = amount;
3053 		else
3054 			err = put_user(amount, (int __user *)arg);
3055 		break;
3056 	case SIOCUNIXFILE:
3057 		err = unix_open_file(sk);
3058 		break;
3059 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3060 	case SIOCATMARK:
3061 		{
3062 			struct sk_buff *skb;
3063 			int answ = 0;
3064 
3065 			skb = skb_peek(&sk->sk_receive_queue);
3066 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3067 				answ = 1;
3068 			err = put_user(answ, (int __user *)arg);
3069 		}
3070 		break;
3071 #endif
3072 	default:
3073 		err = -ENOIOCTLCMD;
3074 		break;
3075 	}
3076 	return err;
3077 }
3078 
3079 #ifdef CONFIG_COMPAT
3080 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3081 {
3082 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3083 }
3084 #endif
3085 
3086 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3087 {
3088 	struct sock *sk = sock->sk;
3089 	unsigned char state;
3090 	__poll_t mask;
3091 	u8 shutdown;
3092 
3093 	sock_poll_wait(file, sock, wait);
3094 	mask = 0;
3095 	shutdown = READ_ONCE(sk->sk_shutdown);
3096 	state = READ_ONCE(sk->sk_state);
3097 
3098 	/* exceptional events? */
3099 	if (READ_ONCE(sk->sk_err))
3100 		mask |= EPOLLERR;
3101 	if (shutdown == SHUTDOWN_MASK)
3102 		mask |= EPOLLHUP;
3103 	if (shutdown & RCV_SHUTDOWN)
3104 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3105 
3106 	/* readable? */
3107 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3108 		mask |= EPOLLIN | EPOLLRDNORM;
3109 	if (sk_is_readable(sk))
3110 		mask |= EPOLLIN | EPOLLRDNORM;
3111 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3112 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3113 		mask |= EPOLLPRI;
3114 #endif
3115 
3116 	/* Connection-based need to check for termination and startup */
3117 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3118 	    state == TCP_CLOSE)
3119 		mask |= EPOLLHUP;
3120 
3121 	/*
3122 	 * we set writable also when the other side has shut down the
3123 	 * connection. This prevents stuck sockets.
3124 	 */
3125 	if (unix_writable(sk, state))
3126 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3127 
3128 	return mask;
3129 }
3130 
3131 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3132 				    poll_table *wait)
3133 {
3134 	struct sock *sk = sock->sk, *other;
3135 	unsigned int writable;
3136 	unsigned char state;
3137 	__poll_t mask;
3138 	u8 shutdown;
3139 
3140 	sock_poll_wait(file, sock, wait);
3141 	mask = 0;
3142 	shutdown = READ_ONCE(sk->sk_shutdown);
3143 	state = READ_ONCE(sk->sk_state);
3144 
3145 	/* exceptional events? */
3146 	if (READ_ONCE(sk->sk_err) ||
3147 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3148 		mask |= EPOLLERR |
3149 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3150 
3151 	if (shutdown & RCV_SHUTDOWN)
3152 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3153 	if (shutdown == SHUTDOWN_MASK)
3154 		mask |= EPOLLHUP;
3155 
3156 	/* readable? */
3157 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3158 		mask |= EPOLLIN | EPOLLRDNORM;
3159 	if (sk_is_readable(sk))
3160 		mask |= EPOLLIN | EPOLLRDNORM;
3161 
3162 	/* Connection-based need to check for termination and startup */
3163 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3164 		mask |= EPOLLHUP;
3165 
3166 	/* No write status requested, avoid expensive OUT tests. */
3167 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3168 		return mask;
3169 
3170 	writable = unix_writable(sk, state);
3171 	if (writable) {
3172 		unix_state_lock(sk);
3173 
3174 		other = unix_peer(sk);
3175 		if (other && unix_peer(other) != sk &&
3176 		    unix_recvq_full_lockless(other) &&
3177 		    unix_dgram_peer_wake_me(sk, other))
3178 			writable = 0;
3179 
3180 		unix_state_unlock(sk);
3181 	}
3182 
3183 	if (writable)
3184 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3185 	else
3186 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3187 
3188 	return mask;
3189 }
3190 
3191 #ifdef CONFIG_PROC_FS
3192 
3193 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3194 
3195 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3196 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3197 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3198 
3199 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3200 {
3201 	unsigned long offset = get_offset(*pos);
3202 	unsigned long bucket = get_bucket(*pos);
3203 	unsigned long count = 0;
3204 	struct sock *sk;
3205 
3206 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3207 	     sk; sk = sk_next(sk)) {
3208 		if (++count == offset)
3209 			break;
3210 	}
3211 
3212 	return sk;
3213 }
3214 
3215 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3216 {
3217 	unsigned long bucket = get_bucket(*pos);
3218 	struct net *net = seq_file_net(seq);
3219 	struct sock *sk;
3220 
3221 	while (bucket < UNIX_HASH_SIZE) {
3222 		spin_lock(&net->unx.table.locks[bucket]);
3223 
3224 		sk = unix_from_bucket(seq, pos);
3225 		if (sk)
3226 			return sk;
3227 
3228 		spin_unlock(&net->unx.table.locks[bucket]);
3229 
3230 		*pos = set_bucket_offset(++bucket, 1);
3231 	}
3232 
3233 	return NULL;
3234 }
3235 
3236 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3237 				  loff_t *pos)
3238 {
3239 	unsigned long bucket = get_bucket(*pos);
3240 
3241 	sk = sk_next(sk);
3242 	if (sk)
3243 		return sk;
3244 
3245 
3246 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3247 
3248 	*pos = set_bucket_offset(++bucket, 1);
3249 
3250 	return unix_get_first(seq, pos);
3251 }
3252 
3253 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3254 {
3255 	if (!*pos)
3256 		return SEQ_START_TOKEN;
3257 
3258 	return unix_get_first(seq, pos);
3259 }
3260 
3261 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3262 {
3263 	++*pos;
3264 
3265 	if (v == SEQ_START_TOKEN)
3266 		return unix_get_first(seq, pos);
3267 
3268 	return unix_get_next(seq, v, pos);
3269 }
3270 
3271 static void unix_seq_stop(struct seq_file *seq, void *v)
3272 {
3273 	struct sock *sk = v;
3274 
3275 	if (sk)
3276 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3277 }
3278 
3279 static int unix_seq_show(struct seq_file *seq, void *v)
3280 {
3281 
3282 	if (v == SEQ_START_TOKEN)
3283 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3284 			 "Inode Path\n");
3285 	else {
3286 		struct sock *s = v;
3287 		struct unix_sock *u = unix_sk(s);
3288 		unix_state_lock(s);
3289 
3290 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3291 			s,
3292 			refcount_read(&s->sk_refcnt),
3293 			0,
3294 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3295 			s->sk_type,
3296 			s->sk_socket ?
3297 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3298 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3299 			sock_i_ino(s));
3300 
3301 		if (u->addr) {	// under a hash table lock here
3302 			int i, len;
3303 			seq_putc(seq, ' ');
3304 
3305 			i = 0;
3306 			len = u->addr->len -
3307 				offsetof(struct sockaddr_un, sun_path);
3308 			if (u->addr->name->sun_path[0]) {
3309 				len--;
3310 			} else {
3311 				seq_putc(seq, '@');
3312 				i++;
3313 			}
3314 			for ( ; i < len; i++)
3315 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3316 					 '@');
3317 		}
3318 		unix_state_unlock(s);
3319 		seq_putc(seq, '\n');
3320 	}
3321 
3322 	return 0;
3323 }
3324 
3325 static const struct seq_operations unix_seq_ops = {
3326 	.start  = unix_seq_start,
3327 	.next   = unix_seq_next,
3328 	.stop   = unix_seq_stop,
3329 	.show   = unix_seq_show,
3330 };
3331 
3332 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3333 struct bpf_unix_iter_state {
3334 	struct seq_net_private p;
3335 	unsigned int cur_sk;
3336 	unsigned int end_sk;
3337 	unsigned int max_sk;
3338 	struct sock **batch;
3339 	bool st_bucket_done;
3340 };
3341 
3342 struct bpf_iter__unix {
3343 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3344 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3345 	uid_t uid __aligned(8);
3346 };
3347 
3348 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3349 			      struct unix_sock *unix_sk, uid_t uid)
3350 {
3351 	struct bpf_iter__unix ctx;
3352 
3353 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3354 	ctx.meta = meta;
3355 	ctx.unix_sk = unix_sk;
3356 	ctx.uid = uid;
3357 	return bpf_iter_run_prog(prog, &ctx);
3358 }
3359 
3360 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3361 
3362 {
3363 	struct bpf_unix_iter_state *iter = seq->private;
3364 	unsigned int expected = 1;
3365 	struct sock *sk;
3366 
3367 	sock_hold(start_sk);
3368 	iter->batch[iter->end_sk++] = start_sk;
3369 
3370 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3371 		if (iter->end_sk < iter->max_sk) {
3372 			sock_hold(sk);
3373 			iter->batch[iter->end_sk++] = sk;
3374 		}
3375 
3376 		expected++;
3377 	}
3378 
3379 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3380 
3381 	return expected;
3382 }
3383 
3384 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3385 {
3386 	while (iter->cur_sk < iter->end_sk)
3387 		sock_put(iter->batch[iter->cur_sk++]);
3388 }
3389 
3390 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3391 				       unsigned int new_batch_sz)
3392 {
3393 	struct sock **new_batch;
3394 
3395 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3396 			     GFP_USER | __GFP_NOWARN);
3397 	if (!new_batch)
3398 		return -ENOMEM;
3399 
3400 	bpf_iter_unix_put_batch(iter);
3401 	kvfree(iter->batch);
3402 	iter->batch = new_batch;
3403 	iter->max_sk = new_batch_sz;
3404 
3405 	return 0;
3406 }
3407 
3408 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3409 					loff_t *pos)
3410 {
3411 	struct bpf_unix_iter_state *iter = seq->private;
3412 	unsigned int expected;
3413 	bool resized = false;
3414 	struct sock *sk;
3415 
3416 	if (iter->st_bucket_done)
3417 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3418 
3419 again:
3420 	/* Get a new batch */
3421 	iter->cur_sk = 0;
3422 	iter->end_sk = 0;
3423 
3424 	sk = unix_get_first(seq, pos);
3425 	if (!sk)
3426 		return NULL; /* Done */
3427 
3428 	expected = bpf_iter_unix_hold_batch(seq, sk);
3429 
3430 	if (iter->end_sk == expected) {
3431 		iter->st_bucket_done = true;
3432 		return sk;
3433 	}
3434 
3435 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3436 		resized = true;
3437 		goto again;
3438 	}
3439 
3440 	return sk;
3441 }
3442 
3443 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3444 {
3445 	if (!*pos)
3446 		return SEQ_START_TOKEN;
3447 
3448 	/* bpf iter does not support lseek, so it always
3449 	 * continue from where it was stop()-ped.
3450 	 */
3451 	return bpf_iter_unix_batch(seq, pos);
3452 }
3453 
3454 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3455 {
3456 	struct bpf_unix_iter_state *iter = seq->private;
3457 	struct sock *sk;
3458 
3459 	/* Whenever seq_next() is called, the iter->cur_sk is
3460 	 * done with seq_show(), so advance to the next sk in
3461 	 * the batch.
3462 	 */
3463 	if (iter->cur_sk < iter->end_sk)
3464 		sock_put(iter->batch[iter->cur_sk++]);
3465 
3466 	++*pos;
3467 
3468 	if (iter->cur_sk < iter->end_sk)
3469 		sk = iter->batch[iter->cur_sk];
3470 	else
3471 		sk = bpf_iter_unix_batch(seq, pos);
3472 
3473 	return sk;
3474 }
3475 
3476 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3477 {
3478 	struct bpf_iter_meta meta;
3479 	struct bpf_prog *prog;
3480 	struct sock *sk = v;
3481 	uid_t uid;
3482 	bool slow;
3483 	int ret;
3484 
3485 	if (v == SEQ_START_TOKEN)
3486 		return 0;
3487 
3488 	slow = lock_sock_fast(sk);
3489 
3490 	if (unlikely(sk_unhashed(sk))) {
3491 		ret = SEQ_SKIP;
3492 		goto unlock;
3493 	}
3494 
3495 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3496 	meta.seq = seq;
3497 	prog = bpf_iter_get_info(&meta, false);
3498 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3499 unlock:
3500 	unlock_sock_fast(sk, slow);
3501 	return ret;
3502 }
3503 
3504 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3505 {
3506 	struct bpf_unix_iter_state *iter = seq->private;
3507 	struct bpf_iter_meta meta;
3508 	struct bpf_prog *prog;
3509 
3510 	if (!v) {
3511 		meta.seq = seq;
3512 		prog = bpf_iter_get_info(&meta, true);
3513 		if (prog)
3514 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3515 	}
3516 
3517 	if (iter->cur_sk < iter->end_sk)
3518 		bpf_iter_unix_put_batch(iter);
3519 }
3520 
3521 static const struct seq_operations bpf_iter_unix_seq_ops = {
3522 	.start	= bpf_iter_unix_seq_start,
3523 	.next	= bpf_iter_unix_seq_next,
3524 	.stop	= bpf_iter_unix_seq_stop,
3525 	.show	= bpf_iter_unix_seq_show,
3526 };
3527 #endif
3528 #endif
3529 
3530 static const struct net_proto_family unix_family_ops = {
3531 	.family = PF_UNIX,
3532 	.create = unix_create,
3533 	.owner	= THIS_MODULE,
3534 };
3535 
3536 
3537 static int __net_init unix_net_init(struct net *net)
3538 {
3539 	int i;
3540 
3541 	net->unx.sysctl_max_dgram_qlen = 10;
3542 	if (unix_sysctl_register(net))
3543 		goto out;
3544 
3545 #ifdef CONFIG_PROC_FS
3546 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3547 			     sizeof(struct seq_net_private)))
3548 		goto err_sysctl;
3549 #endif
3550 
3551 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3552 					      sizeof(spinlock_t), GFP_KERNEL);
3553 	if (!net->unx.table.locks)
3554 		goto err_proc;
3555 
3556 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3557 						sizeof(struct hlist_head),
3558 						GFP_KERNEL);
3559 	if (!net->unx.table.buckets)
3560 		goto free_locks;
3561 
3562 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3563 		spin_lock_init(&net->unx.table.locks[i]);
3564 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3565 	}
3566 
3567 	return 0;
3568 
3569 free_locks:
3570 	kvfree(net->unx.table.locks);
3571 err_proc:
3572 #ifdef CONFIG_PROC_FS
3573 	remove_proc_entry("unix", net->proc_net);
3574 err_sysctl:
3575 #endif
3576 	unix_sysctl_unregister(net);
3577 out:
3578 	return -ENOMEM;
3579 }
3580 
3581 static void __net_exit unix_net_exit(struct net *net)
3582 {
3583 	kvfree(net->unx.table.buckets);
3584 	kvfree(net->unx.table.locks);
3585 	unix_sysctl_unregister(net);
3586 	remove_proc_entry("unix", net->proc_net);
3587 }
3588 
3589 static struct pernet_operations unix_net_ops = {
3590 	.init = unix_net_init,
3591 	.exit = unix_net_exit,
3592 };
3593 
3594 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3595 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3596 		     struct unix_sock *unix_sk, uid_t uid)
3597 
3598 #define INIT_BATCH_SZ 16
3599 
3600 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3601 {
3602 	struct bpf_unix_iter_state *iter = priv_data;
3603 	int err;
3604 
3605 	err = bpf_iter_init_seq_net(priv_data, aux);
3606 	if (err)
3607 		return err;
3608 
3609 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3610 	if (err) {
3611 		bpf_iter_fini_seq_net(priv_data);
3612 		return err;
3613 	}
3614 
3615 	return 0;
3616 }
3617 
3618 static void bpf_iter_fini_unix(void *priv_data)
3619 {
3620 	struct bpf_unix_iter_state *iter = priv_data;
3621 
3622 	bpf_iter_fini_seq_net(priv_data);
3623 	kvfree(iter->batch);
3624 }
3625 
3626 static const struct bpf_iter_seq_info unix_seq_info = {
3627 	.seq_ops		= &bpf_iter_unix_seq_ops,
3628 	.init_seq_private	= bpf_iter_init_unix,
3629 	.fini_seq_private	= bpf_iter_fini_unix,
3630 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3631 };
3632 
3633 static const struct bpf_func_proto *
3634 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3635 			     const struct bpf_prog *prog)
3636 {
3637 	switch (func_id) {
3638 	case BPF_FUNC_setsockopt:
3639 		return &bpf_sk_setsockopt_proto;
3640 	case BPF_FUNC_getsockopt:
3641 		return &bpf_sk_getsockopt_proto;
3642 	default:
3643 		return NULL;
3644 	}
3645 }
3646 
3647 static struct bpf_iter_reg unix_reg_info = {
3648 	.target			= "unix",
3649 	.ctx_arg_info_size	= 1,
3650 	.ctx_arg_info		= {
3651 		{ offsetof(struct bpf_iter__unix, unix_sk),
3652 		  PTR_TO_BTF_ID_OR_NULL },
3653 	},
3654 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3655 	.seq_info		= &unix_seq_info,
3656 };
3657 
3658 static void __init bpf_iter_register(void)
3659 {
3660 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3661 	if (bpf_iter_reg_target(&unix_reg_info))
3662 		pr_warn("Warning: could not register bpf iterator unix\n");
3663 }
3664 #endif
3665 
3666 static int __init af_unix_init(void)
3667 {
3668 	int i, rc = -1;
3669 
3670 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3671 
3672 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3673 		spin_lock_init(&bsd_socket_locks[i]);
3674 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3675 	}
3676 
3677 	rc = proto_register(&unix_dgram_proto, 1);
3678 	if (rc != 0) {
3679 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3680 		goto out;
3681 	}
3682 
3683 	rc = proto_register(&unix_stream_proto, 1);
3684 	if (rc != 0) {
3685 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3686 		proto_unregister(&unix_dgram_proto);
3687 		goto out;
3688 	}
3689 
3690 	sock_register(&unix_family_ops);
3691 	register_pernet_subsys(&unix_net_ops);
3692 	unix_bpf_build_proto();
3693 
3694 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3695 	bpf_iter_register();
3696 #endif
3697 
3698 out:
3699 	return rc;
3700 }
3701 
3702 static void __exit af_unix_exit(void)
3703 {
3704 	sock_unregister(PF_UNIX);
3705 	proto_unregister(&unix_dgram_proto);
3706 	proto_unregister(&unix_stream_proto);
3707 	unregister_pernet_subsys(&unix_net_ops);
3708 }
3709 
3710 /* Earlier than device_initcall() so that other drivers invoking
3711    request_module() don't end up in a loop when modprobe tries
3712    to use a UNIX socket. But later than subsys_initcall() because
3713    we depend on stuff initialised there */
3714 fs_initcall(af_unix_init);
3715 module_exit(af_unix_exit);
3716 
3717 MODULE_LICENSE("GPL");
3718 MODULE_ALIAS_NETPROTO(PF_UNIX);
3719