xref: /openbmc/linux/net/unix/af_unix.c (revision f7f84721)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 	return unix_peer(osk) == sk;
218 }
219 
220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224 
225 static inline int unix_recvq_full(const struct sock *sk)
226 {
227 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229 
230 static inline int unix_recvq_full_lockless(const struct sock *sk)
231 {
232 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
233 		READ_ONCE(sk->sk_max_ack_backlog);
234 }
235 
236 struct sock *unix_peer_get(struct sock *s)
237 {
238 	struct sock *peer;
239 
240 	unix_state_lock(s);
241 	peer = unix_peer(s);
242 	if (peer)
243 		sock_hold(peer);
244 	unix_state_unlock(s);
245 	return peer;
246 }
247 EXPORT_SYMBOL_GPL(unix_peer_get);
248 
249 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
250 					     int addr_len)
251 {
252 	struct unix_address *addr;
253 
254 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
255 	if (!addr)
256 		return NULL;
257 
258 	refcount_set(&addr->refcnt, 1);
259 	addr->len = addr_len;
260 	memcpy(addr->name, sunaddr, addr_len);
261 
262 	return addr;
263 }
264 
265 static inline void unix_release_addr(struct unix_address *addr)
266 {
267 	if (refcount_dec_and_test(&addr->refcnt))
268 		kfree(addr);
269 }
270 
271 /*
272  *	Check unix socket name:
273  *		- should be not zero length.
274  *	        - if started by not zero, should be NULL terminated (FS object)
275  *		- if started by zero, it is abstract name.
276  */
277 
278 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
279 {
280 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
281 	    addr_len > sizeof(*sunaddr))
282 		return -EINVAL;
283 
284 	if (sunaddr->sun_family != AF_UNIX)
285 		return -EINVAL;
286 
287 	return 0;
288 }
289 
290 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
291 {
292 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
293 	short offset = offsetof(struct sockaddr_storage, __data);
294 
295 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
296 
297 	/* This may look like an off by one error but it is a bit more
298 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
299 	 * sun_path[108] doesn't as such exist.  However in kernel space
300 	 * we are guaranteed that it is a valid memory location in our
301 	 * kernel address buffer because syscall functions always pass
302 	 * a pointer of struct sockaddr_storage which has a bigger buffer
303 	 * than 108.  Also, we must terminate sun_path for strlen() in
304 	 * getname_kernel().
305 	 */
306 	addr->__data[addr_len - offset] = 0;
307 
308 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
309 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
310 	 * know the actual buffer.
311 	 */
312 	return strlen(addr->__data) + offset + 1;
313 }
314 
315 static void __unix_remove_socket(struct sock *sk)
316 {
317 	sk_del_node_init(sk);
318 }
319 
320 static void __unix_insert_socket(struct net *net, struct sock *sk)
321 {
322 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
323 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
324 }
325 
326 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
327 				 struct unix_address *addr, unsigned int hash)
328 {
329 	__unix_remove_socket(sk);
330 	smp_store_release(&unix_sk(sk)->addr, addr);
331 
332 	sk->sk_hash = hash;
333 	__unix_insert_socket(net, sk);
334 }
335 
336 static void unix_remove_socket(struct net *net, struct sock *sk)
337 {
338 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
339 	__unix_remove_socket(sk);
340 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
341 }
342 
343 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
344 {
345 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
346 	__unix_insert_socket(net, sk);
347 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
348 }
349 
350 static void unix_insert_bsd_socket(struct sock *sk)
351 {
352 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
353 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
354 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
355 }
356 
357 static void unix_remove_bsd_socket(struct sock *sk)
358 {
359 	if (!hlist_unhashed(&sk->sk_bind_node)) {
360 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
361 		__sk_del_bind_node(sk);
362 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
363 
364 		sk_node_init(&sk->sk_bind_node);
365 	}
366 }
367 
368 static struct sock *__unix_find_socket_byname(struct net *net,
369 					      struct sockaddr_un *sunname,
370 					      int len, unsigned int hash)
371 {
372 	struct sock *s;
373 
374 	sk_for_each(s, &net->unx.table.buckets[hash]) {
375 		struct unix_sock *u = unix_sk(s);
376 
377 		if (u->addr->len == len &&
378 		    !memcmp(u->addr->name, sunname, len))
379 			return s;
380 	}
381 	return NULL;
382 }
383 
384 static inline struct sock *unix_find_socket_byname(struct net *net,
385 						   struct sockaddr_un *sunname,
386 						   int len, unsigned int hash)
387 {
388 	struct sock *s;
389 
390 	spin_lock(&net->unx.table.locks[hash]);
391 	s = __unix_find_socket_byname(net, sunname, len, hash);
392 	if (s)
393 		sock_hold(s);
394 	spin_unlock(&net->unx.table.locks[hash]);
395 	return s;
396 }
397 
398 static struct sock *unix_find_socket_byinode(struct inode *i)
399 {
400 	unsigned int hash = unix_bsd_hash(i);
401 	struct sock *s;
402 
403 	spin_lock(&bsd_socket_locks[hash]);
404 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
405 		struct dentry *dentry = unix_sk(s)->path.dentry;
406 
407 		if (dentry && d_backing_inode(dentry) == i) {
408 			sock_hold(s);
409 			spin_unlock(&bsd_socket_locks[hash]);
410 			return s;
411 		}
412 	}
413 	spin_unlock(&bsd_socket_locks[hash]);
414 	return NULL;
415 }
416 
417 /* Support code for asymmetrically connected dgram sockets
418  *
419  * If a datagram socket is connected to a socket not itself connected
420  * to the first socket (eg, /dev/log), clients may only enqueue more
421  * messages if the present receive queue of the server socket is not
422  * "too large". This means there's a second writeability condition
423  * poll and sendmsg need to test. The dgram recv code will do a wake
424  * up on the peer_wait wait queue of a socket upon reception of a
425  * datagram which needs to be propagated to sleeping would-be writers
426  * since these might not have sent anything so far. This can't be
427  * accomplished via poll_wait because the lifetime of the server
428  * socket might be less than that of its clients if these break their
429  * association with it or if the server socket is closed while clients
430  * are still connected to it and there's no way to inform "a polling
431  * implementation" that it should let go of a certain wait queue
432  *
433  * In order to propagate a wake up, a wait_queue_entry_t of the client
434  * socket is enqueued on the peer_wait queue of the server socket
435  * whose wake function does a wake_up on the ordinary client socket
436  * wait queue. This connection is established whenever a write (or
437  * poll for write) hit the flow control condition and broken when the
438  * association to the server socket is dissolved or after a wake up
439  * was relayed.
440  */
441 
442 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
443 				      void *key)
444 {
445 	struct unix_sock *u;
446 	wait_queue_head_t *u_sleep;
447 
448 	u = container_of(q, struct unix_sock, peer_wake);
449 
450 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
451 			    q);
452 	u->peer_wake.private = NULL;
453 
454 	/* relaying can only happen while the wq still exists */
455 	u_sleep = sk_sleep(&u->sk);
456 	if (u_sleep)
457 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
458 
459 	return 0;
460 }
461 
462 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
463 {
464 	struct unix_sock *u, *u_other;
465 	int rc;
466 
467 	u = unix_sk(sk);
468 	u_other = unix_sk(other);
469 	rc = 0;
470 	spin_lock(&u_other->peer_wait.lock);
471 
472 	if (!u->peer_wake.private) {
473 		u->peer_wake.private = other;
474 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
475 
476 		rc = 1;
477 	}
478 
479 	spin_unlock(&u_other->peer_wait.lock);
480 	return rc;
481 }
482 
483 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
484 					    struct sock *other)
485 {
486 	struct unix_sock *u, *u_other;
487 
488 	u = unix_sk(sk);
489 	u_other = unix_sk(other);
490 	spin_lock(&u_other->peer_wait.lock);
491 
492 	if (u->peer_wake.private == other) {
493 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
494 		u->peer_wake.private = NULL;
495 	}
496 
497 	spin_unlock(&u_other->peer_wait.lock);
498 }
499 
500 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
501 						   struct sock *other)
502 {
503 	unix_dgram_peer_wake_disconnect(sk, other);
504 	wake_up_interruptible_poll(sk_sleep(sk),
505 				   EPOLLOUT |
506 				   EPOLLWRNORM |
507 				   EPOLLWRBAND);
508 }
509 
510 /* preconditions:
511  *	- unix_peer(sk) == other
512  *	- association is stable
513  */
514 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
515 {
516 	int connected;
517 
518 	connected = unix_dgram_peer_wake_connect(sk, other);
519 
520 	/* If other is SOCK_DEAD, we want to make sure we signal
521 	 * POLLOUT, such that a subsequent write() can get a
522 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
523 	 * to other and its full, we will hang waiting for POLLOUT.
524 	 */
525 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
526 		return 1;
527 
528 	if (connected)
529 		unix_dgram_peer_wake_disconnect(sk, other);
530 
531 	return 0;
532 }
533 
534 static int unix_writable(const struct sock *sk)
535 {
536 	return sk->sk_state != TCP_LISTEN &&
537 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
538 }
539 
540 static void unix_write_space(struct sock *sk)
541 {
542 	struct socket_wq *wq;
543 
544 	rcu_read_lock();
545 	if (unix_writable(sk)) {
546 		wq = rcu_dereference(sk->sk_wq);
547 		if (skwq_has_sleeper(wq))
548 			wake_up_interruptible_sync_poll(&wq->wait,
549 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
550 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
551 	}
552 	rcu_read_unlock();
553 }
554 
555 /* When dgram socket disconnects (or changes its peer), we clear its receive
556  * queue of packets arrived from previous peer. First, it allows to do
557  * flow control based only on wmem_alloc; second, sk connected to peer
558  * may receive messages only from that peer. */
559 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
560 {
561 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
562 		skb_queue_purge(&sk->sk_receive_queue);
563 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
564 
565 		/* If one link of bidirectional dgram pipe is disconnected,
566 		 * we signal error. Messages are lost. Do not make this,
567 		 * when peer was not connected to us.
568 		 */
569 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
570 			WRITE_ONCE(other->sk_err, ECONNRESET);
571 			sk_error_report(other);
572 		}
573 	}
574 	other->sk_state = TCP_CLOSE;
575 }
576 
577 static void unix_sock_destructor(struct sock *sk)
578 {
579 	struct unix_sock *u = unix_sk(sk);
580 
581 	skb_queue_purge(&sk->sk_receive_queue);
582 
583 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
584 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
585 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
586 	if (!sock_flag(sk, SOCK_DEAD)) {
587 		pr_info("Attempt to release alive unix socket: %p\n", sk);
588 		return;
589 	}
590 
591 	if (u->addr)
592 		unix_release_addr(u->addr);
593 
594 	atomic_long_dec(&unix_nr_socks);
595 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
596 #ifdef UNIX_REFCNT_DEBUG
597 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
598 		atomic_long_read(&unix_nr_socks));
599 #endif
600 }
601 
602 static void unix_release_sock(struct sock *sk, int embrion)
603 {
604 	struct unix_sock *u = unix_sk(sk);
605 	struct sock *skpair;
606 	struct sk_buff *skb;
607 	struct path path;
608 	int state;
609 
610 	unix_remove_socket(sock_net(sk), sk);
611 	unix_remove_bsd_socket(sk);
612 
613 	/* Clear state */
614 	unix_state_lock(sk);
615 	sock_orphan(sk);
616 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
617 	path	     = u->path;
618 	u->path.dentry = NULL;
619 	u->path.mnt = NULL;
620 	state = sk->sk_state;
621 	sk->sk_state = TCP_CLOSE;
622 
623 	skpair = unix_peer(sk);
624 	unix_peer(sk) = NULL;
625 
626 	unix_state_unlock(sk);
627 
628 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
629 	if (u->oob_skb) {
630 		kfree_skb(u->oob_skb);
631 		u->oob_skb = NULL;
632 	}
633 #endif
634 
635 	wake_up_interruptible_all(&u->peer_wait);
636 
637 	if (skpair != NULL) {
638 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
639 			unix_state_lock(skpair);
640 			/* No more writes */
641 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
642 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
643 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
644 			unix_state_unlock(skpair);
645 			skpair->sk_state_change(skpair);
646 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
647 		}
648 
649 		unix_dgram_peer_wake_disconnect(sk, skpair);
650 		sock_put(skpair); /* It may now die */
651 	}
652 
653 	/* Try to flush out this socket. Throw out buffers at least */
654 
655 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
656 		if (state == TCP_LISTEN)
657 			unix_release_sock(skb->sk, 1);
658 		/* passed fds are erased in the kfree_skb hook	      */
659 		UNIXCB(skb).consumed = skb->len;
660 		kfree_skb(skb);
661 	}
662 
663 	if (path.dentry)
664 		path_put(&path);
665 
666 	sock_put(sk);
667 
668 	/* ---- Socket is dead now and most probably destroyed ---- */
669 
670 	/*
671 	 * Fixme: BSD difference: In BSD all sockets connected to us get
672 	 *	  ECONNRESET and we die on the spot. In Linux we behave
673 	 *	  like files and pipes do and wait for the last
674 	 *	  dereference.
675 	 *
676 	 * Can't we simply set sock->err?
677 	 *
678 	 *	  What the above comment does talk about? --ANK(980817)
679 	 */
680 
681 	if (READ_ONCE(unix_tot_inflight))
682 		unix_gc();		/* Garbage collect fds */
683 }
684 
685 static void init_peercred(struct sock *sk)
686 {
687 	const struct cred *old_cred;
688 	struct pid *old_pid;
689 
690 	spin_lock(&sk->sk_peer_lock);
691 	old_pid = sk->sk_peer_pid;
692 	old_cred = sk->sk_peer_cred;
693 	sk->sk_peer_pid  = get_pid(task_tgid(current));
694 	sk->sk_peer_cred = get_current_cred();
695 	spin_unlock(&sk->sk_peer_lock);
696 
697 	put_pid(old_pid);
698 	put_cred(old_cred);
699 }
700 
701 static void copy_peercred(struct sock *sk, struct sock *peersk)
702 {
703 	const struct cred *old_cred;
704 	struct pid *old_pid;
705 
706 	if (sk < peersk) {
707 		spin_lock(&sk->sk_peer_lock);
708 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
709 	} else {
710 		spin_lock(&peersk->sk_peer_lock);
711 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
712 	}
713 	old_pid = sk->sk_peer_pid;
714 	old_cred = sk->sk_peer_cred;
715 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
716 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
717 
718 	spin_unlock(&sk->sk_peer_lock);
719 	spin_unlock(&peersk->sk_peer_lock);
720 
721 	put_pid(old_pid);
722 	put_cred(old_cred);
723 }
724 
725 static int unix_listen(struct socket *sock, int backlog)
726 {
727 	int err;
728 	struct sock *sk = sock->sk;
729 	struct unix_sock *u = unix_sk(sk);
730 
731 	err = -EOPNOTSUPP;
732 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
733 		goto out;	/* Only stream/seqpacket sockets accept */
734 	err = -EINVAL;
735 	if (!READ_ONCE(u->addr))
736 		goto out;	/* No listens on an unbound socket */
737 	unix_state_lock(sk);
738 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
739 		goto out_unlock;
740 	if (backlog > sk->sk_max_ack_backlog)
741 		wake_up_interruptible_all(&u->peer_wait);
742 	sk->sk_max_ack_backlog	= backlog;
743 	sk->sk_state		= TCP_LISTEN;
744 	/* set credentials so connect can copy them */
745 	init_peercred(sk);
746 	err = 0;
747 
748 out_unlock:
749 	unix_state_unlock(sk);
750 out:
751 	return err;
752 }
753 
754 static int unix_release(struct socket *);
755 static int unix_bind(struct socket *, struct sockaddr *, int);
756 static int unix_stream_connect(struct socket *, struct sockaddr *,
757 			       int addr_len, int flags);
758 static int unix_socketpair(struct socket *, struct socket *);
759 static int unix_accept(struct socket *, struct socket *, int, bool);
760 static int unix_getname(struct socket *, struct sockaddr *, int);
761 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
762 static __poll_t unix_dgram_poll(struct file *, struct socket *,
763 				    poll_table *);
764 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
765 #ifdef CONFIG_COMPAT
766 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
767 #endif
768 static int unix_shutdown(struct socket *, int);
769 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
770 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
771 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
772 				       struct pipe_inode_info *, size_t size,
773 				       unsigned int flags);
774 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
775 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
776 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
778 static int unix_dgram_connect(struct socket *, struct sockaddr *,
779 			      int, int);
780 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
781 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
782 				  int);
783 
784 static int unix_set_peek_off(struct sock *sk, int val)
785 {
786 	struct unix_sock *u = unix_sk(sk);
787 
788 	if (mutex_lock_interruptible(&u->iolock))
789 		return -EINTR;
790 
791 	WRITE_ONCE(sk->sk_peek_off, val);
792 	mutex_unlock(&u->iolock);
793 
794 	return 0;
795 }
796 
797 #ifdef CONFIG_PROC_FS
798 static int unix_count_nr_fds(struct sock *sk)
799 {
800 	struct sk_buff *skb;
801 	struct unix_sock *u;
802 	int nr_fds = 0;
803 
804 	spin_lock(&sk->sk_receive_queue.lock);
805 	skb = skb_peek(&sk->sk_receive_queue);
806 	while (skb) {
807 		u = unix_sk(skb->sk);
808 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
809 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
810 	}
811 	spin_unlock(&sk->sk_receive_queue.lock);
812 
813 	return nr_fds;
814 }
815 
816 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
817 {
818 	struct sock *sk = sock->sk;
819 	unsigned char s_state;
820 	struct unix_sock *u;
821 	int nr_fds = 0;
822 
823 	if (sk) {
824 		s_state = READ_ONCE(sk->sk_state);
825 		u = unix_sk(sk);
826 
827 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
828 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
829 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
830 		 */
831 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
832 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
833 		else if (s_state == TCP_LISTEN)
834 			nr_fds = unix_count_nr_fds(sk);
835 
836 		seq_printf(m, "scm_fds: %u\n", nr_fds);
837 	}
838 }
839 #else
840 #define unix_show_fdinfo NULL
841 #endif
842 
843 static const struct proto_ops unix_stream_ops = {
844 	.family =	PF_UNIX,
845 	.owner =	THIS_MODULE,
846 	.release =	unix_release,
847 	.bind =		unix_bind,
848 	.connect =	unix_stream_connect,
849 	.socketpair =	unix_socketpair,
850 	.accept =	unix_accept,
851 	.getname =	unix_getname,
852 	.poll =		unix_poll,
853 	.ioctl =	unix_ioctl,
854 #ifdef CONFIG_COMPAT
855 	.compat_ioctl =	unix_compat_ioctl,
856 #endif
857 	.listen =	unix_listen,
858 	.shutdown =	unix_shutdown,
859 	.sendmsg =	unix_stream_sendmsg,
860 	.recvmsg =	unix_stream_recvmsg,
861 	.read_skb =	unix_stream_read_skb,
862 	.mmap =		sock_no_mmap,
863 	.splice_read =	unix_stream_splice_read,
864 	.set_peek_off =	unix_set_peek_off,
865 	.show_fdinfo =	unix_show_fdinfo,
866 };
867 
868 static const struct proto_ops unix_dgram_ops = {
869 	.family =	PF_UNIX,
870 	.owner =	THIS_MODULE,
871 	.release =	unix_release,
872 	.bind =		unix_bind,
873 	.connect =	unix_dgram_connect,
874 	.socketpair =	unix_socketpair,
875 	.accept =	sock_no_accept,
876 	.getname =	unix_getname,
877 	.poll =		unix_dgram_poll,
878 	.ioctl =	unix_ioctl,
879 #ifdef CONFIG_COMPAT
880 	.compat_ioctl =	unix_compat_ioctl,
881 #endif
882 	.listen =	sock_no_listen,
883 	.shutdown =	unix_shutdown,
884 	.sendmsg =	unix_dgram_sendmsg,
885 	.read_skb =	unix_read_skb,
886 	.recvmsg =	unix_dgram_recvmsg,
887 	.mmap =		sock_no_mmap,
888 	.set_peek_off =	unix_set_peek_off,
889 	.show_fdinfo =	unix_show_fdinfo,
890 };
891 
892 static const struct proto_ops unix_seqpacket_ops = {
893 	.family =	PF_UNIX,
894 	.owner =	THIS_MODULE,
895 	.release =	unix_release,
896 	.bind =		unix_bind,
897 	.connect =	unix_stream_connect,
898 	.socketpair =	unix_socketpair,
899 	.accept =	unix_accept,
900 	.getname =	unix_getname,
901 	.poll =		unix_dgram_poll,
902 	.ioctl =	unix_ioctl,
903 #ifdef CONFIG_COMPAT
904 	.compat_ioctl =	unix_compat_ioctl,
905 #endif
906 	.listen =	unix_listen,
907 	.shutdown =	unix_shutdown,
908 	.sendmsg =	unix_seqpacket_sendmsg,
909 	.recvmsg =	unix_seqpacket_recvmsg,
910 	.mmap =		sock_no_mmap,
911 	.set_peek_off =	unix_set_peek_off,
912 	.show_fdinfo =	unix_show_fdinfo,
913 };
914 
915 static void unix_close(struct sock *sk, long timeout)
916 {
917 	/* Nothing to do here, unix socket does not need a ->close().
918 	 * This is merely for sockmap.
919 	 */
920 }
921 
922 static void unix_unhash(struct sock *sk)
923 {
924 	/* Nothing to do here, unix socket does not need a ->unhash().
925 	 * This is merely for sockmap.
926 	 */
927 }
928 
929 static bool unix_bpf_bypass_getsockopt(int level, int optname)
930 {
931 	if (level == SOL_SOCKET) {
932 		switch (optname) {
933 		case SO_PEERPIDFD:
934 			return true;
935 		default:
936 			return false;
937 		}
938 	}
939 
940 	return false;
941 }
942 
943 struct proto unix_dgram_proto = {
944 	.name			= "UNIX",
945 	.owner			= THIS_MODULE,
946 	.obj_size		= sizeof(struct unix_sock),
947 	.close			= unix_close,
948 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
949 #ifdef CONFIG_BPF_SYSCALL
950 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
951 #endif
952 };
953 
954 struct proto unix_stream_proto = {
955 	.name			= "UNIX-STREAM",
956 	.owner			= THIS_MODULE,
957 	.obj_size		= sizeof(struct unix_sock),
958 	.close			= unix_close,
959 	.unhash			= unix_unhash,
960 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
961 #ifdef CONFIG_BPF_SYSCALL
962 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
963 #endif
964 };
965 
966 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
967 {
968 	struct unix_sock *u;
969 	struct sock *sk;
970 	int err;
971 
972 	atomic_long_inc(&unix_nr_socks);
973 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
974 		err = -ENFILE;
975 		goto err;
976 	}
977 
978 	if (type == SOCK_STREAM)
979 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
980 	else /*dgram and  seqpacket */
981 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
982 
983 	if (!sk) {
984 		err = -ENOMEM;
985 		goto err;
986 	}
987 
988 	sock_init_data(sock, sk);
989 
990 	sk->sk_hash		= unix_unbound_hash(sk);
991 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
992 	sk->sk_write_space	= unix_write_space;
993 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
994 	sk->sk_destruct		= unix_sock_destructor;
995 	u = unix_sk(sk);
996 	u->inflight = 0;
997 	u->path.dentry = NULL;
998 	u->path.mnt = NULL;
999 	spin_lock_init(&u->lock);
1000 	INIT_LIST_HEAD(&u->link);
1001 	mutex_init(&u->iolock); /* single task reading lock */
1002 	mutex_init(&u->bindlock); /* single task binding lock */
1003 	init_waitqueue_head(&u->peer_wait);
1004 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1005 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1006 	unix_insert_unbound_socket(net, sk);
1007 
1008 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1009 
1010 	return sk;
1011 
1012 err:
1013 	atomic_long_dec(&unix_nr_socks);
1014 	return ERR_PTR(err);
1015 }
1016 
1017 static int unix_create(struct net *net, struct socket *sock, int protocol,
1018 		       int kern)
1019 {
1020 	struct sock *sk;
1021 
1022 	if (protocol && protocol != PF_UNIX)
1023 		return -EPROTONOSUPPORT;
1024 
1025 	sock->state = SS_UNCONNECTED;
1026 
1027 	switch (sock->type) {
1028 	case SOCK_STREAM:
1029 		sock->ops = &unix_stream_ops;
1030 		break;
1031 		/*
1032 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1033 		 *	nothing uses it.
1034 		 */
1035 	case SOCK_RAW:
1036 		sock->type = SOCK_DGRAM;
1037 		fallthrough;
1038 	case SOCK_DGRAM:
1039 		sock->ops = &unix_dgram_ops;
1040 		break;
1041 	case SOCK_SEQPACKET:
1042 		sock->ops = &unix_seqpacket_ops;
1043 		break;
1044 	default:
1045 		return -ESOCKTNOSUPPORT;
1046 	}
1047 
1048 	sk = unix_create1(net, sock, kern, sock->type);
1049 	if (IS_ERR(sk))
1050 		return PTR_ERR(sk);
1051 
1052 	return 0;
1053 }
1054 
1055 static int unix_release(struct socket *sock)
1056 {
1057 	struct sock *sk = sock->sk;
1058 
1059 	if (!sk)
1060 		return 0;
1061 
1062 	sk->sk_prot->close(sk, 0);
1063 	unix_release_sock(sk, 0);
1064 	sock->sk = NULL;
1065 
1066 	return 0;
1067 }
1068 
1069 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1070 				  int type)
1071 {
1072 	struct inode *inode;
1073 	struct path path;
1074 	struct sock *sk;
1075 	int err;
1076 
1077 	unix_mkname_bsd(sunaddr, addr_len);
1078 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1079 	if (err)
1080 		goto fail;
1081 
1082 	err = path_permission(&path, MAY_WRITE);
1083 	if (err)
1084 		goto path_put;
1085 
1086 	err = -ECONNREFUSED;
1087 	inode = d_backing_inode(path.dentry);
1088 	if (!S_ISSOCK(inode->i_mode))
1089 		goto path_put;
1090 
1091 	sk = unix_find_socket_byinode(inode);
1092 	if (!sk)
1093 		goto path_put;
1094 
1095 	err = -EPROTOTYPE;
1096 	if (sk->sk_type == type)
1097 		touch_atime(&path);
1098 	else
1099 		goto sock_put;
1100 
1101 	path_put(&path);
1102 
1103 	return sk;
1104 
1105 sock_put:
1106 	sock_put(sk);
1107 path_put:
1108 	path_put(&path);
1109 fail:
1110 	return ERR_PTR(err);
1111 }
1112 
1113 static struct sock *unix_find_abstract(struct net *net,
1114 				       struct sockaddr_un *sunaddr,
1115 				       int addr_len, int type)
1116 {
1117 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1118 	struct dentry *dentry;
1119 	struct sock *sk;
1120 
1121 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1122 	if (!sk)
1123 		return ERR_PTR(-ECONNREFUSED);
1124 
1125 	dentry = unix_sk(sk)->path.dentry;
1126 	if (dentry)
1127 		touch_atime(&unix_sk(sk)->path);
1128 
1129 	return sk;
1130 }
1131 
1132 static struct sock *unix_find_other(struct net *net,
1133 				    struct sockaddr_un *sunaddr,
1134 				    int addr_len, int type)
1135 {
1136 	struct sock *sk;
1137 
1138 	if (sunaddr->sun_path[0])
1139 		sk = unix_find_bsd(sunaddr, addr_len, type);
1140 	else
1141 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1142 
1143 	return sk;
1144 }
1145 
1146 static int unix_autobind(struct sock *sk)
1147 {
1148 	struct unix_sock *u = unix_sk(sk);
1149 	unsigned int new_hash, old_hash;
1150 	struct net *net = sock_net(sk);
1151 	struct unix_address *addr;
1152 	u32 lastnum, ordernum;
1153 	int err;
1154 
1155 	err = mutex_lock_interruptible(&u->bindlock);
1156 	if (err)
1157 		return err;
1158 
1159 	if (u->addr)
1160 		goto out;
1161 
1162 	err = -ENOMEM;
1163 	addr = kzalloc(sizeof(*addr) +
1164 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1165 	if (!addr)
1166 		goto out;
1167 
1168 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1169 	addr->name->sun_family = AF_UNIX;
1170 	refcount_set(&addr->refcnt, 1);
1171 
1172 	old_hash = sk->sk_hash;
1173 	ordernum = get_random_u32();
1174 	lastnum = ordernum & 0xFFFFF;
1175 retry:
1176 	ordernum = (ordernum + 1) & 0xFFFFF;
1177 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1178 
1179 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1180 	unix_table_double_lock(net, old_hash, new_hash);
1181 
1182 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1183 		unix_table_double_unlock(net, old_hash, new_hash);
1184 
1185 		/* __unix_find_socket_byname() may take long time if many names
1186 		 * are already in use.
1187 		 */
1188 		cond_resched();
1189 
1190 		if (ordernum == lastnum) {
1191 			/* Give up if all names seems to be in use. */
1192 			err = -ENOSPC;
1193 			unix_release_addr(addr);
1194 			goto out;
1195 		}
1196 
1197 		goto retry;
1198 	}
1199 
1200 	__unix_set_addr_hash(net, sk, addr, new_hash);
1201 	unix_table_double_unlock(net, old_hash, new_hash);
1202 	err = 0;
1203 
1204 out:	mutex_unlock(&u->bindlock);
1205 	return err;
1206 }
1207 
1208 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1209 			 int addr_len)
1210 {
1211 	umode_t mode = S_IFSOCK |
1212 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1213 	struct unix_sock *u = unix_sk(sk);
1214 	unsigned int new_hash, old_hash;
1215 	struct net *net = sock_net(sk);
1216 	struct mnt_idmap *idmap;
1217 	struct unix_address *addr;
1218 	struct dentry *dentry;
1219 	struct path parent;
1220 	int err;
1221 
1222 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1223 	addr = unix_create_addr(sunaddr, addr_len);
1224 	if (!addr)
1225 		return -ENOMEM;
1226 
1227 	/*
1228 	 * Get the parent directory, calculate the hash for last
1229 	 * component.
1230 	 */
1231 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1232 	if (IS_ERR(dentry)) {
1233 		err = PTR_ERR(dentry);
1234 		goto out;
1235 	}
1236 
1237 	/*
1238 	 * All right, let's create it.
1239 	 */
1240 	idmap = mnt_idmap(parent.mnt);
1241 	err = security_path_mknod(&parent, dentry, mode, 0);
1242 	if (!err)
1243 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1244 	if (err)
1245 		goto out_path;
1246 	err = mutex_lock_interruptible(&u->bindlock);
1247 	if (err)
1248 		goto out_unlink;
1249 	if (u->addr)
1250 		goto out_unlock;
1251 
1252 	old_hash = sk->sk_hash;
1253 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1254 	unix_table_double_lock(net, old_hash, new_hash);
1255 	u->path.mnt = mntget(parent.mnt);
1256 	u->path.dentry = dget(dentry);
1257 	__unix_set_addr_hash(net, sk, addr, new_hash);
1258 	unix_table_double_unlock(net, old_hash, new_hash);
1259 	unix_insert_bsd_socket(sk);
1260 	mutex_unlock(&u->bindlock);
1261 	done_path_create(&parent, dentry);
1262 	return 0;
1263 
1264 out_unlock:
1265 	mutex_unlock(&u->bindlock);
1266 	err = -EINVAL;
1267 out_unlink:
1268 	/* failed after successful mknod?  unlink what we'd created... */
1269 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1270 out_path:
1271 	done_path_create(&parent, dentry);
1272 out:
1273 	unix_release_addr(addr);
1274 	return err == -EEXIST ? -EADDRINUSE : err;
1275 }
1276 
1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1278 			      int addr_len)
1279 {
1280 	struct unix_sock *u = unix_sk(sk);
1281 	unsigned int new_hash, old_hash;
1282 	struct net *net = sock_net(sk);
1283 	struct unix_address *addr;
1284 	int err;
1285 
1286 	addr = unix_create_addr(sunaddr, addr_len);
1287 	if (!addr)
1288 		return -ENOMEM;
1289 
1290 	err = mutex_lock_interruptible(&u->bindlock);
1291 	if (err)
1292 		goto out;
1293 
1294 	if (u->addr) {
1295 		err = -EINVAL;
1296 		goto out_mutex;
1297 	}
1298 
1299 	old_hash = sk->sk_hash;
1300 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1301 	unix_table_double_lock(net, old_hash, new_hash);
1302 
1303 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1304 		goto out_spin;
1305 
1306 	__unix_set_addr_hash(net, sk, addr, new_hash);
1307 	unix_table_double_unlock(net, old_hash, new_hash);
1308 	mutex_unlock(&u->bindlock);
1309 	return 0;
1310 
1311 out_spin:
1312 	unix_table_double_unlock(net, old_hash, new_hash);
1313 	err = -EADDRINUSE;
1314 out_mutex:
1315 	mutex_unlock(&u->bindlock);
1316 out:
1317 	unix_release_addr(addr);
1318 	return err;
1319 }
1320 
1321 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1322 {
1323 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1324 	struct sock *sk = sock->sk;
1325 	int err;
1326 
1327 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1328 	    sunaddr->sun_family == AF_UNIX)
1329 		return unix_autobind(sk);
1330 
1331 	err = unix_validate_addr(sunaddr, addr_len);
1332 	if (err)
1333 		return err;
1334 
1335 	if (sunaddr->sun_path[0])
1336 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1337 	else
1338 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1339 
1340 	return err;
1341 }
1342 
1343 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1344 {
1345 	if (unlikely(sk1 == sk2) || !sk2) {
1346 		unix_state_lock(sk1);
1347 		return;
1348 	}
1349 	if (sk1 > sk2)
1350 		swap(sk1, sk2);
1351 
1352 	unix_state_lock(sk1);
1353 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1354 }
1355 
1356 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1357 {
1358 	if (unlikely(sk1 == sk2) || !sk2) {
1359 		unix_state_unlock(sk1);
1360 		return;
1361 	}
1362 	unix_state_unlock(sk1);
1363 	unix_state_unlock(sk2);
1364 }
1365 
1366 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1367 			      int alen, int flags)
1368 {
1369 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1370 	struct sock *sk = sock->sk;
1371 	struct sock *other;
1372 	int err;
1373 
1374 	err = -EINVAL;
1375 	if (alen < offsetofend(struct sockaddr, sa_family))
1376 		goto out;
1377 
1378 	if (addr->sa_family != AF_UNSPEC) {
1379 		err = unix_validate_addr(sunaddr, alen);
1380 		if (err)
1381 			goto out;
1382 
1383 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1384 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1385 		    !READ_ONCE(unix_sk(sk)->addr)) {
1386 			err = unix_autobind(sk);
1387 			if (err)
1388 				goto out;
1389 		}
1390 
1391 restart:
1392 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1393 		if (IS_ERR(other)) {
1394 			err = PTR_ERR(other);
1395 			goto out;
1396 		}
1397 
1398 		unix_state_double_lock(sk, other);
1399 
1400 		/* Apparently VFS overslept socket death. Retry. */
1401 		if (sock_flag(other, SOCK_DEAD)) {
1402 			unix_state_double_unlock(sk, other);
1403 			sock_put(other);
1404 			goto restart;
1405 		}
1406 
1407 		err = -EPERM;
1408 		if (!unix_may_send(sk, other))
1409 			goto out_unlock;
1410 
1411 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1412 		if (err)
1413 			goto out_unlock;
1414 
1415 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1416 	} else {
1417 		/*
1418 		 *	1003.1g breaking connected state with AF_UNSPEC
1419 		 */
1420 		other = NULL;
1421 		unix_state_double_lock(sk, other);
1422 	}
1423 
1424 	/*
1425 	 * If it was connected, reconnect.
1426 	 */
1427 	if (unix_peer(sk)) {
1428 		struct sock *old_peer = unix_peer(sk);
1429 
1430 		unix_peer(sk) = other;
1431 		if (!other)
1432 			sk->sk_state = TCP_CLOSE;
1433 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1434 
1435 		unix_state_double_unlock(sk, other);
1436 
1437 		if (other != old_peer)
1438 			unix_dgram_disconnected(sk, old_peer);
1439 		sock_put(old_peer);
1440 	} else {
1441 		unix_peer(sk) = other;
1442 		unix_state_double_unlock(sk, other);
1443 	}
1444 
1445 	return 0;
1446 
1447 out_unlock:
1448 	unix_state_double_unlock(sk, other);
1449 	sock_put(other);
1450 out:
1451 	return err;
1452 }
1453 
1454 static long unix_wait_for_peer(struct sock *other, long timeo)
1455 	__releases(&unix_sk(other)->lock)
1456 {
1457 	struct unix_sock *u = unix_sk(other);
1458 	int sched;
1459 	DEFINE_WAIT(wait);
1460 
1461 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1462 
1463 	sched = !sock_flag(other, SOCK_DEAD) &&
1464 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1465 		unix_recvq_full_lockless(other);
1466 
1467 	unix_state_unlock(other);
1468 
1469 	if (sched)
1470 		timeo = schedule_timeout(timeo);
1471 
1472 	finish_wait(&u->peer_wait, &wait);
1473 	return timeo;
1474 }
1475 
1476 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1477 			       int addr_len, int flags)
1478 {
1479 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1480 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1481 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1482 	struct net *net = sock_net(sk);
1483 	struct sk_buff *skb = NULL;
1484 	long timeo;
1485 	int err;
1486 	int st;
1487 
1488 	err = unix_validate_addr(sunaddr, addr_len);
1489 	if (err)
1490 		goto out;
1491 
1492 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1493 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1494 	    !READ_ONCE(u->addr)) {
1495 		err = unix_autobind(sk);
1496 		if (err)
1497 			goto out;
1498 	}
1499 
1500 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1501 
1502 	/* First of all allocate resources.
1503 	   If we will make it after state is locked,
1504 	   we will have to recheck all again in any case.
1505 	 */
1506 
1507 	/* create new sock for complete connection */
1508 	newsk = unix_create1(net, NULL, 0, sock->type);
1509 	if (IS_ERR(newsk)) {
1510 		err = PTR_ERR(newsk);
1511 		newsk = NULL;
1512 		goto out;
1513 	}
1514 
1515 	err = -ENOMEM;
1516 
1517 	/* Allocate skb for sending to listening sock */
1518 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1519 	if (skb == NULL)
1520 		goto out;
1521 
1522 restart:
1523 	/*  Find listening sock. */
1524 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1525 	if (IS_ERR(other)) {
1526 		err = PTR_ERR(other);
1527 		other = NULL;
1528 		goto out;
1529 	}
1530 
1531 	/* Latch state of peer */
1532 	unix_state_lock(other);
1533 
1534 	/* Apparently VFS overslept socket death. Retry. */
1535 	if (sock_flag(other, SOCK_DEAD)) {
1536 		unix_state_unlock(other);
1537 		sock_put(other);
1538 		goto restart;
1539 	}
1540 
1541 	err = -ECONNREFUSED;
1542 	if (other->sk_state != TCP_LISTEN)
1543 		goto out_unlock;
1544 	if (other->sk_shutdown & RCV_SHUTDOWN)
1545 		goto out_unlock;
1546 
1547 	if (unix_recvq_full(other)) {
1548 		err = -EAGAIN;
1549 		if (!timeo)
1550 			goto out_unlock;
1551 
1552 		timeo = unix_wait_for_peer(other, timeo);
1553 
1554 		err = sock_intr_errno(timeo);
1555 		if (signal_pending(current))
1556 			goto out;
1557 		sock_put(other);
1558 		goto restart;
1559 	}
1560 
1561 	/* Latch our state.
1562 
1563 	   It is tricky place. We need to grab our state lock and cannot
1564 	   drop lock on peer. It is dangerous because deadlock is
1565 	   possible. Connect to self case and simultaneous
1566 	   attempt to connect are eliminated by checking socket
1567 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1568 	   check this before attempt to grab lock.
1569 
1570 	   Well, and we have to recheck the state after socket locked.
1571 	 */
1572 	st = sk->sk_state;
1573 
1574 	switch (st) {
1575 	case TCP_CLOSE:
1576 		/* This is ok... continue with connect */
1577 		break;
1578 	case TCP_ESTABLISHED:
1579 		/* Socket is already connected */
1580 		err = -EISCONN;
1581 		goto out_unlock;
1582 	default:
1583 		err = -EINVAL;
1584 		goto out_unlock;
1585 	}
1586 
1587 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1588 
1589 	if (sk->sk_state != st) {
1590 		unix_state_unlock(sk);
1591 		unix_state_unlock(other);
1592 		sock_put(other);
1593 		goto restart;
1594 	}
1595 
1596 	err = security_unix_stream_connect(sk, other, newsk);
1597 	if (err) {
1598 		unix_state_unlock(sk);
1599 		goto out_unlock;
1600 	}
1601 
1602 	/* The way is open! Fastly set all the necessary fields... */
1603 
1604 	sock_hold(sk);
1605 	unix_peer(newsk)	= sk;
1606 	newsk->sk_state		= TCP_ESTABLISHED;
1607 	newsk->sk_type		= sk->sk_type;
1608 	init_peercred(newsk);
1609 	newu = unix_sk(newsk);
1610 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611 	otheru = unix_sk(other);
1612 
1613 	/* copy address information from listening to new sock
1614 	 *
1615 	 * The contents of *(otheru->addr) and otheru->path
1616 	 * are seen fully set up here, since we have found
1617 	 * otheru in hash under its lock.  Insertion into the
1618 	 * hash chain we'd found it in had been done in an
1619 	 * earlier critical area protected by the chain's lock,
1620 	 * the same one where we'd set *(otheru->addr) contents,
1621 	 * as well as otheru->path and otheru->addr itself.
1622 	 *
1623 	 * Using smp_store_release() here to set newu->addr
1624 	 * is enough to make those stores, as well as stores
1625 	 * to newu->path visible to anyone who gets newu->addr
1626 	 * by smp_load_acquire().  IOW, the same warranties
1627 	 * as for unix_sock instances bound in unix_bind() or
1628 	 * in unix_autobind().
1629 	 */
1630 	if (otheru->path.dentry) {
1631 		path_get(&otheru->path);
1632 		newu->path = otheru->path;
1633 	}
1634 	refcount_inc(&otheru->addr->refcnt);
1635 	smp_store_release(&newu->addr, otheru->addr);
1636 
1637 	/* Set credentials */
1638 	copy_peercred(sk, other);
1639 
1640 	sock->state	= SS_CONNECTED;
1641 	sk->sk_state	= TCP_ESTABLISHED;
1642 	sock_hold(newsk);
1643 
1644 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1645 	unix_peer(sk)	= newsk;
1646 
1647 	unix_state_unlock(sk);
1648 
1649 	/* take ten and send info to listening sock */
1650 	spin_lock(&other->sk_receive_queue.lock);
1651 	__skb_queue_tail(&other->sk_receive_queue, skb);
1652 	spin_unlock(&other->sk_receive_queue.lock);
1653 	unix_state_unlock(other);
1654 	other->sk_data_ready(other);
1655 	sock_put(other);
1656 	return 0;
1657 
1658 out_unlock:
1659 	if (other)
1660 		unix_state_unlock(other);
1661 
1662 out:
1663 	kfree_skb(skb);
1664 	if (newsk)
1665 		unix_release_sock(newsk, 0);
1666 	if (other)
1667 		sock_put(other);
1668 	return err;
1669 }
1670 
1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1672 {
1673 	struct sock *ska = socka->sk, *skb = sockb->sk;
1674 
1675 	/* Join our sockets back to back */
1676 	sock_hold(ska);
1677 	sock_hold(skb);
1678 	unix_peer(ska) = skb;
1679 	unix_peer(skb) = ska;
1680 	init_peercred(ska);
1681 	init_peercred(skb);
1682 
1683 	ska->sk_state = TCP_ESTABLISHED;
1684 	skb->sk_state = TCP_ESTABLISHED;
1685 	socka->state  = SS_CONNECTED;
1686 	sockb->state  = SS_CONNECTED;
1687 	return 0;
1688 }
1689 
1690 static void unix_sock_inherit_flags(const struct socket *old,
1691 				    struct socket *new)
1692 {
1693 	if (test_bit(SOCK_PASSCRED, &old->flags))
1694 		set_bit(SOCK_PASSCRED, &new->flags);
1695 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696 		set_bit(SOCK_PASSPIDFD, &new->flags);
1697 	if (test_bit(SOCK_PASSSEC, &old->flags))
1698 		set_bit(SOCK_PASSSEC, &new->flags);
1699 }
1700 
1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1702 		       bool kern)
1703 {
1704 	struct sock *sk = sock->sk;
1705 	struct sock *tsk;
1706 	struct sk_buff *skb;
1707 	int err;
1708 
1709 	err = -EOPNOTSUPP;
1710 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1711 		goto out;
1712 
1713 	err = -EINVAL;
1714 	if (sk->sk_state != TCP_LISTEN)
1715 		goto out;
1716 
1717 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1718 	 * so that no locks are necessary.
1719 	 */
1720 
1721 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1722 				&err);
1723 	if (!skb) {
1724 		/* This means receive shutdown. */
1725 		if (err == 0)
1726 			err = -EINVAL;
1727 		goto out;
1728 	}
1729 
1730 	tsk = skb->sk;
1731 	skb_free_datagram(sk, skb);
1732 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1733 
1734 	/* attach accepted sock to socket */
1735 	unix_state_lock(tsk);
1736 	newsock->state = SS_CONNECTED;
1737 	unix_sock_inherit_flags(sock, newsock);
1738 	sock_graft(tsk, newsock);
1739 	unix_state_unlock(tsk);
1740 	return 0;
1741 
1742 out:
1743 	return err;
1744 }
1745 
1746 
1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1748 {
1749 	struct sock *sk = sock->sk;
1750 	struct unix_address *addr;
1751 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1752 	int err = 0;
1753 
1754 	if (peer) {
1755 		sk = unix_peer_get(sk);
1756 
1757 		err = -ENOTCONN;
1758 		if (!sk)
1759 			goto out;
1760 		err = 0;
1761 	} else {
1762 		sock_hold(sk);
1763 	}
1764 
1765 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1766 	if (!addr) {
1767 		sunaddr->sun_family = AF_UNIX;
1768 		sunaddr->sun_path[0] = 0;
1769 		err = offsetof(struct sockaddr_un, sun_path);
1770 	} else {
1771 		err = addr->len;
1772 		memcpy(sunaddr, addr->name, addr->len);
1773 	}
1774 	sock_put(sk);
1775 out:
1776 	return err;
1777 }
1778 
1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1780 {
1781 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1782 
1783 	/*
1784 	 * Garbage collection of unix sockets starts by selecting a set of
1785 	 * candidate sockets which have reference only from being in flight
1786 	 * (total_refs == inflight_refs).  This condition is checked once during
1787 	 * the candidate collection phase, and candidates are marked as such, so
1788 	 * that non-candidates can later be ignored.  While inflight_refs is
1789 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1790 	 * is an instantaneous decision.
1791 	 *
1792 	 * Once a candidate, however, the socket must not be reinstalled into a
1793 	 * file descriptor while the garbage collection is in progress.
1794 	 *
1795 	 * If the above conditions are met, then the directed graph of
1796 	 * candidates (*) does not change while unix_gc_lock is held.
1797 	 *
1798 	 * Any operations that changes the file count through file descriptors
1799 	 * (dup, close, sendmsg) does not change the graph since candidates are
1800 	 * not installed in fds.
1801 	 *
1802 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1803 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1804 	 * serialized with garbage collection.
1805 	 *
1806 	 * MSG_PEEK is special in that it does not change the inflight count,
1807 	 * yet does install the socket into an fd.  The following lock/unlock
1808 	 * pair is to ensure serialization with garbage collection.  It must be
1809 	 * done between incrementing the file count and installing the file into
1810 	 * an fd.
1811 	 *
1812 	 * If garbage collection starts after the barrier provided by the
1813 	 * lock/unlock, then it will see the elevated refcount and not mark this
1814 	 * as a candidate.  If a garbage collection is already in progress
1815 	 * before the file count was incremented, then the lock/unlock pair will
1816 	 * ensure that garbage collection is finished before progressing to
1817 	 * installing the fd.
1818 	 *
1819 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1820 	 * which is on the queue of listening socket A.
1821 	 */
1822 	spin_lock(&unix_gc_lock);
1823 	spin_unlock(&unix_gc_lock);
1824 }
1825 
1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1827 {
1828 	int err = 0;
1829 
1830 	UNIXCB(skb).pid  = get_pid(scm->pid);
1831 	UNIXCB(skb).uid = scm->creds.uid;
1832 	UNIXCB(skb).gid = scm->creds.gid;
1833 	UNIXCB(skb).fp = NULL;
1834 	unix_get_secdata(scm, skb);
1835 	if (scm->fp && send_fds)
1836 		err = unix_attach_fds(scm, skb);
1837 
1838 	skb->destructor = unix_destruct_scm;
1839 	return err;
1840 }
1841 
1842 static bool unix_passcred_enabled(const struct socket *sock,
1843 				  const struct sock *other)
1844 {
1845 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1846 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1847 	       !other->sk_socket ||
1848 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1849 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1850 }
1851 
1852 /*
1853  * Some apps rely on write() giving SCM_CREDENTIALS
1854  * We include credentials if source or destination socket
1855  * asserted SOCK_PASSCRED.
1856  */
1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1858 			    const struct sock *other)
1859 {
1860 	if (UNIXCB(skb).pid)
1861 		return;
1862 	if (unix_passcred_enabled(sock, other)) {
1863 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1864 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1865 	}
1866 }
1867 
1868 static bool unix_skb_scm_eq(struct sk_buff *skb,
1869 			    struct scm_cookie *scm)
1870 {
1871 	return UNIXCB(skb).pid == scm->pid &&
1872 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1873 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1874 	       unix_secdata_eq(scm, skb);
1875 }
1876 
1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1878 {
1879 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1880 	struct unix_sock *u = unix_sk(sk);
1881 
1882 	if (unlikely(fp && fp->count))
1883 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1884 }
1885 
1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1887 {
1888 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1889 	struct unix_sock *u = unix_sk(sk);
1890 
1891 	if (unlikely(fp && fp->count))
1892 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1893 }
1894 
1895 /*
1896  *	Send AF_UNIX data.
1897  */
1898 
1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1900 			      size_t len)
1901 {
1902 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903 	struct sock *sk = sock->sk, *other = NULL;
1904 	struct unix_sock *u = unix_sk(sk);
1905 	struct scm_cookie scm;
1906 	struct sk_buff *skb;
1907 	int data_len = 0;
1908 	int sk_locked;
1909 	long timeo;
1910 	int err;
1911 
1912 	wait_for_unix_gc();
1913 	err = scm_send(sock, msg, &scm, false);
1914 	if (err < 0)
1915 		return err;
1916 
1917 	err = -EOPNOTSUPP;
1918 	if (msg->msg_flags&MSG_OOB)
1919 		goto out;
1920 
1921 	if (msg->msg_namelen) {
1922 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1923 		if (err)
1924 			goto out;
1925 	} else {
1926 		sunaddr = NULL;
1927 		err = -ENOTCONN;
1928 		other = unix_peer_get(sk);
1929 		if (!other)
1930 			goto out;
1931 	}
1932 
1933 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1934 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1935 	    !READ_ONCE(u->addr)) {
1936 		err = unix_autobind(sk);
1937 		if (err)
1938 			goto out;
1939 	}
1940 
1941 	err = -EMSGSIZE;
1942 	if (len > sk->sk_sndbuf - 32)
1943 		goto out;
1944 
1945 	if (len > SKB_MAX_ALLOC) {
1946 		data_len = min_t(size_t,
1947 				 len - SKB_MAX_ALLOC,
1948 				 MAX_SKB_FRAGS * PAGE_SIZE);
1949 		data_len = PAGE_ALIGN(data_len);
1950 
1951 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1952 	}
1953 
1954 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1955 				   msg->msg_flags & MSG_DONTWAIT, &err,
1956 				   PAGE_ALLOC_COSTLY_ORDER);
1957 	if (skb == NULL)
1958 		goto out;
1959 
1960 	err = unix_scm_to_skb(&scm, skb, true);
1961 	if (err < 0)
1962 		goto out_free;
1963 
1964 	skb_put(skb, len - data_len);
1965 	skb->data_len = data_len;
1966 	skb->len = len;
1967 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1968 	if (err)
1969 		goto out_free;
1970 
1971 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1972 
1973 restart:
1974 	if (!other) {
1975 		err = -ECONNRESET;
1976 		if (sunaddr == NULL)
1977 			goto out_free;
1978 
1979 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1980 					sk->sk_type);
1981 		if (IS_ERR(other)) {
1982 			err = PTR_ERR(other);
1983 			other = NULL;
1984 			goto out_free;
1985 		}
1986 	}
1987 
1988 	if (sk_filter(other, skb) < 0) {
1989 		/* Toss the packet but do not return any error to the sender */
1990 		err = len;
1991 		goto out_free;
1992 	}
1993 
1994 	sk_locked = 0;
1995 	unix_state_lock(other);
1996 restart_locked:
1997 	err = -EPERM;
1998 	if (!unix_may_send(sk, other))
1999 		goto out_unlock;
2000 
2001 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2002 		/*
2003 		 *	Check with 1003.1g - what should
2004 		 *	datagram error
2005 		 */
2006 		unix_state_unlock(other);
2007 		sock_put(other);
2008 
2009 		if (!sk_locked)
2010 			unix_state_lock(sk);
2011 
2012 		err = 0;
2013 		if (sk->sk_type == SOCK_SEQPACKET) {
2014 			/* We are here only when racing with unix_release_sock()
2015 			 * is clearing @other. Never change state to TCP_CLOSE
2016 			 * unlike SOCK_DGRAM wants.
2017 			 */
2018 			unix_state_unlock(sk);
2019 			err = -EPIPE;
2020 		} else if (unix_peer(sk) == other) {
2021 			unix_peer(sk) = NULL;
2022 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2023 
2024 			sk->sk_state = TCP_CLOSE;
2025 			unix_state_unlock(sk);
2026 
2027 			unix_dgram_disconnected(sk, other);
2028 			sock_put(other);
2029 			err = -ECONNREFUSED;
2030 		} else {
2031 			unix_state_unlock(sk);
2032 		}
2033 
2034 		other = NULL;
2035 		if (err)
2036 			goto out_free;
2037 		goto restart;
2038 	}
2039 
2040 	err = -EPIPE;
2041 	if (other->sk_shutdown & RCV_SHUTDOWN)
2042 		goto out_unlock;
2043 
2044 	if (sk->sk_type != SOCK_SEQPACKET) {
2045 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2046 		if (err)
2047 			goto out_unlock;
2048 	}
2049 
2050 	/* other == sk && unix_peer(other) != sk if
2051 	 * - unix_peer(sk) == NULL, destination address bound to sk
2052 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2053 	 */
2054 	if (other != sk &&
2055 	    unlikely(unix_peer(other) != sk &&
2056 	    unix_recvq_full_lockless(other))) {
2057 		if (timeo) {
2058 			timeo = unix_wait_for_peer(other, timeo);
2059 
2060 			err = sock_intr_errno(timeo);
2061 			if (signal_pending(current))
2062 				goto out_free;
2063 
2064 			goto restart;
2065 		}
2066 
2067 		if (!sk_locked) {
2068 			unix_state_unlock(other);
2069 			unix_state_double_lock(sk, other);
2070 		}
2071 
2072 		if (unix_peer(sk) != other ||
2073 		    unix_dgram_peer_wake_me(sk, other)) {
2074 			err = -EAGAIN;
2075 			sk_locked = 1;
2076 			goto out_unlock;
2077 		}
2078 
2079 		if (!sk_locked) {
2080 			sk_locked = 1;
2081 			goto restart_locked;
2082 		}
2083 	}
2084 
2085 	if (unlikely(sk_locked))
2086 		unix_state_unlock(sk);
2087 
2088 	if (sock_flag(other, SOCK_RCVTSTAMP))
2089 		__net_timestamp(skb);
2090 	maybe_add_creds(skb, sock, other);
2091 	scm_stat_add(other, skb);
2092 	skb_queue_tail(&other->sk_receive_queue, skb);
2093 	unix_state_unlock(other);
2094 	other->sk_data_ready(other);
2095 	sock_put(other);
2096 	scm_destroy(&scm);
2097 	return len;
2098 
2099 out_unlock:
2100 	if (sk_locked)
2101 		unix_state_unlock(sk);
2102 	unix_state_unlock(other);
2103 out_free:
2104 	kfree_skb(skb);
2105 out:
2106 	if (other)
2107 		sock_put(other);
2108 	scm_destroy(&scm);
2109 	return err;
2110 }
2111 
2112 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2113  * bytes, and a minimum of a full page.
2114  */
2115 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2116 
2117 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2118 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2119 		     struct scm_cookie *scm, bool fds_sent)
2120 {
2121 	struct unix_sock *ousk = unix_sk(other);
2122 	struct sk_buff *skb;
2123 	int err = 0;
2124 
2125 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2126 
2127 	if (!skb)
2128 		return err;
2129 
2130 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2131 	if (err < 0) {
2132 		kfree_skb(skb);
2133 		return err;
2134 	}
2135 	skb_put(skb, 1);
2136 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2137 
2138 	if (err) {
2139 		kfree_skb(skb);
2140 		return err;
2141 	}
2142 
2143 	unix_state_lock(other);
2144 
2145 	if (sock_flag(other, SOCK_DEAD) ||
2146 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2147 		unix_state_unlock(other);
2148 		kfree_skb(skb);
2149 		return -EPIPE;
2150 	}
2151 
2152 	maybe_add_creds(skb, sock, other);
2153 	skb_get(skb);
2154 
2155 	scm_stat_add(other, skb);
2156 
2157 	spin_lock(&other->sk_receive_queue.lock);
2158 	if (ousk->oob_skb)
2159 		consume_skb(ousk->oob_skb);
2160 	WRITE_ONCE(ousk->oob_skb, skb);
2161 	__skb_queue_tail(&other->sk_receive_queue, skb);
2162 	spin_unlock(&other->sk_receive_queue.lock);
2163 
2164 	sk_send_sigurg(other);
2165 	unix_state_unlock(other);
2166 	other->sk_data_ready(other);
2167 
2168 	return err;
2169 }
2170 #endif
2171 
2172 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2173 			       size_t len)
2174 {
2175 	struct sock *sk = sock->sk;
2176 	struct sock *other = NULL;
2177 	int err, size;
2178 	struct sk_buff *skb;
2179 	int sent = 0;
2180 	struct scm_cookie scm;
2181 	bool fds_sent = false;
2182 	int data_len;
2183 
2184 	wait_for_unix_gc();
2185 	err = scm_send(sock, msg, &scm, false);
2186 	if (err < 0)
2187 		return err;
2188 
2189 	err = -EOPNOTSUPP;
2190 	if (msg->msg_flags & MSG_OOB) {
2191 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2192 		if (len)
2193 			len--;
2194 		else
2195 #endif
2196 			goto out_err;
2197 	}
2198 
2199 	if (msg->msg_namelen) {
2200 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2201 		goto out_err;
2202 	} else {
2203 		err = -ENOTCONN;
2204 		other = unix_peer(sk);
2205 		if (!other)
2206 			goto out_err;
2207 	}
2208 
2209 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2210 		goto pipe_err;
2211 
2212 	while (sent < len) {
2213 		size = len - sent;
2214 
2215 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2216 			skb = sock_alloc_send_pskb(sk, 0, 0,
2217 						   msg->msg_flags & MSG_DONTWAIT,
2218 						   &err, 0);
2219 		} else {
2220 			/* Keep two messages in the pipe so it schedules better */
2221 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2222 
2223 			/* allow fallback to order-0 allocations */
2224 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2225 
2226 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2227 
2228 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2229 
2230 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2231 						   msg->msg_flags & MSG_DONTWAIT, &err,
2232 						   get_order(UNIX_SKB_FRAGS_SZ));
2233 		}
2234 		if (!skb)
2235 			goto out_err;
2236 
2237 		/* Only send the fds in the first buffer */
2238 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2239 		if (err < 0) {
2240 			kfree_skb(skb);
2241 			goto out_err;
2242 		}
2243 		fds_sent = true;
2244 
2245 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2246 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2247 						   sk->sk_allocation);
2248 			if (err < 0) {
2249 				kfree_skb(skb);
2250 				goto out_err;
2251 			}
2252 			size = err;
2253 			refcount_add(size, &sk->sk_wmem_alloc);
2254 		} else {
2255 			skb_put(skb, size - data_len);
2256 			skb->data_len = data_len;
2257 			skb->len = size;
2258 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2259 			if (err) {
2260 				kfree_skb(skb);
2261 				goto out_err;
2262 			}
2263 		}
2264 
2265 		unix_state_lock(other);
2266 
2267 		if (sock_flag(other, SOCK_DEAD) ||
2268 		    (other->sk_shutdown & RCV_SHUTDOWN))
2269 			goto pipe_err_free;
2270 
2271 		maybe_add_creds(skb, sock, other);
2272 		scm_stat_add(other, skb);
2273 		skb_queue_tail(&other->sk_receive_queue, skb);
2274 		unix_state_unlock(other);
2275 		other->sk_data_ready(other);
2276 		sent += size;
2277 	}
2278 
2279 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2280 	if (msg->msg_flags & MSG_OOB) {
2281 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2282 		if (err)
2283 			goto out_err;
2284 		sent++;
2285 	}
2286 #endif
2287 
2288 	scm_destroy(&scm);
2289 
2290 	return sent;
2291 
2292 pipe_err_free:
2293 	unix_state_unlock(other);
2294 	kfree_skb(skb);
2295 pipe_err:
2296 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2297 		send_sig(SIGPIPE, current, 0);
2298 	err = -EPIPE;
2299 out_err:
2300 	scm_destroy(&scm);
2301 	return sent ? : err;
2302 }
2303 
2304 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2305 				  size_t len)
2306 {
2307 	int err;
2308 	struct sock *sk = sock->sk;
2309 
2310 	err = sock_error(sk);
2311 	if (err)
2312 		return err;
2313 
2314 	if (sk->sk_state != TCP_ESTABLISHED)
2315 		return -ENOTCONN;
2316 
2317 	if (msg->msg_namelen)
2318 		msg->msg_namelen = 0;
2319 
2320 	return unix_dgram_sendmsg(sock, msg, len);
2321 }
2322 
2323 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2324 				  size_t size, int flags)
2325 {
2326 	struct sock *sk = sock->sk;
2327 
2328 	if (sk->sk_state != TCP_ESTABLISHED)
2329 		return -ENOTCONN;
2330 
2331 	return unix_dgram_recvmsg(sock, msg, size, flags);
2332 }
2333 
2334 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2335 {
2336 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2337 
2338 	if (addr) {
2339 		msg->msg_namelen = addr->len;
2340 		memcpy(msg->msg_name, addr->name, addr->len);
2341 	}
2342 }
2343 
2344 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2345 			 int flags)
2346 {
2347 	struct scm_cookie scm;
2348 	struct socket *sock = sk->sk_socket;
2349 	struct unix_sock *u = unix_sk(sk);
2350 	struct sk_buff *skb, *last;
2351 	long timeo;
2352 	int skip;
2353 	int err;
2354 
2355 	err = -EOPNOTSUPP;
2356 	if (flags&MSG_OOB)
2357 		goto out;
2358 
2359 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2360 
2361 	do {
2362 		mutex_lock(&u->iolock);
2363 
2364 		skip = sk_peek_offset(sk, flags);
2365 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2366 					      &skip, &err, &last);
2367 		if (skb) {
2368 			if (!(flags & MSG_PEEK))
2369 				scm_stat_del(sk, skb);
2370 			break;
2371 		}
2372 
2373 		mutex_unlock(&u->iolock);
2374 
2375 		if (err != -EAGAIN)
2376 			break;
2377 	} while (timeo &&
2378 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2379 					      &err, &timeo, last));
2380 
2381 	if (!skb) { /* implies iolock unlocked */
2382 		unix_state_lock(sk);
2383 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2384 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2385 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2386 			err = 0;
2387 		unix_state_unlock(sk);
2388 		goto out;
2389 	}
2390 
2391 	if (wq_has_sleeper(&u->peer_wait))
2392 		wake_up_interruptible_sync_poll(&u->peer_wait,
2393 						EPOLLOUT | EPOLLWRNORM |
2394 						EPOLLWRBAND);
2395 
2396 	if (msg->msg_name)
2397 		unix_copy_addr(msg, skb->sk);
2398 
2399 	if (size > skb->len - skip)
2400 		size = skb->len - skip;
2401 	else if (size < skb->len - skip)
2402 		msg->msg_flags |= MSG_TRUNC;
2403 
2404 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2405 	if (err)
2406 		goto out_free;
2407 
2408 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2409 		__sock_recv_timestamp(msg, sk, skb);
2410 
2411 	memset(&scm, 0, sizeof(scm));
2412 
2413 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2414 	unix_set_secdata(&scm, skb);
2415 
2416 	if (!(flags & MSG_PEEK)) {
2417 		if (UNIXCB(skb).fp)
2418 			unix_detach_fds(&scm, skb);
2419 
2420 		sk_peek_offset_bwd(sk, skb->len);
2421 	} else {
2422 		/* It is questionable: on PEEK we could:
2423 		   - do not return fds - good, but too simple 8)
2424 		   - return fds, and do not return them on read (old strategy,
2425 		     apparently wrong)
2426 		   - clone fds (I chose it for now, it is the most universal
2427 		     solution)
2428 
2429 		   POSIX 1003.1g does not actually define this clearly
2430 		   at all. POSIX 1003.1g doesn't define a lot of things
2431 		   clearly however!
2432 
2433 		*/
2434 
2435 		sk_peek_offset_fwd(sk, size);
2436 
2437 		if (UNIXCB(skb).fp)
2438 			unix_peek_fds(&scm, skb);
2439 	}
2440 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2441 
2442 	scm_recv_unix(sock, msg, &scm, flags);
2443 
2444 out_free:
2445 	skb_free_datagram(sk, skb);
2446 	mutex_unlock(&u->iolock);
2447 out:
2448 	return err;
2449 }
2450 
2451 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2452 			      int flags)
2453 {
2454 	struct sock *sk = sock->sk;
2455 
2456 #ifdef CONFIG_BPF_SYSCALL
2457 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2458 
2459 	if (prot != &unix_dgram_proto)
2460 		return prot->recvmsg(sk, msg, size, flags, NULL);
2461 #endif
2462 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2463 }
2464 
2465 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2466 {
2467 	struct unix_sock *u = unix_sk(sk);
2468 	struct sk_buff *skb;
2469 	int err;
2470 
2471 	mutex_lock(&u->iolock);
2472 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2473 	mutex_unlock(&u->iolock);
2474 	if (!skb)
2475 		return err;
2476 
2477 	return recv_actor(sk, skb);
2478 }
2479 
2480 /*
2481  *	Sleep until more data has arrived. But check for races..
2482  */
2483 static long unix_stream_data_wait(struct sock *sk, long timeo,
2484 				  struct sk_buff *last, unsigned int last_len,
2485 				  bool freezable)
2486 {
2487 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2488 	struct sk_buff *tail;
2489 	DEFINE_WAIT(wait);
2490 
2491 	unix_state_lock(sk);
2492 
2493 	for (;;) {
2494 		prepare_to_wait(sk_sleep(sk), &wait, state);
2495 
2496 		tail = skb_peek_tail(&sk->sk_receive_queue);
2497 		if (tail != last ||
2498 		    (tail && tail->len != last_len) ||
2499 		    sk->sk_err ||
2500 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2501 		    signal_pending(current) ||
2502 		    !timeo)
2503 			break;
2504 
2505 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2506 		unix_state_unlock(sk);
2507 		timeo = schedule_timeout(timeo);
2508 		unix_state_lock(sk);
2509 
2510 		if (sock_flag(sk, SOCK_DEAD))
2511 			break;
2512 
2513 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2514 	}
2515 
2516 	finish_wait(sk_sleep(sk), &wait);
2517 	unix_state_unlock(sk);
2518 	return timeo;
2519 }
2520 
2521 static unsigned int unix_skb_len(const struct sk_buff *skb)
2522 {
2523 	return skb->len - UNIXCB(skb).consumed;
2524 }
2525 
2526 struct unix_stream_read_state {
2527 	int (*recv_actor)(struct sk_buff *, int, int,
2528 			  struct unix_stream_read_state *);
2529 	struct socket *socket;
2530 	struct msghdr *msg;
2531 	struct pipe_inode_info *pipe;
2532 	size_t size;
2533 	int flags;
2534 	unsigned int splice_flags;
2535 };
2536 
2537 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2538 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2539 {
2540 	struct socket *sock = state->socket;
2541 	struct sock *sk = sock->sk;
2542 	struct unix_sock *u = unix_sk(sk);
2543 	int chunk = 1;
2544 	struct sk_buff *oob_skb;
2545 
2546 	mutex_lock(&u->iolock);
2547 	unix_state_lock(sk);
2548 	spin_lock(&sk->sk_receive_queue.lock);
2549 
2550 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2551 		spin_unlock(&sk->sk_receive_queue.lock);
2552 		unix_state_unlock(sk);
2553 		mutex_unlock(&u->iolock);
2554 		return -EINVAL;
2555 	}
2556 
2557 	oob_skb = u->oob_skb;
2558 
2559 	if (!(state->flags & MSG_PEEK))
2560 		WRITE_ONCE(u->oob_skb, NULL);
2561 	else
2562 		skb_get(oob_skb);
2563 
2564 	spin_unlock(&sk->sk_receive_queue.lock);
2565 	unix_state_unlock(sk);
2566 
2567 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2568 
2569 	if (!(state->flags & MSG_PEEK))
2570 		UNIXCB(oob_skb).consumed += 1;
2571 
2572 	consume_skb(oob_skb);
2573 
2574 	mutex_unlock(&u->iolock);
2575 
2576 	if (chunk < 0)
2577 		return -EFAULT;
2578 
2579 	state->msg->msg_flags |= MSG_OOB;
2580 	return 1;
2581 }
2582 
2583 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2584 				  int flags, int copied)
2585 {
2586 	struct unix_sock *u = unix_sk(sk);
2587 
2588 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2589 		skb_unlink(skb, &sk->sk_receive_queue);
2590 		consume_skb(skb);
2591 		skb = NULL;
2592 	} else {
2593 		struct sk_buff *unlinked_skb = NULL;
2594 
2595 		spin_lock(&sk->sk_receive_queue.lock);
2596 
2597 		if (skb == u->oob_skb) {
2598 			if (copied) {
2599 				skb = NULL;
2600 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2601 				if (!(flags & MSG_PEEK)) {
2602 					WRITE_ONCE(u->oob_skb, NULL);
2603 					consume_skb(skb);
2604 				}
2605 			} else if (flags & MSG_PEEK) {
2606 				skb = NULL;
2607 			} else {
2608 				__skb_unlink(skb, &sk->sk_receive_queue);
2609 				WRITE_ONCE(u->oob_skb, NULL);
2610 				unlinked_skb = skb;
2611 				skb = skb_peek(&sk->sk_receive_queue);
2612 			}
2613 		}
2614 
2615 		spin_unlock(&sk->sk_receive_queue.lock);
2616 
2617 		if (unlinked_skb) {
2618 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2619 			kfree_skb(unlinked_skb);
2620 		}
2621 	}
2622 	return skb;
2623 }
2624 #endif
2625 
2626 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2627 {
2628 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2629 		return -ENOTCONN;
2630 
2631 	return unix_read_skb(sk, recv_actor);
2632 }
2633 
2634 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2635 				    bool freezable)
2636 {
2637 	struct scm_cookie scm;
2638 	struct socket *sock = state->socket;
2639 	struct sock *sk = sock->sk;
2640 	struct unix_sock *u = unix_sk(sk);
2641 	int copied = 0;
2642 	int flags = state->flags;
2643 	int noblock = flags & MSG_DONTWAIT;
2644 	bool check_creds = false;
2645 	int target;
2646 	int err = 0;
2647 	long timeo;
2648 	int skip;
2649 	size_t size = state->size;
2650 	unsigned int last_len;
2651 
2652 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2653 		err = -EINVAL;
2654 		goto out;
2655 	}
2656 
2657 	if (unlikely(flags & MSG_OOB)) {
2658 		err = -EOPNOTSUPP;
2659 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2660 		err = unix_stream_recv_urg(state);
2661 #endif
2662 		goto out;
2663 	}
2664 
2665 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2666 	timeo = sock_rcvtimeo(sk, noblock);
2667 
2668 	memset(&scm, 0, sizeof(scm));
2669 
2670 	/* Lock the socket to prevent queue disordering
2671 	 * while sleeps in memcpy_tomsg
2672 	 */
2673 	mutex_lock(&u->iolock);
2674 
2675 	skip = max(sk_peek_offset(sk, flags), 0);
2676 
2677 	do {
2678 		int chunk;
2679 		bool drop_skb;
2680 		struct sk_buff *skb, *last;
2681 
2682 redo:
2683 		unix_state_lock(sk);
2684 		if (sock_flag(sk, SOCK_DEAD)) {
2685 			err = -ECONNRESET;
2686 			goto unlock;
2687 		}
2688 		last = skb = skb_peek(&sk->sk_receive_queue);
2689 		last_len = last ? last->len : 0;
2690 
2691 again:
2692 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2693 		if (skb) {
2694 			skb = manage_oob(skb, sk, flags, copied);
2695 			if (!skb && copied) {
2696 				unix_state_unlock(sk);
2697 				break;
2698 			}
2699 		}
2700 #endif
2701 		if (skb == NULL) {
2702 			if (copied >= target)
2703 				goto unlock;
2704 
2705 			/*
2706 			 *	POSIX 1003.1g mandates this order.
2707 			 */
2708 
2709 			err = sock_error(sk);
2710 			if (err)
2711 				goto unlock;
2712 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2713 				goto unlock;
2714 
2715 			unix_state_unlock(sk);
2716 			if (!timeo) {
2717 				err = -EAGAIN;
2718 				break;
2719 			}
2720 
2721 			mutex_unlock(&u->iolock);
2722 
2723 			timeo = unix_stream_data_wait(sk, timeo, last,
2724 						      last_len, freezable);
2725 
2726 			if (signal_pending(current)) {
2727 				err = sock_intr_errno(timeo);
2728 				scm_destroy(&scm);
2729 				goto out;
2730 			}
2731 
2732 			mutex_lock(&u->iolock);
2733 			goto redo;
2734 unlock:
2735 			unix_state_unlock(sk);
2736 			break;
2737 		}
2738 
2739 		while (skip >= unix_skb_len(skb)) {
2740 			skip -= unix_skb_len(skb);
2741 			last = skb;
2742 			last_len = skb->len;
2743 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2744 			if (!skb)
2745 				goto again;
2746 		}
2747 
2748 		unix_state_unlock(sk);
2749 
2750 		if (check_creds) {
2751 			/* Never glue messages from different writers */
2752 			if (!unix_skb_scm_eq(skb, &scm))
2753 				break;
2754 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2755 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2756 			/* Copy credentials */
2757 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2758 			unix_set_secdata(&scm, skb);
2759 			check_creds = true;
2760 		}
2761 
2762 		/* Copy address just once */
2763 		if (state->msg && state->msg->msg_name) {
2764 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2765 					 state->msg->msg_name);
2766 			unix_copy_addr(state->msg, skb->sk);
2767 			sunaddr = NULL;
2768 		}
2769 
2770 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2771 		skb_get(skb);
2772 		chunk = state->recv_actor(skb, skip, chunk, state);
2773 		drop_skb = !unix_skb_len(skb);
2774 		/* skb is only safe to use if !drop_skb */
2775 		consume_skb(skb);
2776 		if (chunk < 0) {
2777 			if (copied == 0)
2778 				copied = -EFAULT;
2779 			break;
2780 		}
2781 		copied += chunk;
2782 		size -= chunk;
2783 
2784 		if (drop_skb) {
2785 			/* the skb was touched by a concurrent reader;
2786 			 * we should not expect anything from this skb
2787 			 * anymore and assume it invalid - we can be
2788 			 * sure it was dropped from the socket queue
2789 			 *
2790 			 * let's report a short read
2791 			 */
2792 			err = 0;
2793 			break;
2794 		}
2795 
2796 		/* Mark read part of skb as used */
2797 		if (!(flags & MSG_PEEK)) {
2798 			UNIXCB(skb).consumed += chunk;
2799 
2800 			sk_peek_offset_bwd(sk, chunk);
2801 
2802 			if (UNIXCB(skb).fp) {
2803 				scm_stat_del(sk, skb);
2804 				unix_detach_fds(&scm, skb);
2805 			}
2806 
2807 			if (unix_skb_len(skb))
2808 				break;
2809 
2810 			skb_unlink(skb, &sk->sk_receive_queue);
2811 			consume_skb(skb);
2812 
2813 			if (scm.fp)
2814 				break;
2815 		} else {
2816 			/* It is questionable, see note in unix_dgram_recvmsg.
2817 			 */
2818 			if (UNIXCB(skb).fp)
2819 				unix_peek_fds(&scm, skb);
2820 
2821 			sk_peek_offset_fwd(sk, chunk);
2822 
2823 			if (UNIXCB(skb).fp)
2824 				break;
2825 
2826 			skip = 0;
2827 			last = skb;
2828 			last_len = skb->len;
2829 			unix_state_lock(sk);
2830 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2831 			if (skb)
2832 				goto again;
2833 			unix_state_unlock(sk);
2834 			break;
2835 		}
2836 	} while (size);
2837 
2838 	mutex_unlock(&u->iolock);
2839 	if (state->msg)
2840 		scm_recv_unix(sock, state->msg, &scm, flags);
2841 	else
2842 		scm_destroy(&scm);
2843 out:
2844 	return copied ? : err;
2845 }
2846 
2847 static int unix_stream_read_actor(struct sk_buff *skb,
2848 				  int skip, int chunk,
2849 				  struct unix_stream_read_state *state)
2850 {
2851 	int ret;
2852 
2853 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2854 				    state->msg, chunk);
2855 	return ret ?: chunk;
2856 }
2857 
2858 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2859 			  size_t size, int flags)
2860 {
2861 	struct unix_stream_read_state state = {
2862 		.recv_actor = unix_stream_read_actor,
2863 		.socket = sk->sk_socket,
2864 		.msg = msg,
2865 		.size = size,
2866 		.flags = flags
2867 	};
2868 
2869 	return unix_stream_read_generic(&state, true);
2870 }
2871 
2872 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2873 			       size_t size, int flags)
2874 {
2875 	struct unix_stream_read_state state = {
2876 		.recv_actor = unix_stream_read_actor,
2877 		.socket = sock,
2878 		.msg = msg,
2879 		.size = size,
2880 		.flags = flags
2881 	};
2882 
2883 #ifdef CONFIG_BPF_SYSCALL
2884 	struct sock *sk = sock->sk;
2885 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2886 
2887 	if (prot != &unix_stream_proto)
2888 		return prot->recvmsg(sk, msg, size, flags, NULL);
2889 #endif
2890 	return unix_stream_read_generic(&state, true);
2891 }
2892 
2893 static int unix_stream_splice_actor(struct sk_buff *skb,
2894 				    int skip, int chunk,
2895 				    struct unix_stream_read_state *state)
2896 {
2897 	return skb_splice_bits(skb, state->socket->sk,
2898 			       UNIXCB(skb).consumed + skip,
2899 			       state->pipe, chunk, state->splice_flags);
2900 }
2901 
2902 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2903 				       struct pipe_inode_info *pipe,
2904 				       size_t size, unsigned int flags)
2905 {
2906 	struct unix_stream_read_state state = {
2907 		.recv_actor = unix_stream_splice_actor,
2908 		.socket = sock,
2909 		.pipe = pipe,
2910 		.size = size,
2911 		.splice_flags = flags,
2912 	};
2913 
2914 	if (unlikely(*ppos))
2915 		return -ESPIPE;
2916 
2917 	if (sock->file->f_flags & O_NONBLOCK ||
2918 	    flags & SPLICE_F_NONBLOCK)
2919 		state.flags = MSG_DONTWAIT;
2920 
2921 	return unix_stream_read_generic(&state, false);
2922 }
2923 
2924 static int unix_shutdown(struct socket *sock, int mode)
2925 {
2926 	struct sock *sk = sock->sk;
2927 	struct sock *other;
2928 
2929 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2930 		return -EINVAL;
2931 	/* This maps:
2932 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2933 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2934 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2935 	 */
2936 	++mode;
2937 
2938 	unix_state_lock(sk);
2939 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2940 	other = unix_peer(sk);
2941 	if (other)
2942 		sock_hold(other);
2943 	unix_state_unlock(sk);
2944 	sk->sk_state_change(sk);
2945 
2946 	if (other &&
2947 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2948 
2949 		int peer_mode = 0;
2950 		const struct proto *prot = READ_ONCE(other->sk_prot);
2951 
2952 		if (prot->unhash)
2953 			prot->unhash(other);
2954 		if (mode&RCV_SHUTDOWN)
2955 			peer_mode |= SEND_SHUTDOWN;
2956 		if (mode&SEND_SHUTDOWN)
2957 			peer_mode |= RCV_SHUTDOWN;
2958 		unix_state_lock(other);
2959 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2960 		unix_state_unlock(other);
2961 		other->sk_state_change(other);
2962 		if (peer_mode == SHUTDOWN_MASK)
2963 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2964 		else if (peer_mode & RCV_SHUTDOWN)
2965 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2966 	}
2967 	if (other)
2968 		sock_put(other);
2969 
2970 	return 0;
2971 }
2972 
2973 long unix_inq_len(struct sock *sk)
2974 {
2975 	struct sk_buff *skb;
2976 	long amount = 0;
2977 
2978 	if (sk->sk_state == TCP_LISTEN)
2979 		return -EINVAL;
2980 
2981 	spin_lock(&sk->sk_receive_queue.lock);
2982 	if (sk->sk_type == SOCK_STREAM ||
2983 	    sk->sk_type == SOCK_SEQPACKET) {
2984 		skb_queue_walk(&sk->sk_receive_queue, skb)
2985 			amount += unix_skb_len(skb);
2986 	} else {
2987 		skb = skb_peek(&sk->sk_receive_queue);
2988 		if (skb)
2989 			amount = skb->len;
2990 	}
2991 	spin_unlock(&sk->sk_receive_queue.lock);
2992 
2993 	return amount;
2994 }
2995 EXPORT_SYMBOL_GPL(unix_inq_len);
2996 
2997 long unix_outq_len(struct sock *sk)
2998 {
2999 	return sk_wmem_alloc_get(sk);
3000 }
3001 EXPORT_SYMBOL_GPL(unix_outq_len);
3002 
3003 static int unix_open_file(struct sock *sk)
3004 {
3005 	struct path path;
3006 	struct file *f;
3007 	int fd;
3008 
3009 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3010 		return -EPERM;
3011 
3012 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3013 		return -ENOENT;
3014 
3015 	path = unix_sk(sk)->path;
3016 	if (!path.dentry)
3017 		return -ENOENT;
3018 
3019 	path_get(&path);
3020 
3021 	fd = get_unused_fd_flags(O_CLOEXEC);
3022 	if (fd < 0)
3023 		goto out;
3024 
3025 	f = dentry_open(&path, O_PATH, current_cred());
3026 	if (IS_ERR(f)) {
3027 		put_unused_fd(fd);
3028 		fd = PTR_ERR(f);
3029 		goto out;
3030 	}
3031 
3032 	fd_install(fd, f);
3033 out:
3034 	path_put(&path);
3035 
3036 	return fd;
3037 }
3038 
3039 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3040 {
3041 	struct sock *sk = sock->sk;
3042 	long amount = 0;
3043 	int err;
3044 
3045 	switch (cmd) {
3046 	case SIOCOUTQ:
3047 		amount = unix_outq_len(sk);
3048 		err = put_user(amount, (int __user *)arg);
3049 		break;
3050 	case SIOCINQ:
3051 		amount = unix_inq_len(sk);
3052 		if (amount < 0)
3053 			err = amount;
3054 		else
3055 			err = put_user(amount, (int __user *)arg);
3056 		break;
3057 	case SIOCUNIXFILE:
3058 		err = unix_open_file(sk);
3059 		break;
3060 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3061 	case SIOCATMARK:
3062 		{
3063 			struct sk_buff *skb;
3064 			int answ = 0;
3065 
3066 			skb = skb_peek(&sk->sk_receive_queue);
3067 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3068 				answ = 1;
3069 			err = put_user(answ, (int __user *)arg);
3070 		}
3071 		break;
3072 #endif
3073 	default:
3074 		err = -ENOIOCTLCMD;
3075 		break;
3076 	}
3077 	return err;
3078 }
3079 
3080 #ifdef CONFIG_COMPAT
3081 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3082 {
3083 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3084 }
3085 #endif
3086 
3087 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3088 {
3089 	struct sock *sk = sock->sk;
3090 	__poll_t mask;
3091 	u8 shutdown;
3092 
3093 	sock_poll_wait(file, sock, wait);
3094 	mask = 0;
3095 	shutdown = READ_ONCE(sk->sk_shutdown);
3096 
3097 	/* exceptional events? */
3098 	if (READ_ONCE(sk->sk_err))
3099 		mask |= EPOLLERR;
3100 	if (shutdown == SHUTDOWN_MASK)
3101 		mask |= EPOLLHUP;
3102 	if (shutdown & RCV_SHUTDOWN)
3103 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3104 
3105 	/* readable? */
3106 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3107 		mask |= EPOLLIN | EPOLLRDNORM;
3108 	if (sk_is_readable(sk))
3109 		mask |= EPOLLIN | EPOLLRDNORM;
3110 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3111 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3112 		mask |= EPOLLPRI;
3113 #endif
3114 
3115 	/* Connection-based need to check for termination and startup */
3116 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3117 	    sk->sk_state == TCP_CLOSE)
3118 		mask |= EPOLLHUP;
3119 
3120 	/*
3121 	 * we set writable also when the other side has shut down the
3122 	 * connection. This prevents stuck sockets.
3123 	 */
3124 	if (unix_writable(sk))
3125 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3126 
3127 	return mask;
3128 }
3129 
3130 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3131 				    poll_table *wait)
3132 {
3133 	struct sock *sk = sock->sk, *other;
3134 	unsigned int writable;
3135 	__poll_t mask;
3136 	u8 shutdown;
3137 
3138 	sock_poll_wait(file, sock, wait);
3139 	mask = 0;
3140 	shutdown = READ_ONCE(sk->sk_shutdown);
3141 
3142 	/* exceptional events? */
3143 	if (READ_ONCE(sk->sk_err) ||
3144 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3145 		mask |= EPOLLERR |
3146 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3147 
3148 	if (shutdown & RCV_SHUTDOWN)
3149 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3150 	if (shutdown == SHUTDOWN_MASK)
3151 		mask |= EPOLLHUP;
3152 
3153 	/* readable? */
3154 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3155 		mask |= EPOLLIN | EPOLLRDNORM;
3156 	if (sk_is_readable(sk))
3157 		mask |= EPOLLIN | EPOLLRDNORM;
3158 
3159 	/* Connection-based need to check for termination and startup */
3160 	if (sk->sk_type == SOCK_SEQPACKET) {
3161 		if (sk->sk_state == TCP_CLOSE)
3162 			mask |= EPOLLHUP;
3163 		/* connection hasn't started yet? */
3164 		if (sk->sk_state == TCP_SYN_SENT)
3165 			return mask;
3166 	}
3167 
3168 	/* No write status requested, avoid expensive OUT tests. */
3169 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3170 		return mask;
3171 
3172 	writable = unix_writable(sk);
3173 	if (writable) {
3174 		unix_state_lock(sk);
3175 
3176 		other = unix_peer(sk);
3177 		if (other && unix_peer(other) != sk &&
3178 		    unix_recvq_full_lockless(other) &&
3179 		    unix_dgram_peer_wake_me(sk, other))
3180 			writable = 0;
3181 
3182 		unix_state_unlock(sk);
3183 	}
3184 
3185 	if (writable)
3186 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3187 	else
3188 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3189 
3190 	return mask;
3191 }
3192 
3193 #ifdef CONFIG_PROC_FS
3194 
3195 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3196 
3197 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3198 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3199 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3200 
3201 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3202 {
3203 	unsigned long offset = get_offset(*pos);
3204 	unsigned long bucket = get_bucket(*pos);
3205 	unsigned long count = 0;
3206 	struct sock *sk;
3207 
3208 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3209 	     sk; sk = sk_next(sk)) {
3210 		if (++count == offset)
3211 			break;
3212 	}
3213 
3214 	return sk;
3215 }
3216 
3217 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3218 {
3219 	unsigned long bucket = get_bucket(*pos);
3220 	struct net *net = seq_file_net(seq);
3221 	struct sock *sk;
3222 
3223 	while (bucket < UNIX_HASH_SIZE) {
3224 		spin_lock(&net->unx.table.locks[bucket]);
3225 
3226 		sk = unix_from_bucket(seq, pos);
3227 		if (sk)
3228 			return sk;
3229 
3230 		spin_unlock(&net->unx.table.locks[bucket]);
3231 
3232 		*pos = set_bucket_offset(++bucket, 1);
3233 	}
3234 
3235 	return NULL;
3236 }
3237 
3238 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3239 				  loff_t *pos)
3240 {
3241 	unsigned long bucket = get_bucket(*pos);
3242 
3243 	sk = sk_next(sk);
3244 	if (sk)
3245 		return sk;
3246 
3247 
3248 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3249 
3250 	*pos = set_bucket_offset(++bucket, 1);
3251 
3252 	return unix_get_first(seq, pos);
3253 }
3254 
3255 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3256 {
3257 	if (!*pos)
3258 		return SEQ_START_TOKEN;
3259 
3260 	return unix_get_first(seq, pos);
3261 }
3262 
3263 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3264 {
3265 	++*pos;
3266 
3267 	if (v == SEQ_START_TOKEN)
3268 		return unix_get_first(seq, pos);
3269 
3270 	return unix_get_next(seq, v, pos);
3271 }
3272 
3273 static void unix_seq_stop(struct seq_file *seq, void *v)
3274 {
3275 	struct sock *sk = v;
3276 
3277 	if (sk)
3278 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3279 }
3280 
3281 static int unix_seq_show(struct seq_file *seq, void *v)
3282 {
3283 
3284 	if (v == SEQ_START_TOKEN)
3285 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3286 			 "Inode Path\n");
3287 	else {
3288 		struct sock *s = v;
3289 		struct unix_sock *u = unix_sk(s);
3290 		unix_state_lock(s);
3291 
3292 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3293 			s,
3294 			refcount_read(&s->sk_refcnt),
3295 			0,
3296 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3297 			s->sk_type,
3298 			s->sk_socket ?
3299 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3300 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3301 			sock_i_ino(s));
3302 
3303 		if (u->addr) {	// under a hash table lock here
3304 			int i, len;
3305 			seq_putc(seq, ' ');
3306 
3307 			i = 0;
3308 			len = u->addr->len -
3309 				offsetof(struct sockaddr_un, sun_path);
3310 			if (u->addr->name->sun_path[0]) {
3311 				len--;
3312 			} else {
3313 				seq_putc(seq, '@');
3314 				i++;
3315 			}
3316 			for ( ; i < len; i++)
3317 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3318 					 '@');
3319 		}
3320 		unix_state_unlock(s);
3321 		seq_putc(seq, '\n');
3322 	}
3323 
3324 	return 0;
3325 }
3326 
3327 static const struct seq_operations unix_seq_ops = {
3328 	.start  = unix_seq_start,
3329 	.next   = unix_seq_next,
3330 	.stop   = unix_seq_stop,
3331 	.show   = unix_seq_show,
3332 };
3333 
3334 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3335 struct bpf_unix_iter_state {
3336 	struct seq_net_private p;
3337 	unsigned int cur_sk;
3338 	unsigned int end_sk;
3339 	unsigned int max_sk;
3340 	struct sock **batch;
3341 	bool st_bucket_done;
3342 };
3343 
3344 struct bpf_iter__unix {
3345 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3346 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3347 	uid_t uid __aligned(8);
3348 };
3349 
3350 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3351 			      struct unix_sock *unix_sk, uid_t uid)
3352 {
3353 	struct bpf_iter__unix ctx;
3354 
3355 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3356 	ctx.meta = meta;
3357 	ctx.unix_sk = unix_sk;
3358 	ctx.uid = uid;
3359 	return bpf_iter_run_prog(prog, &ctx);
3360 }
3361 
3362 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3363 
3364 {
3365 	struct bpf_unix_iter_state *iter = seq->private;
3366 	unsigned int expected = 1;
3367 	struct sock *sk;
3368 
3369 	sock_hold(start_sk);
3370 	iter->batch[iter->end_sk++] = start_sk;
3371 
3372 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3373 		if (iter->end_sk < iter->max_sk) {
3374 			sock_hold(sk);
3375 			iter->batch[iter->end_sk++] = sk;
3376 		}
3377 
3378 		expected++;
3379 	}
3380 
3381 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3382 
3383 	return expected;
3384 }
3385 
3386 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3387 {
3388 	while (iter->cur_sk < iter->end_sk)
3389 		sock_put(iter->batch[iter->cur_sk++]);
3390 }
3391 
3392 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3393 				       unsigned int new_batch_sz)
3394 {
3395 	struct sock **new_batch;
3396 
3397 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3398 			     GFP_USER | __GFP_NOWARN);
3399 	if (!new_batch)
3400 		return -ENOMEM;
3401 
3402 	bpf_iter_unix_put_batch(iter);
3403 	kvfree(iter->batch);
3404 	iter->batch = new_batch;
3405 	iter->max_sk = new_batch_sz;
3406 
3407 	return 0;
3408 }
3409 
3410 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3411 					loff_t *pos)
3412 {
3413 	struct bpf_unix_iter_state *iter = seq->private;
3414 	unsigned int expected;
3415 	bool resized = false;
3416 	struct sock *sk;
3417 
3418 	if (iter->st_bucket_done)
3419 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3420 
3421 again:
3422 	/* Get a new batch */
3423 	iter->cur_sk = 0;
3424 	iter->end_sk = 0;
3425 
3426 	sk = unix_get_first(seq, pos);
3427 	if (!sk)
3428 		return NULL; /* Done */
3429 
3430 	expected = bpf_iter_unix_hold_batch(seq, sk);
3431 
3432 	if (iter->end_sk == expected) {
3433 		iter->st_bucket_done = true;
3434 		return sk;
3435 	}
3436 
3437 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3438 		resized = true;
3439 		goto again;
3440 	}
3441 
3442 	return sk;
3443 }
3444 
3445 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3446 {
3447 	if (!*pos)
3448 		return SEQ_START_TOKEN;
3449 
3450 	/* bpf iter does not support lseek, so it always
3451 	 * continue from where it was stop()-ped.
3452 	 */
3453 	return bpf_iter_unix_batch(seq, pos);
3454 }
3455 
3456 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3457 {
3458 	struct bpf_unix_iter_state *iter = seq->private;
3459 	struct sock *sk;
3460 
3461 	/* Whenever seq_next() is called, the iter->cur_sk is
3462 	 * done with seq_show(), so advance to the next sk in
3463 	 * the batch.
3464 	 */
3465 	if (iter->cur_sk < iter->end_sk)
3466 		sock_put(iter->batch[iter->cur_sk++]);
3467 
3468 	++*pos;
3469 
3470 	if (iter->cur_sk < iter->end_sk)
3471 		sk = iter->batch[iter->cur_sk];
3472 	else
3473 		sk = bpf_iter_unix_batch(seq, pos);
3474 
3475 	return sk;
3476 }
3477 
3478 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3479 {
3480 	struct bpf_iter_meta meta;
3481 	struct bpf_prog *prog;
3482 	struct sock *sk = v;
3483 	uid_t uid;
3484 	bool slow;
3485 	int ret;
3486 
3487 	if (v == SEQ_START_TOKEN)
3488 		return 0;
3489 
3490 	slow = lock_sock_fast(sk);
3491 
3492 	if (unlikely(sk_unhashed(sk))) {
3493 		ret = SEQ_SKIP;
3494 		goto unlock;
3495 	}
3496 
3497 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3498 	meta.seq = seq;
3499 	prog = bpf_iter_get_info(&meta, false);
3500 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3501 unlock:
3502 	unlock_sock_fast(sk, slow);
3503 	return ret;
3504 }
3505 
3506 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3507 {
3508 	struct bpf_unix_iter_state *iter = seq->private;
3509 	struct bpf_iter_meta meta;
3510 	struct bpf_prog *prog;
3511 
3512 	if (!v) {
3513 		meta.seq = seq;
3514 		prog = bpf_iter_get_info(&meta, true);
3515 		if (prog)
3516 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3517 	}
3518 
3519 	if (iter->cur_sk < iter->end_sk)
3520 		bpf_iter_unix_put_batch(iter);
3521 }
3522 
3523 static const struct seq_operations bpf_iter_unix_seq_ops = {
3524 	.start	= bpf_iter_unix_seq_start,
3525 	.next	= bpf_iter_unix_seq_next,
3526 	.stop	= bpf_iter_unix_seq_stop,
3527 	.show	= bpf_iter_unix_seq_show,
3528 };
3529 #endif
3530 #endif
3531 
3532 static const struct net_proto_family unix_family_ops = {
3533 	.family = PF_UNIX,
3534 	.create = unix_create,
3535 	.owner	= THIS_MODULE,
3536 };
3537 
3538 
3539 static int __net_init unix_net_init(struct net *net)
3540 {
3541 	int i;
3542 
3543 	net->unx.sysctl_max_dgram_qlen = 10;
3544 	if (unix_sysctl_register(net))
3545 		goto out;
3546 
3547 #ifdef CONFIG_PROC_FS
3548 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3549 			     sizeof(struct seq_net_private)))
3550 		goto err_sysctl;
3551 #endif
3552 
3553 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3554 					      sizeof(spinlock_t), GFP_KERNEL);
3555 	if (!net->unx.table.locks)
3556 		goto err_proc;
3557 
3558 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3559 						sizeof(struct hlist_head),
3560 						GFP_KERNEL);
3561 	if (!net->unx.table.buckets)
3562 		goto free_locks;
3563 
3564 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3565 		spin_lock_init(&net->unx.table.locks[i]);
3566 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3567 	}
3568 
3569 	return 0;
3570 
3571 free_locks:
3572 	kvfree(net->unx.table.locks);
3573 err_proc:
3574 #ifdef CONFIG_PROC_FS
3575 	remove_proc_entry("unix", net->proc_net);
3576 err_sysctl:
3577 #endif
3578 	unix_sysctl_unregister(net);
3579 out:
3580 	return -ENOMEM;
3581 }
3582 
3583 static void __net_exit unix_net_exit(struct net *net)
3584 {
3585 	kvfree(net->unx.table.buckets);
3586 	kvfree(net->unx.table.locks);
3587 	unix_sysctl_unregister(net);
3588 	remove_proc_entry("unix", net->proc_net);
3589 }
3590 
3591 static struct pernet_operations unix_net_ops = {
3592 	.init = unix_net_init,
3593 	.exit = unix_net_exit,
3594 };
3595 
3596 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3597 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3598 		     struct unix_sock *unix_sk, uid_t uid)
3599 
3600 #define INIT_BATCH_SZ 16
3601 
3602 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3603 {
3604 	struct bpf_unix_iter_state *iter = priv_data;
3605 	int err;
3606 
3607 	err = bpf_iter_init_seq_net(priv_data, aux);
3608 	if (err)
3609 		return err;
3610 
3611 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3612 	if (err) {
3613 		bpf_iter_fini_seq_net(priv_data);
3614 		return err;
3615 	}
3616 
3617 	return 0;
3618 }
3619 
3620 static void bpf_iter_fini_unix(void *priv_data)
3621 {
3622 	struct bpf_unix_iter_state *iter = priv_data;
3623 
3624 	bpf_iter_fini_seq_net(priv_data);
3625 	kvfree(iter->batch);
3626 }
3627 
3628 static const struct bpf_iter_seq_info unix_seq_info = {
3629 	.seq_ops		= &bpf_iter_unix_seq_ops,
3630 	.init_seq_private	= bpf_iter_init_unix,
3631 	.fini_seq_private	= bpf_iter_fini_unix,
3632 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3633 };
3634 
3635 static const struct bpf_func_proto *
3636 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3637 			     const struct bpf_prog *prog)
3638 {
3639 	switch (func_id) {
3640 	case BPF_FUNC_setsockopt:
3641 		return &bpf_sk_setsockopt_proto;
3642 	case BPF_FUNC_getsockopt:
3643 		return &bpf_sk_getsockopt_proto;
3644 	default:
3645 		return NULL;
3646 	}
3647 }
3648 
3649 static struct bpf_iter_reg unix_reg_info = {
3650 	.target			= "unix",
3651 	.ctx_arg_info_size	= 1,
3652 	.ctx_arg_info		= {
3653 		{ offsetof(struct bpf_iter__unix, unix_sk),
3654 		  PTR_TO_BTF_ID_OR_NULL },
3655 	},
3656 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3657 	.seq_info		= &unix_seq_info,
3658 };
3659 
3660 static void __init bpf_iter_register(void)
3661 {
3662 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3663 	if (bpf_iter_reg_target(&unix_reg_info))
3664 		pr_warn("Warning: could not register bpf iterator unix\n");
3665 }
3666 #endif
3667 
3668 static int __init af_unix_init(void)
3669 {
3670 	int i, rc = -1;
3671 
3672 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3673 
3674 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3675 		spin_lock_init(&bsd_socket_locks[i]);
3676 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3677 	}
3678 
3679 	rc = proto_register(&unix_dgram_proto, 1);
3680 	if (rc != 0) {
3681 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3682 		goto out;
3683 	}
3684 
3685 	rc = proto_register(&unix_stream_proto, 1);
3686 	if (rc != 0) {
3687 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3688 		proto_unregister(&unix_dgram_proto);
3689 		goto out;
3690 	}
3691 
3692 	sock_register(&unix_family_ops);
3693 	register_pernet_subsys(&unix_net_ops);
3694 	unix_bpf_build_proto();
3695 
3696 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3697 	bpf_iter_register();
3698 #endif
3699 
3700 out:
3701 	return rc;
3702 }
3703 
3704 static void __exit af_unix_exit(void)
3705 {
3706 	sock_unregister(PF_UNIX);
3707 	proto_unregister(&unix_dgram_proto);
3708 	proto_unregister(&unix_stream_proto);
3709 	unregister_pernet_subsys(&unix_net_ops);
3710 }
3711 
3712 /* Earlier than device_initcall() so that other drivers invoking
3713    request_module() don't end up in a loop when modprobe tries
3714    to use a UNIX socket. But later than subsys_initcall() because
3715    we depend on stuff initialised there */
3716 fs_initcall(af_unix_init);
3717 module_exit(af_unix_exit);
3718 
3719 MODULE_LICENSE("GPL");
3720 MODULE_ALIAS_NETPROTO(PF_UNIX);
3721