xref: /openbmc/linux/net/unix/af_unix.c (revision 766b0e80)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 	return unix_peer(osk) == sk;
218 }
219 
220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224 
225 static inline int unix_recvq_full_lockless(const struct sock *sk)
226 {
227 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229 
230 struct sock *unix_peer_get(struct sock *s)
231 {
232 	struct sock *peer;
233 
234 	unix_state_lock(s);
235 	peer = unix_peer(s);
236 	if (peer)
237 		sock_hold(peer);
238 	unix_state_unlock(s);
239 	return peer;
240 }
241 EXPORT_SYMBOL_GPL(unix_peer_get);
242 
243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
244 					     int addr_len)
245 {
246 	struct unix_address *addr;
247 
248 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
249 	if (!addr)
250 		return NULL;
251 
252 	refcount_set(&addr->refcnt, 1);
253 	addr->len = addr_len;
254 	memcpy(addr->name, sunaddr, addr_len);
255 
256 	return addr;
257 }
258 
259 static inline void unix_release_addr(struct unix_address *addr)
260 {
261 	if (refcount_dec_and_test(&addr->refcnt))
262 		kfree(addr);
263 }
264 
265 /*
266  *	Check unix socket name:
267  *		- should be not zero length.
268  *	        - if started by not zero, should be NULL terminated (FS object)
269  *		- if started by zero, it is abstract name.
270  */
271 
272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
273 {
274 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
275 	    addr_len > sizeof(*sunaddr))
276 		return -EINVAL;
277 
278 	if (sunaddr->sun_family != AF_UNIX)
279 		return -EINVAL;
280 
281 	return 0;
282 }
283 
284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
285 {
286 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
287 	short offset = offsetof(struct sockaddr_storage, __data);
288 
289 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
290 
291 	/* This may look like an off by one error but it is a bit more
292 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
293 	 * sun_path[108] doesn't as such exist.  However in kernel space
294 	 * we are guaranteed that it is a valid memory location in our
295 	 * kernel address buffer because syscall functions always pass
296 	 * a pointer of struct sockaddr_storage which has a bigger buffer
297 	 * than 108.  Also, we must terminate sun_path for strlen() in
298 	 * getname_kernel().
299 	 */
300 	addr->__data[addr_len - offset] = 0;
301 
302 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
303 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
304 	 * know the actual buffer.
305 	 */
306 	return strlen(addr->__data) + offset + 1;
307 }
308 
309 static void __unix_remove_socket(struct sock *sk)
310 {
311 	sk_del_node_init(sk);
312 }
313 
314 static void __unix_insert_socket(struct net *net, struct sock *sk)
315 {
316 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
317 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
318 }
319 
320 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
321 				 struct unix_address *addr, unsigned int hash)
322 {
323 	__unix_remove_socket(sk);
324 	smp_store_release(&unix_sk(sk)->addr, addr);
325 
326 	sk->sk_hash = hash;
327 	__unix_insert_socket(net, sk);
328 }
329 
330 static void unix_remove_socket(struct net *net, struct sock *sk)
331 {
332 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
333 	__unix_remove_socket(sk);
334 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
335 }
336 
337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
338 {
339 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
340 	__unix_insert_socket(net, sk);
341 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
342 }
343 
344 static void unix_insert_bsd_socket(struct sock *sk)
345 {
346 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
347 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
348 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
349 }
350 
351 static void unix_remove_bsd_socket(struct sock *sk)
352 {
353 	if (!hlist_unhashed(&sk->sk_bind_node)) {
354 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 		__sk_del_bind_node(sk);
356 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357 
358 		sk_node_init(&sk->sk_bind_node);
359 	}
360 }
361 
362 static struct sock *__unix_find_socket_byname(struct net *net,
363 					      struct sockaddr_un *sunname,
364 					      int len, unsigned int hash)
365 {
366 	struct sock *s;
367 
368 	sk_for_each(s, &net->unx.table.buckets[hash]) {
369 		struct unix_sock *u = unix_sk(s);
370 
371 		if (u->addr->len == len &&
372 		    !memcmp(u->addr->name, sunname, len))
373 			return s;
374 	}
375 	return NULL;
376 }
377 
378 static inline struct sock *unix_find_socket_byname(struct net *net,
379 						   struct sockaddr_un *sunname,
380 						   int len, unsigned int hash)
381 {
382 	struct sock *s;
383 
384 	spin_lock(&net->unx.table.locks[hash]);
385 	s = __unix_find_socket_byname(net, sunname, len, hash);
386 	if (s)
387 		sock_hold(s);
388 	spin_unlock(&net->unx.table.locks[hash]);
389 	return s;
390 }
391 
392 static struct sock *unix_find_socket_byinode(struct inode *i)
393 {
394 	unsigned int hash = unix_bsd_hash(i);
395 	struct sock *s;
396 
397 	spin_lock(&bsd_socket_locks[hash]);
398 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
399 		struct dentry *dentry = unix_sk(s)->path.dentry;
400 
401 		if (dentry && d_backing_inode(dentry) == i) {
402 			sock_hold(s);
403 			spin_unlock(&bsd_socket_locks[hash]);
404 			return s;
405 		}
406 	}
407 	spin_unlock(&bsd_socket_locks[hash]);
408 	return NULL;
409 }
410 
411 /* Support code for asymmetrically connected dgram sockets
412  *
413  * If a datagram socket is connected to a socket not itself connected
414  * to the first socket (eg, /dev/log), clients may only enqueue more
415  * messages if the present receive queue of the server socket is not
416  * "too large". This means there's a second writeability condition
417  * poll and sendmsg need to test. The dgram recv code will do a wake
418  * up on the peer_wait wait queue of a socket upon reception of a
419  * datagram which needs to be propagated to sleeping would-be writers
420  * since these might not have sent anything so far. This can't be
421  * accomplished via poll_wait because the lifetime of the server
422  * socket might be less than that of its clients if these break their
423  * association with it or if the server socket is closed while clients
424  * are still connected to it and there's no way to inform "a polling
425  * implementation" that it should let go of a certain wait queue
426  *
427  * In order to propagate a wake up, a wait_queue_entry_t of the client
428  * socket is enqueued on the peer_wait queue of the server socket
429  * whose wake function does a wake_up on the ordinary client socket
430  * wait queue. This connection is established whenever a write (or
431  * poll for write) hit the flow control condition and broken when the
432  * association to the server socket is dissolved or after a wake up
433  * was relayed.
434  */
435 
436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
437 				      void *key)
438 {
439 	struct unix_sock *u;
440 	wait_queue_head_t *u_sleep;
441 
442 	u = container_of(q, struct unix_sock, peer_wake);
443 
444 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
445 			    q);
446 	u->peer_wake.private = NULL;
447 
448 	/* relaying can only happen while the wq still exists */
449 	u_sleep = sk_sleep(&u->sk);
450 	if (u_sleep)
451 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
452 
453 	return 0;
454 }
455 
456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
457 {
458 	struct unix_sock *u, *u_other;
459 	int rc;
460 
461 	u = unix_sk(sk);
462 	u_other = unix_sk(other);
463 	rc = 0;
464 	spin_lock(&u_other->peer_wait.lock);
465 
466 	if (!u->peer_wake.private) {
467 		u->peer_wake.private = other;
468 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
469 
470 		rc = 1;
471 	}
472 
473 	spin_unlock(&u_other->peer_wait.lock);
474 	return rc;
475 }
476 
477 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
478 					    struct sock *other)
479 {
480 	struct unix_sock *u, *u_other;
481 
482 	u = unix_sk(sk);
483 	u_other = unix_sk(other);
484 	spin_lock(&u_other->peer_wait.lock);
485 
486 	if (u->peer_wake.private == other) {
487 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
488 		u->peer_wake.private = NULL;
489 	}
490 
491 	spin_unlock(&u_other->peer_wait.lock);
492 }
493 
494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
495 						   struct sock *other)
496 {
497 	unix_dgram_peer_wake_disconnect(sk, other);
498 	wake_up_interruptible_poll(sk_sleep(sk),
499 				   EPOLLOUT |
500 				   EPOLLWRNORM |
501 				   EPOLLWRBAND);
502 }
503 
504 /* preconditions:
505  *	- unix_peer(sk) == other
506  *	- association is stable
507  */
508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
509 {
510 	int connected;
511 
512 	connected = unix_dgram_peer_wake_connect(sk, other);
513 
514 	/* If other is SOCK_DEAD, we want to make sure we signal
515 	 * POLLOUT, such that a subsequent write() can get a
516 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
517 	 * to other and its full, we will hang waiting for POLLOUT.
518 	 */
519 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
520 		return 1;
521 
522 	if (connected)
523 		unix_dgram_peer_wake_disconnect(sk, other);
524 
525 	return 0;
526 }
527 
528 static int unix_writable(const struct sock *sk, unsigned char state)
529 {
530 	return state != TCP_LISTEN &&
531 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
532 }
533 
534 static void unix_write_space(struct sock *sk)
535 {
536 	struct socket_wq *wq;
537 
538 	rcu_read_lock();
539 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
540 		wq = rcu_dereference(sk->sk_wq);
541 		if (skwq_has_sleeper(wq))
542 			wake_up_interruptible_sync_poll(&wq->wait,
543 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
544 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
545 	}
546 	rcu_read_unlock();
547 }
548 
549 /* When dgram socket disconnects (or changes its peer), we clear its receive
550  * queue of packets arrived from previous peer. First, it allows to do
551  * flow control based only on wmem_alloc; second, sk connected to peer
552  * may receive messages only from that peer. */
553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
554 {
555 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
556 		skb_queue_purge(&sk->sk_receive_queue);
557 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
558 
559 		/* If one link of bidirectional dgram pipe is disconnected,
560 		 * we signal error. Messages are lost. Do not make this,
561 		 * when peer was not connected to us.
562 		 */
563 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
564 			WRITE_ONCE(other->sk_err, ECONNRESET);
565 			sk_error_report(other);
566 		}
567 	}
568 }
569 
570 static void unix_sock_destructor(struct sock *sk)
571 {
572 	struct unix_sock *u = unix_sk(sk);
573 
574 	skb_queue_purge(&sk->sk_receive_queue);
575 
576 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
577 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
578 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
579 	if (!sock_flag(sk, SOCK_DEAD)) {
580 		pr_info("Attempt to release alive unix socket: %p\n", sk);
581 		return;
582 	}
583 
584 	if (u->addr)
585 		unix_release_addr(u->addr);
586 
587 	atomic_long_dec(&unix_nr_socks);
588 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
589 #ifdef UNIX_REFCNT_DEBUG
590 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
591 		atomic_long_read(&unix_nr_socks));
592 #endif
593 }
594 
595 static void unix_release_sock(struct sock *sk, int embrion)
596 {
597 	struct unix_sock *u = unix_sk(sk);
598 	struct sock *skpair;
599 	struct sk_buff *skb;
600 	struct path path;
601 	int state;
602 
603 	unix_remove_socket(sock_net(sk), sk);
604 	unix_remove_bsd_socket(sk);
605 
606 	/* Clear state */
607 	unix_state_lock(sk);
608 	sock_orphan(sk);
609 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
610 	path	     = u->path;
611 	u->path.dentry = NULL;
612 	u->path.mnt = NULL;
613 	state = sk->sk_state;
614 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
615 
616 	skpair = unix_peer(sk);
617 	unix_peer(sk) = NULL;
618 
619 	unix_state_unlock(sk);
620 
621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
622 	if (u->oob_skb) {
623 		kfree_skb(u->oob_skb);
624 		u->oob_skb = NULL;
625 	}
626 #endif
627 
628 	wake_up_interruptible_all(&u->peer_wait);
629 
630 	if (skpair != NULL) {
631 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
632 			unix_state_lock(skpair);
633 			/* No more writes */
634 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
635 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
636 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
637 			unix_state_unlock(skpair);
638 			skpair->sk_state_change(skpair);
639 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
640 		}
641 
642 		unix_dgram_peer_wake_disconnect(sk, skpair);
643 		sock_put(skpair); /* It may now die */
644 	}
645 
646 	/* Try to flush out this socket. Throw out buffers at least */
647 
648 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
649 		if (state == TCP_LISTEN)
650 			unix_release_sock(skb->sk, 1);
651 		/* passed fds are erased in the kfree_skb hook	      */
652 		UNIXCB(skb).consumed = skb->len;
653 		kfree_skb(skb);
654 	}
655 
656 	if (path.dentry)
657 		path_put(&path);
658 
659 	sock_put(sk);
660 
661 	/* ---- Socket is dead now and most probably destroyed ---- */
662 
663 	/*
664 	 * Fixme: BSD difference: In BSD all sockets connected to us get
665 	 *	  ECONNRESET and we die on the spot. In Linux we behave
666 	 *	  like files and pipes do and wait for the last
667 	 *	  dereference.
668 	 *
669 	 * Can't we simply set sock->err?
670 	 *
671 	 *	  What the above comment does talk about? --ANK(980817)
672 	 */
673 
674 	if (READ_ONCE(unix_tot_inflight))
675 		unix_gc();		/* Garbage collect fds */
676 }
677 
678 static void init_peercred(struct sock *sk)
679 {
680 	const struct cred *old_cred;
681 	struct pid *old_pid;
682 
683 	spin_lock(&sk->sk_peer_lock);
684 	old_pid = sk->sk_peer_pid;
685 	old_cred = sk->sk_peer_cred;
686 	sk->sk_peer_pid  = get_pid(task_tgid(current));
687 	sk->sk_peer_cred = get_current_cred();
688 	spin_unlock(&sk->sk_peer_lock);
689 
690 	put_pid(old_pid);
691 	put_cred(old_cred);
692 }
693 
694 static void copy_peercred(struct sock *sk, struct sock *peersk)
695 {
696 	const struct cred *old_cred;
697 	struct pid *old_pid;
698 
699 	if (sk < peersk) {
700 		spin_lock(&sk->sk_peer_lock);
701 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 	} else {
703 		spin_lock(&peersk->sk_peer_lock);
704 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
705 	}
706 	old_pid = sk->sk_peer_pid;
707 	old_cred = sk->sk_peer_cred;
708 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
709 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
710 
711 	spin_unlock(&sk->sk_peer_lock);
712 	spin_unlock(&peersk->sk_peer_lock);
713 
714 	put_pid(old_pid);
715 	put_cred(old_cred);
716 }
717 
718 static int unix_listen(struct socket *sock, int backlog)
719 {
720 	int err;
721 	struct sock *sk = sock->sk;
722 	struct unix_sock *u = unix_sk(sk);
723 
724 	err = -EOPNOTSUPP;
725 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
726 		goto out;	/* Only stream/seqpacket sockets accept */
727 	err = -EINVAL;
728 	if (!READ_ONCE(u->addr))
729 		goto out;	/* No listens on an unbound socket */
730 	unix_state_lock(sk);
731 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
732 		goto out_unlock;
733 	if (backlog > sk->sk_max_ack_backlog)
734 		wake_up_interruptible_all(&u->peer_wait);
735 	sk->sk_max_ack_backlog	= backlog;
736 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
737 
738 	/* set credentials so connect can copy them */
739 	init_peercred(sk);
740 	err = 0;
741 
742 out_unlock:
743 	unix_state_unlock(sk);
744 out:
745 	return err;
746 }
747 
748 static int unix_release(struct socket *);
749 static int unix_bind(struct socket *, struct sockaddr *, int);
750 static int unix_stream_connect(struct socket *, struct sockaddr *,
751 			       int addr_len, int flags);
752 static int unix_socketpair(struct socket *, struct socket *);
753 static int unix_accept(struct socket *, struct socket *, int, bool);
754 static int unix_getname(struct socket *, struct sockaddr *, int);
755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
756 static __poll_t unix_dgram_poll(struct file *, struct socket *,
757 				    poll_table *);
758 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
759 #ifdef CONFIG_COMPAT
760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
761 #endif
762 static int unix_shutdown(struct socket *, int);
763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
765 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
766 				       struct pipe_inode_info *, size_t size,
767 				       unsigned int flags);
768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
772 static int unix_dgram_connect(struct socket *, struct sockaddr *,
773 			      int, int);
774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
776 				  int);
777 
778 static int unix_set_peek_off(struct sock *sk, int val)
779 {
780 	struct unix_sock *u = unix_sk(sk);
781 
782 	if (mutex_lock_interruptible(&u->iolock))
783 		return -EINTR;
784 
785 	WRITE_ONCE(sk->sk_peek_off, val);
786 	mutex_unlock(&u->iolock);
787 
788 	return 0;
789 }
790 
791 #ifdef CONFIG_PROC_FS
792 static int unix_count_nr_fds(struct sock *sk)
793 {
794 	struct sk_buff *skb;
795 	struct unix_sock *u;
796 	int nr_fds = 0;
797 
798 	spin_lock(&sk->sk_receive_queue.lock);
799 	skb = skb_peek(&sk->sk_receive_queue);
800 	while (skb) {
801 		u = unix_sk(skb->sk);
802 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
803 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
804 	}
805 	spin_unlock(&sk->sk_receive_queue.lock);
806 
807 	return nr_fds;
808 }
809 
810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
811 {
812 	struct sock *sk = sock->sk;
813 	unsigned char s_state;
814 	struct unix_sock *u;
815 	int nr_fds = 0;
816 
817 	if (sk) {
818 		s_state = READ_ONCE(sk->sk_state);
819 		u = unix_sk(sk);
820 
821 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
822 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
823 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
824 		 */
825 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
826 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
827 		else if (s_state == TCP_LISTEN)
828 			nr_fds = unix_count_nr_fds(sk);
829 
830 		seq_printf(m, "scm_fds: %u\n", nr_fds);
831 	}
832 }
833 #else
834 #define unix_show_fdinfo NULL
835 #endif
836 
837 static const struct proto_ops unix_stream_ops = {
838 	.family =	PF_UNIX,
839 	.owner =	THIS_MODULE,
840 	.release =	unix_release,
841 	.bind =		unix_bind,
842 	.connect =	unix_stream_connect,
843 	.socketpair =	unix_socketpair,
844 	.accept =	unix_accept,
845 	.getname =	unix_getname,
846 	.poll =		unix_poll,
847 	.ioctl =	unix_ioctl,
848 #ifdef CONFIG_COMPAT
849 	.compat_ioctl =	unix_compat_ioctl,
850 #endif
851 	.listen =	unix_listen,
852 	.shutdown =	unix_shutdown,
853 	.sendmsg =	unix_stream_sendmsg,
854 	.recvmsg =	unix_stream_recvmsg,
855 	.read_skb =	unix_stream_read_skb,
856 	.mmap =		sock_no_mmap,
857 	.splice_read =	unix_stream_splice_read,
858 	.set_peek_off =	unix_set_peek_off,
859 	.show_fdinfo =	unix_show_fdinfo,
860 };
861 
862 static const struct proto_ops unix_dgram_ops = {
863 	.family =	PF_UNIX,
864 	.owner =	THIS_MODULE,
865 	.release =	unix_release,
866 	.bind =		unix_bind,
867 	.connect =	unix_dgram_connect,
868 	.socketpair =	unix_socketpair,
869 	.accept =	sock_no_accept,
870 	.getname =	unix_getname,
871 	.poll =		unix_dgram_poll,
872 	.ioctl =	unix_ioctl,
873 #ifdef CONFIG_COMPAT
874 	.compat_ioctl =	unix_compat_ioctl,
875 #endif
876 	.listen =	sock_no_listen,
877 	.shutdown =	unix_shutdown,
878 	.sendmsg =	unix_dgram_sendmsg,
879 	.read_skb =	unix_read_skb,
880 	.recvmsg =	unix_dgram_recvmsg,
881 	.mmap =		sock_no_mmap,
882 	.set_peek_off =	unix_set_peek_off,
883 	.show_fdinfo =	unix_show_fdinfo,
884 };
885 
886 static const struct proto_ops unix_seqpacket_ops = {
887 	.family =	PF_UNIX,
888 	.owner =	THIS_MODULE,
889 	.release =	unix_release,
890 	.bind =		unix_bind,
891 	.connect =	unix_stream_connect,
892 	.socketpair =	unix_socketpair,
893 	.accept =	unix_accept,
894 	.getname =	unix_getname,
895 	.poll =		unix_dgram_poll,
896 	.ioctl =	unix_ioctl,
897 #ifdef CONFIG_COMPAT
898 	.compat_ioctl =	unix_compat_ioctl,
899 #endif
900 	.listen =	unix_listen,
901 	.shutdown =	unix_shutdown,
902 	.sendmsg =	unix_seqpacket_sendmsg,
903 	.recvmsg =	unix_seqpacket_recvmsg,
904 	.mmap =		sock_no_mmap,
905 	.set_peek_off =	unix_set_peek_off,
906 	.show_fdinfo =	unix_show_fdinfo,
907 };
908 
909 static void unix_close(struct sock *sk, long timeout)
910 {
911 	/* Nothing to do here, unix socket does not need a ->close().
912 	 * This is merely for sockmap.
913 	 */
914 }
915 
916 static void unix_unhash(struct sock *sk)
917 {
918 	/* Nothing to do here, unix socket does not need a ->unhash().
919 	 * This is merely for sockmap.
920 	 */
921 }
922 
923 static bool unix_bpf_bypass_getsockopt(int level, int optname)
924 {
925 	if (level == SOL_SOCKET) {
926 		switch (optname) {
927 		case SO_PEERPIDFD:
928 			return true;
929 		default:
930 			return false;
931 		}
932 	}
933 
934 	return false;
935 }
936 
937 struct proto unix_dgram_proto = {
938 	.name			= "UNIX",
939 	.owner			= THIS_MODULE,
940 	.obj_size		= sizeof(struct unix_sock),
941 	.close			= unix_close,
942 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
943 #ifdef CONFIG_BPF_SYSCALL
944 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
945 #endif
946 };
947 
948 struct proto unix_stream_proto = {
949 	.name			= "UNIX-STREAM",
950 	.owner			= THIS_MODULE,
951 	.obj_size		= sizeof(struct unix_sock),
952 	.close			= unix_close,
953 	.unhash			= unix_unhash,
954 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
955 #ifdef CONFIG_BPF_SYSCALL
956 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
957 #endif
958 };
959 
960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
961 {
962 	struct unix_sock *u;
963 	struct sock *sk;
964 	int err;
965 
966 	atomic_long_inc(&unix_nr_socks);
967 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
968 		err = -ENFILE;
969 		goto err;
970 	}
971 
972 	if (type == SOCK_STREAM)
973 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
974 	else /*dgram and  seqpacket */
975 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
976 
977 	if (!sk) {
978 		err = -ENOMEM;
979 		goto err;
980 	}
981 
982 	sock_init_data(sock, sk);
983 
984 	sk->sk_hash		= unix_unbound_hash(sk);
985 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
986 	sk->sk_write_space	= unix_write_space;
987 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
988 	sk->sk_destruct		= unix_sock_destructor;
989 	u = unix_sk(sk);
990 	u->inflight = 0;
991 	u->path.dentry = NULL;
992 	u->path.mnt = NULL;
993 	spin_lock_init(&u->lock);
994 	INIT_LIST_HEAD(&u->link);
995 	mutex_init(&u->iolock); /* single task reading lock */
996 	mutex_init(&u->bindlock); /* single task binding lock */
997 	init_waitqueue_head(&u->peer_wait);
998 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
999 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1000 	unix_insert_unbound_socket(net, sk);
1001 
1002 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1003 
1004 	return sk;
1005 
1006 err:
1007 	atomic_long_dec(&unix_nr_socks);
1008 	return ERR_PTR(err);
1009 }
1010 
1011 static int unix_create(struct net *net, struct socket *sock, int protocol,
1012 		       int kern)
1013 {
1014 	struct sock *sk;
1015 
1016 	if (protocol && protocol != PF_UNIX)
1017 		return -EPROTONOSUPPORT;
1018 
1019 	sock->state = SS_UNCONNECTED;
1020 
1021 	switch (sock->type) {
1022 	case SOCK_STREAM:
1023 		sock->ops = &unix_stream_ops;
1024 		break;
1025 		/*
1026 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1027 		 *	nothing uses it.
1028 		 */
1029 	case SOCK_RAW:
1030 		sock->type = SOCK_DGRAM;
1031 		fallthrough;
1032 	case SOCK_DGRAM:
1033 		sock->ops = &unix_dgram_ops;
1034 		break;
1035 	case SOCK_SEQPACKET:
1036 		sock->ops = &unix_seqpacket_ops;
1037 		break;
1038 	default:
1039 		return -ESOCKTNOSUPPORT;
1040 	}
1041 
1042 	sk = unix_create1(net, sock, kern, sock->type);
1043 	if (IS_ERR(sk))
1044 		return PTR_ERR(sk);
1045 
1046 	return 0;
1047 }
1048 
1049 static int unix_release(struct socket *sock)
1050 {
1051 	struct sock *sk = sock->sk;
1052 
1053 	if (!sk)
1054 		return 0;
1055 
1056 	sk->sk_prot->close(sk, 0);
1057 	unix_release_sock(sk, 0);
1058 	sock->sk = NULL;
1059 
1060 	return 0;
1061 }
1062 
1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1064 				  int type)
1065 {
1066 	struct inode *inode;
1067 	struct path path;
1068 	struct sock *sk;
1069 	int err;
1070 
1071 	unix_mkname_bsd(sunaddr, addr_len);
1072 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1073 	if (err)
1074 		goto fail;
1075 
1076 	err = path_permission(&path, MAY_WRITE);
1077 	if (err)
1078 		goto path_put;
1079 
1080 	err = -ECONNREFUSED;
1081 	inode = d_backing_inode(path.dentry);
1082 	if (!S_ISSOCK(inode->i_mode))
1083 		goto path_put;
1084 
1085 	sk = unix_find_socket_byinode(inode);
1086 	if (!sk)
1087 		goto path_put;
1088 
1089 	err = -EPROTOTYPE;
1090 	if (sk->sk_type == type)
1091 		touch_atime(&path);
1092 	else
1093 		goto sock_put;
1094 
1095 	path_put(&path);
1096 
1097 	return sk;
1098 
1099 sock_put:
1100 	sock_put(sk);
1101 path_put:
1102 	path_put(&path);
1103 fail:
1104 	return ERR_PTR(err);
1105 }
1106 
1107 static struct sock *unix_find_abstract(struct net *net,
1108 				       struct sockaddr_un *sunaddr,
1109 				       int addr_len, int type)
1110 {
1111 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1112 	struct dentry *dentry;
1113 	struct sock *sk;
1114 
1115 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1116 	if (!sk)
1117 		return ERR_PTR(-ECONNREFUSED);
1118 
1119 	dentry = unix_sk(sk)->path.dentry;
1120 	if (dentry)
1121 		touch_atime(&unix_sk(sk)->path);
1122 
1123 	return sk;
1124 }
1125 
1126 static struct sock *unix_find_other(struct net *net,
1127 				    struct sockaddr_un *sunaddr,
1128 				    int addr_len, int type)
1129 {
1130 	struct sock *sk;
1131 
1132 	if (sunaddr->sun_path[0])
1133 		sk = unix_find_bsd(sunaddr, addr_len, type);
1134 	else
1135 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1136 
1137 	return sk;
1138 }
1139 
1140 static int unix_autobind(struct sock *sk)
1141 {
1142 	struct unix_sock *u = unix_sk(sk);
1143 	unsigned int new_hash, old_hash;
1144 	struct net *net = sock_net(sk);
1145 	struct unix_address *addr;
1146 	u32 lastnum, ordernum;
1147 	int err;
1148 
1149 	err = mutex_lock_interruptible(&u->bindlock);
1150 	if (err)
1151 		return err;
1152 
1153 	if (u->addr)
1154 		goto out;
1155 
1156 	err = -ENOMEM;
1157 	addr = kzalloc(sizeof(*addr) +
1158 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1159 	if (!addr)
1160 		goto out;
1161 
1162 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1163 	addr->name->sun_family = AF_UNIX;
1164 	refcount_set(&addr->refcnt, 1);
1165 
1166 	old_hash = sk->sk_hash;
1167 	ordernum = get_random_u32();
1168 	lastnum = ordernum & 0xFFFFF;
1169 retry:
1170 	ordernum = (ordernum + 1) & 0xFFFFF;
1171 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1172 
1173 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1174 	unix_table_double_lock(net, old_hash, new_hash);
1175 
1176 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1177 		unix_table_double_unlock(net, old_hash, new_hash);
1178 
1179 		/* __unix_find_socket_byname() may take long time if many names
1180 		 * are already in use.
1181 		 */
1182 		cond_resched();
1183 
1184 		if (ordernum == lastnum) {
1185 			/* Give up if all names seems to be in use. */
1186 			err = -ENOSPC;
1187 			unix_release_addr(addr);
1188 			goto out;
1189 		}
1190 
1191 		goto retry;
1192 	}
1193 
1194 	__unix_set_addr_hash(net, sk, addr, new_hash);
1195 	unix_table_double_unlock(net, old_hash, new_hash);
1196 	err = 0;
1197 
1198 out:	mutex_unlock(&u->bindlock);
1199 	return err;
1200 }
1201 
1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1203 			 int addr_len)
1204 {
1205 	umode_t mode = S_IFSOCK |
1206 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1207 	struct unix_sock *u = unix_sk(sk);
1208 	unsigned int new_hash, old_hash;
1209 	struct net *net = sock_net(sk);
1210 	struct mnt_idmap *idmap;
1211 	struct unix_address *addr;
1212 	struct dentry *dentry;
1213 	struct path parent;
1214 	int err;
1215 
1216 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1217 	addr = unix_create_addr(sunaddr, addr_len);
1218 	if (!addr)
1219 		return -ENOMEM;
1220 
1221 	/*
1222 	 * Get the parent directory, calculate the hash for last
1223 	 * component.
1224 	 */
1225 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1226 	if (IS_ERR(dentry)) {
1227 		err = PTR_ERR(dentry);
1228 		goto out;
1229 	}
1230 
1231 	/*
1232 	 * All right, let's create it.
1233 	 */
1234 	idmap = mnt_idmap(parent.mnt);
1235 	err = security_path_mknod(&parent, dentry, mode, 0);
1236 	if (!err)
1237 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1238 	if (err)
1239 		goto out_path;
1240 	err = mutex_lock_interruptible(&u->bindlock);
1241 	if (err)
1242 		goto out_unlink;
1243 	if (u->addr)
1244 		goto out_unlock;
1245 
1246 	old_hash = sk->sk_hash;
1247 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1248 	unix_table_double_lock(net, old_hash, new_hash);
1249 	u->path.mnt = mntget(parent.mnt);
1250 	u->path.dentry = dget(dentry);
1251 	__unix_set_addr_hash(net, sk, addr, new_hash);
1252 	unix_table_double_unlock(net, old_hash, new_hash);
1253 	unix_insert_bsd_socket(sk);
1254 	mutex_unlock(&u->bindlock);
1255 	done_path_create(&parent, dentry);
1256 	return 0;
1257 
1258 out_unlock:
1259 	mutex_unlock(&u->bindlock);
1260 	err = -EINVAL;
1261 out_unlink:
1262 	/* failed after successful mknod?  unlink what we'd created... */
1263 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1264 out_path:
1265 	done_path_create(&parent, dentry);
1266 out:
1267 	unix_release_addr(addr);
1268 	return err == -EEXIST ? -EADDRINUSE : err;
1269 }
1270 
1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1272 			      int addr_len)
1273 {
1274 	struct unix_sock *u = unix_sk(sk);
1275 	unsigned int new_hash, old_hash;
1276 	struct net *net = sock_net(sk);
1277 	struct unix_address *addr;
1278 	int err;
1279 
1280 	addr = unix_create_addr(sunaddr, addr_len);
1281 	if (!addr)
1282 		return -ENOMEM;
1283 
1284 	err = mutex_lock_interruptible(&u->bindlock);
1285 	if (err)
1286 		goto out;
1287 
1288 	if (u->addr) {
1289 		err = -EINVAL;
1290 		goto out_mutex;
1291 	}
1292 
1293 	old_hash = sk->sk_hash;
1294 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1295 	unix_table_double_lock(net, old_hash, new_hash);
1296 
1297 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1298 		goto out_spin;
1299 
1300 	__unix_set_addr_hash(net, sk, addr, new_hash);
1301 	unix_table_double_unlock(net, old_hash, new_hash);
1302 	mutex_unlock(&u->bindlock);
1303 	return 0;
1304 
1305 out_spin:
1306 	unix_table_double_unlock(net, old_hash, new_hash);
1307 	err = -EADDRINUSE;
1308 out_mutex:
1309 	mutex_unlock(&u->bindlock);
1310 out:
1311 	unix_release_addr(addr);
1312 	return err;
1313 }
1314 
1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1316 {
1317 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1318 	struct sock *sk = sock->sk;
1319 	int err;
1320 
1321 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1322 	    sunaddr->sun_family == AF_UNIX)
1323 		return unix_autobind(sk);
1324 
1325 	err = unix_validate_addr(sunaddr, addr_len);
1326 	if (err)
1327 		return err;
1328 
1329 	if (sunaddr->sun_path[0])
1330 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1331 	else
1332 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1333 
1334 	return err;
1335 }
1336 
1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1338 {
1339 	if (unlikely(sk1 == sk2) || !sk2) {
1340 		unix_state_lock(sk1);
1341 		return;
1342 	}
1343 	if (sk1 > sk2)
1344 		swap(sk1, sk2);
1345 
1346 	unix_state_lock(sk1);
1347 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1348 }
1349 
1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1351 {
1352 	if (unlikely(sk1 == sk2) || !sk2) {
1353 		unix_state_unlock(sk1);
1354 		return;
1355 	}
1356 	unix_state_unlock(sk1);
1357 	unix_state_unlock(sk2);
1358 }
1359 
1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1361 			      int alen, int flags)
1362 {
1363 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1364 	struct sock *sk = sock->sk;
1365 	struct sock *other;
1366 	int err;
1367 
1368 	err = -EINVAL;
1369 	if (alen < offsetofend(struct sockaddr, sa_family))
1370 		goto out;
1371 
1372 	if (addr->sa_family != AF_UNSPEC) {
1373 		err = unix_validate_addr(sunaddr, alen);
1374 		if (err)
1375 			goto out;
1376 
1377 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1378 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1379 		    !READ_ONCE(unix_sk(sk)->addr)) {
1380 			err = unix_autobind(sk);
1381 			if (err)
1382 				goto out;
1383 		}
1384 
1385 restart:
1386 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1387 		if (IS_ERR(other)) {
1388 			err = PTR_ERR(other);
1389 			goto out;
1390 		}
1391 
1392 		unix_state_double_lock(sk, other);
1393 
1394 		/* Apparently VFS overslept socket death. Retry. */
1395 		if (sock_flag(other, SOCK_DEAD)) {
1396 			unix_state_double_unlock(sk, other);
1397 			sock_put(other);
1398 			goto restart;
1399 		}
1400 
1401 		err = -EPERM;
1402 		if (!unix_may_send(sk, other))
1403 			goto out_unlock;
1404 
1405 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1406 		if (err)
1407 			goto out_unlock;
1408 
1409 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1410 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1411 	} else {
1412 		/*
1413 		 *	1003.1g breaking connected state with AF_UNSPEC
1414 		 */
1415 		other = NULL;
1416 		unix_state_double_lock(sk, other);
1417 	}
1418 
1419 	/*
1420 	 * If it was connected, reconnect.
1421 	 */
1422 	if (unix_peer(sk)) {
1423 		struct sock *old_peer = unix_peer(sk);
1424 
1425 		unix_peer(sk) = other;
1426 		if (!other)
1427 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1428 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1429 
1430 		unix_state_double_unlock(sk, other);
1431 
1432 		if (other != old_peer) {
1433 			unix_dgram_disconnected(sk, old_peer);
1434 
1435 			unix_state_lock(old_peer);
1436 			if (!unix_peer(old_peer))
1437 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1438 			unix_state_unlock(old_peer);
1439 		}
1440 
1441 		sock_put(old_peer);
1442 	} else {
1443 		unix_peer(sk) = other;
1444 		unix_state_double_unlock(sk, other);
1445 	}
1446 
1447 	return 0;
1448 
1449 out_unlock:
1450 	unix_state_double_unlock(sk, other);
1451 	sock_put(other);
1452 out:
1453 	return err;
1454 }
1455 
1456 static long unix_wait_for_peer(struct sock *other, long timeo)
1457 	__releases(&unix_sk(other)->lock)
1458 {
1459 	struct unix_sock *u = unix_sk(other);
1460 	int sched;
1461 	DEFINE_WAIT(wait);
1462 
1463 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464 
1465 	sched = !sock_flag(other, SOCK_DEAD) &&
1466 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1467 		unix_recvq_full_lockless(other);
1468 
1469 	unix_state_unlock(other);
1470 
1471 	if (sched)
1472 		timeo = schedule_timeout(timeo);
1473 
1474 	finish_wait(&u->peer_wait, &wait);
1475 	return timeo;
1476 }
1477 
1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1479 			       int addr_len, int flags)
1480 {
1481 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1482 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1483 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1484 	struct net *net = sock_net(sk);
1485 	struct sk_buff *skb = NULL;
1486 	long timeo;
1487 	int err;
1488 
1489 	err = unix_validate_addr(sunaddr, addr_len);
1490 	if (err)
1491 		goto out;
1492 
1493 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1495 	    !READ_ONCE(u->addr)) {
1496 		err = unix_autobind(sk);
1497 		if (err)
1498 			goto out;
1499 	}
1500 
1501 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1502 
1503 	/* First of all allocate resources.
1504 	   If we will make it after state is locked,
1505 	   we will have to recheck all again in any case.
1506 	 */
1507 
1508 	/* create new sock for complete connection */
1509 	newsk = unix_create1(net, NULL, 0, sock->type);
1510 	if (IS_ERR(newsk)) {
1511 		err = PTR_ERR(newsk);
1512 		newsk = NULL;
1513 		goto out;
1514 	}
1515 
1516 	err = -ENOMEM;
1517 
1518 	/* Allocate skb for sending to listening sock */
1519 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1520 	if (skb == NULL)
1521 		goto out;
1522 
1523 restart:
1524 	/*  Find listening sock. */
1525 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1526 	if (IS_ERR(other)) {
1527 		err = PTR_ERR(other);
1528 		other = NULL;
1529 		goto out;
1530 	}
1531 
1532 	/* Latch state of peer */
1533 	unix_state_lock(other);
1534 
1535 	/* Apparently VFS overslept socket death. Retry. */
1536 	if (sock_flag(other, SOCK_DEAD)) {
1537 		unix_state_unlock(other);
1538 		sock_put(other);
1539 		goto restart;
1540 	}
1541 
1542 	err = -ECONNREFUSED;
1543 	if (other->sk_state != TCP_LISTEN)
1544 		goto out_unlock;
1545 	if (other->sk_shutdown & RCV_SHUTDOWN)
1546 		goto out_unlock;
1547 
1548 	if (unix_recvq_full_lockless(other)) {
1549 		err = -EAGAIN;
1550 		if (!timeo)
1551 			goto out_unlock;
1552 
1553 		timeo = unix_wait_for_peer(other, timeo);
1554 
1555 		err = sock_intr_errno(timeo);
1556 		if (signal_pending(current))
1557 			goto out;
1558 		sock_put(other);
1559 		goto restart;
1560 	}
1561 
1562 	/* Latch our state.
1563 
1564 	   It is tricky place. We need to grab our state lock and cannot
1565 	   drop lock on peer. It is dangerous because deadlock is
1566 	   possible. Connect to self case and simultaneous
1567 	   attempt to connect are eliminated by checking socket
1568 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1569 	   check this before attempt to grab lock.
1570 
1571 	   Well, and we have to recheck the state after socket locked.
1572 	 */
1573 	switch (READ_ONCE(sk->sk_state)) {
1574 	case TCP_CLOSE:
1575 		/* This is ok... continue with connect */
1576 		break;
1577 	case TCP_ESTABLISHED:
1578 		/* Socket is already connected */
1579 		err = -EISCONN;
1580 		goto out_unlock;
1581 	default:
1582 		err = -EINVAL;
1583 		goto out_unlock;
1584 	}
1585 
1586 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1587 
1588 	if (sk->sk_state != TCP_CLOSE) {
1589 		unix_state_unlock(sk);
1590 		unix_state_unlock(other);
1591 		sock_put(other);
1592 		goto restart;
1593 	}
1594 
1595 	err = security_unix_stream_connect(sk, other, newsk);
1596 	if (err) {
1597 		unix_state_unlock(sk);
1598 		goto out_unlock;
1599 	}
1600 
1601 	/* The way is open! Fastly set all the necessary fields... */
1602 
1603 	sock_hold(sk);
1604 	unix_peer(newsk)	= sk;
1605 	newsk->sk_state		= TCP_ESTABLISHED;
1606 	newsk->sk_type		= sk->sk_type;
1607 	init_peercred(newsk);
1608 	newu = unix_sk(newsk);
1609 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1610 	otheru = unix_sk(other);
1611 
1612 	/* copy address information from listening to new sock
1613 	 *
1614 	 * The contents of *(otheru->addr) and otheru->path
1615 	 * are seen fully set up here, since we have found
1616 	 * otheru in hash under its lock.  Insertion into the
1617 	 * hash chain we'd found it in had been done in an
1618 	 * earlier critical area protected by the chain's lock,
1619 	 * the same one where we'd set *(otheru->addr) contents,
1620 	 * as well as otheru->path and otheru->addr itself.
1621 	 *
1622 	 * Using smp_store_release() here to set newu->addr
1623 	 * is enough to make those stores, as well as stores
1624 	 * to newu->path visible to anyone who gets newu->addr
1625 	 * by smp_load_acquire().  IOW, the same warranties
1626 	 * as for unix_sock instances bound in unix_bind() or
1627 	 * in unix_autobind().
1628 	 */
1629 	if (otheru->path.dentry) {
1630 		path_get(&otheru->path);
1631 		newu->path = otheru->path;
1632 	}
1633 	refcount_inc(&otheru->addr->refcnt);
1634 	smp_store_release(&newu->addr, otheru->addr);
1635 
1636 	/* Set credentials */
1637 	copy_peercred(sk, other);
1638 
1639 	sock->state	= SS_CONNECTED;
1640 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1641 	sock_hold(newsk);
1642 
1643 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1644 	unix_peer(sk)	= newsk;
1645 
1646 	unix_state_unlock(sk);
1647 
1648 	/* take ten and send info to listening sock */
1649 	spin_lock(&other->sk_receive_queue.lock);
1650 	__skb_queue_tail(&other->sk_receive_queue, skb);
1651 	spin_unlock(&other->sk_receive_queue.lock);
1652 	unix_state_unlock(other);
1653 	other->sk_data_ready(other);
1654 	sock_put(other);
1655 	return 0;
1656 
1657 out_unlock:
1658 	if (other)
1659 		unix_state_unlock(other);
1660 
1661 out:
1662 	kfree_skb(skb);
1663 	if (newsk)
1664 		unix_release_sock(newsk, 0);
1665 	if (other)
1666 		sock_put(other);
1667 	return err;
1668 }
1669 
1670 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1671 {
1672 	struct sock *ska = socka->sk, *skb = sockb->sk;
1673 
1674 	/* Join our sockets back to back */
1675 	sock_hold(ska);
1676 	sock_hold(skb);
1677 	unix_peer(ska) = skb;
1678 	unix_peer(skb) = ska;
1679 	init_peercred(ska);
1680 	init_peercred(skb);
1681 
1682 	ska->sk_state = TCP_ESTABLISHED;
1683 	skb->sk_state = TCP_ESTABLISHED;
1684 	socka->state  = SS_CONNECTED;
1685 	sockb->state  = SS_CONNECTED;
1686 	return 0;
1687 }
1688 
1689 static void unix_sock_inherit_flags(const struct socket *old,
1690 				    struct socket *new)
1691 {
1692 	if (test_bit(SOCK_PASSCRED, &old->flags))
1693 		set_bit(SOCK_PASSCRED, &new->flags);
1694 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1695 		set_bit(SOCK_PASSPIDFD, &new->flags);
1696 	if (test_bit(SOCK_PASSSEC, &old->flags))
1697 		set_bit(SOCK_PASSSEC, &new->flags);
1698 }
1699 
1700 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1701 		       bool kern)
1702 {
1703 	struct sock *sk = sock->sk;
1704 	struct sock *tsk;
1705 	struct sk_buff *skb;
1706 	int err;
1707 
1708 	err = -EOPNOTSUPP;
1709 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1710 		goto out;
1711 
1712 	err = -EINVAL;
1713 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1714 		goto out;
1715 
1716 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1717 	 * so that no locks are necessary.
1718 	 */
1719 
1720 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1721 				&err);
1722 	if (!skb) {
1723 		/* This means receive shutdown. */
1724 		if (err == 0)
1725 			err = -EINVAL;
1726 		goto out;
1727 	}
1728 
1729 	tsk = skb->sk;
1730 	skb_free_datagram(sk, skb);
1731 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1732 
1733 	/* attach accepted sock to socket */
1734 	unix_state_lock(tsk);
1735 	newsock->state = SS_CONNECTED;
1736 	unix_sock_inherit_flags(sock, newsock);
1737 	sock_graft(tsk, newsock);
1738 	unix_state_unlock(tsk);
1739 	return 0;
1740 
1741 out:
1742 	return err;
1743 }
1744 
1745 
1746 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1747 {
1748 	struct sock *sk = sock->sk;
1749 	struct unix_address *addr;
1750 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1751 	int err = 0;
1752 
1753 	if (peer) {
1754 		sk = unix_peer_get(sk);
1755 
1756 		err = -ENOTCONN;
1757 		if (!sk)
1758 			goto out;
1759 		err = 0;
1760 	} else {
1761 		sock_hold(sk);
1762 	}
1763 
1764 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1765 	if (!addr) {
1766 		sunaddr->sun_family = AF_UNIX;
1767 		sunaddr->sun_path[0] = 0;
1768 		err = offsetof(struct sockaddr_un, sun_path);
1769 	} else {
1770 		err = addr->len;
1771 		memcpy(sunaddr, addr->name, addr->len);
1772 	}
1773 	sock_put(sk);
1774 out:
1775 	return err;
1776 }
1777 
1778 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1779 {
1780 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1781 
1782 	/*
1783 	 * Garbage collection of unix sockets starts by selecting a set of
1784 	 * candidate sockets which have reference only from being in flight
1785 	 * (total_refs == inflight_refs).  This condition is checked once during
1786 	 * the candidate collection phase, and candidates are marked as such, so
1787 	 * that non-candidates can later be ignored.  While inflight_refs is
1788 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1789 	 * is an instantaneous decision.
1790 	 *
1791 	 * Once a candidate, however, the socket must not be reinstalled into a
1792 	 * file descriptor while the garbage collection is in progress.
1793 	 *
1794 	 * If the above conditions are met, then the directed graph of
1795 	 * candidates (*) does not change while unix_gc_lock is held.
1796 	 *
1797 	 * Any operations that changes the file count through file descriptors
1798 	 * (dup, close, sendmsg) does not change the graph since candidates are
1799 	 * not installed in fds.
1800 	 *
1801 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1802 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1803 	 * serialized with garbage collection.
1804 	 *
1805 	 * MSG_PEEK is special in that it does not change the inflight count,
1806 	 * yet does install the socket into an fd.  The following lock/unlock
1807 	 * pair is to ensure serialization with garbage collection.  It must be
1808 	 * done between incrementing the file count and installing the file into
1809 	 * an fd.
1810 	 *
1811 	 * If garbage collection starts after the barrier provided by the
1812 	 * lock/unlock, then it will see the elevated refcount and not mark this
1813 	 * as a candidate.  If a garbage collection is already in progress
1814 	 * before the file count was incremented, then the lock/unlock pair will
1815 	 * ensure that garbage collection is finished before progressing to
1816 	 * installing the fd.
1817 	 *
1818 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1819 	 * which is on the queue of listening socket A.
1820 	 */
1821 	spin_lock(&unix_gc_lock);
1822 	spin_unlock(&unix_gc_lock);
1823 }
1824 
1825 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1826 {
1827 	int err = 0;
1828 
1829 	UNIXCB(skb).pid  = get_pid(scm->pid);
1830 	UNIXCB(skb).uid = scm->creds.uid;
1831 	UNIXCB(skb).gid = scm->creds.gid;
1832 	UNIXCB(skb).fp = NULL;
1833 	unix_get_secdata(scm, skb);
1834 	if (scm->fp && send_fds)
1835 		err = unix_attach_fds(scm, skb);
1836 
1837 	skb->destructor = unix_destruct_scm;
1838 	return err;
1839 }
1840 
1841 static bool unix_passcred_enabled(const struct socket *sock,
1842 				  const struct sock *other)
1843 {
1844 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1845 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1846 	       !other->sk_socket ||
1847 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1848 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1849 }
1850 
1851 /*
1852  * Some apps rely on write() giving SCM_CREDENTIALS
1853  * We include credentials if source or destination socket
1854  * asserted SOCK_PASSCRED.
1855  */
1856 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1857 			    const struct sock *other)
1858 {
1859 	if (UNIXCB(skb).pid)
1860 		return;
1861 	if (unix_passcred_enabled(sock, other)) {
1862 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1863 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1864 	}
1865 }
1866 
1867 static bool unix_skb_scm_eq(struct sk_buff *skb,
1868 			    struct scm_cookie *scm)
1869 {
1870 	return UNIXCB(skb).pid == scm->pid &&
1871 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1872 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1873 	       unix_secdata_eq(scm, skb);
1874 }
1875 
1876 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1877 {
1878 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1879 	struct unix_sock *u = unix_sk(sk);
1880 
1881 	if (unlikely(fp && fp->count))
1882 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1883 }
1884 
1885 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1886 {
1887 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1888 	struct unix_sock *u = unix_sk(sk);
1889 
1890 	if (unlikely(fp && fp->count))
1891 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1892 }
1893 
1894 /*
1895  *	Send AF_UNIX data.
1896  */
1897 
1898 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1899 			      size_t len)
1900 {
1901 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1902 	struct sock *sk = sock->sk, *other = NULL;
1903 	struct unix_sock *u = unix_sk(sk);
1904 	struct scm_cookie scm;
1905 	struct sk_buff *skb;
1906 	int data_len = 0;
1907 	int sk_locked;
1908 	long timeo;
1909 	int err;
1910 
1911 	wait_for_unix_gc();
1912 	err = scm_send(sock, msg, &scm, false);
1913 	if (err < 0)
1914 		return err;
1915 
1916 	err = -EOPNOTSUPP;
1917 	if (msg->msg_flags&MSG_OOB)
1918 		goto out;
1919 
1920 	if (msg->msg_namelen) {
1921 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1922 		if (err)
1923 			goto out;
1924 	} else {
1925 		sunaddr = NULL;
1926 		err = -ENOTCONN;
1927 		other = unix_peer_get(sk);
1928 		if (!other)
1929 			goto out;
1930 	}
1931 
1932 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1933 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1934 	    !READ_ONCE(u->addr)) {
1935 		err = unix_autobind(sk);
1936 		if (err)
1937 			goto out;
1938 	}
1939 
1940 	err = -EMSGSIZE;
1941 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1942 		goto out;
1943 
1944 	if (len > SKB_MAX_ALLOC) {
1945 		data_len = min_t(size_t,
1946 				 len - SKB_MAX_ALLOC,
1947 				 MAX_SKB_FRAGS * PAGE_SIZE);
1948 		data_len = PAGE_ALIGN(data_len);
1949 
1950 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951 	}
1952 
1953 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954 				   msg->msg_flags & MSG_DONTWAIT, &err,
1955 				   PAGE_ALLOC_COSTLY_ORDER);
1956 	if (skb == NULL)
1957 		goto out;
1958 
1959 	err = unix_scm_to_skb(&scm, skb, true);
1960 	if (err < 0)
1961 		goto out_free;
1962 
1963 	skb_put(skb, len - data_len);
1964 	skb->data_len = data_len;
1965 	skb->len = len;
1966 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967 	if (err)
1968 		goto out_free;
1969 
1970 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971 
1972 restart:
1973 	if (!other) {
1974 		err = -ECONNRESET;
1975 		if (sunaddr == NULL)
1976 			goto out_free;
1977 
1978 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979 					sk->sk_type);
1980 		if (IS_ERR(other)) {
1981 			err = PTR_ERR(other);
1982 			other = NULL;
1983 			goto out_free;
1984 		}
1985 	}
1986 
1987 	if (sk_filter(other, skb) < 0) {
1988 		/* Toss the packet but do not return any error to the sender */
1989 		err = len;
1990 		goto out_free;
1991 	}
1992 
1993 	sk_locked = 0;
1994 	unix_state_lock(other);
1995 restart_locked:
1996 	err = -EPERM;
1997 	if (!unix_may_send(sk, other))
1998 		goto out_unlock;
1999 
2000 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001 		/*
2002 		 *	Check with 1003.1g - what should
2003 		 *	datagram error
2004 		 */
2005 		unix_state_unlock(other);
2006 		sock_put(other);
2007 
2008 		if (!sk_locked)
2009 			unix_state_lock(sk);
2010 
2011 		err = 0;
2012 		if (sk->sk_type == SOCK_SEQPACKET) {
2013 			/* We are here only when racing with unix_release_sock()
2014 			 * is clearing @other. Never change state to TCP_CLOSE
2015 			 * unlike SOCK_DGRAM wants.
2016 			 */
2017 			unix_state_unlock(sk);
2018 			err = -EPIPE;
2019 		} else if (unix_peer(sk) == other) {
2020 			unix_peer(sk) = NULL;
2021 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022 
2023 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2024 			unix_state_unlock(sk);
2025 
2026 			unix_dgram_disconnected(sk, other);
2027 			sock_put(other);
2028 			err = -ECONNREFUSED;
2029 		} else {
2030 			unix_state_unlock(sk);
2031 		}
2032 
2033 		other = NULL;
2034 		if (err)
2035 			goto out_free;
2036 		goto restart;
2037 	}
2038 
2039 	err = -EPIPE;
2040 	if (other->sk_shutdown & RCV_SHUTDOWN)
2041 		goto out_unlock;
2042 
2043 	if (sk->sk_type != SOCK_SEQPACKET) {
2044 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045 		if (err)
2046 			goto out_unlock;
2047 	}
2048 
2049 	/* other == sk && unix_peer(other) != sk if
2050 	 * - unix_peer(sk) == NULL, destination address bound to sk
2051 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2052 	 */
2053 	if (other != sk &&
2054 	    unlikely(unix_peer(other) != sk &&
2055 	    unix_recvq_full_lockless(other))) {
2056 		if (timeo) {
2057 			timeo = unix_wait_for_peer(other, timeo);
2058 
2059 			err = sock_intr_errno(timeo);
2060 			if (signal_pending(current))
2061 				goto out_free;
2062 
2063 			goto restart;
2064 		}
2065 
2066 		if (!sk_locked) {
2067 			unix_state_unlock(other);
2068 			unix_state_double_lock(sk, other);
2069 		}
2070 
2071 		if (unix_peer(sk) != other ||
2072 		    unix_dgram_peer_wake_me(sk, other)) {
2073 			err = -EAGAIN;
2074 			sk_locked = 1;
2075 			goto out_unlock;
2076 		}
2077 
2078 		if (!sk_locked) {
2079 			sk_locked = 1;
2080 			goto restart_locked;
2081 		}
2082 	}
2083 
2084 	if (unlikely(sk_locked))
2085 		unix_state_unlock(sk);
2086 
2087 	if (sock_flag(other, SOCK_RCVTSTAMP))
2088 		__net_timestamp(skb);
2089 	maybe_add_creds(skb, sock, other);
2090 	scm_stat_add(other, skb);
2091 	skb_queue_tail(&other->sk_receive_queue, skb);
2092 	unix_state_unlock(other);
2093 	other->sk_data_ready(other);
2094 	sock_put(other);
2095 	scm_destroy(&scm);
2096 	return len;
2097 
2098 out_unlock:
2099 	if (sk_locked)
2100 		unix_state_unlock(sk);
2101 	unix_state_unlock(other);
2102 out_free:
2103 	kfree_skb(skb);
2104 out:
2105 	if (other)
2106 		sock_put(other);
2107 	scm_destroy(&scm);
2108 	return err;
2109 }
2110 
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112  * bytes, and a minimum of a full page.
2113  */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115 
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118 		     struct scm_cookie *scm, bool fds_sent)
2119 {
2120 	struct unix_sock *ousk = unix_sk(other);
2121 	struct sk_buff *skb;
2122 	int err = 0;
2123 
2124 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125 
2126 	if (!skb)
2127 		return err;
2128 
2129 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2130 	if (err < 0) {
2131 		kfree_skb(skb);
2132 		return err;
2133 	}
2134 	skb_put(skb, 1);
2135 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136 
2137 	if (err) {
2138 		kfree_skb(skb);
2139 		return err;
2140 	}
2141 
2142 	unix_state_lock(other);
2143 
2144 	if (sock_flag(other, SOCK_DEAD) ||
2145 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2146 		unix_state_unlock(other);
2147 		kfree_skb(skb);
2148 		return -EPIPE;
2149 	}
2150 
2151 	maybe_add_creds(skb, sock, other);
2152 	skb_get(skb);
2153 
2154 	scm_stat_add(other, skb);
2155 
2156 	spin_lock(&other->sk_receive_queue.lock);
2157 	if (ousk->oob_skb)
2158 		consume_skb(ousk->oob_skb);
2159 	WRITE_ONCE(ousk->oob_skb, skb);
2160 	__skb_queue_tail(&other->sk_receive_queue, skb);
2161 	spin_unlock(&other->sk_receive_queue.lock);
2162 
2163 	sk_send_sigurg(other);
2164 	unix_state_unlock(other);
2165 	other->sk_data_ready(other);
2166 
2167 	return err;
2168 }
2169 #endif
2170 
2171 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2172 			       size_t len)
2173 {
2174 	struct sock *sk = sock->sk;
2175 	struct sock *other = NULL;
2176 	int err, size;
2177 	struct sk_buff *skb;
2178 	int sent = 0;
2179 	struct scm_cookie scm;
2180 	bool fds_sent = false;
2181 	int data_len;
2182 
2183 	wait_for_unix_gc();
2184 	err = scm_send(sock, msg, &scm, false);
2185 	if (err < 0)
2186 		return err;
2187 
2188 	err = -EOPNOTSUPP;
2189 	if (msg->msg_flags & MSG_OOB) {
2190 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2191 		if (len)
2192 			len--;
2193 		else
2194 #endif
2195 			goto out_err;
2196 	}
2197 
2198 	if (msg->msg_namelen) {
2199 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2200 		goto out_err;
2201 	} else {
2202 		err = -ENOTCONN;
2203 		other = unix_peer(sk);
2204 		if (!other)
2205 			goto out_err;
2206 	}
2207 
2208 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2209 		goto pipe_err;
2210 
2211 	while (sent < len) {
2212 		size = len - sent;
2213 
2214 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2215 			skb = sock_alloc_send_pskb(sk, 0, 0,
2216 						   msg->msg_flags & MSG_DONTWAIT,
2217 						   &err, 0);
2218 		} else {
2219 			/* Keep two messages in the pipe so it schedules better */
2220 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2221 
2222 			/* allow fallback to order-0 allocations */
2223 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2224 
2225 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2226 
2227 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2228 
2229 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2230 						   msg->msg_flags & MSG_DONTWAIT, &err,
2231 						   get_order(UNIX_SKB_FRAGS_SZ));
2232 		}
2233 		if (!skb)
2234 			goto out_err;
2235 
2236 		/* Only send the fds in the first buffer */
2237 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2238 		if (err < 0) {
2239 			kfree_skb(skb);
2240 			goto out_err;
2241 		}
2242 		fds_sent = true;
2243 
2244 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2245 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2246 						   sk->sk_allocation);
2247 			if (err < 0) {
2248 				kfree_skb(skb);
2249 				goto out_err;
2250 			}
2251 			size = err;
2252 			refcount_add(size, &sk->sk_wmem_alloc);
2253 		} else {
2254 			skb_put(skb, size - data_len);
2255 			skb->data_len = data_len;
2256 			skb->len = size;
2257 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2258 			if (err) {
2259 				kfree_skb(skb);
2260 				goto out_err;
2261 			}
2262 		}
2263 
2264 		unix_state_lock(other);
2265 
2266 		if (sock_flag(other, SOCK_DEAD) ||
2267 		    (other->sk_shutdown & RCV_SHUTDOWN))
2268 			goto pipe_err_free;
2269 
2270 		maybe_add_creds(skb, sock, other);
2271 		scm_stat_add(other, skb);
2272 		skb_queue_tail(&other->sk_receive_queue, skb);
2273 		unix_state_unlock(other);
2274 		other->sk_data_ready(other);
2275 		sent += size;
2276 	}
2277 
2278 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2279 	if (msg->msg_flags & MSG_OOB) {
2280 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2281 		if (err)
2282 			goto out_err;
2283 		sent++;
2284 	}
2285 #endif
2286 
2287 	scm_destroy(&scm);
2288 
2289 	return sent;
2290 
2291 pipe_err_free:
2292 	unix_state_unlock(other);
2293 	kfree_skb(skb);
2294 pipe_err:
2295 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2296 		send_sig(SIGPIPE, current, 0);
2297 	err = -EPIPE;
2298 out_err:
2299 	scm_destroy(&scm);
2300 	return sent ? : err;
2301 }
2302 
2303 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2304 				  size_t len)
2305 {
2306 	int err;
2307 	struct sock *sk = sock->sk;
2308 
2309 	err = sock_error(sk);
2310 	if (err)
2311 		return err;
2312 
2313 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2314 		return -ENOTCONN;
2315 
2316 	if (msg->msg_namelen)
2317 		msg->msg_namelen = 0;
2318 
2319 	return unix_dgram_sendmsg(sock, msg, len);
2320 }
2321 
2322 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2323 				  size_t size, int flags)
2324 {
2325 	struct sock *sk = sock->sk;
2326 
2327 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2328 		return -ENOTCONN;
2329 
2330 	return unix_dgram_recvmsg(sock, msg, size, flags);
2331 }
2332 
2333 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2334 {
2335 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2336 
2337 	if (addr) {
2338 		msg->msg_namelen = addr->len;
2339 		memcpy(msg->msg_name, addr->name, addr->len);
2340 	}
2341 }
2342 
2343 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2344 			 int flags)
2345 {
2346 	struct scm_cookie scm;
2347 	struct socket *sock = sk->sk_socket;
2348 	struct unix_sock *u = unix_sk(sk);
2349 	struct sk_buff *skb, *last;
2350 	long timeo;
2351 	int skip;
2352 	int err;
2353 
2354 	err = -EOPNOTSUPP;
2355 	if (flags&MSG_OOB)
2356 		goto out;
2357 
2358 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2359 
2360 	do {
2361 		mutex_lock(&u->iolock);
2362 
2363 		skip = sk_peek_offset(sk, flags);
2364 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2365 					      &skip, &err, &last);
2366 		if (skb) {
2367 			if (!(flags & MSG_PEEK))
2368 				scm_stat_del(sk, skb);
2369 			break;
2370 		}
2371 
2372 		mutex_unlock(&u->iolock);
2373 
2374 		if (err != -EAGAIN)
2375 			break;
2376 	} while (timeo &&
2377 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2378 					      &err, &timeo, last));
2379 
2380 	if (!skb) { /* implies iolock unlocked */
2381 		unix_state_lock(sk);
2382 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2383 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2384 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2385 			err = 0;
2386 		unix_state_unlock(sk);
2387 		goto out;
2388 	}
2389 
2390 	if (wq_has_sleeper(&u->peer_wait))
2391 		wake_up_interruptible_sync_poll(&u->peer_wait,
2392 						EPOLLOUT | EPOLLWRNORM |
2393 						EPOLLWRBAND);
2394 
2395 	if (msg->msg_name)
2396 		unix_copy_addr(msg, skb->sk);
2397 
2398 	if (size > skb->len - skip)
2399 		size = skb->len - skip;
2400 	else if (size < skb->len - skip)
2401 		msg->msg_flags |= MSG_TRUNC;
2402 
2403 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2404 	if (err)
2405 		goto out_free;
2406 
2407 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2408 		__sock_recv_timestamp(msg, sk, skb);
2409 
2410 	memset(&scm, 0, sizeof(scm));
2411 
2412 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2413 	unix_set_secdata(&scm, skb);
2414 
2415 	if (!(flags & MSG_PEEK)) {
2416 		if (UNIXCB(skb).fp)
2417 			unix_detach_fds(&scm, skb);
2418 
2419 		sk_peek_offset_bwd(sk, skb->len);
2420 	} else {
2421 		/* It is questionable: on PEEK we could:
2422 		   - do not return fds - good, but too simple 8)
2423 		   - return fds, and do not return them on read (old strategy,
2424 		     apparently wrong)
2425 		   - clone fds (I chose it for now, it is the most universal
2426 		     solution)
2427 
2428 		   POSIX 1003.1g does not actually define this clearly
2429 		   at all. POSIX 1003.1g doesn't define a lot of things
2430 		   clearly however!
2431 
2432 		*/
2433 
2434 		sk_peek_offset_fwd(sk, size);
2435 
2436 		if (UNIXCB(skb).fp)
2437 			unix_peek_fds(&scm, skb);
2438 	}
2439 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2440 
2441 	scm_recv_unix(sock, msg, &scm, flags);
2442 
2443 out_free:
2444 	skb_free_datagram(sk, skb);
2445 	mutex_unlock(&u->iolock);
2446 out:
2447 	return err;
2448 }
2449 
2450 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2451 			      int flags)
2452 {
2453 	struct sock *sk = sock->sk;
2454 
2455 #ifdef CONFIG_BPF_SYSCALL
2456 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2457 
2458 	if (prot != &unix_dgram_proto)
2459 		return prot->recvmsg(sk, msg, size, flags, NULL);
2460 #endif
2461 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2462 }
2463 
2464 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2465 {
2466 	struct unix_sock *u = unix_sk(sk);
2467 	struct sk_buff *skb;
2468 	int err;
2469 
2470 	mutex_lock(&u->iolock);
2471 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2472 	mutex_unlock(&u->iolock);
2473 	if (!skb)
2474 		return err;
2475 
2476 	return recv_actor(sk, skb);
2477 }
2478 
2479 /*
2480  *	Sleep until more data has arrived. But check for races..
2481  */
2482 static long unix_stream_data_wait(struct sock *sk, long timeo,
2483 				  struct sk_buff *last, unsigned int last_len,
2484 				  bool freezable)
2485 {
2486 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2487 	struct sk_buff *tail;
2488 	DEFINE_WAIT(wait);
2489 
2490 	unix_state_lock(sk);
2491 
2492 	for (;;) {
2493 		prepare_to_wait(sk_sleep(sk), &wait, state);
2494 
2495 		tail = skb_peek_tail(&sk->sk_receive_queue);
2496 		if (tail != last ||
2497 		    (tail && tail->len != last_len) ||
2498 		    sk->sk_err ||
2499 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2500 		    signal_pending(current) ||
2501 		    !timeo)
2502 			break;
2503 
2504 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2505 		unix_state_unlock(sk);
2506 		timeo = schedule_timeout(timeo);
2507 		unix_state_lock(sk);
2508 
2509 		if (sock_flag(sk, SOCK_DEAD))
2510 			break;
2511 
2512 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2513 	}
2514 
2515 	finish_wait(sk_sleep(sk), &wait);
2516 	unix_state_unlock(sk);
2517 	return timeo;
2518 }
2519 
2520 static unsigned int unix_skb_len(const struct sk_buff *skb)
2521 {
2522 	return skb->len - UNIXCB(skb).consumed;
2523 }
2524 
2525 struct unix_stream_read_state {
2526 	int (*recv_actor)(struct sk_buff *, int, int,
2527 			  struct unix_stream_read_state *);
2528 	struct socket *socket;
2529 	struct msghdr *msg;
2530 	struct pipe_inode_info *pipe;
2531 	size_t size;
2532 	int flags;
2533 	unsigned int splice_flags;
2534 };
2535 
2536 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2537 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2538 {
2539 	struct socket *sock = state->socket;
2540 	struct sock *sk = sock->sk;
2541 	struct unix_sock *u = unix_sk(sk);
2542 	int chunk = 1;
2543 	struct sk_buff *oob_skb;
2544 
2545 	mutex_lock(&u->iolock);
2546 	unix_state_lock(sk);
2547 	spin_lock(&sk->sk_receive_queue.lock);
2548 
2549 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2550 		spin_unlock(&sk->sk_receive_queue.lock);
2551 		unix_state_unlock(sk);
2552 		mutex_unlock(&u->iolock);
2553 		return -EINVAL;
2554 	}
2555 
2556 	oob_skb = u->oob_skb;
2557 
2558 	if (!(state->flags & MSG_PEEK))
2559 		WRITE_ONCE(u->oob_skb, NULL);
2560 	else
2561 		skb_get(oob_skb);
2562 
2563 	spin_unlock(&sk->sk_receive_queue.lock);
2564 	unix_state_unlock(sk);
2565 
2566 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2567 
2568 	if (!(state->flags & MSG_PEEK))
2569 		UNIXCB(oob_skb).consumed += 1;
2570 
2571 	consume_skb(oob_skb);
2572 
2573 	mutex_unlock(&u->iolock);
2574 
2575 	if (chunk < 0)
2576 		return -EFAULT;
2577 
2578 	state->msg->msg_flags |= MSG_OOB;
2579 	return 1;
2580 }
2581 
2582 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2583 				  int flags, int copied)
2584 {
2585 	struct unix_sock *u = unix_sk(sk);
2586 
2587 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2588 		skb_unlink(skb, &sk->sk_receive_queue);
2589 		consume_skb(skb);
2590 		skb = NULL;
2591 	} else {
2592 		struct sk_buff *unlinked_skb = NULL;
2593 
2594 		spin_lock(&sk->sk_receive_queue.lock);
2595 
2596 		if (skb == u->oob_skb) {
2597 			if (copied) {
2598 				skb = NULL;
2599 			} else if (!(flags & MSG_PEEK)) {
2600 				if (sock_flag(sk, SOCK_URGINLINE)) {
2601 					WRITE_ONCE(u->oob_skb, NULL);
2602 					consume_skb(skb);
2603 				} else {
2604 					__skb_unlink(skb, &sk->sk_receive_queue);
2605 					WRITE_ONCE(u->oob_skb, NULL);
2606 					unlinked_skb = skb;
2607 					skb = skb_peek(&sk->sk_receive_queue);
2608 				}
2609 			} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2610 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
2611 			}
2612 		}
2613 
2614 		spin_unlock(&sk->sk_receive_queue.lock);
2615 
2616 		if (unlinked_skb) {
2617 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2618 			kfree_skb(unlinked_skb);
2619 		}
2620 	}
2621 	return skb;
2622 }
2623 #endif
2624 
2625 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2626 {
2627 	struct unix_sock *u = unix_sk(sk);
2628 	struct sk_buff *skb;
2629 	int err;
2630 
2631 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2632 		return -ENOTCONN;
2633 
2634 	mutex_lock(&u->iolock);
2635 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2636 	mutex_unlock(&u->iolock);
2637 	if (!skb)
2638 		return err;
2639 
2640 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2641 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2642 		bool drop = false;
2643 
2644 		unix_state_lock(sk);
2645 
2646 		if (sock_flag(sk, SOCK_DEAD)) {
2647 			unix_state_unlock(sk);
2648 			kfree_skb(skb);
2649 			return -ECONNRESET;
2650 		}
2651 
2652 		spin_lock(&sk->sk_receive_queue.lock);
2653 		if (likely(skb == u->oob_skb)) {
2654 			WRITE_ONCE(u->oob_skb, NULL);
2655 			drop = true;
2656 		}
2657 		spin_unlock(&sk->sk_receive_queue.lock);
2658 
2659 		unix_state_unlock(sk);
2660 
2661 		if (drop) {
2662 			WARN_ON_ONCE(skb_unref(skb));
2663 			kfree_skb(skb);
2664 			return -EAGAIN;
2665 		}
2666 	}
2667 #endif
2668 
2669 	return recv_actor(sk, skb);
2670 }
2671 
2672 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2673 				    bool freezable)
2674 {
2675 	struct scm_cookie scm;
2676 	struct socket *sock = state->socket;
2677 	struct sock *sk = sock->sk;
2678 	struct unix_sock *u = unix_sk(sk);
2679 	int copied = 0;
2680 	int flags = state->flags;
2681 	int noblock = flags & MSG_DONTWAIT;
2682 	bool check_creds = false;
2683 	int target;
2684 	int err = 0;
2685 	long timeo;
2686 	int skip;
2687 	size_t size = state->size;
2688 	unsigned int last_len;
2689 
2690 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2691 		err = -EINVAL;
2692 		goto out;
2693 	}
2694 
2695 	if (unlikely(flags & MSG_OOB)) {
2696 		err = -EOPNOTSUPP;
2697 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2698 		err = unix_stream_recv_urg(state);
2699 #endif
2700 		goto out;
2701 	}
2702 
2703 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2704 	timeo = sock_rcvtimeo(sk, noblock);
2705 
2706 	memset(&scm, 0, sizeof(scm));
2707 
2708 	/* Lock the socket to prevent queue disordering
2709 	 * while sleeps in memcpy_tomsg
2710 	 */
2711 	mutex_lock(&u->iolock);
2712 
2713 	skip = max(sk_peek_offset(sk, flags), 0);
2714 
2715 	do {
2716 		int chunk;
2717 		bool drop_skb;
2718 		struct sk_buff *skb, *last;
2719 
2720 redo:
2721 		unix_state_lock(sk);
2722 		if (sock_flag(sk, SOCK_DEAD)) {
2723 			err = -ECONNRESET;
2724 			goto unlock;
2725 		}
2726 		last = skb = skb_peek(&sk->sk_receive_queue);
2727 		last_len = last ? last->len : 0;
2728 
2729 again:
2730 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2731 		if (skb) {
2732 			skb = manage_oob(skb, sk, flags, copied);
2733 			if (!skb && copied) {
2734 				unix_state_unlock(sk);
2735 				break;
2736 			}
2737 		}
2738 #endif
2739 		if (skb == NULL) {
2740 			if (copied >= target)
2741 				goto unlock;
2742 
2743 			/*
2744 			 *	POSIX 1003.1g mandates this order.
2745 			 */
2746 
2747 			err = sock_error(sk);
2748 			if (err)
2749 				goto unlock;
2750 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2751 				goto unlock;
2752 
2753 			unix_state_unlock(sk);
2754 			if (!timeo) {
2755 				err = -EAGAIN;
2756 				break;
2757 			}
2758 
2759 			mutex_unlock(&u->iolock);
2760 
2761 			timeo = unix_stream_data_wait(sk, timeo, last,
2762 						      last_len, freezable);
2763 
2764 			if (signal_pending(current)) {
2765 				err = sock_intr_errno(timeo);
2766 				scm_destroy(&scm);
2767 				goto out;
2768 			}
2769 
2770 			mutex_lock(&u->iolock);
2771 			goto redo;
2772 unlock:
2773 			unix_state_unlock(sk);
2774 			break;
2775 		}
2776 
2777 		while (skip >= unix_skb_len(skb)) {
2778 			skip -= unix_skb_len(skb);
2779 			last = skb;
2780 			last_len = skb->len;
2781 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2782 			if (!skb)
2783 				goto again;
2784 		}
2785 
2786 		unix_state_unlock(sk);
2787 
2788 		if (check_creds) {
2789 			/* Never glue messages from different writers */
2790 			if (!unix_skb_scm_eq(skb, &scm))
2791 				break;
2792 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2793 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2794 			/* Copy credentials */
2795 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2796 			unix_set_secdata(&scm, skb);
2797 			check_creds = true;
2798 		}
2799 
2800 		/* Copy address just once */
2801 		if (state->msg && state->msg->msg_name) {
2802 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2803 					 state->msg->msg_name);
2804 			unix_copy_addr(state->msg, skb->sk);
2805 			sunaddr = NULL;
2806 		}
2807 
2808 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2809 		skb_get(skb);
2810 		chunk = state->recv_actor(skb, skip, chunk, state);
2811 		drop_skb = !unix_skb_len(skb);
2812 		/* skb is only safe to use if !drop_skb */
2813 		consume_skb(skb);
2814 		if (chunk < 0) {
2815 			if (copied == 0)
2816 				copied = -EFAULT;
2817 			break;
2818 		}
2819 		copied += chunk;
2820 		size -= chunk;
2821 
2822 		if (drop_skb) {
2823 			/* the skb was touched by a concurrent reader;
2824 			 * we should not expect anything from this skb
2825 			 * anymore and assume it invalid - we can be
2826 			 * sure it was dropped from the socket queue
2827 			 *
2828 			 * let's report a short read
2829 			 */
2830 			err = 0;
2831 			break;
2832 		}
2833 
2834 		/* Mark read part of skb as used */
2835 		if (!(flags & MSG_PEEK)) {
2836 			UNIXCB(skb).consumed += chunk;
2837 
2838 			sk_peek_offset_bwd(sk, chunk);
2839 
2840 			if (UNIXCB(skb).fp) {
2841 				scm_stat_del(sk, skb);
2842 				unix_detach_fds(&scm, skb);
2843 			}
2844 
2845 			if (unix_skb_len(skb))
2846 				break;
2847 
2848 			skb_unlink(skb, &sk->sk_receive_queue);
2849 			consume_skb(skb);
2850 
2851 			if (scm.fp)
2852 				break;
2853 		} else {
2854 			/* It is questionable, see note in unix_dgram_recvmsg.
2855 			 */
2856 			if (UNIXCB(skb).fp)
2857 				unix_peek_fds(&scm, skb);
2858 
2859 			sk_peek_offset_fwd(sk, chunk);
2860 
2861 			if (UNIXCB(skb).fp)
2862 				break;
2863 
2864 			skip = 0;
2865 			last = skb;
2866 			last_len = skb->len;
2867 			unix_state_lock(sk);
2868 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2869 			if (skb)
2870 				goto again;
2871 			unix_state_unlock(sk);
2872 			break;
2873 		}
2874 	} while (size);
2875 
2876 	mutex_unlock(&u->iolock);
2877 	if (state->msg)
2878 		scm_recv_unix(sock, state->msg, &scm, flags);
2879 	else
2880 		scm_destroy(&scm);
2881 out:
2882 	return copied ? : err;
2883 }
2884 
2885 static int unix_stream_read_actor(struct sk_buff *skb,
2886 				  int skip, int chunk,
2887 				  struct unix_stream_read_state *state)
2888 {
2889 	int ret;
2890 
2891 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2892 				    state->msg, chunk);
2893 	return ret ?: chunk;
2894 }
2895 
2896 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2897 			  size_t size, int flags)
2898 {
2899 	struct unix_stream_read_state state = {
2900 		.recv_actor = unix_stream_read_actor,
2901 		.socket = sk->sk_socket,
2902 		.msg = msg,
2903 		.size = size,
2904 		.flags = flags
2905 	};
2906 
2907 	return unix_stream_read_generic(&state, true);
2908 }
2909 
2910 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2911 			       size_t size, int flags)
2912 {
2913 	struct unix_stream_read_state state = {
2914 		.recv_actor = unix_stream_read_actor,
2915 		.socket = sock,
2916 		.msg = msg,
2917 		.size = size,
2918 		.flags = flags
2919 	};
2920 
2921 #ifdef CONFIG_BPF_SYSCALL
2922 	struct sock *sk = sock->sk;
2923 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2924 
2925 	if (prot != &unix_stream_proto)
2926 		return prot->recvmsg(sk, msg, size, flags, NULL);
2927 #endif
2928 	return unix_stream_read_generic(&state, true);
2929 }
2930 
2931 static int unix_stream_splice_actor(struct sk_buff *skb,
2932 				    int skip, int chunk,
2933 				    struct unix_stream_read_state *state)
2934 {
2935 	return skb_splice_bits(skb, state->socket->sk,
2936 			       UNIXCB(skb).consumed + skip,
2937 			       state->pipe, chunk, state->splice_flags);
2938 }
2939 
2940 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2941 				       struct pipe_inode_info *pipe,
2942 				       size_t size, unsigned int flags)
2943 {
2944 	struct unix_stream_read_state state = {
2945 		.recv_actor = unix_stream_splice_actor,
2946 		.socket = sock,
2947 		.pipe = pipe,
2948 		.size = size,
2949 		.splice_flags = flags,
2950 	};
2951 
2952 	if (unlikely(*ppos))
2953 		return -ESPIPE;
2954 
2955 	if (sock->file->f_flags & O_NONBLOCK ||
2956 	    flags & SPLICE_F_NONBLOCK)
2957 		state.flags = MSG_DONTWAIT;
2958 
2959 	return unix_stream_read_generic(&state, false);
2960 }
2961 
2962 static int unix_shutdown(struct socket *sock, int mode)
2963 {
2964 	struct sock *sk = sock->sk;
2965 	struct sock *other;
2966 
2967 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2968 		return -EINVAL;
2969 	/* This maps:
2970 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2971 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2972 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2973 	 */
2974 	++mode;
2975 
2976 	unix_state_lock(sk);
2977 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2978 	other = unix_peer(sk);
2979 	if (other)
2980 		sock_hold(other);
2981 	unix_state_unlock(sk);
2982 	sk->sk_state_change(sk);
2983 
2984 	if (other &&
2985 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2986 
2987 		int peer_mode = 0;
2988 		const struct proto *prot = READ_ONCE(other->sk_prot);
2989 
2990 		if (prot->unhash)
2991 			prot->unhash(other);
2992 		if (mode&RCV_SHUTDOWN)
2993 			peer_mode |= SEND_SHUTDOWN;
2994 		if (mode&SEND_SHUTDOWN)
2995 			peer_mode |= RCV_SHUTDOWN;
2996 		unix_state_lock(other);
2997 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2998 		unix_state_unlock(other);
2999 		other->sk_state_change(other);
3000 		if (peer_mode == SHUTDOWN_MASK)
3001 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3002 		else if (peer_mode & RCV_SHUTDOWN)
3003 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3004 	}
3005 	if (other)
3006 		sock_put(other);
3007 
3008 	return 0;
3009 }
3010 
3011 long unix_inq_len(struct sock *sk)
3012 {
3013 	struct sk_buff *skb;
3014 	long amount = 0;
3015 
3016 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3017 		return -EINVAL;
3018 
3019 	spin_lock(&sk->sk_receive_queue.lock);
3020 	if (sk->sk_type == SOCK_STREAM ||
3021 	    sk->sk_type == SOCK_SEQPACKET) {
3022 		skb_queue_walk(&sk->sk_receive_queue, skb)
3023 			amount += unix_skb_len(skb);
3024 	} else {
3025 		skb = skb_peek(&sk->sk_receive_queue);
3026 		if (skb)
3027 			amount = skb->len;
3028 	}
3029 	spin_unlock(&sk->sk_receive_queue.lock);
3030 
3031 	return amount;
3032 }
3033 EXPORT_SYMBOL_GPL(unix_inq_len);
3034 
3035 long unix_outq_len(struct sock *sk)
3036 {
3037 	return sk_wmem_alloc_get(sk);
3038 }
3039 EXPORT_SYMBOL_GPL(unix_outq_len);
3040 
3041 static int unix_open_file(struct sock *sk)
3042 {
3043 	struct path path;
3044 	struct file *f;
3045 	int fd;
3046 
3047 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3048 		return -EPERM;
3049 
3050 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3051 		return -ENOENT;
3052 
3053 	path = unix_sk(sk)->path;
3054 	if (!path.dentry)
3055 		return -ENOENT;
3056 
3057 	path_get(&path);
3058 
3059 	fd = get_unused_fd_flags(O_CLOEXEC);
3060 	if (fd < 0)
3061 		goto out;
3062 
3063 	f = dentry_open(&path, O_PATH, current_cred());
3064 	if (IS_ERR(f)) {
3065 		put_unused_fd(fd);
3066 		fd = PTR_ERR(f);
3067 		goto out;
3068 	}
3069 
3070 	fd_install(fd, f);
3071 out:
3072 	path_put(&path);
3073 
3074 	return fd;
3075 }
3076 
3077 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3078 {
3079 	struct sock *sk = sock->sk;
3080 	long amount = 0;
3081 	int err;
3082 
3083 	switch (cmd) {
3084 	case SIOCOUTQ:
3085 		amount = unix_outq_len(sk);
3086 		err = put_user(amount, (int __user *)arg);
3087 		break;
3088 	case SIOCINQ:
3089 		amount = unix_inq_len(sk);
3090 		if (amount < 0)
3091 			err = amount;
3092 		else
3093 			err = put_user(amount, (int __user *)arg);
3094 		break;
3095 	case SIOCUNIXFILE:
3096 		err = unix_open_file(sk);
3097 		break;
3098 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3099 	case SIOCATMARK:
3100 		{
3101 			struct sk_buff *skb;
3102 			int answ = 0;
3103 
3104 			skb = skb_peek(&sk->sk_receive_queue);
3105 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3106 				answ = 1;
3107 			err = put_user(answ, (int __user *)arg);
3108 		}
3109 		break;
3110 #endif
3111 	default:
3112 		err = -ENOIOCTLCMD;
3113 		break;
3114 	}
3115 	return err;
3116 }
3117 
3118 #ifdef CONFIG_COMPAT
3119 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3120 {
3121 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3122 }
3123 #endif
3124 
3125 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3126 {
3127 	struct sock *sk = sock->sk;
3128 	unsigned char state;
3129 	__poll_t mask;
3130 	u8 shutdown;
3131 
3132 	sock_poll_wait(file, sock, wait);
3133 	mask = 0;
3134 	shutdown = READ_ONCE(sk->sk_shutdown);
3135 	state = READ_ONCE(sk->sk_state);
3136 
3137 	/* exceptional events? */
3138 	if (READ_ONCE(sk->sk_err))
3139 		mask |= EPOLLERR;
3140 	if (shutdown == SHUTDOWN_MASK)
3141 		mask |= EPOLLHUP;
3142 	if (shutdown & RCV_SHUTDOWN)
3143 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3144 
3145 	/* readable? */
3146 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3147 		mask |= EPOLLIN | EPOLLRDNORM;
3148 	if (sk_is_readable(sk))
3149 		mask |= EPOLLIN | EPOLLRDNORM;
3150 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3151 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3152 		mask |= EPOLLPRI;
3153 #endif
3154 
3155 	/* Connection-based need to check for termination and startup */
3156 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3157 	    state == TCP_CLOSE)
3158 		mask |= EPOLLHUP;
3159 
3160 	/*
3161 	 * we set writable also when the other side has shut down the
3162 	 * connection. This prevents stuck sockets.
3163 	 */
3164 	if (unix_writable(sk, state))
3165 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3166 
3167 	return mask;
3168 }
3169 
3170 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3171 				    poll_table *wait)
3172 {
3173 	struct sock *sk = sock->sk, *other;
3174 	unsigned int writable;
3175 	unsigned char state;
3176 	__poll_t mask;
3177 	u8 shutdown;
3178 
3179 	sock_poll_wait(file, sock, wait);
3180 	mask = 0;
3181 	shutdown = READ_ONCE(sk->sk_shutdown);
3182 	state = READ_ONCE(sk->sk_state);
3183 
3184 	/* exceptional events? */
3185 	if (READ_ONCE(sk->sk_err) ||
3186 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3187 		mask |= EPOLLERR |
3188 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3189 
3190 	if (shutdown & RCV_SHUTDOWN)
3191 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3192 	if (shutdown == SHUTDOWN_MASK)
3193 		mask |= EPOLLHUP;
3194 
3195 	/* readable? */
3196 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3197 		mask |= EPOLLIN | EPOLLRDNORM;
3198 	if (sk_is_readable(sk))
3199 		mask |= EPOLLIN | EPOLLRDNORM;
3200 
3201 	/* Connection-based need to check for termination and startup */
3202 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3203 		mask |= EPOLLHUP;
3204 
3205 	/* No write status requested, avoid expensive OUT tests. */
3206 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3207 		return mask;
3208 
3209 	writable = unix_writable(sk, state);
3210 	if (writable) {
3211 		unix_state_lock(sk);
3212 
3213 		other = unix_peer(sk);
3214 		if (other && unix_peer(other) != sk &&
3215 		    unix_recvq_full_lockless(other) &&
3216 		    unix_dgram_peer_wake_me(sk, other))
3217 			writable = 0;
3218 
3219 		unix_state_unlock(sk);
3220 	}
3221 
3222 	if (writable)
3223 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3224 	else
3225 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3226 
3227 	return mask;
3228 }
3229 
3230 #ifdef CONFIG_PROC_FS
3231 
3232 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3233 
3234 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3235 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3236 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3237 
3238 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3239 {
3240 	unsigned long offset = get_offset(*pos);
3241 	unsigned long bucket = get_bucket(*pos);
3242 	unsigned long count = 0;
3243 	struct sock *sk;
3244 
3245 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3246 	     sk; sk = sk_next(sk)) {
3247 		if (++count == offset)
3248 			break;
3249 	}
3250 
3251 	return sk;
3252 }
3253 
3254 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3255 {
3256 	unsigned long bucket = get_bucket(*pos);
3257 	struct net *net = seq_file_net(seq);
3258 	struct sock *sk;
3259 
3260 	while (bucket < UNIX_HASH_SIZE) {
3261 		spin_lock(&net->unx.table.locks[bucket]);
3262 
3263 		sk = unix_from_bucket(seq, pos);
3264 		if (sk)
3265 			return sk;
3266 
3267 		spin_unlock(&net->unx.table.locks[bucket]);
3268 
3269 		*pos = set_bucket_offset(++bucket, 1);
3270 	}
3271 
3272 	return NULL;
3273 }
3274 
3275 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3276 				  loff_t *pos)
3277 {
3278 	unsigned long bucket = get_bucket(*pos);
3279 
3280 	sk = sk_next(sk);
3281 	if (sk)
3282 		return sk;
3283 
3284 
3285 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3286 
3287 	*pos = set_bucket_offset(++bucket, 1);
3288 
3289 	return unix_get_first(seq, pos);
3290 }
3291 
3292 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3293 {
3294 	if (!*pos)
3295 		return SEQ_START_TOKEN;
3296 
3297 	return unix_get_first(seq, pos);
3298 }
3299 
3300 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3301 {
3302 	++*pos;
3303 
3304 	if (v == SEQ_START_TOKEN)
3305 		return unix_get_first(seq, pos);
3306 
3307 	return unix_get_next(seq, v, pos);
3308 }
3309 
3310 static void unix_seq_stop(struct seq_file *seq, void *v)
3311 {
3312 	struct sock *sk = v;
3313 
3314 	if (sk)
3315 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3316 }
3317 
3318 static int unix_seq_show(struct seq_file *seq, void *v)
3319 {
3320 
3321 	if (v == SEQ_START_TOKEN)
3322 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3323 			 "Inode Path\n");
3324 	else {
3325 		struct sock *s = v;
3326 		struct unix_sock *u = unix_sk(s);
3327 		unix_state_lock(s);
3328 
3329 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3330 			s,
3331 			refcount_read(&s->sk_refcnt),
3332 			0,
3333 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3334 			s->sk_type,
3335 			s->sk_socket ?
3336 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3337 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3338 			sock_i_ino(s));
3339 
3340 		if (u->addr) {	// under a hash table lock here
3341 			int i, len;
3342 			seq_putc(seq, ' ');
3343 
3344 			i = 0;
3345 			len = u->addr->len -
3346 				offsetof(struct sockaddr_un, sun_path);
3347 			if (u->addr->name->sun_path[0]) {
3348 				len--;
3349 			} else {
3350 				seq_putc(seq, '@');
3351 				i++;
3352 			}
3353 			for ( ; i < len; i++)
3354 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3355 					 '@');
3356 		}
3357 		unix_state_unlock(s);
3358 		seq_putc(seq, '\n');
3359 	}
3360 
3361 	return 0;
3362 }
3363 
3364 static const struct seq_operations unix_seq_ops = {
3365 	.start  = unix_seq_start,
3366 	.next   = unix_seq_next,
3367 	.stop   = unix_seq_stop,
3368 	.show   = unix_seq_show,
3369 };
3370 
3371 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3372 struct bpf_unix_iter_state {
3373 	struct seq_net_private p;
3374 	unsigned int cur_sk;
3375 	unsigned int end_sk;
3376 	unsigned int max_sk;
3377 	struct sock **batch;
3378 	bool st_bucket_done;
3379 };
3380 
3381 struct bpf_iter__unix {
3382 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3383 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3384 	uid_t uid __aligned(8);
3385 };
3386 
3387 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3388 			      struct unix_sock *unix_sk, uid_t uid)
3389 {
3390 	struct bpf_iter__unix ctx;
3391 
3392 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3393 	ctx.meta = meta;
3394 	ctx.unix_sk = unix_sk;
3395 	ctx.uid = uid;
3396 	return bpf_iter_run_prog(prog, &ctx);
3397 }
3398 
3399 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3400 
3401 {
3402 	struct bpf_unix_iter_state *iter = seq->private;
3403 	unsigned int expected = 1;
3404 	struct sock *sk;
3405 
3406 	sock_hold(start_sk);
3407 	iter->batch[iter->end_sk++] = start_sk;
3408 
3409 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3410 		if (iter->end_sk < iter->max_sk) {
3411 			sock_hold(sk);
3412 			iter->batch[iter->end_sk++] = sk;
3413 		}
3414 
3415 		expected++;
3416 	}
3417 
3418 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3419 
3420 	return expected;
3421 }
3422 
3423 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3424 {
3425 	while (iter->cur_sk < iter->end_sk)
3426 		sock_put(iter->batch[iter->cur_sk++]);
3427 }
3428 
3429 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3430 				       unsigned int new_batch_sz)
3431 {
3432 	struct sock **new_batch;
3433 
3434 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3435 			     GFP_USER | __GFP_NOWARN);
3436 	if (!new_batch)
3437 		return -ENOMEM;
3438 
3439 	bpf_iter_unix_put_batch(iter);
3440 	kvfree(iter->batch);
3441 	iter->batch = new_batch;
3442 	iter->max_sk = new_batch_sz;
3443 
3444 	return 0;
3445 }
3446 
3447 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3448 					loff_t *pos)
3449 {
3450 	struct bpf_unix_iter_state *iter = seq->private;
3451 	unsigned int expected;
3452 	bool resized = false;
3453 	struct sock *sk;
3454 
3455 	if (iter->st_bucket_done)
3456 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3457 
3458 again:
3459 	/* Get a new batch */
3460 	iter->cur_sk = 0;
3461 	iter->end_sk = 0;
3462 
3463 	sk = unix_get_first(seq, pos);
3464 	if (!sk)
3465 		return NULL; /* Done */
3466 
3467 	expected = bpf_iter_unix_hold_batch(seq, sk);
3468 
3469 	if (iter->end_sk == expected) {
3470 		iter->st_bucket_done = true;
3471 		return sk;
3472 	}
3473 
3474 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3475 		resized = true;
3476 		goto again;
3477 	}
3478 
3479 	return sk;
3480 }
3481 
3482 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3483 {
3484 	if (!*pos)
3485 		return SEQ_START_TOKEN;
3486 
3487 	/* bpf iter does not support lseek, so it always
3488 	 * continue from where it was stop()-ped.
3489 	 */
3490 	return bpf_iter_unix_batch(seq, pos);
3491 }
3492 
3493 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3494 {
3495 	struct bpf_unix_iter_state *iter = seq->private;
3496 	struct sock *sk;
3497 
3498 	/* Whenever seq_next() is called, the iter->cur_sk is
3499 	 * done with seq_show(), so advance to the next sk in
3500 	 * the batch.
3501 	 */
3502 	if (iter->cur_sk < iter->end_sk)
3503 		sock_put(iter->batch[iter->cur_sk++]);
3504 
3505 	++*pos;
3506 
3507 	if (iter->cur_sk < iter->end_sk)
3508 		sk = iter->batch[iter->cur_sk];
3509 	else
3510 		sk = bpf_iter_unix_batch(seq, pos);
3511 
3512 	return sk;
3513 }
3514 
3515 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3516 {
3517 	struct bpf_iter_meta meta;
3518 	struct bpf_prog *prog;
3519 	struct sock *sk = v;
3520 	uid_t uid;
3521 	bool slow;
3522 	int ret;
3523 
3524 	if (v == SEQ_START_TOKEN)
3525 		return 0;
3526 
3527 	slow = lock_sock_fast(sk);
3528 
3529 	if (unlikely(sk_unhashed(sk))) {
3530 		ret = SEQ_SKIP;
3531 		goto unlock;
3532 	}
3533 
3534 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3535 	meta.seq = seq;
3536 	prog = bpf_iter_get_info(&meta, false);
3537 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3538 unlock:
3539 	unlock_sock_fast(sk, slow);
3540 	return ret;
3541 }
3542 
3543 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3544 {
3545 	struct bpf_unix_iter_state *iter = seq->private;
3546 	struct bpf_iter_meta meta;
3547 	struct bpf_prog *prog;
3548 
3549 	if (!v) {
3550 		meta.seq = seq;
3551 		prog = bpf_iter_get_info(&meta, true);
3552 		if (prog)
3553 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3554 	}
3555 
3556 	if (iter->cur_sk < iter->end_sk)
3557 		bpf_iter_unix_put_batch(iter);
3558 }
3559 
3560 static const struct seq_operations bpf_iter_unix_seq_ops = {
3561 	.start	= bpf_iter_unix_seq_start,
3562 	.next	= bpf_iter_unix_seq_next,
3563 	.stop	= bpf_iter_unix_seq_stop,
3564 	.show	= bpf_iter_unix_seq_show,
3565 };
3566 #endif
3567 #endif
3568 
3569 static const struct net_proto_family unix_family_ops = {
3570 	.family = PF_UNIX,
3571 	.create = unix_create,
3572 	.owner	= THIS_MODULE,
3573 };
3574 
3575 
3576 static int __net_init unix_net_init(struct net *net)
3577 {
3578 	int i;
3579 
3580 	net->unx.sysctl_max_dgram_qlen = 10;
3581 	if (unix_sysctl_register(net))
3582 		goto out;
3583 
3584 #ifdef CONFIG_PROC_FS
3585 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3586 			     sizeof(struct seq_net_private)))
3587 		goto err_sysctl;
3588 #endif
3589 
3590 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3591 					      sizeof(spinlock_t), GFP_KERNEL);
3592 	if (!net->unx.table.locks)
3593 		goto err_proc;
3594 
3595 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3596 						sizeof(struct hlist_head),
3597 						GFP_KERNEL);
3598 	if (!net->unx.table.buckets)
3599 		goto free_locks;
3600 
3601 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3602 		spin_lock_init(&net->unx.table.locks[i]);
3603 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3604 	}
3605 
3606 	return 0;
3607 
3608 free_locks:
3609 	kvfree(net->unx.table.locks);
3610 err_proc:
3611 #ifdef CONFIG_PROC_FS
3612 	remove_proc_entry("unix", net->proc_net);
3613 err_sysctl:
3614 #endif
3615 	unix_sysctl_unregister(net);
3616 out:
3617 	return -ENOMEM;
3618 }
3619 
3620 static void __net_exit unix_net_exit(struct net *net)
3621 {
3622 	kvfree(net->unx.table.buckets);
3623 	kvfree(net->unx.table.locks);
3624 	unix_sysctl_unregister(net);
3625 	remove_proc_entry("unix", net->proc_net);
3626 }
3627 
3628 static struct pernet_operations unix_net_ops = {
3629 	.init = unix_net_init,
3630 	.exit = unix_net_exit,
3631 };
3632 
3633 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3634 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3635 		     struct unix_sock *unix_sk, uid_t uid)
3636 
3637 #define INIT_BATCH_SZ 16
3638 
3639 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3640 {
3641 	struct bpf_unix_iter_state *iter = priv_data;
3642 	int err;
3643 
3644 	err = bpf_iter_init_seq_net(priv_data, aux);
3645 	if (err)
3646 		return err;
3647 
3648 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3649 	if (err) {
3650 		bpf_iter_fini_seq_net(priv_data);
3651 		return err;
3652 	}
3653 
3654 	return 0;
3655 }
3656 
3657 static void bpf_iter_fini_unix(void *priv_data)
3658 {
3659 	struct bpf_unix_iter_state *iter = priv_data;
3660 
3661 	bpf_iter_fini_seq_net(priv_data);
3662 	kvfree(iter->batch);
3663 }
3664 
3665 static const struct bpf_iter_seq_info unix_seq_info = {
3666 	.seq_ops		= &bpf_iter_unix_seq_ops,
3667 	.init_seq_private	= bpf_iter_init_unix,
3668 	.fini_seq_private	= bpf_iter_fini_unix,
3669 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3670 };
3671 
3672 static const struct bpf_func_proto *
3673 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3674 			     const struct bpf_prog *prog)
3675 {
3676 	switch (func_id) {
3677 	case BPF_FUNC_setsockopt:
3678 		return &bpf_sk_setsockopt_proto;
3679 	case BPF_FUNC_getsockopt:
3680 		return &bpf_sk_getsockopt_proto;
3681 	default:
3682 		return NULL;
3683 	}
3684 }
3685 
3686 static struct bpf_iter_reg unix_reg_info = {
3687 	.target			= "unix",
3688 	.ctx_arg_info_size	= 1,
3689 	.ctx_arg_info		= {
3690 		{ offsetof(struct bpf_iter__unix, unix_sk),
3691 		  PTR_TO_BTF_ID_OR_NULL },
3692 	},
3693 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3694 	.seq_info		= &unix_seq_info,
3695 };
3696 
3697 static void __init bpf_iter_register(void)
3698 {
3699 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3700 	if (bpf_iter_reg_target(&unix_reg_info))
3701 		pr_warn("Warning: could not register bpf iterator unix\n");
3702 }
3703 #endif
3704 
3705 static int __init af_unix_init(void)
3706 {
3707 	int i, rc = -1;
3708 
3709 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3710 
3711 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3712 		spin_lock_init(&bsd_socket_locks[i]);
3713 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3714 	}
3715 
3716 	rc = proto_register(&unix_dgram_proto, 1);
3717 	if (rc != 0) {
3718 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3719 		goto out;
3720 	}
3721 
3722 	rc = proto_register(&unix_stream_proto, 1);
3723 	if (rc != 0) {
3724 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3725 		proto_unregister(&unix_dgram_proto);
3726 		goto out;
3727 	}
3728 
3729 	sock_register(&unix_family_ops);
3730 	register_pernet_subsys(&unix_net_ops);
3731 	unix_bpf_build_proto();
3732 
3733 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3734 	bpf_iter_register();
3735 #endif
3736 
3737 out:
3738 	return rc;
3739 }
3740 
3741 static void __exit af_unix_exit(void)
3742 {
3743 	sock_unregister(PF_UNIX);
3744 	proto_unregister(&unix_dgram_proto);
3745 	proto_unregister(&unix_stream_proto);
3746 	unregister_pernet_subsys(&unix_net_ops);
3747 }
3748 
3749 /* Earlier than device_initcall() so that other drivers invoking
3750    request_module() don't end up in a loop when modprobe tries
3751    to use a UNIX socket. But later than subsys_initcall() because
3752    we depend on stuff initialised there */
3753 fs_initcall(af_unix_init);
3754 module_exit(af_unix_exit);
3755 
3756 MODULE_LICENSE("GPL");
3757 MODULE_ALIAS_NETPROTO(PF_UNIX);
3758