xref: /openbmc/linux/net/unix/af_unix.c (revision 21f9cb44)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 	return unix_peer(osk) == sk;
218 }
219 
220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224 
225 static inline int unix_recvq_full_lockless(const struct sock *sk)
226 {
227 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229 
230 struct sock *unix_peer_get(struct sock *s)
231 {
232 	struct sock *peer;
233 
234 	unix_state_lock(s);
235 	peer = unix_peer(s);
236 	if (peer)
237 		sock_hold(peer);
238 	unix_state_unlock(s);
239 	return peer;
240 }
241 EXPORT_SYMBOL_GPL(unix_peer_get);
242 
243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
244 					     int addr_len)
245 {
246 	struct unix_address *addr;
247 
248 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
249 	if (!addr)
250 		return NULL;
251 
252 	refcount_set(&addr->refcnt, 1);
253 	addr->len = addr_len;
254 	memcpy(addr->name, sunaddr, addr_len);
255 
256 	return addr;
257 }
258 
259 static inline void unix_release_addr(struct unix_address *addr)
260 {
261 	if (refcount_dec_and_test(&addr->refcnt))
262 		kfree(addr);
263 }
264 
265 /*
266  *	Check unix socket name:
267  *		- should be not zero length.
268  *	        - if started by not zero, should be NULL terminated (FS object)
269  *		- if started by zero, it is abstract name.
270  */
271 
272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
273 {
274 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
275 	    addr_len > sizeof(*sunaddr))
276 		return -EINVAL;
277 
278 	if (sunaddr->sun_family != AF_UNIX)
279 		return -EINVAL;
280 
281 	return 0;
282 }
283 
284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
285 {
286 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
287 	short offset = offsetof(struct sockaddr_storage, __data);
288 
289 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
290 
291 	/* This may look like an off by one error but it is a bit more
292 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
293 	 * sun_path[108] doesn't as such exist.  However in kernel space
294 	 * we are guaranteed that it is a valid memory location in our
295 	 * kernel address buffer because syscall functions always pass
296 	 * a pointer of struct sockaddr_storage which has a bigger buffer
297 	 * than 108.  Also, we must terminate sun_path for strlen() in
298 	 * getname_kernel().
299 	 */
300 	addr->__data[addr_len - offset] = 0;
301 
302 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
303 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
304 	 * know the actual buffer.
305 	 */
306 	return strlen(addr->__data) + offset + 1;
307 }
308 
309 static void __unix_remove_socket(struct sock *sk)
310 {
311 	sk_del_node_init(sk);
312 }
313 
314 static void __unix_insert_socket(struct net *net, struct sock *sk)
315 {
316 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
317 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
318 }
319 
320 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
321 				 struct unix_address *addr, unsigned int hash)
322 {
323 	__unix_remove_socket(sk);
324 	smp_store_release(&unix_sk(sk)->addr, addr);
325 
326 	sk->sk_hash = hash;
327 	__unix_insert_socket(net, sk);
328 }
329 
330 static void unix_remove_socket(struct net *net, struct sock *sk)
331 {
332 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
333 	__unix_remove_socket(sk);
334 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
335 }
336 
337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
338 {
339 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
340 	__unix_insert_socket(net, sk);
341 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
342 }
343 
344 static void unix_insert_bsd_socket(struct sock *sk)
345 {
346 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
347 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
348 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
349 }
350 
351 static void unix_remove_bsd_socket(struct sock *sk)
352 {
353 	if (!hlist_unhashed(&sk->sk_bind_node)) {
354 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 		__sk_del_bind_node(sk);
356 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357 
358 		sk_node_init(&sk->sk_bind_node);
359 	}
360 }
361 
362 static struct sock *__unix_find_socket_byname(struct net *net,
363 					      struct sockaddr_un *sunname,
364 					      int len, unsigned int hash)
365 {
366 	struct sock *s;
367 
368 	sk_for_each(s, &net->unx.table.buckets[hash]) {
369 		struct unix_sock *u = unix_sk(s);
370 
371 		if (u->addr->len == len &&
372 		    !memcmp(u->addr->name, sunname, len))
373 			return s;
374 	}
375 	return NULL;
376 }
377 
378 static inline struct sock *unix_find_socket_byname(struct net *net,
379 						   struct sockaddr_un *sunname,
380 						   int len, unsigned int hash)
381 {
382 	struct sock *s;
383 
384 	spin_lock(&net->unx.table.locks[hash]);
385 	s = __unix_find_socket_byname(net, sunname, len, hash);
386 	if (s)
387 		sock_hold(s);
388 	spin_unlock(&net->unx.table.locks[hash]);
389 	return s;
390 }
391 
392 static struct sock *unix_find_socket_byinode(struct inode *i)
393 {
394 	unsigned int hash = unix_bsd_hash(i);
395 	struct sock *s;
396 
397 	spin_lock(&bsd_socket_locks[hash]);
398 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
399 		struct dentry *dentry = unix_sk(s)->path.dentry;
400 
401 		if (dentry && d_backing_inode(dentry) == i) {
402 			sock_hold(s);
403 			spin_unlock(&bsd_socket_locks[hash]);
404 			return s;
405 		}
406 	}
407 	spin_unlock(&bsd_socket_locks[hash]);
408 	return NULL;
409 }
410 
411 /* Support code for asymmetrically connected dgram sockets
412  *
413  * If a datagram socket is connected to a socket not itself connected
414  * to the first socket (eg, /dev/log), clients may only enqueue more
415  * messages if the present receive queue of the server socket is not
416  * "too large". This means there's a second writeability condition
417  * poll and sendmsg need to test. The dgram recv code will do a wake
418  * up on the peer_wait wait queue of a socket upon reception of a
419  * datagram which needs to be propagated to sleeping would-be writers
420  * since these might not have sent anything so far. This can't be
421  * accomplished via poll_wait because the lifetime of the server
422  * socket might be less than that of its clients if these break their
423  * association with it or if the server socket is closed while clients
424  * are still connected to it and there's no way to inform "a polling
425  * implementation" that it should let go of a certain wait queue
426  *
427  * In order to propagate a wake up, a wait_queue_entry_t of the client
428  * socket is enqueued on the peer_wait queue of the server socket
429  * whose wake function does a wake_up on the ordinary client socket
430  * wait queue. This connection is established whenever a write (or
431  * poll for write) hit the flow control condition and broken when the
432  * association to the server socket is dissolved or after a wake up
433  * was relayed.
434  */
435 
436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
437 				      void *key)
438 {
439 	struct unix_sock *u;
440 	wait_queue_head_t *u_sleep;
441 
442 	u = container_of(q, struct unix_sock, peer_wake);
443 
444 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
445 			    q);
446 	u->peer_wake.private = NULL;
447 
448 	/* relaying can only happen while the wq still exists */
449 	u_sleep = sk_sleep(&u->sk);
450 	if (u_sleep)
451 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
452 
453 	return 0;
454 }
455 
456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
457 {
458 	struct unix_sock *u, *u_other;
459 	int rc;
460 
461 	u = unix_sk(sk);
462 	u_other = unix_sk(other);
463 	rc = 0;
464 	spin_lock(&u_other->peer_wait.lock);
465 
466 	if (!u->peer_wake.private) {
467 		u->peer_wake.private = other;
468 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
469 
470 		rc = 1;
471 	}
472 
473 	spin_unlock(&u_other->peer_wait.lock);
474 	return rc;
475 }
476 
477 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
478 					    struct sock *other)
479 {
480 	struct unix_sock *u, *u_other;
481 
482 	u = unix_sk(sk);
483 	u_other = unix_sk(other);
484 	spin_lock(&u_other->peer_wait.lock);
485 
486 	if (u->peer_wake.private == other) {
487 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
488 		u->peer_wake.private = NULL;
489 	}
490 
491 	spin_unlock(&u_other->peer_wait.lock);
492 }
493 
494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
495 						   struct sock *other)
496 {
497 	unix_dgram_peer_wake_disconnect(sk, other);
498 	wake_up_interruptible_poll(sk_sleep(sk),
499 				   EPOLLOUT |
500 				   EPOLLWRNORM |
501 				   EPOLLWRBAND);
502 }
503 
504 /* preconditions:
505  *	- unix_peer(sk) == other
506  *	- association is stable
507  */
508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
509 {
510 	int connected;
511 
512 	connected = unix_dgram_peer_wake_connect(sk, other);
513 
514 	/* If other is SOCK_DEAD, we want to make sure we signal
515 	 * POLLOUT, such that a subsequent write() can get a
516 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
517 	 * to other and its full, we will hang waiting for POLLOUT.
518 	 */
519 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
520 		return 1;
521 
522 	if (connected)
523 		unix_dgram_peer_wake_disconnect(sk, other);
524 
525 	return 0;
526 }
527 
528 static int unix_writable(const struct sock *sk, unsigned char state)
529 {
530 	return state != TCP_LISTEN &&
531 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
532 }
533 
534 static void unix_write_space(struct sock *sk)
535 {
536 	struct socket_wq *wq;
537 
538 	rcu_read_lock();
539 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
540 		wq = rcu_dereference(sk->sk_wq);
541 		if (skwq_has_sleeper(wq))
542 			wake_up_interruptible_sync_poll(&wq->wait,
543 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
544 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
545 	}
546 	rcu_read_unlock();
547 }
548 
549 /* When dgram socket disconnects (or changes its peer), we clear its receive
550  * queue of packets arrived from previous peer. First, it allows to do
551  * flow control based only on wmem_alloc; second, sk connected to peer
552  * may receive messages only from that peer. */
553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
554 {
555 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
556 		skb_queue_purge(&sk->sk_receive_queue);
557 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
558 
559 		/* If one link of bidirectional dgram pipe is disconnected,
560 		 * we signal error. Messages are lost. Do not make this,
561 		 * when peer was not connected to us.
562 		 */
563 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
564 			WRITE_ONCE(other->sk_err, ECONNRESET);
565 			sk_error_report(other);
566 		}
567 	}
568 }
569 
570 static void unix_sock_destructor(struct sock *sk)
571 {
572 	struct unix_sock *u = unix_sk(sk);
573 
574 	skb_queue_purge(&sk->sk_receive_queue);
575 
576 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
577 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
578 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
579 	if (!sock_flag(sk, SOCK_DEAD)) {
580 		pr_info("Attempt to release alive unix socket: %p\n", sk);
581 		return;
582 	}
583 
584 	if (u->addr)
585 		unix_release_addr(u->addr);
586 
587 	atomic_long_dec(&unix_nr_socks);
588 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
589 #ifdef UNIX_REFCNT_DEBUG
590 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
591 		atomic_long_read(&unix_nr_socks));
592 #endif
593 }
594 
595 static void unix_release_sock(struct sock *sk, int embrion)
596 {
597 	struct unix_sock *u = unix_sk(sk);
598 	struct sock *skpair;
599 	struct sk_buff *skb;
600 	struct path path;
601 	int state;
602 
603 	unix_remove_socket(sock_net(sk), sk);
604 	unix_remove_bsd_socket(sk);
605 
606 	/* Clear state */
607 	unix_state_lock(sk);
608 	sock_orphan(sk);
609 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
610 	path	     = u->path;
611 	u->path.dentry = NULL;
612 	u->path.mnt = NULL;
613 	state = sk->sk_state;
614 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
615 
616 	skpair = unix_peer(sk);
617 	unix_peer(sk) = NULL;
618 
619 	unix_state_unlock(sk);
620 
621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
622 	if (u->oob_skb) {
623 		kfree_skb(u->oob_skb);
624 		u->oob_skb = NULL;
625 	}
626 #endif
627 
628 	wake_up_interruptible_all(&u->peer_wait);
629 
630 	if (skpair != NULL) {
631 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
632 			unix_state_lock(skpair);
633 			/* No more writes */
634 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
635 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
636 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
637 			unix_state_unlock(skpair);
638 			skpair->sk_state_change(skpair);
639 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
640 		}
641 
642 		unix_dgram_peer_wake_disconnect(sk, skpair);
643 		sock_put(skpair); /* It may now die */
644 	}
645 
646 	/* Try to flush out this socket. Throw out buffers at least */
647 
648 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
649 		if (state == TCP_LISTEN)
650 			unix_release_sock(skb->sk, 1);
651 		/* passed fds are erased in the kfree_skb hook	      */
652 		UNIXCB(skb).consumed = skb->len;
653 		kfree_skb(skb);
654 	}
655 
656 	if (path.dentry)
657 		path_put(&path);
658 
659 	sock_put(sk);
660 
661 	/* ---- Socket is dead now and most probably destroyed ---- */
662 
663 	/*
664 	 * Fixme: BSD difference: In BSD all sockets connected to us get
665 	 *	  ECONNRESET and we die on the spot. In Linux we behave
666 	 *	  like files and pipes do and wait for the last
667 	 *	  dereference.
668 	 *
669 	 * Can't we simply set sock->err?
670 	 *
671 	 *	  What the above comment does talk about? --ANK(980817)
672 	 */
673 
674 	if (READ_ONCE(unix_tot_inflight))
675 		unix_gc();		/* Garbage collect fds */
676 }
677 
678 static void init_peercred(struct sock *sk)
679 {
680 	const struct cred *old_cred;
681 	struct pid *old_pid;
682 
683 	spin_lock(&sk->sk_peer_lock);
684 	old_pid = sk->sk_peer_pid;
685 	old_cred = sk->sk_peer_cred;
686 	sk->sk_peer_pid  = get_pid(task_tgid(current));
687 	sk->sk_peer_cred = get_current_cred();
688 	spin_unlock(&sk->sk_peer_lock);
689 
690 	put_pid(old_pid);
691 	put_cred(old_cred);
692 }
693 
694 static void copy_peercred(struct sock *sk, struct sock *peersk)
695 {
696 	const struct cred *old_cred;
697 	struct pid *old_pid;
698 
699 	if (sk < peersk) {
700 		spin_lock(&sk->sk_peer_lock);
701 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 	} else {
703 		spin_lock(&peersk->sk_peer_lock);
704 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
705 	}
706 	old_pid = sk->sk_peer_pid;
707 	old_cred = sk->sk_peer_cred;
708 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
709 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
710 
711 	spin_unlock(&sk->sk_peer_lock);
712 	spin_unlock(&peersk->sk_peer_lock);
713 
714 	put_pid(old_pid);
715 	put_cred(old_cred);
716 }
717 
718 static int unix_listen(struct socket *sock, int backlog)
719 {
720 	int err;
721 	struct sock *sk = sock->sk;
722 	struct unix_sock *u = unix_sk(sk);
723 
724 	err = -EOPNOTSUPP;
725 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
726 		goto out;	/* Only stream/seqpacket sockets accept */
727 	err = -EINVAL;
728 	if (!READ_ONCE(u->addr))
729 		goto out;	/* No listens on an unbound socket */
730 	unix_state_lock(sk);
731 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
732 		goto out_unlock;
733 	if (backlog > sk->sk_max_ack_backlog)
734 		wake_up_interruptible_all(&u->peer_wait);
735 	sk->sk_max_ack_backlog	= backlog;
736 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
737 
738 	/* set credentials so connect can copy them */
739 	init_peercred(sk);
740 	err = 0;
741 
742 out_unlock:
743 	unix_state_unlock(sk);
744 out:
745 	return err;
746 }
747 
748 static int unix_release(struct socket *);
749 static int unix_bind(struct socket *, struct sockaddr *, int);
750 static int unix_stream_connect(struct socket *, struct sockaddr *,
751 			       int addr_len, int flags);
752 static int unix_socketpair(struct socket *, struct socket *);
753 static int unix_accept(struct socket *, struct socket *, int, bool);
754 static int unix_getname(struct socket *, struct sockaddr *, int);
755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
756 static __poll_t unix_dgram_poll(struct file *, struct socket *,
757 				    poll_table *);
758 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
759 #ifdef CONFIG_COMPAT
760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
761 #endif
762 static int unix_shutdown(struct socket *, int);
763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
765 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
766 				       struct pipe_inode_info *, size_t size,
767 				       unsigned int flags);
768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
772 static int unix_dgram_connect(struct socket *, struct sockaddr *,
773 			      int, int);
774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
776 				  int);
777 
778 static int unix_set_peek_off(struct sock *sk, int val)
779 {
780 	struct unix_sock *u = unix_sk(sk);
781 
782 	if (mutex_lock_interruptible(&u->iolock))
783 		return -EINTR;
784 
785 	WRITE_ONCE(sk->sk_peek_off, val);
786 	mutex_unlock(&u->iolock);
787 
788 	return 0;
789 }
790 
791 #ifdef CONFIG_PROC_FS
792 static int unix_count_nr_fds(struct sock *sk)
793 {
794 	struct sk_buff *skb;
795 	struct unix_sock *u;
796 	int nr_fds = 0;
797 
798 	spin_lock(&sk->sk_receive_queue.lock);
799 	skb = skb_peek(&sk->sk_receive_queue);
800 	while (skb) {
801 		u = unix_sk(skb->sk);
802 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
803 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
804 	}
805 	spin_unlock(&sk->sk_receive_queue.lock);
806 
807 	return nr_fds;
808 }
809 
810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
811 {
812 	struct sock *sk = sock->sk;
813 	unsigned char s_state;
814 	struct unix_sock *u;
815 	int nr_fds = 0;
816 
817 	if (sk) {
818 		s_state = READ_ONCE(sk->sk_state);
819 		u = unix_sk(sk);
820 
821 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
822 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
823 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
824 		 */
825 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
826 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
827 		else if (s_state == TCP_LISTEN)
828 			nr_fds = unix_count_nr_fds(sk);
829 
830 		seq_printf(m, "scm_fds: %u\n", nr_fds);
831 	}
832 }
833 #else
834 #define unix_show_fdinfo NULL
835 #endif
836 
837 static const struct proto_ops unix_stream_ops = {
838 	.family =	PF_UNIX,
839 	.owner =	THIS_MODULE,
840 	.release =	unix_release,
841 	.bind =		unix_bind,
842 	.connect =	unix_stream_connect,
843 	.socketpair =	unix_socketpair,
844 	.accept =	unix_accept,
845 	.getname =	unix_getname,
846 	.poll =		unix_poll,
847 	.ioctl =	unix_ioctl,
848 #ifdef CONFIG_COMPAT
849 	.compat_ioctl =	unix_compat_ioctl,
850 #endif
851 	.listen =	unix_listen,
852 	.shutdown =	unix_shutdown,
853 	.sendmsg =	unix_stream_sendmsg,
854 	.recvmsg =	unix_stream_recvmsg,
855 	.read_skb =	unix_stream_read_skb,
856 	.mmap =		sock_no_mmap,
857 	.splice_read =	unix_stream_splice_read,
858 	.set_peek_off =	unix_set_peek_off,
859 	.show_fdinfo =	unix_show_fdinfo,
860 };
861 
862 static const struct proto_ops unix_dgram_ops = {
863 	.family =	PF_UNIX,
864 	.owner =	THIS_MODULE,
865 	.release =	unix_release,
866 	.bind =		unix_bind,
867 	.connect =	unix_dgram_connect,
868 	.socketpair =	unix_socketpair,
869 	.accept =	sock_no_accept,
870 	.getname =	unix_getname,
871 	.poll =		unix_dgram_poll,
872 	.ioctl =	unix_ioctl,
873 #ifdef CONFIG_COMPAT
874 	.compat_ioctl =	unix_compat_ioctl,
875 #endif
876 	.listen =	sock_no_listen,
877 	.shutdown =	unix_shutdown,
878 	.sendmsg =	unix_dgram_sendmsg,
879 	.read_skb =	unix_read_skb,
880 	.recvmsg =	unix_dgram_recvmsg,
881 	.mmap =		sock_no_mmap,
882 	.set_peek_off =	unix_set_peek_off,
883 	.show_fdinfo =	unix_show_fdinfo,
884 };
885 
886 static const struct proto_ops unix_seqpacket_ops = {
887 	.family =	PF_UNIX,
888 	.owner =	THIS_MODULE,
889 	.release =	unix_release,
890 	.bind =		unix_bind,
891 	.connect =	unix_stream_connect,
892 	.socketpair =	unix_socketpair,
893 	.accept =	unix_accept,
894 	.getname =	unix_getname,
895 	.poll =		unix_dgram_poll,
896 	.ioctl =	unix_ioctl,
897 #ifdef CONFIG_COMPAT
898 	.compat_ioctl =	unix_compat_ioctl,
899 #endif
900 	.listen =	unix_listen,
901 	.shutdown =	unix_shutdown,
902 	.sendmsg =	unix_seqpacket_sendmsg,
903 	.recvmsg =	unix_seqpacket_recvmsg,
904 	.mmap =		sock_no_mmap,
905 	.set_peek_off =	unix_set_peek_off,
906 	.show_fdinfo =	unix_show_fdinfo,
907 };
908 
909 static void unix_close(struct sock *sk, long timeout)
910 {
911 	/* Nothing to do here, unix socket does not need a ->close().
912 	 * This is merely for sockmap.
913 	 */
914 }
915 
916 static void unix_unhash(struct sock *sk)
917 {
918 	/* Nothing to do here, unix socket does not need a ->unhash().
919 	 * This is merely for sockmap.
920 	 */
921 }
922 
923 static bool unix_bpf_bypass_getsockopt(int level, int optname)
924 {
925 	if (level == SOL_SOCKET) {
926 		switch (optname) {
927 		case SO_PEERPIDFD:
928 			return true;
929 		default:
930 			return false;
931 		}
932 	}
933 
934 	return false;
935 }
936 
937 struct proto unix_dgram_proto = {
938 	.name			= "UNIX",
939 	.owner			= THIS_MODULE,
940 	.obj_size		= sizeof(struct unix_sock),
941 	.close			= unix_close,
942 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
943 #ifdef CONFIG_BPF_SYSCALL
944 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
945 #endif
946 };
947 
948 struct proto unix_stream_proto = {
949 	.name			= "UNIX-STREAM",
950 	.owner			= THIS_MODULE,
951 	.obj_size		= sizeof(struct unix_sock),
952 	.close			= unix_close,
953 	.unhash			= unix_unhash,
954 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
955 #ifdef CONFIG_BPF_SYSCALL
956 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
957 #endif
958 };
959 
960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
961 {
962 	struct unix_sock *u;
963 	struct sock *sk;
964 	int err;
965 
966 	atomic_long_inc(&unix_nr_socks);
967 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
968 		err = -ENFILE;
969 		goto err;
970 	}
971 
972 	if (type == SOCK_STREAM)
973 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
974 	else /*dgram and  seqpacket */
975 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
976 
977 	if (!sk) {
978 		err = -ENOMEM;
979 		goto err;
980 	}
981 
982 	sock_init_data(sock, sk);
983 
984 	sk->sk_hash		= unix_unbound_hash(sk);
985 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
986 	sk->sk_write_space	= unix_write_space;
987 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
988 	sk->sk_destruct		= unix_sock_destructor;
989 	u = unix_sk(sk);
990 	u->inflight = 0;
991 	u->path.dentry = NULL;
992 	u->path.mnt = NULL;
993 	spin_lock_init(&u->lock);
994 	INIT_LIST_HEAD(&u->link);
995 	mutex_init(&u->iolock); /* single task reading lock */
996 	mutex_init(&u->bindlock); /* single task binding lock */
997 	init_waitqueue_head(&u->peer_wait);
998 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
999 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1000 	unix_insert_unbound_socket(net, sk);
1001 
1002 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1003 
1004 	return sk;
1005 
1006 err:
1007 	atomic_long_dec(&unix_nr_socks);
1008 	return ERR_PTR(err);
1009 }
1010 
1011 static int unix_create(struct net *net, struct socket *sock, int protocol,
1012 		       int kern)
1013 {
1014 	struct sock *sk;
1015 
1016 	if (protocol && protocol != PF_UNIX)
1017 		return -EPROTONOSUPPORT;
1018 
1019 	sock->state = SS_UNCONNECTED;
1020 
1021 	switch (sock->type) {
1022 	case SOCK_STREAM:
1023 		sock->ops = &unix_stream_ops;
1024 		break;
1025 		/*
1026 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1027 		 *	nothing uses it.
1028 		 */
1029 	case SOCK_RAW:
1030 		sock->type = SOCK_DGRAM;
1031 		fallthrough;
1032 	case SOCK_DGRAM:
1033 		sock->ops = &unix_dgram_ops;
1034 		break;
1035 	case SOCK_SEQPACKET:
1036 		sock->ops = &unix_seqpacket_ops;
1037 		break;
1038 	default:
1039 		return -ESOCKTNOSUPPORT;
1040 	}
1041 
1042 	sk = unix_create1(net, sock, kern, sock->type);
1043 	if (IS_ERR(sk))
1044 		return PTR_ERR(sk);
1045 
1046 	return 0;
1047 }
1048 
1049 static int unix_release(struct socket *sock)
1050 {
1051 	struct sock *sk = sock->sk;
1052 
1053 	if (!sk)
1054 		return 0;
1055 
1056 	sk->sk_prot->close(sk, 0);
1057 	unix_release_sock(sk, 0);
1058 	sock->sk = NULL;
1059 
1060 	return 0;
1061 }
1062 
1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1064 				  int type)
1065 {
1066 	struct inode *inode;
1067 	struct path path;
1068 	struct sock *sk;
1069 	int err;
1070 
1071 	unix_mkname_bsd(sunaddr, addr_len);
1072 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1073 	if (err)
1074 		goto fail;
1075 
1076 	err = path_permission(&path, MAY_WRITE);
1077 	if (err)
1078 		goto path_put;
1079 
1080 	err = -ECONNREFUSED;
1081 	inode = d_backing_inode(path.dentry);
1082 	if (!S_ISSOCK(inode->i_mode))
1083 		goto path_put;
1084 
1085 	sk = unix_find_socket_byinode(inode);
1086 	if (!sk)
1087 		goto path_put;
1088 
1089 	err = -EPROTOTYPE;
1090 	if (sk->sk_type == type)
1091 		touch_atime(&path);
1092 	else
1093 		goto sock_put;
1094 
1095 	path_put(&path);
1096 
1097 	return sk;
1098 
1099 sock_put:
1100 	sock_put(sk);
1101 path_put:
1102 	path_put(&path);
1103 fail:
1104 	return ERR_PTR(err);
1105 }
1106 
1107 static struct sock *unix_find_abstract(struct net *net,
1108 				       struct sockaddr_un *sunaddr,
1109 				       int addr_len, int type)
1110 {
1111 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1112 	struct dentry *dentry;
1113 	struct sock *sk;
1114 
1115 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1116 	if (!sk)
1117 		return ERR_PTR(-ECONNREFUSED);
1118 
1119 	dentry = unix_sk(sk)->path.dentry;
1120 	if (dentry)
1121 		touch_atime(&unix_sk(sk)->path);
1122 
1123 	return sk;
1124 }
1125 
1126 static struct sock *unix_find_other(struct net *net,
1127 				    struct sockaddr_un *sunaddr,
1128 				    int addr_len, int type)
1129 {
1130 	struct sock *sk;
1131 
1132 	if (sunaddr->sun_path[0])
1133 		sk = unix_find_bsd(sunaddr, addr_len, type);
1134 	else
1135 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1136 
1137 	return sk;
1138 }
1139 
1140 static int unix_autobind(struct sock *sk)
1141 {
1142 	struct unix_sock *u = unix_sk(sk);
1143 	unsigned int new_hash, old_hash;
1144 	struct net *net = sock_net(sk);
1145 	struct unix_address *addr;
1146 	u32 lastnum, ordernum;
1147 	int err;
1148 
1149 	err = mutex_lock_interruptible(&u->bindlock);
1150 	if (err)
1151 		return err;
1152 
1153 	if (u->addr)
1154 		goto out;
1155 
1156 	err = -ENOMEM;
1157 	addr = kzalloc(sizeof(*addr) +
1158 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1159 	if (!addr)
1160 		goto out;
1161 
1162 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1163 	addr->name->sun_family = AF_UNIX;
1164 	refcount_set(&addr->refcnt, 1);
1165 
1166 	old_hash = sk->sk_hash;
1167 	ordernum = get_random_u32();
1168 	lastnum = ordernum & 0xFFFFF;
1169 retry:
1170 	ordernum = (ordernum + 1) & 0xFFFFF;
1171 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1172 
1173 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1174 	unix_table_double_lock(net, old_hash, new_hash);
1175 
1176 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1177 		unix_table_double_unlock(net, old_hash, new_hash);
1178 
1179 		/* __unix_find_socket_byname() may take long time if many names
1180 		 * are already in use.
1181 		 */
1182 		cond_resched();
1183 
1184 		if (ordernum == lastnum) {
1185 			/* Give up if all names seems to be in use. */
1186 			err = -ENOSPC;
1187 			unix_release_addr(addr);
1188 			goto out;
1189 		}
1190 
1191 		goto retry;
1192 	}
1193 
1194 	__unix_set_addr_hash(net, sk, addr, new_hash);
1195 	unix_table_double_unlock(net, old_hash, new_hash);
1196 	err = 0;
1197 
1198 out:	mutex_unlock(&u->bindlock);
1199 	return err;
1200 }
1201 
1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1203 			 int addr_len)
1204 {
1205 	umode_t mode = S_IFSOCK |
1206 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1207 	struct unix_sock *u = unix_sk(sk);
1208 	unsigned int new_hash, old_hash;
1209 	struct net *net = sock_net(sk);
1210 	struct mnt_idmap *idmap;
1211 	struct unix_address *addr;
1212 	struct dentry *dentry;
1213 	struct path parent;
1214 	int err;
1215 
1216 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1217 	addr = unix_create_addr(sunaddr, addr_len);
1218 	if (!addr)
1219 		return -ENOMEM;
1220 
1221 	/*
1222 	 * Get the parent directory, calculate the hash for last
1223 	 * component.
1224 	 */
1225 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1226 	if (IS_ERR(dentry)) {
1227 		err = PTR_ERR(dentry);
1228 		goto out;
1229 	}
1230 
1231 	/*
1232 	 * All right, let's create it.
1233 	 */
1234 	idmap = mnt_idmap(parent.mnt);
1235 	err = security_path_mknod(&parent, dentry, mode, 0);
1236 	if (!err)
1237 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1238 	if (err)
1239 		goto out_path;
1240 	err = mutex_lock_interruptible(&u->bindlock);
1241 	if (err)
1242 		goto out_unlink;
1243 	if (u->addr)
1244 		goto out_unlock;
1245 
1246 	old_hash = sk->sk_hash;
1247 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1248 	unix_table_double_lock(net, old_hash, new_hash);
1249 	u->path.mnt = mntget(parent.mnt);
1250 	u->path.dentry = dget(dentry);
1251 	__unix_set_addr_hash(net, sk, addr, new_hash);
1252 	unix_table_double_unlock(net, old_hash, new_hash);
1253 	unix_insert_bsd_socket(sk);
1254 	mutex_unlock(&u->bindlock);
1255 	done_path_create(&parent, dentry);
1256 	return 0;
1257 
1258 out_unlock:
1259 	mutex_unlock(&u->bindlock);
1260 	err = -EINVAL;
1261 out_unlink:
1262 	/* failed after successful mknod?  unlink what we'd created... */
1263 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1264 out_path:
1265 	done_path_create(&parent, dentry);
1266 out:
1267 	unix_release_addr(addr);
1268 	return err == -EEXIST ? -EADDRINUSE : err;
1269 }
1270 
1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1272 			      int addr_len)
1273 {
1274 	struct unix_sock *u = unix_sk(sk);
1275 	unsigned int new_hash, old_hash;
1276 	struct net *net = sock_net(sk);
1277 	struct unix_address *addr;
1278 	int err;
1279 
1280 	addr = unix_create_addr(sunaddr, addr_len);
1281 	if (!addr)
1282 		return -ENOMEM;
1283 
1284 	err = mutex_lock_interruptible(&u->bindlock);
1285 	if (err)
1286 		goto out;
1287 
1288 	if (u->addr) {
1289 		err = -EINVAL;
1290 		goto out_mutex;
1291 	}
1292 
1293 	old_hash = sk->sk_hash;
1294 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1295 	unix_table_double_lock(net, old_hash, new_hash);
1296 
1297 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1298 		goto out_spin;
1299 
1300 	__unix_set_addr_hash(net, sk, addr, new_hash);
1301 	unix_table_double_unlock(net, old_hash, new_hash);
1302 	mutex_unlock(&u->bindlock);
1303 	return 0;
1304 
1305 out_spin:
1306 	unix_table_double_unlock(net, old_hash, new_hash);
1307 	err = -EADDRINUSE;
1308 out_mutex:
1309 	mutex_unlock(&u->bindlock);
1310 out:
1311 	unix_release_addr(addr);
1312 	return err;
1313 }
1314 
1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1316 {
1317 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1318 	struct sock *sk = sock->sk;
1319 	int err;
1320 
1321 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1322 	    sunaddr->sun_family == AF_UNIX)
1323 		return unix_autobind(sk);
1324 
1325 	err = unix_validate_addr(sunaddr, addr_len);
1326 	if (err)
1327 		return err;
1328 
1329 	if (sunaddr->sun_path[0])
1330 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1331 	else
1332 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1333 
1334 	return err;
1335 }
1336 
1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1338 {
1339 	if (unlikely(sk1 == sk2) || !sk2) {
1340 		unix_state_lock(sk1);
1341 		return;
1342 	}
1343 	if (sk1 > sk2)
1344 		swap(sk1, sk2);
1345 
1346 	unix_state_lock(sk1);
1347 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1348 }
1349 
1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1351 {
1352 	if (unlikely(sk1 == sk2) || !sk2) {
1353 		unix_state_unlock(sk1);
1354 		return;
1355 	}
1356 	unix_state_unlock(sk1);
1357 	unix_state_unlock(sk2);
1358 }
1359 
1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1361 			      int alen, int flags)
1362 {
1363 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1364 	struct sock *sk = sock->sk;
1365 	struct sock *other;
1366 	int err;
1367 
1368 	err = -EINVAL;
1369 	if (alen < offsetofend(struct sockaddr, sa_family))
1370 		goto out;
1371 
1372 	if (addr->sa_family != AF_UNSPEC) {
1373 		err = unix_validate_addr(sunaddr, alen);
1374 		if (err)
1375 			goto out;
1376 
1377 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1378 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1379 		    !READ_ONCE(unix_sk(sk)->addr)) {
1380 			err = unix_autobind(sk);
1381 			if (err)
1382 				goto out;
1383 		}
1384 
1385 restart:
1386 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1387 		if (IS_ERR(other)) {
1388 			err = PTR_ERR(other);
1389 			goto out;
1390 		}
1391 
1392 		unix_state_double_lock(sk, other);
1393 
1394 		/* Apparently VFS overslept socket death. Retry. */
1395 		if (sock_flag(other, SOCK_DEAD)) {
1396 			unix_state_double_unlock(sk, other);
1397 			sock_put(other);
1398 			goto restart;
1399 		}
1400 
1401 		err = -EPERM;
1402 		if (!unix_may_send(sk, other))
1403 			goto out_unlock;
1404 
1405 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1406 		if (err)
1407 			goto out_unlock;
1408 
1409 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1410 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1411 	} else {
1412 		/*
1413 		 *	1003.1g breaking connected state with AF_UNSPEC
1414 		 */
1415 		other = NULL;
1416 		unix_state_double_lock(sk, other);
1417 	}
1418 
1419 	/*
1420 	 * If it was connected, reconnect.
1421 	 */
1422 	if (unix_peer(sk)) {
1423 		struct sock *old_peer = unix_peer(sk);
1424 
1425 		unix_peer(sk) = other;
1426 		if (!other)
1427 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1428 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1429 
1430 		unix_state_double_unlock(sk, other);
1431 
1432 		if (other != old_peer) {
1433 			unix_dgram_disconnected(sk, old_peer);
1434 
1435 			unix_state_lock(old_peer);
1436 			if (!unix_peer(old_peer))
1437 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1438 			unix_state_unlock(old_peer);
1439 		}
1440 
1441 		sock_put(old_peer);
1442 	} else {
1443 		unix_peer(sk) = other;
1444 		unix_state_double_unlock(sk, other);
1445 	}
1446 
1447 	return 0;
1448 
1449 out_unlock:
1450 	unix_state_double_unlock(sk, other);
1451 	sock_put(other);
1452 out:
1453 	return err;
1454 }
1455 
1456 static long unix_wait_for_peer(struct sock *other, long timeo)
1457 	__releases(&unix_sk(other)->lock)
1458 {
1459 	struct unix_sock *u = unix_sk(other);
1460 	int sched;
1461 	DEFINE_WAIT(wait);
1462 
1463 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464 
1465 	sched = !sock_flag(other, SOCK_DEAD) &&
1466 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1467 		unix_recvq_full_lockless(other);
1468 
1469 	unix_state_unlock(other);
1470 
1471 	if (sched)
1472 		timeo = schedule_timeout(timeo);
1473 
1474 	finish_wait(&u->peer_wait, &wait);
1475 	return timeo;
1476 }
1477 
1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1479 			       int addr_len, int flags)
1480 {
1481 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1482 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1483 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1484 	struct net *net = sock_net(sk);
1485 	struct sk_buff *skb = NULL;
1486 	unsigned char state;
1487 	long timeo;
1488 	int err;
1489 
1490 	err = unix_validate_addr(sunaddr, addr_len);
1491 	if (err)
1492 		goto out;
1493 
1494 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1495 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1496 	    !READ_ONCE(u->addr)) {
1497 		err = unix_autobind(sk);
1498 		if (err)
1499 			goto out;
1500 	}
1501 
1502 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1503 
1504 	/* First of all allocate resources.
1505 	   If we will make it after state is locked,
1506 	   we will have to recheck all again in any case.
1507 	 */
1508 
1509 	/* create new sock for complete connection */
1510 	newsk = unix_create1(net, NULL, 0, sock->type);
1511 	if (IS_ERR(newsk)) {
1512 		err = PTR_ERR(newsk);
1513 		newsk = NULL;
1514 		goto out;
1515 	}
1516 
1517 	err = -ENOMEM;
1518 
1519 	/* Allocate skb for sending to listening sock */
1520 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1521 	if (skb == NULL)
1522 		goto out;
1523 
1524 restart:
1525 	/*  Find listening sock. */
1526 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1527 	if (IS_ERR(other)) {
1528 		err = PTR_ERR(other);
1529 		other = NULL;
1530 		goto out;
1531 	}
1532 
1533 	unix_state_lock(other);
1534 
1535 	/* Apparently VFS overslept socket death. Retry. */
1536 	if (sock_flag(other, SOCK_DEAD)) {
1537 		unix_state_unlock(other);
1538 		sock_put(other);
1539 		goto restart;
1540 	}
1541 
1542 	err = -ECONNREFUSED;
1543 	if (other->sk_state != TCP_LISTEN)
1544 		goto out_unlock;
1545 	if (other->sk_shutdown & RCV_SHUTDOWN)
1546 		goto out_unlock;
1547 
1548 	if (unix_recvq_full_lockless(other)) {
1549 		err = -EAGAIN;
1550 		if (!timeo)
1551 			goto out_unlock;
1552 
1553 		timeo = unix_wait_for_peer(other, timeo);
1554 
1555 		err = sock_intr_errno(timeo);
1556 		if (signal_pending(current))
1557 			goto out;
1558 		sock_put(other);
1559 		goto restart;
1560 	}
1561 
1562 	/* self connect and simultaneous connect are eliminated
1563 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1564 	 */
1565 	state = READ_ONCE(sk->sk_state);
1566 	if (unlikely(state != TCP_CLOSE)) {
1567 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1568 		goto out_unlock;
1569 	}
1570 
1571 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1572 
1573 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1574 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1575 		unix_state_unlock(sk);
1576 		goto out_unlock;
1577 	}
1578 
1579 	err = security_unix_stream_connect(sk, other, newsk);
1580 	if (err) {
1581 		unix_state_unlock(sk);
1582 		goto out_unlock;
1583 	}
1584 
1585 	/* The way is open! Fastly set all the necessary fields... */
1586 
1587 	sock_hold(sk);
1588 	unix_peer(newsk)	= sk;
1589 	newsk->sk_state		= TCP_ESTABLISHED;
1590 	newsk->sk_type		= sk->sk_type;
1591 	init_peercred(newsk);
1592 	newu = unix_sk(newsk);
1593 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1594 	otheru = unix_sk(other);
1595 
1596 	/* copy address information from listening to new sock
1597 	 *
1598 	 * The contents of *(otheru->addr) and otheru->path
1599 	 * are seen fully set up here, since we have found
1600 	 * otheru in hash under its lock.  Insertion into the
1601 	 * hash chain we'd found it in had been done in an
1602 	 * earlier critical area protected by the chain's lock,
1603 	 * the same one where we'd set *(otheru->addr) contents,
1604 	 * as well as otheru->path and otheru->addr itself.
1605 	 *
1606 	 * Using smp_store_release() here to set newu->addr
1607 	 * is enough to make those stores, as well as stores
1608 	 * to newu->path visible to anyone who gets newu->addr
1609 	 * by smp_load_acquire().  IOW, the same warranties
1610 	 * as for unix_sock instances bound in unix_bind() or
1611 	 * in unix_autobind().
1612 	 */
1613 	if (otheru->path.dentry) {
1614 		path_get(&otheru->path);
1615 		newu->path = otheru->path;
1616 	}
1617 	refcount_inc(&otheru->addr->refcnt);
1618 	smp_store_release(&newu->addr, otheru->addr);
1619 
1620 	/* Set credentials */
1621 	copy_peercred(sk, other);
1622 
1623 	sock->state	= SS_CONNECTED;
1624 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1625 	sock_hold(newsk);
1626 
1627 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1628 	unix_peer(sk)	= newsk;
1629 
1630 	unix_state_unlock(sk);
1631 
1632 	/* take ten and send info to listening sock */
1633 	spin_lock(&other->sk_receive_queue.lock);
1634 	__skb_queue_tail(&other->sk_receive_queue, skb);
1635 	spin_unlock(&other->sk_receive_queue.lock);
1636 	unix_state_unlock(other);
1637 	other->sk_data_ready(other);
1638 	sock_put(other);
1639 	return 0;
1640 
1641 out_unlock:
1642 	if (other)
1643 		unix_state_unlock(other);
1644 
1645 out:
1646 	kfree_skb(skb);
1647 	if (newsk)
1648 		unix_release_sock(newsk, 0);
1649 	if (other)
1650 		sock_put(other);
1651 	return err;
1652 }
1653 
1654 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1655 {
1656 	struct sock *ska = socka->sk, *skb = sockb->sk;
1657 
1658 	/* Join our sockets back to back */
1659 	sock_hold(ska);
1660 	sock_hold(skb);
1661 	unix_peer(ska) = skb;
1662 	unix_peer(skb) = ska;
1663 	init_peercred(ska);
1664 	init_peercred(skb);
1665 
1666 	ska->sk_state = TCP_ESTABLISHED;
1667 	skb->sk_state = TCP_ESTABLISHED;
1668 	socka->state  = SS_CONNECTED;
1669 	sockb->state  = SS_CONNECTED;
1670 	return 0;
1671 }
1672 
1673 static void unix_sock_inherit_flags(const struct socket *old,
1674 				    struct socket *new)
1675 {
1676 	if (test_bit(SOCK_PASSCRED, &old->flags))
1677 		set_bit(SOCK_PASSCRED, &new->flags);
1678 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1679 		set_bit(SOCK_PASSPIDFD, &new->flags);
1680 	if (test_bit(SOCK_PASSSEC, &old->flags))
1681 		set_bit(SOCK_PASSSEC, &new->flags);
1682 }
1683 
1684 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1685 		       bool kern)
1686 {
1687 	struct sock *sk = sock->sk;
1688 	struct sock *tsk;
1689 	struct sk_buff *skb;
1690 	int err;
1691 
1692 	err = -EOPNOTSUPP;
1693 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1694 		goto out;
1695 
1696 	err = -EINVAL;
1697 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1698 		goto out;
1699 
1700 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1701 	 * so that no locks are necessary.
1702 	 */
1703 
1704 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1705 				&err);
1706 	if (!skb) {
1707 		/* This means receive shutdown. */
1708 		if (err == 0)
1709 			err = -EINVAL;
1710 		goto out;
1711 	}
1712 
1713 	tsk = skb->sk;
1714 	skb_free_datagram(sk, skb);
1715 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1716 
1717 	/* attach accepted sock to socket */
1718 	unix_state_lock(tsk);
1719 	newsock->state = SS_CONNECTED;
1720 	unix_sock_inherit_flags(sock, newsock);
1721 	sock_graft(tsk, newsock);
1722 	unix_state_unlock(tsk);
1723 	return 0;
1724 
1725 out:
1726 	return err;
1727 }
1728 
1729 
1730 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1731 {
1732 	struct sock *sk = sock->sk;
1733 	struct unix_address *addr;
1734 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1735 	int err = 0;
1736 
1737 	if (peer) {
1738 		sk = unix_peer_get(sk);
1739 
1740 		err = -ENOTCONN;
1741 		if (!sk)
1742 			goto out;
1743 		err = 0;
1744 	} else {
1745 		sock_hold(sk);
1746 	}
1747 
1748 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1749 	if (!addr) {
1750 		sunaddr->sun_family = AF_UNIX;
1751 		sunaddr->sun_path[0] = 0;
1752 		err = offsetof(struct sockaddr_un, sun_path);
1753 	} else {
1754 		err = addr->len;
1755 		memcpy(sunaddr, addr->name, addr->len);
1756 	}
1757 	sock_put(sk);
1758 out:
1759 	return err;
1760 }
1761 
1762 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1763 {
1764 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1765 
1766 	/*
1767 	 * Garbage collection of unix sockets starts by selecting a set of
1768 	 * candidate sockets which have reference only from being in flight
1769 	 * (total_refs == inflight_refs).  This condition is checked once during
1770 	 * the candidate collection phase, and candidates are marked as such, so
1771 	 * that non-candidates can later be ignored.  While inflight_refs is
1772 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1773 	 * is an instantaneous decision.
1774 	 *
1775 	 * Once a candidate, however, the socket must not be reinstalled into a
1776 	 * file descriptor while the garbage collection is in progress.
1777 	 *
1778 	 * If the above conditions are met, then the directed graph of
1779 	 * candidates (*) does not change while unix_gc_lock is held.
1780 	 *
1781 	 * Any operations that changes the file count through file descriptors
1782 	 * (dup, close, sendmsg) does not change the graph since candidates are
1783 	 * not installed in fds.
1784 	 *
1785 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1786 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1787 	 * serialized with garbage collection.
1788 	 *
1789 	 * MSG_PEEK is special in that it does not change the inflight count,
1790 	 * yet does install the socket into an fd.  The following lock/unlock
1791 	 * pair is to ensure serialization with garbage collection.  It must be
1792 	 * done between incrementing the file count and installing the file into
1793 	 * an fd.
1794 	 *
1795 	 * If garbage collection starts after the barrier provided by the
1796 	 * lock/unlock, then it will see the elevated refcount and not mark this
1797 	 * as a candidate.  If a garbage collection is already in progress
1798 	 * before the file count was incremented, then the lock/unlock pair will
1799 	 * ensure that garbage collection is finished before progressing to
1800 	 * installing the fd.
1801 	 *
1802 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1803 	 * which is on the queue of listening socket A.
1804 	 */
1805 	spin_lock(&unix_gc_lock);
1806 	spin_unlock(&unix_gc_lock);
1807 }
1808 
1809 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1810 {
1811 	int err = 0;
1812 
1813 	UNIXCB(skb).pid  = get_pid(scm->pid);
1814 	UNIXCB(skb).uid = scm->creds.uid;
1815 	UNIXCB(skb).gid = scm->creds.gid;
1816 	UNIXCB(skb).fp = NULL;
1817 	unix_get_secdata(scm, skb);
1818 	if (scm->fp && send_fds)
1819 		err = unix_attach_fds(scm, skb);
1820 
1821 	skb->destructor = unix_destruct_scm;
1822 	return err;
1823 }
1824 
1825 static bool unix_passcred_enabled(const struct socket *sock,
1826 				  const struct sock *other)
1827 {
1828 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1829 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1830 	       !other->sk_socket ||
1831 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1832 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1833 }
1834 
1835 /*
1836  * Some apps rely on write() giving SCM_CREDENTIALS
1837  * We include credentials if source or destination socket
1838  * asserted SOCK_PASSCRED.
1839  */
1840 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1841 			    const struct sock *other)
1842 {
1843 	if (UNIXCB(skb).pid)
1844 		return;
1845 	if (unix_passcred_enabled(sock, other)) {
1846 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1847 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1848 	}
1849 }
1850 
1851 static bool unix_skb_scm_eq(struct sk_buff *skb,
1852 			    struct scm_cookie *scm)
1853 {
1854 	return UNIXCB(skb).pid == scm->pid &&
1855 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1856 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1857 	       unix_secdata_eq(scm, skb);
1858 }
1859 
1860 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1861 {
1862 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1863 	struct unix_sock *u = unix_sk(sk);
1864 
1865 	if (unlikely(fp && fp->count))
1866 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1867 }
1868 
1869 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1870 {
1871 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1872 	struct unix_sock *u = unix_sk(sk);
1873 
1874 	if (unlikely(fp && fp->count))
1875 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1876 }
1877 
1878 /*
1879  *	Send AF_UNIX data.
1880  */
1881 
1882 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1883 			      size_t len)
1884 {
1885 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1886 	struct sock *sk = sock->sk, *other = NULL;
1887 	struct unix_sock *u = unix_sk(sk);
1888 	struct scm_cookie scm;
1889 	struct sk_buff *skb;
1890 	int data_len = 0;
1891 	int sk_locked;
1892 	long timeo;
1893 	int err;
1894 
1895 	wait_for_unix_gc();
1896 	err = scm_send(sock, msg, &scm, false);
1897 	if (err < 0)
1898 		return err;
1899 
1900 	err = -EOPNOTSUPP;
1901 	if (msg->msg_flags&MSG_OOB)
1902 		goto out;
1903 
1904 	if (msg->msg_namelen) {
1905 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1906 		if (err)
1907 			goto out;
1908 	} else {
1909 		sunaddr = NULL;
1910 		err = -ENOTCONN;
1911 		other = unix_peer_get(sk);
1912 		if (!other)
1913 			goto out;
1914 	}
1915 
1916 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1917 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1918 	    !READ_ONCE(u->addr)) {
1919 		err = unix_autobind(sk);
1920 		if (err)
1921 			goto out;
1922 	}
1923 
1924 	err = -EMSGSIZE;
1925 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1926 		goto out;
1927 
1928 	if (len > SKB_MAX_ALLOC) {
1929 		data_len = min_t(size_t,
1930 				 len - SKB_MAX_ALLOC,
1931 				 MAX_SKB_FRAGS * PAGE_SIZE);
1932 		data_len = PAGE_ALIGN(data_len);
1933 
1934 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1935 	}
1936 
1937 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1938 				   msg->msg_flags & MSG_DONTWAIT, &err,
1939 				   PAGE_ALLOC_COSTLY_ORDER);
1940 	if (skb == NULL)
1941 		goto out;
1942 
1943 	err = unix_scm_to_skb(&scm, skb, true);
1944 	if (err < 0)
1945 		goto out_free;
1946 
1947 	skb_put(skb, len - data_len);
1948 	skb->data_len = data_len;
1949 	skb->len = len;
1950 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1951 	if (err)
1952 		goto out_free;
1953 
1954 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1955 
1956 restart:
1957 	if (!other) {
1958 		err = -ECONNRESET;
1959 		if (sunaddr == NULL)
1960 			goto out_free;
1961 
1962 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1963 					sk->sk_type);
1964 		if (IS_ERR(other)) {
1965 			err = PTR_ERR(other);
1966 			other = NULL;
1967 			goto out_free;
1968 		}
1969 	}
1970 
1971 	if (sk_filter(other, skb) < 0) {
1972 		/* Toss the packet but do not return any error to the sender */
1973 		err = len;
1974 		goto out_free;
1975 	}
1976 
1977 	sk_locked = 0;
1978 	unix_state_lock(other);
1979 restart_locked:
1980 	err = -EPERM;
1981 	if (!unix_may_send(sk, other))
1982 		goto out_unlock;
1983 
1984 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1985 		/*
1986 		 *	Check with 1003.1g - what should
1987 		 *	datagram error
1988 		 */
1989 		unix_state_unlock(other);
1990 		sock_put(other);
1991 
1992 		if (!sk_locked)
1993 			unix_state_lock(sk);
1994 
1995 		err = 0;
1996 		if (sk->sk_type == SOCK_SEQPACKET) {
1997 			/* We are here only when racing with unix_release_sock()
1998 			 * is clearing @other. Never change state to TCP_CLOSE
1999 			 * unlike SOCK_DGRAM wants.
2000 			 */
2001 			unix_state_unlock(sk);
2002 			err = -EPIPE;
2003 		} else if (unix_peer(sk) == other) {
2004 			unix_peer(sk) = NULL;
2005 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2006 
2007 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2008 			unix_state_unlock(sk);
2009 
2010 			unix_dgram_disconnected(sk, other);
2011 			sock_put(other);
2012 			err = -ECONNREFUSED;
2013 		} else {
2014 			unix_state_unlock(sk);
2015 		}
2016 
2017 		other = NULL;
2018 		if (err)
2019 			goto out_free;
2020 		goto restart;
2021 	}
2022 
2023 	err = -EPIPE;
2024 	if (other->sk_shutdown & RCV_SHUTDOWN)
2025 		goto out_unlock;
2026 
2027 	if (sk->sk_type != SOCK_SEQPACKET) {
2028 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2029 		if (err)
2030 			goto out_unlock;
2031 	}
2032 
2033 	/* other == sk && unix_peer(other) != sk if
2034 	 * - unix_peer(sk) == NULL, destination address bound to sk
2035 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2036 	 */
2037 	if (other != sk &&
2038 	    unlikely(unix_peer(other) != sk &&
2039 	    unix_recvq_full_lockless(other))) {
2040 		if (timeo) {
2041 			timeo = unix_wait_for_peer(other, timeo);
2042 
2043 			err = sock_intr_errno(timeo);
2044 			if (signal_pending(current))
2045 				goto out_free;
2046 
2047 			goto restart;
2048 		}
2049 
2050 		if (!sk_locked) {
2051 			unix_state_unlock(other);
2052 			unix_state_double_lock(sk, other);
2053 		}
2054 
2055 		if (unix_peer(sk) != other ||
2056 		    unix_dgram_peer_wake_me(sk, other)) {
2057 			err = -EAGAIN;
2058 			sk_locked = 1;
2059 			goto out_unlock;
2060 		}
2061 
2062 		if (!sk_locked) {
2063 			sk_locked = 1;
2064 			goto restart_locked;
2065 		}
2066 	}
2067 
2068 	if (unlikely(sk_locked))
2069 		unix_state_unlock(sk);
2070 
2071 	if (sock_flag(other, SOCK_RCVTSTAMP))
2072 		__net_timestamp(skb);
2073 	maybe_add_creds(skb, sock, other);
2074 	scm_stat_add(other, skb);
2075 	skb_queue_tail(&other->sk_receive_queue, skb);
2076 	unix_state_unlock(other);
2077 	other->sk_data_ready(other);
2078 	sock_put(other);
2079 	scm_destroy(&scm);
2080 	return len;
2081 
2082 out_unlock:
2083 	if (sk_locked)
2084 		unix_state_unlock(sk);
2085 	unix_state_unlock(other);
2086 out_free:
2087 	kfree_skb(skb);
2088 out:
2089 	if (other)
2090 		sock_put(other);
2091 	scm_destroy(&scm);
2092 	return err;
2093 }
2094 
2095 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2096  * bytes, and a minimum of a full page.
2097  */
2098 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2099 
2100 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2101 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2102 		     struct scm_cookie *scm, bool fds_sent)
2103 {
2104 	struct unix_sock *ousk = unix_sk(other);
2105 	struct sk_buff *skb;
2106 	int err = 0;
2107 
2108 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2109 
2110 	if (!skb)
2111 		return err;
2112 
2113 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2114 	if (err < 0) {
2115 		kfree_skb(skb);
2116 		return err;
2117 	}
2118 	skb_put(skb, 1);
2119 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2120 
2121 	if (err) {
2122 		kfree_skb(skb);
2123 		return err;
2124 	}
2125 
2126 	unix_state_lock(other);
2127 
2128 	if (sock_flag(other, SOCK_DEAD) ||
2129 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2130 		unix_state_unlock(other);
2131 		kfree_skb(skb);
2132 		return -EPIPE;
2133 	}
2134 
2135 	maybe_add_creds(skb, sock, other);
2136 	skb_get(skb);
2137 
2138 	scm_stat_add(other, skb);
2139 
2140 	spin_lock(&other->sk_receive_queue.lock);
2141 	if (ousk->oob_skb)
2142 		consume_skb(ousk->oob_skb);
2143 	WRITE_ONCE(ousk->oob_skb, skb);
2144 	__skb_queue_tail(&other->sk_receive_queue, skb);
2145 	spin_unlock(&other->sk_receive_queue.lock);
2146 
2147 	sk_send_sigurg(other);
2148 	unix_state_unlock(other);
2149 	other->sk_data_ready(other);
2150 
2151 	return err;
2152 }
2153 #endif
2154 
2155 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2156 			       size_t len)
2157 {
2158 	struct sock *sk = sock->sk;
2159 	struct sock *other = NULL;
2160 	int err, size;
2161 	struct sk_buff *skb;
2162 	int sent = 0;
2163 	struct scm_cookie scm;
2164 	bool fds_sent = false;
2165 	int data_len;
2166 
2167 	wait_for_unix_gc();
2168 	err = scm_send(sock, msg, &scm, false);
2169 	if (err < 0)
2170 		return err;
2171 
2172 	err = -EOPNOTSUPP;
2173 	if (msg->msg_flags & MSG_OOB) {
2174 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2175 		if (len)
2176 			len--;
2177 		else
2178 #endif
2179 			goto out_err;
2180 	}
2181 
2182 	if (msg->msg_namelen) {
2183 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2184 		goto out_err;
2185 	} else {
2186 		err = -ENOTCONN;
2187 		other = unix_peer(sk);
2188 		if (!other)
2189 			goto out_err;
2190 	}
2191 
2192 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2193 		goto pipe_err;
2194 
2195 	while (sent < len) {
2196 		size = len - sent;
2197 
2198 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2199 			skb = sock_alloc_send_pskb(sk, 0, 0,
2200 						   msg->msg_flags & MSG_DONTWAIT,
2201 						   &err, 0);
2202 		} else {
2203 			/* Keep two messages in the pipe so it schedules better */
2204 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2205 
2206 			/* allow fallback to order-0 allocations */
2207 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2208 
2209 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2210 
2211 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2212 
2213 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2214 						   msg->msg_flags & MSG_DONTWAIT, &err,
2215 						   get_order(UNIX_SKB_FRAGS_SZ));
2216 		}
2217 		if (!skb)
2218 			goto out_err;
2219 
2220 		/* Only send the fds in the first buffer */
2221 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2222 		if (err < 0) {
2223 			kfree_skb(skb);
2224 			goto out_err;
2225 		}
2226 		fds_sent = true;
2227 
2228 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2229 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2230 						   sk->sk_allocation);
2231 			if (err < 0) {
2232 				kfree_skb(skb);
2233 				goto out_err;
2234 			}
2235 			size = err;
2236 			refcount_add(size, &sk->sk_wmem_alloc);
2237 		} else {
2238 			skb_put(skb, size - data_len);
2239 			skb->data_len = data_len;
2240 			skb->len = size;
2241 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2242 			if (err) {
2243 				kfree_skb(skb);
2244 				goto out_err;
2245 			}
2246 		}
2247 
2248 		unix_state_lock(other);
2249 
2250 		if (sock_flag(other, SOCK_DEAD) ||
2251 		    (other->sk_shutdown & RCV_SHUTDOWN))
2252 			goto pipe_err_free;
2253 
2254 		maybe_add_creds(skb, sock, other);
2255 		scm_stat_add(other, skb);
2256 		skb_queue_tail(&other->sk_receive_queue, skb);
2257 		unix_state_unlock(other);
2258 		other->sk_data_ready(other);
2259 		sent += size;
2260 	}
2261 
2262 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2263 	if (msg->msg_flags & MSG_OOB) {
2264 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2265 		if (err)
2266 			goto out_err;
2267 		sent++;
2268 	}
2269 #endif
2270 
2271 	scm_destroy(&scm);
2272 
2273 	return sent;
2274 
2275 pipe_err_free:
2276 	unix_state_unlock(other);
2277 	kfree_skb(skb);
2278 pipe_err:
2279 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2280 		send_sig(SIGPIPE, current, 0);
2281 	err = -EPIPE;
2282 out_err:
2283 	scm_destroy(&scm);
2284 	return sent ? : err;
2285 }
2286 
2287 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2288 				  size_t len)
2289 {
2290 	int err;
2291 	struct sock *sk = sock->sk;
2292 
2293 	err = sock_error(sk);
2294 	if (err)
2295 		return err;
2296 
2297 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2298 		return -ENOTCONN;
2299 
2300 	if (msg->msg_namelen)
2301 		msg->msg_namelen = 0;
2302 
2303 	return unix_dgram_sendmsg(sock, msg, len);
2304 }
2305 
2306 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2307 				  size_t size, int flags)
2308 {
2309 	struct sock *sk = sock->sk;
2310 
2311 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2312 		return -ENOTCONN;
2313 
2314 	return unix_dgram_recvmsg(sock, msg, size, flags);
2315 }
2316 
2317 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2318 {
2319 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2320 
2321 	if (addr) {
2322 		msg->msg_namelen = addr->len;
2323 		memcpy(msg->msg_name, addr->name, addr->len);
2324 	}
2325 }
2326 
2327 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2328 			 int flags)
2329 {
2330 	struct scm_cookie scm;
2331 	struct socket *sock = sk->sk_socket;
2332 	struct unix_sock *u = unix_sk(sk);
2333 	struct sk_buff *skb, *last;
2334 	long timeo;
2335 	int skip;
2336 	int err;
2337 
2338 	err = -EOPNOTSUPP;
2339 	if (flags&MSG_OOB)
2340 		goto out;
2341 
2342 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2343 
2344 	do {
2345 		mutex_lock(&u->iolock);
2346 
2347 		skip = sk_peek_offset(sk, flags);
2348 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2349 					      &skip, &err, &last);
2350 		if (skb) {
2351 			if (!(flags & MSG_PEEK))
2352 				scm_stat_del(sk, skb);
2353 			break;
2354 		}
2355 
2356 		mutex_unlock(&u->iolock);
2357 
2358 		if (err != -EAGAIN)
2359 			break;
2360 	} while (timeo &&
2361 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2362 					      &err, &timeo, last));
2363 
2364 	if (!skb) { /* implies iolock unlocked */
2365 		unix_state_lock(sk);
2366 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2367 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2368 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2369 			err = 0;
2370 		unix_state_unlock(sk);
2371 		goto out;
2372 	}
2373 
2374 	if (wq_has_sleeper(&u->peer_wait))
2375 		wake_up_interruptible_sync_poll(&u->peer_wait,
2376 						EPOLLOUT | EPOLLWRNORM |
2377 						EPOLLWRBAND);
2378 
2379 	if (msg->msg_name)
2380 		unix_copy_addr(msg, skb->sk);
2381 
2382 	if (size > skb->len - skip)
2383 		size = skb->len - skip;
2384 	else if (size < skb->len - skip)
2385 		msg->msg_flags |= MSG_TRUNC;
2386 
2387 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2388 	if (err)
2389 		goto out_free;
2390 
2391 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2392 		__sock_recv_timestamp(msg, sk, skb);
2393 
2394 	memset(&scm, 0, sizeof(scm));
2395 
2396 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2397 	unix_set_secdata(&scm, skb);
2398 
2399 	if (!(flags & MSG_PEEK)) {
2400 		if (UNIXCB(skb).fp)
2401 			unix_detach_fds(&scm, skb);
2402 
2403 		sk_peek_offset_bwd(sk, skb->len);
2404 	} else {
2405 		/* It is questionable: on PEEK we could:
2406 		   - do not return fds - good, but too simple 8)
2407 		   - return fds, and do not return them on read (old strategy,
2408 		     apparently wrong)
2409 		   - clone fds (I chose it for now, it is the most universal
2410 		     solution)
2411 
2412 		   POSIX 1003.1g does not actually define this clearly
2413 		   at all. POSIX 1003.1g doesn't define a lot of things
2414 		   clearly however!
2415 
2416 		*/
2417 
2418 		sk_peek_offset_fwd(sk, size);
2419 
2420 		if (UNIXCB(skb).fp)
2421 			unix_peek_fds(&scm, skb);
2422 	}
2423 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2424 
2425 	scm_recv_unix(sock, msg, &scm, flags);
2426 
2427 out_free:
2428 	skb_free_datagram(sk, skb);
2429 	mutex_unlock(&u->iolock);
2430 out:
2431 	return err;
2432 }
2433 
2434 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2435 			      int flags)
2436 {
2437 	struct sock *sk = sock->sk;
2438 
2439 #ifdef CONFIG_BPF_SYSCALL
2440 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2441 
2442 	if (prot != &unix_dgram_proto)
2443 		return prot->recvmsg(sk, msg, size, flags, NULL);
2444 #endif
2445 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2446 }
2447 
2448 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2449 {
2450 	struct unix_sock *u = unix_sk(sk);
2451 	struct sk_buff *skb;
2452 	int err;
2453 
2454 	mutex_lock(&u->iolock);
2455 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2456 	mutex_unlock(&u->iolock);
2457 	if (!skb)
2458 		return err;
2459 
2460 	return recv_actor(sk, skb);
2461 }
2462 
2463 /*
2464  *	Sleep until more data has arrived. But check for races..
2465  */
2466 static long unix_stream_data_wait(struct sock *sk, long timeo,
2467 				  struct sk_buff *last, unsigned int last_len,
2468 				  bool freezable)
2469 {
2470 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2471 	struct sk_buff *tail;
2472 	DEFINE_WAIT(wait);
2473 
2474 	unix_state_lock(sk);
2475 
2476 	for (;;) {
2477 		prepare_to_wait(sk_sleep(sk), &wait, state);
2478 
2479 		tail = skb_peek_tail(&sk->sk_receive_queue);
2480 		if (tail != last ||
2481 		    (tail && tail->len != last_len) ||
2482 		    sk->sk_err ||
2483 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2484 		    signal_pending(current) ||
2485 		    !timeo)
2486 			break;
2487 
2488 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2489 		unix_state_unlock(sk);
2490 		timeo = schedule_timeout(timeo);
2491 		unix_state_lock(sk);
2492 
2493 		if (sock_flag(sk, SOCK_DEAD))
2494 			break;
2495 
2496 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2497 	}
2498 
2499 	finish_wait(sk_sleep(sk), &wait);
2500 	unix_state_unlock(sk);
2501 	return timeo;
2502 }
2503 
2504 static unsigned int unix_skb_len(const struct sk_buff *skb)
2505 {
2506 	return skb->len - UNIXCB(skb).consumed;
2507 }
2508 
2509 struct unix_stream_read_state {
2510 	int (*recv_actor)(struct sk_buff *, int, int,
2511 			  struct unix_stream_read_state *);
2512 	struct socket *socket;
2513 	struct msghdr *msg;
2514 	struct pipe_inode_info *pipe;
2515 	size_t size;
2516 	int flags;
2517 	unsigned int splice_flags;
2518 };
2519 
2520 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2521 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2522 {
2523 	struct socket *sock = state->socket;
2524 	struct sock *sk = sock->sk;
2525 	struct unix_sock *u = unix_sk(sk);
2526 	int chunk = 1;
2527 	struct sk_buff *oob_skb;
2528 
2529 	mutex_lock(&u->iolock);
2530 	unix_state_lock(sk);
2531 	spin_lock(&sk->sk_receive_queue.lock);
2532 
2533 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2534 		spin_unlock(&sk->sk_receive_queue.lock);
2535 		unix_state_unlock(sk);
2536 		mutex_unlock(&u->iolock);
2537 		return -EINVAL;
2538 	}
2539 
2540 	oob_skb = u->oob_skb;
2541 
2542 	if (!(state->flags & MSG_PEEK))
2543 		WRITE_ONCE(u->oob_skb, NULL);
2544 	else
2545 		skb_get(oob_skb);
2546 
2547 	spin_unlock(&sk->sk_receive_queue.lock);
2548 	unix_state_unlock(sk);
2549 
2550 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2551 
2552 	if (!(state->flags & MSG_PEEK))
2553 		UNIXCB(oob_skb).consumed += 1;
2554 
2555 	consume_skb(oob_skb);
2556 
2557 	mutex_unlock(&u->iolock);
2558 
2559 	if (chunk < 0)
2560 		return -EFAULT;
2561 
2562 	state->msg->msg_flags |= MSG_OOB;
2563 	return 1;
2564 }
2565 
2566 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2567 				  int flags, int copied)
2568 {
2569 	struct unix_sock *u = unix_sk(sk);
2570 
2571 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2572 		skb_unlink(skb, &sk->sk_receive_queue);
2573 		consume_skb(skb);
2574 		skb = NULL;
2575 	} else {
2576 		struct sk_buff *unlinked_skb = NULL;
2577 
2578 		spin_lock(&sk->sk_receive_queue.lock);
2579 
2580 		if (skb == u->oob_skb) {
2581 			if (copied) {
2582 				skb = NULL;
2583 			} else if (!(flags & MSG_PEEK)) {
2584 				if (sock_flag(sk, SOCK_URGINLINE)) {
2585 					WRITE_ONCE(u->oob_skb, NULL);
2586 					consume_skb(skb);
2587 				} else {
2588 					__skb_unlink(skb, &sk->sk_receive_queue);
2589 					WRITE_ONCE(u->oob_skb, NULL);
2590 					unlinked_skb = skb;
2591 					skb = skb_peek(&sk->sk_receive_queue);
2592 				}
2593 			} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2594 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
2595 			}
2596 		}
2597 
2598 		spin_unlock(&sk->sk_receive_queue.lock);
2599 
2600 		if (unlinked_skb) {
2601 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2602 			kfree_skb(unlinked_skb);
2603 		}
2604 	}
2605 	return skb;
2606 }
2607 #endif
2608 
2609 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2610 {
2611 	struct unix_sock *u = unix_sk(sk);
2612 	struct sk_buff *skb;
2613 	int err;
2614 
2615 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2616 		return -ENOTCONN;
2617 
2618 	mutex_lock(&u->iolock);
2619 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2620 	mutex_unlock(&u->iolock);
2621 	if (!skb)
2622 		return err;
2623 
2624 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2625 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2626 		bool drop = false;
2627 
2628 		unix_state_lock(sk);
2629 
2630 		if (sock_flag(sk, SOCK_DEAD)) {
2631 			unix_state_unlock(sk);
2632 			kfree_skb(skb);
2633 			return -ECONNRESET;
2634 		}
2635 
2636 		spin_lock(&sk->sk_receive_queue.lock);
2637 		if (likely(skb == u->oob_skb)) {
2638 			WRITE_ONCE(u->oob_skb, NULL);
2639 			drop = true;
2640 		}
2641 		spin_unlock(&sk->sk_receive_queue.lock);
2642 
2643 		unix_state_unlock(sk);
2644 
2645 		if (drop) {
2646 			WARN_ON_ONCE(skb_unref(skb));
2647 			kfree_skb(skb);
2648 			return -EAGAIN;
2649 		}
2650 	}
2651 #endif
2652 
2653 	return recv_actor(sk, skb);
2654 }
2655 
2656 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2657 				    bool freezable)
2658 {
2659 	struct scm_cookie scm;
2660 	struct socket *sock = state->socket;
2661 	struct sock *sk = sock->sk;
2662 	struct unix_sock *u = unix_sk(sk);
2663 	int copied = 0;
2664 	int flags = state->flags;
2665 	int noblock = flags & MSG_DONTWAIT;
2666 	bool check_creds = false;
2667 	int target;
2668 	int err = 0;
2669 	long timeo;
2670 	int skip;
2671 	size_t size = state->size;
2672 	unsigned int last_len;
2673 
2674 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2675 		err = -EINVAL;
2676 		goto out;
2677 	}
2678 
2679 	if (unlikely(flags & MSG_OOB)) {
2680 		err = -EOPNOTSUPP;
2681 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2682 		err = unix_stream_recv_urg(state);
2683 #endif
2684 		goto out;
2685 	}
2686 
2687 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2688 	timeo = sock_rcvtimeo(sk, noblock);
2689 
2690 	memset(&scm, 0, sizeof(scm));
2691 
2692 	/* Lock the socket to prevent queue disordering
2693 	 * while sleeps in memcpy_tomsg
2694 	 */
2695 	mutex_lock(&u->iolock);
2696 
2697 	skip = max(sk_peek_offset(sk, flags), 0);
2698 
2699 	do {
2700 		int chunk;
2701 		bool drop_skb;
2702 		struct sk_buff *skb, *last;
2703 
2704 redo:
2705 		unix_state_lock(sk);
2706 		if (sock_flag(sk, SOCK_DEAD)) {
2707 			err = -ECONNRESET;
2708 			goto unlock;
2709 		}
2710 		last = skb = skb_peek(&sk->sk_receive_queue);
2711 		last_len = last ? last->len : 0;
2712 
2713 again:
2714 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2715 		if (skb) {
2716 			skb = manage_oob(skb, sk, flags, copied);
2717 			if (!skb && copied) {
2718 				unix_state_unlock(sk);
2719 				break;
2720 			}
2721 		}
2722 #endif
2723 		if (skb == NULL) {
2724 			if (copied >= target)
2725 				goto unlock;
2726 
2727 			/*
2728 			 *	POSIX 1003.1g mandates this order.
2729 			 */
2730 
2731 			err = sock_error(sk);
2732 			if (err)
2733 				goto unlock;
2734 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2735 				goto unlock;
2736 
2737 			unix_state_unlock(sk);
2738 			if (!timeo) {
2739 				err = -EAGAIN;
2740 				break;
2741 			}
2742 
2743 			mutex_unlock(&u->iolock);
2744 
2745 			timeo = unix_stream_data_wait(sk, timeo, last,
2746 						      last_len, freezable);
2747 
2748 			if (signal_pending(current)) {
2749 				err = sock_intr_errno(timeo);
2750 				scm_destroy(&scm);
2751 				goto out;
2752 			}
2753 
2754 			mutex_lock(&u->iolock);
2755 			goto redo;
2756 unlock:
2757 			unix_state_unlock(sk);
2758 			break;
2759 		}
2760 
2761 		while (skip >= unix_skb_len(skb)) {
2762 			skip -= unix_skb_len(skb);
2763 			last = skb;
2764 			last_len = skb->len;
2765 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2766 			if (!skb)
2767 				goto again;
2768 		}
2769 
2770 		unix_state_unlock(sk);
2771 
2772 		if (check_creds) {
2773 			/* Never glue messages from different writers */
2774 			if (!unix_skb_scm_eq(skb, &scm))
2775 				break;
2776 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2777 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2778 			/* Copy credentials */
2779 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2780 			unix_set_secdata(&scm, skb);
2781 			check_creds = true;
2782 		}
2783 
2784 		/* Copy address just once */
2785 		if (state->msg && state->msg->msg_name) {
2786 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2787 					 state->msg->msg_name);
2788 			unix_copy_addr(state->msg, skb->sk);
2789 			sunaddr = NULL;
2790 		}
2791 
2792 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2793 		skb_get(skb);
2794 		chunk = state->recv_actor(skb, skip, chunk, state);
2795 		drop_skb = !unix_skb_len(skb);
2796 		/* skb is only safe to use if !drop_skb */
2797 		consume_skb(skb);
2798 		if (chunk < 0) {
2799 			if (copied == 0)
2800 				copied = -EFAULT;
2801 			break;
2802 		}
2803 		copied += chunk;
2804 		size -= chunk;
2805 
2806 		if (drop_skb) {
2807 			/* the skb was touched by a concurrent reader;
2808 			 * we should not expect anything from this skb
2809 			 * anymore and assume it invalid - we can be
2810 			 * sure it was dropped from the socket queue
2811 			 *
2812 			 * let's report a short read
2813 			 */
2814 			err = 0;
2815 			break;
2816 		}
2817 
2818 		/* Mark read part of skb as used */
2819 		if (!(flags & MSG_PEEK)) {
2820 			UNIXCB(skb).consumed += chunk;
2821 
2822 			sk_peek_offset_bwd(sk, chunk);
2823 
2824 			if (UNIXCB(skb).fp) {
2825 				scm_stat_del(sk, skb);
2826 				unix_detach_fds(&scm, skb);
2827 			}
2828 
2829 			if (unix_skb_len(skb))
2830 				break;
2831 
2832 			skb_unlink(skb, &sk->sk_receive_queue);
2833 			consume_skb(skb);
2834 
2835 			if (scm.fp)
2836 				break;
2837 		} else {
2838 			/* It is questionable, see note in unix_dgram_recvmsg.
2839 			 */
2840 			if (UNIXCB(skb).fp)
2841 				unix_peek_fds(&scm, skb);
2842 
2843 			sk_peek_offset_fwd(sk, chunk);
2844 
2845 			if (UNIXCB(skb).fp)
2846 				break;
2847 
2848 			skip = 0;
2849 			last = skb;
2850 			last_len = skb->len;
2851 			unix_state_lock(sk);
2852 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2853 			if (skb)
2854 				goto again;
2855 			unix_state_unlock(sk);
2856 			break;
2857 		}
2858 	} while (size);
2859 
2860 	mutex_unlock(&u->iolock);
2861 	if (state->msg)
2862 		scm_recv_unix(sock, state->msg, &scm, flags);
2863 	else
2864 		scm_destroy(&scm);
2865 out:
2866 	return copied ? : err;
2867 }
2868 
2869 static int unix_stream_read_actor(struct sk_buff *skb,
2870 				  int skip, int chunk,
2871 				  struct unix_stream_read_state *state)
2872 {
2873 	int ret;
2874 
2875 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2876 				    state->msg, chunk);
2877 	return ret ?: chunk;
2878 }
2879 
2880 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2881 			  size_t size, int flags)
2882 {
2883 	struct unix_stream_read_state state = {
2884 		.recv_actor = unix_stream_read_actor,
2885 		.socket = sk->sk_socket,
2886 		.msg = msg,
2887 		.size = size,
2888 		.flags = flags
2889 	};
2890 
2891 	return unix_stream_read_generic(&state, true);
2892 }
2893 
2894 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2895 			       size_t size, int flags)
2896 {
2897 	struct unix_stream_read_state state = {
2898 		.recv_actor = unix_stream_read_actor,
2899 		.socket = sock,
2900 		.msg = msg,
2901 		.size = size,
2902 		.flags = flags
2903 	};
2904 
2905 #ifdef CONFIG_BPF_SYSCALL
2906 	struct sock *sk = sock->sk;
2907 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2908 
2909 	if (prot != &unix_stream_proto)
2910 		return prot->recvmsg(sk, msg, size, flags, NULL);
2911 #endif
2912 	return unix_stream_read_generic(&state, true);
2913 }
2914 
2915 static int unix_stream_splice_actor(struct sk_buff *skb,
2916 				    int skip, int chunk,
2917 				    struct unix_stream_read_state *state)
2918 {
2919 	return skb_splice_bits(skb, state->socket->sk,
2920 			       UNIXCB(skb).consumed + skip,
2921 			       state->pipe, chunk, state->splice_flags);
2922 }
2923 
2924 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2925 				       struct pipe_inode_info *pipe,
2926 				       size_t size, unsigned int flags)
2927 {
2928 	struct unix_stream_read_state state = {
2929 		.recv_actor = unix_stream_splice_actor,
2930 		.socket = sock,
2931 		.pipe = pipe,
2932 		.size = size,
2933 		.splice_flags = flags,
2934 	};
2935 
2936 	if (unlikely(*ppos))
2937 		return -ESPIPE;
2938 
2939 	if (sock->file->f_flags & O_NONBLOCK ||
2940 	    flags & SPLICE_F_NONBLOCK)
2941 		state.flags = MSG_DONTWAIT;
2942 
2943 	return unix_stream_read_generic(&state, false);
2944 }
2945 
2946 static int unix_shutdown(struct socket *sock, int mode)
2947 {
2948 	struct sock *sk = sock->sk;
2949 	struct sock *other;
2950 
2951 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2952 		return -EINVAL;
2953 	/* This maps:
2954 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2955 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2956 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2957 	 */
2958 	++mode;
2959 
2960 	unix_state_lock(sk);
2961 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2962 	other = unix_peer(sk);
2963 	if (other)
2964 		sock_hold(other);
2965 	unix_state_unlock(sk);
2966 	sk->sk_state_change(sk);
2967 
2968 	if (other &&
2969 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2970 
2971 		int peer_mode = 0;
2972 		const struct proto *prot = READ_ONCE(other->sk_prot);
2973 
2974 		if (prot->unhash)
2975 			prot->unhash(other);
2976 		if (mode&RCV_SHUTDOWN)
2977 			peer_mode |= SEND_SHUTDOWN;
2978 		if (mode&SEND_SHUTDOWN)
2979 			peer_mode |= RCV_SHUTDOWN;
2980 		unix_state_lock(other);
2981 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2982 		unix_state_unlock(other);
2983 		other->sk_state_change(other);
2984 		if (peer_mode == SHUTDOWN_MASK)
2985 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2986 		else if (peer_mode & RCV_SHUTDOWN)
2987 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2988 	}
2989 	if (other)
2990 		sock_put(other);
2991 
2992 	return 0;
2993 }
2994 
2995 long unix_inq_len(struct sock *sk)
2996 {
2997 	struct sk_buff *skb;
2998 	long amount = 0;
2999 
3000 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3001 		return -EINVAL;
3002 
3003 	spin_lock(&sk->sk_receive_queue.lock);
3004 	if (sk->sk_type == SOCK_STREAM ||
3005 	    sk->sk_type == SOCK_SEQPACKET) {
3006 		skb_queue_walk(&sk->sk_receive_queue, skb)
3007 			amount += unix_skb_len(skb);
3008 	} else {
3009 		skb = skb_peek(&sk->sk_receive_queue);
3010 		if (skb)
3011 			amount = skb->len;
3012 	}
3013 	spin_unlock(&sk->sk_receive_queue.lock);
3014 
3015 	return amount;
3016 }
3017 EXPORT_SYMBOL_GPL(unix_inq_len);
3018 
3019 long unix_outq_len(struct sock *sk)
3020 {
3021 	return sk_wmem_alloc_get(sk);
3022 }
3023 EXPORT_SYMBOL_GPL(unix_outq_len);
3024 
3025 static int unix_open_file(struct sock *sk)
3026 {
3027 	struct path path;
3028 	struct file *f;
3029 	int fd;
3030 
3031 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3032 		return -EPERM;
3033 
3034 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3035 		return -ENOENT;
3036 
3037 	path = unix_sk(sk)->path;
3038 	if (!path.dentry)
3039 		return -ENOENT;
3040 
3041 	path_get(&path);
3042 
3043 	fd = get_unused_fd_flags(O_CLOEXEC);
3044 	if (fd < 0)
3045 		goto out;
3046 
3047 	f = dentry_open(&path, O_PATH, current_cred());
3048 	if (IS_ERR(f)) {
3049 		put_unused_fd(fd);
3050 		fd = PTR_ERR(f);
3051 		goto out;
3052 	}
3053 
3054 	fd_install(fd, f);
3055 out:
3056 	path_put(&path);
3057 
3058 	return fd;
3059 }
3060 
3061 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3062 {
3063 	struct sock *sk = sock->sk;
3064 	long amount = 0;
3065 	int err;
3066 
3067 	switch (cmd) {
3068 	case SIOCOUTQ:
3069 		amount = unix_outq_len(sk);
3070 		err = put_user(amount, (int __user *)arg);
3071 		break;
3072 	case SIOCINQ:
3073 		amount = unix_inq_len(sk);
3074 		if (amount < 0)
3075 			err = amount;
3076 		else
3077 			err = put_user(amount, (int __user *)arg);
3078 		break;
3079 	case SIOCUNIXFILE:
3080 		err = unix_open_file(sk);
3081 		break;
3082 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3083 	case SIOCATMARK:
3084 		{
3085 			struct sk_buff *skb;
3086 			int answ = 0;
3087 
3088 			skb = skb_peek(&sk->sk_receive_queue);
3089 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3090 				answ = 1;
3091 			err = put_user(answ, (int __user *)arg);
3092 		}
3093 		break;
3094 #endif
3095 	default:
3096 		err = -ENOIOCTLCMD;
3097 		break;
3098 	}
3099 	return err;
3100 }
3101 
3102 #ifdef CONFIG_COMPAT
3103 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3104 {
3105 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3106 }
3107 #endif
3108 
3109 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3110 {
3111 	struct sock *sk = sock->sk;
3112 	unsigned char state;
3113 	__poll_t mask;
3114 	u8 shutdown;
3115 
3116 	sock_poll_wait(file, sock, wait);
3117 	mask = 0;
3118 	shutdown = READ_ONCE(sk->sk_shutdown);
3119 	state = READ_ONCE(sk->sk_state);
3120 
3121 	/* exceptional events? */
3122 	if (READ_ONCE(sk->sk_err))
3123 		mask |= EPOLLERR;
3124 	if (shutdown == SHUTDOWN_MASK)
3125 		mask |= EPOLLHUP;
3126 	if (shutdown & RCV_SHUTDOWN)
3127 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3128 
3129 	/* readable? */
3130 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3131 		mask |= EPOLLIN | EPOLLRDNORM;
3132 	if (sk_is_readable(sk))
3133 		mask |= EPOLLIN | EPOLLRDNORM;
3134 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3135 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3136 		mask |= EPOLLPRI;
3137 #endif
3138 
3139 	/* Connection-based need to check for termination and startup */
3140 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3141 	    state == TCP_CLOSE)
3142 		mask |= EPOLLHUP;
3143 
3144 	/*
3145 	 * we set writable also when the other side has shut down the
3146 	 * connection. This prevents stuck sockets.
3147 	 */
3148 	if (unix_writable(sk, state))
3149 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3150 
3151 	return mask;
3152 }
3153 
3154 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3155 				    poll_table *wait)
3156 {
3157 	struct sock *sk = sock->sk, *other;
3158 	unsigned int writable;
3159 	unsigned char state;
3160 	__poll_t mask;
3161 	u8 shutdown;
3162 
3163 	sock_poll_wait(file, sock, wait);
3164 	mask = 0;
3165 	shutdown = READ_ONCE(sk->sk_shutdown);
3166 	state = READ_ONCE(sk->sk_state);
3167 
3168 	/* exceptional events? */
3169 	if (READ_ONCE(sk->sk_err) ||
3170 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3171 		mask |= EPOLLERR |
3172 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3173 
3174 	if (shutdown & RCV_SHUTDOWN)
3175 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3176 	if (shutdown == SHUTDOWN_MASK)
3177 		mask |= EPOLLHUP;
3178 
3179 	/* readable? */
3180 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3181 		mask |= EPOLLIN | EPOLLRDNORM;
3182 	if (sk_is_readable(sk))
3183 		mask |= EPOLLIN | EPOLLRDNORM;
3184 
3185 	/* Connection-based need to check for termination and startup */
3186 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3187 		mask |= EPOLLHUP;
3188 
3189 	/* No write status requested, avoid expensive OUT tests. */
3190 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3191 		return mask;
3192 
3193 	writable = unix_writable(sk, state);
3194 	if (writable) {
3195 		unix_state_lock(sk);
3196 
3197 		other = unix_peer(sk);
3198 		if (other && unix_peer(other) != sk &&
3199 		    unix_recvq_full_lockless(other) &&
3200 		    unix_dgram_peer_wake_me(sk, other))
3201 			writable = 0;
3202 
3203 		unix_state_unlock(sk);
3204 	}
3205 
3206 	if (writable)
3207 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3208 	else
3209 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3210 
3211 	return mask;
3212 }
3213 
3214 #ifdef CONFIG_PROC_FS
3215 
3216 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3217 
3218 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3219 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3220 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3221 
3222 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3223 {
3224 	unsigned long offset = get_offset(*pos);
3225 	unsigned long bucket = get_bucket(*pos);
3226 	unsigned long count = 0;
3227 	struct sock *sk;
3228 
3229 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3230 	     sk; sk = sk_next(sk)) {
3231 		if (++count == offset)
3232 			break;
3233 	}
3234 
3235 	return sk;
3236 }
3237 
3238 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3239 {
3240 	unsigned long bucket = get_bucket(*pos);
3241 	struct net *net = seq_file_net(seq);
3242 	struct sock *sk;
3243 
3244 	while (bucket < UNIX_HASH_SIZE) {
3245 		spin_lock(&net->unx.table.locks[bucket]);
3246 
3247 		sk = unix_from_bucket(seq, pos);
3248 		if (sk)
3249 			return sk;
3250 
3251 		spin_unlock(&net->unx.table.locks[bucket]);
3252 
3253 		*pos = set_bucket_offset(++bucket, 1);
3254 	}
3255 
3256 	return NULL;
3257 }
3258 
3259 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3260 				  loff_t *pos)
3261 {
3262 	unsigned long bucket = get_bucket(*pos);
3263 
3264 	sk = sk_next(sk);
3265 	if (sk)
3266 		return sk;
3267 
3268 
3269 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3270 
3271 	*pos = set_bucket_offset(++bucket, 1);
3272 
3273 	return unix_get_first(seq, pos);
3274 }
3275 
3276 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3277 {
3278 	if (!*pos)
3279 		return SEQ_START_TOKEN;
3280 
3281 	return unix_get_first(seq, pos);
3282 }
3283 
3284 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3285 {
3286 	++*pos;
3287 
3288 	if (v == SEQ_START_TOKEN)
3289 		return unix_get_first(seq, pos);
3290 
3291 	return unix_get_next(seq, v, pos);
3292 }
3293 
3294 static void unix_seq_stop(struct seq_file *seq, void *v)
3295 {
3296 	struct sock *sk = v;
3297 
3298 	if (sk)
3299 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3300 }
3301 
3302 static int unix_seq_show(struct seq_file *seq, void *v)
3303 {
3304 
3305 	if (v == SEQ_START_TOKEN)
3306 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3307 			 "Inode Path\n");
3308 	else {
3309 		struct sock *s = v;
3310 		struct unix_sock *u = unix_sk(s);
3311 		unix_state_lock(s);
3312 
3313 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3314 			s,
3315 			refcount_read(&s->sk_refcnt),
3316 			0,
3317 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3318 			s->sk_type,
3319 			s->sk_socket ?
3320 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3321 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3322 			sock_i_ino(s));
3323 
3324 		if (u->addr) {	// under a hash table lock here
3325 			int i, len;
3326 			seq_putc(seq, ' ');
3327 
3328 			i = 0;
3329 			len = u->addr->len -
3330 				offsetof(struct sockaddr_un, sun_path);
3331 			if (u->addr->name->sun_path[0]) {
3332 				len--;
3333 			} else {
3334 				seq_putc(seq, '@');
3335 				i++;
3336 			}
3337 			for ( ; i < len; i++)
3338 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3339 					 '@');
3340 		}
3341 		unix_state_unlock(s);
3342 		seq_putc(seq, '\n');
3343 	}
3344 
3345 	return 0;
3346 }
3347 
3348 static const struct seq_operations unix_seq_ops = {
3349 	.start  = unix_seq_start,
3350 	.next   = unix_seq_next,
3351 	.stop   = unix_seq_stop,
3352 	.show   = unix_seq_show,
3353 };
3354 
3355 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3356 struct bpf_unix_iter_state {
3357 	struct seq_net_private p;
3358 	unsigned int cur_sk;
3359 	unsigned int end_sk;
3360 	unsigned int max_sk;
3361 	struct sock **batch;
3362 	bool st_bucket_done;
3363 };
3364 
3365 struct bpf_iter__unix {
3366 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3367 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3368 	uid_t uid __aligned(8);
3369 };
3370 
3371 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3372 			      struct unix_sock *unix_sk, uid_t uid)
3373 {
3374 	struct bpf_iter__unix ctx;
3375 
3376 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3377 	ctx.meta = meta;
3378 	ctx.unix_sk = unix_sk;
3379 	ctx.uid = uid;
3380 	return bpf_iter_run_prog(prog, &ctx);
3381 }
3382 
3383 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3384 
3385 {
3386 	struct bpf_unix_iter_state *iter = seq->private;
3387 	unsigned int expected = 1;
3388 	struct sock *sk;
3389 
3390 	sock_hold(start_sk);
3391 	iter->batch[iter->end_sk++] = start_sk;
3392 
3393 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3394 		if (iter->end_sk < iter->max_sk) {
3395 			sock_hold(sk);
3396 			iter->batch[iter->end_sk++] = sk;
3397 		}
3398 
3399 		expected++;
3400 	}
3401 
3402 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3403 
3404 	return expected;
3405 }
3406 
3407 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3408 {
3409 	while (iter->cur_sk < iter->end_sk)
3410 		sock_put(iter->batch[iter->cur_sk++]);
3411 }
3412 
3413 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3414 				       unsigned int new_batch_sz)
3415 {
3416 	struct sock **new_batch;
3417 
3418 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3419 			     GFP_USER | __GFP_NOWARN);
3420 	if (!new_batch)
3421 		return -ENOMEM;
3422 
3423 	bpf_iter_unix_put_batch(iter);
3424 	kvfree(iter->batch);
3425 	iter->batch = new_batch;
3426 	iter->max_sk = new_batch_sz;
3427 
3428 	return 0;
3429 }
3430 
3431 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3432 					loff_t *pos)
3433 {
3434 	struct bpf_unix_iter_state *iter = seq->private;
3435 	unsigned int expected;
3436 	bool resized = false;
3437 	struct sock *sk;
3438 
3439 	if (iter->st_bucket_done)
3440 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3441 
3442 again:
3443 	/* Get a new batch */
3444 	iter->cur_sk = 0;
3445 	iter->end_sk = 0;
3446 
3447 	sk = unix_get_first(seq, pos);
3448 	if (!sk)
3449 		return NULL; /* Done */
3450 
3451 	expected = bpf_iter_unix_hold_batch(seq, sk);
3452 
3453 	if (iter->end_sk == expected) {
3454 		iter->st_bucket_done = true;
3455 		return sk;
3456 	}
3457 
3458 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3459 		resized = true;
3460 		goto again;
3461 	}
3462 
3463 	return sk;
3464 }
3465 
3466 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3467 {
3468 	if (!*pos)
3469 		return SEQ_START_TOKEN;
3470 
3471 	/* bpf iter does not support lseek, so it always
3472 	 * continue from where it was stop()-ped.
3473 	 */
3474 	return bpf_iter_unix_batch(seq, pos);
3475 }
3476 
3477 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3478 {
3479 	struct bpf_unix_iter_state *iter = seq->private;
3480 	struct sock *sk;
3481 
3482 	/* Whenever seq_next() is called, the iter->cur_sk is
3483 	 * done with seq_show(), so advance to the next sk in
3484 	 * the batch.
3485 	 */
3486 	if (iter->cur_sk < iter->end_sk)
3487 		sock_put(iter->batch[iter->cur_sk++]);
3488 
3489 	++*pos;
3490 
3491 	if (iter->cur_sk < iter->end_sk)
3492 		sk = iter->batch[iter->cur_sk];
3493 	else
3494 		sk = bpf_iter_unix_batch(seq, pos);
3495 
3496 	return sk;
3497 }
3498 
3499 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3500 {
3501 	struct bpf_iter_meta meta;
3502 	struct bpf_prog *prog;
3503 	struct sock *sk = v;
3504 	uid_t uid;
3505 	bool slow;
3506 	int ret;
3507 
3508 	if (v == SEQ_START_TOKEN)
3509 		return 0;
3510 
3511 	slow = lock_sock_fast(sk);
3512 
3513 	if (unlikely(sk_unhashed(sk))) {
3514 		ret = SEQ_SKIP;
3515 		goto unlock;
3516 	}
3517 
3518 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3519 	meta.seq = seq;
3520 	prog = bpf_iter_get_info(&meta, false);
3521 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3522 unlock:
3523 	unlock_sock_fast(sk, slow);
3524 	return ret;
3525 }
3526 
3527 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3528 {
3529 	struct bpf_unix_iter_state *iter = seq->private;
3530 	struct bpf_iter_meta meta;
3531 	struct bpf_prog *prog;
3532 
3533 	if (!v) {
3534 		meta.seq = seq;
3535 		prog = bpf_iter_get_info(&meta, true);
3536 		if (prog)
3537 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3538 	}
3539 
3540 	if (iter->cur_sk < iter->end_sk)
3541 		bpf_iter_unix_put_batch(iter);
3542 }
3543 
3544 static const struct seq_operations bpf_iter_unix_seq_ops = {
3545 	.start	= bpf_iter_unix_seq_start,
3546 	.next	= bpf_iter_unix_seq_next,
3547 	.stop	= bpf_iter_unix_seq_stop,
3548 	.show	= bpf_iter_unix_seq_show,
3549 };
3550 #endif
3551 #endif
3552 
3553 static const struct net_proto_family unix_family_ops = {
3554 	.family = PF_UNIX,
3555 	.create = unix_create,
3556 	.owner	= THIS_MODULE,
3557 };
3558 
3559 
3560 static int __net_init unix_net_init(struct net *net)
3561 {
3562 	int i;
3563 
3564 	net->unx.sysctl_max_dgram_qlen = 10;
3565 	if (unix_sysctl_register(net))
3566 		goto out;
3567 
3568 #ifdef CONFIG_PROC_FS
3569 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3570 			     sizeof(struct seq_net_private)))
3571 		goto err_sysctl;
3572 #endif
3573 
3574 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3575 					      sizeof(spinlock_t), GFP_KERNEL);
3576 	if (!net->unx.table.locks)
3577 		goto err_proc;
3578 
3579 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3580 						sizeof(struct hlist_head),
3581 						GFP_KERNEL);
3582 	if (!net->unx.table.buckets)
3583 		goto free_locks;
3584 
3585 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3586 		spin_lock_init(&net->unx.table.locks[i]);
3587 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3588 	}
3589 
3590 	return 0;
3591 
3592 free_locks:
3593 	kvfree(net->unx.table.locks);
3594 err_proc:
3595 #ifdef CONFIG_PROC_FS
3596 	remove_proc_entry("unix", net->proc_net);
3597 err_sysctl:
3598 #endif
3599 	unix_sysctl_unregister(net);
3600 out:
3601 	return -ENOMEM;
3602 }
3603 
3604 static void __net_exit unix_net_exit(struct net *net)
3605 {
3606 	kvfree(net->unx.table.buckets);
3607 	kvfree(net->unx.table.locks);
3608 	unix_sysctl_unregister(net);
3609 	remove_proc_entry("unix", net->proc_net);
3610 }
3611 
3612 static struct pernet_operations unix_net_ops = {
3613 	.init = unix_net_init,
3614 	.exit = unix_net_exit,
3615 };
3616 
3617 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3618 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3619 		     struct unix_sock *unix_sk, uid_t uid)
3620 
3621 #define INIT_BATCH_SZ 16
3622 
3623 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3624 {
3625 	struct bpf_unix_iter_state *iter = priv_data;
3626 	int err;
3627 
3628 	err = bpf_iter_init_seq_net(priv_data, aux);
3629 	if (err)
3630 		return err;
3631 
3632 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3633 	if (err) {
3634 		bpf_iter_fini_seq_net(priv_data);
3635 		return err;
3636 	}
3637 
3638 	return 0;
3639 }
3640 
3641 static void bpf_iter_fini_unix(void *priv_data)
3642 {
3643 	struct bpf_unix_iter_state *iter = priv_data;
3644 
3645 	bpf_iter_fini_seq_net(priv_data);
3646 	kvfree(iter->batch);
3647 }
3648 
3649 static const struct bpf_iter_seq_info unix_seq_info = {
3650 	.seq_ops		= &bpf_iter_unix_seq_ops,
3651 	.init_seq_private	= bpf_iter_init_unix,
3652 	.fini_seq_private	= bpf_iter_fini_unix,
3653 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3654 };
3655 
3656 static const struct bpf_func_proto *
3657 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3658 			     const struct bpf_prog *prog)
3659 {
3660 	switch (func_id) {
3661 	case BPF_FUNC_setsockopt:
3662 		return &bpf_sk_setsockopt_proto;
3663 	case BPF_FUNC_getsockopt:
3664 		return &bpf_sk_getsockopt_proto;
3665 	default:
3666 		return NULL;
3667 	}
3668 }
3669 
3670 static struct bpf_iter_reg unix_reg_info = {
3671 	.target			= "unix",
3672 	.ctx_arg_info_size	= 1,
3673 	.ctx_arg_info		= {
3674 		{ offsetof(struct bpf_iter__unix, unix_sk),
3675 		  PTR_TO_BTF_ID_OR_NULL },
3676 	},
3677 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3678 	.seq_info		= &unix_seq_info,
3679 };
3680 
3681 static void __init bpf_iter_register(void)
3682 {
3683 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3684 	if (bpf_iter_reg_target(&unix_reg_info))
3685 		pr_warn("Warning: could not register bpf iterator unix\n");
3686 }
3687 #endif
3688 
3689 static int __init af_unix_init(void)
3690 {
3691 	int i, rc = -1;
3692 
3693 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3694 
3695 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3696 		spin_lock_init(&bsd_socket_locks[i]);
3697 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3698 	}
3699 
3700 	rc = proto_register(&unix_dgram_proto, 1);
3701 	if (rc != 0) {
3702 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3703 		goto out;
3704 	}
3705 
3706 	rc = proto_register(&unix_stream_proto, 1);
3707 	if (rc != 0) {
3708 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3709 		proto_unregister(&unix_dgram_proto);
3710 		goto out;
3711 	}
3712 
3713 	sock_register(&unix_family_ops);
3714 	register_pernet_subsys(&unix_net_ops);
3715 	unix_bpf_build_proto();
3716 
3717 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3718 	bpf_iter_register();
3719 #endif
3720 
3721 out:
3722 	return rc;
3723 }
3724 
3725 static void __exit af_unix_exit(void)
3726 {
3727 	sock_unregister(PF_UNIX);
3728 	proto_unregister(&unix_dgram_proto);
3729 	proto_unregister(&unix_stream_proto);
3730 	unregister_pernet_subsys(&unix_net_ops);
3731 }
3732 
3733 /* Earlier than device_initcall() so that other drivers invoking
3734    request_module() don't end up in a loop when modprobe tries
3735    to use a UNIX socket. But later than subsys_initcall() because
3736    we depend on stuff initialised there */
3737 fs_initcall(af_unix_init);
3738 module_exit(af_unix_exit);
3739 
3740 MODULE_LICENSE("GPL");
3741 MODULE_ALIAS_NETPROTO(PF_UNIX);
3742