xref: /openbmc/linux/net/unix/af_unix.c (revision 46290c6b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 #define unix_peer(sk) (unix_sk(sk)->peer)
216 
217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
218 {
219 	return unix_peer(osk) == sk;
220 }
221 
222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
223 {
224 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
225 }
226 
227 static inline int unix_recvq_full(const struct sock *sk)
228 {
229 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
230 }
231 
232 static inline int unix_recvq_full_lockless(const struct sock *sk)
233 {
234 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 		READ_ONCE(sk->sk_max_ack_backlog);
236 }
237 
238 struct sock *unix_peer_get(struct sock *s)
239 {
240 	struct sock *peer;
241 
242 	unix_state_lock(s);
243 	peer = unix_peer(s);
244 	if (peer)
245 		sock_hold(peer);
246 	unix_state_unlock(s);
247 	return peer;
248 }
249 EXPORT_SYMBOL_GPL(unix_peer_get);
250 
251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
252 					     int addr_len)
253 {
254 	struct unix_address *addr;
255 
256 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
257 	if (!addr)
258 		return NULL;
259 
260 	refcount_set(&addr->refcnt, 1);
261 	addr->len = addr_len;
262 	memcpy(addr->name, sunaddr, addr_len);
263 
264 	return addr;
265 }
266 
267 static inline void unix_release_addr(struct unix_address *addr)
268 {
269 	if (refcount_dec_and_test(&addr->refcnt))
270 		kfree(addr);
271 }
272 
273 /*
274  *	Check unix socket name:
275  *		- should be not zero length.
276  *	        - if started by not zero, should be NULL terminated (FS object)
277  *		- if started by zero, it is abstract name.
278  */
279 
280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
281 {
282 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 	    addr_len > sizeof(*sunaddr))
284 		return -EINVAL;
285 
286 	if (sunaddr->sun_family != AF_UNIX)
287 		return -EINVAL;
288 
289 	return 0;
290 }
291 
292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
293 {
294 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
295 	short offset = offsetof(struct sockaddr_storage, __data);
296 
297 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
298 
299 	/* This may look like an off by one error but it is a bit more
300 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
301 	 * sun_path[108] doesn't as such exist.  However in kernel space
302 	 * we are guaranteed that it is a valid memory location in our
303 	 * kernel address buffer because syscall functions always pass
304 	 * a pointer of struct sockaddr_storage which has a bigger buffer
305 	 * than 108.  Also, we must terminate sun_path for strlen() in
306 	 * getname_kernel().
307 	 */
308 	addr->__data[addr_len - offset] = 0;
309 
310 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
311 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
312 	 * know the actual buffer.
313 	 */
314 	return strlen(addr->__data) + offset + 1;
315 }
316 
317 static void __unix_remove_socket(struct sock *sk)
318 {
319 	sk_del_node_init(sk);
320 }
321 
322 static void __unix_insert_socket(struct net *net, struct sock *sk)
323 {
324 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
325 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
326 }
327 
328 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
329 				 struct unix_address *addr, unsigned int hash)
330 {
331 	__unix_remove_socket(sk);
332 	smp_store_release(&unix_sk(sk)->addr, addr);
333 
334 	sk->sk_hash = hash;
335 	__unix_insert_socket(net, sk);
336 }
337 
338 static void unix_remove_socket(struct net *net, struct sock *sk)
339 {
340 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
341 	__unix_remove_socket(sk);
342 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
343 }
344 
345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
346 {
347 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
348 	__unix_insert_socket(net, sk);
349 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
350 }
351 
352 static void unix_insert_bsd_socket(struct sock *sk)
353 {
354 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
356 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357 }
358 
359 static void unix_remove_bsd_socket(struct sock *sk)
360 {
361 	if (!hlist_unhashed(&sk->sk_bind_node)) {
362 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
363 		__sk_del_bind_node(sk);
364 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
365 
366 		sk_node_init(&sk->sk_bind_node);
367 	}
368 }
369 
370 static struct sock *__unix_find_socket_byname(struct net *net,
371 					      struct sockaddr_un *sunname,
372 					      int len, unsigned int hash)
373 {
374 	struct sock *s;
375 
376 	sk_for_each(s, &net->unx.table.buckets[hash]) {
377 		struct unix_sock *u = unix_sk(s);
378 
379 		if (u->addr->len == len &&
380 		    !memcmp(u->addr->name, sunname, len))
381 			return s;
382 	}
383 	return NULL;
384 }
385 
386 static inline struct sock *unix_find_socket_byname(struct net *net,
387 						   struct sockaddr_un *sunname,
388 						   int len, unsigned int hash)
389 {
390 	struct sock *s;
391 
392 	spin_lock(&net->unx.table.locks[hash]);
393 	s = __unix_find_socket_byname(net, sunname, len, hash);
394 	if (s)
395 		sock_hold(s);
396 	spin_unlock(&net->unx.table.locks[hash]);
397 	return s;
398 }
399 
400 static struct sock *unix_find_socket_byinode(struct inode *i)
401 {
402 	unsigned int hash = unix_bsd_hash(i);
403 	struct sock *s;
404 
405 	spin_lock(&bsd_socket_locks[hash]);
406 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
407 		struct dentry *dentry = unix_sk(s)->path.dentry;
408 
409 		if (dentry && d_backing_inode(dentry) == i) {
410 			sock_hold(s);
411 			spin_unlock(&bsd_socket_locks[hash]);
412 			return s;
413 		}
414 	}
415 	spin_unlock(&bsd_socket_locks[hash]);
416 	return NULL;
417 }
418 
419 /* Support code for asymmetrically connected dgram sockets
420  *
421  * If a datagram socket is connected to a socket not itself connected
422  * to the first socket (eg, /dev/log), clients may only enqueue more
423  * messages if the present receive queue of the server socket is not
424  * "too large". This means there's a second writeability condition
425  * poll and sendmsg need to test. The dgram recv code will do a wake
426  * up on the peer_wait wait queue of a socket upon reception of a
427  * datagram which needs to be propagated to sleeping would-be writers
428  * since these might not have sent anything so far. This can't be
429  * accomplished via poll_wait because the lifetime of the server
430  * socket might be less than that of its clients if these break their
431  * association with it or if the server socket is closed while clients
432  * are still connected to it and there's no way to inform "a polling
433  * implementation" that it should let go of a certain wait queue
434  *
435  * In order to propagate a wake up, a wait_queue_entry_t of the client
436  * socket is enqueued on the peer_wait queue of the server socket
437  * whose wake function does a wake_up on the ordinary client socket
438  * wait queue. This connection is established whenever a write (or
439  * poll for write) hit the flow control condition and broken when the
440  * association to the server socket is dissolved or after a wake up
441  * was relayed.
442  */
443 
444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
445 				      void *key)
446 {
447 	struct unix_sock *u;
448 	wait_queue_head_t *u_sleep;
449 
450 	u = container_of(q, struct unix_sock, peer_wake);
451 
452 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
453 			    q);
454 	u->peer_wake.private = NULL;
455 
456 	/* relaying can only happen while the wq still exists */
457 	u_sleep = sk_sleep(&u->sk);
458 	if (u_sleep)
459 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
460 
461 	return 0;
462 }
463 
464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
465 {
466 	struct unix_sock *u, *u_other;
467 	int rc;
468 
469 	u = unix_sk(sk);
470 	u_other = unix_sk(other);
471 	rc = 0;
472 	spin_lock(&u_other->peer_wait.lock);
473 
474 	if (!u->peer_wake.private) {
475 		u->peer_wake.private = other;
476 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
477 
478 		rc = 1;
479 	}
480 
481 	spin_unlock(&u_other->peer_wait.lock);
482 	return rc;
483 }
484 
485 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
486 					    struct sock *other)
487 {
488 	struct unix_sock *u, *u_other;
489 
490 	u = unix_sk(sk);
491 	u_other = unix_sk(other);
492 	spin_lock(&u_other->peer_wait.lock);
493 
494 	if (u->peer_wake.private == other) {
495 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
496 		u->peer_wake.private = NULL;
497 	}
498 
499 	spin_unlock(&u_other->peer_wait.lock);
500 }
501 
502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
503 						   struct sock *other)
504 {
505 	unix_dgram_peer_wake_disconnect(sk, other);
506 	wake_up_interruptible_poll(sk_sleep(sk),
507 				   EPOLLOUT |
508 				   EPOLLWRNORM |
509 				   EPOLLWRBAND);
510 }
511 
512 /* preconditions:
513  *	- unix_peer(sk) == other
514  *	- association is stable
515  */
516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
517 {
518 	int connected;
519 
520 	connected = unix_dgram_peer_wake_connect(sk, other);
521 
522 	/* If other is SOCK_DEAD, we want to make sure we signal
523 	 * POLLOUT, such that a subsequent write() can get a
524 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
525 	 * to other and its full, we will hang waiting for POLLOUT.
526 	 */
527 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
528 		return 1;
529 
530 	if (connected)
531 		unix_dgram_peer_wake_disconnect(sk, other);
532 
533 	return 0;
534 }
535 
536 static int unix_writable(const struct sock *sk)
537 {
538 	return sk->sk_state != TCP_LISTEN &&
539 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
540 }
541 
542 static void unix_write_space(struct sock *sk)
543 {
544 	struct socket_wq *wq;
545 
546 	rcu_read_lock();
547 	if (unix_writable(sk)) {
548 		wq = rcu_dereference(sk->sk_wq);
549 		if (skwq_has_sleeper(wq))
550 			wake_up_interruptible_sync_poll(&wq->wait,
551 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
552 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
553 	}
554 	rcu_read_unlock();
555 }
556 
557 /* When dgram socket disconnects (or changes its peer), we clear its receive
558  * queue of packets arrived from previous peer. First, it allows to do
559  * flow control based only on wmem_alloc; second, sk connected to peer
560  * may receive messages only from that peer. */
561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
562 {
563 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
564 		skb_queue_purge(&sk->sk_receive_queue);
565 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
566 
567 		/* If one link of bidirectional dgram pipe is disconnected,
568 		 * we signal error. Messages are lost. Do not make this,
569 		 * when peer was not connected to us.
570 		 */
571 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
572 			WRITE_ONCE(other->sk_err, ECONNRESET);
573 			sk_error_report(other);
574 		}
575 	}
576 	other->sk_state = TCP_CLOSE;
577 }
578 
579 static void unix_sock_destructor(struct sock *sk)
580 {
581 	struct unix_sock *u = unix_sk(sk);
582 
583 	skb_queue_purge(&sk->sk_receive_queue);
584 
585 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
586 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
587 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
588 	if (!sock_flag(sk, SOCK_DEAD)) {
589 		pr_info("Attempt to release alive unix socket: %p\n", sk);
590 		return;
591 	}
592 
593 	if (u->addr)
594 		unix_release_addr(u->addr);
595 
596 	atomic_long_dec(&unix_nr_socks);
597 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
598 #ifdef UNIX_REFCNT_DEBUG
599 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
600 		atomic_long_read(&unix_nr_socks));
601 #endif
602 }
603 
604 static void unix_release_sock(struct sock *sk, int embrion)
605 {
606 	struct unix_sock *u = unix_sk(sk);
607 	struct sock *skpair;
608 	struct sk_buff *skb;
609 	struct path path;
610 	int state;
611 
612 	unix_remove_socket(sock_net(sk), sk);
613 	unix_remove_bsd_socket(sk);
614 
615 	/* Clear state */
616 	unix_state_lock(sk);
617 	sock_orphan(sk);
618 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
619 	path	     = u->path;
620 	u->path.dentry = NULL;
621 	u->path.mnt = NULL;
622 	state = sk->sk_state;
623 	sk->sk_state = TCP_CLOSE;
624 
625 	skpair = unix_peer(sk);
626 	unix_peer(sk) = NULL;
627 
628 	unix_state_unlock(sk);
629 
630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
631 	if (u->oob_skb) {
632 		kfree_skb(u->oob_skb);
633 		u->oob_skb = NULL;
634 	}
635 #endif
636 
637 	wake_up_interruptible_all(&u->peer_wait);
638 
639 	if (skpair != NULL) {
640 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
641 			unix_state_lock(skpair);
642 			/* No more writes */
643 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
644 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
645 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
646 			unix_state_unlock(skpair);
647 			skpair->sk_state_change(skpair);
648 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
649 		}
650 
651 		unix_dgram_peer_wake_disconnect(sk, skpair);
652 		sock_put(skpair); /* It may now die */
653 	}
654 
655 	/* Try to flush out this socket. Throw out buffers at least */
656 
657 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
658 		if (state == TCP_LISTEN)
659 			unix_release_sock(skb->sk, 1);
660 		/* passed fds are erased in the kfree_skb hook	      */
661 		UNIXCB(skb).consumed = skb->len;
662 		kfree_skb(skb);
663 	}
664 
665 	if (path.dentry)
666 		path_put(&path);
667 
668 	sock_put(sk);
669 
670 	/* ---- Socket is dead now and most probably destroyed ---- */
671 
672 	/*
673 	 * Fixme: BSD difference: In BSD all sockets connected to us get
674 	 *	  ECONNRESET and we die on the spot. In Linux we behave
675 	 *	  like files and pipes do and wait for the last
676 	 *	  dereference.
677 	 *
678 	 * Can't we simply set sock->err?
679 	 *
680 	 *	  What the above comment does talk about? --ANK(980817)
681 	 */
682 
683 	if (unix_tot_inflight)
684 		unix_gc();		/* Garbage collect fds */
685 }
686 
687 static void init_peercred(struct sock *sk)
688 {
689 	const struct cred *old_cred;
690 	struct pid *old_pid;
691 
692 	spin_lock(&sk->sk_peer_lock);
693 	old_pid = sk->sk_peer_pid;
694 	old_cred = sk->sk_peer_cred;
695 	sk->sk_peer_pid  = get_pid(task_tgid(current));
696 	sk->sk_peer_cred = get_current_cred();
697 	spin_unlock(&sk->sk_peer_lock);
698 
699 	put_pid(old_pid);
700 	put_cred(old_cred);
701 }
702 
703 static void copy_peercred(struct sock *sk, struct sock *peersk)
704 {
705 	const struct cred *old_cred;
706 	struct pid *old_pid;
707 
708 	if (sk < peersk) {
709 		spin_lock(&sk->sk_peer_lock);
710 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711 	} else {
712 		spin_lock(&peersk->sk_peer_lock);
713 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
714 	}
715 	old_pid = sk->sk_peer_pid;
716 	old_cred = sk->sk_peer_cred;
717 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
718 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
719 
720 	spin_unlock(&sk->sk_peer_lock);
721 	spin_unlock(&peersk->sk_peer_lock);
722 
723 	put_pid(old_pid);
724 	put_cred(old_cred);
725 }
726 
727 static int unix_listen(struct socket *sock, int backlog)
728 {
729 	int err;
730 	struct sock *sk = sock->sk;
731 	struct unix_sock *u = unix_sk(sk);
732 
733 	err = -EOPNOTSUPP;
734 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
735 		goto out;	/* Only stream/seqpacket sockets accept */
736 	err = -EINVAL;
737 	if (!u->addr)
738 		goto out;	/* No listens on an unbound socket */
739 	unix_state_lock(sk);
740 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
741 		goto out_unlock;
742 	if (backlog > sk->sk_max_ack_backlog)
743 		wake_up_interruptible_all(&u->peer_wait);
744 	sk->sk_max_ack_backlog	= backlog;
745 	sk->sk_state		= TCP_LISTEN;
746 	/* set credentials so connect can copy them */
747 	init_peercred(sk);
748 	err = 0;
749 
750 out_unlock:
751 	unix_state_unlock(sk);
752 out:
753 	return err;
754 }
755 
756 static int unix_release(struct socket *);
757 static int unix_bind(struct socket *, struct sockaddr *, int);
758 static int unix_stream_connect(struct socket *, struct sockaddr *,
759 			       int addr_len, int flags);
760 static int unix_socketpair(struct socket *, struct socket *);
761 static int unix_accept(struct socket *, struct socket *, int, bool);
762 static int unix_getname(struct socket *, struct sockaddr *, int);
763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
764 static __poll_t unix_dgram_poll(struct file *, struct socket *,
765 				    poll_table *);
766 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
767 #ifdef CONFIG_COMPAT
768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
769 #endif
770 static int unix_shutdown(struct socket *, int);
771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
773 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
774 				       struct pipe_inode_info *, size_t size,
775 				       unsigned int flags);
776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
780 static int unix_dgram_connect(struct socket *, struct sockaddr *,
781 			      int, int);
782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
784 				  int);
785 
786 static int unix_set_peek_off(struct sock *sk, int val)
787 {
788 	struct unix_sock *u = unix_sk(sk);
789 
790 	if (mutex_lock_interruptible(&u->iolock))
791 		return -EINTR;
792 
793 	sk->sk_peek_off = val;
794 	mutex_unlock(&u->iolock);
795 
796 	return 0;
797 }
798 
799 #ifdef CONFIG_PROC_FS
800 static int unix_count_nr_fds(struct sock *sk)
801 {
802 	struct sk_buff *skb;
803 	struct unix_sock *u;
804 	int nr_fds = 0;
805 
806 	spin_lock(&sk->sk_receive_queue.lock);
807 	skb = skb_peek(&sk->sk_receive_queue);
808 	while (skb) {
809 		u = unix_sk(skb->sk);
810 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
811 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
812 	}
813 	spin_unlock(&sk->sk_receive_queue.lock);
814 
815 	return nr_fds;
816 }
817 
818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
819 {
820 	struct sock *sk = sock->sk;
821 	unsigned char s_state;
822 	struct unix_sock *u;
823 	int nr_fds = 0;
824 
825 	if (sk) {
826 		s_state = READ_ONCE(sk->sk_state);
827 		u = unix_sk(sk);
828 
829 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
830 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
831 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
832 		 */
833 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
834 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
835 		else if (s_state == TCP_LISTEN)
836 			nr_fds = unix_count_nr_fds(sk);
837 
838 		seq_printf(m, "scm_fds: %u\n", nr_fds);
839 	}
840 }
841 #else
842 #define unix_show_fdinfo NULL
843 #endif
844 
845 static const struct proto_ops unix_stream_ops = {
846 	.family =	PF_UNIX,
847 	.owner =	THIS_MODULE,
848 	.release =	unix_release,
849 	.bind =		unix_bind,
850 	.connect =	unix_stream_connect,
851 	.socketpair =	unix_socketpair,
852 	.accept =	unix_accept,
853 	.getname =	unix_getname,
854 	.poll =		unix_poll,
855 	.ioctl =	unix_ioctl,
856 #ifdef CONFIG_COMPAT
857 	.compat_ioctl =	unix_compat_ioctl,
858 #endif
859 	.listen =	unix_listen,
860 	.shutdown =	unix_shutdown,
861 	.sendmsg =	unix_stream_sendmsg,
862 	.recvmsg =	unix_stream_recvmsg,
863 	.read_skb =	unix_stream_read_skb,
864 	.mmap =		sock_no_mmap,
865 	.splice_read =	unix_stream_splice_read,
866 	.set_peek_off =	unix_set_peek_off,
867 	.show_fdinfo =	unix_show_fdinfo,
868 };
869 
870 static const struct proto_ops unix_dgram_ops = {
871 	.family =	PF_UNIX,
872 	.owner =	THIS_MODULE,
873 	.release =	unix_release,
874 	.bind =		unix_bind,
875 	.connect =	unix_dgram_connect,
876 	.socketpair =	unix_socketpair,
877 	.accept =	sock_no_accept,
878 	.getname =	unix_getname,
879 	.poll =		unix_dgram_poll,
880 	.ioctl =	unix_ioctl,
881 #ifdef CONFIG_COMPAT
882 	.compat_ioctl =	unix_compat_ioctl,
883 #endif
884 	.listen =	sock_no_listen,
885 	.shutdown =	unix_shutdown,
886 	.sendmsg =	unix_dgram_sendmsg,
887 	.read_skb =	unix_read_skb,
888 	.recvmsg =	unix_dgram_recvmsg,
889 	.mmap =		sock_no_mmap,
890 	.set_peek_off =	unix_set_peek_off,
891 	.show_fdinfo =	unix_show_fdinfo,
892 };
893 
894 static const struct proto_ops unix_seqpacket_ops = {
895 	.family =	PF_UNIX,
896 	.owner =	THIS_MODULE,
897 	.release =	unix_release,
898 	.bind =		unix_bind,
899 	.connect =	unix_stream_connect,
900 	.socketpair =	unix_socketpair,
901 	.accept =	unix_accept,
902 	.getname =	unix_getname,
903 	.poll =		unix_dgram_poll,
904 	.ioctl =	unix_ioctl,
905 #ifdef CONFIG_COMPAT
906 	.compat_ioctl =	unix_compat_ioctl,
907 #endif
908 	.listen =	unix_listen,
909 	.shutdown =	unix_shutdown,
910 	.sendmsg =	unix_seqpacket_sendmsg,
911 	.recvmsg =	unix_seqpacket_recvmsg,
912 	.mmap =		sock_no_mmap,
913 	.set_peek_off =	unix_set_peek_off,
914 	.show_fdinfo =	unix_show_fdinfo,
915 };
916 
917 static void unix_close(struct sock *sk, long timeout)
918 {
919 	/* Nothing to do here, unix socket does not need a ->close().
920 	 * This is merely for sockmap.
921 	 */
922 }
923 
924 static void unix_unhash(struct sock *sk)
925 {
926 	/* Nothing to do here, unix socket does not need a ->unhash().
927 	 * This is merely for sockmap.
928 	 */
929 }
930 
931 static bool unix_bpf_bypass_getsockopt(int level, int optname)
932 {
933 	if (level == SOL_SOCKET) {
934 		switch (optname) {
935 		case SO_PEERPIDFD:
936 			return true;
937 		default:
938 			return false;
939 		}
940 	}
941 
942 	return false;
943 }
944 
945 struct proto unix_dgram_proto = {
946 	.name			= "UNIX",
947 	.owner			= THIS_MODULE,
948 	.obj_size		= sizeof(struct unix_sock),
949 	.close			= unix_close,
950 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
951 #ifdef CONFIG_BPF_SYSCALL
952 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
953 #endif
954 };
955 
956 struct proto unix_stream_proto = {
957 	.name			= "UNIX-STREAM",
958 	.owner			= THIS_MODULE,
959 	.obj_size		= sizeof(struct unix_sock),
960 	.close			= unix_close,
961 	.unhash			= unix_unhash,
962 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
963 #ifdef CONFIG_BPF_SYSCALL
964 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
965 #endif
966 };
967 
968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
969 {
970 	struct unix_sock *u;
971 	struct sock *sk;
972 	int err;
973 
974 	atomic_long_inc(&unix_nr_socks);
975 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
976 		err = -ENFILE;
977 		goto err;
978 	}
979 
980 	if (type == SOCK_STREAM)
981 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
982 	else /*dgram and  seqpacket */
983 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
984 
985 	if (!sk) {
986 		err = -ENOMEM;
987 		goto err;
988 	}
989 
990 	sock_init_data(sock, sk);
991 
992 	sk->sk_hash		= unix_unbound_hash(sk);
993 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
994 	sk->sk_write_space	= unix_write_space;
995 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
996 	sk->sk_destruct		= unix_sock_destructor;
997 	u	  = unix_sk(sk);
998 	u->path.dentry = NULL;
999 	u->path.mnt = NULL;
1000 	spin_lock_init(&u->lock);
1001 	atomic_long_set(&u->inflight, 0);
1002 	INIT_LIST_HEAD(&u->link);
1003 	mutex_init(&u->iolock); /* single task reading lock */
1004 	mutex_init(&u->bindlock); /* single task binding lock */
1005 	init_waitqueue_head(&u->peer_wait);
1006 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1007 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1008 	unix_insert_unbound_socket(net, sk);
1009 
1010 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1011 
1012 	return sk;
1013 
1014 err:
1015 	atomic_long_dec(&unix_nr_socks);
1016 	return ERR_PTR(err);
1017 }
1018 
1019 static int unix_create(struct net *net, struct socket *sock, int protocol,
1020 		       int kern)
1021 {
1022 	struct sock *sk;
1023 
1024 	if (protocol && protocol != PF_UNIX)
1025 		return -EPROTONOSUPPORT;
1026 
1027 	sock->state = SS_UNCONNECTED;
1028 
1029 	switch (sock->type) {
1030 	case SOCK_STREAM:
1031 		sock->ops = &unix_stream_ops;
1032 		break;
1033 		/*
1034 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1035 		 *	nothing uses it.
1036 		 */
1037 	case SOCK_RAW:
1038 		sock->type = SOCK_DGRAM;
1039 		fallthrough;
1040 	case SOCK_DGRAM:
1041 		sock->ops = &unix_dgram_ops;
1042 		break;
1043 	case SOCK_SEQPACKET:
1044 		sock->ops = &unix_seqpacket_ops;
1045 		break;
1046 	default:
1047 		return -ESOCKTNOSUPPORT;
1048 	}
1049 
1050 	sk = unix_create1(net, sock, kern, sock->type);
1051 	if (IS_ERR(sk))
1052 		return PTR_ERR(sk);
1053 
1054 	return 0;
1055 }
1056 
1057 static int unix_release(struct socket *sock)
1058 {
1059 	struct sock *sk = sock->sk;
1060 
1061 	if (!sk)
1062 		return 0;
1063 
1064 	sk->sk_prot->close(sk, 0);
1065 	unix_release_sock(sk, 0);
1066 	sock->sk = NULL;
1067 
1068 	return 0;
1069 }
1070 
1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1072 				  int type)
1073 {
1074 	struct inode *inode;
1075 	struct path path;
1076 	struct sock *sk;
1077 	int err;
1078 
1079 	unix_mkname_bsd(sunaddr, addr_len);
1080 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1081 	if (err)
1082 		goto fail;
1083 
1084 	err = path_permission(&path, MAY_WRITE);
1085 	if (err)
1086 		goto path_put;
1087 
1088 	err = -ECONNREFUSED;
1089 	inode = d_backing_inode(path.dentry);
1090 	if (!S_ISSOCK(inode->i_mode))
1091 		goto path_put;
1092 
1093 	sk = unix_find_socket_byinode(inode);
1094 	if (!sk)
1095 		goto path_put;
1096 
1097 	err = -EPROTOTYPE;
1098 	if (sk->sk_type == type)
1099 		touch_atime(&path);
1100 	else
1101 		goto sock_put;
1102 
1103 	path_put(&path);
1104 
1105 	return sk;
1106 
1107 sock_put:
1108 	sock_put(sk);
1109 path_put:
1110 	path_put(&path);
1111 fail:
1112 	return ERR_PTR(err);
1113 }
1114 
1115 static struct sock *unix_find_abstract(struct net *net,
1116 				       struct sockaddr_un *sunaddr,
1117 				       int addr_len, int type)
1118 {
1119 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1120 	struct dentry *dentry;
1121 	struct sock *sk;
1122 
1123 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1124 	if (!sk)
1125 		return ERR_PTR(-ECONNREFUSED);
1126 
1127 	dentry = unix_sk(sk)->path.dentry;
1128 	if (dentry)
1129 		touch_atime(&unix_sk(sk)->path);
1130 
1131 	return sk;
1132 }
1133 
1134 static struct sock *unix_find_other(struct net *net,
1135 				    struct sockaddr_un *sunaddr,
1136 				    int addr_len, int type)
1137 {
1138 	struct sock *sk;
1139 
1140 	if (sunaddr->sun_path[0])
1141 		sk = unix_find_bsd(sunaddr, addr_len, type);
1142 	else
1143 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1144 
1145 	return sk;
1146 }
1147 
1148 static int unix_autobind(struct sock *sk)
1149 {
1150 	unsigned int new_hash, old_hash = sk->sk_hash;
1151 	struct unix_sock *u = unix_sk(sk);
1152 	struct net *net = sock_net(sk);
1153 	struct unix_address *addr;
1154 	u32 lastnum, ordernum;
1155 	int err;
1156 
1157 	err = mutex_lock_interruptible(&u->bindlock);
1158 	if (err)
1159 		return err;
1160 
1161 	if (u->addr)
1162 		goto out;
1163 
1164 	err = -ENOMEM;
1165 	addr = kzalloc(sizeof(*addr) +
1166 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1167 	if (!addr)
1168 		goto out;
1169 
1170 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1171 	addr->name->sun_family = AF_UNIX;
1172 	refcount_set(&addr->refcnt, 1);
1173 
1174 	ordernum = get_random_u32();
1175 	lastnum = ordernum & 0xFFFFF;
1176 retry:
1177 	ordernum = (ordernum + 1) & 0xFFFFF;
1178 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1179 
1180 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1181 	unix_table_double_lock(net, old_hash, new_hash);
1182 
1183 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1184 		unix_table_double_unlock(net, old_hash, new_hash);
1185 
1186 		/* __unix_find_socket_byname() may take long time if many names
1187 		 * are already in use.
1188 		 */
1189 		cond_resched();
1190 
1191 		if (ordernum == lastnum) {
1192 			/* Give up if all names seems to be in use. */
1193 			err = -ENOSPC;
1194 			unix_release_addr(addr);
1195 			goto out;
1196 		}
1197 
1198 		goto retry;
1199 	}
1200 
1201 	__unix_set_addr_hash(net, sk, addr, new_hash);
1202 	unix_table_double_unlock(net, old_hash, new_hash);
1203 	err = 0;
1204 
1205 out:	mutex_unlock(&u->bindlock);
1206 	return err;
1207 }
1208 
1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1210 			 int addr_len)
1211 {
1212 	umode_t mode = S_IFSOCK |
1213 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1214 	unsigned int new_hash, old_hash = sk->sk_hash;
1215 	struct unix_sock *u = unix_sk(sk);
1216 	struct net *net = sock_net(sk);
1217 	struct mnt_idmap *idmap;
1218 	struct unix_address *addr;
1219 	struct dentry *dentry;
1220 	struct path parent;
1221 	int err;
1222 
1223 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1224 	addr = unix_create_addr(sunaddr, addr_len);
1225 	if (!addr)
1226 		return -ENOMEM;
1227 
1228 	/*
1229 	 * Get the parent directory, calculate the hash for last
1230 	 * component.
1231 	 */
1232 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1233 	if (IS_ERR(dentry)) {
1234 		err = PTR_ERR(dentry);
1235 		goto out;
1236 	}
1237 
1238 	/*
1239 	 * All right, let's create it.
1240 	 */
1241 	idmap = mnt_idmap(parent.mnt);
1242 	err = security_path_mknod(&parent, dentry, mode, 0);
1243 	if (!err)
1244 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1245 	if (err)
1246 		goto out_path;
1247 	err = mutex_lock_interruptible(&u->bindlock);
1248 	if (err)
1249 		goto out_unlink;
1250 	if (u->addr)
1251 		goto out_unlock;
1252 
1253 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1254 	unix_table_double_lock(net, old_hash, new_hash);
1255 	u->path.mnt = mntget(parent.mnt);
1256 	u->path.dentry = dget(dentry);
1257 	__unix_set_addr_hash(net, sk, addr, new_hash);
1258 	unix_table_double_unlock(net, old_hash, new_hash);
1259 	unix_insert_bsd_socket(sk);
1260 	mutex_unlock(&u->bindlock);
1261 	done_path_create(&parent, dentry);
1262 	return 0;
1263 
1264 out_unlock:
1265 	mutex_unlock(&u->bindlock);
1266 	err = -EINVAL;
1267 out_unlink:
1268 	/* failed after successful mknod?  unlink what we'd created... */
1269 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1270 out_path:
1271 	done_path_create(&parent, dentry);
1272 out:
1273 	unix_release_addr(addr);
1274 	return err == -EEXIST ? -EADDRINUSE : err;
1275 }
1276 
1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1278 			      int addr_len)
1279 {
1280 	unsigned int new_hash, old_hash = sk->sk_hash;
1281 	struct unix_sock *u = unix_sk(sk);
1282 	struct net *net = sock_net(sk);
1283 	struct unix_address *addr;
1284 	int err;
1285 
1286 	addr = unix_create_addr(sunaddr, addr_len);
1287 	if (!addr)
1288 		return -ENOMEM;
1289 
1290 	err = mutex_lock_interruptible(&u->bindlock);
1291 	if (err)
1292 		goto out;
1293 
1294 	if (u->addr) {
1295 		err = -EINVAL;
1296 		goto out_mutex;
1297 	}
1298 
1299 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1300 	unix_table_double_lock(net, old_hash, new_hash);
1301 
1302 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1303 		goto out_spin;
1304 
1305 	__unix_set_addr_hash(net, sk, addr, new_hash);
1306 	unix_table_double_unlock(net, old_hash, new_hash);
1307 	mutex_unlock(&u->bindlock);
1308 	return 0;
1309 
1310 out_spin:
1311 	unix_table_double_unlock(net, old_hash, new_hash);
1312 	err = -EADDRINUSE;
1313 out_mutex:
1314 	mutex_unlock(&u->bindlock);
1315 out:
1316 	unix_release_addr(addr);
1317 	return err;
1318 }
1319 
1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1321 {
1322 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1323 	struct sock *sk = sock->sk;
1324 	int err;
1325 
1326 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1327 	    sunaddr->sun_family == AF_UNIX)
1328 		return unix_autobind(sk);
1329 
1330 	err = unix_validate_addr(sunaddr, addr_len);
1331 	if (err)
1332 		return err;
1333 
1334 	if (sunaddr->sun_path[0])
1335 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1336 	else
1337 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1338 
1339 	return err;
1340 }
1341 
1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1343 {
1344 	if (unlikely(sk1 == sk2) || !sk2) {
1345 		unix_state_lock(sk1);
1346 		return;
1347 	}
1348 	if (sk1 < sk2) {
1349 		unix_state_lock(sk1);
1350 		unix_state_lock_nested(sk2);
1351 	} else {
1352 		unix_state_lock(sk2);
1353 		unix_state_lock_nested(sk1);
1354 	}
1355 }
1356 
1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1358 {
1359 	if (unlikely(sk1 == sk2) || !sk2) {
1360 		unix_state_unlock(sk1);
1361 		return;
1362 	}
1363 	unix_state_unlock(sk1);
1364 	unix_state_unlock(sk2);
1365 }
1366 
1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1368 			      int alen, int flags)
1369 {
1370 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1371 	struct sock *sk = sock->sk;
1372 	struct sock *other;
1373 	int err;
1374 
1375 	err = -EINVAL;
1376 	if (alen < offsetofend(struct sockaddr, sa_family))
1377 		goto out;
1378 
1379 	if (addr->sa_family != AF_UNSPEC) {
1380 		err = unix_validate_addr(sunaddr, alen);
1381 		if (err)
1382 			goto out;
1383 
1384 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1385 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1386 		    !unix_sk(sk)->addr) {
1387 			err = unix_autobind(sk);
1388 			if (err)
1389 				goto out;
1390 		}
1391 
1392 restart:
1393 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1394 		if (IS_ERR(other)) {
1395 			err = PTR_ERR(other);
1396 			goto out;
1397 		}
1398 
1399 		unix_state_double_lock(sk, other);
1400 
1401 		/* Apparently VFS overslept socket death. Retry. */
1402 		if (sock_flag(other, SOCK_DEAD)) {
1403 			unix_state_double_unlock(sk, other);
1404 			sock_put(other);
1405 			goto restart;
1406 		}
1407 
1408 		err = -EPERM;
1409 		if (!unix_may_send(sk, other))
1410 			goto out_unlock;
1411 
1412 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1413 		if (err)
1414 			goto out_unlock;
1415 
1416 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1417 	} else {
1418 		/*
1419 		 *	1003.1g breaking connected state with AF_UNSPEC
1420 		 */
1421 		other = NULL;
1422 		unix_state_double_lock(sk, other);
1423 	}
1424 
1425 	/*
1426 	 * If it was connected, reconnect.
1427 	 */
1428 	if (unix_peer(sk)) {
1429 		struct sock *old_peer = unix_peer(sk);
1430 
1431 		unix_peer(sk) = other;
1432 		if (!other)
1433 			sk->sk_state = TCP_CLOSE;
1434 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1435 
1436 		unix_state_double_unlock(sk, other);
1437 
1438 		if (other != old_peer)
1439 			unix_dgram_disconnected(sk, old_peer);
1440 		sock_put(old_peer);
1441 	} else {
1442 		unix_peer(sk) = other;
1443 		unix_state_double_unlock(sk, other);
1444 	}
1445 
1446 	return 0;
1447 
1448 out_unlock:
1449 	unix_state_double_unlock(sk, other);
1450 	sock_put(other);
1451 out:
1452 	return err;
1453 }
1454 
1455 static long unix_wait_for_peer(struct sock *other, long timeo)
1456 	__releases(&unix_sk(other)->lock)
1457 {
1458 	struct unix_sock *u = unix_sk(other);
1459 	int sched;
1460 	DEFINE_WAIT(wait);
1461 
1462 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1463 
1464 	sched = !sock_flag(other, SOCK_DEAD) &&
1465 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1466 		unix_recvq_full_lockless(other);
1467 
1468 	unix_state_unlock(other);
1469 
1470 	if (sched)
1471 		timeo = schedule_timeout(timeo);
1472 
1473 	finish_wait(&u->peer_wait, &wait);
1474 	return timeo;
1475 }
1476 
1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1478 			       int addr_len, int flags)
1479 {
1480 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1481 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1482 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1483 	struct net *net = sock_net(sk);
1484 	struct sk_buff *skb = NULL;
1485 	long timeo;
1486 	int err;
1487 	int st;
1488 
1489 	err = unix_validate_addr(sunaddr, addr_len);
1490 	if (err)
1491 		goto out;
1492 
1493 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1495 		err = unix_autobind(sk);
1496 		if (err)
1497 			goto out;
1498 	}
1499 
1500 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1501 
1502 	/* First of all allocate resources.
1503 	   If we will make it after state is locked,
1504 	   we will have to recheck all again in any case.
1505 	 */
1506 
1507 	/* create new sock for complete connection */
1508 	newsk = unix_create1(net, NULL, 0, sock->type);
1509 	if (IS_ERR(newsk)) {
1510 		err = PTR_ERR(newsk);
1511 		newsk = NULL;
1512 		goto out;
1513 	}
1514 
1515 	err = -ENOMEM;
1516 
1517 	/* Allocate skb for sending to listening sock */
1518 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1519 	if (skb == NULL)
1520 		goto out;
1521 
1522 restart:
1523 	/*  Find listening sock. */
1524 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1525 	if (IS_ERR(other)) {
1526 		err = PTR_ERR(other);
1527 		other = NULL;
1528 		goto out;
1529 	}
1530 
1531 	/* Latch state of peer */
1532 	unix_state_lock(other);
1533 
1534 	/* Apparently VFS overslept socket death. Retry. */
1535 	if (sock_flag(other, SOCK_DEAD)) {
1536 		unix_state_unlock(other);
1537 		sock_put(other);
1538 		goto restart;
1539 	}
1540 
1541 	err = -ECONNREFUSED;
1542 	if (other->sk_state != TCP_LISTEN)
1543 		goto out_unlock;
1544 	if (other->sk_shutdown & RCV_SHUTDOWN)
1545 		goto out_unlock;
1546 
1547 	if (unix_recvq_full(other)) {
1548 		err = -EAGAIN;
1549 		if (!timeo)
1550 			goto out_unlock;
1551 
1552 		timeo = unix_wait_for_peer(other, timeo);
1553 
1554 		err = sock_intr_errno(timeo);
1555 		if (signal_pending(current))
1556 			goto out;
1557 		sock_put(other);
1558 		goto restart;
1559 	}
1560 
1561 	/* Latch our state.
1562 
1563 	   It is tricky place. We need to grab our state lock and cannot
1564 	   drop lock on peer. It is dangerous because deadlock is
1565 	   possible. Connect to self case and simultaneous
1566 	   attempt to connect are eliminated by checking socket
1567 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1568 	   check this before attempt to grab lock.
1569 
1570 	   Well, and we have to recheck the state after socket locked.
1571 	 */
1572 	st = sk->sk_state;
1573 
1574 	switch (st) {
1575 	case TCP_CLOSE:
1576 		/* This is ok... continue with connect */
1577 		break;
1578 	case TCP_ESTABLISHED:
1579 		/* Socket is already connected */
1580 		err = -EISCONN;
1581 		goto out_unlock;
1582 	default:
1583 		err = -EINVAL;
1584 		goto out_unlock;
1585 	}
1586 
1587 	unix_state_lock_nested(sk);
1588 
1589 	if (sk->sk_state != st) {
1590 		unix_state_unlock(sk);
1591 		unix_state_unlock(other);
1592 		sock_put(other);
1593 		goto restart;
1594 	}
1595 
1596 	err = security_unix_stream_connect(sk, other, newsk);
1597 	if (err) {
1598 		unix_state_unlock(sk);
1599 		goto out_unlock;
1600 	}
1601 
1602 	/* The way is open! Fastly set all the necessary fields... */
1603 
1604 	sock_hold(sk);
1605 	unix_peer(newsk)	= sk;
1606 	newsk->sk_state		= TCP_ESTABLISHED;
1607 	newsk->sk_type		= sk->sk_type;
1608 	init_peercred(newsk);
1609 	newu = unix_sk(newsk);
1610 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1611 	otheru = unix_sk(other);
1612 
1613 	/* copy address information from listening to new sock
1614 	 *
1615 	 * The contents of *(otheru->addr) and otheru->path
1616 	 * are seen fully set up here, since we have found
1617 	 * otheru in hash under its lock.  Insertion into the
1618 	 * hash chain we'd found it in had been done in an
1619 	 * earlier critical area protected by the chain's lock,
1620 	 * the same one where we'd set *(otheru->addr) contents,
1621 	 * as well as otheru->path and otheru->addr itself.
1622 	 *
1623 	 * Using smp_store_release() here to set newu->addr
1624 	 * is enough to make those stores, as well as stores
1625 	 * to newu->path visible to anyone who gets newu->addr
1626 	 * by smp_load_acquire().  IOW, the same warranties
1627 	 * as for unix_sock instances bound in unix_bind() or
1628 	 * in unix_autobind().
1629 	 */
1630 	if (otheru->path.dentry) {
1631 		path_get(&otheru->path);
1632 		newu->path = otheru->path;
1633 	}
1634 	refcount_inc(&otheru->addr->refcnt);
1635 	smp_store_release(&newu->addr, otheru->addr);
1636 
1637 	/* Set credentials */
1638 	copy_peercred(sk, other);
1639 
1640 	sock->state	= SS_CONNECTED;
1641 	sk->sk_state	= TCP_ESTABLISHED;
1642 	sock_hold(newsk);
1643 
1644 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1645 	unix_peer(sk)	= newsk;
1646 
1647 	unix_state_unlock(sk);
1648 
1649 	/* take ten and send info to listening sock */
1650 	spin_lock(&other->sk_receive_queue.lock);
1651 	__skb_queue_tail(&other->sk_receive_queue, skb);
1652 	spin_unlock(&other->sk_receive_queue.lock);
1653 	unix_state_unlock(other);
1654 	other->sk_data_ready(other);
1655 	sock_put(other);
1656 	return 0;
1657 
1658 out_unlock:
1659 	if (other)
1660 		unix_state_unlock(other);
1661 
1662 out:
1663 	kfree_skb(skb);
1664 	if (newsk)
1665 		unix_release_sock(newsk, 0);
1666 	if (other)
1667 		sock_put(other);
1668 	return err;
1669 }
1670 
1671 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1672 {
1673 	struct sock *ska = socka->sk, *skb = sockb->sk;
1674 
1675 	/* Join our sockets back to back */
1676 	sock_hold(ska);
1677 	sock_hold(skb);
1678 	unix_peer(ska) = skb;
1679 	unix_peer(skb) = ska;
1680 	init_peercred(ska);
1681 	init_peercred(skb);
1682 
1683 	ska->sk_state = TCP_ESTABLISHED;
1684 	skb->sk_state = TCP_ESTABLISHED;
1685 	socka->state  = SS_CONNECTED;
1686 	sockb->state  = SS_CONNECTED;
1687 	return 0;
1688 }
1689 
1690 static void unix_sock_inherit_flags(const struct socket *old,
1691 				    struct socket *new)
1692 {
1693 	if (test_bit(SOCK_PASSCRED, &old->flags))
1694 		set_bit(SOCK_PASSCRED, &new->flags);
1695 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1696 		set_bit(SOCK_PASSPIDFD, &new->flags);
1697 	if (test_bit(SOCK_PASSSEC, &old->flags))
1698 		set_bit(SOCK_PASSSEC, &new->flags);
1699 }
1700 
1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1702 		       bool kern)
1703 {
1704 	struct sock *sk = sock->sk;
1705 	struct sock *tsk;
1706 	struct sk_buff *skb;
1707 	int err;
1708 
1709 	err = -EOPNOTSUPP;
1710 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1711 		goto out;
1712 
1713 	err = -EINVAL;
1714 	if (sk->sk_state != TCP_LISTEN)
1715 		goto out;
1716 
1717 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1718 	 * so that no locks are necessary.
1719 	 */
1720 
1721 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1722 				&err);
1723 	if (!skb) {
1724 		/* This means receive shutdown. */
1725 		if (err == 0)
1726 			err = -EINVAL;
1727 		goto out;
1728 	}
1729 
1730 	tsk = skb->sk;
1731 	skb_free_datagram(sk, skb);
1732 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1733 
1734 	/* attach accepted sock to socket */
1735 	unix_state_lock(tsk);
1736 	newsock->state = SS_CONNECTED;
1737 	unix_sock_inherit_flags(sock, newsock);
1738 	sock_graft(tsk, newsock);
1739 	unix_state_unlock(tsk);
1740 	return 0;
1741 
1742 out:
1743 	return err;
1744 }
1745 
1746 
1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1748 {
1749 	struct sock *sk = sock->sk;
1750 	struct unix_address *addr;
1751 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1752 	int err = 0;
1753 
1754 	if (peer) {
1755 		sk = unix_peer_get(sk);
1756 
1757 		err = -ENOTCONN;
1758 		if (!sk)
1759 			goto out;
1760 		err = 0;
1761 	} else {
1762 		sock_hold(sk);
1763 	}
1764 
1765 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1766 	if (!addr) {
1767 		sunaddr->sun_family = AF_UNIX;
1768 		sunaddr->sun_path[0] = 0;
1769 		err = offsetof(struct sockaddr_un, sun_path);
1770 	} else {
1771 		err = addr->len;
1772 		memcpy(sunaddr, addr->name, addr->len);
1773 	}
1774 	sock_put(sk);
1775 out:
1776 	return err;
1777 }
1778 
1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1780 {
1781 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1782 
1783 	/*
1784 	 * Garbage collection of unix sockets starts by selecting a set of
1785 	 * candidate sockets which have reference only from being in flight
1786 	 * (total_refs == inflight_refs).  This condition is checked once during
1787 	 * the candidate collection phase, and candidates are marked as such, so
1788 	 * that non-candidates can later be ignored.  While inflight_refs is
1789 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1790 	 * is an instantaneous decision.
1791 	 *
1792 	 * Once a candidate, however, the socket must not be reinstalled into a
1793 	 * file descriptor while the garbage collection is in progress.
1794 	 *
1795 	 * If the above conditions are met, then the directed graph of
1796 	 * candidates (*) does not change while unix_gc_lock is held.
1797 	 *
1798 	 * Any operations that changes the file count through file descriptors
1799 	 * (dup, close, sendmsg) does not change the graph since candidates are
1800 	 * not installed in fds.
1801 	 *
1802 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1803 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1804 	 * serialized with garbage collection.
1805 	 *
1806 	 * MSG_PEEK is special in that it does not change the inflight count,
1807 	 * yet does install the socket into an fd.  The following lock/unlock
1808 	 * pair is to ensure serialization with garbage collection.  It must be
1809 	 * done between incrementing the file count and installing the file into
1810 	 * an fd.
1811 	 *
1812 	 * If garbage collection starts after the barrier provided by the
1813 	 * lock/unlock, then it will see the elevated refcount and not mark this
1814 	 * as a candidate.  If a garbage collection is already in progress
1815 	 * before the file count was incremented, then the lock/unlock pair will
1816 	 * ensure that garbage collection is finished before progressing to
1817 	 * installing the fd.
1818 	 *
1819 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1820 	 * which is on the queue of listening socket A.
1821 	 */
1822 	spin_lock(&unix_gc_lock);
1823 	spin_unlock(&unix_gc_lock);
1824 }
1825 
1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1827 {
1828 	int err = 0;
1829 
1830 	UNIXCB(skb).pid  = get_pid(scm->pid);
1831 	UNIXCB(skb).uid = scm->creds.uid;
1832 	UNIXCB(skb).gid = scm->creds.gid;
1833 	UNIXCB(skb).fp = NULL;
1834 	unix_get_secdata(scm, skb);
1835 	if (scm->fp && send_fds)
1836 		err = unix_attach_fds(scm, skb);
1837 
1838 	skb->destructor = unix_destruct_scm;
1839 	return err;
1840 }
1841 
1842 static bool unix_passcred_enabled(const struct socket *sock,
1843 				  const struct sock *other)
1844 {
1845 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1846 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1847 	       !other->sk_socket ||
1848 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1849 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1850 }
1851 
1852 /*
1853  * Some apps rely on write() giving SCM_CREDENTIALS
1854  * We include credentials if source or destination socket
1855  * asserted SOCK_PASSCRED.
1856  */
1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1858 			    const struct sock *other)
1859 {
1860 	if (UNIXCB(skb).pid)
1861 		return;
1862 	if (unix_passcred_enabled(sock, other)) {
1863 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1864 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1865 	}
1866 }
1867 
1868 static bool unix_skb_scm_eq(struct sk_buff *skb,
1869 			    struct scm_cookie *scm)
1870 {
1871 	return UNIXCB(skb).pid == scm->pid &&
1872 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1873 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1874 	       unix_secdata_eq(scm, skb);
1875 }
1876 
1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1878 {
1879 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1880 	struct unix_sock *u = unix_sk(sk);
1881 
1882 	if (unlikely(fp && fp->count))
1883 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1884 }
1885 
1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1887 {
1888 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1889 	struct unix_sock *u = unix_sk(sk);
1890 
1891 	if (unlikely(fp && fp->count))
1892 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1893 }
1894 
1895 /*
1896  *	Send AF_UNIX data.
1897  */
1898 
1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1900 			      size_t len)
1901 {
1902 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903 	struct sock *sk = sock->sk, *other = NULL;
1904 	struct unix_sock *u = unix_sk(sk);
1905 	struct scm_cookie scm;
1906 	struct sk_buff *skb;
1907 	int data_len = 0;
1908 	int sk_locked;
1909 	long timeo;
1910 	int err;
1911 
1912 	wait_for_unix_gc();
1913 	err = scm_send(sock, msg, &scm, false);
1914 	if (err < 0)
1915 		return err;
1916 
1917 	err = -EOPNOTSUPP;
1918 	if (msg->msg_flags&MSG_OOB)
1919 		goto out;
1920 
1921 	if (msg->msg_namelen) {
1922 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1923 		if (err)
1924 			goto out;
1925 	} else {
1926 		sunaddr = NULL;
1927 		err = -ENOTCONN;
1928 		other = unix_peer_get(sk);
1929 		if (!other)
1930 			goto out;
1931 	}
1932 
1933 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1934 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1935 		err = unix_autobind(sk);
1936 		if (err)
1937 			goto out;
1938 	}
1939 
1940 	err = -EMSGSIZE;
1941 	if (len > sk->sk_sndbuf - 32)
1942 		goto out;
1943 
1944 	if (len > SKB_MAX_ALLOC) {
1945 		data_len = min_t(size_t,
1946 				 len - SKB_MAX_ALLOC,
1947 				 MAX_SKB_FRAGS * PAGE_SIZE);
1948 		data_len = PAGE_ALIGN(data_len);
1949 
1950 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951 	}
1952 
1953 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954 				   msg->msg_flags & MSG_DONTWAIT, &err,
1955 				   PAGE_ALLOC_COSTLY_ORDER);
1956 	if (skb == NULL)
1957 		goto out;
1958 
1959 	err = unix_scm_to_skb(&scm, skb, true);
1960 	if (err < 0)
1961 		goto out_free;
1962 
1963 	skb_put(skb, len - data_len);
1964 	skb->data_len = data_len;
1965 	skb->len = len;
1966 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967 	if (err)
1968 		goto out_free;
1969 
1970 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971 
1972 restart:
1973 	if (!other) {
1974 		err = -ECONNRESET;
1975 		if (sunaddr == NULL)
1976 			goto out_free;
1977 
1978 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979 					sk->sk_type);
1980 		if (IS_ERR(other)) {
1981 			err = PTR_ERR(other);
1982 			other = NULL;
1983 			goto out_free;
1984 		}
1985 	}
1986 
1987 	if (sk_filter(other, skb) < 0) {
1988 		/* Toss the packet but do not return any error to the sender */
1989 		err = len;
1990 		goto out_free;
1991 	}
1992 
1993 	sk_locked = 0;
1994 	unix_state_lock(other);
1995 restart_locked:
1996 	err = -EPERM;
1997 	if (!unix_may_send(sk, other))
1998 		goto out_unlock;
1999 
2000 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001 		/*
2002 		 *	Check with 1003.1g - what should
2003 		 *	datagram error
2004 		 */
2005 		unix_state_unlock(other);
2006 		sock_put(other);
2007 
2008 		if (!sk_locked)
2009 			unix_state_lock(sk);
2010 
2011 		err = 0;
2012 		if (sk->sk_type == SOCK_SEQPACKET) {
2013 			/* We are here only when racing with unix_release_sock()
2014 			 * is clearing @other. Never change state to TCP_CLOSE
2015 			 * unlike SOCK_DGRAM wants.
2016 			 */
2017 			unix_state_unlock(sk);
2018 			err = -EPIPE;
2019 		} else if (unix_peer(sk) == other) {
2020 			unix_peer(sk) = NULL;
2021 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022 
2023 			sk->sk_state = TCP_CLOSE;
2024 			unix_state_unlock(sk);
2025 
2026 			unix_dgram_disconnected(sk, other);
2027 			sock_put(other);
2028 			err = -ECONNREFUSED;
2029 		} else {
2030 			unix_state_unlock(sk);
2031 		}
2032 
2033 		other = NULL;
2034 		if (err)
2035 			goto out_free;
2036 		goto restart;
2037 	}
2038 
2039 	err = -EPIPE;
2040 	if (other->sk_shutdown & RCV_SHUTDOWN)
2041 		goto out_unlock;
2042 
2043 	if (sk->sk_type != SOCK_SEQPACKET) {
2044 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045 		if (err)
2046 			goto out_unlock;
2047 	}
2048 
2049 	/* other == sk && unix_peer(other) != sk if
2050 	 * - unix_peer(sk) == NULL, destination address bound to sk
2051 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2052 	 */
2053 	if (other != sk &&
2054 	    unlikely(unix_peer(other) != sk &&
2055 	    unix_recvq_full_lockless(other))) {
2056 		if (timeo) {
2057 			timeo = unix_wait_for_peer(other, timeo);
2058 
2059 			err = sock_intr_errno(timeo);
2060 			if (signal_pending(current))
2061 				goto out_free;
2062 
2063 			goto restart;
2064 		}
2065 
2066 		if (!sk_locked) {
2067 			unix_state_unlock(other);
2068 			unix_state_double_lock(sk, other);
2069 		}
2070 
2071 		if (unix_peer(sk) != other ||
2072 		    unix_dgram_peer_wake_me(sk, other)) {
2073 			err = -EAGAIN;
2074 			sk_locked = 1;
2075 			goto out_unlock;
2076 		}
2077 
2078 		if (!sk_locked) {
2079 			sk_locked = 1;
2080 			goto restart_locked;
2081 		}
2082 	}
2083 
2084 	if (unlikely(sk_locked))
2085 		unix_state_unlock(sk);
2086 
2087 	if (sock_flag(other, SOCK_RCVTSTAMP))
2088 		__net_timestamp(skb);
2089 	maybe_add_creds(skb, sock, other);
2090 	scm_stat_add(other, skb);
2091 	skb_queue_tail(&other->sk_receive_queue, skb);
2092 	unix_state_unlock(other);
2093 	other->sk_data_ready(other);
2094 	sock_put(other);
2095 	scm_destroy(&scm);
2096 	return len;
2097 
2098 out_unlock:
2099 	if (sk_locked)
2100 		unix_state_unlock(sk);
2101 	unix_state_unlock(other);
2102 out_free:
2103 	kfree_skb(skb);
2104 out:
2105 	if (other)
2106 		sock_put(other);
2107 	scm_destroy(&scm);
2108 	return err;
2109 }
2110 
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112  * bytes, and a minimum of a full page.
2113  */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115 
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118 		     struct scm_cookie *scm, bool fds_sent)
2119 {
2120 	struct unix_sock *ousk = unix_sk(other);
2121 	struct sk_buff *skb;
2122 	int err = 0;
2123 
2124 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125 
2126 	if (!skb)
2127 		return err;
2128 
2129 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2130 	if (err < 0) {
2131 		kfree_skb(skb);
2132 		return err;
2133 	}
2134 	skb_put(skb, 1);
2135 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136 
2137 	if (err) {
2138 		kfree_skb(skb);
2139 		return err;
2140 	}
2141 
2142 	unix_state_lock(other);
2143 
2144 	if (sock_flag(other, SOCK_DEAD) ||
2145 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2146 		unix_state_unlock(other);
2147 		kfree_skb(skb);
2148 		return -EPIPE;
2149 	}
2150 
2151 	maybe_add_creds(skb, sock, other);
2152 	skb_get(skb);
2153 
2154 	if (ousk->oob_skb)
2155 		consume_skb(ousk->oob_skb);
2156 
2157 	WRITE_ONCE(ousk->oob_skb, skb);
2158 
2159 	scm_stat_add(other, skb);
2160 	skb_queue_tail(&other->sk_receive_queue, skb);
2161 	sk_send_sigurg(other);
2162 	unix_state_unlock(other);
2163 	other->sk_data_ready(other);
2164 
2165 	return err;
2166 }
2167 #endif
2168 
2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2170 			       size_t len)
2171 {
2172 	struct sock *sk = sock->sk;
2173 	struct sock *other = NULL;
2174 	int err, size;
2175 	struct sk_buff *skb;
2176 	int sent = 0;
2177 	struct scm_cookie scm;
2178 	bool fds_sent = false;
2179 	int data_len;
2180 
2181 	wait_for_unix_gc();
2182 	err = scm_send(sock, msg, &scm, false);
2183 	if (err < 0)
2184 		return err;
2185 
2186 	err = -EOPNOTSUPP;
2187 	if (msg->msg_flags & MSG_OOB) {
2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2189 		if (len)
2190 			len--;
2191 		else
2192 #endif
2193 			goto out_err;
2194 	}
2195 
2196 	if (msg->msg_namelen) {
2197 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2198 		goto out_err;
2199 	} else {
2200 		err = -ENOTCONN;
2201 		other = unix_peer(sk);
2202 		if (!other)
2203 			goto out_err;
2204 	}
2205 
2206 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2207 		goto pipe_err;
2208 
2209 	while (sent < len) {
2210 		size = len - sent;
2211 
2212 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2213 			skb = sock_alloc_send_pskb(sk, 0, 0,
2214 						   msg->msg_flags & MSG_DONTWAIT,
2215 						   &err, 0);
2216 		} else {
2217 			/* Keep two messages in the pipe so it schedules better */
2218 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2219 
2220 			/* allow fallback to order-0 allocations */
2221 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2222 
2223 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2224 
2225 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2226 
2227 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2228 						   msg->msg_flags & MSG_DONTWAIT, &err,
2229 						   get_order(UNIX_SKB_FRAGS_SZ));
2230 		}
2231 		if (!skb)
2232 			goto out_err;
2233 
2234 		/* Only send the fds in the first buffer */
2235 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2236 		if (err < 0) {
2237 			kfree_skb(skb);
2238 			goto out_err;
2239 		}
2240 		fds_sent = true;
2241 
2242 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2243 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2244 						   sk->sk_allocation);
2245 			if (err < 0) {
2246 				kfree_skb(skb);
2247 				goto out_err;
2248 			}
2249 			size = err;
2250 			refcount_add(size, &sk->sk_wmem_alloc);
2251 		} else {
2252 			skb_put(skb, size - data_len);
2253 			skb->data_len = data_len;
2254 			skb->len = size;
2255 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2256 			if (err) {
2257 				kfree_skb(skb);
2258 				goto out_err;
2259 			}
2260 		}
2261 
2262 		unix_state_lock(other);
2263 
2264 		if (sock_flag(other, SOCK_DEAD) ||
2265 		    (other->sk_shutdown & RCV_SHUTDOWN))
2266 			goto pipe_err_free;
2267 
2268 		maybe_add_creds(skb, sock, other);
2269 		scm_stat_add(other, skb);
2270 		skb_queue_tail(&other->sk_receive_queue, skb);
2271 		unix_state_unlock(other);
2272 		other->sk_data_ready(other);
2273 		sent += size;
2274 	}
2275 
2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2277 	if (msg->msg_flags & MSG_OOB) {
2278 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2279 		if (err)
2280 			goto out_err;
2281 		sent++;
2282 	}
2283 #endif
2284 
2285 	scm_destroy(&scm);
2286 
2287 	return sent;
2288 
2289 pipe_err_free:
2290 	unix_state_unlock(other);
2291 	kfree_skb(skb);
2292 pipe_err:
2293 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2294 		send_sig(SIGPIPE, current, 0);
2295 	err = -EPIPE;
2296 out_err:
2297 	scm_destroy(&scm);
2298 	return sent ? : err;
2299 }
2300 
2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2302 				  size_t len)
2303 {
2304 	int err;
2305 	struct sock *sk = sock->sk;
2306 
2307 	err = sock_error(sk);
2308 	if (err)
2309 		return err;
2310 
2311 	if (sk->sk_state != TCP_ESTABLISHED)
2312 		return -ENOTCONN;
2313 
2314 	if (msg->msg_namelen)
2315 		msg->msg_namelen = 0;
2316 
2317 	return unix_dgram_sendmsg(sock, msg, len);
2318 }
2319 
2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2321 				  size_t size, int flags)
2322 {
2323 	struct sock *sk = sock->sk;
2324 
2325 	if (sk->sk_state != TCP_ESTABLISHED)
2326 		return -ENOTCONN;
2327 
2328 	return unix_dgram_recvmsg(sock, msg, size, flags);
2329 }
2330 
2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2332 {
2333 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2334 
2335 	if (addr) {
2336 		msg->msg_namelen = addr->len;
2337 		memcpy(msg->msg_name, addr->name, addr->len);
2338 	}
2339 }
2340 
2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2342 			 int flags)
2343 {
2344 	struct scm_cookie scm;
2345 	struct socket *sock = sk->sk_socket;
2346 	struct unix_sock *u = unix_sk(sk);
2347 	struct sk_buff *skb, *last;
2348 	long timeo;
2349 	int skip;
2350 	int err;
2351 
2352 	err = -EOPNOTSUPP;
2353 	if (flags&MSG_OOB)
2354 		goto out;
2355 
2356 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2357 
2358 	do {
2359 		mutex_lock(&u->iolock);
2360 
2361 		skip = sk_peek_offset(sk, flags);
2362 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2363 					      &skip, &err, &last);
2364 		if (skb) {
2365 			if (!(flags & MSG_PEEK))
2366 				scm_stat_del(sk, skb);
2367 			break;
2368 		}
2369 
2370 		mutex_unlock(&u->iolock);
2371 
2372 		if (err != -EAGAIN)
2373 			break;
2374 	} while (timeo &&
2375 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2376 					      &err, &timeo, last));
2377 
2378 	if (!skb) { /* implies iolock unlocked */
2379 		unix_state_lock(sk);
2380 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2381 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2382 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2383 			err = 0;
2384 		unix_state_unlock(sk);
2385 		goto out;
2386 	}
2387 
2388 	if (wq_has_sleeper(&u->peer_wait))
2389 		wake_up_interruptible_sync_poll(&u->peer_wait,
2390 						EPOLLOUT | EPOLLWRNORM |
2391 						EPOLLWRBAND);
2392 
2393 	if (msg->msg_name)
2394 		unix_copy_addr(msg, skb->sk);
2395 
2396 	if (size > skb->len - skip)
2397 		size = skb->len - skip;
2398 	else if (size < skb->len - skip)
2399 		msg->msg_flags |= MSG_TRUNC;
2400 
2401 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2402 	if (err)
2403 		goto out_free;
2404 
2405 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2406 		__sock_recv_timestamp(msg, sk, skb);
2407 
2408 	memset(&scm, 0, sizeof(scm));
2409 
2410 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2411 	unix_set_secdata(&scm, skb);
2412 
2413 	if (!(flags & MSG_PEEK)) {
2414 		if (UNIXCB(skb).fp)
2415 			unix_detach_fds(&scm, skb);
2416 
2417 		sk_peek_offset_bwd(sk, skb->len);
2418 	} else {
2419 		/* It is questionable: on PEEK we could:
2420 		   - do not return fds - good, but too simple 8)
2421 		   - return fds, and do not return them on read (old strategy,
2422 		     apparently wrong)
2423 		   - clone fds (I chose it for now, it is the most universal
2424 		     solution)
2425 
2426 		   POSIX 1003.1g does not actually define this clearly
2427 		   at all. POSIX 1003.1g doesn't define a lot of things
2428 		   clearly however!
2429 
2430 		*/
2431 
2432 		sk_peek_offset_fwd(sk, size);
2433 
2434 		if (UNIXCB(skb).fp)
2435 			unix_peek_fds(&scm, skb);
2436 	}
2437 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2438 
2439 	scm_recv_unix(sock, msg, &scm, flags);
2440 
2441 out_free:
2442 	skb_free_datagram(sk, skb);
2443 	mutex_unlock(&u->iolock);
2444 out:
2445 	return err;
2446 }
2447 
2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2449 			      int flags)
2450 {
2451 	struct sock *sk = sock->sk;
2452 
2453 #ifdef CONFIG_BPF_SYSCALL
2454 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2455 
2456 	if (prot != &unix_dgram_proto)
2457 		return prot->recvmsg(sk, msg, size, flags, NULL);
2458 #endif
2459 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2460 }
2461 
2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2463 {
2464 	struct unix_sock *u = unix_sk(sk);
2465 	struct sk_buff *skb;
2466 	int err;
2467 
2468 	mutex_lock(&u->iolock);
2469 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2470 	mutex_unlock(&u->iolock);
2471 	if (!skb)
2472 		return err;
2473 
2474 	return recv_actor(sk, skb);
2475 }
2476 
2477 /*
2478  *	Sleep until more data has arrived. But check for races..
2479  */
2480 static long unix_stream_data_wait(struct sock *sk, long timeo,
2481 				  struct sk_buff *last, unsigned int last_len,
2482 				  bool freezable)
2483 {
2484 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2485 	struct sk_buff *tail;
2486 	DEFINE_WAIT(wait);
2487 
2488 	unix_state_lock(sk);
2489 
2490 	for (;;) {
2491 		prepare_to_wait(sk_sleep(sk), &wait, state);
2492 
2493 		tail = skb_peek_tail(&sk->sk_receive_queue);
2494 		if (tail != last ||
2495 		    (tail && tail->len != last_len) ||
2496 		    sk->sk_err ||
2497 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2498 		    signal_pending(current) ||
2499 		    !timeo)
2500 			break;
2501 
2502 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2503 		unix_state_unlock(sk);
2504 		timeo = schedule_timeout(timeo);
2505 		unix_state_lock(sk);
2506 
2507 		if (sock_flag(sk, SOCK_DEAD))
2508 			break;
2509 
2510 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2511 	}
2512 
2513 	finish_wait(sk_sleep(sk), &wait);
2514 	unix_state_unlock(sk);
2515 	return timeo;
2516 }
2517 
2518 static unsigned int unix_skb_len(const struct sk_buff *skb)
2519 {
2520 	return skb->len - UNIXCB(skb).consumed;
2521 }
2522 
2523 struct unix_stream_read_state {
2524 	int (*recv_actor)(struct sk_buff *, int, int,
2525 			  struct unix_stream_read_state *);
2526 	struct socket *socket;
2527 	struct msghdr *msg;
2528 	struct pipe_inode_info *pipe;
2529 	size_t size;
2530 	int flags;
2531 	unsigned int splice_flags;
2532 };
2533 
2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2536 {
2537 	struct socket *sock = state->socket;
2538 	struct sock *sk = sock->sk;
2539 	struct unix_sock *u = unix_sk(sk);
2540 	int chunk = 1;
2541 	struct sk_buff *oob_skb;
2542 
2543 	mutex_lock(&u->iolock);
2544 	unix_state_lock(sk);
2545 
2546 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2547 		unix_state_unlock(sk);
2548 		mutex_unlock(&u->iolock);
2549 		return -EINVAL;
2550 	}
2551 
2552 	oob_skb = u->oob_skb;
2553 
2554 	if (!(state->flags & MSG_PEEK))
2555 		WRITE_ONCE(u->oob_skb, NULL);
2556 
2557 	unix_state_unlock(sk);
2558 
2559 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2560 
2561 	if (!(state->flags & MSG_PEEK)) {
2562 		UNIXCB(oob_skb).consumed += 1;
2563 		kfree_skb(oob_skb);
2564 	}
2565 
2566 	mutex_unlock(&u->iolock);
2567 
2568 	if (chunk < 0)
2569 		return -EFAULT;
2570 
2571 	state->msg->msg_flags |= MSG_OOB;
2572 	return 1;
2573 }
2574 
2575 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2576 				  int flags, int copied)
2577 {
2578 	struct unix_sock *u = unix_sk(sk);
2579 
2580 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2581 		skb_unlink(skb, &sk->sk_receive_queue);
2582 		consume_skb(skb);
2583 		skb = NULL;
2584 	} else {
2585 		if (skb == u->oob_skb) {
2586 			if (copied) {
2587 				skb = NULL;
2588 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2589 				if (!(flags & MSG_PEEK)) {
2590 					WRITE_ONCE(u->oob_skb, NULL);
2591 					consume_skb(skb);
2592 				}
2593 			} else if (!(flags & MSG_PEEK)) {
2594 				skb_unlink(skb, &sk->sk_receive_queue);
2595 				consume_skb(skb);
2596 				skb = skb_peek(&sk->sk_receive_queue);
2597 			}
2598 		}
2599 	}
2600 	return skb;
2601 }
2602 #endif
2603 
2604 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2605 {
2606 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2607 		return -ENOTCONN;
2608 
2609 	return unix_read_skb(sk, recv_actor);
2610 }
2611 
2612 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2613 				    bool freezable)
2614 {
2615 	struct scm_cookie scm;
2616 	struct socket *sock = state->socket;
2617 	struct sock *sk = sock->sk;
2618 	struct unix_sock *u = unix_sk(sk);
2619 	int copied = 0;
2620 	int flags = state->flags;
2621 	int noblock = flags & MSG_DONTWAIT;
2622 	bool check_creds = false;
2623 	int target;
2624 	int err = 0;
2625 	long timeo;
2626 	int skip;
2627 	size_t size = state->size;
2628 	unsigned int last_len;
2629 
2630 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2631 		err = -EINVAL;
2632 		goto out;
2633 	}
2634 
2635 	if (unlikely(flags & MSG_OOB)) {
2636 		err = -EOPNOTSUPP;
2637 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2638 		err = unix_stream_recv_urg(state);
2639 #endif
2640 		goto out;
2641 	}
2642 
2643 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2644 	timeo = sock_rcvtimeo(sk, noblock);
2645 
2646 	memset(&scm, 0, sizeof(scm));
2647 
2648 	/* Lock the socket to prevent queue disordering
2649 	 * while sleeps in memcpy_tomsg
2650 	 */
2651 	mutex_lock(&u->iolock);
2652 
2653 	skip = max(sk_peek_offset(sk, flags), 0);
2654 
2655 	do {
2656 		int chunk;
2657 		bool drop_skb;
2658 		struct sk_buff *skb, *last;
2659 
2660 redo:
2661 		unix_state_lock(sk);
2662 		if (sock_flag(sk, SOCK_DEAD)) {
2663 			err = -ECONNRESET;
2664 			goto unlock;
2665 		}
2666 		last = skb = skb_peek(&sk->sk_receive_queue);
2667 		last_len = last ? last->len : 0;
2668 
2669 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2670 		if (skb) {
2671 			skb = manage_oob(skb, sk, flags, copied);
2672 			if (!skb) {
2673 				unix_state_unlock(sk);
2674 				if (copied)
2675 					break;
2676 				goto redo;
2677 			}
2678 		}
2679 #endif
2680 again:
2681 		if (skb == NULL) {
2682 			if (copied >= target)
2683 				goto unlock;
2684 
2685 			/*
2686 			 *	POSIX 1003.1g mandates this order.
2687 			 */
2688 
2689 			err = sock_error(sk);
2690 			if (err)
2691 				goto unlock;
2692 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2693 				goto unlock;
2694 
2695 			unix_state_unlock(sk);
2696 			if (!timeo) {
2697 				err = -EAGAIN;
2698 				break;
2699 			}
2700 
2701 			mutex_unlock(&u->iolock);
2702 
2703 			timeo = unix_stream_data_wait(sk, timeo, last,
2704 						      last_len, freezable);
2705 
2706 			if (signal_pending(current)) {
2707 				err = sock_intr_errno(timeo);
2708 				scm_destroy(&scm);
2709 				goto out;
2710 			}
2711 
2712 			mutex_lock(&u->iolock);
2713 			goto redo;
2714 unlock:
2715 			unix_state_unlock(sk);
2716 			break;
2717 		}
2718 
2719 		while (skip >= unix_skb_len(skb)) {
2720 			skip -= unix_skb_len(skb);
2721 			last = skb;
2722 			last_len = skb->len;
2723 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2724 			if (!skb)
2725 				goto again;
2726 		}
2727 
2728 		unix_state_unlock(sk);
2729 
2730 		if (check_creds) {
2731 			/* Never glue messages from different writers */
2732 			if (!unix_skb_scm_eq(skb, &scm))
2733 				break;
2734 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2735 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2736 			/* Copy credentials */
2737 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2738 			unix_set_secdata(&scm, skb);
2739 			check_creds = true;
2740 		}
2741 
2742 		/* Copy address just once */
2743 		if (state->msg && state->msg->msg_name) {
2744 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2745 					 state->msg->msg_name);
2746 			unix_copy_addr(state->msg, skb->sk);
2747 			sunaddr = NULL;
2748 		}
2749 
2750 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2751 		skb_get(skb);
2752 		chunk = state->recv_actor(skb, skip, chunk, state);
2753 		drop_skb = !unix_skb_len(skb);
2754 		/* skb is only safe to use if !drop_skb */
2755 		consume_skb(skb);
2756 		if (chunk < 0) {
2757 			if (copied == 0)
2758 				copied = -EFAULT;
2759 			break;
2760 		}
2761 		copied += chunk;
2762 		size -= chunk;
2763 
2764 		if (drop_skb) {
2765 			/* the skb was touched by a concurrent reader;
2766 			 * we should not expect anything from this skb
2767 			 * anymore and assume it invalid - we can be
2768 			 * sure it was dropped from the socket queue
2769 			 *
2770 			 * let's report a short read
2771 			 */
2772 			err = 0;
2773 			break;
2774 		}
2775 
2776 		/* Mark read part of skb as used */
2777 		if (!(flags & MSG_PEEK)) {
2778 			UNIXCB(skb).consumed += chunk;
2779 
2780 			sk_peek_offset_bwd(sk, chunk);
2781 
2782 			if (UNIXCB(skb).fp) {
2783 				scm_stat_del(sk, skb);
2784 				unix_detach_fds(&scm, skb);
2785 			}
2786 
2787 			if (unix_skb_len(skb))
2788 				break;
2789 
2790 			skb_unlink(skb, &sk->sk_receive_queue);
2791 			consume_skb(skb);
2792 
2793 			if (scm.fp)
2794 				break;
2795 		} else {
2796 			/* It is questionable, see note in unix_dgram_recvmsg.
2797 			 */
2798 			if (UNIXCB(skb).fp)
2799 				unix_peek_fds(&scm, skb);
2800 
2801 			sk_peek_offset_fwd(sk, chunk);
2802 
2803 			if (UNIXCB(skb).fp)
2804 				break;
2805 
2806 			skip = 0;
2807 			last = skb;
2808 			last_len = skb->len;
2809 			unix_state_lock(sk);
2810 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2811 			if (skb)
2812 				goto again;
2813 			unix_state_unlock(sk);
2814 			break;
2815 		}
2816 	} while (size);
2817 
2818 	mutex_unlock(&u->iolock);
2819 	if (state->msg)
2820 		scm_recv_unix(sock, state->msg, &scm, flags);
2821 	else
2822 		scm_destroy(&scm);
2823 out:
2824 	return copied ? : err;
2825 }
2826 
2827 static int unix_stream_read_actor(struct sk_buff *skb,
2828 				  int skip, int chunk,
2829 				  struct unix_stream_read_state *state)
2830 {
2831 	int ret;
2832 
2833 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2834 				    state->msg, chunk);
2835 	return ret ?: chunk;
2836 }
2837 
2838 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2839 			  size_t size, int flags)
2840 {
2841 	struct unix_stream_read_state state = {
2842 		.recv_actor = unix_stream_read_actor,
2843 		.socket = sk->sk_socket,
2844 		.msg = msg,
2845 		.size = size,
2846 		.flags = flags
2847 	};
2848 
2849 	return unix_stream_read_generic(&state, true);
2850 }
2851 
2852 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2853 			       size_t size, int flags)
2854 {
2855 	struct unix_stream_read_state state = {
2856 		.recv_actor = unix_stream_read_actor,
2857 		.socket = sock,
2858 		.msg = msg,
2859 		.size = size,
2860 		.flags = flags
2861 	};
2862 
2863 #ifdef CONFIG_BPF_SYSCALL
2864 	struct sock *sk = sock->sk;
2865 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2866 
2867 	if (prot != &unix_stream_proto)
2868 		return prot->recvmsg(sk, msg, size, flags, NULL);
2869 #endif
2870 	return unix_stream_read_generic(&state, true);
2871 }
2872 
2873 static int unix_stream_splice_actor(struct sk_buff *skb,
2874 				    int skip, int chunk,
2875 				    struct unix_stream_read_state *state)
2876 {
2877 	return skb_splice_bits(skb, state->socket->sk,
2878 			       UNIXCB(skb).consumed + skip,
2879 			       state->pipe, chunk, state->splice_flags);
2880 }
2881 
2882 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2883 				       struct pipe_inode_info *pipe,
2884 				       size_t size, unsigned int flags)
2885 {
2886 	struct unix_stream_read_state state = {
2887 		.recv_actor = unix_stream_splice_actor,
2888 		.socket = sock,
2889 		.pipe = pipe,
2890 		.size = size,
2891 		.splice_flags = flags,
2892 	};
2893 
2894 	if (unlikely(*ppos))
2895 		return -ESPIPE;
2896 
2897 	if (sock->file->f_flags & O_NONBLOCK ||
2898 	    flags & SPLICE_F_NONBLOCK)
2899 		state.flags = MSG_DONTWAIT;
2900 
2901 	return unix_stream_read_generic(&state, false);
2902 }
2903 
2904 static int unix_shutdown(struct socket *sock, int mode)
2905 {
2906 	struct sock *sk = sock->sk;
2907 	struct sock *other;
2908 
2909 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2910 		return -EINVAL;
2911 	/* This maps:
2912 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2913 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2914 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2915 	 */
2916 	++mode;
2917 
2918 	unix_state_lock(sk);
2919 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2920 	other = unix_peer(sk);
2921 	if (other)
2922 		sock_hold(other);
2923 	unix_state_unlock(sk);
2924 	sk->sk_state_change(sk);
2925 
2926 	if (other &&
2927 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2928 
2929 		int peer_mode = 0;
2930 		const struct proto *prot = READ_ONCE(other->sk_prot);
2931 
2932 		if (prot->unhash)
2933 			prot->unhash(other);
2934 		if (mode&RCV_SHUTDOWN)
2935 			peer_mode |= SEND_SHUTDOWN;
2936 		if (mode&SEND_SHUTDOWN)
2937 			peer_mode |= RCV_SHUTDOWN;
2938 		unix_state_lock(other);
2939 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2940 		unix_state_unlock(other);
2941 		other->sk_state_change(other);
2942 		if (peer_mode == SHUTDOWN_MASK)
2943 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2944 		else if (peer_mode & RCV_SHUTDOWN)
2945 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2946 	}
2947 	if (other)
2948 		sock_put(other);
2949 
2950 	return 0;
2951 }
2952 
2953 long unix_inq_len(struct sock *sk)
2954 {
2955 	struct sk_buff *skb;
2956 	long amount = 0;
2957 
2958 	if (sk->sk_state == TCP_LISTEN)
2959 		return -EINVAL;
2960 
2961 	spin_lock(&sk->sk_receive_queue.lock);
2962 	if (sk->sk_type == SOCK_STREAM ||
2963 	    sk->sk_type == SOCK_SEQPACKET) {
2964 		skb_queue_walk(&sk->sk_receive_queue, skb)
2965 			amount += unix_skb_len(skb);
2966 	} else {
2967 		skb = skb_peek(&sk->sk_receive_queue);
2968 		if (skb)
2969 			amount = skb->len;
2970 	}
2971 	spin_unlock(&sk->sk_receive_queue.lock);
2972 
2973 	return amount;
2974 }
2975 EXPORT_SYMBOL_GPL(unix_inq_len);
2976 
2977 long unix_outq_len(struct sock *sk)
2978 {
2979 	return sk_wmem_alloc_get(sk);
2980 }
2981 EXPORT_SYMBOL_GPL(unix_outq_len);
2982 
2983 static int unix_open_file(struct sock *sk)
2984 {
2985 	struct path path;
2986 	struct file *f;
2987 	int fd;
2988 
2989 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2990 		return -EPERM;
2991 
2992 	if (!smp_load_acquire(&unix_sk(sk)->addr))
2993 		return -ENOENT;
2994 
2995 	path = unix_sk(sk)->path;
2996 	if (!path.dentry)
2997 		return -ENOENT;
2998 
2999 	path_get(&path);
3000 
3001 	fd = get_unused_fd_flags(O_CLOEXEC);
3002 	if (fd < 0)
3003 		goto out;
3004 
3005 	f = dentry_open(&path, O_PATH, current_cred());
3006 	if (IS_ERR(f)) {
3007 		put_unused_fd(fd);
3008 		fd = PTR_ERR(f);
3009 		goto out;
3010 	}
3011 
3012 	fd_install(fd, f);
3013 out:
3014 	path_put(&path);
3015 
3016 	return fd;
3017 }
3018 
3019 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3020 {
3021 	struct sock *sk = sock->sk;
3022 	long amount = 0;
3023 	int err;
3024 
3025 	switch (cmd) {
3026 	case SIOCOUTQ:
3027 		amount = unix_outq_len(sk);
3028 		err = put_user(amount, (int __user *)arg);
3029 		break;
3030 	case SIOCINQ:
3031 		amount = unix_inq_len(sk);
3032 		if (amount < 0)
3033 			err = amount;
3034 		else
3035 			err = put_user(amount, (int __user *)arg);
3036 		break;
3037 	case SIOCUNIXFILE:
3038 		err = unix_open_file(sk);
3039 		break;
3040 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3041 	case SIOCATMARK:
3042 		{
3043 			struct sk_buff *skb;
3044 			int answ = 0;
3045 
3046 			skb = skb_peek(&sk->sk_receive_queue);
3047 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3048 				answ = 1;
3049 			err = put_user(answ, (int __user *)arg);
3050 		}
3051 		break;
3052 #endif
3053 	default:
3054 		err = -ENOIOCTLCMD;
3055 		break;
3056 	}
3057 	return err;
3058 }
3059 
3060 #ifdef CONFIG_COMPAT
3061 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3062 {
3063 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3064 }
3065 #endif
3066 
3067 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3068 {
3069 	struct sock *sk = sock->sk;
3070 	__poll_t mask;
3071 	u8 shutdown;
3072 
3073 	sock_poll_wait(file, sock, wait);
3074 	mask = 0;
3075 	shutdown = READ_ONCE(sk->sk_shutdown);
3076 
3077 	/* exceptional events? */
3078 	if (READ_ONCE(sk->sk_err))
3079 		mask |= EPOLLERR;
3080 	if (shutdown == SHUTDOWN_MASK)
3081 		mask |= EPOLLHUP;
3082 	if (shutdown & RCV_SHUTDOWN)
3083 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3084 
3085 	/* readable? */
3086 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3087 		mask |= EPOLLIN | EPOLLRDNORM;
3088 	if (sk_is_readable(sk))
3089 		mask |= EPOLLIN | EPOLLRDNORM;
3090 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3091 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3092 		mask |= EPOLLPRI;
3093 #endif
3094 
3095 	/* Connection-based need to check for termination and startup */
3096 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3097 	    sk->sk_state == TCP_CLOSE)
3098 		mask |= EPOLLHUP;
3099 
3100 	/*
3101 	 * we set writable also when the other side has shut down the
3102 	 * connection. This prevents stuck sockets.
3103 	 */
3104 	if (unix_writable(sk))
3105 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3106 
3107 	return mask;
3108 }
3109 
3110 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3111 				    poll_table *wait)
3112 {
3113 	struct sock *sk = sock->sk, *other;
3114 	unsigned int writable;
3115 	__poll_t mask;
3116 	u8 shutdown;
3117 
3118 	sock_poll_wait(file, sock, wait);
3119 	mask = 0;
3120 	shutdown = READ_ONCE(sk->sk_shutdown);
3121 
3122 	/* exceptional events? */
3123 	if (READ_ONCE(sk->sk_err) ||
3124 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3125 		mask |= EPOLLERR |
3126 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3127 
3128 	if (shutdown & RCV_SHUTDOWN)
3129 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3130 	if (shutdown == SHUTDOWN_MASK)
3131 		mask |= EPOLLHUP;
3132 
3133 	/* readable? */
3134 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3135 		mask |= EPOLLIN | EPOLLRDNORM;
3136 	if (sk_is_readable(sk))
3137 		mask |= EPOLLIN | EPOLLRDNORM;
3138 
3139 	/* Connection-based need to check for termination and startup */
3140 	if (sk->sk_type == SOCK_SEQPACKET) {
3141 		if (sk->sk_state == TCP_CLOSE)
3142 			mask |= EPOLLHUP;
3143 		/* connection hasn't started yet? */
3144 		if (sk->sk_state == TCP_SYN_SENT)
3145 			return mask;
3146 	}
3147 
3148 	/* No write status requested, avoid expensive OUT tests. */
3149 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3150 		return mask;
3151 
3152 	writable = unix_writable(sk);
3153 	if (writable) {
3154 		unix_state_lock(sk);
3155 
3156 		other = unix_peer(sk);
3157 		if (other && unix_peer(other) != sk &&
3158 		    unix_recvq_full_lockless(other) &&
3159 		    unix_dgram_peer_wake_me(sk, other))
3160 			writable = 0;
3161 
3162 		unix_state_unlock(sk);
3163 	}
3164 
3165 	if (writable)
3166 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3167 	else
3168 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3169 
3170 	return mask;
3171 }
3172 
3173 #ifdef CONFIG_PROC_FS
3174 
3175 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3176 
3177 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3178 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3179 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3180 
3181 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3182 {
3183 	unsigned long offset = get_offset(*pos);
3184 	unsigned long bucket = get_bucket(*pos);
3185 	unsigned long count = 0;
3186 	struct sock *sk;
3187 
3188 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3189 	     sk; sk = sk_next(sk)) {
3190 		if (++count == offset)
3191 			break;
3192 	}
3193 
3194 	return sk;
3195 }
3196 
3197 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3198 {
3199 	unsigned long bucket = get_bucket(*pos);
3200 	struct net *net = seq_file_net(seq);
3201 	struct sock *sk;
3202 
3203 	while (bucket < UNIX_HASH_SIZE) {
3204 		spin_lock(&net->unx.table.locks[bucket]);
3205 
3206 		sk = unix_from_bucket(seq, pos);
3207 		if (sk)
3208 			return sk;
3209 
3210 		spin_unlock(&net->unx.table.locks[bucket]);
3211 
3212 		*pos = set_bucket_offset(++bucket, 1);
3213 	}
3214 
3215 	return NULL;
3216 }
3217 
3218 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3219 				  loff_t *pos)
3220 {
3221 	unsigned long bucket = get_bucket(*pos);
3222 
3223 	sk = sk_next(sk);
3224 	if (sk)
3225 		return sk;
3226 
3227 
3228 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3229 
3230 	*pos = set_bucket_offset(++bucket, 1);
3231 
3232 	return unix_get_first(seq, pos);
3233 }
3234 
3235 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3236 {
3237 	if (!*pos)
3238 		return SEQ_START_TOKEN;
3239 
3240 	return unix_get_first(seq, pos);
3241 }
3242 
3243 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3244 {
3245 	++*pos;
3246 
3247 	if (v == SEQ_START_TOKEN)
3248 		return unix_get_first(seq, pos);
3249 
3250 	return unix_get_next(seq, v, pos);
3251 }
3252 
3253 static void unix_seq_stop(struct seq_file *seq, void *v)
3254 {
3255 	struct sock *sk = v;
3256 
3257 	if (sk)
3258 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3259 }
3260 
3261 static int unix_seq_show(struct seq_file *seq, void *v)
3262 {
3263 
3264 	if (v == SEQ_START_TOKEN)
3265 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3266 			 "Inode Path\n");
3267 	else {
3268 		struct sock *s = v;
3269 		struct unix_sock *u = unix_sk(s);
3270 		unix_state_lock(s);
3271 
3272 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3273 			s,
3274 			refcount_read(&s->sk_refcnt),
3275 			0,
3276 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3277 			s->sk_type,
3278 			s->sk_socket ?
3279 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3280 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3281 			sock_i_ino(s));
3282 
3283 		if (u->addr) {	// under a hash table lock here
3284 			int i, len;
3285 			seq_putc(seq, ' ');
3286 
3287 			i = 0;
3288 			len = u->addr->len -
3289 				offsetof(struct sockaddr_un, sun_path);
3290 			if (u->addr->name->sun_path[0]) {
3291 				len--;
3292 			} else {
3293 				seq_putc(seq, '@');
3294 				i++;
3295 			}
3296 			for ( ; i < len; i++)
3297 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3298 					 '@');
3299 		}
3300 		unix_state_unlock(s);
3301 		seq_putc(seq, '\n');
3302 	}
3303 
3304 	return 0;
3305 }
3306 
3307 static const struct seq_operations unix_seq_ops = {
3308 	.start  = unix_seq_start,
3309 	.next   = unix_seq_next,
3310 	.stop   = unix_seq_stop,
3311 	.show   = unix_seq_show,
3312 };
3313 
3314 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3315 struct bpf_unix_iter_state {
3316 	struct seq_net_private p;
3317 	unsigned int cur_sk;
3318 	unsigned int end_sk;
3319 	unsigned int max_sk;
3320 	struct sock **batch;
3321 	bool st_bucket_done;
3322 };
3323 
3324 struct bpf_iter__unix {
3325 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3326 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3327 	uid_t uid __aligned(8);
3328 };
3329 
3330 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3331 			      struct unix_sock *unix_sk, uid_t uid)
3332 {
3333 	struct bpf_iter__unix ctx;
3334 
3335 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3336 	ctx.meta = meta;
3337 	ctx.unix_sk = unix_sk;
3338 	ctx.uid = uid;
3339 	return bpf_iter_run_prog(prog, &ctx);
3340 }
3341 
3342 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3343 
3344 {
3345 	struct bpf_unix_iter_state *iter = seq->private;
3346 	unsigned int expected = 1;
3347 	struct sock *sk;
3348 
3349 	sock_hold(start_sk);
3350 	iter->batch[iter->end_sk++] = start_sk;
3351 
3352 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3353 		if (iter->end_sk < iter->max_sk) {
3354 			sock_hold(sk);
3355 			iter->batch[iter->end_sk++] = sk;
3356 		}
3357 
3358 		expected++;
3359 	}
3360 
3361 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3362 
3363 	return expected;
3364 }
3365 
3366 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3367 {
3368 	while (iter->cur_sk < iter->end_sk)
3369 		sock_put(iter->batch[iter->cur_sk++]);
3370 }
3371 
3372 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3373 				       unsigned int new_batch_sz)
3374 {
3375 	struct sock **new_batch;
3376 
3377 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3378 			     GFP_USER | __GFP_NOWARN);
3379 	if (!new_batch)
3380 		return -ENOMEM;
3381 
3382 	bpf_iter_unix_put_batch(iter);
3383 	kvfree(iter->batch);
3384 	iter->batch = new_batch;
3385 	iter->max_sk = new_batch_sz;
3386 
3387 	return 0;
3388 }
3389 
3390 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3391 					loff_t *pos)
3392 {
3393 	struct bpf_unix_iter_state *iter = seq->private;
3394 	unsigned int expected;
3395 	bool resized = false;
3396 	struct sock *sk;
3397 
3398 	if (iter->st_bucket_done)
3399 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3400 
3401 again:
3402 	/* Get a new batch */
3403 	iter->cur_sk = 0;
3404 	iter->end_sk = 0;
3405 
3406 	sk = unix_get_first(seq, pos);
3407 	if (!sk)
3408 		return NULL; /* Done */
3409 
3410 	expected = bpf_iter_unix_hold_batch(seq, sk);
3411 
3412 	if (iter->end_sk == expected) {
3413 		iter->st_bucket_done = true;
3414 		return sk;
3415 	}
3416 
3417 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3418 		resized = true;
3419 		goto again;
3420 	}
3421 
3422 	return sk;
3423 }
3424 
3425 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3426 {
3427 	if (!*pos)
3428 		return SEQ_START_TOKEN;
3429 
3430 	/* bpf iter does not support lseek, so it always
3431 	 * continue from where it was stop()-ped.
3432 	 */
3433 	return bpf_iter_unix_batch(seq, pos);
3434 }
3435 
3436 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3437 {
3438 	struct bpf_unix_iter_state *iter = seq->private;
3439 	struct sock *sk;
3440 
3441 	/* Whenever seq_next() is called, the iter->cur_sk is
3442 	 * done with seq_show(), so advance to the next sk in
3443 	 * the batch.
3444 	 */
3445 	if (iter->cur_sk < iter->end_sk)
3446 		sock_put(iter->batch[iter->cur_sk++]);
3447 
3448 	++*pos;
3449 
3450 	if (iter->cur_sk < iter->end_sk)
3451 		sk = iter->batch[iter->cur_sk];
3452 	else
3453 		sk = bpf_iter_unix_batch(seq, pos);
3454 
3455 	return sk;
3456 }
3457 
3458 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3459 {
3460 	struct bpf_iter_meta meta;
3461 	struct bpf_prog *prog;
3462 	struct sock *sk = v;
3463 	uid_t uid;
3464 	bool slow;
3465 	int ret;
3466 
3467 	if (v == SEQ_START_TOKEN)
3468 		return 0;
3469 
3470 	slow = lock_sock_fast(sk);
3471 
3472 	if (unlikely(sk_unhashed(sk))) {
3473 		ret = SEQ_SKIP;
3474 		goto unlock;
3475 	}
3476 
3477 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3478 	meta.seq = seq;
3479 	prog = bpf_iter_get_info(&meta, false);
3480 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3481 unlock:
3482 	unlock_sock_fast(sk, slow);
3483 	return ret;
3484 }
3485 
3486 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3487 {
3488 	struct bpf_unix_iter_state *iter = seq->private;
3489 	struct bpf_iter_meta meta;
3490 	struct bpf_prog *prog;
3491 
3492 	if (!v) {
3493 		meta.seq = seq;
3494 		prog = bpf_iter_get_info(&meta, true);
3495 		if (prog)
3496 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3497 	}
3498 
3499 	if (iter->cur_sk < iter->end_sk)
3500 		bpf_iter_unix_put_batch(iter);
3501 }
3502 
3503 static const struct seq_operations bpf_iter_unix_seq_ops = {
3504 	.start	= bpf_iter_unix_seq_start,
3505 	.next	= bpf_iter_unix_seq_next,
3506 	.stop	= bpf_iter_unix_seq_stop,
3507 	.show	= bpf_iter_unix_seq_show,
3508 };
3509 #endif
3510 #endif
3511 
3512 static const struct net_proto_family unix_family_ops = {
3513 	.family = PF_UNIX,
3514 	.create = unix_create,
3515 	.owner	= THIS_MODULE,
3516 };
3517 
3518 
3519 static int __net_init unix_net_init(struct net *net)
3520 {
3521 	int i;
3522 
3523 	net->unx.sysctl_max_dgram_qlen = 10;
3524 	if (unix_sysctl_register(net))
3525 		goto out;
3526 
3527 #ifdef CONFIG_PROC_FS
3528 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3529 			     sizeof(struct seq_net_private)))
3530 		goto err_sysctl;
3531 #endif
3532 
3533 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3534 					      sizeof(spinlock_t), GFP_KERNEL);
3535 	if (!net->unx.table.locks)
3536 		goto err_proc;
3537 
3538 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3539 						sizeof(struct hlist_head),
3540 						GFP_KERNEL);
3541 	if (!net->unx.table.buckets)
3542 		goto free_locks;
3543 
3544 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3545 		spin_lock_init(&net->unx.table.locks[i]);
3546 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3547 	}
3548 
3549 	return 0;
3550 
3551 free_locks:
3552 	kvfree(net->unx.table.locks);
3553 err_proc:
3554 #ifdef CONFIG_PROC_FS
3555 	remove_proc_entry("unix", net->proc_net);
3556 err_sysctl:
3557 #endif
3558 	unix_sysctl_unregister(net);
3559 out:
3560 	return -ENOMEM;
3561 }
3562 
3563 static void __net_exit unix_net_exit(struct net *net)
3564 {
3565 	kvfree(net->unx.table.buckets);
3566 	kvfree(net->unx.table.locks);
3567 	unix_sysctl_unregister(net);
3568 	remove_proc_entry("unix", net->proc_net);
3569 }
3570 
3571 static struct pernet_operations unix_net_ops = {
3572 	.init = unix_net_init,
3573 	.exit = unix_net_exit,
3574 };
3575 
3576 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3577 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3578 		     struct unix_sock *unix_sk, uid_t uid)
3579 
3580 #define INIT_BATCH_SZ 16
3581 
3582 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3583 {
3584 	struct bpf_unix_iter_state *iter = priv_data;
3585 	int err;
3586 
3587 	err = bpf_iter_init_seq_net(priv_data, aux);
3588 	if (err)
3589 		return err;
3590 
3591 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3592 	if (err) {
3593 		bpf_iter_fini_seq_net(priv_data);
3594 		return err;
3595 	}
3596 
3597 	return 0;
3598 }
3599 
3600 static void bpf_iter_fini_unix(void *priv_data)
3601 {
3602 	struct bpf_unix_iter_state *iter = priv_data;
3603 
3604 	bpf_iter_fini_seq_net(priv_data);
3605 	kvfree(iter->batch);
3606 }
3607 
3608 static const struct bpf_iter_seq_info unix_seq_info = {
3609 	.seq_ops		= &bpf_iter_unix_seq_ops,
3610 	.init_seq_private	= bpf_iter_init_unix,
3611 	.fini_seq_private	= bpf_iter_fini_unix,
3612 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3613 };
3614 
3615 static const struct bpf_func_proto *
3616 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3617 			     const struct bpf_prog *prog)
3618 {
3619 	switch (func_id) {
3620 	case BPF_FUNC_setsockopt:
3621 		return &bpf_sk_setsockopt_proto;
3622 	case BPF_FUNC_getsockopt:
3623 		return &bpf_sk_getsockopt_proto;
3624 	default:
3625 		return NULL;
3626 	}
3627 }
3628 
3629 static struct bpf_iter_reg unix_reg_info = {
3630 	.target			= "unix",
3631 	.ctx_arg_info_size	= 1,
3632 	.ctx_arg_info		= {
3633 		{ offsetof(struct bpf_iter__unix, unix_sk),
3634 		  PTR_TO_BTF_ID_OR_NULL },
3635 	},
3636 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3637 	.seq_info		= &unix_seq_info,
3638 };
3639 
3640 static void __init bpf_iter_register(void)
3641 {
3642 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3643 	if (bpf_iter_reg_target(&unix_reg_info))
3644 		pr_warn("Warning: could not register bpf iterator unix\n");
3645 }
3646 #endif
3647 
3648 static int __init af_unix_init(void)
3649 {
3650 	int i, rc = -1;
3651 
3652 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3653 
3654 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3655 		spin_lock_init(&bsd_socket_locks[i]);
3656 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3657 	}
3658 
3659 	rc = proto_register(&unix_dgram_proto, 1);
3660 	if (rc != 0) {
3661 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3662 		goto out;
3663 	}
3664 
3665 	rc = proto_register(&unix_stream_proto, 1);
3666 	if (rc != 0) {
3667 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3668 		proto_unregister(&unix_dgram_proto);
3669 		goto out;
3670 	}
3671 
3672 	sock_register(&unix_family_ops);
3673 	register_pernet_subsys(&unix_net_ops);
3674 	unix_bpf_build_proto();
3675 
3676 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3677 	bpf_iter_register();
3678 #endif
3679 
3680 out:
3681 	return rc;
3682 }
3683 
3684 static void __exit af_unix_exit(void)
3685 {
3686 	sock_unregister(PF_UNIX);
3687 	proto_unregister(&unix_dgram_proto);
3688 	proto_unregister(&unix_stream_proto);
3689 	unregister_pernet_subsys(&unix_net_ops);
3690 }
3691 
3692 /* Earlier than device_initcall() so that other drivers invoking
3693    request_module() don't end up in a loop when modprobe tries
3694    to use a UNIX socket. But later than subsys_initcall() because
3695    we depend on stuff initialised there */
3696 fs_initcall(af_unix_init);
3697 module_exit(af_unix_exit);
3698 
3699 MODULE_LICENSE("GPL");
3700 MODULE_ALIAS_NETPROTO(PF_UNIX);
3701