xref: /openbmc/linux/net/unix/af_unix.c (revision 4e042f02)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 	return unix_peer(osk) == sk;
218 }
219 
220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224 
225 static inline int unix_recvq_full(const struct sock *sk)
226 {
227 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229 
230 static inline int unix_recvq_full_lockless(const struct sock *sk)
231 {
232 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
233 		READ_ONCE(sk->sk_max_ack_backlog);
234 }
235 
236 struct sock *unix_peer_get(struct sock *s)
237 {
238 	struct sock *peer;
239 
240 	unix_state_lock(s);
241 	peer = unix_peer(s);
242 	if (peer)
243 		sock_hold(peer);
244 	unix_state_unlock(s);
245 	return peer;
246 }
247 EXPORT_SYMBOL_GPL(unix_peer_get);
248 
249 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
250 					     int addr_len)
251 {
252 	struct unix_address *addr;
253 
254 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
255 	if (!addr)
256 		return NULL;
257 
258 	refcount_set(&addr->refcnt, 1);
259 	addr->len = addr_len;
260 	memcpy(addr->name, sunaddr, addr_len);
261 
262 	return addr;
263 }
264 
265 static inline void unix_release_addr(struct unix_address *addr)
266 {
267 	if (refcount_dec_and_test(&addr->refcnt))
268 		kfree(addr);
269 }
270 
271 /*
272  *	Check unix socket name:
273  *		- should be not zero length.
274  *	        - if started by not zero, should be NULL terminated (FS object)
275  *		- if started by zero, it is abstract name.
276  */
277 
278 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
279 {
280 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
281 	    addr_len > sizeof(*sunaddr))
282 		return -EINVAL;
283 
284 	if (sunaddr->sun_family != AF_UNIX)
285 		return -EINVAL;
286 
287 	return 0;
288 }
289 
290 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
291 {
292 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
293 	short offset = offsetof(struct sockaddr_storage, __data);
294 
295 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
296 
297 	/* This may look like an off by one error but it is a bit more
298 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
299 	 * sun_path[108] doesn't as such exist.  However in kernel space
300 	 * we are guaranteed that it is a valid memory location in our
301 	 * kernel address buffer because syscall functions always pass
302 	 * a pointer of struct sockaddr_storage which has a bigger buffer
303 	 * than 108.  Also, we must terminate sun_path for strlen() in
304 	 * getname_kernel().
305 	 */
306 	addr->__data[addr_len - offset] = 0;
307 
308 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
309 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
310 	 * know the actual buffer.
311 	 */
312 	return strlen(addr->__data) + offset + 1;
313 }
314 
315 static void __unix_remove_socket(struct sock *sk)
316 {
317 	sk_del_node_init(sk);
318 }
319 
320 static void __unix_insert_socket(struct net *net, struct sock *sk)
321 {
322 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
323 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
324 }
325 
326 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
327 				 struct unix_address *addr, unsigned int hash)
328 {
329 	__unix_remove_socket(sk);
330 	smp_store_release(&unix_sk(sk)->addr, addr);
331 
332 	sk->sk_hash = hash;
333 	__unix_insert_socket(net, sk);
334 }
335 
336 static void unix_remove_socket(struct net *net, struct sock *sk)
337 {
338 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
339 	__unix_remove_socket(sk);
340 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
341 }
342 
343 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
344 {
345 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
346 	__unix_insert_socket(net, sk);
347 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
348 }
349 
350 static void unix_insert_bsd_socket(struct sock *sk)
351 {
352 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
353 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
354 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
355 }
356 
357 static void unix_remove_bsd_socket(struct sock *sk)
358 {
359 	if (!hlist_unhashed(&sk->sk_bind_node)) {
360 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
361 		__sk_del_bind_node(sk);
362 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
363 
364 		sk_node_init(&sk->sk_bind_node);
365 	}
366 }
367 
368 static struct sock *__unix_find_socket_byname(struct net *net,
369 					      struct sockaddr_un *sunname,
370 					      int len, unsigned int hash)
371 {
372 	struct sock *s;
373 
374 	sk_for_each(s, &net->unx.table.buckets[hash]) {
375 		struct unix_sock *u = unix_sk(s);
376 
377 		if (u->addr->len == len &&
378 		    !memcmp(u->addr->name, sunname, len))
379 			return s;
380 	}
381 	return NULL;
382 }
383 
384 static inline struct sock *unix_find_socket_byname(struct net *net,
385 						   struct sockaddr_un *sunname,
386 						   int len, unsigned int hash)
387 {
388 	struct sock *s;
389 
390 	spin_lock(&net->unx.table.locks[hash]);
391 	s = __unix_find_socket_byname(net, sunname, len, hash);
392 	if (s)
393 		sock_hold(s);
394 	spin_unlock(&net->unx.table.locks[hash]);
395 	return s;
396 }
397 
398 static struct sock *unix_find_socket_byinode(struct inode *i)
399 {
400 	unsigned int hash = unix_bsd_hash(i);
401 	struct sock *s;
402 
403 	spin_lock(&bsd_socket_locks[hash]);
404 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
405 		struct dentry *dentry = unix_sk(s)->path.dentry;
406 
407 		if (dentry && d_backing_inode(dentry) == i) {
408 			sock_hold(s);
409 			spin_unlock(&bsd_socket_locks[hash]);
410 			return s;
411 		}
412 	}
413 	spin_unlock(&bsd_socket_locks[hash]);
414 	return NULL;
415 }
416 
417 /* Support code for asymmetrically connected dgram sockets
418  *
419  * If a datagram socket is connected to a socket not itself connected
420  * to the first socket (eg, /dev/log), clients may only enqueue more
421  * messages if the present receive queue of the server socket is not
422  * "too large". This means there's a second writeability condition
423  * poll and sendmsg need to test. The dgram recv code will do a wake
424  * up on the peer_wait wait queue of a socket upon reception of a
425  * datagram which needs to be propagated to sleeping would-be writers
426  * since these might not have sent anything so far. This can't be
427  * accomplished via poll_wait because the lifetime of the server
428  * socket might be less than that of its clients if these break their
429  * association with it or if the server socket is closed while clients
430  * are still connected to it and there's no way to inform "a polling
431  * implementation" that it should let go of a certain wait queue
432  *
433  * In order to propagate a wake up, a wait_queue_entry_t of the client
434  * socket is enqueued on the peer_wait queue of the server socket
435  * whose wake function does a wake_up on the ordinary client socket
436  * wait queue. This connection is established whenever a write (or
437  * poll for write) hit the flow control condition and broken when the
438  * association to the server socket is dissolved or after a wake up
439  * was relayed.
440  */
441 
442 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
443 				      void *key)
444 {
445 	struct unix_sock *u;
446 	wait_queue_head_t *u_sleep;
447 
448 	u = container_of(q, struct unix_sock, peer_wake);
449 
450 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
451 			    q);
452 	u->peer_wake.private = NULL;
453 
454 	/* relaying can only happen while the wq still exists */
455 	u_sleep = sk_sleep(&u->sk);
456 	if (u_sleep)
457 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
458 
459 	return 0;
460 }
461 
462 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
463 {
464 	struct unix_sock *u, *u_other;
465 	int rc;
466 
467 	u = unix_sk(sk);
468 	u_other = unix_sk(other);
469 	rc = 0;
470 	spin_lock(&u_other->peer_wait.lock);
471 
472 	if (!u->peer_wake.private) {
473 		u->peer_wake.private = other;
474 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
475 
476 		rc = 1;
477 	}
478 
479 	spin_unlock(&u_other->peer_wait.lock);
480 	return rc;
481 }
482 
483 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
484 					    struct sock *other)
485 {
486 	struct unix_sock *u, *u_other;
487 
488 	u = unix_sk(sk);
489 	u_other = unix_sk(other);
490 	spin_lock(&u_other->peer_wait.lock);
491 
492 	if (u->peer_wake.private == other) {
493 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
494 		u->peer_wake.private = NULL;
495 	}
496 
497 	spin_unlock(&u_other->peer_wait.lock);
498 }
499 
500 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
501 						   struct sock *other)
502 {
503 	unix_dgram_peer_wake_disconnect(sk, other);
504 	wake_up_interruptible_poll(sk_sleep(sk),
505 				   EPOLLOUT |
506 				   EPOLLWRNORM |
507 				   EPOLLWRBAND);
508 }
509 
510 /* preconditions:
511  *	- unix_peer(sk) == other
512  *	- association is stable
513  */
514 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
515 {
516 	int connected;
517 
518 	connected = unix_dgram_peer_wake_connect(sk, other);
519 
520 	/* If other is SOCK_DEAD, we want to make sure we signal
521 	 * POLLOUT, such that a subsequent write() can get a
522 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
523 	 * to other and its full, we will hang waiting for POLLOUT.
524 	 */
525 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
526 		return 1;
527 
528 	if (connected)
529 		unix_dgram_peer_wake_disconnect(sk, other);
530 
531 	return 0;
532 }
533 
534 static int unix_writable(const struct sock *sk)
535 {
536 	return sk->sk_state != TCP_LISTEN &&
537 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
538 }
539 
540 static void unix_write_space(struct sock *sk)
541 {
542 	struct socket_wq *wq;
543 
544 	rcu_read_lock();
545 	if (unix_writable(sk)) {
546 		wq = rcu_dereference(sk->sk_wq);
547 		if (skwq_has_sleeper(wq))
548 			wake_up_interruptible_sync_poll(&wq->wait,
549 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
550 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
551 	}
552 	rcu_read_unlock();
553 }
554 
555 /* When dgram socket disconnects (or changes its peer), we clear its receive
556  * queue of packets arrived from previous peer. First, it allows to do
557  * flow control based only on wmem_alloc; second, sk connected to peer
558  * may receive messages only from that peer. */
559 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
560 {
561 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
562 		skb_queue_purge(&sk->sk_receive_queue);
563 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
564 
565 		/* If one link of bidirectional dgram pipe is disconnected,
566 		 * we signal error. Messages are lost. Do not make this,
567 		 * when peer was not connected to us.
568 		 */
569 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
570 			WRITE_ONCE(other->sk_err, ECONNRESET);
571 			sk_error_report(other);
572 		}
573 	}
574 	other->sk_state = TCP_CLOSE;
575 }
576 
577 static void unix_sock_destructor(struct sock *sk)
578 {
579 	struct unix_sock *u = unix_sk(sk);
580 
581 	skb_queue_purge(&sk->sk_receive_queue);
582 
583 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
584 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
585 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
586 	if (!sock_flag(sk, SOCK_DEAD)) {
587 		pr_info("Attempt to release alive unix socket: %p\n", sk);
588 		return;
589 	}
590 
591 	if (u->addr)
592 		unix_release_addr(u->addr);
593 
594 	atomic_long_dec(&unix_nr_socks);
595 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
596 #ifdef UNIX_REFCNT_DEBUG
597 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
598 		atomic_long_read(&unix_nr_socks));
599 #endif
600 }
601 
602 static void unix_release_sock(struct sock *sk, int embrion)
603 {
604 	struct unix_sock *u = unix_sk(sk);
605 	struct sock *skpair;
606 	struct sk_buff *skb;
607 	struct path path;
608 	int state;
609 
610 	unix_remove_socket(sock_net(sk), sk);
611 	unix_remove_bsd_socket(sk);
612 
613 	/* Clear state */
614 	unix_state_lock(sk);
615 	sock_orphan(sk);
616 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
617 	path	     = u->path;
618 	u->path.dentry = NULL;
619 	u->path.mnt = NULL;
620 	state = sk->sk_state;
621 	sk->sk_state = TCP_CLOSE;
622 
623 	skpair = unix_peer(sk);
624 	unix_peer(sk) = NULL;
625 
626 	unix_state_unlock(sk);
627 
628 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
629 	if (u->oob_skb) {
630 		kfree_skb(u->oob_skb);
631 		u->oob_skb = NULL;
632 	}
633 #endif
634 
635 	wake_up_interruptible_all(&u->peer_wait);
636 
637 	if (skpair != NULL) {
638 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
639 			unix_state_lock(skpair);
640 			/* No more writes */
641 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
642 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
643 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
644 			unix_state_unlock(skpair);
645 			skpair->sk_state_change(skpair);
646 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
647 		}
648 
649 		unix_dgram_peer_wake_disconnect(sk, skpair);
650 		sock_put(skpair); /* It may now die */
651 	}
652 
653 	/* Try to flush out this socket. Throw out buffers at least */
654 
655 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
656 		if (state == TCP_LISTEN)
657 			unix_release_sock(skb->sk, 1);
658 		/* passed fds are erased in the kfree_skb hook	      */
659 		UNIXCB(skb).consumed = skb->len;
660 		kfree_skb(skb);
661 	}
662 
663 	if (path.dentry)
664 		path_put(&path);
665 
666 	sock_put(sk);
667 
668 	/* ---- Socket is dead now and most probably destroyed ---- */
669 
670 	/*
671 	 * Fixme: BSD difference: In BSD all sockets connected to us get
672 	 *	  ECONNRESET and we die on the spot. In Linux we behave
673 	 *	  like files and pipes do and wait for the last
674 	 *	  dereference.
675 	 *
676 	 * Can't we simply set sock->err?
677 	 *
678 	 *	  What the above comment does talk about? --ANK(980817)
679 	 */
680 
681 	if (READ_ONCE(unix_tot_inflight))
682 		unix_gc();		/* Garbage collect fds */
683 }
684 
685 static void init_peercred(struct sock *sk)
686 {
687 	const struct cred *old_cred;
688 	struct pid *old_pid;
689 
690 	spin_lock(&sk->sk_peer_lock);
691 	old_pid = sk->sk_peer_pid;
692 	old_cred = sk->sk_peer_cred;
693 	sk->sk_peer_pid  = get_pid(task_tgid(current));
694 	sk->sk_peer_cred = get_current_cred();
695 	spin_unlock(&sk->sk_peer_lock);
696 
697 	put_pid(old_pid);
698 	put_cred(old_cred);
699 }
700 
701 static void copy_peercred(struct sock *sk, struct sock *peersk)
702 {
703 	const struct cred *old_cred;
704 	struct pid *old_pid;
705 
706 	if (sk < peersk) {
707 		spin_lock(&sk->sk_peer_lock);
708 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
709 	} else {
710 		spin_lock(&peersk->sk_peer_lock);
711 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
712 	}
713 	old_pid = sk->sk_peer_pid;
714 	old_cred = sk->sk_peer_cred;
715 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
716 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
717 
718 	spin_unlock(&sk->sk_peer_lock);
719 	spin_unlock(&peersk->sk_peer_lock);
720 
721 	put_pid(old_pid);
722 	put_cred(old_cred);
723 }
724 
725 static int unix_listen(struct socket *sock, int backlog)
726 {
727 	int err;
728 	struct sock *sk = sock->sk;
729 	struct unix_sock *u = unix_sk(sk);
730 
731 	err = -EOPNOTSUPP;
732 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
733 		goto out;	/* Only stream/seqpacket sockets accept */
734 	err = -EINVAL;
735 	if (!u->addr)
736 		goto out;	/* No listens on an unbound socket */
737 	unix_state_lock(sk);
738 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
739 		goto out_unlock;
740 	if (backlog > sk->sk_max_ack_backlog)
741 		wake_up_interruptible_all(&u->peer_wait);
742 	sk->sk_max_ack_backlog	= backlog;
743 	sk->sk_state		= TCP_LISTEN;
744 	/* set credentials so connect can copy them */
745 	init_peercred(sk);
746 	err = 0;
747 
748 out_unlock:
749 	unix_state_unlock(sk);
750 out:
751 	return err;
752 }
753 
754 static int unix_release(struct socket *);
755 static int unix_bind(struct socket *, struct sockaddr *, int);
756 static int unix_stream_connect(struct socket *, struct sockaddr *,
757 			       int addr_len, int flags);
758 static int unix_socketpair(struct socket *, struct socket *);
759 static int unix_accept(struct socket *, struct socket *, int, bool);
760 static int unix_getname(struct socket *, struct sockaddr *, int);
761 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
762 static __poll_t unix_dgram_poll(struct file *, struct socket *,
763 				    poll_table *);
764 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
765 #ifdef CONFIG_COMPAT
766 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
767 #endif
768 static int unix_shutdown(struct socket *, int);
769 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
770 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
771 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
772 				       struct pipe_inode_info *, size_t size,
773 				       unsigned int flags);
774 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
775 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
776 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
778 static int unix_dgram_connect(struct socket *, struct sockaddr *,
779 			      int, int);
780 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
781 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
782 				  int);
783 
784 static int unix_set_peek_off(struct sock *sk, int val)
785 {
786 	struct unix_sock *u = unix_sk(sk);
787 
788 	if (mutex_lock_interruptible(&u->iolock))
789 		return -EINTR;
790 
791 	WRITE_ONCE(sk->sk_peek_off, val);
792 	mutex_unlock(&u->iolock);
793 
794 	return 0;
795 }
796 
797 #ifdef CONFIG_PROC_FS
798 static int unix_count_nr_fds(struct sock *sk)
799 {
800 	struct sk_buff *skb;
801 	struct unix_sock *u;
802 	int nr_fds = 0;
803 
804 	spin_lock(&sk->sk_receive_queue.lock);
805 	skb = skb_peek(&sk->sk_receive_queue);
806 	while (skb) {
807 		u = unix_sk(skb->sk);
808 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
809 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
810 	}
811 	spin_unlock(&sk->sk_receive_queue.lock);
812 
813 	return nr_fds;
814 }
815 
816 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
817 {
818 	struct sock *sk = sock->sk;
819 	unsigned char s_state;
820 	struct unix_sock *u;
821 	int nr_fds = 0;
822 
823 	if (sk) {
824 		s_state = READ_ONCE(sk->sk_state);
825 		u = unix_sk(sk);
826 
827 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
828 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
829 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
830 		 */
831 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
832 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
833 		else if (s_state == TCP_LISTEN)
834 			nr_fds = unix_count_nr_fds(sk);
835 
836 		seq_printf(m, "scm_fds: %u\n", nr_fds);
837 	}
838 }
839 #else
840 #define unix_show_fdinfo NULL
841 #endif
842 
843 static const struct proto_ops unix_stream_ops = {
844 	.family =	PF_UNIX,
845 	.owner =	THIS_MODULE,
846 	.release =	unix_release,
847 	.bind =		unix_bind,
848 	.connect =	unix_stream_connect,
849 	.socketpair =	unix_socketpair,
850 	.accept =	unix_accept,
851 	.getname =	unix_getname,
852 	.poll =		unix_poll,
853 	.ioctl =	unix_ioctl,
854 #ifdef CONFIG_COMPAT
855 	.compat_ioctl =	unix_compat_ioctl,
856 #endif
857 	.listen =	unix_listen,
858 	.shutdown =	unix_shutdown,
859 	.sendmsg =	unix_stream_sendmsg,
860 	.recvmsg =	unix_stream_recvmsg,
861 	.read_skb =	unix_stream_read_skb,
862 	.mmap =		sock_no_mmap,
863 	.splice_read =	unix_stream_splice_read,
864 	.set_peek_off =	unix_set_peek_off,
865 	.show_fdinfo =	unix_show_fdinfo,
866 };
867 
868 static const struct proto_ops unix_dgram_ops = {
869 	.family =	PF_UNIX,
870 	.owner =	THIS_MODULE,
871 	.release =	unix_release,
872 	.bind =		unix_bind,
873 	.connect =	unix_dgram_connect,
874 	.socketpair =	unix_socketpair,
875 	.accept =	sock_no_accept,
876 	.getname =	unix_getname,
877 	.poll =		unix_dgram_poll,
878 	.ioctl =	unix_ioctl,
879 #ifdef CONFIG_COMPAT
880 	.compat_ioctl =	unix_compat_ioctl,
881 #endif
882 	.listen =	sock_no_listen,
883 	.shutdown =	unix_shutdown,
884 	.sendmsg =	unix_dgram_sendmsg,
885 	.read_skb =	unix_read_skb,
886 	.recvmsg =	unix_dgram_recvmsg,
887 	.mmap =		sock_no_mmap,
888 	.set_peek_off =	unix_set_peek_off,
889 	.show_fdinfo =	unix_show_fdinfo,
890 };
891 
892 static const struct proto_ops unix_seqpacket_ops = {
893 	.family =	PF_UNIX,
894 	.owner =	THIS_MODULE,
895 	.release =	unix_release,
896 	.bind =		unix_bind,
897 	.connect =	unix_stream_connect,
898 	.socketpair =	unix_socketpair,
899 	.accept =	unix_accept,
900 	.getname =	unix_getname,
901 	.poll =		unix_dgram_poll,
902 	.ioctl =	unix_ioctl,
903 #ifdef CONFIG_COMPAT
904 	.compat_ioctl =	unix_compat_ioctl,
905 #endif
906 	.listen =	unix_listen,
907 	.shutdown =	unix_shutdown,
908 	.sendmsg =	unix_seqpacket_sendmsg,
909 	.recvmsg =	unix_seqpacket_recvmsg,
910 	.mmap =		sock_no_mmap,
911 	.set_peek_off =	unix_set_peek_off,
912 	.show_fdinfo =	unix_show_fdinfo,
913 };
914 
915 static void unix_close(struct sock *sk, long timeout)
916 {
917 	/* Nothing to do here, unix socket does not need a ->close().
918 	 * This is merely for sockmap.
919 	 */
920 }
921 
922 static void unix_unhash(struct sock *sk)
923 {
924 	/* Nothing to do here, unix socket does not need a ->unhash().
925 	 * This is merely for sockmap.
926 	 */
927 }
928 
929 static bool unix_bpf_bypass_getsockopt(int level, int optname)
930 {
931 	if (level == SOL_SOCKET) {
932 		switch (optname) {
933 		case SO_PEERPIDFD:
934 			return true;
935 		default:
936 			return false;
937 		}
938 	}
939 
940 	return false;
941 }
942 
943 struct proto unix_dgram_proto = {
944 	.name			= "UNIX",
945 	.owner			= THIS_MODULE,
946 	.obj_size		= sizeof(struct unix_sock),
947 	.close			= unix_close,
948 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
949 #ifdef CONFIG_BPF_SYSCALL
950 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
951 #endif
952 };
953 
954 struct proto unix_stream_proto = {
955 	.name			= "UNIX-STREAM",
956 	.owner			= THIS_MODULE,
957 	.obj_size		= sizeof(struct unix_sock),
958 	.close			= unix_close,
959 	.unhash			= unix_unhash,
960 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
961 #ifdef CONFIG_BPF_SYSCALL
962 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
963 #endif
964 };
965 
966 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
967 {
968 	struct unix_sock *u;
969 	struct sock *sk;
970 	int err;
971 
972 	atomic_long_inc(&unix_nr_socks);
973 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
974 		err = -ENFILE;
975 		goto err;
976 	}
977 
978 	if (type == SOCK_STREAM)
979 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
980 	else /*dgram and  seqpacket */
981 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
982 
983 	if (!sk) {
984 		err = -ENOMEM;
985 		goto err;
986 	}
987 
988 	sock_init_data(sock, sk);
989 
990 	sk->sk_hash		= unix_unbound_hash(sk);
991 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
992 	sk->sk_write_space	= unix_write_space;
993 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
994 	sk->sk_destruct		= unix_sock_destructor;
995 	u	  = unix_sk(sk);
996 	u->path.dentry = NULL;
997 	u->path.mnt = NULL;
998 	spin_lock_init(&u->lock);
999 	atomic_long_set(&u->inflight, 0);
1000 	INIT_LIST_HEAD(&u->link);
1001 	mutex_init(&u->iolock); /* single task reading lock */
1002 	mutex_init(&u->bindlock); /* single task binding lock */
1003 	init_waitqueue_head(&u->peer_wait);
1004 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1005 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1006 	unix_insert_unbound_socket(net, sk);
1007 
1008 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1009 
1010 	return sk;
1011 
1012 err:
1013 	atomic_long_dec(&unix_nr_socks);
1014 	return ERR_PTR(err);
1015 }
1016 
1017 static int unix_create(struct net *net, struct socket *sock, int protocol,
1018 		       int kern)
1019 {
1020 	struct sock *sk;
1021 
1022 	if (protocol && protocol != PF_UNIX)
1023 		return -EPROTONOSUPPORT;
1024 
1025 	sock->state = SS_UNCONNECTED;
1026 
1027 	switch (sock->type) {
1028 	case SOCK_STREAM:
1029 		sock->ops = &unix_stream_ops;
1030 		break;
1031 		/*
1032 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1033 		 *	nothing uses it.
1034 		 */
1035 	case SOCK_RAW:
1036 		sock->type = SOCK_DGRAM;
1037 		fallthrough;
1038 	case SOCK_DGRAM:
1039 		sock->ops = &unix_dgram_ops;
1040 		break;
1041 	case SOCK_SEQPACKET:
1042 		sock->ops = &unix_seqpacket_ops;
1043 		break;
1044 	default:
1045 		return -ESOCKTNOSUPPORT;
1046 	}
1047 
1048 	sk = unix_create1(net, sock, kern, sock->type);
1049 	if (IS_ERR(sk))
1050 		return PTR_ERR(sk);
1051 
1052 	return 0;
1053 }
1054 
1055 static int unix_release(struct socket *sock)
1056 {
1057 	struct sock *sk = sock->sk;
1058 
1059 	if (!sk)
1060 		return 0;
1061 
1062 	sk->sk_prot->close(sk, 0);
1063 	unix_release_sock(sk, 0);
1064 	sock->sk = NULL;
1065 
1066 	return 0;
1067 }
1068 
1069 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1070 				  int type)
1071 {
1072 	struct inode *inode;
1073 	struct path path;
1074 	struct sock *sk;
1075 	int err;
1076 
1077 	unix_mkname_bsd(sunaddr, addr_len);
1078 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1079 	if (err)
1080 		goto fail;
1081 
1082 	err = path_permission(&path, MAY_WRITE);
1083 	if (err)
1084 		goto path_put;
1085 
1086 	err = -ECONNREFUSED;
1087 	inode = d_backing_inode(path.dentry);
1088 	if (!S_ISSOCK(inode->i_mode))
1089 		goto path_put;
1090 
1091 	sk = unix_find_socket_byinode(inode);
1092 	if (!sk)
1093 		goto path_put;
1094 
1095 	err = -EPROTOTYPE;
1096 	if (sk->sk_type == type)
1097 		touch_atime(&path);
1098 	else
1099 		goto sock_put;
1100 
1101 	path_put(&path);
1102 
1103 	return sk;
1104 
1105 sock_put:
1106 	sock_put(sk);
1107 path_put:
1108 	path_put(&path);
1109 fail:
1110 	return ERR_PTR(err);
1111 }
1112 
1113 static struct sock *unix_find_abstract(struct net *net,
1114 				       struct sockaddr_un *sunaddr,
1115 				       int addr_len, int type)
1116 {
1117 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1118 	struct dentry *dentry;
1119 	struct sock *sk;
1120 
1121 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1122 	if (!sk)
1123 		return ERR_PTR(-ECONNREFUSED);
1124 
1125 	dentry = unix_sk(sk)->path.dentry;
1126 	if (dentry)
1127 		touch_atime(&unix_sk(sk)->path);
1128 
1129 	return sk;
1130 }
1131 
1132 static struct sock *unix_find_other(struct net *net,
1133 				    struct sockaddr_un *sunaddr,
1134 				    int addr_len, int type)
1135 {
1136 	struct sock *sk;
1137 
1138 	if (sunaddr->sun_path[0])
1139 		sk = unix_find_bsd(sunaddr, addr_len, type);
1140 	else
1141 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1142 
1143 	return sk;
1144 }
1145 
1146 static int unix_autobind(struct sock *sk)
1147 {
1148 	unsigned int new_hash, old_hash = sk->sk_hash;
1149 	struct unix_sock *u = unix_sk(sk);
1150 	struct net *net = sock_net(sk);
1151 	struct unix_address *addr;
1152 	u32 lastnum, ordernum;
1153 	int err;
1154 
1155 	err = mutex_lock_interruptible(&u->bindlock);
1156 	if (err)
1157 		return err;
1158 
1159 	if (u->addr)
1160 		goto out;
1161 
1162 	err = -ENOMEM;
1163 	addr = kzalloc(sizeof(*addr) +
1164 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1165 	if (!addr)
1166 		goto out;
1167 
1168 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1169 	addr->name->sun_family = AF_UNIX;
1170 	refcount_set(&addr->refcnt, 1);
1171 
1172 	ordernum = get_random_u32();
1173 	lastnum = ordernum & 0xFFFFF;
1174 retry:
1175 	ordernum = (ordernum + 1) & 0xFFFFF;
1176 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1177 
1178 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1179 	unix_table_double_lock(net, old_hash, new_hash);
1180 
1181 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1182 		unix_table_double_unlock(net, old_hash, new_hash);
1183 
1184 		/* __unix_find_socket_byname() may take long time if many names
1185 		 * are already in use.
1186 		 */
1187 		cond_resched();
1188 
1189 		if (ordernum == lastnum) {
1190 			/* Give up if all names seems to be in use. */
1191 			err = -ENOSPC;
1192 			unix_release_addr(addr);
1193 			goto out;
1194 		}
1195 
1196 		goto retry;
1197 	}
1198 
1199 	__unix_set_addr_hash(net, sk, addr, new_hash);
1200 	unix_table_double_unlock(net, old_hash, new_hash);
1201 	err = 0;
1202 
1203 out:	mutex_unlock(&u->bindlock);
1204 	return err;
1205 }
1206 
1207 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1208 			 int addr_len)
1209 {
1210 	umode_t mode = S_IFSOCK |
1211 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1212 	unsigned int new_hash, old_hash = sk->sk_hash;
1213 	struct unix_sock *u = unix_sk(sk);
1214 	struct net *net = sock_net(sk);
1215 	struct mnt_idmap *idmap;
1216 	struct unix_address *addr;
1217 	struct dentry *dentry;
1218 	struct path parent;
1219 	int err;
1220 
1221 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1222 	addr = unix_create_addr(sunaddr, addr_len);
1223 	if (!addr)
1224 		return -ENOMEM;
1225 
1226 	/*
1227 	 * Get the parent directory, calculate the hash for last
1228 	 * component.
1229 	 */
1230 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1231 	if (IS_ERR(dentry)) {
1232 		err = PTR_ERR(dentry);
1233 		goto out;
1234 	}
1235 
1236 	/*
1237 	 * All right, let's create it.
1238 	 */
1239 	idmap = mnt_idmap(parent.mnt);
1240 	err = security_path_mknod(&parent, dentry, mode, 0);
1241 	if (!err)
1242 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1243 	if (err)
1244 		goto out_path;
1245 	err = mutex_lock_interruptible(&u->bindlock);
1246 	if (err)
1247 		goto out_unlink;
1248 	if (u->addr)
1249 		goto out_unlock;
1250 
1251 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1252 	unix_table_double_lock(net, old_hash, new_hash);
1253 	u->path.mnt = mntget(parent.mnt);
1254 	u->path.dentry = dget(dentry);
1255 	__unix_set_addr_hash(net, sk, addr, new_hash);
1256 	unix_table_double_unlock(net, old_hash, new_hash);
1257 	unix_insert_bsd_socket(sk);
1258 	mutex_unlock(&u->bindlock);
1259 	done_path_create(&parent, dentry);
1260 	return 0;
1261 
1262 out_unlock:
1263 	mutex_unlock(&u->bindlock);
1264 	err = -EINVAL;
1265 out_unlink:
1266 	/* failed after successful mknod?  unlink what we'd created... */
1267 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1268 out_path:
1269 	done_path_create(&parent, dentry);
1270 out:
1271 	unix_release_addr(addr);
1272 	return err == -EEXIST ? -EADDRINUSE : err;
1273 }
1274 
1275 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1276 			      int addr_len)
1277 {
1278 	unsigned int new_hash, old_hash = sk->sk_hash;
1279 	struct unix_sock *u = unix_sk(sk);
1280 	struct net *net = sock_net(sk);
1281 	struct unix_address *addr;
1282 	int err;
1283 
1284 	addr = unix_create_addr(sunaddr, addr_len);
1285 	if (!addr)
1286 		return -ENOMEM;
1287 
1288 	err = mutex_lock_interruptible(&u->bindlock);
1289 	if (err)
1290 		goto out;
1291 
1292 	if (u->addr) {
1293 		err = -EINVAL;
1294 		goto out_mutex;
1295 	}
1296 
1297 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1298 	unix_table_double_lock(net, old_hash, new_hash);
1299 
1300 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1301 		goto out_spin;
1302 
1303 	__unix_set_addr_hash(net, sk, addr, new_hash);
1304 	unix_table_double_unlock(net, old_hash, new_hash);
1305 	mutex_unlock(&u->bindlock);
1306 	return 0;
1307 
1308 out_spin:
1309 	unix_table_double_unlock(net, old_hash, new_hash);
1310 	err = -EADDRINUSE;
1311 out_mutex:
1312 	mutex_unlock(&u->bindlock);
1313 out:
1314 	unix_release_addr(addr);
1315 	return err;
1316 }
1317 
1318 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1319 {
1320 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1321 	struct sock *sk = sock->sk;
1322 	int err;
1323 
1324 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1325 	    sunaddr->sun_family == AF_UNIX)
1326 		return unix_autobind(sk);
1327 
1328 	err = unix_validate_addr(sunaddr, addr_len);
1329 	if (err)
1330 		return err;
1331 
1332 	if (sunaddr->sun_path[0])
1333 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1334 	else
1335 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1336 
1337 	return err;
1338 }
1339 
1340 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1341 {
1342 	if (unlikely(sk1 == sk2) || !sk2) {
1343 		unix_state_lock(sk1);
1344 		return;
1345 	}
1346 	if (sk1 < sk2) {
1347 		unix_state_lock(sk1);
1348 		unix_state_lock_nested(sk2);
1349 	} else {
1350 		unix_state_lock(sk2);
1351 		unix_state_lock_nested(sk1);
1352 	}
1353 }
1354 
1355 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1356 {
1357 	if (unlikely(sk1 == sk2) || !sk2) {
1358 		unix_state_unlock(sk1);
1359 		return;
1360 	}
1361 	unix_state_unlock(sk1);
1362 	unix_state_unlock(sk2);
1363 }
1364 
1365 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1366 			      int alen, int flags)
1367 {
1368 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1369 	struct sock *sk = sock->sk;
1370 	struct sock *other;
1371 	int err;
1372 
1373 	err = -EINVAL;
1374 	if (alen < offsetofend(struct sockaddr, sa_family))
1375 		goto out;
1376 
1377 	if (addr->sa_family != AF_UNSPEC) {
1378 		err = unix_validate_addr(sunaddr, alen);
1379 		if (err)
1380 			goto out;
1381 
1382 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1383 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1384 		    !unix_sk(sk)->addr) {
1385 			err = unix_autobind(sk);
1386 			if (err)
1387 				goto out;
1388 		}
1389 
1390 restart:
1391 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1392 		if (IS_ERR(other)) {
1393 			err = PTR_ERR(other);
1394 			goto out;
1395 		}
1396 
1397 		unix_state_double_lock(sk, other);
1398 
1399 		/* Apparently VFS overslept socket death. Retry. */
1400 		if (sock_flag(other, SOCK_DEAD)) {
1401 			unix_state_double_unlock(sk, other);
1402 			sock_put(other);
1403 			goto restart;
1404 		}
1405 
1406 		err = -EPERM;
1407 		if (!unix_may_send(sk, other))
1408 			goto out_unlock;
1409 
1410 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1411 		if (err)
1412 			goto out_unlock;
1413 
1414 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1415 	} else {
1416 		/*
1417 		 *	1003.1g breaking connected state with AF_UNSPEC
1418 		 */
1419 		other = NULL;
1420 		unix_state_double_lock(sk, other);
1421 	}
1422 
1423 	/*
1424 	 * If it was connected, reconnect.
1425 	 */
1426 	if (unix_peer(sk)) {
1427 		struct sock *old_peer = unix_peer(sk);
1428 
1429 		unix_peer(sk) = other;
1430 		if (!other)
1431 			sk->sk_state = TCP_CLOSE;
1432 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1433 
1434 		unix_state_double_unlock(sk, other);
1435 
1436 		if (other != old_peer)
1437 			unix_dgram_disconnected(sk, old_peer);
1438 		sock_put(old_peer);
1439 	} else {
1440 		unix_peer(sk) = other;
1441 		unix_state_double_unlock(sk, other);
1442 	}
1443 
1444 	return 0;
1445 
1446 out_unlock:
1447 	unix_state_double_unlock(sk, other);
1448 	sock_put(other);
1449 out:
1450 	return err;
1451 }
1452 
1453 static long unix_wait_for_peer(struct sock *other, long timeo)
1454 	__releases(&unix_sk(other)->lock)
1455 {
1456 	struct unix_sock *u = unix_sk(other);
1457 	int sched;
1458 	DEFINE_WAIT(wait);
1459 
1460 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1461 
1462 	sched = !sock_flag(other, SOCK_DEAD) &&
1463 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1464 		unix_recvq_full_lockless(other);
1465 
1466 	unix_state_unlock(other);
1467 
1468 	if (sched)
1469 		timeo = schedule_timeout(timeo);
1470 
1471 	finish_wait(&u->peer_wait, &wait);
1472 	return timeo;
1473 }
1474 
1475 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1476 			       int addr_len, int flags)
1477 {
1478 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1479 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1480 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1481 	struct net *net = sock_net(sk);
1482 	struct sk_buff *skb = NULL;
1483 	long timeo;
1484 	int err;
1485 	int st;
1486 
1487 	err = unix_validate_addr(sunaddr, addr_len);
1488 	if (err)
1489 		goto out;
1490 
1491 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1492 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1493 		err = unix_autobind(sk);
1494 		if (err)
1495 			goto out;
1496 	}
1497 
1498 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1499 
1500 	/* First of all allocate resources.
1501 	   If we will make it after state is locked,
1502 	   we will have to recheck all again in any case.
1503 	 */
1504 
1505 	/* create new sock for complete connection */
1506 	newsk = unix_create1(net, NULL, 0, sock->type);
1507 	if (IS_ERR(newsk)) {
1508 		err = PTR_ERR(newsk);
1509 		newsk = NULL;
1510 		goto out;
1511 	}
1512 
1513 	err = -ENOMEM;
1514 
1515 	/* Allocate skb for sending to listening sock */
1516 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1517 	if (skb == NULL)
1518 		goto out;
1519 
1520 restart:
1521 	/*  Find listening sock. */
1522 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1523 	if (IS_ERR(other)) {
1524 		err = PTR_ERR(other);
1525 		other = NULL;
1526 		goto out;
1527 	}
1528 
1529 	/* Latch state of peer */
1530 	unix_state_lock(other);
1531 
1532 	/* Apparently VFS overslept socket death. Retry. */
1533 	if (sock_flag(other, SOCK_DEAD)) {
1534 		unix_state_unlock(other);
1535 		sock_put(other);
1536 		goto restart;
1537 	}
1538 
1539 	err = -ECONNREFUSED;
1540 	if (other->sk_state != TCP_LISTEN)
1541 		goto out_unlock;
1542 	if (other->sk_shutdown & RCV_SHUTDOWN)
1543 		goto out_unlock;
1544 
1545 	if (unix_recvq_full(other)) {
1546 		err = -EAGAIN;
1547 		if (!timeo)
1548 			goto out_unlock;
1549 
1550 		timeo = unix_wait_for_peer(other, timeo);
1551 
1552 		err = sock_intr_errno(timeo);
1553 		if (signal_pending(current))
1554 			goto out;
1555 		sock_put(other);
1556 		goto restart;
1557 	}
1558 
1559 	/* Latch our state.
1560 
1561 	   It is tricky place. We need to grab our state lock and cannot
1562 	   drop lock on peer. It is dangerous because deadlock is
1563 	   possible. Connect to self case and simultaneous
1564 	   attempt to connect are eliminated by checking socket
1565 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1566 	   check this before attempt to grab lock.
1567 
1568 	   Well, and we have to recheck the state after socket locked.
1569 	 */
1570 	st = sk->sk_state;
1571 
1572 	switch (st) {
1573 	case TCP_CLOSE:
1574 		/* This is ok... continue with connect */
1575 		break;
1576 	case TCP_ESTABLISHED:
1577 		/* Socket is already connected */
1578 		err = -EISCONN;
1579 		goto out_unlock;
1580 	default:
1581 		err = -EINVAL;
1582 		goto out_unlock;
1583 	}
1584 
1585 	unix_state_lock_nested(sk);
1586 
1587 	if (sk->sk_state != st) {
1588 		unix_state_unlock(sk);
1589 		unix_state_unlock(other);
1590 		sock_put(other);
1591 		goto restart;
1592 	}
1593 
1594 	err = security_unix_stream_connect(sk, other, newsk);
1595 	if (err) {
1596 		unix_state_unlock(sk);
1597 		goto out_unlock;
1598 	}
1599 
1600 	/* The way is open! Fastly set all the necessary fields... */
1601 
1602 	sock_hold(sk);
1603 	unix_peer(newsk)	= sk;
1604 	newsk->sk_state		= TCP_ESTABLISHED;
1605 	newsk->sk_type		= sk->sk_type;
1606 	init_peercred(newsk);
1607 	newu = unix_sk(newsk);
1608 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1609 	otheru = unix_sk(other);
1610 
1611 	/* copy address information from listening to new sock
1612 	 *
1613 	 * The contents of *(otheru->addr) and otheru->path
1614 	 * are seen fully set up here, since we have found
1615 	 * otheru in hash under its lock.  Insertion into the
1616 	 * hash chain we'd found it in had been done in an
1617 	 * earlier critical area protected by the chain's lock,
1618 	 * the same one where we'd set *(otheru->addr) contents,
1619 	 * as well as otheru->path and otheru->addr itself.
1620 	 *
1621 	 * Using smp_store_release() here to set newu->addr
1622 	 * is enough to make those stores, as well as stores
1623 	 * to newu->path visible to anyone who gets newu->addr
1624 	 * by smp_load_acquire().  IOW, the same warranties
1625 	 * as for unix_sock instances bound in unix_bind() or
1626 	 * in unix_autobind().
1627 	 */
1628 	if (otheru->path.dentry) {
1629 		path_get(&otheru->path);
1630 		newu->path = otheru->path;
1631 	}
1632 	refcount_inc(&otheru->addr->refcnt);
1633 	smp_store_release(&newu->addr, otheru->addr);
1634 
1635 	/* Set credentials */
1636 	copy_peercred(sk, other);
1637 
1638 	sock->state	= SS_CONNECTED;
1639 	sk->sk_state	= TCP_ESTABLISHED;
1640 	sock_hold(newsk);
1641 
1642 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1643 	unix_peer(sk)	= newsk;
1644 
1645 	unix_state_unlock(sk);
1646 
1647 	/* take ten and send info to listening sock */
1648 	spin_lock(&other->sk_receive_queue.lock);
1649 	__skb_queue_tail(&other->sk_receive_queue, skb);
1650 	spin_unlock(&other->sk_receive_queue.lock);
1651 	unix_state_unlock(other);
1652 	other->sk_data_ready(other);
1653 	sock_put(other);
1654 	return 0;
1655 
1656 out_unlock:
1657 	if (other)
1658 		unix_state_unlock(other);
1659 
1660 out:
1661 	kfree_skb(skb);
1662 	if (newsk)
1663 		unix_release_sock(newsk, 0);
1664 	if (other)
1665 		sock_put(other);
1666 	return err;
1667 }
1668 
1669 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1670 {
1671 	struct sock *ska = socka->sk, *skb = sockb->sk;
1672 
1673 	/* Join our sockets back to back */
1674 	sock_hold(ska);
1675 	sock_hold(skb);
1676 	unix_peer(ska) = skb;
1677 	unix_peer(skb) = ska;
1678 	init_peercred(ska);
1679 	init_peercred(skb);
1680 
1681 	ska->sk_state = TCP_ESTABLISHED;
1682 	skb->sk_state = TCP_ESTABLISHED;
1683 	socka->state  = SS_CONNECTED;
1684 	sockb->state  = SS_CONNECTED;
1685 	return 0;
1686 }
1687 
1688 static void unix_sock_inherit_flags(const struct socket *old,
1689 				    struct socket *new)
1690 {
1691 	if (test_bit(SOCK_PASSCRED, &old->flags))
1692 		set_bit(SOCK_PASSCRED, &new->flags);
1693 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1694 		set_bit(SOCK_PASSPIDFD, &new->flags);
1695 	if (test_bit(SOCK_PASSSEC, &old->flags))
1696 		set_bit(SOCK_PASSSEC, &new->flags);
1697 }
1698 
1699 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1700 		       bool kern)
1701 {
1702 	struct sock *sk = sock->sk;
1703 	struct sock *tsk;
1704 	struct sk_buff *skb;
1705 	int err;
1706 
1707 	err = -EOPNOTSUPP;
1708 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1709 		goto out;
1710 
1711 	err = -EINVAL;
1712 	if (sk->sk_state != TCP_LISTEN)
1713 		goto out;
1714 
1715 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1716 	 * so that no locks are necessary.
1717 	 */
1718 
1719 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1720 				&err);
1721 	if (!skb) {
1722 		/* This means receive shutdown. */
1723 		if (err == 0)
1724 			err = -EINVAL;
1725 		goto out;
1726 	}
1727 
1728 	tsk = skb->sk;
1729 	skb_free_datagram(sk, skb);
1730 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1731 
1732 	/* attach accepted sock to socket */
1733 	unix_state_lock(tsk);
1734 	newsock->state = SS_CONNECTED;
1735 	unix_sock_inherit_flags(sock, newsock);
1736 	sock_graft(tsk, newsock);
1737 	unix_state_unlock(tsk);
1738 	return 0;
1739 
1740 out:
1741 	return err;
1742 }
1743 
1744 
1745 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1746 {
1747 	struct sock *sk = sock->sk;
1748 	struct unix_address *addr;
1749 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1750 	int err = 0;
1751 
1752 	if (peer) {
1753 		sk = unix_peer_get(sk);
1754 
1755 		err = -ENOTCONN;
1756 		if (!sk)
1757 			goto out;
1758 		err = 0;
1759 	} else {
1760 		sock_hold(sk);
1761 	}
1762 
1763 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1764 	if (!addr) {
1765 		sunaddr->sun_family = AF_UNIX;
1766 		sunaddr->sun_path[0] = 0;
1767 		err = offsetof(struct sockaddr_un, sun_path);
1768 	} else {
1769 		err = addr->len;
1770 		memcpy(sunaddr, addr->name, addr->len);
1771 	}
1772 	sock_put(sk);
1773 out:
1774 	return err;
1775 }
1776 
1777 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1778 {
1779 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1780 
1781 	/*
1782 	 * Garbage collection of unix sockets starts by selecting a set of
1783 	 * candidate sockets which have reference only from being in flight
1784 	 * (total_refs == inflight_refs).  This condition is checked once during
1785 	 * the candidate collection phase, and candidates are marked as such, so
1786 	 * that non-candidates can later be ignored.  While inflight_refs is
1787 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1788 	 * is an instantaneous decision.
1789 	 *
1790 	 * Once a candidate, however, the socket must not be reinstalled into a
1791 	 * file descriptor while the garbage collection is in progress.
1792 	 *
1793 	 * If the above conditions are met, then the directed graph of
1794 	 * candidates (*) does not change while unix_gc_lock is held.
1795 	 *
1796 	 * Any operations that changes the file count through file descriptors
1797 	 * (dup, close, sendmsg) does not change the graph since candidates are
1798 	 * not installed in fds.
1799 	 *
1800 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1801 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1802 	 * serialized with garbage collection.
1803 	 *
1804 	 * MSG_PEEK is special in that it does not change the inflight count,
1805 	 * yet does install the socket into an fd.  The following lock/unlock
1806 	 * pair is to ensure serialization with garbage collection.  It must be
1807 	 * done between incrementing the file count and installing the file into
1808 	 * an fd.
1809 	 *
1810 	 * If garbage collection starts after the barrier provided by the
1811 	 * lock/unlock, then it will see the elevated refcount and not mark this
1812 	 * as a candidate.  If a garbage collection is already in progress
1813 	 * before the file count was incremented, then the lock/unlock pair will
1814 	 * ensure that garbage collection is finished before progressing to
1815 	 * installing the fd.
1816 	 *
1817 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1818 	 * which is on the queue of listening socket A.
1819 	 */
1820 	spin_lock(&unix_gc_lock);
1821 	spin_unlock(&unix_gc_lock);
1822 }
1823 
1824 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1825 {
1826 	int err = 0;
1827 
1828 	UNIXCB(skb).pid  = get_pid(scm->pid);
1829 	UNIXCB(skb).uid = scm->creds.uid;
1830 	UNIXCB(skb).gid = scm->creds.gid;
1831 	UNIXCB(skb).fp = NULL;
1832 	unix_get_secdata(scm, skb);
1833 	if (scm->fp && send_fds)
1834 		err = unix_attach_fds(scm, skb);
1835 
1836 	skb->destructor = unix_destruct_scm;
1837 	return err;
1838 }
1839 
1840 static bool unix_passcred_enabled(const struct socket *sock,
1841 				  const struct sock *other)
1842 {
1843 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1844 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1845 	       !other->sk_socket ||
1846 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1847 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1848 }
1849 
1850 /*
1851  * Some apps rely on write() giving SCM_CREDENTIALS
1852  * We include credentials if source or destination socket
1853  * asserted SOCK_PASSCRED.
1854  */
1855 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1856 			    const struct sock *other)
1857 {
1858 	if (UNIXCB(skb).pid)
1859 		return;
1860 	if (unix_passcred_enabled(sock, other)) {
1861 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1862 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1863 	}
1864 }
1865 
1866 static bool unix_skb_scm_eq(struct sk_buff *skb,
1867 			    struct scm_cookie *scm)
1868 {
1869 	return UNIXCB(skb).pid == scm->pid &&
1870 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1871 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1872 	       unix_secdata_eq(scm, skb);
1873 }
1874 
1875 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1876 {
1877 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1878 	struct unix_sock *u = unix_sk(sk);
1879 
1880 	if (unlikely(fp && fp->count))
1881 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1882 }
1883 
1884 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1885 {
1886 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1887 	struct unix_sock *u = unix_sk(sk);
1888 
1889 	if (unlikely(fp && fp->count))
1890 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1891 }
1892 
1893 /*
1894  *	Send AF_UNIX data.
1895  */
1896 
1897 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1898 			      size_t len)
1899 {
1900 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1901 	struct sock *sk = sock->sk, *other = NULL;
1902 	struct unix_sock *u = unix_sk(sk);
1903 	struct scm_cookie scm;
1904 	struct sk_buff *skb;
1905 	int data_len = 0;
1906 	int sk_locked;
1907 	long timeo;
1908 	int err;
1909 
1910 	wait_for_unix_gc();
1911 	err = scm_send(sock, msg, &scm, false);
1912 	if (err < 0)
1913 		return err;
1914 
1915 	err = -EOPNOTSUPP;
1916 	if (msg->msg_flags&MSG_OOB)
1917 		goto out;
1918 
1919 	if (msg->msg_namelen) {
1920 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1921 		if (err)
1922 			goto out;
1923 	} else {
1924 		sunaddr = NULL;
1925 		err = -ENOTCONN;
1926 		other = unix_peer_get(sk);
1927 		if (!other)
1928 			goto out;
1929 	}
1930 
1931 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1932 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1933 		err = unix_autobind(sk);
1934 		if (err)
1935 			goto out;
1936 	}
1937 
1938 	err = -EMSGSIZE;
1939 	if (len > sk->sk_sndbuf - 32)
1940 		goto out;
1941 
1942 	if (len > SKB_MAX_ALLOC) {
1943 		data_len = min_t(size_t,
1944 				 len - SKB_MAX_ALLOC,
1945 				 MAX_SKB_FRAGS * PAGE_SIZE);
1946 		data_len = PAGE_ALIGN(data_len);
1947 
1948 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1949 	}
1950 
1951 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1952 				   msg->msg_flags & MSG_DONTWAIT, &err,
1953 				   PAGE_ALLOC_COSTLY_ORDER);
1954 	if (skb == NULL)
1955 		goto out;
1956 
1957 	err = unix_scm_to_skb(&scm, skb, true);
1958 	if (err < 0)
1959 		goto out_free;
1960 
1961 	skb_put(skb, len - data_len);
1962 	skb->data_len = data_len;
1963 	skb->len = len;
1964 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1965 	if (err)
1966 		goto out_free;
1967 
1968 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1969 
1970 restart:
1971 	if (!other) {
1972 		err = -ECONNRESET;
1973 		if (sunaddr == NULL)
1974 			goto out_free;
1975 
1976 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1977 					sk->sk_type);
1978 		if (IS_ERR(other)) {
1979 			err = PTR_ERR(other);
1980 			other = NULL;
1981 			goto out_free;
1982 		}
1983 	}
1984 
1985 	if (sk_filter(other, skb) < 0) {
1986 		/* Toss the packet but do not return any error to the sender */
1987 		err = len;
1988 		goto out_free;
1989 	}
1990 
1991 	sk_locked = 0;
1992 	unix_state_lock(other);
1993 restart_locked:
1994 	err = -EPERM;
1995 	if (!unix_may_send(sk, other))
1996 		goto out_unlock;
1997 
1998 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1999 		/*
2000 		 *	Check with 1003.1g - what should
2001 		 *	datagram error
2002 		 */
2003 		unix_state_unlock(other);
2004 		sock_put(other);
2005 
2006 		if (!sk_locked)
2007 			unix_state_lock(sk);
2008 
2009 		err = 0;
2010 		if (sk->sk_type == SOCK_SEQPACKET) {
2011 			/* We are here only when racing with unix_release_sock()
2012 			 * is clearing @other. Never change state to TCP_CLOSE
2013 			 * unlike SOCK_DGRAM wants.
2014 			 */
2015 			unix_state_unlock(sk);
2016 			err = -EPIPE;
2017 		} else if (unix_peer(sk) == other) {
2018 			unix_peer(sk) = NULL;
2019 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2020 
2021 			sk->sk_state = TCP_CLOSE;
2022 			unix_state_unlock(sk);
2023 
2024 			unix_dgram_disconnected(sk, other);
2025 			sock_put(other);
2026 			err = -ECONNREFUSED;
2027 		} else {
2028 			unix_state_unlock(sk);
2029 		}
2030 
2031 		other = NULL;
2032 		if (err)
2033 			goto out_free;
2034 		goto restart;
2035 	}
2036 
2037 	err = -EPIPE;
2038 	if (other->sk_shutdown & RCV_SHUTDOWN)
2039 		goto out_unlock;
2040 
2041 	if (sk->sk_type != SOCK_SEQPACKET) {
2042 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2043 		if (err)
2044 			goto out_unlock;
2045 	}
2046 
2047 	/* other == sk && unix_peer(other) != sk if
2048 	 * - unix_peer(sk) == NULL, destination address bound to sk
2049 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2050 	 */
2051 	if (other != sk &&
2052 	    unlikely(unix_peer(other) != sk &&
2053 	    unix_recvq_full_lockless(other))) {
2054 		if (timeo) {
2055 			timeo = unix_wait_for_peer(other, timeo);
2056 
2057 			err = sock_intr_errno(timeo);
2058 			if (signal_pending(current))
2059 				goto out_free;
2060 
2061 			goto restart;
2062 		}
2063 
2064 		if (!sk_locked) {
2065 			unix_state_unlock(other);
2066 			unix_state_double_lock(sk, other);
2067 		}
2068 
2069 		if (unix_peer(sk) != other ||
2070 		    unix_dgram_peer_wake_me(sk, other)) {
2071 			err = -EAGAIN;
2072 			sk_locked = 1;
2073 			goto out_unlock;
2074 		}
2075 
2076 		if (!sk_locked) {
2077 			sk_locked = 1;
2078 			goto restart_locked;
2079 		}
2080 	}
2081 
2082 	if (unlikely(sk_locked))
2083 		unix_state_unlock(sk);
2084 
2085 	if (sock_flag(other, SOCK_RCVTSTAMP))
2086 		__net_timestamp(skb);
2087 	maybe_add_creds(skb, sock, other);
2088 	scm_stat_add(other, skb);
2089 	skb_queue_tail(&other->sk_receive_queue, skb);
2090 	unix_state_unlock(other);
2091 	other->sk_data_ready(other);
2092 	sock_put(other);
2093 	scm_destroy(&scm);
2094 	return len;
2095 
2096 out_unlock:
2097 	if (sk_locked)
2098 		unix_state_unlock(sk);
2099 	unix_state_unlock(other);
2100 out_free:
2101 	kfree_skb(skb);
2102 out:
2103 	if (other)
2104 		sock_put(other);
2105 	scm_destroy(&scm);
2106 	return err;
2107 }
2108 
2109 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2110  * bytes, and a minimum of a full page.
2111  */
2112 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2113 
2114 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2115 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2116 		     struct scm_cookie *scm, bool fds_sent)
2117 {
2118 	struct unix_sock *ousk = unix_sk(other);
2119 	struct sk_buff *skb;
2120 	int err = 0;
2121 
2122 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2123 
2124 	if (!skb)
2125 		return err;
2126 
2127 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2128 	if (err < 0) {
2129 		kfree_skb(skb);
2130 		return err;
2131 	}
2132 	skb_put(skb, 1);
2133 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2134 
2135 	if (err) {
2136 		kfree_skb(skb);
2137 		return err;
2138 	}
2139 
2140 	unix_state_lock(other);
2141 
2142 	if (sock_flag(other, SOCK_DEAD) ||
2143 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2144 		unix_state_unlock(other);
2145 		kfree_skb(skb);
2146 		return -EPIPE;
2147 	}
2148 
2149 	maybe_add_creds(skb, sock, other);
2150 	skb_get(skb);
2151 
2152 	if (ousk->oob_skb)
2153 		consume_skb(ousk->oob_skb);
2154 
2155 	WRITE_ONCE(ousk->oob_skb, skb);
2156 
2157 	scm_stat_add(other, skb);
2158 	skb_queue_tail(&other->sk_receive_queue, skb);
2159 	sk_send_sigurg(other);
2160 	unix_state_unlock(other);
2161 	other->sk_data_ready(other);
2162 
2163 	return err;
2164 }
2165 #endif
2166 
2167 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2168 			       size_t len)
2169 {
2170 	struct sock *sk = sock->sk;
2171 	struct sock *other = NULL;
2172 	int err, size;
2173 	struct sk_buff *skb;
2174 	int sent = 0;
2175 	struct scm_cookie scm;
2176 	bool fds_sent = false;
2177 	int data_len;
2178 
2179 	wait_for_unix_gc();
2180 	err = scm_send(sock, msg, &scm, false);
2181 	if (err < 0)
2182 		return err;
2183 
2184 	err = -EOPNOTSUPP;
2185 	if (msg->msg_flags & MSG_OOB) {
2186 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2187 		if (len)
2188 			len--;
2189 		else
2190 #endif
2191 			goto out_err;
2192 	}
2193 
2194 	if (msg->msg_namelen) {
2195 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2196 		goto out_err;
2197 	} else {
2198 		err = -ENOTCONN;
2199 		other = unix_peer(sk);
2200 		if (!other)
2201 			goto out_err;
2202 	}
2203 
2204 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2205 		goto pipe_err;
2206 
2207 	while (sent < len) {
2208 		size = len - sent;
2209 
2210 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2211 			skb = sock_alloc_send_pskb(sk, 0, 0,
2212 						   msg->msg_flags & MSG_DONTWAIT,
2213 						   &err, 0);
2214 		} else {
2215 			/* Keep two messages in the pipe so it schedules better */
2216 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2217 
2218 			/* allow fallback to order-0 allocations */
2219 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2220 
2221 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2222 
2223 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2224 
2225 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2226 						   msg->msg_flags & MSG_DONTWAIT, &err,
2227 						   get_order(UNIX_SKB_FRAGS_SZ));
2228 		}
2229 		if (!skb)
2230 			goto out_err;
2231 
2232 		/* Only send the fds in the first buffer */
2233 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2234 		if (err < 0) {
2235 			kfree_skb(skb);
2236 			goto out_err;
2237 		}
2238 		fds_sent = true;
2239 
2240 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2241 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2242 						   sk->sk_allocation);
2243 			if (err < 0) {
2244 				kfree_skb(skb);
2245 				goto out_err;
2246 			}
2247 			size = err;
2248 			refcount_add(size, &sk->sk_wmem_alloc);
2249 		} else {
2250 			skb_put(skb, size - data_len);
2251 			skb->data_len = data_len;
2252 			skb->len = size;
2253 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2254 			if (err) {
2255 				kfree_skb(skb);
2256 				goto out_err;
2257 			}
2258 		}
2259 
2260 		unix_state_lock(other);
2261 
2262 		if (sock_flag(other, SOCK_DEAD) ||
2263 		    (other->sk_shutdown & RCV_SHUTDOWN))
2264 			goto pipe_err_free;
2265 
2266 		maybe_add_creds(skb, sock, other);
2267 		scm_stat_add(other, skb);
2268 		skb_queue_tail(&other->sk_receive_queue, skb);
2269 		unix_state_unlock(other);
2270 		other->sk_data_ready(other);
2271 		sent += size;
2272 	}
2273 
2274 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2275 	if (msg->msg_flags & MSG_OOB) {
2276 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2277 		if (err)
2278 			goto out_err;
2279 		sent++;
2280 	}
2281 #endif
2282 
2283 	scm_destroy(&scm);
2284 
2285 	return sent;
2286 
2287 pipe_err_free:
2288 	unix_state_unlock(other);
2289 	kfree_skb(skb);
2290 pipe_err:
2291 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2292 		send_sig(SIGPIPE, current, 0);
2293 	err = -EPIPE;
2294 out_err:
2295 	scm_destroy(&scm);
2296 	return sent ? : err;
2297 }
2298 
2299 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2300 				  size_t len)
2301 {
2302 	int err;
2303 	struct sock *sk = sock->sk;
2304 
2305 	err = sock_error(sk);
2306 	if (err)
2307 		return err;
2308 
2309 	if (sk->sk_state != TCP_ESTABLISHED)
2310 		return -ENOTCONN;
2311 
2312 	if (msg->msg_namelen)
2313 		msg->msg_namelen = 0;
2314 
2315 	return unix_dgram_sendmsg(sock, msg, len);
2316 }
2317 
2318 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2319 				  size_t size, int flags)
2320 {
2321 	struct sock *sk = sock->sk;
2322 
2323 	if (sk->sk_state != TCP_ESTABLISHED)
2324 		return -ENOTCONN;
2325 
2326 	return unix_dgram_recvmsg(sock, msg, size, flags);
2327 }
2328 
2329 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2330 {
2331 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2332 
2333 	if (addr) {
2334 		msg->msg_namelen = addr->len;
2335 		memcpy(msg->msg_name, addr->name, addr->len);
2336 	}
2337 }
2338 
2339 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2340 			 int flags)
2341 {
2342 	struct scm_cookie scm;
2343 	struct socket *sock = sk->sk_socket;
2344 	struct unix_sock *u = unix_sk(sk);
2345 	struct sk_buff *skb, *last;
2346 	long timeo;
2347 	int skip;
2348 	int err;
2349 
2350 	err = -EOPNOTSUPP;
2351 	if (flags&MSG_OOB)
2352 		goto out;
2353 
2354 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2355 
2356 	do {
2357 		mutex_lock(&u->iolock);
2358 
2359 		skip = sk_peek_offset(sk, flags);
2360 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2361 					      &skip, &err, &last);
2362 		if (skb) {
2363 			if (!(flags & MSG_PEEK))
2364 				scm_stat_del(sk, skb);
2365 			break;
2366 		}
2367 
2368 		mutex_unlock(&u->iolock);
2369 
2370 		if (err != -EAGAIN)
2371 			break;
2372 	} while (timeo &&
2373 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2374 					      &err, &timeo, last));
2375 
2376 	if (!skb) { /* implies iolock unlocked */
2377 		unix_state_lock(sk);
2378 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2379 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2380 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2381 			err = 0;
2382 		unix_state_unlock(sk);
2383 		goto out;
2384 	}
2385 
2386 	if (wq_has_sleeper(&u->peer_wait))
2387 		wake_up_interruptible_sync_poll(&u->peer_wait,
2388 						EPOLLOUT | EPOLLWRNORM |
2389 						EPOLLWRBAND);
2390 
2391 	if (msg->msg_name)
2392 		unix_copy_addr(msg, skb->sk);
2393 
2394 	if (size > skb->len - skip)
2395 		size = skb->len - skip;
2396 	else if (size < skb->len - skip)
2397 		msg->msg_flags |= MSG_TRUNC;
2398 
2399 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2400 	if (err)
2401 		goto out_free;
2402 
2403 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2404 		__sock_recv_timestamp(msg, sk, skb);
2405 
2406 	memset(&scm, 0, sizeof(scm));
2407 
2408 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2409 	unix_set_secdata(&scm, skb);
2410 
2411 	if (!(flags & MSG_PEEK)) {
2412 		if (UNIXCB(skb).fp)
2413 			unix_detach_fds(&scm, skb);
2414 
2415 		sk_peek_offset_bwd(sk, skb->len);
2416 	} else {
2417 		/* It is questionable: on PEEK we could:
2418 		   - do not return fds - good, but too simple 8)
2419 		   - return fds, and do not return them on read (old strategy,
2420 		     apparently wrong)
2421 		   - clone fds (I chose it for now, it is the most universal
2422 		     solution)
2423 
2424 		   POSIX 1003.1g does not actually define this clearly
2425 		   at all. POSIX 1003.1g doesn't define a lot of things
2426 		   clearly however!
2427 
2428 		*/
2429 
2430 		sk_peek_offset_fwd(sk, size);
2431 
2432 		if (UNIXCB(skb).fp)
2433 			unix_peek_fds(&scm, skb);
2434 	}
2435 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2436 
2437 	scm_recv_unix(sock, msg, &scm, flags);
2438 
2439 out_free:
2440 	skb_free_datagram(sk, skb);
2441 	mutex_unlock(&u->iolock);
2442 out:
2443 	return err;
2444 }
2445 
2446 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2447 			      int flags)
2448 {
2449 	struct sock *sk = sock->sk;
2450 
2451 #ifdef CONFIG_BPF_SYSCALL
2452 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2453 
2454 	if (prot != &unix_dgram_proto)
2455 		return prot->recvmsg(sk, msg, size, flags, NULL);
2456 #endif
2457 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2458 }
2459 
2460 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2461 {
2462 	struct unix_sock *u = unix_sk(sk);
2463 	struct sk_buff *skb;
2464 	int err;
2465 
2466 	mutex_lock(&u->iolock);
2467 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2468 	mutex_unlock(&u->iolock);
2469 	if (!skb)
2470 		return err;
2471 
2472 	return recv_actor(sk, skb);
2473 }
2474 
2475 /*
2476  *	Sleep until more data has arrived. But check for races..
2477  */
2478 static long unix_stream_data_wait(struct sock *sk, long timeo,
2479 				  struct sk_buff *last, unsigned int last_len,
2480 				  bool freezable)
2481 {
2482 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2483 	struct sk_buff *tail;
2484 	DEFINE_WAIT(wait);
2485 
2486 	unix_state_lock(sk);
2487 
2488 	for (;;) {
2489 		prepare_to_wait(sk_sleep(sk), &wait, state);
2490 
2491 		tail = skb_peek_tail(&sk->sk_receive_queue);
2492 		if (tail != last ||
2493 		    (tail && tail->len != last_len) ||
2494 		    sk->sk_err ||
2495 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2496 		    signal_pending(current) ||
2497 		    !timeo)
2498 			break;
2499 
2500 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2501 		unix_state_unlock(sk);
2502 		timeo = schedule_timeout(timeo);
2503 		unix_state_lock(sk);
2504 
2505 		if (sock_flag(sk, SOCK_DEAD))
2506 			break;
2507 
2508 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2509 	}
2510 
2511 	finish_wait(sk_sleep(sk), &wait);
2512 	unix_state_unlock(sk);
2513 	return timeo;
2514 }
2515 
2516 static unsigned int unix_skb_len(const struct sk_buff *skb)
2517 {
2518 	return skb->len - UNIXCB(skb).consumed;
2519 }
2520 
2521 struct unix_stream_read_state {
2522 	int (*recv_actor)(struct sk_buff *, int, int,
2523 			  struct unix_stream_read_state *);
2524 	struct socket *socket;
2525 	struct msghdr *msg;
2526 	struct pipe_inode_info *pipe;
2527 	size_t size;
2528 	int flags;
2529 	unsigned int splice_flags;
2530 };
2531 
2532 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2533 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2534 {
2535 	struct socket *sock = state->socket;
2536 	struct sock *sk = sock->sk;
2537 	struct unix_sock *u = unix_sk(sk);
2538 	int chunk = 1;
2539 	struct sk_buff *oob_skb;
2540 
2541 	mutex_lock(&u->iolock);
2542 	unix_state_lock(sk);
2543 
2544 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2545 		unix_state_unlock(sk);
2546 		mutex_unlock(&u->iolock);
2547 		return -EINVAL;
2548 	}
2549 
2550 	oob_skb = u->oob_skb;
2551 
2552 	if (!(state->flags & MSG_PEEK))
2553 		WRITE_ONCE(u->oob_skb, NULL);
2554 	else
2555 		skb_get(oob_skb);
2556 	unix_state_unlock(sk);
2557 
2558 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2559 
2560 	if (!(state->flags & MSG_PEEK))
2561 		UNIXCB(oob_skb).consumed += 1;
2562 
2563 	consume_skb(oob_skb);
2564 
2565 	mutex_unlock(&u->iolock);
2566 
2567 	if (chunk < 0)
2568 		return -EFAULT;
2569 
2570 	state->msg->msg_flags |= MSG_OOB;
2571 	return 1;
2572 }
2573 
2574 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2575 				  int flags, int copied)
2576 {
2577 	struct unix_sock *u = unix_sk(sk);
2578 
2579 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2580 		skb_unlink(skb, &sk->sk_receive_queue);
2581 		consume_skb(skb);
2582 		skb = NULL;
2583 	} else {
2584 		if (skb == u->oob_skb) {
2585 			if (copied) {
2586 				skb = NULL;
2587 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2588 				if (!(flags & MSG_PEEK)) {
2589 					WRITE_ONCE(u->oob_skb, NULL);
2590 					consume_skb(skb);
2591 				}
2592 			} else if (!(flags & MSG_PEEK)) {
2593 				skb_unlink(skb, &sk->sk_receive_queue);
2594 				consume_skb(skb);
2595 				skb = skb_peek(&sk->sk_receive_queue);
2596 			}
2597 		}
2598 	}
2599 	return skb;
2600 }
2601 #endif
2602 
2603 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2604 {
2605 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2606 		return -ENOTCONN;
2607 
2608 	return unix_read_skb(sk, recv_actor);
2609 }
2610 
2611 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2612 				    bool freezable)
2613 {
2614 	struct scm_cookie scm;
2615 	struct socket *sock = state->socket;
2616 	struct sock *sk = sock->sk;
2617 	struct unix_sock *u = unix_sk(sk);
2618 	int copied = 0;
2619 	int flags = state->flags;
2620 	int noblock = flags & MSG_DONTWAIT;
2621 	bool check_creds = false;
2622 	int target;
2623 	int err = 0;
2624 	long timeo;
2625 	int skip;
2626 	size_t size = state->size;
2627 	unsigned int last_len;
2628 
2629 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2630 		err = -EINVAL;
2631 		goto out;
2632 	}
2633 
2634 	if (unlikely(flags & MSG_OOB)) {
2635 		err = -EOPNOTSUPP;
2636 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2637 		err = unix_stream_recv_urg(state);
2638 #endif
2639 		goto out;
2640 	}
2641 
2642 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2643 	timeo = sock_rcvtimeo(sk, noblock);
2644 
2645 	memset(&scm, 0, sizeof(scm));
2646 
2647 	/* Lock the socket to prevent queue disordering
2648 	 * while sleeps in memcpy_tomsg
2649 	 */
2650 	mutex_lock(&u->iolock);
2651 
2652 	skip = max(sk_peek_offset(sk, flags), 0);
2653 
2654 	do {
2655 		int chunk;
2656 		bool drop_skb;
2657 		struct sk_buff *skb, *last;
2658 
2659 redo:
2660 		unix_state_lock(sk);
2661 		if (sock_flag(sk, SOCK_DEAD)) {
2662 			err = -ECONNRESET;
2663 			goto unlock;
2664 		}
2665 		last = skb = skb_peek(&sk->sk_receive_queue);
2666 		last_len = last ? last->len : 0;
2667 
2668 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2669 		if (skb) {
2670 			skb = manage_oob(skb, sk, flags, copied);
2671 			if (!skb) {
2672 				unix_state_unlock(sk);
2673 				if (copied)
2674 					break;
2675 				goto redo;
2676 			}
2677 		}
2678 #endif
2679 again:
2680 		if (skb == NULL) {
2681 			if (copied >= target)
2682 				goto unlock;
2683 
2684 			/*
2685 			 *	POSIX 1003.1g mandates this order.
2686 			 */
2687 
2688 			err = sock_error(sk);
2689 			if (err)
2690 				goto unlock;
2691 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2692 				goto unlock;
2693 
2694 			unix_state_unlock(sk);
2695 			if (!timeo) {
2696 				err = -EAGAIN;
2697 				break;
2698 			}
2699 
2700 			mutex_unlock(&u->iolock);
2701 
2702 			timeo = unix_stream_data_wait(sk, timeo, last,
2703 						      last_len, freezable);
2704 
2705 			if (signal_pending(current)) {
2706 				err = sock_intr_errno(timeo);
2707 				scm_destroy(&scm);
2708 				goto out;
2709 			}
2710 
2711 			mutex_lock(&u->iolock);
2712 			goto redo;
2713 unlock:
2714 			unix_state_unlock(sk);
2715 			break;
2716 		}
2717 
2718 		while (skip >= unix_skb_len(skb)) {
2719 			skip -= unix_skb_len(skb);
2720 			last = skb;
2721 			last_len = skb->len;
2722 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2723 			if (!skb)
2724 				goto again;
2725 		}
2726 
2727 		unix_state_unlock(sk);
2728 
2729 		if (check_creds) {
2730 			/* Never glue messages from different writers */
2731 			if (!unix_skb_scm_eq(skb, &scm))
2732 				break;
2733 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2734 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2735 			/* Copy credentials */
2736 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2737 			unix_set_secdata(&scm, skb);
2738 			check_creds = true;
2739 		}
2740 
2741 		/* Copy address just once */
2742 		if (state->msg && state->msg->msg_name) {
2743 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2744 					 state->msg->msg_name);
2745 			unix_copy_addr(state->msg, skb->sk);
2746 			sunaddr = NULL;
2747 		}
2748 
2749 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2750 		skb_get(skb);
2751 		chunk = state->recv_actor(skb, skip, chunk, state);
2752 		drop_skb = !unix_skb_len(skb);
2753 		/* skb is only safe to use if !drop_skb */
2754 		consume_skb(skb);
2755 		if (chunk < 0) {
2756 			if (copied == 0)
2757 				copied = -EFAULT;
2758 			break;
2759 		}
2760 		copied += chunk;
2761 		size -= chunk;
2762 
2763 		if (drop_skb) {
2764 			/* the skb was touched by a concurrent reader;
2765 			 * we should not expect anything from this skb
2766 			 * anymore and assume it invalid - we can be
2767 			 * sure it was dropped from the socket queue
2768 			 *
2769 			 * let's report a short read
2770 			 */
2771 			err = 0;
2772 			break;
2773 		}
2774 
2775 		/* Mark read part of skb as used */
2776 		if (!(flags & MSG_PEEK)) {
2777 			UNIXCB(skb).consumed += chunk;
2778 
2779 			sk_peek_offset_bwd(sk, chunk);
2780 
2781 			if (UNIXCB(skb).fp) {
2782 				scm_stat_del(sk, skb);
2783 				unix_detach_fds(&scm, skb);
2784 			}
2785 
2786 			if (unix_skb_len(skb))
2787 				break;
2788 
2789 			skb_unlink(skb, &sk->sk_receive_queue);
2790 			consume_skb(skb);
2791 
2792 			if (scm.fp)
2793 				break;
2794 		} else {
2795 			/* It is questionable, see note in unix_dgram_recvmsg.
2796 			 */
2797 			if (UNIXCB(skb).fp)
2798 				unix_peek_fds(&scm, skb);
2799 
2800 			sk_peek_offset_fwd(sk, chunk);
2801 
2802 			if (UNIXCB(skb).fp)
2803 				break;
2804 
2805 			skip = 0;
2806 			last = skb;
2807 			last_len = skb->len;
2808 			unix_state_lock(sk);
2809 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2810 			if (skb)
2811 				goto again;
2812 			unix_state_unlock(sk);
2813 			break;
2814 		}
2815 	} while (size);
2816 
2817 	mutex_unlock(&u->iolock);
2818 	if (state->msg)
2819 		scm_recv_unix(sock, state->msg, &scm, flags);
2820 	else
2821 		scm_destroy(&scm);
2822 out:
2823 	return copied ? : err;
2824 }
2825 
2826 static int unix_stream_read_actor(struct sk_buff *skb,
2827 				  int skip, int chunk,
2828 				  struct unix_stream_read_state *state)
2829 {
2830 	int ret;
2831 
2832 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2833 				    state->msg, chunk);
2834 	return ret ?: chunk;
2835 }
2836 
2837 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2838 			  size_t size, int flags)
2839 {
2840 	struct unix_stream_read_state state = {
2841 		.recv_actor = unix_stream_read_actor,
2842 		.socket = sk->sk_socket,
2843 		.msg = msg,
2844 		.size = size,
2845 		.flags = flags
2846 	};
2847 
2848 	return unix_stream_read_generic(&state, true);
2849 }
2850 
2851 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2852 			       size_t size, int flags)
2853 {
2854 	struct unix_stream_read_state state = {
2855 		.recv_actor = unix_stream_read_actor,
2856 		.socket = sock,
2857 		.msg = msg,
2858 		.size = size,
2859 		.flags = flags
2860 	};
2861 
2862 #ifdef CONFIG_BPF_SYSCALL
2863 	struct sock *sk = sock->sk;
2864 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2865 
2866 	if (prot != &unix_stream_proto)
2867 		return prot->recvmsg(sk, msg, size, flags, NULL);
2868 #endif
2869 	return unix_stream_read_generic(&state, true);
2870 }
2871 
2872 static int unix_stream_splice_actor(struct sk_buff *skb,
2873 				    int skip, int chunk,
2874 				    struct unix_stream_read_state *state)
2875 {
2876 	return skb_splice_bits(skb, state->socket->sk,
2877 			       UNIXCB(skb).consumed + skip,
2878 			       state->pipe, chunk, state->splice_flags);
2879 }
2880 
2881 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2882 				       struct pipe_inode_info *pipe,
2883 				       size_t size, unsigned int flags)
2884 {
2885 	struct unix_stream_read_state state = {
2886 		.recv_actor = unix_stream_splice_actor,
2887 		.socket = sock,
2888 		.pipe = pipe,
2889 		.size = size,
2890 		.splice_flags = flags,
2891 	};
2892 
2893 	if (unlikely(*ppos))
2894 		return -ESPIPE;
2895 
2896 	if (sock->file->f_flags & O_NONBLOCK ||
2897 	    flags & SPLICE_F_NONBLOCK)
2898 		state.flags = MSG_DONTWAIT;
2899 
2900 	return unix_stream_read_generic(&state, false);
2901 }
2902 
2903 static int unix_shutdown(struct socket *sock, int mode)
2904 {
2905 	struct sock *sk = sock->sk;
2906 	struct sock *other;
2907 
2908 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2909 		return -EINVAL;
2910 	/* This maps:
2911 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2912 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2913 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2914 	 */
2915 	++mode;
2916 
2917 	unix_state_lock(sk);
2918 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2919 	other = unix_peer(sk);
2920 	if (other)
2921 		sock_hold(other);
2922 	unix_state_unlock(sk);
2923 	sk->sk_state_change(sk);
2924 
2925 	if (other &&
2926 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2927 
2928 		int peer_mode = 0;
2929 		const struct proto *prot = READ_ONCE(other->sk_prot);
2930 
2931 		if (prot->unhash)
2932 			prot->unhash(other);
2933 		if (mode&RCV_SHUTDOWN)
2934 			peer_mode |= SEND_SHUTDOWN;
2935 		if (mode&SEND_SHUTDOWN)
2936 			peer_mode |= RCV_SHUTDOWN;
2937 		unix_state_lock(other);
2938 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2939 		unix_state_unlock(other);
2940 		other->sk_state_change(other);
2941 		if (peer_mode == SHUTDOWN_MASK)
2942 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2943 		else if (peer_mode & RCV_SHUTDOWN)
2944 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2945 	}
2946 	if (other)
2947 		sock_put(other);
2948 
2949 	return 0;
2950 }
2951 
2952 long unix_inq_len(struct sock *sk)
2953 {
2954 	struct sk_buff *skb;
2955 	long amount = 0;
2956 
2957 	if (sk->sk_state == TCP_LISTEN)
2958 		return -EINVAL;
2959 
2960 	spin_lock(&sk->sk_receive_queue.lock);
2961 	if (sk->sk_type == SOCK_STREAM ||
2962 	    sk->sk_type == SOCK_SEQPACKET) {
2963 		skb_queue_walk(&sk->sk_receive_queue, skb)
2964 			amount += unix_skb_len(skb);
2965 	} else {
2966 		skb = skb_peek(&sk->sk_receive_queue);
2967 		if (skb)
2968 			amount = skb->len;
2969 	}
2970 	spin_unlock(&sk->sk_receive_queue.lock);
2971 
2972 	return amount;
2973 }
2974 EXPORT_SYMBOL_GPL(unix_inq_len);
2975 
2976 long unix_outq_len(struct sock *sk)
2977 {
2978 	return sk_wmem_alloc_get(sk);
2979 }
2980 EXPORT_SYMBOL_GPL(unix_outq_len);
2981 
2982 static int unix_open_file(struct sock *sk)
2983 {
2984 	struct path path;
2985 	struct file *f;
2986 	int fd;
2987 
2988 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2989 		return -EPERM;
2990 
2991 	if (!smp_load_acquire(&unix_sk(sk)->addr))
2992 		return -ENOENT;
2993 
2994 	path = unix_sk(sk)->path;
2995 	if (!path.dentry)
2996 		return -ENOENT;
2997 
2998 	path_get(&path);
2999 
3000 	fd = get_unused_fd_flags(O_CLOEXEC);
3001 	if (fd < 0)
3002 		goto out;
3003 
3004 	f = dentry_open(&path, O_PATH, current_cred());
3005 	if (IS_ERR(f)) {
3006 		put_unused_fd(fd);
3007 		fd = PTR_ERR(f);
3008 		goto out;
3009 	}
3010 
3011 	fd_install(fd, f);
3012 out:
3013 	path_put(&path);
3014 
3015 	return fd;
3016 }
3017 
3018 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3019 {
3020 	struct sock *sk = sock->sk;
3021 	long amount = 0;
3022 	int err;
3023 
3024 	switch (cmd) {
3025 	case SIOCOUTQ:
3026 		amount = unix_outq_len(sk);
3027 		err = put_user(amount, (int __user *)arg);
3028 		break;
3029 	case SIOCINQ:
3030 		amount = unix_inq_len(sk);
3031 		if (amount < 0)
3032 			err = amount;
3033 		else
3034 			err = put_user(amount, (int __user *)arg);
3035 		break;
3036 	case SIOCUNIXFILE:
3037 		err = unix_open_file(sk);
3038 		break;
3039 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3040 	case SIOCATMARK:
3041 		{
3042 			struct sk_buff *skb;
3043 			int answ = 0;
3044 
3045 			skb = skb_peek(&sk->sk_receive_queue);
3046 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3047 				answ = 1;
3048 			err = put_user(answ, (int __user *)arg);
3049 		}
3050 		break;
3051 #endif
3052 	default:
3053 		err = -ENOIOCTLCMD;
3054 		break;
3055 	}
3056 	return err;
3057 }
3058 
3059 #ifdef CONFIG_COMPAT
3060 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3061 {
3062 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3063 }
3064 #endif
3065 
3066 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3067 {
3068 	struct sock *sk = sock->sk;
3069 	__poll_t mask;
3070 	u8 shutdown;
3071 
3072 	sock_poll_wait(file, sock, wait);
3073 	mask = 0;
3074 	shutdown = READ_ONCE(sk->sk_shutdown);
3075 
3076 	/* exceptional events? */
3077 	if (READ_ONCE(sk->sk_err))
3078 		mask |= EPOLLERR;
3079 	if (shutdown == SHUTDOWN_MASK)
3080 		mask |= EPOLLHUP;
3081 	if (shutdown & RCV_SHUTDOWN)
3082 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3083 
3084 	/* readable? */
3085 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3086 		mask |= EPOLLIN | EPOLLRDNORM;
3087 	if (sk_is_readable(sk))
3088 		mask |= EPOLLIN | EPOLLRDNORM;
3089 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3090 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3091 		mask |= EPOLLPRI;
3092 #endif
3093 
3094 	/* Connection-based need to check for termination and startup */
3095 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3096 	    sk->sk_state == TCP_CLOSE)
3097 		mask |= EPOLLHUP;
3098 
3099 	/*
3100 	 * we set writable also when the other side has shut down the
3101 	 * connection. This prevents stuck sockets.
3102 	 */
3103 	if (unix_writable(sk))
3104 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3105 
3106 	return mask;
3107 }
3108 
3109 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3110 				    poll_table *wait)
3111 {
3112 	struct sock *sk = sock->sk, *other;
3113 	unsigned int writable;
3114 	__poll_t mask;
3115 	u8 shutdown;
3116 
3117 	sock_poll_wait(file, sock, wait);
3118 	mask = 0;
3119 	shutdown = READ_ONCE(sk->sk_shutdown);
3120 
3121 	/* exceptional events? */
3122 	if (READ_ONCE(sk->sk_err) ||
3123 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3124 		mask |= EPOLLERR |
3125 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3126 
3127 	if (shutdown & RCV_SHUTDOWN)
3128 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3129 	if (shutdown == SHUTDOWN_MASK)
3130 		mask |= EPOLLHUP;
3131 
3132 	/* readable? */
3133 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3134 		mask |= EPOLLIN | EPOLLRDNORM;
3135 	if (sk_is_readable(sk))
3136 		mask |= EPOLLIN | EPOLLRDNORM;
3137 
3138 	/* Connection-based need to check for termination and startup */
3139 	if (sk->sk_type == SOCK_SEQPACKET) {
3140 		if (sk->sk_state == TCP_CLOSE)
3141 			mask |= EPOLLHUP;
3142 		/* connection hasn't started yet? */
3143 		if (sk->sk_state == TCP_SYN_SENT)
3144 			return mask;
3145 	}
3146 
3147 	/* No write status requested, avoid expensive OUT tests. */
3148 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3149 		return mask;
3150 
3151 	writable = unix_writable(sk);
3152 	if (writable) {
3153 		unix_state_lock(sk);
3154 
3155 		other = unix_peer(sk);
3156 		if (other && unix_peer(other) != sk &&
3157 		    unix_recvq_full_lockless(other) &&
3158 		    unix_dgram_peer_wake_me(sk, other))
3159 			writable = 0;
3160 
3161 		unix_state_unlock(sk);
3162 	}
3163 
3164 	if (writable)
3165 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3166 	else
3167 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3168 
3169 	return mask;
3170 }
3171 
3172 #ifdef CONFIG_PROC_FS
3173 
3174 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3175 
3176 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3177 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3178 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3179 
3180 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3181 {
3182 	unsigned long offset = get_offset(*pos);
3183 	unsigned long bucket = get_bucket(*pos);
3184 	unsigned long count = 0;
3185 	struct sock *sk;
3186 
3187 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3188 	     sk; sk = sk_next(sk)) {
3189 		if (++count == offset)
3190 			break;
3191 	}
3192 
3193 	return sk;
3194 }
3195 
3196 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3197 {
3198 	unsigned long bucket = get_bucket(*pos);
3199 	struct net *net = seq_file_net(seq);
3200 	struct sock *sk;
3201 
3202 	while (bucket < UNIX_HASH_SIZE) {
3203 		spin_lock(&net->unx.table.locks[bucket]);
3204 
3205 		sk = unix_from_bucket(seq, pos);
3206 		if (sk)
3207 			return sk;
3208 
3209 		spin_unlock(&net->unx.table.locks[bucket]);
3210 
3211 		*pos = set_bucket_offset(++bucket, 1);
3212 	}
3213 
3214 	return NULL;
3215 }
3216 
3217 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3218 				  loff_t *pos)
3219 {
3220 	unsigned long bucket = get_bucket(*pos);
3221 
3222 	sk = sk_next(sk);
3223 	if (sk)
3224 		return sk;
3225 
3226 
3227 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3228 
3229 	*pos = set_bucket_offset(++bucket, 1);
3230 
3231 	return unix_get_first(seq, pos);
3232 }
3233 
3234 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3235 {
3236 	if (!*pos)
3237 		return SEQ_START_TOKEN;
3238 
3239 	return unix_get_first(seq, pos);
3240 }
3241 
3242 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3243 {
3244 	++*pos;
3245 
3246 	if (v == SEQ_START_TOKEN)
3247 		return unix_get_first(seq, pos);
3248 
3249 	return unix_get_next(seq, v, pos);
3250 }
3251 
3252 static void unix_seq_stop(struct seq_file *seq, void *v)
3253 {
3254 	struct sock *sk = v;
3255 
3256 	if (sk)
3257 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3258 }
3259 
3260 static int unix_seq_show(struct seq_file *seq, void *v)
3261 {
3262 
3263 	if (v == SEQ_START_TOKEN)
3264 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3265 			 "Inode Path\n");
3266 	else {
3267 		struct sock *s = v;
3268 		struct unix_sock *u = unix_sk(s);
3269 		unix_state_lock(s);
3270 
3271 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3272 			s,
3273 			refcount_read(&s->sk_refcnt),
3274 			0,
3275 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3276 			s->sk_type,
3277 			s->sk_socket ?
3278 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3279 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3280 			sock_i_ino(s));
3281 
3282 		if (u->addr) {	// under a hash table lock here
3283 			int i, len;
3284 			seq_putc(seq, ' ');
3285 
3286 			i = 0;
3287 			len = u->addr->len -
3288 				offsetof(struct sockaddr_un, sun_path);
3289 			if (u->addr->name->sun_path[0]) {
3290 				len--;
3291 			} else {
3292 				seq_putc(seq, '@');
3293 				i++;
3294 			}
3295 			for ( ; i < len; i++)
3296 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3297 					 '@');
3298 		}
3299 		unix_state_unlock(s);
3300 		seq_putc(seq, '\n');
3301 	}
3302 
3303 	return 0;
3304 }
3305 
3306 static const struct seq_operations unix_seq_ops = {
3307 	.start  = unix_seq_start,
3308 	.next   = unix_seq_next,
3309 	.stop   = unix_seq_stop,
3310 	.show   = unix_seq_show,
3311 };
3312 
3313 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3314 struct bpf_unix_iter_state {
3315 	struct seq_net_private p;
3316 	unsigned int cur_sk;
3317 	unsigned int end_sk;
3318 	unsigned int max_sk;
3319 	struct sock **batch;
3320 	bool st_bucket_done;
3321 };
3322 
3323 struct bpf_iter__unix {
3324 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3325 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3326 	uid_t uid __aligned(8);
3327 };
3328 
3329 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3330 			      struct unix_sock *unix_sk, uid_t uid)
3331 {
3332 	struct bpf_iter__unix ctx;
3333 
3334 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3335 	ctx.meta = meta;
3336 	ctx.unix_sk = unix_sk;
3337 	ctx.uid = uid;
3338 	return bpf_iter_run_prog(prog, &ctx);
3339 }
3340 
3341 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3342 
3343 {
3344 	struct bpf_unix_iter_state *iter = seq->private;
3345 	unsigned int expected = 1;
3346 	struct sock *sk;
3347 
3348 	sock_hold(start_sk);
3349 	iter->batch[iter->end_sk++] = start_sk;
3350 
3351 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3352 		if (iter->end_sk < iter->max_sk) {
3353 			sock_hold(sk);
3354 			iter->batch[iter->end_sk++] = sk;
3355 		}
3356 
3357 		expected++;
3358 	}
3359 
3360 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3361 
3362 	return expected;
3363 }
3364 
3365 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3366 {
3367 	while (iter->cur_sk < iter->end_sk)
3368 		sock_put(iter->batch[iter->cur_sk++]);
3369 }
3370 
3371 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3372 				       unsigned int new_batch_sz)
3373 {
3374 	struct sock **new_batch;
3375 
3376 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3377 			     GFP_USER | __GFP_NOWARN);
3378 	if (!new_batch)
3379 		return -ENOMEM;
3380 
3381 	bpf_iter_unix_put_batch(iter);
3382 	kvfree(iter->batch);
3383 	iter->batch = new_batch;
3384 	iter->max_sk = new_batch_sz;
3385 
3386 	return 0;
3387 }
3388 
3389 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3390 					loff_t *pos)
3391 {
3392 	struct bpf_unix_iter_state *iter = seq->private;
3393 	unsigned int expected;
3394 	bool resized = false;
3395 	struct sock *sk;
3396 
3397 	if (iter->st_bucket_done)
3398 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3399 
3400 again:
3401 	/* Get a new batch */
3402 	iter->cur_sk = 0;
3403 	iter->end_sk = 0;
3404 
3405 	sk = unix_get_first(seq, pos);
3406 	if (!sk)
3407 		return NULL; /* Done */
3408 
3409 	expected = bpf_iter_unix_hold_batch(seq, sk);
3410 
3411 	if (iter->end_sk == expected) {
3412 		iter->st_bucket_done = true;
3413 		return sk;
3414 	}
3415 
3416 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3417 		resized = true;
3418 		goto again;
3419 	}
3420 
3421 	return sk;
3422 }
3423 
3424 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3425 {
3426 	if (!*pos)
3427 		return SEQ_START_TOKEN;
3428 
3429 	/* bpf iter does not support lseek, so it always
3430 	 * continue from where it was stop()-ped.
3431 	 */
3432 	return bpf_iter_unix_batch(seq, pos);
3433 }
3434 
3435 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3436 {
3437 	struct bpf_unix_iter_state *iter = seq->private;
3438 	struct sock *sk;
3439 
3440 	/* Whenever seq_next() is called, the iter->cur_sk is
3441 	 * done with seq_show(), so advance to the next sk in
3442 	 * the batch.
3443 	 */
3444 	if (iter->cur_sk < iter->end_sk)
3445 		sock_put(iter->batch[iter->cur_sk++]);
3446 
3447 	++*pos;
3448 
3449 	if (iter->cur_sk < iter->end_sk)
3450 		sk = iter->batch[iter->cur_sk];
3451 	else
3452 		sk = bpf_iter_unix_batch(seq, pos);
3453 
3454 	return sk;
3455 }
3456 
3457 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3458 {
3459 	struct bpf_iter_meta meta;
3460 	struct bpf_prog *prog;
3461 	struct sock *sk = v;
3462 	uid_t uid;
3463 	bool slow;
3464 	int ret;
3465 
3466 	if (v == SEQ_START_TOKEN)
3467 		return 0;
3468 
3469 	slow = lock_sock_fast(sk);
3470 
3471 	if (unlikely(sk_unhashed(sk))) {
3472 		ret = SEQ_SKIP;
3473 		goto unlock;
3474 	}
3475 
3476 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3477 	meta.seq = seq;
3478 	prog = bpf_iter_get_info(&meta, false);
3479 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3480 unlock:
3481 	unlock_sock_fast(sk, slow);
3482 	return ret;
3483 }
3484 
3485 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3486 {
3487 	struct bpf_unix_iter_state *iter = seq->private;
3488 	struct bpf_iter_meta meta;
3489 	struct bpf_prog *prog;
3490 
3491 	if (!v) {
3492 		meta.seq = seq;
3493 		prog = bpf_iter_get_info(&meta, true);
3494 		if (prog)
3495 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3496 	}
3497 
3498 	if (iter->cur_sk < iter->end_sk)
3499 		bpf_iter_unix_put_batch(iter);
3500 }
3501 
3502 static const struct seq_operations bpf_iter_unix_seq_ops = {
3503 	.start	= bpf_iter_unix_seq_start,
3504 	.next	= bpf_iter_unix_seq_next,
3505 	.stop	= bpf_iter_unix_seq_stop,
3506 	.show	= bpf_iter_unix_seq_show,
3507 };
3508 #endif
3509 #endif
3510 
3511 static const struct net_proto_family unix_family_ops = {
3512 	.family = PF_UNIX,
3513 	.create = unix_create,
3514 	.owner	= THIS_MODULE,
3515 };
3516 
3517 
3518 static int __net_init unix_net_init(struct net *net)
3519 {
3520 	int i;
3521 
3522 	net->unx.sysctl_max_dgram_qlen = 10;
3523 	if (unix_sysctl_register(net))
3524 		goto out;
3525 
3526 #ifdef CONFIG_PROC_FS
3527 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3528 			     sizeof(struct seq_net_private)))
3529 		goto err_sysctl;
3530 #endif
3531 
3532 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3533 					      sizeof(spinlock_t), GFP_KERNEL);
3534 	if (!net->unx.table.locks)
3535 		goto err_proc;
3536 
3537 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3538 						sizeof(struct hlist_head),
3539 						GFP_KERNEL);
3540 	if (!net->unx.table.buckets)
3541 		goto free_locks;
3542 
3543 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3544 		spin_lock_init(&net->unx.table.locks[i]);
3545 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3546 	}
3547 
3548 	return 0;
3549 
3550 free_locks:
3551 	kvfree(net->unx.table.locks);
3552 err_proc:
3553 #ifdef CONFIG_PROC_FS
3554 	remove_proc_entry("unix", net->proc_net);
3555 err_sysctl:
3556 #endif
3557 	unix_sysctl_unregister(net);
3558 out:
3559 	return -ENOMEM;
3560 }
3561 
3562 static void __net_exit unix_net_exit(struct net *net)
3563 {
3564 	kvfree(net->unx.table.buckets);
3565 	kvfree(net->unx.table.locks);
3566 	unix_sysctl_unregister(net);
3567 	remove_proc_entry("unix", net->proc_net);
3568 }
3569 
3570 static struct pernet_operations unix_net_ops = {
3571 	.init = unix_net_init,
3572 	.exit = unix_net_exit,
3573 };
3574 
3575 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3576 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3577 		     struct unix_sock *unix_sk, uid_t uid)
3578 
3579 #define INIT_BATCH_SZ 16
3580 
3581 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3582 {
3583 	struct bpf_unix_iter_state *iter = priv_data;
3584 	int err;
3585 
3586 	err = bpf_iter_init_seq_net(priv_data, aux);
3587 	if (err)
3588 		return err;
3589 
3590 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3591 	if (err) {
3592 		bpf_iter_fini_seq_net(priv_data);
3593 		return err;
3594 	}
3595 
3596 	return 0;
3597 }
3598 
3599 static void bpf_iter_fini_unix(void *priv_data)
3600 {
3601 	struct bpf_unix_iter_state *iter = priv_data;
3602 
3603 	bpf_iter_fini_seq_net(priv_data);
3604 	kvfree(iter->batch);
3605 }
3606 
3607 static const struct bpf_iter_seq_info unix_seq_info = {
3608 	.seq_ops		= &bpf_iter_unix_seq_ops,
3609 	.init_seq_private	= bpf_iter_init_unix,
3610 	.fini_seq_private	= bpf_iter_fini_unix,
3611 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3612 };
3613 
3614 static const struct bpf_func_proto *
3615 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3616 			     const struct bpf_prog *prog)
3617 {
3618 	switch (func_id) {
3619 	case BPF_FUNC_setsockopt:
3620 		return &bpf_sk_setsockopt_proto;
3621 	case BPF_FUNC_getsockopt:
3622 		return &bpf_sk_getsockopt_proto;
3623 	default:
3624 		return NULL;
3625 	}
3626 }
3627 
3628 static struct bpf_iter_reg unix_reg_info = {
3629 	.target			= "unix",
3630 	.ctx_arg_info_size	= 1,
3631 	.ctx_arg_info		= {
3632 		{ offsetof(struct bpf_iter__unix, unix_sk),
3633 		  PTR_TO_BTF_ID_OR_NULL },
3634 	},
3635 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3636 	.seq_info		= &unix_seq_info,
3637 };
3638 
3639 static void __init bpf_iter_register(void)
3640 {
3641 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3642 	if (bpf_iter_reg_target(&unix_reg_info))
3643 		pr_warn("Warning: could not register bpf iterator unix\n");
3644 }
3645 #endif
3646 
3647 static int __init af_unix_init(void)
3648 {
3649 	int i, rc = -1;
3650 
3651 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3652 
3653 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3654 		spin_lock_init(&bsd_socket_locks[i]);
3655 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3656 	}
3657 
3658 	rc = proto_register(&unix_dgram_proto, 1);
3659 	if (rc != 0) {
3660 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3661 		goto out;
3662 	}
3663 
3664 	rc = proto_register(&unix_stream_proto, 1);
3665 	if (rc != 0) {
3666 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3667 		proto_unregister(&unix_dgram_proto);
3668 		goto out;
3669 	}
3670 
3671 	sock_register(&unix_family_ops);
3672 	register_pernet_subsys(&unix_net_ops);
3673 	unix_bpf_build_proto();
3674 
3675 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3676 	bpf_iter_register();
3677 #endif
3678 
3679 out:
3680 	return rc;
3681 }
3682 
3683 static void __exit af_unix_exit(void)
3684 {
3685 	sock_unregister(PF_UNIX);
3686 	proto_unregister(&unix_dgram_proto);
3687 	proto_unregister(&unix_stream_proto);
3688 	unregister_pernet_subsys(&unix_net_ops);
3689 }
3690 
3691 /* Earlier than device_initcall() so that other drivers invoking
3692    request_module() don't end up in a loop when modprobe tries
3693    to use a UNIX socket. But later than subsys_initcall() because
3694    we depend on stuff initialised there */
3695 fs_initcall(af_unix_init);
3696 module_exit(af_unix_exit);
3697 
3698 MODULE_LICENSE("GPL");
3699 MODULE_ALIAS_NETPROTO(PF_UNIX);
3700