xref: /openbmc/linux/net/unix/af_unix.c (revision fd37b884)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 #define unix_peer(sk) (unix_sk(sk)->peer)
216 
217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
218 {
219 	return unix_peer(osk) == sk;
220 }
221 
222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
223 {
224 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
225 }
226 
227 static inline int unix_recvq_full(const struct sock *sk)
228 {
229 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
230 }
231 
232 static inline int unix_recvq_full_lockless(const struct sock *sk)
233 {
234 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 		READ_ONCE(sk->sk_max_ack_backlog);
236 }
237 
238 struct sock *unix_peer_get(struct sock *s)
239 {
240 	struct sock *peer;
241 
242 	unix_state_lock(s);
243 	peer = unix_peer(s);
244 	if (peer)
245 		sock_hold(peer);
246 	unix_state_unlock(s);
247 	return peer;
248 }
249 EXPORT_SYMBOL_GPL(unix_peer_get);
250 
251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
252 					     int addr_len)
253 {
254 	struct unix_address *addr;
255 
256 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
257 	if (!addr)
258 		return NULL;
259 
260 	refcount_set(&addr->refcnt, 1);
261 	addr->len = addr_len;
262 	memcpy(addr->name, sunaddr, addr_len);
263 
264 	return addr;
265 }
266 
267 static inline void unix_release_addr(struct unix_address *addr)
268 {
269 	if (refcount_dec_and_test(&addr->refcnt))
270 		kfree(addr);
271 }
272 
273 /*
274  *	Check unix socket name:
275  *		- should be not zero length.
276  *	        - if started by not zero, should be NULL terminated (FS object)
277  *		- if started by zero, it is abstract name.
278  */
279 
280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
281 {
282 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 	    addr_len > sizeof(*sunaddr))
284 		return -EINVAL;
285 
286 	if (sunaddr->sun_family != AF_UNIX)
287 		return -EINVAL;
288 
289 	return 0;
290 }
291 
292 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
293 {
294 	/* This may look like an off by one error but it is a bit more
295 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
296 	 * sun_path[108] doesn't as such exist.  However in kernel space
297 	 * we are guaranteed that it is a valid memory location in our
298 	 * kernel address buffer because syscall functions always pass
299 	 * a pointer of struct sockaddr_storage which has a bigger buffer
300 	 * than 108.
301 	 */
302 	((char *)sunaddr)[addr_len] = 0;
303 }
304 
305 static void __unix_remove_socket(struct sock *sk)
306 {
307 	sk_del_node_init(sk);
308 }
309 
310 static void __unix_insert_socket(struct net *net, struct sock *sk)
311 {
312 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
313 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
314 }
315 
316 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
317 				 struct unix_address *addr, unsigned int hash)
318 {
319 	__unix_remove_socket(sk);
320 	smp_store_release(&unix_sk(sk)->addr, addr);
321 
322 	sk->sk_hash = hash;
323 	__unix_insert_socket(net, sk);
324 }
325 
326 static void unix_remove_socket(struct net *net, struct sock *sk)
327 {
328 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
329 	__unix_remove_socket(sk);
330 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
331 }
332 
333 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
334 {
335 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
336 	__unix_insert_socket(net, sk);
337 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
338 }
339 
340 static void unix_insert_bsd_socket(struct sock *sk)
341 {
342 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
343 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
344 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
345 }
346 
347 static void unix_remove_bsd_socket(struct sock *sk)
348 {
349 	if (!hlist_unhashed(&sk->sk_bind_node)) {
350 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
351 		__sk_del_bind_node(sk);
352 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
353 
354 		sk_node_init(&sk->sk_bind_node);
355 	}
356 }
357 
358 static struct sock *__unix_find_socket_byname(struct net *net,
359 					      struct sockaddr_un *sunname,
360 					      int len, unsigned int hash)
361 {
362 	struct sock *s;
363 
364 	sk_for_each(s, &net->unx.table.buckets[hash]) {
365 		struct unix_sock *u = unix_sk(s);
366 
367 		if (u->addr->len == len &&
368 		    !memcmp(u->addr->name, sunname, len))
369 			return s;
370 	}
371 	return NULL;
372 }
373 
374 static inline struct sock *unix_find_socket_byname(struct net *net,
375 						   struct sockaddr_un *sunname,
376 						   int len, unsigned int hash)
377 {
378 	struct sock *s;
379 
380 	spin_lock(&net->unx.table.locks[hash]);
381 	s = __unix_find_socket_byname(net, sunname, len, hash);
382 	if (s)
383 		sock_hold(s);
384 	spin_unlock(&net->unx.table.locks[hash]);
385 	return s;
386 }
387 
388 static struct sock *unix_find_socket_byinode(struct inode *i)
389 {
390 	unsigned int hash = unix_bsd_hash(i);
391 	struct sock *s;
392 
393 	spin_lock(&bsd_socket_locks[hash]);
394 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
395 		struct dentry *dentry = unix_sk(s)->path.dentry;
396 
397 		if (dentry && d_backing_inode(dentry) == i) {
398 			sock_hold(s);
399 			spin_unlock(&bsd_socket_locks[hash]);
400 			return s;
401 		}
402 	}
403 	spin_unlock(&bsd_socket_locks[hash]);
404 	return NULL;
405 }
406 
407 /* Support code for asymmetrically connected dgram sockets
408  *
409  * If a datagram socket is connected to a socket not itself connected
410  * to the first socket (eg, /dev/log), clients may only enqueue more
411  * messages if the present receive queue of the server socket is not
412  * "too large". This means there's a second writeability condition
413  * poll and sendmsg need to test. The dgram recv code will do a wake
414  * up on the peer_wait wait queue of a socket upon reception of a
415  * datagram which needs to be propagated to sleeping would-be writers
416  * since these might not have sent anything so far. This can't be
417  * accomplished via poll_wait because the lifetime of the server
418  * socket might be less than that of its clients if these break their
419  * association with it or if the server socket is closed while clients
420  * are still connected to it and there's no way to inform "a polling
421  * implementation" that it should let go of a certain wait queue
422  *
423  * In order to propagate a wake up, a wait_queue_entry_t of the client
424  * socket is enqueued on the peer_wait queue of the server socket
425  * whose wake function does a wake_up on the ordinary client socket
426  * wait queue. This connection is established whenever a write (or
427  * poll for write) hit the flow control condition and broken when the
428  * association to the server socket is dissolved or after a wake up
429  * was relayed.
430  */
431 
432 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
433 				      void *key)
434 {
435 	struct unix_sock *u;
436 	wait_queue_head_t *u_sleep;
437 
438 	u = container_of(q, struct unix_sock, peer_wake);
439 
440 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
441 			    q);
442 	u->peer_wake.private = NULL;
443 
444 	/* relaying can only happen while the wq still exists */
445 	u_sleep = sk_sleep(&u->sk);
446 	if (u_sleep)
447 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
448 
449 	return 0;
450 }
451 
452 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
453 {
454 	struct unix_sock *u, *u_other;
455 	int rc;
456 
457 	u = unix_sk(sk);
458 	u_other = unix_sk(other);
459 	rc = 0;
460 	spin_lock(&u_other->peer_wait.lock);
461 
462 	if (!u->peer_wake.private) {
463 		u->peer_wake.private = other;
464 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
465 
466 		rc = 1;
467 	}
468 
469 	spin_unlock(&u_other->peer_wait.lock);
470 	return rc;
471 }
472 
473 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
474 					    struct sock *other)
475 {
476 	struct unix_sock *u, *u_other;
477 
478 	u = unix_sk(sk);
479 	u_other = unix_sk(other);
480 	spin_lock(&u_other->peer_wait.lock);
481 
482 	if (u->peer_wake.private == other) {
483 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
484 		u->peer_wake.private = NULL;
485 	}
486 
487 	spin_unlock(&u_other->peer_wait.lock);
488 }
489 
490 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
491 						   struct sock *other)
492 {
493 	unix_dgram_peer_wake_disconnect(sk, other);
494 	wake_up_interruptible_poll(sk_sleep(sk),
495 				   EPOLLOUT |
496 				   EPOLLWRNORM |
497 				   EPOLLWRBAND);
498 }
499 
500 /* preconditions:
501  *	- unix_peer(sk) == other
502  *	- association is stable
503  */
504 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
505 {
506 	int connected;
507 
508 	connected = unix_dgram_peer_wake_connect(sk, other);
509 
510 	/* If other is SOCK_DEAD, we want to make sure we signal
511 	 * POLLOUT, such that a subsequent write() can get a
512 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
513 	 * to other and its full, we will hang waiting for POLLOUT.
514 	 */
515 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
516 		return 1;
517 
518 	if (connected)
519 		unix_dgram_peer_wake_disconnect(sk, other);
520 
521 	return 0;
522 }
523 
524 static int unix_writable(const struct sock *sk)
525 {
526 	return sk->sk_state != TCP_LISTEN &&
527 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
528 }
529 
530 static void unix_write_space(struct sock *sk)
531 {
532 	struct socket_wq *wq;
533 
534 	rcu_read_lock();
535 	if (unix_writable(sk)) {
536 		wq = rcu_dereference(sk->sk_wq);
537 		if (skwq_has_sleeper(wq))
538 			wake_up_interruptible_sync_poll(&wq->wait,
539 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
540 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
541 	}
542 	rcu_read_unlock();
543 }
544 
545 /* When dgram socket disconnects (or changes its peer), we clear its receive
546  * queue of packets arrived from previous peer. First, it allows to do
547  * flow control based only on wmem_alloc; second, sk connected to peer
548  * may receive messages only from that peer. */
549 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
550 {
551 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
552 		skb_queue_purge(&sk->sk_receive_queue);
553 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
554 
555 		/* If one link of bidirectional dgram pipe is disconnected,
556 		 * we signal error. Messages are lost. Do not make this,
557 		 * when peer was not connected to us.
558 		 */
559 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
560 			WRITE_ONCE(other->sk_err, ECONNRESET);
561 			sk_error_report(other);
562 		}
563 	}
564 	other->sk_state = TCP_CLOSE;
565 }
566 
567 static void unix_sock_destructor(struct sock *sk)
568 {
569 	struct unix_sock *u = unix_sk(sk);
570 
571 	skb_queue_purge(&sk->sk_receive_queue);
572 
573 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
574 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
575 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
576 	if (!sock_flag(sk, SOCK_DEAD)) {
577 		pr_info("Attempt to release alive unix socket: %p\n", sk);
578 		return;
579 	}
580 
581 	if (u->addr)
582 		unix_release_addr(u->addr);
583 
584 	atomic_long_dec(&unix_nr_socks);
585 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
586 #ifdef UNIX_REFCNT_DEBUG
587 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
588 		atomic_long_read(&unix_nr_socks));
589 #endif
590 }
591 
592 static void unix_release_sock(struct sock *sk, int embrion)
593 {
594 	struct unix_sock *u = unix_sk(sk);
595 	struct sock *skpair;
596 	struct sk_buff *skb;
597 	struct path path;
598 	int state;
599 
600 	unix_remove_socket(sock_net(sk), sk);
601 	unix_remove_bsd_socket(sk);
602 
603 	/* Clear state */
604 	unix_state_lock(sk);
605 	sock_orphan(sk);
606 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
607 	path	     = u->path;
608 	u->path.dentry = NULL;
609 	u->path.mnt = NULL;
610 	state = sk->sk_state;
611 	sk->sk_state = TCP_CLOSE;
612 
613 	skpair = unix_peer(sk);
614 	unix_peer(sk) = NULL;
615 
616 	unix_state_unlock(sk);
617 
618 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
619 	if (u->oob_skb) {
620 		kfree_skb(u->oob_skb);
621 		u->oob_skb = NULL;
622 	}
623 #endif
624 
625 	wake_up_interruptible_all(&u->peer_wait);
626 
627 	if (skpair != NULL) {
628 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
629 			unix_state_lock(skpair);
630 			/* No more writes */
631 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
632 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
633 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
634 			unix_state_unlock(skpair);
635 			skpair->sk_state_change(skpair);
636 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
637 		}
638 
639 		unix_dgram_peer_wake_disconnect(sk, skpair);
640 		sock_put(skpair); /* It may now die */
641 	}
642 
643 	/* Try to flush out this socket. Throw out buffers at least */
644 
645 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
646 		if (state == TCP_LISTEN)
647 			unix_release_sock(skb->sk, 1);
648 		/* passed fds are erased in the kfree_skb hook	      */
649 		UNIXCB(skb).consumed = skb->len;
650 		kfree_skb(skb);
651 	}
652 
653 	if (path.dentry)
654 		path_put(&path);
655 
656 	sock_put(sk);
657 
658 	/* ---- Socket is dead now and most probably destroyed ---- */
659 
660 	/*
661 	 * Fixme: BSD difference: In BSD all sockets connected to us get
662 	 *	  ECONNRESET and we die on the spot. In Linux we behave
663 	 *	  like files and pipes do and wait for the last
664 	 *	  dereference.
665 	 *
666 	 * Can't we simply set sock->err?
667 	 *
668 	 *	  What the above comment does talk about? --ANK(980817)
669 	 */
670 
671 	if (unix_tot_inflight)
672 		unix_gc();		/* Garbage collect fds */
673 }
674 
675 static void init_peercred(struct sock *sk)
676 {
677 	const struct cred *old_cred;
678 	struct pid *old_pid;
679 
680 	spin_lock(&sk->sk_peer_lock);
681 	old_pid = sk->sk_peer_pid;
682 	old_cred = sk->sk_peer_cred;
683 	sk->sk_peer_pid  = get_pid(task_tgid(current));
684 	sk->sk_peer_cred = get_current_cred();
685 	spin_unlock(&sk->sk_peer_lock);
686 
687 	put_pid(old_pid);
688 	put_cred(old_cred);
689 }
690 
691 static void copy_peercred(struct sock *sk, struct sock *peersk)
692 {
693 	const struct cred *old_cred;
694 	struct pid *old_pid;
695 
696 	if (sk < peersk) {
697 		spin_lock(&sk->sk_peer_lock);
698 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
699 	} else {
700 		spin_lock(&peersk->sk_peer_lock);
701 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 	}
703 	old_pid = sk->sk_peer_pid;
704 	old_cred = sk->sk_peer_cred;
705 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
706 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
707 
708 	spin_unlock(&sk->sk_peer_lock);
709 	spin_unlock(&peersk->sk_peer_lock);
710 
711 	put_pid(old_pid);
712 	put_cred(old_cred);
713 }
714 
715 static int unix_listen(struct socket *sock, int backlog)
716 {
717 	int err;
718 	struct sock *sk = sock->sk;
719 	struct unix_sock *u = unix_sk(sk);
720 
721 	err = -EOPNOTSUPP;
722 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
723 		goto out;	/* Only stream/seqpacket sockets accept */
724 	err = -EINVAL;
725 	if (!u->addr)
726 		goto out;	/* No listens on an unbound socket */
727 	unix_state_lock(sk);
728 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
729 		goto out_unlock;
730 	if (backlog > sk->sk_max_ack_backlog)
731 		wake_up_interruptible_all(&u->peer_wait);
732 	sk->sk_max_ack_backlog	= backlog;
733 	sk->sk_state		= TCP_LISTEN;
734 	/* set credentials so connect can copy them */
735 	init_peercred(sk);
736 	err = 0;
737 
738 out_unlock:
739 	unix_state_unlock(sk);
740 out:
741 	return err;
742 }
743 
744 static int unix_release(struct socket *);
745 static int unix_bind(struct socket *, struct sockaddr *, int);
746 static int unix_stream_connect(struct socket *, struct sockaddr *,
747 			       int addr_len, int flags);
748 static int unix_socketpair(struct socket *, struct socket *);
749 static int unix_accept(struct socket *, struct socket *, int, bool);
750 static int unix_getname(struct socket *, struct sockaddr *, int);
751 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
752 static __poll_t unix_dgram_poll(struct file *, struct socket *,
753 				    poll_table *);
754 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
755 #ifdef CONFIG_COMPAT
756 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
757 #endif
758 static int unix_shutdown(struct socket *, int);
759 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
760 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
761 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
762 				    size_t size, int flags);
763 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
764 				       struct pipe_inode_info *, size_t size,
765 				       unsigned int flags);
766 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
767 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
768 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
769 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
770 static int unix_dgram_connect(struct socket *, struct sockaddr *,
771 			      int, int);
772 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
773 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
774 				  int);
775 
776 static int unix_set_peek_off(struct sock *sk, int val)
777 {
778 	struct unix_sock *u = unix_sk(sk);
779 
780 	if (mutex_lock_interruptible(&u->iolock))
781 		return -EINTR;
782 
783 	sk->sk_peek_off = val;
784 	mutex_unlock(&u->iolock);
785 
786 	return 0;
787 }
788 
789 #ifdef CONFIG_PROC_FS
790 static int unix_count_nr_fds(struct sock *sk)
791 {
792 	struct sk_buff *skb;
793 	struct unix_sock *u;
794 	int nr_fds = 0;
795 
796 	spin_lock(&sk->sk_receive_queue.lock);
797 	skb = skb_peek(&sk->sk_receive_queue);
798 	while (skb) {
799 		u = unix_sk(skb->sk);
800 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
801 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
802 	}
803 	spin_unlock(&sk->sk_receive_queue.lock);
804 
805 	return nr_fds;
806 }
807 
808 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
809 {
810 	struct sock *sk = sock->sk;
811 	unsigned char s_state;
812 	struct unix_sock *u;
813 	int nr_fds = 0;
814 
815 	if (sk) {
816 		s_state = READ_ONCE(sk->sk_state);
817 		u = unix_sk(sk);
818 
819 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
820 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
821 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
822 		 */
823 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
824 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
825 		else if (s_state == TCP_LISTEN)
826 			nr_fds = unix_count_nr_fds(sk);
827 
828 		seq_printf(m, "scm_fds: %u\n", nr_fds);
829 	}
830 }
831 #else
832 #define unix_show_fdinfo NULL
833 #endif
834 
835 static const struct proto_ops unix_stream_ops = {
836 	.family =	PF_UNIX,
837 	.owner =	THIS_MODULE,
838 	.release =	unix_release,
839 	.bind =		unix_bind,
840 	.connect =	unix_stream_connect,
841 	.socketpair =	unix_socketpair,
842 	.accept =	unix_accept,
843 	.getname =	unix_getname,
844 	.poll =		unix_poll,
845 	.ioctl =	unix_ioctl,
846 #ifdef CONFIG_COMPAT
847 	.compat_ioctl =	unix_compat_ioctl,
848 #endif
849 	.listen =	unix_listen,
850 	.shutdown =	unix_shutdown,
851 	.sendmsg =	unix_stream_sendmsg,
852 	.recvmsg =	unix_stream_recvmsg,
853 	.read_skb =	unix_stream_read_skb,
854 	.mmap =		sock_no_mmap,
855 	.sendpage =	unix_stream_sendpage,
856 	.splice_read =	unix_stream_splice_read,
857 	.set_peek_off =	unix_set_peek_off,
858 	.show_fdinfo =	unix_show_fdinfo,
859 };
860 
861 static const struct proto_ops unix_dgram_ops = {
862 	.family =	PF_UNIX,
863 	.owner =	THIS_MODULE,
864 	.release =	unix_release,
865 	.bind =		unix_bind,
866 	.connect =	unix_dgram_connect,
867 	.socketpair =	unix_socketpair,
868 	.accept =	sock_no_accept,
869 	.getname =	unix_getname,
870 	.poll =		unix_dgram_poll,
871 	.ioctl =	unix_ioctl,
872 #ifdef CONFIG_COMPAT
873 	.compat_ioctl =	unix_compat_ioctl,
874 #endif
875 	.listen =	sock_no_listen,
876 	.shutdown =	unix_shutdown,
877 	.sendmsg =	unix_dgram_sendmsg,
878 	.read_skb =	unix_read_skb,
879 	.recvmsg =	unix_dgram_recvmsg,
880 	.mmap =		sock_no_mmap,
881 	.sendpage =	sock_no_sendpage,
882 	.set_peek_off =	unix_set_peek_off,
883 	.show_fdinfo =	unix_show_fdinfo,
884 };
885 
886 static const struct proto_ops unix_seqpacket_ops = {
887 	.family =	PF_UNIX,
888 	.owner =	THIS_MODULE,
889 	.release =	unix_release,
890 	.bind =		unix_bind,
891 	.connect =	unix_stream_connect,
892 	.socketpair =	unix_socketpair,
893 	.accept =	unix_accept,
894 	.getname =	unix_getname,
895 	.poll =		unix_dgram_poll,
896 	.ioctl =	unix_ioctl,
897 #ifdef CONFIG_COMPAT
898 	.compat_ioctl =	unix_compat_ioctl,
899 #endif
900 	.listen =	unix_listen,
901 	.shutdown =	unix_shutdown,
902 	.sendmsg =	unix_seqpacket_sendmsg,
903 	.recvmsg =	unix_seqpacket_recvmsg,
904 	.mmap =		sock_no_mmap,
905 	.sendpage =	sock_no_sendpage,
906 	.set_peek_off =	unix_set_peek_off,
907 	.show_fdinfo =	unix_show_fdinfo,
908 };
909 
910 static void unix_close(struct sock *sk, long timeout)
911 {
912 	/* Nothing to do here, unix socket does not need a ->close().
913 	 * This is merely for sockmap.
914 	 */
915 }
916 
917 static void unix_unhash(struct sock *sk)
918 {
919 	/* Nothing to do here, unix socket does not need a ->unhash().
920 	 * This is merely for sockmap.
921 	 */
922 }
923 
924 struct proto unix_dgram_proto = {
925 	.name			= "UNIX",
926 	.owner			= THIS_MODULE,
927 	.obj_size		= sizeof(struct unix_sock),
928 	.close			= unix_close,
929 #ifdef CONFIG_BPF_SYSCALL
930 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
931 #endif
932 };
933 
934 struct proto unix_stream_proto = {
935 	.name			= "UNIX-STREAM",
936 	.owner			= THIS_MODULE,
937 	.obj_size		= sizeof(struct unix_sock),
938 	.close			= unix_close,
939 	.unhash			= unix_unhash,
940 #ifdef CONFIG_BPF_SYSCALL
941 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
942 #endif
943 };
944 
945 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
946 {
947 	struct unix_sock *u;
948 	struct sock *sk;
949 	int err;
950 
951 	atomic_long_inc(&unix_nr_socks);
952 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
953 		err = -ENFILE;
954 		goto err;
955 	}
956 
957 	if (type == SOCK_STREAM)
958 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
959 	else /*dgram and  seqpacket */
960 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
961 
962 	if (!sk) {
963 		err = -ENOMEM;
964 		goto err;
965 	}
966 
967 	sock_init_data(sock, sk);
968 
969 	sk->sk_hash		= unix_unbound_hash(sk);
970 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
971 	sk->sk_write_space	= unix_write_space;
972 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
973 	sk->sk_destruct		= unix_sock_destructor;
974 	u	  = unix_sk(sk);
975 	u->path.dentry = NULL;
976 	u->path.mnt = NULL;
977 	spin_lock_init(&u->lock);
978 	atomic_long_set(&u->inflight, 0);
979 	INIT_LIST_HEAD(&u->link);
980 	mutex_init(&u->iolock); /* single task reading lock */
981 	mutex_init(&u->bindlock); /* single task binding lock */
982 	init_waitqueue_head(&u->peer_wait);
983 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
984 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
985 	unix_insert_unbound_socket(net, sk);
986 
987 	sock_prot_inuse_add(net, sk->sk_prot, 1);
988 
989 	return sk;
990 
991 err:
992 	atomic_long_dec(&unix_nr_socks);
993 	return ERR_PTR(err);
994 }
995 
996 static int unix_create(struct net *net, struct socket *sock, int protocol,
997 		       int kern)
998 {
999 	struct sock *sk;
1000 
1001 	if (protocol && protocol != PF_UNIX)
1002 		return -EPROTONOSUPPORT;
1003 
1004 	sock->state = SS_UNCONNECTED;
1005 
1006 	switch (sock->type) {
1007 	case SOCK_STREAM:
1008 		sock->ops = &unix_stream_ops;
1009 		break;
1010 		/*
1011 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1012 		 *	nothing uses it.
1013 		 */
1014 	case SOCK_RAW:
1015 		sock->type = SOCK_DGRAM;
1016 		fallthrough;
1017 	case SOCK_DGRAM:
1018 		sock->ops = &unix_dgram_ops;
1019 		break;
1020 	case SOCK_SEQPACKET:
1021 		sock->ops = &unix_seqpacket_ops;
1022 		break;
1023 	default:
1024 		return -ESOCKTNOSUPPORT;
1025 	}
1026 
1027 	sk = unix_create1(net, sock, kern, sock->type);
1028 	if (IS_ERR(sk))
1029 		return PTR_ERR(sk);
1030 
1031 	return 0;
1032 }
1033 
1034 static int unix_release(struct socket *sock)
1035 {
1036 	struct sock *sk = sock->sk;
1037 
1038 	if (!sk)
1039 		return 0;
1040 
1041 	sk->sk_prot->close(sk, 0);
1042 	unix_release_sock(sk, 0);
1043 	sock->sk = NULL;
1044 
1045 	return 0;
1046 }
1047 
1048 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1049 				  int type)
1050 {
1051 	struct inode *inode;
1052 	struct path path;
1053 	struct sock *sk;
1054 	int err;
1055 
1056 	unix_mkname_bsd(sunaddr, addr_len);
1057 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1058 	if (err)
1059 		goto fail;
1060 
1061 	err = path_permission(&path, MAY_WRITE);
1062 	if (err)
1063 		goto path_put;
1064 
1065 	err = -ECONNREFUSED;
1066 	inode = d_backing_inode(path.dentry);
1067 	if (!S_ISSOCK(inode->i_mode))
1068 		goto path_put;
1069 
1070 	sk = unix_find_socket_byinode(inode);
1071 	if (!sk)
1072 		goto path_put;
1073 
1074 	err = -EPROTOTYPE;
1075 	if (sk->sk_type == type)
1076 		touch_atime(&path);
1077 	else
1078 		goto sock_put;
1079 
1080 	path_put(&path);
1081 
1082 	return sk;
1083 
1084 sock_put:
1085 	sock_put(sk);
1086 path_put:
1087 	path_put(&path);
1088 fail:
1089 	return ERR_PTR(err);
1090 }
1091 
1092 static struct sock *unix_find_abstract(struct net *net,
1093 				       struct sockaddr_un *sunaddr,
1094 				       int addr_len, int type)
1095 {
1096 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1097 	struct dentry *dentry;
1098 	struct sock *sk;
1099 
1100 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1101 	if (!sk)
1102 		return ERR_PTR(-ECONNREFUSED);
1103 
1104 	dentry = unix_sk(sk)->path.dentry;
1105 	if (dentry)
1106 		touch_atime(&unix_sk(sk)->path);
1107 
1108 	return sk;
1109 }
1110 
1111 static struct sock *unix_find_other(struct net *net,
1112 				    struct sockaddr_un *sunaddr,
1113 				    int addr_len, int type)
1114 {
1115 	struct sock *sk;
1116 
1117 	if (sunaddr->sun_path[0])
1118 		sk = unix_find_bsd(sunaddr, addr_len, type);
1119 	else
1120 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1121 
1122 	return sk;
1123 }
1124 
1125 static int unix_autobind(struct sock *sk)
1126 {
1127 	unsigned int new_hash, old_hash = sk->sk_hash;
1128 	struct unix_sock *u = unix_sk(sk);
1129 	struct net *net = sock_net(sk);
1130 	struct unix_address *addr;
1131 	u32 lastnum, ordernum;
1132 	int err;
1133 
1134 	err = mutex_lock_interruptible(&u->bindlock);
1135 	if (err)
1136 		return err;
1137 
1138 	if (u->addr)
1139 		goto out;
1140 
1141 	err = -ENOMEM;
1142 	addr = kzalloc(sizeof(*addr) +
1143 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1144 	if (!addr)
1145 		goto out;
1146 
1147 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1148 	addr->name->sun_family = AF_UNIX;
1149 	refcount_set(&addr->refcnt, 1);
1150 
1151 	ordernum = get_random_u32();
1152 	lastnum = ordernum & 0xFFFFF;
1153 retry:
1154 	ordernum = (ordernum + 1) & 0xFFFFF;
1155 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1156 
1157 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1158 	unix_table_double_lock(net, old_hash, new_hash);
1159 
1160 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1161 		unix_table_double_unlock(net, old_hash, new_hash);
1162 
1163 		/* __unix_find_socket_byname() may take long time if many names
1164 		 * are already in use.
1165 		 */
1166 		cond_resched();
1167 
1168 		if (ordernum == lastnum) {
1169 			/* Give up if all names seems to be in use. */
1170 			err = -ENOSPC;
1171 			unix_release_addr(addr);
1172 			goto out;
1173 		}
1174 
1175 		goto retry;
1176 	}
1177 
1178 	__unix_set_addr_hash(net, sk, addr, new_hash);
1179 	unix_table_double_unlock(net, old_hash, new_hash);
1180 	err = 0;
1181 
1182 out:	mutex_unlock(&u->bindlock);
1183 	return err;
1184 }
1185 
1186 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1187 			 int addr_len)
1188 {
1189 	umode_t mode = S_IFSOCK |
1190 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1191 	unsigned int new_hash, old_hash = sk->sk_hash;
1192 	struct unix_sock *u = unix_sk(sk);
1193 	struct net *net = sock_net(sk);
1194 	struct mnt_idmap *idmap;
1195 	struct unix_address *addr;
1196 	struct dentry *dentry;
1197 	struct path parent;
1198 	int err;
1199 
1200 	unix_mkname_bsd(sunaddr, addr_len);
1201 	addr_len = strlen(sunaddr->sun_path) +
1202 		offsetof(struct sockaddr_un, sun_path) + 1;
1203 
1204 	addr = unix_create_addr(sunaddr, addr_len);
1205 	if (!addr)
1206 		return -ENOMEM;
1207 
1208 	/*
1209 	 * Get the parent directory, calculate the hash for last
1210 	 * component.
1211 	 */
1212 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1213 	if (IS_ERR(dentry)) {
1214 		err = PTR_ERR(dentry);
1215 		goto out;
1216 	}
1217 
1218 	/*
1219 	 * All right, let's create it.
1220 	 */
1221 	idmap = mnt_idmap(parent.mnt);
1222 	err = security_path_mknod(&parent, dentry, mode, 0);
1223 	if (!err)
1224 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1225 	if (err)
1226 		goto out_path;
1227 	err = mutex_lock_interruptible(&u->bindlock);
1228 	if (err)
1229 		goto out_unlink;
1230 	if (u->addr)
1231 		goto out_unlock;
1232 
1233 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1234 	unix_table_double_lock(net, old_hash, new_hash);
1235 	u->path.mnt = mntget(parent.mnt);
1236 	u->path.dentry = dget(dentry);
1237 	__unix_set_addr_hash(net, sk, addr, new_hash);
1238 	unix_table_double_unlock(net, old_hash, new_hash);
1239 	unix_insert_bsd_socket(sk);
1240 	mutex_unlock(&u->bindlock);
1241 	done_path_create(&parent, dentry);
1242 	return 0;
1243 
1244 out_unlock:
1245 	mutex_unlock(&u->bindlock);
1246 	err = -EINVAL;
1247 out_unlink:
1248 	/* failed after successful mknod?  unlink what we'd created... */
1249 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1250 out_path:
1251 	done_path_create(&parent, dentry);
1252 out:
1253 	unix_release_addr(addr);
1254 	return err == -EEXIST ? -EADDRINUSE : err;
1255 }
1256 
1257 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1258 			      int addr_len)
1259 {
1260 	unsigned int new_hash, old_hash = sk->sk_hash;
1261 	struct unix_sock *u = unix_sk(sk);
1262 	struct net *net = sock_net(sk);
1263 	struct unix_address *addr;
1264 	int err;
1265 
1266 	addr = unix_create_addr(sunaddr, addr_len);
1267 	if (!addr)
1268 		return -ENOMEM;
1269 
1270 	err = mutex_lock_interruptible(&u->bindlock);
1271 	if (err)
1272 		goto out;
1273 
1274 	if (u->addr) {
1275 		err = -EINVAL;
1276 		goto out_mutex;
1277 	}
1278 
1279 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1280 	unix_table_double_lock(net, old_hash, new_hash);
1281 
1282 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1283 		goto out_spin;
1284 
1285 	__unix_set_addr_hash(net, sk, addr, new_hash);
1286 	unix_table_double_unlock(net, old_hash, new_hash);
1287 	mutex_unlock(&u->bindlock);
1288 	return 0;
1289 
1290 out_spin:
1291 	unix_table_double_unlock(net, old_hash, new_hash);
1292 	err = -EADDRINUSE;
1293 out_mutex:
1294 	mutex_unlock(&u->bindlock);
1295 out:
1296 	unix_release_addr(addr);
1297 	return err;
1298 }
1299 
1300 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1301 {
1302 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1303 	struct sock *sk = sock->sk;
1304 	int err;
1305 
1306 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1307 	    sunaddr->sun_family == AF_UNIX)
1308 		return unix_autobind(sk);
1309 
1310 	err = unix_validate_addr(sunaddr, addr_len);
1311 	if (err)
1312 		return err;
1313 
1314 	if (sunaddr->sun_path[0])
1315 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1316 	else
1317 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1318 
1319 	return err;
1320 }
1321 
1322 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1323 {
1324 	if (unlikely(sk1 == sk2) || !sk2) {
1325 		unix_state_lock(sk1);
1326 		return;
1327 	}
1328 	if (sk1 < sk2) {
1329 		unix_state_lock(sk1);
1330 		unix_state_lock_nested(sk2);
1331 	} else {
1332 		unix_state_lock(sk2);
1333 		unix_state_lock_nested(sk1);
1334 	}
1335 }
1336 
1337 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1338 {
1339 	if (unlikely(sk1 == sk2) || !sk2) {
1340 		unix_state_unlock(sk1);
1341 		return;
1342 	}
1343 	unix_state_unlock(sk1);
1344 	unix_state_unlock(sk2);
1345 }
1346 
1347 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1348 			      int alen, int flags)
1349 {
1350 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1351 	struct sock *sk = sock->sk;
1352 	struct sock *other;
1353 	int err;
1354 
1355 	err = -EINVAL;
1356 	if (alen < offsetofend(struct sockaddr, sa_family))
1357 		goto out;
1358 
1359 	if (addr->sa_family != AF_UNSPEC) {
1360 		err = unix_validate_addr(sunaddr, alen);
1361 		if (err)
1362 			goto out;
1363 
1364 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1365 		    !unix_sk(sk)->addr) {
1366 			err = unix_autobind(sk);
1367 			if (err)
1368 				goto out;
1369 		}
1370 
1371 restart:
1372 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1373 		if (IS_ERR(other)) {
1374 			err = PTR_ERR(other);
1375 			goto out;
1376 		}
1377 
1378 		unix_state_double_lock(sk, other);
1379 
1380 		/* Apparently VFS overslept socket death. Retry. */
1381 		if (sock_flag(other, SOCK_DEAD)) {
1382 			unix_state_double_unlock(sk, other);
1383 			sock_put(other);
1384 			goto restart;
1385 		}
1386 
1387 		err = -EPERM;
1388 		if (!unix_may_send(sk, other))
1389 			goto out_unlock;
1390 
1391 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1392 		if (err)
1393 			goto out_unlock;
1394 
1395 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1396 	} else {
1397 		/*
1398 		 *	1003.1g breaking connected state with AF_UNSPEC
1399 		 */
1400 		other = NULL;
1401 		unix_state_double_lock(sk, other);
1402 	}
1403 
1404 	/*
1405 	 * If it was connected, reconnect.
1406 	 */
1407 	if (unix_peer(sk)) {
1408 		struct sock *old_peer = unix_peer(sk);
1409 
1410 		unix_peer(sk) = other;
1411 		if (!other)
1412 			sk->sk_state = TCP_CLOSE;
1413 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1414 
1415 		unix_state_double_unlock(sk, other);
1416 
1417 		if (other != old_peer)
1418 			unix_dgram_disconnected(sk, old_peer);
1419 		sock_put(old_peer);
1420 	} else {
1421 		unix_peer(sk) = other;
1422 		unix_state_double_unlock(sk, other);
1423 	}
1424 
1425 	return 0;
1426 
1427 out_unlock:
1428 	unix_state_double_unlock(sk, other);
1429 	sock_put(other);
1430 out:
1431 	return err;
1432 }
1433 
1434 static long unix_wait_for_peer(struct sock *other, long timeo)
1435 	__releases(&unix_sk(other)->lock)
1436 {
1437 	struct unix_sock *u = unix_sk(other);
1438 	int sched;
1439 	DEFINE_WAIT(wait);
1440 
1441 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1442 
1443 	sched = !sock_flag(other, SOCK_DEAD) &&
1444 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1445 		unix_recvq_full_lockless(other);
1446 
1447 	unix_state_unlock(other);
1448 
1449 	if (sched)
1450 		timeo = schedule_timeout(timeo);
1451 
1452 	finish_wait(&u->peer_wait, &wait);
1453 	return timeo;
1454 }
1455 
1456 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1457 			       int addr_len, int flags)
1458 {
1459 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1460 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1461 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1462 	struct net *net = sock_net(sk);
1463 	struct sk_buff *skb = NULL;
1464 	long timeo;
1465 	int err;
1466 	int st;
1467 
1468 	err = unix_validate_addr(sunaddr, addr_len);
1469 	if (err)
1470 		goto out;
1471 
1472 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1473 		err = unix_autobind(sk);
1474 		if (err)
1475 			goto out;
1476 	}
1477 
1478 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1479 
1480 	/* First of all allocate resources.
1481 	   If we will make it after state is locked,
1482 	   we will have to recheck all again in any case.
1483 	 */
1484 
1485 	/* create new sock for complete connection */
1486 	newsk = unix_create1(net, NULL, 0, sock->type);
1487 	if (IS_ERR(newsk)) {
1488 		err = PTR_ERR(newsk);
1489 		newsk = NULL;
1490 		goto out;
1491 	}
1492 
1493 	err = -ENOMEM;
1494 
1495 	/* Allocate skb for sending to listening sock */
1496 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1497 	if (skb == NULL)
1498 		goto out;
1499 
1500 restart:
1501 	/*  Find listening sock. */
1502 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1503 	if (IS_ERR(other)) {
1504 		err = PTR_ERR(other);
1505 		other = NULL;
1506 		goto out;
1507 	}
1508 
1509 	/* Latch state of peer */
1510 	unix_state_lock(other);
1511 
1512 	/* Apparently VFS overslept socket death. Retry. */
1513 	if (sock_flag(other, SOCK_DEAD)) {
1514 		unix_state_unlock(other);
1515 		sock_put(other);
1516 		goto restart;
1517 	}
1518 
1519 	err = -ECONNREFUSED;
1520 	if (other->sk_state != TCP_LISTEN)
1521 		goto out_unlock;
1522 	if (other->sk_shutdown & RCV_SHUTDOWN)
1523 		goto out_unlock;
1524 
1525 	if (unix_recvq_full(other)) {
1526 		err = -EAGAIN;
1527 		if (!timeo)
1528 			goto out_unlock;
1529 
1530 		timeo = unix_wait_for_peer(other, timeo);
1531 
1532 		err = sock_intr_errno(timeo);
1533 		if (signal_pending(current))
1534 			goto out;
1535 		sock_put(other);
1536 		goto restart;
1537 	}
1538 
1539 	/* Latch our state.
1540 
1541 	   It is tricky place. We need to grab our state lock and cannot
1542 	   drop lock on peer. It is dangerous because deadlock is
1543 	   possible. Connect to self case and simultaneous
1544 	   attempt to connect are eliminated by checking socket
1545 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1546 	   check this before attempt to grab lock.
1547 
1548 	   Well, and we have to recheck the state after socket locked.
1549 	 */
1550 	st = sk->sk_state;
1551 
1552 	switch (st) {
1553 	case TCP_CLOSE:
1554 		/* This is ok... continue with connect */
1555 		break;
1556 	case TCP_ESTABLISHED:
1557 		/* Socket is already connected */
1558 		err = -EISCONN;
1559 		goto out_unlock;
1560 	default:
1561 		err = -EINVAL;
1562 		goto out_unlock;
1563 	}
1564 
1565 	unix_state_lock_nested(sk);
1566 
1567 	if (sk->sk_state != st) {
1568 		unix_state_unlock(sk);
1569 		unix_state_unlock(other);
1570 		sock_put(other);
1571 		goto restart;
1572 	}
1573 
1574 	err = security_unix_stream_connect(sk, other, newsk);
1575 	if (err) {
1576 		unix_state_unlock(sk);
1577 		goto out_unlock;
1578 	}
1579 
1580 	/* The way is open! Fastly set all the necessary fields... */
1581 
1582 	sock_hold(sk);
1583 	unix_peer(newsk)	= sk;
1584 	newsk->sk_state		= TCP_ESTABLISHED;
1585 	newsk->sk_type		= sk->sk_type;
1586 	init_peercred(newsk);
1587 	newu = unix_sk(newsk);
1588 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1589 	otheru = unix_sk(other);
1590 
1591 	/* copy address information from listening to new sock
1592 	 *
1593 	 * The contents of *(otheru->addr) and otheru->path
1594 	 * are seen fully set up here, since we have found
1595 	 * otheru in hash under its lock.  Insertion into the
1596 	 * hash chain we'd found it in had been done in an
1597 	 * earlier critical area protected by the chain's lock,
1598 	 * the same one where we'd set *(otheru->addr) contents,
1599 	 * as well as otheru->path and otheru->addr itself.
1600 	 *
1601 	 * Using smp_store_release() here to set newu->addr
1602 	 * is enough to make those stores, as well as stores
1603 	 * to newu->path visible to anyone who gets newu->addr
1604 	 * by smp_load_acquire().  IOW, the same warranties
1605 	 * as for unix_sock instances bound in unix_bind() or
1606 	 * in unix_autobind().
1607 	 */
1608 	if (otheru->path.dentry) {
1609 		path_get(&otheru->path);
1610 		newu->path = otheru->path;
1611 	}
1612 	refcount_inc(&otheru->addr->refcnt);
1613 	smp_store_release(&newu->addr, otheru->addr);
1614 
1615 	/* Set credentials */
1616 	copy_peercred(sk, other);
1617 
1618 	sock->state	= SS_CONNECTED;
1619 	sk->sk_state	= TCP_ESTABLISHED;
1620 	sock_hold(newsk);
1621 
1622 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1623 	unix_peer(sk)	= newsk;
1624 
1625 	unix_state_unlock(sk);
1626 
1627 	/* take ten and send info to listening sock */
1628 	spin_lock(&other->sk_receive_queue.lock);
1629 	__skb_queue_tail(&other->sk_receive_queue, skb);
1630 	spin_unlock(&other->sk_receive_queue.lock);
1631 	unix_state_unlock(other);
1632 	other->sk_data_ready(other);
1633 	sock_put(other);
1634 	return 0;
1635 
1636 out_unlock:
1637 	if (other)
1638 		unix_state_unlock(other);
1639 
1640 out:
1641 	kfree_skb(skb);
1642 	if (newsk)
1643 		unix_release_sock(newsk, 0);
1644 	if (other)
1645 		sock_put(other);
1646 	return err;
1647 }
1648 
1649 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1650 {
1651 	struct sock *ska = socka->sk, *skb = sockb->sk;
1652 
1653 	/* Join our sockets back to back */
1654 	sock_hold(ska);
1655 	sock_hold(skb);
1656 	unix_peer(ska) = skb;
1657 	unix_peer(skb) = ska;
1658 	init_peercred(ska);
1659 	init_peercred(skb);
1660 
1661 	ska->sk_state = TCP_ESTABLISHED;
1662 	skb->sk_state = TCP_ESTABLISHED;
1663 	socka->state  = SS_CONNECTED;
1664 	sockb->state  = SS_CONNECTED;
1665 	return 0;
1666 }
1667 
1668 static void unix_sock_inherit_flags(const struct socket *old,
1669 				    struct socket *new)
1670 {
1671 	if (test_bit(SOCK_PASSCRED, &old->flags))
1672 		set_bit(SOCK_PASSCRED, &new->flags);
1673 	if (test_bit(SOCK_PASSSEC, &old->flags))
1674 		set_bit(SOCK_PASSSEC, &new->flags);
1675 }
1676 
1677 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1678 		       bool kern)
1679 {
1680 	struct sock *sk = sock->sk;
1681 	struct sock *tsk;
1682 	struct sk_buff *skb;
1683 	int err;
1684 
1685 	err = -EOPNOTSUPP;
1686 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1687 		goto out;
1688 
1689 	err = -EINVAL;
1690 	if (sk->sk_state != TCP_LISTEN)
1691 		goto out;
1692 
1693 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1694 	 * so that no locks are necessary.
1695 	 */
1696 
1697 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1698 				&err);
1699 	if (!skb) {
1700 		/* This means receive shutdown. */
1701 		if (err == 0)
1702 			err = -EINVAL;
1703 		goto out;
1704 	}
1705 
1706 	tsk = skb->sk;
1707 	skb_free_datagram(sk, skb);
1708 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1709 
1710 	/* attach accepted sock to socket */
1711 	unix_state_lock(tsk);
1712 	newsock->state = SS_CONNECTED;
1713 	unix_sock_inherit_flags(sock, newsock);
1714 	sock_graft(tsk, newsock);
1715 	unix_state_unlock(tsk);
1716 	return 0;
1717 
1718 out:
1719 	return err;
1720 }
1721 
1722 
1723 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1724 {
1725 	struct sock *sk = sock->sk;
1726 	struct unix_address *addr;
1727 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1728 	int err = 0;
1729 
1730 	if (peer) {
1731 		sk = unix_peer_get(sk);
1732 
1733 		err = -ENOTCONN;
1734 		if (!sk)
1735 			goto out;
1736 		err = 0;
1737 	} else {
1738 		sock_hold(sk);
1739 	}
1740 
1741 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1742 	if (!addr) {
1743 		sunaddr->sun_family = AF_UNIX;
1744 		sunaddr->sun_path[0] = 0;
1745 		err = offsetof(struct sockaddr_un, sun_path);
1746 	} else {
1747 		err = addr->len;
1748 		memcpy(sunaddr, addr->name, addr->len);
1749 	}
1750 	sock_put(sk);
1751 out:
1752 	return err;
1753 }
1754 
1755 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1756 {
1757 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1758 
1759 	/*
1760 	 * Garbage collection of unix sockets starts by selecting a set of
1761 	 * candidate sockets which have reference only from being in flight
1762 	 * (total_refs == inflight_refs).  This condition is checked once during
1763 	 * the candidate collection phase, and candidates are marked as such, so
1764 	 * that non-candidates can later be ignored.  While inflight_refs is
1765 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1766 	 * is an instantaneous decision.
1767 	 *
1768 	 * Once a candidate, however, the socket must not be reinstalled into a
1769 	 * file descriptor while the garbage collection is in progress.
1770 	 *
1771 	 * If the above conditions are met, then the directed graph of
1772 	 * candidates (*) does not change while unix_gc_lock is held.
1773 	 *
1774 	 * Any operations that changes the file count through file descriptors
1775 	 * (dup, close, sendmsg) does not change the graph since candidates are
1776 	 * not installed in fds.
1777 	 *
1778 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1779 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1780 	 * serialized with garbage collection.
1781 	 *
1782 	 * MSG_PEEK is special in that it does not change the inflight count,
1783 	 * yet does install the socket into an fd.  The following lock/unlock
1784 	 * pair is to ensure serialization with garbage collection.  It must be
1785 	 * done between incrementing the file count and installing the file into
1786 	 * an fd.
1787 	 *
1788 	 * If garbage collection starts after the barrier provided by the
1789 	 * lock/unlock, then it will see the elevated refcount and not mark this
1790 	 * as a candidate.  If a garbage collection is already in progress
1791 	 * before the file count was incremented, then the lock/unlock pair will
1792 	 * ensure that garbage collection is finished before progressing to
1793 	 * installing the fd.
1794 	 *
1795 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1796 	 * which is on the queue of listening socket A.
1797 	 */
1798 	spin_lock(&unix_gc_lock);
1799 	spin_unlock(&unix_gc_lock);
1800 }
1801 
1802 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1803 {
1804 	int err = 0;
1805 
1806 	UNIXCB(skb).pid  = get_pid(scm->pid);
1807 	UNIXCB(skb).uid = scm->creds.uid;
1808 	UNIXCB(skb).gid = scm->creds.gid;
1809 	UNIXCB(skb).fp = NULL;
1810 	unix_get_secdata(scm, skb);
1811 	if (scm->fp && send_fds)
1812 		err = unix_attach_fds(scm, skb);
1813 
1814 	skb->destructor = unix_destruct_scm;
1815 	return err;
1816 }
1817 
1818 static bool unix_passcred_enabled(const struct socket *sock,
1819 				  const struct sock *other)
1820 {
1821 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1822 	       !other->sk_socket ||
1823 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1824 }
1825 
1826 /*
1827  * Some apps rely on write() giving SCM_CREDENTIALS
1828  * We include credentials if source or destination socket
1829  * asserted SOCK_PASSCRED.
1830  */
1831 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1832 			    const struct sock *other)
1833 {
1834 	if (UNIXCB(skb).pid)
1835 		return;
1836 	if (unix_passcred_enabled(sock, other)) {
1837 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1838 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1839 	}
1840 }
1841 
1842 static int maybe_init_creds(struct scm_cookie *scm,
1843 			    struct socket *socket,
1844 			    const struct sock *other)
1845 {
1846 	int err;
1847 	struct msghdr msg = { .msg_controllen = 0 };
1848 
1849 	err = scm_send(socket, &msg, scm, false);
1850 	if (err)
1851 		return err;
1852 
1853 	if (unix_passcred_enabled(socket, other)) {
1854 		scm->pid = get_pid(task_tgid(current));
1855 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1856 	}
1857 	return err;
1858 }
1859 
1860 static bool unix_skb_scm_eq(struct sk_buff *skb,
1861 			    struct scm_cookie *scm)
1862 {
1863 	return UNIXCB(skb).pid == scm->pid &&
1864 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1865 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1866 	       unix_secdata_eq(scm, skb);
1867 }
1868 
1869 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1870 {
1871 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1872 	struct unix_sock *u = unix_sk(sk);
1873 
1874 	if (unlikely(fp && fp->count))
1875 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1876 }
1877 
1878 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1879 {
1880 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1881 	struct unix_sock *u = unix_sk(sk);
1882 
1883 	if (unlikely(fp && fp->count))
1884 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1885 }
1886 
1887 /*
1888  *	Send AF_UNIX data.
1889  */
1890 
1891 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1892 			      size_t len)
1893 {
1894 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1895 	struct sock *sk = sock->sk, *other = NULL;
1896 	struct unix_sock *u = unix_sk(sk);
1897 	struct scm_cookie scm;
1898 	struct sk_buff *skb;
1899 	int data_len = 0;
1900 	int sk_locked;
1901 	long timeo;
1902 	int err;
1903 
1904 	wait_for_unix_gc();
1905 	err = scm_send(sock, msg, &scm, false);
1906 	if (err < 0)
1907 		return err;
1908 
1909 	err = -EOPNOTSUPP;
1910 	if (msg->msg_flags&MSG_OOB)
1911 		goto out;
1912 
1913 	if (msg->msg_namelen) {
1914 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1915 		if (err)
1916 			goto out;
1917 	} else {
1918 		sunaddr = NULL;
1919 		err = -ENOTCONN;
1920 		other = unix_peer_get(sk);
1921 		if (!other)
1922 			goto out;
1923 	}
1924 
1925 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1926 		err = unix_autobind(sk);
1927 		if (err)
1928 			goto out;
1929 	}
1930 
1931 	err = -EMSGSIZE;
1932 	if (len > sk->sk_sndbuf - 32)
1933 		goto out;
1934 
1935 	if (len > SKB_MAX_ALLOC) {
1936 		data_len = min_t(size_t,
1937 				 len - SKB_MAX_ALLOC,
1938 				 MAX_SKB_FRAGS * PAGE_SIZE);
1939 		data_len = PAGE_ALIGN(data_len);
1940 
1941 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1942 	}
1943 
1944 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1945 				   msg->msg_flags & MSG_DONTWAIT, &err,
1946 				   PAGE_ALLOC_COSTLY_ORDER);
1947 	if (skb == NULL)
1948 		goto out;
1949 
1950 	err = unix_scm_to_skb(&scm, skb, true);
1951 	if (err < 0)
1952 		goto out_free;
1953 
1954 	skb_put(skb, len - data_len);
1955 	skb->data_len = data_len;
1956 	skb->len = len;
1957 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1958 	if (err)
1959 		goto out_free;
1960 
1961 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1962 
1963 restart:
1964 	if (!other) {
1965 		err = -ECONNRESET;
1966 		if (sunaddr == NULL)
1967 			goto out_free;
1968 
1969 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1970 					sk->sk_type);
1971 		if (IS_ERR(other)) {
1972 			err = PTR_ERR(other);
1973 			other = NULL;
1974 			goto out_free;
1975 		}
1976 	}
1977 
1978 	if (sk_filter(other, skb) < 0) {
1979 		/* Toss the packet but do not return any error to the sender */
1980 		err = len;
1981 		goto out_free;
1982 	}
1983 
1984 	sk_locked = 0;
1985 	unix_state_lock(other);
1986 restart_locked:
1987 	err = -EPERM;
1988 	if (!unix_may_send(sk, other))
1989 		goto out_unlock;
1990 
1991 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1992 		/*
1993 		 *	Check with 1003.1g - what should
1994 		 *	datagram error
1995 		 */
1996 		unix_state_unlock(other);
1997 		sock_put(other);
1998 
1999 		if (!sk_locked)
2000 			unix_state_lock(sk);
2001 
2002 		err = 0;
2003 		if (sk->sk_type == SOCK_SEQPACKET) {
2004 			/* We are here only when racing with unix_release_sock()
2005 			 * is clearing @other. Never change state to TCP_CLOSE
2006 			 * unlike SOCK_DGRAM wants.
2007 			 */
2008 			unix_state_unlock(sk);
2009 			err = -EPIPE;
2010 		} else if (unix_peer(sk) == other) {
2011 			unix_peer(sk) = NULL;
2012 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2013 
2014 			sk->sk_state = TCP_CLOSE;
2015 			unix_state_unlock(sk);
2016 
2017 			unix_dgram_disconnected(sk, other);
2018 			sock_put(other);
2019 			err = -ECONNREFUSED;
2020 		} else {
2021 			unix_state_unlock(sk);
2022 		}
2023 
2024 		other = NULL;
2025 		if (err)
2026 			goto out_free;
2027 		goto restart;
2028 	}
2029 
2030 	err = -EPIPE;
2031 	if (other->sk_shutdown & RCV_SHUTDOWN)
2032 		goto out_unlock;
2033 
2034 	if (sk->sk_type != SOCK_SEQPACKET) {
2035 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2036 		if (err)
2037 			goto out_unlock;
2038 	}
2039 
2040 	/* other == sk && unix_peer(other) != sk if
2041 	 * - unix_peer(sk) == NULL, destination address bound to sk
2042 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2043 	 */
2044 	if (other != sk &&
2045 	    unlikely(unix_peer(other) != sk &&
2046 	    unix_recvq_full_lockless(other))) {
2047 		if (timeo) {
2048 			timeo = unix_wait_for_peer(other, timeo);
2049 
2050 			err = sock_intr_errno(timeo);
2051 			if (signal_pending(current))
2052 				goto out_free;
2053 
2054 			goto restart;
2055 		}
2056 
2057 		if (!sk_locked) {
2058 			unix_state_unlock(other);
2059 			unix_state_double_lock(sk, other);
2060 		}
2061 
2062 		if (unix_peer(sk) != other ||
2063 		    unix_dgram_peer_wake_me(sk, other)) {
2064 			err = -EAGAIN;
2065 			sk_locked = 1;
2066 			goto out_unlock;
2067 		}
2068 
2069 		if (!sk_locked) {
2070 			sk_locked = 1;
2071 			goto restart_locked;
2072 		}
2073 	}
2074 
2075 	if (unlikely(sk_locked))
2076 		unix_state_unlock(sk);
2077 
2078 	if (sock_flag(other, SOCK_RCVTSTAMP))
2079 		__net_timestamp(skb);
2080 	maybe_add_creds(skb, sock, other);
2081 	scm_stat_add(other, skb);
2082 	skb_queue_tail(&other->sk_receive_queue, skb);
2083 	unix_state_unlock(other);
2084 	other->sk_data_ready(other);
2085 	sock_put(other);
2086 	scm_destroy(&scm);
2087 	return len;
2088 
2089 out_unlock:
2090 	if (sk_locked)
2091 		unix_state_unlock(sk);
2092 	unix_state_unlock(other);
2093 out_free:
2094 	kfree_skb(skb);
2095 out:
2096 	if (other)
2097 		sock_put(other);
2098 	scm_destroy(&scm);
2099 	return err;
2100 }
2101 
2102 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2103  * bytes, and a minimum of a full page.
2104  */
2105 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2106 
2107 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2108 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2109 		     struct scm_cookie *scm, bool fds_sent)
2110 {
2111 	struct unix_sock *ousk = unix_sk(other);
2112 	struct sk_buff *skb;
2113 	int err = 0;
2114 
2115 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2116 
2117 	if (!skb)
2118 		return err;
2119 
2120 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2121 	if (err < 0) {
2122 		kfree_skb(skb);
2123 		return err;
2124 	}
2125 	skb_put(skb, 1);
2126 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2127 
2128 	if (err) {
2129 		kfree_skb(skb);
2130 		return err;
2131 	}
2132 
2133 	unix_state_lock(other);
2134 
2135 	if (sock_flag(other, SOCK_DEAD) ||
2136 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2137 		unix_state_unlock(other);
2138 		kfree_skb(skb);
2139 		return -EPIPE;
2140 	}
2141 
2142 	maybe_add_creds(skb, sock, other);
2143 	skb_get(skb);
2144 
2145 	if (ousk->oob_skb)
2146 		consume_skb(ousk->oob_skb);
2147 
2148 	WRITE_ONCE(ousk->oob_skb, skb);
2149 
2150 	scm_stat_add(other, skb);
2151 	skb_queue_tail(&other->sk_receive_queue, skb);
2152 	sk_send_sigurg(other);
2153 	unix_state_unlock(other);
2154 	other->sk_data_ready(other);
2155 
2156 	return err;
2157 }
2158 #endif
2159 
2160 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2161 			       size_t len)
2162 {
2163 	struct sock *sk = sock->sk;
2164 	struct sock *other = NULL;
2165 	int err, size;
2166 	struct sk_buff *skb;
2167 	int sent = 0;
2168 	struct scm_cookie scm;
2169 	bool fds_sent = false;
2170 	int data_len;
2171 
2172 	wait_for_unix_gc();
2173 	err = scm_send(sock, msg, &scm, false);
2174 	if (err < 0)
2175 		return err;
2176 
2177 	err = -EOPNOTSUPP;
2178 	if (msg->msg_flags & MSG_OOB) {
2179 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2180 		if (len)
2181 			len--;
2182 		else
2183 #endif
2184 			goto out_err;
2185 	}
2186 
2187 	if (msg->msg_namelen) {
2188 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2189 		goto out_err;
2190 	} else {
2191 		err = -ENOTCONN;
2192 		other = unix_peer(sk);
2193 		if (!other)
2194 			goto out_err;
2195 	}
2196 
2197 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2198 		goto pipe_err;
2199 
2200 	while (sent < len) {
2201 		size = len - sent;
2202 
2203 		/* Keep two messages in the pipe so it schedules better */
2204 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2205 
2206 		/* allow fallback to order-0 allocations */
2207 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2208 
2209 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2210 
2211 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2212 
2213 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2214 					   msg->msg_flags & MSG_DONTWAIT, &err,
2215 					   get_order(UNIX_SKB_FRAGS_SZ));
2216 		if (!skb)
2217 			goto out_err;
2218 
2219 		/* Only send the fds in the first buffer */
2220 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2221 		if (err < 0) {
2222 			kfree_skb(skb);
2223 			goto out_err;
2224 		}
2225 		fds_sent = true;
2226 
2227 		skb_put(skb, size - data_len);
2228 		skb->data_len = data_len;
2229 		skb->len = size;
2230 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2231 		if (err) {
2232 			kfree_skb(skb);
2233 			goto out_err;
2234 		}
2235 
2236 		unix_state_lock(other);
2237 
2238 		if (sock_flag(other, SOCK_DEAD) ||
2239 		    (other->sk_shutdown & RCV_SHUTDOWN))
2240 			goto pipe_err_free;
2241 
2242 		maybe_add_creds(skb, sock, other);
2243 		scm_stat_add(other, skb);
2244 		skb_queue_tail(&other->sk_receive_queue, skb);
2245 		unix_state_unlock(other);
2246 		other->sk_data_ready(other);
2247 		sent += size;
2248 	}
2249 
2250 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2251 	if (msg->msg_flags & MSG_OOB) {
2252 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2253 		if (err)
2254 			goto out_err;
2255 		sent++;
2256 	}
2257 #endif
2258 
2259 	scm_destroy(&scm);
2260 
2261 	return sent;
2262 
2263 pipe_err_free:
2264 	unix_state_unlock(other);
2265 	kfree_skb(skb);
2266 pipe_err:
2267 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2268 		send_sig(SIGPIPE, current, 0);
2269 	err = -EPIPE;
2270 out_err:
2271 	scm_destroy(&scm);
2272 	return sent ? : err;
2273 }
2274 
2275 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2276 				    int offset, size_t size, int flags)
2277 {
2278 	int err;
2279 	bool send_sigpipe = false;
2280 	bool init_scm = true;
2281 	struct scm_cookie scm;
2282 	struct sock *other, *sk = socket->sk;
2283 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2284 
2285 	if (flags & MSG_OOB)
2286 		return -EOPNOTSUPP;
2287 
2288 	other = unix_peer(sk);
2289 	if (!other || sk->sk_state != TCP_ESTABLISHED)
2290 		return -ENOTCONN;
2291 
2292 	if (false) {
2293 alloc_skb:
2294 		unix_state_unlock(other);
2295 		mutex_unlock(&unix_sk(other)->iolock);
2296 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2297 					      &err, 0);
2298 		if (!newskb)
2299 			goto err;
2300 	}
2301 
2302 	/* we must acquire iolock as we modify already present
2303 	 * skbs in the sk_receive_queue and mess with skb->len
2304 	 */
2305 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2306 	if (err) {
2307 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2308 		goto err;
2309 	}
2310 
2311 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2312 		err = -EPIPE;
2313 		send_sigpipe = true;
2314 		goto err_unlock;
2315 	}
2316 
2317 	unix_state_lock(other);
2318 
2319 	if (sock_flag(other, SOCK_DEAD) ||
2320 	    other->sk_shutdown & RCV_SHUTDOWN) {
2321 		err = -EPIPE;
2322 		send_sigpipe = true;
2323 		goto err_state_unlock;
2324 	}
2325 
2326 	if (init_scm) {
2327 		err = maybe_init_creds(&scm, socket, other);
2328 		if (err)
2329 			goto err_state_unlock;
2330 		init_scm = false;
2331 	}
2332 
2333 	skb = skb_peek_tail(&other->sk_receive_queue);
2334 	if (tail && tail == skb) {
2335 		skb = newskb;
2336 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2337 		if (newskb) {
2338 			skb = newskb;
2339 		} else {
2340 			tail = skb;
2341 			goto alloc_skb;
2342 		}
2343 	} else if (newskb) {
2344 		/* this is fast path, we don't necessarily need to
2345 		 * call to kfree_skb even though with newskb == NULL
2346 		 * this - does no harm
2347 		 */
2348 		consume_skb(newskb);
2349 		newskb = NULL;
2350 	}
2351 
2352 	if (skb_append_pagefrags(skb, page, offset, size)) {
2353 		tail = skb;
2354 		goto alloc_skb;
2355 	}
2356 
2357 	skb->len += size;
2358 	skb->data_len += size;
2359 	skb->truesize += size;
2360 	refcount_add(size, &sk->sk_wmem_alloc);
2361 
2362 	if (newskb) {
2363 		err = unix_scm_to_skb(&scm, skb, false);
2364 		if (err)
2365 			goto err_state_unlock;
2366 		spin_lock(&other->sk_receive_queue.lock);
2367 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2368 		spin_unlock(&other->sk_receive_queue.lock);
2369 	}
2370 
2371 	unix_state_unlock(other);
2372 	mutex_unlock(&unix_sk(other)->iolock);
2373 
2374 	other->sk_data_ready(other);
2375 	scm_destroy(&scm);
2376 	return size;
2377 
2378 err_state_unlock:
2379 	unix_state_unlock(other);
2380 err_unlock:
2381 	mutex_unlock(&unix_sk(other)->iolock);
2382 err:
2383 	kfree_skb(newskb);
2384 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2385 		send_sig(SIGPIPE, current, 0);
2386 	if (!init_scm)
2387 		scm_destroy(&scm);
2388 	return err;
2389 }
2390 
2391 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2392 				  size_t len)
2393 {
2394 	int err;
2395 	struct sock *sk = sock->sk;
2396 
2397 	err = sock_error(sk);
2398 	if (err)
2399 		return err;
2400 
2401 	if (sk->sk_state != TCP_ESTABLISHED)
2402 		return -ENOTCONN;
2403 
2404 	if (msg->msg_namelen)
2405 		msg->msg_namelen = 0;
2406 
2407 	return unix_dgram_sendmsg(sock, msg, len);
2408 }
2409 
2410 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2411 				  size_t size, int flags)
2412 {
2413 	struct sock *sk = sock->sk;
2414 
2415 	if (sk->sk_state != TCP_ESTABLISHED)
2416 		return -ENOTCONN;
2417 
2418 	return unix_dgram_recvmsg(sock, msg, size, flags);
2419 }
2420 
2421 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2422 {
2423 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2424 
2425 	if (addr) {
2426 		msg->msg_namelen = addr->len;
2427 		memcpy(msg->msg_name, addr->name, addr->len);
2428 	}
2429 }
2430 
2431 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2432 			 int flags)
2433 {
2434 	struct scm_cookie scm;
2435 	struct socket *sock = sk->sk_socket;
2436 	struct unix_sock *u = unix_sk(sk);
2437 	struct sk_buff *skb, *last;
2438 	long timeo;
2439 	int skip;
2440 	int err;
2441 
2442 	err = -EOPNOTSUPP;
2443 	if (flags&MSG_OOB)
2444 		goto out;
2445 
2446 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2447 
2448 	do {
2449 		mutex_lock(&u->iolock);
2450 
2451 		skip = sk_peek_offset(sk, flags);
2452 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2453 					      &skip, &err, &last);
2454 		if (skb) {
2455 			if (!(flags & MSG_PEEK))
2456 				scm_stat_del(sk, skb);
2457 			break;
2458 		}
2459 
2460 		mutex_unlock(&u->iolock);
2461 
2462 		if (err != -EAGAIN)
2463 			break;
2464 	} while (timeo &&
2465 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2466 					      &err, &timeo, last));
2467 
2468 	if (!skb) { /* implies iolock unlocked */
2469 		unix_state_lock(sk);
2470 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2471 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2472 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2473 			err = 0;
2474 		unix_state_unlock(sk);
2475 		goto out;
2476 	}
2477 
2478 	if (wq_has_sleeper(&u->peer_wait))
2479 		wake_up_interruptible_sync_poll(&u->peer_wait,
2480 						EPOLLOUT | EPOLLWRNORM |
2481 						EPOLLWRBAND);
2482 
2483 	if (msg->msg_name)
2484 		unix_copy_addr(msg, skb->sk);
2485 
2486 	if (size > skb->len - skip)
2487 		size = skb->len - skip;
2488 	else if (size < skb->len - skip)
2489 		msg->msg_flags |= MSG_TRUNC;
2490 
2491 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2492 	if (err)
2493 		goto out_free;
2494 
2495 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2496 		__sock_recv_timestamp(msg, sk, skb);
2497 
2498 	memset(&scm, 0, sizeof(scm));
2499 
2500 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2501 	unix_set_secdata(&scm, skb);
2502 
2503 	if (!(flags & MSG_PEEK)) {
2504 		if (UNIXCB(skb).fp)
2505 			unix_detach_fds(&scm, skb);
2506 
2507 		sk_peek_offset_bwd(sk, skb->len);
2508 	} else {
2509 		/* It is questionable: on PEEK we could:
2510 		   - do not return fds - good, but too simple 8)
2511 		   - return fds, and do not return them on read (old strategy,
2512 		     apparently wrong)
2513 		   - clone fds (I chose it for now, it is the most universal
2514 		     solution)
2515 
2516 		   POSIX 1003.1g does not actually define this clearly
2517 		   at all. POSIX 1003.1g doesn't define a lot of things
2518 		   clearly however!
2519 
2520 		*/
2521 
2522 		sk_peek_offset_fwd(sk, size);
2523 
2524 		if (UNIXCB(skb).fp)
2525 			unix_peek_fds(&scm, skb);
2526 	}
2527 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2528 
2529 	scm_recv(sock, msg, &scm, flags);
2530 
2531 out_free:
2532 	skb_free_datagram(sk, skb);
2533 	mutex_unlock(&u->iolock);
2534 out:
2535 	return err;
2536 }
2537 
2538 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2539 			      int flags)
2540 {
2541 	struct sock *sk = sock->sk;
2542 
2543 #ifdef CONFIG_BPF_SYSCALL
2544 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2545 
2546 	if (prot != &unix_dgram_proto)
2547 		return prot->recvmsg(sk, msg, size, flags, NULL);
2548 #endif
2549 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2550 }
2551 
2552 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2553 {
2554 	struct unix_sock *u = unix_sk(sk);
2555 	struct sk_buff *skb;
2556 	int err;
2557 
2558 	mutex_lock(&u->iolock);
2559 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2560 	mutex_unlock(&u->iolock);
2561 	if (!skb)
2562 		return err;
2563 
2564 	return recv_actor(sk, skb);
2565 }
2566 
2567 /*
2568  *	Sleep until more data has arrived. But check for races..
2569  */
2570 static long unix_stream_data_wait(struct sock *sk, long timeo,
2571 				  struct sk_buff *last, unsigned int last_len,
2572 				  bool freezable)
2573 {
2574 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2575 	struct sk_buff *tail;
2576 	DEFINE_WAIT(wait);
2577 
2578 	unix_state_lock(sk);
2579 
2580 	for (;;) {
2581 		prepare_to_wait(sk_sleep(sk), &wait, state);
2582 
2583 		tail = skb_peek_tail(&sk->sk_receive_queue);
2584 		if (tail != last ||
2585 		    (tail && tail->len != last_len) ||
2586 		    sk->sk_err ||
2587 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2588 		    signal_pending(current) ||
2589 		    !timeo)
2590 			break;
2591 
2592 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2593 		unix_state_unlock(sk);
2594 		timeo = schedule_timeout(timeo);
2595 		unix_state_lock(sk);
2596 
2597 		if (sock_flag(sk, SOCK_DEAD))
2598 			break;
2599 
2600 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2601 	}
2602 
2603 	finish_wait(sk_sleep(sk), &wait);
2604 	unix_state_unlock(sk);
2605 	return timeo;
2606 }
2607 
2608 static unsigned int unix_skb_len(const struct sk_buff *skb)
2609 {
2610 	return skb->len - UNIXCB(skb).consumed;
2611 }
2612 
2613 struct unix_stream_read_state {
2614 	int (*recv_actor)(struct sk_buff *, int, int,
2615 			  struct unix_stream_read_state *);
2616 	struct socket *socket;
2617 	struct msghdr *msg;
2618 	struct pipe_inode_info *pipe;
2619 	size_t size;
2620 	int flags;
2621 	unsigned int splice_flags;
2622 };
2623 
2624 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2625 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2626 {
2627 	struct socket *sock = state->socket;
2628 	struct sock *sk = sock->sk;
2629 	struct unix_sock *u = unix_sk(sk);
2630 	int chunk = 1;
2631 	struct sk_buff *oob_skb;
2632 
2633 	mutex_lock(&u->iolock);
2634 	unix_state_lock(sk);
2635 
2636 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2637 		unix_state_unlock(sk);
2638 		mutex_unlock(&u->iolock);
2639 		return -EINVAL;
2640 	}
2641 
2642 	oob_skb = u->oob_skb;
2643 
2644 	if (!(state->flags & MSG_PEEK))
2645 		WRITE_ONCE(u->oob_skb, NULL);
2646 
2647 	unix_state_unlock(sk);
2648 
2649 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2650 
2651 	if (!(state->flags & MSG_PEEK)) {
2652 		UNIXCB(oob_skb).consumed += 1;
2653 		kfree_skb(oob_skb);
2654 	}
2655 
2656 	mutex_unlock(&u->iolock);
2657 
2658 	if (chunk < 0)
2659 		return -EFAULT;
2660 
2661 	state->msg->msg_flags |= MSG_OOB;
2662 	return 1;
2663 }
2664 
2665 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2666 				  int flags, int copied)
2667 {
2668 	struct unix_sock *u = unix_sk(sk);
2669 
2670 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2671 		skb_unlink(skb, &sk->sk_receive_queue);
2672 		consume_skb(skb);
2673 		skb = NULL;
2674 	} else {
2675 		if (skb == u->oob_skb) {
2676 			if (copied) {
2677 				skb = NULL;
2678 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2679 				if (!(flags & MSG_PEEK)) {
2680 					WRITE_ONCE(u->oob_skb, NULL);
2681 					consume_skb(skb);
2682 				}
2683 			} else if (!(flags & MSG_PEEK)) {
2684 				skb_unlink(skb, &sk->sk_receive_queue);
2685 				consume_skb(skb);
2686 				skb = skb_peek(&sk->sk_receive_queue);
2687 			}
2688 		}
2689 	}
2690 	return skb;
2691 }
2692 #endif
2693 
2694 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2695 {
2696 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2697 		return -ENOTCONN;
2698 
2699 	return unix_read_skb(sk, recv_actor);
2700 }
2701 
2702 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2703 				    bool freezable)
2704 {
2705 	struct scm_cookie scm;
2706 	struct socket *sock = state->socket;
2707 	struct sock *sk = sock->sk;
2708 	struct unix_sock *u = unix_sk(sk);
2709 	int copied = 0;
2710 	int flags = state->flags;
2711 	int noblock = flags & MSG_DONTWAIT;
2712 	bool check_creds = false;
2713 	int target;
2714 	int err = 0;
2715 	long timeo;
2716 	int skip;
2717 	size_t size = state->size;
2718 	unsigned int last_len;
2719 
2720 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2721 		err = -EINVAL;
2722 		goto out;
2723 	}
2724 
2725 	if (unlikely(flags & MSG_OOB)) {
2726 		err = -EOPNOTSUPP;
2727 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2728 		err = unix_stream_recv_urg(state);
2729 #endif
2730 		goto out;
2731 	}
2732 
2733 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2734 	timeo = sock_rcvtimeo(sk, noblock);
2735 
2736 	memset(&scm, 0, sizeof(scm));
2737 
2738 	/* Lock the socket to prevent queue disordering
2739 	 * while sleeps in memcpy_tomsg
2740 	 */
2741 	mutex_lock(&u->iolock);
2742 
2743 	skip = max(sk_peek_offset(sk, flags), 0);
2744 
2745 	do {
2746 		int chunk;
2747 		bool drop_skb;
2748 		struct sk_buff *skb, *last;
2749 
2750 redo:
2751 		unix_state_lock(sk);
2752 		if (sock_flag(sk, SOCK_DEAD)) {
2753 			err = -ECONNRESET;
2754 			goto unlock;
2755 		}
2756 		last = skb = skb_peek(&sk->sk_receive_queue);
2757 		last_len = last ? last->len : 0;
2758 
2759 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2760 		if (skb) {
2761 			skb = manage_oob(skb, sk, flags, copied);
2762 			if (!skb) {
2763 				unix_state_unlock(sk);
2764 				if (copied)
2765 					break;
2766 				goto redo;
2767 			}
2768 		}
2769 #endif
2770 again:
2771 		if (skb == NULL) {
2772 			if (copied >= target)
2773 				goto unlock;
2774 
2775 			/*
2776 			 *	POSIX 1003.1g mandates this order.
2777 			 */
2778 
2779 			err = sock_error(sk);
2780 			if (err)
2781 				goto unlock;
2782 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2783 				goto unlock;
2784 
2785 			unix_state_unlock(sk);
2786 			if (!timeo) {
2787 				err = -EAGAIN;
2788 				break;
2789 			}
2790 
2791 			mutex_unlock(&u->iolock);
2792 
2793 			timeo = unix_stream_data_wait(sk, timeo, last,
2794 						      last_len, freezable);
2795 
2796 			if (signal_pending(current)) {
2797 				err = sock_intr_errno(timeo);
2798 				scm_destroy(&scm);
2799 				goto out;
2800 			}
2801 
2802 			mutex_lock(&u->iolock);
2803 			goto redo;
2804 unlock:
2805 			unix_state_unlock(sk);
2806 			break;
2807 		}
2808 
2809 		while (skip >= unix_skb_len(skb)) {
2810 			skip -= unix_skb_len(skb);
2811 			last = skb;
2812 			last_len = skb->len;
2813 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2814 			if (!skb)
2815 				goto again;
2816 		}
2817 
2818 		unix_state_unlock(sk);
2819 
2820 		if (check_creds) {
2821 			/* Never glue messages from different writers */
2822 			if (!unix_skb_scm_eq(skb, &scm))
2823 				break;
2824 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2825 			/* Copy credentials */
2826 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2827 			unix_set_secdata(&scm, skb);
2828 			check_creds = true;
2829 		}
2830 
2831 		/* Copy address just once */
2832 		if (state->msg && state->msg->msg_name) {
2833 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2834 					 state->msg->msg_name);
2835 			unix_copy_addr(state->msg, skb->sk);
2836 			sunaddr = NULL;
2837 		}
2838 
2839 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2840 		skb_get(skb);
2841 		chunk = state->recv_actor(skb, skip, chunk, state);
2842 		drop_skb = !unix_skb_len(skb);
2843 		/* skb is only safe to use if !drop_skb */
2844 		consume_skb(skb);
2845 		if (chunk < 0) {
2846 			if (copied == 0)
2847 				copied = -EFAULT;
2848 			break;
2849 		}
2850 		copied += chunk;
2851 		size -= chunk;
2852 
2853 		if (drop_skb) {
2854 			/* the skb was touched by a concurrent reader;
2855 			 * we should not expect anything from this skb
2856 			 * anymore and assume it invalid - we can be
2857 			 * sure it was dropped from the socket queue
2858 			 *
2859 			 * let's report a short read
2860 			 */
2861 			err = 0;
2862 			break;
2863 		}
2864 
2865 		/* Mark read part of skb as used */
2866 		if (!(flags & MSG_PEEK)) {
2867 			UNIXCB(skb).consumed += chunk;
2868 
2869 			sk_peek_offset_bwd(sk, chunk);
2870 
2871 			if (UNIXCB(skb).fp) {
2872 				scm_stat_del(sk, skb);
2873 				unix_detach_fds(&scm, skb);
2874 			}
2875 
2876 			if (unix_skb_len(skb))
2877 				break;
2878 
2879 			skb_unlink(skb, &sk->sk_receive_queue);
2880 			consume_skb(skb);
2881 
2882 			if (scm.fp)
2883 				break;
2884 		} else {
2885 			/* It is questionable, see note in unix_dgram_recvmsg.
2886 			 */
2887 			if (UNIXCB(skb).fp)
2888 				unix_peek_fds(&scm, skb);
2889 
2890 			sk_peek_offset_fwd(sk, chunk);
2891 
2892 			if (UNIXCB(skb).fp)
2893 				break;
2894 
2895 			skip = 0;
2896 			last = skb;
2897 			last_len = skb->len;
2898 			unix_state_lock(sk);
2899 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2900 			if (skb)
2901 				goto again;
2902 			unix_state_unlock(sk);
2903 			break;
2904 		}
2905 	} while (size);
2906 
2907 	mutex_unlock(&u->iolock);
2908 	if (state->msg)
2909 		scm_recv(sock, state->msg, &scm, flags);
2910 	else
2911 		scm_destroy(&scm);
2912 out:
2913 	return copied ? : err;
2914 }
2915 
2916 static int unix_stream_read_actor(struct sk_buff *skb,
2917 				  int skip, int chunk,
2918 				  struct unix_stream_read_state *state)
2919 {
2920 	int ret;
2921 
2922 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2923 				    state->msg, chunk);
2924 	return ret ?: chunk;
2925 }
2926 
2927 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2928 			  size_t size, int flags)
2929 {
2930 	struct unix_stream_read_state state = {
2931 		.recv_actor = unix_stream_read_actor,
2932 		.socket = sk->sk_socket,
2933 		.msg = msg,
2934 		.size = size,
2935 		.flags = flags
2936 	};
2937 
2938 	return unix_stream_read_generic(&state, true);
2939 }
2940 
2941 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2942 			       size_t size, int flags)
2943 {
2944 	struct unix_stream_read_state state = {
2945 		.recv_actor = unix_stream_read_actor,
2946 		.socket = sock,
2947 		.msg = msg,
2948 		.size = size,
2949 		.flags = flags
2950 	};
2951 
2952 #ifdef CONFIG_BPF_SYSCALL
2953 	struct sock *sk = sock->sk;
2954 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2955 
2956 	if (prot != &unix_stream_proto)
2957 		return prot->recvmsg(sk, msg, size, flags, NULL);
2958 #endif
2959 	return unix_stream_read_generic(&state, true);
2960 }
2961 
2962 static int unix_stream_splice_actor(struct sk_buff *skb,
2963 				    int skip, int chunk,
2964 				    struct unix_stream_read_state *state)
2965 {
2966 	return skb_splice_bits(skb, state->socket->sk,
2967 			       UNIXCB(skb).consumed + skip,
2968 			       state->pipe, chunk, state->splice_flags);
2969 }
2970 
2971 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2972 				       struct pipe_inode_info *pipe,
2973 				       size_t size, unsigned int flags)
2974 {
2975 	struct unix_stream_read_state state = {
2976 		.recv_actor = unix_stream_splice_actor,
2977 		.socket = sock,
2978 		.pipe = pipe,
2979 		.size = size,
2980 		.splice_flags = flags,
2981 	};
2982 
2983 	if (unlikely(*ppos))
2984 		return -ESPIPE;
2985 
2986 	if (sock->file->f_flags & O_NONBLOCK ||
2987 	    flags & SPLICE_F_NONBLOCK)
2988 		state.flags = MSG_DONTWAIT;
2989 
2990 	return unix_stream_read_generic(&state, false);
2991 }
2992 
2993 static int unix_shutdown(struct socket *sock, int mode)
2994 {
2995 	struct sock *sk = sock->sk;
2996 	struct sock *other;
2997 
2998 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2999 		return -EINVAL;
3000 	/* This maps:
3001 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3002 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3003 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3004 	 */
3005 	++mode;
3006 
3007 	unix_state_lock(sk);
3008 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3009 	other = unix_peer(sk);
3010 	if (other)
3011 		sock_hold(other);
3012 	unix_state_unlock(sk);
3013 	sk->sk_state_change(sk);
3014 
3015 	if (other &&
3016 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3017 
3018 		int peer_mode = 0;
3019 		const struct proto *prot = READ_ONCE(other->sk_prot);
3020 
3021 		if (prot->unhash)
3022 			prot->unhash(other);
3023 		if (mode&RCV_SHUTDOWN)
3024 			peer_mode |= SEND_SHUTDOWN;
3025 		if (mode&SEND_SHUTDOWN)
3026 			peer_mode |= RCV_SHUTDOWN;
3027 		unix_state_lock(other);
3028 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3029 		unix_state_unlock(other);
3030 		other->sk_state_change(other);
3031 		if (peer_mode == SHUTDOWN_MASK)
3032 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3033 		else if (peer_mode & RCV_SHUTDOWN)
3034 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3035 	}
3036 	if (other)
3037 		sock_put(other);
3038 
3039 	return 0;
3040 }
3041 
3042 long unix_inq_len(struct sock *sk)
3043 {
3044 	struct sk_buff *skb;
3045 	long amount = 0;
3046 
3047 	if (sk->sk_state == TCP_LISTEN)
3048 		return -EINVAL;
3049 
3050 	spin_lock(&sk->sk_receive_queue.lock);
3051 	if (sk->sk_type == SOCK_STREAM ||
3052 	    sk->sk_type == SOCK_SEQPACKET) {
3053 		skb_queue_walk(&sk->sk_receive_queue, skb)
3054 			amount += unix_skb_len(skb);
3055 	} else {
3056 		skb = skb_peek(&sk->sk_receive_queue);
3057 		if (skb)
3058 			amount = skb->len;
3059 	}
3060 	spin_unlock(&sk->sk_receive_queue.lock);
3061 
3062 	return amount;
3063 }
3064 EXPORT_SYMBOL_GPL(unix_inq_len);
3065 
3066 long unix_outq_len(struct sock *sk)
3067 {
3068 	return sk_wmem_alloc_get(sk);
3069 }
3070 EXPORT_SYMBOL_GPL(unix_outq_len);
3071 
3072 static int unix_open_file(struct sock *sk)
3073 {
3074 	struct path path;
3075 	struct file *f;
3076 	int fd;
3077 
3078 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3079 		return -EPERM;
3080 
3081 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3082 		return -ENOENT;
3083 
3084 	path = unix_sk(sk)->path;
3085 	if (!path.dentry)
3086 		return -ENOENT;
3087 
3088 	path_get(&path);
3089 
3090 	fd = get_unused_fd_flags(O_CLOEXEC);
3091 	if (fd < 0)
3092 		goto out;
3093 
3094 	f = dentry_open(&path, O_PATH, current_cred());
3095 	if (IS_ERR(f)) {
3096 		put_unused_fd(fd);
3097 		fd = PTR_ERR(f);
3098 		goto out;
3099 	}
3100 
3101 	fd_install(fd, f);
3102 out:
3103 	path_put(&path);
3104 
3105 	return fd;
3106 }
3107 
3108 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3109 {
3110 	struct sock *sk = sock->sk;
3111 	long amount = 0;
3112 	int err;
3113 
3114 	switch (cmd) {
3115 	case SIOCOUTQ:
3116 		amount = unix_outq_len(sk);
3117 		err = put_user(amount, (int __user *)arg);
3118 		break;
3119 	case SIOCINQ:
3120 		amount = unix_inq_len(sk);
3121 		if (amount < 0)
3122 			err = amount;
3123 		else
3124 			err = put_user(amount, (int __user *)arg);
3125 		break;
3126 	case SIOCUNIXFILE:
3127 		err = unix_open_file(sk);
3128 		break;
3129 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3130 	case SIOCATMARK:
3131 		{
3132 			struct sk_buff *skb;
3133 			int answ = 0;
3134 
3135 			skb = skb_peek(&sk->sk_receive_queue);
3136 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3137 				answ = 1;
3138 			err = put_user(answ, (int __user *)arg);
3139 		}
3140 		break;
3141 #endif
3142 	default:
3143 		err = -ENOIOCTLCMD;
3144 		break;
3145 	}
3146 	return err;
3147 }
3148 
3149 #ifdef CONFIG_COMPAT
3150 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3151 {
3152 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3153 }
3154 #endif
3155 
3156 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3157 {
3158 	struct sock *sk = sock->sk;
3159 	__poll_t mask;
3160 	u8 shutdown;
3161 
3162 	sock_poll_wait(file, sock, wait);
3163 	mask = 0;
3164 	shutdown = READ_ONCE(sk->sk_shutdown);
3165 
3166 	/* exceptional events? */
3167 	if (READ_ONCE(sk->sk_err))
3168 		mask |= EPOLLERR;
3169 	if (shutdown == SHUTDOWN_MASK)
3170 		mask |= EPOLLHUP;
3171 	if (shutdown & RCV_SHUTDOWN)
3172 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3173 
3174 	/* readable? */
3175 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3176 		mask |= EPOLLIN | EPOLLRDNORM;
3177 	if (sk_is_readable(sk))
3178 		mask |= EPOLLIN | EPOLLRDNORM;
3179 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3180 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3181 		mask |= EPOLLPRI;
3182 #endif
3183 
3184 	/* Connection-based need to check for termination and startup */
3185 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3186 	    sk->sk_state == TCP_CLOSE)
3187 		mask |= EPOLLHUP;
3188 
3189 	/*
3190 	 * we set writable also when the other side has shut down the
3191 	 * connection. This prevents stuck sockets.
3192 	 */
3193 	if (unix_writable(sk))
3194 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3195 
3196 	return mask;
3197 }
3198 
3199 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3200 				    poll_table *wait)
3201 {
3202 	struct sock *sk = sock->sk, *other;
3203 	unsigned int writable;
3204 	__poll_t mask;
3205 	u8 shutdown;
3206 
3207 	sock_poll_wait(file, sock, wait);
3208 	mask = 0;
3209 	shutdown = READ_ONCE(sk->sk_shutdown);
3210 
3211 	/* exceptional events? */
3212 	if (READ_ONCE(sk->sk_err) ||
3213 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3214 		mask |= EPOLLERR |
3215 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3216 
3217 	if (shutdown & RCV_SHUTDOWN)
3218 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3219 	if (shutdown == SHUTDOWN_MASK)
3220 		mask |= EPOLLHUP;
3221 
3222 	/* readable? */
3223 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3224 		mask |= EPOLLIN | EPOLLRDNORM;
3225 	if (sk_is_readable(sk))
3226 		mask |= EPOLLIN | EPOLLRDNORM;
3227 
3228 	/* Connection-based need to check for termination and startup */
3229 	if (sk->sk_type == SOCK_SEQPACKET) {
3230 		if (sk->sk_state == TCP_CLOSE)
3231 			mask |= EPOLLHUP;
3232 		/* connection hasn't started yet? */
3233 		if (sk->sk_state == TCP_SYN_SENT)
3234 			return mask;
3235 	}
3236 
3237 	/* No write status requested, avoid expensive OUT tests. */
3238 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3239 		return mask;
3240 
3241 	writable = unix_writable(sk);
3242 	if (writable) {
3243 		unix_state_lock(sk);
3244 
3245 		other = unix_peer(sk);
3246 		if (other && unix_peer(other) != sk &&
3247 		    unix_recvq_full_lockless(other) &&
3248 		    unix_dgram_peer_wake_me(sk, other))
3249 			writable = 0;
3250 
3251 		unix_state_unlock(sk);
3252 	}
3253 
3254 	if (writable)
3255 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3256 	else
3257 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3258 
3259 	return mask;
3260 }
3261 
3262 #ifdef CONFIG_PROC_FS
3263 
3264 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3265 
3266 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3267 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3268 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3269 
3270 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3271 {
3272 	unsigned long offset = get_offset(*pos);
3273 	unsigned long bucket = get_bucket(*pos);
3274 	unsigned long count = 0;
3275 	struct sock *sk;
3276 
3277 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3278 	     sk; sk = sk_next(sk)) {
3279 		if (++count == offset)
3280 			break;
3281 	}
3282 
3283 	return sk;
3284 }
3285 
3286 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3287 {
3288 	unsigned long bucket = get_bucket(*pos);
3289 	struct net *net = seq_file_net(seq);
3290 	struct sock *sk;
3291 
3292 	while (bucket < UNIX_HASH_SIZE) {
3293 		spin_lock(&net->unx.table.locks[bucket]);
3294 
3295 		sk = unix_from_bucket(seq, pos);
3296 		if (sk)
3297 			return sk;
3298 
3299 		spin_unlock(&net->unx.table.locks[bucket]);
3300 
3301 		*pos = set_bucket_offset(++bucket, 1);
3302 	}
3303 
3304 	return NULL;
3305 }
3306 
3307 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3308 				  loff_t *pos)
3309 {
3310 	unsigned long bucket = get_bucket(*pos);
3311 
3312 	sk = sk_next(sk);
3313 	if (sk)
3314 		return sk;
3315 
3316 
3317 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3318 
3319 	*pos = set_bucket_offset(++bucket, 1);
3320 
3321 	return unix_get_first(seq, pos);
3322 }
3323 
3324 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3325 {
3326 	if (!*pos)
3327 		return SEQ_START_TOKEN;
3328 
3329 	return unix_get_first(seq, pos);
3330 }
3331 
3332 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3333 {
3334 	++*pos;
3335 
3336 	if (v == SEQ_START_TOKEN)
3337 		return unix_get_first(seq, pos);
3338 
3339 	return unix_get_next(seq, v, pos);
3340 }
3341 
3342 static void unix_seq_stop(struct seq_file *seq, void *v)
3343 {
3344 	struct sock *sk = v;
3345 
3346 	if (sk)
3347 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3348 }
3349 
3350 static int unix_seq_show(struct seq_file *seq, void *v)
3351 {
3352 
3353 	if (v == SEQ_START_TOKEN)
3354 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3355 			 "Inode Path\n");
3356 	else {
3357 		struct sock *s = v;
3358 		struct unix_sock *u = unix_sk(s);
3359 		unix_state_lock(s);
3360 
3361 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3362 			s,
3363 			refcount_read(&s->sk_refcnt),
3364 			0,
3365 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3366 			s->sk_type,
3367 			s->sk_socket ?
3368 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3369 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3370 			sock_i_ino(s));
3371 
3372 		if (u->addr) {	// under a hash table lock here
3373 			int i, len;
3374 			seq_putc(seq, ' ');
3375 
3376 			i = 0;
3377 			len = u->addr->len -
3378 				offsetof(struct sockaddr_un, sun_path);
3379 			if (u->addr->name->sun_path[0]) {
3380 				len--;
3381 			} else {
3382 				seq_putc(seq, '@');
3383 				i++;
3384 			}
3385 			for ( ; i < len; i++)
3386 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3387 					 '@');
3388 		}
3389 		unix_state_unlock(s);
3390 		seq_putc(seq, '\n');
3391 	}
3392 
3393 	return 0;
3394 }
3395 
3396 static const struct seq_operations unix_seq_ops = {
3397 	.start  = unix_seq_start,
3398 	.next   = unix_seq_next,
3399 	.stop   = unix_seq_stop,
3400 	.show   = unix_seq_show,
3401 };
3402 
3403 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3404 struct bpf_unix_iter_state {
3405 	struct seq_net_private p;
3406 	unsigned int cur_sk;
3407 	unsigned int end_sk;
3408 	unsigned int max_sk;
3409 	struct sock **batch;
3410 	bool st_bucket_done;
3411 };
3412 
3413 struct bpf_iter__unix {
3414 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3415 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3416 	uid_t uid __aligned(8);
3417 };
3418 
3419 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3420 			      struct unix_sock *unix_sk, uid_t uid)
3421 {
3422 	struct bpf_iter__unix ctx;
3423 
3424 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3425 	ctx.meta = meta;
3426 	ctx.unix_sk = unix_sk;
3427 	ctx.uid = uid;
3428 	return bpf_iter_run_prog(prog, &ctx);
3429 }
3430 
3431 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3432 
3433 {
3434 	struct bpf_unix_iter_state *iter = seq->private;
3435 	unsigned int expected = 1;
3436 	struct sock *sk;
3437 
3438 	sock_hold(start_sk);
3439 	iter->batch[iter->end_sk++] = start_sk;
3440 
3441 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3442 		if (iter->end_sk < iter->max_sk) {
3443 			sock_hold(sk);
3444 			iter->batch[iter->end_sk++] = sk;
3445 		}
3446 
3447 		expected++;
3448 	}
3449 
3450 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3451 
3452 	return expected;
3453 }
3454 
3455 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3456 {
3457 	while (iter->cur_sk < iter->end_sk)
3458 		sock_put(iter->batch[iter->cur_sk++]);
3459 }
3460 
3461 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3462 				       unsigned int new_batch_sz)
3463 {
3464 	struct sock **new_batch;
3465 
3466 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3467 			     GFP_USER | __GFP_NOWARN);
3468 	if (!new_batch)
3469 		return -ENOMEM;
3470 
3471 	bpf_iter_unix_put_batch(iter);
3472 	kvfree(iter->batch);
3473 	iter->batch = new_batch;
3474 	iter->max_sk = new_batch_sz;
3475 
3476 	return 0;
3477 }
3478 
3479 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3480 					loff_t *pos)
3481 {
3482 	struct bpf_unix_iter_state *iter = seq->private;
3483 	unsigned int expected;
3484 	bool resized = false;
3485 	struct sock *sk;
3486 
3487 	if (iter->st_bucket_done)
3488 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3489 
3490 again:
3491 	/* Get a new batch */
3492 	iter->cur_sk = 0;
3493 	iter->end_sk = 0;
3494 
3495 	sk = unix_get_first(seq, pos);
3496 	if (!sk)
3497 		return NULL; /* Done */
3498 
3499 	expected = bpf_iter_unix_hold_batch(seq, sk);
3500 
3501 	if (iter->end_sk == expected) {
3502 		iter->st_bucket_done = true;
3503 		return sk;
3504 	}
3505 
3506 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3507 		resized = true;
3508 		goto again;
3509 	}
3510 
3511 	return sk;
3512 }
3513 
3514 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3515 {
3516 	if (!*pos)
3517 		return SEQ_START_TOKEN;
3518 
3519 	/* bpf iter does not support lseek, so it always
3520 	 * continue from where it was stop()-ped.
3521 	 */
3522 	return bpf_iter_unix_batch(seq, pos);
3523 }
3524 
3525 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3526 {
3527 	struct bpf_unix_iter_state *iter = seq->private;
3528 	struct sock *sk;
3529 
3530 	/* Whenever seq_next() is called, the iter->cur_sk is
3531 	 * done with seq_show(), so advance to the next sk in
3532 	 * the batch.
3533 	 */
3534 	if (iter->cur_sk < iter->end_sk)
3535 		sock_put(iter->batch[iter->cur_sk++]);
3536 
3537 	++*pos;
3538 
3539 	if (iter->cur_sk < iter->end_sk)
3540 		sk = iter->batch[iter->cur_sk];
3541 	else
3542 		sk = bpf_iter_unix_batch(seq, pos);
3543 
3544 	return sk;
3545 }
3546 
3547 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3548 {
3549 	struct bpf_iter_meta meta;
3550 	struct bpf_prog *prog;
3551 	struct sock *sk = v;
3552 	uid_t uid;
3553 	bool slow;
3554 	int ret;
3555 
3556 	if (v == SEQ_START_TOKEN)
3557 		return 0;
3558 
3559 	slow = lock_sock_fast(sk);
3560 
3561 	if (unlikely(sk_unhashed(sk))) {
3562 		ret = SEQ_SKIP;
3563 		goto unlock;
3564 	}
3565 
3566 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3567 	meta.seq = seq;
3568 	prog = bpf_iter_get_info(&meta, false);
3569 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3570 unlock:
3571 	unlock_sock_fast(sk, slow);
3572 	return ret;
3573 }
3574 
3575 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3576 {
3577 	struct bpf_unix_iter_state *iter = seq->private;
3578 	struct bpf_iter_meta meta;
3579 	struct bpf_prog *prog;
3580 
3581 	if (!v) {
3582 		meta.seq = seq;
3583 		prog = bpf_iter_get_info(&meta, true);
3584 		if (prog)
3585 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3586 	}
3587 
3588 	if (iter->cur_sk < iter->end_sk)
3589 		bpf_iter_unix_put_batch(iter);
3590 }
3591 
3592 static const struct seq_operations bpf_iter_unix_seq_ops = {
3593 	.start	= bpf_iter_unix_seq_start,
3594 	.next	= bpf_iter_unix_seq_next,
3595 	.stop	= bpf_iter_unix_seq_stop,
3596 	.show	= bpf_iter_unix_seq_show,
3597 };
3598 #endif
3599 #endif
3600 
3601 static const struct net_proto_family unix_family_ops = {
3602 	.family = PF_UNIX,
3603 	.create = unix_create,
3604 	.owner	= THIS_MODULE,
3605 };
3606 
3607 
3608 static int __net_init unix_net_init(struct net *net)
3609 {
3610 	int i;
3611 
3612 	net->unx.sysctl_max_dgram_qlen = 10;
3613 	if (unix_sysctl_register(net))
3614 		goto out;
3615 
3616 #ifdef CONFIG_PROC_FS
3617 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3618 			     sizeof(struct seq_net_private)))
3619 		goto err_sysctl;
3620 #endif
3621 
3622 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3623 					      sizeof(spinlock_t), GFP_KERNEL);
3624 	if (!net->unx.table.locks)
3625 		goto err_proc;
3626 
3627 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3628 						sizeof(struct hlist_head),
3629 						GFP_KERNEL);
3630 	if (!net->unx.table.buckets)
3631 		goto free_locks;
3632 
3633 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3634 		spin_lock_init(&net->unx.table.locks[i]);
3635 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3636 	}
3637 
3638 	return 0;
3639 
3640 free_locks:
3641 	kvfree(net->unx.table.locks);
3642 err_proc:
3643 #ifdef CONFIG_PROC_FS
3644 	remove_proc_entry("unix", net->proc_net);
3645 err_sysctl:
3646 #endif
3647 	unix_sysctl_unregister(net);
3648 out:
3649 	return -ENOMEM;
3650 }
3651 
3652 static void __net_exit unix_net_exit(struct net *net)
3653 {
3654 	kvfree(net->unx.table.buckets);
3655 	kvfree(net->unx.table.locks);
3656 	unix_sysctl_unregister(net);
3657 	remove_proc_entry("unix", net->proc_net);
3658 }
3659 
3660 static struct pernet_operations unix_net_ops = {
3661 	.init = unix_net_init,
3662 	.exit = unix_net_exit,
3663 };
3664 
3665 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3666 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3667 		     struct unix_sock *unix_sk, uid_t uid)
3668 
3669 #define INIT_BATCH_SZ 16
3670 
3671 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3672 {
3673 	struct bpf_unix_iter_state *iter = priv_data;
3674 	int err;
3675 
3676 	err = bpf_iter_init_seq_net(priv_data, aux);
3677 	if (err)
3678 		return err;
3679 
3680 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3681 	if (err) {
3682 		bpf_iter_fini_seq_net(priv_data);
3683 		return err;
3684 	}
3685 
3686 	return 0;
3687 }
3688 
3689 static void bpf_iter_fini_unix(void *priv_data)
3690 {
3691 	struct bpf_unix_iter_state *iter = priv_data;
3692 
3693 	bpf_iter_fini_seq_net(priv_data);
3694 	kvfree(iter->batch);
3695 }
3696 
3697 static const struct bpf_iter_seq_info unix_seq_info = {
3698 	.seq_ops		= &bpf_iter_unix_seq_ops,
3699 	.init_seq_private	= bpf_iter_init_unix,
3700 	.fini_seq_private	= bpf_iter_fini_unix,
3701 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3702 };
3703 
3704 static const struct bpf_func_proto *
3705 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3706 			     const struct bpf_prog *prog)
3707 {
3708 	switch (func_id) {
3709 	case BPF_FUNC_setsockopt:
3710 		return &bpf_sk_setsockopt_proto;
3711 	case BPF_FUNC_getsockopt:
3712 		return &bpf_sk_getsockopt_proto;
3713 	default:
3714 		return NULL;
3715 	}
3716 }
3717 
3718 static struct bpf_iter_reg unix_reg_info = {
3719 	.target			= "unix",
3720 	.ctx_arg_info_size	= 1,
3721 	.ctx_arg_info		= {
3722 		{ offsetof(struct bpf_iter__unix, unix_sk),
3723 		  PTR_TO_BTF_ID_OR_NULL },
3724 	},
3725 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3726 	.seq_info		= &unix_seq_info,
3727 };
3728 
3729 static void __init bpf_iter_register(void)
3730 {
3731 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3732 	if (bpf_iter_reg_target(&unix_reg_info))
3733 		pr_warn("Warning: could not register bpf iterator unix\n");
3734 }
3735 #endif
3736 
3737 static int __init af_unix_init(void)
3738 {
3739 	int i, rc = -1;
3740 
3741 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3742 
3743 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3744 		spin_lock_init(&bsd_socket_locks[i]);
3745 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3746 	}
3747 
3748 	rc = proto_register(&unix_dgram_proto, 1);
3749 	if (rc != 0) {
3750 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3751 		goto out;
3752 	}
3753 
3754 	rc = proto_register(&unix_stream_proto, 1);
3755 	if (rc != 0) {
3756 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3757 		proto_unregister(&unix_dgram_proto);
3758 		goto out;
3759 	}
3760 
3761 	sock_register(&unix_family_ops);
3762 	register_pernet_subsys(&unix_net_ops);
3763 	unix_bpf_build_proto();
3764 
3765 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3766 	bpf_iter_register();
3767 #endif
3768 
3769 out:
3770 	return rc;
3771 }
3772 
3773 static void __exit af_unix_exit(void)
3774 {
3775 	sock_unregister(PF_UNIX);
3776 	proto_unregister(&unix_dgram_proto);
3777 	proto_unregister(&unix_stream_proto);
3778 	unregister_pernet_subsys(&unix_net_ops);
3779 }
3780 
3781 /* Earlier than device_initcall() so that other drivers invoking
3782    request_module() don't end up in a loop when modprobe tries
3783    to use a UNIX socket. But later than subsys_initcall() because
3784    we depend on stuff initialised there */
3785 fs_initcall(af_unix_init);
3786 module_exit(af_unix_exit);
3787 
3788 MODULE_LICENSE("GPL");
3789 MODULE_ALIAS_NETPROTO(PF_UNIX);
3790