xref: /openbmc/linux/net/unix/af_unix.c (revision 0cb4228f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/freezer.h>
116 #include <linux/file.h>
117 #include <linux/btf_ids.h>
118 
119 #include "scm.h"
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 #define unix_peer(sk) (unix_sk(sk)->peer)
215 
216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
217 {
218 	return unix_peer(osk) == sk;
219 }
220 
221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
222 {
223 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
224 }
225 
226 static inline int unix_recvq_full(const struct sock *sk)
227 {
228 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
229 }
230 
231 static inline int unix_recvq_full_lockless(const struct sock *sk)
232 {
233 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
234 		READ_ONCE(sk->sk_max_ack_backlog);
235 }
236 
237 struct sock *unix_peer_get(struct sock *s)
238 {
239 	struct sock *peer;
240 
241 	unix_state_lock(s);
242 	peer = unix_peer(s);
243 	if (peer)
244 		sock_hold(peer);
245 	unix_state_unlock(s);
246 	return peer;
247 }
248 EXPORT_SYMBOL_GPL(unix_peer_get);
249 
250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
251 					     int addr_len)
252 {
253 	struct unix_address *addr;
254 
255 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
256 	if (!addr)
257 		return NULL;
258 
259 	refcount_set(&addr->refcnt, 1);
260 	addr->len = addr_len;
261 	memcpy(addr->name, sunaddr, addr_len);
262 
263 	return addr;
264 }
265 
266 static inline void unix_release_addr(struct unix_address *addr)
267 {
268 	if (refcount_dec_and_test(&addr->refcnt))
269 		kfree(addr);
270 }
271 
272 /*
273  *	Check unix socket name:
274  *		- should be not zero length.
275  *	        - if started by not zero, should be NULL terminated (FS object)
276  *		- if started by zero, it is abstract name.
277  */
278 
279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
280 {
281 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
282 	    addr_len > sizeof(*sunaddr))
283 		return -EINVAL;
284 
285 	if (sunaddr->sun_family != AF_UNIX)
286 		return -EINVAL;
287 
288 	return 0;
289 }
290 
291 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
292 {
293 	/* This may look like an off by one error but it is a bit more
294 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
295 	 * sun_path[108] doesn't as such exist.  However in kernel space
296 	 * we are guaranteed that it is a valid memory location in our
297 	 * kernel address buffer because syscall functions always pass
298 	 * a pointer of struct sockaddr_storage which has a bigger buffer
299 	 * than 108.
300 	 */
301 	((char *)sunaddr)[addr_len] = 0;
302 }
303 
304 static void __unix_remove_socket(struct sock *sk)
305 {
306 	sk_del_node_init(sk);
307 }
308 
309 static void __unix_insert_socket(struct net *net, struct sock *sk)
310 {
311 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
312 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
313 }
314 
315 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
316 				 struct unix_address *addr, unsigned int hash)
317 {
318 	__unix_remove_socket(sk);
319 	smp_store_release(&unix_sk(sk)->addr, addr);
320 
321 	sk->sk_hash = hash;
322 	__unix_insert_socket(net, sk);
323 }
324 
325 static void unix_remove_socket(struct net *net, struct sock *sk)
326 {
327 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
328 	__unix_remove_socket(sk);
329 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
330 }
331 
332 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
333 {
334 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
335 	__unix_insert_socket(net, sk);
336 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
337 }
338 
339 static void unix_insert_bsd_socket(struct sock *sk)
340 {
341 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
342 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
343 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
344 }
345 
346 static void unix_remove_bsd_socket(struct sock *sk)
347 {
348 	if (!hlist_unhashed(&sk->sk_bind_node)) {
349 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
350 		__sk_del_bind_node(sk);
351 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
352 
353 		sk_node_init(&sk->sk_bind_node);
354 	}
355 }
356 
357 static struct sock *__unix_find_socket_byname(struct net *net,
358 					      struct sockaddr_un *sunname,
359 					      int len, unsigned int hash)
360 {
361 	struct sock *s;
362 
363 	sk_for_each(s, &net->unx.table.buckets[hash]) {
364 		struct unix_sock *u = unix_sk(s);
365 
366 		if (u->addr->len == len &&
367 		    !memcmp(u->addr->name, sunname, len))
368 			return s;
369 	}
370 	return NULL;
371 }
372 
373 static inline struct sock *unix_find_socket_byname(struct net *net,
374 						   struct sockaddr_un *sunname,
375 						   int len, unsigned int hash)
376 {
377 	struct sock *s;
378 
379 	spin_lock(&net->unx.table.locks[hash]);
380 	s = __unix_find_socket_byname(net, sunname, len, hash);
381 	if (s)
382 		sock_hold(s);
383 	spin_unlock(&net->unx.table.locks[hash]);
384 	return s;
385 }
386 
387 static struct sock *unix_find_socket_byinode(struct inode *i)
388 {
389 	unsigned int hash = unix_bsd_hash(i);
390 	struct sock *s;
391 
392 	spin_lock(&bsd_socket_locks[hash]);
393 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
394 		struct dentry *dentry = unix_sk(s)->path.dentry;
395 
396 		if (dentry && d_backing_inode(dentry) == i) {
397 			sock_hold(s);
398 			spin_unlock(&bsd_socket_locks[hash]);
399 			return s;
400 		}
401 	}
402 	spin_unlock(&bsd_socket_locks[hash]);
403 	return NULL;
404 }
405 
406 /* Support code for asymmetrically connected dgram sockets
407  *
408  * If a datagram socket is connected to a socket not itself connected
409  * to the first socket (eg, /dev/log), clients may only enqueue more
410  * messages if the present receive queue of the server socket is not
411  * "too large". This means there's a second writeability condition
412  * poll and sendmsg need to test. The dgram recv code will do a wake
413  * up on the peer_wait wait queue of a socket upon reception of a
414  * datagram which needs to be propagated to sleeping would-be writers
415  * since these might not have sent anything so far. This can't be
416  * accomplished via poll_wait because the lifetime of the server
417  * socket might be less than that of its clients if these break their
418  * association with it or if the server socket is closed while clients
419  * are still connected to it and there's no way to inform "a polling
420  * implementation" that it should let go of a certain wait queue
421  *
422  * In order to propagate a wake up, a wait_queue_entry_t of the client
423  * socket is enqueued on the peer_wait queue of the server socket
424  * whose wake function does a wake_up on the ordinary client socket
425  * wait queue. This connection is established whenever a write (or
426  * poll for write) hit the flow control condition and broken when the
427  * association to the server socket is dissolved or after a wake up
428  * was relayed.
429  */
430 
431 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
432 				      void *key)
433 {
434 	struct unix_sock *u;
435 	wait_queue_head_t *u_sleep;
436 
437 	u = container_of(q, struct unix_sock, peer_wake);
438 
439 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
440 			    q);
441 	u->peer_wake.private = NULL;
442 
443 	/* relaying can only happen while the wq still exists */
444 	u_sleep = sk_sleep(&u->sk);
445 	if (u_sleep)
446 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
447 
448 	return 0;
449 }
450 
451 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
452 {
453 	struct unix_sock *u, *u_other;
454 	int rc;
455 
456 	u = unix_sk(sk);
457 	u_other = unix_sk(other);
458 	rc = 0;
459 	spin_lock(&u_other->peer_wait.lock);
460 
461 	if (!u->peer_wake.private) {
462 		u->peer_wake.private = other;
463 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
464 
465 		rc = 1;
466 	}
467 
468 	spin_unlock(&u_other->peer_wait.lock);
469 	return rc;
470 }
471 
472 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
473 					    struct sock *other)
474 {
475 	struct unix_sock *u, *u_other;
476 
477 	u = unix_sk(sk);
478 	u_other = unix_sk(other);
479 	spin_lock(&u_other->peer_wait.lock);
480 
481 	if (u->peer_wake.private == other) {
482 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
483 		u->peer_wake.private = NULL;
484 	}
485 
486 	spin_unlock(&u_other->peer_wait.lock);
487 }
488 
489 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
490 						   struct sock *other)
491 {
492 	unix_dgram_peer_wake_disconnect(sk, other);
493 	wake_up_interruptible_poll(sk_sleep(sk),
494 				   EPOLLOUT |
495 				   EPOLLWRNORM |
496 				   EPOLLWRBAND);
497 }
498 
499 /* preconditions:
500  *	- unix_peer(sk) == other
501  *	- association is stable
502  */
503 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
504 {
505 	int connected;
506 
507 	connected = unix_dgram_peer_wake_connect(sk, other);
508 
509 	/* If other is SOCK_DEAD, we want to make sure we signal
510 	 * POLLOUT, such that a subsequent write() can get a
511 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
512 	 * to other and its full, we will hang waiting for POLLOUT.
513 	 */
514 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
515 		return 1;
516 
517 	if (connected)
518 		unix_dgram_peer_wake_disconnect(sk, other);
519 
520 	return 0;
521 }
522 
523 static int unix_writable(const struct sock *sk)
524 {
525 	return sk->sk_state != TCP_LISTEN &&
526 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
527 }
528 
529 static void unix_write_space(struct sock *sk)
530 {
531 	struct socket_wq *wq;
532 
533 	rcu_read_lock();
534 	if (unix_writable(sk)) {
535 		wq = rcu_dereference(sk->sk_wq);
536 		if (skwq_has_sleeper(wq))
537 			wake_up_interruptible_sync_poll(&wq->wait,
538 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
539 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
540 	}
541 	rcu_read_unlock();
542 }
543 
544 /* When dgram socket disconnects (or changes its peer), we clear its receive
545  * queue of packets arrived from previous peer. First, it allows to do
546  * flow control based only on wmem_alloc; second, sk connected to peer
547  * may receive messages only from that peer. */
548 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
549 {
550 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
551 		skb_queue_purge(&sk->sk_receive_queue);
552 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
553 
554 		/* If one link of bidirectional dgram pipe is disconnected,
555 		 * we signal error. Messages are lost. Do not make this,
556 		 * when peer was not connected to us.
557 		 */
558 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
559 			other->sk_err = ECONNRESET;
560 			sk_error_report(other);
561 		}
562 	}
563 	other->sk_state = TCP_CLOSE;
564 }
565 
566 static void unix_sock_destructor(struct sock *sk)
567 {
568 	struct unix_sock *u = unix_sk(sk);
569 
570 	skb_queue_purge(&sk->sk_receive_queue);
571 
572 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
573 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
574 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
575 	if (!sock_flag(sk, SOCK_DEAD)) {
576 		pr_info("Attempt to release alive unix socket: %p\n", sk);
577 		return;
578 	}
579 
580 	if (u->addr)
581 		unix_release_addr(u->addr);
582 
583 	atomic_long_dec(&unix_nr_socks);
584 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
585 #ifdef UNIX_REFCNT_DEBUG
586 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
587 		atomic_long_read(&unix_nr_socks));
588 #endif
589 }
590 
591 static void unix_release_sock(struct sock *sk, int embrion)
592 {
593 	struct unix_sock *u = unix_sk(sk);
594 	struct sock *skpair;
595 	struct sk_buff *skb;
596 	struct path path;
597 	int state;
598 
599 	unix_remove_socket(sock_net(sk), sk);
600 	unix_remove_bsd_socket(sk);
601 
602 	/* Clear state */
603 	unix_state_lock(sk);
604 	sock_orphan(sk);
605 	sk->sk_shutdown = SHUTDOWN_MASK;
606 	path	     = u->path;
607 	u->path.dentry = NULL;
608 	u->path.mnt = NULL;
609 	state = sk->sk_state;
610 	sk->sk_state = TCP_CLOSE;
611 
612 	skpair = unix_peer(sk);
613 	unix_peer(sk) = NULL;
614 
615 	unix_state_unlock(sk);
616 
617 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
618 	if (u->oob_skb) {
619 		kfree_skb(u->oob_skb);
620 		u->oob_skb = NULL;
621 	}
622 #endif
623 
624 	wake_up_interruptible_all(&u->peer_wait);
625 
626 	if (skpair != NULL) {
627 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
628 			unix_state_lock(skpair);
629 			/* No more writes */
630 			skpair->sk_shutdown = SHUTDOWN_MASK;
631 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
632 				skpair->sk_err = ECONNRESET;
633 			unix_state_unlock(skpair);
634 			skpair->sk_state_change(skpair);
635 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
636 		}
637 
638 		unix_dgram_peer_wake_disconnect(sk, skpair);
639 		sock_put(skpair); /* It may now die */
640 	}
641 
642 	/* Try to flush out this socket. Throw out buffers at least */
643 
644 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
645 		if (state == TCP_LISTEN)
646 			unix_release_sock(skb->sk, 1);
647 		/* passed fds are erased in the kfree_skb hook	      */
648 		UNIXCB(skb).consumed = skb->len;
649 		kfree_skb(skb);
650 	}
651 
652 	if (path.dentry)
653 		path_put(&path);
654 
655 	sock_put(sk);
656 
657 	/* ---- Socket is dead now and most probably destroyed ---- */
658 
659 	/*
660 	 * Fixme: BSD difference: In BSD all sockets connected to us get
661 	 *	  ECONNRESET and we die on the spot. In Linux we behave
662 	 *	  like files and pipes do and wait for the last
663 	 *	  dereference.
664 	 *
665 	 * Can't we simply set sock->err?
666 	 *
667 	 *	  What the above comment does talk about? --ANK(980817)
668 	 */
669 
670 	if (unix_tot_inflight)
671 		unix_gc();		/* Garbage collect fds */
672 }
673 
674 static void init_peercred(struct sock *sk)
675 {
676 	const struct cred *old_cred;
677 	struct pid *old_pid;
678 
679 	spin_lock(&sk->sk_peer_lock);
680 	old_pid = sk->sk_peer_pid;
681 	old_cred = sk->sk_peer_cred;
682 	sk->sk_peer_pid  = get_pid(task_tgid(current));
683 	sk->sk_peer_cred = get_current_cred();
684 	spin_unlock(&sk->sk_peer_lock);
685 
686 	put_pid(old_pid);
687 	put_cred(old_cred);
688 }
689 
690 static void copy_peercred(struct sock *sk, struct sock *peersk)
691 {
692 	const struct cred *old_cred;
693 	struct pid *old_pid;
694 
695 	if (sk < peersk) {
696 		spin_lock(&sk->sk_peer_lock);
697 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
698 	} else {
699 		spin_lock(&peersk->sk_peer_lock);
700 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
701 	}
702 	old_pid = sk->sk_peer_pid;
703 	old_cred = sk->sk_peer_cred;
704 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
705 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
706 
707 	spin_unlock(&sk->sk_peer_lock);
708 	spin_unlock(&peersk->sk_peer_lock);
709 
710 	put_pid(old_pid);
711 	put_cred(old_cred);
712 }
713 
714 static int unix_listen(struct socket *sock, int backlog)
715 {
716 	int err;
717 	struct sock *sk = sock->sk;
718 	struct unix_sock *u = unix_sk(sk);
719 
720 	err = -EOPNOTSUPP;
721 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
722 		goto out;	/* Only stream/seqpacket sockets accept */
723 	err = -EINVAL;
724 	if (!u->addr)
725 		goto out;	/* No listens on an unbound socket */
726 	unix_state_lock(sk);
727 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
728 		goto out_unlock;
729 	if (backlog > sk->sk_max_ack_backlog)
730 		wake_up_interruptible_all(&u->peer_wait);
731 	sk->sk_max_ack_backlog	= backlog;
732 	sk->sk_state		= TCP_LISTEN;
733 	/* set credentials so connect can copy them */
734 	init_peercred(sk);
735 	err = 0;
736 
737 out_unlock:
738 	unix_state_unlock(sk);
739 out:
740 	return err;
741 }
742 
743 static int unix_release(struct socket *);
744 static int unix_bind(struct socket *, struct sockaddr *, int);
745 static int unix_stream_connect(struct socket *, struct sockaddr *,
746 			       int addr_len, int flags);
747 static int unix_socketpair(struct socket *, struct socket *);
748 static int unix_accept(struct socket *, struct socket *, int, bool);
749 static int unix_getname(struct socket *, struct sockaddr *, int);
750 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
751 static __poll_t unix_dgram_poll(struct file *, struct socket *,
752 				    poll_table *);
753 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
754 #ifdef CONFIG_COMPAT
755 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
756 #endif
757 static int unix_shutdown(struct socket *, int);
758 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
759 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
760 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
761 				    size_t size, int flags);
762 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
763 				       struct pipe_inode_info *, size_t size,
764 				       unsigned int flags);
765 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
766 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
767 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
768 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
769 static int unix_dgram_connect(struct socket *, struct sockaddr *,
770 			      int, int);
771 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
772 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
773 				  int);
774 
775 static int unix_set_peek_off(struct sock *sk, int val)
776 {
777 	struct unix_sock *u = unix_sk(sk);
778 
779 	if (mutex_lock_interruptible(&u->iolock))
780 		return -EINTR;
781 
782 	sk->sk_peek_off = val;
783 	mutex_unlock(&u->iolock);
784 
785 	return 0;
786 }
787 
788 #ifdef CONFIG_PROC_FS
789 static int unix_count_nr_fds(struct sock *sk)
790 {
791 	struct sk_buff *skb;
792 	struct unix_sock *u;
793 	int nr_fds = 0;
794 
795 	spin_lock(&sk->sk_receive_queue.lock);
796 	skb = skb_peek(&sk->sk_receive_queue);
797 	while (skb) {
798 		u = unix_sk(skb->sk);
799 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
800 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
801 	}
802 	spin_unlock(&sk->sk_receive_queue.lock);
803 
804 	return nr_fds;
805 }
806 
807 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
808 {
809 	struct sock *sk = sock->sk;
810 	struct unix_sock *u;
811 	int nr_fds;
812 
813 	if (sk) {
814 		u = unix_sk(sk);
815 		if (sock->type == SOCK_DGRAM) {
816 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
817 			goto out_print;
818 		}
819 
820 		unix_state_lock(sk);
821 		if (sk->sk_state != TCP_LISTEN)
822 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
823 		else
824 			nr_fds = unix_count_nr_fds(sk);
825 		unix_state_unlock(sk);
826 out_print:
827 		seq_printf(m, "scm_fds: %u\n", nr_fds);
828 	}
829 }
830 #else
831 #define unix_show_fdinfo NULL
832 #endif
833 
834 static const struct proto_ops unix_stream_ops = {
835 	.family =	PF_UNIX,
836 	.owner =	THIS_MODULE,
837 	.release =	unix_release,
838 	.bind =		unix_bind,
839 	.connect =	unix_stream_connect,
840 	.socketpair =	unix_socketpair,
841 	.accept =	unix_accept,
842 	.getname =	unix_getname,
843 	.poll =		unix_poll,
844 	.ioctl =	unix_ioctl,
845 #ifdef CONFIG_COMPAT
846 	.compat_ioctl =	unix_compat_ioctl,
847 #endif
848 	.listen =	unix_listen,
849 	.shutdown =	unix_shutdown,
850 	.sendmsg =	unix_stream_sendmsg,
851 	.recvmsg =	unix_stream_recvmsg,
852 	.read_skb =	unix_stream_read_skb,
853 	.mmap =		sock_no_mmap,
854 	.sendpage =	unix_stream_sendpage,
855 	.splice_read =	unix_stream_splice_read,
856 	.set_peek_off =	unix_set_peek_off,
857 	.show_fdinfo =	unix_show_fdinfo,
858 };
859 
860 static const struct proto_ops unix_dgram_ops = {
861 	.family =	PF_UNIX,
862 	.owner =	THIS_MODULE,
863 	.release =	unix_release,
864 	.bind =		unix_bind,
865 	.connect =	unix_dgram_connect,
866 	.socketpair =	unix_socketpair,
867 	.accept =	sock_no_accept,
868 	.getname =	unix_getname,
869 	.poll =		unix_dgram_poll,
870 	.ioctl =	unix_ioctl,
871 #ifdef CONFIG_COMPAT
872 	.compat_ioctl =	unix_compat_ioctl,
873 #endif
874 	.listen =	sock_no_listen,
875 	.shutdown =	unix_shutdown,
876 	.sendmsg =	unix_dgram_sendmsg,
877 	.read_skb =	unix_read_skb,
878 	.recvmsg =	unix_dgram_recvmsg,
879 	.mmap =		sock_no_mmap,
880 	.sendpage =	sock_no_sendpage,
881 	.set_peek_off =	unix_set_peek_off,
882 	.show_fdinfo =	unix_show_fdinfo,
883 };
884 
885 static const struct proto_ops unix_seqpacket_ops = {
886 	.family =	PF_UNIX,
887 	.owner =	THIS_MODULE,
888 	.release =	unix_release,
889 	.bind =		unix_bind,
890 	.connect =	unix_stream_connect,
891 	.socketpair =	unix_socketpair,
892 	.accept =	unix_accept,
893 	.getname =	unix_getname,
894 	.poll =		unix_dgram_poll,
895 	.ioctl =	unix_ioctl,
896 #ifdef CONFIG_COMPAT
897 	.compat_ioctl =	unix_compat_ioctl,
898 #endif
899 	.listen =	unix_listen,
900 	.shutdown =	unix_shutdown,
901 	.sendmsg =	unix_seqpacket_sendmsg,
902 	.recvmsg =	unix_seqpacket_recvmsg,
903 	.mmap =		sock_no_mmap,
904 	.sendpage =	sock_no_sendpage,
905 	.set_peek_off =	unix_set_peek_off,
906 	.show_fdinfo =	unix_show_fdinfo,
907 };
908 
909 static void unix_close(struct sock *sk, long timeout)
910 {
911 	/* Nothing to do here, unix socket does not need a ->close().
912 	 * This is merely for sockmap.
913 	 */
914 }
915 
916 static void unix_unhash(struct sock *sk)
917 {
918 	/* Nothing to do here, unix socket does not need a ->unhash().
919 	 * This is merely for sockmap.
920 	 */
921 }
922 
923 struct proto unix_dgram_proto = {
924 	.name			= "UNIX",
925 	.owner			= THIS_MODULE,
926 	.obj_size		= sizeof(struct unix_sock),
927 	.close			= unix_close,
928 #ifdef CONFIG_BPF_SYSCALL
929 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
930 #endif
931 };
932 
933 struct proto unix_stream_proto = {
934 	.name			= "UNIX-STREAM",
935 	.owner			= THIS_MODULE,
936 	.obj_size		= sizeof(struct unix_sock),
937 	.close			= unix_close,
938 	.unhash			= unix_unhash,
939 #ifdef CONFIG_BPF_SYSCALL
940 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
941 #endif
942 };
943 
944 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
945 {
946 	struct unix_sock *u;
947 	struct sock *sk;
948 	int err;
949 
950 	atomic_long_inc(&unix_nr_socks);
951 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
952 		err = -ENFILE;
953 		goto err;
954 	}
955 
956 	if (type == SOCK_STREAM)
957 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
958 	else /*dgram and  seqpacket */
959 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
960 
961 	if (!sk) {
962 		err = -ENOMEM;
963 		goto err;
964 	}
965 
966 	sock_init_data(sock, sk);
967 
968 	sk->sk_hash		= unix_unbound_hash(sk);
969 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
970 	sk->sk_write_space	= unix_write_space;
971 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
972 	sk->sk_destruct		= unix_sock_destructor;
973 	u	  = unix_sk(sk);
974 	u->path.dentry = NULL;
975 	u->path.mnt = NULL;
976 	spin_lock_init(&u->lock);
977 	atomic_long_set(&u->inflight, 0);
978 	INIT_LIST_HEAD(&u->link);
979 	mutex_init(&u->iolock); /* single task reading lock */
980 	mutex_init(&u->bindlock); /* single task binding lock */
981 	init_waitqueue_head(&u->peer_wait);
982 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
983 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
984 	unix_insert_unbound_socket(net, sk);
985 
986 	sock_prot_inuse_add(net, sk->sk_prot, 1);
987 
988 	return sk;
989 
990 err:
991 	atomic_long_dec(&unix_nr_socks);
992 	return ERR_PTR(err);
993 }
994 
995 static int unix_create(struct net *net, struct socket *sock, int protocol,
996 		       int kern)
997 {
998 	struct sock *sk;
999 
1000 	if (protocol && protocol != PF_UNIX)
1001 		return -EPROTONOSUPPORT;
1002 
1003 	sock->state = SS_UNCONNECTED;
1004 
1005 	switch (sock->type) {
1006 	case SOCK_STREAM:
1007 		sock->ops = &unix_stream_ops;
1008 		break;
1009 		/*
1010 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1011 		 *	nothing uses it.
1012 		 */
1013 	case SOCK_RAW:
1014 		sock->type = SOCK_DGRAM;
1015 		fallthrough;
1016 	case SOCK_DGRAM:
1017 		sock->ops = &unix_dgram_ops;
1018 		break;
1019 	case SOCK_SEQPACKET:
1020 		sock->ops = &unix_seqpacket_ops;
1021 		break;
1022 	default:
1023 		return -ESOCKTNOSUPPORT;
1024 	}
1025 
1026 	sk = unix_create1(net, sock, kern, sock->type);
1027 	if (IS_ERR(sk))
1028 		return PTR_ERR(sk);
1029 
1030 	return 0;
1031 }
1032 
1033 static int unix_release(struct socket *sock)
1034 {
1035 	struct sock *sk = sock->sk;
1036 
1037 	if (!sk)
1038 		return 0;
1039 
1040 	sk->sk_prot->close(sk, 0);
1041 	unix_release_sock(sk, 0);
1042 	sock->sk = NULL;
1043 
1044 	return 0;
1045 }
1046 
1047 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1048 				  int type)
1049 {
1050 	struct inode *inode;
1051 	struct path path;
1052 	struct sock *sk;
1053 	int err;
1054 
1055 	unix_mkname_bsd(sunaddr, addr_len);
1056 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1057 	if (err)
1058 		goto fail;
1059 
1060 	err = path_permission(&path, MAY_WRITE);
1061 	if (err)
1062 		goto path_put;
1063 
1064 	err = -ECONNREFUSED;
1065 	inode = d_backing_inode(path.dentry);
1066 	if (!S_ISSOCK(inode->i_mode))
1067 		goto path_put;
1068 
1069 	sk = unix_find_socket_byinode(inode);
1070 	if (!sk)
1071 		goto path_put;
1072 
1073 	err = -EPROTOTYPE;
1074 	if (sk->sk_type == type)
1075 		touch_atime(&path);
1076 	else
1077 		goto sock_put;
1078 
1079 	path_put(&path);
1080 
1081 	return sk;
1082 
1083 sock_put:
1084 	sock_put(sk);
1085 path_put:
1086 	path_put(&path);
1087 fail:
1088 	return ERR_PTR(err);
1089 }
1090 
1091 static struct sock *unix_find_abstract(struct net *net,
1092 				       struct sockaddr_un *sunaddr,
1093 				       int addr_len, int type)
1094 {
1095 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1096 	struct dentry *dentry;
1097 	struct sock *sk;
1098 
1099 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1100 	if (!sk)
1101 		return ERR_PTR(-ECONNREFUSED);
1102 
1103 	dentry = unix_sk(sk)->path.dentry;
1104 	if (dentry)
1105 		touch_atime(&unix_sk(sk)->path);
1106 
1107 	return sk;
1108 }
1109 
1110 static struct sock *unix_find_other(struct net *net,
1111 				    struct sockaddr_un *sunaddr,
1112 				    int addr_len, int type)
1113 {
1114 	struct sock *sk;
1115 
1116 	if (sunaddr->sun_path[0])
1117 		sk = unix_find_bsd(sunaddr, addr_len, type);
1118 	else
1119 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1120 
1121 	return sk;
1122 }
1123 
1124 static int unix_autobind(struct sock *sk)
1125 {
1126 	unsigned int new_hash, old_hash = sk->sk_hash;
1127 	struct unix_sock *u = unix_sk(sk);
1128 	struct net *net = sock_net(sk);
1129 	struct unix_address *addr;
1130 	u32 lastnum, ordernum;
1131 	int err;
1132 
1133 	err = mutex_lock_interruptible(&u->bindlock);
1134 	if (err)
1135 		return err;
1136 
1137 	if (u->addr)
1138 		goto out;
1139 
1140 	err = -ENOMEM;
1141 	addr = kzalloc(sizeof(*addr) +
1142 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1143 	if (!addr)
1144 		goto out;
1145 
1146 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1147 	addr->name->sun_family = AF_UNIX;
1148 	refcount_set(&addr->refcnt, 1);
1149 
1150 	ordernum = get_random_u32();
1151 	lastnum = ordernum & 0xFFFFF;
1152 retry:
1153 	ordernum = (ordernum + 1) & 0xFFFFF;
1154 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1155 
1156 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1157 	unix_table_double_lock(net, old_hash, new_hash);
1158 
1159 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1160 		unix_table_double_unlock(net, old_hash, new_hash);
1161 
1162 		/* __unix_find_socket_byname() may take long time if many names
1163 		 * are already in use.
1164 		 */
1165 		cond_resched();
1166 
1167 		if (ordernum == lastnum) {
1168 			/* Give up if all names seems to be in use. */
1169 			err = -ENOSPC;
1170 			unix_release_addr(addr);
1171 			goto out;
1172 		}
1173 
1174 		goto retry;
1175 	}
1176 
1177 	__unix_set_addr_hash(net, sk, addr, new_hash);
1178 	unix_table_double_unlock(net, old_hash, new_hash);
1179 	err = 0;
1180 
1181 out:	mutex_unlock(&u->bindlock);
1182 	return err;
1183 }
1184 
1185 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1186 			 int addr_len)
1187 {
1188 	umode_t mode = S_IFSOCK |
1189 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1190 	unsigned int new_hash, old_hash = sk->sk_hash;
1191 	struct unix_sock *u = unix_sk(sk);
1192 	struct net *net = sock_net(sk);
1193 	struct user_namespace *ns; // barf...
1194 	struct unix_address *addr;
1195 	struct dentry *dentry;
1196 	struct path parent;
1197 	int err;
1198 
1199 	unix_mkname_bsd(sunaddr, addr_len);
1200 	addr_len = strlen(sunaddr->sun_path) +
1201 		offsetof(struct sockaddr_un, sun_path) + 1;
1202 
1203 	addr = unix_create_addr(sunaddr, addr_len);
1204 	if (!addr)
1205 		return -ENOMEM;
1206 
1207 	/*
1208 	 * Get the parent directory, calculate the hash for last
1209 	 * component.
1210 	 */
1211 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1212 	if (IS_ERR(dentry)) {
1213 		err = PTR_ERR(dentry);
1214 		goto out;
1215 	}
1216 
1217 	/*
1218 	 * All right, let's create it.
1219 	 */
1220 	ns = mnt_user_ns(parent.mnt);
1221 	err = security_path_mknod(&parent, dentry, mode, 0);
1222 	if (!err)
1223 		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1224 	if (err)
1225 		goto out_path;
1226 	err = mutex_lock_interruptible(&u->bindlock);
1227 	if (err)
1228 		goto out_unlink;
1229 	if (u->addr)
1230 		goto out_unlock;
1231 
1232 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1233 	unix_table_double_lock(net, old_hash, new_hash);
1234 	u->path.mnt = mntget(parent.mnt);
1235 	u->path.dentry = dget(dentry);
1236 	__unix_set_addr_hash(net, sk, addr, new_hash);
1237 	unix_table_double_unlock(net, old_hash, new_hash);
1238 	unix_insert_bsd_socket(sk);
1239 	mutex_unlock(&u->bindlock);
1240 	done_path_create(&parent, dentry);
1241 	return 0;
1242 
1243 out_unlock:
1244 	mutex_unlock(&u->bindlock);
1245 	err = -EINVAL;
1246 out_unlink:
1247 	/* failed after successful mknod?  unlink what we'd created... */
1248 	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1249 out_path:
1250 	done_path_create(&parent, dentry);
1251 out:
1252 	unix_release_addr(addr);
1253 	return err == -EEXIST ? -EADDRINUSE : err;
1254 }
1255 
1256 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1257 			      int addr_len)
1258 {
1259 	unsigned int new_hash, old_hash = sk->sk_hash;
1260 	struct unix_sock *u = unix_sk(sk);
1261 	struct net *net = sock_net(sk);
1262 	struct unix_address *addr;
1263 	int err;
1264 
1265 	addr = unix_create_addr(sunaddr, addr_len);
1266 	if (!addr)
1267 		return -ENOMEM;
1268 
1269 	err = mutex_lock_interruptible(&u->bindlock);
1270 	if (err)
1271 		goto out;
1272 
1273 	if (u->addr) {
1274 		err = -EINVAL;
1275 		goto out_mutex;
1276 	}
1277 
1278 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1279 	unix_table_double_lock(net, old_hash, new_hash);
1280 
1281 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1282 		goto out_spin;
1283 
1284 	__unix_set_addr_hash(net, sk, addr, new_hash);
1285 	unix_table_double_unlock(net, old_hash, new_hash);
1286 	mutex_unlock(&u->bindlock);
1287 	return 0;
1288 
1289 out_spin:
1290 	unix_table_double_unlock(net, old_hash, new_hash);
1291 	err = -EADDRINUSE;
1292 out_mutex:
1293 	mutex_unlock(&u->bindlock);
1294 out:
1295 	unix_release_addr(addr);
1296 	return err;
1297 }
1298 
1299 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1300 {
1301 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1302 	struct sock *sk = sock->sk;
1303 	int err;
1304 
1305 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1306 	    sunaddr->sun_family == AF_UNIX)
1307 		return unix_autobind(sk);
1308 
1309 	err = unix_validate_addr(sunaddr, addr_len);
1310 	if (err)
1311 		return err;
1312 
1313 	if (sunaddr->sun_path[0])
1314 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1315 	else
1316 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1317 
1318 	return err;
1319 }
1320 
1321 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1322 {
1323 	if (unlikely(sk1 == sk2) || !sk2) {
1324 		unix_state_lock(sk1);
1325 		return;
1326 	}
1327 	if (sk1 < sk2) {
1328 		unix_state_lock(sk1);
1329 		unix_state_lock_nested(sk2);
1330 	} else {
1331 		unix_state_lock(sk2);
1332 		unix_state_lock_nested(sk1);
1333 	}
1334 }
1335 
1336 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1337 {
1338 	if (unlikely(sk1 == sk2) || !sk2) {
1339 		unix_state_unlock(sk1);
1340 		return;
1341 	}
1342 	unix_state_unlock(sk1);
1343 	unix_state_unlock(sk2);
1344 }
1345 
1346 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1347 			      int alen, int flags)
1348 {
1349 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1350 	struct sock *sk = sock->sk;
1351 	struct sock *other;
1352 	int err;
1353 
1354 	err = -EINVAL;
1355 	if (alen < offsetofend(struct sockaddr, sa_family))
1356 		goto out;
1357 
1358 	if (addr->sa_family != AF_UNSPEC) {
1359 		err = unix_validate_addr(sunaddr, alen);
1360 		if (err)
1361 			goto out;
1362 
1363 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1364 		    !unix_sk(sk)->addr) {
1365 			err = unix_autobind(sk);
1366 			if (err)
1367 				goto out;
1368 		}
1369 
1370 restart:
1371 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1372 		if (IS_ERR(other)) {
1373 			err = PTR_ERR(other);
1374 			goto out;
1375 		}
1376 
1377 		unix_state_double_lock(sk, other);
1378 
1379 		/* Apparently VFS overslept socket death. Retry. */
1380 		if (sock_flag(other, SOCK_DEAD)) {
1381 			unix_state_double_unlock(sk, other);
1382 			sock_put(other);
1383 			goto restart;
1384 		}
1385 
1386 		err = -EPERM;
1387 		if (!unix_may_send(sk, other))
1388 			goto out_unlock;
1389 
1390 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1391 		if (err)
1392 			goto out_unlock;
1393 
1394 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1395 	} else {
1396 		/*
1397 		 *	1003.1g breaking connected state with AF_UNSPEC
1398 		 */
1399 		other = NULL;
1400 		unix_state_double_lock(sk, other);
1401 	}
1402 
1403 	/*
1404 	 * If it was connected, reconnect.
1405 	 */
1406 	if (unix_peer(sk)) {
1407 		struct sock *old_peer = unix_peer(sk);
1408 
1409 		unix_peer(sk) = other;
1410 		if (!other)
1411 			sk->sk_state = TCP_CLOSE;
1412 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1413 
1414 		unix_state_double_unlock(sk, other);
1415 
1416 		if (other != old_peer)
1417 			unix_dgram_disconnected(sk, old_peer);
1418 		sock_put(old_peer);
1419 	} else {
1420 		unix_peer(sk) = other;
1421 		unix_state_double_unlock(sk, other);
1422 	}
1423 
1424 	return 0;
1425 
1426 out_unlock:
1427 	unix_state_double_unlock(sk, other);
1428 	sock_put(other);
1429 out:
1430 	return err;
1431 }
1432 
1433 static long unix_wait_for_peer(struct sock *other, long timeo)
1434 	__releases(&unix_sk(other)->lock)
1435 {
1436 	struct unix_sock *u = unix_sk(other);
1437 	int sched;
1438 	DEFINE_WAIT(wait);
1439 
1440 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1441 
1442 	sched = !sock_flag(other, SOCK_DEAD) &&
1443 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1444 		unix_recvq_full(other);
1445 
1446 	unix_state_unlock(other);
1447 
1448 	if (sched)
1449 		timeo = schedule_timeout(timeo);
1450 
1451 	finish_wait(&u->peer_wait, &wait);
1452 	return timeo;
1453 }
1454 
1455 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1456 			       int addr_len, int flags)
1457 {
1458 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1459 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1460 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1461 	struct net *net = sock_net(sk);
1462 	struct sk_buff *skb = NULL;
1463 	long timeo;
1464 	int err;
1465 	int st;
1466 
1467 	err = unix_validate_addr(sunaddr, addr_len);
1468 	if (err)
1469 		goto out;
1470 
1471 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1472 		err = unix_autobind(sk);
1473 		if (err)
1474 			goto out;
1475 	}
1476 
1477 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1478 
1479 	/* First of all allocate resources.
1480 	   If we will make it after state is locked,
1481 	   we will have to recheck all again in any case.
1482 	 */
1483 
1484 	/* create new sock for complete connection */
1485 	newsk = unix_create1(net, NULL, 0, sock->type);
1486 	if (IS_ERR(newsk)) {
1487 		err = PTR_ERR(newsk);
1488 		newsk = NULL;
1489 		goto out;
1490 	}
1491 
1492 	err = -ENOMEM;
1493 
1494 	/* Allocate skb for sending to listening sock */
1495 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1496 	if (skb == NULL)
1497 		goto out;
1498 
1499 restart:
1500 	/*  Find listening sock. */
1501 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1502 	if (IS_ERR(other)) {
1503 		err = PTR_ERR(other);
1504 		other = NULL;
1505 		goto out;
1506 	}
1507 
1508 	/* Latch state of peer */
1509 	unix_state_lock(other);
1510 
1511 	/* Apparently VFS overslept socket death. Retry. */
1512 	if (sock_flag(other, SOCK_DEAD)) {
1513 		unix_state_unlock(other);
1514 		sock_put(other);
1515 		goto restart;
1516 	}
1517 
1518 	err = -ECONNREFUSED;
1519 	if (other->sk_state != TCP_LISTEN)
1520 		goto out_unlock;
1521 	if (other->sk_shutdown & RCV_SHUTDOWN)
1522 		goto out_unlock;
1523 
1524 	if (unix_recvq_full(other)) {
1525 		err = -EAGAIN;
1526 		if (!timeo)
1527 			goto out_unlock;
1528 
1529 		timeo = unix_wait_for_peer(other, timeo);
1530 
1531 		err = sock_intr_errno(timeo);
1532 		if (signal_pending(current))
1533 			goto out;
1534 		sock_put(other);
1535 		goto restart;
1536 	}
1537 
1538 	/* Latch our state.
1539 
1540 	   It is tricky place. We need to grab our state lock and cannot
1541 	   drop lock on peer. It is dangerous because deadlock is
1542 	   possible. Connect to self case and simultaneous
1543 	   attempt to connect are eliminated by checking socket
1544 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1545 	   check this before attempt to grab lock.
1546 
1547 	   Well, and we have to recheck the state after socket locked.
1548 	 */
1549 	st = sk->sk_state;
1550 
1551 	switch (st) {
1552 	case TCP_CLOSE:
1553 		/* This is ok... continue with connect */
1554 		break;
1555 	case TCP_ESTABLISHED:
1556 		/* Socket is already connected */
1557 		err = -EISCONN;
1558 		goto out_unlock;
1559 	default:
1560 		err = -EINVAL;
1561 		goto out_unlock;
1562 	}
1563 
1564 	unix_state_lock_nested(sk);
1565 
1566 	if (sk->sk_state != st) {
1567 		unix_state_unlock(sk);
1568 		unix_state_unlock(other);
1569 		sock_put(other);
1570 		goto restart;
1571 	}
1572 
1573 	err = security_unix_stream_connect(sk, other, newsk);
1574 	if (err) {
1575 		unix_state_unlock(sk);
1576 		goto out_unlock;
1577 	}
1578 
1579 	/* The way is open! Fastly set all the necessary fields... */
1580 
1581 	sock_hold(sk);
1582 	unix_peer(newsk)	= sk;
1583 	newsk->sk_state		= TCP_ESTABLISHED;
1584 	newsk->sk_type		= sk->sk_type;
1585 	init_peercred(newsk);
1586 	newu = unix_sk(newsk);
1587 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1588 	otheru = unix_sk(other);
1589 
1590 	/* copy address information from listening to new sock
1591 	 *
1592 	 * The contents of *(otheru->addr) and otheru->path
1593 	 * are seen fully set up here, since we have found
1594 	 * otheru in hash under its lock.  Insertion into the
1595 	 * hash chain we'd found it in had been done in an
1596 	 * earlier critical area protected by the chain's lock,
1597 	 * the same one where we'd set *(otheru->addr) contents,
1598 	 * as well as otheru->path and otheru->addr itself.
1599 	 *
1600 	 * Using smp_store_release() here to set newu->addr
1601 	 * is enough to make those stores, as well as stores
1602 	 * to newu->path visible to anyone who gets newu->addr
1603 	 * by smp_load_acquire().  IOW, the same warranties
1604 	 * as for unix_sock instances bound in unix_bind() or
1605 	 * in unix_autobind().
1606 	 */
1607 	if (otheru->path.dentry) {
1608 		path_get(&otheru->path);
1609 		newu->path = otheru->path;
1610 	}
1611 	refcount_inc(&otheru->addr->refcnt);
1612 	smp_store_release(&newu->addr, otheru->addr);
1613 
1614 	/* Set credentials */
1615 	copy_peercred(sk, other);
1616 
1617 	sock->state	= SS_CONNECTED;
1618 	sk->sk_state	= TCP_ESTABLISHED;
1619 	sock_hold(newsk);
1620 
1621 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1622 	unix_peer(sk)	= newsk;
1623 
1624 	unix_state_unlock(sk);
1625 
1626 	/* take ten and send info to listening sock */
1627 	spin_lock(&other->sk_receive_queue.lock);
1628 	__skb_queue_tail(&other->sk_receive_queue, skb);
1629 	spin_unlock(&other->sk_receive_queue.lock);
1630 	unix_state_unlock(other);
1631 	other->sk_data_ready(other);
1632 	sock_put(other);
1633 	return 0;
1634 
1635 out_unlock:
1636 	if (other)
1637 		unix_state_unlock(other);
1638 
1639 out:
1640 	kfree_skb(skb);
1641 	if (newsk)
1642 		unix_release_sock(newsk, 0);
1643 	if (other)
1644 		sock_put(other);
1645 	return err;
1646 }
1647 
1648 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1649 {
1650 	struct sock *ska = socka->sk, *skb = sockb->sk;
1651 
1652 	/* Join our sockets back to back */
1653 	sock_hold(ska);
1654 	sock_hold(skb);
1655 	unix_peer(ska) = skb;
1656 	unix_peer(skb) = ska;
1657 	init_peercred(ska);
1658 	init_peercred(skb);
1659 
1660 	ska->sk_state = TCP_ESTABLISHED;
1661 	skb->sk_state = TCP_ESTABLISHED;
1662 	socka->state  = SS_CONNECTED;
1663 	sockb->state  = SS_CONNECTED;
1664 	return 0;
1665 }
1666 
1667 static void unix_sock_inherit_flags(const struct socket *old,
1668 				    struct socket *new)
1669 {
1670 	if (test_bit(SOCK_PASSCRED, &old->flags))
1671 		set_bit(SOCK_PASSCRED, &new->flags);
1672 	if (test_bit(SOCK_PASSSEC, &old->flags))
1673 		set_bit(SOCK_PASSSEC, &new->flags);
1674 }
1675 
1676 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1677 		       bool kern)
1678 {
1679 	struct sock *sk = sock->sk;
1680 	struct sock *tsk;
1681 	struct sk_buff *skb;
1682 	int err;
1683 
1684 	err = -EOPNOTSUPP;
1685 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1686 		goto out;
1687 
1688 	err = -EINVAL;
1689 	if (sk->sk_state != TCP_LISTEN)
1690 		goto out;
1691 
1692 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1693 	 * so that no locks are necessary.
1694 	 */
1695 
1696 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1697 				&err);
1698 	if (!skb) {
1699 		/* This means receive shutdown. */
1700 		if (err == 0)
1701 			err = -EINVAL;
1702 		goto out;
1703 	}
1704 
1705 	tsk = skb->sk;
1706 	skb_free_datagram(sk, skb);
1707 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1708 
1709 	/* attach accepted sock to socket */
1710 	unix_state_lock(tsk);
1711 	newsock->state = SS_CONNECTED;
1712 	unix_sock_inherit_flags(sock, newsock);
1713 	sock_graft(tsk, newsock);
1714 	unix_state_unlock(tsk);
1715 	return 0;
1716 
1717 out:
1718 	return err;
1719 }
1720 
1721 
1722 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1723 {
1724 	struct sock *sk = sock->sk;
1725 	struct unix_address *addr;
1726 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1727 	int err = 0;
1728 
1729 	if (peer) {
1730 		sk = unix_peer_get(sk);
1731 
1732 		err = -ENOTCONN;
1733 		if (!sk)
1734 			goto out;
1735 		err = 0;
1736 	} else {
1737 		sock_hold(sk);
1738 	}
1739 
1740 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1741 	if (!addr) {
1742 		sunaddr->sun_family = AF_UNIX;
1743 		sunaddr->sun_path[0] = 0;
1744 		err = offsetof(struct sockaddr_un, sun_path);
1745 	} else {
1746 		err = addr->len;
1747 		memcpy(sunaddr, addr->name, addr->len);
1748 	}
1749 	sock_put(sk);
1750 out:
1751 	return err;
1752 }
1753 
1754 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1755 {
1756 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1757 
1758 	/*
1759 	 * Garbage collection of unix sockets starts by selecting a set of
1760 	 * candidate sockets which have reference only from being in flight
1761 	 * (total_refs == inflight_refs).  This condition is checked once during
1762 	 * the candidate collection phase, and candidates are marked as such, so
1763 	 * that non-candidates can later be ignored.  While inflight_refs is
1764 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1765 	 * is an instantaneous decision.
1766 	 *
1767 	 * Once a candidate, however, the socket must not be reinstalled into a
1768 	 * file descriptor while the garbage collection is in progress.
1769 	 *
1770 	 * If the above conditions are met, then the directed graph of
1771 	 * candidates (*) does not change while unix_gc_lock is held.
1772 	 *
1773 	 * Any operations that changes the file count through file descriptors
1774 	 * (dup, close, sendmsg) does not change the graph since candidates are
1775 	 * not installed in fds.
1776 	 *
1777 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1778 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1779 	 * serialized with garbage collection.
1780 	 *
1781 	 * MSG_PEEK is special in that it does not change the inflight count,
1782 	 * yet does install the socket into an fd.  The following lock/unlock
1783 	 * pair is to ensure serialization with garbage collection.  It must be
1784 	 * done between incrementing the file count and installing the file into
1785 	 * an fd.
1786 	 *
1787 	 * If garbage collection starts after the barrier provided by the
1788 	 * lock/unlock, then it will see the elevated refcount and not mark this
1789 	 * as a candidate.  If a garbage collection is already in progress
1790 	 * before the file count was incremented, then the lock/unlock pair will
1791 	 * ensure that garbage collection is finished before progressing to
1792 	 * installing the fd.
1793 	 *
1794 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1795 	 * which is on the queue of listening socket A.
1796 	 */
1797 	spin_lock(&unix_gc_lock);
1798 	spin_unlock(&unix_gc_lock);
1799 }
1800 
1801 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1802 {
1803 	int err = 0;
1804 
1805 	UNIXCB(skb).pid  = get_pid(scm->pid);
1806 	UNIXCB(skb).uid = scm->creds.uid;
1807 	UNIXCB(skb).gid = scm->creds.gid;
1808 	UNIXCB(skb).fp = NULL;
1809 	unix_get_secdata(scm, skb);
1810 	if (scm->fp && send_fds)
1811 		err = unix_attach_fds(scm, skb);
1812 
1813 	skb->destructor = unix_destruct_scm;
1814 	return err;
1815 }
1816 
1817 static bool unix_passcred_enabled(const struct socket *sock,
1818 				  const struct sock *other)
1819 {
1820 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1821 	       !other->sk_socket ||
1822 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1823 }
1824 
1825 /*
1826  * Some apps rely on write() giving SCM_CREDENTIALS
1827  * We include credentials if source or destination socket
1828  * asserted SOCK_PASSCRED.
1829  */
1830 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1831 			    const struct sock *other)
1832 {
1833 	if (UNIXCB(skb).pid)
1834 		return;
1835 	if (unix_passcred_enabled(sock, other)) {
1836 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1837 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1838 	}
1839 }
1840 
1841 static int maybe_init_creds(struct scm_cookie *scm,
1842 			    struct socket *socket,
1843 			    const struct sock *other)
1844 {
1845 	int err;
1846 	struct msghdr msg = { .msg_controllen = 0 };
1847 
1848 	err = scm_send(socket, &msg, scm, false);
1849 	if (err)
1850 		return err;
1851 
1852 	if (unix_passcred_enabled(socket, other)) {
1853 		scm->pid = get_pid(task_tgid(current));
1854 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1855 	}
1856 	return err;
1857 }
1858 
1859 static bool unix_skb_scm_eq(struct sk_buff *skb,
1860 			    struct scm_cookie *scm)
1861 {
1862 	return UNIXCB(skb).pid == scm->pid &&
1863 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1864 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1865 	       unix_secdata_eq(scm, skb);
1866 }
1867 
1868 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1869 {
1870 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1871 	struct unix_sock *u = unix_sk(sk);
1872 
1873 	if (unlikely(fp && fp->count))
1874 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1875 }
1876 
1877 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1878 {
1879 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1880 	struct unix_sock *u = unix_sk(sk);
1881 
1882 	if (unlikely(fp && fp->count))
1883 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1884 }
1885 
1886 /*
1887  *	Send AF_UNIX data.
1888  */
1889 
1890 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1891 			      size_t len)
1892 {
1893 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1894 	struct sock *sk = sock->sk, *other = NULL;
1895 	struct unix_sock *u = unix_sk(sk);
1896 	struct scm_cookie scm;
1897 	struct sk_buff *skb;
1898 	int data_len = 0;
1899 	int sk_locked;
1900 	long timeo;
1901 	int err;
1902 
1903 	wait_for_unix_gc();
1904 	err = scm_send(sock, msg, &scm, false);
1905 	if (err < 0)
1906 		return err;
1907 
1908 	err = -EOPNOTSUPP;
1909 	if (msg->msg_flags&MSG_OOB)
1910 		goto out;
1911 
1912 	if (msg->msg_namelen) {
1913 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1914 		if (err)
1915 			goto out;
1916 	} else {
1917 		sunaddr = NULL;
1918 		err = -ENOTCONN;
1919 		other = unix_peer_get(sk);
1920 		if (!other)
1921 			goto out;
1922 	}
1923 
1924 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1925 		err = unix_autobind(sk);
1926 		if (err)
1927 			goto out;
1928 	}
1929 
1930 	err = -EMSGSIZE;
1931 	if (len > sk->sk_sndbuf - 32)
1932 		goto out;
1933 
1934 	if (len > SKB_MAX_ALLOC) {
1935 		data_len = min_t(size_t,
1936 				 len - SKB_MAX_ALLOC,
1937 				 MAX_SKB_FRAGS * PAGE_SIZE);
1938 		data_len = PAGE_ALIGN(data_len);
1939 
1940 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1941 	}
1942 
1943 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1944 				   msg->msg_flags & MSG_DONTWAIT, &err,
1945 				   PAGE_ALLOC_COSTLY_ORDER);
1946 	if (skb == NULL)
1947 		goto out;
1948 
1949 	err = unix_scm_to_skb(&scm, skb, true);
1950 	if (err < 0)
1951 		goto out_free;
1952 
1953 	skb_put(skb, len - data_len);
1954 	skb->data_len = data_len;
1955 	skb->len = len;
1956 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1957 	if (err)
1958 		goto out_free;
1959 
1960 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1961 
1962 restart:
1963 	if (!other) {
1964 		err = -ECONNRESET;
1965 		if (sunaddr == NULL)
1966 			goto out_free;
1967 
1968 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1969 					sk->sk_type);
1970 		if (IS_ERR(other)) {
1971 			err = PTR_ERR(other);
1972 			other = NULL;
1973 			goto out_free;
1974 		}
1975 	}
1976 
1977 	if (sk_filter(other, skb) < 0) {
1978 		/* Toss the packet but do not return any error to the sender */
1979 		err = len;
1980 		goto out_free;
1981 	}
1982 
1983 	sk_locked = 0;
1984 	unix_state_lock(other);
1985 restart_locked:
1986 	err = -EPERM;
1987 	if (!unix_may_send(sk, other))
1988 		goto out_unlock;
1989 
1990 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1991 		/*
1992 		 *	Check with 1003.1g - what should
1993 		 *	datagram error
1994 		 */
1995 		unix_state_unlock(other);
1996 		sock_put(other);
1997 
1998 		if (!sk_locked)
1999 			unix_state_lock(sk);
2000 
2001 		err = 0;
2002 		if (sk->sk_type == SOCK_SEQPACKET) {
2003 			/* We are here only when racing with unix_release_sock()
2004 			 * is clearing @other. Never change state to TCP_CLOSE
2005 			 * unlike SOCK_DGRAM wants.
2006 			 */
2007 			unix_state_unlock(sk);
2008 			err = -EPIPE;
2009 		} else if (unix_peer(sk) == other) {
2010 			unix_peer(sk) = NULL;
2011 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2012 
2013 			sk->sk_state = TCP_CLOSE;
2014 			unix_state_unlock(sk);
2015 
2016 			unix_dgram_disconnected(sk, other);
2017 			sock_put(other);
2018 			err = -ECONNREFUSED;
2019 		} else {
2020 			unix_state_unlock(sk);
2021 		}
2022 
2023 		other = NULL;
2024 		if (err)
2025 			goto out_free;
2026 		goto restart;
2027 	}
2028 
2029 	err = -EPIPE;
2030 	if (other->sk_shutdown & RCV_SHUTDOWN)
2031 		goto out_unlock;
2032 
2033 	if (sk->sk_type != SOCK_SEQPACKET) {
2034 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2035 		if (err)
2036 			goto out_unlock;
2037 	}
2038 
2039 	/* other == sk && unix_peer(other) != sk if
2040 	 * - unix_peer(sk) == NULL, destination address bound to sk
2041 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2042 	 */
2043 	if (other != sk &&
2044 	    unlikely(unix_peer(other) != sk &&
2045 	    unix_recvq_full_lockless(other))) {
2046 		if (timeo) {
2047 			timeo = unix_wait_for_peer(other, timeo);
2048 
2049 			err = sock_intr_errno(timeo);
2050 			if (signal_pending(current))
2051 				goto out_free;
2052 
2053 			goto restart;
2054 		}
2055 
2056 		if (!sk_locked) {
2057 			unix_state_unlock(other);
2058 			unix_state_double_lock(sk, other);
2059 		}
2060 
2061 		if (unix_peer(sk) != other ||
2062 		    unix_dgram_peer_wake_me(sk, other)) {
2063 			err = -EAGAIN;
2064 			sk_locked = 1;
2065 			goto out_unlock;
2066 		}
2067 
2068 		if (!sk_locked) {
2069 			sk_locked = 1;
2070 			goto restart_locked;
2071 		}
2072 	}
2073 
2074 	if (unlikely(sk_locked))
2075 		unix_state_unlock(sk);
2076 
2077 	if (sock_flag(other, SOCK_RCVTSTAMP))
2078 		__net_timestamp(skb);
2079 	maybe_add_creds(skb, sock, other);
2080 	scm_stat_add(other, skb);
2081 	skb_queue_tail(&other->sk_receive_queue, skb);
2082 	unix_state_unlock(other);
2083 	other->sk_data_ready(other);
2084 	sock_put(other);
2085 	scm_destroy(&scm);
2086 	return len;
2087 
2088 out_unlock:
2089 	if (sk_locked)
2090 		unix_state_unlock(sk);
2091 	unix_state_unlock(other);
2092 out_free:
2093 	kfree_skb(skb);
2094 out:
2095 	if (other)
2096 		sock_put(other);
2097 	scm_destroy(&scm);
2098 	return err;
2099 }
2100 
2101 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2102  * bytes, and a minimum of a full page.
2103  */
2104 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2105 
2106 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2107 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2108 {
2109 	struct unix_sock *ousk = unix_sk(other);
2110 	struct sk_buff *skb;
2111 	int err = 0;
2112 
2113 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2114 
2115 	if (!skb)
2116 		return err;
2117 
2118 	skb_put(skb, 1);
2119 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2120 
2121 	if (err) {
2122 		kfree_skb(skb);
2123 		return err;
2124 	}
2125 
2126 	unix_state_lock(other);
2127 
2128 	if (sock_flag(other, SOCK_DEAD) ||
2129 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2130 		unix_state_unlock(other);
2131 		kfree_skb(skb);
2132 		return -EPIPE;
2133 	}
2134 
2135 	maybe_add_creds(skb, sock, other);
2136 	skb_get(skb);
2137 
2138 	if (ousk->oob_skb)
2139 		consume_skb(ousk->oob_skb);
2140 
2141 	WRITE_ONCE(ousk->oob_skb, skb);
2142 
2143 	scm_stat_add(other, skb);
2144 	skb_queue_tail(&other->sk_receive_queue, skb);
2145 	sk_send_sigurg(other);
2146 	unix_state_unlock(other);
2147 	other->sk_data_ready(other);
2148 
2149 	return err;
2150 }
2151 #endif
2152 
2153 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2154 			       size_t len)
2155 {
2156 	struct sock *sk = sock->sk;
2157 	struct sock *other = NULL;
2158 	int err, size;
2159 	struct sk_buff *skb;
2160 	int sent = 0;
2161 	struct scm_cookie scm;
2162 	bool fds_sent = false;
2163 	int data_len;
2164 
2165 	wait_for_unix_gc();
2166 	err = scm_send(sock, msg, &scm, false);
2167 	if (err < 0)
2168 		return err;
2169 
2170 	err = -EOPNOTSUPP;
2171 	if (msg->msg_flags & MSG_OOB) {
2172 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2173 		if (len)
2174 			len--;
2175 		else
2176 #endif
2177 			goto out_err;
2178 	}
2179 
2180 	if (msg->msg_namelen) {
2181 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2182 		goto out_err;
2183 	} else {
2184 		err = -ENOTCONN;
2185 		other = unix_peer(sk);
2186 		if (!other)
2187 			goto out_err;
2188 	}
2189 
2190 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2191 		goto pipe_err;
2192 
2193 	while (sent < len) {
2194 		size = len - sent;
2195 
2196 		/* Keep two messages in the pipe so it schedules better */
2197 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2198 
2199 		/* allow fallback to order-0 allocations */
2200 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2201 
2202 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2203 
2204 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2205 
2206 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2207 					   msg->msg_flags & MSG_DONTWAIT, &err,
2208 					   get_order(UNIX_SKB_FRAGS_SZ));
2209 		if (!skb)
2210 			goto out_err;
2211 
2212 		/* Only send the fds in the first buffer */
2213 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2214 		if (err < 0) {
2215 			kfree_skb(skb);
2216 			goto out_err;
2217 		}
2218 		fds_sent = true;
2219 
2220 		skb_put(skb, size - data_len);
2221 		skb->data_len = data_len;
2222 		skb->len = size;
2223 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2224 		if (err) {
2225 			kfree_skb(skb);
2226 			goto out_err;
2227 		}
2228 
2229 		unix_state_lock(other);
2230 
2231 		if (sock_flag(other, SOCK_DEAD) ||
2232 		    (other->sk_shutdown & RCV_SHUTDOWN))
2233 			goto pipe_err_free;
2234 
2235 		maybe_add_creds(skb, sock, other);
2236 		scm_stat_add(other, skb);
2237 		skb_queue_tail(&other->sk_receive_queue, skb);
2238 		unix_state_unlock(other);
2239 		other->sk_data_ready(other);
2240 		sent += size;
2241 	}
2242 
2243 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2244 	if (msg->msg_flags & MSG_OOB) {
2245 		err = queue_oob(sock, msg, other);
2246 		if (err)
2247 			goto out_err;
2248 		sent++;
2249 	}
2250 #endif
2251 
2252 	scm_destroy(&scm);
2253 
2254 	return sent;
2255 
2256 pipe_err_free:
2257 	unix_state_unlock(other);
2258 	kfree_skb(skb);
2259 pipe_err:
2260 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2261 		send_sig(SIGPIPE, current, 0);
2262 	err = -EPIPE;
2263 out_err:
2264 	scm_destroy(&scm);
2265 	return sent ? : err;
2266 }
2267 
2268 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2269 				    int offset, size_t size, int flags)
2270 {
2271 	int err;
2272 	bool send_sigpipe = false;
2273 	bool init_scm = true;
2274 	struct scm_cookie scm;
2275 	struct sock *other, *sk = socket->sk;
2276 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2277 
2278 	if (flags & MSG_OOB)
2279 		return -EOPNOTSUPP;
2280 
2281 	other = unix_peer(sk);
2282 	if (!other || sk->sk_state != TCP_ESTABLISHED)
2283 		return -ENOTCONN;
2284 
2285 	if (false) {
2286 alloc_skb:
2287 		unix_state_unlock(other);
2288 		mutex_unlock(&unix_sk(other)->iolock);
2289 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2290 					      &err, 0);
2291 		if (!newskb)
2292 			goto err;
2293 	}
2294 
2295 	/* we must acquire iolock as we modify already present
2296 	 * skbs in the sk_receive_queue and mess with skb->len
2297 	 */
2298 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2299 	if (err) {
2300 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2301 		goto err;
2302 	}
2303 
2304 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2305 		err = -EPIPE;
2306 		send_sigpipe = true;
2307 		goto err_unlock;
2308 	}
2309 
2310 	unix_state_lock(other);
2311 
2312 	if (sock_flag(other, SOCK_DEAD) ||
2313 	    other->sk_shutdown & RCV_SHUTDOWN) {
2314 		err = -EPIPE;
2315 		send_sigpipe = true;
2316 		goto err_state_unlock;
2317 	}
2318 
2319 	if (init_scm) {
2320 		err = maybe_init_creds(&scm, socket, other);
2321 		if (err)
2322 			goto err_state_unlock;
2323 		init_scm = false;
2324 	}
2325 
2326 	skb = skb_peek_tail(&other->sk_receive_queue);
2327 	if (tail && tail == skb) {
2328 		skb = newskb;
2329 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2330 		if (newskb) {
2331 			skb = newskb;
2332 		} else {
2333 			tail = skb;
2334 			goto alloc_skb;
2335 		}
2336 	} else if (newskb) {
2337 		/* this is fast path, we don't necessarily need to
2338 		 * call to kfree_skb even though with newskb == NULL
2339 		 * this - does no harm
2340 		 */
2341 		consume_skb(newskb);
2342 		newskb = NULL;
2343 	}
2344 
2345 	if (skb_append_pagefrags(skb, page, offset, size)) {
2346 		tail = skb;
2347 		goto alloc_skb;
2348 	}
2349 
2350 	skb->len += size;
2351 	skb->data_len += size;
2352 	skb->truesize += size;
2353 	refcount_add(size, &sk->sk_wmem_alloc);
2354 
2355 	if (newskb) {
2356 		err = unix_scm_to_skb(&scm, skb, false);
2357 		if (err)
2358 			goto err_state_unlock;
2359 		spin_lock(&other->sk_receive_queue.lock);
2360 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2361 		spin_unlock(&other->sk_receive_queue.lock);
2362 	}
2363 
2364 	unix_state_unlock(other);
2365 	mutex_unlock(&unix_sk(other)->iolock);
2366 
2367 	other->sk_data_ready(other);
2368 	scm_destroy(&scm);
2369 	return size;
2370 
2371 err_state_unlock:
2372 	unix_state_unlock(other);
2373 err_unlock:
2374 	mutex_unlock(&unix_sk(other)->iolock);
2375 err:
2376 	kfree_skb(newskb);
2377 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2378 		send_sig(SIGPIPE, current, 0);
2379 	if (!init_scm)
2380 		scm_destroy(&scm);
2381 	return err;
2382 }
2383 
2384 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2385 				  size_t len)
2386 {
2387 	int err;
2388 	struct sock *sk = sock->sk;
2389 
2390 	err = sock_error(sk);
2391 	if (err)
2392 		return err;
2393 
2394 	if (sk->sk_state != TCP_ESTABLISHED)
2395 		return -ENOTCONN;
2396 
2397 	if (msg->msg_namelen)
2398 		msg->msg_namelen = 0;
2399 
2400 	return unix_dgram_sendmsg(sock, msg, len);
2401 }
2402 
2403 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2404 				  size_t size, int flags)
2405 {
2406 	struct sock *sk = sock->sk;
2407 
2408 	if (sk->sk_state != TCP_ESTABLISHED)
2409 		return -ENOTCONN;
2410 
2411 	return unix_dgram_recvmsg(sock, msg, size, flags);
2412 }
2413 
2414 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2415 {
2416 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2417 
2418 	if (addr) {
2419 		msg->msg_namelen = addr->len;
2420 		memcpy(msg->msg_name, addr->name, addr->len);
2421 	}
2422 }
2423 
2424 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2425 			 int flags)
2426 {
2427 	struct scm_cookie scm;
2428 	struct socket *sock = sk->sk_socket;
2429 	struct unix_sock *u = unix_sk(sk);
2430 	struct sk_buff *skb, *last;
2431 	long timeo;
2432 	int skip;
2433 	int err;
2434 
2435 	err = -EOPNOTSUPP;
2436 	if (flags&MSG_OOB)
2437 		goto out;
2438 
2439 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2440 
2441 	do {
2442 		mutex_lock(&u->iolock);
2443 
2444 		skip = sk_peek_offset(sk, flags);
2445 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2446 					      &skip, &err, &last);
2447 		if (skb) {
2448 			if (!(flags & MSG_PEEK))
2449 				scm_stat_del(sk, skb);
2450 			break;
2451 		}
2452 
2453 		mutex_unlock(&u->iolock);
2454 
2455 		if (err != -EAGAIN)
2456 			break;
2457 	} while (timeo &&
2458 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2459 					      &err, &timeo, last));
2460 
2461 	if (!skb) { /* implies iolock unlocked */
2462 		unix_state_lock(sk);
2463 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2464 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2465 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2466 			err = 0;
2467 		unix_state_unlock(sk);
2468 		goto out;
2469 	}
2470 
2471 	if (wq_has_sleeper(&u->peer_wait))
2472 		wake_up_interruptible_sync_poll(&u->peer_wait,
2473 						EPOLLOUT | EPOLLWRNORM |
2474 						EPOLLWRBAND);
2475 
2476 	if (msg->msg_name)
2477 		unix_copy_addr(msg, skb->sk);
2478 
2479 	if (size > skb->len - skip)
2480 		size = skb->len - skip;
2481 	else if (size < skb->len - skip)
2482 		msg->msg_flags |= MSG_TRUNC;
2483 
2484 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2485 	if (err)
2486 		goto out_free;
2487 
2488 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2489 		__sock_recv_timestamp(msg, sk, skb);
2490 
2491 	memset(&scm, 0, sizeof(scm));
2492 
2493 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2494 	unix_set_secdata(&scm, skb);
2495 
2496 	if (!(flags & MSG_PEEK)) {
2497 		if (UNIXCB(skb).fp)
2498 			unix_detach_fds(&scm, skb);
2499 
2500 		sk_peek_offset_bwd(sk, skb->len);
2501 	} else {
2502 		/* It is questionable: on PEEK we could:
2503 		   - do not return fds - good, but too simple 8)
2504 		   - return fds, and do not return them on read (old strategy,
2505 		     apparently wrong)
2506 		   - clone fds (I chose it for now, it is the most universal
2507 		     solution)
2508 
2509 		   POSIX 1003.1g does not actually define this clearly
2510 		   at all. POSIX 1003.1g doesn't define a lot of things
2511 		   clearly however!
2512 
2513 		*/
2514 
2515 		sk_peek_offset_fwd(sk, size);
2516 
2517 		if (UNIXCB(skb).fp)
2518 			unix_peek_fds(&scm, skb);
2519 	}
2520 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2521 
2522 	scm_recv(sock, msg, &scm, flags);
2523 
2524 out_free:
2525 	skb_free_datagram(sk, skb);
2526 	mutex_unlock(&u->iolock);
2527 out:
2528 	return err;
2529 }
2530 
2531 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2532 			      int flags)
2533 {
2534 	struct sock *sk = sock->sk;
2535 
2536 #ifdef CONFIG_BPF_SYSCALL
2537 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2538 
2539 	if (prot != &unix_dgram_proto)
2540 		return prot->recvmsg(sk, msg, size, flags, NULL);
2541 #endif
2542 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2543 }
2544 
2545 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2546 {
2547 	struct unix_sock *u = unix_sk(sk);
2548 	struct sk_buff *skb;
2549 	int err, copied;
2550 
2551 	mutex_lock(&u->iolock);
2552 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2553 	mutex_unlock(&u->iolock);
2554 	if (!skb)
2555 		return err;
2556 
2557 	copied = recv_actor(sk, skb);
2558 	kfree_skb(skb);
2559 
2560 	return copied;
2561 }
2562 
2563 /*
2564  *	Sleep until more data has arrived. But check for races..
2565  */
2566 static long unix_stream_data_wait(struct sock *sk, long timeo,
2567 				  struct sk_buff *last, unsigned int last_len,
2568 				  bool freezable)
2569 {
2570 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2571 	struct sk_buff *tail;
2572 	DEFINE_WAIT(wait);
2573 
2574 	unix_state_lock(sk);
2575 
2576 	for (;;) {
2577 		prepare_to_wait(sk_sleep(sk), &wait, state);
2578 
2579 		tail = skb_peek_tail(&sk->sk_receive_queue);
2580 		if (tail != last ||
2581 		    (tail && tail->len != last_len) ||
2582 		    sk->sk_err ||
2583 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2584 		    signal_pending(current) ||
2585 		    !timeo)
2586 			break;
2587 
2588 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2589 		unix_state_unlock(sk);
2590 		timeo = schedule_timeout(timeo);
2591 		unix_state_lock(sk);
2592 
2593 		if (sock_flag(sk, SOCK_DEAD))
2594 			break;
2595 
2596 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2597 	}
2598 
2599 	finish_wait(sk_sleep(sk), &wait);
2600 	unix_state_unlock(sk);
2601 	return timeo;
2602 }
2603 
2604 static unsigned int unix_skb_len(const struct sk_buff *skb)
2605 {
2606 	return skb->len - UNIXCB(skb).consumed;
2607 }
2608 
2609 struct unix_stream_read_state {
2610 	int (*recv_actor)(struct sk_buff *, int, int,
2611 			  struct unix_stream_read_state *);
2612 	struct socket *socket;
2613 	struct msghdr *msg;
2614 	struct pipe_inode_info *pipe;
2615 	size_t size;
2616 	int flags;
2617 	unsigned int splice_flags;
2618 };
2619 
2620 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2621 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2622 {
2623 	struct socket *sock = state->socket;
2624 	struct sock *sk = sock->sk;
2625 	struct unix_sock *u = unix_sk(sk);
2626 	int chunk = 1;
2627 	struct sk_buff *oob_skb;
2628 
2629 	mutex_lock(&u->iolock);
2630 	unix_state_lock(sk);
2631 
2632 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2633 		unix_state_unlock(sk);
2634 		mutex_unlock(&u->iolock);
2635 		return -EINVAL;
2636 	}
2637 
2638 	oob_skb = u->oob_skb;
2639 
2640 	if (!(state->flags & MSG_PEEK))
2641 		WRITE_ONCE(u->oob_skb, NULL);
2642 
2643 	unix_state_unlock(sk);
2644 
2645 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2646 
2647 	if (!(state->flags & MSG_PEEK)) {
2648 		UNIXCB(oob_skb).consumed += 1;
2649 		kfree_skb(oob_skb);
2650 	}
2651 
2652 	mutex_unlock(&u->iolock);
2653 
2654 	if (chunk < 0)
2655 		return -EFAULT;
2656 
2657 	state->msg->msg_flags |= MSG_OOB;
2658 	return 1;
2659 }
2660 
2661 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2662 				  int flags, int copied)
2663 {
2664 	struct unix_sock *u = unix_sk(sk);
2665 
2666 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2667 		skb_unlink(skb, &sk->sk_receive_queue);
2668 		consume_skb(skb);
2669 		skb = NULL;
2670 	} else {
2671 		if (skb == u->oob_skb) {
2672 			if (copied) {
2673 				skb = NULL;
2674 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2675 				if (!(flags & MSG_PEEK)) {
2676 					WRITE_ONCE(u->oob_skb, NULL);
2677 					consume_skb(skb);
2678 				}
2679 			} else if (!(flags & MSG_PEEK)) {
2680 				skb_unlink(skb, &sk->sk_receive_queue);
2681 				consume_skb(skb);
2682 				skb = skb_peek(&sk->sk_receive_queue);
2683 			}
2684 		}
2685 	}
2686 	return skb;
2687 }
2688 #endif
2689 
2690 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2691 {
2692 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2693 		return -ENOTCONN;
2694 
2695 	return unix_read_skb(sk, recv_actor);
2696 }
2697 
2698 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2699 				    bool freezable)
2700 {
2701 	struct scm_cookie scm;
2702 	struct socket *sock = state->socket;
2703 	struct sock *sk = sock->sk;
2704 	struct unix_sock *u = unix_sk(sk);
2705 	int copied = 0;
2706 	int flags = state->flags;
2707 	int noblock = flags & MSG_DONTWAIT;
2708 	bool check_creds = false;
2709 	int target;
2710 	int err = 0;
2711 	long timeo;
2712 	int skip;
2713 	size_t size = state->size;
2714 	unsigned int last_len;
2715 
2716 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2717 		err = -EINVAL;
2718 		goto out;
2719 	}
2720 
2721 	if (unlikely(flags & MSG_OOB)) {
2722 		err = -EOPNOTSUPP;
2723 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2724 		err = unix_stream_recv_urg(state);
2725 #endif
2726 		goto out;
2727 	}
2728 
2729 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2730 	timeo = sock_rcvtimeo(sk, noblock);
2731 
2732 	memset(&scm, 0, sizeof(scm));
2733 
2734 	/* Lock the socket to prevent queue disordering
2735 	 * while sleeps in memcpy_tomsg
2736 	 */
2737 	mutex_lock(&u->iolock);
2738 
2739 	skip = max(sk_peek_offset(sk, flags), 0);
2740 
2741 	do {
2742 		int chunk;
2743 		bool drop_skb;
2744 		struct sk_buff *skb, *last;
2745 
2746 redo:
2747 		unix_state_lock(sk);
2748 		if (sock_flag(sk, SOCK_DEAD)) {
2749 			err = -ECONNRESET;
2750 			goto unlock;
2751 		}
2752 		last = skb = skb_peek(&sk->sk_receive_queue);
2753 		last_len = last ? last->len : 0;
2754 
2755 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2756 		if (skb) {
2757 			skb = manage_oob(skb, sk, flags, copied);
2758 			if (!skb) {
2759 				unix_state_unlock(sk);
2760 				if (copied)
2761 					break;
2762 				goto redo;
2763 			}
2764 		}
2765 #endif
2766 again:
2767 		if (skb == NULL) {
2768 			if (copied >= target)
2769 				goto unlock;
2770 
2771 			/*
2772 			 *	POSIX 1003.1g mandates this order.
2773 			 */
2774 
2775 			err = sock_error(sk);
2776 			if (err)
2777 				goto unlock;
2778 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2779 				goto unlock;
2780 
2781 			unix_state_unlock(sk);
2782 			if (!timeo) {
2783 				err = -EAGAIN;
2784 				break;
2785 			}
2786 
2787 			mutex_unlock(&u->iolock);
2788 
2789 			timeo = unix_stream_data_wait(sk, timeo, last,
2790 						      last_len, freezable);
2791 
2792 			if (signal_pending(current)) {
2793 				err = sock_intr_errno(timeo);
2794 				scm_destroy(&scm);
2795 				goto out;
2796 			}
2797 
2798 			mutex_lock(&u->iolock);
2799 			goto redo;
2800 unlock:
2801 			unix_state_unlock(sk);
2802 			break;
2803 		}
2804 
2805 		while (skip >= unix_skb_len(skb)) {
2806 			skip -= unix_skb_len(skb);
2807 			last = skb;
2808 			last_len = skb->len;
2809 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2810 			if (!skb)
2811 				goto again;
2812 		}
2813 
2814 		unix_state_unlock(sk);
2815 
2816 		if (check_creds) {
2817 			/* Never glue messages from different writers */
2818 			if (!unix_skb_scm_eq(skb, &scm))
2819 				break;
2820 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2821 			/* Copy credentials */
2822 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2823 			unix_set_secdata(&scm, skb);
2824 			check_creds = true;
2825 		}
2826 
2827 		/* Copy address just once */
2828 		if (state->msg && state->msg->msg_name) {
2829 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2830 					 state->msg->msg_name);
2831 			unix_copy_addr(state->msg, skb->sk);
2832 			sunaddr = NULL;
2833 		}
2834 
2835 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2836 		skb_get(skb);
2837 		chunk = state->recv_actor(skb, skip, chunk, state);
2838 		drop_skb = !unix_skb_len(skb);
2839 		/* skb is only safe to use if !drop_skb */
2840 		consume_skb(skb);
2841 		if (chunk < 0) {
2842 			if (copied == 0)
2843 				copied = -EFAULT;
2844 			break;
2845 		}
2846 		copied += chunk;
2847 		size -= chunk;
2848 
2849 		if (drop_skb) {
2850 			/* the skb was touched by a concurrent reader;
2851 			 * we should not expect anything from this skb
2852 			 * anymore and assume it invalid - we can be
2853 			 * sure it was dropped from the socket queue
2854 			 *
2855 			 * let's report a short read
2856 			 */
2857 			err = 0;
2858 			break;
2859 		}
2860 
2861 		/* Mark read part of skb as used */
2862 		if (!(flags & MSG_PEEK)) {
2863 			UNIXCB(skb).consumed += chunk;
2864 
2865 			sk_peek_offset_bwd(sk, chunk);
2866 
2867 			if (UNIXCB(skb).fp) {
2868 				scm_stat_del(sk, skb);
2869 				unix_detach_fds(&scm, skb);
2870 			}
2871 
2872 			if (unix_skb_len(skb))
2873 				break;
2874 
2875 			skb_unlink(skb, &sk->sk_receive_queue);
2876 			consume_skb(skb);
2877 
2878 			if (scm.fp)
2879 				break;
2880 		} else {
2881 			/* It is questionable, see note in unix_dgram_recvmsg.
2882 			 */
2883 			if (UNIXCB(skb).fp)
2884 				unix_peek_fds(&scm, skb);
2885 
2886 			sk_peek_offset_fwd(sk, chunk);
2887 
2888 			if (UNIXCB(skb).fp)
2889 				break;
2890 
2891 			skip = 0;
2892 			last = skb;
2893 			last_len = skb->len;
2894 			unix_state_lock(sk);
2895 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2896 			if (skb)
2897 				goto again;
2898 			unix_state_unlock(sk);
2899 			break;
2900 		}
2901 	} while (size);
2902 
2903 	mutex_unlock(&u->iolock);
2904 	if (state->msg)
2905 		scm_recv(sock, state->msg, &scm, flags);
2906 	else
2907 		scm_destroy(&scm);
2908 out:
2909 	return copied ? : err;
2910 }
2911 
2912 static int unix_stream_read_actor(struct sk_buff *skb,
2913 				  int skip, int chunk,
2914 				  struct unix_stream_read_state *state)
2915 {
2916 	int ret;
2917 
2918 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2919 				    state->msg, chunk);
2920 	return ret ?: chunk;
2921 }
2922 
2923 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2924 			  size_t size, int flags)
2925 {
2926 	struct unix_stream_read_state state = {
2927 		.recv_actor = unix_stream_read_actor,
2928 		.socket = sk->sk_socket,
2929 		.msg = msg,
2930 		.size = size,
2931 		.flags = flags
2932 	};
2933 
2934 	return unix_stream_read_generic(&state, true);
2935 }
2936 
2937 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2938 			       size_t size, int flags)
2939 {
2940 	struct unix_stream_read_state state = {
2941 		.recv_actor = unix_stream_read_actor,
2942 		.socket = sock,
2943 		.msg = msg,
2944 		.size = size,
2945 		.flags = flags
2946 	};
2947 
2948 #ifdef CONFIG_BPF_SYSCALL
2949 	struct sock *sk = sock->sk;
2950 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2951 
2952 	if (prot != &unix_stream_proto)
2953 		return prot->recvmsg(sk, msg, size, flags, NULL);
2954 #endif
2955 	return unix_stream_read_generic(&state, true);
2956 }
2957 
2958 static int unix_stream_splice_actor(struct sk_buff *skb,
2959 				    int skip, int chunk,
2960 				    struct unix_stream_read_state *state)
2961 {
2962 	return skb_splice_bits(skb, state->socket->sk,
2963 			       UNIXCB(skb).consumed + skip,
2964 			       state->pipe, chunk, state->splice_flags);
2965 }
2966 
2967 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2968 				       struct pipe_inode_info *pipe,
2969 				       size_t size, unsigned int flags)
2970 {
2971 	struct unix_stream_read_state state = {
2972 		.recv_actor = unix_stream_splice_actor,
2973 		.socket = sock,
2974 		.pipe = pipe,
2975 		.size = size,
2976 		.splice_flags = flags,
2977 	};
2978 
2979 	if (unlikely(*ppos))
2980 		return -ESPIPE;
2981 
2982 	if (sock->file->f_flags & O_NONBLOCK ||
2983 	    flags & SPLICE_F_NONBLOCK)
2984 		state.flags = MSG_DONTWAIT;
2985 
2986 	return unix_stream_read_generic(&state, false);
2987 }
2988 
2989 static int unix_shutdown(struct socket *sock, int mode)
2990 {
2991 	struct sock *sk = sock->sk;
2992 	struct sock *other;
2993 
2994 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2995 		return -EINVAL;
2996 	/* This maps:
2997 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2998 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2999 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3000 	 */
3001 	++mode;
3002 
3003 	unix_state_lock(sk);
3004 	sk->sk_shutdown |= mode;
3005 	other = unix_peer(sk);
3006 	if (other)
3007 		sock_hold(other);
3008 	unix_state_unlock(sk);
3009 	sk->sk_state_change(sk);
3010 
3011 	if (other &&
3012 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3013 
3014 		int peer_mode = 0;
3015 		const struct proto *prot = READ_ONCE(other->sk_prot);
3016 
3017 		if (prot->unhash)
3018 			prot->unhash(other);
3019 		if (mode&RCV_SHUTDOWN)
3020 			peer_mode |= SEND_SHUTDOWN;
3021 		if (mode&SEND_SHUTDOWN)
3022 			peer_mode |= RCV_SHUTDOWN;
3023 		unix_state_lock(other);
3024 		other->sk_shutdown |= peer_mode;
3025 		unix_state_unlock(other);
3026 		other->sk_state_change(other);
3027 		if (peer_mode == SHUTDOWN_MASK)
3028 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3029 		else if (peer_mode & RCV_SHUTDOWN)
3030 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3031 	}
3032 	if (other)
3033 		sock_put(other);
3034 
3035 	return 0;
3036 }
3037 
3038 long unix_inq_len(struct sock *sk)
3039 {
3040 	struct sk_buff *skb;
3041 	long amount = 0;
3042 
3043 	if (sk->sk_state == TCP_LISTEN)
3044 		return -EINVAL;
3045 
3046 	spin_lock(&sk->sk_receive_queue.lock);
3047 	if (sk->sk_type == SOCK_STREAM ||
3048 	    sk->sk_type == SOCK_SEQPACKET) {
3049 		skb_queue_walk(&sk->sk_receive_queue, skb)
3050 			amount += unix_skb_len(skb);
3051 	} else {
3052 		skb = skb_peek(&sk->sk_receive_queue);
3053 		if (skb)
3054 			amount = skb->len;
3055 	}
3056 	spin_unlock(&sk->sk_receive_queue.lock);
3057 
3058 	return amount;
3059 }
3060 EXPORT_SYMBOL_GPL(unix_inq_len);
3061 
3062 long unix_outq_len(struct sock *sk)
3063 {
3064 	return sk_wmem_alloc_get(sk);
3065 }
3066 EXPORT_SYMBOL_GPL(unix_outq_len);
3067 
3068 static int unix_open_file(struct sock *sk)
3069 {
3070 	struct path path;
3071 	struct file *f;
3072 	int fd;
3073 
3074 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3075 		return -EPERM;
3076 
3077 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3078 		return -ENOENT;
3079 
3080 	path = unix_sk(sk)->path;
3081 	if (!path.dentry)
3082 		return -ENOENT;
3083 
3084 	path_get(&path);
3085 
3086 	fd = get_unused_fd_flags(O_CLOEXEC);
3087 	if (fd < 0)
3088 		goto out;
3089 
3090 	f = dentry_open(&path, O_PATH, current_cred());
3091 	if (IS_ERR(f)) {
3092 		put_unused_fd(fd);
3093 		fd = PTR_ERR(f);
3094 		goto out;
3095 	}
3096 
3097 	fd_install(fd, f);
3098 out:
3099 	path_put(&path);
3100 
3101 	return fd;
3102 }
3103 
3104 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3105 {
3106 	struct sock *sk = sock->sk;
3107 	long amount = 0;
3108 	int err;
3109 
3110 	switch (cmd) {
3111 	case SIOCOUTQ:
3112 		amount = unix_outq_len(sk);
3113 		err = put_user(amount, (int __user *)arg);
3114 		break;
3115 	case SIOCINQ:
3116 		amount = unix_inq_len(sk);
3117 		if (amount < 0)
3118 			err = amount;
3119 		else
3120 			err = put_user(amount, (int __user *)arg);
3121 		break;
3122 	case SIOCUNIXFILE:
3123 		err = unix_open_file(sk);
3124 		break;
3125 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3126 	case SIOCATMARK:
3127 		{
3128 			struct sk_buff *skb;
3129 			int answ = 0;
3130 
3131 			skb = skb_peek(&sk->sk_receive_queue);
3132 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3133 				answ = 1;
3134 			err = put_user(answ, (int __user *)arg);
3135 		}
3136 		break;
3137 #endif
3138 	default:
3139 		err = -ENOIOCTLCMD;
3140 		break;
3141 	}
3142 	return err;
3143 }
3144 
3145 #ifdef CONFIG_COMPAT
3146 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3147 {
3148 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3149 }
3150 #endif
3151 
3152 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3153 {
3154 	struct sock *sk = sock->sk;
3155 	__poll_t mask;
3156 
3157 	sock_poll_wait(file, sock, wait);
3158 	mask = 0;
3159 
3160 	/* exceptional events? */
3161 	if (sk->sk_err)
3162 		mask |= EPOLLERR;
3163 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3164 		mask |= EPOLLHUP;
3165 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3166 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3167 
3168 	/* readable? */
3169 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3170 		mask |= EPOLLIN | EPOLLRDNORM;
3171 	if (sk_is_readable(sk))
3172 		mask |= EPOLLIN | EPOLLRDNORM;
3173 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3174 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3175 		mask |= EPOLLPRI;
3176 #endif
3177 
3178 	/* Connection-based need to check for termination and startup */
3179 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3180 	    sk->sk_state == TCP_CLOSE)
3181 		mask |= EPOLLHUP;
3182 
3183 	/*
3184 	 * we set writable also when the other side has shut down the
3185 	 * connection. This prevents stuck sockets.
3186 	 */
3187 	if (unix_writable(sk))
3188 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3189 
3190 	return mask;
3191 }
3192 
3193 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3194 				    poll_table *wait)
3195 {
3196 	struct sock *sk = sock->sk, *other;
3197 	unsigned int writable;
3198 	__poll_t mask;
3199 
3200 	sock_poll_wait(file, sock, wait);
3201 	mask = 0;
3202 
3203 	/* exceptional events? */
3204 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3205 		mask |= EPOLLERR |
3206 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3207 
3208 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3209 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3210 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3211 		mask |= EPOLLHUP;
3212 
3213 	/* readable? */
3214 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3215 		mask |= EPOLLIN | EPOLLRDNORM;
3216 	if (sk_is_readable(sk))
3217 		mask |= EPOLLIN | EPOLLRDNORM;
3218 
3219 	/* Connection-based need to check for termination and startup */
3220 	if (sk->sk_type == SOCK_SEQPACKET) {
3221 		if (sk->sk_state == TCP_CLOSE)
3222 			mask |= EPOLLHUP;
3223 		/* connection hasn't started yet? */
3224 		if (sk->sk_state == TCP_SYN_SENT)
3225 			return mask;
3226 	}
3227 
3228 	/* No write status requested, avoid expensive OUT tests. */
3229 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3230 		return mask;
3231 
3232 	writable = unix_writable(sk);
3233 	if (writable) {
3234 		unix_state_lock(sk);
3235 
3236 		other = unix_peer(sk);
3237 		if (other && unix_peer(other) != sk &&
3238 		    unix_recvq_full_lockless(other) &&
3239 		    unix_dgram_peer_wake_me(sk, other))
3240 			writable = 0;
3241 
3242 		unix_state_unlock(sk);
3243 	}
3244 
3245 	if (writable)
3246 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3247 	else
3248 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3249 
3250 	return mask;
3251 }
3252 
3253 #ifdef CONFIG_PROC_FS
3254 
3255 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3256 
3257 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3258 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3259 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3260 
3261 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3262 {
3263 	unsigned long offset = get_offset(*pos);
3264 	unsigned long bucket = get_bucket(*pos);
3265 	unsigned long count = 0;
3266 	struct sock *sk;
3267 
3268 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3269 	     sk; sk = sk_next(sk)) {
3270 		if (++count == offset)
3271 			break;
3272 	}
3273 
3274 	return sk;
3275 }
3276 
3277 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3278 {
3279 	unsigned long bucket = get_bucket(*pos);
3280 	struct net *net = seq_file_net(seq);
3281 	struct sock *sk;
3282 
3283 	while (bucket < UNIX_HASH_SIZE) {
3284 		spin_lock(&net->unx.table.locks[bucket]);
3285 
3286 		sk = unix_from_bucket(seq, pos);
3287 		if (sk)
3288 			return sk;
3289 
3290 		spin_unlock(&net->unx.table.locks[bucket]);
3291 
3292 		*pos = set_bucket_offset(++bucket, 1);
3293 	}
3294 
3295 	return NULL;
3296 }
3297 
3298 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3299 				  loff_t *pos)
3300 {
3301 	unsigned long bucket = get_bucket(*pos);
3302 
3303 	sk = sk_next(sk);
3304 	if (sk)
3305 		return sk;
3306 
3307 
3308 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3309 
3310 	*pos = set_bucket_offset(++bucket, 1);
3311 
3312 	return unix_get_first(seq, pos);
3313 }
3314 
3315 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3316 {
3317 	if (!*pos)
3318 		return SEQ_START_TOKEN;
3319 
3320 	return unix_get_first(seq, pos);
3321 }
3322 
3323 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3324 {
3325 	++*pos;
3326 
3327 	if (v == SEQ_START_TOKEN)
3328 		return unix_get_first(seq, pos);
3329 
3330 	return unix_get_next(seq, v, pos);
3331 }
3332 
3333 static void unix_seq_stop(struct seq_file *seq, void *v)
3334 {
3335 	struct sock *sk = v;
3336 
3337 	if (sk)
3338 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3339 }
3340 
3341 static int unix_seq_show(struct seq_file *seq, void *v)
3342 {
3343 
3344 	if (v == SEQ_START_TOKEN)
3345 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3346 			 "Inode Path\n");
3347 	else {
3348 		struct sock *s = v;
3349 		struct unix_sock *u = unix_sk(s);
3350 		unix_state_lock(s);
3351 
3352 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3353 			s,
3354 			refcount_read(&s->sk_refcnt),
3355 			0,
3356 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3357 			s->sk_type,
3358 			s->sk_socket ?
3359 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3360 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3361 			sock_i_ino(s));
3362 
3363 		if (u->addr) {	// under a hash table lock here
3364 			int i, len;
3365 			seq_putc(seq, ' ');
3366 
3367 			i = 0;
3368 			len = u->addr->len -
3369 				offsetof(struct sockaddr_un, sun_path);
3370 			if (u->addr->name->sun_path[0]) {
3371 				len--;
3372 			} else {
3373 				seq_putc(seq, '@');
3374 				i++;
3375 			}
3376 			for ( ; i < len; i++)
3377 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3378 					 '@');
3379 		}
3380 		unix_state_unlock(s);
3381 		seq_putc(seq, '\n');
3382 	}
3383 
3384 	return 0;
3385 }
3386 
3387 static const struct seq_operations unix_seq_ops = {
3388 	.start  = unix_seq_start,
3389 	.next   = unix_seq_next,
3390 	.stop   = unix_seq_stop,
3391 	.show   = unix_seq_show,
3392 };
3393 
3394 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3395 struct bpf_unix_iter_state {
3396 	struct seq_net_private p;
3397 	unsigned int cur_sk;
3398 	unsigned int end_sk;
3399 	unsigned int max_sk;
3400 	struct sock **batch;
3401 	bool st_bucket_done;
3402 };
3403 
3404 struct bpf_iter__unix {
3405 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3406 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3407 	uid_t uid __aligned(8);
3408 };
3409 
3410 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3411 			      struct unix_sock *unix_sk, uid_t uid)
3412 {
3413 	struct bpf_iter__unix ctx;
3414 
3415 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3416 	ctx.meta = meta;
3417 	ctx.unix_sk = unix_sk;
3418 	ctx.uid = uid;
3419 	return bpf_iter_run_prog(prog, &ctx);
3420 }
3421 
3422 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3423 
3424 {
3425 	struct bpf_unix_iter_state *iter = seq->private;
3426 	unsigned int expected = 1;
3427 	struct sock *sk;
3428 
3429 	sock_hold(start_sk);
3430 	iter->batch[iter->end_sk++] = start_sk;
3431 
3432 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3433 		if (iter->end_sk < iter->max_sk) {
3434 			sock_hold(sk);
3435 			iter->batch[iter->end_sk++] = sk;
3436 		}
3437 
3438 		expected++;
3439 	}
3440 
3441 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3442 
3443 	return expected;
3444 }
3445 
3446 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3447 {
3448 	while (iter->cur_sk < iter->end_sk)
3449 		sock_put(iter->batch[iter->cur_sk++]);
3450 }
3451 
3452 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3453 				       unsigned int new_batch_sz)
3454 {
3455 	struct sock **new_batch;
3456 
3457 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3458 			     GFP_USER | __GFP_NOWARN);
3459 	if (!new_batch)
3460 		return -ENOMEM;
3461 
3462 	bpf_iter_unix_put_batch(iter);
3463 	kvfree(iter->batch);
3464 	iter->batch = new_batch;
3465 	iter->max_sk = new_batch_sz;
3466 
3467 	return 0;
3468 }
3469 
3470 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3471 					loff_t *pos)
3472 {
3473 	struct bpf_unix_iter_state *iter = seq->private;
3474 	unsigned int expected;
3475 	bool resized = false;
3476 	struct sock *sk;
3477 
3478 	if (iter->st_bucket_done)
3479 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3480 
3481 again:
3482 	/* Get a new batch */
3483 	iter->cur_sk = 0;
3484 	iter->end_sk = 0;
3485 
3486 	sk = unix_get_first(seq, pos);
3487 	if (!sk)
3488 		return NULL; /* Done */
3489 
3490 	expected = bpf_iter_unix_hold_batch(seq, sk);
3491 
3492 	if (iter->end_sk == expected) {
3493 		iter->st_bucket_done = true;
3494 		return sk;
3495 	}
3496 
3497 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3498 		resized = true;
3499 		goto again;
3500 	}
3501 
3502 	return sk;
3503 }
3504 
3505 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3506 {
3507 	if (!*pos)
3508 		return SEQ_START_TOKEN;
3509 
3510 	/* bpf iter does not support lseek, so it always
3511 	 * continue from where it was stop()-ped.
3512 	 */
3513 	return bpf_iter_unix_batch(seq, pos);
3514 }
3515 
3516 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3517 {
3518 	struct bpf_unix_iter_state *iter = seq->private;
3519 	struct sock *sk;
3520 
3521 	/* Whenever seq_next() is called, the iter->cur_sk is
3522 	 * done with seq_show(), so advance to the next sk in
3523 	 * the batch.
3524 	 */
3525 	if (iter->cur_sk < iter->end_sk)
3526 		sock_put(iter->batch[iter->cur_sk++]);
3527 
3528 	++*pos;
3529 
3530 	if (iter->cur_sk < iter->end_sk)
3531 		sk = iter->batch[iter->cur_sk];
3532 	else
3533 		sk = bpf_iter_unix_batch(seq, pos);
3534 
3535 	return sk;
3536 }
3537 
3538 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3539 {
3540 	struct bpf_iter_meta meta;
3541 	struct bpf_prog *prog;
3542 	struct sock *sk = v;
3543 	uid_t uid;
3544 	bool slow;
3545 	int ret;
3546 
3547 	if (v == SEQ_START_TOKEN)
3548 		return 0;
3549 
3550 	slow = lock_sock_fast(sk);
3551 
3552 	if (unlikely(sk_unhashed(sk))) {
3553 		ret = SEQ_SKIP;
3554 		goto unlock;
3555 	}
3556 
3557 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3558 	meta.seq = seq;
3559 	prog = bpf_iter_get_info(&meta, false);
3560 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3561 unlock:
3562 	unlock_sock_fast(sk, slow);
3563 	return ret;
3564 }
3565 
3566 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3567 {
3568 	struct bpf_unix_iter_state *iter = seq->private;
3569 	struct bpf_iter_meta meta;
3570 	struct bpf_prog *prog;
3571 
3572 	if (!v) {
3573 		meta.seq = seq;
3574 		prog = bpf_iter_get_info(&meta, true);
3575 		if (prog)
3576 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3577 	}
3578 
3579 	if (iter->cur_sk < iter->end_sk)
3580 		bpf_iter_unix_put_batch(iter);
3581 }
3582 
3583 static const struct seq_operations bpf_iter_unix_seq_ops = {
3584 	.start	= bpf_iter_unix_seq_start,
3585 	.next	= bpf_iter_unix_seq_next,
3586 	.stop	= bpf_iter_unix_seq_stop,
3587 	.show	= bpf_iter_unix_seq_show,
3588 };
3589 #endif
3590 #endif
3591 
3592 static const struct net_proto_family unix_family_ops = {
3593 	.family = PF_UNIX,
3594 	.create = unix_create,
3595 	.owner	= THIS_MODULE,
3596 };
3597 
3598 
3599 static int __net_init unix_net_init(struct net *net)
3600 {
3601 	int i;
3602 
3603 	net->unx.sysctl_max_dgram_qlen = 10;
3604 	if (unix_sysctl_register(net))
3605 		goto out;
3606 
3607 #ifdef CONFIG_PROC_FS
3608 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3609 			     sizeof(struct seq_net_private)))
3610 		goto err_sysctl;
3611 #endif
3612 
3613 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3614 					      sizeof(spinlock_t), GFP_KERNEL);
3615 	if (!net->unx.table.locks)
3616 		goto err_proc;
3617 
3618 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3619 						sizeof(struct hlist_head),
3620 						GFP_KERNEL);
3621 	if (!net->unx.table.buckets)
3622 		goto free_locks;
3623 
3624 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3625 		spin_lock_init(&net->unx.table.locks[i]);
3626 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3627 	}
3628 
3629 	return 0;
3630 
3631 free_locks:
3632 	kvfree(net->unx.table.locks);
3633 err_proc:
3634 #ifdef CONFIG_PROC_FS
3635 	remove_proc_entry("unix", net->proc_net);
3636 err_sysctl:
3637 #endif
3638 	unix_sysctl_unregister(net);
3639 out:
3640 	return -ENOMEM;
3641 }
3642 
3643 static void __net_exit unix_net_exit(struct net *net)
3644 {
3645 	kvfree(net->unx.table.buckets);
3646 	kvfree(net->unx.table.locks);
3647 	unix_sysctl_unregister(net);
3648 	remove_proc_entry("unix", net->proc_net);
3649 }
3650 
3651 static struct pernet_operations unix_net_ops = {
3652 	.init = unix_net_init,
3653 	.exit = unix_net_exit,
3654 };
3655 
3656 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3657 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3658 		     struct unix_sock *unix_sk, uid_t uid)
3659 
3660 #define INIT_BATCH_SZ 16
3661 
3662 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3663 {
3664 	struct bpf_unix_iter_state *iter = priv_data;
3665 	int err;
3666 
3667 	err = bpf_iter_init_seq_net(priv_data, aux);
3668 	if (err)
3669 		return err;
3670 
3671 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3672 	if (err) {
3673 		bpf_iter_fini_seq_net(priv_data);
3674 		return err;
3675 	}
3676 
3677 	return 0;
3678 }
3679 
3680 static void bpf_iter_fini_unix(void *priv_data)
3681 {
3682 	struct bpf_unix_iter_state *iter = priv_data;
3683 
3684 	bpf_iter_fini_seq_net(priv_data);
3685 	kvfree(iter->batch);
3686 }
3687 
3688 static const struct bpf_iter_seq_info unix_seq_info = {
3689 	.seq_ops		= &bpf_iter_unix_seq_ops,
3690 	.init_seq_private	= bpf_iter_init_unix,
3691 	.fini_seq_private	= bpf_iter_fini_unix,
3692 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3693 };
3694 
3695 static const struct bpf_func_proto *
3696 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3697 			     const struct bpf_prog *prog)
3698 {
3699 	switch (func_id) {
3700 	case BPF_FUNC_setsockopt:
3701 		return &bpf_sk_setsockopt_proto;
3702 	case BPF_FUNC_getsockopt:
3703 		return &bpf_sk_getsockopt_proto;
3704 	default:
3705 		return NULL;
3706 	}
3707 }
3708 
3709 static struct bpf_iter_reg unix_reg_info = {
3710 	.target			= "unix",
3711 	.ctx_arg_info_size	= 1,
3712 	.ctx_arg_info		= {
3713 		{ offsetof(struct bpf_iter__unix, unix_sk),
3714 		  PTR_TO_BTF_ID_OR_NULL },
3715 	},
3716 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3717 	.seq_info		= &unix_seq_info,
3718 };
3719 
3720 static void __init bpf_iter_register(void)
3721 {
3722 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3723 	if (bpf_iter_reg_target(&unix_reg_info))
3724 		pr_warn("Warning: could not register bpf iterator unix\n");
3725 }
3726 #endif
3727 
3728 static int __init af_unix_init(void)
3729 {
3730 	int i, rc = -1;
3731 
3732 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3733 
3734 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3735 		spin_lock_init(&bsd_socket_locks[i]);
3736 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3737 	}
3738 
3739 	rc = proto_register(&unix_dgram_proto, 1);
3740 	if (rc != 0) {
3741 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3742 		goto out;
3743 	}
3744 
3745 	rc = proto_register(&unix_stream_proto, 1);
3746 	if (rc != 0) {
3747 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3748 		proto_unregister(&unix_dgram_proto);
3749 		goto out;
3750 	}
3751 
3752 	sock_register(&unix_family_ops);
3753 	register_pernet_subsys(&unix_net_ops);
3754 	unix_bpf_build_proto();
3755 
3756 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3757 	bpf_iter_register();
3758 #endif
3759 
3760 out:
3761 	return rc;
3762 }
3763 
3764 static void __exit af_unix_exit(void)
3765 {
3766 	sock_unregister(PF_UNIX);
3767 	proto_unregister(&unix_dgram_proto);
3768 	proto_unregister(&unix_stream_proto);
3769 	unregister_pernet_subsys(&unix_net_ops);
3770 }
3771 
3772 /* Earlier than device_initcall() so that other drivers invoking
3773    request_module() don't end up in a loop when modprobe tries
3774    to use a UNIX socket. But later than subsys_initcall() because
3775    we depend on stuff initialised there */
3776 fs_initcall(af_unix_init);
3777 module_exit(af_unix_exit);
3778 
3779 MODULE_LICENSE("GPL");
3780 MODULE_ALIAS_NETPROTO(PF_UNIX);
3781