xref: /openbmc/linux/net/unix/af_unix.c (revision 3184e07e12405a4d00ad7990f2a0c97278fdc636)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 	return unix_peer(osk) == sk;
218 }
219 
220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224 
225 static inline int unix_recvq_full_lockless(const struct sock *sk)
226 {
227 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229 
230 struct sock *unix_peer_get(struct sock *s)
231 {
232 	struct sock *peer;
233 
234 	unix_state_lock(s);
235 	peer = unix_peer(s);
236 	if (peer)
237 		sock_hold(peer);
238 	unix_state_unlock(s);
239 	return peer;
240 }
241 EXPORT_SYMBOL_GPL(unix_peer_get);
242 
243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
244 					     int addr_len)
245 {
246 	struct unix_address *addr;
247 
248 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
249 	if (!addr)
250 		return NULL;
251 
252 	refcount_set(&addr->refcnt, 1);
253 	addr->len = addr_len;
254 	memcpy(addr->name, sunaddr, addr_len);
255 
256 	return addr;
257 }
258 
259 static inline void unix_release_addr(struct unix_address *addr)
260 {
261 	if (refcount_dec_and_test(&addr->refcnt))
262 		kfree(addr);
263 }
264 
265 /*
266  *	Check unix socket name:
267  *		- should be not zero length.
268  *	        - if started by not zero, should be NULL terminated (FS object)
269  *		- if started by zero, it is abstract name.
270  */
271 
272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
273 {
274 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
275 	    addr_len > sizeof(*sunaddr))
276 		return -EINVAL;
277 
278 	if (sunaddr->sun_family != AF_UNIX)
279 		return -EINVAL;
280 
281 	return 0;
282 }
283 
284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
285 {
286 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
287 	short offset = offsetof(struct sockaddr_storage, __data);
288 
289 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
290 
291 	/* This may look like an off by one error but it is a bit more
292 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
293 	 * sun_path[108] doesn't as such exist.  However in kernel space
294 	 * we are guaranteed that it is a valid memory location in our
295 	 * kernel address buffer because syscall functions always pass
296 	 * a pointer of struct sockaddr_storage which has a bigger buffer
297 	 * than 108.  Also, we must terminate sun_path for strlen() in
298 	 * getname_kernel().
299 	 */
300 	addr->__data[addr_len - offset] = 0;
301 
302 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
303 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
304 	 * know the actual buffer.
305 	 */
306 	return strlen(addr->__data) + offset + 1;
307 }
308 
309 static void __unix_remove_socket(struct sock *sk)
310 {
311 	sk_del_node_init(sk);
312 }
313 
314 static void __unix_insert_socket(struct net *net, struct sock *sk)
315 {
316 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
317 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
318 }
319 
320 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
321 				 struct unix_address *addr, unsigned int hash)
322 {
323 	__unix_remove_socket(sk);
324 	smp_store_release(&unix_sk(sk)->addr, addr);
325 
326 	sk->sk_hash = hash;
327 	__unix_insert_socket(net, sk);
328 }
329 
330 static void unix_remove_socket(struct net *net, struct sock *sk)
331 {
332 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
333 	__unix_remove_socket(sk);
334 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
335 }
336 
337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
338 {
339 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
340 	__unix_insert_socket(net, sk);
341 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
342 }
343 
344 static void unix_insert_bsd_socket(struct sock *sk)
345 {
346 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
347 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
348 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
349 }
350 
351 static void unix_remove_bsd_socket(struct sock *sk)
352 {
353 	if (!hlist_unhashed(&sk->sk_bind_node)) {
354 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 		__sk_del_bind_node(sk);
356 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357 
358 		sk_node_init(&sk->sk_bind_node);
359 	}
360 }
361 
362 static struct sock *__unix_find_socket_byname(struct net *net,
363 					      struct sockaddr_un *sunname,
364 					      int len, unsigned int hash)
365 {
366 	struct sock *s;
367 
368 	sk_for_each(s, &net->unx.table.buckets[hash]) {
369 		struct unix_sock *u = unix_sk(s);
370 
371 		if (u->addr->len == len &&
372 		    !memcmp(u->addr->name, sunname, len))
373 			return s;
374 	}
375 	return NULL;
376 }
377 
378 static inline struct sock *unix_find_socket_byname(struct net *net,
379 						   struct sockaddr_un *sunname,
380 						   int len, unsigned int hash)
381 {
382 	struct sock *s;
383 
384 	spin_lock(&net->unx.table.locks[hash]);
385 	s = __unix_find_socket_byname(net, sunname, len, hash);
386 	if (s)
387 		sock_hold(s);
388 	spin_unlock(&net->unx.table.locks[hash]);
389 	return s;
390 }
391 
392 static struct sock *unix_find_socket_byinode(struct inode *i)
393 {
394 	unsigned int hash = unix_bsd_hash(i);
395 	struct sock *s;
396 
397 	spin_lock(&bsd_socket_locks[hash]);
398 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
399 		struct dentry *dentry = unix_sk(s)->path.dentry;
400 
401 		if (dentry && d_backing_inode(dentry) == i) {
402 			sock_hold(s);
403 			spin_unlock(&bsd_socket_locks[hash]);
404 			return s;
405 		}
406 	}
407 	spin_unlock(&bsd_socket_locks[hash]);
408 	return NULL;
409 }
410 
411 /* Support code for asymmetrically connected dgram sockets
412  *
413  * If a datagram socket is connected to a socket not itself connected
414  * to the first socket (eg, /dev/log), clients may only enqueue more
415  * messages if the present receive queue of the server socket is not
416  * "too large". This means there's a second writeability condition
417  * poll and sendmsg need to test. The dgram recv code will do a wake
418  * up on the peer_wait wait queue of a socket upon reception of a
419  * datagram which needs to be propagated to sleeping would-be writers
420  * since these might not have sent anything so far. This can't be
421  * accomplished via poll_wait because the lifetime of the server
422  * socket might be less than that of its clients if these break their
423  * association with it or if the server socket is closed while clients
424  * are still connected to it and there's no way to inform "a polling
425  * implementation" that it should let go of a certain wait queue
426  *
427  * In order to propagate a wake up, a wait_queue_entry_t of the client
428  * socket is enqueued on the peer_wait queue of the server socket
429  * whose wake function does a wake_up on the ordinary client socket
430  * wait queue. This connection is established whenever a write (or
431  * poll for write) hit the flow control condition and broken when the
432  * association to the server socket is dissolved or after a wake up
433  * was relayed.
434  */
435 
436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
437 				      void *key)
438 {
439 	struct unix_sock *u;
440 	wait_queue_head_t *u_sleep;
441 
442 	u = container_of(q, struct unix_sock, peer_wake);
443 
444 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
445 			    q);
446 	u->peer_wake.private = NULL;
447 
448 	/* relaying can only happen while the wq still exists */
449 	u_sleep = sk_sleep(&u->sk);
450 	if (u_sleep)
451 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
452 
453 	return 0;
454 }
455 
456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
457 {
458 	struct unix_sock *u, *u_other;
459 	int rc;
460 
461 	u = unix_sk(sk);
462 	u_other = unix_sk(other);
463 	rc = 0;
464 	spin_lock(&u_other->peer_wait.lock);
465 
466 	if (!u->peer_wake.private) {
467 		u->peer_wake.private = other;
468 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
469 
470 		rc = 1;
471 	}
472 
473 	spin_unlock(&u_other->peer_wait.lock);
474 	return rc;
475 }
476 
477 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
478 					    struct sock *other)
479 {
480 	struct unix_sock *u, *u_other;
481 
482 	u = unix_sk(sk);
483 	u_other = unix_sk(other);
484 	spin_lock(&u_other->peer_wait.lock);
485 
486 	if (u->peer_wake.private == other) {
487 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
488 		u->peer_wake.private = NULL;
489 	}
490 
491 	spin_unlock(&u_other->peer_wait.lock);
492 }
493 
494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
495 						   struct sock *other)
496 {
497 	unix_dgram_peer_wake_disconnect(sk, other);
498 	wake_up_interruptible_poll(sk_sleep(sk),
499 				   EPOLLOUT |
500 				   EPOLLWRNORM |
501 				   EPOLLWRBAND);
502 }
503 
504 /* preconditions:
505  *	- unix_peer(sk) == other
506  *	- association is stable
507  */
508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
509 {
510 	int connected;
511 
512 	connected = unix_dgram_peer_wake_connect(sk, other);
513 
514 	/* If other is SOCK_DEAD, we want to make sure we signal
515 	 * POLLOUT, such that a subsequent write() can get a
516 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
517 	 * to other and its full, we will hang waiting for POLLOUT.
518 	 */
519 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
520 		return 1;
521 
522 	if (connected)
523 		unix_dgram_peer_wake_disconnect(sk, other);
524 
525 	return 0;
526 }
527 
528 static int unix_writable(const struct sock *sk, unsigned char state)
529 {
530 	return state != TCP_LISTEN &&
531 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
532 }
533 
534 static void unix_write_space(struct sock *sk)
535 {
536 	struct socket_wq *wq;
537 
538 	rcu_read_lock();
539 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
540 		wq = rcu_dereference(sk->sk_wq);
541 		if (skwq_has_sleeper(wq))
542 			wake_up_interruptible_sync_poll(&wq->wait,
543 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
544 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
545 	}
546 	rcu_read_unlock();
547 }
548 
549 /* When dgram socket disconnects (or changes its peer), we clear its receive
550  * queue of packets arrived from previous peer. First, it allows to do
551  * flow control based only on wmem_alloc; second, sk connected to peer
552  * may receive messages only from that peer. */
553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
554 {
555 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
556 		skb_queue_purge(&sk->sk_receive_queue);
557 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
558 
559 		/* If one link of bidirectional dgram pipe is disconnected,
560 		 * we signal error. Messages are lost. Do not make this,
561 		 * when peer was not connected to us.
562 		 */
563 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
564 			WRITE_ONCE(other->sk_err, ECONNRESET);
565 			sk_error_report(other);
566 		}
567 	}
568 }
569 
570 static void unix_sock_destructor(struct sock *sk)
571 {
572 	struct unix_sock *u = unix_sk(sk);
573 
574 	skb_queue_purge(&sk->sk_receive_queue);
575 
576 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
577 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
578 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
579 	if (!sock_flag(sk, SOCK_DEAD)) {
580 		pr_info("Attempt to release alive unix socket: %p\n", sk);
581 		return;
582 	}
583 
584 	if (u->addr)
585 		unix_release_addr(u->addr);
586 
587 	atomic_long_dec(&unix_nr_socks);
588 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
589 #ifdef UNIX_REFCNT_DEBUG
590 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
591 		atomic_long_read(&unix_nr_socks));
592 #endif
593 }
594 
595 static void unix_release_sock(struct sock *sk, int embrion)
596 {
597 	struct unix_sock *u = unix_sk(sk);
598 	struct sock *skpair;
599 	struct sk_buff *skb;
600 	struct path path;
601 	int state;
602 
603 	unix_remove_socket(sock_net(sk), sk);
604 	unix_remove_bsd_socket(sk);
605 
606 	/* Clear state */
607 	unix_state_lock(sk);
608 	sock_orphan(sk);
609 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
610 	path	     = u->path;
611 	u->path.dentry = NULL;
612 	u->path.mnt = NULL;
613 	state = sk->sk_state;
614 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
615 
616 	skpair = unix_peer(sk);
617 	unix_peer(sk) = NULL;
618 
619 	unix_state_unlock(sk);
620 
621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
622 	if (u->oob_skb) {
623 		kfree_skb(u->oob_skb);
624 		u->oob_skb = NULL;
625 	}
626 #endif
627 
628 	wake_up_interruptible_all(&u->peer_wait);
629 
630 	if (skpair != NULL) {
631 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
632 			unix_state_lock(skpair);
633 			/* No more writes */
634 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
635 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
636 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
637 			unix_state_unlock(skpair);
638 			skpair->sk_state_change(skpair);
639 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
640 		}
641 
642 		unix_dgram_peer_wake_disconnect(sk, skpair);
643 		sock_put(skpair); /* It may now die */
644 	}
645 
646 	/* Try to flush out this socket. Throw out buffers at least */
647 
648 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
649 		if (state == TCP_LISTEN)
650 			unix_release_sock(skb->sk, 1);
651 		/* passed fds are erased in the kfree_skb hook	      */
652 		UNIXCB(skb).consumed = skb->len;
653 		kfree_skb(skb);
654 	}
655 
656 	if (path.dentry)
657 		path_put(&path);
658 
659 	sock_put(sk);
660 
661 	/* ---- Socket is dead now and most probably destroyed ---- */
662 
663 	/*
664 	 * Fixme: BSD difference: In BSD all sockets connected to us get
665 	 *	  ECONNRESET and we die on the spot. In Linux we behave
666 	 *	  like files and pipes do and wait for the last
667 	 *	  dereference.
668 	 *
669 	 * Can't we simply set sock->err?
670 	 *
671 	 *	  What the above comment does talk about? --ANK(980817)
672 	 */
673 
674 	if (READ_ONCE(unix_tot_inflight))
675 		unix_gc();		/* Garbage collect fds */
676 }
677 
678 static void init_peercred(struct sock *sk)
679 {
680 	const struct cred *old_cred;
681 	struct pid *old_pid;
682 
683 	spin_lock(&sk->sk_peer_lock);
684 	old_pid = sk->sk_peer_pid;
685 	old_cred = sk->sk_peer_cred;
686 	sk->sk_peer_pid  = get_pid(task_tgid(current));
687 	sk->sk_peer_cred = get_current_cred();
688 	spin_unlock(&sk->sk_peer_lock);
689 
690 	put_pid(old_pid);
691 	put_cred(old_cred);
692 }
693 
694 static void copy_peercred(struct sock *sk, struct sock *peersk)
695 {
696 	if (sk < peersk) {
697 		spin_lock(&sk->sk_peer_lock);
698 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
699 	} else {
700 		spin_lock(&peersk->sk_peer_lock);
701 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 	}
703 
704 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
705 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
706 
707 	spin_unlock(&sk->sk_peer_lock);
708 	spin_unlock(&peersk->sk_peer_lock);
709 }
710 
711 static int unix_listen(struct socket *sock, int backlog)
712 {
713 	int err;
714 	struct sock *sk = sock->sk;
715 	struct unix_sock *u = unix_sk(sk);
716 
717 	err = -EOPNOTSUPP;
718 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
719 		goto out;	/* Only stream/seqpacket sockets accept */
720 	err = -EINVAL;
721 	if (!READ_ONCE(u->addr))
722 		goto out;	/* No listens on an unbound socket */
723 	unix_state_lock(sk);
724 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
725 		goto out_unlock;
726 	if (backlog > sk->sk_max_ack_backlog)
727 		wake_up_interruptible_all(&u->peer_wait);
728 	sk->sk_max_ack_backlog	= backlog;
729 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
730 
731 	/* set credentials so connect can copy them */
732 	init_peercred(sk);
733 	err = 0;
734 
735 out_unlock:
736 	unix_state_unlock(sk);
737 out:
738 	return err;
739 }
740 
741 static int unix_release(struct socket *);
742 static int unix_bind(struct socket *, struct sockaddr *, int);
743 static int unix_stream_connect(struct socket *, struct sockaddr *,
744 			       int addr_len, int flags);
745 static int unix_socketpair(struct socket *, struct socket *);
746 static int unix_accept(struct socket *, struct socket *, int, bool);
747 static int unix_getname(struct socket *, struct sockaddr *, int);
748 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
749 static __poll_t unix_dgram_poll(struct file *, struct socket *,
750 				    poll_table *);
751 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
752 #ifdef CONFIG_COMPAT
753 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
754 #endif
755 static int unix_shutdown(struct socket *, int);
756 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
757 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
758 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
759 				       struct pipe_inode_info *, size_t size,
760 				       unsigned int flags);
761 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
762 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
763 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
764 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
765 static int unix_dgram_connect(struct socket *, struct sockaddr *,
766 			      int, int);
767 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
768 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
769 				  int);
770 
771 static int unix_set_peek_off(struct sock *sk, int val)
772 {
773 	struct unix_sock *u = unix_sk(sk);
774 
775 	if (mutex_lock_interruptible(&u->iolock))
776 		return -EINTR;
777 
778 	WRITE_ONCE(sk->sk_peek_off, val);
779 	mutex_unlock(&u->iolock);
780 
781 	return 0;
782 }
783 
784 #ifdef CONFIG_PROC_FS
785 static int unix_count_nr_fds(struct sock *sk)
786 {
787 	struct sk_buff *skb;
788 	struct unix_sock *u;
789 	int nr_fds = 0;
790 
791 	spin_lock(&sk->sk_receive_queue.lock);
792 	skb = skb_peek(&sk->sk_receive_queue);
793 	while (skb) {
794 		u = unix_sk(skb->sk);
795 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
796 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
797 	}
798 	spin_unlock(&sk->sk_receive_queue.lock);
799 
800 	return nr_fds;
801 }
802 
803 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
804 {
805 	struct sock *sk = sock->sk;
806 	unsigned char s_state;
807 	struct unix_sock *u;
808 	int nr_fds = 0;
809 
810 	if (sk) {
811 		s_state = READ_ONCE(sk->sk_state);
812 		u = unix_sk(sk);
813 
814 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
815 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
816 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
817 		 */
818 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
819 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
820 		else if (s_state == TCP_LISTEN)
821 			nr_fds = unix_count_nr_fds(sk);
822 
823 		seq_printf(m, "scm_fds: %u\n", nr_fds);
824 	}
825 }
826 #else
827 #define unix_show_fdinfo NULL
828 #endif
829 
830 static const struct proto_ops unix_stream_ops = {
831 	.family =	PF_UNIX,
832 	.owner =	THIS_MODULE,
833 	.release =	unix_release,
834 	.bind =		unix_bind,
835 	.connect =	unix_stream_connect,
836 	.socketpair =	unix_socketpair,
837 	.accept =	unix_accept,
838 	.getname =	unix_getname,
839 	.poll =		unix_poll,
840 	.ioctl =	unix_ioctl,
841 #ifdef CONFIG_COMPAT
842 	.compat_ioctl =	unix_compat_ioctl,
843 #endif
844 	.listen =	unix_listen,
845 	.shutdown =	unix_shutdown,
846 	.sendmsg =	unix_stream_sendmsg,
847 	.recvmsg =	unix_stream_recvmsg,
848 	.read_skb =	unix_stream_read_skb,
849 	.mmap =		sock_no_mmap,
850 	.splice_read =	unix_stream_splice_read,
851 	.set_peek_off =	unix_set_peek_off,
852 	.show_fdinfo =	unix_show_fdinfo,
853 };
854 
855 static const struct proto_ops unix_dgram_ops = {
856 	.family =	PF_UNIX,
857 	.owner =	THIS_MODULE,
858 	.release =	unix_release,
859 	.bind =		unix_bind,
860 	.connect =	unix_dgram_connect,
861 	.socketpair =	unix_socketpair,
862 	.accept =	sock_no_accept,
863 	.getname =	unix_getname,
864 	.poll =		unix_dgram_poll,
865 	.ioctl =	unix_ioctl,
866 #ifdef CONFIG_COMPAT
867 	.compat_ioctl =	unix_compat_ioctl,
868 #endif
869 	.listen =	sock_no_listen,
870 	.shutdown =	unix_shutdown,
871 	.sendmsg =	unix_dgram_sendmsg,
872 	.read_skb =	unix_read_skb,
873 	.recvmsg =	unix_dgram_recvmsg,
874 	.mmap =		sock_no_mmap,
875 	.set_peek_off =	unix_set_peek_off,
876 	.show_fdinfo =	unix_show_fdinfo,
877 };
878 
879 static const struct proto_ops unix_seqpacket_ops = {
880 	.family =	PF_UNIX,
881 	.owner =	THIS_MODULE,
882 	.release =	unix_release,
883 	.bind =		unix_bind,
884 	.connect =	unix_stream_connect,
885 	.socketpair =	unix_socketpair,
886 	.accept =	unix_accept,
887 	.getname =	unix_getname,
888 	.poll =		unix_dgram_poll,
889 	.ioctl =	unix_ioctl,
890 #ifdef CONFIG_COMPAT
891 	.compat_ioctl =	unix_compat_ioctl,
892 #endif
893 	.listen =	unix_listen,
894 	.shutdown =	unix_shutdown,
895 	.sendmsg =	unix_seqpacket_sendmsg,
896 	.recvmsg =	unix_seqpacket_recvmsg,
897 	.mmap =		sock_no_mmap,
898 	.set_peek_off =	unix_set_peek_off,
899 	.show_fdinfo =	unix_show_fdinfo,
900 };
901 
902 static void unix_close(struct sock *sk, long timeout)
903 {
904 	/* Nothing to do here, unix socket does not need a ->close().
905 	 * This is merely for sockmap.
906 	 */
907 }
908 
909 static void unix_unhash(struct sock *sk)
910 {
911 	/* Nothing to do here, unix socket does not need a ->unhash().
912 	 * This is merely for sockmap.
913 	 */
914 }
915 
916 static bool unix_bpf_bypass_getsockopt(int level, int optname)
917 {
918 	if (level == SOL_SOCKET) {
919 		switch (optname) {
920 		case SO_PEERPIDFD:
921 			return true;
922 		default:
923 			return false;
924 		}
925 	}
926 
927 	return false;
928 }
929 
930 struct proto unix_dgram_proto = {
931 	.name			= "UNIX",
932 	.owner			= THIS_MODULE,
933 	.obj_size		= sizeof(struct unix_sock),
934 	.close			= unix_close,
935 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
936 #ifdef CONFIG_BPF_SYSCALL
937 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
938 #endif
939 };
940 
941 struct proto unix_stream_proto = {
942 	.name			= "UNIX-STREAM",
943 	.owner			= THIS_MODULE,
944 	.obj_size		= sizeof(struct unix_sock),
945 	.close			= unix_close,
946 	.unhash			= unix_unhash,
947 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
948 #ifdef CONFIG_BPF_SYSCALL
949 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
950 #endif
951 };
952 
953 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
954 {
955 	struct unix_sock *u;
956 	struct sock *sk;
957 	int err;
958 
959 	atomic_long_inc(&unix_nr_socks);
960 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
961 		err = -ENFILE;
962 		goto err;
963 	}
964 
965 	if (type == SOCK_STREAM)
966 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
967 	else /*dgram and  seqpacket */
968 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
969 
970 	if (!sk) {
971 		err = -ENOMEM;
972 		goto err;
973 	}
974 
975 	sock_init_data(sock, sk);
976 
977 	sk->sk_hash		= unix_unbound_hash(sk);
978 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
979 	sk->sk_write_space	= unix_write_space;
980 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
981 	sk->sk_destruct		= unix_sock_destructor;
982 	u = unix_sk(sk);
983 	u->inflight = 0;
984 	u->path.dentry = NULL;
985 	u->path.mnt = NULL;
986 	spin_lock_init(&u->lock);
987 	INIT_LIST_HEAD(&u->link);
988 	mutex_init(&u->iolock); /* single task reading lock */
989 	mutex_init(&u->bindlock); /* single task binding lock */
990 	init_waitqueue_head(&u->peer_wait);
991 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
992 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
993 	unix_insert_unbound_socket(net, sk);
994 
995 	sock_prot_inuse_add(net, sk->sk_prot, 1);
996 
997 	return sk;
998 
999 err:
1000 	atomic_long_dec(&unix_nr_socks);
1001 	return ERR_PTR(err);
1002 }
1003 
1004 static int unix_create(struct net *net, struct socket *sock, int protocol,
1005 		       int kern)
1006 {
1007 	struct sock *sk;
1008 
1009 	if (protocol && protocol != PF_UNIX)
1010 		return -EPROTONOSUPPORT;
1011 
1012 	sock->state = SS_UNCONNECTED;
1013 
1014 	switch (sock->type) {
1015 	case SOCK_STREAM:
1016 		sock->ops = &unix_stream_ops;
1017 		break;
1018 		/*
1019 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1020 		 *	nothing uses it.
1021 		 */
1022 	case SOCK_RAW:
1023 		sock->type = SOCK_DGRAM;
1024 		fallthrough;
1025 	case SOCK_DGRAM:
1026 		sock->ops = &unix_dgram_ops;
1027 		break;
1028 	case SOCK_SEQPACKET:
1029 		sock->ops = &unix_seqpacket_ops;
1030 		break;
1031 	default:
1032 		return -ESOCKTNOSUPPORT;
1033 	}
1034 
1035 	sk = unix_create1(net, sock, kern, sock->type);
1036 	if (IS_ERR(sk))
1037 		return PTR_ERR(sk);
1038 
1039 	return 0;
1040 }
1041 
1042 static int unix_release(struct socket *sock)
1043 {
1044 	struct sock *sk = sock->sk;
1045 
1046 	if (!sk)
1047 		return 0;
1048 
1049 	sk->sk_prot->close(sk, 0);
1050 	unix_release_sock(sk, 0);
1051 	sock->sk = NULL;
1052 
1053 	return 0;
1054 }
1055 
1056 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1057 				  int type)
1058 {
1059 	struct inode *inode;
1060 	struct path path;
1061 	struct sock *sk;
1062 	int err;
1063 
1064 	unix_mkname_bsd(sunaddr, addr_len);
1065 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1066 	if (err)
1067 		goto fail;
1068 
1069 	err = path_permission(&path, MAY_WRITE);
1070 	if (err)
1071 		goto path_put;
1072 
1073 	err = -ECONNREFUSED;
1074 	inode = d_backing_inode(path.dentry);
1075 	if (!S_ISSOCK(inode->i_mode))
1076 		goto path_put;
1077 
1078 	sk = unix_find_socket_byinode(inode);
1079 	if (!sk)
1080 		goto path_put;
1081 
1082 	err = -EPROTOTYPE;
1083 	if (sk->sk_type == type)
1084 		touch_atime(&path);
1085 	else
1086 		goto sock_put;
1087 
1088 	path_put(&path);
1089 
1090 	return sk;
1091 
1092 sock_put:
1093 	sock_put(sk);
1094 path_put:
1095 	path_put(&path);
1096 fail:
1097 	return ERR_PTR(err);
1098 }
1099 
1100 static struct sock *unix_find_abstract(struct net *net,
1101 				       struct sockaddr_un *sunaddr,
1102 				       int addr_len, int type)
1103 {
1104 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1105 	struct dentry *dentry;
1106 	struct sock *sk;
1107 
1108 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1109 	if (!sk)
1110 		return ERR_PTR(-ECONNREFUSED);
1111 
1112 	dentry = unix_sk(sk)->path.dentry;
1113 	if (dentry)
1114 		touch_atime(&unix_sk(sk)->path);
1115 
1116 	return sk;
1117 }
1118 
1119 static struct sock *unix_find_other(struct net *net,
1120 				    struct sockaddr_un *sunaddr,
1121 				    int addr_len, int type)
1122 {
1123 	struct sock *sk;
1124 
1125 	if (sunaddr->sun_path[0])
1126 		sk = unix_find_bsd(sunaddr, addr_len, type);
1127 	else
1128 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1129 
1130 	return sk;
1131 }
1132 
1133 static int unix_autobind(struct sock *sk)
1134 {
1135 	struct unix_sock *u = unix_sk(sk);
1136 	unsigned int new_hash, old_hash;
1137 	struct net *net = sock_net(sk);
1138 	struct unix_address *addr;
1139 	u32 lastnum, ordernum;
1140 	int err;
1141 
1142 	err = mutex_lock_interruptible(&u->bindlock);
1143 	if (err)
1144 		return err;
1145 
1146 	if (u->addr)
1147 		goto out;
1148 
1149 	err = -ENOMEM;
1150 	addr = kzalloc(sizeof(*addr) +
1151 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1152 	if (!addr)
1153 		goto out;
1154 
1155 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1156 	addr->name->sun_family = AF_UNIX;
1157 	refcount_set(&addr->refcnt, 1);
1158 
1159 	old_hash = sk->sk_hash;
1160 	ordernum = get_random_u32();
1161 	lastnum = ordernum & 0xFFFFF;
1162 retry:
1163 	ordernum = (ordernum + 1) & 0xFFFFF;
1164 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1165 
1166 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1167 	unix_table_double_lock(net, old_hash, new_hash);
1168 
1169 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1170 		unix_table_double_unlock(net, old_hash, new_hash);
1171 
1172 		/* __unix_find_socket_byname() may take long time if many names
1173 		 * are already in use.
1174 		 */
1175 		cond_resched();
1176 
1177 		if (ordernum == lastnum) {
1178 			/* Give up if all names seems to be in use. */
1179 			err = -ENOSPC;
1180 			unix_release_addr(addr);
1181 			goto out;
1182 		}
1183 
1184 		goto retry;
1185 	}
1186 
1187 	__unix_set_addr_hash(net, sk, addr, new_hash);
1188 	unix_table_double_unlock(net, old_hash, new_hash);
1189 	err = 0;
1190 
1191 out:	mutex_unlock(&u->bindlock);
1192 	return err;
1193 }
1194 
1195 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1196 			 int addr_len)
1197 {
1198 	umode_t mode = S_IFSOCK |
1199 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1200 	struct unix_sock *u = unix_sk(sk);
1201 	unsigned int new_hash, old_hash;
1202 	struct net *net = sock_net(sk);
1203 	struct mnt_idmap *idmap;
1204 	struct unix_address *addr;
1205 	struct dentry *dentry;
1206 	struct path parent;
1207 	int err;
1208 
1209 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1210 	addr = unix_create_addr(sunaddr, addr_len);
1211 	if (!addr)
1212 		return -ENOMEM;
1213 
1214 	/*
1215 	 * Get the parent directory, calculate the hash for last
1216 	 * component.
1217 	 */
1218 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1219 	if (IS_ERR(dentry)) {
1220 		err = PTR_ERR(dentry);
1221 		goto out;
1222 	}
1223 
1224 	/*
1225 	 * All right, let's create it.
1226 	 */
1227 	idmap = mnt_idmap(parent.mnt);
1228 	err = security_path_mknod(&parent, dentry, mode, 0);
1229 	if (!err)
1230 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1231 	if (err)
1232 		goto out_path;
1233 	err = mutex_lock_interruptible(&u->bindlock);
1234 	if (err)
1235 		goto out_unlink;
1236 	if (u->addr)
1237 		goto out_unlock;
1238 
1239 	old_hash = sk->sk_hash;
1240 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1241 	unix_table_double_lock(net, old_hash, new_hash);
1242 	u->path.mnt = mntget(parent.mnt);
1243 	u->path.dentry = dget(dentry);
1244 	__unix_set_addr_hash(net, sk, addr, new_hash);
1245 	unix_table_double_unlock(net, old_hash, new_hash);
1246 	unix_insert_bsd_socket(sk);
1247 	mutex_unlock(&u->bindlock);
1248 	done_path_create(&parent, dentry);
1249 	return 0;
1250 
1251 out_unlock:
1252 	mutex_unlock(&u->bindlock);
1253 	err = -EINVAL;
1254 out_unlink:
1255 	/* failed after successful mknod?  unlink what we'd created... */
1256 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1257 out_path:
1258 	done_path_create(&parent, dentry);
1259 out:
1260 	unix_release_addr(addr);
1261 	return err == -EEXIST ? -EADDRINUSE : err;
1262 }
1263 
1264 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1265 			      int addr_len)
1266 {
1267 	struct unix_sock *u = unix_sk(sk);
1268 	unsigned int new_hash, old_hash;
1269 	struct net *net = sock_net(sk);
1270 	struct unix_address *addr;
1271 	int err;
1272 
1273 	addr = unix_create_addr(sunaddr, addr_len);
1274 	if (!addr)
1275 		return -ENOMEM;
1276 
1277 	err = mutex_lock_interruptible(&u->bindlock);
1278 	if (err)
1279 		goto out;
1280 
1281 	if (u->addr) {
1282 		err = -EINVAL;
1283 		goto out_mutex;
1284 	}
1285 
1286 	old_hash = sk->sk_hash;
1287 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1288 	unix_table_double_lock(net, old_hash, new_hash);
1289 
1290 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1291 		goto out_spin;
1292 
1293 	__unix_set_addr_hash(net, sk, addr, new_hash);
1294 	unix_table_double_unlock(net, old_hash, new_hash);
1295 	mutex_unlock(&u->bindlock);
1296 	return 0;
1297 
1298 out_spin:
1299 	unix_table_double_unlock(net, old_hash, new_hash);
1300 	err = -EADDRINUSE;
1301 out_mutex:
1302 	mutex_unlock(&u->bindlock);
1303 out:
1304 	unix_release_addr(addr);
1305 	return err;
1306 }
1307 
1308 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1309 {
1310 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1311 	struct sock *sk = sock->sk;
1312 	int err;
1313 
1314 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1315 	    sunaddr->sun_family == AF_UNIX)
1316 		return unix_autobind(sk);
1317 
1318 	err = unix_validate_addr(sunaddr, addr_len);
1319 	if (err)
1320 		return err;
1321 
1322 	if (sunaddr->sun_path[0])
1323 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1324 	else
1325 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1326 
1327 	return err;
1328 }
1329 
1330 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1331 {
1332 	if (unlikely(sk1 == sk2) || !sk2) {
1333 		unix_state_lock(sk1);
1334 		return;
1335 	}
1336 	if (sk1 > sk2)
1337 		swap(sk1, sk2);
1338 
1339 	unix_state_lock(sk1);
1340 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1341 }
1342 
1343 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1344 {
1345 	if (unlikely(sk1 == sk2) || !sk2) {
1346 		unix_state_unlock(sk1);
1347 		return;
1348 	}
1349 	unix_state_unlock(sk1);
1350 	unix_state_unlock(sk2);
1351 }
1352 
1353 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1354 			      int alen, int flags)
1355 {
1356 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1357 	struct sock *sk = sock->sk;
1358 	struct sock *other;
1359 	int err;
1360 
1361 	err = -EINVAL;
1362 	if (alen < offsetofend(struct sockaddr, sa_family))
1363 		goto out;
1364 
1365 	if (addr->sa_family != AF_UNSPEC) {
1366 		err = unix_validate_addr(sunaddr, alen);
1367 		if (err)
1368 			goto out;
1369 
1370 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1371 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1372 		    !READ_ONCE(unix_sk(sk)->addr)) {
1373 			err = unix_autobind(sk);
1374 			if (err)
1375 				goto out;
1376 		}
1377 
1378 restart:
1379 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1380 		if (IS_ERR(other)) {
1381 			err = PTR_ERR(other);
1382 			goto out;
1383 		}
1384 
1385 		unix_state_double_lock(sk, other);
1386 
1387 		/* Apparently VFS overslept socket death. Retry. */
1388 		if (sock_flag(other, SOCK_DEAD)) {
1389 			unix_state_double_unlock(sk, other);
1390 			sock_put(other);
1391 			goto restart;
1392 		}
1393 
1394 		err = -EPERM;
1395 		if (!unix_may_send(sk, other))
1396 			goto out_unlock;
1397 
1398 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1399 		if (err)
1400 			goto out_unlock;
1401 
1402 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1403 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1404 	} else {
1405 		/*
1406 		 *	1003.1g breaking connected state with AF_UNSPEC
1407 		 */
1408 		other = NULL;
1409 		unix_state_double_lock(sk, other);
1410 	}
1411 
1412 	/*
1413 	 * If it was connected, reconnect.
1414 	 */
1415 	if (unix_peer(sk)) {
1416 		struct sock *old_peer = unix_peer(sk);
1417 
1418 		unix_peer(sk) = other;
1419 		if (!other)
1420 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1421 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1422 
1423 		unix_state_double_unlock(sk, other);
1424 
1425 		if (other != old_peer) {
1426 			unix_dgram_disconnected(sk, old_peer);
1427 
1428 			unix_state_lock(old_peer);
1429 			if (!unix_peer(old_peer))
1430 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1431 			unix_state_unlock(old_peer);
1432 		}
1433 
1434 		sock_put(old_peer);
1435 	} else {
1436 		unix_peer(sk) = other;
1437 		unix_state_double_unlock(sk, other);
1438 	}
1439 
1440 	return 0;
1441 
1442 out_unlock:
1443 	unix_state_double_unlock(sk, other);
1444 	sock_put(other);
1445 out:
1446 	return err;
1447 }
1448 
1449 static long unix_wait_for_peer(struct sock *other, long timeo)
1450 	__releases(&unix_sk(other)->lock)
1451 {
1452 	struct unix_sock *u = unix_sk(other);
1453 	int sched;
1454 	DEFINE_WAIT(wait);
1455 
1456 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1457 
1458 	sched = !sock_flag(other, SOCK_DEAD) &&
1459 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1460 		unix_recvq_full_lockless(other);
1461 
1462 	unix_state_unlock(other);
1463 
1464 	if (sched)
1465 		timeo = schedule_timeout(timeo);
1466 
1467 	finish_wait(&u->peer_wait, &wait);
1468 	return timeo;
1469 }
1470 
1471 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1472 			       int addr_len, int flags)
1473 {
1474 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1475 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1476 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1477 	struct net *net = sock_net(sk);
1478 	struct sk_buff *skb = NULL;
1479 	unsigned char state;
1480 	long timeo;
1481 	int err;
1482 
1483 	err = unix_validate_addr(sunaddr, addr_len);
1484 	if (err)
1485 		goto out;
1486 
1487 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1488 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1489 	    !READ_ONCE(u->addr)) {
1490 		err = unix_autobind(sk);
1491 		if (err)
1492 			goto out;
1493 	}
1494 
1495 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1496 
1497 	/* First of all allocate resources.
1498 	   If we will make it after state is locked,
1499 	   we will have to recheck all again in any case.
1500 	 */
1501 
1502 	/* create new sock for complete connection */
1503 	newsk = unix_create1(net, NULL, 0, sock->type);
1504 	if (IS_ERR(newsk)) {
1505 		err = PTR_ERR(newsk);
1506 		newsk = NULL;
1507 		goto out;
1508 	}
1509 
1510 	err = -ENOMEM;
1511 
1512 	/* Allocate skb for sending to listening sock */
1513 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1514 	if (skb == NULL)
1515 		goto out;
1516 
1517 restart:
1518 	/*  Find listening sock. */
1519 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1520 	if (IS_ERR(other)) {
1521 		err = PTR_ERR(other);
1522 		other = NULL;
1523 		goto out;
1524 	}
1525 
1526 	unix_state_lock(other);
1527 
1528 	/* Apparently VFS overslept socket death. Retry. */
1529 	if (sock_flag(other, SOCK_DEAD)) {
1530 		unix_state_unlock(other);
1531 		sock_put(other);
1532 		goto restart;
1533 	}
1534 
1535 	err = -ECONNREFUSED;
1536 	if (other->sk_state != TCP_LISTEN)
1537 		goto out_unlock;
1538 	if (other->sk_shutdown & RCV_SHUTDOWN)
1539 		goto out_unlock;
1540 
1541 	if (unix_recvq_full_lockless(other)) {
1542 		err = -EAGAIN;
1543 		if (!timeo)
1544 			goto out_unlock;
1545 
1546 		timeo = unix_wait_for_peer(other, timeo);
1547 
1548 		err = sock_intr_errno(timeo);
1549 		if (signal_pending(current))
1550 			goto out;
1551 		sock_put(other);
1552 		goto restart;
1553 	}
1554 
1555 	/* self connect and simultaneous connect are eliminated
1556 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1557 	 */
1558 	state = READ_ONCE(sk->sk_state);
1559 	if (unlikely(state != TCP_CLOSE)) {
1560 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1561 		goto out_unlock;
1562 	}
1563 
1564 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1565 
1566 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1567 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1568 		unix_state_unlock(sk);
1569 		goto out_unlock;
1570 	}
1571 
1572 	err = security_unix_stream_connect(sk, other, newsk);
1573 	if (err) {
1574 		unix_state_unlock(sk);
1575 		goto out_unlock;
1576 	}
1577 
1578 	/* The way is open! Fastly set all the necessary fields... */
1579 
1580 	sock_hold(sk);
1581 	unix_peer(newsk)	= sk;
1582 	newsk->sk_state		= TCP_ESTABLISHED;
1583 	newsk->sk_type		= sk->sk_type;
1584 	init_peercred(newsk);
1585 	newu = unix_sk(newsk);
1586 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1587 	otheru = unix_sk(other);
1588 
1589 	/* copy address information from listening to new sock
1590 	 *
1591 	 * The contents of *(otheru->addr) and otheru->path
1592 	 * are seen fully set up here, since we have found
1593 	 * otheru in hash under its lock.  Insertion into the
1594 	 * hash chain we'd found it in had been done in an
1595 	 * earlier critical area protected by the chain's lock,
1596 	 * the same one where we'd set *(otheru->addr) contents,
1597 	 * as well as otheru->path and otheru->addr itself.
1598 	 *
1599 	 * Using smp_store_release() here to set newu->addr
1600 	 * is enough to make those stores, as well as stores
1601 	 * to newu->path visible to anyone who gets newu->addr
1602 	 * by smp_load_acquire().  IOW, the same warranties
1603 	 * as for unix_sock instances bound in unix_bind() or
1604 	 * in unix_autobind().
1605 	 */
1606 	if (otheru->path.dentry) {
1607 		path_get(&otheru->path);
1608 		newu->path = otheru->path;
1609 	}
1610 	refcount_inc(&otheru->addr->refcnt);
1611 	smp_store_release(&newu->addr, otheru->addr);
1612 
1613 	/* Set credentials */
1614 	copy_peercred(sk, other);
1615 
1616 	sock->state	= SS_CONNECTED;
1617 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1618 	sock_hold(newsk);
1619 
1620 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1621 	unix_peer(sk)	= newsk;
1622 
1623 	unix_state_unlock(sk);
1624 
1625 	/* take ten and send info to listening sock */
1626 	spin_lock(&other->sk_receive_queue.lock);
1627 	__skb_queue_tail(&other->sk_receive_queue, skb);
1628 	spin_unlock(&other->sk_receive_queue.lock);
1629 	unix_state_unlock(other);
1630 	other->sk_data_ready(other);
1631 	sock_put(other);
1632 	return 0;
1633 
1634 out_unlock:
1635 	if (other)
1636 		unix_state_unlock(other);
1637 
1638 out:
1639 	kfree_skb(skb);
1640 	if (newsk)
1641 		unix_release_sock(newsk, 0);
1642 	if (other)
1643 		sock_put(other);
1644 	return err;
1645 }
1646 
1647 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1648 {
1649 	struct sock *ska = socka->sk, *skb = sockb->sk;
1650 
1651 	/* Join our sockets back to back */
1652 	sock_hold(ska);
1653 	sock_hold(skb);
1654 	unix_peer(ska) = skb;
1655 	unix_peer(skb) = ska;
1656 	init_peercred(ska);
1657 	init_peercred(skb);
1658 
1659 	ska->sk_state = TCP_ESTABLISHED;
1660 	skb->sk_state = TCP_ESTABLISHED;
1661 	socka->state  = SS_CONNECTED;
1662 	sockb->state  = SS_CONNECTED;
1663 	return 0;
1664 }
1665 
1666 static void unix_sock_inherit_flags(const struct socket *old,
1667 				    struct socket *new)
1668 {
1669 	if (test_bit(SOCK_PASSCRED, &old->flags))
1670 		set_bit(SOCK_PASSCRED, &new->flags);
1671 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1672 		set_bit(SOCK_PASSPIDFD, &new->flags);
1673 	if (test_bit(SOCK_PASSSEC, &old->flags))
1674 		set_bit(SOCK_PASSSEC, &new->flags);
1675 }
1676 
1677 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1678 		       bool kern)
1679 {
1680 	struct sock *sk = sock->sk;
1681 	struct sock *tsk;
1682 	struct sk_buff *skb;
1683 	int err;
1684 
1685 	err = -EOPNOTSUPP;
1686 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1687 		goto out;
1688 
1689 	err = -EINVAL;
1690 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1691 		goto out;
1692 
1693 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1694 	 * so that no locks are necessary.
1695 	 */
1696 
1697 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1698 				&err);
1699 	if (!skb) {
1700 		/* This means receive shutdown. */
1701 		if (err == 0)
1702 			err = -EINVAL;
1703 		goto out;
1704 	}
1705 
1706 	tsk = skb->sk;
1707 	skb_free_datagram(sk, skb);
1708 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1709 
1710 	/* attach accepted sock to socket */
1711 	unix_state_lock(tsk);
1712 	newsock->state = SS_CONNECTED;
1713 	unix_sock_inherit_flags(sock, newsock);
1714 	sock_graft(tsk, newsock);
1715 	unix_state_unlock(tsk);
1716 	return 0;
1717 
1718 out:
1719 	return err;
1720 }
1721 
1722 
1723 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1724 {
1725 	struct sock *sk = sock->sk;
1726 	struct unix_address *addr;
1727 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1728 	int err = 0;
1729 
1730 	if (peer) {
1731 		sk = unix_peer_get(sk);
1732 
1733 		err = -ENOTCONN;
1734 		if (!sk)
1735 			goto out;
1736 		err = 0;
1737 	} else {
1738 		sock_hold(sk);
1739 	}
1740 
1741 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1742 	if (!addr) {
1743 		sunaddr->sun_family = AF_UNIX;
1744 		sunaddr->sun_path[0] = 0;
1745 		err = offsetof(struct sockaddr_un, sun_path);
1746 	} else {
1747 		err = addr->len;
1748 		memcpy(sunaddr, addr->name, addr->len);
1749 	}
1750 	sock_put(sk);
1751 out:
1752 	return err;
1753 }
1754 
1755 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1756 {
1757 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1758 
1759 	/*
1760 	 * Garbage collection of unix sockets starts by selecting a set of
1761 	 * candidate sockets which have reference only from being in flight
1762 	 * (total_refs == inflight_refs).  This condition is checked once during
1763 	 * the candidate collection phase, and candidates are marked as such, so
1764 	 * that non-candidates can later be ignored.  While inflight_refs is
1765 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1766 	 * is an instantaneous decision.
1767 	 *
1768 	 * Once a candidate, however, the socket must not be reinstalled into a
1769 	 * file descriptor while the garbage collection is in progress.
1770 	 *
1771 	 * If the above conditions are met, then the directed graph of
1772 	 * candidates (*) does not change while unix_gc_lock is held.
1773 	 *
1774 	 * Any operations that changes the file count through file descriptors
1775 	 * (dup, close, sendmsg) does not change the graph since candidates are
1776 	 * not installed in fds.
1777 	 *
1778 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1779 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1780 	 * serialized with garbage collection.
1781 	 *
1782 	 * MSG_PEEK is special in that it does not change the inflight count,
1783 	 * yet does install the socket into an fd.  The following lock/unlock
1784 	 * pair is to ensure serialization with garbage collection.  It must be
1785 	 * done between incrementing the file count and installing the file into
1786 	 * an fd.
1787 	 *
1788 	 * If garbage collection starts after the barrier provided by the
1789 	 * lock/unlock, then it will see the elevated refcount and not mark this
1790 	 * as a candidate.  If a garbage collection is already in progress
1791 	 * before the file count was incremented, then the lock/unlock pair will
1792 	 * ensure that garbage collection is finished before progressing to
1793 	 * installing the fd.
1794 	 *
1795 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1796 	 * which is on the queue of listening socket A.
1797 	 */
1798 	spin_lock(&unix_gc_lock);
1799 	spin_unlock(&unix_gc_lock);
1800 }
1801 
1802 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1803 {
1804 	int err = 0;
1805 
1806 	UNIXCB(skb).pid  = get_pid(scm->pid);
1807 	UNIXCB(skb).uid = scm->creds.uid;
1808 	UNIXCB(skb).gid = scm->creds.gid;
1809 	UNIXCB(skb).fp = NULL;
1810 	unix_get_secdata(scm, skb);
1811 	if (scm->fp && send_fds)
1812 		err = unix_attach_fds(scm, skb);
1813 
1814 	skb->destructor = unix_destruct_scm;
1815 	return err;
1816 }
1817 
1818 static bool unix_passcred_enabled(const struct socket *sock,
1819 				  const struct sock *other)
1820 {
1821 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1822 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1823 	       !other->sk_socket ||
1824 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1825 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1826 }
1827 
1828 /*
1829  * Some apps rely on write() giving SCM_CREDENTIALS
1830  * We include credentials if source or destination socket
1831  * asserted SOCK_PASSCRED.
1832  */
1833 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1834 			    const struct sock *other)
1835 {
1836 	if (UNIXCB(skb).pid)
1837 		return;
1838 	if (unix_passcred_enabled(sock, other)) {
1839 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1840 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1841 	}
1842 }
1843 
1844 static bool unix_skb_scm_eq(struct sk_buff *skb,
1845 			    struct scm_cookie *scm)
1846 {
1847 	return UNIXCB(skb).pid == scm->pid &&
1848 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1849 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1850 	       unix_secdata_eq(scm, skb);
1851 }
1852 
1853 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1854 {
1855 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1856 	struct unix_sock *u = unix_sk(sk);
1857 
1858 	if (unlikely(fp && fp->count))
1859 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1860 }
1861 
1862 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1863 {
1864 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1865 	struct unix_sock *u = unix_sk(sk);
1866 
1867 	if (unlikely(fp && fp->count))
1868 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1869 }
1870 
1871 /*
1872  *	Send AF_UNIX data.
1873  */
1874 
1875 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1876 			      size_t len)
1877 {
1878 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1879 	struct sock *sk = sock->sk, *other = NULL;
1880 	struct unix_sock *u = unix_sk(sk);
1881 	struct scm_cookie scm;
1882 	struct sk_buff *skb;
1883 	int data_len = 0;
1884 	int sk_locked;
1885 	long timeo;
1886 	int err;
1887 
1888 	wait_for_unix_gc();
1889 	err = scm_send(sock, msg, &scm, false);
1890 	if (err < 0)
1891 		return err;
1892 
1893 	err = -EOPNOTSUPP;
1894 	if (msg->msg_flags&MSG_OOB)
1895 		goto out;
1896 
1897 	if (msg->msg_namelen) {
1898 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1899 		if (err)
1900 			goto out;
1901 	} else {
1902 		sunaddr = NULL;
1903 		err = -ENOTCONN;
1904 		other = unix_peer_get(sk);
1905 		if (!other)
1906 			goto out;
1907 	}
1908 
1909 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1910 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1911 	    !READ_ONCE(u->addr)) {
1912 		err = unix_autobind(sk);
1913 		if (err)
1914 			goto out;
1915 	}
1916 
1917 	err = -EMSGSIZE;
1918 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1919 		goto out;
1920 
1921 	if (len > SKB_MAX_ALLOC) {
1922 		data_len = min_t(size_t,
1923 				 len - SKB_MAX_ALLOC,
1924 				 MAX_SKB_FRAGS * PAGE_SIZE);
1925 		data_len = PAGE_ALIGN(data_len);
1926 
1927 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1928 	}
1929 
1930 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1931 				   msg->msg_flags & MSG_DONTWAIT, &err,
1932 				   PAGE_ALLOC_COSTLY_ORDER);
1933 	if (skb == NULL)
1934 		goto out;
1935 
1936 	err = unix_scm_to_skb(&scm, skb, true);
1937 	if (err < 0)
1938 		goto out_free;
1939 
1940 	skb_put(skb, len - data_len);
1941 	skb->data_len = data_len;
1942 	skb->len = len;
1943 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1944 	if (err)
1945 		goto out_free;
1946 
1947 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1948 
1949 restart:
1950 	if (!other) {
1951 		err = -ECONNRESET;
1952 		if (sunaddr == NULL)
1953 			goto out_free;
1954 
1955 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1956 					sk->sk_type);
1957 		if (IS_ERR(other)) {
1958 			err = PTR_ERR(other);
1959 			other = NULL;
1960 			goto out_free;
1961 		}
1962 	}
1963 
1964 	if (sk_filter(other, skb) < 0) {
1965 		/* Toss the packet but do not return any error to the sender */
1966 		err = len;
1967 		goto out_free;
1968 	}
1969 
1970 	sk_locked = 0;
1971 	unix_state_lock(other);
1972 restart_locked:
1973 	err = -EPERM;
1974 	if (!unix_may_send(sk, other))
1975 		goto out_unlock;
1976 
1977 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1978 		/*
1979 		 *	Check with 1003.1g - what should
1980 		 *	datagram error
1981 		 */
1982 		unix_state_unlock(other);
1983 		sock_put(other);
1984 
1985 		if (!sk_locked)
1986 			unix_state_lock(sk);
1987 
1988 		err = 0;
1989 		if (sk->sk_type == SOCK_SEQPACKET) {
1990 			/* We are here only when racing with unix_release_sock()
1991 			 * is clearing @other. Never change state to TCP_CLOSE
1992 			 * unlike SOCK_DGRAM wants.
1993 			 */
1994 			unix_state_unlock(sk);
1995 			err = -EPIPE;
1996 		} else if (unix_peer(sk) == other) {
1997 			unix_peer(sk) = NULL;
1998 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1999 
2000 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2001 			unix_state_unlock(sk);
2002 
2003 			unix_dgram_disconnected(sk, other);
2004 			sock_put(other);
2005 			err = -ECONNREFUSED;
2006 		} else {
2007 			unix_state_unlock(sk);
2008 		}
2009 
2010 		other = NULL;
2011 		if (err)
2012 			goto out_free;
2013 		goto restart;
2014 	}
2015 
2016 	err = -EPIPE;
2017 	if (other->sk_shutdown & RCV_SHUTDOWN)
2018 		goto out_unlock;
2019 
2020 	if (sk->sk_type != SOCK_SEQPACKET) {
2021 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2022 		if (err)
2023 			goto out_unlock;
2024 	}
2025 
2026 	/* other == sk && unix_peer(other) != sk if
2027 	 * - unix_peer(sk) == NULL, destination address bound to sk
2028 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2029 	 */
2030 	if (other != sk &&
2031 	    unlikely(unix_peer(other) != sk &&
2032 	    unix_recvq_full_lockless(other))) {
2033 		if (timeo) {
2034 			timeo = unix_wait_for_peer(other, timeo);
2035 
2036 			err = sock_intr_errno(timeo);
2037 			if (signal_pending(current))
2038 				goto out_free;
2039 
2040 			goto restart;
2041 		}
2042 
2043 		if (!sk_locked) {
2044 			unix_state_unlock(other);
2045 			unix_state_double_lock(sk, other);
2046 		}
2047 
2048 		if (unix_peer(sk) != other ||
2049 		    unix_dgram_peer_wake_me(sk, other)) {
2050 			err = -EAGAIN;
2051 			sk_locked = 1;
2052 			goto out_unlock;
2053 		}
2054 
2055 		if (!sk_locked) {
2056 			sk_locked = 1;
2057 			goto restart_locked;
2058 		}
2059 	}
2060 
2061 	if (unlikely(sk_locked))
2062 		unix_state_unlock(sk);
2063 
2064 	if (sock_flag(other, SOCK_RCVTSTAMP))
2065 		__net_timestamp(skb);
2066 	maybe_add_creds(skb, sock, other);
2067 	scm_stat_add(other, skb);
2068 	skb_queue_tail(&other->sk_receive_queue, skb);
2069 	unix_state_unlock(other);
2070 	other->sk_data_ready(other);
2071 	sock_put(other);
2072 	scm_destroy(&scm);
2073 	return len;
2074 
2075 out_unlock:
2076 	if (sk_locked)
2077 		unix_state_unlock(sk);
2078 	unix_state_unlock(other);
2079 out_free:
2080 	kfree_skb(skb);
2081 out:
2082 	if (other)
2083 		sock_put(other);
2084 	scm_destroy(&scm);
2085 	return err;
2086 }
2087 
2088 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2089  * bytes, and a minimum of a full page.
2090  */
2091 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2092 
2093 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2094 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2095 		     struct scm_cookie *scm, bool fds_sent)
2096 {
2097 	struct unix_sock *ousk = unix_sk(other);
2098 	struct sk_buff *skb;
2099 	int err = 0;
2100 
2101 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2102 
2103 	if (!skb)
2104 		return err;
2105 
2106 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2107 	if (err < 0) {
2108 		kfree_skb(skb);
2109 		return err;
2110 	}
2111 	skb_put(skb, 1);
2112 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2113 
2114 	if (err) {
2115 		kfree_skb(skb);
2116 		return err;
2117 	}
2118 
2119 	unix_state_lock(other);
2120 
2121 	if (sock_flag(other, SOCK_DEAD) ||
2122 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2123 		unix_state_unlock(other);
2124 		kfree_skb(skb);
2125 		return -EPIPE;
2126 	}
2127 
2128 	maybe_add_creds(skb, sock, other);
2129 	skb_get(skb);
2130 
2131 	scm_stat_add(other, skb);
2132 
2133 	spin_lock(&other->sk_receive_queue.lock);
2134 	if (ousk->oob_skb)
2135 		consume_skb(ousk->oob_skb);
2136 	WRITE_ONCE(ousk->oob_skb, skb);
2137 	__skb_queue_tail(&other->sk_receive_queue, skb);
2138 	spin_unlock(&other->sk_receive_queue.lock);
2139 
2140 	sk_send_sigurg(other);
2141 	unix_state_unlock(other);
2142 	other->sk_data_ready(other);
2143 
2144 	return err;
2145 }
2146 #endif
2147 
2148 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2149 			       size_t len)
2150 {
2151 	struct sock *sk = sock->sk;
2152 	struct sock *other = NULL;
2153 	int err, size;
2154 	struct sk_buff *skb;
2155 	int sent = 0;
2156 	struct scm_cookie scm;
2157 	bool fds_sent = false;
2158 	int data_len;
2159 
2160 	wait_for_unix_gc();
2161 	err = scm_send(sock, msg, &scm, false);
2162 	if (err < 0)
2163 		return err;
2164 
2165 	err = -EOPNOTSUPP;
2166 	if (msg->msg_flags & MSG_OOB) {
2167 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2168 		if (len)
2169 			len--;
2170 		else
2171 #endif
2172 			goto out_err;
2173 	}
2174 
2175 	if (msg->msg_namelen) {
2176 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2177 		goto out_err;
2178 	} else {
2179 		err = -ENOTCONN;
2180 		other = unix_peer(sk);
2181 		if (!other)
2182 			goto out_err;
2183 	}
2184 
2185 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2186 		goto pipe_err;
2187 
2188 	while (sent < len) {
2189 		size = len - sent;
2190 
2191 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2192 			skb = sock_alloc_send_pskb(sk, 0, 0,
2193 						   msg->msg_flags & MSG_DONTWAIT,
2194 						   &err, 0);
2195 		} else {
2196 			/* Keep two messages in the pipe so it schedules better */
2197 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2198 
2199 			/* allow fallback to order-0 allocations */
2200 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2201 
2202 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2203 
2204 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2205 
2206 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2207 						   msg->msg_flags & MSG_DONTWAIT, &err,
2208 						   get_order(UNIX_SKB_FRAGS_SZ));
2209 		}
2210 		if (!skb)
2211 			goto out_err;
2212 
2213 		/* Only send the fds in the first buffer */
2214 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2215 		if (err < 0) {
2216 			kfree_skb(skb);
2217 			goto out_err;
2218 		}
2219 		fds_sent = true;
2220 
2221 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2222 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2223 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2224 						   sk->sk_allocation);
2225 			if (err < 0) {
2226 				kfree_skb(skb);
2227 				goto out_err;
2228 			}
2229 			size = err;
2230 			refcount_add(size, &sk->sk_wmem_alloc);
2231 		} else {
2232 			skb_put(skb, size - data_len);
2233 			skb->data_len = data_len;
2234 			skb->len = size;
2235 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2236 			if (err) {
2237 				kfree_skb(skb);
2238 				goto out_err;
2239 			}
2240 		}
2241 
2242 		unix_state_lock(other);
2243 
2244 		if (sock_flag(other, SOCK_DEAD) ||
2245 		    (other->sk_shutdown & RCV_SHUTDOWN))
2246 			goto pipe_err_free;
2247 
2248 		maybe_add_creds(skb, sock, other);
2249 		scm_stat_add(other, skb);
2250 		skb_queue_tail(&other->sk_receive_queue, skb);
2251 		unix_state_unlock(other);
2252 		other->sk_data_ready(other);
2253 		sent += size;
2254 	}
2255 
2256 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2257 	if (msg->msg_flags & MSG_OOB) {
2258 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2259 		if (err)
2260 			goto out_err;
2261 		sent++;
2262 	}
2263 #endif
2264 
2265 	scm_destroy(&scm);
2266 
2267 	return sent;
2268 
2269 pipe_err_free:
2270 	unix_state_unlock(other);
2271 	kfree_skb(skb);
2272 pipe_err:
2273 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2274 		send_sig(SIGPIPE, current, 0);
2275 	err = -EPIPE;
2276 out_err:
2277 	scm_destroy(&scm);
2278 	return sent ? : err;
2279 }
2280 
2281 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2282 				  size_t len)
2283 {
2284 	int err;
2285 	struct sock *sk = sock->sk;
2286 
2287 	err = sock_error(sk);
2288 	if (err)
2289 		return err;
2290 
2291 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2292 		return -ENOTCONN;
2293 
2294 	if (msg->msg_namelen)
2295 		msg->msg_namelen = 0;
2296 
2297 	return unix_dgram_sendmsg(sock, msg, len);
2298 }
2299 
2300 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2301 				  size_t size, int flags)
2302 {
2303 	struct sock *sk = sock->sk;
2304 
2305 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2306 		return -ENOTCONN;
2307 
2308 	return unix_dgram_recvmsg(sock, msg, size, flags);
2309 }
2310 
2311 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2312 {
2313 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2314 
2315 	if (addr) {
2316 		msg->msg_namelen = addr->len;
2317 		memcpy(msg->msg_name, addr->name, addr->len);
2318 	}
2319 }
2320 
2321 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2322 			 int flags)
2323 {
2324 	struct scm_cookie scm;
2325 	struct socket *sock = sk->sk_socket;
2326 	struct unix_sock *u = unix_sk(sk);
2327 	struct sk_buff *skb, *last;
2328 	long timeo;
2329 	int skip;
2330 	int err;
2331 
2332 	err = -EOPNOTSUPP;
2333 	if (flags&MSG_OOB)
2334 		goto out;
2335 
2336 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2337 
2338 	do {
2339 		mutex_lock(&u->iolock);
2340 
2341 		skip = sk_peek_offset(sk, flags);
2342 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2343 					      &skip, &err, &last);
2344 		if (skb) {
2345 			if (!(flags & MSG_PEEK))
2346 				scm_stat_del(sk, skb);
2347 			break;
2348 		}
2349 
2350 		mutex_unlock(&u->iolock);
2351 
2352 		if (err != -EAGAIN)
2353 			break;
2354 	} while (timeo &&
2355 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2356 					      &err, &timeo, last));
2357 
2358 	if (!skb) { /* implies iolock unlocked */
2359 		unix_state_lock(sk);
2360 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2361 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2362 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2363 			err = 0;
2364 		unix_state_unlock(sk);
2365 		goto out;
2366 	}
2367 
2368 	if (wq_has_sleeper(&u->peer_wait))
2369 		wake_up_interruptible_sync_poll(&u->peer_wait,
2370 						EPOLLOUT | EPOLLWRNORM |
2371 						EPOLLWRBAND);
2372 
2373 	if (msg->msg_name)
2374 		unix_copy_addr(msg, skb->sk);
2375 
2376 	if (size > skb->len - skip)
2377 		size = skb->len - skip;
2378 	else if (size < skb->len - skip)
2379 		msg->msg_flags |= MSG_TRUNC;
2380 
2381 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2382 	if (err)
2383 		goto out_free;
2384 
2385 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2386 		__sock_recv_timestamp(msg, sk, skb);
2387 
2388 	memset(&scm, 0, sizeof(scm));
2389 
2390 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2391 	unix_set_secdata(&scm, skb);
2392 
2393 	if (!(flags & MSG_PEEK)) {
2394 		if (UNIXCB(skb).fp)
2395 			unix_detach_fds(&scm, skb);
2396 
2397 		sk_peek_offset_bwd(sk, skb->len);
2398 	} else {
2399 		/* It is questionable: on PEEK we could:
2400 		   - do not return fds - good, but too simple 8)
2401 		   - return fds, and do not return them on read (old strategy,
2402 		     apparently wrong)
2403 		   - clone fds (I chose it for now, it is the most universal
2404 		     solution)
2405 
2406 		   POSIX 1003.1g does not actually define this clearly
2407 		   at all. POSIX 1003.1g doesn't define a lot of things
2408 		   clearly however!
2409 
2410 		*/
2411 
2412 		sk_peek_offset_fwd(sk, size);
2413 
2414 		if (UNIXCB(skb).fp)
2415 			unix_peek_fds(&scm, skb);
2416 	}
2417 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2418 
2419 	scm_recv_unix(sock, msg, &scm, flags);
2420 
2421 out_free:
2422 	skb_free_datagram(sk, skb);
2423 	mutex_unlock(&u->iolock);
2424 out:
2425 	return err;
2426 }
2427 
2428 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2429 			      int flags)
2430 {
2431 	struct sock *sk = sock->sk;
2432 
2433 #ifdef CONFIG_BPF_SYSCALL
2434 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2435 
2436 	if (prot != &unix_dgram_proto)
2437 		return prot->recvmsg(sk, msg, size, flags, NULL);
2438 #endif
2439 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2440 }
2441 
2442 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2443 {
2444 	struct unix_sock *u = unix_sk(sk);
2445 	struct sk_buff *skb;
2446 	int err;
2447 
2448 	mutex_lock(&u->iolock);
2449 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2450 	mutex_unlock(&u->iolock);
2451 	if (!skb)
2452 		return err;
2453 
2454 	return recv_actor(sk, skb);
2455 }
2456 
2457 /*
2458  *	Sleep until more data has arrived. But check for races..
2459  */
2460 static long unix_stream_data_wait(struct sock *sk, long timeo,
2461 				  struct sk_buff *last, unsigned int last_len,
2462 				  bool freezable)
2463 {
2464 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2465 	struct sk_buff *tail;
2466 	DEFINE_WAIT(wait);
2467 
2468 	unix_state_lock(sk);
2469 
2470 	for (;;) {
2471 		prepare_to_wait(sk_sleep(sk), &wait, state);
2472 
2473 		tail = skb_peek_tail(&sk->sk_receive_queue);
2474 		if (tail != last ||
2475 		    (tail && tail->len != last_len) ||
2476 		    sk->sk_err ||
2477 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2478 		    signal_pending(current) ||
2479 		    !timeo)
2480 			break;
2481 
2482 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2483 		unix_state_unlock(sk);
2484 		timeo = schedule_timeout(timeo);
2485 		unix_state_lock(sk);
2486 
2487 		if (sock_flag(sk, SOCK_DEAD))
2488 			break;
2489 
2490 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2491 	}
2492 
2493 	finish_wait(sk_sleep(sk), &wait);
2494 	unix_state_unlock(sk);
2495 	return timeo;
2496 }
2497 
2498 static unsigned int unix_skb_len(const struct sk_buff *skb)
2499 {
2500 	return skb->len - UNIXCB(skb).consumed;
2501 }
2502 
2503 struct unix_stream_read_state {
2504 	int (*recv_actor)(struct sk_buff *, int, int,
2505 			  struct unix_stream_read_state *);
2506 	struct socket *socket;
2507 	struct msghdr *msg;
2508 	struct pipe_inode_info *pipe;
2509 	size_t size;
2510 	int flags;
2511 	unsigned int splice_flags;
2512 };
2513 
2514 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2515 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2516 {
2517 	struct socket *sock = state->socket;
2518 	struct sock *sk = sock->sk;
2519 	struct unix_sock *u = unix_sk(sk);
2520 	int chunk = 1;
2521 	struct sk_buff *oob_skb;
2522 
2523 	mutex_lock(&u->iolock);
2524 	unix_state_lock(sk);
2525 	spin_lock(&sk->sk_receive_queue.lock);
2526 
2527 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2528 		spin_unlock(&sk->sk_receive_queue.lock);
2529 		unix_state_unlock(sk);
2530 		mutex_unlock(&u->iolock);
2531 		return -EINVAL;
2532 	}
2533 
2534 	oob_skb = u->oob_skb;
2535 
2536 	if (!(state->flags & MSG_PEEK))
2537 		WRITE_ONCE(u->oob_skb, NULL);
2538 	else
2539 		skb_get(oob_skb);
2540 
2541 	spin_unlock(&sk->sk_receive_queue.lock);
2542 	unix_state_unlock(sk);
2543 
2544 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2545 
2546 	if (!(state->flags & MSG_PEEK))
2547 		UNIXCB(oob_skb).consumed += 1;
2548 
2549 	consume_skb(oob_skb);
2550 
2551 	mutex_unlock(&u->iolock);
2552 
2553 	if (chunk < 0)
2554 		return -EFAULT;
2555 
2556 	state->msg->msg_flags |= MSG_OOB;
2557 	return 1;
2558 }
2559 
2560 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2561 				  int flags, int copied)
2562 {
2563 	struct unix_sock *u = unix_sk(sk);
2564 
2565 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2566 		skb_unlink(skb, &sk->sk_receive_queue);
2567 		consume_skb(skb);
2568 		skb = NULL;
2569 	} else {
2570 		struct sk_buff *unlinked_skb = NULL;
2571 
2572 		spin_lock(&sk->sk_receive_queue.lock);
2573 
2574 		if (skb == u->oob_skb) {
2575 			if (copied) {
2576 				skb = NULL;
2577 			} else if (!(flags & MSG_PEEK)) {
2578 				if (sock_flag(sk, SOCK_URGINLINE)) {
2579 					WRITE_ONCE(u->oob_skb, NULL);
2580 					consume_skb(skb);
2581 				} else {
2582 					__skb_unlink(skb, &sk->sk_receive_queue);
2583 					WRITE_ONCE(u->oob_skb, NULL);
2584 					unlinked_skb = skb;
2585 					skb = skb_peek(&sk->sk_receive_queue);
2586 				}
2587 			} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2588 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
2589 			}
2590 		}
2591 
2592 		spin_unlock(&sk->sk_receive_queue.lock);
2593 
2594 		if (unlinked_skb) {
2595 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2596 			kfree_skb(unlinked_skb);
2597 		}
2598 	}
2599 	return skb;
2600 }
2601 #endif
2602 
2603 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2604 {
2605 	struct unix_sock *u = unix_sk(sk);
2606 	struct sk_buff *skb;
2607 	int err;
2608 
2609 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2610 		return -ENOTCONN;
2611 
2612 	mutex_lock(&u->iolock);
2613 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2614 	mutex_unlock(&u->iolock);
2615 	if (!skb)
2616 		return err;
2617 
2618 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2619 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2620 		bool drop = false;
2621 
2622 		unix_state_lock(sk);
2623 
2624 		if (sock_flag(sk, SOCK_DEAD)) {
2625 			unix_state_unlock(sk);
2626 			kfree_skb(skb);
2627 			return -ECONNRESET;
2628 		}
2629 
2630 		spin_lock(&sk->sk_receive_queue.lock);
2631 		if (likely(skb == u->oob_skb)) {
2632 			WRITE_ONCE(u->oob_skb, NULL);
2633 			drop = true;
2634 		}
2635 		spin_unlock(&sk->sk_receive_queue.lock);
2636 
2637 		unix_state_unlock(sk);
2638 
2639 		if (drop) {
2640 			WARN_ON_ONCE(skb_unref(skb));
2641 			kfree_skb(skb);
2642 			return -EAGAIN;
2643 		}
2644 	}
2645 #endif
2646 
2647 	return recv_actor(sk, skb);
2648 }
2649 
2650 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2651 				    bool freezable)
2652 {
2653 	struct scm_cookie scm;
2654 	struct socket *sock = state->socket;
2655 	struct sock *sk = sock->sk;
2656 	struct unix_sock *u = unix_sk(sk);
2657 	int copied = 0;
2658 	int flags = state->flags;
2659 	int noblock = flags & MSG_DONTWAIT;
2660 	bool check_creds = false;
2661 	int target;
2662 	int err = 0;
2663 	long timeo;
2664 	int skip;
2665 	size_t size = state->size;
2666 	unsigned int last_len;
2667 
2668 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2669 		err = -EINVAL;
2670 		goto out;
2671 	}
2672 
2673 	if (unlikely(flags & MSG_OOB)) {
2674 		err = -EOPNOTSUPP;
2675 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2676 		err = unix_stream_recv_urg(state);
2677 #endif
2678 		goto out;
2679 	}
2680 
2681 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2682 	timeo = sock_rcvtimeo(sk, noblock);
2683 
2684 	memset(&scm, 0, sizeof(scm));
2685 
2686 	/* Lock the socket to prevent queue disordering
2687 	 * while sleeps in memcpy_tomsg
2688 	 */
2689 	mutex_lock(&u->iolock);
2690 
2691 	skip = max(sk_peek_offset(sk, flags), 0);
2692 
2693 	do {
2694 		int chunk;
2695 		bool drop_skb;
2696 		struct sk_buff *skb, *last;
2697 
2698 redo:
2699 		unix_state_lock(sk);
2700 		if (sock_flag(sk, SOCK_DEAD)) {
2701 			err = -ECONNRESET;
2702 			goto unlock;
2703 		}
2704 		last = skb = skb_peek(&sk->sk_receive_queue);
2705 		last_len = last ? last->len : 0;
2706 
2707 again:
2708 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2709 		if (skb) {
2710 			skb = manage_oob(skb, sk, flags, copied);
2711 			if (!skb && copied) {
2712 				unix_state_unlock(sk);
2713 				break;
2714 			}
2715 		}
2716 #endif
2717 		if (skb == NULL) {
2718 			if (copied >= target)
2719 				goto unlock;
2720 
2721 			/*
2722 			 *	POSIX 1003.1g mandates this order.
2723 			 */
2724 
2725 			err = sock_error(sk);
2726 			if (err)
2727 				goto unlock;
2728 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2729 				goto unlock;
2730 
2731 			unix_state_unlock(sk);
2732 			if (!timeo) {
2733 				err = -EAGAIN;
2734 				break;
2735 			}
2736 
2737 			mutex_unlock(&u->iolock);
2738 
2739 			timeo = unix_stream_data_wait(sk, timeo, last,
2740 						      last_len, freezable);
2741 
2742 			if (signal_pending(current)) {
2743 				err = sock_intr_errno(timeo);
2744 				scm_destroy(&scm);
2745 				goto out;
2746 			}
2747 
2748 			mutex_lock(&u->iolock);
2749 			goto redo;
2750 unlock:
2751 			unix_state_unlock(sk);
2752 			break;
2753 		}
2754 
2755 		while (skip >= unix_skb_len(skb)) {
2756 			skip -= unix_skb_len(skb);
2757 			last = skb;
2758 			last_len = skb->len;
2759 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2760 			if (!skb)
2761 				goto again;
2762 		}
2763 
2764 		unix_state_unlock(sk);
2765 
2766 		if (check_creds) {
2767 			/* Never glue messages from different writers */
2768 			if (!unix_skb_scm_eq(skb, &scm))
2769 				break;
2770 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2771 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2772 			/* Copy credentials */
2773 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2774 			unix_set_secdata(&scm, skb);
2775 			check_creds = true;
2776 		}
2777 
2778 		/* Copy address just once */
2779 		if (state->msg && state->msg->msg_name) {
2780 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2781 					 state->msg->msg_name);
2782 			unix_copy_addr(state->msg, skb->sk);
2783 			sunaddr = NULL;
2784 		}
2785 
2786 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2787 		skb_get(skb);
2788 		chunk = state->recv_actor(skb, skip, chunk, state);
2789 		drop_skb = !unix_skb_len(skb);
2790 		/* skb is only safe to use if !drop_skb */
2791 		consume_skb(skb);
2792 		if (chunk < 0) {
2793 			if (copied == 0)
2794 				copied = -EFAULT;
2795 			break;
2796 		}
2797 		copied += chunk;
2798 		size -= chunk;
2799 
2800 		if (drop_skb) {
2801 			/* the skb was touched by a concurrent reader;
2802 			 * we should not expect anything from this skb
2803 			 * anymore and assume it invalid - we can be
2804 			 * sure it was dropped from the socket queue
2805 			 *
2806 			 * let's report a short read
2807 			 */
2808 			err = 0;
2809 			break;
2810 		}
2811 
2812 		/* Mark read part of skb as used */
2813 		if (!(flags & MSG_PEEK)) {
2814 			UNIXCB(skb).consumed += chunk;
2815 
2816 			sk_peek_offset_bwd(sk, chunk);
2817 
2818 			if (UNIXCB(skb).fp) {
2819 				scm_stat_del(sk, skb);
2820 				unix_detach_fds(&scm, skb);
2821 			}
2822 
2823 			if (unix_skb_len(skb))
2824 				break;
2825 
2826 			skb_unlink(skb, &sk->sk_receive_queue);
2827 			consume_skb(skb);
2828 
2829 			if (scm.fp)
2830 				break;
2831 		} else {
2832 			/* It is questionable, see note in unix_dgram_recvmsg.
2833 			 */
2834 			if (UNIXCB(skb).fp)
2835 				unix_peek_fds(&scm, skb);
2836 
2837 			sk_peek_offset_fwd(sk, chunk);
2838 
2839 			if (UNIXCB(skb).fp)
2840 				break;
2841 
2842 			skip = 0;
2843 			last = skb;
2844 			last_len = skb->len;
2845 			unix_state_lock(sk);
2846 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2847 			if (skb)
2848 				goto again;
2849 			unix_state_unlock(sk);
2850 			break;
2851 		}
2852 	} while (size);
2853 
2854 	mutex_unlock(&u->iolock);
2855 	if (state->msg)
2856 		scm_recv_unix(sock, state->msg, &scm, flags);
2857 	else
2858 		scm_destroy(&scm);
2859 out:
2860 	return copied ? : err;
2861 }
2862 
2863 static int unix_stream_read_actor(struct sk_buff *skb,
2864 				  int skip, int chunk,
2865 				  struct unix_stream_read_state *state)
2866 {
2867 	int ret;
2868 
2869 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2870 				    state->msg, chunk);
2871 	return ret ?: chunk;
2872 }
2873 
2874 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2875 			  size_t size, int flags)
2876 {
2877 	struct unix_stream_read_state state = {
2878 		.recv_actor = unix_stream_read_actor,
2879 		.socket = sk->sk_socket,
2880 		.msg = msg,
2881 		.size = size,
2882 		.flags = flags
2883 	};
2884 
2885 	return unix_stream_read_generic(&state, true);
2886 }
2887 
2888 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2889 			       size_t size, int flags)
2890 {
2891 	struct unix_stream_read_state state = {
2892 		.recv_actor = unix_stream_read_actor,
2893 		.socket = sock,
2894 		.msg = msg,
2895 		.size = size,
2896 		.flags = flags
2897 	};
2898 
2899 #ifdef CONFIG_BPF_SYSCALL
2900 	struct sock *sk = sock->sk;
2901 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2902 
2903 	if (prot != &unix_stream_proto)
2904 		return prot->recvmsg(sk, msg, size, flags, NULL);
2905 #endif
2906 	return unix_stream_read_generic(&state, true);
2907 }
2908 
2909 static int unix_stream_splice_actor(struct sk_buff *skb,
2910 				    int skip, int chunk,
2911 				    struct unix_stream_read_state *state)
2912 {
2913 	return skb_splice_bits(skb, state->socket->sk,
2914 			       UNIXCB(skb).consumed + skip,
2915 			       state->pipe, chunk, state->splice_flags);
2916 }
2917 
2918 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2919 				       struct pipe_inode_info *pipe,
2920 				       size_t size, unsigned int flags)
2921 {
2922 	struct unix_stream_read_state state = {
2923 		.recv_actor = unix_stream_splice_actor,
2924 		.socket = sock,
2925 		.pipe = pipe,
2926 		.size = size,
2927 		.splice_flags = flags,
2928 	};
2929 
2930 	if (unlikely(*ppos))
2931 		return -ESPIPE;
2932 
2933 	if (sock->file->f_flags & O_NONBLOCK ||
2934 	    flags & SPLICE_F_NONBLOCK)
2935 		state.flags = MSG_DONTWAIT;
2936 
2937 	return unix_stream_read_generic(&state, false);
2938 }
2939 
2940 static int unix_shutdown(struct socket *sock, int mode)
2941 {
2942 	struct sock *sk = sock->sk;
2943 	struct sock *other;
2944 
2945 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2946 		return -EINVAL;
2947 	/* This maps:
2948 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2949 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2950 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2951 	 */
2952 	++mode;
2953 
2954 	unix_state_lock(sk);
2955 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2956 	other = unix_peer(sk);
2957 	if (other)
2958 		sock_hold(other);
2959 	unix_state_unlock(sk);
2960 	sk->sk_state_change(sk);
2961 
2962 	if (other &&
2963 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2964 
2965 		int peer_mode = 0;
2966 		const struct proto *prot = READ_ONCE(other->sk_prot);
2967 
2968 		if (prot->unhash)
2969 			prot->unhash(other);
2970 		if (mode&RCV_SHUTDOWN)
2971 			peer_mode |= SEND_SHUTDOWN;
2972 		if (mode&SEND_SHUTDOWN)
2973 			peer_mode |= RCV_SHUTDOWN;
2974 		unix_state_lock(other);
2975 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2976 		unix_state_unlock(other);
2977 		other->sk_state_change(other);
2978 		if (peer_mode == SHUTDOWN_MASK)
2979 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2980 		else if (peer_mode & RCV_SHUTDOWN)
2981 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2982 	}
2983 	if (other)
2984 		sock_put(other);
2985 
2986 	return 0;
2987 }
2988 
2989 long unix_inq_len(struct sock *sk)
2990 {
2991 	struct sk_buff *skb;
2992 	long amount = 0;
2993 
2994 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
2995 		return -EINVAL;
2996 
2997 	spin_lock(&sk->sk_receive_queue.lock);
2998 	if (sk->sk_type == SOCK_STREAM ||
2999 	    sk->sk_type == SOCK_SEQPACKET) {
3000 		skb_queue_walk(&sk->sk_receive_queue, skb)
3001 			amount += unix_skb_len(skb);
3002 	} else {
3003 		skb = skb_peek(&sk->sk_receive_queue);
3004 		if (skb)
3005 			amount = skb->len;
3006 	}
3007 	spin_unlock(&sk->sk_receive_queue.lock);
3008 
3009 	return amount;
3010 }
3011 EXPORT_SYMBOL_GPL(unix_inq_len);
3012 
3013 long unix_outq_len(struct sock *sk)
3014 {
3015 	return sk_wmem_alloc_get(sk);
3016 }
3017 EXPORT_SYMBOL_GPL(unix_outq_len);
3018 
3019 static int unix_open_file(struct sock *sk)
3020 {
3021 	struct path path;
3022 	struct file *f;
3023 	int fd;
3024 
3025 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3026 		return -EPERM;
3027 
3028 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3029 		return -ENOENT;
3030 
3031 	path = unix_sk(sk)->path;
3032 	if (!path.dentry)
3033 		return -ENOENT;
3034 
3035 	path_get(&path);
3036 
3037 	fd = get_unused_fd_flags(O_CLOEXEC);
3038 	if (fd < 0)
3039 		goto out;
3040 
3041 	f = dentry_open(&path, O_PATH, current_cred());
3042 	if (IS_ERR(f)) {
3043 		put_unused_fd(fd);
3044 		fd = PTR_ERR(f);
3045 		goto out;
3046 	}
3047 
3048 	fd_install(fd, f);
3049 out:
3050 	path_put(&path);
3051 
3052 	return fd;
3053 }
3054 
3055 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3056 {
3057 	struct sock *sk = sock->sk;
3058 	long amount = 0;
3059 	int err;
3060 
3061 	switch (cmd) {
3062 	case SIOCOUTQ:
3063 		amount = unix_outq_len(sk);
3064 		err = put_user(amount, (int __user *)arg);
3065 		break;
3066 	case SIOCINQ:
3067 		amount = unix_inq_len(sk);
3068 		if (amount < 0)
3069 			err = amount;
3070 		else
3071 			err = put_user(amount, (int __user *)arg);
3072 		break;
3073 	case SIOCUNIXFILE:
3074 		err = unix_open_file(sk);
3075 		break;
3076 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3077 	case SIOCATMARK:
3078 		{
3079 			struct sk_buff *skb;
3080 			int answ = 0;
3081 
3082 			skb = skb_peek(&sk->sk_receive_queue);
3083 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3084 				answ = 1;
3085 			err = put_user(answ, (int __user *)arg);
3086 		}
3087 		break;
3088 #endif
3089 	default:
3090 		err = -ENOIOCTLCMD;
3091 		break;
3092 	}
3093 	return err;
3094 }
3095 
3096 #ifdef CONFIG_COMPAT
3097 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3098 {
3099 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3100 }
3101 #endif
3102 
3103 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3104 {
3105 	struct sock *sk = sock->sk;
3106 	unsigned char state;
3107 	__poll_t mask;
3108 	u8 shutdown;
3109 
3110 	sock_poll_wait(file, sock, wait);
3111 	mask = 0;
3112 	shutdown = READ_ONCE(sk->sk_shutdown);
3113 	state = READ_ONCE(sk->sk_state);
3114 
3115 	/* exceptional events? */
3116 	if (READ_ONCE(sk->sk_err))
3117 		mask |= EPOLLERR;
3118 	if (shutdown == SHUTDOWN_MASK)
3119 		mask |= EPOLLHUP;
3120 	if (shutdown & RCV_SHUTDOWN)
3121 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3122 
3123 	/* readable? */
3124 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3125 		mask |= EPOLLIN | EPOLLRDNORM;
3126 	if (sk_is_readable(sk))
3127 		mask |= EPOLLIN | EPOLLRDNORM;
3128 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3129 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3130 		mask |= EPOLLPRI;
3131 #endif
3132 
3133 	/* Connection-based need to check for termination and startup */
3134 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3135 	    state == TCP_CLOSE)
3136 		mask |= EPOLLHUP;
3137 
3138 	/*
3139 	 * we set writable also when the other side has shut down the
3140 	 * connection. This prevents stuck sockets.
3141 	 */
3142 	if (unix_writable(sk, state))
3143 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3144 
3145 	return mask;
3146 }
3147 
3148 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3149 				    poll_table *wait)
3150 {
3151 	struct sock *sk = sock->sk, *other;
3152 	unsigned int writable;
3153 	unsigned char state;
3154 	__poll_t mask;
3155 	u8 shutdown;
3156 
3157 	sock_poll_wait(file, sock, wait);
3158 	mask = 0;
3159 	shutdown = READ_ONCE(sk->sk_shutdown);
3160 	state = READ_ONCE(sk->sk_state);
3161 
3162 	/* exceptional events? */
3163 	if (READ_ONCE(sk->sk_err) ||
3164 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3165 		mask |= EPOLLERR |
3166 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3167 
3168 	if (shutdown & RCV_SHUTDOWN)
3169 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3170 	if (shutdown == SHUTDOWN_MASK)
3171 		mask |= EPOLLHUP;
3172 
3173 	/* readable? */
3174 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3175 		mask |= EPOLLIN | EPOLLRDNORM;
3176 	if (sk_is_readable(sk))
3177 		mask |= EPOLLIN | EPOLLRDNORM;
3178 
3179 	/* Connection-based need to check for termination and startup */
3180 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3181 		mask |= EPOLLHUP;
3182 
3183 	/* No write status requested, avoid expensive OUT tests. */
3184 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3185 		return mask;
3186 
3187 	writable = unix_writable(sk, state);
3188 	if (writable) {
3189 		unix_state_lock(sk);
3190 
3191 		other = unix_peer(sk);
3192 		if (other && unix_peer(other) != sk &&
3193 		    unix_recvq_full_lockless(other) &&
3194 		    unix_dgram_peer_wake_me(sk, other))
3195 			writable = 0;
3196 
3197 		unix_state_unlock(sk);
3198 	}
3199 
3200 	if (writable)
3201 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3202 	else
3203 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3204 
3205 	return mask;
3206 }
3207 
3208 #ifdef CONFIG_PROC_FS
3209 
3210 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3211 
3212 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3213 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3214 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3215 
3216 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3217 {
3218 	unsigned long offset = get_offset(*pos);
3219 	unsigned long bucket = get_bucket(*pos);
3220 	unsigned long count = 0;
3221 	struct sock *sk;
3222 
3223 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3224 	     sk; sk = sk_next(sk)) {
3225 		if (++count == offset)
3226 			break;
3227 	}
3228 
3229 	return sk;
3230 }
3231 
3232 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3233 {
3234 	unsigned long bucket = get_bucket(*pos);
3235 	struct net *net = seq_file_net(seq);
3236 	struct sock *sk;
3237 
3238 	while (bucket < UNIX_HASH_SIZE) {
3239 		spin_lock(&net->unx.table.locks[bucket]);
3240 
3241 		sk = unix_from_bucket(seq, pos);
3242 		if (sk)
3243 			return sk;
3244 
3245 		spin_unlock(&net->unx.table.locks[bucket]);
3246 
3247 		*pos = set_bucket_offset(++bucket, 1);
3248 	}
3249 
3250 	return NULL;
3251 }
3252 
3253 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3254 				  loff_t *pos)
3255 {
3256 	unsigned long bucket = get_bucket(*pos);
3257 
3258 	sk = sk_next(sk);
3259 	if (sk)
3260 		return sk;
3261 
3262 
3263 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3264 
3265 	*pos = set_bucket_offset(++bucket, 1);
3266 
3267 	return unix_get_first(seq, pos);
3268 }
3269 
3270 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3271 {
3272 	if (!*pos)
3273 		return SEQ_START_TOKEN;
3274 
3275 	return unix_get_first(seq, pos);
3276 }
3277 
3278 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3279 {
3280 	++*pos;
3281 
3282 	if (v == SEQ_START_TOKEN)
3283 		return unix_get_first(seq, pos);
3284 
3285 	return unix_get_next(seq, v, pos);
3286 }
3287 
3288 static void unix_seq_stop(struct seq_file *seq, void *v)
3289 {
3290 	struct sock *sk = v;
3291 
3292 	if (sk)
3293 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3294 }
3295 
3296 static int unix_seq_show(struct seq_file *seq, void *v)
3297 {
3298 
3299 	if (v == SEQ_START_TOKEN)
3300 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3301 			 "Inode Path\n");
3302 	else {
3303 		struct sock *s = v;
3304 		struct unix_sock *u = unix_sk(s);
3305 		unix_state_lock(s);
3306 
3307 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3308 			s,
3309 			refcount_read(&s->sk_refcnt),
3310 			0,
3311 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3312 			s->sk_type,
3313 			s->sk_socket ?
3314 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3315 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3316 			sock_i_ino(s));
3317 
3318 		if (u->addr) {	// under a hash table lock here
3319 			int i, len;
3320 			seq_putc(seq, ' ');
3321 
3322 			i = 0;
3323 			len = u->addr->len -
3324 				offsetof(struct sockaddr_un, sun_path);
3325 			if (u->addr->name->sun_path[0]) {
3326 				len--;
3327 			} else {
3328 				seq_putc(seq, '@');
3329 				i++;
3330 			}
3331 			for ( ; i < len; i++)
3332 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3333 					 '@');
3334 		}
3335 		unix_state_unlock(s);
3336 		seq_putc(seq, '\n');
3337 	}
3338 
3339 	return 0;
3340 }
3341 
3342 static const struct seq_operations unix_seq_ops = {
3343 	.start  = unix_seq_start,
3344 	.next   = unix_seq_next,
3345 	.stop   = unix_seq_stop,
3346 	.show   = unix_seq_show,
3347 };
3348 
3349 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3350 struct bpf_unix_iter_state {
3351 	struct seq_net_private p;
3352 	unsigned int cur_sk;
3353 	unsigned int end_sk;
3354 	unsigned int max_sk;
3355 	struct sock **batch;
3356 	bool st_bucket_done;
3357 };
3358 
3359 struct bpf_iter__unix {
3360 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3361 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3362 	uid_t uid __aligned(8);
3363 };
3364 
3365 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3366 			      struct unix_sock *unix_sk, uid_t uid)
3367 {
3368 	struct bpf_iter__unix ctx;
3369 
3370 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3371 	ctx.meta = meta;
3372 	ctx.unix_sk = unix_sk;
3373 	ctx.uid = uid;
3374 	return bpf_iter_run_prog(prog, &ctx);
3375 }
3376 
3377 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3378 
3379 {
3380 	struct bpf_unix_iter_state *iter = seq->private;
3381 	unsigned int expected = 1;
3382 	struct sock *sk;
3383 
3384 	sock_hold(start_sk);
3385 	iter->batch[iter->end_sk++] = start_sk;
3386 
3387 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3388 		if (iter->end_sk < iter->max_sk) {
3389 			sock_hold(sk);
3390 			iter->batch[iter->end_sk++] = sk;
3391 		}
3392 
3393 		expected++;
3394 	}
3395 
3396 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3397 
3398 	return expected;
3399 }
3400 
3401 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3402 {
3403 	while (iter->cur_sk < iter->end_sk)
3404 		sock_put(iter->batch[iter->cur_sk++]);
3405 }
3406 
3407 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3408 				       unsigned int new_batch_sz)
3409 {
3410 	struct sock **new_batch;
3411 
3412 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3413 			     GFP_USER | __GFP_NOWARN);
3414 	if (!new_batch)
3415 		return -ENOMEM;
3416 
3417 	bpf_iter_unix_put_batch(iter);
3418 	kvfree(iter->batch);
3419 	iter->batch = new_batch;
3420 	iter->max_sk = new_batch_sz;
3421 
3422 	return 0;
3423 }
3424 
3425 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3426 					loff_t *pos)
3427 {
3428 	struct bpf_unix_iter_state *iter = seq->private;
3429 	unsigned int expected;
3430 	bool resized = false;
3431 	struct sock *sk;
3432 
3433 	if (iter->st_bucket_done)
3434 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3435 
3436 again:
3437 	/* Get a new batch */
3438 	iter->cur_sk = 0;
3439 	iter->end_sk = 0;
3440 
3441 	sk = unix_get_first(seq, pos);
3442 	if (!sk)
3443 		return NULL; /* Done */
3444 
3445 	expected = bpf_iter_unix_hold_batch(seq, sk);
3446 
3447 	if (iter->end_sk == expected) {
3448 		iter->st_bucket_done = true;
3449 		return sk;
3450 	}
3451 
3452 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3453 		resized = true;
3454 		goto again;
3455 	}
3456 
3457 	return sk;
3458 }
3459 
3460 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3461 {
3462 	if (!*pos)
3463 		return SEQ_START_TOKEN;
3464 
3465 	/* bpf iter does not support lseek, so it always
3466 	 * continue from where it was stop()-ped.
3467 	 */
3468 	return bpf_iter_unix_batch(seq, pos);
3469 }
3470 
3471 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3472 {
3473 	struct bpf_unix_iter_state *iter = seq->private;
3474 	struct sock *sk;
3475 
3476 	/* Whenever seq_next() is called, the iter->cur_sk is
3477 	 * done with seq_show(), so advance to the next sk in
3478 	 * the batch.
3479 	 */
3480 	if (iter->cur_sk < iter->end_sk)
3481 		sock_put(iter->batch[iter->cur_sk++]);
3482 
3483 	++*pos;
3484 
3485 	if (iter->cur_sk < iter->end_sk)
3486 		sk = iter->batch[iter->cur_sk];
3487 	else
3488 		sk = bpf_iter_unix_batch(seq, pos);
3489 
3490 	return sk;
3491 }
3492 
3493 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3494 {
3495 	struct bpf_iter_meta meta;
3496 	struct bpf_prog *prog;
3497 	struct sock *sk = v;
3498 	uid_t uid;
3499 	bool slow;
3500 	int ret;
3501 
3502 	if (v == SEQ_START_TOKEN)
3503 		return 0;
3504 
3505 	slow = lock_sock_fast(sk);
3506 
3507 	if (unlikely(sk_unhashed(sk))) {
3508 		ret = SEQ_SKIP;
3509 		goto unlock;
3510 	}
3511 
3512 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3513 	meta.seq = seq;
3514 	prog = bpf_iter_get_info(&meta, false);
3515 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3516 unlock:
3517 	unlock_sock_fast(sk, slow);
3518 	return ret;
3519 }
3520 
3521 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3522 {
3523 	struct bpf_unix_iter_state *iter = seq->private;
3524 	struct bpf_iter_meta meta;
3525 	struct bpf_prog *prog;
3526 
3527 	if (!v) {
3528 		meta.seq = seq;
3529 		prog = bpf_iter_get_info(&meta, true);
3530 		if (prog)
3531 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3532 	}
3533 
3534 	if (iter->cur_sk < iter->end_sk)
3535 		bpf_iter_unix_put_batch(iter);
3536 }
3537 
3538 static const struct seq_operations bpf_iter_unix_seq_ops = {
3539 	.start	= bpf_iter_unix_seq_start,
3540 	.next	= bpf_iter_unix_seq_next,
3541 	.stop	= bpf_iter_unix_seq_stop,
3542 	.show	= bpf_iter_unix_seq_show,
3543 };
3544 #endif
3545 #endif
3546 
3547 static const struct net_proto_family unix_family_ops = {
3548 	.family = PF_UNIX,
3549 	.create = unix_create,
3550 	.owner	= THIS_MODULE,
3551 };
3552 
3553 
3554 static int __net_init unix_net_init(struct net *net)
3555 {
3556 	int i;
3557 
3558 	net->unx.sysctl_max_dgram_qlen = 10;
3559 	if (unix_sysctl_register(net))
3560 		goto out;
3561 
3562 #ifdef CONFIG_PROC_FS
3563 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3564 			     sizeof(struct seq_net_private)))
3565 		goto err_sysctl;
3566 #endif
3567 
3568 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3569 					      sizeof(spinlock_t), GFP_KERNEL);
3570 	if (!net->unx.table.locks)
3571 		goto err_proc;
3572 
3573 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3574 						sizeof(struct hlist_head),
3575 						GFP_KERNEL);
3576 	if (!net->unx.table.buckets)
3577 		goto free_locks;
3578 
3579 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3580 		spin_lock_init(&net->unx.table.locks[i]);
3581 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3582 	}
3583 
3584 	return 0;
3585 
3586 free_locks:
3587 	kvfree(net->unx.table.locks);
3588 err_proc:
3589 #ifdef CONFIG_PROC_FS
3590 	remove_proc_entry("unix", net->proc_net);
3591 err_sysctl:
3592 #endif
3593 	unix_sysctl_unregister(net);
3594 out:
3595 	return -ENOMEM;
3596 }
3597 
3598 static void __net_exit unix_net_exit(struct net *net)
3599 {
3600 	kvfree(net->unx.table.buckets);
3601 	kvfree(net->unx.table.locks);
3602 	unix_sysctl_unregister(net);
3603 	remove_proc_entry("unix", net->proc_net);
3604 }
3605 
3606 static struct pernet_operations unix_net_ops = {
3607 	.init = unix_net_init,
3608 	.exit = unix_net_exit,
3609 };
3610 
3611 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3612 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3613 		     struct unix_sock *unix_sk, uid_t uid)
3614 
3615 #define INIT_BATCH_SZ 16
3616 
3617 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3618 {
3619 	struct bpf_unix_iter_state *iter = priv_data;
3620 	int err;
3621 
3622 	err = bpf_iter_init_seq_net(priv_data, aux);
3623 	if (err)
3624 		return err;
3625 
3626 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3627 	if (err) {
3628 		bpf_iter_fini_seq_net(priv_data);
3629 		return err;
3630 	}
3631 
3632 	return 0;
3633 }
3634 
3635 static void bpf_iter_fini_unix(void *priv_data)
3636 {
3637 	struct bpf_unix_iter_state *iter = priv_data;
3638 
3639 	bpf_iter_fini_seq_net(priv_data);
3640 	kvfree(iter->batch);
3641 }
3642 
3643 static const struct bpf_iter_seq_info unix_seq_info = {
3644 	.seq_ops		= &bpf_iter_unix_seq_ops,
3645 	.init_seq_private	= bpf_iter_init_unix,
3646 	.fini_seq_private	= bpf_iter_fini_unix,
3647 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3648 };
3649 
3650 static const struct bpf_func_proto *
3651 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3652 			     const struct bpf_prog *prog)
3653 {
3654 	switch (func_id) {
3655 	case BPF_FUNC_setsockopt:
3656 		return &bpf_sk_setsockopt_proto;
3657 	case BPF_FUNC_getsockopt:
3658 		return &bpf_sk_getsockopt_proto;
3659 	default:
3660 		return NULL;
3661 	}
3662 }
3663 
3664 static struct bpf_iter_reg unix_reg_info = {
3665 	.target			= "unix",
3666 	.ctx_arg_info_size	= 1,
3667 	.ctx_arg_info		= {
3668 		{ offsetof(struct bpf_iter__unix, unix_sk),
3669 		  PTR_TO_BTF_ID_OR_NULL },
3670 	},
3671 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3672 	.seq_info		= &unix_seq_info,
3673 };
3674 
3675 static void __init bpf_iter_register(void)
3676 {
3677 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3678 	if (bpf_iter_reg_target(&unix_reg_info))
3679 		pr_warn("Warning: could not register bpf iterator unix\n");
3680 }
3681 #endif
3682 
3683 static int __init af_unix_init(void)
3684 {
3685 	int i, rc = -1;
3686 
3687 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3688 
3689 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3690 		spin_lock_init(&bsd_socket_locks[i]);
3691 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3692 	}
3693 
3694 	rc = proto_register(&unix_dgram_proto, 1);
3695 	if (rc != 0) {
3696 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3697 		goto out;
3698 	}
3699 
3700 	rc = proto_register(&unix_stream_proto, 1);
3701 	if (rc != 0) {
3702 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3703 		proto_unregister(&unix_dgram_proto);
3704 		goto out;
3705 	}
3706 
3707 	sock_register(&unix_family_ops);
3708 	register_pernet_subsys(&unix_net_ops);
3709 	unix_bpf_build_proto();
3710 
3711 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3712 	bpf_iter_register();
3713 #endif
3714 
3715 out:
3716 	return rc;
3717 }
3718 
3719 static void __exit af_unix_exit(void)
3720 {
3721 	sock_unregister(PF_UNIX);
3722 	proto_unregister(&unix_dgram_proto);
3723 	proto_unregister(&unix_stream_proto);
3724 	unregister_pernet_subsys(&unix_net_ops);
3725 }
3726 
3727 /* Earlier than device_initcall() so that other drivers invoking
3728    request_module() don't end up in a loop when modprobe tries
3729    to use a UNIX socket. But later than subsys_initcall() because
3730    we depend on stuff initialised there */
3731 fs_initcall(af_unix_init);
3732 module_exit(af_unix_exit);
3733 
3734 MODULE_LICENSE("GPL");
3735 MODULE_ALIAS_NETPROTO(PF_UNIX);
3736