xref: /openbmc/linux/net/unix/af_unix.c (revision 500e1340)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 #define unix_peer(sk) (unix_sk(sk)->peer)
216 
217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
218 {
219 	return unix_peer(osk) == sk;
220 }
221 
222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
223 {
224 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
225 }
226 
227 static inline int unix_recvq_full(const struct sock *sk)
228 {
229 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
230 }
231 
232 static inline int unix_recvq_full_lockless(const struct sock *sk)
233 {
234 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 		READ_ONCE(sk->sk_max_ack_backlog);
236 }
237 
238 struct sock *unix_peer_get(struct sock *s)
239 {
240 	struct sock *peer;
241 
242 	unix_state_lock(s);
243 	peer = unix_peer(s);
244 	if (peer)
245 		sock_hold(peer);
246 	unix_state_unlock(s);
247 	return peer;
248 }
249 EXPORT_SYMBOL_GPL(unix_peer_get);
250 
251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
252 					     int addr_len)
253 {
254 	struct unix_address *addr;
255 
256 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
257 	if (!addr)
258 		return NULL;
259 
260 	refcount_set(&addr->refcnt, 1);
261 	addr->len = addr_len;
262 	memcpy(addr->name, sunaddr, addr_len);
263 
264 	return addr;
265 }
266 
267 static inline void unix_release_addr(struct unix_address *addr)
268 {
269 	if (refcount_dec_and_test(&addr->refcnt))
270 		kfree(addr);
271 }
272 
273 /*
274  *	Check unix socket name:
275  *		- should be not zero length.
276  *	        - if started by not zero, should be NULL terminated (FS object)
277  *		- if started by zero, it is abstract name.
278  */
279 
280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
281 {
282 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 	    addr_len > sizeof(*sunaddr))
284 		return -EINVAL;
285 
286 	if (sunaddr->sun_family != AF_UNIX)
287 		return -EINVAL;
288 
289 	return 0;
290 }
291 
292 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
293 {
294 	/* This may look like an off by one error but it is a bit more
295 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
296 	 * sun_path[108] doesn't as such exist.  However in kernel space
297 	 * we are guaranteed that it is a valid memory location in our
298 	 * kernel address buffer because syscall functions always pass
299 	 * a pointer of struct sockaddr_storage which has a bigger buffer
300 	 * than 108.
301 	 */
302 	((char *)sunaddr)[addr_len] = 0;
303 }
304 
305 static void __unix_remove_socket(struct sock *sk)
306 {
307 	sk_del_node_init(sk);
308 }
309 
310 static void __unix_insert_socket(struct net *net, struct sock *sk)
311 {
312 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
313 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
314 }
315 
316 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
317 				 struct unix_address *addr, unsigned int hash)
318 {
319 	__unix_remove_socket(sk);
320 	smp_store_release(&unix_sk(sk)->addr, addr);
321 
322 	sk->sk_hash = hash;
323 	__unix_insert_socket(net, sk);
324 }
325 
326 static void unix_remove_socket(struct net *net, struct sock *sk)
327 {
328 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
329 	__unix_remove_socket(sk);
330 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
331 }
332 
333 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
334 {
335 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
336 	__unix_insert_socket(net, sk);
337 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
338 }
339 
340 static void unix_insert_bsd_socket(struct sock *sk)
341 {
342 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
343 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
344 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
345 }
346 
347 static void unix_remove_bsd_socket(struct sock *sk)
348 {
349 	if (!hlist_unhashed(&sk->sk_bind_node)) {
350 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
351 		__sk_del_bind_node(sk);
352 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
353 
354 		sk_node_init(&sk->sk_bind_node);
355 	}
356 }
357 
358 static struct sock *__unix_find_socket_byname(struct net *net,
359 					      struct sockaddr_un *sunname,
360 					      int len, unsigned int hash)
361 {
362 	struct sock *s;
363 
364 	sk_for_each(s, &net->unx.table.buckets[hash]) {
365 		struct unix_sock *u = unix_sk(s);
366 
367 		if (u->addr->len == len &&
368 		    !memcmp(u->addr->name, sunname, len))
369 			return s;
370 	}
371 	return NULL;
372 }
373 
374 static inline struct sock *unix_find_socket_byname(struct net *net,
375 						   struct sockaddr_un *sunname,
376 						   int len, unsigned int hash)
377 {
378 	struct sock *s;
379 
380 	spin_lock(&net->unx.table.locks[hash]);
381 	s = __unix_find_socket_byname(net, sunname, len, hash);
382 	if (s)
383 		sock_hold(s);
384 	spin_unlock(&net->unx.table.locks[hash]);
385 	return s;
386 }
387 
388 static struct sock *unix_find_socket_byinode(struct inode *i)
389 {
390 	unsigned int hash = unix_bsd_hash(i);
391 	struct sock *s;
392 
393 	spin_lock(&bsd_socket_locks[hash]);
394 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
395 		struct dentry *dentry = unix_sk(s)->path.dentry;
396 
397 		if (dentry && d_backing_inode(dentry) == i) {
398 			sock_hold(s);
399 			spin_unlock(&bsd_socket_locks[hash]);
400 			return s;
401 		}
402 	}
403 	spin_unlock(&bsd_socket_locks[hash]);
404 	return NULL;
405 }
406 
407 /* Support code for asymmetrically connected dgram sockets
408  *
409  * If a datagram socket is connected to a socket not itself connected
410  * to the first socket (eg, /dev/log), clients may only enqueue more
411  * messages if the present receive queue of the server socket is not
412  * "too large". This means there's a second writeability condition
413  * poll and sendmsg need to test. The dgram recv code will do a wake
414  * up on the peer_wait wait queue of a socket upon reception of a
415  * datagram which needs to be propagated to sleeping would-be writers
416  * since these might not have sent anything so far. This can't be
417  * accomplished via poll_wait because the lifetime of the server
418  * socket might be less than that of its clients if these break their
419  * association with it or if the server socket is closed while clients
420  * are still connected to it and there's no way to inform "a polling
421  * implementation" that it should let go of a certain wait queue
422  *
423  * In order to propagate a wake up, a wait_queue_entry_t of the client
424  * socket is enqueued on the peer_wait queue of the server socket
425  * whose wake function does a wake_up on the ordinary client socket
426  * wait queue. This connection is established whenever a write (or
427  * poll for write) hit the flow control condition and broken when the
428  * association to the server socket is dissolved or after a wake up
429  * was relayed.
430  */
431 
432 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
433 				      void *key)
434 {
435 	struct unix_sock *u;
436 	wait_queue_head_t *u_sleep;
437 
438 	u = container_of(q, struct unix_sock, peer_wake);
439 
440 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
441 			    q);
442 	u->peer_wake.private = NULL;
443 
444 	/* relaying can only happen while the wq still exists */
445 	u_sleep = sk_sleep(&u->sk);
446 	if (u_sleep)
447 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
448 
449 	return 0;
450 }
451 
452 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
453 {
454 	struct unix_sock *u, *u_other;
455 	int rc;
456 
457 	u = unix_sk(sk);
458 	u_other = unix_sk(other);
459 	rc = 0;
460 	spin_lock(&u_other->peer_wait.lock);
461 
462 	if (!u->peer_wake.private) {
463 		u->peer_wake.private = other;
464 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
465 
466 		rc = 1;
467 	}
468 
469 	spin_unlock(&u_other->peer_wait.lock);
470 	return rc;
471 }
472 
473 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
474 					    struct sock *other)
475 {
476 	struct unix_sock *u, *u_other;
477 
478 	u = unix_sk(sk);
479 	u_other = unix_sk(other);
480 	spin_lock(&u_other->peer_wait.lock);
481 
482 	if (u->peer_wake.private == other) {
483 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
484 		u->peer_wake.private = NULL;
485 	}
486 
487 	spin_unlock(&u_other->peer_wait.lock);
488 }
489 
490 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
491 						   struct sock *other)
492 {
493 	unix_dgram_peer_wake_disconnect(sk, other);
494 	wake_up_interruptible_poll(sk_sleep(sk),
495 				   EPOLLOUT |
496 				   EPOLLWRNORM |
497 				   EPOLLWRBAND);
498 }
499 
500 /* preconditions:
501  *	- unix_peer(sk) == other
502  *	- association is stable
503  */
504 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
505 {
506 	int connected;
507 
508 	connected = unix_dgram_peer_wake_connect(sk, other);
509 
510 	/* If other is SOCK_DEAD, we want to make sure we signal
511 	 * POLLOUT, such that a subsequent write() can get a
512 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
513 	 * to other and its full, we will hang waiting for POLLOUT.
514 	 */
515 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
516 		return 1;
517 
518 	if (connected)
519 		unix_dgram_peer_wake_disconnect(sk, other);
520 
521 	return 0;
522 }
523 
524 static int unix_writable(const struct sock *sk)
525 {
526 	return sk->sk_state != TCP_LISTEN &&
527 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
528 }
529 
530 static void unix_write_space(struct sock *sk)
531 {
532 	struct socket_wq *wq;
533 
534 	rcu_read_lock();
535 	if (unix_writable(sk)) {
536 		wq = rcu_dereference(sk->sk_wq);
537 		if (skwq_has_sleeper(wq))
538 			wake_up_interruptible_sync_poll(&wq->wait,
539 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
540 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
541 	}
542 	rcu_read_unlock();
543 }
544 
545 /* When dgram socket disconnects (or changes its peer), we clear its receive
546  * queue of packets arrived from previous peer. First, it allows to do
547  * flow control based only on wmem_alloc; second, sk connected to peer
548  * may receive messages only from that peer. */
549 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
550 {
551 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
552 		skb_queue_purge(&sk->sk_receive_queue);
553 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
554 
555 		/* If one link of bidirectional dgram pipe is disconnected,
556 		 * we signal error. Messages are lost. Do not make this,
557 		 * when peer was not connected to us.
558 		 */
559 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
560 			WRITE_ONCE(other->sk_err, ECONNRESET);
561 			sk_error_report(other);
562 		}
563 	}
564 	other->sk_state = TCP_CLOSE;
565 }
566 
567 static void unix_sock_destructor(struct sock *sk)
568 {
569 	struct unix_sock *u = unix_sk(sk);
570 
571 	skb_queue_purge(&sk->sk_receive_queue);
572 
573 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
574 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
575 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
576 	if (!sock_flag(sk, SOCK_DEAD)) {
577 		pr_info("Attempt to release alive unix socket: %p\n", sk);
578 		return;
579 	}
580 
581 	if (u->addr)
582 		unix_release_addr(u->addr);
583 
584 	atomic_long_dec(&unix_nr_socks);
585 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
586 #ifdef UNIX_REFCNT_DEBUG
587 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
588 		atomic_long_read(&unix_nr_socks));
589 #endif
590 }
591 
592 static void unix_release_sock(struct sock *sk, int embrion)
593 {
594 	struct unix_sock *u = unix_sk(sk);
595 	struct sock *skpair;
596 	struct sk_buff *skb;
597 	struct path path;
598 	int state;
599 
600 	unix_remove_socket(sock_net(sk), sk);
601 	unix_remove_bsd_socket(sk);
602 
603 	/* Clear state */
604 	unix_state_lock(sk);
605 	sock_orphan(sk);
606 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
607 	path	     = u->path;
608 	u->path.dentry = NULL;
609 	u->path.mnt = NULL;
610 	state = sk->sk_state;
611 	sk->sk_state = TCP_CLOSE;
612 
613 	skpair = unix_peer(sk);
614 	unix_peer(sk) = NULL;
615 
616 	unix_state_unlock(sk);
617 
618 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
619 	if (u->oob_skb) {
620 		kfree_skb(u->oob_skb);
621 		u->oob_skb = NULL;
622 	}
623 #endif
624 
625 	wake_up_interruptible_all(&u->peer_wait);
626 
627 	if (skpair != NULL) {
628 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
629 			unix_state_lock(skpair);
630 			/* No more writes */
631 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
632 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
633 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
634 			unix_state_unlock(skpair);
635 			skpair->sk_state_change(skpair);
636 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
637 		}
638 
639 		unix_dgram_peer_wake_disconnect(sk, skpair);
640 		sock_put(skpair); /* It may now die */
641 	}
642 
643 	/* Try to flush out this socket. Throw out buffers at least */
644 
645 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
646 		if (state == TCP_LISTEN)
647 			unix_release_sock(skb->sk, 1);
648 		/* passed fds are erased in the kfree_skb hook	      */
649 		UNIXCB(skb).consumed = skb->len;
650 		kfree_skb(skb);
651 	}
652 
653 	if (path.dentry)
654 		path_put(&path);
655 
656 	sock_put(sk);
657 
658 	/* ---- Socket is dead now and most probably destroyed ---- */
659 
660 	/*
661 	 * Fixme: BSD difference: In BSD all sockets connected to us get
662 	 *	  ECONNRESET and we die on the spot. In Linux we behave
663 	 *	  like files and pipes do and wait for the last
664 	 *	  dereference.
665 	 *
666 	 * Can't we simply set sock->err?
667 	 *
668 	 *	  What the above comment does talk about? --ANK(980817)
669 	 */
670 
671 	if (unix_tot_inflight)
672 		unix_gc();		/* Garbage collect fds */
673 }
674 
675 static void init_peercred(struct sock *sk)
676 {
677 	const struct cred *old_cred;
678 	struct pid *old_pid;
679 
680 	spin_lock(&sk->sk_peer_lock);
681 	old_pid = sk->sk_peer_pid;
682 	old_cred = sk->sk_peer_cred;
683 	sk->sk_peer_pid  = get_pid(task_tgid(current));
684 	sk->sk_peer_cred = get_current_cred();
685 	spin_unlock(&sk->sk_peer_lock);
686 
687 	put_pid(old_pid);
688 	put_cred(old_cred);
689 }
690 
691 static void copy_peercred(struct sock *sk, struct sock *peersk)
692 {
693 	const struct cred *old_cred;
694 	struct pid *old_pid;
695 
696 	if (sk < peersk) {
697 		spin_lock(&sk->sk_peer_lock);
698 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
699 	} else {
700 		spin_lock(&peersk->sk_peer_lock);
701 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 	}
703 	old_pid = sk->sk_peer_pid;
704 	old_cred = sk->sk_peer_cred;
705 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
706 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
707 
708 	spin_unlock(&sk->sk_peer_lock);
709 	spin_unlock(&peersk->sk_peer_lock);
710 
711 	put_pid(old_pid);
712 	put_cred(old_cred);
713 }
714 
715 static int unix_listen(struct socket *sock, int backlog)
716 {
717 	int err;
718 	struct sock *sk = sock->sk;
719 	struct unix_sock *u = unix_sk(sk);
720 
721 	err = -EOPNOTSUPP;
722 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
723 		goto out;	/* Only stream/seqpacket sockets accept */
724 	err = -EINVAL;
725 	if (!u->addr)
726 		goto out;	/* No listens on an unbound socket */
727 	unix_state_lock(sk);
728 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
729 		goto out_unlock;
730 	if (backlog > sk->sk_max_ack_backlog)
731 		wake_up_interruptible_all(&u->peer_wait);
732 	sk->sk_max_ack_backlog	= backlog;
733 	sk->sk_state		= TCP_LISTEN;
734 	/* set credentials so connect can copy them */
735 	init_peercred(sk);
736 	err = 0;
737 
738 out_unlock:
739 	unix_state_unlock(sk);
740 out:
741 	return err;
742 }
743 
744 static int unix_release(struct socket *);
745 static int unix_bind(struct socket *, struct sockaddr *, int);
746 static int unix_stream_connect(struct socket *, struct sockaddr *,
747 			       int addr_len, int flags);
748 static int unix_socketpair(struct socket *, struct socket *);
749 static int unix_accept(struct socket *, struct socket *, int, bool);
750 static int unix_getname(struct socket *, struct sockaddr *, int);
751 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
752 static __poll_t unix_dgram_poll(struct file *, struct socket *,
753 				    poll_table *);
754 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
755 #ifdef CONFIG_COMPAT
756 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
757 #endif
758 static int unix_shutdown(struct socket *, int);
759 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
760 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
761 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
762 				    size_t size, int flags);
763 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
764 				       struct pipe_inode_info *, size_t size,
765 				       unsigned int flags);
766 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
767 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
768 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
769 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
770 static int unix_dgram_connect(struct socket *, struct sockaddr *,
771 			      int, int);
772 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
773 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
774 				  int);
775 
776 static int unix_set_peek_off(struct sock *sk, int val)
777 {
778 	struct unix_sock *u = unix_sk(sk);
779 
780 	if (mutex_lock_interruptible(&u->iolock))
781 		return -EINTR;
782 
783 	sk->sk_peek_off = val;
784 	mutex_unlock(&u->iolock);
785 
786 	return 0;
787 }
788 
789 #ifdef CONFIG_PROC_FS
790 static int unix_count_nr_fds(struct sock *sk)
791 {
792 	struct sk_buff *skb;
793 	struct unix_sock *u;
794 	int nr_fds = 0;
795 
796 	spin_lock(&sk->sk_receive_queue.lock);
797 	skb = skb_peek(&sk->sk_receive_queue);
798 	while (skb) {
799 		u = unix_sk(skb->sk);
800 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
801 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
802 	}
803 	spin_unlock(&sk->sk_receive_queue.lock);
804 
805 	return nr_fds;
806 }
807 
808 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
809 {
810 	struct sock *sk = sock->sk;
811 	unsigned char s_state;
812 	struct unix_sock *u;
813 	int nr_fds = 0;
814 
815 	if (sk) {
816 		s_state = READ_ONCE(sk->sk_state);
817 		u = unix_sk(sk);
818 
819 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
820 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
821 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
822 		 */
823 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
824 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
825 		else if (s_state == TCP_LISTEN)
826 			nr_fds = unix_count_nr_fds(sk);
827 
828 		seq_printf(m, "scm_fds: %u\n", nr_fds);
829 	}
830 }
831 #else
832 #define unix_show_fdinfo NULL
833 #endif
834 
835 static const struct proto_ops unix_stream_ops = {
836 	.family =	PF_UNIX,
837 	.owner =	THIS_MODULE,
838 	.release =	unix_release,
839 	.bind =		unix_bind,
840 	.connect =	unix_stream_connect,
841 	.socketpair =	unix_socketpair,
842 	.accept =	unix_accept,
843 	.getname =	unix_getname,
844 	.poll =		unix_poll,
845 	.ioctl =	unix_ioctl,
846 #ifdef CONFIG_COMPAT
847 	.compat_ioctl =	unix_compat_ioctl,
848 #endif
849 	.listen =	unix_listen,
850 	.shutdown =	unix_shutdown,
851 	.sendmsg =	unix_stream_sendmsg,
852 	.recvmsg =	unix_stream_recvmsg,
853 	.read_skb =	unix_stream_read_skb,
854 	.mmap =		sock_no_mmap,
855 	.sendpage =	unix_stream_sendpage,
856 	.splice_read =	unix_stream_splice_read,
857 	.set_peek_off =	unix_set_peek_off,
858 	.show_fdinfo =	unix_show_fdinfo,
859 };
860 
861 static const struct proto_ops unix_dgram_ops = {
862 	.family =	PF_UNIX,
863 	.owner =	THIS_MODULE,
864 	.release =	unix_release,
865 	.bind =		unix_bind,
866 	.connect =	unix_dgram_connect,
867 	.socketpair =	unix_socketpair,
868 	.accept =	sock_no_accept,
869 	.getname =	unix_getname,
870 	.poll =		unix_dgram_poll,
871 	.ioctl =	unix_ioctl,
872 #ifdef CONFIG_COMPAT
873 	.compat_ioctl =	unix_compat_ioctl,
874 #endif
875 	.listen =	sock_no_listen,
876 	.shutdown =	unix_shutdown,
877 	.sendmsg =	unix_dgram_sendmsg,
878 	.read_skb =	unix_read_skb,
879 	.recvmsg =	unix_dgram_recvmsg,
880 	.mmap =		sock_no_mmap,
881 	.sendpage =	sock_no_sendpage,
882 	.set_peek_off =	unix_set_peek_off,
883 	.show_fdinfo =	unix_show_fdinfo,
884 };
885 
886 static const struct proto_ops unix_seqpacket_ops = {
887 	.family =	PF_UNIX,
888 	.owner =	THIS_MODULE,
889 	.release =	unix_release,
890 	.bind =		unix_bind,
891 	.connect =	unix_stream_connect,
892 	.socketpair =	unix_socketpair,
893 	.accept =	unix_accept,
894 	.getname =	unix_getname,
895 	.poll =		unix_dgram_poll,
896 	.ioctl =	unix_ioctl,
897 #ifdef CONFIG_COMPAT
898 	.compat_ioctl =	unix_compat_ioctl,
899 #endif
900 	.listen =	unix_listen,
901 	.shutdown =	unix_shutdown,
902 	.sendmsg =	unix_seqpacket_sendmsg,
903 	.recvmsg =	unix_seqpacket_recvmsg,
904 	.mmap =		sock_no_mmap,
905 	.sendpage =	sock_no_sendpage,
906 	.set_peek_off =	unix_set_peek_off,
907 	.show_fdinfo =	unix_show_fdinfo,
908 };
909 
910 static void unix_close(struct sock *sk, long timeout)
911 {
912 	/* Nothing to do here, unix socket does not need a ->close().
913 	 * This is merely for sockmap.
914 	 */
915 }
916 
917 static void unix_unhash(struct sock *sk)
918 {
919 	/* Nothing to do here, unix socket does not need a ->unhash().
920 	 * This is merely for sockmap.
921 	 */
922 }
923 
924 static bool unix_bpf_bypass_getsockopt(int level, int optname)
925 {
926 	if (level == SOL_SOCKET) {
927 		switch (optname) {
928 		case SO_PEERPIDFD:
929 			return true;
930 		default:
931 			return false;
932 		}
933 	}
934 
935 	return false;
936 }
937 
938 struct proto unix_dgram_proto = {
939 	.name			= "UNIX",
940 	.owner			= THIS_MODULE,
941 	.obj_size		= sizeof(struct unix_sock),
942 	.close			= unix_close,
943 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
944 #ifdef CONFIG_BPF_SYSCALL
945 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
946 #endif
947 };
948 
949 struct proto unix_stream_proto = {
950 	.name			= "UNIX-STREAM",
951 	.owner			= THIS_MODULE,
952 	.obj_size		= sizeof(struct unix_sock),
953 	.close			= unix_close,
954 	.unhash			= unix_unhash,
955 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
956 #ifdef CONFIG_BPF_SYSCALL
957 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
958 #endif
959 };
960 
961 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
962 {
963 	struct unix_sock *u;
964 	struct sock *sk;
965 	int err;
966 
967 	atomic_long_inc(&unix_nr_socks);
968 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
969 		err = -ENFILE;
970 		goto err;
971 	}
972 
973 	if (type == SOCK_STREAM)
974 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
975 	else /*dgram and  seqpacket */
976 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
977 
978 	if (!sk) {
979 		err = -ENOMEM;
980 		goto err;
981 	}
982 
983 	sock_init_data(sock, sk);
984 
985 	sk->sk_hash		= unix_unbound_hash(sk);
986 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
987 	sk->sk_write_space	= unix_write_space;
988 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
989 	sk->sk_destruct		= unix_sock_destructor;
990 	u	  = unix_sk(sk);
991 	u->path.dentry = NULL;
992 	u->path.mnt = NULL;
993 	spin_lock_init(&u->lock);
994 	atomic_long_set(&u->inflight, 0);
995 	INIT_LIST_HEAD(&u->link);
996 	mutex_init(&u->iolock); /* single task reading lock */
997 	mutex_init(&u->bindlock); /* single task binding lock */
998 	init_waitqueue_head(&u->peer_wait);
999 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1000 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1001 	unix_insert_unbound_socket(net, sk);
1002 
1003 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1004 
1005 	return sk;
1006 
1007 err:
1008 	atomic_long_dec(&unix_nr_socks);
1009 	return ERR_PTR(err);
1010 }
1011 
1012 static int unix_create(struct net *net, struct socket *sock, int protocol,
1013 		       int kern)
1014 {
1015 	struct sock *sk;
1016 
1017 	if (protocol && protocol != PF_UNIX)
1018 		return -EPROTONOSUPPORT;
1019 
1020 	sock->state = SS_UNCONNECTED;
1021 
1022 	switch (sock->type) {
1023 	case SOCK_STREAM:
1024 		sock->ops = &unix_stream_ops;
1025 		break;
1026 		/*
1027 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1028 		 *	nothing uses it.
1029 		 */
1030 	case SOCK_RAW:
1031 		sock->type = SOCK_DGRAM;
1032 		fallthrough;
1033 	case SOCK_DGRAM:
1034 		sock->ops = &unix_dgram_ops;
1035 		break;
1036 	case SOCK_SEQPACKET:
1037 		sock->ops = &unix_seqpacket_ops;
1038 		break;
1039 	default:
1040 		return -ESOCKTNOSUPPORT;
1041 	}
1042 
1043 	sk = unix_create1(net, sock, kern, sock->type);
1044 	if (IS_ERR(sk))
1045 		return PTR_ERR(sk);
1046 
1047 	return 0;
1048 }
1049 
1050 static int unix_release(struct socket *sock)
1051 {
1052 	struct sock *sk = sock->sk;
1053 
1054 	if (!sk)
1055 		return 0;
1056 
1057 	sk->sk_prot->close(sk, 0);
1058 	unix_release_sock(sk, 0);
1059 	sock->sk = NULL;
1060 
1061 	return 0;
1062 }
1063 
1064 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1065 				  int type)
1066 {
1067 	struct inode *inode;
1068 	struct path path;
1069 	struct sock *sk;
1070 	int err;
1071 
1072 	unix_mkname_bsd(sunaddr, addr_len);
1073 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1074 	if (err)
1075 		goto fail;
1076 
1077 	err = path_permission(&path, MAY_WRITE);
1078 	if (err)
1079 		goto path_put;
1080 
1081 	err = -ECONNREFUSED;
1082 	inode = d_backing_inode(path.dentry);
1083 	if (!S_ISSOCK(inode->i_mode))
1084 		goto path_put;
1085 
1086 	sk = unix_find_socket_byinode(inode);
1087 	if (!sk)
1088 		goto path_put;
1089 
1090 	err = -EPROTOTYPE;
1091 	if (sk->sk_type == type)
1092 		touch_atime(&path);
1093 	else
1094 		goto sock_put;
1095 
1096 	path_put(&path);
1097 
1098 	return sk;
1099 
1100 sock_put:
1101 	sock_put(sk);
1102 path_put:
1103 	path_put(&path);
1104 fail:
1105 	return ERR_PTR(err);
1106 }
1107 
1108 static struct sock *unix_find_abstract(struct net *net,
1109 				       struct sockaddr_un *sunaddr,
1110 				       int addr_len, int type)
1111 {
1112 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1113 	struct dentry *dentry;
1114 	struct sock *sk;
1115 
1116 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1117 	if (!sk)
1118 		return ERR_PTR(-ECONNREFUSED);
1119 
1120 	dentry = unix_sk(sk)->path.dentry;
1121 	if (dentry)
1122 		touch_atime(&unix_sk(sk)->path);
1123 
1124 	return sk;
1125 }
1126 
1127 static struct sock *unix_find_other(struct net *net,
1128 				    struct sockaddr_un *sunaddr,
1129 				    int addr_len, int type)
1130 {
1131 	struct sock *sk;
1132 
1133 	if (sunaddr->sun_path[0])
1134 		sk = unix_find_bsd(sunaddr, addr_len, type);
1135 	else
1136 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1137 
1138 	return sk;
1139 }
1140 
1141 static int unix_autobind(struct sock *sk)
1142 {
1143 	unsigned int new_hash, old_hash = sk->sk_hash;
1144 	struct unix_sock *u = unix_sk(sk);
1145 	struct net *net = sock_net(sk);
1146 	struct unix_address *addr;
1147 	u32 lastnum, ordernum;
1148 	int err;
1149 
1150 	err = mutex_lock_interruptible(&u->bindlock);
1151 	if (err)
1152 		return err;
1153 
1154 	if (u->addr)
1155 		goto out;
1156 
1157 	err = -ENOMEM;
1158 	addr = kzalloc(sizeof(*addr) +
1159 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1160 	if (!addr)
1161 		goto out;
1162 
1163 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1164 	addr->name->sun_family = AF_UNIX;
1165 	refcount_set(&addr->refcnt, 1);
1166 
1167 	ordernum = get_random_u32();
1168 	lastnum = ordernum & 0xFFFFF;
1169 retry:
1170 	ordernum = (ordernum + 1) & 0xFFFFF;
1171 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1172 
1173 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1174 	unix_table_double_lock(net, old_hash, new_hash);
1175 
1176 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1177 		unix_table_double_unlock(net, old_hash, new_hash);
1178 
1179 		/* __unix_find_socket_byname() may take long time if many names
1180 		 * are already in use.
1181 		 */
1182 		cond_resched();
1183 
1184 		if (ordernum == lastnum) {
1185 			/* Give up if all names seems to be in use. */
1186 			err = -ENOSPC;
1187 			unix_release_addr(addr);
1188 			goto out;
1189 		}
1190 
1191 		goto retry;
1192 	}
1193 
1194 	__unix_set_addr_hash(net, sk, addr, new_hash);
1195 	unix_table_double_unlock(net, old_hash, new_hash);
1196 	err = 0;
1197 
1198 out:	mutex_unlock(&u->bindlock);
1199 	return err;
1200 }
1201 
1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1203 			 int addr_len)
1204 {
1205 	umode_t mode = S_IFSOCK |
1206 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1207 	unsigned int new_hash, old_hash = sk->sk_hash;
1208 	struct unix_sock *u = unix_sk(sk);
1209 	struct net *net = sock_net(sk);
1210 	struct mnt_idmap *idmap;
1211 	struct unix_address *addr;
1212 	struct dentry *dentry;
1213 	struct path parent;
1214 	int err;
1215 
1216 	unix_mkname_bsd(sunaddr, addr_len);
1217 	addr_len = strlen(sunaddr->sun_path) +
1218 		offsetof(struct sockaddr_un, sun_path) + 1;
1219 
1220 	addr = unix_create_addr(sunaddr, addr_len);
1221 	if (!addr)
1222 		return -ENOMEM;
1223 
1224 	/*
1225 	 * Get the parent directory, calculate the hash for last
1226 	 * component.
1227 	 */
1228 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1229 	if (IS_ERR(dentry)) {
1230 		err = PTR_ERR(dentry);
1231 		goto out;
1232 	}
1233 
1234 	/*
1235 	 * All right, let's create it.
1236 	 */
1237 	idmap = mnt_idmap(parent.mnt);
1238 	err = security_path_mknod(&parent, dentry, mode, 0);
1239 	if (!err)
1240 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1241 	if (err)
1242 		goto out_path;
1243 	err = mutex_lock_interruptible(&u->bindlock);
1244 	if (err)
1245 		goto out_unlink;
1246 	if (u->addr)
1247 		goto out_unlock;
1248 
1249 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1250 	unix_table_double_lock(net, old_hash, new_hash);
1251 	u->path.mnt = mntget(parent.mnt);
1252 	u->path.dentry = dget(dentry);
1253 	__unix_set_addr_hash(net, sk, addr, new_hash);
1254 	unix_table_double_unlock(net, old_hash, new_hash);
1255 	unix_insert_bsd_socket(sk);
1256 	mutex_unlock(&u->bindlock);
1257 	done_path_create(&parent, dentry);
1258 	return 0;
1259 
1260 out_unlock:
1261 	mutex_unlock(&u->bindlock);
1262 	err = -EINVAL;
1263 out_unlink:
1264 	/* failed after successful mknod?  unlink what we'd created... */
1265 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1266 out_path:
1267 	done_path_create(&parent, dentry);
1268 out:
1269 	unix_release_addr(addr);
1270 	return err == -EEXIST ? -EADDRINUSE : err;
1271 }
1272 
1273 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1274 			      int addr_len)
1275 {
1276 	unsigned int new_hash, old_hash = sk->sk_hash;
1277 	struct unix_sock *u = unix_sk(sk);
1278 	struct net *net = sock_net(sk);
1279 	struct unix_address *addr;
1280 	int err;
1281 
1282 	addr = unix_create_addr(sunaddr, addr_len);
1283 	if (!addr)
1284 		return -ENOMEM;
1285 
1286 	err = mutex_lock_interruptible(&u->bindlock);
1287 	if (err)
1288 		goto out;
1289 
1290 	if (u->addr) {
1291 		err = -EINVAL;
1292 		goto out_mutex;
1293 	}
1294 
1295 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1296 	unix_table_double_lock(net, old_hash, new_hash);
1297 
1298 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1299 		goto out_spin;
1300 
1301 	__unix_set_addr_hash(net, sk, addr, new_hash);
1302 	unix_table_double_unlock(net, old_hash, new_hash);
1303 	mutex_unlock(&u->bindlock);
1304 	return 0;
1305 
1306 out_spin:
1307 	unix_table_double_unlock(net, old_hash, new_hash);
1308 	err = -EADDRINUSE;
1309 out_mutex:
1310 	mutex_unlock(&u->bindlock);
1311 out:
1312 	unix_release_addr(addr);
1313 	return err;
1314 }
1315 
1316 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1317 {
1318 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1319 	struct sock *sk = sock->sk;
1320 	int err;
1321 
1322 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1323 	    sunaddr->sun_family == AF_UNIX)
1324 		return unix_autobind(sk);
1325 
1326 	err = unix_validate_addr(sunaddr, addr_len);
1327 	if (err)
1328 		return err;
1329 
1330 	if (sunaddr->sun_path[0])
1331 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1332 	else
1333 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1334 
1335 	return err;
1336 }
1337 
1338 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1339 {
1340 	if (unlikely(sk1 == sk2) || !sk2) {
1341 		unix_state_lock(sk1);
1342 		return;
1343 	}
1344 	if (sk1 < sk2) {
1345 		unix_state_lock(sk1);
1346 		unix_state_lock_nested(sk2);
1347 	} else {
1348 		unix_state_lock(sk2);
1349 		unix_state_lock_nested(sk1);
1350 	}
1351 }
1352 
1353 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1354 {
1355 	if (unlikely(sk1 == sk2) || !sk2) {
1356 		unix_state_unlock(sk1);
1357 		return;
1358 	}
1359 	unix_state_unlock(sk1);
1360 	unix_state_unlock(sk2);
1361 }
1362 
1363 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1364 			      int alen, int flags)
1365 {
1366 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1367 	struct sock *sk = sock->sk;
1368 	struct sock *other;
1369 	int err;
1370 
1371 	err = -EINVAL;
1372 	if (alen < offsetofend(struct sockaddr, sa_family))
1373 		goto out;
1374 
1375 	if (addr->sa_family != AF_UNSPEC) {
1376 		err = unix_validate_addr(sunaddr, alen);
1377 		if (err)
1378 			goto out;
1379 
1380 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1381 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1382 		    !unix_sk(sk)->addr) {
1383 			err = unix_autobind(sk);
1384 			if (err)
1385 				goto out;
1386 		}
1387 
1388 restart:
1389 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1390 		if (IS_ERR(other)) {
1391 			err = PTR_ERR(other);
1392 			goto out;
1393 		}
1394 
1395 		unix_state_double_lock(sk, other);
1396 
1397 		/* Apparently VFS overslept socket death. Retry. */
1398 		if (sock_flag(other, SOCK_DEAD)) {
1399 			unix_state_double_unlock(sk, other);
1400 			sock_put(other);
1401 			goto restart;
1402 		}
1403 
1404 		err = -EPERM;
1405 		if (!unix_may_send(sk, other))
1406 			goto out_unlock;
1407 
1408 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1409 		if (err)
1410 			goto out_unlock;
1411 
1412 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1413 	} else {
1414 		/*
1415 		 *	1003.1g breaking connected state with AF_UNSPEC
1416 		 */
1417 		other = NULL;
1418 		unix_state_double_lock(sk, other);
1419 	}
1420 
1421 	/*
1422 	 * If it was connected, reconnect.
1423 	 */
1424 	if (unix_peer(sk)) {
1425 		struct sock *old_peer = unix_peer(sk);
1426 
1427 		unix_peer(sk) = other;
1428 		if (!other)
1429 			sk->sk_state = TCP_CLOSE;
1430 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1431 
1432 		unix_state_double_unlock(sk, other);
1433 
1434 		if (other != old_peer)
1435 			unix_dgram_disconnected(sk, old_peer);
1436 		sock_put(old_peer);
1437 	} else {
1438 		unix_peer(sk) = other;
1439 		unix_state_double_unlock(sk, other);
1440 	}
1441 
1442 	return 0;
1443 
1444 out_unlock:
1445 	unix_state_double_unlock(sk, other);
1446 	sock_put(other);
1447 out:
1448 	return err;
1449 }
1450 
1451 static long unix_wait_for_peer(struct sock *other, long timeo)
1452 	__releases(&unix_sk(other)->lock)
1453 {
1454 	struct unix_sock *u = unix_sk(other);
1455 	int sched;
1456 	DEFINE_WAIT(wait);
1457 
1458 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1459 
1460 	sched = !sock_flag(other, SOCK_DEAD) &&
1461 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1462 		unix_recvq_full_lockless(other);
1463 
1464 	unix_state_unlock(other);
1465 
1466 	if (sched)
1467 		timeo = schedule_timeout(timeo);
1468 
1469 	finish_wait(&u->peer_wait, &wait);
1470 	return timeo;
1471 }
1472 
1473 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1474 			       int addr_len, int flags)
1475 {
1476 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1477 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1478 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1479 	struct net *net = sock_net(sk);
1480 	struct sk_buff *skb = NULL;
1481 	long timeo;
1482 	int err;
1483 	int st;
1484 
1485 	err = unix_validate_addr(sunaddr, addr_len);
1486 	if (err)
1487 		goto out;
1488 
1489 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1490 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1491 		err = unix_autobind(sk);
1492 		if (err)
1493 			goto out;
1494 	}
1495 
1496 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1497 
1498 	/* First of all allocate resources.
1499 	   If we will make it after state is locked,
1500 	   we will have to recheck all again in any case.
1501 	 */
1502 
1503 	/* create new sock for complete connection */
1504 	newsk = unix_create1(net, NULL, 0, sock->type);
1505 	if (IS_ERR(newsk)) {
1506 		err = PTR_ERR(newsk);
1507 		newsk = NULL;
1508 		goto out;
1509 	}
1510 
1511 	err = -ENOMEM;
1512 
1513 	/* Allocate skb for sending to listening sock */
1514 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1515 	if (skb == NULL)
1516 		goto out;
1517 
1518 restart:
1519 	/*  Find listening sock. */
1520 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1521 	if (IS_ERR(other)) {
1522 		err = PTR_ERR(other);
1523 		other = NULL;
1524 		goto out;
1525 	}
1526 
1527 	/* Latch state of peer */
1528 	unix_state_lock(other);
1529 
1530 	/* Apparently VFS overslept socket death. Retry. */
1531 	if (sock_flag(other, SOCK_DEAD)) {
1532 		unix_state_unlock(other);
1533 		sock_put(other);
1534 		goto restart;
1535 	}
1536 
1537 	err = -ECONNREFUSED;
1538 	if (other->sk_state != TCP_LISTEN)
1539 		goto out_unlock;
1540 	if (other->sk_shutdown & RCV_SHUTDOWN)
1541 		goto out_unlock;
1542 
1543 	if (unix_recvq_full(other)) {
1544 		err = -EAGAIN;
1545 		if (!timeo)
1546 			goto out_unlock;
1547 
1548 		timeo = unix_wait_for_peer(other, timeo);
1549 
1550 		err = sock_intr_errno(timeo);
1551 		if (signal_pending(current))
1552 			goto out;
1553 		sock_put(other);
1554 		goto restart;
1555 	}
1556 
1557 	/* Latch our state.
1558 
1559 	   It is tricky place. We need to grab our state lock and cannot
1560 	   drop lock on peer. It is dangerous because deadlock is
1561 	   possible. Connect to self case and simultaneous
1562 	   attempt to connect are eliminated by checking socket
1563 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1564 	   check this before attempt to grab lock.
1565 
1566 	   Well, and we have to recheck the state after socket locked.
1567 	 */
1568 	st = sk->sk_state;
1569 
1570 	switch (st) {
1571 	case TCP_CLOSE:
1572 		/* This is ok... continue with connect */
1573 		break;
1574 	case TCP_ESTABLISHED:
1575 		/* Socket is already connected */
1576 		err = -EISCONN;
1577 		goto out_unlock;
1578 	default:
1579 		err = -EINVAL;
1580 		goto out_unlock;
1581 	}
1582 
1583 	unix_state_lock_nested(sk);
1584 
1585 	if (sk->sk_state != st) {
1586 		unix_state_unlock(sk);
1587 		unix_state_unlock(other);
1588 		sock_put(other);
1589 		goto restart;
1590 	}
1591 
1592 	err = security_unix_stream_connect(sk, other, newsk);
1593 	if (err) {
1594 		unix_state_unlock(sk);
1595 		goto out_unlock;
1596 	}
1597 
1598 	/* The way is open! Fastly set all the necessary fields... */
1599 
1600 	sock_hold(sk);
1601 	unix_peer(newsk)	= sk;
1602 	newsk->sk_state		= TCP_ESTABLISHED;
1603 	newsk->sk_type		= sk->sk_type;
1604 	init_peercred(newsk);
1605 	newu = unix_sk(newsk);
1606 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1607 	otheru = unix_sk(other);
1608 
1609 	/* copy address information from listening to new sock
1610 	 *
1611 	 * The contents of *(otheru->addr) and otheru->path
1612 	 * are seen fully set up here, since we have found
1613 	 * otheru in hash under its lock.  Insertion into the
1614 	 * hash chain we'd found it in had been done in an
1615 	 * earlier critical area protected by the chain's lock,
1616 	 * the same one where we'd set *(otheru->addr) contents,
1617 	 * as well as otheru->path and otheru->addr itself.
1618 	 *
1619 	 * Using smp_store_release() here to set newu->addr
1620 	 * is enough to make those stores, as well as stores
1621 	 * to newu->path visible to anyone who gets newu->addr
1622 	 * by smp_load_acquire().  IOW, the same warranties
1623 	 * as for unix_sock instances bound in unix_bind() or
1624 	 * in unix_autobind().
1625 	 */
1626 	if (otheru->path.dentry) {
1627 		path_get(&otheru->path);
1628 		newu->path = otheru->path;
1629 	}
1630 	refcount_inc(&otheru->addr->refcnt);
1631 	smp_store_release(&newu->addr, otheru->addr);
1632 
1633 	/* Set credentials */
1634 	copy_peercred(sk, other);
1635 
1636 	sock->state	= SS_CONNECTED;
1637 	sk->sk_state	= TCP_ESTABLISHED;
1638 	sock_hold(newsk);
1639 
1640 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1641 	unix_peer(sk)	= newsk;
1642 
1643 	unix_state_unlock(sk);
1644 
1645 	/* take ten and send info to listening sock */
1646 	spin_lock(&other->sk_receive_queue.lock);
1647 	__skb_queue_tail(&other->sk_receive_queue, skb);
1648 	spin_unlock(&other->sk_receive_queue.lock);
1649 	unix_state_unlock(other);
1650 	other->sk_data_ready(other);
1651 	sock_put(other);
1652 	return 0;
1653 
1654 out_unlock:
1655 	if (other)
1656 		unix_state_unlock(other);
1657 
1658 out:
1659 	kfree_skb(skb);
1660 	if (newsk)
1661 		unix_release_sock(newsk, 0);
1662 	if (other)
1663 		sock_put(other);
1664 	return err;
1665 }
1666 
1667 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1668 {
1669 	struct sock *ska = socka->sk, *skb = sockb->sk;
1670 
1671 	/* Join our sockets back to back */
1672 	sock_hold(ska);
1673 	sock_hold(skb);
1674 	unix_peer(ska) = skb;
1675 	unix_peer(skb) = ska;
1676 	init_peercred(ska);
1677 	init_peercred(skb);
1678 
1679 	ska->sk_state = TCP_ESTABLISHED;
1680 	skb->sk_state = TCP_ESTABLISHED;
1681 	socka->state  = SS_CONNECTED;
1682 	sockb->state  = SS_CONNECTED;
1683 	return 0;
1684 }
1685 
1686 static void unix_sock_inherit_flags(const struct socket *old,
1687 				    struct socket *new)
1688 {
1689 	if (test_bit(SOCK_PASSCRED, &old->flags))
1690 		set_bit(SOCK_PASSCRED, &new->flags);
1691 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1692 		set_bit(SOCK_PASSPIDFD, &new->flags);
1693 	if (test_bit(SOCK_PASSSEC, &old->flags))
1694 		set_bit(SOCK_PASSSEC, &new->flags);
1695 }
1696 
1697 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1698 		       bool kern)
1699 {
1700 	struct sock *sk = sock->sk;
1701 	struct sock *tsk;
1702 	struct sk_buff *skb;
1703 	int err;
1704 
1705 	err = -EOPNOTSUPP;
1706 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1707 		goto out;
1708 
1709 	err = -EINVAL;
1710 	if (sk->sk_state != TCP_LISTEN)
1711 		goto out;
1712 
1713 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1714 	 * so that no locks are necessary.
1715 	 */
1716 
1717 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1718 				&err);
1719 	if (!skb) {
1720 		/* This means receive shutdown. */
1721 		if (err == 0)
1722 			err = -EINVAL;
1723 		goto out;
1724 	}
1725 
1726 	tsk = skb->sk;
1727 	skb_free_datagram(sk, skb);
1728 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1729 
1730 	/* attach accepted sock to socket */
1731 	unix_state_lock(tsk);
1732 	newsock->state = SS_CONNECTED;
1733 	unix_sock_inherit_flags(sock, newsock);
1734 	sock_graft(tsk, newsock);
1735 	unix_state_unlock(tsk);
1736 	return 0;
1737 
1738 out:
1739 	return err;
1740 }
1741 
1742 
1743 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1744 {
1745 	struct sock *sk = sock->sk;
1746 	struct unix_address *addr;
1747 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1748 	int err = 0;
1749 
1750 	if (peer) {
1751 		sk = unix_peer_get(sk);
1752 
1753 		err = -ENOTCONN;
1754 		if (!sk)
1755 			goto out;
1756 		err = 0;
1757 	} else {
1758 		sock_hold(sk);
1759 	}
1760 
1761 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1762 	if (!addr) {
1763 		sunaddr->sun_family = AF_UNIX;
1764 		sunaddr->sun_path[0] = 0;
1765 		err = offsetof(struct sockaddr_un, sun_path);
1766 	} else {
1767 		err = addr->len;
1768 		memcpy(sunaddr, addr->name, addr->len);
1769 	}
1770 	sock_put(sk);
1771 out:
1772 	return err;
1773 }
1774 
1775 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1776 {
1777 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1778 
1779 	/*
1780 	 * Garbage collection of unix sockets starts by selecting a set of
1781 	 * candidate sockets which have reference only from being in flight
1782 	 * (total_refs == inflight_refs).  This condition is checked once during
1783 	 * the candidate collection phase, and candidates are marked as such, so
1784 	 * that non-candidates can later be ignored.  While inflight_refs is
1785 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1786 	 * is an instantaneous decision.
1787 	 *
1788 	 * Once a candidate, however, the socket must not be reinstalled into a
1789 	 * file descriptor while the garbage collection is in progress.
1790 	 *
1791 	 * If the above conditions are met, then the directed graph of
1792 	 * candidates (*) does not change while unix_gc_lock is held.
1793 	 *
1794 	 * Any operations that changes the file count through file descriptors
1795 	 * (dup, close, sendmsg) does not change the graph since candidates are
1796 	 * not installed in fds.
1797 	 *
1798 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1799 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1800 	 * serialized with garbage collection.
1801 	 *
1802 	 * MSG_PEEK is special in that it does not change the inflight count,
1803 	 * yet does install the socket into an fd.  The following lock/unlock
1804 	 * pair is to ensure serialization with garbage collection.  It must be
1805 	 * done between incrementing the file count and installing the file into
1806 	 * an fd.
1807 	 *
1808 	 * If garbage collection starts after the barrier provided by the
1809 	 * lock/unlock, then it will see the elevated refcount and not mark this
1810 	 * as a candidate.  If a garbage collection is already in progress
1811 	 * before the file count was incremented, then the lock/unlock pair will
1812 	 * ensure that garbage collection is finished before progressing to
1813 	 * installing the fd.
1814 	 *
1815 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1816 	 * which is on the queue of listening socket A.
1817 	 */
1818 	spin_lock(&unix_gc_lock);
1819 	spin_unlock(&unix_gc_lock);
1820 }
1821 
1822 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1823 {
1824 	int err = 0;
1825 
1826 	UNIXCB(skb).pid  = get_pid(scm->pid);
1827 	UNIXCB(skb).uid = scm->creds.uid;
1828 	UNIXCB(skb).gid = scm->creds.gid;
1829 	UNIXCB(skb).fp = NULL;
1830 	unix_get_secdata(scm, skb);
1831 	if (scm->fp && send_fds)
1832 		err = unix_attach_fds(scm, skb);
1833 
1834 	skb->destructor = unix_destruct_scm;
1835 	return err;
1836 }
1837 
1838 static bool unix_passcred_enabled(const struct socket *sock,
1839 				  const struct sock *other)
1840 {
1841 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1842 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1843 	       !other->sk_socket ||
1844 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1845 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1846 }
1847 
1848 /*
1849  * Some apps rely on write() giving SCM_CREDENTIALS
1850  * We include credentials if source or destination socket
1851  * asserted SOCK_PASSCRED.
1852  */
1853 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1854 			    const struct sock *other)
1855 {
1856 	if (UNIXCB(skb).pid)
1857 		return;
1858 	if (unix_passcred_enabled(sock, other)) {
1859 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1860 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1861 	}
1862 }
1863 
1864 static bool unix_skb_scm_eq(struct sk_buff *skb,
1865 			    struct scm_cookie *scm)
1866 {
1867 	return UNIXCB(skb).pid == scm->pid &&
1868 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1869 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1870 	       unix_secdata_eq(scm, skb);
1871 }
1872 
1873 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1874 {
1875 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1876 	struct unix_sock *u = unix_sk(sk);
1877 
1878 	if (unlikely(fp && fp->count))
1879 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1880 }
1881 
1882 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1883 {
1884 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1885 	struct unix_sock *u = unix_sk(sk);
1886 
1887 	if (unlikely(fp && fp->count))
1888 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1889 }
1890 
1891 /*
1892  *	Send AF_UNIX data.
1893  */
1894 
1895 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1896 			      size_t len)
1897 {
1898 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1899 	struct sock *sk = sock->sk, *other = NULL;
1900 	struct unix_sock *u = unix_sk(sk);
1901 	struct scm_cookie scm;
1902 	struct sk_buff *skb;
1903 	int data_len = 0;
1904 	int sk_locked;
1905 	long timeo;
1906 	int err;
1907 
1908 	wait_for_unix_gc();
1909 	err = scm_send(sock, msg, &scm, false);
1910 	if (err < 0)
1911 		return err;
1912 
1913 	err = -EOPNOTSUPP;
1914 	if (msg->msg_flags&MSG_OOB)
1915 		goto out;
1916 
1917 	if (msg->msg_namelen) {
1918 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1919 		if (err)
1920 			goto out;
1921 	} else {
1922 		sunaddr = NULL;
1923 		err = -ENOTCONN;
1924 		other = unix_peer_get(sk);
1925 		if (!other)
1926 			goto out;
1927 	}
1928 
1929 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1930 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1931 		err = unix_autobind(sk);
1932 		if (err)
1933 			goto out;
1934 	}
1935 
1936 	err = -EMSGSIZE;
1937 	if (len > sk->sk_sndbuf - 32)
1938 		goto out;
1939 
1940 	if (len > SKB_MAX_ALLOC) {
1941 		data_len = min_t(size_t,
1942 				 len - SKB_MAX_ALLOC,
1943 				 MAX_SKB_FRAGS * PAGE_SIZE);
1944 		data_len = PAGE_ALIGN(data_len);
1945 
1946 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1947 	}
1948 
1949 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1950 				   msg->msg_flags & MSG_DONTWAIT, &err,
1951 				   PAGE_ALLOC_COSTLY_ORDER);
1952 	if (skb == NULL)
1953 		goto out;
1954 
1955 	err = unix_scm_to_skb(&scm, skb, true);
1956 	if (err < 0)
1957 		goto out_free;
1958 
1959 	skb_put(skb, len - data_len);
1960 	skb->data_len = data_len;
1961 	skb->len = len;
1962 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1963 	if (err)
1964 		goto out_free;
1965 
1966 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1967 
1968 restart:
1969 	if (!other) {
1970 		err = -ECONNRESET;
1971 		if (sunaddr == NULL)
1972 			goto out_free;
1973 
1974 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1975 					sk->sk_type);
1976 		if (IS_ERR(other)) {
1977 			err = PTR_ERR(other);
1978 			other = NULL;
1979 			goto out_free;
1980 		}
1981 	}
1982 
1983 	if (sk_filter(other, skb) < 0) {
1984 		/* Toss the packet but do not return any error to the sender */
1985 		err = len;
1986 		goto out_free;
1987 	}
1988 
1989 	sk_locked = 0;
1990 	unix_state_lock(other);
1991 restart_locked:
1992 	err = -EPERM;
1993 	if (!unix_may_send(sk, other))
1994 		goto out_unlock;
1995 
1996 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1997 		/*
1998 		 *	Check with 1003.1g - what should
1999 		 *	datagram error
2000 		 */
2001 		unix_state_unlock(other);
2002 		sock_put(other);
2003 
2004 		if (!sk_locked)
2005 			unix_state_lock(sk);
2006 
2007 		err = 0;
2008 		if (sk->sk_type == SOCK_SEQPACKET) {
2009 			/* We are here only when racing with unix_release_sock()
2010 			 * is clearing @other. Never change state to TCP_CLOSE
2011 			 * unlike SOCK_DGRAM wants.
2012 			 */
2013 			unix_state_unlock(sk);
2014 			err = -EPIPE;
2015 		} else if (unix_peer(sk) == other) {
2016 			unix_peer(sk) = NULL;
2017 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2018 
2019 			sk->sk_state = TCP_CLOSE;
2020 			unix_state_unlock(sk);
2021 
2022 			unix_dgram_disconnected(sk, other);
2023 			sock_put(other);
2024 			err = -ECONNREFUSED;
2025 		} else {
2026 			unix_state_unlock(sk);
2027 		}
2028 
2029 		other = NULL;
2030 		if (err)
2031 			goto out_free;
2032 		goto restart;
2033 	}
2034 
2035 	err = -EPIPE;
2036 	if (other->sk_shutdown & RCV_SHUTDOWN)
2037 		goto out_unlock;
2038 
2039 	if (sk->sk_type != SOCK_SEQPACKET) {
2040 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2041 		if (err)
2042 			goto out_unlock;
2043 	}
2044 
2045 	/* other == sk && unix_peer(other) != sk if
2046 	 * - unix_peer(sk) == NULL, destination address bound to sk
2047 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2048 	 */
2049 	if (other != sk &&
2050 	    unlikely(unix_peer(other) != sk &&
2051 	    unix_recvq_full_lockless(other))) {
2052 		if (timeo) {
2053 			timeo = unix_wait_for_peer(other, timeo);
2054 
2055 			err = sock_intr_errno(timeo);
2056 			if (signal_pending(current))
2057 				goto out_free;
2058 
2059 			goto restart;
2060 		}
2061 
2062 		if (!sk_locked) {
2063 			unix_state_unlock(other);
2064 			unix_state_double_lock(sk, other);
2065 		}
2066 
2067 		if (unix_peer(sk) != other ||
2068 		    unix_dgram_peer_wake_me(sk, other)) {
2069 			err = -EAGAIN;
2070 			sk_locked = 1;
2071 			goto out_unlock;
2072 		}
2073 
2074 		if (!sk_locked) {
2075 			sk_locked = 1;
2076 			goto restart_locked;
2077 		}
2078 	}
2079 
2080 	if (unlikely(sk_locked))
2081 		unix_state_unlock(sk);
2082 
2083 	if (sock_flag(other, SOCK_RCVTSTAMP))
2084 		__net_timestamp(skb);
2085 	maybe_add_creds(skb, sock, other);
2086 	scm_stat_add(other, skb);
2087 	skb_queue_tail(&other->sk_receive_queue, skb);
2088 	unix_state_unlock(other);
2089 	other->sk_data_ready(other);
2090 	sock_put(other);
2091 	scm_destroy(&scm);
2092 	return len;
2093 
2094 out_unlock:
2095 	if (sk_locked)
2096 		unix_state_unlock(sk);
2097 	unix_state_unlock(other);
2098 out_free:
2099 	kfree_skb(skb);
2100 out:
2101 	if (other)
2102 		sock_put(other);
2103 	scm_destroy(&scm);
2104 	return err;
2105 }
2106 
2107 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2108  * bytes, and a minimum of a full page.
2109  */
2110 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2111 
2112 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2113 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2114 		     struct scm_cookie *scm, bool fds_sent)
2115 {
2116 	struct unix_sock *ousk = unix_sk(other);
2117 	struct sk_buff *skb;
2118 	int err = 0;
2119 
2120 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2121 
2122 	if (!skb)
2123 		return err;
2124 
2125 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2126 	if (err < 0) {
2127 		kfree_skb(skb);
2128 		return err;
2129 	}
2130 	skb_put(skb, 1);
2131 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2132 
2133 	if (err) {
2134 		kfree_skb(skb);
2135 		return err;
2136 	}
2137 
2138 	unix_state_lock(other);
2139 
2140 	if (sock_flag(other, SOCK_DEAD) ||
2141 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2142 		unix_state_unlock(other);
2143 		kfree_skb(skb);
2144 		return -EPIPE;
2145 	}
2146 
2147 	maybe_add_creds(skb, sock, other);
2148 	skb_get(skb);
2149 
2150 	if (ousk->oob_skb)
2151 		consume_skb(ousk->oob_skb);
2152 
2153 	WRITE_ONCE(ousk->oob_skb, skb);
2154 
2155 	scm_stat_add(other, skb);
2156 	skb_queue_tail(&other->sk_receive_queue, skb);
2157 	sk_send_sigurg(other);
2158 	unix_state_unlock(other);
2159 	other->sk_data_ready(other);
2160 
2161 	return err;
2162 }
2163 #endif
2164 
2165 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2166 			       size_t len)
2167 {
2168 	struct sock *sk = sock->sk;
2169 	struct sock *other = NULL;
2170 	int err, size;
2171 	struct sk_buff *skb;
2172 	int sent = 0;
2173 	struct scm_cookie scm;
2174 	bool fds_sent = false;
2175 	int data_len;
2176 
2177 	wait_for_unix_gc();
2178 	err = scm_send(sock, msg, &scm, false);
2179 	if (err < 0)
2180 		return err;
2181 
2182 	err = -EOPNOTSUPP;
2183 	if (msg->msg_flags & MSG_OOB) {
2184 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2185 		if (len)
2186 			len--;
2187 		else
2188 #endif
2189 			goto out_err;
2190 	}
2191 
2192 	if (msg->msg_namelen) {
2193 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2194 		goto out_err;
2195 	} else {
2196 		err = -ENOTCONN;
2197 		other = unix_peer(sk);
2198 		if (!other)
2199 			goto out_err;
2200 	}
2201 
2202 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2203 		goto pipe_err;
2204 
2205 	while (sent < len) {
2206 		size = len - sent;
2207 
2208 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2209 			skb = sock_alloc_send_pskb(sk, 0, 0,
2210 						   msg->msg_flags & MSG_DONTWAIT,
2211 						   &err, 0);
2212 		} else {
2213 			/* Keep two messages in the pipe so it schedules better */
2214 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2215 
2216 			/* allow fallback to order-0 allocations */
2217 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2218 
2219 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2220 
2221 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2222 
2223 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2224 						   msg->msg_flags & MSG_DONTWAIT, &err,
2225 						   get_order(UNIX_SKB_FRAGS_SZ));
2226 		}
2227 		if (!skb)
2228 			goto out_err;
2229 
2230 		/* Only send the fds in the first buffer */
2231 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2232 		if (err < 0) {
2233 			kfree_skb(skb);
2234 			goto out_err;
2235 		}
2236 		fds_sent = true;
2237 
2238 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2239 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2240 						   sk->sk_allocation);
2241 			if (err < 0) {
2242 				kfree_skb(skb);
2243 				goto out_err;
2244 			}
2245 			size = err;
2246 			refcount_add(size, &sk->sk_wmem_alloc);
2247 		} else {
2248 			skb_put(skb, size - data_len);
2249 			skb->data_len = data_len;
2250 			skb->len = size;
2251 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2252 			if (err) {
2253 				kfree_skb(skb);
2254 				goto out_err;
2255 			}
2256 		}
2257 
2258 		unix_state_lock(other);
2259 
2260 		if (sock_flag(other, SOCK_DEAD) ||
2261 		    (other->sk_shutdown & RCV_SHUTDOWN))
2262 			goto pipe_err_free;
2263 
2264 		maybe_add_creds(skb, sock, other);
2265 		scm_stat_add(other, skb);
2266 		skb_queue_tail(&other->sk_receive_queue, skb);
2267 		unix_state_unlock(other);
2268 		other->sk_data_ready(other);
2269 		sent += size;
2270 	}
2271 
2272 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2273 	if (msg->msg_flags & MSG_OOB) {
2274 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2275 		if (err)
2276 			goto out_err;
2277 		sent++;
2278 	}
2279 #endif
2280 
2281 	scm_destroy(&scm);
2282 
2283 	return sent;
2284 
2285 pipe_err_free:
2286 	unix_state_unlock(other);
2287 	kfree_skb(skb);
2288 pipe_err:
2289 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2290 		send_sig(SIGPIPE, current, 0);
2291 	err = -EPIPE;
2292 out_err:
2293 	scm_destroy(&scm);
2294 	return sent ? : err;
2295 }
2296 
2297 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2298 				    int offset, size_t size, int flags)
2299 {
2300 	struct bio_vec bvec;
2301 	struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES };
2302 
2303 	if (flags & MSG_SENDPAGE_NOTLAST)
2304 		msg.msg_flags |= MSG_MORE;
2305 
2306 	bvec_set_page(&bvec, page, size, offset);
2307 	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
2308 	return unix_stream_sendmsg(socket, &msg, size);
2309 }
2310 
2311 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2312 				  size_t len)
2313 {
2314 	int err;
2315 	struct sock *sk = sock->sk;
2316 
2317 	err = sock_error(sk);
2318 	if (err)
2319 		return err;
2320 
2321 	if (sk->sk_state != TCP_ESTABLISHED)
2322 		return -ENOTCONN;
2323 
2324 	if (msg->msg_namelen)
2325 		msg->msg_namelen = 0;
2326 
2327 	return unix_dgram_sendmsg(sock, msg, len);
2328 }
2329 
2330 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2331 				  size_t size, int flags)
2332 {
2333 	struct sock *sk = sock->sk;
2334 
2335 	if (sk->sk_state != TCP_ESTABLISHED)
2336 		return -ENOTCONN;
2337 
2338 	return unix_dgram_recvmsg(sock, msg, size, flags);
2339 }
2340 
2341 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2342 {
2343 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2344 
2345 	if (addr) {
2346 		msg->msg_namelen = addr->len;
2347 		memcpy(msg->msg_name, addr->name, addr->len);
2348 	}
2349 }
2350 
2351 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2352 			 int flags)
2353 {
2354 	struct scm_cookie scm;
2355 	struct socket *sock = sk->sk_socket;
2356 	struct unix_sock *u = unix_sk(sk);
2357 	struct sk_buff *skb, *last;
2358 	long timeo;
2359 	int skip;
2360 	int err;
2361 
2362 	err = -EOPNOTSUPP;
2363 	if (flags&MSG_OOB)
2364 		goto out;
2365 
2366 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2367 
2368 	do {
2369 		mutex_lock(&u->iolock);
2370 
2371 		skip = sk_peek_offset(sk, flags);
2372 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2373 					      &skip, &err, &last);
2374 		if (skb) {
2375 			if (!(flags & MSG_PEEK))
2376 				scm_stat_del(sk, skb);
2377 			break;
2378 		}
2379 
2380 		mutex_unlock(&u->iolock);
2381 
2382 		if (err != -EAGAIN)
2383 			break;
2384 	} while (timeo &&
2385 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2386 					      &err, &timeo, last));
2387 
2388 	if (!skb) { /* implies iolock unlocked */
2389 		unix_state_lock(sk);
2390 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2391 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2392 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2393 			err = 0;
2394 		unix_state_unlock(sk);
2395 		goto out;
2396 	}
2397 
2398 	if (wq_has_sleeper(&u->peer_wait))
2399 		wake_up_interruptible_sync_poll(&u->peer_wait,
2400 						EPOLLOUT | EPOLLWRNORM |
2401 						EPOLLWRBAND);
2402 
2403 	if (msg->msg_name)
2404 		unix_copy_addr(msg, skb->sk);
2405 
2406 	if (size > skb->len - skip)
2407 		size = skb->len - skip;
2408 	else if (size < skb->len - skip)
2409 		msg->msg_flags |= MSG_TRUNC;
2410 
2411 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2412 	if (err)
2413 		goto out_free;
2414 
2415 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2416 		__sock_recv_timestamp(msg, sk, skb);
2417 
2418 	memset(&scm, 0, sizeof(scm));
2419 
2420 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2421 	unix_set_secdata(&scm, skb);
2422 
2423 	if (!(flags & MSG_PEEK)) {
2424 		if (UNIXCB(skb).fp)
2425 			unix_detach_fds(&scm, skb);
2426 
2427 		sk_peek_offset_bwd(sk, skb->len);
2428 	} else {
2429 		/* It is questionable: on PEEK we could:
2430 		   - do not return fds - good, but too simple 8)
2431 		   - return fds, and do not return them on read (old strategy,
2432 		     apparently wrong)
2433 		   - clone fds (I chose it for now, it is the most universal
2434 		     solution)
2435 
2436 		   POSIX 1003.1g does not actually define this clearly
2437 		   at all. POSIX 1003.1g doesn't define a lot of things
2438 		   clearly however!
2439 
2440 		*/
2441 
2442 		sk_peek_offset_fwd(sk, size);
2443 
2444 		if (UNIXCB(skb).fp)
2445 			unix_peek_fds(&scm, skb);
2446 	}
2447 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2448 
2449 	scm_recv(sock, msg, &scm, flags);
2450 
2451 out_free:
2452 	skb_free_datagram(sk, skb);
2453 	mutex_unlock(&u->iolock);
2454 out:
2455 	return err;
2456 }
2457 
2458 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2459 			      int flags)
2460 {
2461 	struct sock *sk = sock->sk;
2462 
2463 #ifdef CONFIG_BPF_SYSCALL
2464 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2465 
2466 	if (prot != &unix_dgram_proto)
2467 		return prot->recvmsg(sk, msg, size, flags, NULL);
2468 #endif
2469 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2470 }
2471 
2472 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2473 {
2474 	struct unix_sock *u = unix_sk(sk);
2475 	struct sk_buff *skb;
2476 	int err;
2477 
2478 	mutex_lock(&u->iolock);
2479 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2480 	mutex_unlock(&u->iolock);
2481 	if (!skb)
2482 		return err;
2483 
2484 	return recv_actor(sk, skb);
2485 }
2486 
2487 /*
2488  *	Sleep until more data has arrived. But check for races..
2489  */
2490 static long unix_stream_data_wait(struct sock *sk, long timeo,
2491 				  struct sk_buff *last, unsigned int last_len,
2492 				  bool freezable)
2493 {
2494 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2495 	struct sk_buff *tail;
2496 	DEFINE_WAIT(wait);
2497 
2498 	unix_state_lock(sk);
2499 
2500 	for (;;) {
2501 		prepare_to_wait(sk_sleep(sk), &wait, state);
2502 
2503 		tail = skb_peek_tail(&sk->sk_receive_queue);
2504 		if (tail != last ||
2505 		    (tail && tail->len != last_len) ||
2506 		    sk->sk_err ||
2507 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2508 		    signal_pending(current) ||
2509 		    !timeo)
2510 			break;
2511 
2512 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2513 		unix_state_unlock(sk);
2514 		timeo = schedule_timeout(timeo);
2515 		unix_state_lock(sk);
2516 
2517 		if (sock_flag(sk, SOCK_DEAD))
2518 			break;
2519 
2520 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2521 	}
2522 
2523 	finish_wait(sk_sleep(sk), &wait);
2524 	unix_state_unlock(sk);
2525 	return timeo;
2526 }
2527 
2528 static unsigned int unix_skb_len(const struct sk_buff *skb)
2529 {
2530 	return skb->len - UNIXCB(skb).consumed;
2531 }
2532 
2533 struct unix_stream_read_state {
2534 	int (*recv_actor)(struct sk_buff *, int, int,
2535 			  struct unix_stream_read_state *);
2536 	struct socket *socket;
2537 	struct msghdr *msg;
2538 	struct pipe_inode_info *pipe;
2539 	size_t size;
2540 	int flags;
2541 	unsigned int splice_flags;
2542 };
2543 
2544 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2545 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2546 {
2547 	struct socket *sock = state->socket;
2548 	struct sock *sk = sock->sk;
2549 	struct unix_sock *u = unix_sk(sk);
2550 	int chunk = 1;
2551 	struct sk_buff *oob_skb;
2552 
2553 	mutex_lock(&u->iolock);
2554 	unix_state_lock(sk);
2555 
2556 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2557 		unix_state_unlock(sk);
2558 		mutex_unlock(&u->iolock);
2559 		return -EINVAL;
2560 	}
2561 
2562 	oob_skb = u->oob_skb;
2563 
2564 	if (!(state->flags & MSG_PEEK))
2565 		WRITE_ONCE(u->oob_skb, NULL);
2566 
2567 	unix_state_unlock(sk);
2568 
2569 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2570 
2571 	if (!(state->flags & MSG_PEEK)) {
2572 		UNIXCB(oob_skb).consumed += 1;
2573 		kfree_skb(oob_skb);
2574 	}
2575 
2576 	mutex_unlock(&u->iolock);
2577 
2578 	if (chunk < 0)
2579 		return -EFAULT;
2580 
2581 	state->msg->msg_flags |= MSG_OOB;
2582 	return 1;
2583 }
2584 
2585 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2586 				  int flags, int copied)
2587 {
2588 	struct unix_sock *u = unix_sk(sk);
2589 
2590 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2591 		skb_unlink(skb, &sk->sk_receive_queue);
2592 		consume_skb(skb);
2593 		skb = NULL;
2594 	} else {
2595 		if (skb == u->oob_skb) {
2596 			if (copied) {
2597 				skb = NULL;
2598 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2599 				if (!(flags & MSG_PEEK)) {
2600 					WRITE_ONCE(u->oob_skb, NULL);
2601 					consume_skb(skb);
2602 				}
2603 			} else if (!(flags & MSG_PEEK)) {
2604 				skb_unlink(skb, &sk->sk_receive_queue);
2605 				consume_skb(skb);
2606 				skb = skb_peek(&sk->sk_receive_queue);
2607 			}
2608 		}
2609 	}
2610 	return skb;
2611 }
2612 #endif
2613 
2614 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2615 {
2616 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2617 		return -ENOTCONN;
2618 
2619 	return unix_read_skb(sk, recv_actor);
2620 }
2621 
2622 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2623 				    bool freezable)
2624 {
2625 	struct scm_cookie scm;
2626 	struct socket *sock = state->socket;
2627 	struct sock *sk = sock->sk;
2628 	struct unix_sock *u = unix_sk(sk);
2629 	int copied = 0;
2630 	int flags = state->flags;
2631 	int noblock = flags & MSG_DONTWAIT;
2632 	bool check_creds = false;
2633 	int target;
2634 	int err = 0;
2635 	long timeo;
2636 	int skip;
2637 	size_t size = state->size;
2638 	unsigned int last_len;
2639 
2640 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2641 		err = -EINVAL;
2642 		goto out;
2643 	}
2644 
2645 	if (unlikely(flags & MSG_OOB)) {
2646 		err = -EOPNOTSUPP;
2647 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2648 		err = unix_stream_recv_urg(state);
2649 #endif
2650 		goto out;
2651 	}
2652 
2653 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2654 	timeo = sock_rcvtimeo(sk, noblock);
2655 
2656 	memset(&scm, 0, sizeof(scm));
2657 
2658 	/* Lock the socket to prevent queue disordering
2659 	 * while sleeps in memcpy_tomsg
2660 	 */
2661 	mutex_lock(&u->iolock);
2662 
2663 	skip = max(sk_peek_offset(sk, flags), 0);
2664 
2665 	do {
2666 		int chunk;
2667 		bool drop_skb;
2668 		struct sk_buff *skb, *last;
2669 
2670 redo:
2671 		unix_state_lock(sk);
2672 		if (sock_flag(sk, SOCK_DEAD)) {
2673 			err = -ECONNRESET;
2674 			goto unlock;
2675 		}
2676 		last = skb = skb_peek(&sk->sk_receive_queue);
2677 		last_len = last ? last->len : 0;
2678 
2679 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2680 		if (skb) {
2681 			skb = manage_oob(skb, sk, flags, copied);
2682 			if (!skb) {
2683 				unix_state_unlock(sk);
2684 				if (copied)
2685 					break;
2686 				goto redo;
2687 			}
2688 		}
2689 #endif
2690 again:
2691 		if (skb == NULL) {
2692 			if (copied >= target)
2693 				goto unlock;
2694 
2695 			/*
2696 			 *	POSIX 1003.1g mandates this order.
2697 			 */
2698 
2699 			err = sock_error(sk);
2700 			if (err)
2701 				goto unlock;
2702 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2703 				goto unlock;
2704 
2705 			unix_state_unlock(sk);
2706 			if (!timeo) {
2707 				err = -EAGAIN;
2708 				break;
2709 			}
2710 
2711 			mutex_unlock(&u->iolock);
2712 
2713 			timeo = unix_stream_data_wait(sk, timeo, last,
2714 						      last_len, freezable);
2715 
2716 			if (signal_pending(current)) {
2717 				err = sock_intr_errno(timeo);
2718 				scm_destroy(&scm);
2719 				goto out;
2720 			}
2721 
2722 			mutex_lock(&u->iolock);
2723 			goto redo;
2724 unlock:
2725 			unix_state_unlock(sk);
2726 			break;
2727 		}
2728 
2729 		while (skip >= unix_skb_len(skb)) {
2730 			skip -= unix_skb_len(skb);
2731 			last = skb;
2732 			last_len = skb->len;
2733 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2734 			if (!skb)
2735 				goto again;
2736 		}
2737 
2738 		unix_state_unlock(sk);
2739 
2740 		if (check_creds) {
2741 			/* Never glue messages from different writers */
2742 			if (!unix_skb_scm_eq(skb, &scm))
2743 				break;
2744 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2745 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2746 			/* Copy credentials */
2747 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2748 			unix_set_secdata(&scm, skb);
2749 			check_creds = true;
2750 		}
2751 
2752 		/* Copy address just once */
2753 		if (state->msg && state->msg->msg_name) {
2754 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2755 					 state->msg->msg_name);
2756 			unix_copy_addr(state->msg, skb->sk);
2757 			sunaddr = NULL;
2758 		}
2759 
2760 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2761 		skb_get(skb);
2762 		chunk = state->recv_actor(skb, skip, chunk, state);
2763 		drop_skb = !unix_skb_len(skb);
2764 		/* skb is only safe to use if !drop_skb */
2765 		consume_skb(skb);
2766 		if (chunk < 0) {
2767 			if (copied == 0)
2768 				copied = -EFAULT;
2769 			break;
2770 		}
2771 		copied += chunk;
2772 		size -= chunk;
2773 
2774 		if (drop_skb) {
2775 			/* the skb was touched by a concurrent reader;
2776 			 * we should not expect anything from this skb
2777 			 * anymore and assume it invalid - we can be
2778 			 * sure it was dropped from the socket queue
2779 			 *
2780 			 * let's report a short read
2781 			 */
2782 			err = 0;
2783 			break;
2784 		}
2785 
2786 		/* Mark read part of skb as used */
2787 		if (!(flags & MSG_PEEK)) {
2788 			UNIXCB(skb).consumed += chunk;
2789 
2790 			sk_peek_offset_bwd(sk, chunk);
2791 
2792 			if (UNIXCB(skb).fp) {
2793 				scm_stat_del(sk, skb);
2794 				unix_detach_fds(&scm, skb);
2795 			}
2796 
2797 			if (unix_skb_len(skb))
2798 				break;
2799 
2800 			skb_unlink(skb, &sk->sk_receive_queue);
2801 			consume_skb(skb);
2802 
2803 			if (scm.fp)
2804 				break;
2805 		} else {
2806 			/* It is questionable, see note in unix_dgram_recvmsg.
2807 			 */
2808 			if (UNIXCB(skb).fp)
2809 				unix_peek_fds(&scm, skb);
2810 
2811 			sk_peek_offset_fwd(sk, chunk);
2812 
2813 			if (UNIXCB(skb).fp)
2814 				break;
2815 
2816 			skip = 0;
2817 			last = skb;
2818 			last_len = skb->len;
2819 			unix_state_lock(sk);
2820 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2821 			if (skb)
2822 				goto again;
2823 			unix_state_unlock(sk);
2824 			break;
2825 		}
2826 	} while (size);
2827 
2828 	mutex_unlock(&u->iolock);
2829 	if (state->msg)
2830 		scm_recv(sock, state->msg, &scm, flags);
2831 	else
2832 		scm_destroy(&scm);
2833 out:
2834 	return copied ? : err;
2835 }
2836 
2837 static int unix_stream_read_actor(struct sk_buff *skb,
2838 				  int skip, int chunk,
2839 				  struct unix_stream_read_state *state)
2840 {
2841 	int ret;
2842 
2843 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2844 				    state->msg, chunk);
2845 	return ret ?: chunk;
2846 }
2847 
2848 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2849 			  size_t size, int flags)
2850 {
2851 	struct unix_stream_read_state state = {
2852 		.recv_actor = unix_stream_read_actor,
2853 		.socket = sk->sk_socket,
2854 		.msg = msg,
2855 		.size = size,
2856 		.flags = flags
2857 	};
2858 
2859 	return unix_stream_read_generic(&state, true);
2860 }
2861 
2862 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2863 			       size_t size, int flags)
2864 {
2865 	struct unix_stream_read_state state = {
2866 		.recv_actor = unix_stream_read_actor,
2867 		.socket = sock,
2868 		.msg = msg,
2869 		.size = size,
2870 		.flags = flags
2871 	};
2872 
2873 #ifdef CONFIG_BPF_SYSCALL
2874 	struct sock *sk = sock->sk;
2875 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2876 
2877 	if (prot != &unix_stream_proto)
2878 		return prot->recvmsg(sk, msg, size, flags, NULL);
2879 #endif
2880 	return unix_stream_read_generic(&state, true);
2881 }
2882 
2883 static int unix_stream_splice_actor(struct sk_buff *skb,
2884 				    int skip, int chunk,
2885 				    struct unix_stream_read_state *state)
2886 {
2887 	return skb_splice_bits(skb, state->socket->sk,
2888 			       UNIXCB(skb).consumed + skip,
2889 			       state->pipe, chunk, state->splice_flags);
2890 }
2891 
2892 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2893 				       struct pipe_inode_info *pipe,
2894 				       size_t size, unsigned int flags)
2895 {
2896 	struct unix_stream_read_state state = {
2897 		.recv_actor = unix_stream_splice_actor,
2898 		.socket = sock,
2899 		.pipe = pipe,
2900 		.size = size,
2901 		.splice_flags = flags,
2902 	};
2903 
2904 	if (unlikely(*ppos))
2905 		return -ESPIPE;
2906 
2907 	if (sock->file->f_flags & O_NONBLOCK ||
2908 	    flags & SPLICE_F_NONBLOCK)
2909 		state.flags = MSG_DONTWAIT;
2910 
2911 	return unix_stream_read_generic(&state, false);
2912 }
2913 
2914 static int unix_shutdown(struct socket *sock, int mode)
2915 {
2916 	struct sock *sk = sock->sk;
2917 	struct sock *other;
2918 
2919 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2920 		return -EINVAL;
2921 	/* This maps:
2922 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2923 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2924 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2925 	 */
2926 	++mode;
2927 
2928 	unix_state_lock(sk);
2929 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2930 	other = unix_peer(sk);
2931 	if (other)
2932 		sock_hold(other);
2933 	unix_state_unlock(sk);
2934 	sk->sk_state_change(sk);
2935 
2936 	if (other &&
2937 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2938 
2939 		int peer_mode = 0;
2940 		const struct proto *prot = READ_ONCE(other->sk_prot);
2941 
2942 		if (prot->unhash)
2943 			prot->unhash(other);
2944 		if (mode&RCV_SHUTDOWN)
2945 			peer_mode |= SEND_SHUTDOWN;
2946 		if (mode&SEND_SHUTDOWN)
2947 			peer_mode |= RCV_SHUTDOWN;
2948 		unix_state_lock(other);
2949 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2950 		unix_state_unlock(other);
2951 		other->sk_state_change(other);
2952 		if (peer_mode == SHUTDOWN_MASK)
2953 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2954 		else if (peer_mode & RCV_SHUTDOWN)
2955 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2956 	}
2957 	if (other)
2958 		sock_put(other);
2959 
2960 	return 0;
2961 }
2962 
2963 long unix_inq_len(struct sock *sk)
2964 {
2965 	struct sk_buff *skb;
2966 	long amount = 0;
2967 
2968 	if (sk->sk_state == TCP_LISTEN)
2969 		return -EINVAL;
2970 
2971 	spin_lock(&sk->sk_receive_queue.lock);
2972 	if (sk->sk_type == SOCK_STREAM ||
2973 	    sk->sk_type == SOCK_SEQPACKET) {
2974 		skb_queue_walk(&sk->sk_receive_queue, skb)
2975 			amount += unix_skb_len(skb);
2976 	} else {
2977 		skb = skb_peek(&sk->sk_receive_queue);
2978 		if (skb)
2979 			amount = skb->len;
2980 	}
2981 	spin_unlock(&sk->sk_receive_queue.lock);
2982 
2983 	return amount;
2984 }
2985 EXPORT_SYMBOL_GPL(unix_inq_len);
2986 
2987 long unix_outq_len(struct sock *sk)
2988 {
2989 	return sk_wmem_alloc_get(sk);
2990 }
2991 EXPORT_SYMBOL_GPL(unix_outq_len);
2992 
2993 static int unix_open_file(struct sock *sk)
2994 {
2995 	struct path path;
2996 	struct file *f;
2997 	int fd;
2998 
2999 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3000 		return -EPERM;
3001 
3002 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3003 		return -ENOENT;
3004 
3005 	path = unix_sk(sk)->path;
3006 	if (!path.dentry)
3007 		return -ENOENT;
3008 
3009 	path_get(&path);
3010 
3011 	fd = get_unused_fd_flags(O_CLOEXEC);
3012 	if (fd < 0)
3013 		goto out;
3014 
3015 	f = dentry_open(&path, O_PATH, current_cred());
3016 	if (IS_ERR(f)) {
3017 		put_unused_fd(fd);
3018 		fd = PTR_ERR(f);
3019 		goto out;
3020 	}
3021 
3022 	fd_install(fd, f);
3023 out:
3024 	path_put(&path);
3025 
3026 	return fd;
3027 }
3028 
3029 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3030 {
3031 	struct sock *sk = sock->sk;
3032 	long amount = 0;
3033 	int err;
3034 
3035 	switch (cmd) {
3036 	case SIOCOUTQ:
3037 		amount = unix_outq_len(sk);
3038 		err = put_user(amount, (int __user *)arg);
3039 		break;
3040 	case SIOCINQ:
3041 		amount = unix_inq_len(sk);
3042 		if (amount < 0)
3043 			err = amount;
3044 		else
3045 			err = put_user(amount, (int __user *)arg);
3046 		break;
3047 	case SIOCUNIXFILE:
3048 		err = unix_open_file(sk);
3049 		break;
3050 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3051 	case SIOCATMARK:
3052 		{
3053 			struct sk_buff *skb;
3054 			int answ = 0;
3055 
3056 			skb = skb_peek(&sk->sk_receive_queue);
3057 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3058 				answ = 1;
3059 			err = put_user(answ, (int __user *)arg);
3060 		}
3061 		break;
3062 #endif
3063 	default:
3064 		err = -ENOIOCTLCMD;
3065 		break;
3066 	}
3067 	return err;
3068 }
3069 
3070 #ifdef CONFIG_COMPAT
3071 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3072 {
3073 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3074 }
3075 #endif
3076 
3077 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3078 {
3079 	struct sock *sk = sock->sk;
3080 	__poll_t mask;
3081 	u8 shutdown;
3082 
3083 	sock_poll_wait(file, sock, wait);
3084 	mask = 0;
3085 	shutdown = READ_ONCE(sk->sk_shutdown);
3086 
3087 	/* exceptional events? */
3088 	if (READ_ONCE(sk->sk_err))
3089 		mask |= EPOLLERR;
3090 	if (shutdown == SHUTDOWN_MASK)
3091 		mask |= EPOLLHUP;
3092 	if (shutdown & RCV_SHUTDOWN)
3093 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3094 
3095 	/* readable? */
3096 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3097 		mask |= EPOLLIN | EPOLLRDNORM;
3098 	if (sk_is_readable(sk))
3099 		mask |= EPOLLIN | EPOLLRDNORM;
3100 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3101 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3102 		mask |= EPOLLPRI;
3103 #endif
3104 
3105 	/* Connection-based need to check for termination and startup */
3106 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3107 	    sk->sk_state == TCP_CLOSE)
3108 		mask |= EPOLLHUP;
3109 
3110 	/*
3111 	 * we set writable also when the other side has shut down the
3112 	 * connection. This prevents stuck sockets.
3113 	 */
3114 	if (unix_writable(sk))
3115 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3116 
3117 	return mask;
3118 }
3119 
3120 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3121 				    poll_table *wait)
3122 {
3123 	struct sock *sk = sock->sk, *other;
3124 	unsigned int writable;
3125 	__poll_t mask;
3126 	u8 shutdown;
3127 
3128 	sock_poll_wait(file, sock, wait);
3129 	mask = 0;
3130 	shutdown = READ_ONCE(sk->sk_shutdown);
3131 
3132 	/* exceptional events? */
3133 	if (READ_ONCE(sk->sk_err) ||
3134 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3135 		mask |= EPOLLERR |
3136 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3137 
3138 	if (shutdown & RCV_SHUTDOWN)
3139 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3140 	if (shutdown == SHUTDOWN_MASK)
3141 		mask |= EPOLLHUP;
3142 
3143 	/* readable? */
3144 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3145 		mask |= EPOLLIN | EPOLLRDNORM;
3146 	if (sk_is_readable(sk))
3147 		mask |= EPOLLIN | EPOLLRDNORM;
3148 
3149 	/* Connection-based need to check for termination and startup */
3150 	if (sk->sk_type == SOCK_SEQPACKET) {
3151 		if (sk->sk_state == TCP_CLOSE)
3152 			mask |= EPOLLHUP;
3153 		/* connection hasn't started yet? */
3154 		if (sk->sk_state == TCP_SYN_SENT)
3155 			return mask;
3156 	}
3157 
3158 	/* No write status requested, avoid expensive OUT tests. */
3159 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3160 		return mask;
3161 
3162 	writable = unix_writable(sk);
3163 	if (writable) {
3164 		unix_state_lock(sk);
3165 
3166 		other = unix_peer(sk);
3167 		if (other && unix_peer(other) != sk &&
3168 		    unix_recvq_full_lockless(other) &&
3169 		    unix_dgram_peer_wake_me(sk, other))
3170 			writable = 0;
3171 
3172 		unix_state_unlock(sk);
3173 	}
3174 
3175 	if (writable)
3176 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3177 	else
3178 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3179 
3180 	return mask;
3181 }
3182 
3183 #ifdef CONFIG_PROC_FS
3184 
3185 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3186 
3187 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3188 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3189 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3190 
3191 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3192 {
3193 	unsigned long offset = get_offset(*pos);
3194 	unsigned long bucket = get_bucket(*pos);
3195 	unsigned long count = 0;
3196 	struct sock *sk;
3197 
3198 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3199 	     sk; sk = sk_next(sk)) {
3200 		if (++count == offset)
3201 			break;
3202 	}
3203 
3204 	return sk;
3205 }
3206 
3207 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3208 {
3209 	unsigned long bucket = get_bucket(*pos);
3210 	struct net *net = seq_file_net(seq);
3211 	struct sock *sk;
3212 
3213 	while (bucket < UNIX_HASH_SIZE) {
3214 		spin_lock(&net->unx.table.locks[bucket]);
3215 
3216 		sk = unix_from_bucket(seq, pos);
3217 		if (sk)
3218 			return sk;
3219 
3220 		spin_unlock(&net->unx.table.locks[bucket]);
3221 
3222 		*pos = set_bucket_offset(++bucket, 1);
3223 	}
3224 
3225 	return NULL;
3226 }
3227 
3228 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3229 				  loff_t *pos)
3230 {
3231 	unsigned long bucket = get_bucket(*pos);
3232 
3233 	sk = sk_next(sk);
3234 	if (sk)
3235 		return sk;
3236 
3237 
3238 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3239 
3240 	*pos = set_bucket_offset(++bucket, 1);
3241 
3242 	return unix_get_first(seq, pos);
3243 }
3244 
3245 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3246 {
3247 	if (!*pos)
3248 		return SEQ_START_TOKEN;
3249 
3250 	return unix_get_first(seq, pos);
3251 }
3252 
3253 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3254 {
3255 	++*pos;
3256 
3257 	if (v == SEQ_START_TOKEN)
3258 		return unix_get_first(seq, pos);
3259 
3260 	return unix_get_next(seq, v, pos);
3261 }
3262 
3263 static void unix_seq_stop(struct seq_file *seq, void *v)
3264 {
3265 	struct sock *sk = v;
3266 
3267 	if (sk)
3268 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3269 }
3270 
3271 static int unix_seq_show(struct seq_file *seq, void *v)
3272 {
3273 
3274 	if (v == SEQ_START_TOKEN)
3275 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3276 			 "Inode Path\n");
3277 	else {
3278 		struct sock *s = v;
3279 		struct unix_sock *u = unix_sk(s);
3280 		unix_state_lock(s);
3281 
3282 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3283 			s,
3284 			refcount_read(&s->sk_refcnt),
3285 			0,
3286 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3287 			s->sk_type,
3288 			s->sk_socket ?
3289 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3290 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3291 			sock_i_ino(s));
3292 
3293 		if (u->addr) {	// under a hash table lock here
3294 			int i, len;
3295 			seq_putc(seq, ' ');
3296 
3297 			i = 0;
3298 			len = u->addr->len -
3299 				offsetof(struct sockaddr_un, sun_path);
3300 			if (u->addr->name->sun_path[0]) {
3301 				len--;
3302 			} else {
3303 				seq_putc(seq, '@');
3304 				i++;
3305 			}
3306 			for ( ; i < len; i++)
3307 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3308 					 '@');
3309 		}
3310 		unix_state_unlock(s);
3311 		seq_putc(seq, '\n');
3312 	}
3313 
3314 	return 0;
3315 }
3316 
3317 static const struct seq_operations unix_seq_ops = {
3318 	.start  = unix_seq_start,
3319 	.next   = unix_seq_next,
3320 	.stop   = unix_seq_stop,
3321 	.show   = unix_seq_show,
3322 };
3323 
3324 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3325 struct bpf_unix_iter_state {
3326 	struct seq_net_private p;
3327 	unsigned int cur_sk;
3328 	unsigned int end_sk;
3329 	unsigned int max_sk;
3330 	struct sock **batch;
3331 	bool st_bucket_done;
3332 };
3333 
3334 struct bpf_iter__unix {
3335 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3336 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3337 	uid_t uid __aligned(8);
3338 };
3339 
3340 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3341 			      struct unix_sock *unix_sk, uid_t uid)
3342 {
3343 	struct bpf_iter__unix ctx;
3344 
3345 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3346 	ctx.meta = meta;
3347 	ctx.unix_sk = unix_sk;
3348 	ctx.uid = uid;
3349 	return bpf_iter_run_prog(prog, &ctx);
3350 }
3351 
3352 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3353 
3354 {
3355 	struct bpf_unix_iter_state *iter = seq->private;
3356 	unsigned int expected = 1;
3357 	struct sock *sk;
3358 
3359 	sock_hold(start_sk);
3360 	iter->batch[iter->end_sk++] = start_sk;
3361 
3362 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3363 		if (iter->end_sk < iter->max_sk) {
3364 			sock_hold(sk);
3365 			iter->batch[iter->end_sk++] = sk;
3366 		}
3367 
3368 		expected++;
3369 	}
3370 
3371 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3372 
3373 	return expected;
3374 }
3375 
3376 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3377 {
3378 	while (iter->cur_sk < iter->end_sk)
3379 		sock_put(iter->batch[iter->cur_sk++]);
3380 }
3381 
3382 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3383 				       unsigned int new_batch_sz)
3384 {
3385 	struct sock **new_batch;
3386 
3387 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3388 			     GFP_USER | __GFP_NOWARN);
3389 	if (!new_batch)
3390 		return -ENOMEM;
3391 
3392 	bpf_iter_unix_put_batch(iter);
3393 	kvfree(iter->batch);
3394 	iter->batch = new_batch;
3395 	iter->max_sk = new_batch_sz;
3396 
3397 	return 0;
3398 }
3399 
3400 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3401 					loff_t *pos)
3402 {
3403 	struct bpf_unix_iter_state *iter = seq->private;
3404 	unsigned int expected;
3405 	bool resized = false;
3406 	struct sock *sk;
3407 
3408 	if (iter->st_bucket_done)
3409 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3410 
3411 again:
3412 	/* Get a new batch */
3413 	iter->cur_sk = 0;
3414 	iter->end_sk = 0;
3415 
3416 	sk = unix_get_first(seq, pos);
3417 	if (!sk)
3418 		return NULL; /* Done */
3419 
3420 	expected = bpf_iter_unix_hold_batch(seq, sk);
3421 
3422 	if (iter->end_sk == expected) {
3423 		iter->st_bucket_done = true;
3424 		return sk;
3425 	}
3426 
3427 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3428 		resized = true;
3429 		goto again;
3430 	}
3431 
3432 	return sk;
3433 }
3434 
3435 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3436 {
3437 	if (!*pos)
3438 		return SEQ_START_TOKEN;
3439 
3440 	/* bpf iter does not support lseek, so it always
3441 	 * continue from where it was stop()-ped.
3442 	 */
3443 	return bpf_iter_unix_batch(seq, pos);
3444 }
3445 
3446 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3447 {
3448 	struct bpf_unix_iter_state *iter = seq->private;
3449 	struct sock *sk;
3450 
3451 	/* Whenever seq_next() is called, the iter->cur_sk is
3452 	 * done with seq_show(), so advance to the next sk in
3453 	 * the batch.
3454 	 */
3455 	if (iter->cur_sk < iter->end_sk)
3456 		sock_put(iter->batch[iter->cur_sk++]);
3457 
3458 	++*pos;
3459 
3460 	if (iter->cur_sk < iter->end_sk)
3461 		sk = iter->batch[iter->cur_sk];
3462 	else
3463 		sk = bpf_iter_unix_batch(seq, pos);
3464 
3465 	return sk;
3466 }
3467 
3468 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3469 {
3470 	struct bpf_iter_meta meta;
3471 	struct bpf_prog *prog;
3472 	struct sock *sk = v;
3473 	uid_t uid;
3474 	bool slow;
3475 	int ret;
3476 
3477 	if (v == SEQ_START_TOKEN)
3478 		return 0;
3479 
3480 	slow = lock_sock_fast(sk);
3481 
3482 	if (unlikely(sk_unhashed(sk))) {
3483 		ret = SEQ_SKIP;
3484 		goto unlock;
3485 	}
3486 
3487 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3488 	meta.seq = seq;
3489 	prog = bpf_iter_get_info(&meta, false);
3490 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3491 unlock:
3492 	unlock_sock_fast(sk, slow);
3493 	return ret;
3494 }
3495 
3496 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3497 {
3498 	struct bpf_unix_iter_state *iter = seq->private;
3499 	struct bpf_iter_meta meta;
3500 	struct bpf_prog *prog;
3501 
3502 	if (!v) {
3503 		meta.seq = seq;
3504 		prog = bpf_iter_get_info(&meta, true);
3505 		if (prog)
3506 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3507 	}
3508 
3509 	if (iter->cur_sk < iter->end_sk)
3510 		bpf_iter_unix_put_batch(iter);
3511 }
3512 
3513 static const struct seq_operations bpf_iter_unix_seq_ops = {
3514 	.start	= bpf_iter_unix_seq_start,
3515 	.next	= bpf_iter_unix_seq_next,
3516 	.stop	= bpf_iter_unix_seq_stop,
3517 	.show	= bpf_iter_unix_seq_show,
3518 };
3519 #endif
3520 #endif
3521 
3522 static const struct net_proto_family unix_family_ops = {
3523 	.family = PF_UNIX,
3524 	.create = unix_create,
3525 	.owner	= THIS_MODULE,
3526 };
3527 
3528 
3529 static int __net_init unix_net_init(struct net *net)
3530 {
3531 	int i;
3532 
3533 	net->unx.sysctl_max_dgram_qlen = 10;
3534 	if (unix_sysctl_register(net))
3535 		goto out;
3536 
3537 #ifdef CONFIG_PROC_FS
3538 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3539 			     sizeof(struct seq_net_private)))
3540 		goto err_sysctl;
3541 #endif
3542 
3543 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3544 					      sizeof(spinlock_t), GFP_KERNEL);
3545 	if (!net->unx.table.locks)
3546 		goto err_proc;
3547 
3548 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3549 						sizeof(struct hlist_head),
3550 						GFP_KERNEL);
3551 	if (!net->unx.table.buckets)
3552 		goto free_locks;
3553 
3554 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3555 		spin_lock_init(&net->unx.table.locks[i]);
3556 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3557 	}
3558 
3559 	return 0;
3560 
3561 free_locks:
3562 	kvfree(net->unx.table.locks);
3563 err_proc:
3564 #ifdef CONFIG_PROC_FS
3565 	remove_proc_entry("unix", net->proc_net);
3566 err_sysctl:
3567 #endif
3568 	unix_sysctl_unregister(net);
3569 out:
3570 	return -ENOMEM;
3571 }
3572 
3573 static void __net_exit unix_net_exit(struct net *net)
3574 {
3575 	kvfree(net->unx.table.buckets);
3576 	kvfree(net->unx.table.locks);
3577 	unix_sysctl_unregister(net);
3578 	remove_proc_entry("unix", net->proc_net);
3579 }
3580 
3581 static struct pernet_operations unix_net_ops = {
3582 	.init = unix_net_init,
3583 	.exit = unix_net_exit,
3584 };
3585 
3586 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3587 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3588 		     struct unix_sock *unix_sk, uid_t uid)
3589 
3590 #define INIT_BATCH_SZ 16
3591 
3592 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3593 {
3594 	struct bpf_unix_iter_state *iter = priv_data;
3595 	int err;
3596 
3597 	err = bpf_iter_init_seq_net(priv_data, aux);
3598 	if (err)
3599 		return err;
3600 
3601 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3602 	if (err) {
3603 		bpf_iter_fini_seq_net(priv_data);
3604 		return err;
3605 	}
3606 
3607 	return 0;
3608 }
3609 
3610 static void bpf_iter_fini_unix(void *priv_data)
3611 {
3612 	struct bpf_unix_iter_state *iter = priv_data;
3613 
3614 	bpf_iter_fini_seq_net(priv_data);
3615 	kvfree(iter->batch);
3616 }
3617 
3618 static const struct bpf_iter_seq_info unix_seq_info = {
3619 	.seq_ops		= &bpf_iter_unix_seq_ops,
3620 	.init_seq_private	= bpf_iter_init_unix,
3621 	.fini_seq_private	= bpf_iter_fini_unix,
3622 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3623 };
3624 
3625 static const struct bpf_func_proto *
3626 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3627 			     const struct bpf_prog *prog)
3628 {
3629 	switch (func_id) {
3630 	case BPF_FUNC_setsockopt:
3631 		return &bpf_sk_setsockopt_proto;
3632 	case BPF_FUNC_getsockopt:
3633 		return &bpf_sk_getsockopt_proto;
3634 	default:
3635 		return NULL;
3636 	}
3637 }
3638 
3639 static struct bpf_iter_reg unix_reg_info = {
3640 	.target			= "unix",
3641 	.ctx_arg_info_size	= 1,
3642 	.ctx_arg_info		= {
3643 		{ offsetof(struct bpf_iter__unix, unix_sk),
3644 		  PTR_TO_BTF_ID_OR_NULL },
3645 	},
3646 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3647 	.seq_info		= &unix_seq_info,
3648 };
3649 
3650 static void __init bpf_iter_register(void)
3651 {
3652 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3653 	if (bpf_iter_reg_target(&unix_reg_info))
3654 		pr_warn("Warning: could not register bpf iterator unix\n");
3655 }
3656 #endif
3657 
3658 static int __init af_unix_init(void)
3659 {
3660 	int i, rc = -1;
3661 
3662 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3663 
3664 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3665 		spin_lock_init(&bsd_socket_locks[i]);
3666 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3667 	}
3668 
3669 	rc = proto_register(&unix_dgram_proto, 1);
3670 	if (rc != 0) {
3671 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3672 		goto out;
3673 	}
3674 
3675 	rc = proto_register(&unix_stream_proto, 1);
3676 	if (rc != 0) {
3677 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3678 		proto_unregister(&unix_dgram_proto);
3679 		goto out;
3680 	}
3681 
3682 	sock_register(&unix_family_ops);
3683 	register_pernet_subsys(&unix_net_ops);
3684 	unix_bpf_build_proto();
3685 
3686 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3687 	bpf_iter_register();
3688 #endif
3689 
3690 out:
3691 	return rc;
3692 }
3693 
3694 static void __exit af_unix_exit(void)
3695 {
3696 	sock_unregister(PF_UNIX);
3697 	proto_unregister(&unix_dgram_proto);
3698 	proto_unregister(&unix_stream_proto);
3699 	unregister_pernet_subsys(&unix_net_ops);
3700 }
3701 
3702 /* Earlier than device_initcall() so that other drivers invoking
3703    request_module() don't end up in a loop when modprobe tries
3704    to use a UNIX socket. But later than subsys_initcall() because
3705    we depend on stuff initialised there */
3706 fs_initcall(af_unix_init);
3707 module_exit(af_unix_exit);
3708 
3709 MODULE_LICENSE("GPL");
3710 MODULE_ALIAS_NETPROTO(PF_UNIX);
3711