xref: /openbmc/linux/net/unix/af_unix.c (revision 7df45f35313c1ae083dac72c066b3aebfc7fc0cd)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 static atomic_long_t unix_nr_socks;
121 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
122 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
123 
124 /* SMP locking strategy:
125  *    hash table is protected with spinlock.
126  *    each socket state is protected by separate spinlock.
127  */
128 #ifdef CONFIG_PROVE_LOCKING
129 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
130 
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)131 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
132 				  const struct lockdep_map *b)
133 {
134 	return cmp_ptr(a, b);
135 }
136 
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)137 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
138 				  const struct lockdep_map *_b)
139 {
140 	const struct unix_sock *a, *b;
141 
142 	a = container_of(_a, struct unix_sock, lock.dep_map);
143 	b = container_of(_b, struct unix_sock, lock.dep_map);
144 
145 	/* unix_state_double_lock(): ascending address order. */
146 	return cmp_ptr(a, b);
147 }
148 
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)149 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
150 				  const struct lockdep_map *_b)
151 {
152 	const struct sock *a, *b;
153 
154 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
155 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
156 
157 	/* unix_collect_skb(): listener -> embryo order. */
158 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
159 		return -1;
160 
161 	/* Should never happen.  Just to be symmetric. */
162 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
163 		return 1;
164 
165 	return 0;
166 }
167 #endif
168 
unix_unbound_hash(struct sock * sk)169 static unsigned int unix_unbound_hash(struct sock *sk)
170 {
171 	unsigned long hash = (unsigned long)sk;
172 
173 	hash ^= hash >> 16;
174 	hash ^= hash >> 8;
175 	hash ^= sk->sk_type;
176 
177 	return hash & UNIX_HASH_MOD;
178 }
179 
unix_bsd_hash(struct inode * i)180 static unsigned int unix_bsd_hash(struct inode *i)
181 {
182 	return i->i_ino & UNIX_HASH_MOD;
183 }
184 
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)185 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
186 				       int addr_len, int type)
187 {
188 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
189 	unsigned int hash;
190 
191 	hash = (__force unsigned int)csum_fold(csum);
192 	hash ^= hash >> 8;
193 	hash ^= type;
194 
195 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
196 }
197 
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)198 static void unix_table_double_lock(struct net *net,
199 				   unsigned int hash1, unsigned int hash2)
200 {
201 	if (hash1 == hash2) {
202 		spin_lock(&net->unx.table.locks[hash1]);
203 		return;
204 	}
205 
206 	if (hash1 > hash2)
207 		swap(hash1, hash2);
208 
209 	spin_lock(&net->unx.table.locks[hash1]);
210 	spin_lock(&net->unx.table.locks[hash2]);
211 }
212 
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)213 static void unix_table_double_unlock(struct net *net,
214 				     unsigned int hash1, unsigned int hash2)
215 {
216 	if (hash1 == hash2) {
217 		spin_unlock(&net->unx.table.locks[hash1]);
218 		return;
219 	}
220 
221 	spin_unlock(&net->unx.table.locks[hash1]);
222 	spin_unlock(&net->unx.table.locks[hash2]);
223 }
224 
225 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)226 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
227 {
228 	UNIXCB(skb).secid = scm->secid;
229 }
230 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)231 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
232 {
233 	scm->secid = UNIXCB(skb).secid;
234 }
235 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)236 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
237 {
238 	return (scm->secid == UNIXCB(skb).secid);
239 }
240 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)241 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
242 { }
243 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)244 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
245 { }
246 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)247 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
248 {
249 	return true;
250 }
251 #endif /* CONFIG_SECURITY_NETWORK */
252 
unix_our_peer(struct sock * sk,struct sock * osk)253 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
254 {
255 	return unix_peer(osk) == sk;
256 }
257 
unix_may_send(struct sock * sk,struct sock * osk)258 static inline int unix_may_send(struct sock *sk, struct sock *osk)
259 {
260 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
261 }
262 
unix_recvq_full_lockless(const struct sock * sk)263 static inline int unix_recvq_full_lockless(const struct sock *sk)
264 {
265 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
266 }
267 
unix_peer_get(struct sock * s)268 struct sock *unix_peer_get(struct sock *s)
269 {
270 	struct sock *peer;
271 
272 	unix_state_lock(s);
273 	peer = unix_peer(s);
274 	if (peer)
275 		sock_hold(peer);
276 	unix_state_unlock(s);
277 	return peer;
278 }
279 EXPORT_SYMBOL_GPL(unix_peer_get);
280 
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)281 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
282 					     int addr_len)
283 {
284 	struct unix_address *addr;
285 
286 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
287 	if (!addr)
288 		return NULL;
289 
290 	refcount_set(&addr->refcnt, 1);
291 	addr->len = addr_len;
292 	memcpy(addr->name, sunaddr, addr_len);
293 
294 	return addr;
295 }
296 
unix_release_addr(struct unix_address * addr)297 static inline void unix_release_addr(struct unix_address *addr)
298 {
299 	if (refcount_dec_and_test(&addr->refcnt))
300 		kfree(addr);
301 }
302 
303 /*
304  *	Check unix socket name:
305  *		- should be not zero length.
306  *	        - if started by not zero, should be NULL terminated (FS object)
307  *		- if started by zero, it is abstract name.
308  */
309 
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)310 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
311 {
312 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
313 	    addr_len > sizeof(*sunaddr))
314 		return -EINVAL;
315 
316 	if (sunaddr->sun_family != AF_UNIX)
317 		return -EINVAL;
318 
319 	return 0;
320 }
321 
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)322 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
323 {
324 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
325 	short offset = offsetof(struct sockaddr_storage, __data);
326 
327 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
328 
329 	/* This may look like an off by one error but it is a bit more
330 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
331 	 * sun_path[108] doesn't as such exist.  However in kernel space
332 	 * we are guaranteed that it is a valid memory location in our
333 	 * kernel address buffer because syscall functions always pass
334 	 * a pointer of struct sockaddr_storage which has a bigger buffer
335 	 * than 108.  Also, we must terminate sun_path for strlen() in
336 	 * getname_kernel().
337 	 */
338 	addr->__data[addr_len - offset] = 0;
339 
340 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
341 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
342 	 * know the actual buffer.
343 	 */
344 	return strlen(addr->__data) + offset + 1;
345 }
346 
__unix_remove_socket(struct sock * sk)347 static void __unix_remove_socket(struct sock *sk)
348 {
349 	sk_del_node_init(sk);
350 }
351 
__unix_insert_socket(struct net * net,struct sock * sk)352 static void __unix_insert_socket(struct net *net, struct sock *sk)
353 {
354 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
355 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
356 }
357 
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)358 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
359 				 struct unix_address *addr, unsigned int hash)
360 {
361 	__unix_remove_socket(sk);
362 	smp_store_release(&unix_sk(sk)->addr, addr);
363 
364 	sk->sk_hash = hash;
365 	__unix_insert_socket(net, sk);
366 }
367 
unix_remove_socket(struct net * net,struct sock * sk)368 static void unix_remove_socket(struct net *net, struct sock *sk)
369 {
370 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
371 	__unix_remove_socket(sk);
372 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
373 }
374 
unix_insert_unbound_socket(struct net * net,struct sock * sk)375 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
376 {
377 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
378 	__unix_insert_socket(net, sk);
379 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
380 }
381 
unix_insert_bsd_socket(struct sock * sk)382 static void unix_insert_bsd_socket(struct sock *sk)
383 {
384 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
385 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
386 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
387 }
388 
unix_remove_bsd_socket(struct sock * sk)389 static void unix_remove_bsd_socket(struct sock *sk)
390 {
391 	if (!hlist_unhashed(&sk->sk_bind_node)) {
392 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
393 		__sk_del_bind_node(sk);
394 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
395 
396 		sk_node_init(&sk->sk_bind_node);
397 	}
398 }
399 
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)400 static struct sock *__unix_find_socket_byname(struct net *net,
401 					      struct sockaddr_un *sunname,
402 					      int len, unsigned int hash)
403 {
404 	struct sock *s;
405 
406 	sk_for_each(s, &net->unx.table.buckets[hash]) {
407 		struct unix_sock *u = unix_sk(s);
408 
409 		if (u->addr->len == len &&
410 		    !memcmp(u->addr->name, sunname, len))
411 			return s;
412 	}
413 	return NULL;
414 }
415 
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)416 static inline struct sock *unix_find_socket_byname(struct net *net,
417 						   struct sockaddr_un *sunname,
418 						   int len, unsigned int hash)
419 {
420 	struct sock *s;
421 
422 	spin_lock(&net->unx.table.locks[hash]);
423 	s = __unix_find_socket_byname(net, sunname, len, hash);
424 	if (s)
425 		sock_hold(s);
426 	spin_unlock(&net->unx.table.locks[hash]);
427 	return s;
428 }
429 
unix_find_socket_byinode(struct inode * i)430 static struct sock *unix_find_socket_byinode(struct inode *i)
431 {
432 	unsigned int hash = unix_bsd_hash(i);
433 	struct sock *s;
434 
435 	spin_lock(&bsd_socket_locks[hash]);
436 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
437 		struct dentry *dentry = unix_sk(s)->path.dentry;
438 
439 		if (dentry && d_backing_inode(dentry) == i) {
440 			sock_hold(s);
441 			spin_unlock(&bsd_socket_locks[hash]);
442 			return s;
443 		}
444 	}
445 	spin_unlock(&bsd_socket_locks[hash]);
446 	return NULL;
447 }
448 
449 /* Support code for asymmetrically connected dgram sockets
450  *
451  * If a datagram socket is connected to a socket not itself connected
452  * to the first socket (eg, /dev/log), clients may only enqueue more
453  * messages if the present receive queue of the server socket is not
454  * "too large". This means there's a second writeability condition
455  * poll and sendmsg need to test. The dgram recv code will do a wake
456  * up on the peer_wait wait queue of a socket upon reception of a
457  * datagram which needs to be propagated to sleeping would-be writers
458  * since these might not have sent anything so far. This can't be
459  * accomplished via poll_wait because the lifetime of the server
460  * socket might be less than that of its clients if these break their
461  * association with it or if the server socket is closed while clients
462  * are still connected to it and there's no way to inform "a polling
463  * implementation" that it should let go of a certain wait queue
464  *
465  * In order to propagate a wake up, a wait_queue_entry_t of the client
466  * socket is enqueued on the peer_wait queue of the server socket
467  * whose wake function does a wake_up on the ordinary client socket
468  * wait queue. This connection is established whenever a write (or
469  * poll for write) hit the flow control condition and broken when the
470  * association to the server socket is dissolved or after a wake up
471  * was relayed.
472  */
473 
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)474 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
475 				      void *key)
476 {
477 	struct unix_sock *u;
478 	wait_queue_head_t *u_sleep;
479 
480 	u = container_of(q, struct unix_sock, peer_wake);
481 
482 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
483 			    q);
484 	u->peer_wake.private = NULL;
485 
486 	/* relaying can only happen while the wq still exists */
487 	u_sleep = sk_sleep(&u->sk);
488 	if (u_sleep)
489 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
490 
491 	return 0;
492 }
493 
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)494 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
495 {
496 	struct unix_sock *u, *u_other;
497 	int rc;
498 
499 	u = unix_sk(sk);
500 	u_other = unix_sk(other);
501 	rc = 0;
502 	spin_lock(&u_other->peer_wait.lock);
503 
504 	if (!u->peer_wake.private) {
505 		u->peer_wake.private = other;
506 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
507 
508 		rc = 1;
509 	}
510 
511 	spin_unlock(&u_other->peer_wait.lock);
512 	return rc;
513 }
514 
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)515 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
516 					    struct sock *other)
517 {
518 	struct unix_sock *u, *u_other;
519 
520 	u = unix_sk(sk);
521 	u_other = unix_sk(other);
522 	spin_lock(&u_other->peer_wait.lock);
523 
524 	if (u->peer_wake.private == other) {
525 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
526 		u->peer_wake.private = NULL;
527 	}
528 
529 	spin_unlock(&u_other->peer_wait.lock);
530 }
531 
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)532 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
533 						   struct sock *other)
534 {
535 	unix_dgram_peer_wake_disconnect(sk, other);
536 	wake_up_interruptible_poll(sk_sleep(sk),
537 				   EPOLLOUT |
538 				   EPOLLWRNORM |
539 				   EPOLLWRBAND);
540 }
541 
542 /* preconditions:
543  *	- unix_peer(sk) == other
544  *	- association is stable
545  */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)546 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
547 {
548 	int connected;
549 
550 	connected = unix_dgram_peer_wake_connect(sk, other);
551 
552 	/* If other is SOCK_DEAD, we want to make sure we signal
553 	 * POLLOUT, such that a subsequent write() can get a
554 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
555 	 * to other and its full, we will hang waiting for POLLOUT.
556 	 */
557 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
558 		return 1;
559 
560 	if (connected)
561 		unix_dgram_peer_wake_disconnect(sk, other);
562 
563 	return 0;
564 }
565 
unix_writable(const struct sock * sk,unsigned char state)566 static int unix_writable(const struct sock *sk, unsigned char state)
567 {
568 	return state != TCP_LISTEN &&
569 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
570 }
571 
unix_write_space(struct sock * sk)572 static void unix_write_space(struct sock *sk)
573 {
574 	struct socket_wq *wq;
575 
576 	rcu_read_lock();
577 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
578 		wq = rcu_dereference(sk->sk_wq);
579 		if (skwq_has_sleeper(wq))
580 			wake_up_interruptible_sync_poll(&wq->wait,
581 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
582 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
583 	}
584 	rcu_read_unlock();
585 }
586 
587 /* When dgram socket disconnects (or changes its peer), we clear its receive
588  * queue of packets arrived from previous peer. First, it allows to do
589  * flow control based only on wmem_alloc; second, sk connected to peer
590  * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)591 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
592 {
593 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
594 		skb_queue_purge(&sk->sk_receive_queue);
595 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
596 
597 		/* If one link of bidirectional dgram pipe is disconnected,
598 		 * we signal error. Messages are lost. Do not make this,
599 		 * when peer was not connected to us.
600 		 */
601 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
602 			WRITE_ONCE(other->sk_err, ECONNRESET);
603 			sk_error_report(other);
604 		}
605 	}
606 }
607 
unix_sock_destructor(struct sock * sk)608 static void unix_sock_destructor(struct sock *sk)
609 {
610 	struct unix_sock *u = unix_sk(sk);
611 
612 	skb_queue_purge(&sk->sk_receive_queue);
613 
614 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
615 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
616 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
617 	if (!sock_flag(sk, SOCK_DEAD)) {
618 		pr_info("Attempt to release alive unix socket: %p\n", sk);
619 		return;
620 	}
621 
622 	if (u->addr)
623 		unix_release_addr(u->addr);
624 
625 	atomic_long_dec(&unix_nr_socks);
626 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
627 #ifdef UNIX_REFCNT_DEBUG
628 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
629 		atomic_long_read(&unix_nr_socks));
630 #endif
631 }
632 
unix_skb_len(const struct sk_buff * skb)633 static unsigned int unix_skb_len(const struct sk_buff *skb)
634 {
635 	return skb->len - UNIXCB(skb).consumed;
636 }
637 
unix_release_sock(struct sock * sk,int embrion)638 static void unix_release_sock(struct sock *sk, int embrion)
639 {
640 	struct unix_sock *u = unix_sk(sk);
641 	struct sock *skpair;
642 	struct sk_buff *skb;
643 	struct path path;
644 	int state;
645 
646 	unix_remove_socket(sock_net(sk), sk);
647 	unix_remove_bsd_socket(sk);
648 
649 	/* Clear state */
650 	unix_state_lock(sk);
651 	sock_orphan(sk);
652 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
653 	path	     = u->path;
654 	u->path.dentry = NULL;
655 	u->path.mnt = NULL;
656 	state = sk->sk_state;
657 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
658 
659 	skpair = unix_peer(sk);
660 	unix_peer(sk) = NULL;
661 
662 	unix_state_unlock(sk);
663 
664 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
665 	u->oob_skb = NULL;
666 #endif
667 
668 	wake_up_interruptible_all(&u->peer_wait);
669 
670 	if (skpair != NULL) {
671 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
672 			struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
673 
674 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
675 			if (skb && !unix_skb_len(skb))
676 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
677 #endif
678 			unix_state_lock(skpair);
679 			/* No more writes */
680 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
681 			if (skb || embrion)
682 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
683 			unix_state_unlock(skpair);
684 			skpair->sk_state_change(skpair);
685 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
686 		}
687 
688 		unix_dgram_peer_wake_disconnect(sk, skpair);
689 		sock_put(skpair); /* It may now die */
690 	}
691 
692 	/* Try to flush out this socket. Throw out buffers at least */
693 
694 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
695 		if (state == TCP_LISTEN)
696 			unix_release_sock(skb->sk, 1);
697 		/* passed fds are erased in the kfree_skb hook	      */
698 		UNIXCB(skb).consumed = skb->len;
699 		kfree_skb(skb);
700 	}
701 
702 	if (path.dentry)
703 		path_put(&path);
704 
705 	sock_put(sk);
706 
707 	/* ---- Socket is dead now and most probably destroyed ---- */
708 
709 	/*
710 	 * Fixme: BSD difference: In BSD all sockets connected to us get
711 	 *	  ECONNRESET and we die on the spot. In Linux we behave
712 	 *	  like files and pipes do and wait for the last
713 	 *	  dereference.
714 	 *
715 	 * Can't we simply set sock->err?
716 	 *
717 	 *	  What the above comment does talk about? --ANK(980817)
718 	 */
719 
720 	if (READ_ONCE(unix_tot_inflight))
721 		unix_gc();		/* Garbage collect fds */
722 }
723 
init_peercred(struct sock * sk)724 static void init_peercred(struct sock *sk)
725 {
726 	const struct cred *old_cred;
727 	struct pid *old_pid;
728 
729 	spin_lock(&sk->sk_peer_lock);
730 	old_pid = sk->sk_peer_pid;
731 	old_cred = sk->sk_peer_cred;
732 	sk->sk_peer_pid  = get_pid(task_tgid(current));
733 	sk->sk_peer_cred = get_current_cred();
734 	spin_unlock(&sk->sk_peer_lock);
735 
736 	put_pid(old_pid);
737 	put_cred(old_cred);
738 }
739 
copy_peercred(struct sock * sk,struct sock * peersk)740 static void copy_peercred(struct sock *sk, struct sock *peersk)
741 {
742 	if (sk < peersk) {
743 		spin_lock(&sk->sk_peer_lock);
744 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
745 	} else {
746 		spin_lock(&peersk->sk_peer_lock);
747 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
748 	}
749 
750 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
751 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
752 
753 	spin_unlock(&sk->sk_peer_lock);
754 	spin_unlock(&peersk->sk_peer_lock);
755 }
756 
unix_listen(struct socket * sock,int backlog)757 static int unix_listen(struct socket *sock, int backlog)
758 {
759 	int err;
760 	struct sock *sk = sock->sk;
761 	struct unix_sock *u = unix_sk(sk);
762 
763 	err = -EOPNOTSUPP;
764 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
765 		goto out;	/* Only stream/seqpacket sockets accept */
766 	err = -EINVAL;
767 	if (!READ_ONCE(u->addr))
768 		goto out;	/* No listens on an unbound socket */
769 	unix_state_lock(sk);
770 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
771 		goto out_unlock;
772 	if (backlog > sk->sk_max_ack_backlog)
773 		wake_up_interruptible_all(&u->peer_wait);
774 	sk->sk_max_ack_backlog	= backlog;
775 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
776 
777 	/* set credentials so connect can copy them */
778 	init_peercred(sk);
779 	err = 0;
780 
781 out_unlock:
782 	unix_state_unlock(sk);
783 out:
784 	return err;
785 }
786 
787 static int unix_release(struct socket *);
788 static int unix_bind(struct socket *, struct sockaddr *, int);
789 static int unix_stream_connect(struct socket *, struct sockaddr *,
790 			       int addr_len, int flags);
791 static int unix_socketpair(struct socket *, struct socket *);
792 static int unix_accept(struct socket *, struct socket *, int, bool);
793 static int unix_getname(struct socket *, struct sockaddr *, int);
794 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
795 static __poll_t unix_dgram_poll(struct file *, struct socket *,
796 				    poll_table *);
797 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
798 #ifdef CONFIG_COMPAT
799 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
800 #endif
801 static int unix_shutdown(struct socket *, int);
802 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
803 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
804 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
805 				       struct pipe_inode_info *, size_t size,
806 				       unsigned int flags);
807 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
808 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
809 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
810 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
811 static int unix_dgram_connect(struct socket *, struct sockaddr *,
812 			      int, int);
813 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
814 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
815 				  int);
816 
unix_set_peek_off(struct sock * sk,int val)817 static int unix_set_peek_off(struct sock *sk, int val)
818 {
819 	struct unix_sock *u = unix_sk(sk);
820 
821 	if (mutex_lock_interruptible(&u->iolock))
822 		return -EINTR;
823 
824 	WRITE_ONCE(sk->sk_peek_off, val);
825 	mutex_unlock(&u->iolock);
826 
827 	return 0;
828 }
829 
830 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)831 static int unix_count_nr_fds(struct sock *sk)
832 {
833 	struct sk_buff *skb;
834 	struct unix_sock *u;
835 	int nr_fds = 0;
836 
837 	spin_lock(&sk->sk_receive_queue.lock);
838 	skb = skb_peek(&sk->sk_receive_queue);
839 	while (skb) {
840 		u = unix_sk(skb->sk);
841 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
842 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
843 	}
844 	spin_unlock(&sk->sk_receive_queue.lock);
845 
846 	return nr_fds;
847 }
848 
unix_show_fdinfo(struct seq_file * m,struct socket * sock)849 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
850 {
851 	struct sock *sk = sock->sk;
852 	unsigned char s_state;
853 	struct unix_sock *u;
854 	int nr_fds = 0;
855 
856 	if (sk) {
857 		s_state = READ_ONCE(sk->sk_state);
858 		u = unix_sk(sk);
859 
860 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
861 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
862 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
863 		 */
864 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
865 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
866 		else if (s_state == TCP_LISTEN)
867 			nr_fds = unix_count_nr_fds(sk);
868 
869 		seq_printf(m, "scm_fds: %u\n", nr_fds);
870 	}
871 }
872 #else
873 #define unix_show_fdinfo NULL
874 #endif
875 
876 static const struct proto_ops unix_stream_ops = {
877 	.family =	PF_UNIX,
878 	.owner =	THIS_MODULE,
879 	.release =	unix_release,
880 	.bind =		unix_bind,
881 	.connect =	unix_stream_connect,
882 	.socketpair =	unix_socketpair,
883 	.accept =	unix_accept,
884 	.getname =	unix_getname,
885 	.poll =		unix_poll,
886 	.ioctl =	unix_ioctl,
887 #ifdef CONFIG_COMPAT
888 	.compat_ioctl =	unix_compat_ioctl,
889 #endif
890 	.listen =	unix_listen,
891 	.shutdown =	unix_shutdown,
892 	.sendmsg =	unix_stream_sendmsg,
893 	.recvmsg =	unix_stream_recvmsg,
894 	.read_skb =	unix_stream_read_skb,
895 	.mmap =		sock_no_mmap,
896 	.splice_read =	unix_stream_splice_read,
897 	.set_peek_off =	unix_set_peek_off,
898 	.show_fdinfo =	unix_show_fdinfo,
899 };
900 
901 static const struct proto_ops unix_dgram_ops = {
902 	.family =	PF_UNIX,
903 	.owner =	THIS_MODULE,
904 	.release =	unix_release,
905 	.bind =		unix_bind,
906 	.connect =	unix_dgram_connect,
907 	.socketpair =	unix_socketpair,
908 	.accept =	sock_no_accept,
909 	.getname =	unix_getname,
910 	.poll =		unix_dgram_poll,
911 	.ioctl =	unix_ioctl,
912 #ifdef CONFIG_COMPAT
913 	.compat_ioctl =	unix_compat_ioctl,
914 #endif
915 	.listen =	sock_no_listen,
916 	.shutdown =	unix_shutdown,
917 	.sendmsg =	unix_dgram_sendmsg,
918 	.read_skb =	unix_read_skb,
919 	.recvmsg =	unix_dgram_recvmsg,
920 	.mmap =		sock_no_mmap,
921 	.set_peek_off =	unix_set_peek_off,
922 	.show_fdinfo =	unix_show_fdinfo,
923 };
924 
925 static const struct proto_ops unix_seqpacket_ops = {
926 	.family =	PF_UNIX,
927 	.owner =	THIS_MODULE,
928 	.release =	unix_release,
929 	.bind =		unix_bind,
930 	.connect =	unix_stream_connect,
931 	.socketpair =	unix_socketpair,
932 	.accept =	unix_accept,
933 	.getname =	unix_getname,
934 	.poll =		unix_dgram_poll,
935 	.ioctl =	unix_ioctl,
936 #ifdef CONFIG_COMPAT
937 	.compat_ioctl =	unix_compat_ioctl,
938 #endif
939 	.listen =	unix_listen,
940 	.shutdown =	unix_shutdown,
941 	.sendmsg =	unix_seqpacket_sendmsg,
942 	.recvmsg =	unix_seqpacket_recvmsg,
943 	.mmap =		sock_no_mmap,
944 	.set_peek_off =	unix_set_peek_off,
945 	.show_fdinfo =	unix_show_fdinfo,
946 };
947 
unix_close(struct sock * sk,long timeout)948 static void unix_close(struct sock *sk, long timeout)
949 {
950 	/* Nothing to do here, unix socket does not need a ->close().
951 	 * This is merely for sockmap.
952 	 */
953 }
954 
unix_unhash(struct sock * sk)955 static void unix_unhash(struct sock *sk)
956 {
957 	/* Nothing to do here, unix socket does not need a ->unhash().
958 	 * This is merely for sockmap.
959 	 */
960 }
961 
unix_bpf_bypass_getsockopt(int level,int optname)962 static bool unix_bpf_bypass_getsockopt(int level, int optname)
963 {
964 	if (level == SOL_SOCKET) {
965 		switch (optname) {
966 		case SO_PEERPIDFD:
967 			return true;
968 		default:
969 			return false;
970 		}
971 	}
972 
973 	return false;
974 }
975 
976 struct proto unix_dgram_proto = {
977 	.name			= "UNIX",
978 	.owner			= THIS_MODULE,
979 	.obj_size		= sizeof(struct unix_sock),
980 	.close			= unix_close,
981 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
982 #ifdef CONFIG_BPF_SYSCALL
983 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
984 #endif
985 };
986 
987 struct proto unix_stream_proto = {
988 	.name			= "UNIX-STREAM",
989 	.owner			= THIS_MODULE,
990 	.obj_size		= sizeof(struct unix_sock),
991 	.close			= unix_close,
992 	.unhash			= unix_unhash,
993 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
994 #ifdef CONFIG_BPF_SYSCALL
995 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
996 #endif
997 };
998 
unix_create1(struct net * net,struct socket * sock,int kern,int type)999 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1000 {
1001 	struct unix_sock *u;
1002 	struct sock *sk;
1003 	int err;
1004 
1005 	atomic_long_inc(&unix_nr_socks);
1006 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1007 		err = -ENFILE;
1008 		goto err;
1009 	}
1010 
1011 	if (type == SOCK_STREAM)
1012 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1013 	else /*dgram and  seqpacket */
1014 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1015 
1016 	if (!sk) {
1017 		err = -ENOMEM;
1018 		goto err;
1019 	}
1020 
1021 	sock_init_data(sock, sk);
1022 
1023 	sk->sk_hash		= unix_unbound_hash(sk);
1024 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1025 	sk->sk_write_space	= unix_write_space;
1026 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1027 	sk->sk_destruct		= unix_sock_destructor;
1028 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1029 
1030 	u = unix_sk(sk);
1031 	u->listener = NULL;
1032 	u->vertex = NULL;
1033 	u->path.dentry = NULL;
1034 	u->path.mnt = NULL;
1035 	spin_lock_init(&u->lock);
1036 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1037 	mutex_init(&u->iolock); /* single task reading lock */
1038 	mutex_init(&u->bindlock); /* single task binding lock */
1039 	init_waitqueue_head(&u->peer_wait);
1040 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1041 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1042 	unix_insert_unbound_socket(net, sk);
1043 
1044 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1045 
1046 	return sk;
1047 
1048 err:
1049 	atomic_long_dec(&unix_nr_socks);
1050 	return ERR_PTR(err);
1051 }
1052 
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1053 static int unix_create(struct net *net, struct socket *sock, int protocol,
1054 		       int kern)
1055 {
1056 	struct sock *sk;
1057 
1058 	if (protocol && protocol != PF_UNIX)
1059 		return -EPROTONOSUPPORT;
1060 
1061 	sock->state = SS_UNCONNECTED;
1062 
1063 	switch (sock->type) {
1064 	case SOCK_STREAM:
1065 		sock->ops = &unix_stream_ops;
1066 		break;
1067 		/*
1068 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1069 		 *	nothing uses it.
1070 		 */
1071 	case SOCK_RAW:
1072 		sock->type = SOCK_DGRAM;
1073 		fallthrough;
1074 	case SOCK_DGRAM:
1075 		sock->ops = &unix_dgram_ops;
1076 		break;
1077 	case SOCK_SEQPACKET:
1078 		sock->ops = &unix_seqpacket_ops;
1079 		break;
1080 	default:
1081 		return -ESOCKTNOSUPPORT;
1082 	}
1083 
1084 	sk = unix_create1(net, sock, kern, sock->type);
1085 	if (IS_ERR(sk))
1086 		return PTR_ERR(sk);
1087 
1088 	return 0;
1089 }
1090 
unix_release(struct socket * sock)1091 static int unix_release(struct socket *sock)
1092 {
1093 	struct sock *sk = sock->sk;
1094 
1095 	if (!sk)
1096 		return 0;
1097 
1098 	sk->sk_prot->close(sk, 0);
1099 	unix_release_sock(sk, 0);
1100 	sock->sk = NULL;
1101 
1102 	return 0;
1103 }
1104 
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1105 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1106 				  int type)
1107 {
1108 	struct inode *inode;
1109 	struct path path;
1110 	struct sock *sk;
1111 	int err;
1112 
1113 	unix_mkname_bsd(sunaddr, addr_len);
1114 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1115 	if (err)
1116 		goto fail;
1117 
1118 	err = path_permission(&path, MAY_WRITE);
1119 	if (err)
1120 		goto path_put;
1121 
1122 	err = -ECONNREFUSED;
1123 	inode = d_backing_inode(path.dentry);
1124 	if (!S_ISSOCK(inode->i_mode))
1125 		goto path_put;
1126 
1127 	sk = unix_find_socket_byinode(inode);
1128 	if (!sk)
1129 		goto path_put;
1130 
1131 	err = -EPROTOTYPE;
1132 	if (sk->sk_type == type)
1133 		touch_atime(&path);
1134 	else
1135 		goto sock_put;
1136 
1137 	path_put(&path);
1138 
1139 	return sk;
1140 
1141 sock_put:
1142 	sock_put(sk);
1143 path_put:
1144 	path_put(&path);
1145 fail:
1146 	return ERR_PTR(err);
1147 }
1148 
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1149 static struct sock *unix_find_abstract(struct net *net,
1150 				       struct sockaddr_un *sunaddr,
1151 				       int addr_len, int type)
1152 {
1153 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1154 	struct dentry *dentry;
1155 	struct sock *sk;
1156 
1157 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1158 	if (!sk)
1159 		return ERR_PTR(-ECONNREFUSED);
1160 
1161 	dentry = unix_sk(sk)->path.dentry;
1162 	if (dentry)
1163 		touch_atime(&unix_sk(sk)->path);
1164 
1165 	return sk;
1166 }
1167 
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1168 static struct sock *unix_find_other(struct net *net,
1169 				    struct sockaddr_un *sunaddr,
1170 				    int addr_len, int type)
1171 {
1172 	struct sock *sk;
1173 
1174 	if (sunaddr->sun_path[0])
1175 		sk = unix_find_bsd(sunaddr, addr_len, type);
1176 	else
1177 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1178 
1179 	return sk;
1180 }
1181 
unix_autobind(struct sock * sk)1182 static int unix_autobind(struct sock *sk)
1183 {
1184 	struct unix_sock *u = unix_sk(sk);
1185 	unsigned int new_hash, old_hash;
1186 	struct net *net = sock_net(sk);
1187 	struct unix_address *addr;
1188 	u32 lastnum, ordernum;
1189 	int err;
1190 
1191 	err = mutex_lock_interruptible(&u->bindlock);
1192 	if (err)
1193 		return err;
1194 
1195 	if (u->addr)
1196 		goto out;
1197 
1198 	err = -ENOMEM;
1199 	addr = kzalloc(sizeof(*addr) +
1200 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1201 	if (!addr)
1202 		goto out;
1203 
1204 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1205 	addr->name->sun_family = AF_UNIX;
1206 	refcount_set(&addr->refcnt, 1);
1207 
1208 	old_hash = sk->sk_hash;
1209 	ordernum = get_random_u32();
1210 	lastnum = ordernum & 0xFFFFF;
1211 retry:
1212 	ordernum = (ordernum + 1) & 0xFFFFF;
1213 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1214 
1215 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1216 	unix_table_double_lock(net, old_hash, new_hash);
1217 
1218 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1219 		unix_table_double_unlock(net, old_hash, new_hash);
1220 
1221 		/* __unix_find_socket_byname() may take long time if many names
1222 		 * are already in use.
1223 		 */
1224 		cond_resched();
1225 
1226 		if (ordernum == lastnum) {
1227 			/* Give up if all names seems to be in use. */
1228 			err = -ENOSPC;
1229 			unix_release_addr(addr);
1230 			goto out;
1231 		}
1232 
1233 		goto retry;
1234 	}
1235 
1236 	__unix_set_addr_hash(net, sk, addr, new_hash);
1237 	unix_table_double_unlock(net, old_hash, new_hash);
1238 	err = 0;
1239 
1240 out:	mutex_unlock(&u->bindlock);
1241 	return err;
1242 }
1243 
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1244 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1245 			 int addr_len)
1246 {
1247 	umode_t mode = S_IFSOCK |
1248 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1249 	struct unix_sock *u = unix_sk(sk);
1250 	unsigned int new_hash, old_hash;
1251 	struct net *net = sock_net(sk);
1252 	struct mnt_idmap *idmap;
1253 	struct unix_address *addr;
1254 	struct dentry *dentry;
1255 	struct path parent;
1256 	int err;
1257 
1258 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1259 	addr = unix_create_addr(sunaddr, addr_len);
1260 	if (!addr)
1261 		return -ENOMEM;
1262 
1263 	/*
1264 	 * Get the parent directory, calculate the hash for last
1265 	 * component.
1266 	 */
1267 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1268 	if (IS_ERR(dentry)) {
1269 		err = PTR_ERR(dentry);
1270 		goto out;
1271 	}
1272 
1273 	/*
1274 	 * All right, let's create it.
1275 	 */
1276 	idmap = mnt_idmap(parent.mnt);
1277 	err = security_path_mknod(&parent, dentry, mode, 0);
1278 	if (!err)
1279 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1280 	if (err)
1281 		goto out_path;
1282 	err = mutex_lock_interruptible(&u->bindlock);
1283 	if (err)
1284 		goto out_unlink;
1285 	if (u->addr)
1286 		goto out_unlock;
1287 
1288 	old_hash = sk->sk_hash;
1289 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1290 	unix_table_double_lock(net, old_hash, new_hash);
1291 	u->path.mnt = mntget(parent.mnt);
1292 	u->path.dentry = dget(dentry);
1293 	__unix_set_addr_hash(net, sk, addr, new_hash);
1294 	unix_table_double_unlock(net, old_hash, new_hash);
1295 	unix_insert_bsd_socket(sk);
1296 	mutex_unlock(&u->bindlock);
1297 	done_path_create(&parent, dentry);
1298 	return 0;
1299 
1300 out_unlock:
1301 	mutex_unlock(&u->bindlock);
1302 	err = -EINVAL;
1303 out_unlink:
1304 	/* failed after successful mknod?  unlink what we'd created... */
1305 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1306 out_path:
1307 	done_path_create(&parent, dentry);
1308 out:
1309 	unix_release_addr(addr);
1310 	return err == -EEXIST ? -EADDRINUSE : err;
1311 }
1312 
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1313 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1314 			      int addr_len)
1315 {
1316 	struct unix_sock *u = unix_sk(sk);
1317 	unsigned int new_hash, old_hash;
1318 	struct net *net = sock_net(sk);
1319 	struct unix_address *addr;
1320 	int err;
1321 
1322 	addr = unix_create_addr(sunaddr, addr_len);
1323 	if (!addr)
1324 		return -ENOMEM;
1325 
1326 	err = mutex_lock_interruptible(&u->bindlock);
1327 	if (err)
1328 		goto out;
1329 
1330 	if (u->addr) {
1331 		err = -EINVAL;
1332 		goto out_mutex;
1333 	}
1334 
1335 	old_hash = sk->sk_hash;
1336 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1337 	unix_table_double_lock(net, old_hash, new_hash);
1338 
1339 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1340 		goto out_spin;
1341 
1342 	__unix_set_addr_hash(net, sk, addr, new_hash);
1343 	unix_table_double_unlock(net, old_hash, new_hash);
1344 	mutex_unlock(&u->bindlock);
1345 	return 0;
1346 
1347 out_spin:
1348 	unix_table_double_unlock(net, old_hash, new_hash);
1349 	err = -EADDRINUSE;
1350 out_mutex:
1351 	mutex_unlock(&u->bindlock);
1352 out:
1353 	unix_release_addr(addr);
1354 	return err;
1355 }
1356 
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1357 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1358 {
1359 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1360 	struct sock *sk = sock->sk;
1361 	int err;
1362 
1363 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1364 	    sunaddr->sun_family == AF_UNIX)
1365 		return unix_autobind(sk);
1366 
1367 	err = unix_validate_addr(sunaddr, addr_len);
1368 	if (err)
1369 		return err;
1370 
1371 	if (sunaddr->sun_path[0])
1372 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1373 	else
1374 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1375 
1376 	return err;
1377 }
1378 
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1379 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1380 {
1381 	if (unlikely(sk1 == sk2) || !sk2) {
1382 		unix_state_lock(sk1);
1383 		return;
1384 	}
1385 
1386 	if (sk1 > sk2)
1387 		swap(sk1, sk2);
1388 
1389 	unix_state_lock(sk1);
1390 	unix_state_lock(sk2);
1391 }
1392 
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1393 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1394 {
1395 	if (unlikely(sk1 == sk2) || !sk2) {
1396 		unix_state_unlock(sk1);
1397 		return;
1398 	}
1399 	unix_state_unlock(sk1);
1400 	unix_state_unlock(sk2);
1401 }
1402 
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1403 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1404 			      int alen, int flags)
1405 {
1406 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1407 	struct sock *sk = sock->sk;
1408 	struct sock *other;
1409 	int err;
1410 
1411 	err = -EINVAL;
1412 	if (alen < offsetofend(struct sockaddr, sa_family))
1413 		goto out;
1414 
1415 	if (addr->sa_family != AF_UNSPEC) {
1416 		err = unix_validate_addr(sunaddr, alen);
1417 		if (err)
1418 			goto out;
1419 
1420 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1421 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1422 		    !READ_ONCE(unix_sk(sk)->addr)) {
1423 			err = unix_autobind(sk);
1424 			if (err)
1425 				goto out;
1426 		}
1427 
1428 restart:
1429 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1430 		if (IS_ERR(other)) {
1431 			err = PTR_ERR(other);
1432 			goto out;
1433 		}
1434 
1435 		unix_state_double_lock(sk, other);
1436 
1437 		/* Apparently VFS overslept socket death. Retry. */
1438 		if (sock_flag(other, SOCK_DEAD)) {
1439 			unix_state_double_unlock(sk, other);
1440 			sock_put(other);
1441 			goto restart;
1442 		}
1443 
1444 		err = -EPERM;
1445 		if (!unix_may_send(sk, other))
1446 			goto out_unlock;
1447 
1448 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1449 		if (err)
1450 			goto out_unlock;
1451 
1452 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1453 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1454 	} else {
1455 		/*
1456 		 *	1003.1g breaking connected state with AF_UNSPEC
1457 		 */
1458 		other = NULL;
1459 		unix_state_double_lock(sk, other);
1460 	}
1461 
1462 	/*
1463 	 * If it was connected, reconnect.
1464 	 */
1465 	if (unix_peer(sk)) {
1466 		struct sock *old_peer = unix_peer(sk);
1467 
1468 		unix_peer(sk) = other;
1469 		if (!other)
1470 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1471 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1472 
1473 		unix_state_double_unlock(sk, other);
1474 
1475 		if (other != old_peer) {
1476 			unix_dgram_disconnected(sk, old_peer);
1477 
1478 			unix_state_lock(old_peer);
1479 			if (!unix_peer(old_peer))
1480 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1481 			unix_state_unlock(old_peer);
1482 		}
1483 
1484 		sock_put(old_peer);
1485 	} else {
1486 		unix_peer(sk) = other;
1487 		unix_state_double_unlock(sk, other);
1488 	}
1489 
1490 	return 0;
1491 
1492 out_unlock:
1493 	unix_state_double_unlock(sk, other);
1494 	sock_put(other);
1495 out:
1496 	return err;
1497 }
1498 
unix_wait_for_peer(struct sock * other,long timeo)1499 static long unix_wait_for_peer(struct sock *other, long timeo)
1500 	__releases(&unix_sk(other)->lock)
1501 {
1502 	struct unix_sock *u = unix_sk(other);
1503 	int sched;
1504 	DEFINE_WAIT(wait);
1505 
1506 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1507 
1508 	sched = !sock_flag(other, SOCK_DEAD) &&
1509 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1510 		unix_recvq_full_lockless(other);
1511 
1512 	unix_state_unlock(other);
1513 
1514 	if (sched)
1515 		timeo = schedule_timeout(timeo);
1516 
1517 	finish_wait(&u->peer_wait, &wait);
1518 	return timeo;
1519 }
1520 
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1521 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1522 			       int addr_len, int flags)
1523 {
1524 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1525 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1526 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1527 	struct net *net = sock_net(sk);
1528 	struct sk_buff *skb = NULL;
1529 	unsigned char state;
1530 	long timeo;
1531 	int err;
1532 
1533 	err = unix_validate_addr(sunaddr, addr_len);
1534 	if (err)
1535 		goto out;
1536 
1537 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1538 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1539 	    !READ_ONCE(u->addr)) {
1540 		err = unix_autobind(sk);
1541 		if (err)
1542 			goto out;
1543 	}
1544 
1545 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1546 
1547 	/* First of all allocate resources.
1548 	   If we will make it after state is locked,
1549 	   we will have to recheck all again in any case.
1550 	 */
1551 
1552 	/* create new sock for complete connection */
1553 	newsk = unix_create1(net, NULL, 0, sock->type);
1554 	if (IS_ERR(newsk)) {
1555 		err = PTR_ERR(newsk);
1556 		newsk = NULL;
1557 		goto out;
1558 	}
1559 
1560 	err = -ENOMEM;
1561 
1562 	/* Allocate skb for sending to listening sock */
1563 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1564 	if (skb == NULL)
1565 		goto out;
1566 
1567 restart:
1568 	/*  Find listening sock. */
1569 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1570 	if (IS_ERR(other)) {
1571 		err = PTR_ERR(other);
1572 		other = NULL;
1573 		goto out;
1574 	}
1575 
1576 	unix_state_lock(other);
1577 
1578 	/* Apparently VFS overslept socket death. Retry. */
1579 	if (sock_flag(other, SOCK_DEAD)) {
1580 		unix_state_unlock(other);
1581 		sock_put(other);
1582 		goto restart;
1583 	}
1584 
1585 	err = -ECONNREFUSED;
1586 	if (other->sk_state != TCP_LISTEN)
1587 		goto out_unlock;
1588 	if (other->sk_shutdown & RCV_SHUTDOWN)
1589 		goto out_unlock;
1590 
1591 	if (unix_recvq_full_lockless(other)) {
1592 		err = -EAGAIN;
1593 		if (!timeo)
1594 			goto out_unlock;
1595 
1596 		timeo = unix_wait_for_peer(other, timeo);
1597 
1598 		err = sock_intr_errno(timeo);
1599 		if (signal_pending(current))
1600 			goto out;
1601 		sock_put(other);
1602 		goto restart;
1603 	}
1604 
1605 	/* self connect and simultaneous connect are eliminated
1606 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1607 	 */
1608 	state = READ_ONCE(sk->sk_state);
1609 	if (unlikely(state != TCP_CLOSE)) {
1610 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1611 		goto out_unlock;
1612 	}
1613 
1614 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1615 
1616 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1617 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1618 		unix_state_unlock(sk);
1619 		goto out_unlock;
1620 	}
1621 
1622 	err = security_unix_stream_connect(sk, other, newsk);
1623 	if (err) {
1624 		unix_state_unlock(sk);
1625 		goto out_unlock;
1626 	}
1627 
1628 	/* The way is open! Fastly set all the necessary fields... */
1629 
1630 	sock_hold(sk);
1631 	unix_peer(newsk)	= sk;
1632 	newsk->sk_state		= TCP_ESTABLISHED;
1633 	newsk->sk_type		= sk->sk_type;
1634 	init_peercred(newsk);
1635 	newu = unix_sk(newsk);
1636 	newu->listener = other;
1637 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1638 	otheru = unix_sk(other);
1639 
1640 	/* copy address information from listening to new sock
1641 	 *
1642 	 * The contents of *(otheru->addr) and otheru->path
1643 	 * are seen fully set up here, since we have found
1644 	 * otheru in hash under its lock.  Insertion into the
1645 	 * hash chain we'd found it in had been done in an
1646 	 * earlier critical area protected by the chain's lock,
1647 	 * the same one where we'd set *(otheru->addr) contents,
1648 	 * as well as otheru->path and otheru->addr itself.
1649 	 *
1650 	 * Using smp_store_release() here to set newu->addr
1651 	 * is enough to make those stores, as well as stores
1652 	 * to newu->path visible to anyone who gets newu->addr
1653 	 * by smp_load_acquire().  IOW, the same warranties
1654 	 * as for unix_sock instances bound in unix_bind() or
1655 	 * in unix_autobind().
1656 	 */
1657 	if (otheru->path.dentry) {
1658 		path_get(&otheru->path);
1659 		newu->path = otheru->path;
1660 	}
1661 	refcount_inc(&otheru->addr->refcnt);
1662 	smp_store_release(&newu->addr, otheru->addr);
1663 
1664 	/* Set credentials */
1665 	copy_peercred(sk, other);
1666 
1667 	sock->state	= SS_CONNECTED;
1668 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1669 	sock_hold(newsk);
1670 
1671 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1672 	unix_peer(sk)	= newsk;
1673 
1674 	unix_state_unlock(sk);
1675 
1676 	/* take ten and send info to listening sock */
1677 	spin_lock(&other->sk_receive_queue.lock);
1678 	__skb_queue_tail(&other->sk_receive_queue, skb);
1679 	spin_unlock(&other->sk_receive_queue.lock);
1680 	unix_state_unlock(other);
1681 	other->sk_data_ready(other);
1682 	sock_put(other);
1683 	return 0;
1684 
1685 out_unlock:
1686 	if (other)
1687 		unix_state_unlock(other);
1688 
1689 out:
1690 	kfree_skb(skb);
1691 	if (newsk)
1692 		unix_release_sock(newsk, 0);
1693 	if (other)
1694 		sock_put(other);
1695 	return err;
1696 }
1697 
unix_socketpair(struct socket * socka,struct socket * sockb)1698 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1699 {
1700 	struct sock *ska = socka->sk, *skb = sockb->sk;
1701 
1702 	/* Join our sockets back to back */
1703 	sock_hold(ska);
1704 	sock_hold(skb);
1705 	unix_peer(ska) = skb;
1706 	unix_peer(skb) = ska;
1707 	init_peercred(ska);
1708 	init_peercred(skb);
1709 
1710 	ska->sk_state = TCP_ESTABLISHED;
1711 	skb->sk_state = TCP_ESTABLISHED;
1712 	socka->state  = SS_CONNECTED;
1713 	sockb->state  = SS_CONNECTED;
1714 	return 0;
1715 }
1716 
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1717 static void unix_sock_inherit_flags(const struct socket *old,
1718 				    struct socket *new)
1719 {
1720 	if (test_bit(SOCK_PASSCRED, &old->flags))
1721 		set_bit(SOCK_PASSCRED, &new->flags);
1722 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1723 		set_bit(SOCK_PASSPIDFD, &new->flags);
1724 	if (test_bit(SOCK_PASSSEC, &old->flags))
1725 		set_bit(SOCK_PASSSEC, &new->flags);
1726 }
1727 
unix_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)1728 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1729 		       bool kern)
1730 {
1731 	struct sock *sk = sock->sk;
1732 	struct sk_buff *skb;
1733 	struct sock *tsk;
1734 	int err;
1735 
1736 	err = -EOPNOTSUPP;
1737 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1738 		goto out;
1739 
1740 	err = -EINVAL;
1741 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1742 		goto out;
1743 
1744 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1745 	 * so that no locks are necessary.
1746 	 */
1747 
1748 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1749 				&err);
1750 	if (!skb) {
1751 		/* This means receive shutdown. */
1752 		if (err == 0)
1753 			err = -EINVAL;
1754 		goto out;
1755 	}
1756 
1757 	tsk = skb->sk;
1758 	skb_free_datagram(sk, skb);
1759 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1760 
1761 	/* attach accepted sock to socket */
1762 	unix_state_lock(tsk);
1763 	unix_update_edges(unix_sk(tsk));
1764 	newsock->state = SS_CONNECTED;
1765 	unix_sock_inherit_flags(sock, newsock);
1766 	sock_graft(tsk, newsock);
1767 	unix_state_unlock(tsk);
1768 	return 0;
1769 
1770 out:
1771 	return err;
1772 }
1773 
1774 
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1775 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1776 {
1777 	struct sock *sk = sock->sk;
1778 	struct unix_address *addr;
1779 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1780 	int err = 0;
1781 
1782 	if (peer) {
1783 		sk = unix_peer_get(sk);
1784 
1785 		err = -ENOTCONN;
1786 		if (!sk)
1787 			goto out;
1788 		err = 0;
1789 	} else {
1790 		sock_hold(sk);
1791 	}
1792 
1793 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1794 	if (!addr) {
1795 		sunaddr->sun_family = AF_UNIX;
1796 		sunaddr->sun_path[0] = 0;
1797 		err = offsetof(struct sockaddr_un, sun_path);
1798 	} else {
1799 		err = addr->len;
1800 		memcpy(sunaddr, addr->name, addr->len);
1801 	}
1802 	sock_put(sk);
1803 out:
1804 	return err;
1805 }
1806 
1807 /* The "user->unix_inflight" variable is protected by the garbage
1808  * collection lock, and we just read it locklessly here. If you go
1809  * over the limit, there might be a tiny race in actually noticing
1810  * it across threads. Tough.
1811  */
too_many_unix_fds(struct task_struct * p)1812 static inline bool too_many_unix_fds(struct task_struct *p)
1813 {
1814 	struct user_struct *user = current_user();
1815 
1816 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1817 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1818 	return false;
1819 }
1820 
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1821 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1822 {
1823 	if (too_many_unix_fds(current))
1824 		return -ETOOMANYREFS;
1825 
1826 	/* Need to duplicate file references for the sake of garbage
1827 	 * collection.  Otherwise a socket in the fps might become a
1828 	 * candidate for GC while the skb is not yet queued.
1829 	 */
1830 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1831 	if (!UNIXCB(skb).fp)
1832 		return -ENOMEM;
1833 
1834 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1835 		return -ENOMEM;
1836 
1837 	return 0;
1838 }
1839 
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1840 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1841 {
1842 	scm->fp = UNIXCB(skb).fp;
1843 	UNIXCB(skb).fp = NULL;
1844 
1845 	unix_destroy_fpl(scm->fp);
1846 }
1847 
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1848 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1849 {
1850 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1851 }
1852 
unix_destruct_scm(struct sk_buff * skb)1853 static void unix_destruct_scm(struct sk_buff *skb)
1854 {
1855 	struct scm_cookie scm;
1856 
1857 	memset(&scm, 0, sizeof(scm));
1858 	scm.pid  = UNIXCB(skb).pid;
1859 	if (UNIXCB(skb).fp)
1860 		unix_detach_fds(&scm, skb);
1861 
1862 	/* Alas, it calls VFS */
1863 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1864 	scm_destroy(&scm);
1865 	sock_wfree(skb);
1866 }
1867 
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1868 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1869 {
1870 	int err = 0;
1871 
1872 	UNIXCB(skb).pid  = get_pid(scm->pid);
1873 	UNIXCB(skb).uid = scm->creds.uid;
1874 	UNIXCB(skb).gid = scm->creds.gid;
1875 	UNIXCB(skb).fp = NULL;
1876 	unix_get_secdata(scm, skb);
1877 	if (scm->fp && send_fds)
1878 		err = unix_attach_fds(scm, skb);
1879 
1880 	skb->destructor = unix_destruct_scm;
1881 	return err;
1882 }
1883 
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1884 static bool unix_passcred_enabled(const struct socket *sock,
1885 				  const struct sock *other)
1886 {
1887 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1888 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1889 	       !other->sk_socket ||
1890 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1891 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1892 }
1893 
1894 /*
1895  * Some apps rely on write() giving SCM_CREDENTIALS
1896  * We include credentials if source or destination socket
1897  * asserted SOCK_PASSCRED.
1898  */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1899 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1900 			    const struct sock *other)
1901 {
1902 	if (UNIXCB(skb).pid)
1903 		return;
1904 	if (unix_passcred_enabled(sock, other)) {
1905 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1906 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1907 	}
1908 }
1909 
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1910 static bool unix_skb_scm_eq(struct sk_buff *skb,
1911 			    struct scm_cookie *scm)
1912 {
1913 	return UNIXCB(skb).pid == scm->pid &&
1914 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1915 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1916 	       unix_secdata_eq(scm, skb);
1917 }
1918 
scm_stat_add(struct sock * sk,struct sk_buff * skb)1919 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1920 {
1921 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1922 	struct unix_sock *u = unix_sk(sk);
1923 
1924 	if (unlikely(fp && fp->count)) {
1925 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1926 		unix_add_edges(fp, u);
1927 	}
1928 }
1929 
scm_stat_del(struct sock * sk,struct sk_buff * skb)1930 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1931 {
1932 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1933 	struct unix_sock *u = unix_sk(sk);
1934 
1935 	if (unlikely(fp && fp->count)) {
1936 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1937 		unix_del_edges(fp);
1938 	}
1939 }
1940 
1941 /*
1942  *	Send AF_UNIX data.
1943  */
1944 
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1945 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1946 			      size_t len)
1947 {
1948 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1949 	struct sock *sk = sock->sk, *other = NULL;
1950 	struct unix_sock *u = unix_sk(sk);
1951 	struct scm_cookie scm;
1952 	struct sk_buff *skb;
1953 	int data_len = 0;
1954 	int sk_locked;
1955 	long timeo;
1956 	int err;
1957 
1958 	err = scm_send(sock, msg, &scm, false);
1959 	if (err < 0)
1960 		return err;
1961 
1962 	wait_for_unix_gc(scm.fp);
1963 
1964 	err = -EOPNOTSUPP;
1965 	if (msg->msg_flags&MSG_OOB)
1966 		goto out;
1967 
1968 	if (msg->msg_namelen) {
1969 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1970 		if (err)
1971 			goto out;
1972 	} else {
1973 		sunaddr = NULL;
1974 		err = -ENOTCONN;
1975 		other = unix_peer_get(sk);
1976 		if (!other)
1977 			goto out;
1978 	}
1979 
1980 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1981 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1982 	    !READ_ONCE(u->addr)) {
1983 		err = unix_autobind(sk);
1984 		if (err)
1985 			goto out;
1986 	}
1987 
1988 	err = -EMSGSIZE;
1989 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1990 		goto out;
1991 
1992 	if (len > SKB_MAX_ALLOC) {
1993 		data_len = min_t(size_t,
1994 				 len - SKB_MAX_ALLOC,
1995 				 MAX_SKB_FRAGS * PAGE_SIZE);
1996 		data_len = PAGE_ALIGN(data_len);
1997 
1998 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1999 	}
2000 
2001 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2002 				   msg->msg_flags & MSG_DONTWAIT, &err,
2003 				   PAGE_ALLOC_COSTLY_ORDER);
2004 	if (skb == NULL)
2005 		goto out;
2006 
2007 	err = unix_scm_to_skb(&scm, skb, true);
2008 	if (err < 0)
2009 		goto out_free;
2010 
2011 	skb_put(skb, len - data_len);
2012 	skb->data_len = data_len;
2013 	skb->len = len;
2014 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2015 	if (err)
2016 		goto out_free;
2017 
2018 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2019 
2020 restart:
2021 	if (!other) {
2022 		err = -ECONNRESET;
2023 		if (sunaddr == NULL)
2024 			goto out_free;
2025 
2026 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2027 					sk->sk_type);
2028 		if (IS_ERR(other)) {
2029 			err = PTR_ERR(other);
2030 			other = NULL;
2031 			goto out_free;
2032 		}
2033 	}
2034 
2035 	if (sk_filter(other, skb) < 0) {
2036 		/* Toss the packet but do not return any error to the sender */
2037 		err = len;
2038 		goto out_free;
2039 	}
2040 
2041 	sk_locked = 0;
2042 	unix_state_lock(other);
2043 restart_locked:
2044 	err = -EPERM;
2045 	if (!unix_may_send(sk, other))
2046 		goto out_unlock;
2047 
2048 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2049 		/*
2050 		 *	Check with 1003.1g - what should
2051 		 *	datagram error
2052 		 */
2053 		unix_state_unlock(other);
2054 		sock_put(other);
2055 
2056 		if (!sk_locked)
2057 			unix_state_lock(sk);
2058 
2059 		err = 0;
2060 		if (sk->sk_type == SOCK_SEQPACKET) {
2061 			/* We are here only when racing with unix_release_sock()
2062 			 * is clearing @other. Never change state to TCP_CLOSE
2063 			 * unlike SOCK_DGRAM wants.
2064 			 */
2065 			unix_state_unlock(sk);
2066 			err = -EPIPE;
2067 		} else if (unix_peer(sk) == other) {
2068 			unix_peer(sk) = NULL;
2069 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2070 
2071 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2072 			unix_state_unlock(sk);
2073 
2074 			unix_dgram_disconnected(sk, other);
2075 			sock_put(other);
2076 			err = -ECONNREFUSED;
2077 		} else {
2078 			unix_state_unlock(sk);
2079 		}
2080 
2081 		other = NULL;
2082 		if (err)
2083 			goto out_free;
2084 		goto restart;
2085 	}
2086 
2087 	err = -EPIPE;
2088 	if (other->sk_shutdown & RCV_SHUTDOWN)
2089 		goto out_unlock;
2090 
2091 	if (sk->sk_type != SOCK_SEQPACKET) {
2092 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2093 		if (err)
2094 			goto out_unlock;
2095 	}
2096 
2097 	/* other == sk && unix_peer(other) != sk if
2098 	 * - unix_peer(sk) == NULL, destination address bound to sk
2099 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2100 	 */
2101 	if (other != sk &&
2102 	    unlikely(unix_peer(other) != sk &&
2103 	    unix_recvq_full_lockless(other))) {
2104 		if (timeo) {
2105 			timeo = unix_wait_for_peer(other, timeo);
2106 
2107 			err = sock_intr_errno(timeo);
2108 			if (signal_pending(current))
2109 				goto out_free;
2110 
2111 			goto restart;
2112 		}
2113 
2114 		if (!sk_locked) {
2115 			unix_state_unlock(other);
2116 			unix_state_double_lock(sk, other);
2117 		}
2118 
2119 		if (unix_peer(sk) != other ||
2120 		    unix_dgram_peer_wake_me(sk, other)) {
2121 			err = -EAGAIN;
2122 			sk_locked = 1;
2123 			goto out_unlock;
2124 		}
2125 
2126 		if (!sk_locked) {
2127 			sk_locked = 1;
2128 			goto restart_locked;
2129 		}
2130 	}
2131 
2132 	if (unlikely(sk_locked))
2133 		unix_state_unlock(sk);
2134 
2135 	if (sock_flag(other, SOCK_RCVTSTAMP))
2136 		__net_timestamp(skb);
2137 	maybe_add_creds(skb, sock, other);
2138 	scm_stat_add(other, skb);
2139 	skb_queue_tail(&other->sk_receive_queue, skb);
2140 	unix_state_unlock(other);
2141 	other->sk_data_ready(other);
2142 	sock_put(other);
2143 	scm_destroy(&scm);
2144 	return len;
2145 
2146 out_unlock:
2147 	if (sk_locked)
2148 		unix_state_unlock(sk);
2149 	unix_state_unlock(other);
2150 out_free:
2151 	kfree_skb(skb);
2152 out:
2153 	if (other)
2154 		sock_put(other);
2155 	scm_destroy(&scm);
2156 	return err;
2157 }
2158 
2159 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2160  * bytes, and a minimum of a full page.
2161  */
2162 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2163 
2164 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2165 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2166 		     struct scm_cookie *scm, bool fds_sent)
2167 {
2168 	struct unix_sock *ousk = unix_sk(other);
2169 	struct sk_buff *skb;
2170 	int err = 0;
2171 
2172 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2173 
2174 	if (!skb)
2175 		return err;
2176 
2177 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2178 	if (err < 0) {
2179 		kfree_skb(skb);
2180 		return err;
2181 	}
2182 	skb_put(skb, 1);
2183 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2184 
2185 	if (err) {
2186 		kfree_skb(skb);
2187 		return err;
2188 	}
2189 
2190 	unix_state_lock(other);
2191 
2192 	if (sock_flag(other, SOCK_DEAD) ||
2193 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2194 		unix_state_unlock(other);
2195 		kfree_skb(skb);
2196 		return -EPIPE;
2197 	}
2198 
2199 	maybe_add_creds(skb, sock, other);
2200 	scm_stat_add(other, skb);
2201 
2202 	spin_lock(&other->sk_receive_queue.lock);
2203 	WRITE_ONCE(ousk->oob_skb, skb);
2204 	__skb_queue_tail(&other->sk_receive_queue, skb);
2205 	spin_unlock(&other->sk_receive_queue.lock);
2206 
2207 	sk_send_sigurg(other);
2208 	unix_state_unlock(other);
2209 	other->sk_data_ready(other);
2210 
2211 	return err;
2212 }
2213 #endif
2214 
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2215 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2216 			       size_t len)
2217 {
2218 	struct sock *sk = sock->sk;
2219 	struct sock *other = NULL;
2220 	int err, size;
2221 	struct sk_buff *skb;
2222 	int sent = 0;
2223 	struct scm_cookie scm;
2224 	bool fds_sent = false;
2225 	int data_len;
2226 
2227 	err = scm_send(sock, msg, &scm, false);
2228 	if (err < 0)
2229 		return err;
2230 
2231 	wait_for_unix_gc(scm.fp);
2232 
2233 	err = -EOPNOTSUPP;
2234 	if (msg->msg_flags & MSG_OOB) {
2235 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2236 		if (len)
2237 			len--;
2238 		else
2239 #endif
2240 			goto out_err;
2241 	}
2242 
2243 	if (msg->msg_namelen) {
2244 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2245 		goto out_err;
2246 	} else {
2247 		err = -ENOTCONN;
2248 		other = unix_peer(sk);
2249 		if (!other)
2250 			goto out_err;
2251 	}
2252 
2253 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2254 		goto pipe_err;
2255 
2256 	while (sent < len) {
2257 		size = len - sent;
2258 
2259 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2260 			skb = sock_alloc_send_pskb(sk, 0, 0,
2261 						   msg->msg_flags & MSG_DONTWAIT,
2262 						   &err, 0);
2263 		} else {
2264 			/* Keep two messages in the pipe so it schedules better */
2265 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2266 
2267 			/* allow fallback to order-0 allocations */
2268 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2269 
2270 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2271 
2272 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2273 
2274 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2275 						   msg->msg_flags & MSG_DONTWAIT, &err,
2276 						   get_order(UNIX_SKB_FRAGS_SZ));
2277 		}
2278 		if (!skb)
2279 			goto out_err;
2280 
2281 		/* Only send the fds in the first buffer */
2282 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2283 		if (err < 0) {
2284 			kfree_skb(skb);
2285 			goto out_err;
2286 		}
2287 		fds_sent = true;
2288 
2289 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2290 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2291 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2292 						   sk->sk_allocation);
2293 			if (err < 0) {
2294 				kfree_skb(skb);
2295 				goto out_err;
2296 			}
2297 			size = err;
2298 			refcount_add(size, &sk->sk_wmem_alloc);
2299 		} else {
2300 			skb_put(skb, size - data_len);
2301 			skb->data_len = data_len;
2302 			skb->len = size;
2303 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2304 			if (err) {
2305 				kfree_skb(skb);
2306 				goto out_err;
2307 			}
2308 		}
2309 
2310 		unix_state_lock(other);
2311 
2312 		if (sock_flag(other, SOCK_DEAD) ||
2313 		    (other->sk_shutdown & RCV_SHUTDOWN))
2314 			goto pipe_err_free;
2315 
2316 		maybe_add_creds(skb, sock, other);
2317 		scm_stat_add(other, skb);
2318 		skb_queue_tail(&other->sk_receive_queue, skb);
2319 		unix_state_unlock(other);
2320 		other->sk_data_ready(other);
2321 		sent += size;
2322 	}
2323 
2324 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2325 	if (msg->msg_flags & MSG_OOB) {
2326 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2327 		if (err)
2328 			goto out_err;
2329 		sent++;
2330 	}
2331 #endif
2332 
2333 	scm_destroy(&scm);
2334 
2335 	return sent;
2336 
2337 pipe_err_free:
2338 	unix_state_unlock(other);
2339 	kfree_skb(skb);
2340 pipe_err:
2341 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2342 		send_sig(SIGPIPE, current, 0);
2343 	err = -EPIPE;
2344 out_err:
2345 	scm_destroy(&scm);
2346 	return sent ? : err;
2347 }
2348 
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2349 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2350 				  size_t len)
2351 {
2352 	int err;
2353 	struct sock *sk = sock->sk;
2354 
2355 	err = sock_error(sk);
2356 	if (err)
2357 		return err;
2358 
2359 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2360 		return -ENOTCONN;
2361 
2362 	if (msg->msg_namelen)
2363 		msg->msg_namelen = 0;
2364 
2365 	return unix_dgram_sendmsg(sock, msg, len);
2366 }
2367 
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2368 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2369 				  size_t size, int flags)
2370 {
2371 	struct sock *sk = sock->sk;
2372 
2373 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2374 		return -ENOTCONN;
2375 
2376 	return unix_dgram_recvmsg(sock, msg, size, flags);
2377 }
2378 
unix_copy_addr(struct msghdr * msg,struct sock * sk)2379 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2380 {
2381 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2382 
2383 	if (addr) {
2384 		msg->msg_namelen = addr->len;
2385 		memcpy(msg->msg_name, addr->name, addr->len);
2386 	}
2387 }
2388 
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2389 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2390 			 int flags)
2391 {
2392 	struct scm_cookie scm;
2393 	struct socket *sock = sk->sk_socket;
2394 	struct unix_sock *u = unix_sk(sk);
2395 	struct sk_buff *skb, *last;
2396 	long timeo;
2397 	int skip;
2398 	int err;
2399 
2400 	err = -EOPNOTSUPP;
2401 	if (flags&MSG_OOB)
2402 		goto out;
2403 
2404 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2405 
2406 	do {
2407 		mutex_lock(&u->iolock);
2408 
2409 		skip = sk_peek_offset(sk, flags);
2410 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2411 					      &skip, &err, &last);
2412 		if (skb) {
2413 			if (!(flags & MSG_PEEK))
2414 				scm_stat_del(sk, skb);
2415 			break;
2416 		}
2417 
2418 		mutex_unlock(&u->iolock);
2419 
2420 		if (err != -EAGAIN)
2421 			break;
2422 	} while (timeo &&
2423 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2424 					      &err, &timeo, last));
2425 
2426 	if (!skb) { /* implies iolock unlocked */
2427 		unix_state_lock(sk);
2428 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2429 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2430 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2431 			err = 0;
2432 		unix_state_unlock(sk);
2433 		goto out;
2434 	}
2435 
2436 	if (wq_has_sleeper(&u->peer_wait))
2437 		wake_up_interruptible_sync_poll(&u->peer_wait,
2438 						EPOLLOUT | EPOLLWRNORM |
2439 						EPOLLWRBAND);
2440 
2441 	if (msg->msg_name)
2442 		unix_copy_addr(msg, skb->sk);
2443 
2444 	if (size > skb->len - skip)
2445 		size = skb->len - skip;
2446 	else if (size < skb->len - skip)
2447 		msg->msg_flags |= MSG_TRUNC;
2448 
2449 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2450 	if (err)
2451 		goto out_free;
2452 
2453 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2454 		__sock_recv_timestamp(msg, sk, skb);
2455 
2456 	memset(&scm, 0, sizeof(scm));
2457 
2458 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2459 	unix_set_secdata(&scm, skb);
2460 
2461 	if (!(flags & MSG_PEEK)) {
2462 		if (UNIXCB(skb).fp)
2463 			unix_detach_fds(&scm, skb);
2464 
2465 		sk_peek_offset_bwd(sk, skb->len);
2466 	} else {
2467 		/* It is questionable: on PEEK we could:
2468 		   - do not return fds - good, but too simple 8)
2469 		   - return fds, and do not return them on read (old strategy,
2470 		     apparently wrong)
2471 		   - clone fds (I chose it for now, it is the most universal
2472 		     solution)
2473 
2474 		   POSIX 1003.1g does not actually define this clearly
2475 		   at all. POSIX 1003.1g doesn't define a lot of things
2476 		   clearly however!
2477 
2478 		*/
2479 
2480 		sk_peek_offset_fwd(sk, size);
2481 
2482 		if (UNIXCB(skb).fp)
2483 			unix_peek_fds(&scm, skb);
2484 	}
2485 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2486 
2487 	scm_recv_unix(sock, msg, &scm, flags);
2488 
2489 out_free:
2490 	skb_free_datagram(sk, skb);
2491 	mutex_unlock(&u->iolock);
2492 out:
2493 	return err;
2494 }
2495 
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2496 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2497 			      int flags)
2498 {
2499 	struct sock *sk = sock->sk;
2500 
2501 #ifdef CONFIG_BPF_SYSCALL
2502 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2503 
2504 	if (prot != &unix_dgram_proto)
2505 		return prot->recvmsg(sk, msg, size, flags, NULL);
2506 #endif
2507 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2508 }
2509 
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2510 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2511 {
2512 	struct unix_sock *u = unix_sk(sk);
2513 	struct sk_buff *skb;
2514 	int err;
2515 
2516 	mutex_lock(&u->iolock);
2517 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2518 	mutex_unlock(&u->iolock);
2519 	if (!skb)
2520 		return err;
2521 
2522 	return recv_actor(sk, skb);
2523 }
2524 
2525 /*
2526  *	Sleep until more data has arrived. But check for races..
2527  */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2528 static long unix_stream_data_wait(struct sock *sk, long timeo,
2529 				  struct sk_buff *last, unsigned int last_len,
2530 				  bool freezable)
2531 {
2532 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2533 	struct sk_buff *tail;
2534 	DEFINE_WAIT(wait);
2535 
2536 	unix_state_lock(sk);
2537 
2538 	for (;;) {
2539 		prepare_to_wait(sk_sleep(sk), &wait, state);
2540 
2541 		tail = skb_peek_tail(&sk->sk_receive_queue);
2542 		if (tail != last ||
2543 		    (tail && tail->len != last_len) ||
2544 		    sk->sk_err ||
2545 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2546 		    signal_pending(current) ||
2547 		    !timeo)
2548 			break;
2549 
2550 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2551 		unix_state_unlock(sk);
2552 		timeo = schedule_timeout(timeo);
2553 		unix_state_lock(sk);
2554 
2555 		if (sock_flag(sk, SOCK_DEAD))
2556 			break;
2557 
2558 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2559 	}
2560 
2561 	finish_wait(sk_sleep(sk), &wait);
2562 	unix_state_unlock(sk);
2563 	return timeo;
2564 }
2565 
2566 struct unix_stream_read_state {
2567 	int (*recv_actor)(struct sk_buff *, int, int,
2568 			  struct unix_stream_read_state *);
2569 	struct socket *socket;
2570 	struct msghdr *msg;
2571 	struct pipe_inode_info *pipe;
2572 	size_t size;
2573 	int flags;
2574 	unsigned int splice_flags;
2575 };
2576 
2577 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2578 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2579 {
2580 	struct sk_buff *oob_skb, *read_skb = NULL;
2581 	struct socket *sock = state->socket;
2582 	struct sock *sk = sock->sk;
2583 	struct unix_sock *u = unix_sk(sk);
2584 	int chunk = 1;
2585 
2586 	mutex_lock(&u->iolock);
2587 	unix_state_lock(sk);
2588 	spin_lock(&sk->sk_receive_queue.lock);
2589 
2590 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2591 		spin_unlock(&sk->sk_receive_queue.lock);
2592 		unix_state_unlock(sk);
2593 		mutex_unlock(&u->iolock);
2594 		return -EINVAL;
2595 	}
2596 
2597 	oob_skb = u->oob_skb;
2598 
2599 	if (!(state->flags & MSG_PEEK)) {
2600 		WRITE_ONCE(u->oob_skb, NULL);
2601 
2602 		if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
2603 		    !unix_skb_len(oob_skb->prev)) {
2604 			read_skb = oob_skb->prev;
2605 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2606 		}
2607 	}
2608 
2609 	spin_unlock(&sk->sk_receive_queue.lock);
2610 	unix_state_unlock(sk);
2611 
2612 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2613 
2614 	if (!(state->flags & MSG_PEEK))
2615 		UNIXCB(oob_skb).consumed += 1;
2616 
2617 	mutex_unlock(&u->iolock);
2618 
2619 	consume_skb(read_skb);
2620 
2621 	if (chunk < 0)
2622 		return -EFAULT;
2623 
2624 	state->msg->msg_flags |= MSG_OOB;
2625 	return 1;
2626 }
2627 
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2628 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2629 				  int flags, int copied)
2630 {
2631 	struct unix_sock *u = unix_sk(sk);
2632 
2633 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2634 		skb_unlink(skb, &sk->sk_receive_queue);
2635 		consume_skb(skb);
2636 		skb = NULL;
2637 	} else {
2638 		struct sk_buff *unlinked_skb = NULL;
2639 
2640 		spin_lock(&sk->sk_receive_queue.lock);
2641 
2642 		if (skb == u->oob_skb) {
2643 			if (copied) {
2644 				skb = NULL;
2645 			} else if (!(flags & MSG_PEEK)) {
2646 				WRITE_ONCE(u->oob_skb, NULL);
2647 
2648 				if (!sock_flag(sk, SOCK_URGINLINE)) {
2649 					__skb_unlink(skb, &sk->sk_receive_queue);
2650 					unlinked_skb = skb;
2651 					skb = skb_peek(&sk->sk_receive_queue);
2652 				}
2653 			} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2654 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
2655 			}
2656 		}
2657 
2658 		spin_unlock(&sk->sk_receive_queue.lock);
2659 
2660 		kfree_skb(unlinked_skb);
2661 	}
2662 	return skb;
2663 }
2664 #endif
2665 
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2666 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2667 {
2668 	struct unix_sock *u = unix_sk(sk);
2669 	struct sk_buff *skb;
2670 	int err;
2671 
2672 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2673 		return -ENOTCONN;
2674 
2675 	mutex_lock(&u->iolock);
2676 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2677 	mutex_unlock(&u->iolock);
2678 	if (!skb)
2679 		return err;
2680 
2681 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2682 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2683 		bool drop = false;
2684 
2685 		unix_state_lock(sk);
2686 
2687 		if (sock_flag(sk, SOCK_DEAD)) {
2688 			unix_state_unlock(sk);
2689 			kfree_skb(skb);
2690 			return -ECONNRESET;
2691 		}
2692 
2693 		spin_lock(&sk->sk_receive_queue.lock);
2694 		if (likely(skb == u->oob_skb)) {
2695 			WRITE_ONCE(u->oob_skb, NULL);
2696 			drop = true;
2697 		}
2698 		spin_unlock(&sk->sk_receive_queue.lock);
2699 
2700 		unix_state_unlock(sk);
2701 
2702 		if (drop) {
2703 			kfree_skb(skb);
2704 			return -EAGAIN;
2705 		}
2706 	}
2707 #endif
2708 
2709 	return recv_actor(sk, skb);
2710 }
2711 
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2712 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2713 				    bool freezable)
2714 {
2715 	struct scm_cookie scm;
2716 	struct socket *sock = state->socket;
2717 	struct sock *sk = sock->sk;
2718 	struct unix_sock *u = unix_sk(sk);
2719 	int copied = 0;
2720 	int flags = state->flags;
2721 	int noblock = flags & MSG_DONTWAIT;
2722 	bool check_creds = false;
2723 	int target;
2724 	int err = 0;
2725 	long timeo;
2726 	int skip;
2727 	size_t size = state->size;
2728 	unsigned int last_len;
2729 
2730 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2731 		err = -EINVAL;
2732 		goto out;
2733 	}
2734 
2735 	if (unlikely(flags & MSG_OOB)) {
2736 		err = -EOPNOTSUPP;
2737 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2738 		err = unix_stream_recv_urg(state);
2739 #endif
2740 		goto out;
2741 	}
2742 
2743 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2744 	timeo = sock_rcvtimeo(sk, noblock);
2745 
2746 	memset(&scm, 0, sizeof(scm));
2747 
2748 	/* Lock the socket to prevent queue disordering
2749 	 * while sleeps in memcpy_tomsg
2750 	 */
2751 	mutex_lock(&u->iolock);
2752 
2753 	skip = max(sk_peek_offset(sk, flags), 0);
2754 
2755 	do {
2756 		int chunk;
2757 		bool drop_skb;
2758 		struct sk_buff *skb, *last;
2759 
2760 redo:
2761 		unix_state_lock(sk);
2762 		if (sock_flag(sk, SOCK_DEAD)) {
2763 			err = -ECONNRESET;
2764 			goto unlock;
2765 		}
2766 		last = skb = skb_peek(&sk->sk_receive_queue);
2767 		last_len = last ? last->len : 0;
2768 
2769 again:
2770 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2771 		if (skb) {
2772 			skb = manage_oob(skb, sk, flags, copied);
2773 			if (!skb && copied) {
2774 				unix_state_unlock(sk);
2775 				break;
2776 			}
2777 		}
2778 #endif
2779 		if (skb == NULL) {
2780 			if (copied >= target)
2781 				goto unlock;
2782 
2783 			/*
2784 			 *	POSIX 1003.1g mandates this order.
2785 			 */
2786 
2787 			err = sock_error(sk);
2788 			if (err)
2789 				goto unlock;
2790 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2791 				goto unlock;
2792 
2793 			unix_state_unlock(sk);
2794 			if (!timeo) {
2795 				err = -EAGAIN;
2796 				break;
2797 			}
2798 
2799 			mutex_unlock(&u->iolock);
2800 
2801 			timeo = unix_stream_data_wait(sk, timeo, last,
2802 						      last_len, freezable);
2803 
2804 			if (signal_pending(current)) {
2805 				err = sock_intr_errno(timeo);
2806 				scm_destroy(&scm);
2807 				goto out;
2808 			}
2809 
2810 			mutex_lock(&u->iolock);
2811 			goto redo;
2812 unlock:
2813 			unix_state_unlock(sk);
2814 			break;
2815 		}
2816 
2817 		while (skip >= unix_skb_len(skb)) {
2818 			skip -= unix_skb_len(skb);
2819 			last = skb;
2820 			last_len = skb->len;
2821 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2822 			if (!skb)
2823 				goto again;
2824 		}
2825 
2826 		unix_state_unlock(sk);
2827 
2828 		if (check_creds) {
2829 			/* Never glue messages from different writers */
2830 			if (!unix_skb_scm_eq(skb, &scm))
2831 				break;
2832 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2833 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2834 			/* Copy credentials */
2835 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2836 			unix_set_secdata(&scm, skb);
2837 			check_creds = true;
2838 		}
2839 
2840 		/* Copy address just once */
2841 		if (state->msg && state->msg->msg_name) {
2842 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2843 					 state->msg->msg_name);
2844 			unix_copy_addr(state->msg, skb->sk);
2845 			sunaddr = NULL;
2846 		}
2847 
2848 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2849 		skb_get(skb);
2850 		chunk = state->recv_actor(skb, skip, chunk, state);
2851 		drop_skb = !unix_skb_len(skb);
2852 		/* skb is only safe to use if !drop_skb */
2853 		consume_skb(skb);
2854 		if (chunk < 0) {
2855 			if (copied == 0)
2856 				copied = -EFAULT;
2857 			break;
2858 		}
2859 		copied += chunk;
2860 		size -= chunk;
2861 
2862 		if (drop_skb) {
2863 			/* the skb was touched by a concurrent reader;
2864 			 * we should not expect anything from this skb
2865 			 * anymore and assume it invalid - we can be
2866 			 * sure it was dropped from the socket queue
2867 			 *
2868 			 * let's report a short read
2869 			 */
2870 			err = 0;
2871 			break;
2872 		}
2873 
2874 		/* Mark read part of skb as used */
2875 		if (!(flags & MSG_PEEK)) {
2876 			UNIXCB(skb).consumed += chunk;
2877 
2878 			sk_peek_offset_bwd(sk, chunk);
2879 
2880 			if (UNIXCB(skb).fp) {
2881 				scm_stat_del(sk, skb);
2882 				unix_detach_fds(&scm, skb);
2883 			}
2884 
2885 			if (unix_skb_len(skb))
2886 				break;
2887 
2888 			skb_unlink(skb, &sk->sk_receive_queue);
2889 			consume_skb(skb);
2890 
2891 			if (scm.fp)
2892 				break;
2893 		} else {
2894 			/* It is questionable, see note in unix_dgram_recvmsg.
2895 			 */
2896 			if (UNIXCB(skb).fp)
2897 				unix_peek_fds(&scm, skb);
2898 
2899 			sk_peek_offset_fwd(sk, chunk);
2900 
2901 			if (UNIXCB(skb).fp)
2902 				break;
2903 
2904 			skip = 0;
2905 			last = skb;
2906 			last_len = skb->len;
2907 			unix_state_lock(sk);
2908 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2909 			if (skb)
2910 				goto again;
2911 			unix_state_unlock(sk);
2912 			break;
2913 		}
2914 	} while (size);
2915 
2916 	mutex_unlock(&u->iolock);
2917 	if (state->msg)
2918 		scm_recv_unix(sock, state->msg, &scm, flags);
2919 	else
2920 		scm_destroy(&scm);
2921 out:
2922 	return copied ? : err;
2923 }
2924 
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2925 static int unix_stream_read_actor(struct sk_buff *skb,
2926 				  int skip, int chunk,
2927 				  struct unix_stream_read_state *state)
2928 {
2929 	int ret;
2930 
2931 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2932 				    state->msg, chunk);
2933 	return ret ?: chunk;
2934 }
2935 
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2936 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2937 			  size_t size, int flags)
2938 {
2939 	struct unix_stream_read_state state = {
2940 		.recv_actor = unix_stream_read_actor,
2941 		.socket = sk->sk_socket,
2942 		.msg = msg,
2943 		.size = size,
2944 		.flags = flags
2945 	};
2946 
2947 	return unix_stream_read_generic(&state, true);
2948 }
2949 
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2950 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2951 			       size_t size, int flags)
2952 {
2953 	struct unix_stream_read_state state = {
2954 		.recv_actor = unix_stream_read_actor,
2955 		.socket = sock,
2956 		.msg = msg,
2957 		.size = size,
2958 		.flags = flags
2959 	};
2960 
2961 #ifdef CONFIG_BPF_SYSCALL
2962 	struct sock *sk = sock->sk;
2963 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2964 
2965 	if (prot != &unix_stream_proto)
2966 		return prot->recvmsg(sk, msg, size, flags, NULL);
2967 #endif
2968 	return unix_stream_read_generic(&state, true);
2969 }
2970 
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2971 static int unix_stream_splice_actor(struct sk_buff *skb,
2972 				    int skip, int chunk,
2973 				    struct unix_stream_read_state *state)
2974 {
2975 	return skb_splice_bits(skb, state->socket->sk,
2976 			       UNIXCB(skb).consumed + skip,
2977 			       state->pipe, chunk, state->splice_flags);
2978 }
2979 
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)2980 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2981 				       struct pipe_inode_info *pipe,
2982 				       size_t size, unsigned int flags)
2983 {
2984 	struct unix_stream_read_state state = {
2985 		.recv_actor = unix_stream_splice_actor,
2986 		.socket = sock,
2987 		.pipe = pipe,
2988 		.size = size,
2989 		.splice_flags = flags,
2990 	};
2991 
2992 	if (unlikely(*ppos))
2993 		return -ESPIPE;
2994 
2995 	if (sock->file->f_flags & O_NONBLOCK ||
2996 	    flags & SPLICE_F_NONBLOCK)
2997 		state.flags = MSG_DONTWAIT;
2998 
2999 	return unix_stream_read_generic(&state, false);
3000 }
3001 
unix_shutdown(struct socket * sock,int mode)3002 static int unix_shutdown(struct socket *sock, int mode)
3003 {
3004 	struct sock *sk = sock->sk;
3005 	struct sock *other;
3006 
3007 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3008 		return -EINVAL;
3009 	/* This maps:
3010 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3011 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3012 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3013 	 */
3014 	++mode;
3015 
3016 	unix_state_lock(sk);
3017 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3018 	other = unix_peer(sk);
3019 	if (other)
3020 		sock_hold(other);
3021 	unix_state_unlock(sk);
3022 	sk->sk_state_change(sk);
3023 
3024 	if (other &&
3025 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3026 
3027 		int peer_mode = 0;
3028 		const struct proto *prot = READ_ONCE(other->sk_prot);
3029 
3030 		if (prot->unhash)
3031 			prot->unhash(other);
3032 		if (mode&RCV_SHUTDOWN)
3033 			peer_mode |= SEND_SHUTDOWN;
3034 		if (mode&SEND_SHUTDOWN)
3035 			peer_mode |= RCV_SHUTDOWN;
3036 		unix_state_lock(other);
3037 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3038 		unix_state_unlock(other);
3039 		other->sk_state_change(other);
3040 		if (peer_mode == SHUTDOWN_MASK)
3041 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3042 		else if (peer_mode & RCV_SHUTDOWN)
3043 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3044 	}
3045 	if (other)
3046 		sock_put(other);
3047 
3048 	return 0;
3049 }
3050 
unix_inq_len(struct sock * sk)3051 long unix_inq_len(struct sock *sk)
3052 {
3053 	struct sk_buff *skb;
3054 	long amount = 0;
3055 
3056 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3057 		return -EINVAL;
3058 
3059 	spin_lock(&sk->sk_receive_queue.lock);
3060 	if (sk->sk_type == SOCK_STREAM ||
3061 	    sk->sk_type == SOCK_SEQPACKET) {
3062 		skb_queue_walk(&sk->sk_receive_queue, skb)
3063 			amount += unix_skb_len(skb);
3064 	} else {
3065 		skb = skb_peek(&sk->sk_receive_queue);
3066 		if (skb)
3067 			amount = skb->len;
3068 	}
3069 	spin_unlock(&sk->sk_receive_queue.lock);
3070 
3071 	return amount;
3072 }
3073 EXPORT_SYMBOL_GPL(unix_inq_len);
3074 
unix_outq_len(struct sock * sk)3075 long unix_outq_len(struct sock *sk)
3076 {
3077 	return sk_wmem_alloc_get(sk);
3078 }
3079 EXPORT_SYMBOL_GPL(unix_outq_len);
3080 
unix_open_file(struct sock * sk)3081 static int unix_open_file(struct sock *sk)
3082 {
3083 	struct path path;
3084 	struct file *f;
3085 	int fd;
3086 
3087 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3088 		return -EPERM;
3089 
3090 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3091 		return -ENOENT;
3092 
3093 	path = unix_sk(sk)->path;
3094 	if (!path.dentry)
3095 		return -ENOENT;
3096 
3097 	path_get(&path);
3098 
3099 	fd = get_unused_fd_flags(O_CLOEXEC);
3100 	if (fd < 0)
3101 		goto out;
3102 
3103 	f = dentry_open(&path, O_PATH, current_cred());
3104 	if (IS_ERR(f)) {
3105 		put_unused_fd(fd);
3106 		fd = PTR_ERR(f);
3107 		goto out;
3108 	}
3109 
3110 	fd_install(fd, f);
3111 out:
3112 	path_put(&path);
3113 
3114 	return fd;
3115 }
3116 
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3117 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3118 {
3119 	struct sock *sk = sock->sk;
3120 	long amount = 0;
3121 	int err;
3122 
3123 	switch (cmd) {
3124 	case SIOCOUTQ:
3125 		amount = unix_outq_len(sk);
3126 		err = put_user(amount, (int __user *)arg);
3127 		break;
3128 	case SIOCINQ:
3129 		amount = unix_inq_len(sk);
3130 		if (amount < 0)
3131 			err = amount;
3132 		else
3133 			err = put_user(amount, (int __user *)arg);
3134 		break;
3135 	case SIOCUNIXFILE:
3136 		err = unix_open_file(sk);
3137 		break;
3138 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3139 	case SIOCATMARK:
3140 		{
3141 			struct sk_buff *skb;
3142 			int answ = 0;
3143 
3144 			skb = skb_peek(&sk->sk_receive_queue);
3145 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3146 				answ = 1;
3147 			err = put_user(answ, (int __user *)arg);
3148 		}
3149 		break;
3150 #endif
3151 	default:
3152 		err = -ENOIOCTLCMD;
3153 		break;
3154 	}
3155 	return err;
3156 }
3157 
3158 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3159 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3160 {
3161 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3162 }
3163 #endif
3164 
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3165 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3166 {
3167 	struct sock *sk = sock->sk;
3168 	unsigned char state;
3169 	__poll_t mask;
3170 	u8 shutdown;
3171 
3172 	sock_poll_wait(file, sock, wait);
3173 	mask = 0;
3174 	shutdown = READ_ONCE(sk->sk_shutdown);
3175 	state = READ_ONCE(sk->sk_state);
3176 
3177 	/* exceptional events? */
3178 	if (READ_ONCE(sk->sk_err))
3179 		mask |= EPOLLERR;
3180 	if (shutdown == SHUTDOWN_MASK)
3181 		mask |= EPOLLHUP;
3182 	if (shutdown & RCV_SHUTDOWN)
3183 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3184 
3185 	/* readable? */
3186 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3187 		mask |= EPOLLIN | EPOLLRDNORM;
3188 	if (sk_is_readable(sk))
3189 		mask |= EPOLLIN | EPOLLRDNORM;
3190 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3191 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3192 		mask |= EPOLLPRI;
3193 #endif
3194 
3195 	/* Connection-based need to check for termination and startup */
3196 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3197 	    state == TCP_CLOSE)
3198 		mask |= EPOLLHUP;
3199 
3200 	/*
3201 	 * we set writable also when the other side has shut down the
3202 	 * connection. This prevents stuck sockets.
3203 	 */
3204 	if (unix_writable(sk, state))
3205 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3206 
3207 	return mask;
3208 }
3209 
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3210 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3211 				    poll_table *wait)
3212 {
3213 	struct sock *sk = sock->sk, *other;
3214 	unsigned int writable;
3215 	unsigned char state;
3216 	__poll_t mask;
3217 	u8 shutdown;
3218 
3219 	sock_poll_wait(file, sock, wait);
3220 	mask = 0;
3221 	shutdown = READ_ONCE(sk->sk_shutdown);
3222 	state = READ_ONCE(sk->sk_state);
3223 
3224 	/* exceptional events? */
3225 	if (READ_ONCE(sk->sk_err) ||
3226 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3227 		mask |= EPOLLERR |
3228 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3229 
3230 	if (shutdown & RCV_SHUTDOWN)
3231 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3232 	if (shutdown == SHUTDOWN_MASK)
3233 		mask |= EPOLLHUP;
3234 
3235 	/* readable? */
3236 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3237 		mask |= EPOLLIN | EPOLLRDNORM;
3238 	if (sk_is_readable(sk))
3239 		mask |= EPOLLIN | EPOLLRDNORM;
3240 
3241 	/* Connection-based need to check for termination and startup */
3242 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3243 		mask |= EPOLLHUP;
3244 
3245 	/* No write status requested, avoid expensive OUT tests. */
3246 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3247 		return mask;
3248 
3249 	writable = unix_writable(sk, state);
3250 	if (writable) {
3251 		unix_state_lock(sk);
3252 
3253 		other = unix_peer(sk);
3254 		if (other && unix_peer(other) != sk &&
3255 		    unix_recvq_full_lockless(other) &&
3256 		    unix_dgram_peer_wake_me(sk, other))
3257 			writable = 0;
3258 
3259 		unix_state_unlock(sk);
3260 	}
3261 
3262 	if (writable)
3263 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3264 	else
3265 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3266 
3267 	return mask;
3268 }
3269 
3270 #ifdef CONFIG_PROC_FS
3271 
3272 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3273 
3274 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3275 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3276 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3277 
unix_from_bucket(struct seq_file * seq,loff_t * pos)3278 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3279 {
3280 	unsigned long offset = get_offset(*pos);
3281 	unsigned long bucket = get_bucket(*pos);
3282 	unsigned long count = 0;
3283 	struct sock *sk;
3284 
3285 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3286 	     sk; sk = sk_next(sk)) {
3287 		if (++count == offset)
3288 			break;
3289 	}
3290 
3291 	return sk;
3292 }
3293 
unix_get_first(struct seq_file * seq,loff_t * pos)3294 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3295 {
3296 	unsigned long bucket = get_bucket(*pos);
3297 	struct net *net = seq_file_net(seq);
3298 	struct sock *sk;
3299 
3300 	while (bucket < UNIX_HASH_SIZE) {
3301 		spin_lock(&net->unx.table.locks[bucket]);
3302 
3303 		sk = unix_from_bucket(seq, pos);
3304 		if (sk)
3305 			return sk;
3306 
3307 		spin_unlock(&net->unx.table.locks[bucket]);
3308 
3309 		*pos = set_bucket_offset(++bucket, 1);
3310 	}
3311 
3312 	return NULL;
3313 }
3314 
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3315 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3316 				  loff_t *pos)
3317 {
3318 	unsigned long bucket = get_bucket(*pos);
3319 
3320 	sk = sk_next(sk);
3321 	if (sk)
3322 		return sk;
3323 
3324 
3325 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3326 
3327 	*pos = set_bucket_offset(++bucket, 1);
3328 
3329 	return unix_get_first(seq, pos);
3330 }
3331 
unix_seq_start(struct seq_file * seq,loff_t * pos)3332 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3333 {
3334 	if (!*pos)
3335 		return SEQ_START_TOKEN;
3336 
3337 	return unix_get_first(seq, pos);
3338 }
3339 
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3340 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3341 {
3342 	++*pos;
3343 
3344 	if (v == SEQ_START_TOKEN)
3345 		return unix_get_first(seq, pos);
3346 
3347 	return unix_get_next(seq, v, pos);
3348 }
3349 
unix_seq_stop(struct seq_file * seq,void * v)3350 static void unix_seq_stop(struct seq_file *seq, void *v)
3351 {
3352 	struct sock *sk = v;
3353 
3354 	if (sk)
3355 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3356 }
3357 
unix_seq_show(struct seq_file * seq,void * v)3358 static int unix_seq_show(struct seq_file *seq, void *v)
3359 {
3360 
3361 	if (v == SEQ_START_TOKEN)
3362 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3363 			 "Inode Path\n");
3364 	else {
3365 		struct sock *s = v;
3366 		struct unix_sock *u = unix_sk(s);
3367 		unix_state_lock(s);
3368 
3369 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3370 			s,
3371 			refcount_read(&s->sk_refcnt),
3372 			0,
3373 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3374 			s->sk_type,
3375 			s->sk_socket ?
3376 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3377 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3378 			sock_i_ino(s));
3379 
3380 		if (u->addr) {	// under a hash table lock here
3381 			int i, len;
3382 			seq_putc(seq, ' ');
3383 
3384 			i = 0;
3385 			len = u->addr->len -
3386 				offsetof(struct sockaddr_un, sun_path);
3387 			if (u->addr->name->sun_path[0]) {
3388 				len--;
3389 			} else {
3390 				seq_putc(seq, '@');
3391 				i++;
3392 			}
3393 			for ( ; i < len; i++)
3394 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3395 					 '@');
3396 		}
3397 		unix_state_unlock(s);
3398 		seq_putc(seq, '\n');
3399 	}
3400 
3401 	return 0;
3402 }
3403 
3404 static const struct seq_operations unix_seq_ops = {
3405 	.start  = unix_seq_start,
3406 	.next   = unix_seq_next,
3407 	.stop   = unix_seq_stop,
3408 	.show   = unix_seq_show,
3409 };
3410 
3411 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3412 struct bpf_unix_iter_state {
3413 	struct seq_net_private p;
3414 	unsigned int cur_sk;
3415 	unsigned int end_sk;
3416 	unsigned int max_sk;
3417 	struct sock **batch;
3418 	bool st_bucket_done;
3419 };
3420 
3421 struct bpf_iter__unix {
3422 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3423 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3424 	uid_t uid __aligned(8);
3425 };
3426 
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3427 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3428 			      struct unix_sock *unix_sk, uid_t uid)
3429 {
3430 	struct bpf_iter__unix ctx;
3431 
3432 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3433 	ctx.meta = meta;
3434 	ctx.unix_sk = unix_sk;
3435 	ctx.uid = uid;
3436 	return bpf_iter_run_prog(prog, &ctx);
3437 }
3438 
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3439 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3440 
3441 {
3442 	struct bpf_unix_iter_state *iter = seq->private;
3443 	unsigned int expected = 1;
3444 	struct sock *sk;
3445 
3446 	sock_hold(start_sk);
3447 	iter->batch[iter->end_sk++] = start_sk;
3448 
3449 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3450 		if (iter->end_sk < iter->max_sk) {
3451 			sock_hold(sk);
3452 			iter->batch[iter->end_sk++] = sk;
3453 		}
3454 
3455 		expected++;
3456 	}
3457 
3458 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3459 
3460 	return expected;
3461 }
3462 
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3463 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3464 {
3465 	while (iter->cur_sk < iter->end_sk)
3466 		sock_put(iter->batch[iter->cur_sk++]);
3467 }
3468 
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3469 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3470 				       unsigned int new_batch_sz)
3471 {
3472 	struct sock **new_batch;
3473 
3474 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3475 			     GFP_USER | __GFP_NOWARN);
3476 	if (!new_batch)
3477 		return -ENOMEM;
3478 
3479 	bpf_iter_unix_put_batch(iter);
3480 	kvfree(iter->batch);
3481 	iter->batch = new_batch;
3482 	iter->max_sk = new_batch_sz;
3483 
3484 	return 0;
3485 }
3486 
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3487 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3488 					loff_t *pos)
3489 {
3490 	struct bpf_unix_iter_state *iter = seq->private;
3491 	unsigned int expected;
3492 	bool resized = false;
3493 	struct sock *sk;
3494 
3495 	if (iter->st_bucket_done)
3496 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3497 
3498 again:
3499 	/* Get a new batch */
3500 	iter->cur_sk = 0;
3501 	iter->end_sk = 0;
3502 
3503 	sk = unix_get_first(seq, pos);
3504 	if (!sk)
3505 		return NULL; /* Done */
3506 
3507 	expected = bpf_iter_unix_hold_batch(seq, sk);
3508 
3509 	if (iter->end_sk == expected) {
3510 		iter->st_bucket_done = true;
3511 		return sk;
3512 	}
3513 
3514 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3515 		resized = true;
3516 		goto again;
3517 	}
3518 
3519 	return sk;
3520 }
3521 
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3522 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3523 {
3524 	if (!*pos)
3525 		return SEQ_START_TOKEN;
3526 
3527 	/* bpf iter does not support lseek, so it always
3528 	 * continue from where it was stop()-ped.
3529 	 */
3530 	return bpf_iter_unix_batch(seq, pos);
3531 }
3532 
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3533 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3534 {
3535 	struct bpf_unix_iter_state *iter = seq->private;
3536 	struct sock *sk;
3537 
3538 	/* Whenever seq_next() is called, the iter->cur_sk is
3539 	 * done with seq_show(), so advance to the next sk in
3540 	 * the batch.
3541 	 */
3542 	if (iter->cur_sk < iter->end_sk)
3543 		sock_put(iter->batch[iter->cur_sk++]);
3544 
3545 	++*pos;
3546 
3547 	if (iter->cur_sk < iter->end_sk)
3548 		sk = iter->batch[iter->cur_sk];
3549 	else
3550 		sk = bpf_iter_unix_batch(seq, pos);
3551 
3552 	return sk;
3553 }
3554 
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3555 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3556 {
3557 	struct bpf_iter_meta meta;
3558 	struct bpf_prog *prog;
3559 	struct sock *sk = v;
3560 	uid_t uid;
3561 	bool slow;
3562 	int ret;
3563 
3564 	if (v == SEQ_START_TOKEN)
3565 		return 0;
3566 
3567 	slow = lock_sock_fast(sk);
3568 
3569 	if (unlikely(sk_unhashed(sk))) {
3570 		ret = SEQ_SKIP;
3571 		goto unlock;
3572 	}
3573 
3574 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3575 	meta.seq = seq;
3576 	prog = bpf_iter_get_info(&meta, false);
3577 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3578 unlock:
3579 	unlock_sock_fast(sk, slow);
3580 	return ret;
3581 }
3582 
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3583 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3584 {
3585 	struct bpf_unix_iter_state *iter = seq->private;
3586 	struct bpf_iter_meta meta;
3587 	struct bpf_prog *prog;
3588 
3589 	if (!v) {
3590 		meta.seq = seq;
3591 		prog = bpf_iter_get_info(&meta, true);
3592 		if (prog)
3593 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3594 	}
3595 
3596 	if (iter->cur_sk < iter->end_sk)
3597 		bpf_iter_unix_put_batch(iter);
3598 }
3599 
3600 static const struct seq_operations bpf_iter_unix_seq_ops = {
3601 	.start	= bpf_iter_unix_seq_start,
3602 	.next	= bpf_iter_unix_seq_next,
3603 	.stop	= bpf_iter_unix_seq_stop,
3604 	.show	= bpf_iter_unix_seq_show,
3605 };
3606 #endif
3607 #endif
3608 
3609 static const struct net_proto_family unix_family_ops = {
3610 	.family = PF_UNIX,
3611 	.create = unix_create,
3612 	.owner	= THIS_MODULE,
3613 };
3614 
3615 
unix_net_init(struct net * net)3616 static int __net_init unix_net_init(struct net *net)
3617 {
3618 	int i;
3619 
3620 	net->unx.sysctl_max_dgram_qlen = 10;
3621 	if (unix_sysctl_register(net))
3622 		goto out;
3623 
3624 #ifdef CONFIG_PROC_FS
3625 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3626 			     sizeof(struct seq_net_private)))
3627 		goto err_sysctl;
3628 #endif
3629 
3630 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3631 					      sizeof(spinlock_t), GFP_KERNEL);
3632 	if (!net->unx.table.locks)
3633 		goto err_proc;
3634 
3635 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3636 						sizeof(struct hlist_head),
3637 						GFP_KERNEL);
3638 	if (!net->unx.table.buckets)
3639 		goto free_locks;
3640 
3641 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3642 		spin_lock_init(&net->unx.table.locks[i]);
3643 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3644 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3645 	}
3646 
3647 	return 0;
3648 
3649 free_locks:
3650 	kvfree(net->unx.table.locks);
3651 err_proc:
3652 #ifdef CONFIG_PROC_FS
3653 	remove_proc_entry("unix", net->proc_net);
3654 err_sysctl:
3655 #endif
3656 	unix_sysctl_unregister(net);
3657 out:
3658 	return -ENOMEM;
3659 }
3660 
unix_net_exit(struct net * net)3661 static void __net_exit unix_net_exit(struct net *net)
3662 {
3663 	kvfree(net->unx.table.buckets);
3664 	kvfree(net->unx.table.locks);
3665 	unix_sysctl_unregister(net);
3666 	remove_proc_entry("unix", net->proc_net);
3667 }
3668 
3669 static struct pernet_operations unix_net_ops = {
3670 	.init = unix_net_init,
3671 	.exit = unix_net_exit,
3672 };
3673 
3674 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3675 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3676 		     struct unix_sock *unix_sk, uid_t uid)
3677 
3678 #define INIT_BATCH_SZ 16
3679 
3680 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3681 {
3682 	struct bpf_unix_iter_state *iter = priv_data;
3683 	int err;
3684 
3685 	err = bpf_iter_init_seq_net(priv_data, aux);
3686 	if (err)
3687 		return err;
3688 
3689 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3690 	if (err) {
3691 		bpf_iter_fini_seq_net(priv_data);
3692 		return err;
3693 	}
3694 
3695 	return 0;
3696 }
3697 
bpf_iter_fini_unix(void * priv_data)3698 static void bpf_iter_fini_unix(void *priv_data)
3699 {
3700 	struct bpf_unix_iter_state *iter = priv_data;
3701 
3702 	bpf_iter_fini_seq_net(priv_data);
3703 	kvfree(iter->batch);
3704 }
3705 
3706 static const struct bpf_iter_seq_info unix_seq_info = {
3707 	.seq_ops		= &bpf_iter_unix_seq_ops,
3708 	.init_seq_private	= bpf_iter_init_unix,
3709 	.fini_seq_private	= bpf_iter_fini_unix,
3710 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3711 };
3712 
3713 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3714 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3715 			     const struct bpf_prog *prog)
3716 {
3717 	switch (func_id) {
3718 	case BPF_FUNC_setsockopt:
3719 		return &bpf_sk_setsockopt_proto;
3720 	case BPF_FUNC_getsockopt:
3721 		return &bpf_sk_getsockopt_proto;
3722 	default:
3723 		return NULL;
3724 	}
3725 }
3726 
3727 static struct bpf_iter_reg unix_reg_info = {
3728 	.target			= "unix",
3729 	.ctx_arg_info_size	= 1,
3730 	.ctx_arg_info		= {
3731 		{ offsetof(struct bpf_iter__unix, unix_sk),
3732 		  PTR_TO_BTF_ID_OR_NULL },
3733 	},
3734 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3735 	.seq_info		= &unix_seq_info,
3736 };
3737 
bpf_iter_register(void)3738 static void __init bpf_iter_register(void)
3739 {
3740 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3741 	if (bpf_iter_reg_target(&unix_reg_info))
3742 		pr_warn("Warning: could not register bpf iterator unix\n");
3743 }
3744 #endif
3745 
af_unix_init(void)3746 static int __init af_unix_init(void)
3747 {
3748 	int i, rc = -1;
3749 
3750 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3751 
3752 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3753 		spin_lock_init(&bsd_socket_locks[i]);
3754 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3755 	}
3756 
3757 	rc = proto_register(&unix_dgram_proto, 1);
3758 	if (rc != 0) {
3759 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3760 		goto out;
3761 	}
3762 
3763 	rc = proto_register(&unix_stream_proto, 1);
3764 	if (rc != 0) {
3765 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3766 		proto_unregister(&unix_dgram_proto);
3767 		goto out;
3768 	}
3769 
3770 	sock_register(&unix_family_ops);
3771 	register_pernet_subsys(&unix_net_ops);
3772 	unix_bpf_build_proto();
3773 
3774 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3775 	bpf_iter_register();
3776 #endif
3777 
3778 out:
3779 	return rc;
3780 }
3781 
af_unix_exit(void)3782 static void __exit af_unix_exit(void)
3783 {
3784 	sock_unregister(PF_UNIX);
3785 	proto_unregister(&unix_dgram_proto);
3786 	proto_unregister(&unix_stream_proto);
3787 	unregister_pernet_subsys(&unix_net_ops);
3788 }
3789 
3790 /* Earlier than device_initcall() so that other drivers invoking
3791    request_module() don't end up in a loop when modprobe tries
3792    to use a UNIX socket. But later than subsys_initcall() because
3793    we depend on stuff initialised there */
3794 fs_initcall(af_unix_init);
3795 module_exit(af_unix_exit);
3796 
3797 MODULE_LICENSE("GPL");
3798 MODULE_ALIAS_NETPROTO(PF_UNIX);
3799