1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119
120 #include "scm.h"
121
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125
126 /* SMP locking strategy:
127 * hash table is protected with spinlock.
128 * each socket state is protected by separate spinlock.
129 */
130
unix_unbound_hash(struct sock * sk)131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 unsigned long hash = (unsigned long)sk;
134
135 hash ^= hash >> 16;
136 hash ^= hash >> 8;
137 hash ^= sk->sk_type;
138
139 return hash & UNIX_HASH_MOD;
140 }
141
unix_bsd_hash(struct inode * i)142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 return i->i_ino & UNIX_HASH_MOD;
145 }
146
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 int addr_len, int type)
149 {
150 __wsum csum = csum_partial(sunaddr, addr_len, 0);
151 unsigned int hash;
152
153 hash = (__force unsigned int)csum_fold(csum);
154 hash ^= hash >> 8;
155 hash ^= type;
156
157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)160 static void unix_table_double_lock(struct net *net,
161 unsigned int hash1, unsigned int hash2)
162 {
163 if (hash1 == hash2) {
164 spin_lock(&net->unx.table.locks[hash1]);
165 return;
166 }
167
168 if (hash1 > hash2)
169 swap(hash1, hash2);
170
171 spin_lock(&net->unx.table.locks[hash1]);
172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)175 static void unix_table_double_unlock(struct net *net,
176 unsigned int hash1, unsigned int hash2)
177 {
178 if (hash1 == hash2) {
179 spin_unlock(&net->unx.table.locks[hash1]);
180 return;
181 }
182
183 spin_unlock(&net->unx.table.locks[hash1]);
184 spin_unlock(&net->unx.table.locks[hash2]);
185 }
186
187 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 UNIXCB(skb).secid = scm->secid;
191 }
192
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 scm->secid = UNIXCB(skb).secid;
196 }
197
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214
unix_our_peer(struct sock * sk,struct sock * osk)215 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
216 {
217 return unix_peer(osk) == sk;
218 }
219
unix_may_send(struct sock * sk,struct sock * osk)220 static inline int unix_may_send(struct sock *sk, struct sock *osk)
221 {
222 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
223 }
224
unix_recvq_full_lockless(const struct sock * sk)225 static inline int unix_recvq_full_lockless(const struct sock *sk)
226 {
227 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
228 }
229
unix_peer_get(struct sock * s)230 struct sock *unix_peer_get(struct sock *s)
231 {
232 struct sock *peer;
233
234 unix_state_lock(s);
235 peer = unix_peer(s);
236 if (peer)
237 sock_hold(peer);
238 unix_state_unlock(s);
239 return peer;
240 }
241 EXPORT_SYMBOL_GPL(unix_peer_get);
242
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
244 int addr_len)
245 {
246 struct unix_address *addr;
247
248 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
249 if (!addr)
250 return NULL;
251
252 refcount_set(&addr->refcnt, 1);
253 addr->len = addr_len;
254 memcpy(addr->name, sunaddr, addr_len);
255
256 return addr;
257 }
258
unix_release_addr(struct unix_address * addr)259 static inline void unix_release_addr(struct unix_address *addr)
260 {
261 if (refcount_dec_and_test(&addr->refcnt))
262 kfree(addr);
263 }
264
265 /*
266 * Check unix socket name:
267 * - should be not zero length.
268 * - if started by not zero, should be NULL terminated (FS object)
269 * - if started by zero, it is abstract name.
270 */
271
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
273 {
274 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
275 addr_len > sizeof(*sunaddr))
276 return -EINVAL;
277
278 if (sunaddr->sun_family != AF_UNIX)
279 return -EINVAL;
280
281 return 0;
282 }
283
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
285 {
286 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
287 short offset = offsetof(struct sockaddr_storage, __data);
288
289 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
290
291 /* This may look like an off by one error but it is a bit more
292 * subtle. 108 is the longest valid AF_UNIX path for a binding.
293 * sun_path[108] doesn't as such exist. However in kernel space
294 * we are guaranteed that it is a valid memory location in our
295 * kernel address buffer because syscall functions always pass
296 * a pointer of struct sockaddr_storage which has a bigger buffer
297 * than 108. Also, we must terminate sun_path for strlen() in
298 * getname_kernel().
299 */
300 addr->__data[addr_len - offset] = 0;
301
302 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
303 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
304 * know the actual buffer.
305 */
306 return strlen(addr->__data) + offset + 1;
307 }
308
__unix_remove_socket(struct sock * sk)309 static void __unix_remove_socket(struct sock *sk)
310 {
311 sk_del_node_init(sk);
312 }
313
__unix_insert_socket(struct net * net,struct sock * sk)314 static void __unix_insert_socket(struct net *net, struct sock *sk)
315 {
316 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
317 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
318 }
319
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)320 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
321 struct unix_address *addr, unsigned int hash)
322 {
323 __unix_remove_socket(sk);
324 smp_store_release(&unix_sk(sk)->addr, addr);
325
326 sk->sk_hash = hash;
327 __unix_insert_socket(net, sk);
328 }
329
unix_remove_socket(struct net * net,struct sock * sk)330 static void unix_remove_socket(struct net *net, struct sock *sk)
331 {
332 spin_lock(&net->unx.table.locks[sk->sk_hash]);
333 __unix_remove_socket(sk);
334 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
335 }
336
unix_insert_unbound_socket(struct net * net,struct sock * sk)337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
338 {
339 spin_lock(&net->unx.table.locks[sk->sk_hash]);
340 __unix_insert_socket(net, sk);
341 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
342 }
343
unix_insert_bsd_socket(struct sock * sk)344 static void unix_insert_bsd_socket(struct sock *sk)
345 {
346 spin_lock(&bsd_socket_locks[sk->sk_hash]);
347 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
348 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
349 }
350
unix_remove_bsd_socket(struct sock * sk)351 static void unix_remove_bsd_socket(struct sock *sk)
352 {
353 if (!hlist_unhashed(&sk->sk_bind_node)) {
354 spin_lock(&bsd_socket_locks[sk->sk_hash]);
355 __sk_del_bind_node(sk);
356 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
357
358 sk_node_init(&sk->sk_bind_node);
359 }
360 }
361
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)362 static struct sock *__unix_find_socket_byname(struct net *net,
363 struct sockaddr_un *sunname,
364 int len, unsigned int hash)
365 {
366 struct sock *s;
367
368 sk_for_each(s, &net->unx.table.buckets[hash]) {
369 struct unix_sock *u = unix_sk(s);
370
371 if (u->addr->len == len &&
372 !memcmp(u->addr->name, sunname, len))
373 return s;
374 }
375 return NULL;
376 }
377
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)378 static inline struct sock *unix_find_socket_byname(struct net *net,
379 struct sockaddr_un *sunname,
380 int len, unsigned int hash)
381 {
382 struct sock *s;
383
384 spin_lock(&net->unx.table.locks[hash]);
385 s = __unix_find_socket_byname(net, sunname, len, hash);
386 if (s)
387 sock_hold(s);
388 spin_unlock(&net->unx.table.locks[hash]);
389 return s;
390 }
391
unix_find_socket_byinode(struct inode * i)392 static struct sock *unix_find_socket_byinode(struct inode *i)
393 {
394 unsigned int hash = unix_bsd_hash(i);
395 struct sock *s;
396
397 spin_lock(&bsd_socket_locks[hash]);
398 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
399 struct dentry *dentry = unix_sk(s)->path.dentry;
400
401 if (dentry && d_backing_inode(dentry) == i) {
402 sock_hold(s);
403 spin_unlock(&bsd_socket_locks[hash]);
404 return s;
405 }
406 }
407 spin_unlock(&bsd_socket_locks[hash]);
408 return NULL;
409 }
410
411 /* Support code for asymmetrically connected dgram sockets
412 *
413 * If a datagram socket is connected to a socket not itself connected
414 * to the first socket (eg, /dev/log), clients may only enqueue more
415 * messages if the present receive queue of the server socket is not
416 * "too large". This means there's a second writeability condition
417 * poll and sendmsg need to test. The dgram recv code will do a wake
418 * up on the peer_wait wait queue of a socket upon reception of a
419 * datagram which needs to be propagated to sleeping would-be writers
420 * since these might not have sent anything so far. This can't be
421 * accomplished via poll_wait because the lifetime of the server
422 * socket might be less than that of its clients if these break their
423 * association with it or if the server socket is closed while clients
424 * are still connected to it and there's no way to inform "a polling
425 * implementation" that it should let go of a certain wait queue
426 *
427 * In order to propagate a wake up, a wait_queue_entry_t of the client
428 * socket is enqueued on the peer_wait queue of the server socket
429 * whose wake function does a wake_up on the ordinary client socket
430 * wait queue. This connection is established whenever a write (or
431 * poll for write) hit the flow control condition and broken when the
432 * association to the server socket is dissolved or after a wake up
433 * was relayed.
434 */
435
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
437 void *key)
438 {
439 struct unix_sock *u;
440 wait_queue_head_t *u_sleep;
441
442 u = container_of(q, struct unix_sock, peer_wake);
443
444 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
445 q);
446 u->peer_wake.private = NULL;
447
448 /* relaying can only happen while the wq still exists */
449 u_sleep = sk_sleep(&u->sk);
450 if (u_sleep)
451 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
452
453 return 0;
454 }
455
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
457 {
458 struct unix_sock *u, *u_other;
459 int rc;
460
461 u = unix_sk(sk);
462 u_other = unix_sk(other);
463 rc = 0;
464 spin_lock(&u_other->peer_wait.lock);
465
466 if (!u->peer_wake.private) {
467 u->peer_wake.private = other;
468 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
469
470 rc = 1;
471 }
472
473 spin_unlock(&u_other->peer_wait.lock);
474 return rc;
475 }
476
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)477 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
478 struct sock *other)
479 {
480 struct unix_sock *u, *u_other;
481
482 u = unix_sk(sk);
483 u_other = unix_sk(other);
484 spin_lock(&u_other->peer_wait.lock);
485
486 if (u->peer_wake.private == other) {
487 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
488 u->peer_wake.private = NULL;
489 }
490
491 spin_unlock(&u_other->peer_wait.lock);
492 }
493
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
495 struct sock *other)
496 {
497 unix_dgram_peer_wake_disconnect(sk, other);
498 wake_up_interruptible_poll(sk_sleep(sk),
499 EPOLLOUT |
500 EPOLLWRNORM |
501 EPOLLWRBAND);
502 }
503
504 /* preconditions:
505 * - unix_peer(sk) == other
506 * - association is stable
507 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
509 {
510 int connected;
511
512 connected = unix_dgram_peer_wake_connect(sk, other);
513
514 /* If other is SOCK_DEAD, we want to make sure we signal
515 * POLLOUT, such that a subsequent write() can get a
516 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
517 * to other and its full, we will hang waiting for POLLOUT.
518 */
519 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
520 return 1;
521
522 if (connected)
523 unix_dgram_peer_wake_disconnect(sk, other);
524
525 return 0;
526 }
527
unix_writable(const struct sock * sk,unsigned char state)528 static int unix_writable(const struct sock *sk, unsigned char state)
529 {
530 return state != TCP_LISTEN &&
531 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
532 }
533
unix_write_space(struct sock * sk)534 static void unix_write_space(struct sock *sk)
535 {
536 struct socket_wq *wq;
537
538 rcu_read_lock();
539 if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
540 wq = rcu_dereference(sk->sk_wq);
541 if (skwq_has_sleeper(wq))
542 wake_up_interruptible_sync_poll(&wq->wait,
543 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
544 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
545 }
546 rcu_read_unlock();
547 }
548
549 /* When dgram socket disconnects (or changes its peer), we clear its receive
550 * queue of packets arrived from previous peer. First, it allows to do
551 * flow control based only on wmem_alloc; second, sk connected to peer
552 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
554 {
555 if (!skb_queue_empty(&sk->sk_receive_queue)) {
556 skb_queue_purge(&sk->sk_receive_queue);
557 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
558
559 /* If one link of bidirectional dgram pipe is disconnected,
560 * we signal error. Messages are lost. Do not make this,
561 * when peer was not connected to us.
562 */
563 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
564 WRITE_ONCE(other->sk_err, ECONNRESET);
565 sk_error_report(other);
566 }
567 }
568 }
569
unix_sock_destructor(struct sock * sk)570 static void unix_sock_destructor(struct sock *sk)
571 {
572 struct unix_sock *u = unix_sk(sk);
573
574 skb_queue_purge(&sk->sk_receive_queue);
575
576 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
577 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
578 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
579 if (!sock_flag(sk, SOCK_DEAD)) {
580 pr_info("Attempt to release alive unix socket: %p\n", sk);
581 return;
582 }
583
584 if (u->addr)
585 unix_release_addr(u->addr);
586
587 atomic_long_dec(&unix_nr_socks);
588 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
589 #ifdef UNIX_REFCNT_DEBUG
590 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
591 atomic_long_read(&unix_nr_socks));
592 #endif
593 }
594
unix_release_sock(struct sock * sk,int embrion)595 static void unix_release_sock(struct sock *sk, int embrion)
596 {
597 struct unix_sock *u = unix_sk(sk);
598 struct sock *skpair;
599 struct sk_buff *skb;
600 struct path path;
601 int state;
602
603 unix_remove_socket(sock_net(sk), sk);
604 unix_remove_bsd_socket(sk);
605
606 /* Clear state */
607 unix_state_lock(sk);
608 sock_orphan(sk);
609 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
610 path = u->path;
611 u->path.dentry = NULL;
612 u->path.mnt = NULL;
613 state = sk->sk_state;
614 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
615
616 skpair = unix_peer(sk);
617 unix_peer(sk) = NULL;
618
619 unix_state_unlock(sk);
620
621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
622 if (u->oob_skb) {
623 kfree_skb(u->oob_skb);
624 u->oob_skb = NULL;
625 }
626 #endif
627
628 wake_up_interruptible_all(&u->peer_wait);
629
630 if (skpair != NULL) {
631 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
632 unix_state_lock(skpair);
633 /* No more writes */
634 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
635 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
636 WRITE_ONCE(skpair->sk_err, ECONNRESET);
637 unix_state_unlock(skpair);
638 skpair->sk_state_change(skpair);
639 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
640 }
641
642 unix_dgram_peer_wake_disconnect(sk, skpair);
643 sock_put(skpair); /* It may now die */
644 }
645
646 /* Try to flush out this socket. Throw out buffers at least */
647
648 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
649 if (state == TCP_LISTEN)
650 unix_release_sock(skb->sk, 1);
651 /* passed fds are erased in the kfree_skb hook */
652 UNIXCB(skb).consumed = skb->len;
653 kfree_skb(skb);
654 }
655
656 if (path.dentry)
657 path_put(&path);
658
659 sock_put(sk);
660
661 /* ---- Socket is dead now and most probably destroyed ---- */
662
663 /*
664 * Fixme: BSD difference: In BSD all sockets connected to us get
665 * ECONNRESET and we die on the spot. In Linux we behave
666 * like files and pipes do and wait for the last
667 * dereference.
668 *
669 * Can't we simply set sock->err?
670 *
671 * What the above comment does talk about? --ANK(980817)
672 */
673
674 if (READ_ONCE(unix_tot_inflight))
675 unix_gc(); /* Garbage collect fds */
676 }
677
init_peercred(struct sock * sk)678 static void init_peercred(struct sock *sk)
679 {
680 const struct cred *old_cred;
681 struct pid *old_pid;
682
683 spin_lock(&sk->sk_peer_lock);
684 old_pid = sk->sk_peer_pid;
685 old_cred = sk->sk_peer_cred;
686 sk->sk_peer_pid = get_pid(task_tgid(current));
687 sk->sk_peer_cred = get_current_cred();
688 spin_unlock(&sk->sk_peer_lock);
689
690 put_pid(old_pid);
691 put_cred(old_cred);
692 }
693
copy_peercred(struct sock * sk,struct sock * peersk)694 static void copy_peercred(struct sock *sk, struct sock *peersk)
695 {
696 const struct cred *old_cred;
697 struct pid *old_pid;
698
699 if (sk < peersk) {
700 spin_lock(&sk->sk_peer_lock);
701 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 } else {
703 spin_lock(&peersk->sk_peer_lock);
704 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
705 }
706 old_pid = sk->sk_peer_pid;
707 old_cred = sk->sk_peer_cred;
708 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
709 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
710
711 spin_unlock(&sk->sk_peer_lock);
712 spin_unlock(&peersk->sk_peer_lock);
713
714 put_pid(old_pid);
715 put_cred(old_cred);
716 }
717
unix_listen(struct socket * sock,int backlog)718 static int unix_listen(struct socket *sock, int backlog)
719 {
720 int err;
721 struct sock *sk = sock->sk;
722 struct unix_sock *u = unix_sk(sk);
723
724 err = -EOPNOTSUPP;
725 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
726 goto out; /* Only stream/seqpacket sockets accept */
727 err = -EINVAL;
728 if (!READ_ONCE(u->addr))
729 goto out; /* No listens on an unbound socket */
730 unix_state_lock(sk);
731 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
732 goto out_unlock;
733 if (backlog > sk->sk_max_ack_backlog)
734 wake_up_interruptible_all(&u->peer_wait);
735 sk->sk_max_ack_backlog = backlog;
736 WRITE_ONCE(sk->sk_state, TCP_LISTEN);
737
738 /* set credentials so connect can copy them */
739 init_peercred(sk);
740 err = 0;
741
742 out_unlock:
743 unix_state_unlock(sk);
744 out:
745 return err;
746 }
747
748 static int unix_release(struct socket *);
749 static int unix_bind(struct socket *, struct sockaddr *, int);
750 static int unix_stream_connect(struct socket *, struct sockaddr *,
751 int addr_len, int flags);
752 static int unix_socketpair(struct socket *, struct socket *);
753 static int unix_accept(struct socket *, struct socket *, int, bool);
754 static int unix_getname(struct socket *, struct sockaddr *, int);
755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
756 static __poll_t unix_dgram_poll(struct file *, struct socket *,
757 poll_table *);
758 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
759 #ifdef CONFIG_COMPAT
760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
761 #endif
762 static int unix_shutdown(struct socket *, int);
763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
765 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
766 struct pipe_inode_info *, size_t size,
767 unsigned int flags);
768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
772 static int unix_dgram_connect(struct socket *, struct sockaddr *,
773 int, int);
774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
776 int);
777
unix_set_peek_off(struct sock * sk,int val)778 static int unix_set_peek_off(struct sock *sk, int val)
779 {
780 struct unix_sock *u = unix_sk(sk);
781
782 if (mutex_lock_interruptible(&u->iolock))
783 return -EINTR;
784
785 WRITE_ONCE(sk->sk_peek_off, val);
786 mutex_unlock(&u->iolock);
787
788 return 0;
789 }
790
791 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)792 static int unix_count_nr_fds(struct sock *sk)
793 {
794 struct sk_buff *skb;
795 struct unix_sock *u;
796 int nr_fds = 0;
797
798 spin_lock(&sk->sk_receive_queue.lock);
799 skb = skb_peek(&sk->sk_receive_queue);
800 while (skb) {
801 u = unix_sk(skb->sk);
802 nr_fds += atomic_read(&u->scm_stat.nr_fds);
803 skb = skb_peek_next(skb, &sk->sk_receive_queue);
804 }
805 spin_unlock(&sk->sk_receive_queue.lock);
806
807 return nr_fds;
808 }
809
unix_show_fdinfo(struct seq_file * m,struct socket * sock)810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
811 {
812 struct sock *sk = sock->sk;
813 unsigned char s_state;
814 struct unix_sock *u;
815 int nr_fds = 0;
816
817 if (sk) {
818 s_state = READ_ONCE(sk->sk_state);
819 u = unix_sk(sk);
820
821 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
822 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
823 * SOCK_DGRAM is ordinary. So, no lock is needed.
824 */
825 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
826 nr_fds = atomic_read(&u->scm_stat.nr_fds);
827 else if (s_state == TCP_LISTEN)
828 nr_fds = unix_count_nr_fds(sk);
829
830 seq_printf(m, "scm_fds: %u\n", nr_fds);
831 }
832 }
833 #else
834 #define unix_show_fdinfo NULL
835 #endif
836
837 static const struct proto_ops unix_stream_ops = {
838 .family = PF_UNIX,
839 .owner = THIS_MODULE,
840 .release = unix_release,
841 .bind = unix_bind,
842 .connect = unix_stream_connect,
843 .socketpair = unix_socketpair,
844 .accept = unix_accept,
845 .getname = unix_getname,
846 .poll = unix_poll,
847 .ioctl = unix_ioctl,
848 #ifdef CONFIG_COMPAT
849 .compat_ioctl = unix_compat_ioctl,
850 #endif
851 .listen = unix_listen,
852 .shutdown = unix_shutdown,
853 .sendmsg = unix_stream_sendmsg,
854 .recvmsg = unix_stream_recvmsg,
855 .read_skb = unix_stream_read_skb,
856 .mmap = sock_no_mmap,
857 .splice_read = unix_stream_splice_read,
858 .set_peek_off = unix_set_peek_off,
859 .show_fdinfo = unix_show_fdinfo,
860 };
861
862 static const struct proto_ops unix_dgram_ops = {
863 .family = PF_UNIX,
864 .owner = THIS_MODULE,
865 .release = unix_release,
866 .bind = unix_bind,
867 .connect = unix_dgram_connect,
868 .socketpair = unix_socketpair,
869 .accept = sock_no_accept,
870 .getname = unix_getname,
871 .poll = unix_dgram_poll,
872 .ioctl = unix_ioctl,
873 #ifdef CONFIG_COMPAT
874 .compat_ioctl = unix_compat_ioctl,
875 #endif
876 .listen = sock_no_listen,
877 .shutdown = unix_shutdown,
878 .sendmsg = unix_dgram_sendmsg,
879 .read_skb = unix_read_skb,
880 .recvmsg = unix_dgram_recvmsg,
881 .mmap = sock_no_mmap,
882 .set_peek_off = unix_set_peek_off,
883 .show_fdinfo = unix_show_fdinfo,
884 };
885
886 static const struct proto_ops unix_seqpacket_ops = {
887 .family = PF_UNIX,
888 .owner = THIS_MODULE,
889 .release = unix_release,
890 .bind = unix_bind,
891 .connect = unix_stream_connect,
892 .socketpair = unix_socketpair,
893 .accept = unix_accept,
894 .getname = unix_getname,
895 .poll = unix_dgram_poll,
896 .ioctl = unix_ioctl,
897 #ifdef CONFIG_COMPAT
898 .compat_ioctl = unix_compat_ioctl,
899 #endif
900 .listen = unix_listen,
901 .shutdown = unix_shutdown,
902 .sendmsg = unix_seqpacket_sendmsg,
903 .recvmsg = unix_seqpacket_recvmsg,
904 .mmap = sock_no_mmap,
905 .set_peek_off = unix_set_peek_off,
906 .show_fdinfo = unix_show_fdinfo,
907 };
908
unix_close(struct sock * sk,long timeout)909 static void unix_close(struct sock *sk, long timeout)
910 {
911 /* Nothing to do here, unix socket does not need a ->close().
912 * This is merely for sockmap.
913 */
914 }
915
unix_unhash(struct sock * sk)916 static void unix_unhash(struct sock *sk)
917 {
918 /* Nothing to do here, unix socket does not need a ->unhash().
919 * This is merely for sockmap.
920 */
921 }
922
unix_bpf_bypass_getsockopt(int level,int optname)923 static bool unix_bpf_bypass_getsockopt(int level, int optname)
924 {
925 if (level == SOL_SOCKET) {
926 switch (optname) {
927 case SO_PEERPIDFD:
928 return true;
929 default:
930 return false;
931 }
932 }
933
934 return false;
935 }
936
937 struct proto unix_dgram_proto = {
938 .name = "UNIX",
939 .owner = THIS_MODULE,
940 .obj_size = sizeof(struct unix_sock),
941 .close = unix_close,
942 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
943 #ifdef CONFIG_BPF_SYSCALL
944 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
945 #endif
946 };
947
948 struct proto unix_stream_proto = {
949 .name = "UNIX-STREAM",
950 .owner = THIS_MODULE,
951 .obj_size = sizeof(struct unix_sock),
952 .close = unix_close,
953 .unhash = unix_unhash,
954 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
955 #ifdef CONFIG_BPF_SYSCALL
956 .psock_update_sk_prot = unix_stream_bpf_update_proto,
957 #endif
958 };
959
unix_create1(struct net * net,struct socket * sock,int kern,int type)960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
961 {
962 struct unix_sock *u;
963 struct sock *sk;
964 int err;
965
966 atomic_long_inc(&unix_nr_socks);
967 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
968 err = -ENFILE;
969 goto err;
970 }
971
972 if (type == SOCK_STREAM)
973 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
974 else /*dgram and seqpacket */
975 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
976
977 if (!sk) {
978 err = -ENOMEM;
979 goto err;
980 }
981
982 sock_init_data(sock, sk);
983
984 sk->sk_hash = unix_unbound_hash(sk);
985 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
986 sk->sk_write_space = unix_write_space;
987 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
988 sk->sk_destruct = unix_sock_destructor;
989 u = unix_sk(sk);
990 u->inflight = 0;
991 u->path.dentry = NULL;
992 u->path.mnt = NULL;
993 spin_lock_init(&u->lock);
994 INIT_LIST_HEAD(&u->link);
995 mutex_init(&u->iolock); /* single task reading lock */
996 mutex_init(&u->bindlock); /* single task binding lock */
997 init_waitqueue_head(&u->peer_wait);
998 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
999 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1000 unix_insert_unbound_socket(net, sk);
1001
1002 sock_prot_inuse_add(net, sk->sk_prot, 1);
1003
1004 return sk;
1005
1006 err:
1007 atomic_long_dec(&unix_nr_socks);
1008 return ERR_PTR(err);
1009 }
1010
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1011 static int unix_create(struct net *net, struct socket *sock, int protocol,
1012 int kern)
1013 {
1014 struct sock *sk;
1015
1016 if (protocol && protocol != PF_UNIX)
1017 return -EPROTONOSUPPORT;
1018
1019 sock->state = SS_UNCONNECTED;
1020
1021 switch (sock->type) {
1022 case SOCK_STREAM:
1023 sock->ops = &unix_stream_ops;
1024 break;
1025 /*
1026 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1027 * nothing uses it.
1028 */
1029 case SOCK_RAW:
1030 sock->type = SOCK_DGRAM;
1031 fallthrough;
1032 case SOCK_DGRAM:
1033 sock->ops = &unix_dgram_ops;
1034 break;
1035 case SOCK_SEQPACKET:
1036 sock->ops = &unix_seqpacket_ops;
1037 break;
1038 default:
1039 return -ESOCKTNOSUPPORT;
1040 }
1041
1042 sk = unix_create1(net, sock, kern, sock->type);
1043 if (IS_ERR(sk))
1044 return PTR_ERR(sk);
1045
1046 return 0;
1047 }
1048
unix_release(struct socket * sock)1049 static int unix_release(struct socket *sock)
1050 {
1051 struct sock *sk = sock->sk;
1052
1053 if (!sk)
1054 return 0;
1055
1056 sk->sk_prot->close(sk, 0);
1057 unix_release_sock(sk, 0);
1058 sock->sk = NULL;
1059
1060 return 0;
1061 }
1062
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1064 int type)
1065 {
1066 struct inode *inode;
1067 struct path path;
1068 struct sock *sk;
1069 int err;
1070
1071 unix_mkname_bsd(sunaddr, addr_len);
1072 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1073 if (err)
1074 goto fail;
1075
1076 err = path_permission(&path, MAY_WRITE);
1077 if (err)
1078 goto path_put;
1079
1080 err = -ECONNREFUSED;
1081 inode = d_backing_inode(path.dentry);
1082 if (!S_ISSOCK(inode->i_mode))
1083 goto path_put;
1084
1085 sk = unix_find_socket_byinode(inode);
1086 if (!sk)
1087 goto path_put;
1088
1089 err = -EPROTOTYPE;
1090 if (sk->sk_type == type)
1091 touch_atime(&path);
1092 else
1093 goto sock_put;
1094
1095 path_put(&path);
1096
1097 return sk;
1098
1099 sock_put:
1100 sock_put(sk);
1101 path_put:
1102 path_put(&path);
1103 fail:
1104 return ERR_PTR(err);
1105 }
1106
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1107 static struct sock *unix_find_abstract(struct net *net,
1108 struct sockaddr_un *sunaddr,
1109 int addr_len, int type)
1110 {
1111 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1112 struct dentry *dentry;
1113 struct sock *sk;
1114
1115 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1116 if (!sk)
1117 return ERR_PTR(-ECONNREFUSED);
1118
1119 dentry = unix_sk(sk)->path.dentry;
1120 if (dentry)
1121 touch_atime(&unix_sk(sk)->path);
1122
1123 return sk;
1124 }
1125
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1126 static struct sock *unix_find_other(struct net *net,
1127 struct sockaddr_un *sunaddr,
1128 int addr_len, int type)
1129 {
1130 struct sock *sk;
1131
1132 if (sunaddr->sun_path[0])
1133 sk = unix_find_bsd(sunaddr, addr_len, type);
1134 else
1135 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1136
1137 return sk;
1138 }
1139
unix_autobind(struct sock * sk)1140 static int unix_autobind(struct sock *sk)
1141 {
1142 struct unix_sock *u = unix_sk(sk);
1143 unsigned int new_hash, old_hash;
1144 struct net *net = sock_net(sk);
1145 struct unix_address *addr;
1146 u32 lastnum, ordernum;
1147 int err;
1148
1149 err = mutex_lock_interruptible(&u->bindlock);
1150 if (err)
1151 return err;
1152
1153 if (u->addr)
1154 goto out;
1155
1156 err = -ENOMEM;
1157 addr = kzalloc(sizeof(*addr) +
1158 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1159 if (!addr)
1160 goto out;
1161
1162 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1163 addr->name->sun_family = AF_UNIX;
1164 refcount_set(&addr->refcnt, 1);
1165
1166 old_hash = sk->sk_hash;
1167 ordernum = get_random_u32();
1168 lastnum = ordernum & 0xFFFFF;
1169 retry:
1170 ordernum = (ordernum + 1) & 0xFFFFF;
1171 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1172
1173 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1174 unix_table_double_lock(net, old_hash, new_hash);
1175
1176 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1177 unix_table_double_unlock(net, old_hash, new_hash);
1178
1179 /* __unix_find_socket_byname() may take long time if many names
1180 * are already in use.
1181 */
1182 cond_resched();
1183
1184 if (ordernum == lastnum) {
1185 /* Give up if all names seems to be in use. */
1186 err = -ENOSPC;
1187 unix_release_addr(addr);
1188 goto out;
1189 }
1190
1191 goto retry;
1192 }
1193
1194 __unix_set_addr_hash(net, sk, addr, new_hash);
1195 unix_table_double_unlock(net, old_hash, new_hash);
1196 err = 0;
1197
1198 out: mutex_unlock(&u->bindlock);
1199 return err;
1200 }
1201
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1203 int addr_len)
1204 {
1205 umode_t mode = S_IFSOCK |
1206 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1207 struct unix_sock *u = unix_sk(sk);
1208 unsigned int new_hash, old_hash;
1209 struct net *net = sock_net(sk);
1210 struct mnt_idmap *idmap;
1211 struct unix_address *addr;
1212 struct dentry *dentry;
1213 struct path parent;
1214 int err;
1215
1216 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1217 addr = unix_create_addr(sunaddr, addr_len);
1218 if (!addr)
1219 return -ENOMEM;
1220
1221 /*
1222 * Get the parent directory, calculate the hash for last
1223 * component.
1224 */
1225 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1226 if (IS_ERR(dentry)) {
1227 err = PTR_ERR(dentry);
1228 goto out;
1229 }
1230
1231 /*
1232 * All right, let's create it.
1233 */
1234 idmap = mnt_idmap(parent.mnt);
1235 err = security_path_mknod(&parent, dentry, mode, 0);
1236 if (!err)
1237 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1238 if (err)
1239 goto out_path;
1240 err = mutex_lock_interruptible(&u->bindlock);
1241 if (err)
1242 goto out_unlink;
1243 if (u->addr)
1244 goto out_unlock;
1245
1246 old_hash = sk->sk_hash;
1247 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1248 unix_table_double_lock(net, old_hash, new_hash);
1249 u->path.mnt = mntget(parent.mnt);
1250 u->path.dentry = dget(dentry);
1251 __unix_set_addr_hash(net, sk, addr, new_hash);
1252 unix_table_double_unlock(net, old_hash, new_hash);
1253 unix_insert_bsd_socket(sk);
1254 mutex_unlock(&u->bindlock);
1255 done_path_create(&parent, dentry);
1256 return 0;
1257
1258 out_unlock:
1259 mutex_unlock(&u->bindlock);
1260 err = -EINVAL;
1261 out_unlink:
1262 /* failed after successful mknod? unlink what we'd created... */
1263 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1264 out_path:
1265 done_path_create(&parent, dentry);
1266 out:
1267 unix_release_addr(addr);
1268 return err == -EEXIST ? -EADDRINUSE : err;
1269 }
1270
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1272 int addr_len)
1273 {
1274 struct unix_sock *u = unix_sk(sk);
1275 unsigned int new_hash, old_hash;
1276 struct net *net = sock_net(sk);
1277 struct unix_address *addr;
1278 int err;
1279
1280 addr = unix_create_addr(sunaddr, addr_len);
1281 if (!addr)
1282 return -ENOMEM;
1283
1284 err = mutex_lock_interruptible(&u->bindlock);
1285 if (err)
1286 goto out;
1287
1288 if (u->addr) {
1289 err = -EINVAL;
1290 goto out_mutex;
1291 }
1292
1293 old_hash = sk->sk_hash;
1294 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1295 unix_table_double_lock(net, old_hash, new_hash);
1296
1297 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1298 goto out_spin;
1299
1300 __unix_set_addr_hash(net, sk, addr, new_hash);
1301 unix_table_double_unlock(net, old_hash, new_hash);
1302 mutex_unlock(&u->bindlock);
1303 return 0;
1304
1305 out_spin:
1306 unix_table_double_unlock(net, old_hash, new_hash);
1307 err = -EADDRINUSE;
1308 out_mutex:
1309 mutex_unlock(&u->bindlock);
1310 out:
1311 unix_release_addr(addr);
1312 return err;
1313 }
1314
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1316 {
1317 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1318 struct sock *sk = sock->sk;
1319 int err;
1320
1321 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1322 sunaddr->sun_family == AF_UNIX)
1323 return unix_autobind(sk);
1324
1325 err = unix_validate_addr(sunaddr, addr_len);
1326 if (err)
1327 return err;
1328
1329 if (sunaddr->sun_path[0])
1330 err = unix_bind_bsd(sk, sunaddr, addr_len);
1331 else
1332 err = unix_bind_abstract(sk, sunaddr, addr_len);
1333
1334 return err;
1335 }
1336
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1338 {
1339 if (unlikely(sk1 == sk2) || !sk2) {
1340 unix_state_lock(sk1);
1341 return;
1342 }
1343 if (sk1 > sk2)
1344 swap(sk1, sk2);
1345
1346 unix_state_lock(sk1);
1347 unix_state_lock_nested(sk2, U_LOCK_SECOND);
1348 }
1349
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1351 {
1352 if (unlikely(sk1 == sk2) || !sk2) {
1353 unix_state_unlock(sk1);
1354 return;
1355 }
1356 unix_state_unlock(sk1);
1357 unix_state_unlock(sk2);
1358 }
1359
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1361 int alen, int flags)
1362 {
1363 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1364 struct sock *sk = sock->sk;
1365 struct sock *other;
1366 int err;
1367
1368 err = -EINVAL;
1369 if (alen < offsetofend(struct sockaddr, sa_family))
1370 goto out;
1371
1372 if (addr->sa_family != AF_UNSPEC) {
1373 err = unix_validate_addr(sunaddr, alen);
1374 if (err)
1375 goto out;
1376
1377 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1378 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1379 !READ_ONCE(unix_sk(sk)->addr)) {
1380 err = unix_autobind(sk);
1381 if (err)
1382 goto out;
1383 }
1384
1385 restart:
1386 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1387 if (IS_ERR(other)) {
1388 err = PTR_ERR(other);
1389 goto out;
1390 }
1391
1392 unix_state_double_lock(sk, other);
1393
1394 /* Apparently VFS overslept socket death. Retry. */
1395 if (sock_flag(other, SOCK_DEAD)) {
1396 unix_state_double_unlock(sk, other);
1397 sock_put(other);
1398 goto restart;
1399 }
1400
1401 err = -EPERM;
1402 if (!unix_may_send(sk, other))
1403 goto out_unlock;
1404
1405 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1406 if (err)
1407 goto out_unlock;
1408
1409 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1410 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1411 } else {
1412 /*
1413 * 1003.1g breaking connected state with AF_UNSPEC
1414 */
1415 other = NULL;
1416 unix_state_double_lock(sk, other);
1417 }
1418
1419 /*
1420 * If it was connected, reconnect.
1421 */
1422 if (unix_peer(sk)) {
1423 struct sock *old_peer = unix_peer(sk);
1424
1425 unix_peer(sk) = other;
1426 if (!other)
1427 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1428 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1429
1430 unix_state_double_unlock(sk, other);
1431
1432 if (other != old_peer) {
1433 unix_dgram_disconnected(sk, old_peer);
1434
1435 unix_state_lock(old_peer);
1436 if (!unix_peer(old_peer))
1437 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1438 unix_state_unlock(old_peer);
1439 }
1440
1441 sock_put(old_peer);
1442 } else {
1443 unix_peer(sk) = other;
1444 unix_state_double_unlock(sk, other);
1445 }
1446
1447 return 0;
1448
1449 out_unlock:
1450 unix_state_double_unlock(sk, other);
1451 sock_put(other);
1452 out:
1453 return err;
1454 }
1455
unix_wait_for_peer(struct sock * other,long timeo)1456 static long unix_wait_for_peer(struct sock *other, long timeo)
1457 __releases(&unix_sk(other)->lock)
1458 {
1459 struct unix_sock *u = unix_sk(other);
1460 int sched;
1461 DEFINE_WAIT(wait);
1462
1463 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464
1465 sched = !sock_flag(other, SOCK_DEAD) &&
1466 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1467 unix_recvq_full_lockless(other);
1468
1469 unix_state_unlock(other);
1470
1471 if (sched)
1472 timeo = schedule_timeout(timeo);
1473
1474 finish_wait(&u->peer_wait, &wait);
1475 return timeo;
1476 }
1477
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1479 int addr_len, int flags)
1480 {
1481 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1482 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1483 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1484 struct net *net = sock_net(sk);
1485 struct sk_buff *skb = NULL;
1486 long timeo;
1487 int err;
1488
1489 err = unix_validate_addr(sunaddr, addr_len);
1490 if (err)
1491 goto out;
1492
1493 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1494 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1495 !READ_ONCE(u->addr)) {
1496 err = unix_autobind(sk);
1497 if (err)
1498 goto out;
1499 }
1500
1501 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1502
1503 /* First of all allocate resources.
1504 If we will make it after state is locked,
1505 we will have to recheck all again in any case.
1506 */
1507
1508 /* create new sock for complete connection */
1509 newsk = unix_create1(net, NULL, 0, sock->type);
1510 if (IS_ERR(newsk)) {
1511 err = PTR_ERR(newsk);
1512 newsk = NULL;
1513 goto out;
1514 }
1515
1516 err = -ENOMEM;
1517
1518 /* Allocate skb for sending to listening sock */
1519 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1520 if (skb == NULL)
1521 goto out;
1522
1523 restart:
1524 /* Find listening sock. */
1525 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1526 if (IS_ERR(other)) {
1527 err = PTR_ERR(other);
1528 other = NULL;
1529 goto out;
1530 }
1531
1532 /* Latch state of peer */
1533 unix_state_lock(other);
1534
1535 /* Apparently VFS overslept socket death. Retry. */
1536 if (sock_flag(other, SOCK_DEAD)) {
1537 unix_state_unlock(other);
1538 sock_put(other);
1539 goto restart;
1540 }
1541
1542 err = -ECONNREFUSED;
1543 if (other->sk_state != TCP_LISTEN)
1544 goto out_unlock;
1545 if (other->sk_shutdown & RCV_SHUTDOWN)
1546 goto out_unlock;
1547
1548 if (unix_recvq_full_lockless(other)) {
1549 err = -EAGAIN;
1550 if (!timeo)
1551 goto out_unlock;
1552
1553 timeo = unix_wait_for_peer(other, timeo);
1554
1555 err = sock_intr_errno(timeo);
1556 if (signal_pending(current))
1557 goto out;
1558 sock_put(other);
1559 goto restart;
1560 }
1561
1562 /* Latch our state.
1563
1564 It is tricky place. We need to grab our state lock and cannot
1565 drop lock on peer. It is dangerous because deadlock is
1566 possible. Connect to self case and simultaneous
1567 attempt to connect are eliminated by checking socket
1568 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1569 check this before attempt to grab lock.
1570
1571 Well, and we have to recheck the state after socket locked.
1572 */
1573 switch (READ_ONCE(sk->sk_state)) {
1574 case TCP_CLOSE:
1575 /* This is ok... continue with connect */
1576 break;
1577 case TCP_ESTABLISHED:
1578 /* Socket is already connected */
1579 err = -EISCONN;
1580 goto out_unlock;
1581 default:
1582 err = -EINVAL;
1583 goto out_unlock;
1584 }
1585
1586 unix_state_lock_nested(sk, U_LOCK_SECOND);
1587
1588 if (sk->sk_state != TCP_CLOSE) {
1589 unix_state_unlock(sk);
1590 unix_state_unlock(other);
1591 sock_put(other);
1592 goto restart;
1593 }
1594
1595 err = security_unix_stream_connect(sk, other, newsk);
1596 if (err) {
1597 unix_state_unlock(sk);
1598 goto out_unlock;
1599 }
1600
1601 /* The way is open! Fastly set all the necessary fields... */
1602
1603 sock_hold(sk);
1604 unix_peer(newsk) = sk;
1605 newsk->sk_state = TCP_ESTABLISHED;
1606 newsk->sk_type = sk->sk_type;
1607 init_peercred(newsk);
1608 newu = unix_sk(newsk);
1609 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1610 otheru = unix_sk(other);
1611
1612 /* copy address information from listening to new sock
1613 *
1614 * The contents of *(otheru->addr) and otheru->path
1615 * are seen fully set up here, since we have found
1616 * otheru in hash under its lock. Insertion into the
1617 * hash chain we'd found it in had been done in an
1618 * earlier critical area protected by the chain's lock,
1619 * the same one where we'd set *(otheru->addr) contents,
1620 * as well as otheru->path and otheru->addr itself.
1621 *
1622 * Using smp_store_release() here to set newu->addr
1623 * is enough to make those stores, as well as stores
1624 * to newu->path visible to anyone who gets newu->addr
1625 * by smp_load_acquire(). IOW, the same warranties
1626 * as for unix_sock instances bound in unix_bind() or
1627 * in unix_autobind().
1628 */
1629 if (otheru->path.dentry) {
1630 path_get(&otheru->path);
1631 newu->path = otheru->path;
1632 }
1633 refcount_inc(&otheru->addr->refcnt);
1634 smp_store_release(&newu->addr, otheru->addr);
1635
1636 /* Set credentials */
1637 copy_peercred(sk, other);
1638
1639 sock->state = SS_CONNECTED;
1640 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1641 sock_hold(newsk);
1642
1643 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1644 unix_peer(sk) = newsk;
1645
1646 unix_state_unlock(sk);
1647
1648 /* take ten and send info to listening sock */
1649 spin_lock(&other->sk_receive_queue.lock);
1650 __skb_queue_tail(&other->sk_receive_queue, skb);
1651 spin_unlock(&other->sk_receive_queue.lock);
1652 unix_state_unlock(other);
1653 other->sk_data_ready(other);
1654 sock_put(other);
1655 return 0;
1656
1657 out_unlock:
1658 if (other)
1659 unix_state_unlock(other);
1660
1661 out:
1662 kfree_skb(skb);
1663 if (newsk)
1664 unix_release_sock(newsk, 0);
1665 if (other)
1666 sock_put(other);
1667 return err;
1668 }
1669
unix_socketpair(struct socket * socka,struct socket * sockb)1670 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1671 {
1672 struct sock *ska = socka->sk, *skb = sockb->sk;
1673
1674 /* Join our sockets back to back */
1675 sock_hold(ska);
1676 sock_hold(skb);
1677 unix_peer(ska) = skb;
1678 unix_peer(skb) = ska;
1679 init_peercred(ska);
1680 init_peercred(skb);
1681
1682 ska->sk_state = TCP_ESTABLISHED;
1683 skb->sk_state = TCP_ESTABLISHED;
1684 socka->state = SS_CONNECTED;
1685 sockb->state = SS_CONNECTED;
1686 return 0;
1687 }
1688
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1689 static void unix_sock_inherit_flags(const struct socket *old,
1690 struct socket *new)
1691 {
1692 if (test_bit(SOCK_PASSCRED, &old->flags))
1693 set_bit(SOCK_PASSCRED, &new->flags);
1694 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1695 set_bit(SOCK_PASSPIDFD, &new->flags);
1696 if (test_bit(SOCK_PASSSEC, &old->flags))
1697 set_bit(SOCK_PASSSEC, &new->flags);
1698 }
1699
unix_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)1700 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1701 bool kern)
1702 {
1703 struct sock *sk = sock->sk;
1704 struct sock *tsk;
1705 struct sk_buff *skb;
1706 int err;
1707
1708 err = -EOPNOTSUPP;
1709 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1710 goto out;
1711
1712 err = -EINVAL;
1713 if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1714 goto out;
1715
1716 /* If socket state is TCP_LISTEN it cannot change (for now...),
1717 * so that no locks are necessary.
1718 */
1719
1720 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1721 &err);
1722 if (!skb) {
1723 /* This means receive shutdown. */
1724 if (err == 0)
1725 err = -EINVAL;
1726 goto out;
1727 }
1728
1729 tsk = skb->sk;
1730 skb_free_datagram(sk, skb);
1731 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1732
1733 /* attach accepted sock to socket */
1734 unix_state_lock(tsk);
1735 newsock->state = SS_CONNECTED;
1736 unix_sock_inherit_flags(sock, newsock);
1737 sock_graft(tsk, newsock);
1738 unix_state_unlock(tsk);
1739 return 0;
1740
1741 out:
1742 return err;
1743 }
1744
1745
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1746 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1747 {
1748 struct sock *sk = sock->sk;
1749 struct unix_address *addr;
1750 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1751 int err = 0;
1752
1753 if (peer) {
1754 sk = unix_peer_get(sk);
1755
1756 err = -ENOTCONN;
1757 if (!sk)
1758 goto out;
1759 err = 0;
1760 } else {
1761 sock_hold(sk);
1762 }
1763
1764 addr = smp_load_acquire(&unix_sk(sk)->addr);
1765 if (!addr) {
1766 sunaddr->sun_family = AF_UNIX;
1767 sunaddr->sun_path[0] = 0;
1768 err = offsetof(struct sockaddr_un, sun_path);
1769 } else {
1770 err = addr->len;
1771 memcpy(sunaddr, addr->name, addr->len);
1772 }
1773 sock_put(sk);
1774 out:
1775 return err;
1776 }
1777
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1778 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1779 {
1780 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1781
1782 /*
1783 * Garbage collection of unix sockets starts by selecting a set of
1784 * candidate sockets which have reference only from being in flight
1785 * (total_refs == inflight_refs). This condition is checked once during
1786 * the candidate collection phase, and candidates are marked as such, so
1787 * that non-candidates can later be ignored. While inflight_refs is
1788 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1789 * is an instantaneous decision.
1790 *
1791 * Once a candidate, however, the socket must not be reinstalled into a
1792 * file descriptor while the garbage collection is in progress.
1793 *
1794 * If the above conditions are met, then the directed graph of
1795 * candidates (*) does not change while unix_gc_lock is held.
1796 *
1797 * Any operations that changes the file count through file descriptors
1798 * (dup, close, sendmsg) does not change the graph since candidates are
1799 * not installed in fds.
1800 *
1801 * Dequeing a candidate via recvmsg would install it into an fd, but
1802 * that takes unix_gc_lock to decrement the inflight count, so it's
1803 * serialized with garbage collection.
1804 *
1805 * MSG_PEEK is special in that it does not change the inflight count,
1806 * yet does install the socket into an fd. The following lock/unlock
1807 * pair is to ensure serialization with garbage collection. It must be
1808 * done between incrementing the file count and installing the file into
1809 * an fd.
1810 *
1811 * If garbage collection starts after the barrier provided by the
1812 * lock/unlock, then it will see the elevated refcount and not mark this
1813 * as a candidate. If a garbage collection is already in progress
1814 * before the file count was incremented, then the lock/unlock pair will
1815 * ensure that garbage collection is finished before progressing to
1816 * installing the fd.
1817 *
1818 * (*) A -> B where B is on the queue of A or B is on the queue of C
1819 * which is on the queue of listening socket A.
1820 */
1821 spin_lock(&unix_gc_lock);
1822 spin_unlock(&unix_gc_lock);
1823 }
1824
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1825 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1826 {
1827 int err = 0;
1828
1829 UNIXCB(skb).pid = get_pid(scm->pid);
1830 UNIXCB(skb).uid = scm->creds.uid;
1831 UNIXCB(skb).gid = scm->creds.gid;
1832 UNIXCB(skb).fp = NULL;
1833 unix_get_secdata(scm, skb);
1834 if (scm->fp && send_fds)
1835 err = unix_attach_fds(scm, skb);
1836
1837 skb->destructor = unix_destruct_scm;
1838 return err;
1839 }
1840
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1841 static bool unix_passcred_enabled(const struct socket *sock,
1842 const struct sock *other)
1843 {
1844 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1845 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1846 !other->sk_socket ||
1847 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1848 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1849 }
1850
1851 /*
1852 * Some apps rely on write() giving SCM_CREDENTIALS
1853 * We include credentials if source or destination socket
1854 * asserted SOCK_PASSCRED.
1855 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1856 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1857 const struct sock *other)
1858 {
1859 if (UNIXCB(skb).pid)
1860 return;
1861 if (unix_passcred_enabled(sock, other)) {
1862 UNIXCB(skb).pid = get_pid(task_tgid(current));
1863 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1864 }
1865 }
1866
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1867 static bool unix_skb_scm_eq(struct sk_buff *skb,
1868 struct scm_cookie *scm)
1869 {
1870 return UNIXCB(skb).pid == scm->pid &&
1871 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1872 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1873 unix_secdata_eq(scm, skb);
1874 }
1875
scm_stat_add(struct sock * sk,struct sk_buff * skb)1876 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1877 {
1878 struct scm_fp_list *fp = UNIXCB(skb).fp;
1879 struct unix_sock *u = unix_sk(sk);
1880
1881 if (unlikely(fp && fp->count))
1882 atomic_add(fp->count, &u->scm_stat.nr_fds);
1883 }
1884
scm_stat_del(struct sock * sk,struct sk_buff * skb)1885 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1886 {
1887 struct scm_fp_list *fp = UNIXCB(skb).fp;
1888 struct unix_sock *u = unix_sk(sk);
1889
1890 if (unlikely(fp && fp->count))
1891 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1892 }
1893
1894 /*
1895 * Send AF_UNIX data.
1896 */
1897
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1898 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1899 size_t len)
1900 {
1901 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1902 struct sock *sk = sock->sk, *other = NULL;
1903 struct unix_sock *u = unix_sk(sk);
1904 struct scm_cookie scm;
1905 struct sk_buff *skb;
1906 int data_len = 0;
1907 int sk_locked;
1908 long timeo;
1909 int err;
1910
1911 wait_for_unix_gc();
1912 err = scm_send(sock, msg, &scm, false);
1913 if (err < 0)
1914 return err;
1915
1916 err = -EOPNOTSUPP;
1917 if (msg->msg_flags&MSG_OOB)
1918 goto out;
1919
1920 if (msg->msg_namelen) {
1921 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1922 if (err)
1923 goto out;
1924 } else {
1925 sunaddr = NULL;
1926 err = -ENOTCONN;
1927 other = unix_peer_get(sk);
1928 if (!other)
1929 goto out;
1930 }
1931
1932 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1933 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1934 !READ_ONCE(u->addr)) {
1935 err = unix_autobind(sk);
1936 if (err)
1937 goto out;
1938 }
1939
1940 err = -EMSGSIZE;
1941 if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1942 goto out;
1943
1944 if (len > SKB_MAX_ALLOC) {
1945 data_len = min_t(size_t,
1946 len - SKB_MAX_ALLOC,
1947 MAX_SKB_FRAGS * PAGE_SIZE);
1948 data_len = PAGE_ALIGN(data_len);
1949
1950 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1951 }
1952
1953 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1954 msg->msg_flags & MSG_DONTWAIT, &err,
1955 PAGE_ALLOC_COSTLY_ORDER);
1956 if (skb == NULL)
1957 goto out;
1958
1959 err = unix_scm_to_skb(&scm, skb, true);
1960 if (err < 0)
1961 goto out_free;
1962
1963 skb_put(skb, len - data_len);
1964 skb->data_len = data_len;
1965 skb->len = len;
1966 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1967 if (err)
1968 goto out_free;
1969
1970 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1971
1972 restart:
1973 if (!other) {
1974 err = -ECONNRESET;
1975 if (sunaddr == NULL)
1976 goto out_free;
1977
1978 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1979 sk->sk_type);
1980 if (IS_ERR(other)) {
1981 err = PTR_ERR(other);
1982 other = NULL;
1983 goto out_free;
1984 }
1985 }
1986
1987 if (sk_filter(other, skb) < 0) {
1988 /* Toss the packet but do not return any error to the sender */
1989 err = len;
1990 goto out_free;
1991 }
1992
1993 sk_locked = 0;
1994 unix_state_lock(other);
1995 restart_locked:
1996 err = -EPERM;
1997 if (!unix_may_send(sk, other))
1998 goto out_unlock;
1999
2000 if (unlikely(sock_flag(other, SOCK_DEAD))) {
2001 /*
2002 * Check with 1003.1g - what should
2003 * datagram error
2004 */
2005 unix_state_unlock(other);
2006 sock_put(other);
2007
2008 if (!sk_locked)
2009 unix_state_lock(sk);
2010
2011 err = 0;
2012 if (sk->sk_type == SOCK_SEQPACKET) {
2013 /* We are here only when racing with unix_release_sock()
2014 * is clearing @other. Never change state to TCP_CLOSE
2015 * unlike SOCK_DGRAM wants.
2016 */
2017 unix_state_unlock(sk);
2018 err = -EPIPE;
2019 } else if (unix_peer(sk) == other) {
2020 unix_peer(sk) = NULL;
2021 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2022
2023 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2024 unix_state_unlock(sk);
2025
2026 unix_dgram_disconnected(sk, other);
2027 sock_put(other);
2028 err = -ECONNREFUSED;
2029 } else {
2030 unix_state_unlock(sk);
2031 }
2032
2033 other = NULL;
2034 if (err)
2035 goto out_free;
2036 goto restart;
2037 }
2038
2039 err = -EPIPE;
2040 if (other->sk_shutdown & RCV_SHUTDOWN)
2041 goto out_unlock;
2042
2043 if (sk->sk_type != SOCK_SEQPACKET) {
2044 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2045 if (err)
2046 goto out_unlock;
2047 }
2048
2049 /* other == sk && unix_peer(other) != sk if
2050 * - unix_peer(sk) == NULL, destination address bound to sk
2051 * - unix_peer(sk) == sk by time of get but disconnected before lock
2052 */
2053 if (other != sk &&
2054 unlikely(unix_peer(other) != sk &&
2055 unix_recvq_full_lockless(other))) {
2056 if (timeo) {
2057 timeo = unix_wait_for_peer(other, timeo);
2058
2059 err = sock_intr_errno(timeo);
2060 if (signal_pending(current))
2061 goto out_free;
2062
2063 goto restart;
2064 }
2065
2066 if (!sk_locked) {
2067 unix_state_unlock(other);
2068 unix_state_double_lock(sk, other);
2069 }
2070
2071 if (unix_peer(sk) != other ||
2072 unix_dgram_peer_wake_me(sk, other)) {
2073 err = -EAGAIN;
2074 sk_locked = 1;
2075 goto out_unlock;
2076 }
2077
2078 if (!sk_locked) {
2079 sk_locked = 1;
2080 goto restart_locked;
2081 }
2082 }
2083
2084 if (unlikely(sk_locked))
2085 unix_state_unlock(sk);
2086
2087 if (sock_flag(other, SOCK_RCVTSTAMP))
2088 __net_timestamp(skb);
2089 maybe_add_creds(skb, sock, other);
2090 scm_stat_add(other, skb);
2091 skb_queue_tail(&other->sk_receive_queue, skb);
2092 unix_state_unlock(other);
2093 other->sk_data_ready(other);
2094 sock_put(other);
2095 scm_destroy(&scm);
2096 return len;
2097
2098 out_unlock:
2099 if (sk_locked)
2100 unix_state_unlock(sk);
2101 unix_state_unlock(other);
2102 out_free:
2103 kfree_skb(skb);
2104 out:
2105 if (other)
2106 sock_put(other);
2107 scm_destroy(&scm);
2108 return err;
2109 }
2110
2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2112 * bytes, and a minimum of a full page.
2113 */
2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2115
2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2118 struct scm_cookie *scm, bool fds_sent)
2119 {
2120 struct unix_sock *ousk = unix_sk(other);
2121 struct sk_buff *skb;
2122 int err = 0;
2123
2124 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2125
2126 if (!skb)
2127 return err;
2128
2129 err = unix_scm_to_skb(scm, skb, !fds_sent);
2130 if (err < 0) {
2131 kfree_skb(skb);
2132 return err;
2133 }
2134 skb_put(skb, 1);
2135 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2136
2137 if (err) {
2138 kfree_skb(skb);
2139 return err;
2140 }
2141
2142 unix_state_lock(other);
2143
2144 if (sock_flag(other, SOCK_DEAD) ||
2145 (other->sk_shutdown & RCV_SHUTDOWN)) {
2146 unix_state_unlock(other);
2147 kfree_skb(skb);
2148 return -EPIPE;
2149 }
2150
2151 maybe_add_creds(skb, sock, other);
2152 skb_get(skb);
2153
2154 scm_stat_add(other, skb);
2155
2156 spin_lock(&other->sk_receive_queue.lock);
2157 if (ousk->oob_skb)
2158 consume_skb(ousk->oob_skb);
2159 WRITE_ONCE(ousk->oob_skb, skb);
2160 __skb_queue_tail(&other->sk_receive_queue, skb);
2161 spin_unlock(&other->sk_receive_queue.lock);
2162
2163 sk_send_sigurg(other);
2164 unix_state_unlock(other);
2165 other->sk_data_ready(other);
2166
2167 return err;
2168 }
2169 #endif
2170
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2171 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2172 size_t len)
2173 {
2174 struct sock *sk = sock->sk;
2175 struct sock *other = NULL;
2176 int err, size;
2177 struct sk_buff *skb;
2178 int sent = 0;
2179 struct scm_cookie scm;
2180 bool fds_sent = false;
2181 int data_len;
2182
2183 wait_for_unix_gc();
2184 err = scm_send(sock, msg, &scm, false);
2185 if (err < 0)
2186 return err;
2187
2188 err = -EOPNOTSUPP;
2189 if (msg->msg_flags & MSG_OOB) {
2190 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2191 if (len)
2192 len--;
2193 else
2194 #endif
2195 goto out_err;
2196 }
2197
2198 if (msg->msg_namelen) {
2199 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2200 goto out_err;
2201 } else {
2202 err = -ENOTCONN;
2203 other = unix_peer(sk);
2204 if (!other)
2205 goto out_err;
2206 }
2207
2208 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2209 goto pipe_err;
2210
2211 while (sent < len) {
2212 size = len - sent;
2213
2214 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2215 skb = sock_alloc_send_pskb(sk, 0, 0,
2216 msg->msg_flags & MSG_DONTWAIT,
2217 &err, 0);
2218 } else {
2219 /* Keep two messages in the pipe so it schedules better */
2220 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2221
2222 /* allow fallback to order-0 allocations */
2223 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2224
2225 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2226
2227 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2228
2229 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2230 msg->msg_flags & MSG_DONTWAIT, &err,
2231 get_order(UNIX_SKB_FRAGS_SZ));
2232 }
2233 if (!skb)
2234 goto out_err;
2235
2236 /* Only send the fds in the first buffer */
2237 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2238 if (err < 0) {
2239 kfree_skb(skb);
2240 goto out_err;
2241 }
2242 fds_sent = true;
2243
2244 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2245 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2246 sk->sk_allocation);
2247 if (err < 0) {
2248 kfree_skb(skb);
2249 goto out_err;
2250 }
2251 size = err;
2252 refcount_add(size, &sk->sk_wmem_alloc);
2253 } else {
2254 skb_put(skb, size - data_len);
2255 skb->data_len = data_len;
2256 skb->len = size;
2257 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2258 if (err) {
2259 kfree_skb(skb);
2260 goto out_err;
2261 }
2262 }
2263
2264 unix_state_lock(other);
2265
2266 if (sock_flag(other, SOCK_DEAD) ||
2267 (other->sk_shutdown & RCV_SHUTDOWN))
2268 goto pipe_err_free;
2269
2270 maybe_add_creds(skb, sock, other);
2271 scm_stat_add(other, skb);
2272 skb_queue_tail(&other->sk_receive_queue, skb);
2273 unix_state_unlock(other);
2274 other->sk_data_ready(other);
2275 sent += size;
2276 }
2277
2278 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2279 if (msg->msg_flags & MSG_OOB) {
2280 err = queue_oob(sock, msg, other, &scm, fds_sent);
2281 if (err)
2282 goto out_err;
2283 sent++;
2284 }
2285 #endif
2286
2287 scm_destroy(&scm);
2288
2289 return sent;
2290
2291 pipe_err_free:
2292 unix_state_unlock(other);
2293 kfree_skb(skb);
2294 pipe_err:
2295 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2296 send_sig(SIGPIPE, current, 0);
2297 err = -EPIPE;
2298 out_err:
2299 scm_destroy(&scm);
2300 return sent ? : err;
2301 }
2302
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2303 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2304 size_t len)
2305 {
2306 int err;
2307 struct sock *sk = sock->sk;
2308
2309 err = sock_error(sk);
2310 if (err)
2311 return err;
2312
2313 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2314 return -ENOTCONN;
2315
2316 if (msg->msg_namelen)
2317 msg->msg_namelen = 0;
2318
2319 return unix_dgram_sendmsg(sock, msg, len);
2320 }
2321
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2322 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2323 size_t size, int flags)
2324 {
2325 struct sock *sk = sock->sk;
2326
2327 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2328 return -ENOTCONN;
2329
2330 return unix_dgram_recvmsg(sock, msg, size, flags);
2331 }
2332
unix_copy_addr(struct msghdr * msg,struct sock * sk)2333 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2334 {
2335 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2336
2337 if (addr) {
2338 msg->msg_namelen = addr->len;
2339 memcpy(msg->msg_name, addr->name, addr->len);
2340 }
2341 }
2342
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2343 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2344 int flags)
2345 {
2346 struct scm_cookie scm;
2347 struct socket *sock = sk->sk_socket;
2348 struct unix_sock *u = unix_sk(sk);
2349 struct sk_buff *skb, *last;
2350 long timeo;
2351 int skip;
2352 int err;
2353
2354 err = -EOPNOTSUPP;
2355 if (flags&MSG_OOB)
2356 goto out;
2357
2358 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2359
2360 do {
2361 mutex_lock(&u->iolock);
2362
2363 skip = sk_peek_offset(sk, flags);
2364 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2365 &skip, &err, &last);
2366 if (skb) {
2367 if (!(flags & MSG_PEEK))
2368 scm_stat_del(sk, skb);
2369 break;
2370 }
2371
2372 mutex_unlock(&u->iolock);
2373
2374 if (err != -EAGAIN)
2375 break;
2376 } while (timeo &&
2377 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2378 &err, &timeo, last));
2379
2380 if (!skb) { /* implies iolock unlocked */
2381 unix_state_lock(sk);
2382 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2383 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2384 (sk->sk_shutdown & RCV_SHUTDOWN))
2385 err = 0;
2386 unix_state_unlock(sk);
2387 goto out;
2388 }
2389
2390 if (wq_has_sleeper(&u->peer_wait))
2391 wake_up_interruptible_sync_poll(&u->peer_wait,
2392 EPOLLOUT | EPOLLWRNORM |
2393 EPOLLWRBAND);
2394
2395 if (msg->msg_name)
2396 unix_copy_addr(msg, skb->sk);
2397
2398 if (size > skb->len - skip)
2399 size = skb->len - skip;
2400 else if (size < skb->len - skip)
2401 msg->msg_flags |= MSG_TRUNC;
2402
2403 err = skb_copy_datagram_msg(skb, skip, msg, size);
2404 if (err)
2405 goto out_free;
2406
2407 if (sock_flag(sk, SOCK_RCVTSTAMP))
2408 __sock_recv_timestamp(msg, sk, skb);
2409
2410 memset(&scm, 0, sizeof(scm));
2411
2412 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2413 unix_set_secdata(&scm, skb);
2414
2415 if (!(flags & MSG_PEEK)) {
2416 if (UNIXCB(skb).fp)
2417 unix_detach_fds(&scm, skb);
2418
2419 sk_peek_offset_bwd(sk, skb->len);
2420 } else {
2421 /* It is questionable: on PEEK we could:
2422 - do not return fds - good, but too simple 8)
2423 - return fds, and do not return them on read (old strategy,
2424 apparently wrong)
2425 - clone fds (I chose it for now, it is the most universal
2426 solution)
2427
2428 POSIX 1003.1g does not actually define this clearly
2429 at all. POSIX 1003.1g doesn't define a lot of things
2430 clearly however!
2431
2432 */
2433
2434 sk_peek_offset_fwd(sk, size);
2435
2436 if (UNIXCB(skb).fp)
2437 unix_peek_fds(&scm, skb);
2438 }
2439 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2440
2441 scm_recv_unix(sock, msg, &scm, flags);
2442
2443 out_free:
2444 skb_free_datagram(sk, skb);
2445 mutex_unlock(&u->iolock);
2446 out:
2447 return err;
2448 }
2449
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2450 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2451 int flags)
2452 {
2453 struct sock *sk = sock->sk;
2454
2455 #ifdef CONFIG_BPF_SYSCALL
2456 const struct proto *prot = READ_ONCE(sk->sk_prot);
2457
2458 if (prot != &unix_dgram_proto)
2459 return prot->recvmsg(sk, msg, size, flags, NULL);
2460 #endif
2461 return __unix_dgram_recvmsg(sk, msg, size, flags);
2462 }
2463
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2464 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2465 {
2466 struct unix_sock *u = unix_sk(sk);
2467 struct sk_buff *skb;
2468 int err;
2469
2470 mutex_lock(&u->iolock);
2471 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2472 mutex_unlock(&u->iolock);
2473 if (!skb)
2474 return err;
2475
2476 return recv_actor(sk, skb);
2477 }
2478
2479 /*
2480 * Sleep until more data has arrived. But check for races..
2481 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2482 static long unix_stream_data_wait(struct sock *sk, long timeo,
2483 struct sk_buff *last, unsigned int last_len,
2484 bool freezable)
2485 {
2486 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2487 struct sk_buff *tail;
2488 DEFINE_WAIT(wait);
2489
2490 unix_state_lock(sk);
2491
2492 for (;;) {
2493 prepare_to_wait(sk_sleep(sk), &wait, state);
2494
2495 tail = skb_peek_tail(&sk->sk_receive_queue);
2496 if (tail != last ||
2497 (tail && tail->len != last_len) ||
2498 sk->sk_err ||
2499 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2500 signal_pending(current) ||
2501 !timeo)
2502 break;
2503
2504 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2505 unix_state_unlock(sk);
2506 timeo = schedule_timeout(timeo);
2507 unix_state_lock(sk);
2508
2509 if (sock_flag(sk, SOCK_DEAD))
2510 break;
2511
2512 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2513 }
2514
2515 finish_wait(sk_sleep(sk), &wait);
2516 unix_state_unlock(sk);
2517 return timeo;
2518 }
2519
unix_skb_len(const struct sk_buff * skb)2520 static unsigned int unix_skb_len(const struct sk_buff *skb)
2521 {
2522 return skb->len - UNIXCB(skb).consumed;
2523 }
2524
2525 struct unix_stream_read_state {
2526 int (*recv_actor)(struct sk_buff *, int, int,
2527 struct unix_stream_read_state *);
2528 struct socket *socket;
2529 struct msghdr *msg;
2530 struct pipe_inode_info *pipe;
2531 size_t size;
2532 int flags;
2533 unsigned int splice_flags;
2534 };
2535
2536 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2537 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2538 {
2539 struct socket *sock = state->socket;
2540 struct sock *sk = sock->sk;
2541 struct unix_sock *u = unix_sk(sk);
2542 int chunk = 1;
2543 struct sk_buff *oob_skb;
2544
2545 mutex_lock(&u->iolock);
2546 unix_state_lock(sk);
2547 spin_lock(&sk->sk_receive_queue.lock);
2548
2549 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2550 spin_unlock(&sk->sk_receive_queue.lock);
2551 unix_state_unlock(sk);
2552 mutex_unlock(&u->iolock);
2553 return -EINVAL;
2554 }
2555
2556 oob_skb = u->oob_skb;
2557
2558 if (!(state->flags & MSG_PEEK))
2559 WRITE_ONCE(u->oob_skb, NULL);
2560 else
2561 skb_get(oob_skb);
2562
2563 spin_unlock(&sk->sk_receive_queue.lock);
2564 unix_state_unlock(sk);
2565
2566 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2567
2568 if (!(state->flags & MSG_PEEK))
2569 UNIXCB(oob_skb).consumed += 1;
2570
2571 consume_skb(oob_skb);
2572
2573 mutex_unlock(&u->iolock);
2574
2575 if (chunk < 0)
2576 return -EFAULT;
2577
2578 state->msg->msg_flags |= MSG_OOB;
2579 return 1;
2580 }
2581
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2582 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2583 int flags, int copied)
2584 {
2585 struct unix_sock *u = unix_sk(sk);
2586
2587 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2588 skb_unlink(skb, &sk->sk_receive_queue);
2589 consume_skb(skb);
2590 skb = NULL;
2591 } else {
2592 struct sk_buff *unlinked_skb = NULL;
2593
2594 spin_lock(&sk->sk_receive_queue.lock);
2595
2596 if (skb == u->oob_skb) {
2597 if (copied) {
2598 skb = NULL;
2599 } else if (!(flags & MSG_PEEK)) {
2600 if (sock_flag(sk, SOCK_URGINLINE)) {
2601 WRITE_ONCE(u->oob_skb, NULL);
2602 consume_skb(skb);
2603 } else {
2604 __skb_unlink(skb, &sk->sk_receive_queue);
2605 WRITE_ONCE(u->oob_skb, NULL);
2606 unlinked_skb = skb;
2607 skb = skb_peek(&sk->sk_receive_queue);
2608 }
2609 } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2610 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2611 }
2612 }
2613
2614 spin_unlock(&sk->sk_receive_queue.lock);
2615
2616 if (unlinked_skb) {
2617 WARN_ON_ONCE(skb_unref(unlinked_skb));
2618 kfree_skb(unlinked_skb);
2619 }
2620 }
2621 return skb;
2622 }
2623 #endif
2624
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2625 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2626 {
2627 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2628 return -ENOTCONN;
2629
2630 return unix_read_skb(sk, recv_actor);
2631 }
2632
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2633 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2634 bool freezable)
2635 {
2636 struct scm_cookie scm;
2637 struct socket *sock = state->socket;
2638 struct sock *sk = sock->sk;
2639 struct unix_sock *u = unix_sk(sk);
2640 int copied = 0;
2641 int flags = state->flags;
2642 int noblock = flags & MSG_DONTWAIT;
2643 bool check_creds = false;
2644 int target;
2645 int err = 0;
2646 long timeo;
2647 int skip;
2648 size_t size = state->size;
2649 unsigned int last_len;
2650
2651 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2652 err = -EINVAL;
2653 goto out;
2654 }
2655
2656 if (unlikely(flags & MSG_OOB)) {
2657 err = -EOPNOTSUPP;
2658 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2659 err = unix_stream_recv_urg(state);
2660 #endif
2661 goto out;
2662 }
2663
2664 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2665 timeo = sock_rcvtimeo(sk, noblock);
2666
2667 memset(&scm, 0, sizeof(scm));
2668
2669 /* Lock the socket to prevent queue disordering
2670 * while sleeps in memcpy_tomsg
2671 */
2672 mutex_lock(&u->iolock);
2673
2674 skip = max(sk_peek_offset(sk, flags), 0);
2675
2676 do {
2677 int chunk;
2678 bool drop_skb;
2679 struct sk_buff *skb, *last;
2680
2681 redo:
2682 unix_state_lock(sk);
2683 if (sock_flag(sk, SOCK_DEAD)) {
2684 err = -ECONNRESET;
2685 goto unlock;
2686 }
2687 last = skb = skb_peek(&sk->sk_receive_queue);
2688 last_len = last ? last->len : 0;
2689
2690 again:
2691 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2692 if (skb) {
2693 skb = manage_oob(skb, sk, flags, copied);
2694 if (!skb && copied) {
2695 unix_state_unlock(sk);
2696 break;
2697 }
2698 }
2699 #endif
2700 if (skb == NULL) {
2701 if (copied >= target)
2702 goto unlock;
2703
2704 /*
2705 * POSIX 1003.1g mandates this order.
2706 */
2707
2708 err = sock_error(sk);
2709 if (err)
2710 goto unlock;
2711 if (sk->sk_shutdown & RCV_SHUTDOWN)
2712 goto unlock;
2713
2714 unix_state_unlock(sk);
2715 if (!timeo) {
2716 err = -EAGAIN;
2717 break;
2718 }
2719
2720 mutex_unlock(&u->iolock);
2721
2722 timeo = unix_stream_data_wait(sk, timeo, last,
2723 last_len, freezable);
2724
2725 if (signal_pending(current)) {
2726 err = sock_intr_errno(timeo);
2727 scm_destroy(&scm);
2728 goto out;
2729 }
2730
2731 mutex_lock(&u->iolock);
2732 goto redo;
2733 unlock:
2734 unix_state_unlock(sk);
2735 break;
2736 }
2737
2738 while (skip >= unix_skb_len(skb)) {
2739 skip -= unix_skb_len(skb);
2740 last = skb;
2741 last_len = skb->len;
2742 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2743 if (!skb)
2744 goto again;
2745 }
2746
2747 unix_state_unlock(sk);
2748
2749 if (check_creds) {
2750 /* Never glue messages from different writers */
2751 if (!unix_skb_scm_eq(skb, &scm))
2752 break;
2753 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2754 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2755 /* Copy credentials */
2756 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2757 unix_set_secdata(&scm, skb);
2758 check_creds = true;
2759 }
2760
2761 /* Copy address just once */
2762 if (state->msg && state->msg->msg_name) {
2763 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2764 state->msg->msg_name);
2765 unix_copy_addr(state->msg, skb->sk);
2766 sunaddr = NULL;
2767 }
2768
2769 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2770 skb_get(skb);
2771 chunk = state->recv_actor(skb, skip, chunk, state);
2772 drop_skb = !unix_skb_len(skb);
2773 /* skb is only safe to use if !drop_skb */
2774 consume_skb(skb);
2775 if (chunk < 0) {
2776 if (copied == 0)
2777 copied = -EFAULT;
2778 break;
2779 }
2780 copied += chunk;
2781 size -= chunk;
2782
2783 if (drop_skb) {
2784 /* the skb was touched by a concurrent reader;
2785 * we should not expect anything from this skb
2786 * anymore and assume it invalid - we can be
2787 * sure it was dropped from the socket queue
2788 *
2789 * let's report a short read
2790 */
2791 err = 0;
2792 break;
2793 }
2794
2795 /* Mark read part of skb as used */
2796 if (!(flags & MSG_PEEK)) {
2797 UNIXCB(skb).consumed += chunk;
2798
2799 sk_peek_offset_bwd(sk, chunk);
2800
2801 if (UNIXCB(skb).fp) {
2802 scm_stat_del(sk, skb);
2803 unix_detach_fds(&scm, skb);
2804 }
2805
2806 if (unix_skb_len(skb))
2807 break;
2808
2809 skb_unlink(skb, &sk->sk_receive_queue);
2810 consume_skb(skb);
2811
2812 if (scm.fp)
2813 break;
2814 } else {
2815 /* It is questionable, see note in unix_dgram_recvmsg.
2816 */
2817 if (UNIXCB(skb).fp)
2818 unix_peek_fds(&scm, skb);
2819
2820 sk_peek_offset_fwd(sk, chunk);
2821
2822 if (UNIXCB(skb).fp)
2823 break;
2824
2825 skip = 0;
2826 last = skb;
2827 last_len = skb->len;
2828 unix_state_lock(sk);
2829 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2830 if (skb)
2831 goto again;
2832 unix_state_unlock(sk);
2833 break;
2834 }
2835 } while (size);
2836
2837 mutex_unlock(&u->iolock);
2838 if (state->msg)
2839 scm_recv_unix(sock, state->msg, &scm, flags);
2840 else
2841 scm_destroy(&scm);
2842 out:
2843 return copied ? : err;
2844 }
2845
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2846 static int unix_stream_read_actor(struct sk_buff *skb,
2847 int skip, int chunk,
2848 struct unix_stream_read_state *state)
2849 {
2850 int ret;
2851
2852 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2853 state->msg, chunk);
2854 return ret ?: chunk;
2855 }
2856
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2857 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2858 size_t size, int flags)
2859 {
2860 struct unix_stream_read_state state = {
2861 .recv_actor = unix_stream_read_actor,
2862 .socket = sk->sk_socket,
2863 .msg = msg,
2864 .size = size,
2865 .flags = flags
2866 };
2867
2868 return unix_stream_read_generic(&state, true);
2869 }
2870
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2871 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2872 size_t size, int flags)
2873 {
2874 struct unix_stream_read_state state = {
2875 .recv_actor = unix_stream_read_actor,
2876 .socket = sock,
2877 .msg = msg,
2878 .size = size,
2879 .flags = flags
2880 };
2881
2882 #ifdef CONFIG_BPF_SYSCALL
2883 struct sock *sk = sock->sk;
2884 const struct proto *prot = READ_ONCE(sk->sk_prot);
2885
2886 if (prot != &unix_stream_proto)
2887 return prot->recvmsg(sk, msg, size, flags, NULL);
2888 #endif
2889 return unix_stream_read_generic(&state, true);
2890 }
2891
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2892 static int unix_stream_splice_actor(struct sk_buff *skb,
2893 int skip, int chunk,
2894 struct unix_stream_read_state *state)
2895 {
2896 return skb_splice_bits(skb, state->socket->sk,
2897 UNIXCB(skb).consumed + skip,
2898 state->pipe, chunk, state->splice_flags);
2899 }
2900
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)2901 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2902 struct pipe_inode_info *pipe,
2903 size_t size, unsigned int flags)
2904 {
2905 struct unix_stream_read_state state = {
2906 .recv_actor = unix_stream_splice_actor,
2907 .socket = sock,
2908 .pipe = pipe,
2909 .size = size,
2910 .splice_flags = flags,
2911 };
2912
2913 if (unlikely(*ppos))
2914 return -ESPIPE;
2915
2916 if (sock->file->f_flags & O_NONBLOCK ||
2917 flags & SPLICE_F_NONBLOCK)
2918 state.flags = MSG_DONTWAIT;
2919
2920 return unix_stream_read_generic(&state, false);
2921 }
2922
unix_shutdown(struct socket * sock,int mode)2923 static int unix_shutdown(struct socket *sock, int mode)
2924 {
2925 struct sock *sk = sock->sk;
2926 struct sock *other;
2927
2928 if (mode < SHUT_RD || mode > SHUT_RDWR)
2929 return -EINVAL;
2930 /* This maps:
2931 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2932 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2933 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2934 */
2935 ++mode;
2936
2937 unix_state_lock(sk);
2938 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2939 other = unix_peer(sk);
2940 if (other)
2941 sock_hold(other);
2942 unix_state_unlock(sk);
2943 sk->sk_state_change(sk);
2944
2945 if (other &&
2946 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2947
2948 int peer_mode = 0;
2949 const struct proto *prot = READ_ONCE(other->sk_prot);
2950
2951 if (prot->unhash)
2952 prot->unhash(other);
2953 if (mode&RCV_SHUTDOWN)
2954 peer_mode |= SEND_SHUTDOWN;
2955 if (mode&SEND_SHUTDOWN)
2956 peer_mode |= RCV_SHUTDOWN;
2957 unix_state_lock(other);
2958 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2959 unix_state_unlock(other);
2960 other->sk_state_change(other);
2961 if (peer_mode == SHUTDOWN_MASK)
2962 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2963 else if (peer_mode & RCV_SHUTDOWN)
2964 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2965 }
2966 if (other)
2967 sock_put(other);
2968
2969 return 0;
2970 }
2971
unix_inq_len(struct sock * sk)2972 long unix_inq_len(struct sock *sk)
2973 {
2974 struct sk_buff *skb;
2975 long amount = 0;
2976
2977 if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
2978 return -EINVAL;
2979
2980 spin_lock(&sk->sk_receive_queue.lock);
2981 if (sk->sk_type == SOCK_STREAM ||
2982 sk->sk_type == SOCK_SEQPACKET) {
2983 skb_queue_walk(&sk->sk_receive_queue, skb)
2984 amount += unix_skb_len(skb);
2985 } else {
2986 skb = skb_peek(&sk->sk_receive_queue);
2987 if (skb)
2988 amount = skb->len;
2989 }
2990 spin_unlock(&sk->sk_receive_queue.lock);
2991
2992 return amount;
2993 }
2994 EXPORT_SYMBOL_GPL(unix_inq_len);
2995
unix_outq_len(struct sock * sk)2996 long unix_outq_len(struct sock *sk)
2997 {
2998 return sk_wmem_alloc_get(sk);
2999 }
3000 EXPORT_SYMBOL_GPL(unix_outq_len);
3001
unix_open_file(struct sock * sk)3002 static int unix_open_file(struct sock *sk)
3003 {
3004 struct path path;
3005 struct file *f;
3006 int fd;
3007
3008 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3009 return -EPERM;
3010
3011 if (!smp_load_acquire(&unix_sk(sk)->addr))
3012 return -ENOENT;
3013
3014 path = unix_sk(sk)->path;
3015 if (!path.dentry)
3016 return -ENOENT;
3017
3018 path_get(&path);
3019
3020 fd = get_unused_fd_flags(O_CLOEXEC);
3021 if (fd < 0)
3022 goto out;
3023
3024 f = dentry_open(&path, O_PATH, current_cred());
3025 if (IS_ERR(f)) {
3026 put_unused_fd(fd);
3027 fd = PTR_ERR(f);
3028 goto out;
3029 }
3030
3031 fd_install(fd, f);
3032 out:
3033 path_put(&path);
3034
3035 return fd;
3036 }
3037
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3038 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3039 {
3040 struct sock *sk = sock->sk;
3041 long amount = 0;
3042 int err;
3043
3044 switch (cmd) {
3045 case SIOCOUTQ:
3046 amount = unix_outq_len(sk);
3047 err = put_user(amount, (int __user *)arg);
3048 break;
3049 case SIOCINQ:
3050 amount = unix_inq_len(sk);
3051 if (amount < 0)
3052 err = amount;
3053 else
3054 err = put_user(amount, (int __user *)arg);
3055 break;
3056 case SIOCUNIXFILE:
3057 err = unix_open_file(sk);
3058 break;
3059 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3060 case SIOCATMARK:
3061 {
3062 struct sk_buff *skb;
3063 int answ = 0;
3064
3065 skb = skb_peek(&sk->sk_receive_queue);
3066 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3067 answ = 1;
3068 err = put_user(answ, (int __user *)arg);
3069 }
3070 break;
3071 #endif
3072 default:
3073 err = -ENOIOCTLCMD;
3074 break;
3075 }
3076 return err;
3077 }
3078
3079 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3080 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3081 {
3082 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3083 }
3084 #endif
3085
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3086 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3087 {
3088 struct sock *sk = sock->sk;
3089 unsigned char state;
3090 __poll_t mask;
3091 u8 shutdown;
3092
3093 sock_poll_wait(file, sock, wait);
3094 mask = 0;
3095 shutdown = READ_ONCE(sk->sk_shutdown);
3096 state = READ_ONCE(sk->sk_state);
3097
3098 /* exceptional events? */
3099 if (READ_ONCE(sk->sk_err))
3100 mask |= EPOLLERR;
3101 if (shutdown == SHUTDOWN_MASK)
3102 mask |= EPOLLHUP;
3103 if (shutdown & RCV_SHUTDOWN)
3104 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3105
3106 /* readable? */
3107 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3108 mask |= EPOLLIN | EPOLLRDNORM;
3109 if (sk_is_readable(sk))
3110 mask |= EPOLLIN | EPOLLRDNORM;
3111 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3112 if (READ_ONCE(unix_sk(sk)->oob_skb))
3113 mask |= EPOLLPRI;
3114 #endif
3115
3116 /* Connection-based need to check for termination and startup */
3117 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3118 state == TCP_CLOSE)
3119 mask |= EPOLLHUP;
3120
3121 /*
3122 * we set writable also when the other side has shut down the
3123 * connection. This prevents stuck sockets.
3124 */
3125 if (unix_writable(sk, state))
3126 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3127
3128 return mask;
3129 }
3130
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3131 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3132 poll_table *wait)
3133 {
3134 struct sock *sk = sock->sk, *other;
3135 unsigned int writable;
3136 unsigned char state;
3137 __poll_t mask;
3138 u8 shutdown;
3139
3140 sock_poll_wait(file, sock, wait);
3141 mask = 0;
3142 shutdown = READ_ONCE(sk->sk_shutdown);
3143 state = READ_ONCE(sk->sk_state);
3144
3145 /* exceptional events? */
3146 if (READ_ONCE(sk->sk_err) ||
3147 !skb_queue_empty_lockless(&sk->sk_error_queue))
3148 mask |= EPOLLERR |
3149 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3150
3151 if (shutdown & RCV_SHUTDOWN)
3152 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3153 if (shutdown == SHUTDOWN_MASK)
3154 mask |= EPOLLHUP;
3155
3156 /* readable? */
3157 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3158 mask |= EPOLLIN | EPOLLRDNORM;
3159 if (sk_is_readable(sk))
3160 mask |= EPOLLIN | EPOLLRDNORM;
3161
3162 /* Connection-based need to check for termination and startup */
3163 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3164 mask |= EPOLLHUP;
3165
3166 /* No write status requested, avoid expensive OUT tests. */
3167 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3168 return mask;
3169
3170 writable = unix_writable(sk, state);
3171 if (writable) {
3172 unix_state_lock(sk);
3173
3174 other = unix_peer(sk);
3175 if (other && unix_peer(other) != sk &&
3176 unix_recvq_full_lockless(other) &&
3177 unix_dgram_peer_wake_me(sk, other))
3178 writable = 0;
3179
3180 unix_state_unlock(sk);
3181 }
3182
3183 if (writable)
3184 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3185 else
3186 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3187
3188 return mask;
3189 }
3190
3191 #ifdef CONFIG_PROC_FS
3192
3193 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3194
3195 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3196 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3197 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3198
unix_from_bucket(struct seq_file * seq,loff_t * pos)3199 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3200 {
3201 unsigned long offset = get_offset(*pos);
3202 unsigned long bucket = get_bucket(*pos);
3203 unsigned long count = 0;
3204 struct sock *sk;
3205
3206 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3207 sk; sk = sk_next(sk)) {
3208 if (++count == offset)
3209 break;
3210 }
3211
3212 return sk;
3213 }
3214
unix_get_first(struct seq_file * seq,loff_t * pos)3215 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3216 {
3217 unsigned long bucket = get_bucket(*pos);
3218 struct net *net = seq_file_net(seq);
3219 struct sock *sk;
3220
3221 while (bucket < UNIX_HASH_SIZE) {
3222 spin_lock(&net->unx.table.locks[bucket]);
3223
3224 sk = unix_from_bucket(seq, pos);
3225 if (sk)
3226 return sk;
3227
3228 spin_unlock(&net->unx.table.locks[bucket]);
3229
3230 *pos = set_bucket_offset(++bucket, 1);
3231 }
3232
3233 return NULL;
3234 }
3235
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3236 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3237 loff_t *pos)
3238 {
3239 unsigned long bucket = get_bucket(*pos);
3240
3241 sk = sk_next(sk);
3242 if (sk)
3243 return sk;
3244
3245
3246 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3247
3248 *pos = set_bucket_offset(++bucket, 1);
3249
3250 return unix_get_first(seq, pos);
3251 }
3252
unix_seq_start(struct seq_file * seq,loff_t * pos)3253 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3254 {
3255 if (!*pos)
3256 return SEQ_START_TOKEN;
3257
3258 return unix_get_first(seq, pos);
3259 }
3260
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3261 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3262 {
3263 ++*pos;
3264
3265 if (v == SEQ_START_TOKEN)
3266 return unix_get_first(seq, pos);
3267
3268 return unix_get_next(seq, v, pos);
3269 }
3270
unix_seq_stop(struct seq_file * seq,void * v)3271 static void unix_seq_stop(struct seq_file *seq, void *v)
3272 {
3273 struct sock *sk = v;
3274
3275 if (sk)
3276 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3277 }
3278
unix_seq_show(struct seq_file * seq,void * v)3279 static int unix_seq_show(struct seq_file *seq, void *v)
3280 {
3281
3282 if (v == SEQ_START_TOKEN)
3283 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3284 "Inode Path\n");
3285 else {
3286 struct sock *s = v;
3287 struct unix_sock *u = unix_sk(s);
3288 unix_state_lock(s);
3289
3290 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3291 s,
3292 refcount_read(&s->sk_refcnt),
3293 0,
3294 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3295 s->sk_type,
3296 s->sk_socket ?
3297 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3298 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3299 sock_i_ino(s));
3300
3301 if (u->addr) { // under a hash table lock here
3302 int i, len;
3303 seq_putc(seq, ' ');
3304
3305 i = 0;
3306 len = u->addr->len -
3307 offsetof(struct sockaddr_un, sun_path);
3308 if (u->addr->name->sun_path[0]) {
3309 len--;
3310 } else {
3311 seq_putc(seq, '@');
3312 i++;
3313 }
3314 for ( ; i < len; i++)
3315 seq_putc(seq, u->addr->name->sun_path[i] ?:
3316 '@');
3317 }
3318 unix_state_unlock(s);
3319 seq_putc(seq, '\n');
3320 }
3321
3322 return 0;
3323 }
3324
3325 static const struct seq_operations unix_seq_ops = {
3326 .start = unix_seq_start,
3327 .next = unix_seq_next,
3328 .stop = unix_seq_stop,
3329 .show = unix_seq_show,
3330 };
3331
3332 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3333 struct bpf_unix_iter_state {
3334 struct seq_net_private p;
3335 unsigned int cur_sk;
3336 unsigned int end_sk;
3337 unsigned int max_sk;
3338 struct sock **batch;
3339 bool st_bucket_done;
3340 };
3341
3342 struct bpf_iter__unix {
3343 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3344 __bpf_md_ptr(struct unix_sock *, unix_sk);
3345 uid_t uid __aligned(8);
3346 };
3347
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3348 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3349 struct unix_sock *unix_sk, uid_t uid)
3350 {
3351 struct bpf_iter__unix ctx;
3352
3353 meta->seq_num--; /* skip SEQ_START_TOKEN */
3354 ctx.meta = meta;
3355 ctx.unix_sk = unix_sk;
3356 ctx.uid = uid;
3357 return bpf_iter_run_prog(prog, &ctx);
3358 }
3359
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3360 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3361
3362 {
3363 struct bpf_unix_iter_state *iter = seq->private;
3364 unsigned int expected = 1;
3365 struct sock *sk;
3366
3367 sock_hold(start_sk);
3368 iter->batch[iter->end_sk++] = start_sk;
3369
3370 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3371 if (iter->end_sk < iter->max_sk) {
3372 sock_hold(sk);
3373 iter->batch[iter->end_sk++] = sk;
3374 }
3375
3376 expected++;
3377 }
3378
3379 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3380
3381 return expected;
3382 }
3383
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3384 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3385 {
3386 while (iter->cur_sk < iter->end_sk)
3387 sock_put(iter->batch[iter->cur_sk++]);
3388 }
3389
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3390 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3391 unsigned int new_batch_sz)
3392 {
3393 struct sock **new_batch;
3394
3395 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3396 GFP_USER | __GFP_NOWARN);
3397 if (!new_batch)
3398 return -ENOMEM;
3399
3400 bpf_iter_unix_put_batch(iter);
3401 kvfree(iter->batch);
3402 iter->batch = new_batch;
3403 iter->max_sk = new_batch_sz;
3404
3405 return 0;
3406 }
3407
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3408 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3409 loff_t *pos)
3410 {
3411 struct bpf_unix_iter_state *iter = seq->private;
3412 unsigned int expected;
3413 bool resized = false;
3414 struct sock *sk;
3415
3416 if (iter->st_bucket_done)
3417 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3418
3419 again:
3420 /* Get a new batch */
3421 iter->cur_sk = 0;
3422 iter->end_sk = 0;
3423
3424 sk = unix_get_first(seq, pos);
3425 if (!sk)
3426 return NULL; /* Done */
3427
3428 expected = bpf_iter_unix_hold_batch(seq, sk);
3429
3430 if (iter->end_sk == expected) {
3431 iter->st_bucket_done = true;
3432 return sk;
3433 }
3434
3435 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3436 resized = true;
3437 goto again;
3438 }
3439
3440 return sk;
3441 }
3442
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3443 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3444 {
3445 if (!*pos)
3446 return SEQ_START_TOKEN;
3447
3448 /* bpf iter does not support lseek, so it always
3449 * continue from where it was stop()-ped.
3450 */
3451 return bpf_iter_unix_batch(seq, pos);
3452 }
3453
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3454 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3455 {
3456 struct bpf_unix_iter_state *iter = seq->private;
3457 struct sock *sk;
3458
3459 /* Whenever seq_next() is called, the iter->cur_sk is
3460 * done with seq_show(), so advance to the next sk in
3461 * the batch.
3462 */
3463 if (iter->cur_sk < iter->end_sk)
3464 sock_put(iter->batch[iter->cur_sk++]);
3465
3466 ++*pos;
3467
3468 if (iter->cur_sk < iter->end_sk)
3469 sk = iter->batch[iter->cur_sk];
3470 else
3471 sk = bpf_iter_unix_batch(seq, pos);
3472
3473 return sk;
3474 }
3475
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3476 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3477 {
3478 struct bpf_iter_meta meta;
3479 struct bpf_prog *prog;
3480 struct sock *sk = v;
3481 uid_t uid;
3482 bool slow;
3483 int ret;
3484
3485 if (v == SEQ_START_TOKEN)
3486 return 0;
3487
3488 slow = lock_sock_fast(sk);
3489
3490 if (unlikely(sk_unhashed(sk))) {
3491 ret = SEQ_SKIP;
3492 goto unlock;
3493 }
3494
3495 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3496 meta.seq = seq;
3497 prog = bpf_iter_get_info(&meta, false);
3498 ret = unix_prog_seq_show(prog, &meta, v, uid);
3499 unlock:
3500 unlock_sock_fast(sk, slow);
3501 return ret;
3502 }
3503
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3504 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3505 {
3506 struct bpf_unix_iter_state *iter = seq->private;
3507 struct bpf_iter_meta meta;
3508 struct bpf_prog *prog;
3509
3510 if (!v) {
3511 meta.seq = seq;
3512 prog = bpf_iter_get_info(&meta, true);
3513 if (prog)
3514 (void)unix_prog_seq_show(prog, &meta, v, 0);
3515 }
3516
3517 if (iter->cur_sk < iter->end_sk)
3518 bpf_iter_unix_put_batch(iter);
3519 }
3520
3521 static const struct seq_operations bpf_iter_unix_seq_ops = {
3522 .start = bpf_iter_unix_seq_start,
3523 .next = bpf_iter_unix_seq_next,
3524 .stop = bpf_iter_unix_seq_stop,
3525 .show = bpf_iter_unix_seq_show,
3526 };
3527 #endif
3528 #endif
3529
3530 static const struct net_proto_family unix_family_ops = {
3531 .family = PF_UNIX,
3532 .create = unix_create,
3533 .owner = THIS_MODULE,
3534 };
3535
3536
unix_net_init(struct net * net)3537 static int __net_init unix_net_init(struct net *net)
3538 {
3539 int i;
3540
3541 net->unx.sysctl_max_dgram_qlen = 10;
3542 if (unix_sysctl_register(net))
3543 goto out;
3544
3545 #ifdef CONFIG_PROC_FS
3546 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3547 sizeof(struct seq_net_private)))
3548 goto err_sysctl;
3549 #endif
3550
3551 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3552 sizeof(spinlock_t), GFP_KERNEL);
3553 if (!net->unx.table.locks)
3554 goto err_proc;
3555
3556 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3557 sizeof(struct hlist_head),
3558 GFP_KERNEL);
3559 if (!net->unx.table.buckets)
3560 goto free_locks;
3561
3562 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3563 spin_lock_init(&net->unx.table.locks[i]);
3564 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3565 }
3566
3567 return 0;
3568
3569 free_locks:
3570 kvfree(net->unx.table.locks);
3571 err_proc:
3572 #ifdef CONFIG_PROC_FS
3573 remove_proc_entry("unix", net->proc_net);
3574 err_sysctl:
3575 #endif
3576 unix_sysctl_unregister(net);
3577 out:
3578 return -ENOMEM;
3579 }
3580
unix_net_exit(struct net * net)3581 static void __net_exit unix_net_exit(struct net *net)
3582 {
3583 kvfree(net->unx.table.buckets);
3584 kvfree(net->unx.table.locks);
3585 unix_sysctl_unregister(net);
3586 remove_proc_entry("unix", net->proc_net);
3587 }
3588
3589 static struct pernet_operations unix_net_ops = {
3590 .init = unix_net_init,
3591 .exit = unix_net_exit,
3592 };
3593
3594 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3595 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3596 struct unix_sock *unix_sk, uid_t uid)
3597
3598 #define INIT_BATCH_SZ 16
3599
3600 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3601 {
3602 struct bpf_unix_iter_state *iter = priv_data;
3603 int err;
3604
3605 err = bpf_iter_init_seq_net(priv_data, aux);
3606 if (err)
3607 return err;
3608
3609 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3610 if (err) {
3611 bpf_iter_fini_seq_net(priv_data);
3612 return err;
3613 }
3614
3615 return 0;
3616 }
3617
bpf_iter_fini_unix(void * priv_data)3618 static void bpf_iter_fini_unix(void *priv_data)
3619 {
3620 struct bpf_unix_iter_state *iter = priv_data;
3621
3622 bpf_iter_fini_seq_net(priv_data);
3623 kvfree(iter->batch);
3624 }
3625
3626 static const struct bpf_iter_seq_info unix_seq_info = {
3627 .seq_ops = &bpf_iter_unix_seq_ops,
3628 .init_seq_private = bpf_iter_init_unix,
3629 .fini_seq_private = bpf_iter_fini_unix,
3630 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3631 };
3632
3633 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3634 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3635 const struct bpf_prog *prog)
3636 {
3637 switch (func_id) {
3638 case BPF_FUNC_setsockopt:
3639 return &bpf_sk_setsockopt_proto;
3640 case BPF_FUNC_getsockopt:
3641 return &bpf_sk_getsockopt_proto;
3642 default:
3643 return NULL;
3644 }
3645 }
3646
3647 static struct bpf_iter_reg unix_reg_info = {
3648 .target = "unix",
3649 .ctx_arg_info_size = 1,
3650 .ctx_arg_info = {
3651 { offsetof(struct bpf_iter__unix, unix_sk),
3652 PTR_TO_BTF_ID_OR_NULL },
3653 },
3654 .get_func_proto = bpf_iter_unix_get_func_proto,
3655 .seq_info = &unix_seq_info,
3656 };
3657
bpf_iter_register(void)3658 static void __init bpf_iter_register(void)
3659 {
3660 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3661 if (bpf_iter_reg_target(&unix_reg_info))
3662 pr_warn("Warning: could not register bpf iterator unix\n");
3663 }
3664 #endif
3665
af_unix_init(void)3666 static int __init af_unix_init(void)
3667 {
3668 int i, rc = -1;
3669
3670 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3671
3672 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3673 spin_lock_init(&bsd_socket_locks[i]);
3674 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3675 }
3676
3677 rc = proto_register(&unix_dgram_proto, 1);
3678 if (rc != 0) {
3679 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3680 goto out;
3681 }
3682
3683 rc = proto_register(&unix_stream_proto, 1);
3684 if (rc != 0) {
3685 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3686 proto_unregister(&unix_dgram_proto);
3687 goto out;
3688 }
3689
3690 sock_register(&unix_family_ops);
3691 register_pernet_subsys(&unix_net_ops);
3692 unix_bpf_build_proto();
3693
3694 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3695 bpf_iter_register();
3696 #endif
3697
3698 out:
3699 return rc;
3700 }
3701
af_unix_exit(void)3702 static void __exit af_unix_exit(void)
3703 {
3704 sock_unregister(PF_UNIX);
3705 proto_unregister(&unix_dgram_proto);
3706 proto_unregister(&unix_stream_proto);
3707 unregister_pernet_subsys(&unix_net_ops);
3708 }
3709
3710 /* Earlier than device_initcall() so that other drivers invoking
3711 request_module() don't end up in a loop when modprobe tries
3712 to use a UNIX socket. But later than subsys_initcall() because
3713 we depend on stuff initialised there */
3714 fs_initcall(af_unix_init);
3715 module_exit(af_unix_exit);
3716
3717 MODULE_LICENSE("GPL");
3718 MODULE_ALIAS_NETPROTO(PF_UNIX);
3719