1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119
120 static atomic_long_t unix_nr_socks;
121 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
122 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
123
124 /* SMP locking strategy:
125 * hash table is protected with spinlock.
126 * each socket state is protected by separate spinlock.
127 */
128
unix_unbound_hash(struct sock * sk)129 static unsigned int unix_unbound_hash(struct sock *sk)
130 {
131 unsigned long hash = (unsigned long)sk;
132
133 hash ^= hash >> 16;
134 hash ^= hash >> 8;
135 hash ^= sk->sk_type;
136
137 return hash & UNIX_HASH_MOD;
138 }
139
unix_bsd_hash(struct inode * i)140 static unsigned int unix_bsd_hash(struct inode *i)
141 {
142 return i->i_ino & UNIX_HASH_MOD;
143 }
144
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)145 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
146 int addr_len, int type)
147 {
148 __wsum csum = csum_partial(sunaddr, addr_len, 0);
149 unsigned int hash;
150
151 hash = (__force unsigned int)csum_fold(csum);
152 hash ^= hash >> 8;
153 hash ^= type;
154
155 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
156 }
157
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)158 static void unix_table_double_lock(struct net *net,
159 unsigned int hash1, unsigned int hash2)
160 {
161 if (hash1 == hash2) {
162 spin_lock(&net->unx.table.locks[hash1]);
163 return;
164 }
165
166 if (hash1 > hash2)
167 swap(hash1, hash2);
168
169 spin_lock(&net->unx.table.locks[hash1]);
170 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
171 }
172
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)173 static void unix_table_double_unlock(struct net *net,
174 unsigned int hash1, unsigned int hash2)
175 {
176 if (hash1 == hash2) {
177 spin_unlock(&net->unx.table.locks[hash1]);
178 return;
179 }
180
181 spin_unlock(&net->unx.table.locks[hash1]);
182 spin_unlock(&net->unx.table.locks[hash2]);
183 }
184
185 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)186 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
187 {
188 UNIXCB(skb).secid = scm->secid;
189 }
190
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)191 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
192 {
193 scm->secid = UNIXCB(skb).secid;
194 }
195
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)196 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
197 {
198 return (scm->secid == UNIXCB(skb).secid);
199 }
200 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)201 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
202 { }
203
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)204 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
205 { }
206
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)207 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
208 {
209 return true;
210 }
211 #endif /* CONFIG_SECURITY_NETWORK */
212
unix_our_peer(struct sock * sk,struct sock * osk)213 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
214 {
215 return unix_peer(osk) == sk;
216 }
217
unix_may_send(struct sock * sk,struct sock * osk)218 static inline int unix_may_send(struct sock *sk, struct sock *osk)
219 {
220 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
221 }
222
unix_recvq_full_lockless(const struct sock * sk)223 static inline int unix_recvq_full_lockless(const struct sock *sk)
224 {
225 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
226 }
227
unix_peer_get(struct sock * s)228 struct sock *unix_peer_get(struct sock *s)
229 {
230 struct sock *peer;
231
232 unix_state_lock(s);
233 peer = unix_peer(s);
234 if (peer)
235 sock_hold(peer);
236 unix_state_unlock(s);
237 return peer;
238 }
239 EXPORT_SYMBOL_GPL(unix_peer_get);
240
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)241 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
242 int addr_len)
243 {
244 struct unix_address *addr;
245
246 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
247 if (!addr)
248 return NULL;
249
250 refcount_set(&addr->refcnt, 1);
251 addr->len = addr_len;
252 memcpy(addr->name, sunaddr, addr_len);
253
254 return addr;
255 }
256
unix_release_addr(struct unix_address * addr)257 static inline void unix_release_addr(struct unix_address *addr)
258 {
259 if (refcount_dec_and_test(&addr->refcnt))
260 kfree(addr);
261 }
262
263 /*
264 * Check unix socket name:
265 * - should be not zero length.
266 * - if started by not zero, should be NULL terminated (FS object)
267 * - if started by zero, it is abstract name.
268 */
269
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)270 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
271 {
272 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
273 addr_len > sizeof(*sunaddr))
274 return -EINVAL;
275
276 if (sunaddr->sun_family != AF_UNIX)
277 return -EINVAL;
278
279 return 0;
280 }
281
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)282 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
283 {
284 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
285 short offset = offsetof(struct sockaddr_storage, __data);
286
287 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
288
289 /* This may look like an off by one error but it is a bit more
290 * subtle. 108 is the longest valid AF_UNIX path for a binding.
291 * sun_path[108] doesn't as such exist. However in kernel space
292 * we are guaranteed that it is a valid memory location in our
293 * kernel address buffer because syscall functions always pass
294 * a pointer of struct sockaddr_storage which has a bigger buffer
295 * than 108. Also, we must terminate sun_path for strlen() in
296 * getname_kernel().
297 */
298 addr->__data[addr_len - offset] = 0;
299
300 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
301 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
302 * know the actual buffer.
303 */
304 return strlen(addr->__data) + offset + 1;
305 }
306
__unix_remove_socket(struct sock * sk)307 static void __unix_remove_socket(struct sock *sk)
308 {
309 sk_del_node_init(sk);
310 }
311
__unix_insert_socket(struct net * net,struct sock * sk)312 static void __unix_insert_socket(struct net *net, struct sock *sk)
313 {
314 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
315 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
316 }
317
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)318 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
319 struct unix_address *addr, unsigned int hash)
320 {
321 __unix_remove_socket(sk);
322 smp_store_release(&unix_sk(sk)->addr, addr);
323
324 sk->sk_hash = hash;
325 __unix_insert_socket(net, sk);
326 }
327
unix_remove_socket(struct net * net,struct sock * sk)328 static void unix_remove_socket(struct net *net, struct sock *sk)
329 {
330 spin_lock(&net->unx.table.locks[sk->sk_hash]);
331 __unix_remove_socket(sk);
332 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
333 }
334
unix_insert_unbound_socket(struct net * net,struct sock * sk)335 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
336 {
337 spin_lock(&net->unx.table.locks[sk->sk_hash]);
338 __unix_insert_socket(net, sk);
339 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341
unix_insert_bsd_socket(struct sock * sk)342 static void unix_insert_bsd_socket(struct sock *sk)
343 {
344 spin_lock(&bsd_socket_locks[sk->sk_hash]);
345 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
346 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
347 }
348
unix_remove_bsd_socket(struct sock * sk)349 static void unix_remove_bsd_socket(struct sock *sk)
350 {
351 if (!hlist_unhashed(&sk->sk_bind_node)) {
352 spin_lock(&bsd_socket_locks[sk->sk_hash]);
353 __sk_del_bind_node(sk);
354 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
355
356 sk_node_init(&sk->sk_bind_node);
357 }
358 }
359
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)360 static struct sock *__unix_find_socket_byname(struct net *net,
361 struct sockaddr_un *sunname,
362 int len, unsigned int hash)
363 {
364 struct sock *s;
365
366 sk_for_each(s, &net->unx.table.buckets[hash]) {
367 struct unix_sock *u = unix_sk(s);
368
369 if (u->addr->len == len &&
370 !memcmp(u->addr->name, sunname, len))
371 return s;
372 }
373 return NULL;
374 }
375
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)376 static inline struct sock *unix_find_socket_byname(struct net *net,
377 struct sockaddr_un *sunname,
378 int len, unsigned int hash)
379 {
380 struct sock *s;
381
382 spin_lock(&net->unx.table.locks[hash]);
383 s = __unix_find_socket_byname(net, sunname, len, hash);
384 if (s)
385 sock_hold(s);
386 spin_unlock(&net->unx.table.locks[hash]);
387 return s;
388 }
389
unix_find_socket_byinode(struct inode * i)390 static struct sock *unix_find_socket_byinode(struct inode *i)
391 {
392 unsigned int hash = unix_bsd_hash(i);
393 struct sock *s;
394
395 spin_lock(&bsd_socket_locks[hash]);
396 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
397 struct dentry *dentry = unix_sk(s)->path.dentry;
398
399 if (dentry && d_backing_inode(dentry) == i) {
400 sock_hold(s);
401 spin_unlock(&bsd_socket_locks[hash]);
402 return s;
403 }
404 }
405 spin_unlock(&bsd_socket_locks[hash]);
406 return NULL;
407 }
408
409 /* Support code for asymmetrically connected dgram sockets
410 *
411 * If a datagram socket is connected to a socket not itself connected
412 * to the first socket (eg, /dev/log), clients may only enqueue more
413 * messages if the present receive queue of the server socket is not
414 * "too large". This means there's a second writeability condition
415 * poll and sendmsg need to test. The dgram recv code will do a wake
416 * up on the peer_wait wait queue of a socket upon reception of a
417 * datagram which needs to be propagated to sleeping would-be writers
418 * since these might not have sent anything so far. This can't be
419 * accomplished via poll_wait because the lifetime of the server
420 * socket might be less than that of its clients if these break their
421 * association with it or if the server socket is closed while clients
422 * are still connected to it and there's no way to inform "a polling
423 * implementation" that it should let go of a certain wait queue
424 *
425 * In order to propagate a wake up, a wait_queue_entry_t of the client
426 * socket is enqueued on the peer_wait queue of the server socket
427 * whose wake function does a wake_up on the ordinary client socket
428 * wait queue. This connection is established whenever a write (or
429 * poll for write) hit the flow control condition and broken when the
430 * association to the server socket is dissolved or after a wake up
431 * was relayed.
432 */
433
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)434 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
435 void *key)
436 {
437 struct unix_sock *u;
438 wait_queue_head_t *u_sleep;
439
440 u = container_of(q, struct unix_sock, peer_wake);
441
442 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
443 q);
444 u->peer_wake.private = NULL;
445
446 /* relaying can only happen while the wq still exists */
447 u_sleep = sk_sleep(&u->sk);
448 if (u_sleep)
449 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
450
451 return 0;
452 }
453
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)454 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
455 {
456 struct unix_sock *u, *u_other;
457 int rc;
458
459 u = unix_sk(sk);
460 u_other = unix_sk(other);
461 rc = 0;
462 spin_lock(&u_other->peer_wait.lock);
463
464 if (!u->peer_wake.private) {
465 u->peer_wake.private = other;
466 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
467
468 rc = 1;
469 }
470
471 spin_unlock(&u_other->peer_wait.lock);
472 return rc;
473 }
474
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)475 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
476 struct sock *other)
477 {
478 struct unix_sock *u, *u_other;
479
480 u = unix_sk(sk);
481 u_other = unix_sk(other);
482 spin_lock(&u_other->peer_wait.lock);
483
484 if (u->peer_wake.private == other) {
485 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
486 u->peer_wake.private = NULL;
487 }
488
489 spin_unlock(&u_other->peer_wait.lock);
490 }
491
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)492 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
493 struct sock *other)
494 {
495 unix_dgram_peer_wake_disconnect(sk, other);
496 wake_up_interruptible_poll(sk_sleep(sk),
497 EPOLLOUT |
498 EPOLLWRNORM |
499 EPOLLWRBAND);
500 }
501
502 /* preconditions:
503 * - unix_peer(sk) == other
504 * - association is stable
505 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)506 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
507 {
508 int connected;
509
510 connected = unix_dgram_peer_wake_connect(sk, other);
511
512 /* If other is SOCK_DEAD, we want to make sure we signal
513 * POLLOUT, such that a subsequent write() can get a
514 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
515 * to other and its full, we will hang waiting for POLLOUT.
516 */
517 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
518 return 1;
519
520 if (connected)
521 unix_dgram_peer_wake_disconnect(sk, other);
522
523 return 0;
524 }
525
unix_writable(const struct sock * sk,unsigned char state)526 static int unix_writable(const struct sock *sk, unsigned char state)
527 {
528 return state != TCP_LISTEN &&
529 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
530 }
531
unix_write_space(struct sock * sk)532 static void unix_write_space(struct sock *sk)
533 {
534 struct socket_wq *wq;
535
536 rcu_read_lock();
537 if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
538 wq = rcu_dereference(sk->sk_wq);
539 if (skwq_has_sleeper(wq))
540 wake_up_interruptible_sync_poll(&wq->wait,
541 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
542 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
543 }
544 rcu_read_unlock();
545 }
546
547 /* When dgram socket disconnects (or changes its peer), we clear its receive
548 * queue of packets arrived from previous peer. First, it allows to do
549 * flow control based only on wmem_alloc; second, sk connected to peer
550 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)551 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
552 {
553 if (!skb_queue_empty(&sk->sk_receive_queue)) {
554 skb_queue_purge(&sk->sk_receive_queue);
555 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
556
557 /* If one link of bidirectional dgram pipe is disconnected,
558 * we signal error. Messages are lost. Do not make this,
559 * when peer was not connected to us.
560 */
561 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
562 WRITE_ONCE(other->sk_err, ECONNRESET);
563 sk_error_report(other);
564 }
565 }
566 }
567
unix_sock_destructor(struct sock * sk)568 static void unix_sock_destructor(struct sock *sk)
569 {
570 struct unix_sock *u = unix_sk(sk);
571
572 skb_queue_purge(&sk->sk_receive_queue);
573
574 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
575 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
576 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
577 if (!sock_flag(sk, SOCK_DEAD)) {
578 pr_info("Attempt to release alive unix socket: %p\n", sk);
579 return;
580 }
581
582 if (u->addr)
583 unix_release_addr(u->addr);
584
585 atomic_long_dec(&unix_nr_socks);
586 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
587 #ifdef UNIX_REFCNT_DEBUG
588 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
589 atomic_long_read(&unix_nr_socks));
590 #endif
591 }
592
unix_release_sock(struct sock * sk,int embrion)593 static void unix_release_sock(struct sock *sk, int embrion)
594 {
595 struct unix_sock *u = unix_sk(sk);
596 struct sock *skpair;
597 struct sk_buff *skb;
598 struct path path;
599 int state;
600
601 unix_remove_socket(sock_net(sk), sk);
602 unix_remove_bsd_socket(sk);
603
604 /* Clear state */
605 unix_state_lock(sk);
606 sock_orphan(sk);
607 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
608 path = u->path;
609 u->path.dentry = NULL;
610 u->path.mnt = NULL;
611 state = sk->sk_state;
612 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
613
614 skpair = unix_peer(sk);
615 unix_peer(sk) = NULL;
616
617 unix_state_unlock(sk);
618
619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
620 if (u->oob_skb) {
621 kfree_skb(u->oob_skb);
622 u->oob_skb = NULL;
623 }
624 #endif
625
626 wake_up_interruptible_all(&u->peer_wait);
627
628 if (skpair != NULL) {
629 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
630 unix_state_lock(skpair);
631 /* No more writes */
632 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
633 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
634 WRITE_ONCE(skpair->sk_err, ECONNRESET);
635 unix_state_unlock(skpair);
636 skpair->sk_state_change(skpair);
637 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
638 }
639
640 unix_dgram_peer_wake_disconnect(sk, skpair);
641 sock_put(skpair); /* It may now die */
642 }
643
644 /* Try to flush out this socket. Throw out buffers at least */
645
646 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
647 if (state == TCP_LISTEN)
648 unix_release_sock(skb->sk, 1);
649 /* passed fds are erased in the kfree_skb hook */
650 UNIXCB(skb).consumed = skb->len;
651 kfree_skb(skb);
652 }
653
654 if (path.dentry)
655 path_put(&path);
656
657 sock_put(sk);
658
659 /* ---- Socket is dead now and most probably destroyed ---- */
660
661 /*
662 * Fixme: BSD difference: In BSD all sockets connected to us get
663 * ECONNRESET and we die on the spot. In Linux we behave
664 * like files and pipes do and wait for the last
665 * dereference.
666 *
667 * Can't we simply set sock->err?
668 *
669 * What the above comment does talk about? --ANK(980817)
670 */
671
672 if (READ_ONCE(unix_tot_inflight))
673 unix_gc(); /* Garbage collect fds */
674 }
675
init_peercred(struct sock * sk)676 static void init_peercred(struct sock *sk)
677 {
678 const struct cred *old_cred;
679 struct pid *old_pid;
680
681 spin_lock(&sk->sk_peer_lock);
682 old_pid = sk->sk_peer_pid;
683 old_cred = sk->sk_peer_cred;
684 sk->sk_peer_pid = get_pid(task_tgid(current));
685 sk->sk_peer_cred = get_current_cred();
686 spin_unlock(&sk->sk_peer_lock);
687
688 put_pid(old_pid);
689 put_cred(old_cred);
690 }
691
copy_peercred(struct sock * sk,struct sock * peersk)692 static void copy_peercred(struct sock *sk, struct sock *peersk)
693 {
694 if (sk < peersk) {
695 spin_lock(&sk->sk_peer_lock);
696 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
697 } else {
698 spin_lock(&peersk->sk_peer_lock);
699 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
700 }
701
702 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
703 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
704
705 spin_unlock(&sk->sk_peer_lock);
706 spin_unlock(&peersk->sk_peer_lock);
707 }
708
unix_listen(struct socket * sock,int backlog)709 static int unix_listen(struct socket *sock, int backlog)
710 {
711 int err;
712 struct sock *sk = sock->sk;
713 struct unix_sock *u = unix_sk(sk);
714
715 err = -EOPNOTSUPP;
716 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
717 goto out; /* Only stream/seqpacket sockets accept */
718 err = -EINVAL;
719 if (!READ_ONCE(u->addr))
720 goto out; /* No listens on an unbound socket */
721 unix_state_lock(sk);
722 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
723 goto out_unlock;
724 if (backlog > sk->sk_max_ack_backlog)
725 wake_up_interruptible_all(&u->peer_wait);
726 sk->sk_max_ack_backlog = backlog;
727 WRITE_ONCE(sk->sk_state, TCP_LISTEN);
728
729 /* set credentials so connect can copy them */
730 init_peercred(sk);
731 err = 0;
732
733 out_unlock:
734 unix_state_unlock(sk);
735 out:
736 return err;
737 }
738
739 static int unix_release(struct socket *);
740 static int unix_bind(struct socket *, struct sockaddr *, int);
741 static int unix_stream_connect(struct socket *, struct sockaddr *,
742 int addr_len, int flags);
743 static int unix_socketpair(struct socket *, struct socket *);
744 static int unix_accept(struct socket *, struct socket *, int, bool);
745 static int unix_getname(struct socket *, struct sockaddr *, int);
746 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
747 static __poll_t unix_dgram_poll(struct file *, struct socket *,
748 poll_table *);
749 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
750 #ifdef CONFIG_COMPAT
751 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
752 #endif
753 static int unix_shutdown(struct socket *, int);
754 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
755 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
756 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
757 struct pipe_inode_info *, size_t size,
758 unsigned int flags);
759 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
760 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
761 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
762 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
763 static int unix_dgram_connect(struct socket *, struct sockaddr *,
764 int, int);
765 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
766 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
767 int);
768
unix_set_peek_off(struct sock * sk,int val)769 static int unix_set_peek_off(struct sock *sk, int val)
770 {
771 struct unix_sock *u = unix_sk(sk);
772
773 if (mutex_lock_interruptible(&u->iolock))
774 return -EINTR;
775
776 WRITE_ONCE(sk->sk_peek_off, val);
777 mutex_unlock(&u->iolock);
778
779 return 0;
780 }
781
782 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)783 static int unix_count_nr_fds(struct sock *sk)
784 {
785 struct sk_buff *skb;
786 struct unix_sock *u;
787 int nr_fds = 0;
788
789 spin_lock(&sk->sk_receive_queue.lock);
790 skb = skb_peek(&sk->sk_receive_queue);
791 while (skb) {
792 u = unix_sk(skb->sk);
793 nr_fds += atomic_read(&u->scm_stat.nr_fds);
794 skb = skb_peek_next(skb, &sk->sk_receive_queue);
795 }
796 spin_unlock(&sk->sk_receive_queue.lock);
797
798 return nr_fds;
799 }
800
unix_show_fdinfo(struct seq_file * m,struct socket * sock)801 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
802 {
803 struct sock *sk = sock->sk;
804 unsigned char s_state;
805 struct unix_sock *u;
806 int nr_fds = 0;
807
808 if (sk) {
809 s_state = READ_ONCE(sk->sk_state);
810 u = unix_sk(sk);
811
812 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
813 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
814 * SOCK_DGRAM is ordinary. So, no lock is needed.
815 */
816 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
817 nr_fds = atomic_read(&u->scm_stat.nr_fds);
818 else if (s_state == TCP_LISTEN)
819 nr_fds = unix_count_nr_fds(sk);
820
821 seq_printf(m, "scm_fds: %u\n", nr_fds);
822 }
823 }
824 #else
825 #define unix_show_fdinfo NULL
826 #endif
827
828 static const struct proto_ops unix_stream_ops = {
829 .family = PF_UNIX,
830 .owner = THIS_MODULE,
831 .release = unix_release,
832 .bind = unix_bind,
833 .connect = unix_stream_connect,
834 .socketpair = unix_socketpair,
835 .accept = unix_accept,
836 .getname = unix_getname,
837 .poll = unix_poll,
838 .ioctl = unix_ioctl,
839 #ifdef CONFIG_COMPAT
840 .compat_ioctl = unix_compat_ioctl,
841 #endif
842 .listen = unix_listen,
843 .shutdown = unix_shutdown,
844 .sendmsg = unix_stream_sendmsg,
845 .recvmsg = unix_stream_recvmsg,
846 .read_skb = unix_stream_read_skb,
847 .mmap = sock_no_mmap,
848 .splice_read = unix_stream_splice_read,
849 .set_peek_off = unix_set_peek_off,
850 .show_fdinfo = unix_show_fdinfo,
851 };
852
853 static const struct proto_ops unix_dgram_ops = {
854 .family = PF_UNIX,
855 .owner = THIS_MODULE,
856 .release = unix_release,
857 .bind = unix_bind,
858 .connect = unix_dgram_connect,
859 .socketpair = unix_socketpair,
860 .accept = sock_no_accept,
861 .getname = unix_getname,
862 .poll = unix_dgram_poll,
863 .ioctl = unix_ioctl,
864 #ifdef CONFIG_COMPAT
865 .compat_ioctl = unix_compat_ioctl,
866 #endif
867 .listen = sock_no_listen,
868 .shutdown = unix_shutdown,
869 .sendmsg = unix_dgram_sendmsg,
870 .read_skb = unix_read_skb,
871 .recvmsg = unix_dgram_recvmsg,
872 .mmap = sock_no_mmap,
873 .set_peek_off = unix_set_peek_off,
874 .show_fdinfo = unix_show_fdinfo,
875 };
876
877 static const struct proto_ops unix_seqpacket_ops = {
878 .family = PF_UNIX,
879 .owner = THIS_MODULE,
880 .release = unix_release,
881 .bind = unix_bind,
882 .connect = unix_stream_connect,
883 .socketpair = unix_socketpair,
884 .accept = unix_accept,
885 .getname = unix_getname,
886 .poll = unix_dgram_poll,
887 .ioctl = unix_ioctl,
888 #ifdef CONFIG_COMPAT
889 .compat_ioctl = unix_compat_ioctl,
890 #endif
891 .listen = unix_listen,
892 .shutdown = unix_shutdown,
893 .sendmsg = unix_seqpacket_sendmsg,
894 .recvmsg = unix_seqpacket_recvmsg,
895 .mmap = sock_no_mmap,
896 .set_peek_off = unix_set_peek_off,
897 .show_fdinfo = unix_show_fdinfo,
898 };
899
unix_close(struct sock * sk,long timeout)900 static void unix_close(struct sock *sk, long timeout)
901 {
902 /* Nothing to do here, unix socket does not need a ->close().
903 * This is merely for sockmap.
904 */
905 }
906
unix_unhash(struct sock * sk)907 static void unix_unhash(struct sock *sk)
908 {
909 /* Nothing to do here, unix socket does not need a ->unhash().
910 * This is merely for sockmap.
911 */
912 }
913
unix_bpf_bypass_getsockopt(int level,int optname)914 static bool unix_bpf_bypass_getsockopt(int level, int optname)
915 {
916 if (level == SOL_SOCKET) {
917 switch (optname) {
918 case SO_PEERPIDFD:
919 return true;
920 default:
921 return false;
922 }
923 }
924
925 return false;
926 }
927
928 struct proto unix_dgram_proto = {
929 .name = "UNIX",
930 .owner = THIS_MODULE,
931 .obj_size = sizeof(struct unix_sock),
932 .close = unix_close,
933 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
934 #ifdef CONFIG_BPF_SYSCALL
935 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
936 #endif
937 };
938
939 struct proto unix_stream_proto = {
940 .name = "UNIX-STREAM",
941 .owner = THIS_MODULE,
942 .obj_size = sizeof(struct unix_sock),
943 .close = unix_close,
944 .unhash = unix_unhash,
945 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
946 #ifdef CONFIG_BPF_SYSCALL
947 .psock_update_sk_prot = unix_stream_bpf_update_proto,
948 #endif
949 };
950
unix_create1(struct net * net,struct socket * sock,int kern,int type)951 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
952 {
953 struct unix_sock *u;
954 struct sock *sk;
955 int err;
956
957 atomic_long_inc(&unix_nr_socks);
958 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
959 err = -ENFILE;
960 goto err;
961 }
962
963 if (type == SOCK_STREAM)
964 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
965 else /*dgram and seqpacket */
966 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
967
968 if (!sk) {
969 err = -ENOMEM;
970 goto err;
971 }
972
973 sock_init_data(sock, sk);
974
975 sk->sk_hash = unix_unbound_hash(sk);
976 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
977 sk->sk_write_space = unix_write_space;
978 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
979 sk->sk_destruct = unix_sock_destructor;
980 u = unix_sk(sk);
981 u->listener = NULL;
982 u->vertex = NULL;
983 u->path.dentry = NULL;
984 u->path.mnt = NULL;
985 spin_lock_init(&u->lock);
986 mutex_init(&u->iolock); /* single task reading lock */
987 mutex_init(&u->bindlock); /* single task binding lock */
988 init_waitqueue_head(&u->peer_wait);
989 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
990 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
991 unix_insert_unbound_socket(net, sk);
992
993 sock_prot_inuse_add(net, sk->sk_prot, 1);
994
995 return sk;
996
997 err:
998 atomic_long_dec(&unix_nr_socks);
999 return ERR_PTR(err);
1000 }
1001
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1002 static int unix_create(struct net *net, struct socket *sock, int protocol,
1003 int kern)
1004 {
1005 struct sock *sk;
1006
1007 if (protocol && protocol != PF_UNIX)
1008 return -EPROTONOSUPPORT;
1009
1010 sock->state = SS_UNCONNECTED;
1011
1012 switch (sock->type) {
1013 case SOCK_STREAM:
1014 sock->ops = &unix_stream_ops;
1015 break;
1016 /*
1017 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1018 * nothing uses it.
1019 */
1020 case SOCK_RAW:
1021 sock->type = SOCK_DGRAM;
1022 fallthrough;
1023 case SOCK_DGRAM:
1024 sock->ops = &unix_dgram_ops;
1025 break;
1026 case SOCK_SEQPACKET:
1027 sock->ops = &unix_seqpacket_ops;
1028 break;
1029 default:
1030 return -ESOCKTNOSUPPORT;
1031 }
1032
1033 sk = unix_create1(net, sock, kern, sock->type);
1034 if (IS_ERR(sk))
1035 return PTR_ERR(sk);
1036
1037 return 0;
1038 }
1039
unix_release(struct socket * sock)1040 static int unix_release(struct socket *sock)
1041 {
1042 struct sock *sk = sock->sk;
1043
1044 if (!sk)
1045 return 0;
1046
1047 sk->sk_prot->close(sk, 0);
1048 unix_release_sock(sk, 0);
1049 sock->sk = NULL;
1050
1051 return 0;
1052 }
1053
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1054 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1055 int type)
1056 {
1057 struct inode *inode;
1058 struct path path;
1059 struct sock *sk;
1060 int err;
1061
1062 unix_mkname_bsd(sunaddr, addr_len);
1063 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1064 if (err)
1065 goto fail;
1066
1067 err = path_permission(&path, MAY_WRITE);
1068 if (err)
1069 goto path_put;
1070
1071 err = -ECONNREFUSED;
1072 inode = d_backing_inode(path.dentry);
1073 if (!S_ISSOCK(inode->i_mode))
1074 goto path_put;
1075
1076 sk = unix_find_socket_byinode(inode);
1077 if (!sk)
1078 goto path_put;
1079
1080 err = -EPROTOTYPE;
1081 if (sk->sk_type == type)
1082 touch_atime(&path);
1083 else
1084 goto sock_put;
1085
1086 path_put(&path);
1087
1088 return sk;
1089
1090 sock_put:
1091 sock_put(sk);
1092 path_put:
1093 path_put(&path);
1094 fail:
1095 return ERR_PTR(err);
1096 }
1097
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1098 static struct sock *unix_find_abstract(struct net *net,
1099 struct sockaddr_un *sunaddr,
1100 int addr_len, int type)
1101 {
1102 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1103 struct dentry *dentry;
1104 struct sock *sk;
1105
1106 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1107 if (!sk)
1108 return ERR_PTR(-ECONNREFUSED);
1109
1110 dentry = unix_sk(sk)->path.dentry;
1111 if (dentry)
1112 touch_atime(&unix_sk(sk)->path);
1113
1114 return sk;
1115 }
1116
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1117 static struct sock *unix_find_other(struct net *net,
1118 struct sockaddr_un *sunaddr,
1119 int addr_len, int type)
1120 {
1121 struct sock *sk;
1122
1123 if (sunaddr->sun_path[0])
1124 sk = unix_find_bsd(sunaddr, addr_len, type);
1125 else
1126 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1127
1128 return sk;
1129 }
1130
unix_autobind(struct sock * sk)1131 static int unix_autobind(struct sock *sk)
1132 {
1133 struct unix_sock *u = unix_sk(sk);
1134 unsigned int new_hash, old_hash;
1135 struct net *net = sock_net(sk);
1136 struct unix_address *addr;
1137 u32 lastnum, ordernum;
1138 int err;
1139
1140 err = mutex_lock_interruptible(&u->bindlock);
1141 if (err)
1142 return err;
1143
1144 if (u->addr)
1145 goto out;
1146
1147 err = -ENOMEM;
1148 addr = kzalloc(sizeof(*addr) +
1149 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1150 if (!addr)
1151 goto out;
1152
1153 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1154 addr->name->sun_family = AF_UNIX;
1155 refcount_set(&addr->refcnt, 1);
1156
1157 old_hash = sk->sk_hash;
1158 ordernum = get_random_u32();
1159 lastnum = ordernum & 0xFFFFF;
1160 retry:
1161 ordernum = (ordernum + 1) & 0xFFFFF;
1162 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1163
1164 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1165 unix_table_double_lock(net, old_hash, new_hash);
1166
1167 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1168 unix_table_double_unlock(net, old_hash, new_hash);
1169
1170 /* __unix_find_socket_byname() may take long time if many names
1171 * are already in use.
1172 */
1173 cond_resched();
1174
1175 if (ordernum == lastnum) {
1176 /* Give up if all names seems to be in use. */
1177 err = -ENOSPC;
1178 unix_release_addr(addr);
1179 goto out;
1180 }
1181
1182 goto retry;
1183 }
1184
1185 __unix_set_addr_hash(net, sk, addr, new_hash);
1186 unix_table_double_unlock(net, old_hash, new_hash);
1187 err = 0;
1188
1189 out: mutex_unlock(&u->bindlock);
1190 return err;
1191 }
1192
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1193 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1194 int addr_len)
1195 {
1196 umode_t mode = S_IFSOCK |
1197 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1198 struct unix_sock *u = unix_sk(sk);
1199 unsigned int new_hash, old_hash;
1200 struct net *net = sock_net(sk);
1201 struct mnt_idmap *idmap;
1202 struct unix_address *addr;
1203 struct dentry *dentry;
1204 struct path parent;
1205 int err;
1206
1207 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1208 addr = unix_create_addr(sunaddr, addr_len);
1209 if (!addr)
1210 return -ENOMEM;
1211
1212 /*
1213 * Get the parent directory, calculate the hash for last
1214 * component.
1215 */
1216 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1217 if (IS_ERR(dentry)) {
1218 err = PTR_ERR(dentry);
1219 goto out;
1220 }
1221
1222 /*
1223 * All right, let's create it.
1224 */
1225 idmap = mnt_idmap(parent.mnt);
1226 err = security_path_mknod(&parent, dentry, mode, 0);
1227 if (!err)
1228 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1229 if (err)
1230 goto out_path;
1231 err = mutex_lock_interruptible(&u->bindlock);
1232 if (err)
1233 goto out_unlink;
1234 if (u->addr)
1235 goto out_unlock;
1236
1237 old_hash = sk->sk_hash;
1238 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1239 unix_table_double_lock(net, old_hash, new_hash);
1240 u->path.mnt = mntget(parent.mnt);
1241 u->path.dentry = dget(dentry);
1242 __unix_set_addr_hash(net, sk, addr, new_hash);
1243 unix_table_double_unlock(net, old_hash, new_hash);
1244 unix_insert_bsd_socket(sk);
1245 mutex_unlock(&u->bindlock);
1246 done_path_create(&parent, dentry);
1247 return 0;
1248
1249 out_unlock:
1250 mutex_unlock(&u->bindlock);
1251 err = -EINVAL;
1252 out_unlink:
1253 /* failed after successful mknod? unlink what we'd created... */
1254 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1255 out_path:
1256 done_path_create(&parent, dentry);
1257 out:
1258 unix_release_addr(addr);
1259 return err == -EEXIST ? -EADDRINUSE : err;
1260 }
1261
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1262 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1263 int addr_len)
1264 {
1265 struct unix_sock *u = unix_sk(sk);
1266 unsigned int new_hash, old_hash;
1267 struct net *net = sock_net(sk);
1268 struct unix_address *addr;
1269 int err;
1270
1271 addr = unix_create_addr(sunaddr, addr_len);
1272 if (!addr)
1273 return -ENOMEM;
1274
1275 err = mutex_lock_interruptible(&u->bindlock);
1276 if (err)
1277 goto out;
1278
1279 if (u->addr) {
1280 err = -EINVAL;
1281 goto out_mutex;
1282 }
1283
1284 old_hash = sk->sk_hash;
1285 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1286 unix_table_double_lock(net, old_hash, new_hash);
1287
1288 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1289 goto out_spin;
1290
1291 __unix_set_addr_hash(net, sk, addr, new_hash);
1292 unix_table_double_unlock(net, old_hash, new_hash);
1293 mutex_unlock(&u->bindlock);
1294 return 0;
1295
1296 out_spin:
1297 unix_table_double_unlock(net, old_hash, new_hash);
1298 err = -EADDRINUSE;
1299 out_mutex:
1300 mutex_unlock(&u->bindlock);
1301 out:
1302 unix_release_addr(addr);
1303 return err;
1304 }
1305
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1306 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1307 {
1308 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1309 struct sock *sk = sock->sk;
1310 int err;
1311
1312 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1313 sunaddr->sun_family == AF_UNIX)
1314 return unix_autobind(sk);
1315
1316 err = unix_validate_addr(sunaddr, addr_len);
1317 if (err)
1318 return err;
1319
1320 if (sunaddr->sun_path[0])
1321 err = unix_bind_bsd(sk, sunaddr, addr_len);
1322 else
1323 err = unix_bind_abstract(sk, sunaddr, addr_len);
1324
1325 return err;
1326 }
1327
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1328 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1329 {
1330 if (unlikely(sk1 == sk2) || !sk2) {
1331 unix_state_lock(sk1);
1332 return;
1333 }
1334 if (sk1 > sk2)
1335 swap(sk1, sk2);
1336
1337 unix_state_lock(sk1);
1338 unix_state_lock_nested(sk2, U_LOCK_SECOND);
1339 }
1340
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1341 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1342 {
1343 if (unlikely(sk1 == sk2) || !sk2) {
1344 unix_state_unlock(sk1);
1345 return;
1346 }
1347 unix_state_unlock(sk1);
1348 unix_state_unlock(sk2);
1349 }
1350
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1351 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1352 int alen, int flags)
1353 {
1354 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1355 struct sock *sk = sock->sk;
1356 struct sock *other;
1357 int err;
1358
1359 err = -EINVAL;
1360 if (alen < offsetofend(struct sockaddr, sa_family))
1361 goto out;
1362
1363 if (addr->sa_family != AF_UNSPEC) {
1364 err = unix_validate_addr(sunaddr, alen);
1365 if (err)
1366 goto out;
1367
1368 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1369 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1370 !READ_ONCE(unix_sk(sk)->addr)) {
1371 err = unix_autobind(sk);
1372 if (err)
1373 goto out;
1374 }
1375
1376 restart:
1377 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1378 if (IS_ERR(other)) {
1379 err = PTR_ERR(other);
1380 goto out;
1381 }
1382
1383 unix_state_double_lock(sk, other);
1384
1385 /* Apparently VFS overslept socket death. Retry. */
1386 if (sock_flag(other, SOCK_DEAD)) {
1387 unix_state_double_unlock(sk, other);
1388 sock_put(other);
1389 goto restart;
1390 }
1391
1392 err = -EPERM;
1393 if (!unix_may_send(sk, other))
1394 goto out_unlock;
1395
1396 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1397 if (err)
1398 goto out_unlock;
1399
1400 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1401 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1402 } else {
1403 /*
1404 * 1003.1g breaking connected state with AF_UNSPEC
1405 */
1406 other = NULL;
1407 unix_state_double_lock(sk, other);
1408 }
1409
1410 /*
1411 * If it was connected, reconnect.
1412 */
1413 if (unix_peer(sk)) {
1414 struct sock *old_peer = unix_peer(sk);
1415
1416 unix_peer(sk) = other;
1417 if (!other)
1418 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1419 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1420
1421 unix_state_double_unlock(sk, other);
1422
1423 if (other != old_peer) {
1424 unix_dgram_disconnected(sk, old_peer);
1425
1426 unix_state_lock(old_peer);
1427 if (!unix_peer(old_peer))
1428 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1429 unix_state_unlock(old_peer);
1430 }
1431
1432 sock_put(old_peer);
1433 } else {
1434 unix_peer(sk) = other;
1435 unix_state_double_unlock(sk, other);
1436 }
1437
1438 return 0;
1439
1440 out_unlock:
1441 unix_state_double_unlock(sk, other);
1442 sock_put(other);
1443 out:
1444 return err;
1445 }
1446
unix_wait_for_peer(struct sock * other,long timeo)1447 static long unix_wait_for_peer(struct sock *other, long timeo)
1448 __releases(&unix_sk(other)->lock)
1449 {
1450 struct unix_sock *u = unix_sk(other);
1451 int sched;
1452 DEFINE_WAIT(wait);
1453
1454 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1455
1456 sched = !sock_flag(other, SOCK_DEAD) &&
1457 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1458 unix_recvq_full_lockless(other);
1459
1460 unix_state_unlock(other);
1461
1462 if (sched)
1463 timeo = schedule_timeout(timeo);
1464
1465 finish_wait(&u->peer_wait, &wait);
1466 return timeo;
1467 }
1468
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1469 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1470 int addr_len, int flags)
1471 {
1472 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1473 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1474 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1475 struct net *net = sock_net(sk);
1476 struct sk_buff *skb = NULL;
1477 unsigned char state;
1478 long timeo;
1479 int err;
1480
1481 err = unix_validate_addr(sunaddr, addr_len);
1482 if (err)
1483 goto out;
1484
1485 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1486 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1487 !READ_ONCE(u->addr)) {
1488 err = unix_autobind(sk);
1489 if (err)
1490 goto out;
1491 }
1492
1493 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1494
1495 /* First of all allocate resources.
1496 If we will make it after state is locked,
1497 we will have to recheck all again in any case.
1498 */
1499
1500 /* create new sock for complete connection */
1501 newsk = unix_create1(net, NULL, 0, sock->type);
1502 if (IS_ERR(newsk)) {
1503 err = PTR_ERR(newsk);
1504 newsk = NULL;
1505 goto out;
1506 }
1507
1508 err = -ENOMEM;
1509
1510 /* Allocate skb for sending to listening sock */
1511 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1512 if (skb == NULL)
1513 goto out;
1514
1515 restart:
1516 /* Find listening sock. */
1517 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1518 if (IS_ERR(other)) {
1519 err = PTR_ERR(other);
1520 other = NULL;
1521 goto out;
1522 }
1523
1524 unix_state_lock(other);
1525
1526 /* Apparently VFS overslept socket death. Retry. */
1527 if (sock_flag(other, SOCK_DEAD)) {
1528 unix_state_unlock(other);
1529 sock_put(other);
1530 goto restart;
1531 }
1532
1533 err = -ECONNREFUSED;
1534 if (other->sk_state != TCP_LISTEN)
1535 goto out_unlock;
1536 if (other->sk_shutdown & RCV_SHUTDOWN)
1537 goto out_unlock;
1538
1539 if (unix_recvq_full_lockless(other)) {
1540 err = -EAGAIN;
1541 if (!timeo)
1542 goto out_unlock;
1543
1544 timeo = unix_wait_for_peer(other, timeo);
1545
1546 err = sock_intr_errno(timeo);
1547 if (signal_pending(current))
1548 goto out;
1549 sock_put(other);
1550 goto restart;
1551 }
1552
1553 /* self connect and simultaneous connect are eliminated
1554 * by rejecting TCP_LISTEN socket to avoid deadlock.
1555 */
1556 state = READ_ONCE(sk->sk_state);
1557 if (unlikely(state != TCP_CLOSE)) {
1558 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1559 goto out_unlock;
1560 }
1561
1562 unix_state_lock_nested(sk, U_LOCK_SECOND);
1563
1564 if (unlikely(sk->sk_state != TCP_CLOSE)) {
1565 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1566 unix_state_unlock(sk);
1567 goto out_unlock;
1568 }
1569
1570 err = security_unix_stream_connect(sk, other, newsk);
1571 if (err) {
1572 unix_state_unlock(sk);
1573 goto out_unlock;
1574 }
1575
1576 /* The way is open! Fastly set all the necessary fields... */
1577
1578 sock_hold(sk);
1579 unix_peer(newsk) = sk;
1580 newsk->sk_state = TCP_ESTABLISHED;
1581 newsk->sk_type = sk->sk_type;
1582 init_peercred(newsk);
1583 newu = unix_sk(newsk);
1584 newu->listener = other;
1585 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1586 otheru = unix_sk(other);
1587
1588 /* copy address information from listening to new sock
1589 *
1590 * The contents of *(otheru->addr) and otheru->path
1591 * are seen fully set up here, since we have found
1592 * otheru in hash under its lock. Insertion into the
1593 * hash chain we'd found it in had been done in an
1594 * earlier critical area protected by the chain's lock,
1595 * the same one where we'd set *(otheru->addr) contents,
1596 * as well as otheru->path and otheru->addr itself.
1597 *
1598 * Using smp_store_release() here to set newu->addr
1599 * is enough to make those stores, as well as stores
1600 * to newu->path visible to anyone who gets newu->addr
1601 * by smp_load_acquire(). IOW, the same warranties
1602 * as for unix_sock instances bound in unix_bind() or
1603 * in unix_autobind().
1604 */
1605 if (otheru->path.dentry) {
1606 path_get(&otheru->path);
1607 newu->path = otheru->path;
1608 }
1609 refcount_inc(&otheru->addr->refcnt);
1610 smp_store_release(&newu->addr, otheru->addr);
1611
1612 /* Set credentials */
1613 copy_peercred(sk, other);
1614
1615 sock->state = SS_CONNECTED;
1616 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1617 sock_hold(newsk);
1618
1619 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1620 unix_peer(sk) = newsk;
1621
1622 unix_state_unlock(sk);
1623
1624 /* take ten and send info to listening sock */
1625 spin_lock(&other->sk_receive_queue.lock);
1626 __skb_queue_tail(&other->sk_receive_queue, skb);
1627 spin_unlock(&other->sk_receive_queue.lock);
1628 unix_state_unlock(other);
1629 other->sk_data_ready(other);
1630 sock_put(other);
1631 return 0;
1632
1633 out_unlock:
1634 if (other)
1635 unix_state_unlock(other);
1636
1637 out:
1638 kfree_skb(skb);
1639 if (newsk)
1640 unix_release_sock(newsk, 0);
1641 if (other)
1642 sock_put(other);
1643 return err;
1644 }
1645
unix_socketpair(struct socket * socka,struct socket * sockb)1646 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1647 {
1648 struct sock *ska = socka->sk, *skb = sockb->sk;
1649
1650 /* Join our sockets back to back */
1651 sock_hold(ska);
1652 sock_hold(skb);
1653 unix_peer(ska) = skb;
1654 unix_peer(skb) = ska;
1655 init_peercred(ska);
1656 init_peercred(skb);
1657
1658 ska->sk_state = TCP_ESTABLISHED;
1659 skb->sk_state = TCP_ESTABLISHED;
1660 socka->state = SS_CONNECTED;
1661 sockb->state = SS_CONNECTED;
1662 return 0;
1663 }
1664
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1665 static void unix_sock_inherit_flags(const struct socket *old,
1666 struct socket *new)
1667 {
1668 if (test_bit(SOCK_PASSCRED, &old->flags))
1669 set_bit(SOCK_PASSCRED, &new->flags);
1670 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1671 set_bit(SOCK_PASSPIDFD, &new->flags);
1672 if (test_bit(SOCK_PASSSEC, &old->flags))
1673 set_bit(SOCK_PASSSEC, &new->flags);
1674 }
1675
unix_accept(struct socket * sock,struct socket * newsock,int flags,bool kern)1676 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1677 bool kern)
1678 {
1679 struct sock *sk = sock->sk;
1680 struct sk_buff *skb;
1681 struct sock *tsk;
1682 int err;
1683
1684 err = -EOPNOTSUPP;
1685 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1686 goto out;
1687
1688 err = -EINVAL;
1689 if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1690 goto out;
1691
1692 /* If socket state is TCP_LISTEN it cannot change (for now...),
1693 * so that no locks are necessary.
1694 */
1695
1696 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1697 &err);
1698 if (!skb) {
1699 /* This means receive shutdown. */
1700 if (err == 0)
1701 err = -EINVAL;
1702 goto out;
1703 }
1704
1705 tsk = skb->sk;
1706 skb_free_datagram(sk, skb);
1707 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1708
1709 /* attach accepted sock to socket */
1710 unix_state_lock(tsk);
1711 unix_update_edges(unix_sk(tsk));
1712 newsock->state = SS_CONNECTED;
1713 unix_sock_inherit_flags(sock, newsock);
1714 sock_graft(tsk, newsock);
1715 unix_state_unlock(tsk);
1716 return 0;
1717
1718 out:
1719 return err;
1720 }
1721
1722
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1723 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1724 {
1725 struct sock *sk = sock->sk;
1726 struct unix_address *addr;
1727 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1728 int err = 0;
1729
1730 if (peer) {
1731 sk = unix_peer_get(sk);
1732
1733 err = -ENOTCONN;
1734 if (!sk)
1735 goto out;
1736 err = 0;
1737 } else {
1738 sock_hold(sk);
1739 }
1740
1741 addr = smp_load_acquire(&unix_sk(sk)->addr);
1742 if (!addr) {
1743 sunaddr->sun_family = AF_UNIX;
1744 sunaddr->sun_path[0] = 0;
1745 err = offsetof(struct sockaddr_un, sun_path);
1746 } else {
1747 err = addr->len;
1748 memcpy(sunaddr, addr->name, addr->len);
1749 }
1750 sock_put(sk);
1751 out:
1752 return err;
1753 }
1754
1755 /* The "user->unix_inflight" variable is protected by the garbage
1756 * collection lock, and we just read it locklessly here. If you go
1757 * over the limit, there might be a tiny race in actually noticing
1758 * it across threads. Tough.
1759 */
too_many_unix_fds(struct task_struct * p)1760 static inline bool too_many_unix_fds(struct task_struct *p)
1761 {
1762 struct user_struct *user = current_user();
1763
1764 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1765 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1766 return false;
1767 }
1768
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1769 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1770 {
1771 if (too_many_unix_fds(current))
1772 return -ETOOMANYREFS;
1773
1774 /* Need to duplicate file references for the sake of garbage
1775 * collection. Otherwise a socket in the fps might become a
1776 * candidate for GC while the skb is not yet queued.
1777 */
1778 UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1779 if (!UNIXCB(skb).fp)
1780 return -ENOMEM;
1781
1782 if (unix_prepare_fpl(UNIXCB(skb).fp))
1783 return -ENOMEM;
1784
1785 return 0;
1786 }
1787
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1788 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1789 {
1790 scm->fp = UNIXCB(skb).fp;
1791 UNIXCB(skb).fp = NULL;
1792
1793 unix_destroy_fpl(scm->fp);
1794 }
1795
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1796 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1797 {
1798 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1799 }
1800
unix_destruct_scm(struct sk_buff * skb)1801 static void unix_destruct_scm(struct sk_buff *skb)
1802 {
1803 struct scm_cookie scm;
1804
1805 memset(&scm, 0, sizeof(scm));
1806 scm.pid = UNIXCB(skb).pid;
1807 if (UNIXCB(skb).fp)
1808 unix_detach_fds(&scm, skb);
1809
1810 /* Alas, it calls VFS */
1811 /* So fscking what? fput() had been SMP-safe since the last Summer */
1812 scm_destroy(&scm);
1813 sock_wfree(skb);
1814 }
1815
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1816 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1817 {
1818 int err = 0;
1819
1820 UNIXCB(skb).pid = get_pid(scm->pid);
1821 UNIXCB(skb).uid = scm->creds.uid;
1822 UNIXCB(skb).gid = scm->creds.gid;
1823 UNIXCB(skb).fp = NULL;
1824 unix_get_secdata(scm, skb);
1825 if (scm->fp && send_fds)
1826 err = unix_attach_fds(scm, skb);
1827
1828 skb->destructor = unix_destruct_scm;
1829 return err;
1830 }
1831
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1832 static bool unix_passcred_enabled(const struct socket *sock,
1833 const struct sock *other)
1834 {
1835 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1836 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1837 !other->sk_socket ||
1838 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1839 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1840 }
1841
1842 /*
1843 * Some apps rely on write() giving SCM_CREDENTIALS
1844 * We include credentials if source or destination socket
1845 * asserted SOCK_PASSCRED.
1846 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1847 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1848 const struct sock *other)
1849 {
1850 if (UNIXCB(skb).pid)
1851 return;
1852 if (unix_passcred_enabled(sock, other)) {
1853 UNIXCB(skb).pid = get_pid(task_tgid(current));
1854 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1855 }
1856 }
1857
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1858 static bool unix_skb_scm_eq(struct sk_buff *skb,
1859 struct scm_cookie *scm)
1860 {
1861 return UNIXCB(skb).pid == scm->pid &&
1862 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1863 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1864 unix_secdata_eq(scm, skb);
1865 }
1866
scm_stat_add(struct sock * sk,struct sk_buff * skb)1867 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1868 {
1869 struct scm_fp_list *fp = UNIXCB(skb).fp;
1870 struct unix_sock *u = unix_sk(sk);
1871
1872 if (unlikely(fp && fp->count)) {
1873 atomic_add(fp->count, &u->scm_stat.nr_fds);
1874 unix_add_edges(fp, u);
1875 }
1876 }
1877
scm_stat_del(struct sock * sk,struct sk_buff * skb)1878 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1879 {
1880 struct scm_fp_list *fp = UNIXCB(skb).fp;
1881 struct unix_sock *u = unix_sk(sk);
1882
1883 if (unlikely(fp && fp->count)) {
1884 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1885 unix_del_edges(fp);
1886 }
1887 }
1888
1889 /*
1890 * Send AF_UNIX data.
1891 */
1892
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1893 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1894 size_t len)
1895 {
1896 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1897 struct sock *sk = sock->sk, *other = NULL;
1898 struct unix_sock *u = unix_sk(sk);
1899 struct scm_cookie scm;
1900 struct sk_buff *skb;
1901 int data_len = 0;
1902 int sk_locked;
1903 long timeo;
1904 int err;
1905
1906 err = scm_send(sock, msg, &scm, false);
1907 if (err < 0)
1908 return err;
1909
1910 wait_for_unix_gc(scm.fp);
1911
1912 err = -EOPNOTSUPP;
1913 if (msg->msg_flags&MSG_OOB)
1914 goto out;
1915
1916 if (msg->msg_namelen) {
1917 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1918 if (err)
1919 goto out;
1920 } else {
1921 sunaddr = NULL;
1922 err = -ENOTCONN;
1923 other = unix_peer_get(sk);
1924 if (!other)
1925 goto out;
1926 }
1927
1928 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1929 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1930 !READ_ONCE(u->addr)) {
1931 err = unix_autobind(sk);
1932 if (err)
1933 goto out;
1934 }
1935
1936 err = -EMSGSIZE;
1937 if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1938 goto out;
1939
1940 if (len > SKB_MAX_ALLOC) {
1941 data_len = min_t(size_t,
1942 len - SKB_MAX_ALLOC,
1943 MAX_SKB_FRAGS * PAGE_SIZE);
1944 data_len = PAGE_ALIGN(data_len);
1945
1946 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1947 }
1948
1949 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1950 msg->msg_flags & MSG_DONTWAIT, &err,
1951 PAGE_ALLOC_COSTLY_ORDER);
1952 if (skb == NULL)
1953 goto out;
1954
1955 err = unix_scm_to_skb(&scm, skb, true);
1956 if (err < 0)
1957 goto out_free;
1958
1959 skb_put(skb, len - data_len);
1960 skb->data_len = data_len;
1961 skb->len = len;
1962 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1963 if (err)
1964 goto out_free;
1965
1966 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1967
1968 restart:
1969 if (!other) {
1970 err = -ECONNRESET;
1971 if (sunaddr == NULL)
1972 goto out_free;
1973
1974 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1975 sk->sk_type);
1976 if (IS_ERR(other)) {
1977 err = PTR_ERR(other);
1978 other = NULL;
1979 goto out_free;
1980 }
1981 }
1982
1983 if (sk_filter(other, skb) < 0) {
1984 /* Toss the packet but do not return any error to the sender */
1985 err = len;
1986 goto out_free;
1987 }
1988
1989 sk_locked = 0;
1990 unix_state_lock(other);
1991 restart_locked:
1992 err = -EPERM;
1993 if (!unix_may_send(sk, other))
1994 goto out_unlock;
1995
1996 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1997 /*
1998 * Check with 1003.1g - what should
1999 * datagram error
2000 */
2001 unix_state_unlock(other);
2002 sock_put(other);
2003
2004 if (!sk_locked)
2005 unix_state_lock(sk);
2006
2007 err = 0;
2008 if (sk->sk_type == SOCK_SEQPACKET) {
2009 /* We are here only when racing with unix_release_sock()
2010 * is clearing @other. Never change state to TCP_CLOSE
2011 * unlike SOCK_DGRAM wants.
2012 */
2013 unix_state_unlock(sk);
2014 err = -EPIPE;
2015 } else if (unix_peer(sk) == other) {
2016 unix_peer(sk) = NULL;
2017 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2018
2019 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2020 unix_state_unlock(sk);
2021
2022 unix_dgram_disconnected(sk, other);
2023 sock_put(other);
2024 err = -ECONNREFUSED;
2025 } else {
2026 unix_state_unlock(sk);
2027 }
2028
2029 other = NULL;
2030 if (err)
2031 goto out_free;
2032 goto restart;
2033 }
2034
2035 err = -EPIPE;
2036 if (other->sk_shutdown & RCV_SHUTDOWN)
2037 goto out_unlock;
2038
2039 if (sk->sk_type != SOCK_SEQPACKET) {
2040 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2041 if (err)
2042 goto out_unlock;
2043 }
2044
2045 /* other == sk && unix_peer(other) != sk if
2046 * - unix_peer(sk) == NULL, destination address bound to sk
2047 * - unix_peer(sk) == sk by time of get but disconnected before lock
2048 */
2049 if (other != sk &&
2050 unlikely(unix_peer(other) != sk &&
2051 unix_recvq_full_lockless(other))) {
2052 if (timeo) {
2053 timeo = unix_wait_for_peer(other, timeo);
2054
2055 err = sock_intr_errno(timeo);
2056 if (signal_pending(current))
2057 goto out_free;
2058
2059 goto restart;
2060 }
2061
2062 if (!sk_locked) {
2063 unix_state_unlock(other);
2064 unix_state_double_lock(sk, other);
2065 }
2066
2067 if (unix_peer(sk) != other ||
2068 unix_dgram_peer_wake_me(sk, other)) {
2069 err = -EAGAIN;
2070 sk_locked = 1;
2071 goto out_unlock;
2072 }
2073
2074 if (!sk_locked) {
2075 sk_locked = 1;
2076 goto restart_locked;
2077 }
2078 }
2079
2080 if (unlikely(sk_locked))
2081 unix_state_unlock(sk);
2082
2083 if (sock_flag(other, SOCK_RCVTSTAMP))
2084 __net_timestamp(skb);
2085 maybe_add_creds(skb, sock, other);
2086 scm_stat_add(other, skb);
2087 skb_queue_tail(&other->sk_receive_queue, skb);
2088 unix_state_unlock(other);
2089 other->sk_data_ready(other);
2090 sock_put(other);
2091 scm_destroy(&scm);
2092 return len;
2093
2094 out_unlock:
2095 if (sk_locked)
2096 unix_state_unlock(sk);
2097 unix_state_unlock(other);
2098 out_free:
2099 kfree_skb(skb);
2100 out:
2101 if (other)
2102 sock_put(other);
2103 scm_destroy(&scm);
2104 return err;
2105 }
2106
2107 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2108 * bytes, and a minimum of a full page.
2109 */
2110 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2111
2112 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2113 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2114 struct scm_cookie *scm, bool fds_sent)
2115 {
2116 struct unix_sock *ousk = unix_sk(other);
2117 struct sk_buff *skb;
2118 int err = 0;
2119
2120 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2121
2122 if (!skb)
2123 return err;
2124
2125 err = unix_scm_to_skb(scm, skb, !fds_sent);
2126 if (err < 0) {
2127 kfree_skb(skb);
2128 return err;
2129 }
2130 skb_put(skb, 1);
2131 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2132
2133 if (err) {
2134 kfree_skb(skb);
2135 return err;
2136 }
2137
2138 unix_state_lock(other);
2139
2140 if (sock_flag(other, SOCK_DEAD) ||
2141 (other->sk_shutdown & RCV_SHUTDOWN)) {
2142 unix_state_unlock(other);
2143 kfree_skb(skb);
2144 return -EPIPE;
2145 }
2146
2147 maybe_add_creds(skb, sock, other);
2148 skb_get(skb);
2149
2150 scm_stat_add(other, skb);
2151
2152 spin_lock(&other->sk_receive_queue.lock);
2153 if (ousk->oob_skb)
2154 consume_skb(ousk->oob_skb);
2155 WRITE_ONCE(ousk->oob_skb, skb);
2156 __skb_queue_tail(&other->sk_receive_queue, skb);
2157 spin_unlock(&other->sk_receive_queue.lock);
2158
2159 sk_send_sigurg(other);
2160 unix_state_unlock(other);
2161 other->sk_data_ready(other);
2162
2163 return err;
2164 }
2165 #endif
2166
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2167 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2168 size_t len)
2169 {
2170 struct sock *sk = sock->sk;
2171 struct sock *other = NULL;
2172 int err, size;
2173 struct sk_buff *skb;
2174 int sent = 0;
2175 struct scm_cookie scm;
2176 bool fds_sent = false;
2177 int data_len;
2178
2179 err = scm_send(sock, msg, &scm, false);
2180 if (err < 0)
2181 return err;
2182
2183 wait_for_unix_gc(scm.fp);
2184
2185 err = -EOPNOTSUPP;
2186 if (msg->msg_flags & MSG_OOB) {
2187 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2188 if (len)
2189 len--;
2190 else
2191 #endif
2192 goto out_err;
2193 }
2194
2195 if (msg->msg_namelen) {
2196 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2197 goto out_err;
2198 } else {
2199 err = -ENOTCONN;
2200 other = unix_peer(sk);
2201 if (!other)
2202 goto out_err;
2203 }
2204
2205 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2206 goto pipe_err;
2207
2208 while (sent < len) {
2209 size = len - sent;
2210
2211 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2212 skb = sock_alloc_send_pskb(sk, 0, 0,
2213 msg->msg_flags & MSG_DONTWAIT,
2214 &err, 0);
2215 } else {
2216 /* Keep two messages in the pipe so it schedules better */
2217 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2218
2219 /* allow fallback to order-0 allocations */
2220 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2221
2222 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2223
2224 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2225
2226 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2227 msg->msg_flags & MSG_DONTWAIT, &err,
2228 get_order(UNIX_SKB_FRAGS_SZ));
2229 }
2230 if (!skb)
2231 goto out_err;
2232
2233 /* Only send the fds in the first buffer */
2234 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2235 if (err < 0) {
2236 kfree_skb(skb);
2237 goto out_err;
2238 }
2239 fds_sent = true;
2240
2241 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2242 skb->ip_summed = CHECKSUM_UNNECESSARY;
2243 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2244 sk->sk_allocation);
2245 if (err < 0) {
2246 kfree_skb(skb);
2247 goto out_err;
2248 }
2249 size = err;
2250 refcount_add(size, &sk->sk_wmem_alloc);
2251 } else {
2252 skb_put(skb, size - data_len);
2253 skb->data_len = data_len;
2254 skb->len = size;
2255 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2256 if (err) {
2257 kfree_skb(skb);
2258 goto out_err;
2259 }
2260 }
2261
2262 unix_state_lock(other);
2263
2264 if (sock_flag(other, SOCK_DEAD) ||
2265 (other->sk_shutdown & RCV_SHUTDOWN))
2266 goto pipe_err_free;
2267
2268 maybe_add_creds(skb, sock, other);
2269 scm_stat_add(other, skb);
2270 skb_queue_tail(&other->sk_receive_queue, skb);
2271 unix_state_unlock(other);
2272 other->sk_data_ready(other);
2273 sent += size;
2274 }
2275
2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2277 if (msg->msg_flags & MSG_OOB) {
2278 err = queue_oob(sock, msg, other, &scm, fds_sent);
2279 if (err)
2280 goto out_err;
2281 sent++;
2282 }
2283 #endif
2284
2285 scm_destroy(&scm);
2286
2287 return sent;
2288
2289 pipe_err_free:
2290 unix_state_unlock(other);
2291 kfree_skb(skb);
2292 pipe_err:
2293 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2294 send_sig(SIGPIPE, current, 0);
2295 err = -EPIPE;
2296 out_err:
2297 scm_destroy(&scm);
2298 return sent ? : err;
2299 }
2300
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2302 size_t len)
2303 {
2304 int err;
2305 struct sock *sk = sock->sk;
2306
2307 err = sock_error(sk);
2308 if (err)
2309 return err;
2310
2311 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2312 return -ENOTCONN;
2313
2314 if (msg->msg_namelen)
2315 msg->msg_namelen = 0;
2316
2317 return unix_dgram_sendmsg(sock, msg, len);
2318 }
2319
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2321 size_t size, int flags)
2322 {
2323 struct sock *sk = sock->sk;
2324
2325 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2326 return -ENOTCONN;
2327
2328 return unix_dgram_recvmsg(sock, msg, size, flags);
2329 }
2330
unix_copy_addr(struct msghdr * msg,struct sock * sk)2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2332 {
2333 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2334
2335 if (addr) {
2336 msg->msg_namelen = addr->len;
2337 memcpy(msg->msg_name, addr->name, addr->len);
2338 }
2339 }
2340
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2342 int flags)
2343 {
2344 struct scm_cookie scm;
2345 struct socket *sock = sk->sk_socket;
2346 struct unix_sock *u = unix_sk(sk);
2347 struct sk_buff *skb, *last;
2348 long timeo;
2349 int skip;
2350 int err;
2351
2352 err = -EOPNOTSUPP;
2353 if (flags&MSG_OOB)
2354 goto out;
2355
2356 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2357
2358 do {
2359 mutex_lock(&u->iolock);
2360
2361 skip = sk_peek_offset(sk, flags);
2362 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2363 &skip, &err, &last);
2364 if (skb) {
2365 if (!(flags & MSG_PEEK))
2366 scm_stat_del(sk, skb);
2367 break;
2368 }
2369
2370 mutex_unlock(&u->iolock);
2371
2372 if (err != -EAGAIN)
2373 break;
2374 } while (timeo &&
2375 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2376 &err, &timeo, last));
2377
2378 if (!skb) { /* implies iolock unlocked */
2379 unix_state_lock(sk);
2380 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2381 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2382 (sk->sk_shutdown & RCV_SHUTDOWN))
2383 err = 0;
2384 unix_state_unlock(sk);
2385 goto out;
2386 }
2387
2388 if (wq_has_sleeper(&u->peer_wait))
2389 wake_up_interruptible_sync_poll(&u->peer_wait,
2390 EPOLLOUT | EPOLLWRNORM |
2391 EPOLLWRBAND);
2392
2393 if (msg->msg_name)
2394 unix_copy_addr(msg, skb->sk);
2395
2396 if (size > skb->len - skip)
2397 size = skb->len - skip;
2398 else if (size < skb->len - skip)
2399 msg->msg_flags |= MSG_TRUNC;
2400
2401 err = skb_copy_datagram_msg(skb, skip, msg, size);
2402 if (err)
2403 goto out_free;
2404
2405 if (sock_flag(sk, SOCK_RCVTSTAMP))
2406 __sock_recv_timestamp(msg, sk, skb);
2407
2408 memset(&scm, 0, sizeof(scm));
2409
2410 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2411 unix_set_secdata(&scm, skb);
2412
2413 if (!(flags & MSG_PEEK)) {
2414 if (UNIXCB(skb).fp)
2415 unix_detach_fds(&scm, skb);
2416
2417 sk_peek_offset_bwd(sk, skb->len);
2418 } else {
2419 /* It is questionable: on PEEK we could:
2420 - do not return fds - good, but too simple 8)
2421 - return fds, and do not return them on read (old strategy,
2422 apparently wrong)
2423 - clone fds (I chose it for now, it is the most universal
2424 solution)
2425
2426 POSIX 1003.1g does not actually define this clearly
2427 at all. POSIX 1003.1g doesn't define a lot of things
2428 clearly however!
2429
2430 */
2431
2432 sk_peek_offset_fwd(sk, size);
2433
2434 if (UNIXCB(skb).fp)
2435 unix_peek_fds(&scm, skb);
2436 }
2437 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2438
2439 scm_recv_unix(sock, msg, &scm, flags);
2440
2441 out_free:
2442 skb_free_datagram(sk, skb);
2443 mutex_unlock(&u->iolock);
2444 out:
2445 return err;
2446 }
2447
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2449 int flags)
2450 {
2451 struct sock *sk = sock->sk;
2452
2453 #ifdef CONFIG_BPF_SYSCALL
2454 const struct proto *prot = READ_ONCE(sk->sk_prot);
2455
2456 if (prot != &unix_dgram_proto)
2457 return prot->recvmsg(sk, msg, size, flags, NULL);
2458 #endif
2459 return __unix_dgram_recvmsg(sk, msg, size, flags);
2460 }
2461
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2463 {
2464 struct unix_sock *u = unix_sk(sk);
2465 struct sk_buff *skb;
2466 int err;
2467
2468 mutex_lock(&u->iolock);
2469 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2470 mutex_unlock(&u->iolock);
2471 if (!skb)
2472 return err;
2473
2474 return recv_actor(sk, skb);
2475 }
2476
2477 /*
2478 * Sleep until more data has arrived. But check for races..
2479 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2480 static long unix_stream_data_wait(struct sock *sk, long timeo,
2481 struct sk_buff *last, unsigned int last_len,
2482 bool freezable)
2483 {
2484 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2485 struct sk_buff *tail;
2486 DEFINE_WAIT(wait);
2487
2488 unix_state_lock(sk);
2489
2490 for (;;) {
2491 prepare_to_wait(sk_sleep(sk), &wait, state);
2492
2493 tail = skb_peek_tail(&sk->sk_receive_queue);
2494 if (tail != last ||
2495 (tail && tail->len != last_len) ||
2496 sk->sk_err ||
2497 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2498 signal_pending(current) ||
2499 !timeo)
2500 break;
2501
2502 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2503 unix_state_unlock(sk);
2504 timeo = schedule_timeout(timeo);
2505 unix_state_lock(sk);
2506
2507 if (sock_flag(sk, SOCK_DEAD))
2508 break;
2509
2510 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2511 }
2512
2513 finish_wait(sk_sleep(sk), &wait);
2514 unix_state_unlock(sk);
2515 return timeo;
2516 }
2517
unix_skb_len(const struct sk_buff * skb)2518 static unsigned int unix_skb_len(const struct sk_buff *skb)
2519 {
2520 return skb->len - UNIXCB(skb).consumed;
2521 }
2522
2523 struct unix_stream_read_state {
2524 int (*recv_actor)(struct sk_buff *, int, int,
2525 struct unix_stream_read_state *);
2526 struct socket *socket;
2527 struct msghdr *msg;
2528 struct pipe_inode_info *pipe;
2529 size_t size;
2530 int flags;
2531 unsigned int splice_flags;
2532 };
2533
2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2536 {
2537 struct socket *sock = state->socket;
2538 struct sock *sk = sock->sk;
2539 struct unix_sock *u = unix_sk(sk);
2540 int chunk = 1;
2541 struct sk_buff *oob_skb;
2542
2543 mutex_lock(&u->iolock);
2544 unix_state_lock(sk);
2545 spin_lock(&sk->sk_receive_queue.lock);
2546
2547 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2548 spin_unlock(&sk->sk_receive_queue.lock);
2549 unix_state_unlock(sk);
2550 mutex_unlock(&u->iolock);
2551 return -EINVAL;
2552 }
2553
2554 oob_skb = u->oob_skb;
2555
2556 if (!(state->flags & MSG_PEEK))
2557 WRITE_ONCE(u->oob_skb, NULL);
2558 else
2559 skb_get(oob_skb);
2560
2561 spin_unlock(&sk->sk_receive_queue.lock);
2562 unix_state_unlock(sk);
2563
2564 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2565
2566 if (!(state->flags & MSG_PEEK))
2567 UNIXCB(oob_skb).consumed += 1;
2568
2569 consume_skb(oob_skb);
2570
2571 mutex_unlock(&u->iolock);
2572
2573 if (chunk < 0)
2574 return -EFAULT;
2575
2576 state->msg->msg_flags |= MSG_OOB;
2577 return 1;
2578 }
2579
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2580 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2581 int flags, int copied)
2582 {
2583 struct unix_sock *u = unix_sk(sk);
2584
2585 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2586 skb_unlink(skb, &sk->sk_receive_queue);
2587 consume_skb(skb);
2588 skb = NULL;
2589 } else {
2590 struct sk_buff *unlinked_skb = NULL;
2591
2592 spin_lock(&sk->sk_receive_queue.lock);
2593
2594 if (skb == u->oob_skb) {
2595 if (copied) {
2596 skb = NULL;
2597 } else if (!(flags & MSG_PEEK)) {
2598 if (sock_flag(sk, SOCK_URGINLINE)) {
2599 WRITE_ONCE(u->oob_skb, NULL);
2600 consume_skb(skb);
2601 } else {
2602 __skb_unlink(skb, &sk->sk_receive_queue);
2603 WRITE_ONCE(u->oob_skb, NULL);
2604 unlinked_skb = skb;
2605 skb = skb_peek(&sk->sk_receive_queue);
2606 }
2607 } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2608 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2609 }
2610 }
2611
2612 spin_unlock(&sk->sk_receive_queue.lock);
2613
2614 if (unlinked_skb) {
2615 WARN_ON_ONCE(skb_unref(unlinked_skb));
2616 kfree_skb(unlinked_skb);
2617 }
2618 }
2619 return skb;
2620 }
2621 #endif
2622
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2623 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2624 {
2625 struct unix_sock *u = unix_sk(sk);
2626 struct sk_buff *skb;
2627 int err;
2628
2629 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2630 return -ENOTCONN;
2631
2632 mutex_lock(&u->iolock);
2633 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2634 mutex_unlock(&u->iolock);
2635 if (!skb)
2636 return err;
2637
2638 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2639 if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2640 bool drop = false;
2641
2642 unix_state_lock(sk);
2643
2644 if (sock_flag(sk, SOCK_DEAD)) {
2645 unix_state_unlock(sk);
2646 kfree_skb(skb);
2647 return -ECONNRESET;
2648 }
2649
2650 spin_lock(&sk->sk_receive_queue.lock);
2651 if (likely(skb == u->oob_skb)) {
2652 WRITE_ONCE(u->oob_skb, NULL);
2653 drop = true;
2654 }
2655 spin_unlock(&sk->sk_receive_queue.lock);
2656
2657 unix_state_unlock(sk);
2658
2659 if (drop) {
2660 WARN_ON_ONCE(skb_unref(skb));
2661 kfree_skb(skb);
2662 return -EAGAIN;
2663 }
2664 }
2665 #endif
2666
2667 return recv_actor(sk, skb);
2668 }
2669
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2670 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2671 bool freezable)
2672 {
2673 struct scm_cookie scm;
2674 struct socket *sock = state->socket;
2675 struct sock *sk = sock->sk;
2676 struct unix_sock *u = unix_sk(sk);
2677 int copied = 0;
2678 int flags = state->flags;
2679 int noblock = flags & MSG_DONTWAIT;
2680 bool check_creds = false;
2681 int target;
2682 int err = 0;
2683 long timeo;
2684 int skip;
2685 size_t size = state->size;
2686 unsigned int last_len;
2687
2688 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2689 err = -EINVAL;
2690 goto out;
2691 }
2692
2693 if (unlikely(flags & MSG_OOB)) {
2694 err = -EOPNOTSUPP;
2695 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2696 err = unix_stream_recv_urg(state);
2697 #endif
2698 goto out;
2699 }
2700
2701 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2702 timeo = sock_rcvtimeo(sk, noblock);
2703
2704 memset(&scm, 0, sizeof(scm));
2705
2706 /* Lock the socket to prevent queue disordering
2707 * while sleeps in memcpy_tomsg
2708 */
2709 mutex_lock(&u->iolock);
2710
2711 skip = max(sk_peek_offset(sk, flags), 0);
2712
2713 do {
2714 int chunk;
2715 bool drop_skb;
2716 struct sk_buff *skb, *last;
2717
2718 redo:
2719 unix_state_lock(sk);
2720 if (sock_flag(sk, SOCK_DEAD)) {
2721 err = -ECONNRESET;
2722 goto unlock;
2723 }
2724 last = skb = skb_peek(&sk->sk_receive_queue);
2725 last_len = last ? last->len : 0;
2726
2727 again:
2728 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2729 if (skb) {
2730 skb = manage_oob(skb, sk, flags, copied);
2731 if (!skb && copied) {
2732 unix_state_unlock(sk);
2733 break;
2734 }
2735 }
2736 #endif
2737 if (skb == NULL) {
2738 if (copied >= target)
2739 goto unlock;
2740
2741 /*
2742 * POSIX 1003.1g mandates this order.
2743 */
2744
2745 err = sock_error(sk);
2746 if (err)
2747 goto unlock;
2748 if (sk->sk_shutdown & RCV_SHUTDOWN)
2749 goto unlock;
2750
2751 unix_state_unlock(sk);
2752 if (!timeo) {
2753 err = -EAGAIN;
2754 break;
2755 }
2756
2757 mutex_unlock(&u->iolock);
2758
2759 timeo = unix_stream_data_wait(sk, timeo, last,
2760 last_len, freezable);
2761
2762 if (signal_pending(current)) {
2763 err = sock_intr_errno(timeo);
2764 scm_destroy(&scm);
2765 goto out;
2766 }
2767
2768 mutex_lock(&u->iolock);
2769 goto redo;
2770 unlock:
2771 unix_state_unlock(sk);
2772 break;
2773 }
2774
2775 while (skip >= unix_skb_len(skb)) {
2776 skip -= unix_skb_len(skb);
2777 last = skb;
2778 last_len = skb->len;
2779 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2780 if (!skb)
2781 goto again;
2782 }
2783
2784 unix_state_unlock(sk);
2785
2786 if (check_creds) {
2787 /* Never glue messages from different writers */
2788 if (!unix_skb_scm_eq(skb, &scm))
2789 break;
2790 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2791 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2792 /* Copy credentials */
2793 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2794 unix_set_secdata(&scm, skb);
2795 check_creds = true;
2796 }
2797
2798 /* Copy address just once */
2799 if (state->msg && state->msg->msg_name) {
2800 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2801 state->msg->msg_name);
2802 unix_copy_addr(state->msg, skb->sk);
2803 sunaddr = NULL;
2804 }
2805
2806 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2807 skb_get(skb);
2808 chunk = state->recv_actor(skb, skip, chunk, state);
2809 drop_skb = !unix_skb_len(skb);
2810 /* skb is only safe to use if !drop_skb */
2811 consume_skb(skb);
2812 if (chunk < 0) {
2813 if (copied == 0)
2814 copied = -EFAULT;
2815 break;
2816 }
2817 copied += chunk;
2818 size -= chunk;
2819
2820 if (drop_skb) {
2821 /* the skb was touched by a concurrent reader;
2822 * we should not expect anything from this skb
2823 * anymore and assume it invalid - we can be
2824 * sure it was dropped from the socket queue
2825 *
2826 * let's report a short read
2827 */
2828 err = 0;
2829 break;
2830 }
2831
2832 /* Mark read part of skb as used */
2833 if (!(flags & MSG_PEEK)) {
2834 UNIXCB(skb).consumed += chunk;
2835
2836 sk_peek_offset_bwd(sk, chunk);
2837
2838 if (UNIXCB(skb).fp) {
2839 scm_stat_del(sk, skb);
2840 unix_detach_fds(&scm, skb);
2841 }
2842
2843 if (unix_skb_len(skb))
2844 break;
2845
2846 skb_unlink(skb, &sk->sk_receive_queue);
2847 consume_skb(skb);
2848
2849 if (scm.fp)
2850 break;
2851 } else {
2852 /* It is questionable, see note in unix_dgram_recvmsg.
2853 */
2854 if (UNIXCB(skb).fp)
2855 unix_peek_fds(&scm, skb);
2856
2857 sk_peek_offset_fwd(sk, chunk);
2858
2859 if (UNIXCB(skb).fp)
2860 break;
2861
2862 skip = 0;
2863 last = skb;
2864 last_len = skb->len;
2865 unix_state_lock(sk);
2866 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2867 if (skb)
2868 goto again;
2869 unix_state_unlock(sk);
2870 break;
2871 }
2872 } while (size);
2873
2874 mutex_unlock(&u->iolock);
2875 if (state->msg)
2876 scm_recv_unix(sock, state->msg, &scm, flags);
2877 else
2878 scm_destroy(&scm);
2879 out:
2880 return copied ? : err;
2881 }
2882
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2883 static int unix_stream_read_actor(struct sk_buff *skb,
2884 int skip, int chunk,
2885 struct unix_stream_read_state *state)
2886 {
2887 int ret;
2888
2889 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2890 state->msg, chunk);
2891 return ret ?: chunk;
2892 }
2893
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2894 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2895 size_t size, int flags)
2896 {
2897 struct unix_stream_read_state state = {
2898 .recv_actor = unix_stream_read_actor,
2899 .socket = sk->sk_socket,
2900 .msg = msg,
2901 .size = size,
2902 .flags = flags
2903 };
2904
2905 return unix_stream_read_generic(&state, true);
2906 }
2907
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2908 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2909 size_t size, int flags)
2910 {
2911 struct unix_stream_read_state state = {
2912 .recv_actor = unix_stream_read_actor,
2913 .socket = sock,
2914 .msg = msg,
2915 .size = size,
2916 .flags = flags
2917 };
2918
2919 #ifdef CONFIG_BPF_SYSCALL
2920 struct sock *sk = sock->sk;
2921 const struct proto *prot = READ_ONCE(sk->sk_prot);
2922
2923 if (prot != &unix_stream_proto)
2924 return prot->recvmsg(sk, msg, size, flags, NULL);
2925 #endif
2926 return unix_stream_read_generic(&state, true);
2927 }
2928
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2929 static int unix_stream_splice_actor(struct sk_buff *skb,
2930 int skip, int chunk,
2931 struct unix_stream_read_state *state)
2932 {
2933 return skb_splice_bits(skb, state->socket->sk,
2934 UNIXCB(skb).consumed + skip,
2935 state->pipe, chunk, state->splice_flags);
2936 }
2937
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)2938 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2939 struct pipe_inode_info *pipe,
2940 size_t size, unsigned int flags)
2941 {
2942 struct unix_stream_read_state state = {
2943 .recv_actor = unix_stream_splice_actor,
2944 .socket = sock,
2945 .pipe = pipe,
2946 .size = size,
2947 .splice_flags = flags,
2948 };
2949
2950 if (unlikely(*ppos))
2951 return -ESPIPE;
2952
2953 if (sock->file->f_flags & O_NONBLOCK ||
2954 flags & SPLICE_F_NONBLOCK)
2955 state.flags = MSG_DONTWAIT;
2956
2957 return unix_stream_read_generic(&state, false);
2958 }
2959
unix_shutdown(struct socket * sock,int mode)2960 static int unix_shutdown(struct socket *sock, int mode)
2961 {
2962 struct sock *sk = sock->sk;
2963 struct sock *other;
2964
2965 if (mode < SHUT_RD || mode > SHUT_RDWR)
2966 return -EINVAL;
2967 /* This maps:
2968 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2969 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2970 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2971 */
2972 ++mode;
2973
2974 unix_state_lock(sk);
2975 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2976 other = unix_peer(sk);
2977 if (other)
2978 sock_hold(other);
2979 unix_state_unlock(sk);
2980 sk->sk_state_change(sk);
2981
2982 if (other &&
2983 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2984
2985 int peer_mode = 0;
2986 const struct proto *prot = READ_ONCE(other->sk_prot);
2987
2988 if (prot->unhash)
2989 prot->unhash(other);
2990 if (mode&RCV_SHUTDOWN)
2991 peer_mode |= SEND_SHUTDOWN;
2992 if (mode&SEND_SHUTDOWN)
2993 peer_mode |= RCV_SHUTDOWN;
2994 unix_state_lock(other);
2995 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2996 unix_state_unlock(other);
2997 other->sk_state_change(other);
2998 if (peer_mode == SHUTDOWN_MASK)
2999 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3000 else if (peer_mode & RCV_SHUTDOWN)
3001 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3002 }
3003 if (other)
3004 sock_put(other);
3005
3006 return 0;
3007 }
3008
unix_inq_len(struct sock * sk)3009 long unix_inq_len(struct sock *sk)
3010 {
3011 struct sk_buff *skb;
3012 long amount = 0;
3013
3014 if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3015 return -EINVAL;
3016
3017 spin_lock(&sk->sk_receive_queue.lock);
3018 if (sk->sk_type == SOCK_STREAM ||
3019 sk->sk_type == SOCK_SEQPACKET) {
3020 skb_queue_walk(&sk->sk_receive_queue, skb)
3021 amount += unix_skb_len(skb);
3022 } else {
3023 skb = skb_peek(&sk->sk_receive_queue);
3024 if (skb)
3025 amount = skb->len;
3026 }
3027 spin_unlock(&sk->sk_receive_queue.lock);
3028
3029 return amount;
3030 }
3031 EXPORT_SYMBOL_GPL(unix_inq_len);
3032
unix_outq_len(struct sock * sk)3033 long unix_outq_len(struct sock *sk)
3034 {
3035 return sk_wmem_alloc_get(sk);
3036 }
3037 EXPORT_SYMBOL_GPL(unix_outq_len);
3038
unix_open_file(struct sock * sk)3039 static int unix_open_file(struct sock *sk)
3040 {
3041 struct path path;
3042 struct file *f;
3043 int fd;
3044
3045 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3046 return -EPERM;
3047
3048 if (!smp_load_acquire(&unix_sk(sk)->addr))
3049 return -ENOENT;
3050
3051 path = unix_sk(sk)->path;
3052 if (!path.dentry)
3053 return -ENOENT;
3054
3055 path_get(&path);
3056
3057 fd = get_unused_fd_flags(O_CLOEXEC);
3058 if (fd < 0)
3059 goto out;
3060
3061 f = dentry_open(&path, O_PATH, current_cred());
3062 if (IS_ERR(f)) {
3063 put_unused_fd(fd);
3064 fd = PTR_ERR(f);
3065 goto out;
3066 }
3067
3068 fd_install(fd, f);
3069 out:
3070 path_put(&path);
3071
3072 return fd;
3073 }
3074
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3075 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3076 {
3077 struct sock *sk = sock->sk;
3078 long amount = 0;
3079 int err;
3080
3081 switch (cmd) {
3082 case SIOCOUTQ:
3083 amount = unix_outq_len(sk);
3084 err = put_user(amount, (int __user *)arg);
3085 break;
3086 case SIOCINQ:
3087 amount = unix_inq_len(sk);
3088 if (amount < 0)
3089 err = amount;
3090 else
3091 err = put_user(amount, (int __user *)arg);
3092 break;
3093 case SIOCUNIXFILE:
3094 err = unix_open_file(sk);
3095 break;
3096 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3097 case SIOCATMARK:
3098 {
3099 struct sk_buff *skb;
3100 int answ = 0;
3101
3102 skb = skb_peek(&sk->sk_receive_queue);
3103 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3104 answ = 1;
3105 err = put_user(answ, (int __user *)arg);
3106 }
3107 break;
3108 #endif
3109 default:
3110 err = -ENOIOCTLCMD;
3111 break;
3112 }
3113 return err;
3114 }
3115
3116 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3117 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3118 {
3119 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3120 }
3121 #endif
3122
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3123 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3124 {
3125 struct sock *sk = sock->sk;
3126 unsigned char state;
3127 __poll_t mask;
3128 u8 shutdown;
3129
3130 sock_poll_wait(file, sock, wait);
3131 mask = 0;
3132 shutdown = READ_ONCE(sk->sk_shutdown);
3133 state = READ_ONCE(sk->sk_state);
3134
3135 /* exceptional events? */
3136 if (READ_ONCE(sk->sk_err))
3137 mask |= EPOLLERR;
3138 if (shutdown == SHUTDOWN_MASK)
3139 mask |= EPOLLHUP;
3140 if (shutdown & RCV_SHUTDOWN)
3141 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3142
3143 /* readable? */
3144 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3145 mask |= EPOLLIN | EPOLLRDNORM;
3146 if (sk_is_readable(sk))
3147 mask |= EPOLLIN | EPOLLRDNORM;
3148 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3149 if (READ_ONCE(unix_sk(sk)->oob_skb))
3150 mask |= EPOLLPRI;
3151 #endif
3152
3153 /* Connection-based need to check for termination and startup */
3154 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3155 state == TCP_CLOSE)
3156 mask |= EPOLLHUP;
3157
3158 /*
3159 * we set writable also when the other side has shut down the
3160 * connection. This prevents stuck sockets.
3161 */
3162 if (unix_writable(sk, state))
3163 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3164
3165 return mask;
3166 }
3167
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3168 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3169 poll_table *wait)
3170 {
3171 struct sock *sk = sock->sk, *other;
3172 unsigned int writable;
3173 unsigned char state;
3174 __poll_t mask;
3175 u8 shutdown;
3176
3177 sock_poll_wait(file, sock, wait);
3178 mask = 0;
3179 shutdown = READ_ONCE(sk->sk_shutdown);
3180 state = READ_ONCE(sk->sk_state);
3181
3182 /* exceptional events? */
3183 if (READ_ONCE(sk->sk_err) ||
3184 !skb_queue_empty_lockless(&sk->sk_error_queue))
3185 mask |= EPOLLERR |
3186 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3187
3188 if (shutdown & RCV_SHUTDOWN)
3189 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3190 if (shutdown == SHUTDOWN_MASK)
3191 mask |= EPOLLHUP;
3192
3193 /* readable? */
3194 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3195 mask |= EPOLLIN | EPOLLRDNORM;
3196 if (sk_is_readable(sk))
3197 mask |= EPOLLIN | EPOLLRDNORM;
3198
3199 /* Connection-based need to check for termination and startup */
3200 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3201 mask |= EPOLLHUP;
3202
3203 /* No write status requested, avoid expensive OUT tests. */
3204 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3205 return mask;
3206
3207 writable = unix_writable(sk, state);
3208 if (writable) {
3209 unix_state_lock(sk);
3210
3211 other = unix_peer(sk);
3212 if (other && unix_peer(other) != sk &&
3213 unix_recvq_full_lockless(other) &&
3214 unix_dgram_peer_wake_me(sk, other))
3215 writable = 0;
3216
3217 unix_state_unlock(sk);
3218 }
3219
3220 if (writable)
3221 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3222 else
3223 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3224
3225 return mask;
3226 }
3227
3228 #ifdef CONFIG_PROC_FS
3229
3230 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3231
3232 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3233 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3234 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3235
unix_from_bucket(struct seq_file * seq,loff_t * pos)3236 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3237 {
3238 unsigned long offset = get_offset(*pos);
3239 unsigned long bucket = get_bucket(*pos);
3240 unsigned long count = 0;
3241 struct sock *sk;
3242
3243 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3244 sk; sk = sk_next(sk)) {
3245 if (++count == offset)
3246 break;
3247 }
3248
3249 return sk;
3250 }
3251
unix_get_first(struct seq_file * seq,loff_t * pos)3252 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3253 {
3254 unsigned long bucket = get_bucket(*pos);
3255 struct net *net = seq_file_net(seq);
3256 struct sock *sk;
3257
3258 while (bucket < UNIX_HASH_SIZE) {
3259 spin_lock(&net->unx.table.locks[bucket]);
3260
3261 sk = unix_from_bucket(seq, pos);
3262 if (sk)
3263 return sk;
3264
3265 spin_unlock(&net->unx.table.locks[bucket]);
3266
3267 *pos = set_bucket_offset(++bucket, 1);
3268 }
3269
3270 return NULL;
3271 }
3272
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3273 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3274 loff_t *pos)
3275 {
3276 unsigned long bucket = get_bucket(*pos);
3277
3278 sk = sk_next(sk);
3279 if (sk)
3280 return sk;
3281
3282
3283 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3284
3285 *pos = set_bucket_offset(++bucket, 1);
3286
3287 return unix_get_first(seq, pos);
3288 }
3289
unix_seq_start(struct seq_file * seq,loff_t * pos)3290 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3291 {
3292 if (!*pos)
3293 return SEQ_START_TOKEN;
3294
3295 return unix_get_first(seq, pos);
3296 }
3297
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3298 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3299 {
3300 ++*pos;
3301
3302 if (v == SEQ_START_TOKEN)
3303 return unix_get_first(seq, pos);
3304
3305 return unix_get_next(seq, v, pos);
3306 }
3307
unix_seq_stop(struct seq_file * seq,void * v)3308 static void unix_seq_stop(struct seq_file *seq, void *v)
3309 {
3310 struct sock *sk = v;
3311
3312 if (sk)
3313 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3314 }
3315
unix_seq_show(struct seq_file * seq,void * v)3316 static int unix_seq_show(struct seq_file *seq, void *v)
3317 {
3318
3319 if (v == SEQ_START_TOKEN)
3320 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3321 "Inode Path\n");
3322 else {
3323 struct sock *s = v;
3324 struct unix_sock *u = unix_sk(s);
3325 unix_state_lock(s);
3326
3327 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3328 s,
3329 refcount_read(&s->sk_refcnt),
3330 0,
3331 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3332 s->sk_type,
3333 s->sk_socket ?
3334 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3335 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3336 sock_i_ino(s));
3337
3338 if (u->addr) { // under a hash table lock here
3339 int i, len;
3340 seq_putc(seq, ' ');
3341
3342 i = 0;
3343 len = u->addr->len -
3344 offsetof(struct sockaddr_un, sun_path);
3345 if (u->addr->name->sun_path[0]) {
3346 len--;
3347 } else {
3348 seq_putc(seq, '@');
3349 i++;
3350 }
3351 for ( ; i < len; i++)
3352 seq_putc(seq, u->addr->name->sun_path[i] ?:
3353 '@');
3354 }
3355 unix_state_unlock(s);
3356 seq_putc(seq, '\n');
3357 }
3358
3359 return 0;
3360 }
3361
3362 static const struct seq_operations unix_seq_ops = {
3363 .start = unix_seq_start,
3364 .next = unix_seq_next,
3365 .stop = unix_seq_stop,
3366 .show = unix_seq_show,
3367 };
3368
3369 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3370 struct bpf_unix_iter_state {
3371 struct seq_net_private p;
3372 unsigned int cur_sk;
3373 unsigned int end_sk;
3374 unsigned int max_sk;
3375 struct sock **batch;
3376 bool st_bucket_done;
3377 };
3378
3379 struct bpf_iter__unix {
3380 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3381 __bpf_md_ptr(struct unix_sock *, unix_sk);
3382 uid_t uid __aligned(8);
3383 };
3384
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3385 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3386 struct unix_sock *unix_sk, uid_t uid)
3387 {
3388 struct bpf_iter__unix ctx;
3389
3390 meta->seq_num--; /* skip SEQ_START_TOKEN */
3391 ctx.meta = meta;
3392 ctx.unix_sk = unix_sk;
3393 ctx.uid = uid;
3394 return bpf_iter_run_prog(prog, &ctx);
3395 }
3396
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3397 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3398
3399 {
3400 struct bpf_unix_iter_state *iter = seq->private;
3401 unsigned int expected = 1;
3402 struct sock *sk;
3403
3404 sock_hold(start_sk);
3405 iter->batch[iter->end_sk++] = start_sk;
3406
3407 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3408 if (iter->end_sk < iter->max_sk) {
3409 sock_hold(sk);
3410 iter->batch[iter->end_sk++] = sk;
3411 }
3412
3413 expected++;
3414 }
3415
3416 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3417
3418 return expected;
3419 }
3420
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3421 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3422 {
3423 while (iter->cur_sk < iter->end_sk)
3424 sock_put(iter->batch[iter->cur_sk++]);
3425 }
3426
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3427 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3428 unsigned int new_batch_sz)
3429 {
3430 struct sock **new_batch;
3431
3432 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3433 GFP_USER | __GFP_NOWARN);
3434 if (!new_batch)
3435 return -ENOMEM;
3436
3437 bpf_iter_unix_put_batch(iter);
3438 kvfree(iter->batch);
3439 iter->batch = new_batch;
3440 iter->max_sk = new_batch_sz;
3441
3442 return 0;
3443 }
3444
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3445 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3446 loff_t *pos)
3447 {
3448 struct bpf_unix_iter_state *iter = seq->private;
3449 unsigned int expected;
3450 bool resized = false;
3451 struct sock *sk;
3452
3453 if (iter->st_bucket_done)
3454 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3455
3456 again:
3457 /* Get a new batch */
3458 iter->cur_sk = 0;
3459 iter->end_sk = 0;
3460
3461 sk = unix_get_first(seq, pos);
3462 if (!sk)
3463 return NULL; /* Done */
3464
3465 expected = bpf_iter_unix_hold_batch(seq, sk);
3466
3467 if (iter->end_sk == expected) {
3468 iter->st_bucket_done = true;
3469 return sk;
3470 }
3471
3472 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3473 resized = true;
3474 goto again;
3475 }
3476
3477 return sk;
3478 }
3479
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3480 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3481 {
3482 if (!*pos)
3483 return SEQ_START_TOKEN;
3484
3485 /* bpf iter does not support lseek, so it always
3486 * continue from where it was stop()-ped.
3487 */
3488 return bpf_iter_unix_batch(seq, pos);
3489 }
3490
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3491 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3492 {
3493 struct bpf_unix_iter_state *iter = seq->private;
3494 struct sock *sk;
3495
3496 /* Whenever seq_next() is called, the iter->cur_sk is
3497 * done with seq_show(), so advance to the next sk in
3498 * the batch.
3499 */
3500 if (iter->cur_sk < iter->end_sk)
3501 sock_put(iter->batch[iter->cur_sk++]);
3502
3503 ++*pos;
3504
3505 if (iter->cur_sk < iter->end_sk)
3506 sk = iter->batch[iter->cur_sk];
3507 else
3508 sk = bpf_iter_unix_batch(seq, pos);
3509
3510 return sk;
3511 }
3512
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3513 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3514 {
3515 struct bpf_iter_meta meta;
3516 struct bpf_prog *prog;
3517 struct sock *sk = v;
3518 uid_t uid;
3519 bool slow;
3520 int ret;
3521
3522 if (v == SEQ_START_TOKEN)
3523 return 0;
3524
3525 slow = lock_sock_fast(sk);
3526
3527 if (unlikely(sk_unhashed(sk))) {
3528 ret = SEQ_SKIP;
3529 goto unlock;
3530 }
3531
3532 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3533 meta.seq = seq;
3534 prog = bpf_iter_get_info(&meta, false);
3535 ret = unix_prog_seq_show(prog, &meta, v, uid);
3536 unlock:
3537 unlock_sock_fast(sk, slow);
3538 return ret;
3539 }
3540
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3541 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3542 {
3543 struct bpf_unix_iter_state *iter = seq->private;
3544 struct bpf_iter_meta meta;
3545 struct bpf_prog *prog;
3546
3547 if (!v) {
3548 meta.seq = seq;
3549 prog = bpf_iter_get_info(&meta, true);
3550 if (prog)
3551 (void)unix_prog_seq_show(prog, &meta, v, 0);
3552 }
3553
3554 if (iter->cur_sk < iter->end_sk)
3555 bpf_iter_unix_put_batch(iter);
3556 }
3557
3558 static const struct seq_operations bpf_iter_unix_seq_ops = {
3559 .start = bpf_iter_unix_seq_start,
3560 .next = bpf_iter_unix_seq_next,
3561 .stop = bpf_iter_unix_seq_stop,
3562 .show = bpf_iter_unix_seq_show,
3563 };
3564 #endif
3565 #endif
3566
3567 static const struct net_proto_family unix_family_ops = {
3568 .family = PF_UNIX,
3569 .create = unix_create,
3570 .owner = THIS_MODULE,
3571 };
3572
3573
unix_net_init(struct net * net)3574 static int __net_init unix_net_init(struct net *net)
3575 {
3576 int i;
3577
3578 net->unx.sysctl_max_dgram_qlen = 10;
3579 if (unix_sysctl_register(net))
3580 goto out;
3581
3582 #ifdef CONFIG_PROC_FS
3583 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3584 sizeof(struct seq_net_private)))
3585 goto err_sysctl;
3586 #endif
3587
3588 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3589 sizeof(spinlock_t), GFP_KERNEL);
3590 if (!net->unx.table.locks)
3591 goto err_proc;
3592
3593 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3594 sizeof(struct hlist_head),
3595 GFP_KERNEL);
3596 if (!net->unx.table.buckets)
3597 goto free_locks;
3598
3599 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3600 spin_lock_init(&net->unx.table.locks[i]);
3601 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3602 }
3603
3604 return 0;
3605
3606 free_locks:
3607 kvfree(net->unx.table.locks);
3608 err_proc:
3609 #ifdef CONFIG_PROC_FS
3610 remove_proc_entry("unix", net->proc_net);
3611 err_sysctl:
3612 #endif
3613 unix_sysctl_unregister(net);
3614 out:
3615 return -ENOMEM;
3616 }
3617
unix_net_exit(struct net * net)3618 static void __net_exit unix_net_exit(struct net *net)
3619 {
3620 kvfree(net->unx.table.buckets);
3621 kvfree(net->unx.table.locks);
3622 unix_sysctl_unregister(net);
3623 remove_proc_entry("unix", net->proc_net);
3624 }
3625
3626 static struct pernet_operations unix_net_ops = {
3627 .init = unix_net_init,
3628 .exit = unix_net_exit,
3629 };
3630
3631 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3632 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3633 struct unix_sock *unix_sk, uid_t uid)
3634
3635 #define INIT_BATCH_SZ 16
3636
3637 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3638 {
3639 struct bpf_unix_iter_state *iter = priv_data;
3640 int err;
3641
3642 err = bpf_iter_init_seq_net(priv_data, aux);
3643 if (err)
3644 return err;
3645
3646 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3647 if (err) {
3648 bpf_iter_fini_seq_net(priv_data);
3649 return err;
3650 }
3651
3652 return 0;
3653 }
3654
bpf_iter_fini_unix(void * priv_data)3655 static void bpf_iter_fini_unix(void *priv_data)
3656 {
3657 struct bpf_unix_iter_state *iter = priv_data;
3658
3659 bpf_iter_fini_seq_net(priv_data);
3660 kvfree(iter->batch);
3661 }
3662
3663 static const struct bpf_iter_seq_info unix_seq_info = {
3664 .seq_ops = &bpf_iter_unix_seq_ops,
3665 .init_seq_private = bpf_iter_init_unix,
3666 .fini_seq_private = bpf_iter_fini_unix,
3667 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3668 };
3669
3670 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3671 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3672 const struct bpf_prog *prog)
3673 {
3674 switch (func_id) {
3675 case BPF_FUNC_setsockopt:
3676 return &bpf_sk_setsockopt_proto;
3677 case BPF_FUNC_getsockopt:
3678 return &bpf_sk_getsockopt_proto;
3679 default:
3680 return NULL;
3681 }
3682 }
3683
3684 static struct bpf_iter_reg unix_reg_info = {
3685 .target = "unix",
3686 .ctx_arg_info_size = 1,
3687 .ctx_arg_info = {
3688 { offsetof(struct bpf_iter__unix, unix_sk),
3689 PTR_TO_BTF_ID_OR_NULL },
3690 },
3691 .get_func_proto = bpf_iter_unix_get_func_proto,
3692 .seq_info = &unix_seq_info,
3693 };
3694
bpf_iter_register(void)3695 static void __init bpf_iter_register(void)
3696 {
3697 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3698 if (bpf_iter_reg_target(&unix_reg_info))
3699 pr_warn("Warning: could not register bpf iterator unix\n");
3700 }
3701 #endif
3702
af_unix_init(void)3703 static int __init af_unix_init(void)
3704 {
3705 int i, rc = -1;
3706
3707 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3708
3709 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3710 spin_lock_init(&bsd_socket_locks[i]);
3711 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3712 }
3713
3714 rc = proto_register(&unix_dgram_proto, 1);
3715 if (rc != 0) {
3716 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3717 goto out;
3718 }
3719
3720 rc = proto_register(&unix_stream_proto, 1);
3721 if (rc != 0) {
3722 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3723 proto_unregister(&unix_dgram_proto);
3724 goto out;
3725 }
3726
3727 sock_register(&unix_family_ops);
3728 register_pernet_subsys(&unix_net_ops);
3729 unix_bpf_build_proto();
3730
3731 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3732 bpf_iter_register();
3733 #endif
3734
3735 out:
3736 return rc;
3737 }
3738
af_unix_exit(void)3739 static void __exit af_unix_exit(void)
3740 {
3741 sock_unregister(PF_UNIX);
3742 proto_unregister(&unix_dgram_proto);
3743 proto_unregister(&unix_stream_proto);
3744 unregister_pernet_subsys(&unix_net_ops);
3745 }
3746
3747 /* Earlier than device_initcall() so that other drivers invoking
3748 request_module() don't end up in a loop when modprobe tries
3749 to use a UNIX socket. But later than subsys_initcall() because
3750 we depend on stuff initialised there */
3751 fs_initcall(af_unix_init);
3752 module_exit(af_unix_exit);
3753
3754 MODULE_LICENSE("GPL");
3755 MODULE_ALIAS_NETPROTO(PF_UNIX);
3756