1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 120 #include "scm.h" 121 122 static atomic_long_t unix_nr_socks; 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 125 126 /* SMP locking strategy: 127 * hash table is protected with spinlock. 128 * each socket state is protected by separate spinlock. 129 */ 130 131 static unsigned int unix_unbound_hash(struct sock *sk) 132 { 133 unsigned long hash = (unsigned long)sk; 134 135 hash ^= hash >> 16; 136 hash ^= hash >> 8; 137 hash ^= sk->sk_type; 138 139 return hash & UNIX_HASH_MOD; 140 } 141 142 static unsigned int unix_bsd_hash(struct inode *i) 143 { 144 return i->i_ino & UNIX_HASH_MOD; 145 } 146 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 148 int addr_len, int type) 149 { 150 __wsum csum = csum_partial(sunaddr, addr_len, 0); 151 unsigned int hash; 152 153 hash = (__force unsigned int)csum_fold(csum); 154 hash ^= hash >> 8; 155 hash ^= type; 156 157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 158 } 159 160 static void unix_table_double_lock(struct net *net, 161 unsigned int hash1, unsigned int hash2) 162 { 163 if (hash1 == hash2) { 164 spin_lock(&net->unx.table.locks[hash1]); 165 return; 166 } 167 168 if (hash1 > hash2) 169 swap(hash1, hash2); 170 171 spin_lock(&net->unx.table.locks[hash1]); 172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 173 } 174 175 static void unix_table_double_unlock(struct net *net, 176 unsigned int hash1, unsigned int hash2) 177 { 178 if (hash1 == hash2) { 179 spin_unlock(&net->unx.table.locks[hash1]); 180 return; 181 } 182 183 spin_unlock(&net->unx.table.locks[hash1]); 184 spin_unlock(&net->unx.table.locks[hash2]); 185 } 186 187 #ifdef CONFIG_SECURITY_NETWORK 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 189 { 190 UNIXCB(skb).secid = scm->secid; 191 } 192 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 194 { 195 scm->secid = UNIXCB(skb).secid; 196 } 197 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 199 { 200 return (scm->secid == UNIXCB(skb).secid); 201 } 202 #else 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 204 { } 205 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 207 { } 208 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 210 { 211 return true; 212 } 213 #endif /* CONFIG_SECURITY_NETWORK */ 214 215 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 216 { 217 return unix_peer(osk) == sk; 218 } 219 220 static inline int unix_may_send(struct sock *sk, struct sock *osk) 221 { 222 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 223 } 224 225 static inline int unix_recvq_full_lockless(const struct sock *sk) 226 { 227 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 228 } 229 230 struct sock *unix_peer_get(struct sock *s) 231 { 232 struct sock *peer; 233 234 unix_state_lock(s); 235 peer = unix_peer(s); 236 if (peer) 237 sock_hold(peer); 238 unix_state_unlock(s); 239 return peer; 240 } 241 EXPORT_SYMBOL_GPL(unix_peer_get); 242 243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 244 int addr_len) 245 { 246 struct unix_address *addr; 247 248 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 249 if (!addr) 250 return NULL; 251 252 refcount_set(&addr->refcnt, 1); 253 addr->len = addr_len; 254 memcpy(addr->name, sunaddr, addr_len); 255 256 return addr; 257 } 258 259 static inline void unix_release_addr(struct unix_address *addr) 260 { 261 if (refcount_dec_and_test(&addr->refcnt)) 262 kfree(addr); 263 } 264 265 /* 266 * Check unix socket name: 267 * - should be not zero length. 268 * - if started by not zero, should be NULL terminated (FS object) 269 * - if started by zero, it is abstract name. 270 */ 271 272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 273 { 274 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 275 addr_len > sizeof(*sunaddr)) 276 return -EINVAL; 277 278 if (sunaddr->sun_family != AF_UNIX) 279 return -EINVAL; 280 281 return 0; 282 } 283 284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 285 { 286 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 287 short offset = offsetof(struct sockaddr_storage, __data); 288 289 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 290 291 /* This may look like an off by one error but it is a bit more 292 * subtle. 108 is the longest valid AF_UNIX path for a binding. 293 * sun_path[108] doesn't as such exist. However in kernel space 294 * we are guaranteed that it is a valid memory location in our 295 * kernel address buffer because syscall functions always pass 296 * a pointer of struct sockaddr_storage which has a bigger buffer 297 * than 108. Also, we must terminate sun_path for strlen() in 298 * getname_kernel(). 299 */ 300 addr->__data[addr_len - offset] = 0; 301 302 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 303 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 304 * know the actual buffer. 305 */ 306 return strlen(addr->__data) + offset + 1; 307 } 308 309 static void __unix_remove_socket(struct sock *sk) 310 { 311 sk_del_node_init(sk); 312 } 313 314 static void __unix_insert_socket(struct net *net, struct sock *sk) 315 { 316 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 317 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 318 } 319 320 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 321 struct unix_address *addr, unsigned int hash) 322 { 323 __unix_remove_socket(sk); 324 smp_store_release(&unix_sk(sk)->addr, addr); 325 326 sk->sk_hash = hash; 327 __unix_insert_socket(net, sk); 328 } 329 330 static void unix_remove_socket(struct net *net, struct sock *sk) 331 { 332 spin_lock(&net->unx.table.locks[sk->sk_hash]); 333 __unix_remove_socket(sk); 334 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 335 } 336 337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 338 { 339 spin_lock(&net->unx.table.locks[sk->sk_hash]); 340 __unix_insert_socket(net, sk); 341 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 342 } 343 344 static void unix_insert_bsd_socket(struct sock *sk) 345 { 346 spin_lock(&bsd_socket_locks[sk->sk_hash]); 347 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 348 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 349 } 350 351 static void unix_remove_bsd_socket(struct sock *sk) 352 { 353 if (!hlist_unhashed(&sk->sk_bind_node)) { 354 spin_lock(&bsd_socket_locks[sk->sk_hash]); 355 __sk_del_bind_node(sk); 356 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 357 358 sk_node_init(&sk->sk_bind_node); 359 } 360 } 361 362 static struct sock *__unix_find_socket_byname(struct net *net, 363 struct sockaddr_un *sunname, 364 int len, unsigned int hash) 365 { 366 struct sock *s; 367 368 sk_for_each(s, &net->unx.table.buckets[hash]) { 369 struct unix_sock *u = unix_sk(s); 370 371 if (u->addr->len == len && 372 !memcmp(u->addr->name, sunname, len)) 373 return s; 374 } 375 return NULL; 376 } 377 378 static inline struct sock *unix_find_socket_byname(struct net *net, 379 struct sockaddr_un *sunname, 380 int len, unsigned int hash) 381 { 382 struct sock *s; 383 384 spin_lock(&net->unx.table.locks[hash]); 385 s = __unix_find_socket_byname(net, sunname, len, hash); 386 if (s) 387 sock_hold(s); 388 spin_unlock(&net->unx.table.locks[hash]); 389 return s; 390 } 391 392 static struct sock *unix_find_socket_byinode(struct inode *i) 393 { 394 unsigned int hash = unix_bsd_hash(i); 395 struct sock *s; 396 397 spin_lock(&bsd_socket_locks[hash]); 398 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 399 struct dentry *dentry = unix_sk(s)->path.dentry; 400 401 if (dentry && d_backing_inode(dentry) == i) { 402 sock_hold(s); 403 spin_unlock(&bsd_socket_locks[hash]); 404 return s; 405 } 406 } 407 spin_unlock(&bsd_socket_locks[hash]); 408 return NULL; 409 } 410 411 /* Support code for asymmetrically connected dgram sockets 412 * 413 * If a datagram socket is connected to a socket not itself connected 414 * to the first socket (eg, /dev/log), clients may only enqueue more 415 * messages if the present receive queue of the server socket is not 416 * "too large". This means there's a second writeability condition 417 * poll and sendmsg need to test. The dgram recv code will do a wake 418 * up on the peer_wait wait queue of a socket upon reception of a 419 * datagram which needs to be propagated to sleeping would-be writers 420 * since these might not have sent anything so far. This can't be 421 * accomplished via poll_wait because the lifetime of the server 422 * socket might be less than that of its clients if these break their 423 * association with it or if the server socket is closed while clients 424 * are still connected to it and there's no way to inform "a polling 425 * implementation" that it should let go of a certain wait queue 426 * 427 * In order to propagate a wake up, a wait_queue_entry_t of the client 428 * socket is enqueued on the peer_wait queue of the server socket 429 * whose wake function does a wake_up on the ordinary client socket 430 * wait queue. This connection is established whenever a write (or 431 * poll for write) hit the flow control condition and broken when the 432 * association to the server socket is dissolved or after a wake up 433 * was relayed. 434 */ 435 436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 437 void *key) 438 { 439 struct unix_sock *u; 440 wait_queue_head_t *u_sleep; 441 442 u = container_of(q, struct unix_sock, peer_wake); 443 444 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 445 q); 446 u->peer_wake.private = NULL; 447 448 /* relaying can only happen while the wq still exists */ 449 u_sleep = sk_sleep(&u->sk); 450 if (u_sleep) 451 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 452 453 return 0; 454 } 455 456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 457 { 458 struct unix_sock *u, *u_other; 459 int rc; 460 461 u = unix_sk(sk); 462 u_other = unix_sk(other); 463 rc = 0; 464 spin_lock(&u_other->peer_wait.lock); 465 466 if (!u->peer_wake.private) { 467 u->peer_wake.private = other; 468 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 469 470 rc = 1; 471 } 472 473 spin_unlock(&u_other->peer_wait.lock); 474 return rc; 475 } 476 477 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 478 struct sock *other) 479 { 480 struct unix_sock *u, *u_other; 481 482 u = unix_sk(sk); 483 u_other = unix_sk(other); 484 spin_lock(&u_other->peer_wait.lock); 485 486 if (u->peer_wake.private == other) { 487 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 488 u->peer_wake.private = NULL; 489 } 490 491 spin_unlock(&u_other->peer_wait.lock); 492 } 493 494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 495 struct sock *other) 496 { 497 unix_dgram_peer_wake_disconnect(sk, other); 498 wake_up_interruptible_poll(sk_sleep(sk), 499 EPOLLOUT | 500 EPOLLWRNORM | 501 EPOLLWRBAND); 502 } 503 504 /* preconditions: 505 * - unix_peer(sk) == other 506 * - association is stable 507 */ 508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 509 { 510 int connected; 511 512 connected = unix_dgram_peer_wake_connect(sk, other); 513 514 /* If other is SOCK_DEAD, we want to make sure we signal 515 * POLLOUT, such that a subsequent write() can get a 516 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 517 * to other and its full, we will hang waiting for POLLOUT. 518 */ 519 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 520 return 1; 521 522 if (connected) 523 unix_dgram_peer_wake_disconnect(sk, other); 524 525 return 0; 526 } 527 528 static int unix_writable(const struct sock *sk, unsigned char state) 529 { 530 return state != TCP_LISTEN && 531 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 532 } 533 534 static void unix_write_space(struct sock *sk) 535 { 536 struct socket_wq *wq; 537 538 rcu_read_lock(); 539 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 540 wq = rcu_dereference(sk->sk_wq); 541 if (skwq_has_sleeper(wq)) 542 wake_up_interruptible_sync_poll(&wq->wait, 543 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 544 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 545 } 546 rcu_read_unlock(); 547 } 548 549 /* When dgram socket disconnects (or changes its peer), we clear its receive 550 * queue of packets arrived from previous peer. First, it allows to do 551 * flow control based only on wmem_alloc; second, sk connected to peer 552 * may receive messages only from that peer. */ 553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 554 { 555 if (!skb_queue_empty(&sk->sk_receive_queue)) { 556 skb_queue_purge(&sk->sk_receive_queue); 557 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 558 559 /* If one link of bidirectional dgram pipe is disconnected, 560 * we signal error. Messages are lost. Do not make this, 561 * when peer was not connected to us. 562 */ 563 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 564 WRITE_ONCE(other->sk_err, ECONNRESET); 565 sk_error_report(other); 566 } 567 } 568 } 569 570 static void unix_sock_destructor(struct sock *sk) 571 { 572 struct unix_sock *u = unix_sk(sk); 573 574 skb_queue_purge(&sk->sk_receive_queue); 575 576 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 577 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 578 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 579 if (!sock_flag(sk, SOCK_DEAD)) { 580 pr_info("Attempt to release alive unix socket: %p\n", sk); 581 return; 582 } 583 584 if (u->addr) 585 unix_release_addr(u->addr); 586 587 atomic_long_dec(&unix_nr_socks); 588 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 589 #ifdef UNIX_REFCNT_DEBUG 590 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 591 atomic_long_read(&unix_nr_socks)); 592 #endif 593 } 594 595 static void unix_release_sock(struct sock *sk, int embrion) 596 { 597 struct unix_sock *u = unix_sk(sk); 598 struct sock *skpair; 599 struct sk_buff *skb; 600 struct path path; 601 int state; 602 603 unix_remove_socket(sock_net(sk), sk); 604 unix_remove_bsd_socket(sk); 605 606 /* Clear state */ 607 unix_state_lock(sk); 608 sock_orphan(sk); 609 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 610 path = u->path; 611 u->path.dentry = NULL; 612 u->path.mnt = NULL; 613 state = sk->sk_state; 614 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 615 616 skpair = unix_peer(sk); 617 unix_peer(sk) = NULL; 618 619 unix_state_unlock(sk); 620 621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 622 if (u->oob_skb) { 623 kfree_skb(u->oob_skb); 624 u->oob_skb = NULL; 625 } 626 #endif 627 628 wake_up_interruptible_all(&u->peer_wait); 629 630 if (skpair != NULL) { 631 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 632 unix_state_lock(skpair); 633 /* No more writes */ 634 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 635 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 636 WRITE_ONCE(skpair->sk_err, ECONNRESET); 637 unix_state_unlock(skpair); 638 skpair->sk_state_change(skpair); 639 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 640 } 641 642 unix_dgram_peer_wake_disconnect(sk, skpair); 643 sock_put(skpair); /* It may now die */ 644 } 645 646 /* Try to flush out this socket. Throw out buffers at least */ 647 648 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 649 if (state == TCP_LISTEN) 650 unix_release_sock(skb->sk, 1); 651 /* passed fds are erased in the kfree_skb hook */ 652 UNIXCB(skb).consumed = skb->len; 653 kfree_skb(skb); 654 } 655 656 if (path.dentry) 657 path_put(&path); 658 659 sock_put(sk); 660 661 /* ---- Socket is dead now and most probably destroyed ---- */ 662 663 /* 664 * Fixme: BSD difference: In BSD all sockets connected to us get 665 * ECONNRESET and we die on the spot. In Linux we behave 666 * like files and pipes do and wait for the last 667 * dereference. 668 * 669 * Can't we simply set sock->err? 670 * 671 * What the above comment does talk about? --ANK(980817) 672 */ 673 674 if (READ_ONCE(unix_tot_inflight)) 675 unix_gc(); /* Garbage collect fds */ 676 } 677 678 static void init_peercred(struct sock *sk) 679 { 680 const struct cred *old_cred; 681 struct pid *old_pid; 682 683 spin_lock(&sk->sk_peer_lock); 684 old_pid = sk->sk_peer_pid; 685 old_cred = sk->sk_peer_cred; 686 sk->sk_peer_pid = get_pid(task_tgid(current)); 687 sk->sk_peer_cred = get_current_cred(); 688 spin_unlock(&sk->sk_peer_lock); 689 690 put_pid(old_pid); 691 put_cred(old_cred); 692 } 693 694 static void copy_peercred(struct sock *sk, struct sock *peersk) 695 { 696 const struct cred *old_cred; 697 struct pid *old_pid; 698 699 if (sk < peersk) { 700 spin_lock(&sk->sk_peer_lock); 701 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 702 } else { 703 spin_lock(&peersk->sk_peer_lock); 704 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 705 } 706 old_pid = sk->sk_peer_pid; 707 old_cred = sk->sk_peer_cred; 708 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 709 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 710 711 spin_unlock(&sk->sk_peer_lock); 712 spin_unlock(&peersk->sk_peer_lock); 713 714 put_pid(old_pid); 715 put_cred(old_cred); 716 } 717 718 static int unix_listen(struct socket *sock, int backlog) 719 { 720 int err; 721 struct sock *sk = sock->sk; 722 struct unix_sock *u = unix_sk(sk); 723 724 err = -EOPNOTSUPP; 725 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 726 goto out; /* Only stream/seqpacket sockets accept */ 727 err = -EINVAL; 728 if (!READ_ONCE(u->addr)) 729 goto out; /* No listens on an unbound socket */ 730 unix_state_lock(sk); 731 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 732 goto out_unlock; 733 if (backlog > sk->sk_max_ack_backlog) 734 wake_up_interruptible_all(&u->peer_wait); 735 sk->sk_max_ack_backlog = backlog; 736 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 737 738 /* set credentials so connect can copy them */ 739 init_peercred(sk); 740 err = 0; 741 742 out_unlock: 743 unix_state_unlock(sk); 744 out: 745 return err; 746 } 747 748 static int unix_release(struct socket *); 749 static int unix_bind(struct socket *, struct sockaddr *, int); 750 static int unix_stream_connect(struct socket *, struct sockaddr *, 751 int addr_len, int flags); 752 static int unix_socketpair(struct socket *, struct socket *); 753 static int unix_accept(struct socket *, struct socket *, int, bool); 754 static int unix_getname(struct socket *, struct sockaddr *, int); 755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 756 static __poll_t unix_dgram_poll(struct file *, struct socket *, 757 poll_table *); 758 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 759 #ifdef CONFIG_COMPAT 760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 761 #endif 762 static int unix_shutdown(struct socket *, int); 763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 765 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 766 struct pipe_inode_info *, size_t size, 767 unsigned int flags); 768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 772 static int unix_dgram_connect(struct socket *, struct sockaddr *, 773 int, int); 774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 776 int); 777 778 static int unix_set_peek_off(struct sock *sk, int val) 779 { 780 struct unix_sock *u = unix_sk(sk); 781 782 if (mutex_lock_interruptible(&u->iolock)) 783 return -EINTR; 784 785 WRITE_ONCE(sk->sk_peek_off, val); 786 mutex_unlock(&u->iolock); 787 788 return 0; 789 } 790 791 #ifdef CONFIG_PROC_FS 792 static int unix_count_nr_fds(struct sock *sk) 793 { 794 struct sk_buff *skb; 795 struct unix_sock *u; 796 int nr_fds = 0; 797 798 spin_lock(&sk->sk_receive_queue.lock); 799 skb = skb_peek(&sk->sk_receive_queue); 800 while (skb) { 801 u = unix_sk(skb->sk); 802 nr_fds += atomic_read(&u->scm_stat.nr_fds); 803 skb = skb_peek_next(skb, &sk->sk_receive_queue); 804 } 805 spin_unlock(&sk->sk_receive_queue.lock); 806 807 return nr_fds; 808 } 809 810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 811 { 812 struct sock *sk = sock->sk; 813 unsigned char s_state; 814 struct unix_sock *u; 815 int nr_fds = 0; 816 817 if (sk) { 818 s_state = READ_ONCE(sk->sk_state); 819 u = unix_sk(sk); 820 821 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 822 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 823 * SOCK_DGRAM is ordinary. So, no lock is needed. 824 */ 825 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 826 nr_fds = atomic_read(&u->scm_stat.nr_fds); 827 else if (s_state == TCP_LISTEN) 828 nr_fds = unix_count_nr_fds(sk); 829 830 seq_printf(m, "scm_fds: %u\n", nr_fds); 831 } 832 } 833 #else 834 #define unix_show_fdinfo NULL 835 #endif 836 837 static const struct proto_ops unix_stream_ops = { 838 .family = PF_UNIX, 839 .owner = THIS_MODULE, 840 .release = unix_release, 841 .bind = unix_bind, 842 .connect = unix_stream_connect, 843 .socketpair = unix_socketpair, 844 .accept = unix_accept, 845 .getname = unix_getname, 846 .poll = unix_poll, 847 .ioctl = unix_ioctl, 848 #ifdef CONFIG_COMPAT 849 .compat_ioctl = unix_compat_ioctl, 850 #endif 851 .listen = unix_listen, 852 .shutdown = unix_shutdown, 853 .sendmsg = unix_stream_sendmsg, 854 .recvmsg = unix_stream_recvmsg, 855 .read_skb = unix_stream_read_skb, 856 .mmap = sock_no_mmap, 857 .splice_read = unix_stream_splice_read, 858 .set_peek_off = unix_set_peek_off, 859 .show_fdinfo = unix_show_fdinfo, 860 }; 861 862 static const struct proto_ops unix_dgram_ops = { 863 .family = PF_UNIX, 864 .owner = THIS_MODULE, 865 .release = unix_release, 866 .bind = unix_bind, 867 .connect = unix_dgram_connect, 868 .socketpair = unix_socketpair, 869 .accept = sock_no_accept, 870 .getname = unix_getname, 871 .poll = unix_dgram_poll, 872 .ioctl = unix_ioctl, 873 #ifdef CONFIG_COMPAT 874 .compat_ioctl = unix_compat_ioctl, 875 #endif 876 .listen = sock_no_listen, 877 .shutdown = unix_shutdown, 878 .sendmsg = unix_dgram_sendmsg, 879 .read_skb = unix_read_skb, 880 .recvmsg = unix_dgram_recvmsg, 881 .mmap = sock_no_mmap, 882 .set_peek_off = unix_set_peek_off, 883 .show_fdinfo = unix_show_fdinfo, 884 }; 885 886 static const struct proto_ops unix_seqpacket_ops = { 887 .family = PF_UNIX, 888 .owner = THIS_MODULE, 889 .release = unix_release, 890 .bind = unix_bind, 891 .connect = unix_stream_connect, 892 .socketpair = unix_socketpair, 893 .accept = unix_accept, 894 .getname = unix_getname, 895 .poll = unix_dgram_poll, 896 .ioctl = unix_ioctl, 897 #ifdef CONFIG_COMPAT 898 .compat_ioctl = unix_compat_ioctl, 899 #endif 900 .listen = unix_listen, 901 .shutdown = unix_shutdown, 902 .sendmsg = unix_seqpacket_sendmsg, 903 .recvmsg = unix_seqpacket_recvmsg, 904 .mmap = sock_no_mmap, 905 .set_peek_off = unix_set_peek_off, 906 .show_fdinfo = unix_show_fdinfo, 907 }; 908 909 static void unix_close(struct sock *sk, long timeout) 910 { 911 /* Nothing to do here, unix socket does not need a ->close(). 912 * This is merely for sockmap. 913 */ 914 } 915 916 static void unix_unhash(struct sock *sk) 917 { 918 /* Nothing to do here, unix socket does not need a ->unhash(). 919 * This is merely for sockmap. 920 */ 921 } 922 923 static bool unix_bpf_bypass_getsockopt(int level, int optname) 924 { 925 if (level == SOL_SOCKET) { 926 switch (optname) { 927 case SO_PEERPIDFD: 928 return true; 929 default: 930 return false; 931 } 932 } 933 934 return false; 935 } 936 937 struct proto unix_dgram_proto = { 938 .name = "UNIX", 939 .owner = THIS_MODULE, 940 .obj_size = sizeof(struct unix_sock), 941 .close = unix_close, 942 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 943 #ifdef CONFIG_BPF_SYSCALL 944 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 945 #endif 946 }; 947 948 struct proto unix_stream_proto = { 949 .name = "UNIX-STREAM", 950 .owner = THIS_MODULE, 951 .obj_size = sizeof(struct unix_sock), 952 .close = unix_close, 953 .unhash = unix_unhash, 954 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 955 #ifdef CONFIG_BPF_SYSCALL 956 .psock_update_sk_prot = unix_stream_bpf_update_proto, 957 #endif 958 }; 959 960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 961 { 962 struct unix_sock *u; 963 struct sock *sk; 964 int err; 965 966 atomic_long_inc(&unix_nr_socks); 967 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 968 err = -ENFILE; 969 goto err; 970 } 971 972 if (type == SOCK_STREAM) 973 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 974 else /*dgram and seqpacket */ 975 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 976 977 if (!sk) { 978 err = -ENOMEM; 979 goto err; 980 } 981 982 sock_init_data(sock, sk); 983 984 sk->sk_hash = unix_unbound_hash(sk); 985 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 986 sk->sk_write_space = unix_write_space; 987 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 988 sk->sk_destruct = unix_sock_destructor; 989 u = unix_sk(sk); 990 u->inflight = 0; 991 u->path.dentry = NULL; 992 u->path.mnt = NULL; 993 spin_lock_init(&u->lock); 994 INIT_LIST_HEAD(&u->link); 995 mutex_init(&u->iolock); /* single task reading lock */ 996 mutex_init(&u->bindlock); /* single task binding lock */ 997 init_waitqueue_head(&u->peer_wait); 998 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 999 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1000 unix_insert_unbound_socket(net, sk); 1001 1002 sock_prot_inuse_add(net, sk->sk_prot, 1); 1003 1004 return sk; 1005 1006 err: 1007 atomic_long_dec(&unix_nr_socks); 1008 return ERR_PTR(err); 1009 } 1010 1011 static int unix_create(struct net *net, struct socket *sock, int protocol, 1012 int kern) 1013 { 1014 struct sock *sk; 1015 1016 if (protocol && protocol != PF_UNIX) 1017 return -EPROTONOSUPPORT; 1018 1019 sock->state = SS_UNCONNECTED; 1020 1021 switch (sock->type) { 1022 case SOCK_STREAM: 1023 sock->ops = &unix_stream_ops; 1024 break; 1025 /* 1026 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1027 * nothing uses it. 1028 */ 1029 case SOCK_RAW: 1030 sock->type = SOCK_DGRAM; 1031 fallthrough; 1032 case SOCK_DGRAM: 1033 sock->ops = &unix_dgram_ops; 1034 break; 1035 case SOCK_SEQPACKET: 1036 sock->ops = &unix_seqpacket_ops; 1037 break; 1038 default: 1039 return -ESOCKTNOSUPPORT; 1040 } 1041 1042 sk = unix_create1(net, sock, kern, sock->type); 1043 if (IS_ERR(sk)) 1044 return PTR_ERR(sk); 1045 1046 return 0; 1047 } 1048 1049 static int unix_release(struct socket *sock) 1050 { 1051 struct sock *sk = sock->sk; 1052 1053 if (!sk) 1054 return 0; 1055 1056 sk->sk_prot->close(sk, 0); 1057 unix_release_sock(sk, 0); 1058 sock->sk = NULL; 1059 1060 return 0; 1061 } 1062 1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1064 int type) 1065 { 1066 struct inode *inode; 1067 struct path path; 1068 struct sock *sk; 1069 int err; 1070 1071 unix_mkname_bsd(sunaddr, addr_len); 1072 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1073 if (err) 1074 goto fail; 1075 1076 err = path_permission(&path, MAY_WRITE); 1077 if (err) 1078 goto path_put; 1079 1080 err = -ECONNREFUSED; 1081 inode = d_backing_inode(path.dentry); 1082 if (!S_ISSOCK(inode->i_mode)) 1083 goto path_put; 1084 1085 sk = unix_find_socket_byinode(inode); 1086 if (!sk) 1087 goto path_put; 1088 1089 err = -EPROTOTYPE; 1090 if (sk->sk_type == type) 1091 touch_atime(&path); 1092 else 1093 goto sock_put; 1094 1095 path_put(&path); 1096 1097 return sk; 1098 1099 sock_put: 1100 sock_put(sk); 1101 path_put: 1102 path_put(&path); 1103 fail: 1104 return ERR_PTR(err); 1105 } 1106 1107 static struct sock *unix_find_abstract(struct net *net, 1108 struct sockaddr_un *sunaddr, 1109 int addr_len, int type) 1110 { 1111 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1112 struct dentry *dentry; 1113 struct sock *sk; 1114 1115 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1116 if (!sk) 1117 return ERR_PTR(-ECONNREFUSED); 1118 1119 dentry = unix_sk(sk)->path.dentry; 1120 if (dentry) 1121 touch_atime(&unix_sk(sk)->path); 1122 1123 return sk; 1124 } 1125 1126 static struct sock *unix_find_other(struct net *net, 1127 struct sockaddr_un *sunaddr, 1128 int addr_len, int type) 1129 { 1130 struct sock *sk; 1131 1132 if (sunaddr->sun_path[0]) 1133 sk = unix_find_bsd(sunaddr, addr_len, type); 1134 else 1135 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1136 1137 return sk; 1138 } 1139 1140 static int unix_autobind(struct sock *sk) 1141 { 1142 struct unix_sock *u = unix_sk(sk); 1143 unsigned int new_hash, old_hash; 1144 struct net *net = sock_net(sk); 1145 struct unix_address *addr; 1146 u32 lastnum, ordernum; 1147 int err; 1148 1149 err = mutex_lock_interruptible(&u->bindlock); 1150 if (err) 1151 return err; 1152 1153 if (u->addr) 1154 goto out; 1155 1156 err = -ENOMEM; 1157 addr = kzalloc(sizeof(*addr) + 1158 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1159 if (!addr) 1160 goto out; 1161 1162 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1163 addr->name->sun_family = AF_UNIX; 1164 refcount_set(&addr->refcnt, 1); 1165 1166 old_hash = sk->sk_hash; 1167 ordernum = get_random_u32(); 1168 lastnum = ordernum & 0xFFFFF; 1169 retry: 1170 ordernum = (ordernum + 1) & 0xFFFFF; 1171 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1172 1173 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1174 unix_table_double_lock(net, old_hash, new_hash); 1175 1176 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1177 unix_table_double_unlock(net, old_hash, new_hash); 1178 1179 /* __unix_find_socket_byname() may take long time if many names 1180 * are already in use. 1181 */ 1182 cond_resched(); 1183 1184 if (ordernum == lastnum) { 1185 /* Give up if all names seems to be in use. */ 1186 err = -ENOSPC; 1187 unix_release_addr(addr); 1188 goto out; 1189 } 1190 1191 goto retry; 1192 } 1193 1194 __unix_set_addr_hash(net, sk, addr, new_hash); 1195 unix_table_double_unlock(net, old_hash, new_hash); 1196 err = 0; 1197 1198 out: mutex_unlock(&u->bindlock); 1199 return err; 1200 } 1201 1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1203 int addr_len) 1204 { 1205 umode_t mode = S_IFSOCK | 1206 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1207 struct unix_sock *u = unix_sk(sk); 1208 unsigned int new_hash, old_hash; 1209 struct net *net = sock_net(sk); 1210 struct mnt_idmap *idmap; 1211 struct unix_address *addr; 1212 struct dentry *dentry; 1213 struct path parent; 1214 int err; 1215 1216 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1217 addr = unix_create_addr(sunaddr, addr_len); 1218 if (!addr) 1219 return -ENOMEM; 1220 1221 /* 1222 * Get the parent directory, calculate the hash for last 1223 * component. 1224 */ 1225 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1226 if (IS_ERR(dentry)) { 1227 err = PTR_ERR(dentry); 1228 goto out; 1229 } 1230 1231 /* 1232 * All right, let's create it. 1233 */ 1234 idmap = mnt_idmap(parent.mnt); 1235 err = security_path_mknod(&parent, dentry, mode, 0); 1236 if (!err) 1237 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1238 if (err) 1239 goto out_path; 1240 err = mutex_lock_interruptible(&u->bindlock); 1241 if (err) 1242 goto out_unlink; 1243 if (u->addr) 1244 goto out_unlock; 1245 1246 old_hash = sk->sk_hash; 1247 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1248 unix_table_double_lock(net, old_hash, new_hash); 1249 u->path.mnt = mntget(parent.mnt); 1250 u->path.dentry = dget(dentry); 1251 __unix_set_addr_hash(net, sk, addr, new_hash); 1252 unix_table_double_unlock(net, old_hash, new_hash); 1253 unix_insert_bsd_socket(sk); 1254 mutex_unlock(&u->bindlock); 1255 done_path_create(&parent, dentry); 1256 return 0; 1257 1258 out_unlock: 1259 mutex_unlock(&u->bindlock); 1260 err = -EINVAL; 1261 out_unlink: 1262 /* failed after successful mknod? unlink what we'd created... */ 1263 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1264 out_path: 1265 done_path_create(&parent, dentry); 1266 out: 1267 unix_release_addr(addr); 1268 return err == -EEXIST ? -EADDRINUSE : err; 1269 } 1270 1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1272 int addr_len) 1273 { 1274 struct unix_sock *u = unix_sk(sk); 1275 unsigned int new_hash, old_hash; 1276 struct net *net = sock_net(sk); 1277 struct unix_address *addr; 1278 int err; 1279 1280 addr = unix_create_addr(sunaddr, addr_len); 1281 if (!addr) 1282 return -ENOMEM; 1283 1284 err = mutex_lock_interruptible(&u->bindlock); 1285 if (err) 1286 goto out; 1287 1288 if (u->addr) { 1289 err = -EINVAL; 1290 goto out_mutex; 1291 } 1292 1293 old_hash = sk->sk_hash; 1294 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1295 unix_table_double_lock(net, old_hash, new_hash); 1296 1297 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1298 goto out_spin; 1299 1300 __unix_set_addr_hash(net, sk, addr, new_hash); 1301 unix_table_double_unlock(net, old_hash, new_hash); 1302 mutex_unlock(&u->bindlock); 1303 return 0; 1304 1305 out_spin: 1306 unix_table_double_unlock(net, old_hash, new_hash); 1307 err = -EADDRINUSE; 1308 out_mutex: 1309 mutex_unlock(&u->bindlock); 1310 out: 1311 unix_release_addr(addr); 1312 return err; 1313 } 1314 1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1316 { 1317 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1318 struct sock *sk = sock->sk; 1319 int err; 1320 1321 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1322 sunaddr->sun_family == AF_UNIX) 1323 return unix_autobind(sk); 1324 1325 err = unix_validate_addr(sunaddr, addr_len); 1326 if (err) 1327 return err; 1328 1329 if (sunaddr->sun_path[0]) 1330 err = unix_bind_bsd(sk, sunaddr, addr_len); 1331 else 1332 err = unix_bind_abstract(sk, sunaddr, addr_len); 1333 1334 return err; 1335 } 1336 1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1338 { 1339 if (unlikely(sk1 == sk2) || !sk2) { 1340 unix_state_lock(sk1); 1341 return; 1342 } 1343 if (sk1 > sk2) 1344 swap(sk1, sk2); 1345 1346 unix_state_lock(sk1); 1347 unix_state_lock_nested(sk2, U_LOCK_SECOND); 1348 } 1349 1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1351 { 1352 if (unlikely(sk1 == sk2) || !sk2) { 1353 unix_state_unlock(sk1); 1354 return; 1355 } 1356 unix_state_unlock(sk1); 1357 unix_state_unlock(sk2); 1358 } 1359 1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1361 int alen, int flags) 1362 { 1363 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1364 struct sock *sk = sock->sk; 1365 struct sock *other; 1366 int err; 1367 1368 err = -EINVAL; 1369 if (alen < offsetofend(struct sockaddr, sa_family)) 1370 goto out; 1371 1372 if (addr->sa_family != AF_UNSPEC) { 1373 err = unix_validate_addr(sunaddr, alen); 1374 if (err) 1375 goto out; 1376 1377 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1378 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1379 !READ_ONCE(unix_sk(sk)->addr)) { 1380 err = unix_autobind(sk); 1381 if (err) 1382 goto out; 1383 } 1384 1385 restart: 1386 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1387 if (IS_ERR(other)) { 1388 err = PTR_ERR(other); 1389 goto out; 1390 } 1391 1392 unix_state_double_lock(sk, other); 1393 1394 /* Apparently VFS overslept socket death. Retry. */ 1395 if (sock_flag(other, SOCK_DEAD)) { 1396 unix_state_double_unlock(sk, other); 1397 sock_put(other); 1398 goto restart; 1399 } 1400 1401 err = -EPERM; 1402 if (!unix_may_send(sk, other)) 1403 goto out_unlock; 1404 1405 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1406 if (err) 1407 goto out_unlock; 1408 1409 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1410 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1411 } else { 1412 /* 1413 * 1003.1g breaking connected state with AF_UNSPEC 1414 */ 1415 other = NULL; 1416 unix_state_double_lock(sk, other); 1417 } 1418 1419 /* 1420 * If it was connected, reconnect. 1421 */ 1422 if (unix_peer(sk)) { 1423 struct sock *old_peer = unix_peer(sk); 1424 1425 unix_peer(sk) = other; 1426 if (!other) 1427 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1428 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1429 1430 unix_state_double_unlock(sk, other); 1431 1432 if (other != old_peer) { 1433 unix_dgram_disconnected(sk, old_peer); 1434 1435 unix_state_lock(old_peer); 1436 if (!unix_peer(old_peer)) 1437 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1438 unix_state_unlock(old_peer); 1439 } 1440 1441 sock_put(old_peer); 1442 } else { 1443 unix_peer(sk) = other; 1444 unix_state_double_unlock(sk, other); 1445 } 1446 1447 return 0; 1448 1449 out_unlock: 1450 unix_state_double_unlock(sk, other); 1451 sock_put(other); 1452 out: 1453 return err; 1454 } 1455 1456 static long unix_wait_for_peer(struct sock *other, long timeo) 1457 __releases(&unix_sk(other)->lock) 1458 { 1459 struct unix_sock *u = unix_sk(other); 1460 int sched; 1461 DEFINE_WAIT(wait); 1462 1463 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1464 1465 sched = !sock_flag(other, SOCK_DEAD) && 1466 !(other->sk_shutdown & RCV_SHUTDOWN) && 1467 unix_recvq_full_lockless(other); 1468 1469 unix_state_unlock(other); 1470 1471 if (sched) 1472 timeo = schedule_timeout(timeo); 1473 1474 finish_wait(&u->peer_wait, &wait); 1475 return timeo; 1476 } 1477 1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1479 int addr_len, int flags) 1480 { 1481 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1482 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1483 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1484 struct net *net = sock_net(sk); 1485 struct sk_buff *skb = NULL; 1486 long timeo; 1487 int err; 1488 1489 err = unix_validate_addr(sunaddr, addr_len); 1490 if (err) 1491 goto out; 1492 1493 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1494 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1495 !READ_ONCE(u->addr)) { 1496 err = unix_autobind(sk); 1497 if (err) 1498 goto out; 1499 } 1500 1501 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1502 1503 /* First of all allocate resources. 1504 If we will make it after state is locked, 1505 we will have to recheck all again in any case. 1506 */ 1507 1508 /* create new sock for complete connection */ 1509 newsk = unix_create1(net, NULL, 0, sock->type); 1510 if (IS_ERR(newsk)) { 1511 err = PTR_ERR(newsk); 1512 newsk = NULL; 1513 goto out; 1514 } 1515 1516 err = -ENOMEM; 1517 1518 /* Allocate skb for sending to listening sock */ 1519 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1520 if (skb == NULL) 1521 goto out; 1522 1523 restart: 1524 /* Find listening sock. */ 1525 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1526 if (IS_ERR(other)) { 1527 err = PTR_ERR(other); 1528 other = NULL; 1529 goto out; 1530 } 1531 1532 /* Latch state of peer */ 1533 unix_state_lock(other); 1534 1535 /* Apparently VFS overslept socket death. Retry. */ 1536 if (sock_flag(other, SOCK_DEAD)) { 1537 unix_state_unlock(other); 1538 sock_put(other); 1539 goto restart; 1540 } 1541 1542 err = -ECONNREFUSED; 1543 if (other->sk_state != TCP_LISTEN) 1544 goto out_unlock; 1545 if (other->sk_shutdown & RCV_SHUTDOWN) 1546 goto out_unlock; 1547 1548 if (unix_recvq_full_lockless(other)) { 1549 err = -EAGAIN; 1550 if (!timeo) 1551 goto out_unlock; 1552 1553 timeo = unix_wait_for_peer(other, timeo); 1554 1555 err = sock_intr_errno(timeo); 1556 if (signal_pending(current)) 1557 goto out; 1558 sock_put(other); 1559 goto restart; 1560 } 1561 1562 /* Latch our state. 1563 1564 It is tricky place. We need to grab our state lock and cannot 1565 drop lock on peer. It is dangerous because deadlock is 1566 possible. Connect to self case and simultaneous 1567 attempt to connect are eliminated by checking socket 1568 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1569 check this before attempt to grab lock. 1570 1571 Well, and we have to recheck the state after socket locked. 1572 */ 1573 switch (READ_ONCE(sk->sk_state)) { 1574 case TCP_CLOSE: 1575 /* This is ok... continue with connect */ 1576 break; 1577 case TCP_ESTABLISHED: 1578 /* Socket is already connected */ 1579 err = -EISCONN; 1580 goto out_unlock; 1581 default: 1582 err = -EINVAL; 1583 goto out_unlock; 1584 } 1585 1586 unix_state_lock_nested(sk, U_LOCK_SECOND); 1587 1588 if (sk->sk_state != TCP_CLOSE) { 1589 unix_state_unlock(sk); 1590 unix_state_unlock(other); 1591 sock_put(other); 1592 goto restart; 1593 } 1594 1595 err = security_unix_stream_connect(sk, other, newsk); 1596 if (err) { 1597 unix_state_unlock(sk); 1598 goto out_unlock; 1599 } 1600 1601 /* The way is open! Fastly set all the necessary fields... */ 1602 1603 sock_hold(sk); 1604 unix_peer(newsk) = sk; 1605 newsk->sk_state = TCP_ESTABLISHED; 1606 newsk->sk_type = sk->sk_type; 1607 init_peercred(newsk); 1608 newu = unix_sk(newsk); 1609 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1610 otheru = unix_sk(other); 1611 1612 /* copy address information from listening to new sock 1613 * 1614 * The contents of *(otheru->addr) and otheru->path 1615 * are seen fully set up here, since we have found 1616 * otheru in hash under its lock. Insertion into the 1617 * hash chain we'd found it in had been done in an 1618 * earlier critical area protected by the chain's lock, 1619 * the same one where we'd set *(otheru->addr) contents, 1620 * as well as otheru->path and otheru->addr itself. 1621 * 1622 * Using smp_store_release() here to set newu->addr 1623 * is enough to make those stores, as well as stores 1624 * to newu->path visible to anyone who gets newu->addr 1625 * by smp_load_acquire(). IOW, the same warranties 1626 * as for unix_sock instances bound in unix_bind() or 1627 * in unix_autobind(). 1628 */ 1629 if (otheru->path.dentry) { 1630 path_get(&otheru->path); 1631 newu->path = otheru->path; 1632 } 1633 refcount_inc(&otheru->addr->refcnt); 1634 smp_store_release(&newu->addr, otheru->addr); 1635 1636 /* Set credentials */ 1637 copy_peercred(sk, other); 1638 1639 sock->state = SS_CONNECTED; 1640 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1641 sock_hold(newsk); 1642 1643 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1644 unix_peer(sk) = newsk; 1645 1646 unix_state_unlock(sk); 1647 1648 /* take ten and send info to listening sock */ 1649 spin_lock(&other->sk_receive_queue.lock); 1650 __skb_queue_tail(&other->sk_receive_queue, skb); 1651 spin_unlock(&other->sk_receive_queue.lock); 1652 unix_state_unlock(other); 1653 other->sk_data_ready(other); 1654 sock_put(other); 1655 return 0; 1656 1657 out_unlock: 1658 if (other) 1659 unix_state_unlock(other); 1660 1661 out: 1662 kfree_skb(skb); 1663 if (newsk) 1664 unix_release_sock(newsk, 0); 1665 if (other) 1666 sock_put(other); 1667 return err; 1668 } 1669 1670 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1671 { 1672 struct sock *ska = socka->sk, *skb = sockb->sk; 1673 1674 /* Join our sockets back to back */ 1675 sock_hold(ska); 1676 sock_hold(skb); 1677 unix_peer(ska) = skb; 1678 unix_peer(skb) = ska; 1679 init_peercred(ska); 1680 init_peercred(skb); 1681 1682 ska->sk_state = TCP_ESTABLISHED; 1683 skb->sk_state = TCP_ESTABLISHED; 1684 socka->state = SS_CONNECTED; 1685 sockb->state = SS_CONNECTED; 1686 return 0; 1687 } 1688 1689 static void unix_sock_inherit_flags(const struct socket *old, 1690 struct socket *new) 1691 { 1692 if (test_bit(SOCK_PASSCRED, &old->flags)) 1693 set_bit(SOCK_PASSCRED, &new->flags); 1694 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1695 set_bit(SOCK_PASSPIDFD, &new->flags); 1696 if (test_bit(SOCK_PASSSEC, &old->flags)) 1697 set_bit(SOCK_PASSSEC, &new->flags); 1698 } 1699 1700 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1701 bool kern) 1702 { 1703 struct sock *sk = sock->sk; 1704 struct sock *tsk; 1705 struct sk_buff *skb; 1706 int err; 1707 1708 err = -EOPNOTSUPP; 1709 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1710 goto out; 1711 1712 err = -EINVAL; 1713 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1714 goto out; 1715 1716 /* If socket state is TCP_LISTEN it cannot change (for now...), 1717 * so that no locks are necessary. 1718 */ 1719 1720 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1721 &err); 1722 if (!skb) { 1723 /* This means receive shutdown. */ 1724 if (err == 0) 1725 err = -EINVAL; 1726 goto out; 1727 } 1728 1729 tsk = skb->sk; 1730 skb_free_datagram(sk, skb); 1731 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1732 1733 /* attach accepted sock to socket */ 1734 unix_state_lock(tsk); 1735 newsock->state = SS_CONNECTED; 1736 unix_sock_inherit_flags(sock, newsock); 1737 sock_graft(tsk, newsock); 1738 unix_state_unlock(tsk); 1739 return 0; 1740 1741 out: 1742 return err; 1743 } 1744 1745 1746 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1747 { 1748 struct sock *sk = sock->sk; 1749 struct unix_address *addr; 1750 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1751 int err = 0; 1752 1753 if (peer) { 1754 sk = unix_peer_get(sk); 1755 1756 err = -ENOTCONN; 1757 if (!sk) 1758 goto out; 1759 err = 0; 1760 } else { 1761 sock_hold(sk); 1762 } 1763 1764 addr = smp_load_acquire(&unix_sk(sk)->addr); 1765 if (!addr) { 1766 sunaddr->sun_family = AF_UNIX; 1767 sunaddr->sun_path[0] = 0; 1768 err = offsetof(struct sockaddr_un, sun_path); 1769 } else { 1770 err = addr->len; 1771 memcpy(sunaddr, addr->name, addr->len); 1772 } 1773 sock_put(sk); 1774 out: 1775 return err; 1776 } 1777 1778 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1779 { 1780 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1781 1782 /* 1783 * Garbage collection of unix sockets starts by selecting a set of 1784 * candidate sockets which have reference only from being in flight 1785 * (total_refs == inflight_refs). This condition is checked once during 1786 * the candidate collection phase, and candidates are marked as such, so 1787 * that non-candidates can later be ignored. While inflight_refs is 1788 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1789 * is an instantaneous decision. 1790 * 1791 * Once a candidate, however, the socket must not be reinstalled into a 1792 * file descriptor while the garbage collection is in progress. 1793 * 1794 * If the above conditions are met, then the directed graph of 1795 * candidates (*) does not change while unix_gc_lock is held. 1796 * 1797 * Any operations that changes the file count through file descriptors 1798 * (dup, close, sendmsg) does not change the graph since candidates are 1799 * not installed in fds. 1800 * 1801 * Dequeing a candidate via recvmsg would install it into an fd, but 1802 * that takes unix_gc_lock to decrement the inflight count, so it's 1803 * serialized with garbage collection. 1804 * 1805 * MSG_PEEK is special in that it does not change the inflight count, 1806 * yet does install the socket into an fd. The following lock/unlock 1807 * pair is to ensure serialization with garbage collection. It must be 1808 * done between incrementing the file count and installing the file into 1809 * an fd. 1810 * 1811 * If garbage collection starts after the barrier provided by the 1812 * lock/unlock, then it will see the elevated refcount and not mark this 1813 * as a candidate. If a garbage collection is already in progress 1814 * before the file count was incremented, then the lock/unlock pair will 1815 * ensure that garbage collection is finished before progressing to 1816 * installing the fd. 1817 * 1818 * (*) A -> B where B is on the queue of A or B is on the queue of C 1819 * which is on the queue of listening socket A. 1820 */ 1821 spin_lock(&unix_gc_lock); 1822 spin_unlock(&unix_gc_lock); 1823 } 1824 1825 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1826 { 1827 int err = 0; 1828 1829 UNIXCB(skb).pid = get_pid(scm->pid); 1830 UNIXCB(skb).uid = scm->creds.uid; 1831 UNIXCB(skb).gid = scm->creds.gid; 1832 UNIXCB(skb).fp = NULL; 1833 unix_get_secdata(scm, skb); 1834 if (scm->fp && send_fds) 1835 err = unix_attach_fds(scm, skb); 1836 1837 skb->destructor = unix_destruct_scm; 1838 return err; 1839 } 1840 1841 static bool unix_passcred_enabled(const struct socket *sock, 1842 const struct sock *other) 1843 { 1844 return test_bit(SOCK_PASSCRED, &sock->flags) || 1845 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1846 !other->sk_socket || 1847 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1848 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1849 } 1850 1851 /* 1852 * Some apps rely on write() giving SCM_CREDENTIALS 1853 * We include credentials if source or destination socket 1854 * asserted SOCK_PASSCRED. 1855 */ 1856 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1857 const struct sock *other) 1858 { 1859 if (UNIXCB(skb).pid) 1860 return; 1861 if (unix_passcred_enabled(sock, other)) { 1862 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1863 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1864 } 1865 } 1866 1867 static bool unix_skb_scm_eq(struct sk_buff *skb, 1868 struct scm_cookie *scm) 1869 { 1870 return UNIXCB(skb).pid == scm->pid && 1871 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1872 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1873 unix_secdata_eq(scm, skb); 1874 } 1875 1876 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1877 { 1878 struct scm_fp_list *fp = UNIXCB(skb).fp; 1879 struct unix_sock *u = unix_sk(sk); 1880 1881 if (unlikely(fp && fp->count)) 1882 atomic_add(fp->count, &u->scm_stat.nr_fds); 1883 } 1884 1885 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1886 { 1887 struct scm_fp_list *fp = UNIXCB(skb).fp; 1888 struct unix_sock *u = unix_sk(sk); 1889 1890 if (unlikely(fp && fp->count)) 1891 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1892 } 1893 1894 /* 1895 * Send AF_UNIX data. 1896 */ 1897 1898 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1899 size_t len) 1900 { 1901 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1902 struct sock *sk = sock->sk, *other = NULL; 1903 struct unix_sock *u = unix_sk(sk); 1904 struct scm_cookie scm; 1905 struct sk_buff *skb; 1906 int data_len = 0; 1907 int sk_locked; 1908 long timeo; 1909 int err; 1910 1911 wait_for_unix_gc(); 1912 err = scm_send(sock, msg, &scm, false); 1913 if (err < 0) 1914 return err; 1915 1916 err = -EOPNOTSUPP; 1917 if (msg->msg_flags&MSG_OOB) 1918 goto out; 1919 1920 if (msg->msg_namelen) { 1921 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1922 if (err) 1923 goto out; 1924 } else { 1925 sunaddr = NULL; 1926 err = -ENOTCONN; 1927 other = unix_peer_get(sk); 1928 if (!other) 1929 goto out; 1930 } 1931 1932 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1933 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1934 !READ_ONCE(u->addr)) { 1935 err = unix_autobind(sk); 1936 if (err) 1937 goto out; 1938 } 1939 1940 err = -EMSGSIZE; 1941 if (len > READ_ONCE(sk->sk_sndbuf) - 32) 1942 goto out; 1943 1944 if (len > SKB_MAX_ALLOC) { 1945 data_len = min_t(size_t, 1946 len - SKB_MAX_ALLOC, 1947 MAX_SKB_FRAGS * PAGE_SIZE); 1948 data_len = PAGE_ALIGN(data_len); 1949 1950 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1951 } 1952 1953 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1954 msg->msg_flags & MSG_DONTWAIT, &err, 1955 PAGE_ALLOC_COSTLY_ORDER); 1956 if (skb == NULL) 1957 goto out; 1958 1959 err = unix_scm_to_skb(&scm, skb, true); 1960 if (err < 0) 1961 goto out_free; 1962 1963 skb_put(skb, len - data_len); 1964 skb->data_len = data_len; 1965 skb->len = len; 1966 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1967 if (err) 1968 goto out_free; 1969 1970 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1971 1972 restart: 1973 if (!other) { 1974 err = -ECONNRESET; 1975 if (sunaddr == NULL) 1976 goto out_free; 1977 1978 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 1979 sk->sk_type); 1980 if (IS_ERR(other)) { 1981 err = PTR_ERR(other); 1982 other = NULL; 1983 goto out_free; 1984 } 1985 } 1986 1987 if (sk_filter(other, skb) < 0) { 1988 /* Toss the packet but do not return any error to the sender */ 1989 err = len; 1990 goto out_free; 1991 } 1992 1993 sk_locked = 0; 1994 unix_state_lock(other); 1995 restart_locked: 1996 err = -EPERM; 1997 if (!unix_may_send(sk, other)) 1998 goto out_unlock; 1999 2000 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2001 /* 2002 * Check with 1003.1g - what should 2003 * datagram error 2004 */ 2005 unix_state_unlock(other); 2006 sock_put(other); 2007 2008 if (!sk_locked) 2009 unix_state_lock(sk); 2010 2011 err = 0; 2012 if (sk->sk_type == SOCK_SEQPACKET) { 2013 /* We are here only when racing with unix_release_sock() 2014 * is clearing @other. Never change state to TCP_CLOSE 2015 * unlike SOCK_DGRAM wants. 2016 */ 2017 unix_state_unlock(sk); 2018 err = -EPIPE; 2019 } else if (unix_peer(sk) == other) { 2020 unix_peer(sk) = NULL; 2021 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2022 2023 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2024 unix_state_unlock(sk); 2025 2026 unix_dgram_disconnected(sk, other); 2027 sock_put(other); 2028 err = -ECONNREFUSED; 2029 } else { 2030 unix_state_unlock(sk); 2031 } 2032 2033 other = NULL; 2034 if (err) 2035 goto out_free; 2036 goto restart; 2037 } 2038 2039 err = -EPIPE; 2040 if (other->sk_shutdown & RCV_SHUTDOWN) 2041 goto out_unlock; 2042 2043 if (sk->sk_type != SOCK_SEQPACKET) { 2044 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2045 if (err) 2046 goto out_unlock; 2047 } 2048 2049 /* other == sk && unix_peer(other) != sk if 2050 * - unix_peer(sk) == NULL, destination address bound to sk 2051 * - unix_peer(sk) == sk by time of get but disconnected before lock 2052 */ 2053 if (other != sk && 2054 unlikely(unix_peer(other) != sk && 2055 unix_recvq_full_lockless(other))) { 2056 if (timeo) { 2057 timeo = unix_wait_for_peer(other, timeo); 2058 2059 err = sock_intr_errno(timeo); 2060 if (signal_pending(current)) 2061 goto out_free; 2062 2063 goto restart; 2064 } 2065 2066 if (!sk_locked) { 2067 unix_state_unlock(other); 2068 unix_state_double_lock(sk, other); 2069 } 2070 2071 if (unix_peer(sk) != other || 2072 unix_dgram_peer_wake_me(sk, other)) { 2073 err = -EAGAIN; 2074 sk_locked = 1; 2075 goto out_unlock; 2076 } 2077 2078 if (!sk_locked) { 2079 sk_locked = 1; 2080 goto restart_locked; 2081 } 2082 } 2083 2084 if (unlikely(sk_locked)) 2085 unix_state_unlock(sk); 2086 2087 if (sock_flag(other, SOCK_RCVTSTAMP)) 2088 __net_timestamp(skb); 2089 maybe_add_creds(skb, sock, other); 2090 scm_stat_add(other, skb); 2091 skb_queue_tail(&other->sk_receive_queue, skb); 2092 unix_state_unlock(other); 2093 other->sk_data_ready(other); 2094 sock_put(other); 2095 scm_destroy(&scm); 2096 return len; 2097 2098 out_unlock: 2099 if (sk_locked) 2100 unix_state_unlock(sk); 2101 unix_state_unlock(other); 2102 out_free: 2103 kfree_skb(skb); 2104 out: 2105 if (other) 2106 sock_put(other); 2107 scm_destroy(&scm); 2108 return err; 2109 } 2110 2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2112 * bytes, and a minimum of a full page. 2113 */ 2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2115 2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2118 struct scm_cookie *scm, bool fds_sent) 2119 { 2120 struct unix_sock *ousk = unix_sk(other); 2121 struct sk_buff *skb; 2122 int err = 0; 2123 2124 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2125 2126 if (!skb) 2127 return err; 2128 2129 err = unix_scm_to_skb(scm, skb, !fds_sent); 2130 if (err < 0) { 2131 kfree_skb(skb); 2132 return err; 2133 } 2134 skb_put(skb, 1); 2135 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2136 2137 if (err) { 2138 kfree_skb(skb); 2139 return err; 2140 } 2141 2142 unix_state_lock(other); 2143 2144 if (sock_flag(other, SOCK_DEAD) || 2145 (other->sk_shutdown & RCV_SHUTDOWN)) { 2146 unix_state_unlock(other); 2147 kfree_skb(skb); 2148 return -EPIPE; 2149 } 2150 2151 maybe_add_creds(skb, sock, other); 2152 skb_get(skb); 2153 2154 scm_stat_add(other, skb); 2155 2156 spin_lock(&other->sk_receive_queue.lock); 2157 if (ousk->oob_skb) 2158 consume_skb(ousk->oob_skb); 2159 WRITE_ONCE(ousk->oob_skb, skb); 2160 __skb_queue_tail(&other->sk_receive_queue, skb); 2161 spin_unlock(&other->sk_receive_queue.lock); 2162 2163 sk_send_sigurg(other); 2164 unix_state_unlock(other); 2165 other->sk_data_ready(other); 2166 2167 return err; 2168 } 2169 #endif 2170 2171 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2172 size_t len) 2173 { 2174 struct sock *sk = sock->sk; 2175 struct sock *other = NULL; 2176 int err, size; 2177 struct sk_buff *skb; 2178 int sent = 0; 2179 struct scm_cookie scm; 2180 bool fds_sent = false; 2181 int data_len; 2182 2183 wait_for_unix_gc(); 2184 err = scm_send(sock, msg, &scm, false); 2185 if (err < 0) 2186 return err; 2187 2188 err = -EOPNOTSUPP; 2189 if (msg->msg_flags & MSG_OOB) { 2190 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2191 if (len) 2192 len--; 2193 else 2194 #endif 2195 goto out_err; 2196 } 2197 2198 if (msg->msg_namelen) { 2199 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2200 goto out_err; 2201 } else { 2202 err = -ENOTCONN; 2203 other = unix_peer(sk); 2204 if (!other) 2205 goto out_err; 2206 } 2207 2208 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2209 goto pipe_err; 2210 2211 while (sent < len) { 2212 size = len - sent; 2213 2214 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2215 skb = sock_alloc_send_pskb(sk, 0, 0, 2216 msg->msg_flags & MSG_DONTWAIT, 2217 &err, 0); 2218 } else { 2219 /* Keep two messages in the pipe so it schedules better */ 2220 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2221 2222 /* allow fallback to order-0 allocations */ 2223 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2224 2225 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2226 2227 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2228 2229 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2230 msg->msg_flags & MSG_DONTWAIT, &err, 2231 get_order(UNIX_SKB_FRAGS_SZ)); 2232 } 2233 if (!skb) 2234 goto out_err; 2235 2236 /* Only send the fds in the first buffer */ 2237 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2238 if (err < 0) { 2239 kfree_skb(skb); 2240 goto out_err; 2241 } 2242 fds_sent = true; 2243 2244 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2245 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2246 sk->sk_allocation); 2247 if (err < 0) { 2248 kfree_skb(skb); 2249 goto out_err; 2250 } 2251 size = err; 2252 refcount_add(size, &sk->sk_wmem_alloc); 2253 } else { 2254 skb_put(skb, size - data_len); 2255 skb->data_len = data_len; 2256 skb->len = size; 2257 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2258 if (err) { 2259 kfree_skb(skb); 2260 goto out_err; 2261 } 2262 } 2263 2264 unix_state_lock(other); 2265 2266 if (sock_flag(other, SOCK_DEAD) || 2267 (other->sk_shutdown & RCV_SHUTDOWN)) 2268 goto pipe_err_free; 2269 2270 maybe_add_creds(skb, sock, other); 2271 scm_stat_add(other, skb); 2272 skb_queue_tail(&other->sk_receive_queue, skb); 2273 unix_state_unlock(other); 2274 other->sk_data_ready(other); 2275 sent += size; 2276 } 2277 2278 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2279 if (msg->msg_flags & MSG_OOB) { 2280 err = queue_oob(sock, msg, other, &scm, fds_sent); 2281 if (err) 2282 goto out_err; 2283 sent++; 2284 } 2285 #endif 2286 2287 scm_destroy(&scm); 2288 2289 return sent; 2290 2291 pipe_err_free: 2292 unix_state_unlock(other); 2293 kfree_skb(skb); 2294 pipe_err: 2295 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2296 send_sig(SIGPIPE, current, 0); 2297 err = -EPIPE; 2298 out_err: 2299 scm_destroy(&scm); 2300 return sent ? : err; 2301 } 2302 2303 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2304 size_t len) 2305 { 2306 int err; 2307 struct sock *sk = sock->sk; 2308 2309 err = sock_error(sk); 2310 if (err) 2311 return err; 2312 2313 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2314 return -ENOTCONN; 2315 2316 if (msg->msg_namelen) 2317 msg->msg_namelen = 0; 2318 2319 return unix_dgram_sendmsg(sock, msg, len); 2320 } 2321 2322 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2323 size_t size, int flags) 2324 { 2325 struct sock *sk = sock->sk; 2326 2327 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2328 return -ENOTCONN; 2329 2330 return unix_dgram_recvmsg(sock, msg, size, flags); 2331 } 2332 2333 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2334 { 2335 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2336 2337 if (addr) { 2338 msg->msg_namelen = addr->len; 2339 memcpy(msg->msg_name, addr->name, addr->len); 2340 } 2341 } 2342 2343 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2344 int flags) 2345 { 2346 struct scm_cookie scm; 2347 struct socket *sock = sk->sk_socket; 2348 struct unix_sock *u = unix_sk(sk); 2349 struct sk_buff *skb, *last; 2350 long timeo; 2351 int skip; 2352 int err; 2353 2354 err = -EOPNOTSUPP; 2355 if (flags&MSG_OOB) 2356 goto out; 2357 2358 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2359 2360 do { 2361 mutex_lock(&u->iolock); 2362 2363 skip = sk_peek_offset(sk, flags); 2364 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2365 &skip, &err, &last); 2366 if (skb) { 2367 if (!(flags & MSG_PEEK)) 2368 scm_stat_del(sk, skb); 2369 break; 2370 } 2371 2372 mutex_unlock(&u->iolock); 2373 2374 if (err != -EAGAIN) 2375 break; 2376 } while (timeo && 2377 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2378 &err, &timeo, last)); 2379 2380 if (!skb) { /* implies iolock unlocked */ 2381 unix_state_lock(sk); 2382 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2383 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2384 (sk->sk_shutdown & RCV_SHUTDOWN)) 2385 err = 0; 2386 unix_state_unlock(sk); 2387 goto out; 2388 } 2389 2390 if (wq_has_sleeper(&u->peer_wait)) 2391 wake_up_interruptible_sync_poll(&u->peer_wait, 2392 EPOLLOUT | EPOLLWRNORM | 2393 EPOLLWRBAND); 2394 2395 if (msg->msg_name) 2396 unix_copy_addr(msg, skb->sk); 2397 2398 if (size > skb->len - skip) 2399 size = skb->len - skip; 2400 else if (size < skb->len - skip) 2401 msg->msg_flags |= MSG_TRUNC; 2402 2403 err = skb_copy_datagram_msg(skb, skip, msg, size); 2404 if (err) 2405 goto out_free; 2406 2407 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2408 __sock_recv_timestamp(msg, sk, skb); 2409 2410 memset(&scm, 0, sizeof(scm)); 2411 2412 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2413 unix_set_secdata(&scm, skb); 2414 2415 if (!(flags & MSG_PEEK)) { 2416 if (UNIXCB(skb).fp) 2417 unix_detach_fds(&scm, skb); 2418 2419 sk_peek_offset_bwd(sk, skb->len); 2420 } else { 2421 /* It is questionable: on PEEK we could: 2422 - do not return fds - good, but too simple 8) 2423 - return fds, and do not return them on read (old strategy, 2424 apparently wrong) 2425 - clone fds (I chose it for now, it is the most universal 2426 solution) 2427 2428 POSIX 1003.1g does not actually define this clearly 2429 at all. POSIX 1003.1g doesn't define a lot of things 2430 clearly however! 2431 2432 */ 2433 2434 sk_peek_offset_fwd(sk, size); 2435 2436 if (UNIXCB(skb).fp) 2437 unix_peek_fds(&scm, skb); 2438 } 2439 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2440 2441 scm_recv_unix(sock, msg, &scm, flags); 2442 2443 out_free: 2444 skb_free_datagram(sk, skb); 2445 mutex_unlock(&u->iolock); 2446 out: 2447 return err; 2448 } 2449 2450 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2451 int flags) 2452 { 2453 struct sock *sk = sock->sk; 2454 2455 #ifdef CONFIG_BPF_SYSCALL 2456 const struct proto *prot = READ_ONCE(sk->sk_prot); 2457 2458 if (prot != &unix_dgram_proto) 2459 return prot->recvmsg(sk, msg, size, flags, NULL); 2460 #endif 2461 return __unix_dgram_recvmsg(sk, msg, size, flags); 2462 } 2463 2464 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2465 { 2466 struct unix_sock *u = unix_sk(sk); 2467 struct sk_buff *skb; 2468 int err; 2469 2470 mutex_lock(&u->iolock); 2471 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2472 mutex_unlock(&u->iolock); 2473 if (!skb) 2474 return err; 2475 2476 return recv_actor(sk, skb); 2477 } 2478 2479 /* 2480 * Sleep until more data has arrived. But check for races.. 2481 */ 2482 static long unix_stream_data_wait(struct sock *sk, long timeo, 2483 struct sk_buff *last, unsigned int last_len, 2484 bool freezable) 2485 { 2486 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2487 struct sk_buff *tail; 2488 DEFINE_WAIT(wait); 2489 2490 unix_state_lock(sk); 2491 2492 for (;;) { 2493 prepare_to_wait(sk_sleep(sk), &wait, state); 2494 2495 tail = skb_peek_tail(&sk->sk_receive_queue); 2496 if (tail != last || 2497 (tail && tail->len != last_len) || 2498 sk->sk_err || 2499 (sk->sk_shutdown & RCV_SHUTDOWN) || 2500 signal_pending(current) || 2501 !timeo) 2502 break; 2503 2504 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2505 unix_state_unlock(sk); 2506 timeo = schedule_timeout(timeo); 2507 unix_state_lock(sk); 2508 2509 if (sock_flag(sk, SOCK_DEAD)) 2510 break; 2511 2512 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2513 } 2514 2515 finish_wait(sk_sleep(sk), &wait); 2516 unix_state_unlock(sk); 2517 return timeo; 2518 } 2519 2520 static unsigned int unix_skb_len(const struct sk_buff *skb) 2521 { 2522 return skb->len - UNIXCB(skb).consumed; 2523 } 2524 2525 struct unix_stream_read_state { 2526 int (*recv_actor)(struct sk_buff *, int, int, 2527 struct unix_stream_read_state *); 2528 struct socket *socket; 2529 struct msghdr *msg; 2530 struct pipe_inode_info *pipe; 2531 size_t size; 2532 int flags; 2533 unsigned int splice_flags; 2534 }; 2535 2536 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2537 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2538 { 2539 struct socket *sock = state->socket; 2540 struct sock *sk = sock->sk; 2541 struct unix_sock *u = unix_sk(sk); 2542 int chunk = 1; 2543 struct sk_buff *oob_skb; 2544 2545 mutex_lock(&u->iolock); 2546 unix_state_lock(sk); 2547 spin_lock(&sk->sk_receive_queue.lock); 2548 2549 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2550 spin_unlock(&sk->sk_receive_queue.lock); 2551 unix_state_unlock(sk); 2552 mutex_unlock(&u->iolock); 2553 return -EINVAL; 2554 } 2555 2556 oob_skb = u->oob_skb; 2557 2558 if (!(state->flags & MSG_PEEK)) 2559 WRITE_ONCE(u->oob_skb, NULL); 2560 else 2561 skb_get(oob_skb); 2562 2563 spin_unlock(&sk->sk_receive_queue.lock); 2564 unix_state_unlock(sk); 2565 2566 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2567 2568 if (!(state->flags & MSG_PEEK)) 2569 UNIXCB(oob_skb).consumed += 1; 2570 2571 consume_skb(oob_skb); 2572 2573 mutex_unlock(&u->iolock); 2574 2575 if (chunk < 0) 2576 return -EFAULT; 2577 2578 state->msg->msg_flags |= MSG_OOB; 2579 return 1; 2580 } 2581 2582 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2583 int flags, int copied) 2584 { 2585 struct unix_sock *u = unix_sk(sk); 2586 2587 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2588 skb_unlink(skb, &sk->sk_receive_queue); 2589 consume_skb(skb); 2590 skb = NULL; 2591 } else { 2592 struct sk_buff *unlinked_skb = NULL; 2593 2594 spin_lock(&sk->sk_receive_queue.lock); 2595 2596 if (skb == u->oob_skb) { 2597 if (copied) { 2598 skb = NULL; 2599 } else if (!(flags & MSG_PEEK)) { 2600 if (sock_flag(sk, SOCK_URGINLINE)) { 2601 WRITE_ONCE(u->oob_skb, NULL); 2602 consume_skb(skb); 2603 } else { 2604 __skb_unlink(skb, &sk->sk_receive_queue); 2605 WRITE_ONCE(u->oob_skb, NULL); 2606 unlinked_skb = skb; 2607 skb = skb_peek(&sk->sk_receive_queue); 2608 } 2609 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2610 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2611 } 2612 } 2613 2614 spin_unlock(&sk->sk_receive_queue.lock); 2615 2616 if (unlinked_skb) { 2617 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2618 kfree_skb(unlinked_skb); 2619 } 2620 } 2621 return skb; 2622 } 2623 #endif 2624 2625 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2626 { 2627 struct unix_sock *u = unix_sk(sk); 2628 struct sk_buff *skb; 2629 int err; 2630 2631 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2632 return -ENOTCONN; 2633 2634 mutex_lock(&u->iolock); 2635 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2636 mutex_unlock(&u->iolock); 2637 if (!skb) 2638 return err; 2639 2640 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2641 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2642 bool drop = false; 2643 2644 unix_state_lock(sk); 2645 2646 if (sock_flag(sk, SOCK_DEAD)) { 2647 unix_state_unlock(sk); 2648 kfree_skb(skb); 2649 return -ECONNRESET; 2650 } 2651 2652 spin_lock(&sk->sk_receive_queue.lock); 2653 if (likely(skb == u->oob_skb)) { 2654 WRITE_ONCE(u->oob_skb, NULL); 2655 drop = true; 2656 } 2657 spin_unlock(&sk->sk_receive_queue.lock); 2658 2659 unix_state_unlock(sk); 2660 2661 if (drop) { 2662 WARN_ON_ONCE(skb_unref(skb)); 2663 kfree_skb(skb); 2664 return -EAGAIN; 2665 } 2666 } 2667 #endif 2668 2669 return recv_actor(sk, skb); 2670 } 2671 2672 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2673 bool freezable) 2674 { 2675 struct scm_cookie scm; 2676 struct socket *sock = state->socket; 2677 struct sock *sk = sock->sk; 2678 struct unix_sock *u = unix_sk(sk); 2679 int copied = 0; 2680 int flags = state->flags; 2681 int noblock = flags & MSG_DONTWAIT; 2682 bool check_creds = false; 2683 int target; 2684 int err = 0; 2685 long timeo; 2686 int skip; 2687 size_t size = state->size; 2688 unsigned int last_len; 2689 2690 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2691 err = -EINVAL; 2692 goto out; 2693 } 2694 2695 if (unlikely(flags & MSG_OOB)) { 2696 err = -EOPNOTSUPP; 2697 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2698 err = unix_stream_recv_urg(state); 2699 #endif 2700 goto out; 2701 } 2702 2703 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2704 timeo = sock_rcvtimeo(sk, noblock); 2705 2706 memset(&scm, 0, sizeof(scm)); 2707 2708 /* Lock the socket to prevent queue disordering 2709 * while sleeps in memcpy_tomsg 2710 */ 2711 mutex_lock(&u->iolock); 2712 2713 skip = max(sk_peek_offset(sk, flags), 0); 2714 2715 do { 2716 int chunk; 2717 bool drop_skb; 2718 struct sk_buff *skb, *last; 2719 2720 redo: 2721 unix_state_lock(sk); 2722 if (sock_flag(sk, SOCK_DEAD)) { 2723 err = -ECONNRESET; 2724 goto unlock; 2725 } 2726 last = skb = skb_peek(&sk->sk_receive_queue); 2727 last_len = last ? last->len : 0; 2728 2729 again: 2730 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2731 if (skb) { 2732 skb = manage_oob(skb, sk, flags, copied); 2733 if (!skb && copied) { 2734 unix_state_unlock(sk); 2735 break; 2736 } 2737 } 2738 #endif 2739 if (skb == NULL) { 2740 if (copied >= target) 2741 goto unlock; 2742 2743 /* 2744 * POSIX 1003.1g mandates this order. 2745 */ 2746 2747 err = sock_error(sk); 2748 if (err) 2749 goto unlock; 2750 if (sk->sk_shutdown & RCV_SHUTDOWN) 2751 goto unlock; 2752 2753 unix_state_unlock(sk); 2754 if (!timeo) { 2755 err = -EAGAIN; 2756 break; 2757 } 2758 2759 mutex_unlock(&u->iolock); 2760 2761 timeo = unix_stream_data_wait(sk, timeo, last, 2762 last_len, freezable); 2763 2764 if (signal_pending(current)) { 2765 err = sock_intr_errno(timeo); 2766 scm_destroy(&scm); 2767 goto out; 2768 } 2769 2770 mutex_lock(&u->iolock); 2771 goto redo; 2772 unlock: 2773 unix_state_unlock(sk); 2774 break; 2775 } 2776 2777 while (skip >= unix_skb_len(skb)) { 2778 skip -= unix_skb_len(skb); 2779 last = skb; 2780 last_len = skb->len; 2781 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2782 if (!skb) 2783 goto again; 2784 } 2785 2786 unix_state_unlock(sk); 2787 2788 if (check_creds) { 2789 /* Never glue messages from different writers */ 2790 if (!unix_skb_scm_eq(skb, &scm)) 2791 break; 2792 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2793 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2794 /* Copy credentials */ 2795 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2796 unix_set_secdata(&scm, skb); 2797 check_creds = true; 2798 } 2799 2800 /* Copy address just once */ 2801 if (state->msg && state->msg->msg_name) { 2802 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2803 state->msg->msg_name); 2804 unix_copy_addr(state->msg, skb->sk); 2805 sunaddr = NULL; 2806 } 2807 2808 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2809 skb_get(skb); 2810 chunk = state->recv_actor(skb, skip, chunk, state); 2811 drop_skb = !unix_skb_len(skb); 2812 /* skb is only safe to use if !drop_skb */ 2813 consume_skb(skb); 2814 if (chunk < 0) { 2815 if (copied == 0) 2816 copied = -EFAULT; 2817 break; 2818 } 2819 copied += chunk; 2820 size -= chunk; 2821 2822 if (drop_skb) { 2823 /* the skb was touched by a concurrent reader; 2824 * we should not expect anything from this skb 2825 * anymore and assume it invalid - we can be 2826 * sure it was dropped from the socket queue 2827 * 2828 * let's report a short read 2829 */ 2830 err = 0; 2831 break; 2832 } 2833 2834 /* Mark read part of skb as used */ 2835 if (!(flags & MSG_PEEK)) { 2836 UNIXCB(skb).consumed += chunk; 2837 2838 sk_peek_offset_bwd(sk, chunk); 2839 2840 if (UNIXCB(skb).fp) { 2841 scm_stat_del(sk, skb); 2842 unix_detach_fds(&scm, skb); 2843 } 2844 2845 if (unix_skb_len(skb)) 2846 break; 2847 2848 skb_unlink(skb, &sk->sk_receive_queue); 2849 consume_skb(skb); 2850 2851 if (scm.fp) 2852 break; 2853 } else { 2854 /* It is questionable, see note in unix_dgram_recvmsg. 2855 */ 2856 if (UNIXCB(skb).fp) 2857 unix_peek_fds(&scm, skb); 2858 2859 sk_peek_offset_fwd(sk, chunk); 2860 2861 if (UNIXCB(skb).fp) 2862 break; 2863 2864 skip = 0; 2865 last = skb; 2866 last_len = skb->len; 2867 unix_state_lock(sk); 2868 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2869 if (skb) 2870 goto again; 2871 unix_state_unlock(sk); 2872 break; 2873 } 2874 } while (size); 2875 2876 mutex_unlock(&u->iolock); 2877 if (state->msg) 2878 scm_recv_unix(sock, state->msg, &scm, flags); 2879 else 2880 scm_destroy(&scm); 2881 out: 2882 return copied ? : err; 2883 } 2884 2885 static int unix_stream_read_actor(struct sk_buff *skb, 2886 int skip, int chunk, 2887 struct unix_stream_read_state *state) 2888 { 2889 int ret; 2890 2891 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2892 state->msg, chunk); 2893 return ret ?: chunk; 2894 } 2895 2896 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2897 size_t size, int flags) 2898 { 2899 struct unix_stream_read_state state = { 2900 .recv_actor = unix_stream_read_actor, 2901 .socket = sk->sk_socket, 2902 .msg = msg, 2903 .size = size, 2904 .flags = flags 2905 }; 2906 2907 return unix_stream_read_generic(&state, true); 2908 } 2909 2910 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2911 size_t size, int flags) 2912 { 2913 struct unix_stream_read_state state = { 2914 .recv_actor = unix_stream_read_actor, 2915 .socket = sock, 2916 .msg = msg, 2917 .size = size, 2918 .flags = flags 2919 }; 2920 2921 #ifdef CONFIG_BPF_SYSCALL 2922 struct sock *sk = sock->sk; 2923 const struct proto *prot = READ_ONCE(sk->sk_prot); 2924 2925 if (prot != &unix_stream_proto) 2926 return prot->recvmsg(sk, msg, size, flags, NULL); 2927 #endif 2928 return unix_stream_read_generic(&state, true); 2929 } 2930 2931 static int unix_stream_splice_actor(struct sk_buff *skb, 2932 int skip, int chunk, 2933 struct unix_stream_read_state *state) 2934 { 2935 return skb_splice_bits(skb, state->socket->sk, 2936 UNIXCB(skb).consumed + skip, 2937 state->pipe, chunk, state->splice_flags); 2938 } 2939 2940 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2941 struct pipe_inode_info *pipe, 2942 size_t size, unsigned int flags) 2943 { 2944 struct unix_stream_read_state state = { 2945 .recv_actor = unix_stream_splice_actor, 2946 .socket = sock, 2947 .pipe = pipe, 2948 .size = size, 2949 .splice_flags = flags, 2950 }; 2951 2952 if (unlikely(*ppos)) 2953 return -ESPIPE; 2954 2955 if (sock->file->f_flags & O_NONBLOCK || 2956 flags & SPLICE_F_NONBLOCK) 2957 state.flags = MSG_DONTWAIT; 2958 2959 return unix_stream_read_generic(&state, false); 2960 } 2961 2962 static int unix_shutdown(struct socket *sock, int mode) 2963 { 2964 struct sock *sk = sock->sk; 2965 struct sock *other; 2966 2967 if (mode < SHUT_RD || mode > SHUT_RDWR) 2968 return -EINVAL; 2969 /* This maps: 2970 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2971 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2972 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2973 */ 2974 ++mode; 2975 2976 unix_state_lock(sk); 2977 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2978 other = unix_peer(sk); 2979 if (other) 2980 sock_hold(other); 2981 unix_state_unlock(sk); 2982 sk->sk_state_change(sk); 2983 2984 if (other && 2985 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2986 2987 int peer_mode = 0; 2988 const struct proto *prot = READ_ONCE(other->sk_prot); 2989 2990 if (prot->unhash) 2991 prot->unhash(other); 2992 if (mode&RCV_SHUTDOWN) 2993 peer_mode |= SEND_SHUTDOWN; 2994 if (mode&SEND_SHUTDOWN) 2995 peer_mode |= RCV_SHUTDOWN; 2996 unix_state_lock(other); 2997 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2998 unix_state_unlock(other); 2999 other->sk_state_change(other); 3000 if (peer_mode == SHUTDOWN_MASK) 3001 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3002 else if (peer_mode & RCV_SHUTDOWN) 3003 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3004 } 3005 if (other) 3006 sock_put(other); 3007 3008 return 0; 3009 } 3010 3011 long unix_inq_len(struct sock *sk) 3012 { 3013 struct sk_buff *skb; 3014 long amount = 0; 3015 3016 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3017 return -EINVAL; 3018 3019 spin_lock(&sk->sk_receive_queue.lock); 3020 if (sk->sk_type == SOCK_STREAM || 3021 sk->sk_type == SOCK_SEQPACKET) { 3022 skb_queue_walk(&sk->sk_receive_queue, skb) 3023 amount += unix_skb_len(skb); 3024 } else { 3025 skb = skb_peek(&sk->sk_receive_queue); 3026 if (skb) 3027 amount = skb->len; 3028 } 3029 spin_unlock(&sk->sk_receive_queue.lock); 3030 3031 return amount; 3032 } 3033 EXPORT_SYMBOL_GPL(unix_inq_len); 3034 3035 long unix_outq_len(struct sock *sk) 3036 { 3037 return sk_wmem_alloc_get(sk); 3038 } 3039 EXPORT_SYMBOL_GPL(unix_outq_len); 3040 3041 static int unix_open_file(struct sock *sk) 3042 { 3043 struct path path; 3044 struct file *f; 3045 int fd; 3046 3047 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3048 return -EPERM; 3049 3050 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3051 return -ENOENT; 3052 3053 path = unix_sk(sk)->path; 3054 if (!path.dentry) 3055 return -ENOENT; 3056 3057 path_get(&path); 3058 3059 fd = get_unused_fd_flags(O_CLOEXEC); 3060 if (fd < 0) 3061 goto out; 3062 3063 f = dentry_open(&path, O_PATH, current_cred()); 3064 if (IS_ERR(f)) { 3065 put_unused_fd(fd); 3066 fd = PTR_ERR(f); 3067 goto out; 3068 } 3069 3070 fd_install(fd, f); 3071 out: 3072 path_put(&path); 3073 3074 return fd; 3075 } 3076 3077 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3078 { 3079 struct sock *sk = sock->sk; 3080 long amount = 0; 3081 int err; 3082 3083 switch (cmd) { 3084 case SIOCOUTQ: 3085 amount = unix_outq_len(sk); 3086 err = put_user(amount, (int __user *)arg); 3087 break; 3088 case SIOCINQ: 3089 amount = unix_inq_len(sk); 3090 if (amount < 0) 3091 err = amount; 3092 else 3093 err = put_user(amount, (int __user *)arg); 3094 break; 3095 case SIOCUNIXFILE: 3096 err = unix_open_file(sk); 3097 break; 3098 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3099 case SIOCATMARK: 3100 { 3101 struct sk_buff *skb; 3102 int answ = 0; 3103 3104 skb = skb_peek(&sk->sk_receive_queue); 3105 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3106 answ = 1; 3107 err = put_user(answ, (int __user *)arg); 3108 } 3109 break; 3110 #endif 3111 default: 3112 err = -ENOIOCTLCMD; 3113 break; 3114 } 3115 return err; 3116 } 3117 3118 #ifdef CONFIG_COMPAT 3119 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3120 { 3121 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3122 } 3123 #endif 3124 3125 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3126 { 3127 struct sock *sk = sock->sk; 3128 unsigned char state; 3129 __poll_t mask; 3130 u8 shutdown; 3131 3132 sock_poll_wait(file, sock, wait); 3133 mask = 0; 3134 shutdown = READ_ONCE(sk->sk_shutdown); 3135 state = READ_ONCE(sk->sk_state); 3136 3137 /* exceptional events? */ 3138 if (READ_ONCE(sk->sk_err)) 3139 mask |= EPOLLERR; 3140 if (shutdown == SHUTDOWN_MASK) 3141 mask |= EPOLLHUP; 3142 if (shutdown & RCV_SHUTDOWN) 3143 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3144 3145 /* readable? */ 3146 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3147 mask |= EPOLLIN | EPOLLRDNORM; 3148 if (sk_is_readable(sk)) 3149 mask |= EPOLLIN | EPOLLRDNORM; 3150 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3151 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3152 mask |= EPOLLPRI; 3153 #endif 3154 3155 /* Connection-based need to check for termination and startup */ 3156 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3157 state == TCP_CLOSE) 3158 mask |= EPOLLHUP; 3159 3160 /* 3161 * we set writable also when the other side has shut down the 3162 * connection. This prevents stuck sockets. 3163 */ 3164 if (unix_writable(sk, state)) 3165 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3166 3167 return mask; 3168 } 3169 3170 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3171 poll_table *wait) 3172 { 3173 struct sock *sk = sock->sk, *other; 3174 unsigned int writable; 3175 unsigned char state; 3176 __poll_t mask; 3177 u8 shutdown; 3178 3179 sock_poll_wait(file, sock, wait); 3180 mask = 0; 3181 shutdown = READ_ONCE(sk->sk_shutdown); 3182 state = READ_ONCE(sk->sk_state); 3183 3184 /* exceptional events? */ 3185 if (READ_ONCE(sk->sk_err) || 3186 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3187 mask |= EPOLLERR | 3188 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3189 3190 if (shutdown & RCV_SHUTDOWN) 3191 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3192 if (shutdown == SHUTDOWN_MASK) 3193 mask |= EPOLLHUP; 3194 3195 /* readable? */ 3196 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3197 mask |= EPOLLIN | EPOLLRDNORM; 3198 if (sk_is_readable(sk)) 3199 mask |= EPOLLIN | EPOLLRDNORM; 3200 3201 /* Connection-based need to check for termination and startup */ 3202 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3203 mask |= EPOLLHUP; 3204 3205 /* No write status requested, avoid expensive OUT tests. */ 3206 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3207 return mask; 3208 3209 writable = unix_writable(sk, state); 3210 if (writable) { 3211 unix_state_lock(sk); 3212 3213 other = unix_peer(sk); 3214 if (other && unix_peer(other) != sk && 3215 unix_recvq_full_lockless(other) && 3216 unix_dgram_peer_wake_me(sk, other)) 3217 writable = 0; 3218 3219 unix_state_unlock(sk); 3220 } 3221 3222 if (writable) 3223 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3224 else 3225 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3226 3227 return mask; 3228 } 3229 3230 #ifdef CONFIG_PROC_FS 3231 3232 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3233 3234 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3235 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3236 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3237 3238 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3239 { 3240 unsigned long offset = get_offset(*pos); 3241 unsigned long bucket = get_bucket(*pos); 3242 unsigned long count = 0; 3243 struct sock *sk; 3244 3245 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3246 sk; sk = sk_next(sk)) { 3247 if (++count == offset) 3248 break; 3249 } 3250 3251 return sk; 3252 } 3253 3254 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3255 { 3256 unsigned long bucket = get_bucket(*pos); 3257 struct net *net = seq_file_net(seq); 3258 struct sock *sk; 3259 3260 while (bucket < UNIX_HASH_SIZE) { 3261 spin_lock(&net->unx.table.locks[bucket]); 3262 3263 sk = unix_from_bucket(seq, pos); 3264 if (sk) 3265 return sk; 3266 3267 spin_unlock(&net->unx.table.locks[bucket]); 3268 3269 *pos = set_bucket_offset(++bucket, 1); 3270 } 3271 3272 return NULL; 3273 } 3274 3275 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3276 loff_t *pos) 3277 { 3278 unsigned long bucket = get_bucket(*pos); 3279 3280 sk = sk_next(sk); 3281 if (sk) 3282 return sk; 3283 3284 3285 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3286 3287 *pos = set_bucket_offset(++bucket, 1); 3288 3289 return unix_get_first(seq, pos); 3290 } 3291 3292 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3293 { 3294 if (!*pos) 3295 return SEQ_START_TOKEN; 3296 3297 return unix_get_first(seq, pos); 3298 } 3299 3300 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3301 { 3302 ++*pos; 3303 3304 if (v == SEQ_START_TOKEN) 3305 return unix_get_first(seq, pos); 3306 3307 return unix_get_next(seq, v, pos); 3308 } 3309 3310 static void unix_seq_stop(struct seq_file *seq, void *v) 3311 { 3312 struct sock *sk = v; 3313 3314 if (sk) 3315 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3316 } 3317 3318 static int unix_seq_show(struct seq_file *seq, void *v) 3319 { 3320 3321 if (v == SEQ_START_TOKEN) 3322 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3323 "Inode Path\n"); 3324 else { 3325 struct sock *s = v; 3326 struct unix_sock *u = unix_sk(s); 3327 unix_state_lock(s); 3328 3329 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3330 s, 3331 refcount_read(&s->sk_refcnt), 3332 0, 3333 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3334 s->sk_type, 3335 s->sk_socket ? 3336 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3337 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3338 sock_i_ino(s)); 3339 3340 if (u->addr) { // under a hash table lock here 3341 int i, len; 3342 seq_putc(seq, ' '); 3343 3344 i = 0; 3345 len = u->addr->len - 3346 offsetof(struct sockaddr_un, sun_path); 3347 if (u->addr->name->sun_path[0]) { 3348 len--; 3349 } else { 3350 seq_putc(seq, '@'); 3351 i++; 3352 } 3353 for ( ; i < len; i++) 3354 seq_putc(seq, u->addr->name->sun_path[i] ?: 3355 '@'); 3356 } 3357 unix_state_unlock(s); 3358 seq_putc(seq, '\n'); 3359 } 3360 3361 return 0; 3362 } 3363 3364 static const struct seq_operations unix_seq_ops = { 3365 .start = unix_seq_start, 3366 .next = unix_seq_next, 3367 .stop = unix_seq_stop, 3368 .show = unix_seq_show, 3369 }; 3370 3371 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) 3372 struct bpf_unix_iter_state { 3373 struct seq_net_private p; 3374 unsigned int cur_sk; 3375 unsigned int end_sk; 3376 unsigned int max_sk; 3377 struct sock **batch; 3378 bool st_bucket_done; 3379 }; 3380 3381 struct bpf_iter__unix { 3382 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3383 __bpf_md_ptr(struct unix_sock *, unix_sk); 3384 uid_t uid __aligned(8); 3385 }; 3386 3387 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3388 struct unix_sock *unix_sk, uid_t uid) 3389 { 3390 struct bpf_iter__unix ctx; 3391 3392 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3393 ctx.meta = meta; 3394 ctx.unix_sk = unix_sk; 3395 ctx.uid = uid; 3396 return bpf_iter_run_prog(prog, &ctx); 3397 } 3398 3399 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3400 3401 { 3402 struct bpf_unix_iter_state *iter = seq->private; 3403 unsigned int expected = 1; 3404 struct sock *sk; 3405 3406 sock_hold(start_sk); 3407 iter->batch[iter->end_sk++] = start_sk; 3408 3409 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3410 if (iter->end_sk < iter->max_sk) { 3411 sock_hold(sk); 3412 iter->batch[iter->end_sk++] = sk; 3413 } 3414 3415 expected++; 3416 } 3417 3418 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3419 3420 return expected; 3421 } 3422 3423 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3424 { 3425 while (iter->cur_sk < iter->end_sk) 3426 sock_put(iter->batch[iter->cur_sk++]); 3427 } 3428 3429 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3430 unsigned int new_batch_sz) 3431 { 3432 struct sock **new_batch; 3433 3434 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3435 GFP_USER | __GFP_NOWARN); 3436 if (!new_batch) 3437 return -ENOMEM; 3438 3439 bpf_iter_unix_put_batch(iter); 3440 kvfree(iter->batch); 3441 iter->batch = new_batch; 3442 iter->max_sk = new_batch_sz; 3443 3444 return 0; 3445 } 3446 3447 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3448 loff_t *pos) 3449 { 3450 struct bpf_unix_iter_state *iter = seq->private; 3451 unsigned int expected; 3452 bool resized = false; 3453 struct sock *sk; 3454 3455 if (iter->st_bucket_done) 3456 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3457 3458 again: 3459 /* Get a new batch */ 3460 iter->cur_sk = 0; 3461 iter->end_sk = 0; 3462 3463 sk = unix_get_first(seq, pos); 3464 if (!sk) 3465 return NULL; /* Done */ 3466 3467 expected = bpf_iter_unix_hold_batch(seq, sk); 3468 3469 if (iter->end_sk == expected) { 3470 iter->st_bucket_done = true; 3471 return sk; 3472 } 3473 3474 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3475 resized = true; 3476 goto again; 3477 } 3478 3479 return sk; 3480 } 3481 3482 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3483 { 3484 if (!*pos) 3485 return SEQ_START_TOKEN; 3486 3487 /* bpf iter does not support lseek, so it always 3488 * continue from where it was stop()-ped. 3489 */ 3490 return bpf_iter_unix_batch(seq, pos); 3491 } 3492 3493 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3494 { 3495 struct bpf_unix_iter_state *iter = seq->private; 3496 struct sock *sk; 3497 3498 /* Whenever seq_next() is called, the iter->cur_sk is 3499 * done with seq_show(), so advance to the next sk in 3500 * the batch. 3501 */ 3502 if (iter->cur_sk < iter->end_sk) 3503 sock_put(iter->batch[iter->cur_sk++]); 3504 3505 ++*pos; 3506 3507 if (iter->cur_sk < iter->end_sk) 3508 sk = iter->batch[iter->cur_sk]; 3509 else 3510 sk = bpf_iter_unix_batch(seq, pos); 3511 3512 return sk; 3513 } 3514 3515 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3516 { 3517 struct bpf_iter_meta meta; 3518 struct bpf_prog *prog; 3519 struct sock *sk = v; 3520 uid_t uid; 3521 bool slow; 3522 int ret; 3523 3524 if (v == SEQ_START_TOKEN) 3525 return 0; 3526 3527 slow = lock_sock_fast(sk); 3528 3529 if (unlikely(sk_unhashed(sk))) { 3530 ret = SEQ_SKIP; 3531 goto unlock; 3532 } 3533 3534 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3535 meta.seq = seq; 3536 prog = bpf_iter_get_info(&meta, false); 3537 ret = unix_prog_seq_show(prog, &meta, v, uid); 3538 unlock: 3539 unlock_sock_fast(sk, slow); 3540 return ret; 3541 } 3542 3543 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3544 { 3545 struct bpf_unix_iter_state *iter = seq->private; 3546 struct bpf_iter_meta meta; 3547 struct bpf_prog *prog; 3548 3549 if (!v) { 3550 meta.seq = seq; 3551 prog = bpf_iter_get_info(&meta, true); 3552 if (prog) 3553 (void)unix_prog_seq_show(prog, &meta, v, 0); 3554 } 3555 3556 if (iter->cur_sk < iter->end_sk) 3557 bpf_iter_unix_put_batch(iter); 3558 } 3559 3560 static const struct seq_operations bpf_iter_unix_seq_ops = { 3561 .start = bpf_iter_unix_seq_start, 3562 .next = bpf_iter_unix_seq_next, 3563 .stop = bpf_iter_unix_seq_stop, 3564 .show = bpf_iter_unix_seq_show, 3565 }; 3566 #endif 3567 #endif 3568 3569 static const struct net_proto_family unix_family_ops = { 3570 .family = PF_UNIX, 3571 .create = unix_create, 3572 .owner = THIS_MODULE, 3573 }; 3574 3575 3576 static int __net_init unix_net_init(struct net *net) 3577 { 3578 int i; 3579 3580 net->unx.sysctl_max_dgram_qlen = 10; 3581 if (unix_sysctl_register(net)) 3582 goto out; 3583 3584 #ifdef CONFIG_PROC_FS 3585 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3586 sizeof(struct seq_net_private))) 3587 goto err_sysctl; 3588 #endif 3589 3590 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3591 sizeof(spinlock_t), GFP_KERNEL); 3592 if (!net->unx.table.locks) 3593 goto err_proc; 3594 3595 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3596 sizeof(struct hlist_head), 3597 GFP_KERNEL); 3598 if (!net->unx.table.buckets) 3599 goto free_locks; 3600 3601 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3602 spin_lock_init(&net->unx.table.locks[i]); 3603 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3604 } 3605 3606 return 0; 3607 3608 free_locks: 3609 kvfree(net->unx.table.locks); 3610 err_proc: 3611 #ifdef CONFIG_PROC_FS 3612 remove_proc_entry("unix", net->proc_net); 3613 err_sysctl: 3614 #endif 3615 unix_sysctl_unregister(net); 3616 out: 3617 return -ENOMEM; 3618 } 3619 3620 static void __net_exit unix_net_exit(struct net *net) 3621 { 3622 kvfree(net->unx.table.buckets); 3623 kvfree(net->unx.table.locks); 3624 unix_sysctl_unregister(net); 3625 remove_proc_entry("unix", net->proc_net); 3626 } 3627 3628 static struct pernet_operations unix_net_ops = { 3629 .init = unix_net_init, 3630 .exit = unix_net_exit, 3631 }; 3632 3633 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3634 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3635 struct unix_sock *unix_sk, uid_t uid) 3636 3637 #define INIT_BATCH_SZ 16 3638 3639 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3640 { 3641 struct bpf_unix_iter_state *iter = priv_data; 3642 int err; 3643 3644 err = bpf_iter_init_seq_net(priv_data, aux); 3645 if (err) 3646 return err; 3647 3648 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3649 if (err) { 3650 bpf_iter_fini_seq_net(priv_data); 3651 return err; 3652 } 3653 3654 return 0; 3655 } 3656 3657 static void bpf_iter_fini_unix(void *priv_data) 3658 { 3659 struct bpf_unix_iter_state *iter = priv_data; 3660 3661 bpf_iter_fini_seq_net(priv_data); 3662 kvfree(iter->batch); 3663 } 3664 3665 static const struct bpf_iter_seq_info unix_seq_info = { 3666 .seq_ops = &bpf_iter_unix_seq_ops, 3667 .init_seq_private = bpf_iter_init_unix, 3668 .fini_seq_private = bpf_iter_fini_unix, 3669 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3670 }; 3671 3672 static const struct bpf_func_proto * 3673 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3674 const struct bpf_prog *prog) 3675 { 3676 switch (func_id) { 3677 case BPF_FUNC_setsockopt: 3678 return &bpf_sk_setsockopt_proto; 3679 case BPF_FUNC_getsockopt: 3680 return &bpf_sk_getsockopt_proto; 3681 default: 3682 return NULL; 3683 } 3684 } 3685 3686 static struct bpf_iter_reg unix_reg_info = { 3687 .target = "unix", 3688 .ctx_arg_info_size = 1, 3689 .ctx_arg_info = { 3690 { offsetof(struct bpf_iter__unix, unix_sk), 3691 PTR_TO_BTF_ID_OR_NULL }, 3692 }, 3693 .get_func_proto = bpf_iter_unix_get_func_proto, 3694 .seq_info = &unix_seq_info, 3695 }; 3696 3697 static void __init bpf_iter_register(void) 3698 { 3699 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3700 if (bpf_iter_reg_target(&unix_reg_info)) 3701 pr_warn("Warning: could not register bpf iterator unix\n"); 3702 } 3703 #endif 3704 3705 static int __init af_unix_init(void) 3706 { 3707 int i, rc = -1; 3708 3709 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3710 3711 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3712 spin_lock_init(&bsd_socket_locks[i]); 3713 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3714 } 3715 3716 rc = proto_register(&unix_dgram_proto, 1); 3717 if (rc != 0) { 3718 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3719 goto out; 3720 } 3721 3722 rc = proto_register(&unix_stream_proto, 1); 3723 if (rc != 0) { 3724 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3725 proto_unregister(&unix_dgram_proto); 3726 goto out; 3727 } 3728 3729 sock_register(&unix_family_ops); 3730 register_pernet_subsys(&unix_net_ops); 3731 unix_bpf_build_proto(); 3732 3733 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3734 bpf_iter_register(); 3735 #endif 3736 3737 out: 3738 return rc; 3739 } 3740 3741 static void __exit af_unix_exit(void) 3742 { 3743 sock_unregister(PF_UNIX); 3744 proto_unregister(&unix_dgram_proto); 3745 proto_unregister(&unix_stream_proto); 3746 unregister_pernet_subsys(&unix_net_ops); 3747 } 3748 3749 /* Earlier than device_initcall() so that other drivers invoking 3750 request_module() don't end up in a loop when modprobe tries 3751 to use a UNIX socket. But later than subsys_initcall() because 3752 we depend on stuff initialised there */ 3753 fs_initcall(af_unix_init); 3754 module_exit(af_unix_exit); 3755 3756 MODULE_LICENSE("GPL"); 3757 MODULE_ALIAS_NETPROTO(PF_UNIX); 3758