1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 120 #include "scm.h" 121 122 static atomic_long_t unix_nr_socks; 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 125 126 /* SMP locking strategy: 127 * hash table is protected with spinlock. 128 * each socket state is protected by separate spinlock. 129 */ 130 131 static unsigned int unix_unbound_hash(struct sock *sk) 132 { 133 unsigned long hash = (unsigned long)sk; 134 135 hash ^= hash >> 16; 136 hash ^= hash >> 8; 137 hash ^= sk->sk_type; 138 139 return hash & UNIX_HASH_MOD; 140 } 141 142 static unsigned int unix_bsd_hash(struct inode *i) 143 { 144 return i->i_ino & UNIX_HASH_MOD; 145 } 146 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 148 int addr_len, int type) 149 { 150 __wsum csum = csum_partial(sunaddr, addr_len, 0); 151 unsigned int hash; 152 153 hash = (__force unsigned int)csum_fold(csum); 154 hash ^= hash >> 8; 155 hash ^= type; 156 157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 158 } 159 160 static void unix_table_double_lock(struct net *net, 161 unsigned int hash1, unsigned int hash2) 162 { 163 if (hash1 == hash2) { 164 spin_lock(&net->unx.table.locks[hash1]); 165 return; 166 } 167 168 if (hash1 > hash2) 169 swap(hash1, hash2); 170 171 spin_lock(&net->unx.table.locks[hash1]); 172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 173 } 174 175 static void unix_table_double_unlock(struct net *net, 176 unsigned int hash1, unsigned int hash2) 177 { 178 if (hash1 == hash2) { 179 spin_unlock(&net->unx.table.locks[hash1]); 180 return; 181 } 182 183 spin_unlock(&net->unx.table.locks[hash1]); 184 spin_unlock(&net->unx.table.locks[hash2]); 185 } 186 187 #ifdef CONFIG_SECURITY_NETWORK 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 189 { 190 UNIXCB(skb).secid = scm->secid; 191 } 192 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 194 { 195 scm->secid = UNIXCB(skb).secid; 196 } 197 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 199 { 200 return (scm->secid == UNIXCB(skb).secid); 201 } 202 #else 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 204 { } 205 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 207 { } 208 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 210 { 211 return true; 212 } 213 #endif /* CONFIG_SECURITY_NETWORK */ 214 215 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 216 { 217 return unix_peer(osk) == sk; 218 } 219 220 static inline int unix_may_send(struct sock *sk, struct sock *osk) 221 { 222 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 223 } 224 225 static inline int unix_recvq_full_lockless(const struct sock *sk) 226 { 227 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 228 } 229 230 struct sock *unix_peer_get(struct sock *s) 231 { 232 struct sock *peer; 233 234 unix_state_lock(s); 235 peer = unix_peer(s); 236 if (peer) 237 sock_hold(peer); 238 unix_state_unlock(s); 239 return peer; 240 } 241 EXPORT_SYMBOL_GPL(unix_peer_get); 242 243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 244 int addr_len) 245 { 246 struct unix_address *addr; 247 248 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 249 if (!addr) 250 return NULL; 251 252 refcount_set(&addr->refcnt, 1); 253 addr->len = addr_len; 254 memcpy(addr->name, sunaddr, addr_len); 255 256 return addr; 257 } 258 259 static inline void unix_release_addr(struct unix_address *addr) 260 { 261 if (refcount_dec_and_test(&addr->refcnt)) 262 kfree(addr); 263 } 264 265 /* 266 * Check unix socket name: 267 * - should be not zero length. 268 * - if started by not zero, should be NULL terminated (FS object) 269 * - if started by zero, it is abstract name. 270 */ 271 272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 273 { 274 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 275 addr_len > sizeof(*sunaddr)) 276 return -EINVAL; 277 278 if (sunaddr->sun_family != AF_UNIX) 279 return -EINVAL; 280 281 return 0; 282 } 283 284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 285 { 286 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 287 short offset = offsetof(struct sockaddr_storage, __data); 288 289 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 290 291 /* This may look like an off by one error but it is a bit more 292 * subtle. 108 is the longest valid AF_UNIX path for a binding. 293 * sun_path[108] doesn't as such exist. However in kernel space 294 * we are guaranteed that it is a valid memory location in our 295 * kernel address buffer because syscall functions always pass 296 * a pointer of struct sockaddr_storage which has a bigger buffer 297 * than 108. Also, we must terminate sun_path for strlen() in 298 * getname_kernel(). 299 */ 300 addr->__data[addr_len - offset] = 0; 301 302 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 303 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 304 * know the actual buffer. 305 */ 306 return strlen(addr->__data) + offset + 1; 307 } 308 309 static void __unix_remove_socket(struct sock *sk) 310 { 311 sk_del_node_init(sk); 312 } 313 314 static void __unix_insert_socket(struct net *net, struct sock *sk) 315 { 316 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 317 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 318 } 319 320 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 321 struct unix_address *addr, unsigned int hash) 322 { 323 __unix_remove_socket(sk); 324 smp_store_release(&unix_sk(sk)->addr, addr); 325 326 sk->sk_hash = hash; 327 __unix_insert_socket(net, sk); 328 } 329 330 static void unix_remove_socket(struct net *net, struct sock *sk) 331 { 332 spin_lock(&net->unx.table.locks[sk->sk_hash]); 333 __unix_remove_socket(sk); 334 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 335 } 336 337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 338 { 339 spin_lock(&net->unx.table.locks[sk->sk_hash]); 340 __unix_insert_socket(net, sk); 341 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 342 } 343 344 static void unix_insert_bsd_socket(struct sock *sk) 345 { 346 spin_lock(&bsd_socket_locks[sk->sk_hash]); 347 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 348 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 349 } 350 351 static void unix_remove_bsd_socket(struct sock *sk) 352 { 353 if (!hlist_unhashed(&sk->sk_bind_node)) { 354 spin_lock(&bsd_socket_locks[sk->sk_hash]); 355 __sk_del_bind_node(sk); 356 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 357 358 sk_node_init(&sk->sk_bind_node); 359 } 360 } 361 362 static struct sock *__unix_find_socket_byname(struct net *net, 363 struct sockaddr_un *sunname, 364 int len, unsigned int hash) 365 { 366 struct sock *s; 367 368 sk_for_each(s, &net->unx.table.buckets[hash]) { 369 struct unix_sock *u = unix_sk(s); 370 371 if (u->addr->len == len && 372 !memcmp(u->addr->name, sunname, len)) 373 return s; 374 } 375 return NULL; 376 } 377 378 static inline struct sock *unix_find_socket_byname(struct net *net, 379 struct sockaddr_un *sunname, 380 int len, unsigned int hash) 381 { 382 struct sock *s; 383 384 spin_lock(&net->unx.table.locks[hash]); 385 s = __unix_find_socket_byname(net, sunname, len, hash); 386 if (s) 387 sock_hold(s); 388 spin_unlock(&net->unx.table.locks[hash]); 389 return s; 390 } 391 392 static struct sock *unix_find_socket_byinode(struct inode *i) 393 { 394 unsigned int hash = unix_bsd_hash(i); 395 struct sock *s; 396 397 spin_lock(&bsd_socket_locks[hash]); 398 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 399 struct dentry *dentry = unix_sk(s)->path.dentry; 400 401 if (dentry && d_backing_inode(dentry) == i) { 402 sock_hold(s); 403 spin_unlock(&bsd_socket_locks[hash]); 404 return s; 405 } 406 } 407 spin_unlock(&bsd_socket_locks[hash]); 408 return NULL; 409 } 410 411 /* Support code for asymmetrically connected dgram sockets 412 * 413 * If a datagram socket is connected to a socket not itself connected 414 * to the first socket (eg, /dev/log), clients may only enqueue more 415 * messages if the present receive queue of the server socket is not 416 * "too large". This means there's a second writeability condition 417 * poll and sendmsg need to test. The dgram recv code will do a wake 418 * up on the peer_wait wait queue of a socket upon reception of a 419 * datagram which needs to be propagated to sleeping would-be writers 420 * since these might not have sent anything so far. This can't be 421 * accomplished via poll_wait because the lifetime of the server 422 * socket might be less than that of its clients if these break their 423 * association with it or if the server socket is closed while clients 424 * are still connected to it and there's no way to inform "a polling 425 * implementation" that it should let go of a certain wait queue 426 * 427 * In order to propagate a wake up, a wait_queue_entry_t of the client 428 * socket is enqueued on the peer_wait queue of the server socket 429 * whose wake function does a wake_up on the ordinary client socket 430 * wait queue. This connection is established whenever a write (or 431 * poll for write) hit the flow control condition and broken when the 432 * association to the server socket is dissolved or after a wake up 433 * was relayed. 434 */ 435 436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 437 void *key) 438 { 439 struct unix_sock *u; 440 wait_queue_head_t *u_sleep; 441 442 u = container_of(q, struct unix_sock, peer_wake); 443 444 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 445 q); 446 u->peer_wake.private = NULL; 447 448 /* relaying can only happen while the wq still exists */ 449 u_sleep = sk_sleep(&u->sk); 450 if (u_sleep) 451 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 452 453 return 0; 454 } 455 456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 457 { 458 struct unix_sock *u, *u_other; 459 int rc; 460 461 u = unix_sk(sk); 462 u_other = unix_sk(other); 463 rc = 0; 464 spin_lock(&u_other->peer_wait.lock); 465 466 if (!u->peer_wake.private) { 467 u->peer_wake.private = other; 468 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 469 470 rc = 1; 471 } 472 473 spin_unlock(&u_other->peer_wait.lock); 474 return rc; 475 } 476 477 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 478 struct sock *other) 479 { 480 struct unix_sock *u, *u_other; 481 482 u = unix_sk(sk); 483 u_other = unix_sk(other); 484 spin_lock(&u_other->peer_wait.lock); 485 486 if (u->peer_wake.private == other) { 487 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 488 u->peer_wake.private = NULL; 489 } 490 491 spin_unlock(&u_other->peer_wait.lock); 492 } 493 494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 495 struct sock *other) 496 { 497 unix_dgram_peer_wake_disconnect(sk, other); 498 wake_up_interruptible_poll(sk_sleep(sk), 499 EPOLLOUT | 500 EPOLLWRNORM | 501 EPOLLWRBAND); 502 } 503 504 /* preconditions: 505 * - unix_peer(sk) == other 506 * - association is stable 507 */ 508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 509 { 510 int connected; 511 512 connected = unix_dgram_peer_wake_connect(sk, other); 513 514 /* If other is SOCK_DEAD, we want to make sure we signal 515 * POLLOUT, such that a subsequent write() can get a 516 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 517 * to other and its full, we will hang waiting for POLLOUT. 518 */ 519 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 520 return 1; 521 522 if (connected) 523 unix_dgram_peer_wake_disconnect(sk, other); 524 525 return 0; 526 } 527 528 static int unix_writable(const struct sock *sk, unsigned char state) 529 { 530 return state != TCP_LISTEN && 531 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 532 } 533 534 static void unix_write_space(struct sock *sk) 535 { 536 struct socket_wq *wq; 537 538 rcu_read_lock(); 539 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 540 wq = rcu_dereference(sk->sk_wq); 541 if (skwq_has_sleeper(wq)) 542 wake_up_interruptible_sync_poll(&wq->wait, 543 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 544 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 545 } 546 rcu_read_unlock(); 547 } 548 549 /* When dgram socket disconnects (or changes its peer), we clear its receive 550 * queue of packets arrived from previous peer. First, it allows to do 551 * flow control based only on wmem_alloc; second, sk connected to peer 552 * may receive messages only from that peer. */ 553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 554 { 555 if (!skb_queue_empty(&sk->sk_receive_queue)) { 556 skb_queue_purge(&sk->sk_receive_queue); 557 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 558 559 /* If one link of bidirectional dgram pipe is disconnected, 560 * we signal error. Messages are lost. Do not make this, 561 * when peer was not connected to us. 562 */ 563 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 564 WRITE_ONCE(other->sk_err, ECONNRESET); 565 sk_error_report(other); 566 } 567 } 568 } 569 570 static void unix_sock_destructor(struct sock *sk) 571 { 572 struct unix_sock *u = unix_sk(sk); 573 574 skb_queue_purge(&sk->sk_receive_queue); 575 576 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 577 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 578 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 579 if (!sock_flag(sk, SOCK_DEAD)) { 580 pr_info("Attempt to release alive unix socket: %p\n", sk); 581 return; 582 } 583 584 if (u->addr) 585 unix_release_addr(u->addr); 586 587 atomic_long_dec(&unix_nr_socks); 588 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 589 #ifdef UNIX_REFCNT_DEBUG 590 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 591 atomic_long_read(&unix_nr_socks)); 592 #endif 593 } 594 595 static void unix_release_sock(struct sock *sk, int embrion) 596 { 597 struct unix_sock *u = unix_sk(sk); 598 struct sock *skpair; 599 struct sk_buff *skb; 600 struct path path; 601 int state; 602 603 unix_remove_socket(sock_net(sk), sk); 604 unix_remove_bsd_socket(sk); 605 606 /* Clear state */ 607 unix_state_lock(sk); 608 sock_orphan(sk); 609 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 610 path = u->path; 611 u->path.dentry = NULL; 612 u->path.mnt = NULL; 613 state = sk->sk_state; 614 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 615 616 skpair = unix_peer(sk); 617 unix_peer(sk) = NULL; 618 619 unix_state_unlock(sk); 620 621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 622 if (u->oob_skb) { 623 kfree_skb(u->oob_skb); 624 u->oob_skb = NULL; 625 } 626 #endif 627 628 wake_up_interruptible_all(&u->peer_wait); 629 630 if (skpair != NULL) { 631 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 632 unix_state_lock(skpair); 633 /* No more writes */ 634 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 635 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 636 WRITE_ONCE(skpair->sk_err, ECONNRESET); 637 unix_state_unlock(skpair); 638 skpair->sk_state_change(skpair); 639 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 640 } 641 642 unix_dgram_peer_wake_disconnect(sk, skpair); 643 sock_put(skpair); /* It may now die */ 644 } 645 646 /* Try to flush out this socket. Throw out buffers at least */ 647 648 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 649 if (state == TCP_LISTEN) 650 unix_release_sock(skb->sk, 1); 651 /* passed fds are erased in the kfree_skb hook */ 652 UNIXCB(skb).consumed = skb->len; 653 kfree_skb(skb); 654 } 655 656 if (path.dentry) 657 path_put(&path); 658 659 sock_put(sk); 660 661 /* ---- Socket is dead now and most probably destroyed ---- */ 662 663 /* 664 * Fixme: BSD difference: In BSD all sockets connected to us get 665 * ECONNRESET and we die on the spot. In Linux we behave 666 * like files and pipes do and wait for the last 667 * dereference. 668 * 669 * Can't we simply set sock->err? 670 * 671 * What the above comment does talk about? --ANK(980817) 672 */ 673 674 if (READ_ONCE(unix_tot_inflight)) 675 unix_gc(); /* Garbage collect fds */ 676 } 677 678 static void init_peercred(struct sock *sk) 679 { 680 const struct cred *old_cred; 681 struct pid *old_pid; 682 683 spin_lock(&sk->sk_peer_lock); 684 old_pid = sk->sk_peer_pid; 685 old_cred = sk->sk_peer_cred; 686 sk->sk_peer_pid = get_pid(task_tgid(current)); 687 sk->sk_peer_cred = get_current_cred(); 688 spin_unlock(&sk->sk_peer_lock); 689 690 put_pid(old_pid); 691 put_cred(old_cred); 692 } 693 694 static void copy_peercred(struct sock *sk, struct sock *peersk) 695 { 696 const struct cred *old_cred; 697 struct pid *old_pid; 698 699 if (sk < peersk) { 700 spin_lock(&sk->sk_peer_lock); 701 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 702 } else { 703 spin_lock(&peersk->sk_peer_lock); 704 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 705 } 706 old_pid = sk->sk_peer_pid; 707 old_cred = sk->sk_peer_cred; 708 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 709 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 710 711 spin_unlock(&sk->sk_peer_lock); 712 spin_unlock(&peersk->sk_peer_lock); 713 714 put_pid(old_pid); 715 put_cred(old_cred); 716 } 717 718 static int unix_listen(struct socket *sock, int backlog) 719 { 720 int err; 721 struct sock *sk = sock->sk; 722 struct unix_sock *u = unix_sk(sk); 723 724 err = -EOPNOTSUPP; 725 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 726 goto out; /* Only stream/seqpacket sockets accept */ 727 err = -EINVAL; 728 if (!READ_ONCE(u->addr)) 729 goto out; /* No listens on an unbound socket */ 730 unix_state_lock(sk); 731 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 732 goto out_unlock; 733 if (backlog > sk->sk_max_ack_backlog) 734 wake_up_interruptible_all(&u->peer_wait); 735 sk->sk_max_ack_backlog = backlog; 736 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 737 738 /* set credentials so connect can copy them */ 739 init_peercred(sk); 740 err = 0; 741 742 out_unlock: 743 unix_state_unlock(sk); 744 out: 745 return err; 746 } 747 748 static int unix_release(struct socket *); 749 static int unix_bind(struct socket *, struct sockaddr *, int); 750 static int unix_stream_connect(struct socket *, struct sockaddr *, 751 int addr_len, int flags); 752 static int unix_socketpair(struct socket *, struct socket *); 753 static int unix_accept(struct socket *, struct socket *, int, bool); 754 static int unix_getname(struct socket *, struct sockaddr *, int); 755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 756 static __poll_t unix_dgram_poll(struct file *, struct socket *, 757 poll_table *); 758 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 759 #ifdef CONFIG_COMPAT 760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 761 #endif 762 static int unix_shutdown(struct socket *, int); 763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 765 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 766 struct pipe_inode_info *, size_t size, 767 unsigned int flags); 768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 772 static int unix_dgram_connect(struct socket *, struct sockaddr *, 773 int, int); 774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 776 int); 777 778 static int unix_set_peek_off(struct sock *sk, int val) 779 { 780 struct unix_sock *u = unix_sk(sk); 781 782 if (mutex_lock_interruptible(&u->iolock)) 783 return -EINTR; 784 785 WRITE_ONCE(sk->sk_peek_off, val); 786 mutex_unlock(&u->iolock); 787 788 return 0; 789 } 790 791 #ifdef CONFIG_PROC_FS 792 static int unix_count_nr_fds(struct sock *sk) 793 { 794 struct sk_buff *skb; 795 struct unix_sock *u; 796 int nr_fds = 0; 797 798 spin_lock(&sk->sk_receive_queue.lock); 799 skb = skb_peek(&sk->sk_receive_queue); 800 while (skb) { 801 u = unix_sk(skb->sk); 802 nr_fds += atomic_read(&u->scm_stat.nr_fds); 803 skb = skb_peek_next(skb, &sk->sk_receive_queue); 804 } 805 spin_unlock(&sk->sk_receive_queue.lock); 806 807 return nr_fds; 808 } 809 810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 811 { 812 struct sock *sk = sock->sk; 813 unsigned char s_state; 814 struct unix_sock *u; 815 int nr_fds = 0; 816 817 if (sk) { 818 s_state = READ_ONCE(sk->sk_state); 819 u = unix_sk(sk); 820 821 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 822 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 823 * SOCK_DGRAM is ordinary. So, no lock is needed. 824 */ 825 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 826 nr_fds = atomic_read(&u->scm_stat.nr_fds); 827 else if (s_state == TCP_LISTEN) 828 nr_fds = unix_count_nr_fds(sk); 829 830 seq_printf(m, "scm_fds: %u\n", nr_fds); 831 } 832 } 833 #else 834 #define unix_show_fdinfo NULL 835 #endif 836 837 static const struct proto_ops unix_stream_ops = { 838 .family = PF_UNIX, 839 .owner = THIS_MODULE, 840 .release = unix_release, 841 .bind = unix_bind, 842 .connect = unix_stream_connect, 843 .socketpair = unix_socketpair, 844 .accept = unix_accept, 845 .getname = unix_getname, 846 .poll = unix_poll, 847 .ioctl = unix_ioctl, 848 #ifdef CONFIG_COMPAT 849 .compat_ioctl = unix_compat_ioctl, 850 #endif 851 .listen = unix_listen, 852 .shutdown = unix_shutdown, 853 .sendmsg = unix_stream_sendmsg, 854 .recvmsg = unix_stream_recvmsg, 855 .read_skb = unix_stream_read_skb, 856 .mmap = sock_no_mmap, 857 .splice_read = unix_stream_splice_read, 858 .set_peek_off = unix_set_peek_off, 859 .show_fdinfo = unix_show_fdinfo, 860 }; 861 862 static const struct proto_ops unix_dgram_ops = { 863 .family = PF_UNIX, 864 .owner = THIS_MODULE, 865 .release = unix_release, 866 .bind = unix_bind, 867 .connect = unix_dgram_connect, 868 .socketpair = unix_socketpair, 869 .accept = sock_no_accept, 870 .getname = unix_getname, 871 .poll = unix_dgram_poll, 872 .ioctl = unix_ioctl, 873 #ifdef CONFIG_COMPAT 874 .compat_ioctl = unix_compat_ioctl, 875 #endif 876 .listen = sock_no_listen, 877 .shutdown = unix_shutdown, 878 .sendmsg = unix_dgram_sendmsg, 879 .read_skb = unix_read_skb, 880 .recvmsg = unix_dgram_recvmsg, 881 .mmap = sock_no_mmap, 882 .set_peek_off = unix_set_peek_off, 883 .show_fdinfo = unix_show_fdinfo, 884 }; 885 886 static const struct proto_ops unix_seqpacket_ops = { 887 .family = PF_UNIX, 888 .owner = THIS_MODULE, 889 .release = unix_release, 890 .bind = unix_bind, 891 .connect = unix_stream_connect, 892 .socketpair = unix_socketpair, 893 .accept = unix_accept, 894 .getname = unix_getname, 895 .poll = unix_dgram_poll, 896 .ioctl = unix_ioctl, 897 #ifdef CONFIG_COMPAT 898 .compat_ioctl = unix_compat_ioctl, 899 #endif 900 .listen = unix_listen, 901 .shutdown = unix_shutdown, 902 .sendmsg = unix_seqpacket_sendmsg, 903 .recvmsg = unix_seqpacket_recvmsg, 904 .mmap = sock_no_mmap, 905 .set_peek_off = unix_set_peek_off, 906 .show_fdinfo = unix_show_fdinfo, 907 }; 908 909 static void unix_close(struct sock *sk, long timeout) 910 { 911 /* Nothing to do here, unix socket does not need a ->close(). 912 * This is merely for sockmap. 913 */ 914 } 915 916 static void unix_unhash(struct sock *sk) 917 { 918 /* Nothing to do here, unix socket does not need a ->unhash(). 919 * This is merely for sockmap. 920 */ 921 } 922 923 static bool unix_bpf_bypass_getsockopt(int level, int optname) 924 { 925 if (level == SOL_SOCKET) { 926 switch (optname) { 927 case SO_PEERPIDFD: 928 return true; 929 default: 930 return false; 931 } 932 } 933 934 return false; 935 } 936 937 struct proto unix_dgram_proto = { 938 .name = "UNIX", 939 .owner = THIS_MODULE, 940 .obj_size = sizeof(struct unix_sock), 941 .close = unix_close, 942 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 943 #ifdef CONFIG_BPF_SYSCALL 944 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 945 #endif 946 }; 947 948 struct proto unix_stream_proto = { 949 .name = "UNIX-STREAM", 950 .owner = THIS_MODULE, 951 .obj_size = sizeof(struct unix_sock), 952 .close = unix_close, 953 .unhash = unix_unhash, 954 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 955 #ifdef CONFIG_BPF_SYSCALL 956 .psock_update_sk_prot = unix_stream_bpf_update_proto, 957 #endif 958 }; 959 960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 961 { 962 struct unix_sock *u; 963 struct sock *sk; 964 int err; 965 966 atomic_long_inc(&unix_nr_socks); 967 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 968 err = -ENFILE; 969 goto err; 970 } 971 972 if (type == SOCK_STREAM) 973 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 974 else /*dgram and seqpacket */ 975 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 976 977 if (!sk) { 978 err = -ENOMEM; 979 goto err; 980 } 981 982 sock_init_data(sock, sk); 983 984 sk->sk_hash = unix_unbound_hash(sk); 985 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 986 sk->sk_write_space = unix_write_space; 987 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 988 sk->sk_destruct = unix_sock_destructor; 989 u = unix_sk(sk); 990 u->inflight = 0; 991 u->path.dentry = NULL; 992 u->path.mnt = NULL; 993 spin_lock_init(&u->lock); 994 INIT_LIST_HEAD(&u->link); 995 mutex_init(&u->iolock); /* single task reading lock */ 996 mutex_init(&u->bindlock); /* single task binding lock */ 997 init_waitqueue_head(&u->peer_wait); 998 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 999 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1000 unix_insert_unbound_socket(net, sk); 1001 1002 sock_prot_inuse_add(net, sk->sk_prot, 1); 1003 1004 return sk; 1005 1006 err: 1007 atomic_long_dec(&unix_nr_socks); 1008 return ERR_PTR(err); 1009 } 1010 1011 static int unix_create(struct net *net, struct socket *sock, int protocol, 1012 int kern) 1013 { 1014 struct sock *sk; 1015 1016 if (protocol && protocol != PF_UNIX) 1017 return -EPROTONOSUPPORT; 1018 1019 sock->state = SS_UNCONNECTED; 1020 1021 switch (sock->type) { 1022 case SOCK_STREAM: 1023 sock->ops = &unix_stream_ops; 1024 break; 1025 /* 1026 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1027 * nothing uses it. 1028 */ 1029 case SOCK_RAW: 1030 sock->type = SOCK_DGRAM; 1031 fallthrough; 1032 case SOCK_DGRAM: 1033 sock->ops = &unix_dgram_ops; 1034 break; 1035 case SOCK_SEQPACKET: 1036 sock->ops = &unix_seqpacket_ops; 1037 break; 1038 default: 1039 return -ESOCKTNOSUPPORT; 1040 } 1041 1042 sk = unix_create1(net, sock, kern, sock->type); 1043 if (IS_ERR(sk)) 1044 return PTR_ERR(sk); 1045 1046 return 0; 1047 } 1048 1049 static int unix_release(struct socket *sock) 1050 { 1051 struct sock *sk = sock->sk; 1052 1053 if (!sk) 1054 return 0; 1055 1056 sk->sk_prot->close(sk, 0); 1057 unix_release_sock(sk, 0); 1058 sock->sk = NULL; 1059 1060 return 0; 1061 } 1062 1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1064 int type) 1065 { 1066 struct inode *inode; 1067 struct path path; 1068 struct sock *sk; 1069 int err; 1070 1071 unix_mkname_bsd(sunaddr, addr_len); 1072 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1073 if (err) 1074 goto fail; 1075 1076 err = path_permission(&path, MAY_WRITE); 1077 if (err) 1078 goto path_put; 1079 1080 err = -ECONNREFUSED; 1081 inode = d_backing_inode(path.dentry); 1082 if (!S_ISSOCK(inode->i_mode)) 1083 goto path_put; 1084 1085 sk = unix_find_socket_byinode(inode); 1086 if (!sk) 1087 goto path_put; 1088 1089 err = -EPROTOTYPE; 1090 if (sk->sk_type == type) 1091 touch_atime(&path); 1092 else 1093 goto sock_put; 1094 1095 path_put(&path); 1096 1097 return sk; 1098 1099 sock_put: 1100 sock_put(sk); 1101 path_put: 1102 path_put(&path); 1103 fail: 1104 return ERR_PTR(err); 1105 } 1106 1107 static struct sock *unix_find_abstract(struct net *net, 1108 struct sockaddr_un *sunaddr, 1109 int addr_len, int type) 1110 { 1111 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1112 struct dentry *dentry; 1113 struct sock *sk; 1114 1115 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1116 if (!sk) 1117 return ERR_PTR(-ECONNREFUSED); 1118 1119 dentry = unix_sk(sk)->path.dentry; 1120 if (dentry) 1121 touch_atime(&unix_sk(sk)->path); 1122 1123 return sk; 1124 } 1125 1126 static struct sock *unix_find_other(struct net *net, 1127 struct sockaddr_un *sunaddr, 1128 int addr_len, int type) 1129 { 1130 struct sock *sk; 1131 1132 if (sunaddr->sun_path[0]) 1133 sk = unix_find_bsd(sunaddr, addr_len, type); 1134 else 1135 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1136 1137 return sk; 1138 } 1139 1140 static int unix_autobind(struct sock *sk) 1141 { 1142 struct unix_sock *u = unix_sk(sk); 1143 unsigned int new_hash, old_hash; 1144 struct net *net = sock_net(sk); 1145 struct unix_address *addr; 1146 u32 lastnum, ordernum; 1147 int err; 1148 1149 err = mutex_lock_interruptible(&u->bindlock); 1150 if (err) 1151 return err; 1152 1153 if (u->addr) 1154 goto out; 1155 1156 err = -ENOMEM; 1157 addr = kzalloc(sizeof(*addr) + 1158 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1159 if (!addr) 1160 goto out; 1161 1162 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1163 addr->name->sun_family = AF_UNIX; 1164 refcount_set(&addr->refcnt, 1); 1165 1166 old_hash = sk->sk_hash; 1167 ordernum = get_random_u32(); 1168 lastnum = ordernum & 0xFFFFF; 1169 retry: 1170 ordernum = (ordernum + 1) & 0xFFFFF; 1171 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1172 1173 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1174 unix_table_double_lock(net, old_hash, new_hash); 1175 1176 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1177 unix_table_double_unlock(net, old_hash, new_hash); 1178 1179 /* __unix_find_socket_byname() may take long time if many names 1180 * are already in use. 1181 */ 1182 cond_resched(); 1183 1184 if (ordernum == lastnum) { 1185 /* Give up if all names seems to be in use. */ 1186 err = -ENOSPC; 1187 unix_release_addr(addr); 1188 goto out; 1189 } 1190 1191 goto retry; 1192 } 1193 1194 __unix_set_addr_hash(net, sk, addr, new_hash); 1195 unix_table_double_unlock(net, old_hash, new_hash); 1196 err = 0; 1197 1198 out: mutex_unlock(&u->bindlock); 1199 return err; 1200 } 1201 1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1203 int addr_len) 1204 { 1205 umode_t mode = S_IFSOCK | 1206 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1207 struct unix_sock *u = unix_sk(sk); 1208 unsigned int new_hash, old_hash; 1209 struct net *net = sock_net(sk); 1210 struct mnt_idmap *idmap; 1211 struct unix_address *addr; 1212 struct dentry *dentry; 1213 struct path parent; 1214 int err; 1215 1216 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1217 addr = unix_create_addr(sunaddr, addr_len); 1218 if (!addr) 1219 return -ENOMEM; 1220 1221 /* 1222 * Get the parent directory, calculate the hash for last 1223 * component. 1224 */ 1225 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1226 if (IS_ERR(dentry)) { 1227 err = PTR_ERR(dentry); 1228 goto out; 1229 } 1230 1231 /* 1232 * All right, let's create it. 1233 */ 1234 idmap = mnt_idmap(parent.mnt); 1235 err = security_path_mknod(&parent, dentry, mode, 0); 1236 if (!err) 1237 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1238 if (err) 1239 goto out_path; 1240 err = mutex_lock_interruptible(&u->bindlock); 1241 if (err) 1242 goto out_unlink; 1243 if (u->addr) 1244 goto out_unlock; 1245 1246 old_hash = sk->sk_hash; 1247 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1248 unix_table_double_lock(net, old_hash, new_hash); 1249 u->path.mnt = mntget(parent.mnt); 1250 u->path.dentry = dget(dentry); 1251 __unix_set_addr_hash(net, sk, addr, new_hash); 1252 unix_table_double_unlock(net, old_hash, new_hash); 1253 unix_insert_bsd_socket(sk); 1254 mutex_unlock(&u->bindlock); 1255 done_path_create(&parent, dentry); 1256 return 0; 1257 1258 out_unlock: 1259 mutex_unlock(&u->bindlock); 1260 err = -EINVAL; 1261 out_unlink: 1262 /* failed after successful mknod? unlink what we'd created... */ 1263 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1264 out_path: 1265 done_path_create(&parent, dentry); 1266 out: 1267 unix_release_addr(addr); 1268 return err == -EEXIST ? -EADDRINUSE : err; 1269 } 1270 1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1272 int addr_len) 1273 { 1274 struct unix_sock *u = unix_sk(sk); 1275 unsigned int new_hash, old_hash; 1276 struct net *net = sock_net(sk); 1277 struct unix_address *addr; 1278 int err; 1279 1280 addr = unix_create_addr(sunaddr, addr_len); 1281 if (!addr) 1282 return -ENOMEM; 1283 1284 err = mutex_lock_interruptible(&u->bindlock); 1285 if (err) 1286 goto out; 1287 1288 if (u->addr) { 1289 err = -EINVAL; 1290 goto out_mutex; 1291 } 1292 1293 old_hash = sk->sk_hash; 1294 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1295 unix_table_double_lock(net, old_hash, new_hash); 1296 1297 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1298 goto out_spin; 1299 1300 __unix_set_addr_hash(net, sk, addr, new_hash); 1301 unix_table_double_unlock(net, old_hash, new_hash); 1302 mutex_unlock(&u->bindlock); 1303 return 0; 1304 1305 out_spin: 1306 unix_table_double_unlock(net, old_hash, new_hash); 1307 err = -EADDRINUSE; 1308 out_mutex: 1309 mutex_unlock(&u->bindlock); 1310 out: 1311 unix_release_addr(addr); 1312 return err; 1313 } 1314 1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1316 { 1317 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1318 struct sock *sk = sock->sk; 1319 int err; 1320 1321 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1322 sunaddr->sun_family == AF_UNIX) 1323 return unix_autobind(sk); 1324 1325 err = unix_validate_addr(sunaddr, addr_len); 1326 if (err) 1327 return err; 1328 1329 if (sunaddr->sun_path[0]) 1330 err = unix_bind_bsd(sk, sunaddr, addr_len); 1331 else 1332 err = unix_bind_abstract(sk, sunaddr, addr_len); 1333 1334 return err; 1335 } 1336 1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1338 { 1339 if (unlikely(sk1 == sk2) || !sk2) { 1340 unix_state_lock(sk1); 1341 return; 1342 } 1343 if (sk1 > sk2) 1344 swap(sk1, sk2); 1345 1346 unix_state_lock(sk1); 1347 unix_state_lock_nested(sk2, U_LOCK_SECOND); 1348 } 1349 1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1351 { 1352 if (unlikely(sk1 == sk2) || !sk2) { 1353 unix_state_unlock(sk1); 1354 return; 1355 } 1356 unix_state_unlock(sk1); 1357 unix_state_unlock(sk2); 1358 } 1359 1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1361 int alen, int flags) 1362 { 1363 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1364 struct sock *sk = sock->sk; 1365 struct sock *other; 1366 int err; 1367 1368 err = -EINVAL; 1369 if (alen < offsetofend(struct sockaddr, sa_family)) 1370 goto out; 1371 1372 if (addr->sa_family != AF_UNSPEC) { 1373 err = unix_validate_addr(sunaddr, alen); 1374 if (err) 1375 goto out; 1376 1377 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1378 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1379 !READ_ONCE(unix_sk(sk)->addr)) { 1380 err = unix_autobind(sk); 1381 if (err) 1382 goto out; 1383 } 1384 1385 restart: 1386 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1387 if (IS_ERR(other)) { 1388 err = PTR_ERR(other); 1389 goto out; 1390 } 1391 1392 unix_state_double_lock(sk, other); 1393 1394 /* Apparently VFS overslept socket death. Retry. */ 1395 if (sock_flag(other, SOCK_DEAD)) { 1396 unix_state_double_unlock(sk, other); 1397 sock_put(other); 1398 goto restart; 1399 } 1400 1401 err = -EPERM; 1402 if (!unix_may_send(sk, other)) 1403 goto out_unlock; 1404 1405 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1406 if (err) 1407 goto out_unlock; 1408 1409 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1410 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1411 } else { 1412 /* 1413 * 1003.1g breaking connected state with AF_UNSPEC 1414 */ 1415 other = NULL; 1416 unix_state_double_lock(sk, other); 1417 } 1418 1419 /* 1420 * If it was connected, reconnect. 1421 */ 1422 if (unix_peer(sk)) { 1423 struct sock *old_peer = unix_peer(sk); 1424 1425 unix_peer(sk) = other; 1426 if (!other) 1427 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1428 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1429 1430 unix_state_double_unlock(sk, other); 1431 1432 if (other != old_peer) { 1433 unix_dgram_disconnected(sk, old_peer); 1434 1435 unix_state_lock(old_peer); 1436 if (!unix_peer(old_peer)) 1437 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1438 unix_state_unlock(old_peer); 1439 } 1440 1441 sock_put(old_peer); 1442 } else { 1443 unix_peer(sk) = other; 1444 unix_state_double_unlock(sk, other); 1445 } 1446 1447 return 0; 1448 1449 out_unlock: 1450 unix_state_double_unlock(sk, other); 1451 sock_put(other); 1452 out: 1453 return err; 1454 } 1455 1456 static long unix_wait_for_peer(struct sock *other, long timeo) 1457 __releases(&unix_sk(other)->lock) 1458 { 1459 struct unix_sock *u = unix_sk(other); 1460 int sched; 1461 DEFINE_WAIT(wait); 1462 1463 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1464 1465 sched = !sock_flag(other, SOCK_DEAD) && 1466 !(other->sk_shutdown & RCV_SHUTDOWN) && 1467 unix_recvq_full_lockless(other); 1468 1469 unix_state_unlock(other); 1470 1471 if (sched) 1472 timeo = schedule_timeout(timeo); 1473 1474 finish_wait(&u->peer_wait, &wait); 1475 return timeo; 1476 } 1477 1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1479 int addr_len, int flags) 1480 { 1481 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1482 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1483 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1484 struct net *net = sock_net(sk); 1485 struct sk_buff *skb = NULL; 1486 unsigned char state; 1487 long timeo; 1488 int err; 1489 1490 err = unix_validate_addr(sunaddr, addr_len); 1491 if (err) 1492 goto out; 1493 1494 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1495 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1496 !READ_ONCE(u->addr)) { 1497 err = unix_autobind(sk); 1498 if (err) 1499 goto out; 1500 } 1501 1502 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1503 1504 /* First of all allocate resources. 1505 If we will make it after state is locked, 1506 we will have to recheck all again in any case. 1507 */ 1508 1509 /* create new sock for complete connection */ 1510 newsk = unix_create1(net, NULL, 0, sock->type); 1511 if (IS_ERR(newsk)) { 1512 err = PTR_ERR(newsk); 1513 newsk = NULL; 1514 goto out; 1515 } 1516 1517 err = -ENOMEM; 1518 1519 /* Allocate skb for sending to listening sock */ 1520 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1521 if (skb == NULL) 1522 goto out; 1523 1524 restart: 1525 /* Find listening sock. */ 1526 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1527 if (IS_ERR(other)) { 1528 err = PTR_ERR(other); 1529 other = NULL; 1530 goto out; 1531 } 1532 1533 unix_state_lock(other); 1534 1535 /* Apparently VFS overslept socket death. Retry. */ 1536 if (sock_flag(other, SOCK_DEAD)) { 1537 unix_state_unlock(other); 1538 sock_put(other); 1539 goto restart; 1540 } 1541 1542 err = -ECONNREFUSED; 1543 if (other->sk_state != TCP_LISTEN) 1544 goto out_unlock; 1545 if (other->sk_shutdown & RCV_SHUTDOWN) 1546 goto out_unlock; 1547 1548 if (unix_recvq_full_lockless(other)) { 1549 err = -EAGAIN; 1550 if (!timeo) 1551 goto out_unlock; 1552 1553 timeo = unix_wait_for_peer(other, timeo); 1554 1555 err = sock_intr_errno(timeo); 1556 if (signal_pending(current)) 1557 goto out; 1558 sock_put(other); 1559 goto restart; 1560 } 1561 1562 /* self connect and simultaneous connect are eliminated 1563 * by rejecting TCP_LISTEN socket to avoid deadlock. 1564 */ 1565 state = READ_ONCE(sk->sk_state); 1566 if (unlikely(state != TCP_CLOSE)) { 1567 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1568 goto out_unlock; 1569 } 1570 1571 unix_state_lock_nested(sk, U_LOCK_SECOND); 1572 1573 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1574 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1575 unix_state_unlock(sk); 1576 goto out_unlock; 1577 } 1578 1579 err = security_unix_stream_connect(sk, other, newsk); 1580 if (err) { 1581 unix_state_unlock(sk); 1582 goto out_unlock; 1583 } 1584 1585 /* The way is open! Fastly set all the necessary fields... */ 1586 1587 sock_hold(sk); 1588 unix_peer(newsk) = sk; 1589 newsk->sk_state = TCP_ESTABLISHED; 1590 newsk->sk_type = sk->sk_type; 1591 init_peercred(newsk); 1592 newu = unix_sk(newsk); 1593 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1594 otheru = unix_sk(other); 1595 1596 /* copy address information from listening to new sock 1597 * 1598 * The contents of *(otheru->addr) and otheru->path 1599 * are seen fully set up here, since we have found 1600 * otheru in hash under its lock. Insertion into the 1601 * hash chain we'd found it in had been done in an 1602 * earlier critical area protected by the chain's lock, 1603 * the same one where we'd set *(otheru->addr) contents, 1604 * as well as otheru->path and otheru->addr itself. 1605 * 1606 * Using smp_store_release() here to set newu->addr 1607 * is enough to make those stores, as well as stores 1608 * to newu->path visible to anyone who gets newu->addr 1609 * by smp_load_acquire(). IOW, the same warranties 1610 * as for unix_sock instances bound in unix_bind() or 1611 * in unix_autobind(). 1612 */ 1613 if (otheru->path.dentry) { 1614 path_get(&otheru->path); 1615 newu->path = otheru->path; 1616 } 1617 refcount_inc(&otheru->addr->refcnt); 1618 smp_store_release(&newu->addr, otheru->addr); 1619 1620 /* Set credentials */ 1621 copy_peercred(sk, other); 1622 1623 sock->state = SS_CONNECTED; 1624 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1625 sock_hold(newsk); 1626 1627 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1628 unix_peer(sk) = newsk; 1629 1630 unix_state_unlock(sk); 1631 1632 /* take ten and send info to listening sock */ 1633 spin_lock(&other->sk_receive_queue.lock); 1634 __skb_queue_tail(&other->sk_receive_queue, skb); 1635 spin_unlock(&other->sk_receive_queue.lock); 1636 unix_state_unlock(other); 1637 other->sk_data_ready(other); 1638 sock_put(other); 1639 return 0; 1640 1641 out_unlock: 1642 if (other) 1643 unix_state_unlock(other); 1644 1645 out: 1646 kfree_skb(skb); 1647 if (newsk) 1648 unix_release_sock(newsk, 0); 1649 if (other) 1650 sock_put(other); 1651 return err; 1652 } 1653 1654 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1655 { 1656 struct sock *ska = socka->sk, *skb = sockb->sk; 1657 1658 /* Join our sockets back to back */ 1659 sock_hold(ska); 1660 sock_hold(skb); 1661 unix_peer(ska) = skb; 1662 unix_peer(skb) = ska; 1663 init_peercred(ska); 1664 init_peercred(skb); 1665 1666 ska->sk_state = TCP_ESTABLISHED; 1667 skb->sk_state = TCP_ESTABLISHED; 1668 socka->state = SS_CONNECTED; 1669 sockb->state = SS_CONNECTED; 1670 return 0; 1671 } 1672 1673 static void unix_sock_inherit_flags(const struct socket *old, 1674 struct socket *new) 1675 { 1676 if (test_bit(SOCK_PASSCRED, &old->flags)) 1677 set_bit(SOCK_PASSCRED, &new->flags); 1678 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1679 set_bit(SOCK_PASSPIDFD, &new->flags); 1680 if (test_bit(SOCK_PASSSEC, &old->flags)) 1681 set_bit(SOCK_PASSSEC, &new->flags); 1682 } 1683 1684 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1685 bool kern) 1686 { 1687 struct sock *sk = sock->sk; 1688 struct sock *tsk; 1689 struct sk_buff *skb; 1690 int err; 1691 1692 err = -EOPNOTSUPP; 1693 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1694 goto out; 1695 1696 err = -EINVAL; 1697 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1698 goto out; 1699 1700 /* If socket state is TCP_LISTEN it cannot change (for now...), 1701 * so that no locks are necessary. 1702 */ 1703 1704 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1705 &err); 1706 if (!skb) { 1707 /* This means receive shutdown. */ 1708 if (err == 0) 1709 err = -EINVAL; 1710 goto out; 1711 } 1712 1713 tsk = skb->sk; 1714 skb_free_datagram(sk, skb); 1715 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1716 1717 /* attach accepted sock to socket */ 1718 unix_state_lock(tsk); 1719 newsock->state = SS_CONNECTED; 1720 unix_sock_inherit_flags(sock, newsock); 1721 sock_graft(tsk, newsock); 1722 unix_state_unlock(tsk); 1723 return 0; 1724 1725 out: 1726 return err; 1727 } 1728 1729 1730 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1731 { 1732 struct sock *sk = sock->sk; 1733 struct unix_address *addr; 1734 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1735 int err = 0; 1736 1737 if (peer) { 1738 sk = unix_peer_get(sk); 1739 1740 err = -ENOTCONN; 1741 if (!sk) 1742 goto out; 1743 err = 0; 1744 } else { 1745 sock_hold(sk); 1746 } 1747 1748 addr = smp_load_acquire(&unix_sk(sk)->addr); 1749 if (!addr) { 1750 sunaddr->sun_family = AF_UNIX; 1751 sunaddr->sun_path[0] = 0; 1752 err = offsetof(struct sockaddr_un, sun_path); 1753 } else { 1754 err = addr->len; 1755 memcpy(sunaddr, addr->name, addr->len); 1756 } 1757 sock_put(sk); 1758 out: 1759 return err; 1760 } 1761 1762 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1763 { 1764 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1765 1766 /* 1767 * Garbage collection of unix sockets starts by selecting a set of 1768 * candidate sockets which have reference only from being in flight 1769 * (total_refs == inflight_refs). This condition is checked once during 1770 * the candidate collection phase, and candidates are marked as such, so 1771 * that non-candidates can later be ignored. While inflight_refs is 1772 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1773 * is an instantaneous decision. 1774 * 1775 * Once a candidate, however, the socket must not be reinstalled into a 1776 * file descriptor while the garbage collection is in progress. 1777 * 1778 * If the above conditions are met, then the directed graph of 1779 * candidates (*) does not change while unix_gc_lock is held. 1780 * 1781 * Any operations that changes the file count through file descriptors 1782 * (dup, close, sendmsg) does not change the graph since candidates are 1783 * not installed in fds. 1784 * 1785 * Dequeing a candidate via recvmsg would install it into an fd, but 1786 * that takes unix_gc_lock to decrement the inflight count, so it's 1787 * serialized with garbage collection. 1788 * 1789 * MSG_PEEK is special in that it does not change the inflight count, 1790 * yet does install the socket into an fd. The following lock/unlock 1791 * pair is to ensure serialization with garbage collection. It must be 1792 * done between incrementing the file count and installing the file into 1793 * an fd. 1794 * 1795 * If garbage collection starts after the barrier provided by the 1796 * lock/unlock, then it will see the elevated refcount and not mark this 1797 * as a candidate. If a garbage collection is already in progress 1798 * before the file count was incremented, then the lock/unlock pair will 1799 * ensure that garbage collection is finished before progressing to 1800 * installing the fd. 1801 * 1802 * (*) A -> B where B is on the queue of A or B is on the queue of C 1803 * which is on the queue of listening socket A. 1804 */ 1805 spin_lock(&unix_gc_lock); 1806 spin_unlock(&unix_gc_lock); 1807 } 1808 1809 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1810 { 1811 int err = 0; 1812 1813 UNIXCB(skb).pid = get_pid(scm->pid); 1814 UNIXCB(skb).uid = scm->creds.uid; 1815 UNIXCB(skb).gid = scm->creds.gid; 1816 UNIXCB(skb).fp = NULL; 1817 unix_get_secdata(scm, skb); 1818 if (scm->fp && send_fds) 1819 err = unix_attach_fds(scm, skb); 1820 1821 skb->destructor = unix_destruct_scm; 1822 return err; 1823 } 1824 1825 static bool unix_passcred_enabled(const struct socket *sock, 1826 const struct sock *other) 1827 { 1828 return test_bit(SOCK_PASSCRED, &sock->flags) || 1829 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1830 !other->sk_socket || 1831 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1832 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1833 } 1834 1835 /* 1836 * Some apps rely on write() giving SCM_CREDENTIALS 1837 * We include credentials if source or destination socket 1838 * asserted SOCK_PASSCRED. 1839 */ 1840 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1841 const struct sock *other) 1842 { 1843 if (UNIXCB(skb).pid) 1844 return; 1845 if (unix_passcred_enabled(sock, other)) { 1846 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1847 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1848 } 1849 } 1850 1851 static bool unix_skb_scm_eq(struct sk_buff *skb, 1852 struct scm_cookie *scm) 1853 { 1854 return UNIXCB(skb).pid == scm->pid && 1855 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1856 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1857 unix_secdata_eq(scm, skb); 1858 } 1859 1860 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1861 { 1862 struct scm_fp_list *fp = UNIXCB(skb).fp; 1863 struct unix_sock *u = unix_sk(sk); 1864 1865 if (unlikely(fp && fp->count)) 1866 atomic_add(fp->count, &u->scm_stat.nr_fds); 1867 } 1868 1869 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1870 { 1871 struct scm_fp_list *fp = UNIXCB(skb).fp; 1872 struct unix_sock *u = unix_sk(sk); 1873 1874 if (unlikely(fp && fp->count)) 1875 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1876 } 1877 1878 /* 1879 * Send AF_UNIX data. 1880 */ 1881 1882 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1883 size_t len) 1884 { 1885 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1886 struct sock *sk = sock->sk, *other = NULL; 1887 struct unix_sock *u = unix_sk(sk); 1888 struct scm_cookie scm; 1889 struct sk_buff *skb; 1890 int data_len = 0; 1891 int sk_locked; 1892 long timeo; 1893 int err; 1894 1895 wait_for_unix_gc(); 1896 err = scm_send(sock, msg, &scm, false); 1897 if (err < 0) 1898 return err; 1899 1900 err = -EOPNOTSUPP; 1901 if (msg->msg_flags&MSG_OOB) 1902 goto out; 1903 1904 if (msg->msg_namelen) { 1905 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1906 if (err) 1907 goto out; 1908 } else { 1909 sunaddr = NULL; 1910 err = -ENOTCONN; 1911 other = unix_peer_get(sk); 1912 if (!other) 1913 goto out; 1914 } 1915 1916 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1917 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1918 !READ_ONCE(u->addr)) { 1919 err = unix_autobind(sk); 1920 if (err) 1921 goto out; 1922 } 1923 1924 err = -EMSGSIZE; 1925 if (len > READ_ONCE(sk->sk_sndbuf) - 32) 1926 goto out; 1927 1928 if (len > SKB_MAX_ALLOC) { 1929 data_len = min_t(size_t, 1930 len - SKB_MAX_ALLOC, 1931 MAX_SKB_FRAGS * PAGE_SIZE); 1932 data_len = PAGE_ALIGN(data_len); 1933 1934 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1935 } 1936 1937 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1938 msg->msg_flags & MSG_DONTWAIT, &err, 1939 PAGE_ALLOC_COSTLY_ORDER); 1940 if (skb == NULL) 1941 goto out; 1942 1943 err = unix_scm_to_skb(&scm, skb, true); 1944 if (err < 0) 1945 goto out_free; 1946 1947 skb_put(skb, len - data_len); 1948 skb->data_len = data_len; 1949 skb->len = len; 1950 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1951 if (err) 1952 goto out_free; 1953 1954 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1955 1956 restart: 1957 if (!other) { 1958 err = -ECONNRESET; 1959 if (sunaddr == NULL) 1960 goto out_free; 1961 1962 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 1963 sk->sk_type); 1964 if (IS_ERR(other)) { 1965 err = PTR_ERR(other); 1966 other = NULL; 1967 goto out_free; 1968 } 1969 } 1970 1971 if (sk_filter(other, skb) < 0) { 1972 /* Toss the packet but do not return any error to the sender */ 1973 err = len; 1974 goto out_free; 1975 } 1976 1977 sk_locked = 0; 1978 unix_state_lock(other); 1979 restart_locked: 1980 err = -EPERM; 1981 if (!unix_may_send(sk, other)) 1982 goto out_unlock; 1983 1984 if (unlikely(sock_flag(other, SOCK_DEAD))) { 1985 /* 1986 * Check with 1003.1g - what should 1987 * datagram error 1988 */ 1989 unix_state_unlock(other); 1990 sock_put(other); 1991 1992 if (!sk_locked) 1993 unix_state_lock(sk); 1994 1995 err = 0; 1996 if (sk->sk_type == SOCK_SEQPACKET) { 1997 /* We are here only when racing with unix_release_sock() 1998 * is clearing @other. Never change state to TCP_CLOSE 1999 * unlike SOCK_DGRAM wants. 2000 */ 2001 unix_state_unlock(sk); 2002 err = -EPIPE; 2003 } else if (unix_peer(sk) == other) { 2004 unix_peer(sk) = NULL; 2005 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2006 2007 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2008 unix_state_unlock(sk); 2009 2010 unix_dgram_disconnected(sk, other); 2011 sock_put(other); 2012 err = -ECONNREFUSED; 2013 } else { 2014 unix_state_unlock(sk); 2015 } 2016 2017 other = NULL; 2018 if (err) 2019 goto out_free; 2020 goto restart; 2021 } 2022 2023 err = -EPIPE; 2024 if (other->sk_shutdown & RCV_SHUTDOWN) 2025 goto out_unlock; 2026 2027 if (sk->sk_type != SOCK_SEQPACKET) { 2028 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2029 if (err) 2030 goto out_unlock; 2031 } 2032 2033 /* other == sk && unix_peer(other) != sk if 2034 * - unix_peer(sk) == NULL, destination address bound to sk 2035 * - unix_peer(sk) == sk by time of get but disconnected before lock 2036 */ 2037 if (other != sk && 2038 unlikely(unix_peer(other) != sk && 2039 unix_recvq_full_lockless(other))) { 2040 if (timeo) { 2041 timeo = unix_wait_for_peer(other, timeo); 2042 2043 err = sock_intr_errno(timeo); 2044 if (signal_pending(current)) 2045 goto out_free; 2046 2047 goto restart; 2048 } 2049 2050 if (!sk_locked) { 2051 unix_state_unlock(other); 2052 unix_state_double_lock(sk, other); 2053 } 2054 2055 if (unix_peer(sk) != other || 2056 unix_dgram_peer_wake_me(sk, other)) { 2057 err = -EAGAIN; 2058 sk_locked = 1; 2059 goto out_unlock; 2060 } 2061 2062 if (!sk_locked) { 2063 sk_locked = 1; 2064 goto restart_locked; 2065 } 2066 } 2067 2068 if (unlikely(sk_locked)) 2069 unix_state_unlock(sk); 2070 2071 if (sock_flag(other, SOCK_RCVTSTAMP)) 2072 __net_timestamp(skb); 2073 maybe_add_creds(skb, sock, other); 2074 scm_stat_add(other, skb); 2075 skb_queue_tail(&other->sk_receive_queue, skb); 2076 unix_state_unlock(other); 2077 other->sk_data_ready(other); 2078 sock_put(other); 2079 scm_destroy(&scm); 2080 return len; 2081 2082 out_unlock: 2083 if (sk_locked) 2084 unix_state_unlock(sk); 2085 unix_state_unlock(other); 2086 out_free: 2087 kfree_skb(skb); 2088 out: 2089 if (other) 2090 sock_put(other); 2091 scm_destroy(&scm); 2092 return err; 2093 } 2094 2095 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2096 * bytes, and a minimum of a full page. 2097 */ 2098 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2099 2100 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2101 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2102 struct scm_cookie *scm, bool fds_sent) 2103 { 2104 struct unix_sock *ousk = unix_sk(other); 2105 struct sk_buff *skb; 2106 int err = 0; 2107 2108 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2109 2110 if (!skb) 2111 return err; 2112 2113 err = unix_scm_to_skb(scm, skb, !fds_sent); 2114 if (err < 0) { 2115 kfree_skb(skb); 2116 return err; 2117 } 2118 skb_put(skb, 1); 2119 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2120 2121 if (err) { 2122 kfree_skb(skb); 2123 return err; 2124 } 2125 2126 unix_state_lock(other); 2127 2128 if (sock_flag(other, SOCK_DEAD) || 2129 (other->sk_shutdown & RCV_SHUTDOWN)) { 2130 unix_state_unlock(other); 2131 kfree_skb(skb); 2132 return -EPIPE; 2133 } 2134 2135 maybe_add_creds(skb, sock, other); 2136 skb_get(skb); 2137 2138 scm_stat_add(other, skb); 2139 2140 spin_lock(&other->sk_receive_queue.lock); 2141 if (ousk->oob_skb) 2142 consume_skb(ousk->oob_skb); 2143 WRITE_ONCE(ousk->oob_skb, skb); 2144 __skb_queue_tail(&other->sk_receive_queue, skb); 2145 spin_unlock(&other->sk_receive_queue.lock); 2146 2147 sk_send_sigurg(other); 2148 unix_state_unlock(other); 2149 other->sk_data_ready(other); 2150 2151 return err; 2152 } 2153 #endif 2154 2155 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2156 size_t len) 2157 { 2158 struct sock *sk = sock->sk; 2159 struct sock *other = NULL; 2160 int err, size; 2161 struct sk_buff *skb; 2162 int sent = 0; 2163 struct scm_cookie scm; 2164 bool fds_sent = false; 2165 int data_len; 2166 2167 wait_for_unix_gc(); 2168 err = scm_send(sock, msg, &scm, false); 2169 if (err < 0) 2170 return err; 2171 2172 err = -EOPNOTSUPP; 2173 if (msg->msg_flags & MSG_OOB) { 2174 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2175 if (len) 2176 len--; 2177 else 2178 #endif 2179 goto out_err; 2180 } 2181 2182 if (msg->msg_namelen) { 2183 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2184 goto out_err; 2185 } else { 2186 err = -ENOTCONN; 2187 other = unix_peer(sk); 2188 if (!other) 2189 goto out_err; 2190 } 2191 2192 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2193 goto pipe_err; 2194 2195 while (sent < len) { 2196 size = len - sent; 2197 2198 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2199 skb = sock_alloc_send_pskb(sk, 0, 0, 2200 msg->msg_flags & MSG_DONTWAIT, 2201 &err, 0); 2202 } else { 2203 /* Keep two messages in the pipe so it schedules better */ 2204 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2205 2206 /* allow fallback to order-0 allocations */ 2207 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2208 2209 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2210 2211 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2212 2213 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2214 msg->msg_flags & MSG_DONTWAIT, &err, 2215 get_order(UNIX_SKB_FRAGS_SZ)); 2216 } 2217 if (!skb) 2218 goto out_err; 2219 2220 /* Only send the fds in the first buffer */ 2221 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2222 if (err < 0) { 2223 kfree_skb(skb); 2224 goto out_err; 2225 } 2226 fds_sent = true; 2227 2228 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2229 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2230 sk->sk_allocation); 2231 if (err < 0) { 2232 kfree_skb(skb); 2233 goto out_err; 2234 } 2235 size = err; 2236 refcount_add(size, &sk->sk_wmem_alloc); 2237 } else { 2238 skb_put(skb, size - data_len); 2239 skb->data_len = data_len; 2240 skb->len = size; 2241 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2242 if (err) { 2243 kfree_skb(skb); 2244 goto out_err; 2245 } 2246 } 2247 2248 unix_state_lock(other); 2249 2250 if (sock_flag(other, SOCK_DEAD) || 2251 (other->sk_shutdown & RCV_SHUTDOWN)) 2252 goto pipe_err_free; 2253 2254 maybe_add_creds(skb, sock, other); 2255 scm_stat_add(other, skb); 2256 skb_queue_tail(&other->sk_receive_queue, skb); 2257 unix_state_unlock(other); 2258 other->sk_data_ready(other); 2259 sent += size; 2260 } 2261 2262 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2263 if (msg->msg_flags & MSG_OOB) { 2264 err = queue_oob(sock, msg, other, &scm, fds_sent); 2265 if (err) 2266 goto out_err; 2267 sent++; 2268 } 2269 #endif 2270 2271 scm_destroy(&scm); 2272 2273 return sent; 2274 2275 pipe_err_free: 2276 unix_state_unlock(other); 2277 kfree_skb(skb); 2278 pipe_err: 2279 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2280 send_sig(SIGPIPE, current, 0); 2281 err = -EPIPE; 2282 out_err: 2283 scm_destroy(&scm); 2284 return sent ? : err; 2285 } 2286 2287 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2288 size_t len) 2289 { 2290 int err; 2291 struct sock *sk = sock->sk; 2292 2293 err = sock_error(sk); 2294 if (err) 2295 return err; 2296 2297 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2298 return -ENOTCONN; 2299 2300 if (msg->msg_namelen) 2301 msg->msg_namelen = 0; 2302 2303 return unix_dgram_sendmsg(sock, msg, len); 2304 } 2305 2306 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2307 size_t size, int flags) 2308 { 2309 struct sock *sk = sock->sk; 2310 2311 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2312 return -ENOTCONN; 2313 2314 return unix_dgram_recvmsg(sock, msg, size, flags); 2315 } 2316 2317 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2318 { 2319 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2320 2321 if (addr) { 2322 msg->msg_namelen = addr->len; 2323 memcpy(msg->msg_name, addr->name, addr->len); 2324 } 2325 } 2326 2327 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2328 int flags) 2329 { 2330 struct scm_cookie scm; 2331 struct socket *sock = sk->sk_socket; 2332 struct unix_sock *u = unix_sk(sk); 2333 struct sk_buff *skb, *last; 2334 long timeo; 2335 int skip; 2336 int err; 2337 2338 err = -EOPNOTSUPP; 2339 if (flags&MSG_OOB) 2340 goto out; 2341 2342 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2343 2344 do { 2345 mutex_lock(&u->iolock); 2346 2347 skip = sk_peek_offset(sk, flags); 2348 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2349 &skip, &err, &last); 2350 if (skb) { 2351 if (!(flags & MSG_PEEK)) 2352 scm_stat_del(sk, skb); 2353 break; 2354 } 2355 2356 mutex_unlock(&u->iolock); 2357 2358 if (err != -EAGAIN) 2359 break; 2360 } while (timeo && 2361 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2362 &err, &timeo, last)); 2363 2364 if (!skb) { /* implies iolock unlocked */ 2365 unix_state_lock(sk); 2366 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2367 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2368 (sk->sk_shutdown & RCV_SHUTDOWN)) 2369 err = 0; 2370 unix_state_unlock(sk); 2371 goto out; 2372 } 2373 2374 if (wq_has_sleeper(&u->peer_wait)) 2375 wake_up_interruptible_sync_poll(&u->peer_wait, 2376 EPOLLOUT | EPOLLWRNORM | 2377 EPOLLWRBAND); 2378 2379 if (msg->msg_name) 2380 unix_copy_addr(msg, skb->sk); 2381 2382 if (size > skb->len - skip) 2383 size = skb->len - skip; 2384 else if (size < skb->len - skip) 2385 msg->msg_flags |= MSG_TRUNC; 2386 2387 err = skb_copy_datagram_msg(skb, skip, msg, size); 2388 if (err) 2389 goto out_free; 2390 2391 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2392 __sock_recv_timestamp(msg, sk, skb); 2393 2394 memset(&scm, 0, sizeof(scm)); 2395 2396 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2397 unix_set_secdata(&scm, skb); 2398 2399 if (!(flags & MSG_PEEK)) { 2400 if (UNIXCB(skb).fp) 2401 unix_detach_fds(&scm, skb); 2402 2403 sk_peek_offset_bwd(sk, skb->len); 2404 } else { 2405 /* It is questionable: on PEEK we could: 2406 - do not return fds - good, but too simple 8) 2407 - return fds, and do not return them on read (old strategy, 2408 apparently wrong) 2409 - clone fds (I chose it for now, it is the most universal 2410 solution) 2411 2412 POSIX 1003.1g does not actually define this clearly 2413 at all. POSIX 1003.1g doesn't define a lot of things 2414 clearly however! 2415 2416 */ 2417 2418 sk_peek_offset_fwd(sk, size); 2419 2420 if (UNIXCB(skb).fp) 2421 unix_peek_fds(&scm, skb); 2422 } 2423 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2424 2425 scm_recv_unix(sock, msg, &scm, flags); 2426 2427 out_free: 2428 skb_free_datagram(sk, skb); 2429 mutex_unlock(&u->iolock); 2430 out: 2431 return err; 2432 } 2433 2434 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2435 int flags) 2436 { 2437 struct sock *sk = sock->sk; 2438 2439 #ifdef CONFIG_BPF_SYSCALL 2440 const struct proto *prot = READ_ONCE(sk->sk_prot); 2441 2442 if (prot != &unix_dgram_proto) 2443 return prot->recvmsg(sk, msg, size, flags, NULL); 2444 #endif 2445 return __unix_dgram_recvmsg(sk, msg, size, flags); 2446 } 2447 2448 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2449 { 2450 struct unix_sock *u = unix_sk(sk); 2451 struct sk_buff *skb; 2452 int err; 2453 2454 mutex_lock(&u->iolock); 2455 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2456 mutex_unlock(&u->iolock); 2457 if (!skb) 2458 return err; 2459 2460 return recv_actor(sk, skb); 2461 } 2462 2463 /* 2464 * Sleep until more data has arrived. But check for races.. 2465 */ 2466 static long unix_stream_data_wait(struct sock *sk, long timeo, 2467 struct sk_buff *last, unsigned int last_len, 2468 bool freezable) 2469 { 2470 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2471 struct sk_buff *tail; 2472 DEFINE_WAIT(wait); 2473 2474 unix_state_lock(sk); 2475 2476 for (;;) { 2477 prepare_to_wait(sk_sleep(sk), &wait, state); 2478 2479 tail = skb_peek_tail(&sk->sk_receive_queue); 2480 if (tail != last || 2481 (tail && tail->len != last_len) || 2482 sk->sk_err || 2483 (sk->sk_shutdown & RCV_SHUTDOWN) || 2484 signal_pending(current) || 2485 !timeo) 2486 break; 2487 2488 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2489 unix_state_unlock(sk); 2490 timeo = schedule_timeout(timeo); 2491 unix_state_lock(sk); 2492 2493 if (sock_flag(sk, SOCK_DEAD)) 2494 break; 2495 2496 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2497 } 2498 2499 finish_wait(sk_sleep(sk), &wait); 2500 unix_state_unlock(sk); 2501 return timeo; 2502 } 2503 2504 static unsigned int unix_skb_len(const struct sk_buff *skb) 2505 { 2506 return skb->len - UNIXCB(skb).consumed; 2507 } 2508 2509 struct unix_stream_read_state { 2510 int (*recv_actor)(struct sk_buff *, int, int, 2511 struct unix_stream_read_state *); 2512 struct socket *socket; 2513 struct msghdr *msg; 2514 struct pipe_inode_info *pipe; 2515 size_t size; 2516 int flags; 2517 unsigned int splice_flags; 2518 }; 2519 2520 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2521 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2522 { 2523 struct socket *sock = state->socket; 2524 struct sock *sk = sock->sk; 2525 struct unix_sock *u = unix_sk(sk); 2526 int chunk = 1; 2527 struct sk_buff *oob_skb; 2528 2529 mutex_lock(&u->iolock); 2530 unix_state_lock(sk); 2531 spin_lock(&sk->sk_receive_queue.lock); 2532 2533 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2534 spin_unlock(&sk->sk_receive_queue.lock); 2535 unix_state_unlock(sk); 2536 mutex_unlock(&u->iolock); 2537 return -EINVAL; 2538 } 2539 2540 oob_skb = u->oob_skb; 2541 2542 if (!(state->flags & MSG_PEEK)) 2543 WRITE_ONCE(u->oob_skb, NULL); 2544 else 2545 skb_get(oob_skb); 2546 2547 spin_unlock(&sk->sk_receive_queue.lock); 2548 unix_state_unlock(sk); 2549 2550 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2551 2552 if (!(state->flags & MSG_PEEK)) 2553 UNIXCB(oob_skb).consumed += 1; 2554 2555 consume_skb(oob_skb); 2556 2557 mutex_unlock(&u->iolock); 2558 2559 if (chunk < 0) 2560 return -EFAULT; 2561 2562 state->msg->msg_flags |= MSG_OOB; 2563 return 1; 2564 } 2565 2566 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2567 int flags, int copied) 2568 { 2569 struct unix_sock *u = unix_sk(sk); 2570 2571 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2572 skb_unlink(skb, &sk->sk_receive_queue); 2573 consume_skb(skb); 2574 skb = NULL; 2575 } else { 2576 struct sk_buff *unlinked_skb = NULL; 2577 2578 spin_lock(&sk->sk_receive_queue.lock); 2579 2580 if (skb == u->oob_skb) { 2581 if (copied) { 2582 skb = NULL; 2583 } else if (!(flags & MSG_PEEK)) { 2584 if (sock_flag(sk, SOCK_URGINLINE)) { 2585 WRITE_ONCE(u->oob_skb, NULL); 2586 consume_skb(skb); 2587 } else { 2588 __skb_unlink(skb, &sk->sk_receive_queue); 2589 WRITE_ONCE(u->oob_skb, NULL); 2590 unlinked_skb = skb; 2591 skb = skb_peek(&sk->sk_receive_queue); 2592 } 2593 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2594 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2595 } 2596 } 2597 2598 spin_unlock(&sk->sk_receive_queue.lock); 2599 2600 if (unlinked_skb) { 2601 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2602 kfree_skb(unlinked_skb); 2603 } 2604 } 2605 return skb; 2606 } 2607 #endif 2608 2609 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2610 { 2611 struct unix_sock *u = unix_sk(sk); 2612 struct sk_buff *skb; 2613 int err; 2614 2615 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2616 return -ENOTCONN; 2617 2618 mutex_lock(&u->iolock); 2619 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2620 mutex_unlock(&u->iolock); 2621 if (!skb) 2622 return err; 2623 2624 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2625 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2626 bool drop = false; 2627 2628 unix_state_lock(sk); 2629 2630 if (sock_flag(sk, SOCK_DEAD)) { 2631 unix_state_unlock(sk); 2632 kfree_skb(skb); 2633 return -ECONNRESET; 2634 } 2635 2636 spin_lock(&sk->sk_receive_queue.lock); 2637 if (likely(skb == u->oob_skb)) { 2638 WRITE_ONCE(u->oob_skb, NULL); 2639 drop = true; 2640 } 2641 spin_unlock(&sk->sk_receive_queue.lock); 2642 2643 unix_state_unlock(sk); 2644 2645 if (drop) { 2646 WARN_ON_ONCE(skb_unref(skb)); 2647 kfree_skb(skb); 2648 return -EAGAIN; 2649 } 2650 } 2651 #endif 2652 2653 return recv_actor(sk, skb); 2654 } 2655 2656 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2657 bool freezable) 2658 { 2659 struct scm_cookie scm; 2660 struct socket *sock = state->socket; 2661 struct sock *sk = sock->sk; 2662 struct unix_sock *u = unix_sk(sk); 2663 int copied = 0; 2664 int flags = state->flags; 2665 int noblock = flags & MSG_DONTWAIT; 2666 bool check_creds = false; 2667 int target; 2668 int err = 0; 2669 long timeo; 2670 int skip; 2671 size_t size = state->size; 2672 unsigned int last_len; 2673 2674 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2675 err = -EINVAL; 2676 goto out; 2677 } 2678 2679 if (unlikely(flags & MSG_OOB)) { 2680 err = -EOPNOTSUPP; 2681 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2682 err = unix_stream_recv_urg(state); 2683 #endif 2684 goto out; 2685 } 2686 2687 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2688 timeo = sock_rcvtimeo(sk, noblock); 2689 2690 memset(&scm, 0, sizeof(scm)); 2691 2692 /* Lock the socket to prevent queue disordering 2693 * while sleeps in memcpy_tomsg 2694 */ 2695 mutex_lock(&u->iolock); 2696 2697 skip = max(sk_peek_offset(sk, flags), 0); 2698 2699 do { 2700 int chunk; 2701 bool drop_skb; 2702 struct sk_buff *skb, *last; 2703 2704 redo: 2705 unix_state_lock(sk); 2706 if (sock_flag(sk, SOCK_DEAD)) { 2707 err = -ECONNRESET; 2708 goto unlock; 2709 } 2710 last = skb = skb_peek(&sk->sk_receive_queue); 2711 last_len = last ? last->len : 0; 2712 2713 again: 2714 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2715 if (skb) { 2716 skb = manage_oob(skb, sk, flags, copied); 2717 if (!skb && copied) { 2718 unix_state_unlock(sk); 2719 break; 2720 } 2721 } 2722 #endif 2723 if (skb == NULL) { 2724 if (copied >= target) 2725 goto unlock; 2726 2727 /* 2728 * POSIX 1003.1g mandates this order. 2729 */ 2730 2731 err = sock_error(sk); 2732 if (err) 2733 goto unlock; 2734 if (sk->sk_shutdown & RCV_SHUTDOWN) 2735 goto unlock; 2736 2737 unix_state_unlock(sk); 2738 if (!timeo) { 2739 err = -EAGAIN; 2740 break; 2741 } 2742 2743 mutex_unlock(&u->iolock); 2744 2745 timeo = unix_stream_data_wait(sk, timeo, last, 2746 last_len, freezable); 2747 2748 if (signal_pending(current)) { 2749 err = sock_intr_errno(timeo); 2750 scm_destroy(&scm); 2751 goto out; 2752 } 2753 2754 mutex_lock(&u->iolock); 2755 goto redo; 2756 unlock: 2757 unix_state_unlock(sk); 2758 break; 2759 } 2760 2761 while (skip >= unix_skb_len(skb)) { 2762 skip -= unix_skb_len(skb); 2763 last = skb; 2764 last_len = skb->len; 2765 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2766 if (!skb) 2767 goto again; 2768 } 2769 2770 unix_state_unlock(sk); 2771 2772 if (check_creds) { 2773 /* Never glue messages from different writers */ 2774 if (!unix_skb_scm_eq(skb, &scm)) 2775 break; 2776 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2777 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2778 /* Copy credentials */ 2779 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2780 unix_set_secdata(&scm, skb); 2781 check_creds = true; 2782 } 2783 2784 /* Copy address just once */ 2785 if (state->msg && state->msg->msg_name) { 2786 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2787 state->msg->msg_name); 2788 unix_copy_addr(state->msg, skb->sk); 2789 sunaddr = NULL; 2790 } 2791 2792 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2793 skb_get(skb); 2794 chunk = state->recv_actor(skb, skip, chunk, state); 2795 drop_skb = !unix_skb_len(skb); 2796 /* skb is only safe to use if !drop_skb */ 2797 consume_skb(skb); 2798 if (chunk < 0) { 2799 if (copied == 0) 2800 copied = -EFAULT; 2801 break; 2802 } 2803 copied += chunk; 2804 size -= chunk; 2805 2806 if (drop_skb) { 2807 /* the skb was touched by a concurrent reader; 2808 * we should not expect anything from this skb 2809 * anymore and assume it invalid - we can be 2810 * sure it was dropped from the socket queue 2811 * 2812 * let's report a short read 2813 */ 2814 err = 0; 2815 break; 2816 } 2817 2818 /* Mark read part of skb as used */ 2819 if (!(flags & MSG_PEEK)) { 2820 UNIXCB(skb).consumed += chunk; 2821 2822 sk_peek_offset_bwd(sk, chunk); 2823 2824 if (UNIXCB(skb).fp) { 2825 scm_stat_del(sk, skb); 2826 unix_detach_fds(&scm, skb); 2827 } 2828 2829 if (unix_skb_len(skb)) 2830 break; 2831 2832 skb_unlink(skb, &sk->sk_receive_queue); 2833 consume_skb(skb); 2834 2835 if (scm.fp) 2836 break; 2837 } else { 2838 /* It is questionable, see note in unix_dgram_recvmsg. 2839 */ 2840 if (UNIXCB(skb).fp) 2841 unix_peek_fds(&scm, skb); 2842 2843 sk_peek_offset_fwd(sk, chunk); 2844 2845 if (UNIXCB(skb).fp) 2846 break; 2847 2848 skip = 0; 2849 last = skb; 2850 last_len = skb->len; 2851 unix_state_lock(sk); 2852 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2853 if (skb) 2854 goto again; 2855 unix_state_unlock(sk); 2856 break; 2857 } 2858 } while (size); 2859 2860 mutex_unlock(&u->iolock); 2861 if (state->msg) 2862 scm_recv_unix(sock, state->msg, &scm, flags); 2863 else 2864 scm_destroy(&scm); 2865 out: 2866 return copied ? : err; 2867 } 2868 2869 static int unix_stream_read_actor(struct sk_buff *skb, 2870 int skip, int chunk, 2871 struct unix_stream_read_state *state) 2872 { 2873 int ret; 2874 2875 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2876 state->msg, chunk); 2877 return ret ?: chunk; 2878 } 2879 2880 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2881 size_t size, int flags) 2882 { 2883 struct unix_stream_read_state state = { 2884 .recv_actor = unix_stream_read_actor, 2885 .socket = sk->sk_socket, 2886 .msg = msg, 2887 .size = size, 2888 .flags = flags 2889 }; 2890 2891 return unix_stream_read_generic(&state, true); 2892 } 2893 2894 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2895 size_t size, int flags) 2896 { 2897 struct unix_stream_read_state state = { 2898 .recv_actor = unix_stream_read_actor, 2899 .socket = sock, 2900 .msg = msg, 2901 .size = size, 2902 .flags = flags 2903 }; 2904 2905 #ifdef CONFIG_BPF_SYSCALL 2906 struct sock *sk = sock->sk; 2907 const struct proto *prot = READ_ONCE(sk->sk_prot); 2908 2909 if (prot != &unix_stream_proto) 2910 return prot->recvmsg(sk, msg, size, flags, NULL); 2911 #endif 2912 return unix_stream_read_generic(&state, true); 2913 } 2914 2915 static int unix_stream_splice_actor(struct sk_buff *skb, 2916 int skip, int chunk, 2917 struct unix_stream_read_state *state) 2918 { 2919 return skb_splice_bits(skb, state->socket->sk, 2920 UNIXCB(skb).consumed + skip, 2921 state->pipe, chunk, state->splice_flags); 2922 } 2923 2924 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2925 struct pipe_inode_info *pipe, 2926 size_t size, unsigned int flags) 2927 { 2928 struct unix_stream_read_state state = { 2929 .recv_actor = unix_stream_splice_actor, 2930 .socket = sock, 2931 .pipe = pipe, 2932 .size = size, 2933 .splice_flags = flags, 2934 }; 2935 2936 if (unlikely(*ppos)) 2937 return -ESPIPE; 2938 2939 if (sock->file->f_flags & O_NONBLOCK || 2940 flags & SPLICE_F_NONBLOCK) 2941 state.flags = MSG_DONTWAIT; 2942 2943 return unix_stream_read_generic(&state, false); 2944 } 2945 2946 static int unix_shutdown(struct socket *sock, int mode) 2947 { 2948 struct sock *sk = sock->sk; 2949 struct sock *other; 2950 2951 if (mode < SHUT_RD || mode > SHUT_RDWR) 2952 return -EINVAL; 2953 /* This maps: 2954 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2955 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2956 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2957 */ 2958 ++mode; 2959 2960 unix_state_lock(sk); 2961 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2962 other = unix_peer(sk); 2963 if (other) 2964 sock_hold(other); 2965 unix_state_unlock(sk); 2966 sk->sk_state_change(sk); 2967 2968 if (other && 2969 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2970 2971 int peer_mode = 0; 2972 const struct proto *prot = READ_ONCE(other->sk_prot); 2973 2974 if (prot->unhash) 2975 prot->unhash(other); 2976 if (mode&RCV_SHUTDOWN) 2977 peer_mode |= SEND_SHUTDOWN; 2978 if (mode&SEND_SHUTDOWN) 2979 peer_mode |= RCV_SHUTDOWN; 2980 unix_state_lock(other); 2981 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2982 unix_state_unlock(other); 2983 other->sk_state_change(other); 2984 if (peer_mode == SHUTDOWN_MASK) 2985 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2986 else if (peer_mode & RCV_SHUTDOWN) 2987 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 2988 } 2989 if (other) 2990 sock_put(other); 2991 2992 return 0; 2993 } 2994 2995 long unix_inq_len(struct sock *sk) 2996 { 2997 struct sk_buff *skb; 2998 long amount = 0; 2999 3000 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3001 return -EINVAL; 3002 3003 spin_lock(&sk->sk_receive_queue.lock); 3004 if (sk->sk_type == SOCK_STREAM || 3005 sk->sk_type == SOCK_SEQPACKET) { 3006 skb_queue_walk(&sk->sk_receive_queue, skb) 3007 amount += unix_skb_len(skb); 3008 } else { 3009 skb = skb_peek(&sk->sk_receive_queue); 3010 if (skb) 3011 amount = skb->len; 3012 } 3013 spin_unlock(&sk->sk_receive_queue.lock); 3014 3015 return amount; 3016 } 3017 EXPORT_SYMBOL_GPL(unix_inq_len); 3018 3019 long unix_outq_len(struct sock *sk) 3020 { 3021 return sk_wmem_alloc_get(sk); 3022 } 3023 EXPORT_SYMBOL_GPL(unix_outq_len); 3024 3025 static int unix_open_file(struct sock *sk) 3026 { 3027 struct path path; 3028 struct file *f; 3029 int fd; 3030 3031 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3032 return -EPERM; 3033 3034 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3035 return -ENOENT; 3036 3037 path = unix_sk(sk)->path; 3038 if (!path.dentry) 3039 return -ENOENT; 3040 3041 path_get(&path); 3042 3043 fd = get_unused_fd_flags(O_CLOEXEC); 3044 if (fd < 0) 3045 goto out; 3046 3047 f = dentry_open(&path, O_PATH, current_cred()); 3048 if (IS_ERR(f)) { 3049 put_unused_fd(fd); 3050 fd = PTR_ERR(f); 3051 goto out; 3052 } 3053 3054 fd_install(fd, f); 3055 out: 3056 path_put(&path); 3057 3058 return fd; 3059 } 3060 3061 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3062 { 3063 struct sock *sk = sock->sk; 3064 long amount = 0; 3065 int err; 3066 3067 switch (cmd) { 3068 case SIOCOUTQ: 3069 amount = unix_outq_len(sk); 3070 err = put_user(amount, (int __user *)arg); 3071 break; 3072 case SIOCINQ: 3073 amount = unix_inq_len(sk); 3074 if (amount < 0) 3075 err = amount; 3076 else 3077 err = put_user(amount, (int __user *)arg); 3078 break; 3079 case SIOCUNIXFILE: 3080 err = unix_open_file(sk); 3081 break; 3082 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3083 case SIOCATMARK: 3084 { 3085 struct sk_buff *skb; 3086 int answ = 0; 3087 3088 skb = skb_peek(&sk->sk_receive_queue); 3089 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3090 answ = 1; 3091 err = put_user(answ, (int __user *)arg); 3092 } 3093 break; 3094 #endif 3095 default: 3096 err = -ENOIOCTLCMD; 3097 break; 3098 } 3099 return err; 3100 } 3101 3102 #ifdef CONFIG_COMPAT 3103 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3104 { 3105 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3106 } 3107 #endif 3108 3109 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3110 { 3111 struct sock *sk = sock->sk; 3112 unsigned char state; 3113 __poll_t mask; 3114 u8 shutdown; 3115 3116 sock_poll_wait(file, sock, wait); 3117 mask = 0; 3118 shutdown = READ_ONCE(sk->sk_shutdown); 3119 state = READ_ONCE(sk->sk_state); 3120 3121 /* exceptional events? */ 3122 if (READ_ONCE(sk->sk_err)) 3123 mask |= EPOLLERR; 3124 if (shutdown == SHUTDOWN_MASK) 3125 mask |= EPOLLHUP; 3126 if (shutdown & RCV_SHUTDOWN) 3127 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3128 3129 /* readable? */ 3130 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3131 mask |= EPOLLIN | EPOLLRDNORM; 3132 if (sk_is_readable(sk)) 3133 mask |= EPOLLIN | EPOLLRDNORM; 3134 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3135 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3136 mask |= EPOLLPRI; 3137 #endif 3138 3139 /* Connection-based need to check for termination and startup */ 3140 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3141 state == TCP_CLOSE) 3142 mask |= EPOLLHUP; 3143 3144 /* 3145 * we set writable also when the other side has shut down the 3146 * connection. This prevents stuck sockets. 3147 */ 3148 if (unix_writable(sk, state)) 3149 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3150 3151 return mask; 3152 } 3153 3154 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3155 poll_table *wait) 3156 { 3157 struct sock *sk = sock->sk, *other; 3158 unsigned int writable; 3159 unsigned char state; 3160 __poll_t mask; 3161 u8 shutdown; 3162 3163 sock_poll_wait(file, sock, wait); 3164 mask = 0; 3165 shutdown = READ_ONCE(sk->sk_shutdown); 3166 state = READ_ONCE(sk->sk_state); 3167 3168 /* exceptional events? */ 3169 if (READ_ONCE(sk->sk_err) || 3170 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3171 mask |= EPOLLERR | 3172 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3173 3174 if (shutdown & RCV_SHUTDOWN) 3175 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3176 if (shutdown == SHUTDOWN_MASK) 3177 mask |= EPOLLHUP; 3178 3179 /* readable? */ 3180 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3181 mask |= EPOLLIN | EPOLLRDNORM; 3182 if (sk_is_readable(sk)) 3183 mask |= EPOLLIN | EPOLLRDNORM; 3184 3185 /* Connection-based need to check for termination and startup */ 3186 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3187 mask |= EPOLLHUP; 3188 3189 /* No write status requested, avoid expensive OUT tests. */ 3190 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3191 return mask; 3192 3193 writable = unix_writable(sk, state); 3194 if (writable) { 3195 unix_state_lock(sk); 3196 3197 other = unix_peer(sk); 3198 if (other && unix_peer(other) != sk && 3199 unix_recvq_full_lockless(other) && 3200 unix_dgram_peer_wake_me(sk, other)) 3201 writable = 0; 3202 3203 unix_state_unlock(sk); 3204 } 3205 3206 if (writable) 3207 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3208 else 3209 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3210 3211 return mask; 3212 } 3213 3214 #ifdef CONFIG_PROC_FS 3215 3216 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3217 3218 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3219 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3220 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3221 3222 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3223 { 3224 unsigned long offset = get_offset(*pos); 3225 unsigned long bucket = get_bucket(*pos); 3226 unsigned long count = 0; 3227 struct sock *sk; 3228 3229 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3230 sk; sk = sk_next(sk)) { 3231 if (++count == offset) 3232 break; 3233 } 3234 3235 return sk; 3236 } 3237 3238 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3239 { 3240 unsigned long bucket = get_bucket(*pos); 3241 struct net *net = seq_file_net(seq); 3242 struct sock *sk; 3243 3244 while (bucket < UNIX_HASH_SIZE) { 3245 spin_lock(&net->unx.table.locks[bucket]); 3246 3247 sk = unix_from_bucket(seq, pos); 3248 if (sk) 3249 return sk; 3250 3251 spin_unlock(&net->unx.table.locks[bucket]); 3252 3253 *pos = set_bucket_offset(++bucket, 1); 3254 } 3255 3256 return NULL; 3257 } 3258 3259 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3260 loff_t *pos) 3261 { 3262 unsigned long bucket = get_bucket(*pos); 3263 3264 sk = sk_next(sk); 3265 if (sk) 3266 return sk; 3267 3268 3269 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3270 3271 *pos = set_bucket_offset(++bucket, 1); 3272 3273 return unix_get_first(seq, pos); 3274 } 3275 3276 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3277 { 3278 if (!*pos) 3279 return SEQ_START_TOKEN; 3280 3281 return unix_get_first(seq, pos); 3282 } 3283 3284 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3285 { 3286 ++*pos; 3287 3288 if (v == SEQ_START_TOKEN) 3289 return unix_get_first(seq, pos); 3290 3291 return unix_get_next(seq, v, pos); 3292 } 3293 3294 static void unix_seq_stop(struct seq_file *seq, void *v) 3295 { 3296 struct sock *sk = v; 3297 3298 if (sk) 3299 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3300 } 3301 3302 static int unix_seq_show(struct seq_file *seq, void *v) 3303 { 3304 3305 if (v == SEQ_START_TOKEN) 3306 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3307 "Inode Path\n"); 3308 else { 3309 struct sock *s = v; 3310 struct unix_sock *u = unix_sk(s); 3311 unix_state_lock(s); 3312 3313 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3314 s, 3315 refcount_read(&s->sk_refcnt), 3316 0, 3317 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3318 s->sk_type, 3319 s->sk_socket ? 3320 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3321 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3322 sock_i_ino(s)); 3323 3324 if (u->addr) { // under a hash table lock here 3325 int i, len; 3326 seq_putc(seq, ' '); 3327 3328 i = 0; 3329 len = u->addr->len - 3330 offsetof(struct sockaddr_un, sun_path); 3331 if (u->addr->name->sun_path[0]) { 3332 len--; 3333 } else { 3334 seq_putc(seq, '@'); 3335 i++; 3336 } 3337 for ( ; i < len; i++) 3338 seq_putc(seq, u->addr->name->sun_path[i] ?: 3339 '@'); 3340 } 3341 unix_state_unlock(s); 3342 seq_putc(seq, '\n'); 3343 } 3344 3345 return 0; 3346 } 3347 3348 static const struct seq_operations unix_seq_ops = { 3349 .start = unix_seq_start, 3350 .next = unix_seq_next, 3351 .stop = unix_seq_stop, 3352 .show = unix_seq_show, 3353 }; 3354 3355 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) 3356 struct bpf_unix_iter_state { 3357 struct seq_net_private p; 3358 unsigned int cur_sk; 3359 unsigned int end_sk; 3360 unsigned int max_sk; 3361 struct sock **batch; 3362 bool st_bucket_done; 3363 }; 3364 3365 struct bpf_iter__unix { 3366 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3367 __bpf_md_ptr(struct unix_sock *, unix_sk); 3368 uid_t uid __aligned(8); 3369 }; 3370 3371 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3372 struct unix_sock *unix_sk, uid_t uid) 3373 { 3374 struct bpf_iter__unix ctx; 3375 3376 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3377 ctx.meta = meta; 3378 ctx.unix_sk = unix_sk; 3379 ctx.uid = uid; 3380 return bpf_iter_run_prog(prog, &ctx); 3381 } 3382 3383 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3384 3385 { 3386 struct bpf_unix_iter_state *iter = seq->private; 3387 unsigned int expected = 1; 3388 struct sock *sk; 3389 3390 sock_hold(start_sk); 3391 iter->batch[iter->end_sk++] = start_sk; 3392 3393 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3394 if (iter->end_sk < iter->max_sk) { 3395 sock_hold(sk); 3396 iter->batch[iter->end_sk++] = sk; 3397 } 3398 3399 expected++; 3400 } 3401 3402 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3403 3404 return expected; 3405 } 3406 3407 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3408 { 3409 while (iter->cur_sk < iter->end_sk) 3410 sock_put(iter->batch[iter->cur_sk++]); 3411 } 3412 3413 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3414 unsigned int new_batch_sz) 3415 { 3416 struct sock **new_batch; 3417 3418 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3419 GFP_USER | __GFP_NOWARN); 3420 if (!new_batch) 3421 return -ENOMEM; 3422 3423 bpf_iter_unix_put_batch(iter); 3424 kvfree(iter->batch); 3425 iter->batch = new_batch; 3426 iter->max_sk = new_batch_sz; 3427 3428 return 0; 3429 } 3430 3431 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3432 loff_t *pos) 3433 { 3434 struct bpf_unix_iter_state *iter = seq->private; 3435 unsigned int expected; 3436 bool resized = false; 3437 struct sock *sk; 3438 3439 if (iter->st_bucket_done) 3440 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3441 3442 again: 3443 /* Get a new batch */ 3444 iter->cur_sk = 0; 3445 iter->end_sk = 0; 3446 3447 sk = unix_get_first(seq, pos); 3448 if (!sk) 3449 return NULL; /* Done */ 3450 3451 expected = bpf_iter_unix_hold_batch(seq, sk); 3452 3453 if (iter->end_sk == expected) { 3454 iter->st_bucket_done = true; 3455 return sk; 3456 } 3457 3458 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3459 resized = true; 3460 goto again; 3461 } 3462 3463 return sk; 3464 } 3465 3466 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3467 { 3468 if (!*pos) 3469 return SEQ_START_TOKEN; 3470 3471 /* bpf iter does not support lseek, so it always 3472 * continue from where it was stop()-ped. 3473 */ 3474 return bpf_iter_unix_batch(seq, pos); 3475 } 3476 3477 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3478 { 3479 struct bpf_unix_iter_state *iter = seq->private; 3480 struct sock *sk; 3481 3482 /* Whenever seq_next() is called, the iter->cur_sk is 3483 * done with seq_show(), so advance to the next sk in 3484 * the batch. 3485 */ 3486 if (iter->cur_sk < iter->end_sk) 3487 sock_put(iter->batch[iter->cur_sk++]); 3488 3489 ++*pos; 3490 3491 if (iter->cur_sk < iter->end_sk) 3492 sk = iter->batch[iter->cur_sk]; 3493 else 3494 sk = bpf_iter_unix_batch(seq, pos); 3495 3496 return sk; 3497 } 3498 3499 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3500 { 3501 struct bpf_iter_meta meta; 3502 struct bpf_prog *prog; 3503 struct sock *sk = v; 3504 uid_t uid; 3505 bool slow; 3506 int ret; 3507 3508 if (v == SEQ_START_TOKEN) 3509 return 0; 3510 3511 slow = lock_sock_fast(sk); 3512 3513 if (unlikely(sk_unhashed(sk))) { 3514 ret = SEQ_SKIP; 3515 goto unlock; 3516 } 3517 3518 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3519 meta.seq = seq; 3520 prog = bpf_iter_get_info(&meta, false); 3521 ret = unix_prog_seq_show(prog, &meta, v, uid); 3522 unlock: 3523 unlock_sock_fast(sk, slow); 3524 return ret; 3525 } 3526 3527 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3528 { 3529 struct bpf_unix_iter_state *iter = seq->private; 3530 struct bpf_iter_meta meta; 3531 struct bpf_prog *prog; 3532 3533 if (!v) { 3534 meta.seq = seq; 3535 prog = bpf_iter_get_info(&meta, true); 3536 if (prog) 3537 (void)unix_prog_seq_show(prog, &meta, v, 0); 3538 } 3539 3540 if (iter->cur_sk < iter->end_sk) 3541 bpf_iter_unix_put_batch(iter); 3542 } 3543 3544 static const struct seq_operations bpf_iter_unix_seq_ops = { 3545 .start = bpf_iter_unix_seq_start, 3546 .next = bpf_iter_unix_seq_next, 3547 .stop = bpf_iter_unix_seq_stop, 3548 .show = bpf_iter_unix_seq_show, 3549 }; 3550 #endif 3551 #endif 3552 3553 static const struct net_proto_family unix_family_ops = { 3554 .family = PF_UNIX, 3555 .create = unix_create, 3556 .owner = THIS_MODULE, 3557 }; 3558 3559 3560 static int __net_init unix_net_init(struct net *net) 3561 { 3562 int i; 3563 3564 net->unx.sysctl_max_dgram_qlen = 10; 3565 if (unix_sysctl_register(net)) 3566 goto out; 3567 3568 #ifdef CONFIG_PROC_FS 3569 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3570 sizeof(struct seq_net_private))) 3571 goto err_sysctl; 3572 #endif 3573 3574 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3575 sizeof(spinlock_t), GFP_KERNEL); 3576 if (!net->unx.table.locks) 3577 goto err_proc; 3578 3579 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3580 sizeof(struct hlist_head), 3581 GFP_KERNEL); 3582 if (!net->unx.table.buckets) 3583 goto free_locks; 3584 3585 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3586 spin_lock_init(&net->unx.table.locks[i]); 3587 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3588 } 3589 3590 return 0; 3591 3592 free_locks: 3593 kvfree(net->unx.table.locks); 3594 err_proc: 3595 #ifdef CONFIG_PROC_FS 3596 remove_proc_entry("unix", net->proc_net); 3597 err_sysctl: 3598 #endif 3599 unix_sysctl_unregister(net); 3600 out: 3601 return -ENOMEM; 3602 } 3603 3604 static void __net_exit unix_net_exit(struct net *net) 3605 { 3606 kvfree(net->unx.table.buckets); 3607 kvfree(net->unx.table.locks); 3608 unix_sysctl_unregister(net); 3609 remove_proc_entry("unix", net->proc_net); 3610 } 3611 3612 static struct pernet_operations unix_net_ops = { 3613 .init = unix_net_init, 3614 .exit = unix_net_exit, 3615 }; 3616 3617 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3618 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3619 struct unix_sock *unix_sk, uid_t uid) 3620 3621 #define INIT_BATCH_SZ 16 3622 3623 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3624 { 3625 struct bpf_unix_iter_state *iter = priv_data; 3626 int err; 3627 3628 err = bpf_iter_init_seq_net(priv_data, aux); 3629 if (err) 3630 return err; 3631 3632 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3633 if (err) { 3634 bpf_iter_fini_seq_net(priv_data); 3635 return err; 3636 } 3637 3638 return 0; 3639 } 3640 3641 static void bpf_iter_fini_unix(void *priv_data) 3642 { 3643 struct bpf_unix_iter_state *iter = priv_data; 3644 3645 bpf_iter_fini_seq_net(priv_data); 3646 kvfree(iter->batch); 3647 } 3648 3649 static const struct bpf_iter_seq_info unix_seq_info = { 3650 .seq_ops = &bpf_iter_unix_seq_ops, 3651 .init_seq_private = bpf_iter_init_unix, 3652 .fini_seq_private = bpf_iter_fini_unix, 3653 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3654 }; 3655 3656 static const struct bpf_func_proto * 3657 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3658 const struct bpf_prog *prog) 3659 { 3660 switch (func_id) { 3661 case BPF_FUNC_setsockopt: 3662 return &bpf_sk_setsockopt_proto; 3663 case BPF_FUNC_getsockopt: 3664 return &bpf_sk_getsockopt_proto; 3665 default: 3666 return NULL; 3667 } 3668 } 3669 3670 static struct bpf_iter_reg unix_reg_info = { 3671 .target = "unix", 3672 .ctx_arg_info_size = 1, 3673 .ctx_arg_info = { 3674 { offsetof(struct bpf_iter__unix, unix_sk), 3675 PTR_TO_BTF_ID_OR_NULL }, 3676 }, 3677 .get_func_proto = bpf_iter_unix_get_func_proto, 3678 .seq_info = &unix_seq_info, 3679 }; 3680 3681 static void __init bpf_iter_register(void) 3682 { 3683 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3684 if (bpf_iter_reg_target(&unix_reg_info)) 3685 pr_warn("Warning: could not register bpf iterator unix\n"); 3686 } 3687 #endif 3688 3689 static int __init af_unix_init(void) 3690 { 3691 int i, rc = -1; 3692 3693 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3694 3695 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3696 spin_lock_init(&bsd_socket_locks[i]); 3697 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3698 } 3699 3700 rc = proto_register(&unix_dgram_proto, 1); 3701 if (rc != 0) { 3702 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3703 goto out; 3704 } 3705 3706 rc = proto_register(&unix_stream_proto, 1); 3707 if (rc != 0) { 3708 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3709 proto_unregister(&unix_dgram_proto); 3710 goto out; 3711 } 3712 3713 sock_register(&unix_family_ops); 3714 register_pernet_subsys(&unix_net_ops); 3715 unix_bpf_build_proto(); 3716 3717 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3718 bpf_iter_register(); 3719 #endif 3720 3721 out: 3722 return rc; 3723 } 3724 3725 static void __exit af_unix_exit(void) 3726 { 3727 sock_unregister(PF_UNIX); 3728 proto_unregister(&unix_dgram_proto); 3729 proto_unregister(&unix_stream_proto); 3730 unregister_pernet_subsys(&unix_net_ops); 3731 } 3732 3733 /* Earlier than device_initcall() so that other drivers invoking 3734 request_module() don't end up in a loop when modprobe tries 3735 to use a UNIX socket. But later than subsys_initcall() because 3736 we depend on stuff initialised there */ 3737 fs_initcall(af_unix_init); 3738 module_exit(af_unix_exit); 3739 3740 MODULE_LICENSE("GPL"); 3741 MODULE_ALIAS_NETPROTO(PF_UNIX); 3742