1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 120 #include "scm.h" 121 122 static atomic_long_t unix_nr_socks; 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 125 126 /* SMP locking strategy: 127 * hash table is protected with spinlock. 128 * each socket state is protected by separate spinlock. 129 */ 130 131 static unsigned int unix_unbound_hash(struct sock *sk) 132 { 133 unsigned long hash = (unsigned long)sk; 134 135 hash ^= hash >> 16; 136 hash ^= hash >> 8; 137 hash ^= sk->sk_type; 138 139 return hash & UNIX_HASH_MOD; 140 } 141 142 static unsigned int unix_bsd_hash(struct inode *i) 143 { 144 return i->i_ino & UNIX_HASH_MOD; 145 } 146 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 148 int addr_len, int type) 149 { 150 __wsum csum = csum_partial(sunaddr, addr_len, 0); 151 unsigned int hash; 152 153 hash = (__force unsigned int)csum_fold(csum); 154 hash ^= hash >> 8; 155 hash ^= type; 156 157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 158 } 159 160 static void unix_table_double_lock(struct net *net, 161 unsigned int hash1, unsigned int hash2) 162 { 163 if (hash1 == hash2) { 164 spin_lock(&net->unx.table.locks[hash1]); 165 return; 166 } 167 168 if (hash1 > hash2) 169 swap(hash1, hash2); 170 171 spin_lock(&net->unx.table.locks[hash1]); 172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 173 } 174 175 static void unix_table_double_unlock(struct net *net, 176 unsigned int hash1, unsigned int hash2) 177 { 178 if (hash1 == hash2) { 179 spin_unlock(&net->unx.table.locks[hash1]); 180 return; 181 } 182 183 spin_unlock(&net->unx.table.locks[hash1]); 184 spin_unlock(&net->unx.table.locks[hash2]); 185 } 186 187 #ifdef CONFIG_SECURITY_NETWORK 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 189 { 190 UNIXCB(skb).secid = scm->secid; 191 } 192 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 194 { 195 scm->secid = UNIXCB(skb).secid; 196 } 197 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 199 { 200 return (scm->secid == UNIXCB(skb).secid); 201 } 202 #else 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 204 { } 205 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 207 { } 208 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 210 { 211 return true; 212 } 213 #endif /* CONFIG_SECURITY_NETWORK */ 214 215 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 216 { 217 return unix_peer(osk) == sk; 218 } 219 220 static inline int unix_may_send(struct sock *sk, struct sock *osk) 221 { 222 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 223 } 224 225 static inline int unix_recvq_full_lockless(const struct sock *sk) 226 { 227 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 228 } 229 230 struct sock *unix_peer_get(struct sock *s) 231 { 232 struct sock *peer; 233 234 unix_state_lock(s); 235 peer = unix_peer(s); 236 if (peer) 237 sock_hold(peer); 238 unix_state_unlock(s); 239 return peer; 240 } 241 EXPORT_SYMBOL_GPL(unix_peer_get); 242 243 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 244 int addr_len) 245 { 246 struct unix_address *addr; 247 248 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 249 if (!addr) 250 return NULL; 251 252 refcount_set(&addr->refcnt, 1); 253 addr->len = addr_len; 254 memcpy(addr->name, sunaddr, addr_len); 255 256 return addr; 257 } 258 259 static inline void unix_release_addr(struct unix_address *addr) 260 { 261 if (refcount_dec_and_test(&addr->refcnt)) 262 kfree(addr); 263 } 264 265 /* 266 * Check unix socket name: 267 * - should be not zero length. 268 * - if started by not zero, should be NULL terminated (FS object) 269 * - if started by zero, it is abstract name. 270 */ 271 272 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 273 { 274 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 275 addr_len > sizeof(*sunaddr)) 276 return -EINVAL; 277 278 if (sunaddr->sun_family != AF_UNIX) 279 return -EINVAL; 280 281 return 0; 282 } 283 284 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 285 { 286 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 287 short offset = offsetof(struct sockaddr_storage, __data); 288 289 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 290 291 /* This may look like an off by one error but it is a bit more 292 * subtle. 108 is the longest valid AF_UNIX path for a binding. 293 * sun_path[108] doesn't as such exist. However in kernel space 294 * we are guaranteed that it is a valid memory location in our 295 * kernel address buffer because syscall functions always pass 296 * a pointer of struct sockaddr_storage which has a bigger buffer 297 * than 108. Also, we must terminate sun_path for strlen() in 298 * getname_kernel(). 299 */ 300 addr->__data[addr_len - offset] = 0; 301 302 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 303 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 304 * know the actual buffer. 305 */ 306 return strlen(addr->__data) + offset + 1; 307 } 308 309 static void __unix_remove_socket(struct sock *sk) 310 { 311 sk_del_node_init(sk); 312 } 313 314 static void __unix_insert_socket(struct net *net, struct sock *sk) 315 { 316 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 317 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 318 } 319 320 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 321 struct unix_address *addr, unsigned int hash) 322 { 323 __unix_remove_socket(sk); 324 smp_store_release(&unix_sk(sk)->addr, addr); 325 326 sk->sk_hash = hash; 327 __unix_insert_socket(net, sk); 328 } 329 330 static void unix_remove_socket(struct net *net, struct sock *sk) 331 { 332 spin_lock(&net->unx.table.locks[sk->sk_hash]); 333 __unix_remove_socket(sk); 334 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 335 } 336 337 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 338 { 339 spin_lock(&net->unx.table.locks[sk->sk_hash]); 340 __unix_insert_socket(net, sk); 341 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 342 } 343 344 static void unix_insert_bsd_socket(struct sock *sk) 345 { 346 spin_lock(&bsd_socket_locks[sk->sk_hash]); 347 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 348 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 349 } 350 351 static void unix_remove_bsd_socket(struct sock *sk) 352 { 353 if (!hlist_unhashed(&sk->sk_bind_node)) { 354 spin_lock(&bsd_socket_locks[sk->sk_hash]); 355 __sk_del_bind_node(sk); 356 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 357 358 sk_node_init(&sk->sk_bind_node); 359 } 360 } 361 362 static struct sock *__unix_find_socket_byname(struct net *net, 363 struct sockaddr_un *sunname, 364 int len, unsigned int hash) 365 { 366 struct sock *s; 367 368 sk_for_each(s, &net->unx.table.buckets[hash]) { 369 struct unix_sock *u = unix_sk(s); 370 371 if (u->addr->len == len && 372 !memcmp(u->addr->name, sunname, len)) 373 return s; 374 } 375 return NULL; 376 } 377 378 static inline struct sock *unix_find_socket_byname(struct net *net, 379 struct sockaddr_un *sunname, 380 int len, unsigned int hash) 381 { 382 struct sock *s; 383 384 spin_lock(&net->unx.table.locks[hash]); 385 s = __unix_find_socket_byname(net, sunname, len, hash); 386 if (s) 387 sock_hold(s); 388 spin_unlock(&net->unx.table.locks[hash]); 389 return s; 390 } 391 392 static struct sock *unix_find_socket_byinode(struct inode *i) 393 { 394 unsigned int hash = unix_bsd_hash(i); 395 struct sock *s; 396 397 spin_lock(&bsd_socket_locks[hash]); 398 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 399 struct dentry *dentry = unix_sk(s)->path.dentry; 400 401 if (dentry && d_backing_inode(dentry) == i) { 402 sock_hold(s); 403 spin_unlock(&bsd_socket_locks[hash]); 404 return s; 405 } 406 } 407 spin_unlock(&bsd_socket_locks[hash]); 408 return NULL; 409 } 410 411 /* Support code for asymmetrically connected dgram sockets 412 * 413 * If a datagram socket is connected to a socket not itself connected 414 * to the first socket (eg, /dev/log), clients may only enqueue more 415 * messages if the present receive queue of the server socket is not 416 * "too large". This means there's a second writeability condition 417 * poll and sendmsg need to test. The dgram recv code will do a wake 418 * up on the peer_wait wait queue of a socket upon reception of a 419 * datagram which needs to be propagated to sleeping would-be writers 420 * since these might not have sent anything so far. This can't be 421 * accomplished via poll_wait because the lifetime of the server 422 * socket might be less than that of its clients if these break their 423 * association with it or if the server socket is closed while clients 424 * are still connected to it and there's no way to inform "a polling 425 * implementation" that it should let go of a certain wait queue 426 * 427 * In order to propagate a wake up, a wait_queue_entry_t of the client 428 * socket is enqueued on the peer_wait queue of the server socket 429 * whose wake function does a wake_up on the ordinary client socket 430 * wait queue. This connection is established whenever a write (or 431 * poll for write) hit the flow control condition and broken when the 432 * association to the server socket is dissolved or after a wake up 433 * was relayed. 434 */ 435 436 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 437 void *key) 438 { 439 struct unix_sock *u; 440 wait_queue_head_t *u_sleep; 441 442 u = container_of(q, struct unix_sock, peer_wake); 443 444 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 445 q); 446 u->peer_wake.private = NULL; 447 448 /* relaying can only happen while the wq still exists */ 449 u_sleep = sk_sleep(&u->sk); 450 if (u_sleep) 451 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 452 453 return 0; 454 } 455 456 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 457 { 458 struct unix_sock *u, *u_other; 459 int rc; 460 461 u = unix_sk(sk); 462 u_other = unix_sk(other); 463 rc = 0; 464 spin_lock(&u_other->peer_wait.lock); 465 466 if (!u->peer_wake.private) { 467 u->peer_wake.private = other; 468 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 469 470 rc = 1; 471 } 472 473 spin_unlock(&u_other->peer_wait.lock); 474 return rc; 475 } 476 477 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 478 struct sock *other) 479 { 480 struct unix_sock *u, *u_other; 481 482 u = unix_sk(sk); 483 u_other = unix_sk(other); 484 spin_lock(&u_other->peer_wait.lock); 485 486 if (u->peer_wake.private == other) { 487 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 488 u->peer_wake.private = NULL; 489 } 490 491 spin_unlock(&u_other->peer_wait.lock); 492 } 493 494 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 495 struct sock *other) 496 { 497 unix_dgram_peer_wake_disconnect(sk, other); 498 wake_up_interruptible_poll(sk_sleep(sk), 499 EPOLLOUT | 500 EPOLLWRNORM | 501 EPOLLWRBAND); 502 } 503 504 /* preconditions: 505 * - unix_peer(sk) == other 506 * - association is stable 507 */ 508 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 509 { 510 int connected; 511 512 connected = unix_dgram_peer_wake_connect(sk, other); 513 514 /* If other is SOCK_DEAD, we want to make sure we signal 515 * POLLOUT, such that a subsequent write() can get a 516 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 517 * to other and its full, we will hang waiting for POLLOUT. 518 */ 519 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 520 return 1; 521 522 if (connected) 523 unix_dgram_peer_wake_disconnect(sk, other); 524 525 return 0; 526 } 527 528 static int unix_writable(const struct sock *sk, unsigned char state) 529 { 530 return state != TCP_LISTEN && 531 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 532 } 533 534 static void unix_write_space(struct sock *sk) 535 { 536 struct socket_wq *wq; 537 538 rcu_read_lock(); 539 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 540 wq = rcu_dereference(sk->sk_wq); 541 if (skwq_has_sleeper(wq)) 542 wake_up_interruptible_sync_poll(&wq->wait, 543 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 544 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 545 } 546 rcu_read_unlock(); 547 } 548 549 /* When dgram socket disconnects (or changes its peer), we clear its receive 550 * queue of packets arrived from previous peer. First, it allows to do 551 * flow control based only on wmem_alloc; second, sk connected to peer 552 * may receive messages only from that peer. */ 553 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 554 { 555 if (!skb_queue_empty(&sk->sk_receive_queue)) { 556 skb_queue_purge(&sk->sk_receive_queue); 557 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 558 559 /* If one link of bidirectional dgram pipe is disconnected, 560 * we signal error. Messages are lost. Do not make this, 561 * when peer was not connected to us. 562 */ 563 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 564 WRITE_ONCE(other->sk_err, ECONNRESET); 565 sk_error_report(other); 566 } 567 } 568 } 569 570 static void unix_sock_destructor(struct sock *sk) 571 { 572 struct unix_sock *u = unix_sk(sk); 573 574 skb_queue_purge(&sk->sk_receive_queue); 575 576 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 577 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 578 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 579 if (!sock_flag(sk, SOCK_DEAD)) { 580 pr_info("Attempt to release alive unix socket: %p\n", sk); 581 return; 582 } 583 584 if (u->addr) 585 unix_release_addr(u->addr); 586 587 atomic_long_dec(&unix_nr_socks); 588 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 589 #ifdef UNIX_REFCNT_DEBUG 590 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 591 atomic_long_read(&unix_nr_socks)); 592 #endif 593 } 594 595 static void unix_release_sock(struct sock *sk, int embrion) 596 { 597 struct unix_sock *u = unix_sk(sk); 598 struct sock *skpair; 599 struct sk_buff *skb; 600 struct path path; 601 int state; 602 603 unix_remove_socket(sock_net(sk), sk); 604 unix_remove_bsd_socket(sk); 605 606 /* Clear state */ 607 unix_state_lock(sk); 608 sock_orphan(sk); 609 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 610 path = u->path; 611 u->path.dentry = NULL; 612 u->path.mnt = NULL; 613 state = sk->sk_state; 614 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 615 616 skpair = unix_peer(sk); 617 unix_peer(sk) = NULL; 618 619 unix_state_unlock(sk); 620 621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 622 if (u->oob_skb) { 623 kfree_skb(u->oob_skb); 624 u->oob_skb = NULL; 625 } 626 #endif 627 628 wake_up_interruptible_all(&u->peer_wait); 629 630 if (skpair != NULL) { 631 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 632 unix_state_lock(skpair); 633 /* No more writes */ 634 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 635 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 636 WRITE_ONCE(skpair->sk_err, ECONNRESET); 637 unix_state_unlock(skpair); 638 skpair->sk_state_change(skpair); 639 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 640 } 641 642 unix_dgram_peer_wake_disconnect(sk, skpair); 643 sock_put(skpair); /* It may now die */ 644 } 645 646 /* Try to flush out this socket. Throw out buffers at least */ 647 648 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 649 if (state == TCP_LISTEN) 650 unix_release_sock(skb->sk, 1); 651 /* passed fds are erased in the kfree_skb hook */ 652 UNIXCB(skb).consumed = skb->len; 653 kfree_skb(skb); 654 } 655 656 if (path.dentry) 657 path_put(&path); 658 659 sock_put(sk); 660 661 /* ---- Socket is dead now and most probably destroyed ---- */ 662 663 /* 664 * Fixme: BSD difference: In BSD all sockets connected to us get 665 * ECONNRESET and we die on the spot. In Linux we behave 666 * like files and pipes do and wait for the last 667 * dereference. 668 * 669 * Can't we simply set sock->err? 670 * 671 * What the above comment does talk about? --ANK(980817) 672 */ 673 674 if (READ_ONCE(unix_tot_inflight)) 675 unix_gc(); /* Garbage collect fds */ 676 } 677 678 static void init_peercred(struct sock *sk) 679 { 680 const struct cred *old_cred; 681 struct pid *old_pid; 682 683 spin_lock(&sk->sk_peer_lock); 684 old_pid = sk->sk_peer_pid; 685 old_cred = sk->sk_peer_cred; 686 sk->sk_peer_pid = get_pid(task_tgid(current)); 687 sk->sk_peer_cred = get_current_cred(); 688 spin_unlock(&sk->sk_peer_lock); 689 690 put_pid(old_pid); 691 put_cred(old_cred); 692 } 693 694 static void copy_peercred(struct sock *sk, struct sock *peersk) 695 { 696 const struct cred *old_cred; 697 struct pid *old_pid; 698 699 if (sk < peersk) { 700 spin_lock(&sk->sk_peer_lock); 701 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 702 } else { 703 spin_lock(&peersk->sk_peer_lock); 704 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 705 } 706 old_pid = sk->sk_peer_pid; 707 old_cred = sk->sk_peer_cred; 708 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 709 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 710 711 spin_unlock(&sk->sk_peer_lock); 712 spin_unlock(&peersk->sk_peer_lock); 713 714 put_pid(old_pid); 715 put_cred(old_cred); 716 } 717 718 static int unix_listen(struct socket *sock, int backlog) 719 { 720 int err; 721 struct sock *sk = sock->sk; 722 struct unix_sock *u = unix_sk(sk); 723 724 err = -EOPNOTSUPP; 725 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 726 goto out; /* Only stream/seqpacket sockets accept */ 727 err = -EINVAL; 728 if (!READ_ONCE(u->addr)) 729 goto out; /* No listens on an unbound socket */ 730 unix_state_lock(sk); 731 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 732 goto out_unlock; 733 if (backlog > sk->sk_max_ack_backlog) 734 wake_up_interruptible_all(&u->peer_wait); 735 sk->sk_max_ack_backlog = backlog; 736 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 737 738 /* set credentials so connect can copy them */ 739 init_peercred(sk); 740 err = 0; 741 742 out_unlock: 743 unix_state_unlock(sk); 744 out: 745 return err; 746 } 747 748 static int unix_release(struct socket *); 749 static int unix_bind(struct socket *, struct sockaddr *, int); 750 static int unix_stream_connect(struct socket *, struct sockaddr *, 751 int addr_len, int flags); 752 static int unix_socketpair(struct socket *, struct socket *); 753 static int unix_accept(struct socket *, struct socket *, int, bool); 754 static int unix_getname(struct socket *, struct sockaddr *, int); 755 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 756 static __poll_t unix_dgram_poll(struct file *, struct socket *, 757 poll_table *); 758 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 759 #ifdef CONFIG_COMPAT 760 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 761 #endif 762 static int unix_shutdown(struct socket *, int); 763 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 764 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 765 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 766 struct pipe_inode_info *, size_t size, 767 unsigned int flags); 768 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 769 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 770 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 771 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 772 static int unix_dgram_connect(struct socket *, struct sockaddr *, 773 int, int); 774 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 775 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 776 int); 777 778 static int unix_set_peek_off(struct sock *sk, int val) 779 { 780 struct unix_sock *u = unix_sk(sk); 781 782 if (mutex_lock_interruptible(&u->iolock)) 783 return -EINTR; 784 785 WRITE_ONCE(sk->sk_peek_off, val); 786 mutex_unlock(&u->iolock); 787 788 return 0; 789 } 790 791 #ifdef CONFIG_PROC_FS 792 static int unix_count_nr_fds(struct sock *sk) 793 { 794 struct sk_buff *skb; 795 struct unix_sock *u; 796 int nr_fds = 0; 797 798 spin_lock(&sk->sk_receive_queue.lock); 799 skb = skb_peek(&sk->sk_receive_queue); 800 while (skb) { 801 u = unix_sk(skb->sk); 802 nr_fds += atomic_read(&u->scm_stat.nr_fds); 803 skb = skb_peek_next(skb, &sk->sk_receive_queue); 804 } 805 spin_unlock(&sk->sk_receive_queue.lock); 806 807 return nr_fds; 808 } 809 810 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 811 { 812 struct sock *sk = sock->sk; 813 unsigned char s_state; 814 struct unix_sock *u; 815 int nr_fds = 0; 816 817 if (sk) { 818 s_state = READ_ONCE(sk->sk_state); 819 u = unix_sk(sk); 820 821 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 822 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 823 * SOCK_DGRAM is ordinary. So, no lock is needed. 824 */ 825 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 826 nr_fds = atomic_read(&u->scm_stat.nr_fds); 827 else if (s_state == TCP_LISTEN) 828 nr_fds = unix_count_nr_fds(sk); 829 830 seq_printf(m, "scm_fds: %u\n", nr_fds); 831 } 832 } 833 #else 834 #define unix_show_fdinfo NULL 835 #endif 836 837 static const struct proto_ops unix_stream_ops = { 838 .family = PF_UNIX, 839 .owner = THIS_MODULE, 840 .release = unix_release, 841 .bind = unix_bind, 842 .connect = unix_stream_connect, 843 .socketpair = unix_socketpair, 844 .accept = unix_accept, 845 .getname = unix_getname, 846 .poll = unix_poll, 847 .ioctl = unix_ioctl, 848 #ifdef CONFIG_COMPAT 849 .compat_ioctl = unix_compat_ioctl, 850 #endif 851 .listen = unix_listen, 852 .shutdown = unix_shutdown, 853 .sendmsg = unix_stream_sendmsg, 854 .recvmsg = unix_stream_recvmsg, 855 .read_skb = unix_stream_read_skb, 856 .mmap = sock_no_mmap, 857 .splice_read = unix_stream_splice_read, 858 .set_peek_off = unix_set_peek_off, 859 .show_fdinfo = unix_show_fdinfo, 860 }; 861 862 static const struct proto_ops unix_dgram_ops = { 863 .family = PF_UNIX, 864 .owner = THIS_MODULE, 865 .release = unix_release, 866 .bind = unix_bind, 867 .connect = unix_dgram_connect, 868 .socketpair = unix_socketpair, 869 .accept = sock_no_accept, 870 .getname = unix_getname, 871 .poll = unix_dgram_poll, 872 .ioctl = unix_ioctl, 873 #ifdef CONFIG_COMPAT 874 .compat_ioctl = unix_compat_ioctl, 875 #endif 876 .listen = sock_no_listen, 877 .shutdown = unix_shutdown, 878 .sendmsg = unix_dgram_sendmsg, 879 .read_skb = unix_read_skb, 880 .recvmsg = unix_dgram_recvmsg, 881 .mmap = sock_no_mmap, 882 .set_peek_off = unix_set_peek_off, 883 .show_fdinfo = unix_show_fdinfo, 884 }; 885 886 static const struct proto_ops unix_seqpacket_ops = { 887 .family = PF_UNIX, 888 .owner = THIS_MODULE, 889 .release = unix_release, 890 .bind = unix_bind, 891 .connect = unix_stream_connect, 892 .socketpair = unix_socketpair, 893 .accept = unix_accept, 894 .getname = unix_getname, 895 .poll = unix_dgram_poll, 896 .ioctl = unix_ioctl, 897 #ifdef CONFIG_COMPAT 898 .compat_ioctl = unix_compat_ioctl, 899 #endif 900 .listen = unix_listen, 901 .shutdown = unix_shutdown, 902 .sendmsg = unix_seqpacket_sendmsg, 903 .recvmsg = unix_seqpacket_recvmsg, 904 .mmap = sock_no_mmap, 905 .set_peek_off = unix_set_peek_off, 906 .show_fdinfo = unix_show_fdinfo, 907 }; 908 909 static void unix_close(struct sock *sk, long timeout) 910 { 911 /* Nothing to do here, unix socket does not need a ->close(). 912 * This is merely for sockmap. 913 */ 914 } 915 916 static void unix_unhash(struct sock *sk) 917 { 918 /* Nothing to do here, unix socket does not need a ->unhash(). 919 * This is merely for sockmap. 920 */ 921 } 922 923 static bool unix_bpf_bypass_getsockopt(int level, int optname) 924 { 925 if (level == SOL_SOCKET) { 926 switch (optname) { 927 case SO_PEERPIDFD: 928 return true; 929 default: 930 return false; 931 } 932 } 933 934 return false; 935 } 936 937 struct proto unix_dgram_proto = { 938 .name = "UNIX", 939 .owner = THIS_MODULE, 940 .obj_size = sizeof(struct unix_sock), 941 .close = unix_close, 942 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 943 #ifdef CONFIG_BPF_SYSCALL 944 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 945 #endif 946 }; 947 948 struct proto unix_stream_proto = { 949 .name = "UNIX-STREAM", 950 .owner = THIS_MODULE, 951 .obj_size = sizeof(struct unix_sock), 952 .close = unix_close, 953 .unhash = unix_unhash, 954 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 955 #ifdef CONFIG_BPF_SYSCALL 956 .psock_update_sk_prot = unix_stream_bpf_update_proto, 957 #endif 958 }; 959 960 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 961 { 962 struct unix_sock *u; 963 struct sock *sk; 964 int err; 965 966 atomic_long_inc(&unix_nr_socks); 967 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 968 err = -ENFILE; 969 goto err; 970 } 971 972 if (type == SOCK_STREAM) 973 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 974 else /*dgram and seqpacket */ 975 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 976 977 if (!sk) { 978 err = -ENOMEM; 979 goto err; 980 } 981 982 sock_init_data(sock, sk); 983 984 sk->sk_hash = unix_unbound_hash(sk); 985 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 986 sk->sk_write_space = unix_write_space; 987 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 988 sk->sk_destruct = unix_sock_destructor; 989 u = unix_sk(sk); 990 u->inflight = 0; 991 u->path.dentry = NULL; 992 u->path.mnt = NULL; 993 spin_lock_init(&u->lock); 994 INIT_LIST_HEAD(&u->link); 995 mutex_init(&u->iolock); /* single task reading lock */ 996 mutex_init(&u->bindlock); /* single task binding lock */ 997 init_waitqueue_head(&u->peer_wait); 998 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 999 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1000 unix_insert_unbound_socket(net, sk); 1001 1002 sock_prot_inuse_add(net, sk->sk_prot, 1); 1003 1004 return sk; 1005 1006 err: 1007 atomic_long_dec(&unix_nr_socks); 1008 return ERR_PTR(err); 1009 } 1010 1011 static int unix_create(struct net *net, struct socket *sock, int protocol, 1012 int kern) 1013 { 1014 struct sock *sk; 1015 1016 if (protocol && protocol != PF_UNIX) 1017 return -EPROTONOSUPPORT; 1018 1019 sock->state = SS_UNCONNECTED; 1020 1021 switch (sock->type) { 1022 case SOCK_STREAM: 1023 sock->ops = &unix_stream_ops; 1024 break; 1025 /* 1026 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1027 * nothing uses it. 1028 */ 1029 case SOCK_RAW: 1030 sock->type = SOCK_DGRAM; 1031 fallthrough; 1032 case SOCK_DGRAM: 1033 sock->ops = &unix_dgram_ops; 1034 break; 1035 case SOCK_SEQPACKET: 1036 sock->ops = &unix_seqpacket_ops; 1037 break; 1038 default: 1039 return -ESOCKTNOSUPPORT; 1040 } 1041 1042 sk = unix_create1(net, sock, kern, sock->type); 1043 if (IS_ERR(sk)) 1044 return PTR_ERR(sk); 1045 1046 return 0; 1047 } 1048 1049 static int unix_release(struct socket *sock) 1050 { 1051 struct sock *sk = sock->sk; 1052 1053 if (!sk) 1054 return 0; 1055 1056 sk->sk_prot->close(sk, 0); 1057 unix_release_sock(sk, 0); 1058 sock->sk = NULL; 1059 1060 return 0; 1061 } 1062 1063 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1064 int type) 1065 { 1066 struct inode *inode; 1067 struct path path; 1068 struct sock *sk; 1069 int err; 1070 1071 unix_mkname_bsd(sunaddr, addr_len); 1072 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1073 if (err) 1074 goto fail; 1075 1076 err = path_permission(&path, MAY_WRITE); 1077 if (err) 1078 goto path_put; 1079 1080 err = -ECONNREFUSED; 1081 inode = d_backing_inode(path.dentry); 1082 if (!S_ISSOCK(inode->i_mode)) 1083 goto path_put; 1084 1085 sk = unix_find_socket_byinode(inode); 1086 if (!sk) 1087 goto path_put; 1088 1089 err = -EPROTOTYPE; 1090 if (sk->sk_type == type) 1091 touch_atime(&path); 1092 else 1093 goto sock_put; 1094 1095 path_put(&path); 1096 1097 return sk; 1098 1099 sock_put: 1100 sock_put(sk); 1101 path_put: 1102 path_put(&path); 1103 fail: 1104 return ERR_PTR(err); 1105 } 1106 1107 static struct sock *unix_find_abstract(struct net *net, 1108 struct sockaddr_un *sunaddr, 1109 int addr_len, int type) 1110 { 1111 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1112 struct dentry *dentry; 1113 struct sock *sk; 1114 1115 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1116 if (!sk) 1117 return ERR_PTR(-ECONNREFUSED); 1118 1119 dentry = unix_sk(sk)->path.dentry; 1120 if (dentry) 1121 touch_atime(&unix_sk(sk)->path); 1122 1123 return sk; 1124 } 1125 1126 static struct sock *unix_find_other(struct net *net, 1127 struct sockaddr_un *sunaddr, 1128 int addr_len, int type) 1129 { 1130 struct sock *sk; 1131 1132 if (sunaddr->sun_path[0]) 1133 sk = unix_find_bsd(sunaddr, addr_len, type); 1134 else 1135 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1136 1137 return sk; 1138 } 1139 1140 static int unix_autobind(struct sock *sk) 1141 { 1142 struct unix_sock *u = unix_sk(sk); 1143 unsigned int new_hash, old_hash; 1144 struct net *net = sock_net(sk); 1145 struct unix_address *addr; 1146 u32 lastnum, ordernum; 1147 int err; 1148 1149 err = mutex_lock_interruptible(&u->bindlock); 1150 if (err) 1151 return err; 1152 1153 if (u->addr) 1154 goto out; 1155 1156 err = -ENOMEM; 1157 addr = kzalloc(sizeof(*addr) + 1158 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1159 if (!addr) 1160 goto out; 1161 1162 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1163 addr->name->sun_family = AF_UNIX; 1164 refcount_set(&addr->refcnt, 1); 1165 1166 old_hash = sk->sk_hash; 1167 ordernum = get_random_u32(); 1168 lastnum = ordernum & 0xFFFFF; 1169 retry: 1170 ordernum = (ordernum + 1) & 0xFFFFF; 1171 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1172 1173 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1174 unix_table_double_lock(net, old_hash, new_hash); 1175 1176 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1177 unix_table_double_unlock(net, old_hash, new_hash); 1178 1179 /* __unix_find_socket_byname() may take long time if many names 1180 * are already in use. 1181 */ 1182 cond_resched(); 1183 1184 if (ordernum == lastnum) { 1185 /* Give up if all names seems to be in use. */ 1186 err = -ENOSPC; 1187 unix_release_addr(addr); 1188 goto out; 1189 } 1190 1191 goto retry; 1192 } 1193 1194 __unix_set_addr_hash(net, sk, addr, new_hash); 1195 unix_table_double_unlock(net, old_hash, new_hash); 1196 err = 0; 1197 1198 out: mutex_unlock(&u->bindlock); 1199 return err; 1200 } 1201 1202 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1203 int addr_len) 1204 { 1205 umode_t mode = S_IFSOCK | 1206 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1207 struct unix_sock *u = unix_sk(sk); 1208 unsigned int new_hash, old_hash; 1209 struct net *net = sock_net(sk); 1210 struct mnt_idmap *idmap; 1211 struct unix_address *addr; 1212 struct dentry *dentry; 1213 struct path parent; 1214 int err; 1215 1216 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1217 addr = unix_create_addr(sunaddr, addr_len); 1218 if (!addr) 1219 return -ENOMEM; 1220 1221 /* 1222 * Get the parent directory, calculate the hash for last 1223 * component. 1224 */ 1225 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1226 if (IS_ERR(dentry)) { 1227 err = PTR_ERR(dentry); 1228 goto out; 1229 } 1230 1231 /* 1232 * All right, let's create it. 1233 */ 1234 idmap = mnt_idmap(parent.mnt); 1235 err = security_path_mknod(&parent, dentry, mode, 0); 1236 if (!err) 1237 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1238 if (err) 1239 goto out_path; 1240 err = mutex_lock_interruptible(&u->bindlock); 1241 if (err) 1242 goto out_unlink; 1243 if (u->addr) 1244 goto out_unlock; 1245 1246 old_hash = sk->sk_hash; 1247 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1248 unix_table_double_lock(net, old_hash, new_hash); 1249 u->path.mnt = mntget(parent.mnt); 1250 u->path.dentry = dget(dentry); 1251 __unix_set_addr_hash(net, sk, addr, new_hash); 1252 unix_table_double_unlock(net, old_hash, new_hash); 1253 unix_insert_bsd_socket(sk); 1254 mutex_unlock(&u->bindlock); 1255 done_path_create(&parent, dentry); 1256 return 0; 1257 1258 out_unlock: 1259 mutex_unlock(&u->bindlock); 1260 err = -EINVAL; 1261 out_unlink: 1262 /* failed after successful mknod? unlink what we'd created... */ 1263 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1264 out_path: 1265 done_path_create(&parent, dentry); 1266 out: 1267 unix_release_addr(addr); 1268 return err == -EEXIST ? -EADDRINUSE : err; 1269 } 1270 1271 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1272 int addr_len) 1273 { 1274 struct unix_sock *u = unix_sk(sk); 1275 unsigned int new_hash, old_hash; 1276 struct net *net = sock_net(sk); 1277 struct unix_address *addr; 1278 int err; 1279 1280 addr = unix_create_addr(sunaddr, addr_len); 1281 if (!addr) 1282 return -ENOMEM; 1283 1284 err = mutex_lock_interruptible(&u->bindlock); 1285 if (err) 1286 goto out; 1287 1288 if (u->addr) { 1289 err = -EINVAL; 1290 goto out_mutex; 1291 } 1292 1293 old_hash = sk->sk_hash; 1294 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1295 unix_table_double_lock(net, old_hash, new_hash); 1296 1297 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1298 goto out_spin; 1299 1300 __unix_set_addr_hash(net, sk, addr, new_hash); 1301 unix_table_double_unlock(net, old_hash, new_hash); 1302 mutex_unlock(&u->bindlock); 1303 return 0; 1304 1305 out_spin: 1306 unix_table_double_unlock(net, old_hash, new_hash); 1307 err = -EADDRINUSE; 1308 out_mutex: 1309 mutex_unlock(&u->bindlock); 1310 out: 1311 unix_release_addr(addr); 1312 return err; 1313 } 1314 1315 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1316 { 1317 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1318 struct sock *sk = sock->sk; 1319 int err; 1320 1321 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1322 sunaddr->sun_family == AF_UNIX) 1323 return unix_autobind(sk); 1324 1325 err = unix_validate_addr(sunaddr, addr_len); 1326 if (err) 1327 return err; 1328 1329 if (sunaddr->sun_path[0]) 1330 err = unix_bind_bsd(sk, sunaddr, addr_len); 1331 else 1332 err = unix_bind_abstract(sk, sunaddr, addr_len); 1333 1334 return err; 1335 } 1336 1337 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1338 { 1339 if (unlikely(sk1 == sk2) || !sk2) { 1340 unix_state_lock(sk1); 1341 return; 1342 } 1343 if (sk1 > sk2) 1344 swap(sk1, sk2); 1345 1346 unix_state_lock(sk1); 1347 unix_state_lock_nested(sk2, U_LOCK_SECOND); 1348 } 1349 1350 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1351 { 1352 if (unlikely(sk1 == sk2) || !sk2) { 1353 unix_state_unlock(sk1); 1354 return; 1355 } 1356 unix_state_unlock(sk1); 1357 unix_state_unlock(sk2); 1358 } 1359 1360 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1361 int alen, int flags) 1362 { 1363 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1364 struct sock *sk = sock->sk; 1365 struct sock *other; 1366 int err; 1367 1368 err = -EINVAL; 1369 if (alen < offsetofend(struct sockaddr, sa_family)) 1370 goto out; 1371 1372 if (addr->sa_family != AF_UNSPEC) { 1373 err = unix_validate_addr(sunaddr, alen); 1374 if (err) 1375 goto out; 1376 1377 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1378 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1379 !READ_ONCE(unix_sk(sk)->addr)) { 1380 err = unix_autobind(sk); 1381 if (err) 1382 goto out; 1383 } 1384 1385 restart: 1386 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1387 if (IS_ERR(other)) { 1388 err = PTR_ERR(other); 1389 goto out; 1390 } 1391 1392 unix_state_double_lock(sk, other); 1393 1394 /* Apparently VFS overslept socket death. Retry. */ 1395 if (sock_flag(other, SOCK_DEAD)) { 1396 unix_state_double_unlock(sk, other); 1397 sock_put(other); 1398 goto restart; 1399 } 1400 1401 err = -EPERM; 1402 if (!unix_may_send(sk, other)) 1403 goto out_unlock; 1404 1405 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1406 if (err) 1407 goto out_unlock; 1408 1409 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1410 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1411 } else { 1412 /* 1413 * 1003.1g breaking connected state with AF_UNSPEC 1414 */ 1415 other = NULL; 1416 unix_state_double_lock(sk, other); 1417 } 1418 1419 /* 1420 * If it was connected, reconnect. 1421 */ 1422 if (unix_peer(sk)) { 1423 struct sock *old_peer = unix_peer(sk); 1424 1425 unix_peer(sk) = other; 1426 if (!other) 1427 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1428 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1429 1430 unix_state_double_unlock(sk, other); 1431 1432 if (other != old_peer) { 1433 unix_dgram_disconnected(sk, old_peer); 1434 1435 unix_state_lock(old_peer); 1436 if (!unix_peer(old_peer)) 1437 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1438 unix_state_unlock(old_peer); 1439 } 1440 1441 sock_put(old_peer); 1442 } else { 1443 unix_peer(sk) = other; 1444 unix_state_double_unlock(sk, other); 1445 } 1446 1447 return 0; 1448 1449 out_unlock: 1450 unix_state_double_unlock(sk, other); 1451 sock_put(other); 1452 out: 1453 return err; 1454 } 1455 1456 static long unix_wait_for_peer(struct sock *other, long timeo) 1457 __releases(&unix_sk(other)->lock) 1458 { 1459 struct unix_sock *u = unix_sk(other); 1460 int sched; 1461 DEFINE_WAIT(wait); 1462 1463 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1464 1465 sched = !sock_flag(other, SOCK_DEAD) && 1466 !(other->sk_shutdown & RCV_SHUTDOWN) && 1467 unix_recvq_full_lockless(other); 1468 1469 unix_state_unlock(other); 1470 1471 if (sched) 1472 timeo = schedule_timeout(timeo); 1473 1474 finish_wait(&u->peer_wait, &wait); 1475 return timeo; 1476 } 1477 1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1479 int addr_len, int flags) 1480 { 1481 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1482 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1483 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1484 struct net *net = sock_net(sk); 1485 struct sk_buff *skb = NULL; 1486 long timeo; 1487 int err; 1488 1489 err = unix_validate_addr(sunaddr, addr_len); 1490 if (err) 1491 goto out; 1492 1493 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1494 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1495 !READ_ONCE(u->addr)) { 1496 err = unix_autobind(sk); 1497 if (err) 1498 goto out; 1499 } 1500 1501 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1502 1503 /* First of all allocate resources. 1504 If we will make it after state is locked, 1505 we will have to recheck all again in any case. 1506 */ 1507 1508 /* create new sock for complete connection */ 1509 newsk = unix_create1(net, NULL, 0, sock->type); 1510 if (IS_ERR(newsk)) { 1511 err = PTR_ERR(newsk); 1512 newsk = NULL; 1513 goto out; 1514 } 1515 1516 err = -ENOMEM; 1517 1518 /* Allocate skb for sending to listening sock */ 1519 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1520 if (skb == NULL) 1521 goto out; 1522 1523 restart: 1524 /* Find listening sock. */ 1525 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1526 if (IS_ERR(other)) { 1527 err = PTR_ERR(other); 1528 other = NULL; 1529 goto out; 1530 } 1531 1532 /* Latch state of peer */ 1533 unix_state_lock(other); 1534 1535 /* Apparently VFS overslept socket death. Retry. */ 1536 if (sock_flag(other, SOCK_DEAD)) { 1537 unix_state_unlock(other); 1538 sock_put(other); 1539 goto restart; 1540 } 1541 1542 err = -ECONNREFUSED; 1543 if (other->sk_state != TCP_LISTEN) 1544 goto out_unlock; 1545 if (other->sk_shutdown & RCV_SHUTDOWN) 1546 goto out_unlock; 1547 1548 if (unix_recvq_full_lockless(other)) { 1549 err = -EAGAIN; 1550 if (!timeo) 1551 goto out_unlock; 1552 1553 timeo = unix_wait_for_peer(other, timeo); 1554 1555 err = sock_intr_errno(timeo); 1556 if (signal_pending(current)) 1557 goto out; 1558 sock_put(other); 1559 goto restart; 1560 } 1561 1562 /* Latch our state. 1563 1564 It is tricky place. We need to grab our state lock and cannot 1565 drop lock on peer. It is dangerous because deadlock is 1566 possible. Connect to self case and simultaneous 1567 attempt to connect are eliminated by checking socket 1568 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1569 check this before attempt to grab lock. 1570 1571 Well, and we have to recheck the state after socket locked. 1572 */ 1573 switch (READ_ONCE(sk->sk_state)) { 1574 case TCP_CLOSE: 1575 /* This is ok... continue with connect */ 1576 break; 1577 case TCP_ESTABLISHED: 1578 /* Socket is already connected */ 1579 err = -EISCONN; 1580 goto out_unlock; 1581 default: 1582 err = -EINVAL; 1583 goto out_unlock; 1584 } 1585 1586 unix_state_lock_nested(sk, U_LOCK_SECOND); 1587 1588 if (sk->sk_state != TCP_CLOSE) { 1589 unix_state_unlock(sk); 1590 unix_state_unlock(other); 1591 sock_put(other); 1592 goto restart; 1593 } 1594 1595 err = security_unix_stream_connect(sk, other, newsk); 1596 if (err) { 1597 unix_state_unlock(sk); 1598 goto out_unlock; 1599 } 1600 1601 /* The way is open! Fastly set all the necessary fields... */ 1602 1603 sock_hold(sk); 1604 unix_peer(newsk) = sk; 1605 newsk->sk_state = TCP_ESTABLISHED; 1606 newsk->sk_type = sk->sk_type; 1607 init_peercred(newsk); 1608 newu = unix_sk(newsk); 1609 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1610 otheru = unix_sk(other); 1611 1612 /* copy address information from listening to new sock 1613 * 1614 * The contents of *(otheru->addr) and otheru->path 1615 * are seen fully set up here, since we have found 1616 * otheru in hash under its lock. Insertion into the 1617 * hash chain we'd found it in had been done in an 1618 * earlier critical area protected by the chain's lock, 1619 * the same one where we'd set *(otheru->addr) contents, 1620 * as well as otheru->path and otheru->addr itself. 1621 * 1622 * Using smp_store_release() here to set newu->addr 1623 * is enough to make those stores, as well as stores 1624 * to newu->path visible to anyone who gets newu->addr 1625 * by smp_load_acquire(). IOW, the same warranties 1626 * as for unix_sock instances bound in unix_bind() or 1627 * in unix_autobind(). 1628 */ 1629 if (otheru->path.dentry) { 1630 path_get(&otheru->path); 1631 newu->path = otheru->path; 1632 } 1633 refcount_inc(&otheru->addr->refcnt); 1634 smp_store_release(&newu->addr, otheru->addr); 1635 1636 /* Set credentials */ 1637 copy_peercred(sk, other); 1638 1639 sock->state = SS_CONNECTED; 1640 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1641 sock_hold(newsk); 1642 1643 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1644 unix_peer(sk) = newsk; 1645 1646 unix_state_unlock(sk); 1647 1648 /* take ten and send info to listening sock */ 1649 spin_lock(&other->sk_receive_queue.lock); 1650 __skb_queue_tail(&other->sk_receive_queue, skb); 1651 spin_unlock(&other->sk_receive_queue.lock); 1652 unix_state_unlock(other); 1653 other->sk_data_ready(other); 1654 sock_put(other); 1655 return 0; 1656 1657 out_unlock: 1658 if (other) 1659 unix_state_unlock(other); 1660 1661 out: 1662 kfree_skb(skb); 1663 if (newsk) 1664 unix_release_sock(newsk, 0); 1665 if (other) 1666 sock_put(other); 1667 return err; 1668 } 1669 1670 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1671 { 1672 struct sock *ska = socka->sk, *skb = sockb->sk; 1673 1674 /* Join our sockets back to back */ 1675 sock_hold(ska); 1676 sock_hold(skb); 1677 unix_peer(ska) = skb; 1678 unix_peer(skb) = ska; 1679 init_peercred(ska); 1680 init_peercred(skb); 1681 1682 ska->sk_state = TCP_ESTABLISHED; 1683 skb->sk_state = TCP_ESTABLISHED; 1684 socka->state = SS_CONNECTED; 1685 sockb->state = SS_CONNECTED; 1686 return 0; 1687 } 1688 1689 static void unix_sock_inherit_flags(const struct socket *old, 1690 struct socket *new) 1691 { 1692 if (test_bit(SOCK_PASSCRED, &old->flags)) 1693 set_bit(SOCK_PASSCRED, &new->flags); 1694 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1695 set_bit(SOCK_PASSPIDFD, &new->flags); 1696 if (test_bit(SOCK_PASSSEC, &old->flags)) 1697 set_bit(SOCK_PASSSEC, &new->flags); 1698 } 1699 1700 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1701 bool kern) 1702 { 1703 struct sock *sk = sock->sk; 1704 struct sock *tsk; 1705 struct sk_buff *skb; 1706 int err; 1707 1708 err = -EOPNOTSUPP; 1709 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1710 goto out; 1711 1712 err = -EINVAL; 1713 if (sk->sk_state != TCP_LISTEN) 1714 goto out; 1715 1716 /* If socket state is TCP_LISTEN it cannot change (for now...), 1717 * so that no locks are necessary. 1718 */ 1719 1720 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1721 &err); 1722 if (!skb) { 1723 /* This means receive shutdown. */ 1724 if (err == 0) 1725 err = -EINVAL; 1726 goto out; 1727 } 1728 1729 tsk = skb->sk; 1730 skb_free_datagram(sk, skb); 1731 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1732 1733 /* attach accepted sock to socket */ 1734 unix_state_lock(tsk); 1735 newsock->state = SS_CONNECTED; 1736 unix_sock_inherit_flags(sock, newsock); 1737 sock_graft(tsk, newsock); 1738 unix_state_unlock(tsk); 1739 return 0; 1740 1741 out: 1742 return err; 1743 } 1744 1745 1746 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1747 { 1748 struct sock *sk = sock->sk; 1749 struct unix_address *addr; 1750 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1751 int err = 0; 1752 1753 if (peer) { 1754 sk = unix_peer_get(sk); 1755 1756 err = -ENOTCONN; 1757 if (!sk) 1758 goto out; 1759 err = 0; 1760 } else { 1761 sock_hold(sk); 1762 } 1763 1764 addr = smp_load_acquire(&unix_sk(sk)->addr); 1765 if (!addr) { 1766 sunaddr->sun_family = AF_UNIX; 1767 sunaddr->sun_path[0] = 0; 1768 err = offsetof(struct sockaddr_un, sun_path); 1769 } else { 1770 err = addr->len; 1771 memcpy(sunaddr, addr->name, addr->len); 1772 } 1773 sock_put(sk); 1774 out: 1775 return err; 1776 } 1777 1778 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1779 { 1780 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1781 1782 /* 1783 * Garbage collection of unix sockets starts by selecting a set of 1784 * candidate sockets which have reference only from being in flight 1785 * (total_refs == inflight_refs). This condition is checked once during 1786 * the candidate collection phase, and candidates are marked as such, so 1787 * that non-candidates can later be ignored. While inflight_refs is 1788 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1789 * is an instantaneous decision. 1790 * 1791 * Once a candidate, however, the socket must not be reinstalled into a 1792 * file descriptor while the garbage collection is in progress. 1793 * 1794 * If the above conditions are met, then the directed graph of 1795 * candidates (*) does not change while unix_gc_lock is held. 1796 * 1797 * Any operations that changes the file count through file descriptors 1798 * (dup, close, sendmsg) does not change the graph since candidates are 1799 * not installed in fds. 1800 * 1801 * Dequeing a candidate via recvmsg would install it into an fd, but 1802 * that takes unix_gc_lock to decrement the inflight count, so it's 1803 * serialized with garbage collection. 1804 * 1805 * MSG_PEEK is special in that it does not change the inflight count, 1806 * yet does install the socket into an fd. The following lock/unlock 1807 * pair is to ensure serialization with garbage collection. It must be 1808 * done between incrementing the file count and installing the file into 1809 * an fd. 1810 * 1811 * If garbage collection starts after the barrier provided by the 1812 * lock/unlock, then it will see the elevated refcount and not mark this 1813 * as a candidate. If a garbage collection is already in progress 1814 * before the file count was incremented, then the lock/unlock pair will 1815 * ensure that garbage collection is finished before progressing to 1816 * installing the fd. 1817 * 1818 * (*) A -> B where B is on the queue of A or B is on the queue of C 1819 * which is on the queue of listening socket A. 1820 */ 1821 spin_lock(&unix_gc_lock); 1822 spin_unlock(&unix_gc_lock); 1823 } 1824 1825 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1826 { 1827 int err = 0; 1828 1829 UNIXCB(skb).pid = get_pid(scm->pid); 1830 UNIXCB(skb).uid = scm->creds.uid; 1831 UNIXCB(skb).gid = scm->creds.gid; 1832 UNIXCB(skb).fp = NULL; 1833 unix_get_secdata(scm, skb); 1834 if (scm->fp && send_fds) 1835 err = unix_attach_fds(scm, skb); 1836 1837 skb->destructor = unix_destruct_scm; 1838 return err; 1839 } 1840 1841 static bool unix_passcred_enabled(const struct socket *sock, 1842 const struct sock *other) 1843 { 1844 return test_bit(SOCK_PASSCRED, &sock->flags) || 1845 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1846 !other->sk_socket || 1847 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1848 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1849 } 1850 1851 /* 1852 * Some apps rely on write() giving SCM_CREDENTIALS 1853 * We include credentials if source or destination socket 1854 * asserted SOCK_PASSCRED. 1855 */ 1856 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1857 const struct sock *other) 1858 { 1859 if (UNIXCB(skb).pid) 1860 return; 1861 if (unix_passcred_enabled(sock, other)) { 1862 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1863 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1864 } 1865 } 1866 1867 static bool unix_skb_scm_eq(struct sk_buff *skb, 1868 struct scm_cookie *scm) 1869 { 1870 return UNIXCB(skb).pid == scm->pid && 1871 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1872 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1873 unix_secdata_eq(scm, skb); 1874 } 1875 1876 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1877 { 1878 struct scm_fp_list *fp = UNIXCB(skb).fp; 1879 struct unix_sock *u = unix_sk(sk); 1880 1881 if (unlikely(fp && fp->count)) 1882 atomic_add(fp->count, &u->scm_stat.nr_fds); 1883 } 1884 1885 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1886 { 1887 struct scm_fp_list *fp = UNIXCB(skb).fp; 1888 struct unix_sock *u = unix_sk(sk); 1889 1890 if (unlikely(fp && fp->count)) 1891 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1892 } 1893 1894 /* 1895 * Send AF_UNIX data. 1896 */ 1897 1898 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1899 size_t len) 1900 { 1901 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1902 struct sock *sk = sock->sk, *other = NULL; 1903 struct unix_sock *u = unix_sk(sk); 1904 struct scm_cookie scm; 1905 struct sk_buff *skb; 1906 int data_len = 0; 1907 int sk_locked; 1908 long timeo; 1909 int err; 1910 1911 wait_for_unix_gc(); 1912 err = scm_send(sock, msg, &scm, false); 1913 if (err < 0) 1914 return err; 1915 1916 err = -EOPNOTSUPP; 1917 if (msg->msg_flags&MSG_OOB) 1918 goto out; 1919 1920 if (msg->msg_namelen) { 1921 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1922 if (err) 1923 goto out; 1924 } else { 1925 sunaddr = NULL; 1926 err = -ENOTCONN; 1927 other = unix_peer_get(sk); 1928 if (!other) 1929 goto out; 1930 } 1931 1932 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1933 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1934 !READ_ONCE(u->addr)) { 1935 err = unix_autobind(sk); 1936 if (err) 1937 goto out; 1938 } 1939 1940 err = -EMSGSIZE; 1941 if (len > READ_ONCE(sk->sk_sndbuf) - 32) 1942 goto out; 1943 1944 if (len > SKB_MAX_ALLOC) { 1945 data_len = min_t(size_t, 1946 len - SKB_MAX_ALLOC, 1947 MAX_SKB_FRAGS * PAGE_SIZE); 1948 data_len = PAGE_ALIGN(data_len); 1949 1950 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1951 } 1952 1953 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1954 msg->msg_flags & MSG_DONTWAIT, &err, 1955 PAGE_ALLOC_COSTLY_ORDER); 1956 if (skb == NULL) 1957 goto out; 1958 1959 err = unix_scm_to_skb(&scm, skb, true); 1960 if (err < 0) 1961 goto out_free; 1962 1963 skb_put(skb, len - data_len); 1964 skb->data_len = data_len; 1965 skb->len = len; 1966 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1967 if (err) 1968 goto out_free; 1969 1970 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1971 1972 restart: 1973 if (!other) { 1974 err = -ECONNRESET; 1975 if (sunaddr == NULL) 1976 goto out_free; 1977 1978 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 1979 sk->sk_type); 1980 if (IS_ERR(other)) { 1981 err = PTR_ERR(other); 1982 other = NULL; 1983 goto out_free; 1984 } 1985 } 1986 1987 if (sk_filter(other, skb) < 0) { 1988 /* Toss the packet but do not return any error to the sender */ 1989 err = len; 1990 goto out_free; 1991 } 1992 1993 sk_locked = 0; 1994 unix_state_lock(other); 1995 restart_locked: 1996 err = -EPERM; 1997 if (!unix_may_send(sk, other)) 1998 goto out_unlock; 1999 2000 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2001 /* 2002 * Check with 1003.1g - what should 2003 * datagram error 2004 */ 2005 unix_state_unlock(other); 2006 sock_put(other); 2007 2008 if (!sk_locked) 2009 unix_state_lock(sk); 2010 2011 err = 0; 2012 if (sk->sk_type == SOCK_SEQPACKET) { 2013 /* We are here only when racing with unix_release_sock() 2014 * is clearing @other. Never change state to TCP_CLOSE 2015 * unlike SOCK_DGRAM wants. 2016 */ 2017 unix_state_unlock(sk); 2018 err = -EPIPE; 2019 } else if (unix_peer(sk) == other) { 2020 unix_peer(sk) = NULL; 2021 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2022 2023 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2024 unix_state_unlock(sk); 2025 2026 unix_dgram_disconnected(sk, other); 2027 sock_put(other); 2028 err = -ECONNREFUSED; 2029 } else { 2030 unix_state_unlock(sk); 2031 } 2032 2033 other = NULL; 2034 if (err) 2035 goto out_free; 2036 goto restart; 2037 } 2038 2039 err = -EPIPE; 2040 if (other->sk_shutdown & RCV_SHUTDOWN) 2041 goto out_unlock; 2042 2043 if (sk->sk_type != SOCK_SEQPACKET) { 2044 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2045 if (err) 2046 goto out_unlock; 2047 } 2048 2049 /* other == sk && unix_peer(other) != sk if 2050 * - unix_peer(sk) == NULL, destination address bound to sk 2051 * - unix_peer(sk) == sk by time of get but disconnected before lock 2052 */ 2053 if (other != sk && 2054 unlikely(unix_peer(other) != sk && 2055 unix_recvq_full_lockless(other))) { 2056 if (timeo) { 2057 timeo = unix_wait_for_peer(other, timeo); 2058 2059 err = sock_intr_errno(timeo); 2060 if (signal_pending(current)) 2061 goto out_free; 2062 2063 goto restart; 2064 } 2065 2066 if (!sk_locked) { 2067 unix_state_unlock(other); 2068 unix_state_double_lock(sk, other); 2069 } 2070 2071 if (unix_peer(sk) != other || 2072 unix_dgram_peer_wake_me(sk, other)) { 2073 err = -EAGAIN; 2074 sk_locked = 1; 2075 goto out_unlock; 2076 } 2077 2078 if (!sk_locked) { 2079 sk_locked = 1; 2080 goto restart_locked; 2081 } 2082 } 2083 2084 if (unlikely(sk_locked)) 2085 unix_state_unlock(sk); 2086 2087 if (sock_flag(other, SOCK_RCVTSTAMP)) 2088 __net_timestamp(skb); 2089 maybe_add_creds(skb, sock, other); 2090 scm_stat_add(other, skb); 2091 skb_queue_tail(&other->sk_receive_queue, skb); 2092 unix_state_unlock(other); 2093 other->sk_data_ready(other); 2094 sock_put(other); 2095 scm_destroy(&scm); 2096 return len; 2097 2098 out_unlock: 2099 if (sk_locked) 2100 unix_state_unlock(sk); 2101 unix_state_unlock(other); 2102 out_free: 2103 kfree_skb(skb); 2104 out: 2105 if (other) 2106 sock_put(other); 2107 scm_destroy(&scm); 2108 return err; 2109 } 2110 2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2112 * bytes, and a minimum of a full page. 2113 */ 2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2115 2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2118 struct scm_cookie *scm, bool fds_sent) 2119 { 2120 struct unix_sock *ousk = unix_sk(other); 2121 struct sk_buff *skb; 2122 int err = 0; 2123 2124 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2125 2126 if (!skb) 2127 return err; 2128 2129 err = unix_scm_to_skb(scm, skb, !fds_sent); 2130 if (err < 0) { 2131 kfree_skb(skb); 2132 return err; 2133 } 2134 skb_put(skb, 1); 2135 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2136 2137 if (err) { 2138 kfree_skb(skb); 2139 return err; 2140 } 2141 2142 unix_state_lock(other); 2143 2144 if (sock_flag(other, SOCK_DEAD) || 2145 (other->sk_shutdown & RCV_SHUTDOWN)) { 2146 unix_state_unlock(other); 2147 kfree_skb(skb); 2148 return -EPIPE; 2149 } 2150 2151 maybe_add_creds(skb, sock, other); 2152 skb_get(skb); 2153 2154 scm_stat_add(other, skb); 2155 2156 spin_lock(&other->sk_receive_queue.lock); 2157 if (ousk->oob_skb) 2158 consume_skb(ousk->oob_skb); 2159 WRITE_ONCE(ousk->oob_skb, skb); 2160 __skb_queue_tail(&other->sk_receive_queue, skb); 2161 spin_unlock(&other->sk_receive_queue.lock); 2162 2163 sk_send_sigurg(other); 2164 unix_state_unlock(other); 2165 other->sk_data_ready(other); 2166 2167 return err; 2168 } 2169 #endif 2170 2171 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2172 size_t len) 2173 { 2174 struct sock *sk = sock->sk; 2175 struct sock *other = NULL; 2176 int err, size; 2177 struct sk_buff *skb; 2178 int sent = 0; 2179 struct scm_cookie scm; 2180 bool fds_sent = false; 2181 int data_len; 2182 2183 wait_for_unix_gc(); 2184 err = scm_send(sock, msg, &scm, false); 2185 if (err < 0) 2186 return err; 2187 2188 err = -EOPNOTSUPP; 2189 if (msg->msg_flags & MSG_OOB) { 2190 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2191 if (len) 2192 len--; 2193 else 2194 #endif 2195 goto out_err; 2196 } 2197 2198 if (msg->msg_namelen) { 2199 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2200 goto out_err; 2201 } else { 2202 err = -ENOTCONN; 2203 other = unix_peer(sk); 2204 if (!other) 2205 goto out_err; 2206 } 2207 2208 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2209 goto pipe_err; 2210 2211 while (sent < len) { 2212 size = len - sent; 2213 2214 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2215 skb = sock_alloc_send_pskb(sk, 0, 0, 2216 msg->msg_flags & MSG_DONTWAIT, 2217 &err, 0); 2218 } else { 2219 /* Keep two messages in the pipe so it schedules better */ 2220 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2221 2222 /* allow fallback to order-0 allocations */ 2223 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2224 2225 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2226 2227 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2228 2229 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2230 msg->msg_flags & MSG_DONTWAIT, &err, 2231 get_order(UNIX_SKB_FRAGS_SZ)); 2232 } 2233 if (!skb) 2234 goto out_err; 2235 2236 /* Only send the fds in the first buffer */ 2237 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2238 if (err < 0) { 2239 kfree_skb(skb); 2240 goto out_err; 2241 } 2242 fds_sent = true; 2243 2244 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2245 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2246 sk->sk_allocation); 2247 if (err < 0) { 2248 kfree_skb(skb); 2249 goto out_err; 2250 } 2251 size = err; 2252 refcount_add(size, &sk->sk_wmem_alloc); 2253 } else { 2254 skb_put(skb, size - data_len); 2255 skb->data_len = data_len; 2256 skb->len = size; 2257 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2258 if (err) { 2259 kfree_skb(skb); 2260 goto out_err; 2261 } 2262 } 2263 2264 unix_state_lock(other); 2265 2266 if (sock_flag(other, SOCK_DEAD) || 2267 (other->sk_shutdown & RCV_SHUTDOWN)) 2268 goto pipe_err_free; 2269 2270 maybe_add_creds(skb, sock, other); 2271 scm_stat_add(other, skb); 2272 skb_queue_tail(&other->sk_receive_queue, skb); 2273 unix_state_unlock(other); 2274 other->sk_data_ready(other); 2275 sent += size; 2276 } 2277 2278 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2279 if (msg->msg_flags & MSG_OOB) { 2280 err = queue_oob(sock, msg, other, &scm, fds_sent); 2281 if (err) 2282 goto out_err; 2283 sent++; 2284 } 2285 #endif 2286 2287 scm_destroy(&scm); 2288 2289 return sent; 2290 2291 pipe_err_free: 2292 unix_state_unlock(other); 2293 kfree_skb(skb); 2294 pipe_err: 2295 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2296 send_sig(SIGPIPE, current, 0); 2297 err = -EPIPE; 2298 out_err: 2299 scm_destroy(&scm); 2300 return sent ? : err; 2301 } 2302 2303 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2304 size_t len) 2305 { 2306 int err; 2307 struct sock *sk = sock->sk; 2308 2309 err = sock_error(sk); 2310 if (err) 2311 return err; 2312 2313 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2314 return -ENOTCONN; 2315 2316 if (msg->msg_namelen) 2317 msg->msg_namelen = 0; 2318 2319 return unix_dgram_sendmsg(sock, msg, len); 2320 } 2321 2322 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2323 size_t size, int flags) 2324 { 2325 struct sock *sk = sock->sk; 2326 2327 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2328 return -ENOTCONN; 2329 2330 return unix_dgram_recvmsg(sock, msg, size, flags); 2331 } 2332 2333 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2334 { 2335 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2336 2337 if (addr) { 2338 msg->msg_namelen = addr->len; 2339 memcpy(msg->msg_name, addr->name, addr->len); 2340 } 2341 } 2342 2343 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2344 int flags) 2345 { 2346 struct scm_cookie scm; 2347 struct socket *sock = sk->sk_socket; 2348 struct unix_sock *u = unix_sk(sk); 2349 struct sk_buff *skb, *last; 2350 long timeo; 2351 int skip; 2352 int err; 2353 2354 err = -EOPNOTSUPP; 2355 if (flags&MSG_OOB) 2356 goto out; 2357 2358 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2359 2360 do { 2361 mutex_lock(&u->iolock); 2362 2363 skip = sk_peek_offset(sk, flags); 2364 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2365 &skip, &err, &last); 2366 if (skb) { 2367 if (!(flags & MSG_PEEK)) 2368 scm_stat_del(sk, skb); 2369 break; 2370 } 2371 2372 mutex_unlock(&u->iolock); 2373 2374 if (err != -EAGAIN) 2375 break; 2376 } while (timeo && 2377 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2378 &err, &timeo, last)); 2379 2380 if (!skb) { /* implies iolock unlocked */ 2381 unix_state_lock(sk); 2382 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2383 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2384 (sk->sk_shutdown & RCV_SHUTDOWN)) 2385 err = 0; 2386 unix_state_unlock(sk); 2387 goto out; 2388 } 2389 2390 if (wq_has_sleeper(&u->peer_wait)) 2391 wake_up_interruptible_sync_poll(&u->peer_wait, 2392 EPOLLOUT | EPOLLWRNORM | 2393 EPOLLWRBAND); 2394 2395 if (msg->msg_name) 2396 unix_copy_addr(msg, skb->sk); 2397 2398 if (size > skb->len - skip) 2399 size = skb->len - skip; 2400 else if (size < skb->len - skip) 2401 msg->msg_flags |= MSG_TRUNC; 2402 2403 err = skb_copy_datagram_msg(skb, skip, msg, size); 2404 if (err) 2405 goto out_free; 2406 2407 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2408 __sock_recv_timestamp(msg, sk, skb); 2409 2410 memset(&scm, 0, sizeof(scm)); 2411 2412 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2413 unix_set_secdata(&scm, skb); 2414 2415 if (!(flags & MSG_PEEK)) { 2416 if (UNIXCB(skb).fp) 2417 unix_detach_fds(&scm, skb); 2418 2419 sk_peek_offset_bwd(sk, skb->len); 2420 } else { 2421 /* It is questionable: on PEEK we could: 2422 - do not return fds - good, but too simple 8) 2423 - return fds, and do not return them on read (old strategy, 2424 apparently wrong) 2425 - clone fds (I chose it for now, it is the most universal 2426 solution) 2427 2428 POSIX 1003.1g does not actually define this clearly 2429 at all. POSIX 1003.1g doesn't define a lot of things 2430 clearly however! 2431 2432 */ 2433 2434 sk_peek_offset_fwd(sk, size); 2435 2436 if (UNIXCB(skb).fp) 2437 unix_peek_fds(&scm, skb); 2438 } 2439 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2440 2441 scm_recv_unix(sock, msg, &scm, flags); 2442 2443 out_free: 2444 skb_free_datagram(sk, skb); 2445 mutex_unlock(&u->iolock); 2446 out: 2447 return err; 2448 } 2449 2450 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2451 int flags) 2452 { 2453 struct sock *sk = sock->sk; 2454 2455 #ifdef CONFIG_BPF_SYSCALL 2456 const struct proto *prot = READ_ONCE(sk->sk_prot); 2457 2458 if (prot != &unix_dgram_proto) 2459 return prot->recvmsg(sk, msg, size, flags, NULL); 2460 #endif 2461 return __unix_dgram_recvmsg(sk, msg, size, flags); 2462 } 2463 2464 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2465 { 2466 struct unix_sock *u = unix_sk(sk); 2467 struct sk_buff *skb; 2468 int err; 2469 2470 mutex_lock(&u->iolock); 2471 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2472 mutex_unlock(&u->iolock); 2473 if (!skb) 2474 return err; 2475 2476 return recv_actor(sk, skb); 2477 } 2478 2479 /* 2480 * Sleep until more data has arrived. But check for races.. 2481 */ 2482 static long unix_stream_data_wait(struct sock *sk, long timeo, 2483 struct sk_buff *last, unsigned int last_len, 2484 bool freezable) 2485 { 2486 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2487 struct sk_buff *tail; 2488 DEFINE_WAIT(wait); 2489 2490 unix_state_lock(sk); 2491 2492 for (;;) { 2493 prepare_to_wait(sk_sleep(sk), &wait, state); 2494 2495 tail = skb_peek_tail(&sk->sk_receive_queue); 2496 if (tail != last || 2497 (tail && tail->len != last_len) || 2498 sk->sk_err || 2499 (sk->sk_shutdown & RCV_SHUTDOWN) || 2500 signal_pending(current) || 2501 !timeo) 2502 break; 2503 2504 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2505 unix_state_unlock(sk); 2506 timeo = schedule_timeout(timeo); 2507 unix_state_lock(sk); 2508 2509 if (sock_flag(sk, SOCK_DEAD)) 2510 break; 2511 2512 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2513 } 2514 2515 finish_wait(sk_sleep(sk), &wait); 2516 unix_state_unlock(sk); 2517 return timeo; 2518 } 2519 2520 static unsigned int unix_skb_len(const struct sk_buff *skb) 2521 { 2522 return skb->len - UNIXCB(skb).consumed; 2523 } 2524 2525 struct unix_stream_read_state { 2526 int (*recv_actor)(struct sk_buff *, int, int, 2527 struct unix_stream_read_state *); 2528 struct socket *socket; 2529 struct msghdr *msg; 2530 struct pipe_inode_info *pipe; 2531 size_t size; 2532 int flags; 2533 unsigned int splice_flags; 2534 }; 2535 2536 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2537 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2538 { 2539 struct socket *sock = state->socket; 2540 struct sock *sk = sock->sk; 2541 struct unix_sock *u = unix_sk(sk); 2542 int chunk = 1; 2543 struct sk_buff *oob_skb; 2544 2545 mutex_lock(&u->iolock); 2546 unix_state_lock(sk); 2547 spin_lock(&sk->sk_receive_queue.lock); 2548 2549 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2550 spin_unlock(&sk->sk_receive_queue.lock); 2551 unix_state_unlock(sk); 2552 mutex_unlock(&u->iolock); 2553 return -EINVAL; 2554 } 2555 2556 oob_skb = u->oob_skb; 2557 2558 if (!(state->flags & MSG_PEEK)) 2559 WRITE_ONCE(u->oob_skb, NULL); 2560 else 2561 skb_get(oob_skb); 2562 2563 spin_unlock(&sk->sk_receive_queue.lock); 2564 unix_state_unlock(sk); 2565 2566 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2567 2568 if (!(state->flags & MSG_PEEK)) 2569 UNIXCB(oob_skb).consumed += 1; 2570 2571 consume_skb(oob_skb); 2572 2573 mutex_unlock(&u->iolock); 2574 2575 if (chunk < 0) 2576 return -EFAULT; 2577 2578 state->msg->msg_flags |= MSG_OOB; 2579 return 1; 2580 } 2581 2582 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2583 int flags, int copied) 2584 { 2585 struct unix_sock *u = unix_sk(sk); 2586 2587 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2588 skb_unlink(skb, &sk->sk_receive_queue); 2589 consume_skb(skb); 2590 skb = NULL; 2591 } else { 2592 struct sk_buff *unlinked_skb = NULL; 2593 2594 spin_lock(&sk->sk_receive_queue.lock); 2595 2596 if (skb == u->oob_skb) { 2597 if (copied) { 2598 skb = NULL; 2599 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2600 if (!(flags & MSG_PEEK)) { 2601 WRITE_ONCE(u->oob_skb, NULL); 2602 consume_skb(skb); 2603 } 2604 } else if (flags & MSG_PEEK) { 2605 skb = NULL; 2606 } else { 2607 __skb_unlink(skb, &sk->sk_receive_queue); 2608 WRITE_ONCE(u->oob_skb, NULL); 2609 unlinked_skb = skb; 2610 skb = skb_peek(&sk->sk_receive_queue); 2611 } 2612 } 2613 2614 spin_unlock(&sk->sk_receive_queue.lock); 2615 2616 if (unlinked_skb) { 2617 WARN_ON_ONCE(skb_unref(unlinked_skb)); 2618 kfree_skb(unlinked_skb); 2619 } 2620 } 2621 return skb; 2622 } 2623 #endif 2624 2625 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2626 { 2627 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2628 return -ENOTCONN; 2629 2630 return unix_read_skb(sk, recv_actor); 2631 } 2632 2633 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2634 bool freezable) 2635 { 2636 struct scm_cookie scm; 2637 struct socket *sock = state->socket; 2638 struct sock *sk = sock->sk; 2639 struct unix_sock *u = unix_sk(sk); 2640 int copied = 0; 2641 int flags = state->flags; 2642 int noblock = flags & MSG_DONTWAIT; 2643 bool check_creds = false; 2644 int target; 2645 int err = 0; 2646 long timeo; 2647 int skip; 2648 size_t size = state->size; 2649 unsigned int last_len; 2650 2651 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2652 err = -EINVAL; 2653 goto out; 2654 } 2655 2656 if (unlikely(flags & MSG_OOB)) { 2657 err = -EOPNOTSUPP; 2658 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2659 err = unix_stream_recv_urg(state); 2660 #endif 2661 goto out; 2662 } 2663 2664 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2665 timeo = sock_rcvtimeo(sk, noblock); 2666 2667 memset(&scm, 0, sizeof(scm)); 2668 2669 /* Lock the socket to prevent queue disordering 2670 * while sleeps in memcpy_tomsg 2671 */ 2672 mutex_lock(&u->iolock); 2673 2674 skip = max(sk_peek_offset(sk, flags), 0); 2675 2676 do { 2677 int chunk; 2678 bool drop_skb; 2679 struct sk_buff *skb, *last; 2680 2681 redo: 2682 unix_state_lock(sk); 2683 if (sock_flag(sk, SOCK_DEAD)) { 2684 err = -ECONNRESET; 2685 goto unlock; 2686 } 2687 last = skb = skb_peek(&sk->sk_receive_queue); 2688 last_len = last ? last->len : 0; 2689 2690 again: 2691 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2692 if (skb) { 2693 skb = manage_oob(skb, sk, flags, copied); 2694 if (!skb && copied) { 2695 unix_state_unlock(sk); 2696 break; 2697 } 2698 } 2699 #endif 2700 if (skb == NULL) { 2701 if (copied >= target) 2702 goto unlock; 2703 2704 /* 2705 * POSIX 1003.1g mandates this order. 2706 */ 2707 2708 err = sock_error(sk); 2709 if (err) 2710 goto unlock; 2711 if (sk->sk_shutdown & RCV_SHUTDOWN) 2712 goto unlock; 2713 2714 unix_state_unlock(sk); 2715 if (!timeo) { 2716 err = -EAGAIN; 2717 break; 2718 } 2719 2720 mutex_unlock(&u->iolock); 2721 2722 timeo = unix_stream_data_wait(sk, timeo, last, 2723 last_len, freezable); 2724 2725 if (signal_pending(current)) { 2726 err = sock_intr_errno(timeo); 2727 scm_destroy(&scm); 2728 goto out; 2729 } 2730 2731 mutex_lock(&u->iolock); 2732 goto redo; 2733 unlock: 2734 unix_state_unlock(sk); 2735 break; 2736 } 2737 2738 while (skip >= unix_skb_len(skb)) { 2739 skip -= unix_skb_len(skb); 2740 last = skb; 2741 last_len = skb->len; 2742 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2743 if (!skb) 2744 goto again; 2745 } 2746 2747 unix_state_unlock(sk); 2748 2749 if (check_creds) { 2750 /* Never glue messages from different writers */ 2751 if (!unix_skb_scm_eq(skb, &scm)) 2752 break; 2753 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2754 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2755 /* Copy credentials */ 2756 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2757 unix_set_secdata(&scm, skb); 2758 check_creds = true; 2759 } 2760 2761 /* Copy address just once */ 2762 if (state->msg && state->msg->msg_name) { 2763 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2764 state->msg->msg_name); 2765 unix_copy_addr(state->msg, skb->sk); 2766 sunaddr = NULL; 2767 } 2768 2769 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2770 skb_get(skb); 2771 chunk = state->recv_actor(skb, skip, chunk, state); 2772 drop_skb = !unix_skb_len(skb); 2773 /* skb is only safe to use if !drop_skb */ 2774 consume_skb(skb); 2775 if (chunk < 0) { 2776 if (copied == 0) 2777 copied = -EFAULT; 2778 break; 2779 } 2780 copied += chunk; 2781 size -= chunk; 2782 2783 if (drop_skb) { 2784 /* the skb was touched by a concurrent reader; 2785 * we should not expect anything from this skb 2786 * anymore and assume it invalid - we can be 2787 * sure it was dropped from the socket queue 2788 * 2789 * let's report a short read 2790 */ 2791 err = 0; 2792 break; 2793 } 2794 2795 /* Mark read part of skb as used */ 2796 if (!(flags & MSG_PEEK)) { 2797 UNIXCB(skb).consumed += chunk; 2798 2799 sk_peek_offset_bwd(sk, chunk); 2800 2801 if (UNIXCB(skb).fp) { 2802 scm_stat_del(sk, skb); 2803 unix_detach_fds(&scm, skb); 2804 } 2805 2806 if (unix_skb_len(skb)) 2807 break; 2808 2809 skb_unlink(skb, &sk->sk_receive_queue); 2810 consume_skb(skb); 2811 2812 if (scm.fp) 2813 break; 2814 } else { 2815 /* It is questionable, see note in unix_dgram_recvmsg. 2816 */ 2817 if (UNIXCB(skb).fp) 2818 unix_peek_fds(&scm, skb); 2819 2820 sk_peek_offset_fwd(sk, chunk); 2821 2822 if (UNIXCB(skb).fp) 2823 break; 2824 2825 skip = 0; 2826 last = skb; 2827 last_len = skb->len; 2828 unix_state_lock(sk); 2829 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2830 if (skb) 2831 goto again; 2832 unix_state_unlock(sk); 2833 break; 2834 } 2835 } while (size); 2836 2837 mutex_unlock(&u->iolock); 2838 if (state->msg) 2839 scm_recv_unix(sock, state->msg, &scm, flags); 2840 else 2841 scm_destroy(&scm); 2842 out: 2843 return copied ? : err; 2844 } 2845 2846 static int unix_stream_read_actor(struct sk_buff *skb, 2847 int skip, int chunk, 2848 struct unix_stream_read_state *state) 2849 { 2850 int ret; 2851 2852 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2853 state->msg, chunk); 2854 return ret ?: chunk; 2855 } 2856 2857 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2858 size_t size, int flags) 2859 { 2860 struct unix_stream_read_state state = { 2861 .recv_actor = unix_stream_read_actor, 2862 .socket = sk->sk_socket, 2863 .msg = msg, 2864 .size = size, 2865 .flags = flags 2866 }; 2867 2868 return unix_stream_read_generic(&state, true); 2869 } 2870 2871 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2872 size_t size, int flags) 2873 { 2874 struct unix_stream_read_state state = { 2875 .recv_actor = unix_stream_read_actor, 2876 .socket = sock, 2877 .msg = msg, 2878 .size = size, 2879 .flags = flags 2880 }; 2881 2882 #ifdef CONFIG_BPF_SYSCALL 2883 struct sock *sk = sock->sk; 2884 const struct proto *prot = READ_ONCE(sk->sk_prot); 2885 2886 if (prot != &unix_stream_proto) 2887 return prot->recvmsg(sk, msg, size, flags, NULL); 2888 #endif 2889 return unix_stream_read_generic(&state, true); 2890 } 2891 2892 static int unix_stream_splice_actor(struct sk_buff *skb, 2893 int skip, int chunk, 2894 struct unix_stream_read_state *state) 2895 { 2896 return skb_splice_bits(skb, state->socket->sk, 2897 UNIXCB(skb).consumed + skip, 2898 state->pipe, chunk, state->splice_flags); 2899 } 2900 2901 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2902 struct pipe_inode_info *pipe, 2903 size_t size, unsigned int flags) 2904 { 2905 struct unix_stream_read_state state = { 2906 .recv_actor = unix_stream_splice_actor, 2907 .socket = sock, 2908 .pipe = pipe, 2909 .size = size, 2910 .splice_flags = flags, 2911 }; 2912 2913 if (unlikely(*ppos)) 2914 return -ESPIPE; 2915 2916 if (sock->file->f_flags & O_NONBLOCK || 2917 flags & SPLICE_F_NONBLOCK) 2918 state.flags = MSG_DONTWAIT; 2919 2920 return unix_stream_read_generic(&state, false); 2921 } 2922 2923 static int unix_shutdown(struct socket *sock, int mode) 2924 { 2925 struct sock *sk = sock->sk; 2926 struct sock *other; 2927 2928 if (mode < SHUT_RD || mode > SHUT_RDWR) 2929 return -EINVAL; 2930 /* This maps: 2931 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2932 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2933 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2934 */ 2935 ++mode; 2936 2937 unix_state_lock(sk); 2938 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2939 other = unix_peer(sk); 2940 if (other) 2941 sock_hold(other); 2942 unix_state_unlock(sk); 2943 sk->sk_state_change(sk); 2944 2945 if (other && 2946 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2947 2948 int peer_mode = 0; 2949 const struct proto *prot = READ_ONCE(other->sk_prot); 2950 2951 if (prot->unhash) 2952 prot->unhash(other); 2953 if (mode&RCV_SHUTDOWN) 2954 peer_mode |= SEND_SHUTDOWN; 2955 if (mode&SEND_SHUTDOWN) 2956 peer_mode |= RCV_SHUTDOWN; 2957 unix_state_lock(other); 2958 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2959 unix_state_unlock(other); 2960 other->sk_state_change(other); 2961 if (peer_mode == SHUTDOWN_MASK) 2962 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2963 else if (peer_mode & RCV_SHUTDOWN) 2964 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 2965 } 2966 if (other) 2967 sock_put(other); 2968 2969 return 0; 2970 } 2971 2972 long unix_inq_len(struct sock *sk) 2973 { 2974 struct sk_buff *skb; 2975 long amount = 0; 2976 2977 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 2978 return -EINVAL; 2979 2980 spin_lock(&sk->sk_receive_queue.lock); 2981 if (sk->sk_type == SOCK_STREAM || 2982 sk->sk_type == SOCK_SEQPACKET) { 2983 skb_queue_walk(&sk->sk_receive_queue, skb) 2984 amount += unix_skb_len(skb); 2985 } else { 2986 skb = skb_peek(&sk->sk_receive_queue); 2987 if (skb) 2988 amount = skb->len; 2989 } 2990 spin_unlock(&sk->sk_receive_queue.lock); 2991 2992 return amount; 2993 } 2994 EXPORT_SYMBOL_GPL(unix_inq_len); 2995 2996 long unix_outq_len(struct sock *sk) 2997 { 2998 return sk_wmem_alloc_get(sk); 2999 } 3000 EXPORT_SYMBOL_GPL(unix_outq_len); 3001 3002 static int unix_open_file(struct sock *sk) 3003 { 3004 struct path path; 3005 struct file *f; 3006 int fd; 3007 3008 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3009 return -EPERM; 3010 3011 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3012 return -ENOENT; 3013 3014 path = unix_sk(sk)->path; 3015 if (!path.dentry) 3016 return -ENOENT; 3017 3018 path_get(&path); 3019 3020 fd = get_unused_fd_flags(O_CLOEXEC); 3021 if (fd < 0) 3022 goto out; 3023 3024 f = dentry_open(&path, O_PATH, current_cred()); 3025 if (IS_ERR(f)) { 3026 put_unused_fd(fd); 3027 fd = PTR_ERR(f); 3028 goto out; 3029 } 3030 3031 fd_install(fd, f); 3032 out: 3033 path_put(&path); 3034 3035 return fd; 3036 } 3037 3038 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3039 { 3040 struct sock *sk = sock->sk; 3041 long amount = 0; 3042 int err; 3043 3044 switch (cmd) { 3045 case SIOCOUTQ: 3046 amount = unix_outq_len(sk); 3047 err = put_user(amount, (int __user *)arg); 3048 break; 3049 case SIOCINQ: 3050 amount = unix_inq_len(sk); 3051 if (amount < 0) 3052 err = amount; 3053 else 3054 err = put_user(amount, (int __user *)arg); 3055 break; 3056 case SIOCUNIXFILE: 3057 err = unix_open_file(sk); 3058 break; 3059 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3060 case SIOCATMARK: 3061 { 3062 struct sk_buff *skb; 3063 int answ = 0; 3064 3065 skb = skb_peek(&sk->sk_receive_queue); 3066 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3067 answ = 1; 3068 err = put_user(answ, (int __user *)arg); 3069 } 3070 break; 3071 #endif 3072 default: 3073 err = -ENOIOCTLCMD; 3074 break; 3075 } 3076 return err; 3077 } 3078 3079 #ifdef CONFIG_COMPAT 3080 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3081 { 3082 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3083 } 3084 #endif 3085 3086 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3087 { 3088 struct sock *sk = sock->sk; 3089 unsigned char state; 3090 __poll_t mask; 3091 u8 shutdown; 3092 3093 sock_poll_wait(file, sock, wait); 3094 mask = 0; 3095 shutdown = READ_ONCE(sk->sk_shutdown); 3096 state = READ_ONCE(sk->sk_state); 3097 3098 /* exceptional events? */ 3099 if (READ_ONCE(sk->sk_err)) 3100 mask |= EPOLLERR; 3101 if (shutdown == SHUTDOWN_MASK) 3102 mask |= EPOLLHUP; 3103 if (shutdown & RCV_SHUTDOWN) 3104 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3105 3106 /* readable? */ 3107 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3108 mask |= EPOLLIN | EPOLLRDNORM; 3109 if (sk_is_readable(sk)) 3110 mask |= EPOLLIN | EPOLLRDNORM; 3111 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3112 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3113 mask |= EPOLLPRI; 3114 #endif 3115 3116 /* Connection-based need to check for termination and startup */ 3117 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3118 state == TCP_CLOSE) 3119 mask |= EPOLLHUP; 3120 3121 /* 3122 * we set writable also when the other side has shut down the 3123 * connection. This prevents stuck sockets. 3124 */ 3125 if (unix_writable(sk, state)) 3126 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3127 3128 return mask; 3129 } 3130 3131 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3132 poll_table *wait) 3133 { 3134 struct sock *sk = sock->sk, *other; 3135 unsigned int writable; 3136 unsigned char state; 3137 __poll_t mask; 3138 u8 shutdown; 3139 3140 sock_poll_wait(file, sock, wait); 3141 mask = 0; 3142 shutdown = READ_ONCE(sk->sk_shutdown); 3143 state = READ_ONCE(sk->sk_state); 3144 3145 /* exceptional events? */ 3146 if (READ_ONCE(sk->sk_err) || 3147 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3148 mask |= EPOLLERR | 3149 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3150 3151 if (shutdown & RCV_SHUTDOWN) 3152 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3153 if (shutdown == SHUTDOWN_MASK) 3154 mask |= EPOLLHUP; 3155 3156 /* readable? */ 3157 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3158 mask |= EPOLLIN | EPOLLRDNORM; 3159 if (sk_is_readable(sk)) 3160 mask |= EPOLLIN | EPOLLRDNORM; 3161 3162 /* Connection-based need to check for termination and startup */ 3163 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3164 mask |= EPOLLHUP; 3165 3166 /* No write status requested, avoid expensive OUT tests. */ 3167 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3168 return mask; 3169 3170 writable = unix_writable(sk, state); 3171 if (writable) { 3172 unix_state_lock(sk); 3173 3174 other = unix_peer(sk); 3175 if (other && unix_peer(other) != sk && 3176 unix_recvq_full_lockless(other) && 3177 unix_dgram_peer_wake_me(sk, other)) 3178 writable = 0; 3179 3180 unix_state_unlock(sk); 3181 } 3182 3183 if (writable) 3184 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3185 else 3186 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3187 3188 return mask; 3189 } 3190 3191 #ifdef CONFIG_PROC_FS 3192 3193 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3194 3195 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3196 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3197 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3198 3199 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3200 { 3201 unsigned long offset = get_offset(*pos); 3202 unsigned long bucket = get_bucket(*pos); 3203 unsigned long count = 0; 3204 struct sock *sk; 3205 3206 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3207 sk; sk = sk_next(sk)) { 3208 if (++count == offset) 3209 break; 3210 } 3211 3212 return sk; 3213 } 3214 3215 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3216 { 3217 unsigned long bucket = get_bucket(*pos); 3218 struct net *net = seq_file_net(seq); 3219 struct sock *sk; 3220 3221 while (bucket < UNIX_HASH_SIZE) { 3222 spin_lock(&net->unx.table.locks[bucket]); 3223 3224 sk = unix_from_bucket(seq, pos); 3225 if (sk) 3226 return sk; 3227 3228 spin_unlock(&net->unx.table.locks[bucket]); 3229 3230 *pos = set_bucket_offset(++bucket, 1); 3231 } 3232 3233 return NULL; 3234 } 3235 3236 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3237 loff_t *pos) 3238 { 3239 unsigned long bucket = get_bucket(*pos); 3240 3241 sk = sk_next(sk); 3242 if (sk) 3243 return sk; 3244 3245 3246 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3247 3248 *pos = set_bucket_offset(++bucket, 1); 3249 3250 return unix_get_first(seq, pos); 3251 } 3252 3253 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3254 { 3255 if (!*pos) 3256 return SEQ_START_TOKEN; 3257 3258 return unix_get_first(seq, pos); 3259 } 3260 3261 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3262 { 3263 ++*pos; 3264 3265 if (v == SEQ_START_TOKEN) 3266 return unix_get_first(seq, pos); 3267 3268 return unix_get_next(seq, v, pos); 3269 } 3270 3271 static void unix_seq_stop(struct seq_file *seq, void *v) 3272 { 3273 struct sock *sk = v; 3274 3275 if (sk) 3276 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3277 } 3278 3279 static int unix_seq_show(struct seq_file *seq, void *v) 3280 { 3281 3282 if (v == SEQ_START_TOKEN) 3283 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3284 "Inode Path\n"); 3285 else { 3286 struct sock *s = v; 3287 struct unix_sock *u = unix_sk(s); 3288 unix_state_lock(s); 3289 3290 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3291 s, 3292 refcount_read(&s->sk_refcnt), 3293 0, 3294 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3295 s->sk_type, 3296 s->sk_socket ? 3297 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3298 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3299 sock_i_ino(s)); 3300 3301 if (u->addr) { // under a hash table lock here 3302 int i, len; 3303 seq_putc(seq, ' '); 3304 3305 i = 0; 3306 len = u->addr->len - 3307 offsetof(struct sockaddr_un, sun_path); 3308 if (u->addr->name->sun_path[0]) { 3309 len--; 3310 } else { 3311 seq_putc(seq, '@'); 3312 i++; 3313 } 3314 for ( ; i < len; i++) 3315 seq_putc(seq, u->addr->name->sun_path[i] ?: 3316 '@'); 3317 } 3318 unix_state_unlock(s); 3319 seq_putc(seq, '\n'); 3320 } 3321 3322 return 0; 3323 } 3324 3325 static const struct seq_operations unix_seq_ops = { 3326 .start = unix_seq_start, 3327 .next = unix_seq_next, 3328 .stop = unix_seq_stop, 3329 .show = unix_seq_show, 3330 }; 3331 3332 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) 3333 struct bpf_unix_iter_state { 3334 struct seq_net_private p; 3335 unsigned int cur_sk; 3336 unsigned int end_sk; 3337 unsigned int max_sk; 3338 struct sock **batch; 3339 bool st_bucket_done; 3340 }; 3341 3342 struct bpf_iter__unix { 3343 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3344 __bpf_md_ptr(struct unix_sock *, unix_sk); 3345 uid_t uid __aligned(8); 3346 }; 3347 3348 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3349 struct unix_sock *unix_sk, uid_t uid) 3350 { 3351 struct bpf_iter__unix ctx; 3352 3353 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3354 ctx.meta = meta; 3355 ctx.unix_sk = unix_sk; 3356 ctx.uid = uid; 3357 return bpf_iter_run_prog(prog, &ctx); 3358 } 3359 3360 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3361 3362 { 3363 struct bpf_unix_iter_state *iter = seq->private; 3364 unsigned int expected = 1; 3365 struct sock *sk; 3366 3367 sock_hold(start_sk); 3368 iter->batch[iter->end_sk++] = start_sk; 3369 3370 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3371 if (iter->end_sk < iter->max_sk) { 3372 sock_hold(sk); 3373 iter->batch[iter->end_sk++] = sk; 3374 } 3375 3376 expected++; 3377 } 3378 3379 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3380 3381 return expected; 3382 } 3383 3384 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3385 { 3386 while (iter->cur_sk < iter->end_sk) 3387 sock_put(iter->batch[iter->cur_sk++]); 3388 } 3389 3390 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3391 unsigned int new_batch_sz) 3392 { 3393 struct sock **new_batch; 3394 3395 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3396 GFP_USER | __GFP_NOWARN); 3397 if (!new_batch) 3398 return -ENOMEM; 3399 3400 bpf_iter_unix_put_batch(iter); 3401 kvfree(iter->batch); 3402 iter->batch = new_batch; 3403 iter->max_sk = new_batch_sz; 3404 3405 return 0; 3406 } 3407 3408 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3409 loff_t *pos) 3410 { 3411 struct bpf_unix_iter_state *iter = seq->private; 3412 unsigned int expected; 3413 bool resized = false; 3414 struct sock *sk; 3415 3416 if (iter->st_bucket_done) 3417 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3418 3419 again: 3420 /* Get a new batch */ 3421 iter->cur_sk = 0; 3422 iter->end_sk = 0; 3423 3424 sk = unix_get_first(seq, pos); 3425 if (!sk) 3426 return NULL; /* Done */ 3427 3428 expected = bpf_iter_unix_hold_batch(seq, sk); 3429 3430 if (iter->end_sk == expected) { 3431 iter->st_bucket_done = true; 3432 return sk; 3433 } 3434 3435 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3436 resized = true; 3437 goto again; 3438 } 3439 3440 return sk; 3441 } 3442 3443 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3444 { 3445 if (!*pos) 3446 return SEQ_START_TOKEN; 3447 3448 /* bpf iter does not support lseek, so it always 3449 * continue from where it was stop()-ped. 3450 */ 3451 return bpf_iter_unix_batch(seq, pos); 3452 } 3453 3454 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3455 { 3456 struct bpf_unix_iter_state *iter = seq->private; 3457 struct sock *sk; 3458 3459 /* Whenever seq_next() is called, the iter->cur_sk is 3460 * done with seq_show(), so advance to the next sk in 3461 * the batch. 3462 */ 3463 if (iter->cur_sk < iter->end_sk) 3464 sock_put(iter->batch[iter->cur_sk++]); 3465 3466 ++*pos; 3467 3468 if (iter->cur_sk < iter->end_sk) 3469 sk = iter->batch[iter->cur_sk]; 3470 else 3471 sk = bpf_iter_unix_batch(seq, pos); 3472 3473 return sk; 3474 } 3475 3476 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3477 { 3478 struct bpf_iter_meta meta; 3479 struct bpf_prog *prog; 3480 struct sock *sk = v; 3481 uid_t uid; 3482 bool slow; 3483 int ret; 3484 3485 if (v == SEQ_START_TOKEN) 3486 return 0; 3487 3488 slow = lock_sock_fast(sk); 3489 3490 if (unlikely(sk_unhashed(sk))) { 3491 ret = SEQ_SKIP; 3492 goto unlock; 3493 } 3494 3495 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3496 meta.seq = seq; 3497 prog = bpf_iter_get_info(&meta, false); 3498 ret = unix_prog_seq_show(prog, &meta, v, uid); 3499 unlock: 3500 unlock_sock_fast(sk, slow); 3501 return ret; 3502 } 3503 3504 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3505 { 3506 struct bpf_unix_iter_state *iter = seq->private; 3507 struct bpf_iter_meta meta; 3508 struct bpf_prog *prog; 3509 3510 if (!v) { 3511 meta.seq = seq; 3512 prog = bpf_iter_get_info(&meta, true); 3513 if (prog) 3514 (void)unix_prog_seq_show(prog, &meta, v, 0); 3515 } 3516 3517 if (iter->cur_sk < iter->end_sk) 3518 bpf_iter_unix_put_batch(iter); 3519 } 3520 3521 static const struct seq_operations bpf_iter_unix_seq_ops = { 3522 .start = bpf_iter_unix_seq_start, 3523 .next = bpf_iter_unix_seq_next, 3524 .stop = bpf_iter_unix_seq_stop, 3525 .show = bpf_iter_unix_seq_show, 3526 }; 3527 #endif 3528 #endif 3529 3530 static const struct net_proto_family unix_family_ops = { 3531 .family = PF_UNIX, 3532 .create = unix_create, 3533 .owner = THIS_MODULE, 3534 }; 3535 3536 3537 static int __net_init unix_net_init(struct net *net) 3538 { 3539 int i; 3540 3541 net->unx.sysctl_max_dgram_qlen = 10; 3542 if (unix_sysctl_register(net)) 3543 goto out; 3544 3545 #ifdef CONFIG_PROC_FS 3546 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3547 sizeof(struct seq_net_private))) 3548 goto err_sysctl; 3549 #endif 3550 3551 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3552 sizeof(spinlock_t), GFP_KERNEL); 3553 if (!net->unx.table.locks) 3554 goto err_proc; 3555 3556 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3557 sizeof(struct hlist_head), 3558 GFP_KERNEL); 3559 if (!net->unx.table.buckets) 3560 goto free_locks; 3561 3562 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3563 spin_lock_init(&net->unx.table.locks[i]); 3564 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3565 } 3566 3567 return 0; 3568 3569 free_locks: 3570 kvfree(net->unx.table.locks); 3571 err_proc: 3572 #ifdef CONFIG_PROC_FS 3573 remove_proc_entry("unix", net->proc_net); 3574 err_sysctl: 3575 #endif 3576 unix_sysctl_unregister(net); 3577 out: 3578 return -ENOMEM; 3579 } 3580 3581 static void __net_exit unix_net_exit(struct net *net) 3582 { 3583 kvfree(net->unx.table.buckets); 3584 kvfree(net->unx.table.locks); 3585 unix_sysctl_unregister(net); 3586 remove_proc_entry("unix", net->proc_net); 3587 } 3588 3589 static struct pernet_operations unix_net_ops = { 3590 .init = unix_net_init, 3591 .exit = unix_net_exit, 3592 }; 3593 3594 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3595 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3596 struct unix_sock *unix_sk, uid_t uid) 3597 3598 #define INIT_BATCH_SZ 16 3599 3600 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3601 { 3602 struct bpf_unix_iter_state *iter = priv_data; 3603 int err; 3604 3605 err = bpf_iter_init_seq_net(priv_data, aux); 3606 if (err) 3607 return err; 3608 3609 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3610 if (err) { 3611 bpf_iter_fini_seq_net(priv_data); 3612 return err; 3613 } 3614 3615 return 0; 3616 } 3617 3618 static void bpf_iter_fini_unix(void *priv_data) 3619 { 3620 struct bpf_unix_iter_state *iter = priv_data; 3621 3622 bpf_iter_fini_seq_net(priv_data); 3623 kvfree(iter->batch); 3624 } 3625 3626 static const struct bpf_iter_seq_info unix_seq_info = { 3627 .seq_ops = &bpf_iter_unix_seq_ops, 3628 .init_seq_private = bpf_iter_init_unix, 3629 .fini_seq_private = bpf_iter_fini_unix, 3630 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3631 }; 3632 3633 static const struct bpf_func_proto * 3634 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3635 const struct bpf_prog *prog) 3636 { 3637 switch (func_id) { 3638 case BPF_FUNC_setsockopt: 3639 return &bpf_sk_setsockopt_proto; 3640 case BPF_FUNC_getsockopt: 3641 return &bpf_sk_getsockopt_proto; 3642 default: 3643 return NULL; 3644 } 3645 } 3646 3647 static struct bpf_iter_reg unix_reg_info = { 3648 .target = "unix", 3649 .ctx_arg_info_size = 1, 3650 .ctx_arg_info = { 3651 { offsetof(struct bpf_iter__unix, unix_sk), 3652 PTR_TO_BTF_ID_OR_NULL }, 3653 }, 3654 .get_func_proto = bpf_iter_unix_get_func_proto, 3655 .seq_info = &unix_seq_info, 3656 }; 3657 3658 static void __init bpf_iter_register(void) 3659 { 3660 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3661 if (bpf_iter_reg_target(&unix_reg_info)) 3662 pr_warn("Warning: could not register bpf iterator unix\n"); 3663 } 3664 #endif 3665 3666 static int __init af_unix_init(void) 3667 { 3668 int i, rc = -1; 3669 3670 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3671 3672 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3673 spin_lock_init(&bsd_socket_locks[i]); 3674 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3675 } 3676 3677 rc = proto_register(&unix_dgram_proto, 1); 3678 if (rc != 0) { 3679 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3680 goto out; 3681 } 3682 3683 rc = proto_register(&unix_stream_proto, 1); 3684 if (rc != 0) { 3685 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3686 proto_unregister(&unix_dgram_proto); 3687 goto out; 3688 } 3689 3690 sock_register(&unix_family_ops); 3691 register_pernet_subsys(&unix_net_ops); 3692 unix_bpf_build_proto(); 3693 3694 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3695 bpf_iter_register(); 3696 #endif 3697 3698 out: 3699 return rc; 3700 } 3701 3702 static void __exit af_unix_exit(void) 3703 { 3704 sock_unregister(PF_UNIX); 3705 proto_unregister(&unix_dgram_proto); 3706 proto_unregister(&unix_stream_proto); 3707 unregister_pernet_subsys(&unix_net_ops); 3708 } 3709 3710 /* Earlier than device_initcall() so that other drivers invoking 3711 request_module() don't end up in a loop when modprobe tries 3712 to use a UNIX socket. But later than subsys_initcall() because 3713 we depend on stuff initialised there */ 3714 fs_initcall(af_unix_init); 3715 module_exit(af_unix_exit); 3716 3717 MODULE_LICENSE("GPL"); 3718 MODULE_ALIAS_NETPROTO(PF_UNIX); 3719