1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 120 #include "scm.h" 121 122 static atomic_long_t unix_nr_socks; 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 125 126 /* SMP locking strategy: 127 * hash table is protected with spinlock. 128 * each socket state is protected by separate spinlock. 129 */ 130 131 static unsigned int unix_unbound_hash(struct sock *sk) 132 { 133 unsigned long hash = (unsigned long)sk; 134 135 hash ^= hash >> 16; 136 hash ^= hash >> 8; 137 hash ^= sk->sk_type; 138 139 return hash & UNIX_HASH_MOD; 140 } 141 142 static unsigned int unix_bsd_hash(struct inode *i) 143 { 144 return i->i_ino & UNIX_HASH_MOD; 145 } 146 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 148 int addr_len, int type) 149 { 150 __wsum csum = csum_partial(sunaddr, addr_len, 0); 151 unsigned int hash; 152 153 hash = (__force unsigned int)csum_fold(csum); 154 hash ^= hash >> 8; 155 hash ^= type; 156 157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 158 } 159 160 static void unix_table_double_lock(struct net *net, 161 unsigned int hash1, unsigned int hash2) 162 { 163 if (hash1 == hash2) { 164 spin_lock(&net->unx.table.locks[hash1]); 165 return; 166 } 167 168 if (hash1 > hash2) 169 swap(hash1, hash2); 170 171 spin_lock(&net->unx.table.locks[hash1]); 172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 173 } 174 175 static void unix_table_double_unlock(struct net *net, 176 unsigned int hash1, unsigned int hash2) 177 { 178 if (hash1 == hash2) { 179 spin_unlock(&net->unx.table.locks[hash1]); 180 return; 181 } 182 183 spin_unlock(&net->unx.table.locks[hash1]); 184 spin_unlock(&net->unx.table.locks[hash2]); 185 } 186 187 #ifdef CONFIG_SECURITY_NETWORK 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 189 { 190 UNIXCB(skb).secid = scm->secid; 191 } 192 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 194 { 195 scm->secid = UNIXCB(skb).secid; 196 } 197 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 199 { 200 return (scm->secid == UNIXCB(skb).secid); 201 } 202 #else 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 204 { } 205 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 207 { } 208 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 210 { 211 return true; 212 } 213 #endif /* CONFIG_SECURITY_NETWORK */ 214 215 #define unix_peer(sk) (unix_sk(sk)->peer) 216 217 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 218 { 219 return unix_peer(osk) == sk; 220 } 221 222 static inline int unix_may_send(struct sock *sk, struct sock *osk) 223 { 224 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 225 } 226 227 static inline int unix_recvq_full(const struct sock *sk) 228 { 229 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 230 } 231 232 static inline int unix_recvq_full_lockless(const struct sock *sk) 233 { 234 return skb_queue_len_lockless(&sk->sk_receive_queue) > 235 READ_ONCE(sk->sk_max_ack_backlog); 236 } 237 238 struct sock *unix_peer_get(struct sock *s) 239 { 240 struct sock *peer; 241 242 unix_state_lock(s); 243 peer = unix_peer(s); 244 if (peer) 245 sock_hold(peer); 246 unix_state_unlock(s); 247 return peer; 248 } 249 EXPORT_SYMBOL_GPL(unix_peer_get); 250 251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 252 int addr_len) 253 { 254 struct unix_address *addr; 255 256 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 257 if (!addr) 258 return NULL; 259 260 refcount_set(&addr->refcnt, 1); 261 addr->len = addr_len; 262 memcpy(addr->name, sunaddr, addr_len); 263 264 return addr; 265 } 266 267 static inline void unix_release_addr(struct unix_address *addr) 268 { 269 if (refcount_dec_and_test(&addr->refcnt)) 270 kfree(addr); 271 } 272 273 /* 274 * Check unix socket name: 275 * - should be not zero length. 276 * - if started by not zero, should be NULL terminated (FS object) 277 * - if started by zero, it is abstract name. 278 */ 279 280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 281 { 282 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 283 addr_len > sizeof(*sunaddr)) 284 return -EINVAL; 285 286 if (sunaddr->sun_family != AF_UNIX) 287 return -EINVAL; 288 289 return 0; 290 } 291 292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 293 { 294 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 295 short offset = offsetof(struct sockaddr_storage, __data); 296 297 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 298 299 /* This may look like an off by one error but it is a bit more 300 * subtle. 108 is the longest valid AF_UNIX path for a binding. 301 * sun_path[108] doesn't as such exist. However in kernel space 302 * we are guaranteed that it is a valid memory location in our 303 * kernel address buffer because syscall functions always pass 304 * a pointer of struct sockaddr_storage which has a bigger buffer 305 * than 108. Also, we must terminate sun_path for strlen() in 306 * getname_kernel(). 307 */ 308 addr->__data[addr_len - offset] = 0; 309 310 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 311 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 312 * know the actual buffer. 313 */ 314 return strlen(addr->__data) + offset + 1; 315 } 316 317 static void __unix_remove_socket(struct sock *sk) 318 { 319 sk_del_node_init(sk); 320 } 321 322 static void __unix_insert_socket(struct net *net, struct sock *sk) 323 { 324 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 325 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 326 } 327 328 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 329 struct unix_address *addr, unsigned int hash) 330 { 331 __unix_remove_socket(sk); 332 smp_store_release(&unix_sk(sk)->addr, addr); 333 334 sk->sk_hash = hash; 335 __unix_insert_socket(net, sk); 336 } 337 338 static void unix_remove_socket(struct net *net, struct sock *sk) 339 { 340 spin_lock(&net->unx.table.locks[sk->sk_hash]); 341 __unix_remove_socket(sk); 342 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 343 } 344 345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 346 { 347 spin_lock(&net->unx.table.locks[sk->sk_hash]); 348 __unix_insert_socket(net, sk); 349 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 350 } 351 352 static void unix_insert_bsd_socket(struct sock *sk) 353 { 354 spin_lock(&bsd_socket_locks[sk->sk_hash]); 355 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 356 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 357 } 358 359 static void unix_remove_bsd_socket(struct sock *sk) 360 { 361 if (!hlist_unhashed(&sk->sk_bind_node)) { 362 spin_lock(&bsd_socket_locks[sk->sk_hash]); 363 __sk_del_bind_node(sk); 364 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 365 366 sk_node_init(&sk->sk_bind_node); 367 } 368 } 369 370 static struct sock *__unix_find_socket_byname(struct net *net, 371 struct sockaddr_un *sunname, 372 int len, unsigned int hash) 373 { 374 struct sock *s; 375 376 sk_for_each(s, &net->unx.table.buckets[hash]) { 377 struct unix_sock *u = unix_sk(s); 378 379 if (u->addr->len == len && 380 !memcmp(u->addr->name, sunname, len)) 381 return s; 382 } 383 return NULL; 384 } 385 386 static inline struct sock *unix_find_socket_byname(struct net *net, 387 struct sockaddr_un *sunname, 388 int len, unsigned int hash) 389 { 390 struct sock *s; 391 392 spin_lock(&net->unx.table.locks[hash]); 393 s = __unix_find_socket_byname(net, sunname, len, hash); 394 if (s) 395 sock_hold(s); 396 spin_unlock(&net->unx.table.locks[hash]); 397 return s; 398 } 399 400 static struct sock *unix_find_socket_byinode(struct inode *i) 401 { 402 unsigned int hash = unix_bsd_hash(i); 403 struct sock *s; 404 405 spin_lock(&bsd_socket_locks[hash]); 406 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 407 struct dentry *dentry = unix_sk(s)->path.dentry; 408 409 if (dentry && d_backing_inode(dentry) == i) { 410 sock_hold(s); 411 spin_unlock(&bsd_socket_locks[hash]); 412 return s; 413 } 414 } 415 spin_unlock(&bsd_socket_locks[hash]); 416 return NULL; 417 } 418 419 /* Support code for asymmetrically connected dgram sockets 420 * 421 * If a datagram socket is connected to a socket not itself connected 422 * to the first socket (eg, /dev/log), clients may only enqueue more 423 * messages if the present receive queue of the server socket is not 424 * "too large". This means there's a second writeability condition 425 * poll and sendmsg need to test. The dgram recv code will do a wake 426 * up on the peer_wait wait queue of a socket upon reception of a 427 * datagram which needs to be propagated to sleeping would-be writers 428 * since these might not have sent anything so far. This can't be 429 * accomplished via poll_wait because the lifetime of the server 430 * socket might be less than that of its clients if these break their 431 * association with it or if the server socket is closed while clients 432 * are still connected to it and there's no way to inform "a polling 433 * implementation" that it should let go of a certain wait queue 434 * 435 * In order to propagate a wake up, a wait_queue_entry_t of the client 436 * socket is enqueued on the peer_wait queue of the server socket 437 * whose wake function does a wake_up on the ordinary client socket 438 * wait queue. This connection is established whenever a write (or 439 * poll for write) hit the flow control condition and broken when the 440 * association to the server socket is dissolved or after a wake up 441 * was relayed. 442 */ 443 444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 445 void *key) 446 { 447 struct unix_sock *u; 448 wait_queue_head_t *u_sleep; 449 450 u = container_of(q, struct unix_sock, peer_wake); 451 452 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 453 q); 454 u->peer_wake.private = NULL; 455 456 /* relaying can only happen while the wq still exists */ 457 u_sleep = sk_sleep(&u->sk); 458 if (u_sleep) 459 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 460 461 return 0; 462 } 463 464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 465 { 466 struct unix_sock *u, *u_other; 467 int rc; 468 469 u = unix_sk(sk); 470 u_other = unix_sk(other); 471 rc = 0; 472 spin_lock(&u_other->peer_wait.lock); 473 474 if (!u->peer_wake.private) { 475 u->peer_wake.private = other; 476 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 477 478 rc = 1; 479 } 480 481 spin_unlock(&u_other->peer_wait.lock); 482 return rc; 483 } 484 485 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 486 struct sock *other) 487 { 488 struct unix_sock *u, *u_other; 489 490 u = unix_sk(sk); 491 u_other = unix_sk(other); 492 spin_lock(&u_other->peer_wait.lock); 493 494 if (u->peer_wake.private == other) { 495 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 496 u->peer_wake.private = NULL; 497 } 498 499 spin_unlock(&u_other->peer_wait.lock); 500 } 501 502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 503 struct sock *other) 504 { 505 unix_dgram_peer_wake_disconnect(sk, other); 506 wake_up_interruptible_poll(sk_sleep(sk), 507 EPOLLOUT | 508 EPOLLWRNORM | 509 EPOLLWRBAND); 510 } 511 512 /* preconditions: 513 * - unix_peer(sk) == other 514 * - association is stable 515 */ 516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 517 { 518 int connected; 519 520 connected = unix_dgram_peer_wake_connect(sk, other); 521 522 /* If other is SOCK_DEAD, we want to make sure we signal 523 * POLLOUT, such that a subsequent write() can get a 524 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 525 * to other and its full, we will hang waiting for POLLOUT. 526 */ 527 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 528 return 1; 529 530 if (connected) 531 unix_dgram_peer_wake_disconnect(sk, other); 532 533 return 0; 534 } 535 536 static int unix_writable(const struct sock *sk) 537 { 538 return sk->sk_state != TCP_LISTEN && 539 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; 540 } 541 542 static void unix_write_space(struct sock *sk) 543 { 544 struct socket_wq *wq; 545 546 rcu_read_lock(); 547 if (unix_writable(sk)) { 548 wq = rcu_dereference(sk->sk_wq); 549 if (skwq_has_sleeper(wq)) 550 wake_up_interruptible_sync_poll(&wq->wait, 551 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 552 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 553 } 554 rcu_read_unlock(); 555 } 556 557 /* When dgram socket disconnects (or changes its peer), we clear its receive 558 * queue of packets arrived from previous peer. First, it allows to do 559 * flow control based only on wmem_alloc; second, sk connected to peer 560 * may receive messages only from that peer. */ 561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 562 { 563 if (!skb_queue_empty(&sk->sk_receive_queue)) { 564 skb_queue_purge(&sk->sk_receive_queue); 565 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 566 567 /* If one link of bidirectional dgram pipe is disconnected, 568 * we signal error. Messages are lost. Do not make this, 569 * when peer was not connected to us. 570 */ 571 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 572 WRITE_ONCE(other->sk_err, ECONNRESET); 573 sk_error_report(other); 574 } 575 } 576 other->sk_state = TCP_CLOSE; 577 } 578 579 static void unix_sock_destructor(struct sock *sk) 580 { 581 struct unix_sock *u = unix_sk(sk); 582 583 skb_queue_purge(&sk->sk_receive_queue); 584 585 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 586 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 587 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 588 if (!sock_flag(sk, SOCK_DEAD)) { 589 pr_info("Attempt to release alive unix socket: %p\n", sk); 590 return; 591 } 592 593 if (u->addr) 594 unix_release_addr(u->addr); 595 596 atomic_long_dec(&unix_nr_socks); 597 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 598 #ifdef UNIX_REFCNT_DEBUG 599 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 600 atomic_long_read(&unix_nr_socks)); 601 #endif 602 } 603 604 static void unix_release_sock(struct sock *sk, int embrion) 605 { 606 struct unix_sock *u = unix_sk(sk); 607 struct sock *skpair; 608 struct sk_buff *skb; 609 struct path path; 610 int state; 611 612 unix_remove_socket(sock_net(sk), sk); 613 unix_remove_bsd_socket(sk); 614 615 /* Clear state */ 616 unix_state_lock(sk); 617 sock_orphan(sk); 618 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 619 path = u->path; 620 u->path.dentry = NULL; 621 u->path.mnt = NULL; 622 state = sk->sk_state; 623 sk->sk_state = TCP_CLOSE; 624 625 skpair = unix_peer(sk); 626 unix_peer(sk) = NULL; 627 628 unix_state_unlock(sk); 629 630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 631 if (u->oob_skb) { 632 kfree_skb(u->oob_skb); 633 u->oob_skb = NULL; 634 } 635 #endif 636 637 wake_up_interruptible_all(&u->peer_wait); 638 639 if (skpair != NULL) { 640 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 641 unix_state_lock(skpair); 642 /* No more writes */ 643 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 644 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) 645 WRITE_ONCE(skpair->sk_err, ECONNRESET); 646 unix_state_unlock(skpair); 647 skpair->sk_state_change(skpair); 648 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 649 } 650 651 unix_dgram_peer_wake_disconnect(sk, skpair); 652 sock_put(skpair); /* It may now die */ 653 } 654 655 /* Try to flush out this socket. Throw out buffers at least */ 656 657 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 658 if (state == TCP_LISTEN) 659 unix_release_sock(skb->sk, 1); 660 /* passed fds are erased in the kfree_skb hook */ 661 UNIXCB(skb).consumed = skb->len; 662 kfree_skb(skb); 663 } 664 665 if (path.dentry) 666 path_put(&path); 667 668 sock_put(sk); 669 670 /* ---- Socket is dead now and most probably destroyed ---- */ 671 672 /* 673 * Fixme: BSD difference: In BSD all sockets connected to us get 674 * ECONNRESET and we die on the spot. In Linux we behave 675 * like files and pipes do and wait for the last 676 * dereference. 677 * 678 * Can't we simply set sock->err? 679 * 680 * What the above comment does talk about? --ANK(980817) 681 */ 682 683 if (READ_ONCE(unix_tot_inflight)) 684 unix_gc(); /* Garbage collect fds */ 685 } 686 687 static void init_peercred(struct sock *sk) 688 { 689 const struct cred *old_cred; 690 struct pid *old_pid; 691 692 spin_lock(&sk->sk_peer_lock); 693 old_pid = sk->sk_peer_pid; 694 old_cred = sk->sk_peer_cred; 695 sk->sk_peer_pid = get_pid(task_tgid(current)); 696 sk->sk_peer_cred = get_current_cred(); 697 spin_unlock(&sk->sk_peer_lock); 698 699 put_pid(old_pid); 700 put_cred(old_cred); 701 } 702 703 static void copy_peercred(struct sock *sk, struct sock *peersk) 704 { 705 const struct cred *old_cred; 706 struct pid *old_pid; 707 708 if (sk < peersk) { 709 spin_lock(&sk->sk_peer_lock); 710 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 711 } else { 712 spin_lock(&peersk->sk_peer_lock); 713 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 714 } 715 old_pid = sk->sk_peer_pid; 716 old_cred = sk->sk_peer_cred; 717 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 718 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 719 720 spin_unlock(&sk->sk_peer_lock); 721 spin_unlock(&peersk->sk_peer_lock); 722 723 put_pid(old_pid); 724 put_cred(old_cred); 725 } 726 727 static int unix_listen(struct socket *sock, int backlog) 728 { 729 int err; 730 struct sock *sk = sock->sk; 731 struct unix_sock *u = unix_sk(sk); 732 733 err = -EOPNOTSUPP; 734 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 735 goto out; /* Only stream/seqpacket sockets accept */ 736 err = -EINVAL; 737 if (!u->addr) 738 goto out; /* No listens on an unbound socket */ 739 unix_state_lock(sk); 740 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 741 goto out_unlock; 742 if (backlog > sk->sk_max_ack_backlog) 743 wake_up_interruptible_all(&u->peer_wait); 744 sk->sk_max_ack_backlog = backlog; 745 sk->sk_state = TCP_LISTEN; 746 /* set credentials so connect can copy them */ 747 init_peercred(sk); 748 err = 0; 749 750 out_unlock: 751 unix_state_unlock(sk); 752 out: 753 return err; 754 } 755 756 static int unix_release(struct socket *); 757 static int unix_bind(struct socket *, struct sockaddr *, int); 758 static int unix_stream_connect(struct socket *, struct sockaddr *, 759 int addr_len, int flags); 760 static int unix_socketpair(struct socket *, struct socket *); 761 static int unix_accept(struct socket *, struct socket *, int, bool); 762 static int unix_getname(struct socket *, struct sockaddr *, int); 763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 764 static __poll_t unix_dgram_poll(struct file *, struct socket *, 765 poll_table *); 766 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 767 #ifdef CONFIG_COMPAT 768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 769 #endif 770 static int unix_shutdown(struct socket *, int); 771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 773 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 774 struct pipe_inode_info *, size_t size, 775 unsigned int flags); 776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 780 static int unix_dgram_connect(struct socket *, struct sockaddr *, 781 int, int); 782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 784 int); 785 786 static int unix_set_peek_off(struct sock *sk, int val) 787 { 788 struct unix_sock *u = unix_sk(sk); 789 790 if (mutex_lock_interruptible(&u->iolock)) 791 return -EINTR; 792 793 WRITE_ONCE(sk->sk_peek_off, val); 794 mutex_unlock(&u->iolock); 795 796 return 0; 797 } 798 799 #ifdef CONFIG_PROC_FS 800 static int unix_count_nr_fds(struct sock *sk) 801 { 802 struct sk_buff *skb; 803 struct unix_sock *u; 804 int nr_fds = 0; 805 806 spin_lock(&sk->sk_receive_queue.lock); 807 skb = skb_peek(&sk->sk_receive_queue); 808 while (skb) { 809 u = unix_sk(skb->sk); 810 nr_fds += atomic_read(&u->scm_stat.nr_fds); 811 skb = skb_peek_next(skb, &sk->sk_receive_queue); 812 } 813 spin_unlock(&sk->sk_receive_queue.lock); 814 815 return nr_fds; 816 } 817 818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 819 { 820 struct sock *sk = sock->sk; 821 unsigned char s_state; 822 struct unix_sock *u; 823 int nr_fds = 0; 824 825 if (sk) { 826 s_state = READ_ONCE(sk->sk_state); 827 u = unix_sk(sk); 828 829 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 830 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 831 * SOCK_DGRAM is ordinary. So, no lock is needed. 832 */ 833 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 834 nr_fds = atomic_read(&u->scm_stat.nr_fds); 835 else if (s_state == TCP_LISTEN) 836 nr_fds = unix_count_nr_fds(sk); 837 838 seq_printf(m, "scm_fds: %u\n", nr_fds); 839 } 840 } 841 #else 842 #define unix_show_fdinfo NULL 843 #endif 844 845 static const struct proto_ops unix_stream_ops = { 846 .family = PF_UNIX, 847 .owner = THIS_MODULE, 848 .release = unix_release, 849 .bind = unix_bind, 850 .connect = unix_stream_connect, 851 .socketpair = unix_socketpair, 852 .accept = unix_accept, 853 .getname = unix_getname, 854 .poll = unix_poll, 855 .ioctl = unix_ioctl, 856 #ifdef CONFIG_COMPAT 857 .compat_ioctl = unix_compat_ioctl, 858 #endif 859 .listen = unix_listen, 860 .shutdown = unix_shutdown, 861 .sendmsg = unix_stream_sendmsg, 862 .recvmsg = unix_stream_recvmsg, 863 .read_skb = unix_stream_read_skb, 864 .mmap = sock_no_mmap, 865 .splice_read = unix_stream_splice_read, 866 .set_peek_off = unix_set_peek_off, 867 .show_fdinfo = unix_show_fdinfo, 868 }; 869 870 static const struct proto_ops unix_dgram_ops = { 871 .family = PF_UNIX, 872 .owner = THIS_MODULE, 873 .release = unix_release, 874 .bind = unix_bind, 875 .connect = unix_dgram_connect, 876 .socketpair = unix_socketpair, 877 .accept = sock_no_accept, 878 .getname = unix_getname, 879 .poll = unix_dgram_poll, 880 .ioctl = unix_ioctl, 881 #ifdef CONFIG_COMPAT 882 .compat_ioctl = unix_compat_ioctl, 883 #endif 884 .listen = sock_no_listen, 885 .shutdown = unix_shutdown, 886 .sendmsg = unix_dgram_sendmsg, 887 .read_skb = unix_read_skb, 888 .recvmsg = unix_dgram_recvmsg, 889 .mmap = sock_no_mmap, 890 .set_peek_off = unix_set_peek_off, 891 .show_fdinfo = unix_show_fdinfo, 892 }; 893 894 static const struct proto_ops unix_seqpacket_ops = { 895 .family = PF_UNIX, 896 .owner = THIS_MODULE, 897 .release = unix_release, 898 .bind = unix_bind, 899 .connect = unix_stream_connect, 900 .socketpair = unix_socketpair, 901 .accept = unix_accept, 902 .getname = unix_getname, 903 .poll = unix_dgram_poll, 904 .ioctl = unix_ioctl, 905 #ifdef CONFIG_COMPAT 906 .compat_ioctl = unix_compat_ioctl, 907 #endif 908 .listen = unix_listen, 909 .shutdown = unix_shutdown, 910 .sendmsg = unix_seqpacket_sendmsg, 911 .recvmsg = unix_seqpacket_recvmsg, 912 .mmap = sock_no_mmap, 913 .set_peek_off = unix_set_peek_off, 914 .show_fdinfo = unix_show_fdinfo, 915 }; 916 917 static void unix_close(struct sock *sk, long timeout) 918 { 919 /* Nothing to do here, unix socket does not need a ->close(). 920 * This is merely for sockmap. 921 */ 922 } 923 924 static void unix_unhash(struct sock *sk) 925 { 926 /* Nothing to do here, unix socket does not need a ->unhash(). 927 * This is merely for sockmap. 928 */ 929 } 930 931 static bool unix_bpf_bypass_getsockopt(int level, int optname) 932 { 933 if (level == SOL_SOCKET) { 934 switch (optname) { 935 case SO_PEERPIDFD: 936 return true; 937 default: 938 return false; 939 } 940 } 941 942 return false; 943 } 944 945 struct proto unix_dgram_proto = { 946 .name = "UNIX", 947 .owner = THIS_MODULE, 948 .obj_size = sizeof(struct unix_sock), 949 .close = unix_close, 950 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 951 #ifdef CONFIG_BPF_SYSCALL 952 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 953 #endif 954 }; 955 956 struct proto unix_stream_proto = { 957 .name = "UNIX-STREAM", 958 .owner = THIS_MODULE, 959 .obj_size = sizeof(struct unix_sock), 960 .close = unix_close, 961 .unhash = unix_unhash, 962 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 963 #ifdef CONFIG_BPF_SYSCALL 964 .psock_update_sk_prot = unix_stream_bpf_update_proto, 965 #endif 966 }; 967 968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 969 { 970 struct unix_sock *u; 971 struct sock *sk; 972 int err; 973 974 atomic_long_inc(&unix_nr_socks); 975 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 976 err = -ENFILE; 977 goto err; 978 } 979 980 if (type == SOCK_STREAM) 981 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 982 else /*dgram and seqpacket */ 983 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 984 985 if (!sk) { 986 err = -ENOMEM; 987 goto err; 988 } 989 990 sock_init_data(sock, sk); 991 992 sk->sk_hash = unix_unbound_hash(sk); 993 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 994 sk->sk_write_space = unix_write_space; 995 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 996 sk->sk_destruct = unix_sock_destructor; 997 u = unix_sk(sk); 998 u->path.dentry = NULL; 999 u->path.mnt = NULL; 1000 spin_lock_init(&u->lock); 1001 atomic_long_set(&u->inflight, 0); 1002 INIT_LIST_HEAD(&u->link); 1003 mutex_init(&u->iolock); /* single task reading lock */ 1004 mutex_init(&u->bindlock); /* single task binding lock */ 1005 init_waitqueue_head(&u->peer_wait); 1006 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1007 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1008 unix_insert_unbound_socket(net, sk); 1009 1010 sock_prot_inuse_add(net, sk->sk_prot, 1); 1011 1012 return sk; 1013 1014 err: 1015 atomic_long_dec(&unix_nr_socks); 1016 return ERR_PTR(err); 1017 } 1018 1019 static int unix_create(struct net *net, struct socket *sock, int protocol, 1020 int kern) 1021 { 1022 struct sock *sk; 1023 1024 if (protocol && protocol != PF_UNIX) 1025 return -EPROTONOSUPPORT; 1026 1027 sock->state = SS_UNCONNECTED; 1028 1029 switch (sock->type) { 1030 case SOCK_STREAM: 1031 sock->ops = &unix_stream_ops; 1032 break; 1033 /* 1034 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1035 * nothing uses it. 1036 */ 1037 case SOCK_RAW: 1038 sock->type = SOCK_DGRAM; 1039 fallthrough; 1040 case SOCK_DGRAM: 1041 sock->ops = &unix_dgram_ops; 1042 break; 1043 case SOCK_SEQPACKET: 1044 sock->ops = &unix_seqpacket_ops; 1045 break; 1046 default: 1047 return -ESOCKTNOSUPPORT; 1048 } 1049 1050 sk = unix_create1(net, sock, kern, sock->type); 1051 if (IS_ERR(sk)) 1052 return PTR_ERR(sk); 1053 1054 return 0; 1055 } 1056 1057 static int unix_release(struct socket *sock) 1058 { 1059 struct sock *sk = sock->sk; 1060 1061 if (!sk) 1062 return 0; 1063 1064 sk->sk_prot->close(sk, 0); 1065 unix_release_sock(sk, 0); 1066 sock->sk = NULL; 1067 1068 return 0; 1069 } 1070 1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1072 int type) 1073 { 1074 struct inode *inode; 1075 struct path path; 1076 struct sock *sk; 1077 int err; 1078 1079 unix_mkname_bsd(sunaddr, addr_len); 1080 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1081 if (err) 1082 goto fail; 1083 1084 err = path_permission(&path, MAY_WRITE); 1085 if (err) 1086 goto path_put; 1087 1088 err = -ECONNREFUSED; 1089 inode = d_backing_inode(path.dentry); 1090 if (!S_ISSOCK(inode->i_mode)) 1091 goto path_put; 1092 1093 sk = unix_find_socket_byinode(inode); 1094 if (!sk) 1095 goto path_put; 1096 1097 err = -EPROTOTYPE; 1098 if (sk->sk_type == type) 1099 touch_atime(&path); 1100 else 1101 goto sock_put; 1102 1103 path_put(&path); 1104 1105 return sk; 1106 1107 sock_put: 1108 sock_put(sk); 1109 path_put: 1110 path_put(&path); 1111 fail: 1112 return ERR_PTR(err); 1113 } 1114 1115 static struct sock *unix_find_abstract(struct net *net, 1116 struct sockaddr_un *sunaddr, 1117 int addr_len, int type) 1118 { 1119 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1120 struct dentry *dentry; 1121 struct sock *sk; 1122 1123 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1124 if (!sk) 1125 return ERR_PTR(-ECONNREFUSED); 1126 1127 dentry = unix_sk(sk)->path.dentry; 1128 if (dentry) 1129 touch_atime(&unix_sk(sk)->path); 1130 1131 return sk; 1132 } 1133 1134 static struct sock *unix_find_other(struct net *net, 1135 struct sockaddr_un *sunaddr, 1136 int addr_len, int type) 1137 { 1138 struct sock *sk; 1139 1140 if (sunaddr->sun_path[0]) 1141 sk = unix_find_bsd(sunaddr, addr_len, type); 1142 else 1143 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1144 1145 return sk; 1146 } 1147 1148 static int unix_autobind(struct sock *sk) 1149 { 1150 unsigned int new_hash, old_hash = sk->sk_hash; 1151 struct unix_sock *u = unix_sk(sk); 1152 struct net *net = sock_net(sk); 1153 struct unix_address *addr; 1154 u32 lastnum, ordernum; 1155 int err; 1156 1157 err = mutex_lock_interruptible(&u->bindlock); 1158 if (err) 1159 return err; 1160 1161 if (u->addr) 1162 goto out; 1163 1164 err = -ENOMEM; 1165 addr = kzalloc(sizeof(*addr) + 1166 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1167 if (!addr) 1168 goto out; 1169 1170 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1171 addr->name->sun_family = AF_UNIX; 1172 refcount_set(&addr->refcnt, 1); 1173 1174 ordernum = get_random_u32(); 1175 lastnum = ordernum & 0xFFFFF; 1176 retry: 1177 ordernum = (ordernum + 1) & 0xFFFFF; 1178 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1179 1180 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1181 unix_table_double_lock(net, old_hash, new_hash); 1182 1183 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1184 unix_table_double_unlock(net, old_hash, new_hash); 1185 1186 /* __unix_find_socket_byname() may take long time if many names 1187 * are already in use. 1188 */ 1189 cond_resched(); 1190 1191 if (ordernum == lastnum) { 1192 /* Give up if all names seems to be in use. */ 1193 err = -ENOSPC; 1194 unix_release_addr(addr); 1195 goto out; 1196 } 1197 1198 goto retry; 1199 } 1200 1201 __unix_set_addr_hash(net, sk, addr, new_hash); 1202 unix_table_double_unlock(net, old_hash, new_hash); 1203 err = 0; 1204 1205 out: mutex_unlock(&u->bindlock); 1206 return err; 1207 } 1208 1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1210 int addr_len) 1211 { 1212 umode_t mode = S_IFSOCK | 1213 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1214 unsigned int new_hash, old_hash = sk->sk_hash; 1215 struct unix_sock *u = unix_sk(sk); 1216 struct net *net = sock_net(sk); 1217 struct mnt_idmap *idmap; 1218 struct unix_address *addr; 1219 struct dentry *dentry; 1220 struct path parent; 1221 int err; 1222 1223 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1224 addr = unix_create_addr(sunaddr, addr_len); 1225 if (!addr) 1226 return -ENOMEM; 1227 1228 /* 1229 * Get the parent directory, calculate the hash for last 1230 * component. 1231 */ 1232 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1233 if (IS_ERR(dentry)) { 1234 err = PTR_ERR(dentry); 1235 goto out; 1236 } 1237 1238 /* 1239 * All right, let's create it. 1240 */ 1241 idmap = mnt_idmap(parent.mnt); 1242 err = security_path_mknod(&parent, dentry, mode, 0); 1243 if (!err) 1244 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1245 if (err) 1246 goto out_path; 1247 err = mutex_lock_interruptible(&u->bindlock); 1248 if (err) 1249 goto out_unlink; 1250 if (u->addr) 1251 goto out_unlock; 1252 1253 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1254 unix_table_double_lock(net, old_hash, new_hash); 1255 u->path.mnt = mntget(parent.mnt); 1256 u->path.dentry = dget(dentry); 1257 __unix_set_addr_hash(net, sk, addr, new_hash); 1258 unix_table_double_unlock(net, old_hash, new_hash); 1259 unix_insert_bsd_socket(sk); 1260 mutex_unlock(&u->bindlock); 1261 done_path_create(&parent, dentry); 1262 return 0; 1263 1264 out_unlock: 1265 mutex_unlock(&u->bindlock); 1266 err = -EINVAL; 1267 out_unlink: 1268 /* failed after successful mknod? unlink what we'd created... */ 1269 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1270 out_path: 1271 done_path_create(&parent, dentry); 1272 out: 1273 unix_release_addr(addr); 1274 return err == -EEXIST ? -EADDRINUSE : err; 1275 } 1276 1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1278 int addr_len) 1279 { 1280 unsigned int new_hash, old_hash = sk->sk_hash; 1281 struct unix_sock *u = unix_sk(sk); 1282 struct net *net = sock_net(sk); 1283 struct unix_address *addr; 1284 int err; 1285 1286 addr = unix_create_addr(sunaddr, addr_len); 1287 if (!addr) 1288 return -ENOMEM; 1289 1290 err = mutex_lock_interruptible(&u->bindlock); 1291 if (err) 1292 goto out; 1293 1294 if (u->addr) { 1295 err = -EINVAL; 1296 goto out_mutex; 1297 } 1298 1299 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1300 unix_table_double_lock(net, old_hash, new_hash); 1301 1302 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1303 goto out_spin; 1304 1305 __unix_set_addr_hash(net, sk, addr, new_hash); 1306 unix_table_double_unlock(net, old_hash, new_hash); 1307 mutex_unlock(&u->bindlock); 1308 return 0; 1309 1310 out_spin: 1311 unix_table_double_unlock(net, old_hash, new_hash); 1312 err = -EADDRINUSE; 1313 out_mutex: 1314 mutex_unlock(&u->bindlock); 1315 out: 1316 unix_release_addr(addr); 1317 return err; 1318 } 1319 1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1321 { 1322 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1323 struct sock *sk = sock->sk; 1324 int err; 1325 1326 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1327 sunaddr->sun_family == AF_UNIX) 1328 return unix_autobind(sk); 1329 1330 err = unix_validate_addr(sunaddr, addr_len); 1331 if (err) 1332 return err; 1333 1334 if (sunaddr->sun_path[0]) 1335 err = unix_bind_bsd(sk, sunaddr, addr_len); 1336 else 1337 err = unix_bind_abstract(sk, sunaddr, addr_len); 1338 1339 return err; 1340 } 1341 1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1343 { 1344 if (unlikely(sk1 == sk2) || !sk2) { 1345 unix_state_lock(sk1); 1346 return; 1347 } 1348 if (sk1 < sk2) { 1349 unix_state_lock(sk1); 1350 unix_state_lock_nested(sk2); 1351 } else { 1352 unix_state_lock(sk2); 1353 unix_state_lock_nested(sk1); 1354 } 1355 } 1356 1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1358 { 1359 if (unlikely(sk1 == sk2) || !sk2) { 1360 unix_state_unlock(sk1); 1361 return; 1362 } 1363 unix_state_unlock(sk1); 1364 unix_state_unlock(sk2); 1365 } 1366 1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1368 int alen, int flags) 1369 { 1370 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1371 struct sock *sk = sock->sk; 1372 struct sock *other; 1373 int err; 1374 1375 err = -EINVAL; 1376 if (alen < offsetofend(struct sockaddr, sa_family)) 1377 goto out; 1378 1379 if (addr->sa_family != AF_UNSPEC) { 1380 err = unix_validate_addr(sunaddr, alen); 1381 if (err) 1382 goto out; 1383 1384 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1385 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1386 !unix_sk(sk)->addr) { 1387 err = unix_autobind(sk); 1388 if (err) 1389 goto out; 1390 } 1391 1392 restart: 1393 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1394 if (IS_ERR(other)) { 1395 err = PTR_ERR(other); 1396 goto out; 1397 } 1398 1399 unix_state_double_lock(sk, other); 1400 1401 /* Apparently VFS overslept socket death. Retry. */ 1402 if (sock_flag(other, SOCK_DEAD)) { 1403 unix_state_double_unlock(sk, other); 1404 sock_put(other); 1405 goto restart; 1406 } 1407 1408 err = -EPERM; 1409 if (!unix_may_send(sk, other)) 1410 goto out_unlock; 1411 1412 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1413 if (err) 1414 goto out_unlock; 1415 1416 sk->sk_state = other->sk_state = TCP_ESTABLISHED; 1417 } else { 1418 /* 1419 * 1003.1g breaking connected state with AF_UNSPEC 1420 */ 1421 other = NULL; 1422 unix_state_double_lock(sk, other); 1423 } 1424 1425 /* 1426 * If it was connected, reconnect. 1427 */ 1428 if (unix_peer(sk)) { 1429 struct sock *old_peer = unix_peer(sk); 1430 1431 unix_peer(sk) = other; 1432 if (!other) 1433 sk->sk_state = TCP_CLOSE; 1434 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1435 1436 unix_state_double_unlock(sk, other); 1437 1438 if (other != old_peer) 1439 unix_dgram_disconnected(sk, old_peer); 1440 sock_put(old_peer); 1441 } else { 1442 unix_peer(sk) = other; 1443 unix_state_double_unlock(sk, other); 1444 } 1445 1446 return 0; 1447 1448 out_unlock: 1449 unix_state_double_unlock(sk, other); 1450 sock_put(other); 1451 out: 1452 return err; 1453 } 1454 1455 static long unix_wait_for_peer(struct sock *other, long timeo) 1456 __releases(&unix_sk(other)->lock) 1457 { 1458 struct unix_sock *u = unix_sk(other); 1459 int sched; 1460 DEFINE_WAIT(wait); 1461 1462 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1463 1464 sched = !sock_flag(other, SOCK_DEAD) && 1465 !(other->sk_shutdown & RCV_SHUTDOWN) && 1466 unix_recvq_full_lockless(other); 1467 1468 unix_state_unlock(other); 1469 1470 if (sched) 1471 timeo = schedule_timeout(timeo); 1472 1473 finish_wait(&u->peer_wait, &wait); 1474 return timeo; 1475 } 1476 1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1478 int addr_len, int flags) 1479 { 1480 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1481 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1482 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1483 struct net *net = sock_net(sk); 1484 struct sk_buff *skb = NULL; 1485 long timeo; 1486 int err; 1487 int st; 1488 1489 err = unix_validate_addr(sunaddr, addr_len); 1490 if (err) 1491 goto out; 1492 1493 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1494 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1495 err = unix_autobind(sk); 1496 if (err) 1497 goto out; 1498 } 1499 1500 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1501 1502 /* First of all allocate resources. 1503 If we will make it after state is locked, 1504 we will have to recheck all again in any case. 1505 */ 1506 1507 /* create new sock for complete connection */ 1508 newsk = unix_create1(net, NULL, 0, sock->type); 1509 if (IS_ERR(newsk)) { 1510 err = PTR_ERR(newsk); 1511 newsk = NULL; 1512 goto out; 1513 } 1514 1515 err = -ENOMEM; 1516 1517 /* Allocate skb for sending to listening sock */ 1518 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1519 if (skb == NULL) 1520 goto out; 1521 1522 restart: 1523 /* Find listening sock. */ 1524 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1525 if (IS_ERR(other)) { 1526 err = PTR_ERR(other); 1527 other = NULL; 1528 goto out; 1529 } 1530 1531 /* Latch state of peer */ 1532 unix_state_lock(other); 1533 1534 /* Apparently VFS overslept socket death. Retry. */ 1535 if (sock_flag(other, SOCK_DEAD)) { 1536 unix_state_unlock(other); 1537 sock_put(other); 1538 goto restart; 1539 } 1540 1541 err = -ECONNREFUSED; 1542 if (other->sk_state != TCP_LISTEN) 1543 goto out_unlock; 1544 if (other->sk_shutdown & RCV_SHUTDOWN) 1545 goto out_unlock; 1546 1547 if (unix_recvq_full(other)) { 1548 err = -EAGAIN; 1549 if (!timeo) 1550 goto out_unlock; 1551 1552 timeo = unix_wait_for_peer(other, timeo); 1553 1554 err = sock_intr_errno(timeo); 1555 if (signal_pending(current)) 1556 goto out; 1557 sock_put(other); 1558 goto restart; 1559 } 1560 1561 /* Latch our state. 1562 1563 It is tricky place. We need to grab our state lock and cannot 1564 drop lock on peer. It is dangerous because deadlock is 1565 possible. Connect to self case and simultaneous 1566 attempt to connect are eliminated by checking socket 1567 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1568 check this before attempt to grab lock. 1569 1570 Well, and we have to recheck the state after socket locked. 1571 */ 1572 st = sk->sk_state; 1573 1574 switch (st) { 1575 case TCP_CLOSE: 1576 /* This is ok... continue with connect */ 1577 break; 1578 case TCP_ESTABLISHED: 1579 /* Socket is already connected */ 1580 err = -EISCONN; 1581 goto out_unlock; 1582 default: 1583 err = -EINVAL; 1584 goto out_unlock; 1585 } 1586 1587 unix_state_lock_nested(sk); 1588 1589 if (sk->sk_state != st) { 1590 unix_state_unlock(sk); 1591 unix_state_unlock(other); 1592 sock_put(other); 1593 goto restart; 1594 } 1595 1596 err = security_unix_stream_connect(sk, other, newsk); 1597 if (err) { 1598 unix_state_unlock(sk); 1599 goto out_unlock; 1600 } 1601 1602 /* The way is open! Fastly set all the necessary fields... */ 1603 1604 sock_hold(sk); 1605 unix_peer(newsk) = sk; 1606 newsk->sk_state = TCP_ESTABLISHED; 1607 newsk->sk_type = sk->sk_type; 1608 init_peercred(newsk); 1609 newu = unix_sk(newsk); 1610 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1611 otheru = unix_sk(other); 1612 1613 /* copy address information from listening to new sock 1614 * 1615 * The contents of *(otheru->addr) and otheru->path 1616 * are seen fully set up here, since we have found 1617 * otheru in hash under its lock. Insertion into the 1618 * hash chain we'd found it in had been done in an 1619 * earlier critical area protected by the chain's lock, 1620 * the same one where we'd set *(otheru->addr) contents, 1621 * as well as otheru->path and otheru->addr itself. 1622 * 1623 * Using smp_store_release() here to set newu->addr 1624 * is enough to make those stores, as well as stores 1625 * to newu->path visible to anyone who gets newu->addr 1626 * by smp_load_acquire(). IOW, the same warranties 1627 * as for unix_sock instances bound in unix_bind() or 1628 * in unix_autobind(). 1629 */ 1630 if (otheru->path.dentry) { 1631 path_get(&otheru->path); 1632 newu->path = otheru->path; 1633 } 1634 refcount_inc(&otheru->addr->refcnt); 1635 smp_store_release(&newu->addr, otheru->addr); 1636 1637 /* Set credentials */ 1638 copy_peercred(sk, other); 1639 1640 sock->state = SS_CONNECTED; 1641 sk->sk_state = TCP_ESTABLISHED; 1642 sock_hold(newsk); 1643 1644 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1645 unix_peer(sk) = newsk; 1646 1647 unix_state_unlock(sk); 1648 1649 /* take ten and send info to listening sock */ 1650 spin_lock(&other->sk_receive_queue.lock); 1651 __skb_queue_tail(&other->sk_receive_queue, skb); 1652 spin_unlock(&other->sk_receive_queue.lock); 1653 unix_state_unlock(other); 1654 other->sk_data_ready(other); 1655 sock_put(other); 1656 return 0; 1657 1658 out_unlock: 1659 if (other) 1660 unix_state_unlock(other); 1661 1662 out: 1663 kfree_skb(skb); 1664 if (newsk) 1665 unix_release_sock(newsk, 0); 1666 if (other) 1667 sock_put(other); 1668 return err; 1669 } 1670 1671 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1672 { 1673 struct sock *ska = socka->sk, *skb = sockb->sk; 1674 1675 /* Join our sockets back to back */ 1676 sock_hold(ska); 1677 sock_hold(skb); 1678 unix_peer(ska) = skb; 1679 unix_peer(skb) = ska; 1680 init_peercred(ska); 1681 init_peercred(skb); 1682 1683 ska->sk_state = TCP_ESTABLISHED; 1684 skb->sk_state = TCP_ESTABLISHED; 1685 socka->state = SS_CONNECTED; 1686 sockb->state = SS_CONNECTED; 1687 return 0; 1688 } 1689 1690 static void unix_sock_inherit_flags(const struct socket *old, 1691 struct socket *new) 1692 { 1693 if (test_bit(SOCK_PASSCRED, &old->flags)) 1694 set_bit(SOCK_PASSCRED, &new->flags); 1695 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1696 set_bit(SOCK_PASSPIDFD, &new->flags); 1697 if (test_bit(SOCK_PASSSEC, &old->flags)) 1698 set_bit(SOCK_PASSSEC, &new->flags); 1699 } 1700 1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1702 bool kern) 1703 { 1704 struct sock *sk = sock->sk; 1705 struct sock *tsk; 1706 struct sk_buff *skb; 1707 int err; 1708 1709 err = -EOPNOTSUPP; 1710 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1711 goto out; 1712 1713 err = -EINVAL; 1714 if (sk->sk_state != TCP_LISTEN) 1715 goto out; 1716 1717 /* If socket state is TCP_LISTEN it cannot change (for now...), 1718 * so that no locks are necessary. 1719 */ 1720 1721 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1722 &err); 1723 if (!skb) { 1724 /* This means receive shutdown. */ 1725 if (err == 0) 1726 err = -EINVAL; 1727 goto out; 1728 } 1729 1730 tsk = skb->sk; 1731 skb_free_datagram(sk, skb); 1732 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1733 1734 /* attach accepted sock to socket */ 1735 unix_state_lock(tsk); 1736 newsock->state = SS_CONNECTED; 1737 unix_sock_inherit_flags(sock, newsock); 1738 sock_graft(tsk, newsock); 1739 unix_state_unlock(tsk); 1740 return 0; 1741 1742 out: 1743 return err; 1744 } 1745 1746 1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1748 { 1749 struct sock *sk = sock->sk; 1750 struct unix_address *addr; 1751 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1752 int err = 0; 1753 1754 if (peer) { 1755 sk = unix_peer_get(sk); 1756 1757 err = -ENOTCONN; 1758 if (!sk) 1759 goto out; 1760 err = 0; 1761 } else { 1762 sock_hold(sk); 1763 } 1764 1765 addr = smp_load_acquire(&unix_sk(sk)->addr); 1766 if (!addr) { 1767 sunaddr->sun_family = AF_UNIX; 1768 sunaddr->sun_path[0] = 0; 1769 err = offsetof(struct sockaddr_un, sun_path); 1770 } else { 1771 err = addr->len; 1772 memcpy(sunaddr, addr->name, addr->len); 1773 } 1774 sock_put(sk); 1775 out: 1776 return err; 1777 } 1778 1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1780 { 1781 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1782 1783 /* 1784 * Garbage collection of unix sockets starts by selecting a set of 1785 * candidate sockets which have reference only from being in flight 1786 * (total_refs == inflight_refs). This condition is checked once during 1787 * the candidate collection phase, and candidates are marked as such, so 1788 * that non-candidates can later be ignored. While inflight_refs is 1789 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1790 * is an instantaneous decision. 1791 * 1792 * Once a candidate, however, the socket must not be reinstalled into a 1793 * file descriptor while the garbage collection is in progress. 1794 * 1795 * If the above conditions are met, then the directed graph of 1796 * candidates (*) does not change while unix_gc_lock is held. 1797 * 1798 * Any operations that changes the file count through file descriptors 1799 * (dup, close, sendmsg) does not change the graph since candidates are 1800 * not installed in fds. 1801 * 1802 * Dequeing a candidate via recvmsg would install it into an fd, but 1803 * that takes unix_gc_lock to decrement the inflight count, so it's 1804 * serialized with garbage collection. 1805 * 1806 * MSG_PEEK is special in that it does not change the inflight count, 1807 * yet does install the socket into an fd. The following lock/unlock 1808 * pair is to ensure serialization with garbage collection. It must be 1809 * done between incrementing the file count and installing the file into 1810 * an fd. 1811 * 1812 * If garbage collection starts after the barrier provided by the 1813 * lock/unlock, then it will see the elevated refcount and not mark this 1814 * as a candidate. If a garbage collection is already in progress 1815 * before the file count was incremented, then the lock/unlock pair will 1816 * ensure that garbage collection is finished before progressing to 1817 * installing the fd. 1818 * 1819 * (*) A -> B where B is on the queue of A or B is on the queue of C 1820 * which is on the queue of listening socket A. 1821 */ 1822 spin_lock(&unix_gc_lock); 1823 spin_unlock(&unix_gc_lock); 1824 } 1825 1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1827 { 1828 int err = 0; 1829 1830 UNIXCB(skb).pid = get_pid(scm->pid); 1831 UNIXCB(skb).uid = scm->creds.uid; 1832 UNIXCB(skb).gid = scm->creds.gid; 1833 UNIXCB(skb).fp = NULL; 1834 unix_get_secdata(scm, skb); 1835 if (scm->fp && send_fds) 1836 err = unix_attach_fds(scm, skb); 1837 1838 skb->destructor = unix_destruct_scm; 1839 return err; 1840 } 1841 1842 static bool unix_passcred_enabled(const struct socket *sock, 1843 const struct sock *other) 1844 { 1845 return test_bit(SOCK_PASSCRED, &sock->flags) || 1846 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1847 !other->sk_socket || 1848 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1849 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1850 } 1851 1852 /* 1853 * Some apps rely on write() giving SCM_CREDENTIALS 1854 * We include credentials if source or destination socket 1855 * asserted SOCK_PASSCRED. 1856 */ 1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1858 const struct sock *other) 1859 { 1860 if (UNIXCB(skb).pid) 1861 return; 1862 if (unix_passcred_enabled(sock, other)) { 1863 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1864 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1865 } 1866 } 1867 1868 static bool unix_skb_scm_eq(struct sk_buff *skb, 1869 struct scm_cookie *scm) 1870 { 1871 return UNIXCB(skb).pid == scm->pid && 1872 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1873 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1874 unix_secdata_eq(scm, skb); 1875 } 1876 1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1878 { 1879 struct scm_fp_list *fp = UNIXCB(skb).fp; 1880 struct unix_sock *u = unix_sk(sk); 1881 1882 if (unlikely(fp && fp->count)) 1883 atomic_add(fp->count, &u->scm_stat.nr_fds); 1884 } 1885 1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1887 { 1888 struct scm_fp_list *fp = UNIXCB(skb).fp; 1889 struct unix_sock *u = unix_sk(sk); 1890 1891 if (unlikely(fp && fp->count)) 1892 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1893 } 1894 1895 /* 1896 * Send AF_UNIX data. 1897 */ 1898 1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1900 size_t len) 1901 { 1902 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1903 struct sock *sk = sock->sk, *other = NULL; 1904 struct unix_sock *u = unix_sk(sk); 1905 struct scm_cookie scm; 1906 struct sk_buff *skb; 1907 int data_len = 0; 1908 int sk_locked; 1909 long timeo; 1910 int err; 1911 1912 wait_for_unix_gc(); 1913 err = scm_send(sock, msg, &scm, false); 1914 if (err < 0) 1915 return err; 1916 1917 err = -EOPNOTSUPP; 1918 if (msg->msg_flags&MSG_OOB) 1919 goto out; 1920 1921 if (msg->msg_namelen) { 1922 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1923 if (err) 1924 goto out; 1925 } else { 1926 sunaddr = NULL; 1927 err = -ENOTCONN; 1928 other = unix_peer_get(sk); 1929 if (!other) 1930 goto out; 1931 } 1932 1933 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1934 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1935 err = unix_autobind(sk); 1936 if (err) 1937 goto out; 1938 } 1939 1940 err = -EMSGSIZE; 1941 if (len > sk->sk_sndbuf - 32) 1942 goto out; 1943 1944 if (len > SKB_MAX_ALLOC) { 1945 data_len = min_t(size_t, 1946 len - SKB_MAX_ALLOC, 1947 MAX_SKB_FRAGS * PAGE_SIZE); 1948 data_len = PAGE_ALIGN(data_len); 1949 1950 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1951 } 1952 1953 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1954 msg->msg_flags & MSG_DONTWAIT, &err, 1955 PAGE_ALLOC_COSTLY_ORDER); 1956 if (skb == NULL) 1957 goto out; 1958 1959 err = unix_scm_to_skb(&scm, skb, true); 1960 if (err < 0) 1961 goto out_free; 1962 1963 skb_put(skb, len - data_len); 1964 skb->data_len = data_len; 1965 skb->len = len; 1966 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1967 if (err) 1968 goto out_free; 1969 1970 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1971 1972 restart: 1973 if (!other) { 1974 err = -ECONNRESET; 1975 if (sunaddr == NULL) 1976 goto out_free; 1977 1978 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 1979 sk->sk_type); 1980 if (IS_ERR(other)) { 1981 err = PTR_ERR(other); 1982 other = NULL; 1983 goto out_free; 1984 } 1985 } 1986 1987 if (sk_filter(other, skb) < 0) { 1988 /* Toss the packet but do not return any error to the sender */ 1989 err = len; 1990 goto out_free; 1991 } 1992 1993 sk_locked = 0; 1994 unix_state_lock(other); 1995 restart_locked: 1996 err = -EPERM; 1997 if (!unix_may_send(sk, other)) 1998 goto out_unlock; 1999 2000 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2001 /* 2002 * Check with 1003.1g - what should 2003 * datagram error 2004 */ 2005 unix_state_unlock(other); 2006 sock_put(other); 2007 2008 if (!sk_locked) 2009 unix_state_lock(sk); 2010 2011 err = 0; 2012 if (sk->sk_type == SOCK_SEQPACKET) { 2013 /* We are here only when racing with unix_release_sock() 2014 * is clearing @other. Never change state to TCP_CLOSE 2015 * unlike SOCK_DGRAM wants. 2016 */ 2017 unix_state_unlock(sk); 2018 err = -EPIPE; 2019 } else if (unix_peer(sk) == other) { 2020 unix_peer(sk) = NULL; 2021 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2022 2023 sk->sk_state = TCP_CLOSE; 2024 unix_state_unlock(sk); 2025 2026 unix_dgram_disconnected(sk, other); 2027 sock_put(other); 2028 err = -ECONNREFUSED; 2029 } else { 2030 unix_state_unlock(sk); 2031 } 2032 2033 other = NULL; 2034 if (err) 2035 goto out_free; 2036 goto restart; 2037 } 2038 2039 err = -EPIPE; 2040 if (other->sk_shutdown & RCV_SHUTDOWN) 2041 goto out_unlock; 2042 2043 if (sk->sk_type != SOCK_SEQPACKET) { 2044 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2045 if (err) 2046 goto out_unlock; 2047 } 2048 2049 /* other == sk && unix_peer(other) != sk if 2050 * - unix_peer(sk) == NULL, destination address bound to sk 2051 * - unix_peer(sk) == sk by time of get but disconnected before lock 2052 */ 2053 if (other != sk && 2054 unlikely(unix_peer(other) != sk && 2055 unix_recvq_full_lockless(other))) { 2056 if (timeo) { 2057 timeo = unix_wait_for_peer(other, timeo); 2058 2059 err = sock_intr_errno(timeo); 2060 if (signal_pending(current)) 2061 goto out_free; 2062 2063 goto restart; 2064 } 2065 2066 if (!sk_locked) { 2067 unix_state_unlock(other); 2068 unix_state_double_lock(sk, other); 2069 } 2070 2071 if (unix_peer(sk) != other || 2072 unix_dgram_peer_wake_me(sk, other)) { 2073 err = -EAGAIN; 2074 sk_locked = 1; 2075 goto out_unlock; 2076 } 2077 2078 if (!sk_locked) { 2079 sk_locked = 1; 2080 goto restart_locked; 2081 } 2082 } 2083 2084 if (unlikely(sk_locked)) 2085 unix_state_unlock(sk); 2086 2087 if (sock_flag(other, SOCK_RCVTSTAMP)) 2088 __net_timestamp(skb); 2089 maybe_add_creds(skb, sock, other); 2090 scm_stat_add(other, skb); 2091 skb_queue_tail(&other->sk_receive_queue, skb); 2092 unix_state_unlock(other); 2093 other->sk_data_ready(other); 2094 sock_put(other); 2095 scm_destroy(&scm); 2096 return len; 2097 2098 out_unlock: 2099 if (sk_locked) 2100 unix_state_unlock(sk); 2101 unix_state_unlock(other); 2102 out_free: 2103 kfree_skb(skb); 2104 out: 2105 if (other) 2106 sock_put(other); 2107 scm_destroy(&scm); 2108 return err; 2109 } 2110 2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2112 * bytes, and a minimum of a full page. 2113 */ 2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2115 2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2118 struct scm_cookie *scm, bool fds_sent) 2119 { 2120 struct unix_sock *ousk = unix_sk(other); 2121 struct sk_buff *skb; 2122 int err = 0; 2123 2124 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2125 2126 if (!skb) 2127 return err; 2128 2129 err = unix_scm_to_skb(scm, skb, !fds_sent); 2130 if (err < 0) { 2131 kfree_skb(skb); 2132 return err; 2133 } 2134 skb_put(skb, 1); 2135 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2136 2137 if (err) { 2138 kfree_skb(skb); 2139 return err; 2140 } 2141 2142 unix_state_lock(other); 2143 2144 if (sock_flag(other, SOCK_DEAD) || 2145 (other->sk_shutdown & RCV_SHUTDOWN)) { 2146 unix_state_unlock(other); 2147 kfree_skb(skb); 2148 return -EPIPE; 2149 } 2150 2151 maybe_add_creds(skb, sock, other); 2152 skb_get(skb); 2153 2154 if (ousk->oob_skb) 2155 consume_skb(ousk->oob_skb); 2156 2157 WRITE_ONCE(ousk->oob_skb, skb); 2158 2159 scm_stat_add(other, skb); 2160 skb_queue_tail(&other->sk_receive_queue, skb); 2161 sk_send_sigurg(other); 2162 unix_state_unlock(other); 2163 other->sk_data_ready(other); 2164 2165 return err; 2166 } 2167 #endif 2168 2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2170 size_t len) 2171 { 2172 struct sock *sk = sock->sk; 2173 struct sock *other = NULL; 2174 int err, size; 2175 struct sk_buff *skb; 2176 int sent = 0; 2177 struct scm_cookie scm; 2178 bool fds_sent = false; 2179 int data_len; 2180 2181 wait_for_unix_gc(); 2182 err = scm_send(sock, msg, &scm, false); 2183 if (err < 0) 2184 return err; 2185 2186 err = -EOPNOTSUPP; 2187 if (msg->msg_flags & MSG_OOB) { 2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2189 if (len) 2190 len--; 2191 else 2192 #endif 2193 goto out_err; 2194 } 2195 2196 if (msg->msg_namelen) { 2197 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2198 goto out_err; 2199 } else { 2200 err = -ENOTCONN; 2201 other = unix_peer(sk); 2202 if (!other) 2203 goto out_err; 2204 } 2205 2206 if (sk->sk_shutdown & SEND_SHUTDOWN) 2207 goto pipe_err; 2208 2209 while (sent < len) { 2210 size = len - sent; 2211 2212 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2213 skb = sock_alloc_send_pskb(sk, 0, 0, 2214 msg->msg_flags & MSG_DONTWAIT, 2215 &err, 0); 2216 } else { 2217 /* Keep two messages in the pipe so it schedules better */ 2218 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); 2219 2220 /* allow fallback to order-0 allocations */ 2221 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2222 2223 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2224 2225 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2226 2227 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2228 msg->msg_flags & MSG_DONTWAIT, &err, 2229 get_order(UNIX_SKB_FRAGS_SZ)); 2230 } 2231 if (!skb) 2232 goto out_err; 2233 2234 /* Only send the fds in the first buffer */ 2235 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2236 if (err < 0) { 2237 kfree_skb(skb); 2238 goto out_err; 2239 } 2240 fds_sent = true; 2241 2242 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2243 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2244 sk->sk_allocation); 2245 if (err < 0) { 2246 kfree_skb(skb); 2247 goto out_err; 2248 } 2249 size = err; 2250 refcount_add(size, &sk->sk_wmem_alloc); 2251 } else { 2252 skb_put(skb, size - data_len); 2253 skb->data_len = data_len; 2254 skb->len = size; 2255 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2256 if (err) { 2257 kfree_skb(skb); 2258 goto out_err; 2259 } 2260 } 2261 2262 unix_state_lock(other); 2263 2264 if (sock_flag(other, SOCK_DEAD) || 2265 (other->sk_shutdown & RCV_SHUTDOWN)) 2266 goto pipe_err_free; 2267 2268 maybe_add_creds(skb, sock, other); 2269 scm_stat_add(other, skb); 2270 skb_queue_tail(&other->sk_receive_queue, skb); 2271 unix_state_unlock(other); 2272 other->sk_data_ready(other); 2273 sent += size; 2274 } 2275 2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2277 if (msg->msg_flags & MSG_OOB) { 2278 err = queue_oob(sock, msg, other, &scm, fds_sent); 2279 if (err) 2280 goto out_err; 2281 sent++; 2282 } 2283 #endif 2284 2285 scm_destroy(&scm); 2286 2287 return sent; 2288 2289 pipe_err_free: 2290 unix_state_unlock(other); 2291 kfree_skb(skb); 2292 pipe_err: 2293 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2294 send_sig(SIGPIPE, current, 0); 2295 err = -EPIPE; 2296 out_err: 2297 scm_destroy(&scm); 2298 return sent ? : err; 2299 } 2300 2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2302 size_t len) 2303 { 2304 int err; 2305 struct sock *sk = sock->sk; 2306 2307 err = sock_error(sk); 2308 if (err) 2309 return err; 2310 2311 if (sk->sk_state != TCP_ESTABLISHED) 2312 return -ENOTCONN; 2313 2314 if (msg->msg_namelen) 2315 msg->msg_namelen = 0; 2316 2317 return unix_dgram_sendmsg(sock, msg, len); 2318 } 2319 2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2321 size_t size, int flags) 2322 { 2323 struct sock *sk = sock->sk; 2324 2325 if (sk->sk_state != TCP_ESTABLISHED) 2326 return -ENOTCONN; 2327 2328 return unix_dgram_recvmsg(sock, msg, size, flags); 2329 } 2330 2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2332 { 2333 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2334 2335 if (addr) { 2336 msg->msg_namelen = addr->len; 2337 memcpy(msg->msg_name, addr->name, addr->len); 2338 } 2339 } 2340 2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2342 int flags) 2343 { 2344 struct scm_cookie scm; 2345 struct socket *sock = sk->sk_socket; 2346 struct unix_sock *u = unix_sk(sk); 2347 struct sk_buff *skb, *last; 2348 long timeo; 2349 int skip; 2350 int err; 2351 2352 err = -EOPNOTSUPP; 2353 if (flags&MSG_OOB) 2354 goto out; 2355 2356 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2357 2358 do { 2359 mutex_lock(&u->iolock); 2360 2361 skip = sk_peek_offset(sk, flags); 2362 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2363 &skip, &err, &last); 2364 if (skb) { 2365 if (!(flags & MSG_PEEK)) 2366 scm_stat_del(sk, skb); 2367 break; 2368 } 2369 2370 mutex_unlock(&u->iolock); 2371 2372 if (err != -EAGAIN) 2373 break; 2374 } while (timeo && 2375 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2376 &err, &timeo, last)); 2377 2378 if (!skb) { /* implies iolock unlocked */ 2379 unix_state_lock(sk); 2380 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2381 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2382 (sk->sk_shutdown & RCV_SHUTDOWN)) 2383 err = 0; 2384 unix_state_unlock(sk); 2385 goto out; 2386 } 2387 2388 if (wq_has_sleeper(&u->peer_wait)) 2389 wake_up_interruptible_sync_poll(&u->peer_wait, 2390 EPOLLOUT | EPOLLWRNORM | 2391 EPOLLWRBAND); 2392 2393 if (msg->msg_name) 2394 unix_copy_addr(msg, skb->sk); 2395 2396 if (size > skb->len - skip) 2397 size = skb->len - skip; 2398 else if (size < skb->len - skip) 2399 msg->msg_flags |= MSG_TRUNC; 2400 2401 err = skb_copy_datagram_msg(skb, skip, msg, size); 2402 if (err) 2403 goto out_free; 2404 2405 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2406 __sock_recv_timestamp(msg, sk, skb); 2407 2408 memset(&scm, 0, sizeof(scm)); 2409 2410 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2411 unix_set_secdata(&scm, skb); 2412 2413 if (!(flags & MSG_PEEK)) { 2414 if (UNIXCB(skb).fp) 2415 unix_detach_fds(&scm, skb); 2416 2417 sk_peek_offset_bwd(sk, skb->len); 2418 } else { 2419 /* It is questionable: on PEEK we could: 2420 - do not return fds - good, but too simple 8) 2421 - return fds, and do not return them on read (old strategy, 2422 apparently wrong) 2423 - clone fds (I chose it for now, it is the most universal 2424 solution) 2425 2426 POSIX 1003.1g does not actually define this clearly 2427 at all. POSIX 1003.1g doesn't define a lot of things 2428 clearly however! 2429 2430 */ 2431 2432 sk_peek_offset_fwd(sk, size); 2433 2434 if (UNIXCB(skb).fp) 2435 unix_peek_fds(&scm, skb); 2436 } 2437 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2438 2439 scm_recv_unix(sock, msg, &scm, flags); 2440 2441 out_free: 2442 skb_free_datagram(sk, skb); 2443 mutex_unlock(&u->iolock); 2444 out: 2445 return err; 2446 } 2447 2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2449 int flags) 2450 { 2451 struct sock *sk = sock->sk; 2452 2453 #ifdef CONFIG_BPF_SYSCALL 2454 const struct proto *prot = READ_ONCE(sk->sk_prot); 2455 2456 if (prot != &unix_dgram_proto) 2457 return prot->recvmsg(sk, msg, size, flags, NULL); 2458 #endif 2459 return __unix_dgram_recvmsg(sk, msg, size, flags); 2460 } 2461 2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2463 { 2464 struct unix_sock *u = unix_sk(sk); 2465 struct sk_buff *skb; 2466 int err; 2467 2468 mutex_lock(&u->iolock); 2469 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2470 mutex_unlock(&u->iolock); 2471 if (!skb) 2472 return err; 2473 2474 return recv_actor(sk, skb); 2475 } 2476 2477 /* 2478 * Sleep until more data has arrived. But check for races.. 2479 */ 2480 static long unix_stream_data_wait(struct sock *sk, long timeo, 2481 struct sk_buff *last, unsigned int last_len, 2482 bool freezable) 2483 { 2484 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2485 struct sk_buff *tail; 2486 DEFINE_WAIT(wait); 2487 2488 unix_state_lock(sk); 2489 2490 for (;;) { 2491 prepare_to_wait(sk_sleep(sk), &wait, state); 2492 2493 tail = skb_peek_tail(&sk->sk_receive_queue); 2494 if (tail != last || 2495 (tail && tail->len != last_len) || 2496 sk->sk_err || 2497 (sk->sk_shutdown & RCV_SHUTDOWN) || 2498 signal_pending(current) || 2499 !timeo) 2500 break; 2501 2502 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2503 unix_state_unlock(sk); 2504 timeo = schedule_timeout(timeo); 2505 unix_state_lock(sk); 2506 2507 if (sock_flag(sk, SOCK_DEAD)) 2508 break; 2509 2510 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2511 } 2512 2513 finish_wait(sk_sleep(sk), &wait); 2514 unix_state_unlock(sk); 2515 return timeo; 2516 } 2517 2518 static unsigned int unix_skb_len(const struct sk_buff *skb) 2519 { 2520 return skb->len - UNIXCB(skb).consumed; 2521 } 2522 2523 struct unix_stream_read_state { 2524 int (*recv_actor)(struct sk_buff *, int, int, 2525 struct unix_stream_read_state *); 2526 struct socket *socket; 2527 struct msghdr *msg; 2528 struct pipe_inode_info *pipe; 2529 size_t size; 2530 int flags; 2531 unsigned int splice_flags; 2532 }; 2533 2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2536 { 2537 struct socket *sock = state->socket; 2538 struct sock *sk = sock->sk; 2539 struct unix_sock *u = unix_sk(sk); 2540 int chunk = 1; 2541 struct sk_buff *oob_skb; 2542 2543 mutex_lock(&u->iolock); 2544 unix_state_lock(sk); 2545 2546 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2547 unix_state_unlock(sk); 2548 mutex_unlock(&u->iolock); 2549 return -EINVAL; 2550 } 2551 2552 oob_skb = u->oob_skb; 2553 2554 if (!(state->flags & MSG_PEEK)) 2555 WRITE_ONCE(u->oob_skb, NULL); 2556 else 2557 skb_get(oob_skb); 2558 unix_state_unlock(sk); 2559 2560 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2561 2562 if (!(state->flags & MSG_PEEK)) 2563 UNIXCB(oob_skb).consumed += 1; 2564 2565 consume_skb(oob_skb); 2566 2567 mutex_unlock(&u->iolock); 2568 2569 if (chunk < 0) 2570 return -EFAULT; 2571 2572 state->msg->msg_flags |= MSG_OOB; 2573 return 1; 2574 } 2575 2576 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2577 int flags, int copied) 2578 { 2579 struct unix_sock *u = unix_sk(sk); 2580 2581 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2582 skb_unlink(skb, &sk->sk_receive_queue); 2583 consume_skb(skb); 2584 skb = NULL; 2585 } else { 2586 if (skb == u->oob_skb) { 2587 if (copied) { 2588 skb = NULL; 2589 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2590 if (!(flags & MSG_PEEK)) { 2591 WRITE_ONCE(u->oob_skb, NULL); 2592 consume_skb(skb); 2593 } 2594 } else if (!(flags & MSG_PEEK)) { 2595 skb_unlink(skb, &sk->sk_receive_queue); 2596 consume_skb(skb); 2597 skb = skb_peek(&sk->sk_receive_queue); 2598 } 2599 } 2600 } 2601 return skb; 2602 } 2603 #endif 2604 2605 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2606 { 2607 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) 2608 return -ENOTCONN; 2609 2610 return unix_read_skb(sk, recv_actor); 2611 } 2612 2613 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2614 bool freezable) 2615 { 2616 struct scm_cookie scm; 2617 struct socket *sock = state->socket; 2618 struct sock *sk = sock->sk; 2619 struct unix_sock *u = unix_sk(sk); 2620 int copied = 0; 2621 int flags = state->flags; 2622 int noblock = flags & MSG_DONTWAIT; 2623 bool check_creds = false; 2624 int target; 2625 int err = 0; 2626 long timeo; 2627 int skip; 2628 size_t size = state->size; 2629 unsigned int last_len; 2630 2631 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { 2632 err = -EINVAL; 2633 goto out; 2634 } 2635 2636 if (unlikely(flags & MSG_OOB)) { 2637 err = -EOPNOTSUPP; 2638 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2639 err = unix_stream_recv_urg(state); 2640 #endif 2641 goto out; 2642 } 2643 2644 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2645 timeo = sock_rcvtimeo(sk, noblock); 2646 2647 memset(&scm, 0, sizeof(scm)); 2648 2649 /* Lock the socket to prevent queue disordering 2650 * while sleeps in memcpy_tomsg 2651 */ 2652 mutex_lock(&u->iolock); 2653 2654 skip = max(sk_peek_offset(sk, flags), 0); 2655 2656 do { 2657 int chunk; 2658 bool drop_skb; 2659 struct sk_buff *skb, *last; 2660 2661 redo: 2662 unix_state_lock(sk); 2663 if (sock_flag(sk, SOCK_DEAD)) { 2664 err = -ECONNRESET; 2665 goto unlock; 2666 } 2667 last = skb = skb_peek(&sk->sk_receive_queue); 2668 last_len = last ? last->len : 0; 2669 2670 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2671 if (skb) { 2672 skb = manage_oob(skb, sk, flags, copied); 2673 if (!skb) { 2674 unix_state_unlock(sk); 2675 if (copied) 2676 break; 2677 goto redo; 2678 } 2679 } 2680 #endif 2681 again: 2682 if (skb == NULL) { 2683 if (copied >= target) 2684 goto unlock; 2685 2686 /* 2687 * POSIX 1003.1g mandates this order. 2688 */ 2689 2690 err = sock_error(sk); 2691 if (err) 2692 goto unlock; 2693 if (sk->sk_shutdown & RCV_SHUTDOWN) 2694 goto unlock; 2695 2696 unix_state_unlock(sk); 2697 if (!timeo) { 2698 err = -EAGAIN; 2699 break; 2700 } 2701 2702 mutex_unlock(&u->iolock); 2703 2704 timeo = unix_stream_data_wait(sk, timeo, last, 2705 last_len, freezable); 2706 2707 if (signal_pending(current)) { 2708 err = sock_intr_errno(timeo); 2709 scm_destroy(&scm); 2710 goto out; 2711 } 2712 2713 mutex_lock(&u->iolock); 2714 goto redo; 2715 unlock: 2716 unix_state_unlock(sk); 2717 break; 2718 } 2719 2720 while (skip >= unix_skb_len(skb)) { 2721 skip -= unix_skb_len(skb); 2722 last = skb; 2723 last_len = skb->len; 2724 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2725 if (!skb) 2726 goto again; 2727 } 2728 2729 unix_state_unlock(sk); 2730 2731 if (check_creds) { 2732 /* Never glue messages from different writers */ 2733 if (!unix_skb_scm_eq(skb, &scm)) 2734 break; 2735 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2736 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2737 /* Copy credentials */ 2738 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2739 unix_set_secdata(&scm, skb); 2740 check_creds = true; 2741 } 2742 2743 /* Copy address just once */ 2744 if (state->msg && state->msg->msg_name) { 2745 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2746 state->msg->msg_name); 2747 unix_copy_addr(state->msg, skb->sk); 2748 sunaddr = NULL; 2749 } 2750 2751 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2752 skb_get(skb); 2753 chunk = state->recv_actor(skb, skip, chunk, state); 2754 drop_skb = !unix_skb_len(skb); 2755 /* skb is only safe to use if !drop_skb */ 2756 consume_skb(skb); 2757 if (chunk < 0) { 2758 if (copied == 0) 2759 copied = -EFAULT; 2760 break; 2761 } 2762 copied += chunk; 2763 size -= chunk; 2764 2765 if (drop_skb) { 2766 /* the skb was touched by a concurrent reader; 2767 * we should not expect anything from this skb 2768 * anymore and assume it invalid - we can be 2769 * sure it was dropped from the socket queue 2770 * 2771 * let's report a short read 2772 */ 2773 err = 0; 2774 break; 2775 } 2776 2777 /* Mark read part of skb as used */ 2778 if (!(flags & MSG_PEEK)) { 2779 UNIXCB(skb).consumed += chunk; 2780 2781 sk_peek_offset_bwd(sk, chunk); 2782 2783 if (UNIXCB(skb).fp) { 2784 scm_stat_del(sk, skb); 2785 unix_detach_fds(&scm, skb); 2786 } 2787 2788 if (unix_skb_len(skb)) 2789 break; 2790 2791 skb_unlink(skb, &sk->sk_receive_queue); 2792 consume_skb(skb); 2793 2794 if (scm.fp) 2795 break; 2796 } else { 2797 /* It is questionable, see note in unix_dgram_recvmsg. 2798 */ 2799 if (UNIXCB(skb).fp) 2800 unix_peek_fds(&scm, skb); 2801 2802 sk_peek_offset_fwd(sk, chunk); 2803 2804 if (UNIXCB(skb).fp) 2805 break; 2806 2807 skip = 0; 2808 last = skb; 2809 last_len = skb->len; 2810 unix_state_lock(sk); 2811 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2812 if (skb) 2813 goto again; 2814 unix_state_unlock(sk); 2815 break; 2816 } 2817 } while (size); 2818 2819 mutex_unlock(&u->iolock); 2820 if (state->msg) 2821 scm_recv_unix(sock, state->msg, &scm, flags); 2822 else 2823 scm_destroy(&scm); 2824 out: 2825 return copied ? : err; 2826 } 2827 2828 static int unix_stream_read_actor(struct sk_buff *skb, 2829 int skip, int chunk, 2830 struct unix_stream_read_state *state) 2831 { 2832 int ret; 2833 2834 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2835 state->msg, chunk); 2836 return ret ?: chunk; 2837 } 2838 2839 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2840 size_t size, int flags) 2841 { 2842 struct unix_stream_read_state state = { 2843 .recv_actor = unix_stream_read_actor, 2844 .socket = sk->sk_socket, 2845 .msg = msg, 2846 .size = size, 2847 .flags = flags 2848 }; 2849 2850 return unix_stream_read_generic(&state, true); 2851 } 2852 2853 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2854 size_t size, int flags) 2855 { 2856 struct unix_stream_read_state state = { 2857 .recv_actor = unix_stream_read_actor, 2858 .socket = sock, 2859 .msg = msg, 2860 .size = size, 2861 .flags = flags 2862 }; 2863 2864 #ifdef CONFIG_BPF_SYSCALL 2865 struct sock *sk = sock->sk; 2866 const struct proto *prot = READ_ONCE(sk->sk_prot); 2867 2868 if (prot != &unix_stream_proto) 2869 return prot->recvmsg(sk, msg, size, flags, NULL); 2870 #endif 2871 return unix_stream_read_generic(&state, true); 2872 } 2873 2874 static int unix_stream_splice_actor(struct sk_buff *skb, 2875 int skip, int chunk, 2876 struct unix_stream_read_state *state) 2877 { 2878 return skb_splice_bits(skb, state->socket->sk, 2879 UNIXCB(skb).consumed + skip, 2880 state->pipe, chunk, state->splice_flags); 2881 } 2882 2883 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2884 struct pipe_inode_info *pipe, 2885 size_t size, unsigned int flags) 2886 { 2887 struct unix_stream_read_state state = { 2888 .recv_actor = unix_stream_splice_actor, 2889 .socket = sock, 2890 .pipe = pipe, 2891 .size = size, 2892 .splice_flags = flags, 2893 }; 2894 2895 if (unlikely(*ppos)) 2896 return -ESPIPE; 2897 2898 if (sock->file->f_flags & O_NONBLOCK || 2899 flags & SPLICE_F_NONBLOCK) 2900 state.flags = MSG_DONTWAIT; 2901 2902 return unix_stream_read_generic(&state, false); 2903 } 2904 2905 static int unix_shutdown(struct socket *sock, int mode) 2906 { 2907 struct sock *sk = sock->sk; 2908 struct sock *other; 2909 2910 if (mode < SHUT_RD || mode > SHUT_RDWR) 2911 return -EINVAL; 2912 /* This maps: 2913 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2914 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2915 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2916 */ 2917 ++mode; 2918 2919 unix_state_lock(sk); 2920 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2921 other = unix_peer(sk); 2922 if (other) 2923 sock_hold(other); 2924 unix_state_unlock(sk); 2925 sk->sk_state_change(sk); 2926 2927 if (other && 2928 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2929 2930 int peer_mode = 0; 2931 const struct proto *prot = READ_ONCE(other->sk_prot); 2932 2933 if (prot->unhash) 2934 prot->unhash(other); 2935 if (mode&RCV_SHUTDOWN) 2936 peer_mode |= SEND_SHUTDOWN; 2937 if (mode&SEND_SHUTDOWN) 2938 peer_mode |= RCV_SHUTDOWN; 2939 unix_state_lock(other); 2940 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2941 unix_state_unlock(other); 2942 other->sk_state_change(other); 2943 if (peer_mode == SHUTDOWN_MASK) 2944 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2945 else if (peer_mode & RCV_SHUTDOWN) 2946 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 2947 } 2948 if (other) 2949 sock_put(other); 2950 2951 return 0; 2952 } 2953 2954 long unix_inq_len(struct sock *sk) 2955 { 2956 struct sk_buff *skb; 2957 long amount = 0; 2958 2959 if (sk->sk_state == TCP_LISTEN) 2960 return -EINVAL; 2961 2962 spin_lock(&sk->sk_receive_queue.lock); 2963 if (sk->sk_type == SOCK_STREAM || 2964 sk->sk_type == SOCK_SEQPACKET) { 2965 skb_queue_walk(&sk->sk_receive_queue, skb) 2966 amount += unix_skb_len(skb); 2967 } else { 2968 skb = skb_peek(&sk->sk_receive_queue); 2969 if (skb) 2970 amount = skb->len; 2971 } 2972 spin_unlock(&sk->sk_receive_queue.lock); 2973 2974 return amount; 2975 } 2976 EXPORT_SYMBOL_GPL(unix_inq_len); 2977 2978 long unix_outq_len(struct sock *sk) 2979 { 2980 return sk_wmem_alloc_get(sk); 2981 } 2982 EXPORT_SYMBOL_GPL(unix_outq_len); 2983 2984 static int unix_open_file(struct sock *sk) 2985 { 2986 struct path path; 2987 struct file *f; 2988 int fd; 2989 2990 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2991 return -EPERM; 2992 2993 if (!smp_load_acquire(&unix_sk(sk)->addr)) 2994 return -ENOENT; 2995 2996 path = unix_sk(sk)->path; 2997 if (!path.dentry) 2998 return -ENOENT; 2999 3000 path_get(&path); 3001 3002 fd = get_unused_fd_flags(O_CLOEXEC); 3003 if (fd < 0) 3004 goto out; 3005 3006 f = dentry_open(&path, O_PATH, current_cred()); 3007 if (IS_ERR(f)) { 3008 put_unused_fd(fd); 3009 fd = PTR_ERR(f); 3010 goto out; 3011 } 3012 3013 fd_install(fd, f); 3014 out: 3015 path_put(&path); 3016 3017 return fd; 3018 } 3019 3020 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3021 { 3022 struct sock *sk = sock->sk; 3023 long amount = 0; 3024 int err; 3025 3026 switch (cmd) { 3027 case SIOCOUTQ: 3028 amount = unix_outq_len(sk); 3029 err = put_user(amount, (int __user *)arg); 3030 break; 3031 case SIOCINQ: 3032 amount = unix_inq_len(sk); 3033 if (amount < 0) 3034 err = amount; 3035 else 3036 err = put_user(amount, (int __user *)arg); 3037 break; 3038 case SIOCUNIXFILE: 3039 err = unix_open_file(sk); 3040 break; 3041 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3042 case SIOCATMARK: 3043 { 3044 struct sk_buff *skb; 3045 int answ = 0; 3046 3047 skb = skb_peek(&sk->sk_receive_queue); 3048 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3049 answ = 1; 3050 err = put_user(answ, (int __user *)arg); 3051 } 3052 break; 3053 #endif 3054 default: 3055 err = -ENOIOCTLCMD; 3056 break; 3057 } 3058 return err; 3059 } 3060 3061 #ifdef CONFIG_COMPAT 3062 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3063 { 3064 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3065 } 3066 #endif 3067 3068 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3069 { 3070 struct sock *sk = sock->sk; 3071 __poll_t mask; 3072 u8 shutdown; 3073 3074 sock_poll_wait(file, sock, wait); 3075 mask = 0; 3076 shutdown = READ_ONCE(sk->sk_shutdown); 3077 3078 /* exceptional events? */ 3079 if (READ_ONCE(sk->sk_err)) 3080 mask |= EPOLLERR; 3081 if (shutdown == SHUTDOWN_MASK) 3082 mask |= EPOLLHUP; 3083 if (shutdown & RCV_SHUTDOWN) 3084 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3085 3086 /* readable? */ 3087 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3088 mask |= EPOLLIN | EPOLLRDNORM; 3089 if (sk_is_readable(sk)) 3090 mask |= EPOLLIN | EPOLLRDNORM; 3091 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3092 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3093 mask |= EPOLLPRI; 3094 #endif 3095 3096 /* Connection-based need to check for termination and startup */ 3097 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3098 sk->sk_state == TCP_CLOSE) 3099 mask |= EPOLLHUP; 3100 3101 /* 3102 * we set writable also when the other side has shut down the 3103 * connection. This prevents stuck sockets. 3104 */ 3105 if (unix_writable(sk)) 3106 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3107 3108 return mask; 3109 } 3110 3111 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3112 poll_table *wait) 3113 { 3114 struct sock *sk = sock->sk, *other; 3115 unsigned int writable; 3116 __poll_t mask; 3117 u8 shutdown; 3118 3119 sock_poll_wait(file, sock, wait); 3120 mask = 0; 3121 shutdown = READ_ONCE(sk->sk_shutdown); 3122 3123 /* exceptional events? */ 3124 if (READ_ONCE(sk->sk_err) || 3125 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3126 mask |= EPOLLERR | 3127 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3128 3129 if (shutdown & RCV_SHUTDOWN) 3130 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3131 if (shutdown == SHUTDOWN_MASK) 3132 mask |= EPOLLHUP; 3133 3134 /* readable? */ 3135 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3136 mask |= EPOLLIN | EPOLLRDNORM; 3137 if (sk_is_readable(sk)) 3138 mask |= EPOLLIN | EPOLLRDNORM; 3139 3140 /* Connection-based need to check for termination and startup */ 3141 if (sk->sk_type == SOCK_SEQPACKET) { 3142 if (sk->sk_state == TCP_CLOSE) 3143 mask |= EPOLLHUP; 3144 /* connection hasn't started yet? */ 3145 if (sk->sk_state == TCP_SYN_SENT) 3146 return mask; 3147 } 3148 3149 /* No write status requested, avoid expensive OUT tests. */ 3150 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3151 return mask; 3152 3153 writable = unix_writable(sk); 3154 if (writable) { 3155 unix_state_lock(sk); 3156 3157 other = unix_peer(sk); 3158 if (other && unix_peer(other) != sk && 3159 unix_recvq_full_lockless(other) && 3160 unix_dgram_peer_wake_me(sk, other)) 3161 writable = 0; 3162 3163 unix_state_unlock(sk); 3164 } 3165 3166 if (writable) 3167 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3168 else 3169 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3170 3171 return mask; 3172 } 3173 3174 #ifdef CONFIG_PROC_FS 3175 3176 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3177 3178 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3179 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3180 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3181 3182 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3183 { 3184 unsigned long offset = get_offset(*pos); 3185 unsigned long bucket = get_bucket(*pos); 3186 unsigned long count = 0; 3187 struct sock *sk; 3188 3189 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3190 sk; sk = sk_next(sk)) { 3191 if (++count == offset) 3192 break; 3193 } 3194 3195 return sk; 3196 } 3197 3198 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3199 { 3200 unsigned long bucket = get_bucket(*pos); 3201 struct net *net = seq_file_net(seq); 3202 struct sock *sk; 3203 3204 while (bucket < UNIX_HASH_SIZE) { 3205 spin_lock(&net->unx.table.locks[bucket]); 3206 3207 sk = unix_from_bucket(seq, pos); 3208 if (sk) 3209 return sk; 3210 3211 spin_unlock(&net->unx.table.locks[bucket]); 3212 3213 *pos = set_bucket_offset(++bucket, 1); 3214 } 3215 3216 return NULL; 3217 } 3218 3219 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3220 loff_t *pos) 3221 { 3222 unsigned long bucket = get_bucket(*pos); 3223 3224 sk = sk_next(sk); 3225 if (sk) 3226 return sk; 3227 3228 3229 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3230 3231 *pos = set_bucket_offset(++bucket, 1); 3232 3233 return unix_get_first(seq, pos); 3234 } 3235 3236 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3237 { 3238 if (!*pos) 3239 return SEQ_START_TOKEN; 3240 3241 return unix_get_first(seq, pos); 3242 } 3243 3244 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3245 { 3246 ++*pos; 3247 3248 if (v == SEQ_START_TOKEN) 3249 return unix_get_first(seq, pos); 3250 3251 return unix_get_next(seq, v, pos); 3252 } 3253 3254 static void unix_seq_stop(struct seq_file *seq, void *v) 3255 { 3256 struct sock *sk = v; 3257 3258 if (sk) 3259 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3260 } 3261 3262 static int unix_seq_show(struct seq_file *seq, void *v) 3263 { 3264 3265 if (v == SEQ_START_TOKEN) 3266 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3267 "Inode Path\n"); 3268 else { 3269 struct sock *s = v; 3270 struct unix_sock *u = unix_sk(s); 3271 unix_state_lock(s); 3272 3273 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3274 s, 3275 refcount_read(&s->sk_refcnt), 3276 0, 3277 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3278 s->sk_type, 3279 s->sk_socket ? 3280 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3281 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3282 sock_i_ino(s)); 3283 3284 if (u->addr) { // under a hash table lock here 3285 int i, len; 3286 seq_putc(seq, ' '); 3287 3288 i = 0; 3289 len = u->addr->len - 3290 offsetof(struct sockaddr_un, sun_path); 3291 if (u->addr->name->sun_path[0]) { 3292 len--; 3293 } else { 3294 seq_putc(seq, '@'); 3295 i++; 3296 } 3297 for ( ; i < len; i++) 3298 seq_putc(seq, u->addr->name->sun_path[i] ?: 3299 '@'); 3300 } 3301 unix_state_unlock(s); 3302 seq_putc(seq, '\n'); 3303 } 3304 3305 return 0; 3306 } 3307 3308 static const struct seq_operations unix_seq_ops = { 3309 .start = unix_seq_start, 3310 .next = unix_seq_next, 3311 .stop = unix_seq_stop, 3312 .show = unix_seq_show, 3313 }; 3314 3315 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) 3316 struct bpf_unix_iter_state { 3317 struct seq_net_private p; 3318 unsigned int cur_sk; 3319 unsigned int end_sk; 3320 unsigned int max_sk; 3321 struct sock **batch; 3322 bool st_bucket_done; 3323 }; 3324 3325 struct bpf_iter__unix { 3326 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3327 __bpf_md_ptr(struct unix_sock *, unix_sk); 3328 uid_t uid __aligned(8); 3329 }; 3330 3331 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3332 struct unix_sock *unix_sk, uid_t uid) 3333 { 3334 struct bpf_iter__unix ctx; 3335 3336 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3337 ctx.meta = meta; 3338 ctx.unix_sk = unix_sk; 3339 ctx.uid = uid; 3340 return bpf_iter_run_prog(prog, &ctx); 3341 } 3342 3343 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3344 3345 { 3346 struct bpf_unix_iter_state *iter = seq->private; 3347 unsigned int expected = 1; 3348 struct sock *sk; 3349 3350 sock_hold(start_sk); 3351 iter->batch[iter->end_sk++] = start_sk; 3352 3353 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3354 if (iter->end_sk < iter->max_sk) { 3355 sock_hold(sk); 3356 iter->batch[iter->end_sk++] = sk; 3357 } 3358 3359 expected++; 3360 } 3361 3362 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3363 3364 return expected; 3365 } 3366 3367 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3368 { 3369 while (iter->cur_sk < iter->end_sk) 3370 sock_put(iter->batch[iter->cur_sk++]); 3371 } 3372 3373 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3374 unsigned int new_batch_sz) 3375 { 3376 struct sock **new_batch; 3377 3378 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3379 GFP_USER | __GFP_NOWARN); 3380 if (!new_batch) 3381 return -ENOMEM; 3382 3383 bpf_iter_unix_put_batch(iter); 3384 kvfree(iter->batch); 3385 iter->batch = new_batch; 3386 iter->max_sk = new_batch_sz; 3387 3388 return 0; 3389 } 3390 3391 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3392 loff_t *pos) 3393 { 3394 struct bpf_unix_iter_state *iter = seq->private; 3395 unsigned int expected; 3396 bool resized = false; 3397 struct sock *sk; 3398 3399 if (iter->st_bucket_done) 3400 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3401 3402 again: 3403 /* Get a new batch */ 3404 iter->cur_sk = 0; 3405 iter->end_sk = 0; 3406 3407 sk = unix_get_first(seq, pos); 3408 if (!sk) 3409 return NULL; /* Done */ 3410 3411 expected = bpf_iter_unix_hold_batch(seq, sk); 3412 3413 if (iter->end_sk == expected) { 3414 iter->st_bucket_done = true; 3415 return sk; 3416 } 3417 3418 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3419 resized = true; 3420 goto again; 3421 } 3422 3423 return sk; 3424 } 3425 3426 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3427 { 3428 if (!*pos) 3429 return SEQ_START_TOKEN; 3430 3431 /* bpf iter does not support lseek, so it always 3432 * continue from where it was stop()-ped. 3433 */ 3434 return bpf_iter_unix_batch(seq, pos); 3435 } 3436 3437 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3438 { 3439 struct bpf_unix_iter_state *iter = seq->private; 3440 struct sock *sk; 3441 3442 /* Whenever seq_next() is called, the iter->cur_sk is 3443 * done with seq_show(), so advance to the next sk in 3444 * the batch. 3445 */ 3446 if (iter->cur_sk < iter->end_sk) 3447 sock_put(iter->batch[iter->cur_sk++]); 3448 3449 ++*pos; 3450 3451 if (iter->cur_sk < iter->end_sk) 3452 sk = iter->batch[iter->cur_sk]; 3453 else 3454 sk = bpf_iter_unix_batch(seq, pos); 3455 3456 return sk; 3457 } 3458 3459 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3460 { 3461 struct bpf_iter_meta meta; 3462 struct bpf_prog *prog; 3463 struct sock *sk = v; 3464 uid_t uid; 3465 bool slow; 3466 int ret; 3467 3468 if (v == SEQ_START_TOKEN) 3469 return 0; 3470 3471 slow = lock_sock_fast(sk); 3472 3473 if (unlikely(sk_unhashed(sk))) { 3474 ret = SEQ_SKIP; 3475 goto unlock; 3476 } 3477 3478 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3479 meta.seq = seq; 3480 prog = bpf_iter_get_info(&meta, false); 3481 ret = unix_prog_seq_show(prog, &meta, v, uid); 3482 unlock: 3483 unlock_sock_fast(sk, slow); 3484 return ret; 3485 } 3486 3487 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3488 { 3489 struct bpf_unix_iter_state *iter = seq->private; 3490 struct bpf_iter_meta meta; 3491 struct bpf_prog *prog; 3492 3493 if (!v) { 3494 meta.seq = seq; 3495 prog = bpf_iter_get_info(&meta, true); 3496 if (prog) 3497 (void)unix_prog_seq_show(prog, &meta, v, 0); 3498 } 3499 3500 if (iter->cur_sk < iter->end_sk) 3501 bpf_iter_unix_put_batch(iter); 3502 } 3503 3504 static const struct seq_operations bpf_iter_unix_seq_ops = { 3505 .start = bpf_iter_unix_seq_start, 3506 .next = bpf_iter_unix_seq_next, 3507 .stop = bpf_iter_unix_seq_stop, 3508 .show = bpf_iter_unix_seq_show, 3509 }; 3510 #endif 3511 #endif 3512 3513 static const struct net_proto_family unix_family_ops = { 3514 .family = PF_UNIX, 3515 .create = unix_create, 3516 .owner = THIS_MODULE, 3517 }; 3518 3519 3520 static int __net_init unix_net_init(struct net *net) 3521 { 3522 int i; 3523 3524 net->unx.sysctl_max_dgram_qlen = 10; 3525 if (unix_sysctl_register(net)) 3526 goto out; 3527 3528 #ifdef CONFIG_PROC_FS 3529 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3530 sizeof(struct seq_net_private))) 3531 goto err_sysctl; 3532 #endif 3533 3534 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3535 sizeof(spinlock_t), GFP_KERNEL); 3536 if (!net->unx.table.locks) 3537 goto err_proc; 3538 3539 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3540 sizeof(struct hlist_head), 3541 GFP_KERNEL); 3542 if (!net->unx.table.buckets) 3543 goto free_locks; 3544 3545 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3546 spin_lock_init(&net->unx.table.locks[i]); 3547 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3548 } 3549 3550 return 0; 3551 3552 free_locks: 3553 kvfree(net->unx.table.locks); 3554 err_proc: 3555 #ifdef CONFIG_PROC_FS 3556 remove_proc_entry("unix", net->proc_net); 3557 err_sysctl: 3558 #endif 3559 unix_sysctl_unregister(net); 3560 out: 3561 return -ENOMEM; 3562 } 3563 3564 static void __net_exit unix_net_exit(struct net *net) 3565 { 3566 kvfree(net->unx.table.buckets); 3567 kvfree(net->unx.table.locks); 3568 unix_sysctl_unregister(net); 3569 remove_proc_entry("unix", net->proc_net); 3570 } 3571 3572 static struct pernet_operations unix_net_ops = { 3573 .init = unix_net_init, 3574 .exit = unix_net_exit, 3575 }; 3576 3577 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3578 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3579 struct unix_sock *unix_sk, uid_t uid) 3580 3581 #define INIT_BATCH_SZ 16 3582 3583 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3584 { 3585 struct bpf_unix_iter_state *iter = priv_data; 3586 int err; 3587 3588 err = bpf_iter_init_seq_net(priv_data, aux); 3589 if (err) 3590 return err; 3591 3592 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3593 if (err) { 3594 bpf_iter_fini_seq_net(priv_data); 3595 return err; 3596 } 3597 3598 return 0; 3599 } 3600 3601 static void bpf_iter_fini_unix(void *priv_data) 3602 { 3603 struct bpf_unix_iter_state *iter = priv_data; 3604 3605 bpf_iter_fini_seq_net(priv_data); 3606 kvfree(iter->batch); 3607 } 3608 3609 static const struct bpf_iter_seq_info unix_seq_info = { 3610 .seq_ops = &bpf_iter_unix_seq_ops, 3611 .init_seq_private = bpf_iter_init_unix, 3612 .fini_seq_private = bpf_iter_fini_unix, 3613 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3614 }; 3615 3616 static const struct bpf_func_proto * 3617 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3618 const struct bpf_prog *prog) 3619 { 3620 switch (func_id) { 3621 case BPF_FUNC_setsockopt: 3622 return &bpf_sk_setsockopt_proto; 3623 case BPF_FUNC_getsockopt: 3624 return &bpf_sk_getsockopt_proto; 3625 default: 3626 return NULL; 3627 } 3628 } 3629 3630 static struct bpf_iter_reg unix_reg_info = { 3631 .target = "unix", 3632 .ctx_arg_info_size = 1, 3633 .ctx_arg_info = { 3634 { offsetof(struct bpf_iter__unix, unix_sk), 3635 PTR_TO_BTF_ID_OR_NULL }, 3636 }, 3637 .get_func_proto = bpf_iter_unix_get_func_proto, 3638 .seq_info = &unix_seq_info, 3639 }; 3640 3641 static void __init bpf_iter_register(void) 3642 { 3643 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3644 if (bpf_iter_reg_target(&unix_reg_info)) 3645 pr_warn("Warning: could not register bpf iterator unix\n"); 3646 } 3647 #endif 3648 3649 static int __init af_unix_init(void) 3650 { 3651 int i, rc = -1; 3652 3653 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3654 3655 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3656 spin_lock_init(&bsd_socket_locks[i]); 3657 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3658 } 3659 3660 rc = proto_register(&unix_dgram_proto, 1); 3661 if (rc != 0) { 3662 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3663 goto out; 3664 } 3665 3666 rc = proto_register(&unix_stream_proto, 1); 3667 if (rc != 0) { 3668 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3669 proto_unregister(&unix_dgram_proto); 3670 goto out; 3671 } 3672 3673 sock_register(&unix_family_ops); 3674 register_pernet_subsys(&unix_net_ops); 3675 unix_bpf_build_proto(); 3676 3677 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3678 bpf_iter_register(); 3679 #endif 3680 3681 out: 3682 return rc; 3683 } 3684 3685 static void __exit af_unix_exit(void) 3686 { 3687 sock_unregister(PF_UNIX); 3688 proto_unregister(&unix_dgram_proto); 3689 proto_unregister(&unix_stream_proto); 3690 unregister_pernet_subsys(&unix_net_ops); 3691 } 3692 3693 /* Earlier than device_initcall() so that other drivers invoking 3694 request_module() don't end up in a loop when modprobe tries 3695 to use a UNIX socket. But later than subsys_initcall() because 3696 we depend on stuff initialised there */ 3697 fs_initcall(af_unix_init); 3698 module_exit(af_unix_exit); 3699 3700 MODULE_LICENSE("GPL"); 3701 MODULE_ALIAS_NETPROTO(PF_UNIX); 3702