1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/module.h> 81 #include <linux/kernel.h> 82 #include <linux/signal.h> 83 #include <linux/sched/signal.h> 84 #include <linux/errno.h> 85 #include <linux/string.h> 86 #include <linux/stat.h> 87 #include <linux/dcache.h> 88 #include <linux/namei.h> 89 #include <linux/socket.h> 90 #include <linux/un.h> 91 #include <linux/fcntl.h> 92 #include <linux/filter.h> 93 #include <linux/termios.h> 94 #include <linux/sockios.h> 95 #include <linux/net.h> 96 #include <linux/in.h> 97 #include <linux/fs.h> 98 #include <linux/slab.h> 99 #include <linux/uaccess.h> 100 #include <linux/skbuff.h> 101 #include <linux/netdevice.h> 102 #include <net/net_namespace.h> 103 #include <net/sock.h> 104 #include <net/tcp_states.h> 105 #include <net/af_unix.h> 106 #include <linux/proc_fs.h> 107 #include <linux/seq_file.h> 108 #include <net/scm.h> 109 #include <linux/init.h> 110 #include <linux/poll.h> 111 #include <linux/rtnetlink.h> 112 #include <linux/mount.h> 113 #include <net/checksum.h> 114 #include <linux/security.h> 115 #include <linux/splice.h> 116 #include <linux/freezer.h> 117 #include <linux/file.h> 118 #include <linux/btf_ids.h> 119 120 #include "scm.h" 121 122 static atomic_long_t unix_nr_socks; 123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 125 126 /* SMP locking strategy: 127 * hash table is protected with spinlock. 128 * each socket state is protected by separate spinlock. 129 */ 130 131 static unsigned int unix_unbound_hash(struct sock *sk) 132 { 133 unsigned long hash = (unsigned long)sk; 134 135 hash ^= hash >> 16; 136 hash ^= hash >> 8; 137 hash ^= sk->sk_type; 138 139 return hash & UNIX_HASH_MOD; 140 } 141 142 static unsigned int unix_bsd_hash(struct inode *i) 143 { 144 return i->i_ino & UNIX_HASH_MOD; 145 } 146 147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 148 int addr_len, int type) 149 { 150 __wsum csum = csum_partial(sunaddr, addr_len, 0); 151 unsigned int hash; 152 153 hash = (__force unsigned int)csum_fold(csum); 154 hash ^= hash >> 8; 155 hash ^= type; 156 157 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 158 } 159 160 static void unix_table_double_lock(struct net *net, 161 unsigned int hash1, unsigned int hash2) 162 { 163 if (hash1 == hash2) { 164 spin_lock(&net->unx.table.locks[hash1]); 165 return; 166 } 167 168 if (hash1 > hash2) 169 swap(hash1, hash2); 170 171 spin_lock(&net->unx.table.locks[hash1]); 172 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); 173 } 174 175 static void unix_table_double_unlock(struct net *net, 176 unsigned int hash1, unsigned int hash2) 177 { 178 if (hash1 == hash2) { 179 spin_unlock(&net->unx.table.locks[hash1]); 180 return; 181 } 182 183 spin_unlock(&net->unx.table.locks[hash1]); 184 spin_unlock(&net->unx.table.locks[hash2]); 185 } 186 187 #ifdef CONFIG_SECURITY_NETWORK 188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 189 { 190 UNIXCB(skb).secid = scm->secid; 191 } 192 193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 194 { 195 scm->secid = UNIXCB(skb).secid; 196 } 197 198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 199 { 200 return (scm->secid == UNIXCB(skb).secid); 201 } 202 #else 203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 204 { } 205 206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 207 { } 208 209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 210 { 211 return true; 212 } 213 #endif /* CONFIG_SECURITY_NETWORK */ 214 215 #define unix_peer(sk) (unix_sk(sk)->peer) 216 217 static inline int unix_our_peer(struct sock *sk, struct sock *osk) 218 { 219 return unix_peer(osk) == sk; 220 } 221 222 static inline int unix_may_send(struct sock *sk, struct sock *osk) 223 { 224 return unix_peer(osk) == NULL || unix_our_peer(sk, osk); 225 } 226 227 static inline int unix_recvq_full(const struct sock *sk) 228 { 229 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 230 } 231 232 static inline int unix_recvq_full_lockless(const struct sock *sk) 233 { 234 return skb_queue_len_lockless(&sk->sk_receive_queue) > 235 READ_ONCE(sk->sk_max_ack_backlog); 236 } 237 238 struct sock *unix_peer_get(struct sock *s) 239 { 240 struct sock *peer; 241 242 unix_state_lock(s); 243 peer = unix_peer(s); 244 if (peer) 245 sock_hold(peer); 246 unix_state_unlock(s); 247 return peer; 248 } 249 EXPORT_SYMBOL_GPL(unix_peer_get); 250 251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 252 int addr_len) 253 { 254 struct unix_address *addr; 255 256 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 257 if (!addr) 258 return NULL; 259 260 refcount_set(&addr->refcnt, 1); 261 addr->len = addr_len; 262 memcpy(addr->name, sunaddr, addr_len); 263 264 return addr; 265 } 266 267 static inline void unix_release_addr(struct unix_address *addr) 268 { 269 if (refcount_dec_and_test(&addr->refcnt)) 270 kfree(addr); 271 } 272 273 /* 274 * Check unix socket name: 275 * - should be not zero length. 276 * - if started by not zero, should be NULL terminated (FS object) 277 * - if started by zero, it is abstract name. 278 */ 279 280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 281 { 282 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 283 addr_len > sizeof(*sunaddr)) 284 return -EINVAL; 285 286 if (sunaddr->sun_family != AF_UNIX) 287 return -EINVAL; 288 289 return 0; 290 } 291 292 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 293 { 294 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 295 short offset = offsetof(struct sockaddr_storage, __data); 296 297 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 298 299 /* This may look like an off by one error but it is a bit more 300 * subtle. 108 is the longest valid AF_UNIX path for a binding. 301 * sun_path[108] doesn't as such exist. However in kernel space 302 * we are guaranteed that it is a valid memory location in our 303 * kernel address buffer because syscall functions always pass 304 * a pointer of struct sockaddr_storage which has a bigger buffer 305 * than 108. Also, we must terminate sun_path for strlen() in 306 * getname_kernel(). 307 */ 308 addr->__data[addr_len - offset] = 0; 309 310 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 311 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 312 * know the actual buffer. 313 */ 314 return strlen(addr->__data) + offset + 1; 315 } 316 317 static void __unix_remove_socket(struct sock *sk) 318 { 319 sk_del_node_init(sk); 320 } 321 322 static void __unix_insert_socket(struct net *net, struct sock *sk) 323 { 324 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 325 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 326 } 327 328 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 329 struct unix_address *addr, unsigned int hash) 330 { 331 __unix_remove_socket(sk); 332 smp_store_release(&unix_sk(sk)->addr, addr); 333 334 sk->sk_hash = hash; 335 __unix_insert_socket(net, sk); 336 } 337 338 static void unix_remove_socket(struct net *net, struct sock *sk) 339 { 340 spin_lock(&net->unx.table.locks[sk->sk_hash]); 341 __unix_remove_socket(sk); 342 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 343 } 344 345 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 346 { 347 spin_lock(&net->unx.table.locks[sk->sk_hash]); 348 __unix_insert_socket(net, sk); 349 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 350 } 351 352 static void unix_insert_bsd_socket(struct sock *sk) 353 { 354 spin_lock(&bsd_socket_locks[sk->sk_hash]); 355 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 356 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 357 } 358 359 static void unix_remove_bsd_socket(struct sock *sk) 360 { 361 if (!hlist_unhashed(&sk->sk_bind_node)) { 362 spin_lock(&bsd_socket_locks[sk->sk_hash]); 363 __sk_del_bind_node(sk); 364 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 365 366 sk_node_init(&sk->sk_bind_node); 367 } 368 } 369 370 static struct sock *__unix_find_socket_byname(struct net *net, 371 struct sockaddr_un *sunname, 372 int len, unsigned int hash) 373 { 374 struct sock *s; 375 376 sk_for_each(s, &net->unx.table.buckets[hash]) { 377 struct unix_sock *u = unix_sk(s); 378 379 if (u->addr->len == len && 380 !memcmp(u->addr->name, sunname, len)) 381 return s; 382 } 383 return NULL; 384 } 385 386 static inline struct sock *unix_find_socket_byname(struct net *net, 387 struct sockaddr_un *sunname, 388 int len, unsigned int hash) 389 { 390 struct sock *s; 391 392 spin_lock(&net->unx.table.locks[hash]); 393 s = __unix_find_socket_byname(net, sunname, len, hash); 394 if (s) 395 sock_hold(s); 396 spin_unlock(&net->unx.table.locks[hash]); 397 return s; 398 } 399 400 static struct sock *unix_find_socket_byinode(struct inode *i) 401 { 402 unsigned int hash = unix_bsd_hash(i); 403 struct sock *s; 404 405 spin_lock(&bsd_socket_locks[hash]); 406 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 407 struct dentry *dentry = unix_sk(s)->path.dentry; 408 409 if (dentry && d_backing_inode(dentry) == i) { 410 sock_hold(s); 411 spin_unlock(&bsd_socket_locks[hash]); 412 return s; 413 } 414 } 415 spin_unlock(&bsd_socket_locks[hash]); 416 return NULL; 417 } 418 419 /* Support code for asymmetrically connected dgram sockets 420 * 421 * If a datagram socket is connected to a socket not itself connected 422 * to the first socket (eg, /dev/log), clients may only enqueue more 423 * messages if the present receive queue of the server socket is not 424 * "too large". This means there's a second writeability condition 425 * poll and sendmsg need to test. The dgram recv code will do a wake 426 * up on the peer_wait wait queue of a socket upon reception of a 427 * datagram which needs to be propagated to sleeping would-be writers 428 * since these might not have sent anything so far. This can't be 429 * accomplished via poll_wait because the lifetime of the server 430 * socket might be less than that of its clients if these break their 431 * association with it or if the server socket is closed while clients 432 * are still connected to it and there's no way to inform "a polling 433 * implementation" that it should let go of a certain wait queue 434 * 435 * In order to propagate a wake up, a wait_queue_entry_t of the client 436 * socket is enqueued on the peer_wait queue of the server socket 437 * whose wake function does a wake_up on the ordinary client socket 438 * wait queue. This connection is established whenever a write (or 439 * poll for write) hit the flow control condition and broken when the 440 * association to the server socket is dissolved or after a wake up 441 * was relayed. 442 */ 443 444 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 445 void *key) 446 { 447 struct unix_sock *u; 448 wait_queue_head_t *u_sleep; 449 450 u = container_of(q, struct unix_sock, peer_wake); 451 452 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 453 q); 454 u->peer_wake.private = NULL; 455 456 /* relaying can only happen while the wq still exists */ 457 u_sleep = sk_sleep(&u->sk); 458 if (u_sleep) 459 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 460 461 return 0; 462 } 463 464 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 465 { 466 struct unix_sock *u, *u_other; 467 int rc; 468 469 u = unix_sk(sk); 470 u_other = unix_sk(other); 471 rc = 0; 472 spin_lock(&u_other->peer_wait.lock); 473 474 if (!u->peer_wake.private) { 475 u->peer_wake.private = other; 476 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 477 478 rc = 1; 479 } 480 481 spin_unlock(&u_other->peer_wait.lock); 482 return rc; 483 } 484 485 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 486 struct sock *other) 487 { 488 struct unix_sock *u, *u_other; 489 490 u = unix_sk(sk); 491 u_other = unix_sk(other); 492 spin_lock(&u_other->peer_wait.lock); 493 494 if (u->peer_wake.private == other) { 495 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 496 u->peer_wake.private = NULL; 497 } 498 499 spin_unlock(&u_other->peer_wait.lock); 500 } 501 502 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 503 struct sock *other) 504 { 505 unix_dgram_peer_wake_disconnect(sk, other); 506 wake_up_interruptible_poll(sk_sleep(sk), 507 EPOLLOUT | 508 EPOLLWRNORM | 509 EPOLLWRBAND); 510 } 511 512 /* preconditions: 513 * - unix_peer(sk) == other 514 * - association is stable 515 */ 516 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 517 { 518 int connected; 519 520 connected = unix_dgram_peer_wake_connect(sk, other); 521 522 /* If other is SOCK_DEAD, we want to make sure we signal 523 * POLLOUT, such that a subsequent write() can get a 524 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 525 * to other and its full, we will hang waiting for POLLOUT. 526 */ 527 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 528 return 1; 529 530 if (connected) 531 unix_dgram_peer_wake_disconnect(sk, other); 532 533 return 0; 534 } 535 536 static int unix_writable(const struct sock *sk) 537 { 538 return sk->sk_state != TCP_LISTEN && 539 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; 540 } 541 542 static void unix_write_space(struct sock *sk) 543 { 544 struct socket_wq *wq; 545 546 rcu_read_lock(); 547 if (unix_writable(sk)) { 548 wq = rcu_dereference(sk->sk_wq); 549 if (skwq_has_sleeper(wq)) 550 wake_up_interruptible_sync_poll(&wq->wait, 551 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 552 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 553 } 554 rcu_read_unlock(); 555 } 556 557 /* When dgram socket disconnects (or changes its peer), we clear its receive 558 * queue of packets arrived from previous peer. First, it allows to do 559 * flow control based only on wmem_alloc; second, sk connected to peer 560 * may receive messages only from that peer. */ 561 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 562 { 563 if (!skb_queue_empty(&sk->sk_receive_queue)) { 564 skb_queue_purge(&sk->sk_receive_queue); 565 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 566 567 /* If one link of bidirectional dgram pipe is disconnected, 568 * we signal error. Messages are lost. Do not make this, 569 * when peer was not connected to us. 570 */ 571 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 572 WRITE_ONCE(other->sk_err, ECONNRESET); 573 sk_error_report(other); 574 } 575 } 576 other->sk_state = TCP_CLOSE; 577 } 578 579 static void unix_sock_destructor(struct sock *sk) 580 { 581 struct unix_sock *u = unix_sk(sk); 582 583 skb_queue_purge(&sk->sk_receive_queue); 584 585 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 586 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 587 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 588 if (!sock_flag(sk, SOCK_DEAD)) { 589 pr_info("Attempt to release alive unix socket: %p\n", sk); 590 return; 591 } 592 593 if (u->addr) 594 unix_release_addr(u->addr); 595 596 atomic_long_dec(&unix_nr_socks); 597 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 598 #ifdef UNIX_REFCNT_DEBUG 599 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 600 atomic_long_read(&unix_nr_socks)); 601 #endif 602 } 603 604 static void unix_release_sock(struct sock *sk, int embrion) 605 { 606 struct unix_sock *u = unix_sk(sk); 607 struct sock *skpair; 608 struct sk_buff *skb; 609 struct path path; 610 int state; 611 612 unix_remove_socket(sock_net(sk), sk); 613 unix_remove_bsd_socket(sk); 614 615 /* Clear state */ 616 unix_state_lock(sk); 617 sock_orphan(sk); 618 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 619 path = u->path; 620 u->path.dentry = NULL; 621 u->path.mnt = NULL; 622 state = sk->sk_state; 623 sk->sk_state = TCP_CLOSE; 624 625 skpair = unix_peer(sk); 626 unix_peer(sk) = NULL; 627 628 unix_state_unlock(sk); 629 630 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 631 if (u->oob_skb) { 632 kfree_skb(u->oob_skb); 633 u->oob_skb = NULL; 634 } 635 #endif 636 637 wake_up_interruptible_all(&u->peer_wait); 638 639 if (skpair != NULL) { 640 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 641 unix_state_lock(skpair); 642 /* No more writes */ 643 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 644 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) 645 WRITE_ONCE(skpair->sk_err, ECONNRESET); 646 unix_state_unlock(skpair); 647 skpair->sk_state_change(skpair); 648 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 649 } 650 651 unix_dgram_peer_wake_disconnect(sk, skpair); 652 sock_put(skpair); /* It may now die */ 653 } 654 655 /* Try to flush out this socket. Throw out buffers at least */ 656 657 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 658 if (state == TCP_LISTEN) 659 unix_release_sock(skb->sk, 1); 660 /* passed fds are erased in the kfree_skb hook */ 661 UNIXCB(skb).consumed = skb->len; 662 kfree_skb(skb); 663 } 664 665 if (path.dentry) 666 path_put(&path); 667 668 sock_put(sk); 669 670 /* ---- Socket is dead now and most probably destroyed ---- */ 671 672 /* 673 * Fixme: BSD difference: In BSD all sockets connected to us get 674 * ECONNRESET and we die on the spot. In Linux we behave 675 * like files and pipes do and wait for the last 676 * dereference. 677 * 678 * Can't we simply set sock->err? 679 * 680 * What the above comment does talk about? --ANK(980817) 681 */ 682 683 if (READ_ONCE(unix_tot_inflight)) 684 unix_gc(); /* Garbage collect fds */ 685 } 686 687 static void init_peercred(struct sock *sk) 688 { 689 const struct cred *old_cred; 690 struct pid *old_pid; 691 692 spin_lock(&sk->sk_peer_lock); 693 old_pid = sk->sk_peer_pid; 694 old_cred = sk->sk_peer_cred; 695 sk->sk_peer_pid = get_pid(task_tgid(current)); 696 sk->sk_peer_cred = get_current_cred(); 697 spin_unlock(&sk->sk_peer_lock); 698 699 put_pid(old_pid); 700 put_cred(old_cred); 701 } 702 703 static void copy_peercred(struct sock *sk, struct sock *peersk) 704 { 705 const struct cred *old_cred; 706 struct pid *old_pid; 707 708 if (sk < peersk) { 709 spin_lock(&sk->sk_peer_lock); 710 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); 711 } else { 712 spin_lock(&peersk->sk_peer_lock); 713 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); 714 } 715 old_pid = sk->sk_peer_pid; 716 old_cred = sk->sk_peer_cred; 717 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 718 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 719 720 spin_unlock(&sk->sk_peer_lock); 721 spin_unlock(&peersk->sk_peer_lock); 722 723 put_pid(old_pid); 724 put_cred(old_cred); 725 } 726 727 static int unix_listen(struct socket *sock, int backlog) 728 { 729 int err; 730 struct sock *sk = sock->sk; 731 struct unix_sock *u = unix_sk(sk); 732 733 err = -EOPNOTSUPP; 734 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 735 goto out; /* Only stream/seqpacket sockets accept */ 736 err = -EINVAL; 737 if (!u->addr) 738 goto out; /* No listens on an unbound socket */ 739 unix_state_lock(sk); 740 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 741 goto out_unlock; 742 if (backlog > sk->sk_max_ack_backlog) 743 wake_up_interruptible_all(&u->peer_wait); 744 sk->sk_max_ack_backlog = backlog; 745 sk->sk_state = TCP_LISTEN; 746 /* set credentials so connect can copy them */ 747 init_peercred(sk); 748 err = 0; 749 750 out_unlock: 751 unix_state_unlock(sk); 752 out: 753 return err; 754 } 755 756 static int unix_release(struct socket *); 757 static int unix_bind(struct socket *, struct sockaddr *, int); 758 static int unix_stream_connect(struct socket *, struct sockaddr *, 759 int addr_len, int flags); 760 static int unix_socketpair(struct socket *, struct socket *); 761 static int unix_accept(struct socket *, struct socket *, int, bool); 762 static int unix_getname(struct socket *, struct sockaddr *, int); 763 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 764 static __poll_t unix_dgram_poll(struct file *, struct socket *, 765 poll_table *); 766 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 767 #ifdef CONFIG_COMPAT 768 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 769 #endif 770 static int unix_shutdown(struct socket *, int); 771 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 772 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 773 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 774 struct pipe_inode_info *, size_t size, 775 unsigned int flags); 776 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 777 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 778 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 779 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 780 static int unix_dgram_connect(struct socket *, struct sockaddr *, 781 int, int); 782 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 783 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 784 int); 785 786 static int unix_set_peek_off(struct sock *sk, int val) 787 { 788 struct unix_sock *u = unix_sk(sk); 789 790 if (mutex_lock_interruptible(&u->iolock)) 791 return -EINTR; 792 793 WRITE_ONCE(sk->sk_peek_off, val); 794 mutex_unlock(&u->iolock); 795 796 return 0; 797 } 798 799 #ifdef CONFIG_PROC_FS 800 static int unix_count_nr_fds(struct sock *sk) 801 { 802 struct sk_buff *skb; 803 struct unix_sock *u; 804 int nr_fds = 0; 805 806 spin_lock(&sk->sk_receive_queue.lock); 807 skb = skb_peek(&sk->sk_receive_queue); 808 while (skb) { 809 u = unix_sk(skb->sk); 810 nr_fds += atomic_read(&u->scm_stat.nr_fds); 811 skb = skb_peek_next(skb, &sk->sk_receive_queue); 812 } 813 spin_unlock(&sk->sk_receive_queue.lock); 814 815 return nr_fds; 816 } 817 818 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 819 { 820 struct sock *sk = sock->sk; 821 unsigned char s_state; 822 struct unix_sock *u; 823 int nr_fds = 0; 824 825 if (sk) { 826 s_state = READ_ONCE(sk->sk_state); 827 u = unix_sk(sk); 828 829 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 830 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 831 * SOCK_DGRAM is ordinary. So, no lock is needed. 832 */ 833 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 834 nr_fds = atomic_read(&u->scm_stat.nr_fds); 835 else if (s_state == TCP_LISTEN) 836 nr_fds = unix_count_nr_fds(sk); 837 838 seq_printf(m, "scm_fds: %u\n", nr_fds); 839 } 840 } 841 #else 842 #define unix_show_fdinfo NULL 843 #endif 844 845 static const struct proto_ops unix_stream_ops = { 846 .family = PF_UNIX, 847 .owner = THIS_MODULE, 848 .release = unix_release, 849 .bind = unix_bind, 850 .connect = unix_stream_connect, 851 .socketpair = unix_socketpair, 852 .accept = unix_accept, 853 .getname = unix_getname, 854 .poll = unix_poll, 855 .ioctl = unix_ioctl, 856 #ifdef CONFIG_COMPAT 857 .compat_ioctl = unix_compat_ioctl, 858 #endif 859 .listen = unix_listen, 860 .shutdown = unix_shutdown, 861 .sendmsg = unix_stream_sendmsg, 862 .recvmsg = unix_stream_recvmsg, 863 .read_skb = unix_stream_read_skb, 864 .mmap = sock_no_mmap, 865 .splice_read = unix_stream_splice_read, 866 .set_peek_off = unix_set_peek_off, 867 .show_fdinfo = unix_show_fdinfo, 868 }; 869 870 static const struct proto_ops unix_dgram_ops = { 871 .family = PF_UNIX, 872 .owner = THIS_MODULE, 873 .release = unix_release, 874 .bind = unix_bind, 875 .connect = unix_dgram_connect, 876 .socketpair = unix_socketpair, 877 .accept = sock_no_accept, 878 .getname = unix_getname, 879 .poll = unix_dgram_poll, 880 .ioctl = unix_ioctl, 881 #ifdef CONFIG_COMPAT 882 .compat_ioctl = unix_compat_ioctl, 883 #endif 884 .listen = sock_no_listen, 885 .shutdown = unix_shutdown, 886 .sendmsg = unix_dgram_sendmsg, 887 .read_skb = unix_read_skb, 888 .recvmsg = unix_dgram_recvmsg, 889 .mmap = sock_no_mmap, 890 .set_peek_off = unix_set_peek_off, 891 .show_fdinfo = unix_show_fdinfo, 892 }; 893 894 static const struct proto_ops unix_seqpacket_ops = { 895 .family = PF_UNIX, 896 .owner = THIS_MODULE, 897 .release = unix_release, 898 .bind = unix_bind, 899 .connect = unix_stream_connect, 900 .socketpair = unix_socketpair, 901 .accept = unix_accept, 902 .getname = unix_getname, 903 .poll = unix_dgram_poll, 904 .ioctl = unix_ioctl, 905 #ifdef CONFIG_COMPAT 906 .compat_ioctl = unix_compat_ioctl, 907 #endif 908 .listen = unix_listen, 909 .shutdown = unix_shutdown, 910 .sendmsg = unix_seqpacket_sendmsg, 911 .recvmsg = unix_seqpacket_recvmsg, 912 .mmap = sock_no_mmap, 913 .set_peek_off = unix_set_peek_off, 914 .show_fdinfo = unix_show_fdinfo, 915 }; 916 917 static void unix_close(struct sock *sk, long timeout) 918 { 919 /* Nothing to do here, unix socket does not need a ->close(). 920 * This is merely for sockmap. 921 */ 922 } 923 924 static void unix_unhash(struct sock *sk) 925 { 926 /* Nothing to do here, unix socket does not need a ->unhash(). 927 * This is merely for sockmap. 928 */ 929 } 930 931 static bool unix_bpf_bypass_getsockopt(int level, int optname) 932 { 933 if (level == SOL_SOCKET) { 934 switch (optname) { 935 case SO_PEERPIDFD: 936 return true; 937 default: 938 return false; 939 } 940 } 941 942 return false; 943 } 944 945 struct proto unix_dgram_proto = { 946 .name = "UNIX", 947 .owner = THIS_MODULE, 948 .obj_size = sizeof(struct unix_sock), 949 .close = unix_close, 950 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 951 #ifdef CONFIG_BPF_SYSCALL 952 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 953 #endif 954 }; 955 956 struct proto unix_stream_proto = { 957 .name = "UNIX-STREAM", 958 .owner = THIS_MODULE, 959 .obj_size = sizeof(struct unix_sock), 960 .close = unix_close, 961 .unhash = unix_unhash, 962 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 963 #ifdef CONFIG_BPF_SYSCALL 964 .psock_update_sk_prot = unix_stream_bpf_update_proto, 965 #endif 966 }; 967 968 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 969 { 970 struct unix_sock *u; 971 struct sock *sk; 972 int err; 973 974 atomic_long_inc(&unix_nr_socks); 975 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 976 err = -ENFILE; 977 goto err; 978 } 979 980 if (type == SOCK_STREAM) 981 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 982 else /*dgram and seqpacket */ 983 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 984 985 if (!sk) { 986 err = -ENOMEM; 987 goto err; 988 } 989 990 sock_init_data(sock, sk); 991 992 sk->sk_hash = unix_unbound_hash(sk); 993 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 994 sk->sk_write_space = unix_write_space; 995 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 996 sk->sk_destruct = unix_sock_destructor; 997 u = unix_sk(sk); 998 u->path.dentry = NULL; 999 u->path.mnt = NULL; 1000 spin_lock_init(&u->lock); 1001 atomic_long_set(&u->inflight, 0); 1002 INIT_LIST_HEAD(&u->link); 1003 mutex_init(&u->iolock); /* single task reading lock */ 1004 mutex_init(&u->bindlock); /* single task binding lock */ 1005 init_waitqueue_head(&u->peer_wait); 1006 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1007 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1008 unix_insert_unbound_socket(net, sk); 1009 1010 sock_prot_inuse_add(net, sk->sk_prot, 1); 1011 1012 return sk; 1013 1014 err: 1015 atomic_long_dec(&unix_nr_socks); 1016 return ERR_PTR(err); 1017 } 1018 1019 static int unix_create(struct net *net, struct socket *sock, int protocol, 1020 int kern) 1021 { 1022 struct sock *sk; 1023 1024 if (protocol && protocol != PF_UNIX) 1025 return -EPROTONOSUPPORT; 1026 1027 sock->state = SS_UNCONNECTED; 1028 1029 switch (sock->type) { 1030 case SOCK_STREAM: 1031 sock->ops = &unix_stream_ops; 1032 break; 1033 /* 1034 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1035 * nothing uses it. 1036 */ 1037 case SOCK_RAW: 1038 sock->type = SOCK_DGRAM; 1039 fallthrough; 1040 case SOCK_DGRAM: 1041 sock->ops = &unix_dgram_ops; 1042 break; 1043 case SOCK_SEQPACKET: 1044 sock->ops = &unix_seqpacket_ops; 1045 break; 1046 default: 1047 return -ESOCKTNOSUPPORT; 1048 } 1049 1050 sk = unix_create1(net, sock, kern, sock->type); 1051 if (IS_ERR(sk)) 1052 return PTR_ERR(sk); 1053 1054 return 0; 1055 } 1056 1057 static int unix_release(struct socket *sock) 1058 { 1059 struct sock *sk = sock->sk; 1060 1061 if (!sk) 1062 return 0; 1063 1064 sk->sk_prot->close(sk, 0); 1065 unix_release_sock(sk, 0); 1066 sock->sk = NULL; 1067 1068 return 0; 1069 } 1070 1071 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1072 int type) 1073 { 1074 struct inode *inode; 1075 struct path path; 1076 struct sock *sk; 1077 int err; 1078 1079 unix_mkname_bsd(sunaddr, addr_len); 1080 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1081 if (err) 1082 goto fail; 1083 1084 err = path_permission(&path, MAY_WRITE); 1085 if (err) 1086 goto path_put; 1087 1088 err = -ECONNREFUSED; 1089 inode = d_backing_inode(path.dentry); 1090 if (!S_ISSOCK(inode->i_mode)) 1091 goto path_put; 1092 1093 sk = unix_find_socket_byinode(inode); 1094 if (!sk) 1095 goto path_put; 1096 1097 err = -EPROTOTYPE; 1098 if (sk->sk_type == type) 1099 touch_atime(&path); 1100 else 1101 goto sock_put; 1102 1103 path_put(&path); 1104 1105 return sk; 1106 1107 sock_put: 1108 sock_put(sk); 1109 path_put: 1110 path_put(&path); 1111 fail: 1112 return ERR_PTR(err); 1113 } 1114 1115 static struct sock *unix_find_abstract(struct net *net, 1116 struct sockaddr_un *sunaddr, 1117 int addr_len, int type) 1118 { 1119 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1120 struct dentry *dentry; 1121 struct sock *sk; 1122 1123 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1124 if (!sk) 1125 return ERR_PTR(-ECONNREFUSED); 1126 1127 dentry = unix_sk(sk)->path.dentry; 1128 if (dentry) 1129 touch_atime(&unix_sk(sk)->path); 1130 1131 return sk; 1132 } 1133 1134 static struct sock *unix_find_other(struct net *net, 1135 struct sockaddr_un *sunaddr, 1136 int addr_len, int type) 1137 { 1138 struct sock *sk; 1139 1140 if (sunaddr->sun_path[0]) 1141 sk = unix_find_bsd(sunaddr, addr_len, type); 1142 else 1143 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1144 1145 return sk; 1146 } 1147 1148 static int unix_autobind(struct sock *sk) 1149 { 1150 unsigned int new_hash, old_hash = sk->sk_hash; 1151 struct unix_sock *u = unix_sk(sk); 1152 struct net *net = sock_net(sk); 1153 struct unix_address *addr; 1154 u32 lastnum, ordernum; 1155 int err; 1156 1157 err = mutex_lock_interruptible(&u->bindlock); 1158 if (err) 1159 return err; 1160 1161 if (u->addr) 1162 goto out; 1163 1164 err = -ENOMEM; 1165 addr = kzalloc(sizeof(*addr) + 1166 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1167 if (!addr) 1168 goto out; 1169 1170 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1171 addr->name->sun_family = AF_UNIX; 1172 refcount_set(&addr->refcnt, 1); 1173 1174 ordernum = get_random_u32(); 1175 lastnum = ordernum & 0xFFFFF; 1176 retry: 1177 ordernum = (ordernum + 1) & 0xFFFFF; 1178 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1179 1180 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1181 unix_table_double_lock(net, old_hash, new_hash); 1182 1183 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1184 unix_table_double_unlock(net, old_hash, new_hash); 1185 1186 /* __unix_find_socket_byname() may take long time if many names 1187 * are already in use. 1188 */ 1189 cond_resched(); 1190 1191 if (ordernum == lastnum) { 1192 /* Give up if all names seems to be in use. */ 1193 err = -ENOSPC; 1194 unix_release_addr(addr); 1195 goto out; 1196 } 1197 1198 goto retry; 1199 } 1200 1201 __unix_set_addr_hash(net, sk, addr, new_hash); 1202 unix_table_double_unlock(net, old_hash, new_hash); 1203 err = 0; 1204 1205 out: mutex_unlock(&u->bindlock); 1206 return err; 1207 } 1208 1209 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1210 int addr_len) 1211 { 1212 umode_t mode = S_IFSOCK | 1213 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1214 unsigned int new_hash, old_hash = sk->sk_hash; 1215 struct unix_sock *u = unix_sk(sk); 1216 struct net *net = sock_net(sk); 1217 struct mnt_idmap *idmap; 1218 struct unix_address *addr; 1219 struct dentry *dentry; 1220 struct path parent; 1221 int err; 1222 1223 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1224 addr = unix_create_addr(sunaddr, addr_len); 1225 if (!addr) 1226 return -ENOMEM; 1227 1228 /* 1229 * Get the parent directory, calculate the hash for last 1230 * component. 1231 */ 1232 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1233 if (IS_ERR(dentry)) { 1234 err = PTR_ERR(dentry); 1235 goto out; 1236 } 1237 1238 /* 1239 * All right, let's create it. 1240 */ 1241 idmap = mnt_idmap(parent.mnt); 1242 err = security_path_mknod(&parent, dentry, mode, 0); 1243 if (!err) 1244 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1245 if (err) 1246 goto out_path; 1247 err = mutex_lock_interruptible(&u->bindlock); 1248 if (err) 1249 goto out_unlink; 1250 if (u->addr) 1251 goto out_unlock; 1252 1253 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1254 unix_table_double_lock(net, old_hash, new_hash); 1255 u->path.mnt = mntget(parent.mnt); 1256 u->path.dentry = dget(dentry); 1257 __unix_set_addr_hash(net, sk, addr, new_hash); 1258 unix_table_double_unlock(net, old_hash, new_hash); 1259 unix_insert_bsd_socket(sk); 1260 mutex_unlock(&u->bindlock); 1261 done_path_create(&parent, dentry); 1262 return 0; 1263 1264 out_unlock: 1265 mutex_unlock(&u->bindlock); 1266 err = -EINVAL; 1267 out_unlink: 1268 /* failed after successful mknod? unlink what we'd created... */ 1269 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1270 out_path: 1271 done_path_create(&parent, dentry); 1272 out: 1273 unix_release_addr(addr); 1274 return err == -EEXIST ? -EADDRINUSE : err; 1275 } 1276 1277 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1278 int addr_len) 1279 { 1280 unsigned int new_hash, old_hash = sk->sk_hash; 1281 struct unix_sock *u = unix_sk(sk); 1282 struct net *net = sock_net(sk); 1283 struct unix_address *addr; 1284 int err; 1285 1286 addr = unix_create_addr(sunaddr, addr_len); 1287 if (!addr) 1288 return -ENOMEM; 1289 1290 err = mutex_lock_interruptible(&u->bindlock); 1291 if (err) 1292 goto out; 1293 1294 if (u->addr) { 1295 err = -EINVAL; 1296 goto out_mutex; 1297 } 1298 1299 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1300 unix_table_double_lock(net, old_hash, new_hash); 1301 1302 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1303 goto out_spin; 1304 1305 __unix_set_addr_hash(net, sk, addr, new_hash); 1306 unix_table_double_unlock(net, old_hash, new_hash); 1307 mutex_unlock(&u->bindlock); 1308 return 0; 1309 1310 out_spin: 1311 unix_table_double_unlock(net, old_hash, new_hash); 1312 err = -EADDRINUSE; 1313 out_mutex: 1314 mutex_unlock(&u->bindlock); 1315 out: 1316 unix_release_addr(addr); 1317 return err; 1318 } 1319 1320 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1321 { 1322 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1323 struct sock *sk = sock->sk; 1324 int err; 1325 1326 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1327 sunaddr->sun_family == AF_UNIX) 1328 return unix_autobind(sk); 1329 1330 err = unix_validate_addr(sunaddr, addr_len); 1331 if (err) 1332 return err; 1333 1334 if (sunaddr->sun_path[0]) 1335 err = unix_bind_bsd(sk, sunaddr, addr_len); 1336 else 1337 err = unix_bind_abstract(sk, sunaddr, addr_len); 1338 1339 return err; 1340 } 1341 1342 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1343 { 1344 if (unlikely(sk1 == sk2) || !sk2) { 1345 unix_state_lock(sk1); 1346 return; 1347 } 1348 if (sk1 < sk2) { 1349 unix_state_lock(sk1); 1350 unix_state_lock_nested(sk2); 1351 } else { 1352 unix_state_lock(sk2); 1353 unix_state_lock_nested(sk1); 1354 } 1355 } 1356 1357 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1358 { 1359 if (unlikely(sk1 == sk2) || !sk2) { 1360 unix_state_unlock(sk1); 1361 return; 1362 } 1363 unix_state_unlock(sk1); 1364 unix_state_unlock(sk2); 1365 } 1366 1367 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1368 int alen, int flags) 1369 { 1370 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1371 struct sock *sk = sock->sk; 1372 struct sock *other; 1373 int err; 1374 1375 err = -EINVAL; 1376 if (alen < offsetofend(struct sockaddr, sa_family)) 1377 goto out; 1378 1379 if (addr->sa_family != AF_UNSPEC) { 1380 err = unix_validate_addr(sunaddr, alen); 1381 if (err) 1382 goto out; 1383 1384 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1385 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1386 !unix_sk(sk)->addr) { 1387 err = unix_autobind(sk); 1388 if (err) 1389 goto out; 1390 } 1391 1392 restart: 1393 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1394 if (IS_ERR(other)) { 1395 err = PTR_ERR(other); 1396 goto out; 1397 } 1398 1399 unix_state_double_lock(sk, other); 1400 1401 /* Apparently VFS overslept socket death. Retry. */ 1402 if (sock_flag(other, SOCK_DEAD)) { 1403 unix_state_double_unlock(sk, other); 1404 sock_put(other); 1405 goto restart; 1406 } 1407 1408 err = -EPERM; 1409 if (!unix_may_send(sk, other)) 1410 goto out_unlock; 1411 1412 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1413 if (err) 1414 goto out_unlock; 1415 1416 sk->sk_state = other->sk_state = TCP_ESTABLISHED; 1417 } else { 1418 /* 1419 * 1003.1g breaking connected state with AF_UNSPEC 1420 */ 1421 other = NULL; 1422 unix_state_double_lock(sk, other); 1423 } 1424 1425 /* 1426 * If it was connected, reconnect. 1427 */ 1428 if (unix_peer(sk)) { 1429 struct sock *old_peer = unix_peer(sk); 1430 1431 unix_peer(sk) = other; 1432 if (!other) 1433 sk->sk_state = TCP_CLOSE; 1434 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1435 1436 unix_state_double_unlock(sk, other); 1437 1438 if (other != old_peer) 1439 unix_dgram_disconnected(sk, old_peer); 1440 sock_put(old_peer); 1441 } else { 1442 unix_peer(sk) = other; 1443 unix_state_double_unlock(sk, other); 1444 } 1445 1446 return 0; 1447 1448 out_unlock: 1449 unix_state_double_unlock(sk, other); 1450 sock_put(other); 1451 out: 1452 return err; 1453 } 1454 1455 static long unix_wait_for_peer(struct sock *other, long timeo) 1456 __releases(&unix_sk(other)->lock) 1457 { 1458 struct unix_sock *u = unix_sk(other); 1459 int sched; 1460 DEFINE_WAIT(wait); 1461 1462 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1463 1464 sched = !sock_flag(other, SOCK_DEAD) && 1465 !(other->sk_shutdown & RCV_SHUTDOWN) && 1466 unix_recvq_full_lockless(other); 1467 1468 unix_state_unlock(other); 1469 1470 if (sched) 1471 timeo = schedule_timeout(timeo); 1472 1473 finish_wait(&u->peer_wait, &wait); 1474 return timeo; 1475 } 1476 1477 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1478 int addr_len, int flags) 1479 { 1480 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1481 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1482 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1483 struct net *net = sock_net(sk); 1484 struct sk_buff *skb = NULL; 1485 long timeo; 1486 int err; 1487 int st; 1488 1489 err = unix_validate_addr(sunaddr, addr_len); 1490 if (err) 1491 goto out; 1492 1493 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1494 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1495 err = unix_autobind(sk); 1496 if (err) 1497 goto out; 1498 } 1499 1500 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1501 1502 /* First of all allocate resources. 1503 If we will make it after state is locked, 1504 we will have to recheck all again in any case. 1505 */ 1506 1507 /* create new sock for complete connection */ 1508 newsk = unix_create1(net, NULL, 0, sock->type); 1509 if (IS_ERR(newsk)) { 1510 err = PTR_ERR(newsk); 1511 newsk = NULL; 1512 goto out; 1513 } 1514 1515 err = -ENOMEM; 1516 1517 /* Allocate skb for sending to listening sock */ 1518 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1519 if (skb == NULL) 1520 goto out; 1521 1522 restart: 1523 /* Find listening sock. */ 1524 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1525 if (IS_ERR(other)) { 1526 err = PTR_ERR(other); 1527 other = NULL; 1528 goto out; 1529 } 1530 1531 /* Latch state of peer */ 1532 unix_state_lock(other); 1533 1534 /* Apparently VFS overslept socket death. Retry. */ 1535 if (sock_flag(other, SOCK_DEAD)) { 1536 unix_state_unlock(other); 1537 sock_put(other); 1538 goto restart; 1539 } 1540 1541 err = -ECONNREFUSED; 1542 if (other->sk_state != TCP_LISTEN) 1543 goto out_unlock; 1544 if (other->sk_shutdown & RCV_SHUTDOWN) 1545 goto out_unlock; 1546 1547 if (unix_recvq_full(other)) { 1548 err = -EAGAIN; 1549 if (!timeo) 1550 goto out_unlock; 1551 1552 timeo = unix_wait_for_peer(other, timeo); 1553 1554 err = sock_intr_errno(timeo); 1555 if (signal_pending(current)) 1556 goto out; 1557 sock_put(other); 1558 goto restart; 1559 } 1560 1561 /* Latch our state. 1562 1563 It is tricky place. We need to grab our state lock and cannot 1564 drop lock on peer. It is dangerous because deadlock is 1565 possible. Connect to self case and simultaneous 1566 attempt to connect are eliminated by checking socket 1567 state. other is TCP_LISTEN, if sk is TCP_LISTEN we 1568 check this before attempt to grab lock. 1569 1570 Well, and we have to recheck the state after socket locked. 1571 */ 1572 st = sk->sk_state; 1573 1574 switch (st) { 1575 case TCP_CLOSE: 1576 /* This is ok... continue with connect */ 1577 break; 1578 case TCP_ESTABLISHED: 1579 /* Socket is already connected */ 1580 err = -EISCONN; 1581 goto out_unlock; 1582 default: 1583 err = -EINVAL; 1584 goto out_unlock; 1585 } 1586 1587 unix_state_lock_nested(sk); 1588 1589 if (sk->sk_state != st) { 1590 unix_state_unlock(sk); 1591 unix_state_unlock(other); 1592 sock_put(other); 1593 goto restart; 1594 } 1595 1596 err = security_unix_stream_connect(sk, other, newsk); 1597 if (err) { 1598 unix_state_unlock(sk); 1599 goto out_unlock; 1600 } 1601 1602 /* The way is open! Fastly set all the necessary fields... */ 1603 1604 sock_hold(sk); 1605 unix_peer(newsk) = sk; 1606 newsk->sk_state = TCP_ESTABLISHED; 1607 newsk->sk_type = sk->sk_type; 1608 init_peercred(newsk); 1609 newu = unix_sk(newsk); 1610 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1611 otheru = unix_sk(other); 1612 1613 /* copy address information from listening to new sock 1614 * 1615 * The contents of *(otheru->addr) and otheru->path 1616 * are seen fully set up here, since we have found 1617 * otheru in hash under its lock. Insertion into the 1618 * hash chain we'd found it in had been done in an 1619 * earlier critical area protected by the chain's lock, 1620 * the same one where we'd set *(otheru->addr) contents, 1621 * as well as otheru->path and otheru->addr itself. 1622 * 1623 * Using smp_store_release() here to set newu->addr 1624 * is enough to make those stores, as well as stores 1625 * to newu->path visible to anyone who gets newu->addr 1626 * by smp_load_acquire(). IOW, the same warranties 1627 * as for unix_sock instances bound in unix_bind() or 1628 * in unix_autobind(). 1629 */ 1630 if (otheru->path.dentry) { 1631 path_get(&otheru->path); 1632 newu->path = otheru->path; 1633 } 1634 refcount_inc(&otheru->addr->refcnt); 1635 smp_store_release(&newu->addr, otheru->addr); 1636 1637 /* Set credentials */ 1638 copy_peercred(sk, other); 1639 1640 sock->state = SS_CONNECTED; 1641 sk->sk_state = TCP_ESTABLISHED; 1642 sock_hold(newsk); 1643 1644 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1645 unix_peer(sk) = newsk; 1646 1647 unix_state_unlock(sk); 1648 1649 /* take ten and send info to listening sock */ 1650 spin_lock(&other->sk_receive_queue.lock); 1651 __skb_queue_tail(&other->sk_receive_queue, skb); 1652 spin_unlock(&other->sk_receive_queue.lock); 1653 unix_state_unlock(other); 1654 other->sk_data_ready(other); 1655 sock_put(other); 1656 return 0; 1657 1658 out_unlock: 1659 if (other) 1660 unix_state_unlock(other); 1661 1662 out: 1663 kfree_skb(skb); 1664 if (newsk) 1665 unix_release_sock(newsk, 0); 1666 if (other) 1667 sock_put(other); 1668 return err; 1669 } 1670 1671 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1672 { 1673 struct sock *ska = socka->sk, *skb = sockb->sk; 1674 1675 /* Join our sockets back to back */ 1676 sock_hold(ska); 1677 sock_hold(skb); 1678 unix_peer(ska) = skb; 1679 unix_peer(skb) = ska; 1680 init_peercred(ska); 1681 init_peercred(skb); 1682 1683 ska->sk_state = TCP_ESTABLISHED; 1684 skb->sk_state = TCP_ESTABLISHED; 1685 socka->state = SS_CONNECTED; 1686 sockb->state = SS_CONNECTED; 1687 return 0; 1688 } 1689 1690 static void unix_sock_inherit_flags(const struct socket *old, 1691 struct socket *new) 1692 { 1693 if (test_bit(SOCK_PASSCRED, &old->flags)) 1694 set_bit(SOCK_PASSCRED, &new->flags); 1695 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1696 set_bit(SOCK_PASSPIDFD, &new->flags); 1697 if (test_bit(SOCK_PASSSEC, &old->flags)) 1698 set_bit(SOCK_PASSSEC, &new->flags); 1699 } 1700 1701 static int unix_accept(struct socket *sock, struct socket *newsock, int flags, 1702 bool kern) 1703 { 1704 struct sock *sk = sock->sk; 1705 struct sock *tsk; 1706 struct sk_buff *skb; 1707 int err; 1708 1709 err = -EOPNOTSUPP; 1710 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1711 goto out; 1712 1713 err = -EINVAL; 1714 if (sk->sk_state != TCP_LISTEN) 1715 goto out; 1716 1717 /* If socket state is TCP_LISTEN it cannot change (for now...), 1718 * so that no locks are necessary. 1719 */ 1720 1721 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1722 &err); 1723 if (!skb) { 1724 /* This means receive shutdown. */ 1725 if (err == 0) 1726 err = -EINVAL; 1727 goto out; 1728 } 1729 1730 tsk = skb->sk; 1731 skb_free_datagram(sk, skb); 1732 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1733 1734 /* attach accepted sock to socket */ 1735 unix_state_lock(tsk); 1736 newsock->state = SS_CONNECTED; 1737 unix_sock_inherit_flags(sock, newsock); 1738 sock_graft(tsk, newsock); 1739 unix_state_unlock(tsk); 1740 return 0; 1741 1742 out: 1743 return err; 1744 } 1745 1746 1747 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1748 { 1749 struct sock *sk = sock->sk; 1750 struct unix_address *addr; 1751 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1752 int err = 0; 1753 1754 if (peer) { 1755 sk = unix_peer_get(sk); 1756 1757 err = -ENOTCONN; 1758 if (!sk) 1759 goto out; 1760 err = 0; 1761 } else { 1762 sock_hold(sk); 1763 } 1764 1765 addr = smp_load_acquire(&unix_sk(sk)->addr); 1766 if (!addr) { 1767 sunaddr->sun_family = AF_UNIX; 1768 sunaddr->sun_path[0] = 0; 1769 err = offsetof(struct sockaddr_un, sun_path); 1770 } else { 1771 err = addr->len; 1772 memcpy(sunaddr, addr->name, addr->len); 1773 } 1774 sock_put(sk); 1775 out: 1776 return err; 1777 } 1778 1779 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1780 { 1781 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1782 1783 /* 1784 * Garbage collection of unix sockets starts by selecting a set of 1785 * candidate sockets which have reference only from being in flight 1786 * (total_refs == inflight_refs). This condition is checked once during 1787 * the candidate collection phase, and candidates are marked as such, so 1788 * that non-candidates can later be ignored. While inflight_refs is 1789 * protected by unix_gc_lock, total_refs (file count) is not, hence this 1790 * is an instantaneous decision. 1791 * 1792 * Once a candidate, however, the socket must not be reinstalled into a 1793 * file descriptor while the garbage collection is in progress. 1794 * 1795 * If the above conditions are met, then the directed graph of 1796 * candidates (*) does not change while unix_gc_lock is held. 1797 * 1798 * Any operations that changes the file count through file descriptors 1799 * (dup, close, sendmsg) does not change the graph since candidates are 1800 * not installed in fds. 1801 * 1802 * Dequeing a candidate via recvmsg would install it into an fd, but 1803 * that takes unix_gc_lock to decrement the inflight count, so it's 1804 * serialized with garbage collection. 1805 * 1806 * MSG_PEEK is special in that it does not change the inflight count, 1807 * yet does install the socket into an fd. The following lock/unlock 1808 * pair is to ensure serialization with garbage collection. It must be 1809 * done between incrementing the file count and installing the file into 1810 * an fd. 1811 * 1812 * If garbage collection starts after the barrier provided by the 1813 * lock/unlock, then it will see the elevated refcount and not mark this 1814 * as a candidate. If a garbage collection is already in progress 1815 * before the file count was incremented, then the lock/unlock pair will 1816 * ensure that garbage collection is finished before progressing to 1817 * installing the fd. 1818 * 1819 * (*) A -> B where B is on the queue of A or B is on the queue of C 1820 * which is on the queue of listening socket A. 1821 */ 1822 spin_lock(&unix_gc_lock); 1823 spin_unlock(&unix_gc_lock); 1824 } 1825 1826 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1827 { 1828 int err = 0; 1829 1830 UNIXCB(skb).pid = get_pid(scm->pid); 1831 UNIXCB(skb).uid = scm->creds.uid; 1832 UNIXCB(skb).gid = scm->creds.gid; 1833 UNIXCB(skb).fp = NULL; 1834 unix_get_secdata(scm, skb); 1835 if (scm->fp && send_fds) 1836 err = unix_attach_fds(scm, skb); 1837 1838 skb->destructor = unix_destruct_scm; 1839 return err; 1840 } 1841 1842 static bool unix_passcred_enabled(const struct socket *sock, 1843 const struct sock *other) 1844 { 1845 return test_bit(SOCK_PASSCRED, &sock->flags) || 1846 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1847 !other->sk_socket || 1848 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1849 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1850 } 1851 1852 /* 1853 * Some apps rely on write() giving SCM_CREDENTIALS 1854 * We include credentials if source or destination socket 1855 * asserted SOCK_PASSCRED. 1856 */ 1857 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1858 const struct sock *other) 1859 { 1860 if (UNIXCB(skb).pid) 1861 return; 1862 if (unix_passcred_enabled(sock, other)) { 1863 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1864 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1865 } 1866 } 1867 1868 static bool unix_skb_scm_eq(struct sk_buff *skb, 1869 struct scm_cookie *scm) 1870 { 1871 return UNIXCB(skb).pid == scm->pid && 1872 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1873 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1874 unix_secdata_eq(scm, skb); 1875 } 1876 1877 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1878 { 1879 struct scm_fp_list *fp = UNIXCB(skb).fp; 1880 struct unix_sock *u = unix_sk(sk); 1881 1882 if (unlikely(fp && fp->count)) 1883 atomic_add(fp->count, &u->scm_stat.nr_fds); 1884 } 1885 1886 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1887 { 1888 struct scm_fp_list *fp = UNIXCB(skb).fp; 1889 struct unix_sock *u = unix_sk(sk); 1890 1891 if (unlikely(fp && fp->count)) 1892 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1893 } 1894 1895 /* 1896 * Send AF_UNIX data. 1897 */ 1898 1899 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1900 size_t len) 1901 { 1902 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); 1903 struct sock *sk = sock->sk, *other = NULL; 1904 struct unix_sock *u = unix_sk(sk); 1905 struct scm_cookie scm; 1906 struct sk_buff *skb; 1907 int data_len = 0; 1908 int sk_locked; 1909 long timeo; 1910 int err; 1911 1912 wait_for_unix_gc(); 1913 err = scm_send(sock, msg, &scm, false); 1914 if (err < 0) 1915 return err; 1916 1917 err = -EOPNOTSUPP; 1918 if (msg->msg_flags&MSG_OOB) 1919 goto out; 1920 1921 if (msg->msg_namelen) { 1922 err = unix_validate_addr(sunaddr, msg->msg_namelen); 1923 if (err) 1924 goto out; 1925 } else { 1926 sunaddr = NULL; 1927 err = -ENOTCONN; 1928 other = unix_peer_get(sk); 1929 if (!other) 1930 goto out; 1931 } 1932 1933 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1934 test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { 1935 err = unix_autobind(sk); 1936 if (err) 1937 goto out; 1938 } 1939 1940 err = -EMSGSIZE; 1941 if (len > sk->sk_sndbuf - 32) 1942 goto out; 1943 1944 if (len > SKB_MAX_ALLOC) { 1945 data_len = min_t(size_t, 1946 len - SKB_MAX_ALLOC, 1947 MAX_SKB_FRAGS * PAGE_SIZE); 1948 data_len = PAGE_ALIGN(data_len); 1949 1950 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1951 } 1952 1953 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 1954 msg->msg_flags & MSG_DONTWAIT, &err, 1955 PAGE_ALLOC_COSTLY_ORDER); 1956 if (skb == NULL) 1957 goto out; 1958 1959 err = unix_scm_to_skb(&scm, skb, true); 1960 if (err < 0) 1961 goto out_free; 1962 1963 skb_put(skb, len - data_len); 1964 skb->data_len = data_len; 1965 skb->len = len; 1966 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 1967 if (err) 1968 goto out_free; 1969 1970 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1971 1972 restart: 1973 if (!other) { 1974 err = -ECONNRESET; 1975 if (sunaddr == NULL) 1976 goto out_free; 1977 1978 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, 1979 sk->sk_type); 1980 if (IS_ERR(other)) { 1981 err = PTR_ERR(other); 1982 other = NULL; 1983 goto out_free; 1984 } 1985 } 1986 1987 if (sk_filter(other, skb) < 0) { 1988 /* Toss the packet but do not return any error to the sender */ 1989 err = len; 1990 goto out_free; 1991 } 1992 1993 sk_locked = 0; 1994 unix_state_lock(other); 1995 restart_locked: 1996 err = -EPERM; 1997 if (!unix_may_send(sk, other)) 1998 goto out_unlock; 1999 2000 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2001 /* 2002 * Check with 1003.1g - what should 2003 * datagram error 2004 */ 2005 unix_state_unlock(other); 2006 sock_put(other); 2007 2008 if (!sk_locked) 2009 unix_state_lock(sk); 2010 2011 err = 0; 2012 if (sk->sk_type == SOCK_SEQPACKET) { 2013 /* We are here only when racing with unix_release_sock() 2014 * is clearing @other. Never change state to TCP_CLOSE 2015 * unlike SOCK_DGRAM wants. 2016 */ 2017 unix_state_unlock(sk); 2018 err = -EPIPE; 2019 } else if (unix_peer(sk) == other) { 2020 unix_peer(sk) = NULL; 2021 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2022 2023 sk->sk_state = TCP_CLOSE; 2024 unix_state_unlock(sk); 2025 2026 unix_dgram_disconnected(sk, other); 2027 sock_put(other); 2028 err = -ECONNREFUSED; 2029 } else { 2030 unix_state_unlock(sk); 2031 } 2032 2033 other = NULL; 2034 if (err) 2035 goto out_free; 2036 goto restart; 2037 } 2038 2039 err = -EPIPE; 2040 if (other->sk_shutdown & RCV_SHUTDOWN) 2041 goto out_unlock; 2042 2043 if (sk->sk_type != SOCK_SEQPACKET) { 2044 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2045 if (err) 2046 goto out_unlock; 2047 } 2048 2049 /* other == sk && unix_peer(other) != sk if 2050 * - unix_peer(sk) == NULL, destination address bound to sk 2051 * - unix_peer(sk) == sk by time of get but disconnected before lock 2052 */ 2053 if (other != sk && 2054 unlikely(unix_peer(other) != sk && 2055 unix_recvq_full_lockless(other))) { 2056 if (timeo) { 2057 timeo = unix_wait_for_peer(other, timeo); 2058 2059 err = sock_intr_errno(timeo); 2060 if (signal_pending(current)) 2061 goto out_free; 2062 2063 goto restart; 2064 } 2065 2066 if (!sk_locked) { 2067 unix_state_unlock(other); 2068 unix_state_double_lock(sk, other); 2069 } 2070 2071 if (unix_peer(sk) != other || 2072 unix_dgram_peer_wake_me(sk, other)) { 2073 err = -EAGAIN; 2074 sk_locked = 1; 2075 goto out_unlock; 2076 } 2077 2078 if (!sk_locked) { 2079 sk_locked = 1; 2080 goto restart_locked; 2081 } 2082 } 2083 2084 if (unlikely(sk_locked)) 2085 unix_state_unlock(sk); 2086 2087 if (sock_flag(other, SOCK_RCVTSTAMP)) 2088 __net_timestamp(skb); 2089 maybe_add_creds(skb, sock, other); 2090 scm_stat_add(other, skb); 2091 skb_queue_tail(&other->sk_receive_queue, skb); 2092 unix_state_unlock(other); 2093 other->sk_data_ready(other); 2094 sock_put(other); 2095 scm_destroy(&scm); 2096 return len; 2097 2098 out_unlock: 2099 if (sk_locked) 2100 unix_state_unlock(sk); 2101 unix_state_unlock(other); 2102 out_free: 2103 kfree_skb(skb); 2104 out: 2105 if (other) 2106 sock_put(other); 2107 scm_destroy(&scm); 2108 return err; 2109 } 2110 2111 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2112 * bytes, and a minimum of a full page. 2113 */ 2114 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2115 2116 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2117 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2118 struct scm_cookie *scm, bool fds_sent) 2119 { 2120 struct unix_sock *ousk = unix_sk(other); 2121 struct sk_buff *skb; 2122 int err = 0; 2123 2124 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2125 2126 if (!skb) 2127 return err; 2128 2129 err = unix_scm_to_skb(scm, skb, !fds_sent); 2130 if (err < 0) { 2131 kfree_skb(skb); 2132 return err; 2133 } 2134 skb_put(skb, 1); 2135 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2136 2137 if (err) { 2138 kfree_skb(skb); 2139 return err; 2140 } 2141 2142 unix_state_lock(other); 2143 2144 if (sock_flag(other, SOCK_DEAD) || 2145 (other->sk_shutdown & RCV_SHUTDOWN)) { 2146 unix_state_unlock(other); 2147 kfree_skb(skb); 2148 return -EPIPE; 2149 } 2150 2151 maybe_add_creds(skb, sock, other); 2152 skb_get(skb); 2153 2154 if (ousk->oob_skb) 2155 consume_skb(ousk->oob_skb); 2156 2157 WRITE_ONCE(ousk->oob_skb, skb); 2158 2159 scm_stat_add(other, skb); 2160 skb_queue_tail(&other->sk_receive_queue, skb); 2161 sk_send_sigurg(other); 2162 unix_state_unlock(other); 2163 other->sk_data_ready(other); 2164 2165 return err; 2166 } 2167 #endif 2168 2169 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2170 size_t len) 2171 { 2172 struct sock *sk = sock->sk; 2173 struct sock *other = NULL; 2174 int err, size; 2175 struct sk_buff *skb; 2176 int sent = 0; 2177 struct scm_cookie scm; 2178 bool fds_sent = false; 2179 int data_len; 2180 2181 wait_for_unix_gc(); 2182 err = scm_send(sock, msg, &scm, false); 2183 if (err < 0) 2184 return err; 2185 2186 err = -EOPNOTSUPP; 2187 if (msg->msg_flags & MSG_OOB) { 2188 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2189 if (len) 2190 len--; 2191 else 2192 #endif 2193 goto out_err; 2194 } 2195 2196 if (msg->msg_namelen) { 2197 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2198 goto out_err; 2199 } else { 2200 err = -ENOTCONN; 2201 other = unix_peer(sk); 2202 if (!other) 2203 goto out_err; 2204 } 2205 2206 if (sk->sk_shutdown & SEND_SHUTDOWN) 2207 goto pipe_err; 2208 2209 while (sent < len) { 2210 size = len - sent; 2211 2212 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2213 skb = sock_alloc_send_pskb(sk, 0, 0, 2214 msg->msg_flags & MSG_DONTWAIT, 2215 &err, 0); 2216 } else { 2217 /* Keep two messages in the pipe so it schedules better */ 2218 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); 2219 2220 /* allow fallback to order-0 allocations */ 2221 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2222 2223 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2224 2225 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2226 2227 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2228 msg->msg_flags & MSG_DONTWAIT, &err, 2229 get_order(UNIX_SKB_FRAGS_SZ)); 2230 } 2231 if (!skb) 2232 goto out_err; 2233 2234 /* Only send the fds in the first buffer */ 2235 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2236 if (err < 0) { 2237 kfree_skb(skb); 2238 goto out_err; 2239 } 2240 fds_sent = true; 2241 2242 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2243 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2244 sk->sk_allocation); 2245 if (err < 0) { 2246 kfree_skb(skb); 2247 goto out_err; 2248 } 2249 size = err; 2250 refcount_add(size, &sk->sk_wmem_alloc); 2251 } else { 2252 skb_put(skb, size - data_len); 2253 skb->data_len = data_len; 2254 skb->len = size; 2255 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2256 if (err) { 2257 kfree_skb(skb); 2258 goto out_err; 2259 } 2260 } 2261 2262 unix_state_lock(other); 2263 2264 if (sock_flag(other, SOCK_DEAD) || 2265 (other->sk_shutdown & RCV_SHUTDOWN)) 2266 goto pipe_err_free; 2267 2268 maybe_add_creds(skb, sock, other); 2269 scm_stat_add(other, skb); 2270 skb_queue_tail(&other->sk_receive_queue, skb); 2271 unix_state_unlock(other); 2272 other->sk_data_ready(other); 2273 sent += size; 2274 } 2275 2276 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2277 if (msg->msg_flags & MSG_OOB) { 2278 err = queue_oob(sock, msg, other, &scm, fds_sent); 2279 if (err) 2280 goto out_err; 2281 sent++; 2282 } 2283 #endif 2284 2285 scm_destroy(&scm); 2286 2287 return sent; 2288 2289 pipe_err_free: 2290 unix_state_unlock(other); 2291 kfree_skb(skb); 2292 pipe_err: 2293 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) 2294 send_sig(SIGPIPE, current, 0); 2295 err = -EPIPE; 2296 out_err: 2297 scm_destroy(&scm); 2298 return sent ? : err; 2299 } 2300 2301 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2302 size_t len) 2303 { 2304 int err; 2305 struct sock *sk = sock->sk; 2306 2307 err = sock_error(sk); 2308 if (err) 2309 return err; 2310 2311 if (sk->sk_state != TCP_ESTABLISHED) 2312 return -ENOTCONN; 2313 2314 if (msg->msg_namelen) 2315 msg->msg_namelen = 0; 2316 2317 return unix_dgram_sendmsg(sock, msg, len); 2318 } 2319 2320 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2321 size_t size, int flags) 2322 { 2323 struct sock *sk = sock->sk; 2324 2325 if (sk->sk_state != TCP_ESTABLISHED) 2326 return -ENOTCONN; 2327 2328 return unix_dgram_recvmsg(sock, msg, size, flags); 2329 } 2330 2331 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2332 { 2333 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2334 2335 if (addr) { 2336 msg->msg_namelen = addr->len; 2337 memcpy(msg->msg_name, addr->name, addr->len); 2338 } 2339 } 2340 2341 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2342 int flags) 2343 { 2344 struct scm_cookie scm; 2345 struct socket *sock = sk->sk_socket; 2346 struct unix_sock *u = unix_sk(sk); 2347 struct sk_buff *skb, *last; 2348 long timeo; 2349 int skip; 2350 int err; 2351 2352 err = -EOPNOTSUPP; 2353 if (flags&MSG_OOB) 2354 goto out; 2355 2356 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2357 2358 do { 2359 mutex_lock(&u->iolock); 2360 2361 skip = sk_peek_offset(sk, flags); 2362 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2363 &skip, &err, &last); 2364 if (skb) { 2365 if (!(flags & MSG_PEEK)) 2366 scm_stat_del(sk, skb); 2367 break; 2368 } 2369 2370 mutex_unlock(&u->iolock); 2371 2372 if (err != -EAGAIN) 2373 break; 2374 } while (timeo && 2375 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2376 &err, &timeo, last)); 2377 2378 if (!skb) { /* implies iolock unlocked */ 2379 unix_state_lock(sk); 2380 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2381 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2382 (sk->sk_shutdown & RCV_SHUTDOWN)) 2383 err = 0; 2384 unix_state_unlock(sk); 2385 goto out; 2386 } 2387 2388 if (wq_has_sleeper(&u->peer_wait)) 2389 wake_up_interruptible_sync_poll(&u->peer_wait, 2390 EPOLLOUT | EPOLLWRNORM | 2391 EPOLLWRBAND); 2392 2393 if (msg->msg_name) 2394 unix_copy_addr(msg, skb->sk); 2395 2396 if (size > skb->len - skip) 2397 size = skb->len - skip; 2398 else if (size < skb->len - skip) 2399 msg->msg_flags |= MSG_TRUNC; 2400 2401 err = skb_copy_datagram_msg(skb, skip, msg, size); 2402 if (err) 2403 goto out_free; 2404 2405 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2406 __sock_recv_timestamp(msg, sk, skb); 2407 2408 memset(&scm, 0, sizeof(scm)); 2409 2410 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2411 unix_set_secdata(&scm, skb); 2412 2413 if (!(flags & MSG_PEEK)) { 2414 if (UNIXCB(skb).fp) 2415 unix_detach_fds(&scm, skb); 2416 2417 sk_peek_offset_bwd(sk, skb->len); 2418 } else { 2419 /* It is questionable: on PEEK we could: 2420 - do not return fds - good, but too simple 8) 2421 - return fds, and do not return them on read (old strategy, 2422 apparently wrong) 2423 - clone fds (I chose it for now, it is the most universal 2424 solution) 2425 2426 POSIX 1003.1g does not actually define this clearly 2427 at all. POSIX 1003.1g doesn't define a lot of things 2428 clearly however! 2429 2430 */ 2431 2432 sk_peek_offset_fwd(sk, size); 2433 2434 if (UNIXCB(skb).fp) 2435 unix_peek_fds(&scm, skb); 2436 } 2437 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2438 2439 scm_recv_unix(sock, msg, &scm, flags); 2440 2441 out_free: 2442 skb_free_datagram(sk, skb); 2443 mutex_unlock(&u->iolock); 2444 out: 2445 return err; 2446 } 2447 2448 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2449 int flags) 2450 { 2451 struct sock *sk = sock->sk; 2452 2453 #ifdef CONFIG_BPF_SYSCALL 2454 const struct proto *prot = READ_ONCE(sk->sk_prot); 2455 2456 if (prot != &unix_dgram_proto) 2457 return prot->recvmsg(sk, msg, size, flags, NULL); 2458 #endif 2459 return __unix_dgram_recvmsg(sk, msg, size, flags); 2460 } 2461 2462 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2463 { 2464 struct unix_sock *u = unix_sk(sk); 2465 struct sk_buff *skb; 2466 int err; 2467 2468 mutex_lock(&u->iolock); 2469 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2470 mutex_unlock(&u->iolock); 2471 if (!skb) 2472 return err; 2473 2474 return recv_actor(sk, skb); 2475 } 2476 2477 /* 2478 * Sleep until more data has arrived. But check for races.. 2479 */ 2480 static long unix_stream_data_wait(struct sock *sk, long timeo, 2481 struct sk_buff *last, unsigned int last_len, 2482 bool freezable) 2483 { 2484 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2485 struct sk_buff *tail; 2486 DEFINE_WAIT(wait); 2487 2488 unix_state_lock(sk); 2489 2490 for (;;) { 2491 prepare_to_wait(sk_sleep(sk), &wait, state); 2492 2493 tail = skb_peek_tail(&sk->sk_receive_queue); 2494 if (tail != last || 2495 (tail && tail->len != last_len) || 2496 sk->sk_err || 2497 (sk->sk_shutdown & RCV_SHUTDOWN) || 2498 signal_pending(current) || 2499 !timeo) 2500 break; 2501 2502 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2503 unix_state_unlock(sk); 2504 timeo = schedule_timeout(timeo); 2505 unix_state_lock(sk); 2506 2507 if (sock_flag(sk, SOCK_DEAD)) 2508 break; 2509 2510 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2511 } 2512 2513 finish_wait(sk_sleep(sk), &wait); 2514 unix_state_unlock(sk); 2515 return timeo; 2516 } 2517 2518 static unsigned int unix_skb_len(const struct sk_buff *skb) 2519 { 2520 return skb->len - UNIXCB(skb).consumed; 2521 } 2522 2523 struct unix_stream_read_state { 2524 int (*recv_actor)(struct sk_buff *, int, int, 2525 struct unix_stream_read_state *); 2526 struct socket *socket; 2527 struct msghdr *msg; 2528 struct pipe_inode_info *pipe; 2529 size_t size; 2530 int flags; 2531 unsigned int splice_flags; 2532 }; 2533 2534 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2535 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2536 { 2537 struct socket *sock = state->socket; 2538 struct sock *sk = sock->sk; 2539 struct unix_sock *u = unix_sk(sk); 2540 int chunk = 1; 2541 struct sk_buff *oob_skb; 2542 2543 mutex_lock(&u->iolock); 2544 unix_state_lock(sk); 2545 2546 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2547 unix_state_unlock(sk); 2548 mutex_unlock(&u->iolock); 2549 return -EINVAL; 2550 } 2551 2552 oob_skb = u->oob_skb; 2553 2554 if (!(state->flags & MSG_PEEK)) 2555 WRITE_ONCE(u->oob_skb, NULL); 2556 2557 unix_state_unlock(sk); 2558 2559 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2560 2561 if (!(state->flags & MSG_PEEK)) { 2562 UNIXCB(oob_skb).consumed += 1; 2563 kfree_skb(oob_skb); 2564 } 2565 2566 mutex_unlock(&u->iolock); 2567 2568 if (chunk < 0) 2569 return -EFAULT; 2570 2571 state->msg->msg_flags |= MSG_OOB; 2572 return 1; 2573 } 2574 2575 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2576 int flags, int copied) 2577 { 2578 struct unix_sock *u = unix_sk(sk); 2579 2580 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { 2581 skb_unlink(skb, &sk->sk_receive_queue); 2582 consume_skb(skb); 2583 skb = NULL; 2584 } else { 2585 if (skb == u->oob_skb) { 2586 if (copied) { 2587 skb = NULL; 2588 } else if (sock_flag(sk, SOCK_URGINLINE)) { 2589 if (!(flags & MSG_PEEK)) { 2590 WRITE_ONCE(u->oob_skb, NULL); 2591 consume_skb(skb); 2592 } 2593 } else if (!(flags & MSG_PEEK)) { 2594 skb_unlink(skb, &sk->sk_receive_queue); 2595 consume_skb(skb); 2596 skb = skb_peek(&sk->sk_receive_queue); 2597 } 2598 } 2599 } 2600 return skb; 2601 } 2602 #endif 2603 2604 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2605 { 2606 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) 2607 return -ENOTCONN; 2608 2609 return unix_read_skb(sk, recv_actor); 2610 } 2611 2612 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2613 bool freezable) 2614 { 2615 struct scm_cookie scm; 2616 struct socket *sock = state->socket; 2617 struct sock *sk = sock->sk; 2618 struct unix_sock *u = unix_sk(sk); 2619 int copied = 0; 2620 int flags = state->flags; 2621 int noblock = flags & MSG_DONTWAIT; 2622 bool check_creds = false; 2623 int target; 2624 int err = 0; 2625 long timeo; 2626 int skip; 2627 size_t size = state->size; 2628 unsigned int last_len; 2629 2630 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { 2631 err = -EINVAL; 2632 goto out; 2633 } 2634 2635 if (unlikely(flags & MSG_OOB)) { 2636 err = -EOPNOTSUPP; 2637 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2638 err = unix_stream_recv_urg(state); 2639 #endif 2640 goto out; 2641 } 2642 2643 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2644 timeo = sock_rcvtimeo(sk, noblock); 2645 2646 memset(&scm, 0, sizeof(scm)); 2647 2648 /* Lock the socket to prevent queue disordering 2649 * while sleeps in memcpy_tomsg 2650 */ 2651 mutex_lock(&u->iolock); 2652 2653 skip = max(sk_peek_offset(sk, flags), 0); 2654 2655 do { 2656 int chunk; 2657 bool drop_skb; 2658 struct sk_buff *skb, *last; 2659 2660 redo: 2661 unix_state_lock(sk); 2662 if (sock_flag(sk, SOCK_DEAD)) { 2663 err = -ECONNRESET; 2664 goto unlock; 2665 } 2666 last = skb = skb_peek(&sk->sk_receive_queue); 2667 last_len = last ? last->len : 0; 2668 2669 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2670 if (skb) { 2671 skb = manage_oob(skb, sk, flags, copied); 2672 if (!skb) { 2673 unix_state_unlock(sk); 2674 if (copied) 2675 break; 2676 goto redo; 2677 } 2678 } 2679 #endif 2680 again: 2681 if (skb == NULL) { 2682 if (copied >= target) 2683 goto unlock; 2684 2685 /* 2686 * POSIX 1003.1g mandates this order. 2687 */ 2688 2689 err = sock_error(sk); 2690 if (err) 2691 goto unlock; 2692 if (sk->sk_shutdown & RCV_SHUTDOWN) 2693 goto unlock; 2694 2695 unix_state_unlock(sk); 2696 if (!timeo) { 2697 err = -EAGAIN; 2698 break; 2699 } 2700 2701 mutex_unlock(&u->iolock); 2702 2703 timeo = unix_stream_data_wait(sk, timeo, last, 2704 last_len, freezable); 2705 2706 if (signal_pending(current)) { 2707 err = sock_intr_errno(timeo); 2708 scm_destroy(&scm); 2709 goto out; 2710 } 2711 2712 mutex_lock(&u->iolock); 2713 goto redo; 2714 unlock: 2715 unix_state_unlock(sk); 2716 break; 2717 } 2718 2719 while (skip >= unix_skb_len(skb)) { 2720 skip -= unix_skb_len(skb); 2721 last = skb; 2722 last_len = skb->len; 2723 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2724 if (!skb) 2725 goto again; 2726 } 2727 2728 unix_state_unlock(sk); 2729 2730 if (check_creds) { 2731 /* Never glue messages from different writers */ 2732 if (!unix_skb_scm_eq(skb, &scm)) 2733 break; 2734 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2735 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2736 /* Copy credentials */ 2737 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2738 unix_set_secdata(&scm, skb); 2739 check_creds = true; 2740 } 2741 2742 /* Copy address just once */ 2743 if (state->msg && state->msg->msg_name) { 2744 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2745 state->msg->msg_name); 2746 unix_copy_addr(state->msg, skb->sk); 2747 sunaddr = NULL; 2748 } 2749 2750 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2751 skb_get(skb); 2752 chunk = state->recv_actor(skb, skip, chunk, state); 2753 drop_skb = !unix_skb_len(skb); 2754 /* skb is only safe to use if !drop_skb */ 2755 consume_skb(skb); 2756 if (chunk < 0) { 2757 if (copied == 0) 2758 copied = -EFAULT; 2759 break; 2760 } 2761 copied += chunk; 2762 size -= chunk; 2763 2764 if (drop_skb) { 2765 /* the skb was touched by a concurrent reader; 2766 * we should not expect anything from this skb 2767 * anymore and assume it invalid - we can be 2768 * sure it was dropped from the socket queue 2769 * 2770 * let's report a short read 2771 */ 2772 err = 0; 2773 break; 2774 } 2775 2776 /* Mark read part of skb as used */ 2777 if (!(flags & MSG_PEEK)) { 2778 UNIXCB(skb).consumed += chunk; 2779 2780 sk_peek_offset_bwd(sk, chunk); 2781 2782 if (UNIXCB(skb).fp) { 2783 scm_stat_del(sk, skb); 2784 unix_detach_fds(&scm, skb); 2785 } 2786 2787 if (unix_skb_len(skb)) 2788 break; 2789 2790 skb_unlink(skb, &sk->sk_receive_queue); 2791 consume_skb(skb); 2792 2793 if (scm.fp) 2794 break; 2795 } else { 2796 /* It is questionable, see note in unix_dgram_recvmsg. 2797 */ 2798 if (UNIXCB(skb).fp) 2799 unix_peek_fds(&scm, skb); 2800 2801 sk_peek_offset_fwd(sk, chunk); 2802 2803 if (UNIXCB(skb).fp) 2804 break; 2805 2806 skip = 0; 2807 last = skb; 2808 last_len = skb->len; 2809 unix_state_lock(sk); 2810 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2811 if (skb) 2812 goto again; 2813 unix_state_unlock(sk); 2814 break; 2815 } 2816 } while (size); 2817 2818 mutex_unlock(&u->iolock); 2819 if (state->msg) 2820 scm_recv_unix(sock, state->msg, &scm, flags); 2821 else 2822 scm_destroy(&scm); 2823 out: 2824 return copied ? : err; 2825 } 2826 2827 static int unix_stream_read_actor(struct sk_buff *skb, 2828 int skip, int chunk, 2829 struct unix_stream_read_state *state) 2830 { 2831 int ret; 2832 2833 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2834 state->msg, chunk); 2835 return ret ?: chunk; 2836 } 2837 2838 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2839 size_t size, int flags) 2840 { 2841 struct unix_stream_read_state state = { 2842 .recv_actor = unix_stream_read_actor, 2843 .socket = sk->sk_socket, 2844 .msg = msg, 2845 .size = size, 2846 .flags = flags 2847 }; 2848 2849 return unix_stream_read_generic(&state, true); 2850 } 2851 2852 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2853 size_t size, int flags) 2854 { 2855 struct unix_stream_read_state state = { 2856 .recv_actor = unix_stream_read_actor, 2857 .socket = sock, 2858 .msg = msg, 2859 .size = size, 2860 .flags = flags 2861 }; 2862 2863 #ifdef CONFIG_BPF_SYSCALL 2864 struct sock *sk = sock->sk; 2865 const struct proto *prot = READ_ONCE(sk->sk_prot); 2866 2867 if (prot != &unix_stream_proto) 2868 return prot->recvmsg(sk, msg, size, flags, NULL); 2869 #endif 2870 return unix_stream_read_generic(&state, true); 2871 } 2872 2873 static int unix_stream_splice_actor(struct sk_buff *skb, 2874 int skip, int chunk, 2875 struct unix_stream_read_state *state) 2876 { 2877 return skb_splice_bits(skb, state->socket->sk, 2878 UNIXCB(skb).consumed + skip, 2879 state->pipe, chunk, state->splice_flags); 2880 } 2881 2882 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2883 struct pipe_inode_info *pipe, 2884 size_t size, unsigned int flags) 2885 { 2886 struct unix_stream_read_state state = { 2887 .recv_actor = unix_stream_splice_actor, 2888 .socket = sock, 2889 .pipe = pipe, 2890 .size = size, 2891 .splice_flags = flags, 2892 }; 2893 2894 if (unlikely(*ppos)) 2895 return -ESPIPE; 2896 2897 if (sock->file->f_flags & O_NONBLOCK || 2898 flags & SPLICE_F_NONBLOCK) 2899 state.flags = MSG_DONTWAIT; 2900 2901 return unix_stream_read_generic(&state, false); 2902 } 2903 2904 static int unix_shutdown(struct socket *sock, int mode) 2905 { 2906 struct sock *sk = sock->sk; 2907 struct sock *other; 2908 2909 if (mode < SHUT_RD || mode > SHUT_RDWR) 2910 return -EINVAL; 2911 /* This maps: 2912 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 2913 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 2914 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 2915 */ 2916 ++mode; 2917 2918 unix_state_lock(sk); 2919 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 2920 other = unix_peer(sk); 2921 if (other) 2922 sock_hold(other); 2923 unix_state_unlock(sk); 2924 sk->sk_state_change(sk); 2925 2926 if (other && 2927 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 2928 2929 int peer_mode = 0; 2930 const struct proto *prot = READ_ONCE(other->sk_prot); 2931 2932 if (prot->unhash) 2933 prot->unhash(other); 2934 if (mode&RCV_SHUTDOWN) 2935 peer_mode |= SEND_SHUTDOWN; 2936 if (mode&SEND_SHUTDOWN) 2937 peer_mode |= RCV_SHUTDOWN; 2938 unix_state_lock(other); 2939 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 2940 unix_state_unlock(other); 2941 other->sk_state_change(other); 2942 if (peer_mode == SHUTDOWN_MASK) 2943 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2944 else if (peer_mode & RCV_SHUTDOWN) 2945 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 2946 } 2947 if (other) 2948 sock_put(other); 2949 2950 return 0; 2951 } 2952 2953 long unix_inq_len(struct sock *sk) 2954 { 2955 struct sk_buff *skb; 2956 long amount = 0; 2957 2958 if (sk->sk_state == TCP_LISTEN) 2959 return -EINVAL; 2960 2961 spin_lock(&sk->sk_receive_queue.lock); 2962 if (sk->sk_type == SOCK_STREAM || 2963 sk->sk_type == SOCK_SEQPACKET) { 2964 skb_queue_walk(&sk->sk_receive_queue, skb) 2965 amount += unix_skb_len(skb); 2966 } else { 2967 skb = skb_peek(&sk->sk_receive_queue); 2968 if (skb) 2969 amount = skb->len; 2970 } 2971 spin_unlock(&sk->sk_receive_queue.lock); 2972 2973 return amount; 2974 } 2975 EXPORT_SYMBOL_GPL(unix_inq_len); 2976 2977 long unix_outq_len(struct sock *sk) 2978 { 2979 return sk_wmem_alloc_get(sk); 2980 } 2981 EXPORT_SYMBOL_GPL(unix_outq_len); 2982 2983 static int unix_open_file(struct sock *sk) 2984 { 2985 struct path path; 2986 struct file *f; 2987 int fd; 2988 2989 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2990 return -EPERM; 2991 2992 if (!smp_load_acquire(&unix_sk(sk)->addr)) 2993 return -ENOENT; 2994 2995 path = unix_sk(sk)->path; 2996 if (!path.dentry) 2997 return -ENOENT; 2998 2999 path_get(&path); 3000 3001 fd = get_unused_fd_flags(O_CLOEXEC); 3002 if (fd < 0) 3003 goto out; 3004 3005 f = dentry_open(&path, O_PATH, current_cred()); 3006 if (IS_ERR(f)) { 3007 put_unused_fd(fd); 3008 fd = PTR_ERR(f); 3009 goto out; 3010 } 3011 3012 fd_install(fd, f); 3013 out: 3014 path_put(&path); 3015 3016 return fd; 3017 } 3018 3019 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3020 { 3021 struct sock *sk = sock->sk; 3022 long amount = 0; 3023 int err; 3024 3025 switch (cmd) { 3026 case SIOCOUTQ: 3027 amount = unix_outq_len(sk); 3028 err = put_user(amount, (int __user *)arg); 3029 break; 3030 case SIOCINQ: 3031 amount = unix_inq_len(sk); 3032 if (amount < 0) 3033 err = amount; 3034 else 3035 err = put_user(amount, (int __user *)arg); 3036 break; 3037 case SIOCUNIXFILE: 3038 err = unix_open_file(sk); 3039 break; 3040 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3041 case SIOCATMARK: 3042 { 3043 struct sk_buff *skb; 3044 int answ = 0; 3045 3046 skb = skb_peek(&sk->sk_receive_queue); 3047 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) 3048 answ = 1; 3049 err = put_user(answ, (int __user *)arg); 3050 } 3051 break; 3052 #endif 3053 default: 3054 err = -ENOIOCTLCMD; 3055 break; 3056 } 3057 return err; 3058 } 3059 3060 #ifdef CONFIG_COMPAT 3061 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3062 { 3063 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3064 } 3065 #endif 3066 3067 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3068 { 3069 struct sock *sk = sock->sk; 3070 __poll_t mask; 3071 u8 shutdown; 3072 3073 sock_poll_wait(file, sock, wait); 3074 mask = 0; 3075 shutdown = READ_ONCE(sk->sk_shutdown); 3076 3077 /* exceptional events? */ 3078 if (READ_ONCE(sk->sk_err)) 3079 mask |= EPOLLERR; 3080 if (shutdown == SHUTDOWN_MASK) 3081 mask |= EPOLLHUP; 3082 if (shutdown & RCV_SHUTDOWN) 3083 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3084 3085 /* readable? */ 3086 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3087 mask |= EPOLLIN | EPOLLRDNORM; 3088 if (sk_is_readable(sk)) 3089 mask |= EPOLLIN | EPOLLRDNORM; 3090 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3091 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3092 mask |= EPOLLPRI; 3093 #endif 3094 3095 /* Connection-based need to check for termination and startup */ 3096 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3097 sk->sk_state == TCP_CLOSE) 3098 mask |= EPOLLHUP; 3099 3100 /* 3101 * we set writable also when the other side has shut down the 3102 * connection. This prevents stuck sockets. 3103 */ 3104 if (unix_writable(sk)) 3105 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3106 3107 return mask; 3108 } 3109 3110 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3111 poll_table *wait) 3112 { 3113 struct sock *sk = sock->sk, *other; 3114 unsigned int writable; 3115 __poll_t mask; 3116 u8 shutdown; 3117 3118 sock_poll_wait(file, sock, wait); 3119 mask = 0; 3120 shutdown = READ_ONCE(sk->sk_shutdown); 3121 3122 /* exceptional events? */ 3123 if (READ_ONCE(sk->sk_err) || 3124 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3125 mask |= EPOLLERR | 3126 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3127 3128 if (shutdown & RCV_SHUTDOWN) 3129 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3130 if (shutdown == SHUTDOWN_MASK) 3131 mask |= EPOLLHUP; 3132 3133 /* readable? */ 3134 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3135 mask |= EPOLLIN | EPOLLRDNORM; 3136 if (sk_is_readable(sk)) 3137 mask |= EPOLLIN | EPOLLRDNORM; 3138 3139 /* Connection-based need to check for termination and startup */ 3140 if (sk->sk_type == SOCK_SEQPACKET) { 3141 if (sk->sk_state == TCP_CLOSE) 3142 mask |= EPOLLHUP; 3143 /* connection hasn't started yet? */ 3144 if (sk->sk_state == TCP_SYN_SENT) 3145 return mask; 3146 } 3147 3148 /* No write status requested, avoid expensive OUT tests. */ 3149 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3150 return mask; 3151 3152 writable = unix_writable(sk); 3153 if (writable) { 3154 unix_state_lock(sk); 3155 3156 other = unix_peer(sk); 3157 if (other && unix_peer(other) != sk && 3158 unix_recvq_full_lockless(other) && 3159 unix_dgram_peer_wake_me(sk, other)) 3160 writable = 0; 3161 3162 unix_state_unlock(sk); 3163 } 3164 3165 if (writable) 3166 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3167 else 3168 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3169 3170 return mask; 3171 } 3172 3173 #ifdef CONFIG_PROC_FS 3174 3175 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3176 3177 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3178 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3179 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3180 3181 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3182 { 3183 unsigned long offset = get_offset(*pos); 3184 unsigned long bucket = get_bucket(*pos); 3185 unsigned long count = 0; 3186 struct sock *sk; 3187 3188 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3189 sk; sk = sk_next(sk)) { 3190 if (++count == offset) 3191 break; 3192 } 3193 3194 return sk; 3195 } 3196 3197 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3198 { 3199 unsigned long bucket = get_bucket(*pos); 3200 struct net *net = seq_file_net(seq); 3201 struct sock *sk; 3202 3203 while (bucket < UNIX_HASH_SIZE) { 3204 spin_lock(&net->unx.table.locks[bucket]); 3205 3206 sk = unix_from_bucket(seq, pos); 3207 if (sk) 3208 return sk; 3209 3210 spin_unlock(&net->unx.table.locks[bucket]); 3211 3212 *pos = set_bucket_offset(++bucket, 1); 3213 } 3214 3215 return NULL; 3216 } 3217 3218 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3219 loff_t *pos) 3220 { 3221 unsigned long bucket = get_bucket(*pos); 3222 3223 sk = sk_next(sk); 3224 if (sk) 3225 return sk; 3226 3227 3228 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3229 3230 *pos = set_bucket_offset(++bucket, 1); 3231 3232 return unix_get_first(seq, pos); 3233 } 3234 3235 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3236 { 3237 if (!*pos) 3238 return SEQ_START_TOKEN; 3239 3240 return unix_get_first(seq, pos); 3241 } 3242 3243 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3244 { 3245 ++*pos; 3246 3247 if (v == SEQ_START_TOKEN) 3248 return unix_get_first(seq, pos); 3249 3250 return unix_get_next(seq, v, pos); 3251 } 3252 3253 static void unix_seq_stop(struct seq_file *seq, void *v) 3254 { 3255 struct sock *sk = v; 3256 3257 if (sk) 3258 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3259 } 3260 3261 static int unix_seq_show(struct seq_file *seq, void *v) 3262 { 3263 3264 if (v == SEQ_START_TOKEN) 3265 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3266 "Inode Path\n"); 3267 else { 3268 struct sock *s = v; 3269 struct unix_sock *u = unix_sk(s); 3270 unix_state_lock(s); 3271 3272 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3273 s, 3274 refcount_read(&s->sk_refcnt), 3275 0, 3276 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3277 s->sk_type, 3278 s->sk_socket ? 3279 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3280 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3281 sock_i_ino(s)); 3282 3283 if (u->addr) { // under a hash table lock here 3284 int i, len; 3285 seq_putc(seq, ' '); 3286 3287 i = 0; 3288 len = u->addr->len - 3289 offsetof(struct sockaddr_un, sun_path); 3290 if (u->addr->name->sun_path[0]) { 3291 len--; 3292 } else { 3293 seq_putc(seq, '@'); 3294 i++; 3295 } 3296 for ( ; i < len; i++) 3297 seq_putc(seq, u->addr->name->sun_path[i] ?: 3298 '@'); 3299 } 3300 unix_state_unlock(s); 3301 seq_putc(seq, '\n'); 3302 } 3303 3304 return 0; 3305 } 3306 3307 static const struct seq_operations unix_seq_ops = { 3308 .start = unix_seq_start, 3309 .next = unix_seq_next, 3310 .stop = unix_seq_stop, 3311 .show = unix_seq_show, 3312 }; 3313 3314 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) 3315 struct bpf_unix_iter_state { 3316 struct seq_net_private p; 3317 unsigned int cur_sk; 3318 unsigned int end_sk; 3319 unsigned int max_sk; 3320 struct sock **batch; 3321 bool st_bucket_done; 3322 }; 3323 3324 struct bpf_iter__unix { 3325 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3326 __bpf_md_ptr(struct unix_sock *, unix_sk); 3327 uid_t uid __aligned(8); 3328 }; 3329 3330 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3331 struct unix_sock *unix_sk, uid_t uid) 3332 { 3333 struct bpf_iter__unix ctx; 3334 3335 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3336 ctx.meta = meta; 3337 ctx.unix_sk = unix_sk; 3338 ctx.uid = uid; 3339 return bpf_iter_run_prog(prog, &ctx); 3340 } 3341 3342 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3343 3344 { 3345 struct bpf_unix_iter_state *iter = seq->private; 3346 unsigned int expected = 1; 3347 struct sock *sk; 3348 3349 sock_hold(start_sk); 3350 iter->batch[iter->end_sk++] = start_sk; 3351 3352 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3353 if (iter->end_sk < iter->max_sk) { 3354 sock_hold(sk); 3355 iter->batch[iter->end_sk++] = sk; 3356 } 3357 3358 expected++; 3359 } 3360 3361 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3362 3363 return expected; 3364 } 3365 3366 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3367 { 3368 while (iter->cur_sk < iter->end_sk) 3369 sock_put(iter->batch[iter->cur_sk++]); 3370 } 3371 3372 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3373 unsigned int new_batch_sz) 3374 { 3375 struct sock **new_batch; 3376 3377 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3378 GFP_USER | __GFP_NOWARN); 3379 if (!new_batch) 3380 return -ENOMEM; 3381 3382 bpf_iter_unix_put_batch(iter); 3383 kvfree(iter->batch); 3384 iter->batch = new_batch; 3385 iter->max_sk = new_batch_sz; 3386 3387 return 0; 3388 } 3389 3390 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3391 loff_t *pos) 3392 { 3393 struct bpf_unix_iter_state *iter = seq->private; 3394 unsigned int expected; 3395 bool resized = false; 3396 struct sock *sk; 3397 3398 if (iter->st_bucket_done) 3399 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3400 3401 again: 3402 /* Get a new batch */ 3403 iter->cur_sk = 0; 3404 iter->end_sk = 0; 3405 3406 sk = unix_get_first(seq, pos); 3407 if (!sk) 3408 return NULL; /* Done */ 3409 3410 expected = bpf_iter_unix_hold_batch(seq, sk); 3411 3412 if (iter->end_sk == expected) { 3413 iter->st_bucket_done = true; 3414 return sk; 3415 } 3416 3417 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3418 resized = true; 3419 goto again; 3420 } 3421 3422 return sk; 3423 } 3424 3425 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3426 { 3427 if (!*pos) 3428 return SEQ_START_TOKEN; 3429 3430 /* bpf iter does not support lseek, so it always 3431 * continue from where it was stop()-ped. 3432 */ 3433 return bpf_iter_unix_batch(seq, pos); 3434 } 3435 3436 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3437 { 3438 struct bpf_unix_iter_state *iter = seq->private; 3439 struct sock *sk; 3440 3441 /* Whenever seq_next() is called, the iter->cur_sk is 3442 * done with seq_show(), so advance to the next sk in 3443 * the batch. 3444 */ 3445 if (iter->cur_sk < iter->end_sk) 3446 sock_put(iter->batch[iter->cur_sk++]); 3447 3448 ++*pos; 3449 3450 if (iter->cur_sk < iter->end_sk) 3451 sk = iter->batch[iter->cur_sk]; 3452 else 3453 sk = bpf_iter_unix_batch(seq, pos); 3454 3455 return sk; 3456 } 3457 3458 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3459 { 3460 struct bpf_iter_meta meta; 3461 struct bpf_prog *prog; 3462 struct sock *sk = v; 3463 uid_t uid; 3464 bool slow; 3465 int ret; 3466 3467 if (v == SEQ_START_TOKEN) 3468 return 0; 3469 3470 slow = lock_sock_fast(sk); 3471 3472 if (unlikely(sk_unhashed(sk))) { 3473 ret = SEQ_SKIP; 3474 goto unlock; 3475 } 3476 3477 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3478 meta.seq = seq; 3479 prog = bpf_iter_get_info(&meta, false); 3480 ret = unix_prog_seq_show(prog, &meta, v, uid); 3481 unlock: 3482 unlock_sock_fast(sk, slow); 3483 return ret; 3484 } 3485 3486 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3487 { 3488 struct bpf_unix_iter_state *iter = seq->private; 3489 struct bpf_iter_meta meta; 3490 struct bpf_prog *prog; 3491 3492 if (!v) { 3493 meta.seq = seq; 3494 prog = bpf_iter_get_info(&meta, true); 3495 if (prog) 3496 (void)unix_prog_seq_show(prog, &meta, v, 0); 3497 } 3498 3499 if (iter->cur_sk < iter->end_sk) 3500 bpf_iter_unix_put_batch(iter); 3501 } 3502 3503 static const struct seq_operations bpf_iter_unix_seq_ops = { 3504 .start = bpf_iter_unix_seq_start, 3505 .next = bpf_iter_unix_seq_next, 3506 .stop = bpf_iter_unix_seq_stop, 3507 .show = bpf_iter_unix_seq_show, 3508 }; 3509 #endif 3510 #endif 3511 3512 static const struct net_proto_family unix_family_ops = { 3513 .family = PF_UNIX, 3514 .create = unix_create, 3515 .owner = THIS_MODULE, 3516 }; 3517 3518 3519 static int __net_init unix_net_init(struct net *net) 3520 { 3521 int i; 3522 3523 net->unx.sysctl_max_dgram_qlen = 10; 3524 if (unix_sysctl_register(net)) 3525 goto out; 3526 3527 #ifdef CONFIG_PROC_FS 3528 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3529 sizeof(struct seq_net_private))) 3530 goto err_sysctl; 3531 #endif 3532 3533 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3534 sizeof(spinlock_t), GFP_KERNEL); 3535 if (!net->unx.table.locks) 3536 goto err_proc; 3537 3538 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3539 sizeof(struct hlist_head), 3540 GFP_KERNEL); 3541 if (!net->unx.table.buckets) 3542 goto free_locks; 3543 3544 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3545 spin_lock_init(&net->unx.table.locks[i]); 3546 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3547 } 3548 3549 return 0; 3550 3551 free_locks: 3552 kvfree(net->unx.table.locks); 3553 err_proc: 3554 #ifdef CONFIG_PROC_FS 3555 remove_proc_entry("unix", net->proc_net); 3556 err_sysctl: 3557 #endif 3558 unix_sysctl_unregister(net); 3559 out: 3560 return -ENOMEM; 3561 } 3562 3563 static void __net_exit unix_net_exit(struct net *net) 3564 { 3565 kvfree(net->unx.table.buckets); 3566 kvfree(net->unx.table.locks); 3567 unix_sysctl_unregister(net); 3568 remove_proc_entry("unix", net->proc_net); 3569 } 3570 3571 static struct pernet_operations unix_net_ops = { 3572 .init = unix_net_init, 3573 .exit = unix_net_exit, 3574 }; 3575 3576 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3577 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3578 struct unix_sock *unix_sk, uid_t uid) 3579 3580 #define INIT_BATCH_SZ 16 3581 3582 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3583 { 3584 struct bpf_unix_iter_state *iter = priv_data; 3585 int err; 3586 3587 err = bpf_iter_init_seq_net(priv_data, aux); 3588 if (err) 3589 return err; 3590 3591 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3592 if (err) { 3593 bpf_iter_fini_seq_net(priv_data); 3594 return err; 3595 } 3596 3597 return 0; 3598 } 3599 3600 static void bpf_iter_fini_unix(void *priv_data) 3601 { 3602 struct bpf_unix_iter_state *iter = priv_data; 3603 3604 bpf_iter_fini_seq_net(priv_data); 3605 kvfree(iter->batch); 3606 } 3607 3608 static const struct bpf_iter_seq_info unix_seq_info = { 3609 .seq_ops = &bpf_iter_unix_seq_ops, 3610 .init_seq_private = bpf_iter_init_unix, 3611 .fini_seq_private = bpf_iter_fini_unix, 3612 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3613 }; 3614 3615 static const struct bpf_func_proto * 3616 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3617 const struct bpf_prog *prog) 3618 { 3619 switch (func_id) { 3620 case BPF_FUNC_setsockopt: 3621 return &bpf_sk_setsockopt_proto; 3622 case BPF_FUNC_getsockopt: 3623 return &bpf_sk_getsockopt_proto; 3624 default: 3625 return NULL; 3626 } 3627 } 3628 3629 static struct bpf_iter_reg unix_reg_info = { 3630 .target = "unix", 3631 .ctx_arg_info_size = 1, 3632 .ctx_arg_info = { 3633 { offsetof(struct bpf_iter__unix, unix_sk), 3634 PTR_TO_BTF_ID_OR_NULL }, 3635 }, 3636 .get_func_proto = bpf_iter_unix_get_func_proto, 3637 .seq_info = &unix_seq_info, 3638 }; 3639 3640 static void __init bpf_iter_register(void) 3641 { 3642 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3643 if (bpf_iter_reg_target(&unix_reg_info)) 3644 pr_warn("Warning: could not register bpf iterator unix\n"); 3645 } 3646 #endif 3647 3648 static int __init af_unix_init(void) 3649 { 3650 int i, rc = -1; 3651 3652 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3653 3654 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3655 spin_lock_init(&bsd_socket_locks[i]); 3656 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3657 } 3658 3659 rc = proto_register(&unix_dgram_proto, 1); 3660 if (rc != 0) { 3661 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3662 goto out; 3663 } 3664 3665 rc = proto_register(&unix_stream_proto, 1); 3666 if (rc != 0) { 3667 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3668 proto_unregister(&unix_dgram_proto); 3669 goto out; 3670 } 3671 3672 sock_register(&unix_family_ops); 3673 register_pernet_subsys(&unix_net_ops); 3674 unix_bpf_build_proto(); 3675 3676 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3677 bpf_iter_register(); 3678 #endif 3679 3680 out: 3681 return rc; 3682 } 3683 3684 static void __exit af_unix_exit(void) 3685 { 3686 sock_unregister(PF_UNIX); 3687 proto_unregister(&unix_dgram_proto); 3688 proto_unregister(&unix_stream_proto); 3689 unregister_pernet_subsys(&unix_net_ops); 3690 } 3691 3692 /* Earlier than device_initcall() so that other drivers invoking 3693 request_module() don't end up in a loop when modprobe tries 3694 to use a UNIX socket. But later than subsys_initcall() because 3695 we depend on stuff initialised there */ 3696 fs_initcall(af_unix_init); 3697 module_exit(af_unix_exit); 3698 3699 MODULE_LICENSE("GPL"); 3700 MODULE_ALIAS_NETPROTO(PF_UNIX); 3701