1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VMware vSockets Driver 4 * 5 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 6 */ 7 8 /* Implementation notes: 9 * 10 * - There are two kinds of sockets: those created by user action (such as 11 * calling socket(2)) and those created by incoming connection request packets. 12 * 13 * - There are two "global" tables, one for bound sockets (sockets that have 14 * specified an address that they are responsible for) and one for connected 15 * sockets (sockets that have established a connection with another socket). 16 * These tables are "global" in that all sockets on the system are placed 17 * within them. - Note, though, that the bound table contains an extra entry 18 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in 19 * that list. The bound table is used solely for lookup of sockets when packets 20 * are received and that's not necessary for SOCK_DGRAM sockets since we create 21 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM 22 * sockets out of the bound hash buckets will reduce the chance of collisions 23 * when looking for SOCK_STREAM sockets and prevents us from having to check the 24 * socket type in the hash table lookups. 25 * 26 * - Sockets created by user action will either be "client" sockets that 27 * initiate a connection or "server" sockets that listen for connections; we do 28 * not support simultaneous connects (two "client" sockets connecting). 29 * 30 * - "Server" sockets are referred to as listener sockets throughout this 31 * implementation because they are in the TCP_LISTEN state. When a 32 * connection request is received (the second kind of socket mentioned above), 33 * we create a new socket and refer to it as a pending socket. These pending 34 * sockets are placed on the pending connection list of the listener socket. 35 * When future packets are received for the address the listener socket is 36 * bound to, we check if the source of the packet is from one that has an 37 * existing pending connection. If it does, we process the packet for the 38 * pending socket. When that socket reaches the connected state, it is removed 39 * from the listener socket's pending list and enqueued in the listener 40 * socket's accept queue. Callers of accept(2) will accept connected sockets 41 * from the listener socket's accept queue. If the socket cannot be accepted 42 * for some reason then it is marked rejected. Once the connection is 43 * accepted, it is owned by the user process and the responsibility for cleanup 44 * falls with that user process. 45 * 46 * - It is possible that these pending sockets will never reach the connected 47 * state; in fact, we may never receive another packet after the connection 48 * request. Because of this, we must schedule a cleanup function to run in the 49 * future, after some amount of time passes where a connection should have been 50 * established. This function ensures that the socket is off all lists so it 51 * cannot be retrieved, then drops all references to the socket so it is cleaned 52 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this 53 * function will also cleanup rejected sockets, those that reach the connected 54 * state but leave it before they have been accepted. 55 * 56 * - Lock ordering for pending or accept queue sockets is: 57 * 58 * lock_sock(listener); 59 * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); 60 * 61 * Using explicit nested locking keeps lockdep happy since normally only one 62 * lock of a given class may be taken at a time. 63 * 64 * - Sockets created by user action will be cleaned up when the user process 65 * calls close(2), causing our release implementation to be called. Our release 66 * implementation will perform some cleanup then drop the last reference so our 67 * sk_destruct implementation is invoked. Our sk_destruct implementation will 68 * perform additional cleanup that's common for both types of sockets. 69 * 70 * - A socket's reference count is what ensures that the structure won't be 71 * freed. Each entry in a list (such as the "global" bound and connected tables 72 * and the listener socket's pending list and connected queue) ensures a 73 * reference. When we defer work until process context and pass a socket as our 74 * argument, we must ensure the reference count is increased to ensure the 75 * socket isn't freed before the function is run; the deferred function will 76 * then drop the reference. 77 * 78 * - sk->sk_state uses the TCP state constants because they are widely used by 79 * other address families and exposed to userspace tools like ss(8): 80 * 81 * TCP_CLOSE - unconnected 82 * TCP_SYN_SENT - connecting 83 * TCP_ESTABLISHED - connected 84 * TCP_CLOSING - disconnecting 85 * TCP_LISTEN - listening 86 */ 87 88 #include <linux/compat.h> 89 #include <linux/types.h> 90 #include <linux/bitops.h> 91 #include <linux/cred.h> 92 #include <linux/errqueue.h> 93 #include <linux/init.h> 94 #include <linux/io.h> 95 #include <linux/kernel.h> 96 #include <linux/sched/signal.h> 97 #include <linux/kmod.h> 98 #include <linux/list.h> 99 #include <linux/miscdevice.h> 100 #include <linux/module.h> 101 #include <linux/mutex.h> 102 #include <linux/net.h> 103 #include <linux/poll.h> 104 #include <linux/random.h> 105 #include <linux/skbuff.h> 106 #include <linux/smp.h> 107 #include <linux/socket.h> 108 #include <linux/stddef.h> 109 #include <linux/unistd.h> 110 #include <linux/wait.h> 111 #include <linux/workqueue.h> 112 #include <net/sock.h> 113 #include <net/af_vsock.h> 114 #include <uapi/linux/vm_sockets.h> 115 116 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 117 static void vsock_sk_destruct(struct sock *sk); 118 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 119 static void vsock_close(struct sock *sk, long timeout); 120 121 /* Protocol family. */ 122 struct proto vsock_proto = { 123 .name = "AF_VSOCK", 124 .owner = THIS_MODULE, 125 .obj_size = sizeof(struct vsock_sock), 126 .close = vsock_close, 127 #ifdef CONFIG_BPF_SYSCALL 128 .psock_update_sk_prot = vsock_bpf_update_proto, 129 #endif 130 }; 131 132 /* The default peer timeout indicates how long we will wait for a peer response 133 * to a control message. 134 */ 135 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 136 137 #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) 138 #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) 139 #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 140 141 /* Transport used for host->guest communication */ 142 static const struct vsock_transport *transport_h2g; 143 /* Transport used for guest->host communication */ 144 static const struct vsock_transport *transport_g2h; 145 /* Transport used for DGRAM communication */ 146 static const struct vsock_transport *transport_dgram; 147 /* Transport used for local communication */ 148 static const struct vsock_transport *transport_local; 149 static DEFINE_MUTEX(vsock_register_mutex); 150 151 /**** UTILS ****/ 152 153 /* Each bound VSocket is stored in the bind hash table and each connected 154 * VSocket is stored in the connected hash table. 155 * 156 * Unbound sockets are all put on the same list attached to the end of the hash 157 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in 158 * the bucket that their local address hashes to (vsock_bound_sockets(addr) 159 * represents the list that addr hashes to). 160 * 161 * Specifically, we initialize the vsock_bind_table array to a size of 162 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through 163 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and 164 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function 165 * mods with VSOCK_HASH_SIZE to ensure this. 166 */ 167 #define MAX_PORT_RETRIES 24 168 169 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) 170 #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) 171 #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) 172 173 /* XXX This can probably be implemented in a better way. */ 174 #define VSOCK_CONN_HASH(src, dst) \ 175 (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) 176 #define vsock_connected_sockets(src, dst) \ 177 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) 178 #define vsock_connected_sockets_vsk(vsk) \ 179 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) 180 181 struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; 182 EXPORT_SYMBOL_GPL(vsock_bind_table); 183 struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; 184 EXPORT_SYMBOL_GPL(vsock_connected_table); 185 DEFINE_SPINLOCK(vsock_table_lock); 186 EXPORT_SYMBOL_GPL(vsock_table_lock); 187 188 /* Autobind this socket to the local address if necessary. */ 189 static int vsock_auto_bind(struct vsock_sock *vsk) 190 { 191 struct sock *sk = sk_vsock(vsk); 192 struct sockaddr_vm local_addr; 193 194 if (vsock_addr_bound(&vsk->local_addr)) 195 return 0; 196 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 197 return __vsock_bind(sk, &local_addr); 198 } 199 200 static void vsock_init_tables(void) 201 { 202 int i; 203 204 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) 205 INIT_LIST_HEAD(&vsock_bind_table[i]); 206 207 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) 208 INIT_LIST_HEAD(&vsock_connected_table[i]); 209 } 210 211 static void __vsock_insert_bound(struct list_head *list, 212 struct vsock_sock *vsk) 213 { 214 sock_hold(&vsk->sk); 215 list_add(&vsk->bound_table, list); 216 } 217 218 static void __vsock_insert_connected(struct list_head *list, 219 struct vsock_sock *vsk) 220 { 221 sock_hold(&vsk->sk); 222 list_add(&vsk->connected_table, list); 223 } 224 225 static void __vsock_remove_bound(struct vsock_sock *vsk) 226 { 227 list_del_init(&vsk->bound_table); 228 sock_put(&vsk->sk); 229 } 230 231 static void __vsock_remove_connected(struct vsock_sock *vsk) 232 { 233 list_del_init(&vsk->connected_table); 234 sock_put(&vsk->sk); 235 } 236 237 static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) 238 { 239 struct vsock_sock *vsk; 240 241 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { 242 if (vsock_addr_equals_addr(addr, &vsk->local_addr)) 243 return sk_vsock(vsk); 244 245 if (addr->svm_port == vsk->local_addr.svm_port && 246 (vsk->local_addr.svm_cid == VMADDR_CID_ANY || 247 addr->svm_cid == VMADDR_CID_ANY)) 248 return sk_vsock(vsk); 249 } 250 251 return NULL; 252 } 253 254 static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, 255 struct sockaddr_vm *dst) 256 { 257 struct vsock_sock *vsk; 258 259 list_for_each_entry(vsk, vsock_connected_sockets(src, dst), 260 connected_table) { 261 if (vsock_addr_equals_addr(src, &vsk->remote_addr) && 262 dst->svm_port == vsk->local_addr.svm_port) { 263 return sk_vsock(vsk); 264 } 265 } 266 267 return NULL; 268 } 269 270 static void vsock_insert_unbound(struct vsock_sock *vsk) 271 { 272 spin_lock_bh(&vsock_table_lock); 273 __vsock_insert_bound(vsock_unbound_sockets, vsk); 274 spin_unlock_bh(&vsock_table_lock); 275 } 276 277 void vsock_insert_connected(struct vsock_sock *vsk) 278 { 279 struct list_head *list = vsock_connected_sockets( 280 &vsk->remote_addr, &vsk->local_addr); 281 282 spin_lock_bh(&vsock_table_lock); 283 __vsock_insert_connected(list, vsk); 284 spin_unlock_bh(&vsock_table_lock); 285 } 286 EXPORT_SYMBOL_GPL(vsock_insert_connected); 287 288 void vsock_remove_bound(struct vsock_sock *vsk) 289 { 290 spin_lock_bh(&vsock_table_lock); 291 if (__vsock_in_bound_table(vsk)) 292 __vsock_remove_bound(vsk); 293 spin_unlock_bh(&vsock_table_lock); 294 } 295 EXPORT_SYMBOL_GPL(vsock_remove_bound); 296 297 void vsock_remove_connected(struct vsock_sock *vsk) 298 { 299 spin_lock_bh(&vsock_table_lock); 300 if (__vsock_in_connected_table(vsk)) 301 __vsock_remove_connected(vsk); 302 spin_unlock_bh(&vsock_table_lock); 303 } 304 EXPORT_SYMBOL_GPL(vsock_remove_connected); 305 306 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) 307 { 308 struct sock *sk; 309 310 spin_lock_bh(&vsock_table_lock); 311 sk = __vsock_find_bound_socket(addr); 312 if (sk) 313 sock_hold(sk); 314 315 spin_unlock_bh(&vsock_table_lock); 316 317 return sk; 318 } 319 EXPORT_SYMBOL_GPL(vsock_find_bound_socket); 320 321 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 322 struct sockaddr_vm *dst) 323 { 324 struct sock *sk; 325 326 spin_lock_bh(&vsock_table_lock); 327 sk = __vsock_find_connected_socket(src, dst); 328 if (sk) 329 sock_hold(sk); 330 331 spin_unlock_bh(&vsock_table_lock); 332 333 return sk; 334 } 335 EXPORT_SYMBOL_GPL(vsock_find_connected_socket); 336 337 void vsock_remove_sock(struct vsock_sock *vsk) 338 { 339 vsock_remove_bound(vsk); 340 vsock_remove_connected(vsk); 341 } 342 EXPORT_SYMBOL_GPL(vsock_remove_sock); 343 344 void vsock_for_each_connected_socket(struct vsock_transport *transport, 345 void (*fn)(struct sock *sk)) 346 { 347 int i; 348 349 spin_lock_bh(&vsock_table_lock); 350 351 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { 352 struct vsock_sock *vsk; 353 list_for_each_entry(vsk, &vsock_connected_table[i], 354 connected_table) { 355 if (vsk->transport != transport) 356 continue; 357 358 fn(sk_vsock(vsk)); 359 } 360 } 361 362 spin_unlock_bh(&vsock_table_lock); 363 } 364 EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); 365 366 void vsock_add_pending(struct sock *listener, struct sock *pending) 367 { 368 struct vsock_sock *vlistener; 369 struct vsock_sock *vpending; 370 371 vlistener = vsock_sk(listener); 372 vpending = vsock_sk(pending); 373 374 sock_hold(pending); 375 sock_hold(listener); 376 list_add_tail(&vpending->pending_links, &vlistener->pending_links); 377 } 378 EXPORT_SYMBOL_GPL(vsock_add_pending); 379 380 void vsock_remove_pending(struct sock *listener, struct sock *pending) 381 { 382 struct vsock_sock *vpending = vsock_sk(pending); 383 384 list_del_init(&vpending->pending_links); 385 sock_put(listener); 386 sock_put(pending); 387 } 388 EXPORT_SYMBOL_GPL(vsock_remove_pending); 389 390 void vsock_enqueue_accept(struct sock *listener, struct sock *connected) 391 { 392 struct vsock_sock *vlistener; 393 struct vsock_sock *vconnected; 394 395 vlistener = vsock_sk(listener); 396 vconnected = vsock_sk(connected); 397 398 sock_hold(connected); 399 sock_hold(listener); 400 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); 401 } 402 EXPORT_SYMBOL_GPL(vsock_enqueue_accept); 403 404 static bool vsock_use_local_transport(unsigned int remote_cid) 405 { 406 if (!transport_local) 407 return false; 408 409 if (remote_cid == VMADDR_CID_LOCAL) 410 return true; 411 412 if (transport_g2h) { 413 return remote_cid == transport_g2h->get_local_cid(); 414 } else { 415 return remote_cid == VMADDR_CID_HOST; 416 } 417 } 418 419 static void vsock_deassign_transport(struct vsock_sock *vsk) 420 { 421 if (!vsk->transport) 422 return; 423 424 vsk->transport->destruct(vsk); 425 module_put(vsk->transport->module); 426 vsk->transport = NULL; 427 } 428 429 /* Assign a transport to a socket and call the .init transport callback. 430 * 431 * Note: for connection oriented socket this must be called when vsk->remote_addr 432 * is set (e.g. during the connect() or when a connection request on a listener 433 * socket is received). 434 * The vsk->remote_addr is used to decide which transport to use: 435 * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if 436 * g2h is not loaded, will use local transport; 437 * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field 438 * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; 439 * - remote CID > VMADDR_CID_HOST will use host->guest transport; 440 */ 441 int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) 442 { 443 const struct vsock_transport *new_transport; 444 struct sock *sk = sk_vsock(vsk); 445 unsigned int remote_cid = vsk->remote_addr.svm_cid; 446 __u8 remote_flags; 447 int ret; 448 449 /* If the packet is coming with the source and destination CIDs higher 450 * than VMADDR_CID_HOST, then a vsock channel where all the packets are 451 * forwarded to the host should be established. Then the host will 452 * need to forward the packets to the guest. 453 * 454 * The flag is set on the (listen) receive path (psk is not NULL). On 455 * the connect path the flag can be set by the user space application. 456 */ 457 if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && 458 vsk->remote_addr.svm_cid > VMADDR_CID_HOST) 459 vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; 460 461 remote_flags = vsk->remote_addr.svm_flags; 462 463 switch (sk->sk_type) { 464 case SOCK_DGRAM: 465 new_transport = transport_dgram; 466 break; 467 case SOCK_STREAM: 468 case SOCK_SEQPACKET: 469 if (vsock_use_local_transport(remote_cid)) 470 new_transport = transport_local; 471 else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || 472 (remote_flags & VMADDR_FLAG_TO_HOST)) 473 new_transport = transport_g2h; 474 else 475 new_transport = transport_h2g; 476 break; 477 default: 478 return -ESOCKTNOSUPPORT; 479 } 480 481 if (vsk->transport) { 482 if (vsk->transport == new_transport) 483 return 0; 484 485 /* transport->release() must be called with sock lock acquired. 486 * This path can only be taken during vsock_connect(), where we 487 * have already held the sock lock. In the other cases, this 488 * function is called on a new socket which is not assigned to 489 * any transport. 490 */ 491 vsk->transport->release(vsk); 492 vsock_deassign_transport(vsk); 493 494 /* transport's release() and destruct() can touch some socket 495 * state, since we are reassigning the socket to a new transport 496 * during vsock_connect(), let's reset these fields to have a 497 * clean state. 498 */ 499 sock_reset_flag(sk, SOCK_DONE); 500 sk->sk_state = TCP_CLOSE; 501 vsk->peer_shutdown = 0; 502 } 503 504 /* We increase the module refcnt to prevent the transport unloading 505 * while there are open sockets assigned to it. 506 */ 507 if (!new_transport || !try_module_get(new_transport->module)) 508 return -ENODEV; 509 510 if (sk->sk_type == SOCK_SEQPACKET) { 511 if (!new_transport->seqpacket_allow || 512 !new_transport->seqpacket_allow(remote_cid)) { 513 module_put(new_transport->module); 514 return -ESOCKTNOSUPPORT; 515 } 516 } 517 518 ret = new_transport->init(vsk, psk); 519 if (ret) { 520 module_put(new_transport->module); 521 return ret; 522 } 523 524 vsk->transport = new_transport; 525 526 return 0; 527 } 528 EXPORT_SYMBOL_GPL(vsock_assign_transport); 529 530 bool vsock_find_cid(unsigned int cid) 531 { 532 if (transport_g2h && cid == transport_g2h->get_local_cid()) 533 return true; 534 535 if (transport_h2g && cid == VMADDR_CID_HOST) 536 return true; 537 538 if (transport_local && cid == VMADDR_CID_LOCAL) 539 return true; 540 541 return false; 542 } 543 EXPORT_SYMBOL_GPL(vsock_find_cid); 544 545 static struct sock *vsock_dequeue_accept(struct sock *listener) 546 { 547 struct vsock_sock *vlistener; 548 struct vsock_sock *vconnected; 549 550 vlistener = vsock_sk(listener); 551 552 if (list_empty(&vlistener->accept_queue)) 553 return NULL; 554 555 vconnected = list_entry(vlistener->accept_queue.next, 556 struct vsock_sock, accept_queue); 557 558 list_del_init(&vconnected->accept_queue); 559 sock_put(listener); 560 /* The caller will need a reference on the connected socket so we let 561 * it call sock_put(). 562 */ 563 564 return sk_vsock(vconnected); 565 } 566 567 static bool vsock_is_accept_queue_empty(struct sock *sk) 568 { 569 struct vsock_sock *vsk = vsock_sk(sk); 570 return list_empty(&vsk->accept_queue); 571 } 572 573 static bool vsock_is_pending(struct sock *sk) 574 { 575 struct vsock_sock *vsk = vsock_sk(sk); 576 return !list_empty(&vsk->pending_links); 577 } 578 579 static int vsock_send_shutdown(struct sock *sk, int mode) 580 { 581 struct vsock_sock *vsk = vsock_sk(sk); 582 583 if (!vsk->transport) 584 return -ENODEV; 585 586 return vsk->transport->shutdown(vsk, mode); 587 } 588 589 static void vsock_pending_work(struct work_struct *work) 590 { 591 struct sock *sk; 592 struct sock *listener; 593 struct vsock_sock *vsk; 594 bool cleanup; 595 596 vsk = container_of(work, struct vsock_sock, pending_work.work); 597 sk = sk_vsock(vsk); 598 listener = vsk->listener; 599 cleanup = true; 600 601 lock_sock(listener); 602 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 603 604 if (vsock_is_pending(sk)) { 605 vsock_remove_pending(listener, sk); 606 607 sk_acceptq_removed(listener); 608 } else if (!vsk->rejected) { 609 /* We are not on the pending list and accept() did not reject 610 * us, so we must have been accepted by our user process. We 611 * just need to drop our references to the sockets and be on 612 * our way. 613 */ 614 cleanup = false; 615 goto out; 616 } 617 618 /* We need to remove ourself from the global connected sockets list so 619 * incoming packets can't find this socket, and to reduce the reference 620 * count. 621 */ 622 vsock_remove_connected(vsk); 623 624 sk->sk_state = TCP_CLOSE; 625 626 out: 627 release_sock(sk); 628 release_sock(listener); 629 if (cleanup) 630 sock_put(sk); 631 632 sock_put(sk); 633 sock_put(listener); 634 } 635 636 /**** SOCKET OPERATIONS ****/ 637 638 static int __vsock_bind_connectible(struct vsock_sock *vsk, 639 struct sockaddr_vm *addr) 640 { 641 static u32 port; 642 struct sockaddr_vm new_addr; 643 644 if (!port) 645 port = get_random_u32_above(LAST_RESERVED_PORT); 646 647 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); 648 649 if (addr->svm_port == VMADDR_PORT_ANY) { 650 bool found = false; 651 unsigned int i; 652 653 for (i = 0; i < MAX_PORT_RETRIES; i++) { 654 if (port <= LAST_RESERVED_PORT) 655 port = LAST_RESERVED_PORT + 1; 656 657 new_addr.svm_port = port++; 658 659 if (!__vsock_find_bound_socket(&new_addr)) { 660 found = true; 661 break; 662 } 663 } 664 665 if (!found) 666 return -EADDRNOTAVAIL; 667 } else { 668 /* If port is in reserved range, ensure caller 669 * has necessary privileges. 670 */ 671 if (addr->svm_port <= LAST_RESERVED_PORT && 672 !capable(CAP_NET_BIND_SERVICE)) { 673 return -EACCES; 674 } 675 676 if (__vsock_find_bound_socket(&new_addr)) 677 return -EADDRINUSE; 678 } 679 680 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); 681 682 /* Remove connection oriented sockets from the unbound list and add them 683 * to the hash table for easy lookup by its address. The unbound list 684 * is simply an extra entry at the end of the hash table, a trick used 685 * by AF_UNIX. 686 */ 687 __vsock_remove_bound(vsk); 688 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); 689 690 return 0; 691 } 692 693 static int __vsock_bind_dgram(struct vsock_sock *vsk, 694 struct sockaddr_vm *addr) 695 { 696 return vsk->transport->dgram_bind(vsk, addr); 697 } 698 699 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) 700 { 701 struct vsock_sock *vsk = vsock_sk(sk); 702 int retval; 703 704 /* First ensure this socket isn't already bound. */ 705 if (vsock_addr_bound(&vsk->local_addr)) 706 return -EINVAL; 707 708 /* Now bind to the provided address or select appropriate values if 709 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that 710 * like AF_INET prevents binding to a non-local IP address (in most 711 * cases), we only allow binding to a local CID. 712 */ 713 if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) 714 return -EADDRNOTAVAIL; 715 716 switch (sk->sk_socket->type) { 717 case SOCK_STREAM: 718 case SOCK_SEQPACKET: 719 spin_lock_bh(&vsock_table_lock); 720 retval = __vsock_bind_connectible(vsk, addr); 721 spin_unlock_bh(&vsock_table_lock); 722 break; 723 724 case SOCK_DGRAM: 725 retval = __vsock_bind_dgram(vsk, addr); 726 break; 727 728 default: 729 retval = -EINVAL; 730 break; 731 } 732 733 return retval; 734 } 735 736 static void vsock_connect_timeout(struct work_struct *work); 737 738 static struct sock *__vsock_create(struct net *net, 739 struct socket *sock, 740 struct sock *parent, 741 gfp_t priority, 742 unsigned short type, 743 int kern) 744 { 745 struct sock *sk; 746 struct vsock_sock *psk; 747 struct vsock_sock *vsk; 748 749 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); 750 if (!sk) 751 return NULL; 752 753 sock_init_data(sock, sk); 754 755 /* sk->sk_type is normally set in sock_init_data, but only if sock is 756 * non-NULL. We make sure that our sockets always have a type by 757 * setting it here if needed. 758 */ 759 if (!sock) 760 sk->sk_type = type; 761 762 vsk = vsock_sk(sk); 763 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 764 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 765 766 sk->sk_destruct = vsock_sk_destruct; 767 sk->sk_backlog_rcv = vsock_queue_rcv_skb; 768 sock_reset_flag(sk, SOCK_DONE); 769 770 INIT_LIST_HEAD(&vsk->bound_table); 771 INIT_LIST_HEAD(&vsk->connected_table); 772 vsk->listener = NULL; 773 INIT_LIST_HEAD(&vsk->pending_links); 774 INIT_LIST_HEAD(&vsk->accept_queue); 775 vsk->rejected = false; 776 vsk->sent_request = false; 777 vsk->ignore_connecting_rst = false; 778 vsk->peer_shutdown = 0; 779 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); 780 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); 781 782 psk = parent ? vsock_sk(parent) : NULL; 783 if (parent) { 784 vsk->trusted = psk->trusted; 785 vsk->owner = get_cred(psk->owner); 786 vsk->connect_timeout = psk->connect_timeout; 787 vsk->buffer_size = psk->buffer_size; 788 vsk->buffer_min_size = psk->buffer_min_size; 789 vsk->buffer_max_size = psk->buffer_max_size; 790 security_sk_clone(parent, sk); 791 } else { 792 vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); 793 vsk->owner = get_current_cred(); 794 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; 795 vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; 796 vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; 797 vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; 798 } 799 800 return sk; 801 } 802 803 static bool sock_type_connectible(u16 type) 804 { 805 return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); 806 } 807 808 static void __vsock_release(struct sock *sk, int level) 809 { 810 struct vsock_sock *vsk; 811 struct sock *pending; 812 813 vsk = vsock_sk(sk); 814 pending = NULL; /* Compiler warning. */ 815 816 /* When "level" is SINGLE_DEPTH_NESTING, use the nested 817 * version to avoid the warning "possible recursive locking 818 * detected". When "level" is 0, lock_sock_nested(sk, level) 819 * is the same as lock_sock(sk). 820 */ 821 lock_sock_nested(sk, level); 822 823 if (vsk->transport) 824 vsk->transport->release(vsk); 825 else if (sock_type_connectible(sk->sk_type)) 826 vsock_remove_sock(vsk); 827 828 sock_orphan(sk); 829 sk->sk_shutdown = SHUTDOWN_MASK; 830 831 skb_queue_purge(&sk->sk_receive_queue); 832 833 /* Clean up any sockets that never were accepted. */ 834 while ((pending = vsock_dequeue_accept(sk)) != NULL) { 835 __vsock_release(pending, SINGLE_DEPTH_NESTING); 836 sock_put(pending); 837 } 838 839 release_sock(sk); 840 sock_put(sk); 841 } 842 843 static void vsock_sk_destruct(struct sock *sk) 844 { 845 struct vsock_sock *vsk = vsock_sk(sk); 846 847 vsock_deassign_transport(vsk); 848 849 /* When clearing these addresses, there's no need to set the family and 850 * possibly register the address family with the kernel. 851 */ 852 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 853 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 854 855 put_cred(vsk->owner); 856 } 857 858 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 859 { 860 int err; 861 862 err = sock_queue_rcv_skb(sk, skb); 863 if (err) 864 kfree_skb(skb); 865 866 return err; 867 } 868 869 struct sock *vsock_create_connected(struct sock *parent) 870 { 871 return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, 872 parent->sk_type, 0); 873 } 874 EXPORT_SYMBOL_GPL(vsock_create_connected); 875 876 s64 vsock_stream_has_data(struct vsock_sock *vsk) 877 { 878 if (WARN_ON(!vsk->transport)) 879 return 0; 880 881 return vsk->transport->stream_has_data(vsk); 882 } 883 EXPORT_SYMBOL_GPL(vsock_stream_has_data); 884 885 s64 vsock_connectible_has_data(struct vsock_sock *vsk) 886 { 887 struct sock *sk = sk_vsock(vsk); 888 889 if (WARN_ON(!vsk->transport)) 890 return 0; 891 892 if (sk->sk_type == SOCK_SEQPACKET) 893 return vsk->transport->seqpacket_has_data(vsk); 894 else 895 return vsock_stream_has_data(vsk); 896 } 897 EXPORT_SYMBOL_GPL(vsock_connectible_has_data); 898 899 s64 vsock_stream_has_space(struct vsock_sock *vsk) 900 { 901 if (WARN_ON(!vsk->transport)) 902 return 0; 903 904 return vsk->transport->stream_has_space(vsk); 905 } 906 EXPORT_SYMBOL_GPL(vsock_stream_has_space); 907 908 void vsock_data_ready(struct sock *sk) 909 { 910 struct vsock_sock *vsk = vsock_sk(sk); 911 912 if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || 913 sock_flag(sk, SOCK_DONE)) 914 sk->sk_data_ready(sk); 915 } 916 EXPORT_SYMBOL_GPL(vsock_data_ready); 917 918 /* Dummy callback required by sockmap. 919 * See unconditional call of saved_close() in sock_map_close(). 920 */ 921 static void vsock_close(struct sock *sk, long timeout) 922 { 923 } 924 925 static int vsock_release(struct socket *sock) 926 { 927 struct sock *sk = sock->sk; 928 929 if (!sk) 930 return 0; 931 932 sk->sk_prot->close(sk, 0); 933 __vsock_release(sk, 0); 934 sock->sk = NULL; 935 sock->state = SS_FREE; 936 937 return 0; 938 } 939 940 static int 941 vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 942 { 943 int err; 944 struct sock *sk; 945 struct sockaddr_vm *vm_addr; 946 947 sk = sock->sk; 948 949 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) 950 return -EINVAL; 951 952 lock_sock(sk); 953 err = __vsock_bind(sk, vm_addr); 954 release_sock(sk); 955 956 return err; 957 } 958 959 static int vsock_getname(struct socket *sock, 960 struct sockaddr *addr, int peer) 961 { 962 int err; 963 struct sock *sk; 964 struct vsock_sock *vsk; 965 struct sockaddr_vm *vm_addr; 966 967 sk = sock->sk; 968 vsk = vsock_sk(sk); 969 err = 0; 970 971 lock_sock(sk); 972 973 if (peer) { 974 if (sock->state != SS_CONNECTED) { 975 err = -ENOTCONN; 976 goto out; 977 } 978 vm_addr = &vsk->remote_addr; 979 } else { 980 vm_addr = &vsk->local_addr; 981 } 982 983 if (!vm_addr) { 984 err = -EINVAL; 985 goto out; 986 } 987 988 /* sys_getsockname() and sys_getpeername() pass us a 989 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately 990 * that macro is defined in socket.c instead of .h, so we hardcode its 991 * value here. 992 */ 993 BUILD_BUG_ON(sizeof(*vm_addr) > 128); 994 memcpy(addr, vm_addr, sizeof(*vm_addr)); 995 err = sizeof(*vm_addr); 996 997 out: 998 release_sock(sk); 999 return err; 1000 } 1001 1002 static int vsock_shutdown(struct socket *sock, int mode) 1003 { 1004 int err; 1005 struct sock *sk; 1006 1007 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses 1008 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode 1009 * here like the other address families do. Note also that the 1010 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), 1011 * which is what we want. 1012 */ 1013 mode++; 1014 1015 if ((mode & ~SHUTDOWN_MASK) || !mode) 1016 return -EINVAL; 1017 1018 /* If this is a connection oriented socket and it is not connected then 1019 * bail out immediately. If it is a DGRAM socket then we must first 1020 * kick the socket so that it wakes up from any sleeping calls, for 1021 * example recv(), and then afterwards return the error. 1022 */ 1023 1024 sk = sock->sk; 1025 1026 lock_sock(sk); 1027 if (sock->state == SS_UNCONNECTED) { 1028 err = -ENOTCONN; 1029 if (sock_type_connectible(sk->sk_type)) 1030 goto out; 1031 } else { 1032 sock->state = SS_DISCONNECTING; 1033 err = 0; 1034 } 1035 1036 /* Receive and send shutdowns are treated alike. */ 1037 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); 1038 if (mode) { 1039 sk->sk_shutdown |= mode; 1040 sk->sk_state_change(sk); 1041 1042 if (sock_type_connectible(sk->sk_type)) { 1043 sock_reset_flag(sk, SOCK_DONE); 1044 vsock_send_shutdown(sk, mode); 1045 } 1046 } 1047 1048 out: 1049 release_sock(sk); 1050 return err; 1051 } 1052 1053 static __poll_t vsock_poll(struct file *file, struct socket *sock, 1054 poll_table *wait) 1055 { 1056 struct sock *sk; 1057 __poll_t mask; 1058 struct vsock_sock *vsk; 1059 1060 sk = sock->sk; 1061 vsk = vsock_sk(sk); 1062 1063 poll_wait(file, sk_sleep(sk), wait); 1064 mask = 0; 1065 1066 if (sk->sk_err) 1067 /* Signify that there has been an error on this socket. */ 1068 mask |= EPOLLERR; 1069 1070 /* INET sockets treat local write shutdown and peer write shutdown as a 1071 * case of EPOLLHUP set. 1072 */ 1073 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1074 ((sk->sk_shutdown & SEND_SHUTDOWN) && 1075 (vsk->peer_shutdown & SEND_SHUTDOWN))) { 1076 mask |= EPOLLHUP; 1077 } 1078 1079 if (sk->sk_shutdown & RCV_SHUTDOWN || 1080 vsk->peer_shutdown & SEND_SHUTDOWN) { 1081 mask |= EPOLLRDHUP; 1082 } 1083 1084 if (sk_is_readable(sk)) 1085 mask |= EPOLLIN | EPOLLRDNORM; 1086 1087 if (sock->type == SOCK_DGRAM) { 1088 /* For datagram sockets we can read if there is something in 1089 * the queue and write as long as the socket isn't shutdown for 1090 * sending. 1091 */ 1092 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 1093 (sk->sk_shutdown & RCV_SHUTDOWN)) { 1094 mask |= EPOLLIN | EPOLLRDNORM; 1095 } 1096 1097 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 1098 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 1099 1100 } else if (sock_type_connectible(sk->sk_type)) { 1101 const struct vsock_transport *transport; 1102 1103 lock_sock(sk); 1104 1105 transport = vsk->transport; 1106 1107 /* Listening sockets that have connections in their accept 1108 * queue can be read. 1109 */ 1110 if (sk->sk_state == TCP_LISTEN 1111 && !vsock_is_accept_queue_empty(sk)) 1112 mask |= EPOLLIN | EPOLLRDNORM; 1113 1114 /* If there is something in the queue then we can read. */ 1115 if (transport && transport->stream_is_active(vsk) && 1116 !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1117 bool data_ready_now = false; 1118 int target = sock_rcvlowat(sk, 0, INT_MAX); 1119 int ret = transport->notify_poll_in( 1120 vsk, target, &data_ready_now); 1121 if (ret < 0) { 1122 mask |= EPOLLERR; 1123 } else { 1124 if (data_ready_now) 1125 mask |= EPOLLIN | EPOLLRDNORM; 1126 1127 } 1128 } 1129 1130 /* Sockets whose connections have been closed, reset, or 1131 * terminated should also be considered read, and we check the 1132 * shutdown flag for that. 1133 */ 1134 if (sk->sk_shutdown & RCV_SHUTDOWN || 1135 vsk->peer_shutdown & SEND_SHUTDOWN) { 1136 mask |= EPOLLIN | EPOLLRDNORM; 1137 } 1138 1139 /* Connected sockets that can produce data can be written. */ 1140 if (transport && sk->sk_state == TCP_ESTABLISHED) { 1141 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 1142 bool space_avail_now = false; 1143 int ret = transport->notify_poll_out( 1144 vsk, 1, &space_avail_now); 1145 if (ret < 0) { 1146 mask |= EPOLLERR; 1147 } else { 1148 if (space_avail_now) 1149 /* Remove EPOLLWRBAND since INET 1150 * sockets are not setting it. 1151 */ 1152 mask |= EPOLLOUT | EPOLLWRNORM; 1153 1154 } 1155 } 1156 } 1157 1158 /* Simulate INET socket poll behaviors, which sets 1159 * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, 1160 * but local send is not shutdown. 1161 */ 1162 if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { 1163 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 1164 mask |= EPOLLOUT | EPOLLWRNORM; 1165 1166 } 1167 1168 release_sock(sk); 1169 } 1170 1171 return mask; 1172 } 1173 1174 static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) 1175 { 1176 struct vsock_sock *vsk = vsock_sk(sk); 1177 1178 return vsk->transport->read_skb(vsk, read_actor); 1179 } 1180 1181 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1182 size_t len) 1183 { 1184 int err; 1185 struct sock *sk; 1186 struct vsock_sock *vsk; 1187 struct sockaddr_vm *remote_addr; 1188 const struct vsock_transport *transport; 1189 1190 if (msg->msg_flags & MSG_OOB) 1191 return -EOPNOTSUPP; 1192 1193 /* For now, MSG_DONTWAIT is always assumed... */ 1194 err = 0; 1195 sk = sock->sk; 1196 vsk = vsock_sk(sk); 1197 1198 lock_sock(sk); 1199 1200 transport = vsk->transport; 1201 1202 err = vsock_auto_bind(vsk); 1203 if (err) 1204 goto out; 1205 1206 1207 /* If the provided message contains an address, use that. Otherwise 1208 * fall back on the socket's remote handle (if it has been connected). 1209 */ 1210 if (msg->msg_name && 1211 vsock_addr_cast(msg->msg_name, msg->msg_namelen, 1212 &remote_addr) == 0) { 1213 /* Ensure this address is of the right type and is a valid 1214 * destination. 1215 */ 1216 1217 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1218 remote_addr->svm_cid = transport->get_local_cid(); 1219 1220 if (!vsock_addr_bound(remote_addr)) { 1221 err = -EINVAL; 1222 goto out; 1223 } 1224 } else if (sock->state == SS_CONNECTED) { 1225 remote_addr = &vsk->remote_addr; 1226 1227 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1228 remote_addr->svm_cid = transport->get_local_cid(); 1229 1230 /* XXX Should connect() or this function ensure remote_addr is 1231 * bound? 1232 */ 1233 if (!vsock_addr_bound(&vsk->remote_addr)) { 1234 err = -EINVAL; 1235 goto out; 1236 } 1237 } else { 1238 err = -EINVAL; 1239 goto out; 1240 } 1241 1242 if (!transport->dgram_allow(remote_addr->svm_cid, 1243 remote_addr->svm_port)) { 1244 err = -EINVAL; 1245 goto out; 1246 } 1247 1248 err = transport->dgram_enqueue(vsk, remote_addr, msg, len); 1249 1250 out: 1251 release_sock(sk); 1252 return err; 1253 } 1254 1255 static int vsock_dgram_connect(struct socket *sock, 1256 struct sockaddr *addr, int addr_len, int flags) 1257 { 1258 int err; 1259 struct sock *sk; 1260 struct vsock_sock *vsk; 1261 struct sockaddr_vm *remote_addr; 1262 1263 sk = sock->sk; 1264 vsk = vsock_sk(sk); 1265 1266 err = vsock_addr_cast(addr, addr_len, &remote_addr); 1267 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { 1268 lock_sock(sk); 1269 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, 1270 VMADDR_PORT_ANY); 1271 sock->state = SS_UNCONNECTED; 1272 release_sock(sk); 1273 return 0; 1274 } else if (err != 0) 1275 return -EINVAL; 1276 1277 lock_sock(sk); 1278 1279 err = vsock_auto_bind(vsk); 1280 if (err) 1281 goto out; 1282 1283 if (!vsk->transport->dgram_allow(remote_addr->svm_cid, 1284 remote_addr->svm_port)) { 1285 err = -EINVAL; 1286 goto out; 1287 } 1288 1289 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); 1290 sock->state = SS_CONNECTED; 1291 1292 /* sock map disallows redirection of non-TCP sockets with sk_state != 1293 * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set 1294 * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. 1295 * 1296 * This doesn't seem to be abnormal state for datagram sockets, as the 1297 * same approach can be see in other datagram socket types as well 1298 * (such as unix sockets). 1299 */ 1300 sk->sk_state = TCP_ESTABLISHED; 1301 1302 out: 1303 release_sock(sk); 1304 return err; 1305 } 1306 1307 int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1308 size_t len, int flags) 1309 { 1310 struct sock *sk = sock->sk; 1311 struct vsock_sock *vsk = vsock_sk(sk); 1312 1313 return vsk->transport->dgram_dequeue(vsk, msg, len, flags); 1314 } 1315 1316 int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1317 size_t len, int flags) 1318 { 1319 #ifdef CONFIG_BPF_SYSCALL 1320 struct sock *sk = sock->sk; 1321 const struct proto *prot; 1322 1323 prot = READ_ONCE(sk->sk_prot); 1324 if (prot != &vsock_proto) 1325 return prot->recvmsg(sk, msg, len, flags, NULL); 1326 #endif 1327 1328 return __vsock_dgram_recvmsg(sock, msg, len, flags); 1329 } 1330 EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); 1331 1332 static const struct proto_ops vsock_dgram_ops = { 1333 .family = PF_VSOCK, 1334 .owner = THIS_MODULE, 1335 .release = vsock_release, 1336 .bind = vsock_bind, 1337 .connect = vsock_dgram_connect, 1338 .socketpair = sock_no_socketpair, 1339 .accept = sock_no_accept, 1340 .getname = vsock_getname, 1341 .poll = vsock_poll, 1342 .ioctl = sock_no_ioctl, 1343 .listen = sock_no_listen, 1344 .shutdown = vsock_shutdown, 1345 .sendmsg = vsock_dgram_sendmsg, 1346 .recvmsg = vsock_dgram_recvmsg, 1347 .mmap = sock_no_mmap, 1348 .read_skb = vsock_read_skb, 1349 }; 1350 1351 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) 1352 { 1353 const struct vsock_transport *transport = vsk->transport; 1354 1355 if (!transport || !transport->cancel_pkt) 1356 return -EOPNOTSUPP; 1357 1358 return transport->cancel_pkt(vsk); 1359 } 1360 1361 static void vsock_connect_timeout(struct work_struct *work) 1362 { 1363 struct sock *sk; 1364 struct vsock_sock *vsk; 1365 1366 vsk = container_of(work, struct vsock_sock, connect_work.work); 1367 sk = sk_vsock(vsk); 1368 1369 lock_sock(sk); 1370 if (sk->sk_state == TCP_SYN_SENT && 1371 (sk->sk_shutdown != SHUTDOWN_MASK)) { 1372 sk->sk_state = TCP_CLOSE; 1373 sk->sk_socket->state = SS_UNCONNECTED; 1374 sk->sk_err = ETIMEDOUT; 1375 sk_error_report(sk); 1376 vsock_transport_cancel_pkt(vsk); 1377 } 1378 release_sock(sk); 1379 1380 sock_put(sk); 1381 } 1382 1383 static int vsock_connect(struct socket *sock, struct sockaddr *addr, 1384 int addr_len, int flags) 1385 { 1386 int err; 1387 struct sock *sk; 1388 struct vsock_sock *vsk; 1389 const struct vsock_transport *transport; 1390 struct sockaddr_vm *remote_addr; 1391 long timeout; 1392 DEFINE_WAIT(wait); 1393 1394 err = 0; 1395 sk = sock->sk; 1396 vsk = vsock_sk(sk); 1397 1398 lock_sock(sk); 1399 1400 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ 1401 switch (sock->state) { 1402 case SS_CONNECTED: 1403 err = -EISCONN; 1404 goto out; 1405 case SS_DISCONNECTING: 1406 err = -EINVAL; 1407 goto out; 1408 case SS_CONNECTING: 1409 /* This continues on so we can move sock into the SS_CONNECTED 1410 * state once the connection has completed (at which point err 1411 * will be set to zero also). Otherwise, we will either wait 1412 * for the connection or return -EALREADY should this be a 1413 * non-blocking call. 1414 */ 1415 err = -EALREADY; 1416 if (flags & O_NONBLOCK) 1417 goto out; 1418 break; 1419 default: 1420 if ((sk->sk_state == TCP_LISTEN) || 1421 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { 1422 err = -EINVAL; 1423 goto out; 1424 } 1425 1426 /* Set the remote address that we are connecting to. */ 1427 memcpy(&vsk->remote_addr, remote_addr, 1428 sizeof(vsk->remote_addr)); 1429 1430 err = vsock_assign_transport(vsk, NULL); 1431 if (err) 1432 goto out; 1433 1434 transport = vsk->transport; 1435 1436 /* The hypervisor and well-known contexts do not have socket 1437 * endpoints. 1438 */ 1439 if (!transport || 1440 !transport->stream_allow(remote_addr->svm_cid, 1441 remote_addr->svm_port)) { 1442 err = -ENETUNREACH; 1443 goto out; 1444 } 1445 1446 err = vsock_auto_bind(vsk); 1447 if (err) 1448 goto out; 1449 1450 sk->sk_state = TCP_SYN_SENT; 1451 1452 err = transport->connect(vsk); 1453 if (err < 0) 1454 goto out; 1455 1456 /* Mark sock as connecting and set the error code to in 1457 * progress in case this is a non-blocking connect. 1458 */ 1459 sock->state = SS_CONNECTING; 1460 err = -EINPROGRESS; 1461 } 1462 1463 /* The receive path will handle all communication until we are able to 1464 * enter the connected state. Here we wait for the connection to be 1465 * completed or a notification of an error. 1466 */ 1467 timeout = vsk->connect_timeout; 1468 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1469 1470 while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { 1471 if (flags & O_NONBLOCK) { 1472 /* If we're not going to block, we schedule a timeout 1473 * function to generate a timeout on the connection 1474 * attempt, in case the peer doesn't respond in a 1475 * timely manner. We hold on to the socket until the 1476 * timeout fires. 1477 */ 1478 sock_hold(sk); 1479 1480 /* If the timeout function is already scheduled, 1481 * reschedule it, then ungrab the socket refcount to 1482 * keep it balanced. 1483 */ 1484 if (mod_delayed_work(system_wq, &vsk->connect_work, 1485 timeout)) 1486 sock_put(sk); 1487 1488 /* Skip ahead to preserve error code set above. */ 1489 goto out_wait; 1490 } 1491 1492 release_sock(sk); 1493 timeout = schedule_timeout(timeout); 1494 lock_sock(sk); 1495 1496 if (signal_pending(current)) { 1497 err = sock_intr_errno(timeout); 1498 sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; 1499 sock->state = SS_UNCONNECTED; 1500 vsock_transport_cancel_pkt(vsk); 1501 vsock_remove_connected(vsk); 1502 goto out_wait; 1503 } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { 1504 err = -ETIMEDOUT; 1505 sk->sk_state = TCP_CLOSE; 1506 sock->state = SS_UNCONNECTED; 1507 vsock_transport_cancel_pkt(vsk); 1508 goto out_wait; 1509 } 1510 1511 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1512 } 1513 1514 if (sk->sk_err) { 1515 err = -sk->sk_err; 1516 sk->sk_state = TCP_CLOSE; 1517 sock->state = SS_UNCONNECTED; 1518 } else { 1519 err = 0; 1520 } 1521 1522 out_wait: 1523 finish_wait(sk_sleep(sk), &wait); 1524 out: 1525 release_sock(sk); 1526 return err; 1527 } 1528 1529 static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, 1530 bool kern) 1531 { 1532 struct sock *listener; 1533 int err; 1534 struct sock *connected; 1535 struct vsock_sock *vconnected; 1536 long timeout; 1537 DEFINE_WAIT(wait); 1538 1539 err = 0; 1540 listener = sock->sk; 1541 1542 lock_sock(listener); 1543 1544 if (!sock_type_connectible(sock->type)) { 1545 err = -EOPNOTSUPP; 1546 goto out; 1547 } 1548 1549 if (listener->sk_state != TCP_LISTEN) { 1550 err = -EINVAL; 1551 goto out; 1552 } 1553 1554 /* Wait for children sockets to appear; these are the new sockets 1555 * created upon connection establishment. 1556 */ 1557 timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK); 1558 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1559 1560 while ((connected = vsock_dequeue_accept(listener)) == NULL && 1561 listener->sk_err == 0) { 1562 release_sock(listener); 1563 timeout = schedule_timeout(timeout); 1564 finish_wait(sk_sleep(listener), &wait); 1565 lock_sock(listener); 1566 1567 if (signal_pending(current)) { 1568 err = sock_intr_errno(timeout); 1569 goto out; 1570 } else if (timeout == 0) { 1571 err = -EAGAIN; 1572 goto out; 1573 } 1574 1575 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1576 } 1577 finish_wait(sk_sleep(listener), &wait); 1578 1579 if (listener->sk_err) 1580 err = -listener->sk_err; 1581 1582 if (connected) { 1583 sk_acceptq_removed(listener); 1584 1585 lock_sock_nested(connected, SINGLE_DEPTH_NESTING); 1586 vconnected = vsock_sk(connected); 1587 1588 /* If the listener socket has received an error, then we should 1589 * reject this socket and return. Note that we simply mark the 1590 * socket rejected, drop our reference, and let the cleanup 1591 * function handle the cleanup; the fact that we found it in 1592 * the listener's accept queue guarantees that the cleanup 1593 * function hasn't run yet. 1594 */ 1595 if (err) { 1596 vconnected->rejected = true; 1597 } else { 1598 newsock->state = SS_CONNECTED; 1599 sock_graft(connected, newsock); 1600 } 1601 1602 release_sock(connected); 1603 sock_put(connected); 1604 } 1605 1606 out: 1607 release_sock(listener); 1608 return err; 1609 } 1610 1611 static int vsock_listen(struct socket *sock, int backlog) 1612 { 1613 int err; 1614 struct sock *sk; 1615 struct vsock_sock *vsk; 1616 1617 sk = sock->sk; 1618 1619 lock_sock(sk); 1620 1621 if (!sock_type_connectible(sk->sk_type)) { 1622 err = -EOPNOTSUPP; 1623 goto out; 1624 } 1625 1626 if (sock->state != SS_UNCONNECTED) { 1627 err = -EINVAL; 1628 goto out; 1629 } 1630 1631 vsk = vsock_sk(sk); 1632 1633 if (!vsock_addr_bound(&vsk->local_addr)) { 1634 err = -EINVAL; 1635 goto out; 1636 } 1637 1638 sk->sk_max_ack_backlog = backlog; 1639 sk->sk_state = TCP_LISTEN; 1640 1641 err = 0; 1642 1643 out: 1644 release_sock(sk); 1645 return err; 1646 } 1647 1648 static void vsock_update_buffer_size(struct vsock_sock *vsk, 1649 const struct vsock_transport *transport, 1650 u64 val) 1651 { 1652 if (val > vsk->buffer_max_size) 1653 val = vsk->buffer_max_size; 1654 1655 if (val < vsk->buffer_min_size) 1656 val = vsk->buffer_min_size; 1657 1658 if (val != vsk->buffer_size && 1659 transport && transport->notify_buffer_size) 1660 transport->notify_buffer_size(vsk, &val); 1661 1662 vsk->buffer_size = val; 1663 } 1664 1665 static int vsock_connectible_setsockopt(struct socket *sock, 1666 int level, 1667 int optname, 1668 sockptr_t optval, 1669 unsigned int optlen) 1670 { 1671 int err; 1672 struct sock *sk; 1673 struct vsock_sock *vsk; 1674 const struct vsock_transport *transport; 1675 u64 val; 1676 1677 if (level != AF_VSOCK) 1678 return -ENOPROTOOPT; 1679 1680 #define COPY_IN(_v) \ 1681 do { \ 1682 if (optlen < sizeof(_v)) { \ 1683 err = -EINVAL; \ 1684 goto exit; \ 1685 } \ 1686 if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ 1687 err = -EFAULT; \ 1688 goto exit; \ 1689 } \ 1690 } while (0) 1691 1692 err = 0; 1693 sk = sock->sk; 1694 vsk = vsock_sk(sk); 1695 1696 lock_sock(sk); 1697 1698 transport = vsk->transport; 1699 1700 switch (optname) { 1701 case SO_VM_SOCKETS_BUFFER_SIZE: 1702 COPY_IN(val); 1703 vsock_update_buffer_size(vsk, transport, val); 1704 break; 1705 1706 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1707 COPY_IN(val); 1708 vsk->buffer_max_size = val; 1709 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 1710 break; 1711 1712 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1713 COPY_IN(val); 1714 vsk->buffer_min_size = val; 1715 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 1716 break; 1717 1718 case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: 1719 case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { 1720 struct __kernel_sock_timeval tv; 1721 1722 err = sock_copy_user_timeval(&tv, optval, optlen, 1723 optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); 1724 if (err) 1725 break; 1726 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && 1727 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { 1728 vsk->connect_timeout = tv.tv_sec * HZ + 1729 DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); 1730 if (vsk->connect_timeout == 0) 1731 vsk->connect_timeout = 1732 VSOCK_DEFAULT_CONNECT_TIMEOUT; 1733 1734 } else { 1735 err = -ERANGE; 1736 } 1737 break; 1738 } 1739 1740 default: 1741 err = -ENOPROTOOPT; 1742 break; 1743 } 1744 1745 #undef COPY_IN 1746 1747 exit: 1748 release_sock(sk); 1749 return err; 1750 } 1751 1752 static int vsock_connectible_getsockopt(struct socket *sock, 1753 int level, int optname, 1754 char __user *optval, 1755 int __user *optlen) 1756 { 1757 struct sock *sk = sock->sk; 1758 struct vsock_sock *vsk = vsock_sk(sk); 1759 1760 union { 1761 u64 val64; 1762 struct old_timeval32 tm32; 1763 struct __kernel_old_timeval tm; 1764 struct __kernel_sock_timeval stm; 1765 } v; 1766 1767 int lv = sizeof(v.val64); 1768 int len; 1769 1770 if (level != AF_VSOCK) 1771 return -ENOPROTOOPT; 1772 1773 if (get_user(len, optlen)) 1774 return -EFAULT; 1775 1776 memset(&v, 0, sizeof(v)); 1777 1778 switch (optname) { 1779 case SO_VM_SOCKETS_BUFFER_SIZE: 1780 v.val64 = vsk->buffer_size; 1781 break; 1782 1783 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1784 v.val64 = vsk->buffer_max_size; 1785 break; 1786 1787 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1788 v.val64 = vsk->buffer_min_size; 1789 break; 1790 1791 case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: 1792 case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: 1793 lv = sock_get_timeout(vsk->connect_timeout, &v, 1794 optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); 1795 break; 1796 1797 default: 1798 return -ENOPROTOOPT; 1799 } 1800 1801 if (len < lv) 1802 return -EINVAL; 1803 if (len > lv) 1804 len = lv; 1805 if (copy_to_user(optval, &v, len)) 1806 return -EFAULT; 1807 1808 if (put_user(len, optlen)) 1809 return -EFAULT; 1810 1811 return 0; 1812 } 1813 1814 static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, 1815 size_t len) 1816 { 1817 struct sock *sk; 1818 struct vsock_sock *vsk; 1819 const struct vsock_transport *transport; 1820 ssize_t total_written; 1821 long timeout; 1822 int err; 1823 struct vsock_transport_send_notify_data send_data; 1824 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1825 1826 sk = sock->sk; 1827 vsk = vsock_sk(sk); 1828 total_written = 0; 1829 err = 0; 1830 1831 if (msg->msg_flags & MSG_OOB) 1832 return -EOPNOTSUPP; 1833 1834 lock_sock(sk); 1835 1836 transport = vsk->transport; 1837 1838 /* Callers should not provide a destination with connection oriented 1839 * sockets. 1840 */ 1841 if (msg->msg_namelen) { 1842 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 1843 goto out; 1844 } 1845 1846 /* Send data only if both sides are not shutdown in the direction. */ 1847 if (sk->sk_shutdown & SEND_SHUTDOWN || 1848 vsk->peer_shutdown & RCV_SHUTDOWN) { 1849 err = -EPIPE; 1850 goto out; 1851 } 1852 1853 if (!transport || sk->sk_state != TCP_ESTABLISHED || 1854 !vsock_addr_bound(&vsk->local_addr)) { 1855 err = -ENOTCONN; 1856 goto out; 1857 } 1858 1859 if (!vsock_addr_bound(&vsk->remote_addr)) { 1860 err = -EDESTADDRREQ; 1861 goto out; 1862 } 1863 1864 /* Wait for room in the produce queue to enqueue our user's data. */ 1865 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1866 1867 err = transport->notify_send_init(vsk, &send_data); 1868 if (err < 0) 1869 goto out; 1870 1871 while (total_written < len) { 1872 ssize_t written; 1873 1874 add_wait_queue(sk_sleep(sk), &wait); 1875 while (vsock_stream_has_space(vsk) == 0 && 1876 sk->sk_err == 0 && 1877 !(sk->sk_shutdown & SEND_SHUTDOWN) && 1878 !(vsk->peer_shutdown & RCV_SHUTDOWN)) { 1879 1880 /* Don't wait for non-blocking sockets. */ 1881 if (timeout == 0) { 1882 err = -EAGAIN; 1883 remove_wait_queue(sk_sleep(sk), &wait); 1884 goto out_err; 1885 } 1886 1887 err = transport->notify_send_pre_block(vsk, &send_data); 1888 if (err < 0) { 1889 remove_wait_queue(sk_sleep(sk), &wait); 1890 goto out_err; 1891 } 1892 1893 release_sock(sk); 1894 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); 1895 lock_sock(sk); 1896 if (signal_pending(current)) { 1897 err = sock_intr_errno(timeout); 1898 remove_wait_queue(sk_sleep(sk), &wait); 1899 goto out_err; 1900 } else if (timeout == 0) { 1901 err = -EAGAIN; 1902 remove_wait_queue(sk_sleep(sk), &wait); 1903 goto out_err; 1904 } 1905 } 1906 remove_wait_queue(sk_sleep(sk), &wait); 1907 1908 /* These checks occur both as part of and after the loop 1909 * conditional since we need to check before and after 1910 * sleeping. 1911 */ 1912 if (sk->sk_err) { 1913 err = -sk->sk_err; 1914 goto out_err; 1915 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1916 (vsk->peer_shutdown & RCV_SHUTDOWN)) { 1917 err = -EPIPE; 1918 goto out_err; 1919 } 1920 1921 err = transport->notify_send_pre_enqueue(vsk, &send_data); 1922 if (err < 0) 1923 goto out_err; 1924 1925 /* Note that enqueue will only write as many bytes as are free 1926 * in the produce queue, so we don't need to ensure len is 1927 * smaller than the queue size. It is the caller's 1928 * responsibility to check how many bytes we were able to send. 1929 */ 1930 1931 if (sk->sk_type == SOCK_SEQPACKET) { 1932 written = transport->seqpacket_enqueue(vsk, 1933 msg, len - total_written); 1934 } else { 1935 written = transport->stream_enqueue(vsk, 1936 msg, len - total_written); 1937 } 1938 1939 if (written < 0) { 1940 err = written; 1941 goto out_err; 1942 } 1943 1944 total_written += written; 1945 1946 err = transport->notify_send_post_enqueue( 1947 vsk, written, &send_data); 1948 if (err < 0) 1949 goto out_err; 1950 1951 } 1952 1953 out_err: 1954 if (total_written > 0) { 1955 /* Return number of written bytes only if: 1956 * 1) SOCK_STREAM socket. 1957 * 2) SOCK_SEQPACKET socket when whole buffer is sent. 1958 */ 1959 if (sk->sk_type == SOCK_STREAM || total_written == len) 1960 err = total_written; 1961 } 1962 out: 1963 release_sock(sk); 1964 return err; 1965 } 1966 1967 static int vsock_connectible_wait_data(struct sock *sk, 1968 struct wait_queue_entry *wait, 1969 long timeout, 1970 struct vsock_transport_recv_notify_data *recv_data, 1971 size_t target) 1972 { 1973 const struct vsock_transport *transport; 1974 struct vsock_sock *vsk; 1975 s64 data; 1976 int err; 1977 1978 vsk = vsock_sk(sk); 1979 err = 0; 1980 transport = vsk->transport; 1981 1982 while (1) { 1983 prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); 1984 data = vsock_connectible_has_data(vsk); 1985 if (data != 0) 1986 break; 1987 1988 if (sk->sk_err != 0 || 1989 (sk->sk_shutdown & RCV_SHUTDOWN) || 1990 (vsk->peer_shutdown & SEND_SHUTDOWN)) { 1991 break; 1992 } 1993 1994 /* Don't wait for non-blocking sockets. */ 1995 if (timeout == 0) { 1996 err = -EAGAIN; 1997 break; 1998 } 1999 2000 if (recv_data) { 2001 err = transport->notify_recv_pre_block(vsk, target, recv_data); 2002 if (err < 0) 2003 break; 2004 } 2005 2006 release_sock(sk); 2007 timeout = schedule_timeout(timeout); 2008 lock_sock(sk); 2009 2010 if (signal_pending(current)) { 2011 err = sock_intr_errno(timeout); 2012 break; 2013 } else if (timeout == 0) { 2014 err = -EAGAIN; 2015 break; 2016 } 2017 } 2018 2019 finish_wait(sk_sleep(sk), wait); 2020 2021 if (err) 2022 return err; 2023 2024 /* Internal transport error when checking for available 2025 * data. XXX This should be changed to a connection 2026 * reset in a later change. 2027 */ 2028 if (data < 0) 2029 return -ENOMEM; 2030 2031 return data; 2032 } 2033 2034 static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2035 size_t len, int flags) 2036 { 2037 struct vsock_transport_recv_notify_data recv_data; 2038 const struct vsock_transport *transport; 2039 struct vsock_sock *vsk; 2040 ssize_t copied; 2041 size_t target; 2042 long timeout; 2043 int err; 2044 2045 DEFINE_WAIT(wait); 2046 2047 vsk = vsock_sk(sk); 2048 transport = vsk->transport; 2049 2050 /* We must not copy less than target bytes into the user's buffer 2051 * before returning successfully, so we wait for the consume queue to 2052 * have that much data to consume before dequeueing. Note that this 2053 * makes it impossible to handle cases where target is greater than the 2054 * queue size. 2055 */ 2056 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 2057 if (target >= transport->stream_rcvhiwat(vsk)) { 2058 err = -ENOMEM; 2059 goto out; 2060 } 2061 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2062 copied = 0; 2063 2064 err = transport->notify_recv_init(vsk, target, &recv_data); 2065 if (err < 0) 2066 goto out; 2067 2068 2069 while (1) { 2070 ssize_t read; 2071 2072 err = vsock_connectible_wait_data(sk, &wait, timeout, 2073 &recv_data, target); 2074 if (err <= 0) 2075 break; 2076 2077 err = transport->notify_recv_pre_dequeue(vsk, target, 2078 &recv_data); 2079 if (err < 0) 2080 break; 2081 2082 read = transport->stream_dequeue(vsk, msg, len - copied, flags); 2083 if (read < 0) { 2084 err = read; 2085 break; 2086 } 2087 2088 copied += read; 2089 2090 err = transport->notify_recv_post_dequeue(vsk, target, read, 2091 !(flags & MSG_PEEK), &recv_data); 2092 if (err < 0) 2093 goto out; 2094 2095 if (read >= target || flags & MSG_PEEK) 2096 break; 2097 2098 target -= read; 2099 } 2100 2101 if (sk->sk_err) 2102 err = -sk->sk_err; 2103 else if (sk->sk_shutdown & RCV_SHUTDOWN) 2104 err = 0; 2105 2106 if (copied > 0) 2107 err = copied; 2108 2109 out: 2110 return err; 2111 } 2112 2113 static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, 2114 size_t len, int flags) 2115 { 2116 const struct vsock_transport *transport; 2117 struct vsock_sock *vsk; 2118 ssize_t msg_len; 2119 long timeout; 2120 int err = 0; 2121 DEFINE_WAIT(wait); 2122 2123 vsk = vsock_sk(sk); 2124 transport = vsk->transport; 2125 2126 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2127 2128 err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); 2129 if (err <= 0) 2130 goto out; 2131 2132 msg_len = transport->seqpacket_dequeue(vsk, msg, flags); 2133 2134 if (msg_len < 0) { 2135 err = msg_len; 2136 goto out; 2137 } 2138 2139 if (sk->sk_err) { 2140 err = -sk->sk_err; 2141 } else if (sk->sk_shutdown & RCV_SHUTDOWN) { 2142 err = 0; 2143 } else { 2144 /* User sets MSG_TRUNC, so return real length of 2145 * packet. 2146 */ 2147 if (flags & MSG_TRUNC) 2148 err = msg_len; 2149 else 2150 err = len - msg_data_left(msg); 2151 2152 /* Always set MSG_TRUNC if real length of packet is 2153 * bigger than user's buffer. 2154 */ 2155 if (msg_len > len) 2156 msg->msg_flags |= MSG_TRUNC; 2157 } 2158 2159 out: 2160 return err; 2161 } 2162 2163 int 2164 __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 2165 int flags) 2166 { 2167 struct sock *sk; 2168 struct vsock_sock *vsk; 2169 const struct vsock_transport *transport; 2170 int err; 2171 2172 sk = sock->sk; 2173 2174 if (unlikely(flags & MSG_ERRQUEUE)) 2175 return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); 2176 2177 vsk = vsock_sk(sk); 2178 err = 0; 2179 2180 lock_sock(sk); 2181 2182 transport = vsk->transport; 2183 2184 if (!transport || sk->sk_state != TCP_ESTABLISHED) { 2185 /* Recvmsg is supposed to return 0 if a peer performs an 2186 * orderly shutdown. Differentiate between that case and when a 2187 * peer has not connected or a local shutdown occurred with the 2188 * SOCK_DONE flag. 2189 */ 2190 if (sock_flag(sk, SOCK_DONE)) 2191 err = 0; 2192 else 2193 err = -ENOTCONN; 2194 2195 goto out; 2196 } 2197 2198 if (flags & MSG_OOB) { 2199 err = -EOPNOTSUPP; 2200 goto out; 2201 } 2202 2203 /* We don't check peer_shutdown flag here since peer may actually shut 2204 * down, but there can be data in the queue that a local socket can 2205 * receive. 2206 */ 2207 if (sk->sk_shutdown & RCV_SHUTDOWN) { 2208 err = 0; 2209 goto out; 2210 } 2211 2212 /* It is valid on Linux to pass in a zero-length receive buffer. This 2213 * is not an error. We may as well bail out now. 2214 */ 2215 if (!len) { 2216 err = 0; 2217 goto out; 2218 } 2219 2220 if (sk->sk_type == SOCK_STREAM) 2221 err = __vsock_stream_recvmsg(sk, msg, len, flags); 2222 else 2223 err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); 2224 2225 out: 2226 release_sock(sk); 2227 return err; 2228 } 2229 2230 int 2231 vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 2232 int flags) 2233 { 2234 #ifdef CONFIG_BPF_SYSCALL 2235 struct sock *sk = sock->sk; 2236 const struct proto *prot; 2237 2238 prot = READ_ONCE(sk->sk_prot); 2239 if (prot != &vsock_proto) 2240 return prot->recvmsg(sk, msg, len, flags, NULL); 2241 #endif 2242 2243 return __vsock_connectible_recvmsg(sock, msg, len, flags); 2244 } 2245 EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); 2246 2247 static int vsock_set_rcvlowat(struct sock *sk, int val) 2248 { 2249 const struct vsock_transport *transport; 2250 struct vsock_sock *vsk; 2251 2252 vsk = vsock_sk(sk); 2253 2254 if (val > vsk->buffer_size) 2255 return -EINVAL; 2256 2257 transport = vsk->transport; 2258 2259 if (transport && transport->notify_set_rcvlowat) { 2260 int err; 2261 2262 err = transport->notify_set_rcvlowat(vsk, val); 2263 if (err) 2264 return err; 2265 } 2266 2267 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 2268 return 0; 2269 } 2270 2271 static const struct proto_ops vsock_stream_ops = { 2272 .family = PF_VSOCK, 2273 .owner = THIS_MODULE, 2274 .release = vsock_release, 2275 .bind = vsock_bind, 2276 .connect = vsock_connect, 2277 .socketpair = sock_no_socketpair, 2278 .accept = vsock_accept, 2279 .getname = vsock_getname, 2280 .poll = vsock_poll, 2281 .ioctl = sock_no_ioctl, 2282 .listen = vsock_listen, 2283 .shutdown = vsock_shutdown, 2284 .setsockopt = vsock_connectible_setsockopt, 2285 .getsockopt = vsock_connectible_getsockopt, 2286 .sendmsg = vsock_connectible_sendmsg, 2287 .recvmsg = vsock_connectible_recvmsg, 2288 .mmap = sock_no_mmap, 2289 .set_rcvlowat = vsock_set_rcvlowat, 2290 .read_skb = vsock_read_skb, 2291 }; 2292 2293 static const struct proto_ops vsock_seqpacket_ops = { 2294 .family = PF_VSOCK, 2295 .owner = THIS_MODULE, 2296 .release = vsock_release, 2297 .bind = vsock_bind, 2298 .connect = vsock_connect, 2299 .socketpair = sock_no_socketpair, 2300 .accept = vsock_accept, 2301 .getname = vsock_getname, 2302 .poll = vsock_poll, 2303 .ioctl = sock_no_ioctl, 2304 .listen = vsock_listen, 2305 .shutdown = vsock_shutdown, 2306 .setsockopt = vsock_connectible_setsockopt, 2307 .getsockopt = vsock_connectible_getsockopt, 2308 .sendmsg = vsock_connectible_sendmsg, 2309 .recvmsg = vsock_connectible_recvmsg, 2310 .mmap = sock_no_mmap, 2311 .read_skb = vsock_read_skb, 2312 }; 2313 2314 static int vsock_create(struct net *net, struct socket *sock, 2315 int protocol, int kern) 2316 { 2317 struct vsock_sock *vsk; 2318 struct sock *sk; 2319 int ret; 2320 2321 if (!sock) 2322 return -EINVAL; 2323 2324 if (protocol && protocol != PF_VSOCK) 2325 return -EPROTONOSUPPORT; 2326 2327 switch (sock->type) { 2328 case SOCK_DGRAM: 2329 sock->ops = &vsock_dgram_ops; 2330 break; 2331 case SOCK_STREAM: 2332 sock->ops = &vsock_stream_ops; 2333 break; 2334 case SOCK_SEQPACKET: 2335 sock->ops = &vsock_seqpacket_ops; 2336 break; 2337 default: 2338 return -ESOCKTNOSUPPORT; 2339 } 2340 2341 sock->state = SS_UNCONNECTED; 2342 2343 sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); 2344 if (!sk) 2345 return -ENOMEM; 2346 2347 vsk = vsock_sk(sk); 2348 2349 if (sock->type == SOCK_DGRAM) { 2350 ret = vsock_assign_transport(vsk, NULL); 2351 if (ret < 0) { 2352 sock_put(sk); 2353 return ret; 2354 } 2355 } 2356 2357 vsock_insert_unbound(vsk); 2358 2359 return 0; 2360 } 2361 2362 static const struct net_proto_family vsock_family_ops = { 2363 .family = AF_VSOCK, 2364 .create = vsock_create, 2365 .owner = THIS_MODULE, 2366 }; 2367 2368 static long vsock_dev_do_ioctl(struct file *filp, 2369 unsigned int cmd, void __user *ptr) 2370 { 2371 u32 __user *p = ptr; 2372 u32 cid = VMADDR_CID_ANY; 2373 int retval = 0; 2374 2375 switch (cmd) { 2376 case IOCTL_VM_SOCKETS_GET_LOCAL_CID: 2377 /* To be compatible with the VMCI behavior, we prioritize the 2378 * guest CID instead of well-know host CID (VMADDR_CID_HOST). 2379 */ 2380 if (transport_g2h) 2381 cid = transport_g2h->get_local_cid(); 2382 else if (transport_h2g) 2383 cid = transport_h2g->get_local_cid(); 2384 2385 if (put_user(cid, p) != 0) 2386 retval = -EFAULT; 2387 break; 2388 2389 default: 2390 retval = -ENOIOCTLCMD; 2391 } 2392 2393 return retval; 2394 } 2395 2396 static long vsock_dev_ioctl(struct file *filp, 2397 unsigned int cmd, unsigned long arg) 2398 { 2399 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); 2400 } 2401 2402 #ifdef CONFIG_COMPAT 2403 static long vsock_dev_compat_ioctl(struct file *filp, 2404 unsigned int cmd, unsigned long arg) 2405 { 2406 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); 2407 } 2408 #endif 2409 2410 static const struct file_operations vsock_device_ops = { 2411 .owner = THIS_MODULE, 2412 .unlocked_ioctl = vsock_dev_ioctl, 2413 #ifdef CONFIG_COMPAT 2414 .compat_ioctl = vsock_dev_compat_ioctl, 2415 #endif 2416 .open = nonseekable_open, 2417 }; 2418 2419 static struct miscdevice vsock_device = { 2420 .name = "vsock", 2421 .fops = &vsock_device_ops, 2422 }; 2423 2424 static int __init vsock_init(void) 2425 { 2426 int err = 0; 2427 2428 vsock_init_tables(); 2429 2430 vsock_proto.owner = THIS_MODULE; 2431 vsock_device.minor = MISC_DYNAMIC_MINOR; 2432 err = misc_register(&vsock_device); 2433 if (err) { 2434 pr_err("Failed to register misc device\n"); 2435 goto err_reset_transport; 2436 } 2437 2438 err = proto_register(&vsock_proto, 1); /* we want our slab */ 2439 if (err) { 2440 pr_err("Cannot register vsock protocol\n"); 2441 goto err_deregister_misc; 2442 } 2443 2444 err = sock_register(&vsock_family_ops); 2445 if (err) { 2446 pr_err("could not register af_vsock (%d) address family: %d\n", 2447 AF_VSOCK, err); 2448 goto err_unregister_proto; 2449 } 2450 2451 vsock_bpf_build_proto(); 2452 2453 return 0; 2454 2455 err_unregister_proto: 2456 proto_unregister(&vsock_proto); 2457 err_deregister_misc: 2458 misc_deregister(&vsock_device); 2459 err_reset_transport: 2460 return err; 2461 } 2462 2463 static void __exit vsock_exit(void) 2464 { 2465 misc_deregister(&vsock_device); 2466 sock_unregister(AF_VSOCK); 2467 proto_unregister(&vsock_proto); 2468 } 2469 2470 const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) 2471 { 2472 return vsk->transport; 2473 } 2474 EXPORT_SYMBOL_GPL(vsock_core_get_transport); 2475 2476 int vsock_core_register(const struct vsock_transport *t, int features) 2477 { 2478 const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; 2479 int err = mutex_lock_interruptible(&vsock_register_mutex); 2480 2481 if (err) 2482 return err; 2483 2484 t_h2g = transport_h2g; 2485 t_g2h = transport_g2h; 2486 t_dgram = transport_dgram; 2487 t_local = transport_local; 2488 2489 if (features & VSOCK_TRANSPORT_F_H2G) { 2490 if (t_h2g) { 2491 err = -EBUSY; 2492 goto err_busy; 2493 } 2494 t_h2g = t; 2495 } 2496 2497 if (features & VSOCK_TRANSPORT_F_G2H) { 2498 if (t_g2h) { 2499 err = -EBUSY; 2500 goto err_busy; 2501 } 2502 t_g2h = t; 2503 } 2504 2505 if (features & VSOCK_TRANSPORT_F_DGRAM) { 2506 if (t_dgram) { 2507 err = -EBUSY; 2508 goto err_busy; 2509 } 2510 t_dgram = t; 2511 } 2512 2513 if (features & VSOCK_TRANSPORT_F_LOCAL) { 2514 if (t_local) { 2515 err = -EBUSY; 2516 goto err_busy; 2517 } 2518 t_local = t; 2519 } 2520 2521 transport_h2g = t_h2g; 2522 transport_g2h = t_g2h; 2523 transport_dgram = t_dgram; 2524 transport_local = t_local; 2525 2526 err_busy: 2527 mutex_unlock(&vsock_register_mutex); 2528 return err; 2529 } 2530 EXPORT_SYMBOL_GPL(vsock_core_register); 2531 2532 void vsock_core_unregister(const struct vsock_transport *t) 2533 { 2534 mutex_lock(&vsock_register_mutex); 2535 2536 if (transport_h2g == t) 2537 transport_h2g = NULL; 2538 2539 if (transport_g2h == t) 2540 transport_g2h = NULL; 2541 2542 if (transport_dgram == t) 2543 transport_dgram = NULL; 2544 2545 if (transport_local == t) 2546 transport_local = NULL; 2547 2548 mutex_unlock(&vsock_register_mutex); 2549 } 2550 EXPORT_SYMBOL_GPL(vsock_core_unregister); 2551 2552 module_init(vsock_init); 2553 module_exit(vsock_exit); 2554 2555 MODULE_AUTHOR("VMware, Inc."); 2556 MODULE_DESCRIPTION("VMware Virtual Socket Family"); 2557 MODULE_VERSION("1.0.2.0-k"); 2558 MODULE_LICENSE("GPL v2"); 2559