1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VMware vSockets Driver 4 * 5 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 6 */ 7 8 /* Implementation notes: 9 * 10 * - There are two kinds of sockets: those created by user action (such as 11 * calling socket(2)) and those created by incoming connection request packets. 12 * 13 * - There are two "global" tables, one for bound sockets (sockets that have 14 * specified an address that they are responsible for) and one for connected 15 * sockets (sockets that have established a connection with another socket). 16 * These tables are "global" in that all sockets on the system are placed 17 * within them. - Note, though, that the bound table contains an extra entry 18 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in 19 * that list. The bound table is used solely for lookup of sockets when packets 20 * are received and that's not necessary for SOCK_DGRAM sockets since we create 21 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM 22 * sockets out of the bound hash buckets will reduce the chance of collisions 23 * when looking for SOCK_STREAM sockets and prevents us from having to check the 24 * socket type in the hash table lookups. 25 * 26 * - Sockets created by user action will either be "client" sockets that 27 * initiate a connection or "server" sockets that listen for connections; we do 28 * not support simultaneous connects (two "client" sockets connecting). 29 * 30 * - "Server" sockets are referred to as listener sockets throughout this 31 * implementation because they are in the TCP_LISTEN state. When a 32 * connection request is received (the second kind of socket mentioned above), 33 * we create a new socket and refer to it as a pending socket. These pending 34 * sockets are placed on the pending connection list of the listener socket. 35 * When future packets are received for the address the listener socket is 36 * bound to, we check if the source of the packet is from one that has an 37 * existing pending connection. If it does, we process the packet for the 38 * pending socket. When that socket reaches the connected state, it is removed 39 * from the listener socket's pending list and enqueued in the listener 40 * socket's accept queue. Callers of accept(2) will accept connected sockets 41 * from the listener socket's accept queue. If the socket cannot be accepted 42 * for some reason then it is marked rejected. Once the connection is 43 * accepted, it is owned by the user process and the responsibility for cleanup 44 * falls with that user process. 45 * 46 * - It is possible that these pending sockets will never reach the connected 47 * state; in fact, we may never receive another packet after the connection 48 * request. Because of this, we must schedule a cleanup function to run in the 49 * future, after some amount of time passes where a connection should have been 50 * established. This function ensures that the socket is off all lists so it 51 * cannot be retrieved, then drops all references to the socket so it is cleaned 52 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this 53 * function will also cleanup rejected sockets, those that reach the connected 54 * state but leave it before they have been accepted. 55 * 56 * - Lock ordering for pending or accept queue sockets is: 57 * 58 * lock_sock(listener); 59 * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); 60 * 61 * Using explicit nested locking keeps lockdep happy since normally only one 62 * lock of a given class may be taken at a time. 63 * 64 * - Sockets created by user action will be cleaned up when the user process 65 * calls close(2), causing our release implementation to be called. Our release 66 * implementation will perform some cleanup then drop the last reference so our 67 * sk_destruct implementation is invoked. Our sk_destruct implementation will 68 * perform additional cleanup that's common for both types of sockets. 69 * 70 * - A socket's reference count is what ensures that the structure won't be 71 * freed. Each entry in a list (such as the "global" bound and connected tables 72 * and the listener socket's pending list and connected queue) ensures a 73 * reference. When we defer work until process context and pass a socket as our 74 * argument, we must ensure the reference count is increased to ensure the 75 * socket isn't freed before the function is run; the deferred function will 76 * then drop the reference. 77 * 78 * - sk->sk_state uses the TCP state constants because they are widely used by 79 * other address families and exposed to userspace tools like ss(8): 80 * 81 * TCP_CLOSE - unconnected 82 * TCP_SYN_SENT - connecting 83 * TCP_ESTABLISHED - connected 84 * TCP_CLOSING - disconnecting 85 * TCP_LISTEN - listening 86 */ 87 88 #include <linux/compat.h> 89 #include <linux/types.h> 90 #include <linux/bitops.h> 91 #include <linux/cred.h> 92 #include <linux/errqueue.h> 93 #include <linux/init.h> 94 #include <linux/io.h> 95 #include <linux/kernel.h> 96 #include <linux/sched/signal.h> 97 #include <linux/kmod.h> 98 #include <linux/list.h> 99 #include <linux/miscdevice.h> 100 #include <linux/module.h> 101 #include <linux/mutex.h> 102 #include <linux/net.h> 103 #include <linux/poll.h> 104 #include <linux/random.h> 105 #include <linux/skbuff.h> 106 #include <linux/smp.h> 107 #include <linux/socket.h> 108 #include <linux/stddef.h> 109 #include <linux/unistd.h> 110 #include <linux/wait.h> 111 #include <linux/workqueue.h> 112 #include <net/sock.h> 113 #include <net/af_vsock.h> 114 #include <uapi/linux/vm_sockets.h> 115 116 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 117 static void vsock_sk_destruct(struct sock *sk); 118 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 119 static void vsock_close(struct sock *sk, long timeout); 120 121 /* Protocol family. */ 122 struct proto vsock_proto = { 123 .name = "AF_VSOCK", 124 .owner = THIS_MODULE, 125 .obj_size = sizeof(struct vsock_sock), 126 .close = vsock_close, 127 #ifdef CONFIG_BPF_SYSCALL 128 .psock_update_sk_prot = vsock_bpf_update_proto, 129 #endif 130 }; 131 132 /* The default peer timeout indicates how long we will wait for a peer response 133 * to a control message. 134 */ 135 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 136 137 #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) 138 #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) 139 #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 140 141 /* Transport used for host->guest communication */ 142 static const struct vsock_transport *transport_h2g; 143 /* Transport used for guest->host communication */ 144 static const struct vsock_transport *transport_g2h; 145 /* Transport used for DGRAM communication */ 146 static const struct vsock_transport *transport_dgram; 147 /* Transport used for local communication */ 148 static const struct vsock_transport *transport_local; 149 static DEFINE_MUTEX(vsock_register_mutex); 150 151 /**** UTILS ****/ 152 153 /* Each bound VSocket is stored in the bind hash table and each connected 154 * VSocket is stored in the connected hash table. 155 * 156 * Unbound sockets are all put on the same list attached to the end of the hash 157 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in 158 * the bucket that their local address hashes to (vsock_bound_sockets(addr) 159 * represents the list that addr hashes to). 160 * 161 * Specifically, we initialize the vsock_bind_table array to a size of 162 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through 163 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and 164 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function 165 * mods with VSOCK_HASH_SIZE to ensure this. 166 */ 167 #define MAX_PORT_RETRIES 24 168 169 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) 170 #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) 171 #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) 172 173 /* XXX This can probably be implemented in a better way. */ 174 #define VSOCK_CONN_HASH(src, dst) \ 175 (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) 176 #define vsock_connected_sockets(src, dst) \ 177 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) 178 #define vsock_connected_sockets_vsk(vsk) \ 179 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) 180 181 struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; 182 EXPORT_SYMBOL_GPL(vsock_bind_table); 183 struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; 184 EXPORT_SYMBOL_GPL(vsock_connected_table); 185 DEFINE_SPINLOCK(vsock_table_lock); 186 EXPORT_SYMBOL_GPL(vsock_table_lock); 187 188 /* Autobind this socket to the local address if necessary. */ 189 static int vsock_auto_bind(struct vsock_sock *vsk) 190 { 191 struct sock *sk = sk_vsock(vsk); 192 struct sockaddr_vm local_addr; 193 194 if (vsock_addr_bound(&vsk->local_addr)) 195 return 0; 196 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 197 return __vsock_bind(sk, &local_addr); 198 } 199 200 static void vsock_init_tables(void) 201 { 202 int i; 203 204 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) 205 INIT_LIST_HEAD(&vsock_bind_table[i]); 206 207 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) 208 INIT_LIST_HEAD(&vsock_connected_table[i]); 209 } 210 211 static void __vsock_insert_bound(struct list_head *list, 212 struct vsock_sock *vsk) 213 { 214 sock_hold(&vsk->sk); 215 list_add(&vsk->bound_table, list); 216 } 217 218 static void __vsock_insert_connected(struct list_head *list, 219 struct vsock_sock *vsk) 220 { 221 sock_hold(&vsk->sk); 222 list_add(&vsk->connected_table, list); 223 } 224 225 static void __vsock_remove_bound(struct vsock_sock *vsk) 226 { 227 list_del_init(&vsk->bound_table); 228 sock_put(&vsk->sk); 229 } 230 231 static void __vsock_remove_connected(struct vsock_sock *vsk) 232 { 233 list_del_init(&vsk->connected_table); 234 sock_put(&vsk->sk); 235 } 236 237 static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) 238 { 239 struct vsock_sock *vsk; 240 241 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { 242 if (vsock_addr_equals_addr(addr, &vsk->local_addr)) 243 return sk_vsock(vsk); 244 245 if (addr->svm_port == vsk->local_addr.svm_port && 246 (vsk->local_addr.svm_cid == VMADDR_CID_ANY || 247 addr->svm_cid == VMADDR_CID_ANY)) 248 return sk_vsock(vsk); 249 } 250 251 return NULL; 252 } 253 254 static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, 255 struct sockaddr_vm *dst) 256 { 257 struct vsock_sock *vsk; 258 259 list_for_each_entry(vsk, vsock_connected_sockets(src, dst), 260 connected_table) { 261 if (vsock_addr_equals_addr(src, &vsk->remote_addr) && 262 dst->svm_port == vsk->local_addr.svm_port) { 263 return sk_vsock(vsk); 264 } 265 } 266 267 return NULL; 268 } 269 270 static void vsock_insert_unbound(struct vsock_sock *vsk) 271 { 272 spin_lock_bh(&vsock_table_lock); 273 __vsock_insert_bound(vsock_unbound_sockets, vsk); 274 spin_unlock_bh(&vsock_table_lock); 275 } 276 277 void vsock_insert_connected(struct vsock_sock *vsk) 278 { 279 struct list_head *list = vsock_connected_sockets( 280 &vsk->remote_addr, &vsk->local_addr); 281 282 spin_lock_bh(&vsock_table_lock); 283 __vsock_insert_connected(list, vsk); 284 spin_unlock_bh(&vsock_table_lock); 285 } 286 EXPORT_SYMBOL_GPL(vsock_insert_connected); 287 288 void vsock_remove_bound(struct vsock_sock *vsk) 289 { 290 spin_lock_bh(&vsock_table_lock); 291 if (__vsock_in_bound_table(vsk)) 292 __vsock_remove_bound(vsk); 293 spin_unlock_bh(&vsock_table_lock); 294 } 295 EXPORT_SYMBOL_GPL(vsock_remove_bound); 296 297 void vsock_remove_connected(struct vsock_sock *vsk) 298 { 299 spin_lock_bh(&vsock_table_lock); 300 if (__vsock_in_connected_table(vsk)) 301 __vsock_remove_connected(vsk); 302 spin_unlock_bh(&vsock_table_lock); 303 } 304 EXPORT_SYMBOL_GPL(vsock_remove_connected); 305 306 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) 307 { 308 struct sock *sk; 309 310 spin_lock_bh(&vsock_table_lock); 311 sk = __vsock_find_bound_socket(addr); 312 if (sk) 313 sock_hold(sk); 314 315 spin_unlock_bh(&vsock_table_lock); 316 317 return sk; 318 } 319 EXPORT_SYMBOL_GPL(vsock_find_bound_socket); 320 321 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 322 struct sockaddr_vm *dst) 323 { 324 struct sock *sk; 325 326 spin_lock_bh(&vsock_table_lock); 327 sk = __vsock_find_connected_socket(src, dst); 328 if (sk) 329 sock_hold(sk); 330 331 spin_unlock_bh(&vsock_table_lock); 332 333 return sk; 334 } 335 EXPORT_SYMBOL_GPL(vsock_find_connected_socket); 336 337 void vsock_remove_sock(struct vsock_sock *vsk) 338 { 339 vsock_remove_bound(vsk); 340 vsock_remove_connected(vsk); 341 } 342 EXPORT_SYMBOL_GPL(vsock_remove_sock); 343 344 void vsock_for_each_connected_socket(struct vsock_transport *transport, 345 void (*fn)(struct sock *sk)) 346 { 347 int i; 348 349 spin_lock_bh(&vsock_table_lock); 350 351 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { 352 struct vsock_sock *vsk; 353 list_for_each_entry(vsk, &vsock_connected_table[i], 354 connected_table) { 355 if (vsk->transport != transport) 356 continue; 357 358 fn(sk_vsock(vsk)); 359 } 360 } 361 362 spin_unlock_bh(&vsock_table_lock); 363 } 364 EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); 365 366 void vsock_add_pending(struct sock *listener, struct sock *pending) 367 { 368 struct vsock_sock *vlistener; 369 struct vsock_sock *vpending; 370 371 vlistener = vsock_sk(listener); 372 vpending = vsock_sk(pending); 373 374 sock_hold(pending); 375 sock_hold(listener); 376 list_add_tail(&vpending->pending_links, &vlistener->pending_links); 377 } 378 EXPORT_SYMBOL_GPL(vsock_add_pending); 379 380 void vsock_remove_pending(struct sock *listener, struct sock *pending) 381 { 382 struct vsock_sock *vpending = vsock_sk(pending); 383 384 list_del_init(&vpending->pending_links); 385 sock_put(listener); 386 sock_put(pending); 387 } 388 EXPORT_SYMBOL_GPL(vsock_remove_pending); 389 390 void vsock_enqueue_accept(struct sock *listener, struct sock *connected) 391 { 392 struct vsock_sock *vlistener; 393 struct vsock_sock *vconnected; 394 395 vlistener = vsock_sk(listener); 396 vconnected = vsock_sk(connected); 397 398 sock_hold(connected); 399 sock_hold(listener); 400 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); 401 } 402 EXPORT_SYMBOL_GPL(vsock_enqueue_accept); 403 404 static bool vsock_use_local_transport(unsigned int remote_cid) 405 { 406 if (!transport_local) 407 return false; 408 409 if (remote_cid == VMADDR_CID_LOCAL) 410 return true; 411 412 if (transport_g2h) { 413 return remote_cid == transport_g2h->get_local_cid(); 414 } else { 415 return remote_cid == VMADDR_CID_HOST; 416 } 417 } 418 419 static void vsock_deassign_transport(struct vsock_sock *vsk) 420 { 421 if (!vsk->transport) 422 return; 423 424 vsk->transport->destruct(vsk); 425 module_put(vsk->transport->module); 426 vsk->transport = NULL; 427 } 428 429 /* Assign a transport to a socket and call the .init transport callback. 430 * 431 * Note: for connection oriented socket this must be called when vsk->remote_addr 432 * is set (e.g. during the connect() or when a connection request on a listener 433 * socket is received). 434 * The vsk->remote_addr is used to decide which transport to use: 435 * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if 436 * g2h is not loaded, will use local transport; 437 * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field 438 * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; 439 * - remote CID > VMADDR_CID_HOST will use host->guest transport; 440 */ 441 int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) 442 { 443 const struct vsock_transport *new_transport; 444 struct sock *sk = sk_vsock(vsk); 445 unsigned int remote_cid = vsk->remote_addr.svm_cid; 446 __u8 remote_flags; 447 int ret; 448 449 /* If the packet is coming with the source and destination CIDs higher 450 * than VMADDR_CID_HOST, then a vsock channel where all the packets are 451 * forwarded to the host should be established. Then the host will 452 * need to forward the packets to the guest. 453 * 454 * The flag is set on the (listen) receive path (psk is not NULL). On 455 * the connect path the flag can be set by the user space application. 456 */ 457 if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && 458 vsk->remote_addr.svm_cid > VMADDR_CID_HOST) 459 vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; 460 461 remote_flags = vsk->remote_addr.svm_flags; 462 463 switch (sk->sk_type) { 464 case SOCK_DGRAM: 465 new_transport = transport_dgram; 466 break; 467 case SOCK_STREAM: 468 case SOCK_SEQPACKET: 469 if (vsock_use_local_transport(remote_cid)) 470 new_transport = transport_local; 471 else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || 472 (remote_flags & VMADDR_FLAG_TO_HOST)) 473 new_transport = transport_g2h; 474 else 475 new_transport = transport_h2g; 476 break; 477 default: 478 return -ESOCKTNOSUPPORT; 479 } 480 481 if (vsk->transport) { 482 if (vsk->transport == new_transport) 483 return 0; 484 485 /* transport->release() must be called with sock lock acquired. 486 * This path can only be taken during vsock_connect(), where we 487 * have already held the sock lock. In the other cases, this 488 * function is called on a new socket which is not assigned to 489 * any transport. 490 */ 491 vsk->transport->release(vsk); 492 vsock_deassign_transport(vsk); 493 } 494 495 /* We increase the module refcnt to prevent the transport unloading 496 * while there are open sockets assigned to it. 497 */ 498 if (!new_transport || !try_module_get(new_transport->module)) 499 return -ENODEV; 500 501 if (sk->sk_type == SOCK_SEQPACKET) { 502 if (!new_transport->seqpacket_allow || 503 !new_transport->seqpacket_allow(remote_cid)) { 504 module_put(new_transport->module); 505 return -ESOCKTNOSUPPORT; 506 } 507 } 508 509 ret = new_transport->init(vsk, psk); 510 if (ret) { 511 module_put(new_transport->module); 512 return ret; 513 } 514 515 vsk->transport = new_transport; 516 517 return 0; 518 } 519 EXPORT_SYMBOL_GPL(vsock_assign_transport); 520 521 bool vsock_find_cid(unsigned int cid) 522 { 523 if (transport_g2h && cid == transport_g2h->get_local_cid()) 524 return true; 525 526 if (transport_h2g && cid == VMADDR_CID_HOST) 527 return true; 528 529 if (transport_local && cid == VMADDR_CID_LOCAL) 530 return true; 531 532 return false; 533 } 534 EXPORT_SYMBOL_GPL(vsock_find_cid); 535 536 static struct sock *vsock_dequeue_accept(struct sock *listener) 537 { 538 struct vsock_sock *vlistener; 539 struct vsock_sock *vconnected; 540 541 vlistener = vsock_sk(listener); 542 543 if (list_empty(&vlistener->accept_queue)) 544 return NULL; 545 546 vconnected = list_entry(vlistener->accept_queue.next, 547 struct vsock_sock, accept_queue); 548 549 list_del_init(&vconnected->accept_queue); 550 sock_put(listener); 551 /* The caller will need a reference on the connected socket so we let 552 * it call sock_put(). 553 */ 554 555 return sk_vsock(vconnected); 556 } 557 558 static bool vsock_is_accept_queue_empty(struct sock *sk) 559 { 560 struct vsock_sock *vsk = vsock_sk(sk); 561 return list_empty(&vsk->accept_queue); 562 } 563 564 static bool vsock_is_pending(struct sock *sk) 565 { 566 struct vsock_sock *vsk = vsock_sk(sk); 567 return !list_empty(&vsk->pending_links); 568 } 569 570 static int vsock_send_shutdown(struct sock *sk, int mode) 571 { 572 struct vsock_sock *vsk = vsock_sk(sk); 573 574 if (!vsk->transport) 575 return -ENODEV; 576 577 return vsk->transport->shutdown(vsk, mode); 578 } 579 580 static void vsock_pending_work(struct work_struct *work) 581 { 582 struct sock *sk; 583 struct sock *listener; 584 struct vsock_sock *vsk; 585 bool cleanup; 586 587 vsk = container_of(work, struct vsock_sock, pending_work.work); 588 sk = sk_vsock(vsk); 589 listener = vsk->listener; 590 cleanup = true; 591 592 lock_sock(listener); 593 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 594 595 if (vsock_is_pending(sk)) { 596 vsock_remove_pending(listener, sk); 597 598 sk_acceptq_removed(listener); 599 } else if (!vsk->rejected) { 600 /* We are not on the pending list and accept() did not reject 601 * us, so we must have been accepted by our user process. We 602 * just need to drop our references to the sockets and be on 603 * our way. 604 */ 605 cleanup = false; 606 goto out; 607 } 608 609 /* We need to remove ourself from the global connected sockets list so 610 * incoming packets can't find this socket, and to reduce the reference 611 * count. 612 */ 613 vsock_remove_connected(vsk); 614 615 sk->sk_state = TCP_CLOSE; 616 617 out: 618 release_sock(sk); 619 release_sock(listener); 620 if (cleanup) 621 sock_put(sk); 622 623 sock_put(sk); 624 sock_put(listener); 625 } 626 627 /**** SOCKET OPERATIONS ****/ 628 629 static int __vsock_bind_connectible(struct vsock_sock *vsk, 630 struct sockaddr_vm *addr) 631 { 632 static u32 port; 633 struct sockaddr_vm new_addr; 634 635 if (!port) 636 port = get_random_u32_above(LAST_RESERVED_PORT); 637 638 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); 639 640 if (addr->svm_port == VMADDR_PORT_ANY) { 641 bool found = false; 642 unsigned int i; 643 644 for (i = 0; i < MAX_PORT_RETRIES; i++) { 645 if (port <= LAST_RESERVED_PORT) 646 port = LAST_RESERVED_PORT + 1; 647 648 new_addr.svm_port = port++; 649 650 if (!__vsock_find_bound_socket(&new_addr)) { 651 found = true; 652 break; 653 } 654 } 655 656 if (!found) 657 return -EADDRNOTAVAIL; 658 } else { 659 /* If port is in reserved range, ensure caller 660 * has necessary privileges. 661 */ 662 if (addr->svm_port <= LAST_RESERVED_PORT && 663 !capable(CAP_NET_BIND_SERVICE)) { 664 return -EACCES; 665 } 666 667 if (__vsock_find_bound_socket(&new_addr)) 668 return -EADDRINUSE; 669 } 670 671 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); 672 673 /* Remove connection oriented sockets from the unbound list and add them 674 * to the hash table for easy lookup by its address. The unbound list 675 * is simply an extra entry at the end of the hash table, a trick used 676 * by AF_UNIX. 677 */ 678 __vsock_remove_bound(vsk); 679 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); 680 681 return 0; 682 } 683 684 static int __vsock_bind_dgram(struct vsock_sock *vsk, 685 struct sockaddr_vm *addr) 686 { 687 return vsk->transport->dgram_bind(vsk, addr); 688 } 689 690 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) 691 { 692 struct vsock_sock *vsk = vsock_sk(sk); 693 int retval; 694 695 /* First ensure this socket isn't already bound. */ 696 if (vsock_addr_bound(&vsk->local_addr)) 697 return -EINVAL; 698 699 /* Now bind to the provided address or select appropriate values if 700 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that 701 * like AF_INET prevents binding to a non-local IP address (in most 702 * cases), we only allow binding to a local CID. 703 */ 704 if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) 705 return -EADDRNOTAVAIL; 706 707 switch (sk->sk_socket->type) { 708 case SOCK_STREAM: 709 case SOCK_SEQPACKET: 710 spin_lock_bh(&vsock_table_lock); 711 retval = __vsock_bind_connectible(vsk, addr); 712 spin_unlock_bh(&vsock_table_lock); 713 break; 714 715 case SOCK_DGRAM: 716 retval = __vsock_bind_dgram(vsk, addr); 717 break; 718 719 default: 720 retval = -EINVAL; 721 break; 722 } 723 724 return retval; 725 } 726 727 static void vsock_connect_timeout(struct work_struct *work); 728 729 static struct sock *__vsock_create(struct net *net, 730 struct socket *sock, 731 struct sock *parent, 732 gfp_t priority, 733 unsigned short type, 734 int kern) 735 { 736 struct sock *sk; 737 struct vsock_sock *psk; 738 struct vsock_sock *vsk; 739 740 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); 741 if (!sk) 742 return NULL; 743 744 sock_init_data(sock, sk); 745 746 /* sk->sk_type is normally set in sock_init_data, but only if sock is 747 * non-NULL. We make sure that our sockets always have a type by 748 * setting it here if needed. 749 */ 750 if (!sock) 751 sk->sk_type = type; 752 753 vsk = vsock_sk(sk); 754 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 755 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 756 757 sk->sk_destruct = vsock_sk_destruct; 758 sk->sk_backlog_rcv = vsock_queue_rcv_skb; 759 sock_reset_flag(sk, SOCK_DONE); 760 761 INIT_LIST_HEAD(&vsk->bound_table); 762 INIT_LIST_HEAD(&vsk->connected_table); 763 vsk->listener = NULL; 764 INIT_LIST_HEAD(&vsk->pending_links); 765 INIT_LIST_HEAD(&vsk->accept_queue); 766 vsk->rejected = false; 767 vsk->sent_request = false; 768 vsk->ignore_connecting_rst = false; 769 vsk->peer_shutdown = 0; 770 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); 771 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); 772 773 psk = parent ? vsock_sk(parent) : NULL; 774 if (parent) { 775 vsk->trusted = psk->trusted; 776 vsk->owner = get_cred(psk->owner); 777 vsk->connect_timeout = psk->connect_timeout; 778 vsk->buffer_size = psk->buffer_size; 779 vsk->buffer_min_size = psk->buffer_min_size; 780 vsk->buffer_max_size = psk->buffer_max_size; 781 security_sk_clone(parent, sk); 782 } else { 783 vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); 784 vsk->owner = get_current_cred(); 785 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; 786 vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; 787 vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; 788 vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; 789 } 790 791 return sk; 792 } 793 794 static bool sock_type_connectible(u16 type) 795 { 796 return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); 797 } 798 799 static void __vsock_release(struct sock *sk, int level) 800 { 801 struct vsock_sock *vsk; 802 struct sock *pending; 803 804 vsk = vsock_sk(sk); 805 pending = NULL; /* Compiler warning. */ 806 807 /* When "level" is SINGLE_DEPTH_NESTING, use the nested 808 * version to avoid the warning "possible recursive locking 809 * detected". When "level" is 0, lock_sock_nested(sk, level) 810 * is the same as lock_sock(sk). 811 */ 812 lock_sock_nested(sk, level); 813 814 if (vsk->transport) 815 vsk->transport->release(vsk); 816 else if (sock_type_connectible(sk->sk_type)) 817 vsock_remove_sock(vsk); 818 819 sock_orphan(sk); 820 sk->sk_shutdown = SHUTDOWN_MASK; 821 822 skb_queue_purge(&sk->sk_receive_queue); 823 824 /* Clean up any sockets that never were accepted. */ 825 while ((pending = vsock_dequeue_accept(sk)) != NULL) { 826 __vsock_release(pending, SINGLE_DEPTH_NESTING); 827 sock_put(pending); 828 } 829 830 release_sock(sk); 831 sock_put(sk); 832 } 833 834 static void vsock_sk_destruct(struct sock *sk) 835 { 836 struct vsock_sock *vsk = vsock_sk(sk); 837 838 vsock_deassign_transport(vsk); 839 840 /* When clearing these addresses, there's no need to set the family and 841 * possibly register the address family with the kernel. 842 */ 843 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 844 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 845 846 put_cred(vsk->owner); 847 } 848 849 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 850 { 851 int err; 852 853 err = sock_queue_rcv_skb(sk, skb); 854 if (err) 855 kfree_skb(skb); 856 857 return err; 858 } 859 860 struct sock *vsock_create_connected(struct sock *parent) 861 { 862 return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, 863 parent->sk_type, 0); 864 } 865 EXPORT_SYMBOL_GPL(vsock_create_connected); 866 867 s64 vsock_stream_has_data(struct vsock_sock *vsk) 868 { 869 return vsk->transport->stream_has_data(vsk); 870 } 871 EXPORT_SYMBOL_GPL(vsock_stream_has_data); 872 873 s64 vsock_connectible_has_data(struct vsock_sock *vsk) 874 { 875 struct sock *sk = sk_vsock(vsk); 876 877 if (sk->sk_type == SOCK_SEQPACKET) 878 return vsk->transport->seqpacket_has_data(vsk); 879 else 880 return vsock_stream_has_data(vsk); 881 } 882 EXPORT_SYMBOL_GPL(vsock_connectible_has_data); 883 884 s64 vsock_stream_has_space(struct vsock_sock *vsk) 885 { 886 return vsk->transport->stream_has_space(vsk); 887 } 888 EXPORT_SYMBOL_GPL(vsock_stream_has_space); 889 890 void vsock_data_ready(struct sock *sk) 891 { 892 struct vsock_sock *vsk = vsock_sk(sk); 893 894 if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || 895 sock_flag(sk, SOCK_DONE)) 896 sk->sk_data_ready(sk); 897 } 898 EXPORT_SYMBOL_GPL(vsock_data_ready); 899 900 /* Dummy callback required by sockmap. 901 * See unconditional call of saved_close() in sock_map_close(). 902 */ 903 static void vsock_close(struct sock *sk, long timeout) 904 { 905 } 906 907 static int vsock_release(struct socket *sock) 908 { 909 struct sock *sk = sock->sk; 910 911 if (!sk) 912 return 0; 913 914 sk->sk_prot->close(sk, 0); 915 __vsock_release(sk, 0); 916 sock->sk = NULL; 917 sock->state = SS_FREE; 918 919 return 0; 920 } 921 922 static int 923 vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 924 { 925 int err; 926 struct sock *sk; 927 struct sockaddr_vm *vm_addr; 928 929 sk = sock->sk; 930 931 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) 932 return -EINVAL; 933 934 lock_sock(sk); 935 err = __vsock_bind(sk, vm_addr); 936 release_sock(sk); 937 938 return err; 939 } 940 941 static int vsock_getname(struct socket *sock, 942 struct sockaddr *addr, int peer) 943 { 944 int err; 945 struct sock *sk; 946 struct vsock_sock *vsk; 947 struct sockaddr_vm *vm_addr; 948 949 sk = sock->sk; 950 vsk = vsock_sk(sk); 951 err = 0; 952 953 lock_sock(sk); 954 955 if (peer) { 956 if (sock->state != SS_CONNECTED) { 957 err = -ENOTCONN; 958 goto out; 959 } 960 vm_addr = &vsk->remote_addr; 961 } else { 962 vm_addr = &vsk->local_addr; 963 } 964 965 if (!vm_addr) { 966 err = -EINVAL; 967 goto out; 968 } 969 970 /* sys_getsockname() and sys_getpeername() pass us a 971 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately 972 * that macro is defined in socket.c instead of .h, so we hardcode its 973 * value here. 974 */ 975 BUILD_BUG_ON(sizeof(*vm_addr) > 128); 976 memcpy(addr, vm_addr, sizeof(*vm_addr)); 977 err = sizeof(*vm_addr); 978 979 out: 980 release_sock(sk); 981 return err; 982 } 983 984 static int vsock_shutdown(struct socket *sock, int mode) 985 { 986 int err; 987 struct sock *sk; 988 989 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses 990 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode 991 * here like the other address families do. Note also that the 992 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), 993 * which is what we want. 994 */ 995 mode++; 996 997 if ((mode & ~SHUTDOWN_MASK) || !mode) 998 return -EINVAL; 999 1000 /* If this is a connection oriented socket and it is not connected then 1001 * bail out immediately. If it is a DGRAM socket then we must first 1002 * kick the socket so that it wakes up from any sleeping calls, for 1003 * example recv(), and then afterwards return the error. 1004 */ 1005 1006 sk = sock->sk; 1007 1008 lock_sock(sk); 1009 if (sock->state == SS_UNCONNECTED) { 1010 err = -ENOTCONN; 1011 if (sock_type_connectible(sk->sk_type)) 1012 goto out; 1013 } else { 1014 sock->state = SS_DISCONNECTING; 1015 err = 0; 1016 } 1017 1018 /* Receive and send shutdowns are treated alike. */ 1019 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); 1020 if (mode) { 1021 sk->sk_shutdown |= mode; 1022 sk->sk_state_change(sk); 1023 1024 if (sock_type_connectible(sk->sk_type)) { 1025 sock_reset_flag(sk, SOCK_DONE); 1026 vsock_send_shutdown(sk, mode); 1027 } 1028 } 1029 1030 out: 1031 release_sock(sk); 1032 return err; 1033 } 1034 1035 static __poll_t vsock_poll(struct file *file, struct socket *sock, 1036 poll_table *wait) 1037 { 1038 struct sock *sk; 1039 __poll_t mask; 1040 struct vsock_sock *vsk; 1041 1042 sk = sock->sk; 1043 vsk = vsock_sk(sk); 1044 1045 poll_wait(file, sk_sleep(sk), wait); 1046 mask = 0; 1047 1048 if (sk->sk_err) 1049 /* Signify that there has been an error on this socket. */ 1050 mask |= EPOLLERR; 1051 1052 /* INET sockets treat local write shutdown and peer write shutdown as a 1053 * case of EPOLLHUP set. 1054 */ 1055 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1056 ((sk->sk_shutdown & SEND_SHUTDOWN) && 1057 (vsk->peer_shutdown & SEND_SHUTDOWN))) { 1058 mask |= EPOLLHUP; 1059 } 1060 1061 if (sk->sk_shutdown & RCV_SHUTDOWN || 1062 vsk->peer_shutdown & SEND_SHUTDOWN) { 1063 mask |= EPOLLRDHUP; 1064 } 1065 1066 if (sk_is_readable(sk)) 1067 mask |= EPOLLIN | EPOLLRDNORM; 1068 1069 if (sock->type == SOCK_DGRAM) { 1070 /* For datagram sockets we can read if there is something in 1071 * the queue and write as long as the socket isn't shutdown for 1072 * sending. 1073 */ 1074 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 1075 (sk->sk_shutdown & RCV_SHUTDOWN)) { 1076 mask |= EPOLLIN | EPOLLRDNORM; 1077 } 1078 1079 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 1080 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 1081 1082 } else if (sock_type_connectible(sk->sk_type)) { 1083 const struct vsock_transport *transport; 1084 1085 lock_sock(sk); 1086 1087 transport = vsk->transport; 1088 1089 /* Listening sockets that have connections in their accept 1090 * queue can be read. 1091 */ 1092 if (sk->sk_state == TCP_LISTEN 1093 && !vsock_is_accept_queue_empty(sk)) 1094 mask |= EPOLLIN | EPOLLRDNORM; 1095 1096 /* If there is something in the queue then we can read. */ 1097 if (transport && transport->stream_is_active(vsk) && 1098 !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1099 bool data_ready_now = false; 1100 int target = sock_rcvlowat(sk, 0, INT_MAX); 1101 int ret = transport->notify_poll_in( 1102 vsk, target, &data_ready_now); 1103 if (ret < 0) { 1104 mask |= EPOLLERR; 1105 } else { 1106 if (data_ready_now) 1107 mask |= EPOLLIN | EPOLLRDNORM; 1108 1109 } 1110 } 1111 1112 /* Sockets whose connections have been closed, reset, or 1113 * terminated should also be considered read, and we check the 1114 * shutdown flag for that. 1115 */ 1116 if (sk->sk_shutdown & RCV_SHUTDOWN || 1117 vsk->peer_shutdown & SEND_SHUTDOWN) { 1118 mask |= EPOLLIN | EPOLLRDNORM; 1119 } 1120 1121 /* Connected sockets that can produce data can be written. */ 1122 if (transport && sk->sk_state == TCP_ESTABLISHED) { 1123 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 1124 bool space_avail_now = false; 1125 int ret = transport->notify_poll_out( 1126 vsk, 1, &space_avail_now); 1127 if (ret < 0) { 1128 mask |= EPOLLERR; 1129 } else { 1130 if (space_avail_now) 1131 /* Remove EPOLLWRBAND since INET 1132 * sockets are not setting it. 1133 */ 1134 mask |= EPOLLOUT | EPOLLWRNORM; 1135 1136 } 1137 } 1138 } 1139 1140 /* Simulate INET socket poll behaviors, which sets 1141 * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, 1142 * but local send is not shutdown. 1143 */ 1144 if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { 1145 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 1146 mask |= EPOLLOUT | EPOLLWRNORM; 1147 1148 } 1149 1150 release_sock(sk); 1151 } 1152 1153 return mask; 1154 } 1155 1156 static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) 1157 { 1158 struct vsock_sock *vsk = vsock_sk(sk); 1159 1160 return vsk->transport->read_skb(vsk, read_actor); 1161 } 1162 1163 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1164 size_t len) 1165 { 1166 int err; 1167 struct sock *sk; 1168 struct vsock_sock *vsk; 1169 struct sockaddr_vm *remote_addr; 1170 const struct vsock_transport *transport; 1171 1172 if (msg->msg_flags & MSG_OOB) 1173 return -EOPNOTSUPP; 1174 1175 /* For now, MSG_DONTWAIT is always assumed... */ 1176 err = 0; 1177 sk = sock->sk; 1178 vsk = vsock_sk(sk); 1179 1180 lock_sock(sk); 1181 1182 transport = vsk->transport; 1183 1184 err = vsock_auto_bind(vsk); 1185 if (err) 1186 goto out; 1187 1188 1189 /* If the provided message contains an address, use that. Otherwise 1190 * fall back on the socket's remote handle (if it has been connected). 1191 */ 1192 if (msg->msg_name && 1193 vsock_addr_cast(msg->msg_name, msg->msg_namelen, 1194 &remote_addr) == 0) { 1195 /* Ensure this address is of the right type and is a valid 1196 * destination. 1197 */ 1198 1199 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1200 remote_addr->svm_cid = transport->get_local_cid(); 1201 1202 if (!vsock_addr_bound(remote_addr)) { 1203 err = -EINVAL; 1204 goto out; 1205 } 1206 } else if (sock->state == SS_CONNECTED) { 1207 remote_addr = &vsk->remote_addr; 1208 1209 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1210 remote_addr->svm_cid = transport->get_local_cid(); 1211 1212 /* XXX Should connect() or this function ensure remote_addr is 1213 * bound? 1214 */ 1215 if (!vsock_addr_bound(&vsk->remote_addr)) { 1216 err = -EINVAL; 1217 goto out; 1218 } 1219 } else { 1220 err = -EINVAL; 1221 goto out; 1222 } 1223 1224 if (!transport->dgram_allow(remote_addr->svm_cid, 1225 remote_addr->svm_port)) { 1226 err = -EINVAL; 1227 goto out; 1228 } 1229 1230 err = transport->dgram_enqueue(vsk, remote_addr, msg, len); 1231 1232 out: 1233 release_sock(sk); 1234 return err; 1235 } 1236 1237 static int vsock_dgram_connect(struct socket *sock, 1238 struct sockaddr *addr, int addr_len, int flags) 1239 { 1240 int err; 1241 struct sock *sk; 1242 struct vsock_sock *vsk; 1243 struct sockaddr_vm *remote_addr; 1244 1245 sk = sock->sk; 1246 vsk = vsock_sk(sk); 1247 1248 err = vsock_addr_cast(addr, addr_len, &remote_addr); 1249 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { 1250 lock_sock(sk); 1251 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, 1252 VMADDR_PORT_ANY); 1253 sock->state = SS_UNCONNECTED; 1254 release_sock(sk); 1255 return 0; 1256 } else if (err != 0) 1257 return -EINVAL; 1258 1259 lock_sock(sk); 1260 1261 err = vsock_auto_bind(vsk); 1262 if (err) 1263 goto out; 1264 1265 if (!vsk->transport->dgram_allow(remote_addr->svm_cid, 1266 remote_addr->svm_port)) { 1267 err = -EINVAL; 1268 goto out; 1269 } 1270 1271 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); 1272 sock->state = SS_CONNECTED; 1273 1274 /* sock map disallows redirection of non-TCP sockets with sk_state != 1275 * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set 1276 * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. 1277 * 1278 * This doesn't seem to be abnormal state for datagram sockets, as the 1279 * same approach can be see in other datagram socket types as well 1280 * (such as unix sockets). 1281 */ 1282 sk->sk_state = TCP_ESTABLISHED; 1283 1284 out: 1285 release_sock(sk); 1286 return err; 1287 } 1288 1289 int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1290 size_t len, int flags) 1291 { 1292 struct sock *sk = sock->sk; 1293 struct vsock_sock *vsk = vsock_sk(sk); 1294 1295 return vsk->transport->dgram_dequeue(vsk, msg, len, flags); 1296 } 1297 1298 int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1299 size_t len, int flags) 1300 { 1301 #ifdef CONFIG_BPF_SYSCALL 1302 struct sock *sk = sock->sk; 1303 const struct proto *prot; 1304 1305 prot = READ_ONCE(sk->sk_prot); 1306 if (prot != &vsock_proto) 1307 return prot->recvmsg(sk, msg, len, flags, NULL); 1308 #endif 1309 1310 return __vsock_dgram_recvmsg(sock, msg, len, flags); 1311 } 1312 EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); 1313 1314 static const struct proto_ops vsock_dgram_ops = { 1315 .family = PF_VSOCK, 1316 .owner = THIS_MODULE, 1317 .release = vsock_release, 1318 .bind = vsock_bind, 1319 .connect = vsock_dgram_connect, 1320 .socketpair = sock_no_socketpair, 1321 .accept = sock_no_accept, 1322 .getname = vsock_getname, 1323 .poll = vsock_poll, 1324 .ioctl = sock_no_ioctl, 1325 .listen = sock_no_listen, 1326 .shutdown = vsock_shutdown, 1327 .sendmsg = vsock_dgram_sendmsg, 1328 .recvmsg = vsock_dgram_recvmsg, 1329 .mmap = sock_no_mmap, 1330 .read_skb = vsock_read_skb, 1331 }; 1332 1333 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) 1334 { 1335 const struct vsock_transport *transport = vsk->transport; 1336 1337 if (!transport || !transport->cancel_pkt) 1338 return -EOPNOTSUPP; 1339 1340 return transport->cancel_pkt(vsk); 1341 } 1342 1343 static void vsock_connect_timeout(struct work_struct *work) 1344 { 1345 struct sock *sk; 1346 struct vsock_sock *vsk; 1347 1348 vsk = container_of(work, struct vsock_sock, connect_work.work); 1349 sk = sk_vsock(vsk); 1350 1351 lock_sock(sk); 1352 if (sk->sk_state == TCP_SYN_SENT && 1353 (sk->sk_shutdown != SHUTDOWN_MASK)) { 1354 sk->sk_state = TCP_CLOSE; 1355 sk->sk_socket->state = SS_UNCONNECTED; 1356 sk->sk_err = ETIMEDOUT; 1357 sk_error_report(sk); 1358 vsock_transport_cancel_pkt(vsk); 1359 } 1360 release_sock(sk); 1361 1362 sock_put(sk); 1363 } 1364 1365 static int vsock_connect(struct socket *sock, struct sockaddr *addr, 1366 int addr_len, int flags) 1367 { 1368 int err; 1369 struct sock *sk; 1370 struct vsock_sock *vsk; 1371 const struct vsock_transport *transport; 1372 struct sockaddr_vm *remote_addr; 1373 long timeout; 1374 DEFINE_WAIT(wait); 1375 1376 err = 0; 1377 sk = sock->sk; 1378 vsk = vsock_sk(sk); 1379 1380 lock_sock(sk); 1381 1382 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ 1383 switch (sock->state) { 1384 case SS_CONNECTED: 1385 err = -EISCONN; 1386 goto out; 1387 case SS_DISCONNECTING: 1388 err = -EINVAL; 1389 goto out; 1390 case SS_CONNECTING: 1391 /* This continues on so we can move sock into the SS_CONNECTED 1392 * state once the connection has completed (at which point err 1393 * will be set to zero also). Otherwise, we will either wait 1394 * for the connection or return -EALREADY should this be a 1395 * non-blocking call. 1396 */ 1397 err = -EALREADY; 1398 if (flags & O_NONBLOCK) 1399 goto out; 1400 break; 1401 default: 1402 if ((sk->sk_state == TCP_LISTEN) || 1403 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { 1404 err = -EINVAL; 1405 goto out; 1406 } 1407 1408 /* Set the remote address that we are connecting to. */ 1409 memcpy(&vsk->remote_addr, remote_addr, 1410 sizeof(vsk->remote_addr)); 1411 1412 err = vsock_assign_transport(vsk, NULL); 1413 if (err) 1414 goto out; 1415 1416 transport = vsk->transport; 1417 1418 /* The hypervisor and well-known contexts do not have socket 1419 * endpoints. 1420 */ 1421 if (!transport || 1422 !transport->stream_allow(remote_addr->svm_cid, 1423 remote_addr->svm_port)) { 1424 err = -ENETUNREACH; 1425 goto out; 1426 } 1427 1428 err = vsock_auto_bind(vsk); 1429 if (err) 1430 goto out; 1431 1432 sk->sk_state = TCP_SYN_SENT; 1433 1434 err = transport->connect(vsk); 1435 if (err < 0) 1436 goto out; 1437 1438 /* Mark sock as connecting and set the error code to in 1439 * progress in case this is a non-blocking connect. 1440 */ 1441 sock->state = SS_CONNECTING; 1442 err = -EINPROGRESS; 1443 } 1444 1445 /* The receive path will handle all communication until we are able to 1446 * enter the connected state. Here we wait for the connection to be 1447 * completed or a notification of an error. 1448 */ 1449 timeout = vsk->connect_timeout; 1450 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1451 1452 while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { 1453 if (flags & O_NONBLOCK) { 1454 /* If we're not going to block, we schedule a timeout 1455 * function to generate a timeout on the connection 1456 * attempt, in case the peer doesn't respond in a 1457 * timely manner. We hold on to the socket until the 1458 * timeout fires. 1459 */ 1460 sock_hold(sk); 1461 1462 /* If the timeout function is already scheduled, 1463 * reschedule it, then ungrab the socket refcount to 1464 * keep it balanced. 1465 */ 1466 if (mod_delayed_work(system_wq, &vsk->connect_work, 1467 timeout)) 1468 sock_put(sk); 1469 1470 /* Skip ahead to preserve error code set above. */ 1471 goto out_wait; 1472 } 1473 1474 release_sock(sk); 1475 timeout = schedule_timeout(timeout); 1476 lock_sock(sk); 1477 1478 if (signal_pending(current)) { 1479 err = sock_intr_errno(timeout); 1480 sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; 1481 sock->state = SS_UNCONNECTED; 1482 vsock_transport_cancel_pkt(vsk); 1483 vsock_remove_connected(vsk); 1484 goto out_wait; 1485 } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { 1486 err = -ETIMEDOUT; 1487 sk->sk_state = TCP_CLOSE; 1488 sock->state = SS_UNCONNECTED; 1489 vsock_transport_cancel_pkt(vsk); 1490 goto out_wait; 1491 } 1492 1493 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1494 } 1495 1496 if (sk->sk_err) { 1497 err = -sk->sk_err; 1498 sk->sk_state = TCP_CLOSE; 1499 sock->state = SS_UNCONNECTED; 1500 } else { 1501 err = 0; 1502 } 1503 1504 out_wait: 1505 finish_wait(sk_sleep(sk), &wait); 1506 out: 1507 release_sock(sk); 1508 return err; 1509 } 1510 1511 static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, 1512 bool kern) 1513 { 1514 struct sock *listener; 1515 int err; 1516 struct sock *connected; 1517 struct vsock_sock *vconnected; 1518 long timeout; 1519 DEFINE_WAIT(wait); 1520 1521 err = 0; 1522 listener = sock->sk; 1523 1524 lock_sock(listener); 1525 1526 if (!sock_type_connectible(sock->type)) { 1527 err = -EOPNOTSUPP; 1528 goto out; 1529 } 1530 1531 if (listener->sk_state != TCP_LISTEN) { 1532 err = -EINVAL; 1533 goto out; 1534 } 1535 1536 /* Wait for children sockets to appear; these are the new sockets 1537 * created upon connection establishment. 1538 */ 1539 timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK); 1540 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1541 1542 while ((connected = vsock_dequeue_accept(listener)) == NULL && 1543 listener->sk_err == 0) { 1544 release_sock(listener); 1545 timeout = schedule_timeout(timeout); 1546 finish_wait(sk_sleep(listener), &wait); 1547 lock_sock(listener); 1548 1549 if (signal_pending(current)) { 1550 err = sock_intr_errno(timeout); 1551 goto out; 1552 } else if (timeout == 0) { 1553 err = -EAGAIN; 1554 goto out; 1555 } 1556 1557 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1558 } 1559 finish_wait(sk_sleep(listener), &wait); 1560 1561 if (listener->sk_err) 1562 err = -listener->sk_err; 1563 1564 if (connected) { 1565 sk_acceptq_removed(listener); 1566 1567 lock_sock_nested(connected, SINGLE_DEPTH_NESTING); 1568 vconnected = vsock_sk(connected); 1569 1570 /* If the listener socket has received an error, then we should 1571 * reject this socket and return. Note that we simply mark the 1572 * socket rejected, drop our reference, and let the cleanup 1573 * function handle the cleanup; the fact that we found it in 1574 * the listener's accept queue guarantees that the cleanup 1575 * function hasn't run yet. 1576 */ 1577 if (err) { 1578 vconnected->rejected = true; 1579 } else { 1580 newsock->state = SS_CONNECTED; 1581 sock_graft(connected, newsock); 1582 } 1583 1584 release_sock(connected); 1585 sock_put(connected); 1586 } 1587 1588 out: 1589 release_sock(listener); 1590 return err; 1591 } 1592 1593 static int vsock_listen(struct socket *sock, int backlog) 1594 { 1595 int err; 1596 struct sock *sk; 1597 struct vsock_sock *vsk; 1598 1599 sk = sock->sk; 1600 1601 lock_sock(sk); 1602 1603 if (!sock_type_connectible(sk->sk_type)) { 1604 err = -EOPNOTSUPP; 1605 goto out; 1606 } 1607 1608 if (sock->state != SS_UNCONNECTED) { 1609 err = -EINVAL; 1610 goto out; 1611 } 1612 1613 vsk = vsock_sk(sk); 1614 1615 if (!vsock_addr_bound(&vsk->local_addr)) { 1616 err = -EINVAL; 1617 goto out; 1618 } 1619 1620 sk->sk_max_ack_backlog = backlog; 1621 sk->sk_state = TCP_LISTEN; 1622 1623 err = 0; 1624 1625 out: 1626 release_sock(sk); 1627 return err; 1628 } 1629 1630 static void vsock_update_buffer_size(struct vsock_sock *vsk, 1631 const struct vsock_transport *transport, 1632 u64 val) 1633 { 1634 if (val > vsk->buffer_max_size) 1635 val = vsk->buffer_max_size; 1636 1637 if (val < vsk->buffer_min_size) 1638 val = vsk->buffer_min_size; 1639 1640 if (val != vsk->buffer_size && 1641 transport && transport->notify_buffer_size) 1642 transport->notify_buffer_size(vsk, &val); 1643 1644 vsk->buffer_size = val; 1645 } 1646 1647 static int vsock_connectible_setsockopt(struct socket *sock, 1648 int level, 1649 int optname, 1650 sockptr_t optval, 1651 unsigned int optlen) 1652 { 1653 int err; 1654 struct sock *sk; 1655 struct vsock_sock *vsk; 1656 const struct vsock_transport *transport; 1657 u64 val; 1658 1659 if (level != AF_VSOCK) 1660 return -ENOPROTOOPT; 1661 1662 #define COPY_IN(_v) \ 1663 do { \ 1664 if (optlen < sizeof(_v)) { \ 1665 err = -EINVAL; \ 1666 goto exit; \ 1667 } \ 1668 if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ 1669 err = -EFAULT; \ 1670 goto exit; \ 1671 } \ 1672 } while (0) 1673 1674 err = 0; 1675 sk = sock->sk; 1676 vsk = vsock_sk(sk); 1677 1678 lock_sock(sk); 1679 1680 transport = vsk->transport; 1681 1682 switch (optname) { 1683 case SO_VM_SOCKETS_BUFFER_SIZE: 1684 COPY_IN(val); 1685 vsock_update_buffer_size(vsk, transport, val); 1686 break; 1687 1688 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1689 COPY_IN(val); 1690 vsk->buffer_max_size = val; 1691 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 1692 break; 1693 1694 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1695 COPY_IN(val); 1696 vsk->buffer_min_size = val; 1697 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 1698 break; 1699 1700 case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: 1701 case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { 1702 struct __kernel_sock_timeval tv; 1703 1704 err = sock_copy_user_timeval(&tv, optval, optlen, 1705 optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); 1706 if (err) 1707 break; 1708 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && 1709 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { 1710 vsk->connect_timeout = tv.tv_sec * HZ + 1711 DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); 1712 if (vsk->connect_timeout == 0) 1713 vsk->connect_timeout = 1714 VSOCK_DEFAULT_CONNECT_TIMEOUT; 1715 1716 } else { 1717 err = -ERANGE; 1718 } 1719 break; 1720 } 1721 1722 default: 1723 err = -ENOPROTOOPT; 1724 break; 1725 } 1726 1727 #undef COPY_IN 1728 1729 exit: 1730 release_sock(sk); 1731 return err; 1732 } 1733 1734 static int vsock_connectible_getsockopt(struct socket *sock, 1735 int level, int optname, 1736 char __user *optval, 1737 int __user *optlen) 1738 { 1739 struct sock *sk = sock->sk; 1740 struct vsock_sock *vsk = vsock_sk(sk); 1741 1742 union { 1743 u64 val64; 1744 struct old_timeval32 tm32; 1745 struct __kernel_old_timeval tm; 1746 struct __kernel_sock_timeval stm; 1747 } v; 1748 1749 int lv = sizeof(v.val64); 1750 int len; 1751 1752 if (level != AF_VSOCK) 1753 return -ENOPROTOOPT; 1754 1755 if (get_user(len, optlen)) 1756 return -EFAULT; 1757 1758 memset(&v, 0, sizeof(v)); 1759 1760 switch (optname) { 1761 case SO_VM_SOCKETS_BUFFER_SIZE: 1762 v.val64 = vsk->buffer_size; 1763 break; 1764 1765 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1766 v.val64 = vsk->buffer_max_size; 1767 break; 1768 1769 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1770 v.val64 = vsk->buffer_min_size; 1771 break; 1772 1773 case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: 1774 case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: 1775 lv = sock_get_timeout(vsk->connect_timeout, &v, 1776 optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); 1777 break; 1778 1779 default: 1780 return -ENOPROTOOPT; 1781 } 1782 1783 if (len < lv) 1784 return -EINVAL; 1785 if (len > lv) 1786 len = lv; 1787 if (copy_to_user(optval, &v, len)) 1788 return -EFAULT; 1789 1790 if (put_user(len, optlen)) 1791 return -EFAULT; 1792 1793 return 0; 1794 } 1795 1796 static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, 1797 size_t len) 1798 { 1799 struct sock *sk; 1800 struct vsock_sock *vsk; 1801 const struct vsock_transport *transport; 1802 ssize_t total_written; 1803 long timeout; 1804 int err; 1805 struct vsock_transport_send_notify_data send_data; 1806 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1807 1808 sk = sock->sk; 1809 vsk = vsock_sk(sk); 1810 total_written = 0; 1811 err = 0; 1812 1813 if (msg->msg_flags & MSG_OOB) 1814 return -EOPNOTSUPP; 1815 1816 lock_sock(sk); 1817 1818 transport = vsk->transport; 1819 1820 /* Callers should not provide a destination with connection oriented 1821 * sockets. 1822 */ 1823 if (msg->msg_namelen) { 1824 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 1825 goto out; 1826 } 1827 1828 /* Send data only if both sides are not shutdown in the direction. */ 1829 if (sk->sk_shutdown & SEND_SHUTDOWN || 1830 vsk->peer_shutdown & RCV_SHUTDOWN) { 1831 err = -EPIPE; 1832 goto out; 1833 } 1834 1835 if (!transport || sk->sk_state != TCP_ESTABLISHED || 1836 !vsock_addr_bound(&vsk->local_addr)) { 1837 err = -ENOTCONN; 1838 goto out; 1839 } 1840 1841 if (!vsock_addr_bound(&vsk->remote_addr)) { 1842 err = -EDESTADDRREQ; 1843 goto out; 1844 } 1845 1846 /* Wait for room in the produce queue to enqueue our user's data. */ 1847 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1848 1849 err = transport->notify_send_init(vsk, &send_data); 1850 if (err < 0) 1851 goto out; 1852 1853 while (total_written < len) { 1854 ssize_t written; 1855 1856 add_wait_queue(sk_sleep(sk), &wait); 1857 while (vsock_stream_has_space(vsk) == 0 && 1858 sk->sk_err == 0 && 1859 !(sk->sk_shutdown & SEND_SHUTDOWN) && 1860 !(vsk->peer_shutdown & RCV_SHUTDOWN)) { 1861 1862 /* Don't wait for non-blocking sockets. */ 1863 if (timeout == 0) { 1864 err = -EAGAIN; 1865 remove_wait_queue(sk_sleep(sk), &wait); 1866 goto out_err; 1867 } 1868 1869 err = transport->notify_send_pre_block(vsk, &send_data); 1870 if (err < 0) { 1871 remove_wait_queue(sk_sleep(sk), &wait); 1872 goto out_err; 1873 } 1874 1875 release_sock(sk); 1876 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); 1877 lock_sock(sk); 1878 if (signal_pending(current)) { 1879 err = sock_intr_errno(timeout); 1880 remove_wait_queue(sk_sleep(sk), &wait); 1881 goto out_err; 1882 } else if (timeout == 0) { 1883 err = -EAGAIN; 1884 remove_wait_queue(sk_sleep(sk), &wait); 1885 goto out_err; 1886 } 1887 } 1888 remove_wait_queue(sk_sleep(sk), &wait); 1889 1890 /* These checks occur both as part of and after the loop 1891 * conditional since we need to check before and after 1892 * sleeping. 1893 */ 1894 if (sk->sk_err) { 1895 err = -sk->sk_err; 1896 goto out_err; 1897 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1898 (vsk->peer_shutdown & RCV_SHUTDOWN)) { 1899 err = -EPIPE; 1900 goto out_err; 1901 } 1902 1903 err = transport->notify_send_pre_enqueue(vsk, &send_data); 1904 if (err < 0) 1905 goto out_err; 1906 1907 /* Note that enqueue will only write as many bytes as are free 1908 * in the produce queue, so we don't need to ensure len is 1909 * smaller than the queue size. It is the caller's 1910 * responsibility to check how many bytes we were able to send. 1911 */ 1912 1913 if (sk->sk_type == SOCK_SEQPACKET) { 1914 written = transport->seqpacket_enqueue(vsk, 1915 msg, len - total_written); 1916 } else { 1917 written = transport->stream_enqueue(vsk, 1918 msg, len - total_written); 1919 } 1920 1921 if (written < 0) { 1922 err = written; 1923 goto out_err; 1924 } 1925 1926 total_written += written; 1927 1928 err = transport->notify_send_post_enqueue( 1929 vsk, written, &send_data); 1930 if (err < 0) 1931 goto out_err; 1932 1933 } 1934 1935 out_err: 1936 if (total_written > 0) { 1937 /* Return number of written bytes only if: 1938 * 1) SOCK_STREAM socket. 1939 * 2) SOCK_SEQPACKET socket when whole buffer is sent. 1940 */ 1941 if (sk->sk_type == SOCK_STREAM || total_written == len) 1942 err = total_written; 1943 } 1944 out: 1945 release_sock(sk); 1946 return err; 1947 } 1948 1949 static int vsock_connectible_wait_data(struct sock *sk, 1950 struct wait_queue_entry *wait, 1951 long timeout, 1952 struct vsock_transport_recv_notify_data *recv_data, 1953 size_t target) 1954 { 1955 const struct vsock_transport *transport; 1956 struct vsock_sock *vsk; 1957 s64 data; 1958 int err; 1959 1960 vsk = vsock_sk(sk); 1961 err = 0; 1962 transport = vsk->transport; 1963 1964 while (1) { 1965 prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); 1966 data = vsock_connectible_has_data(vsk); 1967 if (data != 0) 1968 break; 1969 1970 if (sk->sk_err != 0 || 1971 (sk->sk_shutdown & RCV_SHUTDOWN) || 1972 (vsk->peer_shutdown & SEND_SHUTDOWN)) { 1973 break; 1974 } 1975 1976 /* Don't wait for non-blocking sockets. */ 1977 if (timeout == 0) { 1978 err = -EAGAIN; 1979 break; 1980 } 1981 1982 if (recv_data) { 1983 err = transport->notify_recv_pre_block(vsk, target, recv_data); 1984 if (err < 0) 1985 break; 1986 } 1987 1988 release_sock(sk); 1989 timeout = schedule_timeout(timeout); 1990 lock_sock(sk); 1991 1992 if (signal_pending(current)) { 1993 err = sock_intr_errno(timeout); 1994 break; 1995 } else if (timeout == 0) { 1996 err = -EAGAIN; 1997 break; 1998 } 1999 } 2000 2001 finish_wait(sk_sleep(sk), wait); 2002 2003 if (err) 2004 return err; 2005 2006 /* Internal transport error when checking for available 2007 * data. XXX This should be changed to a connection 2008 * reset in a later change. 2009 */ 2010 if (data < 0) 2011 return -ENOMEM; 2012 2013 return data; 2014 } 2015 2016 static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2017 size_t len, int flags) 2018 { 2019 struct vsock_transport_recv_notify_data recv_data; 2020 const struct vsock_transport *transport; 2021 struct vsock_sock *vsk; 2022 ssize_t copied; 2023 size_t target; 2024 long timeout; 2025 int err; 2026 2027 DEFINE_WAIT(wait); 2028 2029 vsk = vsock_sk(sk); 2030 transport = vsk->transport; 2031 2032 /* We must not copy less than target bytes into the user's buffer 2033 * before returning successfully, so we wait for the consume queue to 2034 * have that much data to consume before dequeueing. Note that this 2035 * makes it impossible to handle cases where target is greater than the 2036 * queue size. 2037 */ 2038 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 2039 if (target >= transport->stream_rcvhiwat(vsk)) { 2040 err = -ENOMEM; 2041 goto out; 2042 } 2043 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2044 copied = 0; 2045 2046 err = transport->notify_recv_init(vsk, target, &recv_data); 2047 if (err < 0) 2048 goto out; 2049 2050 2051 while (1) { 2052 ssize_t read; 2053 2054 err = vsock_connectible_wait_data(sk, &wait, timeout, 2055 &recv_data, target); 2056 if (err <= 0) 2057 break; 2058 2059 err = transport->notify_recv_pre_dequeue(vsk, target, 2060 &recv_data); 2061 if (err < 0) 2062 break; 2063 2064 read = transport->stream_dequeue(vsk, msg, len - copied, flags); 2065 if (read < 0) { 2066 err = read; 2067 break; 2068 } 2069 2070 copied += read; 2071 2072 err = transport->notify_recv_post_dequeue(vsk, target, read, 2073 !(flags & MSG_PEEK), &recv_data); 2074 if (err < 0) 2075 goto out; 2076 2077 if (read >= target || flags & MSG_PEEK) 2078 break; 2079 2080 target -= read; 2081 } 2082 2083 if (sk->sk_err) 2084 err = -sk->sk_err; 2085 else if (sk->sk_shutdown & RCV_SHUTDOWN) 2086 err = 0; 2087 2088 if (copied > 0) 2089 err = copied; 2090 2091 out: 2092 return err; 2093 } 2094 2095 static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, 2096 size_t len, int flags) 2097 { 2098 const struct vsock_transport *transport; 2099 struct vsock_sock *vsk; 2100 ssize_t msg_len; 2101 long timeout; 2102 int err = 0; 2103 DEFINE_WAIT(wait); 2104 2105 vsk = vsock_sk(sk); 2106 transport = vsk->transport; 2107 2108 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2109 2110 err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); 2111 if (err <= 0) 2112 goto out; 2113 2114 msg_len = transport->seqpacket_dequeue(vsk, msg, flags); 2115 2116 if (msg_len < 0) { 2117 err = msg_len; 2118 goto out; 2119 } 2120 2121 if (sk->sk_err) { 2122 err = -sk->sk_err; 2123 } else if (sk->sk_shutdown & RCV_SHUTDOWN) { 2124 err = 0; 2125 } else { 2126 /* User sets MSG_TRUNC, so return real length of 2127 * packet. 2128 */ 2129 if (flags & MSG_TRUNC) 2130 err = msg_len; 2131 else 2132 err = len - msg_data_left(msg); 2133 2134 /* Always set MSG_TRUNC if real length of packet is 2135 * bigger than user's buffer. 2136 */ 2137 if (msg_len > len) 2138 msg->msg_flags |= MSG_TRUNC; 2139 } 2140 2141 out: 2142 return err; 2143 } 2144 2145 int 2146 __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 2147 int flags) 2148 { 2149 struct sock *sk; 2150 struct vsock_sock *vsk; 2151 const struct vsock_transport *transport; 2152 int err; 2153 2154 sk = sock->sk; 2155 2156 if (unlikely(flags & MSG_ERRQUEUE)) 2157 return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); 2158 2159 vsk = vsock_sk(sk); 2160 err = 0; 2161 2162 lock_sock(sk); 2163 2164 transport = vsk->transport; 2165 2166 if (!transport || sk->sk_state != TCP_ESTABLISHED) { 2167 /* Recvmsg is supposed to return 0 if a peer performs an 2168 * orderly shutdown. Differentiate between that case and when a 2169 * peer has not connected or a local shutdown occurred with the 2170 * SOCK_DONE flag. 2171 */ 2172 if (sock_flag(sk, SOCK_DONE)) 2173 err = 0; 2174 else 2175 err = -ENOTCONN; 2176 2177 goto out; 2178 } 2179 2180 if (flags & MSG_OOB) { 2181 err = -EOPNOTSUPP; 2182 goto out; 2183 } 2184 2185 /* We don't check peer_shutdown flag here since peer may actually shut 2186 * down, but there can be data in the queue that a local socket can 2187 * receive. 2188 */ 2189 if (sk->sk_shutdown & RCV_SHUTDOWN) { 2190 err = 0; 2191 goto out; 2192 } 2193 2194 /* It is valid on Linux to pass in a zero-length receive buffer. This 2195 * is not an error. We may as well bail out now. 2196 */ 2197 if (!len) { 2198 err = 0; 2199 goto out; 2200 } 2201 2202 if (sk->sk_type == SOCK_STREAM) 2203 err = __vsock_stream_recvmsg(sk, msg, len, flags); 2204 else 2205 err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); 2206 2207 out: 2208 release_sock(sk); 2209 return err; 2210 } 2211 2212 int 2213 vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 2214 int flags) 2215 { 2216 #ifdef CONFIG_BPF_SYSCALL 2217 struct sock *sk = sock->sk; 2218 const struct proto *prot; 2219 2220 prot = READ_ONCE(sk->sk_prot); 2221 if (prot != &vsock_proto) 2222 return prot->recvmsg(sk, msg, len, flags, NULL); 2223 #endif 2224 2225 return __vsock_connectible_recvmsg(sock, msg, len, flags); 2226 } 2227 EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); 2228 2229 static int vsock_set_rcvlowat(struct sock *sk, int val) 2230 { 2231 const struct vsock_transport *transport; 2232 struct vsock_sock *vsk; 2233 2234 vsk = vsock_sk(sk); 2235 2236 if (val > vsk->buffer_size) 2237 return -EINVAL; 2238 2239 transport = vsk->transport; 2240 2241 if (transport && transport->notify_set_rcvlowat) { 2242 int err; 2243 2244 err = transport->notify_set_rcvlowat(vsk, val); 2245 if (err) 2246 return err; 2247 } 2248 2249 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 2250 return 0; 2251 } 2252 2253 static const struct proto_ops vsock_stream_ops = { 2254 .family = PF_VSOCK, 2255 .owner = THIS_MODULE, 2256 .release = vsock_release, 2257 .bind = vsock_bind, 2258 .connect = vsock_connect, 2259 .socketpair = sock_no_socketpair, 2260 .accept = vsock_accept, 2261 .getname = vsock_getname, 2262 .poll = vsock_poll, 2263 .ioctl = sock_no_ioctl, 2264 .listen = vsock_listen, 2265 .shutdown = vsock_shutdown, 2266 .setsockopt = vsock_connectible_setsockopt, 2267 .getsockopt = vsock_connectible_getsockopt, 2268 .sendmsg = vsock_connectible_sendmsg, 2269 .recvmsg = vsock_connectible_recvmsg, 2270 .mmap = sock_no_mmap, 2271 .set_rcvlowat = vsock_set_rcvlowat, 2272 .read_skb = vsock_read_skb, 2273 }; 2274 2275 static const struct proto_ops vsock_seqpacket_ops = { 2276 .family = PF_VSOCK, 2277 .owner = THIS_MODULE, 2278 .release = vsock_release, 2279 .bind = vsock_bind, 2280 .connect = vsock_connect, 2281 .socketpair = sock_no_socketpair, 2282 .accept = vsock_accept, 2283 .getname = vsock_getname, 2284 .poll = vsock_poll, 2285 .ioctl = sock_no_ioctl, 2286 .listen = vsock_listen, 2287 .shutdown = vsock_shutdown, 2288 .setsockopt = vsock_connectible_setsockopt, 2289 .getsockopt = vsock_connectible_getsockopt, 2290 .sendmsg = vsock_connectible_sendmsg, 2291 .recvmsg = vsock_connectible_recvmsg, 2292 .mmap = sock_no_mmap, 2293 .read_skb = vsock_read_skb, 2294 }; 2295 2296 static int vsock_create(struct net *net, struct socket *sock, 2297 int protocol, int kern) 2298 { 2299 struct vsock_sock *vsk; 2300 struct sock *sk; 2301 int ret; 2302 2303 if (!sock) 2304 return -EINVAL; 2305 2306 if (protocol && protocol != PF_VSOCK) 2307 return -EPROTONOSUPPORT; 2308 2309 switch (sock->type) { 2310 case SOCK_DGRAM: 2311 sock->ops = &vsock_dgram_ops; 2312 break; 2313 case SOCK_STREAM: 2314 sock->ops = &vsock_stream_ops; 2315 break; 2316 case SOCK_SEQPACKET: 2317 sock->ops = &vsock_seqpacket_ops; 2318 break; 2319 default: 2320 return -ESOCKTNOSUPPORT; 2321 } 2322 2323 sock->state = SS_UNCONNECTED; 2324 2325 sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); 2326 if (!sk) 2327 return -ENOMEM; 2328 2329 vsk = vsock_sk(sk); 2330 2331 if (sock->type == SOCK_DGRAM) { 2332 ret = vsock_assign_transport(vsk, NULL); 2333 if (ret < 0) { 2334 sock_put(sk); 2335 return ret; 2336 } 2337 } 2338 2339 vsock_insert_unbound(vsk); 2340 2341 return 0; 2342 } 2343 2344 static const struct net_proto_family vsock_family_ops = { 2345 .family = AF_VSOCK, 2346 .create = vsock_create, 2347 .owner = THIS_MODULE, 2348 }; 2349 2350 static long vsock_dev_do_ioctl(struct file *filp, 2351 unsigned int cmd, void __user *ptr) 2352 { 2353 u32 __user *p = ptr; 2354 u32 cid = VMADDR_CID_ANY; 2355 int retval = 0; 2356 2357 switch (cmd) { 2358 case IOCTL_VM_SOCKETS_GET_LOCAL_CID: 2359 /* To be compatible with the VMCI behavior, we prioritize the 2360 * guest CID instead of well-know host CID (VMADDR_CID_HOST). 2361 */ 2362 if (transport_g2h) 2363 cid = transport_g2h->get_local_cid(); 2364 else if (transport_h2g) 2365 cid = transport_h2g->get_local_cid(); 2366 2367 if (put_user(cid, p) != 0) 2368 retval = -EFAULT; 2369 break; 2370 2371 default: 2372 retval = -ENOIOCTLCMD; 2373 } 2374 2375 return retval; 2376 } 2377 2378 static long vsock_dev_ioctl(struct file *filp, 2379 unsigned int cmd, unsigned long arg) 2380 { 2381 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); 2382 } 2383 2384 #ifdef CONFIG_COMPAT 2385 static long vsock_dev_compat_ioctl(struct file *filp, 2386 unsigned int cmd, unsigned long arg) 2387 { 2388 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); 2389 } 2390 #endif 2391 2392 static const struct file_operations vsock_device_ops = { 2393 .owner = THIS_MODULE, 2394 .unlocked_ioctl = vsock_dev_ioctl, 2395 #ifdef CONFIG_COMPAT 2396 .compat_ioctl = vsock_dev_compat_ioctl, 2397 #endif 2398 .open = nonseekable_open, 2399 }; 2400 2401 static struct miscdevice vsock_device = { 2402 .name = "vsock", 2403 .fops = &vsock_device_ops, 2404 }; 2405 2406 static int __init vsock_init(void) 2407 { 2408 int err = 0; 2409 2410 vsock_init_tables(); 2411 2412 vsock_proto.owner = THIS_MODULE; 2413 vsock_device.minor = MISC_DYNAMIC_MINOR; 2414 err = misc_register(&vsock_device); 2415 if (err) { 2416 pr_err("Failed to register misc device\n"); 2417 goto err_reset_transport; 2418 } 2419 2420 err = proto_register(&vsock_proto, 1); /* we want our slab */ 2421 if (err) { 2422 pr_err("Cannot register vsock protocol\n"); 2423 goto err_deregister_misc; 2424 } 2425 2426 err = sock_register(&vsock_family_ops); 2427 if (err) { 2428 pr_err("could not register af_vsock (%d) address family: %d\n", 2429 AF_VSOCK, err); 2430 goto err_unregister_proto; 2431 } 2432 2433 vsock_bpf_build_proto(); 2434 2435 return 0; 2436 2437 err_unregister_proto: 2438 proto_unregister(&vsock_proto); 2439 err_deregister_misc: 2440 misc_deregister(&vsock_device); 2441 err_reset_transport: 2442 return err; 2443 } 2444 2445 static void __exit vsock_exit(void) 2446 { 2447 misc_deregister(&vsock_device); 2448 sock_unregister(AF_VSOCK); 2449 proto_unregister(&vsock_proto); 2450 } 2451 2452 const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) 2453 { 2454 return vsk->transport; 2455 } 2456 EXPORT_SYMBOL_GPL(vsock_core_get_transport); 2457 2458 int vsock_core_register(const struct vsock_transport *t, int features) 2459 { 2460 const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; 2461 int err = mutex_lock_interruptible(&vsock_register_mutex); 2462 2463 if (err) 2464 return err; 2465 2466 t_h2g = transport_h2g; 2467 t_g2h = transport_g2h; 2468 t_dgram = transport_dgram; 2469 t_local = transport_local; 2470 2471 if (features & VSOCK_TRANSPORT_F_H2G) { 2472 if (t_h2g) { 2473 err = -EBUSY; 2474 goto err_busy; 2475 } 2476 t_h2g = t; 2477 } 2478 2479 if (features & VSOCK_TRANSPORT_F_G2H) { 2480 if (t_g2h) { 2481 err = -EBUSY; 2482 goto err_busy; 2483 } 2484 t_g2h = t; 2485 } 2486 2487 if (features & VSOCK_TRANSPORT_F_DGRAM) { 2488 if (t_dgram) { 2489 err = -EBUSY; 2490 goto err_busy; 2491 } 2492 t_dgram = t; 2493 } 2494 2495 if (features & VSOCK_TRANSPORT_F_LOCAL) { 2496 if (t_local) { 2497 err = -EBUSY; 2498 goto err_busy; 2499 } 2500 t_local = t; 2501 } 2502 2503 transport_h2g = t_h2g; 2504 transport_g2h = t_g2h; 2505 transport_dgram = t_dgram; 2506 transport_local = t_local; 2507 2508 err_busy: 2509 mutex_unlock(&vsock_register_mutex); 2510 return err; 2511 } 2512 EXPORT_SYMBOL_GPL(vsock_core_register); 2513 2514 void vsock_core_unregister(const struct vsock_transport *t) 2515 { 2516 mutex_lock(&vsock_register_mutex); 2517 2518 if (transport_h2g == t) 2519 transport_h2g = NULL; 2520 2521 if (transport_g2h == t) 2522 transport_g2h = NULL; 2523 2524 if (transport_dgram == t) 2525 transport_dgram = NULL; 2526 2527 if (transport_local == t) 2528 transport_local = NULL; 2529 2530 mutex_unlock(&vsock_register_mutex); 2531 } 2532 EXPORT_SYMBOL_GPL(vsock_core_unregister); 2533 2534 module_init(vsock_init); 2535 module_exit(vsock_exit); 2536 2537 MODULE_AUTHOR("VMware, Inc."); 2538 MODULE_DESCRIPTION("VMware Virtual Socket Family"); 2539 MODULE_VERSION("1.0.2.0-k"); 2540 MODULE_LICENSE("GPL v2"); 2541