1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VMware vSockets Driver 4 * 5 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 6 */ 7 8 /* Implementation notes: 9 * 10 * - There are two kinds of sockets: those created by user action (such as 11 * calling socket(2)) and those created by incoming connection request packets. 12 * 13 * - There are two "global" tables, one for bound sockets (sockets that have 14 * specified an address that they are responsible for) and one for connected 15 * sockets (sockets that have established a connection with another socket). 16 * These tables are "global" in that all sockets on the system are placed 17 * within them. - Note, though, that the bound table contains an extra entry 18 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in 19 * that list. The bound table is used solely for lookup of sockets when packets 20 * are received and that's not necessary for SOCK_DGRAM sockets since we create 21 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM 22 * sockets out of the bound hash buckets will reduce the chance of collisions 23 * when looking for SOCK_STREAM sockets and prevents us from having to check the 24 * socket type in the hash table lookups. 25 * 26 * - Sockets created by user action will either be "client" sockets that 27 * initiate a connection or "server" sockets that listen for connections; we do 28 * not support simultaneous connects (two "client" sockets connecting). 29 * 30 * - "Server" sockets are referred to as listener sockets throughout this 31 * implementation because they are in the TCP_LISTEN state. When a 32 * connection request is received (the second kind of socket mentioned above), 33 * we create a new socket and refer to it as a pending socket. These pending 34 * sockets are placed on the pending connection list of the listener socket. 35 * When future packets are received for the address the listener socket is 36 * bound to, we check if the source of the packet is from one that has an 37 * existing pending connection. If it does, we process the packet for the 38 * pending socket. When that socket reaches the connected state, it is removed 39 * from the listener socket's pending list and enqueued in the listener 40 * socket's accept queue. Callers of accept(2) will accept connected sockets 41 * from the listener socket's accept queue. If the socket cannot be accepted 42 * for some reason then it is marked rejected. Once the connection is 43 * accepted, it is owned by the user process and the responsibility for cleanup 44 * falls with that user process. 45 * 46 * - It is possible that these pending sockets will never reach the connected 47 * state; in fact, we may never receive another packet after the connection 48 * request. Because of this, we must schedule a cleanup function to run in the 49 * future, after some amount of time passes where a connection should have been 50 * established. This function ensures that the socket is off all lists so it 51 * cannot be retrieved, then drops all references to the socket so it is cleaned 52 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this 53 * function will also cleanup rejected sockets, those that reach the connected 54 * state but leave it before they have been accepted. 55 * 56 * - Lock ordering for pending or accept queue sockets is: 57 * 58 * lock_sock(listener); 59 * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); 60 * 61 * Using explicit nested locking keeps lockdep happy since normally only one 62 * lock of a given class may be taken at a time. 63 * 64 * - Sockets created by user action will be cleaned up when the user process 65 * calls close(2), causing our release implementation to be called. Our release 66 * implementation will perform some cleanup then drop the last reference so our 67 * sk_destruct implementation is invoked. Our sk_destruct implementation will 68 * perform additional cleanup that's common for both types of sockets. 69 * 70 * - A socket's reference count is what ensures that the structure won't be 71 * freed. Each entry in a list (such as the "global" bound and connected tables 72 * and the listener socket's pending list and connected queue) ensures a 73 * reference. When we defer work until process context and pass a socket as our 74 * argument, we must ensure the reference count is increased to ensure the 75 * socket isn't freed before the function is run; the deferred function will 76 * then drop the reference. 77 * 78 * - sk->sk_state uses the TCP state constants because they are widely used by 79 * other address families and exposed to userspace tools like ss(8): 80 * 81 * TCP_CLOSE - unconnected 82 * TCP_SYN_SENT - connecting 83 * TCP_ESTABLISHED - connected 84 * TCP_CLOSING - disconnecting 85 * TCP_LISTEN - listening 86 */ 87 88 #include <linux/types.h> 89 #include <linux/bitops.h> 90 #include <linux/cred.h> 91 #include <linux/init.h> 92 #include <linux/io.h> 93 #include <linux/kernel.h> 94 #include <linux/sched/signal.h> 95 #include <linux/kmod.h> 96 #include <linux/list.h> 97 #include <linux/miscdevice.h> 98 #include <linux/module.h> 99 #include <linux/mutex.h> 100 #include <linux/net.h> 101 #include <linux/poll.h> 102 #include <linux/random.h> 103 #include <linux/skbuff.h> 104 #include <linux/smp.h> 105 #include <linux/socket.h> 106 #include <linux/stddef.h> 107 #include <linux/unistd.h> 108 #include <linux/wait.h> 109 #include <linux/workqueue.h> 110 #include <net/sock.h> 111 #include <net/af_vsock.h> 112 113 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 114 static void vsock_sk_destruct(struct sock *sk); 115 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 116 117 /* Protocol family. */ 118 static struct proto vsock_proto = { 119 .name = "AF_VSOCK", 120 .owner = THIS_MODULE, 121 .obj_size = sizeof(struct vsock_sock), 122 }; 123 124 /* The default peer timeout indicates how long we will wait for a peer response 125 * to a control message. 126 */ 127 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 128 129 static const struct vsock_transport *transport; 130 static DEFINE_MUTEX(vsock_register_mutex); 131 132 /**** EXPORTS ****/ 133 134 /* Get the ID of the local context. This is transport dependent. */ 135 136 int vm_sockets_get_local_cid(void) 137 { 138 return transport->get_local_cid(); 139 } 140 EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); 141 142 /**** UTILS ****/ 143 144 /* Each bound VSocket is stored in the bind hash table and each connected 145 * VSocket is stored in the connected hash table. 146 * 147 * Unbound sockets are all put on the same list attached to the end of the hash 148 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in 149 * the bucket that their local address hashes to (vsock_bound_sockets(addr) 150 * represents the list that addr hashes to). 151 * 152 * Specifically, we initialize the vsock_bind_table array to a size of 153 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through 154 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and 155 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function 156 * mods with VSOCK_HASH_SIZE to ensure this. 157 */ 158 #define MAX_PORT_RETRIES 24 159 160 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) 161 #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) 162 #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) 163 164 /* XXX This can probably be implemented in a better way. */ 165 #define VSOCK_CONN_HASH(src, dst) \ 166 (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) 167 #define vsock_connected_sockets(src, dst) \ 168 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) 169 #define vsock_connected_sockets_vsk(vsk) \ 170 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) 171 172 struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; 173 EXPORT_SYMBOL_GPL(vsock_bind_table); 174 struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; 175 EXPORT_SYMBOL_GPL(vsock_connected_table); 176 DEFINE_SPINLOCK(vsock_table_lock); 177 EXPORT_SYMBOL_GPL(vsock_table_lock); 178 179 /* Autobind this socket to the local address if necessary. */ 180 static int vsock_auto_bind(struct vsock_sock *vsk) 181 { 182 struct sock *sk = sk_vsock(vsk); 183 struct sockaddr_vm local_addr; 184 185 if (vsock_addr_bound(&vsk->local_addr)) 186 return 0; 187 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 188 return __vsock_bind(sk, &local_addr); 189 } 190 191 static int __init vsock_init_tables(void) 192 { 193 int i; 194 195 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) 196 INIT_LIST_HEAD(&vsock_bind_table[i]); 197 198 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) 199 INIT_LIST_HEAD(&vsock_connected_table[i]); 200 return 0; 201 } 202 203 static void __vsock_insert_bound(struct list_head *list, 204 struct vsock_sock *vsk) 205 { 206 sock_hold(&vsk->sk); 207 list_add(&vsk->bound_table, list); 208 } 209 210 static void __vsock_insert_connected(struct list_head *list, 211 struct vsock_sock *vsk) 212 { 213 sock_hold(&vsk->sk); 214 list_add(&vsk->connected_table, list); 215 } 216 217 static void __vsock_remove_bound(struct vsock_sock *vsk) 218 { 219 list_del_init(&vsk->bound_table); 220 sock_put(&vsk->sk); 221 } 222 223 static void __vsock_remove_connected(struct vsock_sock *vsk) 224 { 225 list_del_init(&vsk->connected_table); 226 sock_put(&vsk->sk); 227 } 228 229 static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) 230 { 231 struct vsock_sock *vsk; 232 233 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) 234 if (addr->svm_port == vsk->local_addr.svm_port) 235 return sk_vsock(vsk); 236 237 return NULL; 238 } 239 240 static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, 241 struct sockaddr_vm *dst) 242 { 243 struct vsock_sock *vsk; 244 245 list_for_each_entry(vsk, vsock_connected_sockets(src, dst), 246 connected_table) { 247 if (vsock_addr_equals_addr(src, &vsk->remote_addr) && 248 dst->svm_port == vsk->local_addr.svm_port) { 249 return sk_vsock(vsk); 250 } 251 } 252 253 return NULL; 254 } 255 256 static void vsock_insert_unbound(struct vsock_sock *vsk) 257 { 258 spin_lock_bh(&vsock_table_lock); 259 __vsock_insert_bound(vsock_unbound_sockets, vsk); 260 spin_unlock_bh(&vsock_table_lock); 261 } 262 263 void vsock_insert_connected(struct vsock_sock *vsk) 264 { 265 struct list_head *list = vsock_connected_sockets( 266 &vsk->remote_addr, &vsk->local_addr); 267 268 spin_lock_bh(&vsock_table_lock); 269 __vsock_insert_connected(list, vsk); 270 spin_unlock_bh(&vsock_table_lock); 271 } 272 EXPORT_SYMBOL_GPL(vsock_insert_connected); 273 274 void vsock_remove_bound(struct vsock_sock *vsk) 275 { 276 spin_lock_bh(&vsock_table_lock); 277 __vsock_remove_bound(vsk); 278 spin_unlock_bh(&vsock_table_lock); 279 } 280 EXPORT_SYMBOL_GPL(vsock_remove_bound); 281 282 void vsock_remove_connected(struct vsock_sock *vsk) 283 { 284 spin_lock_bh(&vsock_table_lock); 285 __vsock_remove_connected(vsk); 286 spin_unlock_bh(&vsock_table_lock); 287 } 288 EXPORT_SYMBOL_GPL(vsock_remove_connected); 289 290 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) 291 { 292 struct sock *sk; 293 294 spin_lock_bh(&vsock_table_lock); 295 sk = __vsock_find_bound_socket(addr); 296 if (sk) 297 sock_hold(sk); 298 299 spin_unlock_bh(&vsock_table_lock); 300 301 return sk; 302 } 303 EXPORT_SYMBOL_GPL(vsock_find_bound_socket); 304 305 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 306 struct sockaddr_vm *dst) 307 { 308 struct sock *sk; 309 310 spin_lock_bh(&vsock_table_lock); 311 sk = __vsock_find_connected_socket(src, dst); 312 if (sk) 313 sock_hold(sk); 314 315 spin_unlock_bh(&vsock_table_lock); 316 317 return sk; 318 } 319 EXPORT_SYMBOL_GPL(vsock_find_connected_socket); 320 321 static bool vsock_in_bound_table(struct vsock_sock *vsk) 322 { 323 bool ret; 324 325 spin_lock_bh(&vsock_table_lock); 326 ret = __vsock_in_bound_table(vsk); 327 spin_unlock_bh(&vsock_table_lock); 328 329 return ret; 330 } 331 332 static bool vsock_in_connected_table(struct vsock_sock *vsk) 333 { 334 bool ret; 335 336 spin_lock_bh(&vsock_table_lock); 337 ret = __vsock_in_connected_table(vsk); 338 spin_unlock_bh(&vsock_table_lock); 339 340 return ret; 341 } 342 343 void vsock_remove_sock(struct vsock_sock *vsk) 344 { 345 if (vsock_in_bound_table(vsk)) 346 vsock_remove_bound(vsk); 347 348 if (vsock_in_connected_table(vsk)) 349 vsock_remove_connected(vsk); 350 } 351 EXPORT_SYMBOL_GPL(vsock_remove_sock); 352 353 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) 354 { 355 int i; 356 357 spin_lock_bh(&vsock_table_lock); 358 359 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { 360 struct vsock_sock *vsk; 361 list_for_each_entry(vsk, &vsock_connected_table[i], 362 connected_table) 363 fn(sk_vsock(vsk)); 364 } 365 366 spin_unlock_bh(&vsock_table_lock); 367 } 368 EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); 369 370 void vsock_add_pending(struct sock *listener, struct sock *pending) 371 { 372 struct vsock_sock *vlistener; 373 struct vsock_sock *vpending; 374 375 vlistener = vsock_sk(listener); 376 vpending = vsock_sk(pending); 377 378 sock_hold(pending); 379 sock_hold(listener); 380 list_add_tail(&vpending->pending_links, &vlistener->pending_links); 381 } 382 EXPORT_SYMBOL_GPL(vsock_add_pending); 383 384 void vsock_remove_pending(struct sock *listener, struct sock *pending) 385 { 386 struct vsock_sock *vpending = vsock_sk(pending); 387 388 list_del_init(&vpending->pending_links); 389 sock_put(listener); 390 sock_put(pending); 391 } 392 EXPORT_SYMBOL_GPL(vsock_remove_pending); 393 394 void vsock_enqueue_accept(struct sock *listener, struct sock *connected) 395 { 396 struct vsock_sock *vlistener; 397 struct vsock_sock *vconnected; 398 399 vlistener = vsock_sk(listener); 400 vconnected = vsock_sk(connected); 401 402 sock_hold(connected); 403 sock_hold(listener); 404 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); 405 } 406 EXPORT_SYMBOL_GPL(vsock_enqueue_accept); 407 408 static struct sock *vsock_dequeue_accept(struct sock *listener) 409 { 410 struct vsock_sock *vlistener; 411 struct vsock_sock *vconnected; 412 413 vlistener = vsock_sk(listener); 414 415 if (list_empty(&vlistener->accept_queue)) 416 return NULL; 417 418 vconnected = list_entry(vlistener->accept_queue.next, 419 struct vsock_sock, accept_queue); 420 421 list_del_init(&vconnected->accept_queue); 422 sock_put(listener); 423 /* The caller will need a reference on the connected socket so we let 424 * it call sock_put(). 425 */ 426 427 return sk_vsock(vconnected); 428 } 429 430 static bool vsock_is_accept_queue_empty(struct sock *sk) 431 { 432 struct vsock_sock *vsk = vsock_sk(sk); 433 return list_empty(&vsk->accept_queue); 434 } 435 436 static bool vsock_is_pending(struct sock *sk) 437 { 438 struct vsock_sock *vsk = vsock_sk(sk); 439 return !list_empty(&vsk->pending_links); 440 } 441 442 static int vsock_send_shutdown(struct sock *sk, int mode) 443 { 444 return transport->shutdown(vsock_sk(sk), mode); 445 } 446 447 static void vsock_pending_work(struct work_struct *work) 448 { 449 struct sock *sk; 450 struct sock *listener; 451 struct vsock_sock *vsk; 452 bool cleanup; 453 454 vsk = container_of(work, struct vsock_sock, pending_work.work); 455 sk = sk_vsock(vsk); 456 listener = vsk->listener; 457 cleanup = true; 458 459 lock_sock(listener); 460 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 461 462 if (vsock_is_pending(sk)) { 463 vsock_remove_pending(listener, sk); 464 465 listener->sk_ack_backlog--; 466 } else if (!vsk->rejected) { 467 /* We are not on the pending list and accept() did not reject 468 * us, so we must have been accepted by our user process. We 469 * just need to drop our references to the sockets and be on 470 * our way. 471 */ 472 cleanup = false; 473 goto out; 474 } 475 476 /* We need to remove ourself from the global connected sockets list so 477 * incoming packets can't find this socket, and to reduce the reference 478 * count. 479 */ 480 if (vsock_in_connected_table(vsk)) 481 vsock_remove_connected(vsk); 482 483 sk->sk_state = TCP_CLOSE; 484 485 out: 486 release_sock(sk); 487 release_sock(listener); 488 if (cleanup) 489 sock_put(sk); 490 491 sock_put(sk); 492 sock_put(listener); 493 } 494 495 /**** SOCKET OPERATIONS ****/ 496 497 static int __vsock_bind_stream(struct vsock_sock *vsk, 498 struct sockaddr_vm *addr) 499 { 500 static u32 port; 501 struct sockaddr_vm new_addr; 502 503 if (!port) 504 port = LAST_RESERVED_PORT + 1 + 505 prandom_u32_max(U32_MAX - LAST_RESERVED_PORT); 506 507 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); 508 509 if (addr->svm_port == VMADDR_PORT_ANY) { 510 bool found = false; 511 unsigned int i; 512 513 for (i = 0; i < MAX_PORT_RETRIES; i++) { 514 if (port <= LAST_RESERVED_PORT) 515 port = LAST_RESERVED_PORT + 1; 516 517 new_addr.svm_port = port++; 518 519 if (!__vsock_find_bound_socket(&new_addr)) { 520 found = true; 521 break; 522 } 523 } 524 525 if (!found) 526 return -EADDRNOTAVAIL; 527 } else { 528 /* If port is in reserved range, ensure caller 529 * has necessary privileges. 530 */ 531 if (addr->svm_port <= LAST_RESERVED_PORT && 532 !capable(CAP_NET_BIND_SERVICE)) { 533 return -EACCES; 534 } 535 536 if (__vsock_find_bound_socket(&new_addr)) 537 return -EADDRINUSE; 538 } 539 540 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); 541 542 /* Remove stream sockets from the unbound list and add them to the hash 543 * table for easy lookup by its address. The unbound list is simply an 544 * extra entry at the end of the hash table, a trick used by AF_UNIX. 545 */ 546 __vsock_remove_bound(vsk); 547 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); 548 549 return 0; 550 } 551 552 static int __vsock_bind_dgram(struct vsock_sock *vsk, 553 struct sockaddr_vm *addr) 554 { 555 return transport->dgram_bind(vsk, addr); 556 } 557 558 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) 559 { 560 struct vsock_sock *vsk = vsock_sk(sk); 561 u32 cid; 562 int retval; 563 564 /* First ensure this socket isn't already bound. */ 565 if (vsock_addr_bound(&vsk->local_addr)) 566 return -EINVAL; 567 568 /* Now bind to the provided address or select appropriate values if 569 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that 570 * like AF_INET prevents binding to a non-local IP address (in most 571 * cases), we only allow binding to the local CID. 572 */ 573 cid = transport->get_local_cid(); 574 if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) 575 return -EADDRNOTAVAIL; 576 577 switch (sk->sk_socket->type) { 578 case SOCK_STREAM: 579 spin_lock_bh(&vsock_table_lock); 580 retval = __vsock_bind_stream(vsk, addr); 581 spin_unlock_bh(&vsock_table_lock); 582 break; 583 584 case SOCK_DGRAM: 585 retval = __vsock_bind_dgram(vsk, addr); 586 break; 587 588 default: 589 retval = -EINVAL; 590 break; 591 } 592 593 return retval; 594 } 595 596 static void vsock_connect_timeout(struct work_struct *work); 597 598 struct sock *__vsock_create(struct net *net, 599 struct socket *sock, 600 struct sock *parent, 601 gfp_t priority, 602 unsigned short type, 603 int kern) 604 { 605 struct sock *sk; 606 struct vsock_sock *psk; 607 struct vsock_sock *vsk; 608 609 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); 610 if (!sk) 611 return NULL; 612 613 sock_init_data(sock, sk); 614 615 /* sk->sk_type is normally set in sock_init_data, but only if sock is 616 * non-NULL. We make sure that our sockets always have a type by 617 * setting it here if needed. 618 */ 619 if (!sock) 620 sk->sk_type = type; 621 622 vsk = vsock_sk(sk); 623 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 624 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 625 626 sk->sk_destruct = vsock_sk_destruct; 627 sk->sk_backlog_rcv = vsock_queue_rcv_skb; 628 sock_reset_flag(sk, SOCK_DONE); 629 630 INIT_LIST_HEAD(&vsk->bound_table); 631 INIT_LIST_HEAD(&vsk->connected_table); 632 vsk->listener = NULL; 633 INIT_LIST_HEAD(&vsk->pending_links); 634 INIT_LIST_HEAD(&vsk->accept_queue); 635 vsk->rejected = false; 636 vsk->sent_request = false; 637 vsk->ignore_connecting_rst = false; 638 vsk->peer_shutdown = 0; 639 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); 640 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); 641 642 psk = parent ? vsock_sk(parent) : NULL; 643 if (parent) { 644 vsk->trusted = psk->trusted; 645 vsk->owner = get_cred(psk->owner); 646 vsk->connect_timeout = psk->connect_timeout; 647 } else { 648 vsk->trusted = capable(CAP_NET_ADMIN); 649 vsk->owner = get_current_cred(); 650 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; 651 } 652 653 if (transport->init(vsk, psk) < 0) { 654 sk_free(sk); 655 return NULL; 656 } 657 658 if (sock) 659 vsock_insert_unbound(vsk); 660 661 return sk; 662 } 663 EXPORT_SYMBOL_GPL(__vsock_create); 664 665 static void __vsock_release(struct sock *sk) 666 { 667 if (sk) { 668 struct sk_buff *skb; 669 struct sock *pending; 670 struct vsock_sock *vsk; 671 672 vsk = vsock_sk(sk); 673 pending = NULL; /* Compiler warning. */ 674 675 transport->release(vsk); 676 677 lock_sock(sk); 678 sock_orphan(sk); 679 sk->sk_shutdown = SHUTDOWN_MASK; 680 681 while ((skb = skb_dequeue(&sk->sk_receive_queue))) 682 kfree_skb(skb); 683 684 /* Clean up any sockets that never were accepted. */ 685 while ((pending = vsock_dequeue_accept(sk)) != NULL) { 686 __vsock_release(pending); 687 sock_put(pending); 688 } 689 690 release_sock(sk); 691 sock_put(sk); 692 } 693 } 694 695 static void vsock_sk_destruct(struct sock *sk) 696 { 697 struct vsock_sock *vsk = vsock_sk(sk); 698 699 transport->destruct(vsk); 700 701 /* When clearing these addresses, there's no need to set the family and 702 * possibly register the address family with the kernel. 703 */ 704 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 705 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 706 707 put_cred(vsk->owner); 708 } 709 710 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 711 { 712 int err; 713 714 err = sock_queue_rcv_skb(sk, skb); 715 if (err) 716 kfree_skb(skb); 717 718 return err; 719 } 720 721 s64 vsock_stream_has_data(struct vsock_sock *vsk) 722 { 723 return transport->stream_has_data(vsk); 724 } 725 EXPORT_SYMBOL_GPL(vsock_stream_has_data); 726 727 s64 vsock_stream_has_space(struct vsock_sock *vsk) 728 { 729 return transport->stream_has_space(vsk); 730 } 731 EXPORT_SYMBOL_GPL(vsock_stream_has_space); 732 733 static int vsock_release(struct socket *sock) 734 { 735 __vsock_release(sock->sk); 736 sock->sk = NULL; 737 sock->state = SS_FREE; 738 739 return 0; 740 } 741 742 static int 743 vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 744 { 745 int err; 746 struct sock *sk; 747 struct sockaddr_vm *vm_addr; 748 749 sk = sock->sk; 750 751 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) 752 return -EINVAL; 753 754 lock_sock(sk); 755 err = __vsock_bind(sk, vm_addr); 756 release_sock(sk); 757 758 return err; 759 } 760 761 static int vsock_getname(struct socket *sock, 762 struct sockaddr *addr, int peer) 763 { 764 int err; 765 struct sock *sk; 766 struct vsock_sock *vsk; 767 struct sockaddr_vm *vm_addr; 768 769 sk = sock->sk; 770 vsk = vsock_sk(sk); 771 err = 0; 772 773 lock_sock(sk); 774 775 if (peer) { 776 if (sock->state != SS_CONNECTED) { 777 err = -ENOTCONN; 778 goto out; 779 } 780 vm_addr = &vsk->remote_addr; 781 } else { 782 vm_addr = &vsk->local_addr; 783 } 784 785 if (!vm_addr) { 786 err = -EINVAL; 787 goto out; 788 } 789 790 /* sys_getsockname() and sys_getpeername() pass us a 791 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately 792 * that macro is defined in socket.c instead of .h, so we hardcode its 793 * value here. 794 */ 795 BUILD_BUG_ON(sizeof(*vm_addr) > 128); 796 memcpy(addr, vm_addr, sizeof(*vm_addr)); 797 err = sizeof(*vm_addr); 798 799 out: 800 release_sock(sk); 801 return err; 802 } 803 804 static int vsock_shutdown(struct socket *sock, int mode) 805 { 806 int err; 807 struct sock *sk; 808 809 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses 810 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode 811 * here like the other address families do. Note also that the 812 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), 813 * which is what we want. 814 */ 815 mode++; 816 817 if ((mode & ~SHUTDOWN_MASK) || !mode) 818 return -EINVAL; 819 820 /* If this is a STREAM socket and it is not connected then bail out 821 * immediately. If it is a DGRAM socket then we must first kick the 822 * socket so that it wakes up from any sleeping calls, for example 823 * recv(), and then afterwards return the error. 824 */ 825 826 sk = sock->sk; 827 if (sock->state == SS_UNCONNECTED) { 828 err = -ENOTCONN; 829 if (sk->sk_type == SOCK_STREAM) 830 return err; 831 } else { 832 sock->state = SS_DISCONNECTING; 833 err = 0; 834 } 835 836 /* Receive and send shutdowns are treated alike. */ 837 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); 838 if (mode) { 839 lock_sock(sk); 840 sk->sk_shutdown |= mode; 841 sk->sk_state_change(sk); 842 release_sock(sk); 843 844 if (sk->sk_type == SOCK_STREAM) { 845 sock_reset_flag(sk, SOCK_DONE); 846 vsock_send_shutdown(sk, mode); 847 } 848 } 849 850 return err; 851 } 852 853 static __poll_t vsock_poll(struct file *file, struct socket *sock, 854 poll_table *wait) 855 { 856 struct sock *sk; 857 __poll_t mask; 858 struct vsock_sock *vsk; 859 860 sk = sock->sk; 861 vsk = vsock_sk(sk); 862 863 poll_wait(file, sk_sleep(sk), wait); 864 mask = 0; 865 866 if (sk->sk_err) 867 /* Signify that there has been an error on this socket. */ 868 mask |= EPOLLERR; 869 870 /* INET sockets treat local write shutdown and peer write shutdown as a 871 * case of EPOLLHUP set. 872 */ 873 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 874 ((sk->sk_shutdown & SEND_SHUTDOWN) && 875 (vsk->peer_shutdown & SEND_SHUTDOWN))) { 876 mask |= EPOLLHUP; 877 } 878 879 if (sk->sk_shutdown & RCV_SHUTDOWN || 880 vsk->peer_shutdown & SEND_SHUTDOWN) { 881 mask |= EPOLLRDHUP; 882 } 883 884 if (sock->type == SOCK_DGRAM) { 885 /* For datagram sockets we can read if there is something in 886 * the queue and write as long as the socket isn't shutdown for 887 * sending. 888 */ 889 if (!skb_queue_empty(&sk->sk_receive_queue) || 890 (sk->sk_shutdown & RCV_SHUTDOWN)) { 891 mask |= EPOLLIN | EPOLLRDNORM; 892 } 893 894 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 895 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 896 897 } else if (sock->type == SOCK_STREAM) { 898 lock_sock(sk); 899 900 /* Listening sockets that have connections in their accept 901 * queue can be read. 902 */ 903 if (sk->sk_state == TCP_LISTEN 904 && !vsock_is_accept_queue_empty(sk)) 905 mask |= EPOLLIN | EPOLLRDNORM; 906 907 /* If there is something in the queue then we can read. */ 908 if (transport->stream_is_active(vsk) && 909 !(sk->sk_shutdown & RCV_SHUTDOWN)) { 910 bool data_ready_now = false; 911 int ret = transport->notify_poll_in( 912 vsk, 1, &data_ready_now); 913 if (ret < 0) { 914 mask |= EPOLLERR; 915 } else { 916 if (data_ready_now) 917 mask |= EPOLLIN | EPOLLRDNORM; 918 919 } 920 } 921 922 /* Sockets whose connections have been closed, reset, or 923 * terminated should also be considered read, and we check the 924 * shutdown flag for that. 925 */ 926 if (sk->sk_shutdown & RCV_SHUTDOWN || 927 vsk->peer_shutdown & SEND_SHUTDOWN) { 928 mask |= EPOLLIN | EPOLLRDNORM; 929 } 930 931 /* Connected sockets that can produce data can be written. */ 932 if (sk->sk_state == TCP_ESTABLISHED) { 933 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 934 bool space_avail_now = false; 935 int ret = transport->notify_poll_out( 936 vsk, 1, &space_avail_now); 937 if (ret < 0) { 938 mask |= EPOLLERR; 939 } else { 940 if (space_avail_now) 941 /* Remove EPOLLWRBAND since INET 942 * sockets are not setting it. 943 */ 944 mask |= EPOLLOUT | EPOLLWRNORM; 945 946 } 947 } 948 } 949 950 /* Simulate INET socket poll behaviors, which sets 951 * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, 952 * but local send is not shutdown. 953 */ 954 if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { 955 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 956 mask |= EPOLLOUT | EPOLLWRNORM; 957 958 } 959 960 release_sock(sk); 961 } 962 963 return mask; 964 } 965 966 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 967 size_t len) 968 { 969 int err; 970 struct sock *sk; 971 struct vsock_sock *vsk; 972 struct sockaddr_vm *remote_addr; 973 974 if (msg->msg_flags & MSG_OOB) 975 return -EOPNOTSUPP; 976 977 /* For now, MSG_DONTWAIT is always assumed... */ 978 err = 0; 979 sk = sock->sk; 980 vsk = vsock_sk(sk); 981 982 lock_sock(sk); 983 984 err = vsock_auto_bind(vsk); 985 if (err) 986 goto out; 987 988 989 /* If the provided message contains an address, use that. Otherwise 990 * fall back on the socket's remote handle (if it has been connected). 991 */ 992 if (msg->msg_name && 993 vsock_addr_cast(msg->msg_name, msg->msg_namelen, 994 &remote_addr) == 0) { 995 /* Ensure this address is of the right type and is a valid 996 * destination. 997 */ 998 999 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1000 remote_addr->svm_cid = transport->get_local_cid(); 1001 1002 if (!vsock_addr_bound(remote_addr)) { 1003 err = -EINVAL; 1004 goto out; 1005 } 1006 } else if (sock->state == SS_CONNECTED) { 1007 remote_addr = &vsk->remote_addr; 1008 1009 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1010 remote_addr->svm_cid = transport->get_local_cid(); 1011 1012 /* XXX Should connect() or this function ensure remote_addr is 1013 * bound? 1014 */ 1015 if (!vsock_addr_bound(&vsk->remote_addr)) { 1016 err = -EINVAL; 1017 goto out; 1018 } 1019 } else { 1020 err = -EINVAL; 1021 goto out; 1022 } 1023 1024 if (!transport->dgram_allow(remote_addr->svm_cid, 1025 remote_addr->svm_port)) { 1026 err = -EINVAL; 1027 goto out; 1028 } 1029 1030 err = transport->dgram_enqueue(vsk, remote_addr, msg, len); 1031 1032 out: 1033 release_sock(sk); 1034 return err; 1035 } 1036 1037 static int vsock_dgram_connect(struct socket *sock, 1038 struct sockaddr *addr, int addr_len, int flags) 1039 { 1040 int err; 1041 struct sock *sk; 1042 struct vsock_sock *vsk; 1043 struct sockaddr_vm *remote_addr; 1044 1045 sk = sock->sk; 1046 vsk = vsock_sk(sk); 1047 1048 err = vsock_addr_cast(addr, addr_len, &remote_addr); 1049 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { 1050 lock_sock(sk); 1051 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, 1052 VMADDR_PORT_ANY); 1053 sock->state = SS_UNCONNECTED; 1054 release_sock(sk); 1055 return 0; 1056 } else if (err != 0) 1057 return -EINVAL; 1058 1059 lock_sock(sk); 1060 1061 err = vsock_auto_bind(vsk); 1062 if (err) 1063 goto out; 1064 1065 if (!transport->dgram_allow(remote_addr->svm_cid, 1066 remote_addr->svm_port)) { 1067 err = -EINVAL; 1068 goto out; 1069 } 1070 1071 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); 1072 sock->state = SS_CONNECTED; 1073 1074 out: 1075 release_sock(sk); 1076 return err; 1077 } 1078 1079 static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1080 size_t len, int flags) 1081 { 1082 return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags); 1083 } 1084 1085 static const struct proto_ops vsock_dgram_ops = { 1086 .family = PF_VSOCK, 1087 .owner = THIS_MODULE, 1088 .release = vsock_release, 1089 .bind = vsock_bind, 1090 .connect = vsock_dgram_connect, 1091 .socketpair = sock_no_socketpair, 1092 .accept = sock_no_accept, 1093 .getname = vsock_getname, 1094 .poll = vsock_poll, 1095 .ioctl = sock_no_ioctl, 1096 .listen = sock_no_listen, 1097 .shutdown = vsock_shutdown, 1098 .setsockopt = sock_no_setsockopt, 1099 .getsockopt = sock_no_getsockopt, 1100 .sendmsg = vsock_dgram_sendmsg, 1101 .recvmsg = vsock_dgram_recvmsg, 1102 .mmap = sock_no_mmap, 1103 .sendpage = sock_no_sendpage, 1104 }; 1105 1106 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) 1107 { 1108 if (!transport->cancel_pkt) 1109 return -EOPNOTSUPP; 1110 1111 return transport->cancel_pkt(vsk); 1112 } 1113 1114 static void vsock_connect_timeout(struct work_struct *work) 1115 { 1116 struct sock *sk; 1117 struct vsock_sock *vsk; 1118 int cancel = 0; 1119 1120 vsk = container_of(work, struct vsock_sock, connect_work.work); 1121 sk = sk_vsock(vsk); 1122 1123 lock_sock(sk); 1124 if (sk->sk_state == TCP_SYN_SENT && 1125 (sk->sk_shutdown != SHUTDOWN_MASK)) { 1126 sk->sk_state = TCP_CLOSE; 1127 sk->sk_err = ETIMEDOUT; 1128 sk->sk_error_report(sk); 1129 cancel = 1; 1130 } 1131 release_sock(sk); 1132 if (cancel) 1133 vsock_transport_cancel_pkt(vsk); 1134 1135 sock_put(sk); 1136 } 1137 1138 static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, 1139 int addr_len, int flags) 1140 { 1141 int err; 1142 struct sock *sk; 1143 struct vsock_sock *vsk; 1144 struct sockaddr_vm *remote_addr; 1145 long timeout; 1146 DEFINE_WAIT(wait); 1147 1148 err = 0; 1149 sk = sock->sk; 1150 vsk = vsock_sk(sk); 1151 1152 lock_sock(sk); 1153 1154 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ 1155 switch (sock->state) { 1156 case SS_CONNECTED: 1157 err = -EISCONN; 1158 goto out; 1159 case SS_DISCONNECTING: 1160 err = -EINVAL; 1161 goto out; 1162 case SS_CONNECTING: 1163 /* This continues on so we can move sock into the SS_CONNECTED 1164 * state once the connection has completed (at which point err 1165 * will be set to zero also). Otherwise, we will either wait 1166 * for the connection or return -EALREADY should this be a 1167 * non-blocking call. 1168 */ 1169 err = -EALREADY; 1170 break; 1171 default: 1172 if ((sk->sk_state == TCP_LISTEN) || 1173 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { 1174 err = -EINVAL; 1175 goto out; 1176 } 1177 1178 /* The hypervisor and well-known contexts do not have socket 1179 * endpoints. 1180 */ 1181 if (!transport->stream_allow(remote_addr->svm_cid, 1182 remote_addr->svm_port)) { 1183 err = -ENETUNREACH; 1184 goto out; 1185 } 1186 1187 /* Set the remote address that we are connecting to. */ 1188 memcpy(&vsk->remote_addr, remote_addr, 1189 sizeof(vsk->remote_addr)); 1190 1191 err = vsock_auto_bind(vsk); 1192 if (err) 1193 goto out; 1194 1195 sk->sk_state = TCP_SYN_SENT; 1196 1197 err = transport->connect(vsk); 1198 if (err < 0) 1199 goto out; 1200 1201 /* Mark sock as connecting and set the error code to in 1202 * progress in case this is a non-blocking connect. 1203 */ 1204 sock->state = SS_CONNECTING; 1205 err = -EINPROGRESS; 1206 } 1207 1208 /* The receive path will handle all communication until we are able to 1209 * enter the connected state. Here we wait for the connection to be 1210 * completed or a notification of an error. 1211 */ 1212 timeout = vsk->connect_timeout; 1213 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1214 1215 while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { 1216 if (flags & O_NONBLOCK) { 1217 /* If we're not going to block, we schedule a timeout 1218 * function to generate a timeout on the connection 1219 * attempt, in case the peer doesn't respond in a 1220 * timely manner. We hold on to the socket until the 1221 * timeout fires. 1222 */ 1223 sock_hold(sk); 1224 schedule_delayed_work(&vsk->connect_work, timeout); 1225 1226 /* Skip ahead to preserve error code set above. */ 1227 goto out_wait; 1228 } 1229 1230 release_sock(sk); 1231 timeout = schedule_timeout(timeout); 1232 lock_sock(sk); 1233 1234 if (signal_pending(current)) { 1235 err = sock_intr_errno(timeout); 1236 sk->sk_state = TCP_CLOSE; 1237 sock->state = SS_UNCONNECTED; 1238 vsock_transport_cancel_pkt(vsk); 1239 goto out_wait; 1240 } else if (timeout == 0) { 1241 err = -ETIMEDOUT; 1242 sk->sk_state = TCP_CLOSE; 1243 sock->state = SS_UNCONNECTED; 1244 vsock_transport_cancel_pkt(vsk); 1245 goto out_wait; 1246 } 1247 1248 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1249 } 1250 1251 if (sk->sk_err) { 1252 err = -sk->sk_err; 1253 sk->sk_state = TCP_CLOSE; 1254 sock->state = SS_UNCONNECTED; 1255 } else { 1256 err = 0; 1257 } 1258 1259 out_wait: 1260 finish_wait(sk_sleep(sk), &wait); 1261 out: 1262 release_sock(sk); 1263 return err; 1264 } 1265 1266 static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, 1267 bool kern) 1268 { 1269 struct sock *listener; 1270 int err; 1271 struct sock *connected; 1272 struct vsock_sock *vconnected; 1273 long timeout; 1274 DEFINE_WAIT(wait); 1275 1276 err = 0; 1277 listener = sock->sk; 1278 1279 lock_sock(listener); 1280 1281 if (sock->type != SOCK_STREAM) { 1282 err = -EOPNOTSUPP; 1283 goto out; 1284 } 1285 1286 if (listener->sk_state != TCP_LISTEN) { 1287 err = -EINVAL; 1288 goto out; 1289 } 1290 1291 /* Wait for children sockets to appear; these are the new sockets 1292 * created upon connection establishment. 1293 */ 1294 timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); 1295 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1296 1297 while ((connected = vsock_dequeue_accept(listener)) == NULL && 1298 listener->sk_err == 0) { 1299 release_sock(listener); 1300 timeout = schedule_timeout(timeout); 1301 finish_wait(sk_sleep(listener), &wait); 1302 lock_sock(listener); 1303 1304 if (signal_pending(current)) { 1305 err = sock_intr_errno(timeout); 1306 goto out; 1307 } else if (timeout == 0) { 1308 err = -EAGAIN; 1309 goto out; 1310 } 1311 1312 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1313 } 1314 finish_wait(sk_sleep(listener), &wait); 1315 1316 if (listener->sk_err) 1317 err = -listener->sk_err; 1318 1319 if (connected) { 1320 listener->sk_ack_backlog--; 1321 1322 lock_sock_nested(connected, SINGLE_DEPTH_NESTING); 1323 vconnected = vsock_sk(connected); 1324 1325 /* If the listener socket has received an error, then we should 1326 * reject this socket and return. Note that we simply mark the 1327 * socket rejected, drop our reference, and let the cleanup 1328 * function handle the cleanup; the fact that we found it in 1329 * the listener's accept queue guarantees that the cleanup 1330 * function hasn't run yet. 1331 */ 1332 if (err) { 1333 vconnected->rejected = true; 1334 } else { 1335 newsock->state = SS_CONNECTED; 1336 sock_graft(connected, newsock); 1337 } 1338 1339 release_sock(connected); 1340 sock_put(connected); 1341 } 1342 1343 out: 1344 release_sock(listener); 1345 return err; 1346 } 1347 1348 static int vsock_listen(struct socket *sock, int backlog) 1349 { 1350 int err; 1351 struct sock *sk; 1352 struct vsock_sock *vsk; 1353 1354 sk = sock->sk; 1355 1356 lock_sock(sk); 1357 1358 if (sock->type != SOCK_STREAM) { 1359 err = -EOPNOTSUPP; 1360 goto out; 1361 } 1362 1363 if (sock->state != SS_UNCONNECTED) { 1364 err = -EINVAL; 1365 goto out; 1366 } 1367 1368 vsk = vsock_sk(sk); 1369 1370 if (!vsock_addr_bound(&vsk->local_addr)) { 1371 err = -EINVAL; 1372 goto out; 1373 } 1374 1375 sk->sk_max_ack_backlog = backlog; 1376 sk->sk_state = TCP_LISTEN; 1377 1378 err = 0; 1379 1380 out: 1381 release_sock(sk); 1382 return err; 1383 } 1384 1385 static int vsock_stream_setsockopt(struct socket *sock, 1386 int level, 1387 int optname, 1388 char __user *optval, 1389 unsigned int optlen) 1390 { 1391 int err; 1392 struct sock *sk; 1393 struct vsock_sock *vsk; 1394 u64 val; 1395 1396 if (level != AF_VSOCK) 1397 return -ENOPROTOOPT; 1398 1399 #define COPY_IN(_v) \ 1400 do { \ 1401 if (optlen < sizeof(_v)) { \ 1402 err = -EINVAL; \ 1403 goto exit; \ 1404 } \ 1405 if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ 1406 err = -EFAULT; \ 1407 goto exit; \ 1408 } \ 1409 } while (0) 1410 1411 err = 0; 1412 sk = sock->sk; 1413 vsk = vsock_sk(sk); 1414 1415 lock_sock(sk); 1416 1417 switch (optname) { 1418 case SO_VM_SOCKETS_BUFFER_SIZE: 1419 COPY_IN(val); 1420 transport->set_buffer_size(vsk, val); 1421 break; 1422 1423 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1424 COPY_IN(val); 1425 transport->set_max_buffer_size(vsk, val); 1426 break; 1427 1428 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1429 COPY_IN(val); 1430 transport->set_min_buffer_size(vsk, val); 1431 break; 1432 1433 case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1434 struct __kernel_old_timeval tv; 1435 COPY_IN(tv); 1436 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && 1437 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { 1438 vsk->connect_timeout = tv.tv_sec * HZ + 1439 DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ)); 1440 if (vsk->connect_timeout == 0) 1441 vsk->connect_timeout = 1442 VSOCK_DEFAULT_CONNECT_TIMEOUT; 1443 1444 } else { 1445 err = -ERANGE; 1446 } 1447 break; 1448 } 1449 1450 default: 1451 err = -ENOPROTOOPT; 1452 break; 1453 } 1454 1455 #undef COPY_IN 1456 1457 exit: 1458 release_sock(sk); 1459 return err; 1460 } 1461 1462 static int vsock_stream_getsockopt(struct socket *sock, 1463 int level, int optname, 1464 char __user *optval, 1465 int __user *optlen) 1466 { 1467 int err; 1468 int len; 1469 struct sock *sk; 1470 struct vsock_sock *vsk; 1471 u64 val; 1472 1473 if (level != AF_VSOCK) 1474 return -ENOPROTOOPT; 1475 1476 err = get_user(len, optlen); 1477 if (err != 0) 1478 return err; 1479 1480 #define COPY_OUT(_v) \ 1481 do { \ 1482 if (len < sizeof(_v)) \ 1483 return -EINVAL; \ 1484 \ 1485 len = sizeof(_v); \ 1486 if (copy_to_user(optval, &_v, len) != 0) \ 1487 return -EFAULT; \ 1488 \ 1489 } while (0) 1490 1491 err = 0; 1492 sk = sock->sk; 1493 vsk = vsock_sk(sk); 1494 1495 switch (optname) { 1496 case SO_VM_SOCKETS_BUFFER_SIZE: 1497 val = transport->get_buffer_size(vsk); 1498 COPY_OUT(val); 1499 break; 1500 1501 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1502 val = transport->get_max_buffer_size(vsk); 1503 COPY_OUT(val); 1504 break; 1505 1506 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1507 val = transport->get_min_buffer_size(vsk); 1508 COPY_OUT(val); 1509 break; 1510 1511 case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1512 struct __kernel_old_timeval tv; 1513 tv.tv_sec = vsk->connect_timeout / HZ; 1514 tv.tv_usec = 1515 (vsk->connect_timeout - 1516 tv.tv_sec * HZ) * (1000000 / HZ); 1517 COPY_OUT(tv); 1518 break; 1519 } 1520 default: 1521 return -ENOPROTOOPT; 1522 } 1523 1524 err = put_user(len, optlen); 1525 if (err != 0) 1526 return -EFAULT; 1527 1528 #undef COPY_OUT 1529 1530 return 0; 1531 } 1532 1533 static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, 1534 size_t len) 1535 { 1536 struct sock *sk; 1537 struct vsock_sock *vsk; 1538 ssize_t total_written; 1539 long timeout; 1540 int err; 1541 struct vsock_transport_send_notify_data send_data; 1542 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1543 1544 sk = sock->sk; 1545 vsk = vsock_sk(sk); 1546 total_written = 0; 1547 err = 0; 1548 1549 if (msg->msg_flags & MSG_OOB) 1550 return -EOPNOTSUPP; 1551 1552 lock_sock(sk); 1553 1554 /* Callers should not provide a destination with stream sockets. */ 1555 if (msg->msg_namelen) { 1556 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 1557 goto out; 1558 } 1559 1560 /* Send data only if both sides are not shutdown in the direction. */ 1561 if (sk->sk_shutdown & SEND_SHUTDOWN || 1562 vsk->peer_shutdown & RCV_SHUTDOWN) { 1563 err = -EPIPE; 1564 goto out; 1565 } 1566 1567 if (sk->sk_state != TCP_ESTABLISHED || 1568 !vsock_addr_bound(&vsk->local_addr)) { 1569 err = -ENOTCONN; 1570 goto out; 1571 } 1572 1573 if (!vsock_addr_bound(&vsk->remote_addr)) { 1574 err = -EDESTADDRREQ; 1575 goto out; 1576 } 1577 1578 /* Wait for room in the produce queue to enqueue our user's data. */ 1579 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1580 1581 err = transport->notify_send_init(vsk, &send_data); 1582 if (err < 0) 1583 goto out; 1584 1585 while (total_written < len) { 1586 ssize_t written; 1587 1588 add_wait_queue(sk_sleep(sk), &wait); 1589 while (vsock_stream_has_space(vsk) == 0 && 1590 sk->sk_err == 0 && 1591 !(sk->sk_shutdown & SEND_SHUTDOWN) && 1592 !(vsk->peer_shutdown & RCV_SHUTDOWN)) { 1593 1594 /* Don't wait for non-blocking sockets. */ 1595 if (timeout == 0) { 1596 err = -EAGAIN; 1597 remove_wait_queue(sk_sleep(sk), &wait); 1598 goto out_err; 1599 } 1600 1601 err = transport->notify_send_pre_block(vsk, &send_data); 1602 if (err < 0) { 1603 remove_wait_queue(sk_sleep(sk), &wait); 1604 goto out_err; 1605 } 1606 1607 release_sock(sk); 1608 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); 1609 lock_sock(sk); 1610 if (signal_pending(current)) { 1611 err = sock_intr_errno(timeout); 1612 remove_wait_queue(sk_sleep(sk), &wait); 1613 goto out_err; 1614 } else if (timeout == 0) { 1615 err = -EAGAIN; 1616 remove_wait_queue(sk_sleep(sk), &wait); 1617 goto out_err; 1618 } 1619 } 1620 remove_wait_queue(sk_sleep(sk), &wait); 1621 1622 /* These checks occur both as part of and after the loop 1623 * conditional since we need to check before and after 1624 * sleeping. 1625 */ 1626 if (sk->sk_err) { 1627 err = -sk->sk_err; 1628 goto out_err; 1629 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1630 (vsk->peer_shutdown & RCV_SHUTDOWN)) { 1631 err = -EPIPE; 1632 goto out_err; 1633 } 1634 1635 err = transport->notify_send_pre_enqueue(vsk, &send_data); 1636 if (err < 0) 1637 goto out_err; 1638 1639 /* Note that enqueue will only write as many bytes as are free 1640 * in the produce queue, so we don't need to ensure len is 1641 * smaller than the queue size. It is the caller's 1642 * responsibility to check how many bytes we were able to send. 1643 */ 1644 1645 written = transport->stream_enqueue( 1646 vsk, msg, 1647 len - total_written); 1648 if (written < 0) { 1649 err = -ENOMEM; 1650 goto out_err; 1651 } 1652 1653 total_written += written; 1654 1655 err = transport->notify_send_post_enqueue( 1656 vsk, written, &send_data); 1657 if (err < 0) 1658 goto out_err; 1659 1660 } 1661 1662 out_err: 1663 if (total_written > 0) 1664 err = total_written; 1665 out: 1666 release_sock(sk); 1667 return err; 1668 } 1669 1670 1671 static int 1672 vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1673 int flags) 1674 { 1675 struct sock *sk; 1676 struct vsock_sock *vsk; 1677 int err; 1678 size_t target; 1679 ssize_t copied; 1680 long timeout; 1681 struct vsock_transport_recv_notify_data recv_data; 1682 1683 DEFINE_WAIT(wait); 1684 1685 sk = sock->sk; 1686 vsk = vsock_sk(sk); 1687 err = 0; 1688 1689 lock_sock(sk); 1690 1691 if (sk->sk_state != TCP_ESTABLISHED) { 1692 /* Recvmsg is supposed to return 0 if a peer performs an 1693 * orderly shutdown. Differentiate between that case and when a 1694 * peer has not connected or a local shutdown occured with the 1695 * SOCK_DONE flag. 1696 */ 1697 if (sock_flag(sk, SOCK_DONE)) 1698 err = 0; 1699 else 1700 err = -ENOTCONN; 1701 1702 goto out; 1703 } 1704 1705 if (flags & MSG_OOB) { 1706 err = -EOPNOTSUPP; 1707 goto out; 1708 } 1709 1710 /* We don't check peer_shutdown flag here since peer may actually shut 1711 * down, but there can be data in the queue that a local socket can 1712 * receive. 1713 */ 1714 if (sk->sk_shutdown & RCV_SHUTDOWN) { 1715 err = 0; 1716 goto out; 1717 } 1718 1719 /* It is valid on Linux to pass in a zero-length receive buffer. This 1720 * is not an error. We may as well bail out now. 1721 */ 1722 if (!len) { 1723 err = 0; 1724 goto out; 1725 } 1726 1727 /* We must not copy less than target bytes into the user's buffer 1728 * before returning successfully, so we wait for the consume queue to 1729 * have that much data to consume before dequeueing. Note that this 1730 * makes it impossible to handle cases where target is greater than the 1731 * queue size. 1732 */ 1733 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1734 if (target >= transport->stream_rcvhiwat(vsk)) { 1735 err = -ENOMEM; 1736 goto out; 1737 } 1738 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1739 copied = 0; 1740 1741 err = transport->notify_recv_init(vsk, target, &recv_data); 1742 if (err < 0) 1743 goto out; 1744 1745 1746 while (1) { 1747 s64 ready; 1748 1749 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1750 ready = vsock_stream_has_data(vsk); 1751 1752 if (ready == 0) { 1753 if (sk->sk_err != 0 || 1754 (sk->sk_shutdown & RCV_SHUTDOWN) || 1755 (vsk->peer_shutdown & SEND_SHUTDOWN)) { 1756 finish_wait(sk_sleep(sk), &wait); 1757 break; 1758 } 1759 /* Don't wait for non-blocking sockets. */ 1760 if (timeout == 0) { 1761 err = -EAGAIN; 1762 finish_wait(sk_sleep(sk), &wait); 1763 break; 1764 } 1765 1766 err = transport->notify_recv_pre_block( 1767 vsk, target, &recv_data); 1768 if (err < 0) { 1769 finish_wait(sk_sleep(sk), &wait); 1770 break; 1771 } 1772 release_sock(sk); 1773 timeout = schedule_timeout(timeout); 1774 lock_sock(sk); 1775 1776 if (signal_pending(current)) { 1777 err = sock_intr_errno(timeout); 1778 finish_wait(sk_sleep(sk), &wait); 1779 break; 1780 } else if (timeout == 0) { 1781 err = -EAGAIN; 1782 finish_wait(sk_sleep(sk), &wait); 1783 break; 1784 } 1785 } else { 1786 ssize_t read; 1787 1788 finish_wait(sk_sleep(sk), &wait); 1789 1790 if (ready < 0) { 1791 /* Invalid queue pair content. XXX This should 1792 * be changed to a connection reset in a later 1793 * change. 1794 */ 1795 1796 err = -ENOMEM; 1797 goto out; 1798 } 1799 1800 err = transport->notify_recv_pre_dequeue( 1801 vsk, target, &recv_data); 1802 if (err < 0) 1803 break; 1804 1805 read = transport->stream_dequeue( 1806 vsk, msg, 1807 len - copied, flags); 1808 if (read < 0) { 1809 err = -ENOMEM; 1810 break; 1811 } 1812 1813 copied += read; 1814 1815 err = transport->notify_recv_post_dequeue( 1816 vsk, target, read, 1817 !(flags & MSG_PEEK), &recv_data); 1818 if (err < 0) 1819 goto out; 1820 1821 if (read >= target || flags & MSG_PEEK) 1822 break; 1823 1824 target -= read; 1825 } 1826 } 1827 1828 if (sk->sk_err) 1829 err = -sk->sk_err; 1830 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1831 err = 0; 1832 1833 if (copied > 0) 1834 err = copied; 1835 1836 out: 1837 release_sock(sk); 1838 return err; 1839 } 1840 1841 static const struct proto_ops vsock_stream_ops = { 1842 .family = PF_VSOCK, 1843 .owner = THIS_MODULE, 1844 .release = vsock_release, 1845 .bind = vsock_bind, 1846 .connect = vsock_stream_connect, 1847 .socketpair = sock_no_socketpair, 1848 .accept = vsock_accept, 1849 .getname = vsock_getname, 1850 .poll = vsock_poll, 1851 .ioctl = sock_no_ioctl, 1852 .listen = vsock_listen, 1853 .shutdown = vsock_shutdown, 1854 .setsockopt = vsock_stream_setsockopt, 1855 .getsockopt = vsock_stream_getsockopt, 1856 .sendmsg = vsock_stream_sendmsg, 1857 .recvmsg = vsock_stream_recvmsg, 1858 .mmap = sock_no_mmap, 1859 .sendpage = sock_no_sendpage, 1860 }; 1861 1862 static int vsock_create(struct net *net, struct socket *sock, 1863 int protocol, int kern) 1864 { 1865 if (!sock) 1866 return -EINVAL; 1867 1868 if (protocol && protocol != PF_VSOCK) 1869 return -EPROTONOSUPPORT; 1870 1871 switch (sock->type) { 1872 case SOCK_DGRAM: 1873 sock->ops = &vsock_dgram_ops; 1874 break; 1875 case SOCK_STREAM: 1876 sock->ops = &vsock_stream_ops; 1877 break; 1878 default: 1879 return -ESOCKTNOSUPPORT; 1880 } 1881 1882 sock->state = SS_UNCONNECTED; 1883 1884 return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; 1885 } 1886 1887 static const struct net_proto_family vsock_family_ops = { 1888 .family = AF_VSOCK, 1889 .create = vsock_create, 1890 .owner = THIS_MODULE, 1891 }; 1892 1893 static long vsock_dev_do_ioctl(struct file *filp, 1894 unsigned int cmd, void __user *ptr) 1895 { 1896 u32 __user *p = ptr; 1897 int retval = 0; 1898 1899 switch (cmd) { 1900 case IOCTL_VM_SOCKETS_GET_LOCAL_CID: 1901 if (put_user(transport->get_local_cid(), p) != 0) 1902 retval = -EFAULT; 1903 break; 1904 1905 default: 1906 pr_err("Unknown ioctl %d\n", cmd); 1907 retval = -EINVAL; 1908 } 1909 1910 return retval; 1911 } 1912 1913 static long vsock_dev_ioctl(struct file *filp, 1914 unsigned int cmd, unsigned long arg) 1915 { 1916 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); 1917 } 1918 1919 #ifdef CONFIG_COMPAT 1920 static long vsock_dev_compat_ioctl(struct file *filp, 1921 unsigned int cmd, unsigned long arg) 1922 { 1923 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); 1924 } 1925 #endif 1926 1927 static const struct file_operations vsock_device_ops = { 1928 .owner = THIS_MODULE, 1929 .unlocked_ioctl = vsock_dev_ioctl, 1930 #ifdef CONFIG_COMPAT 1931 .compat_ioctl = vsock_dev_compat_ioctl, 1932 #endif 1933 .open = nonseekable_open, 1934 }; 1935 1936 static struct miscdevice vsock_device = { 1937 .name = "vsock", 1938 .fops = &vsock_device_ops, 1939 }; 1940 1941 int __vsock_core_init(const struct vsock_transport *t, struct module *owner) 1942 { 1943 int err = mutex_lock_interruptible(&vsock_register_mutex); 1944 1945 if (err) 1946 return err; 1947 1948 if (transport) { 1949 err = -EBUSY; 1950 goto err_busy; 1951 } 1952 1953 /* Transport must be the owner of the protocol so that it can't 1954 * unload while there are open sockets. 1955 */ 1956 vsock_proto.owner = owner; 1957 transport = t; 1958 1959 vsock_device.minor = MISC_DYNAMIC_MINOR; 1960 err = misc_register(&vsock_device); 1961 if (err) { 1962 pr_err("Failed to register misc device\n"); 1963 goto err_reset_transport; 1964 } 1965 1966 err = proto_register(&vsock_proto, 1); /* we want our slab */ 1967 if (err) { 1968 pr_err("Cannot register vsock protocol\n"); 1969 goto err_deregister_misc; 1970 } 1971 1972 err = sock_register(&vsock_family_ops); 1973 if (err) { 1974 pr_err("could not register af_vsock (%d) address family: %d\n", 1975 AF_VSOCK, err); 1976 goto err_unregister_proto; 1977 } 1978 1979 mutex_unlock(&vsock_register_mutex); 1980 return 0; 1981 1982 err_unregister_proto: 1983 proto_unregister(&vsock_proto); 1984 err_deregister_misc: 1985 misc_deregister(&vsock_device); 1986 err_reset_transport: 1987 transport = NULL; 1988 err_busy: 1989 mutex_unlock(&vsock_register_mutex); 1990 return err; 1991 } 1992 EXPORT_SYMBOL_GPL(__vsock_core_init); 1993 1994 void vsock_core_exit(void) 1995 { 1996 mutex_lock(&vsock_register_mutex); 1997 1998 misc_deregister(&vsock_device); 1999 sock_unregister(AF_VSOCK); 2000 proto_unregister(&vsock_proto); 2001 2002 /* We do not want the assignment below re-ordered. */ 2003 mb(); 2004 transport = NULL; 2005 2006 mutex_unlock(&vsock_register_mutex); 2007 } 2008 EXPORT_SYMBOL_GPL(vsock_core_exit); 2009 2010 const struct vsock_transport *vsock_core_get_transport(void) 2011 { 2012 /* vsock_register_mutex not taken since only the transport uses this 2013 * function and only while registered. 2014 */ 2015 return transport; 2016 } 2017 EXPORT_SYMBOL_GPL(vsock_core_get_transport); 2018 2019 static void __exit vsock_exit(void) 2020 { 2021 /* Do nothing. This function makes this module removable. */ 2022 } 2023 2024 module_init(vsock_init_tables); 2025 module_exit(vsock_exit); 2026 2027 MODULE_AUTHOR("VMware, Inc."); 2028 MODULE_DESCRIPTION("VMware Virtual Socket Family"); 2029 MODULE_VERSION("1.0.2.0-k"); 2030 MODULE_LICENSE("GPL v2"); 2031