1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * VMware vSockets Driver 4 * 5 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 6 */ 7 8 /* Implementation notes: 9 * 10 * - There are two kinds of sockets: those created by user action (such as 11 * calling socket(2)) and those created by incoming connection request packets. 12 * 13 * - There are two "global" tables, one for bound sockets (sockets that have 14 * specified an address that they are responsible for) and one for connected 15 * sockets (sockets that have established a connection with another socket). 16 * These tables are "global" in that all sockets on the system are placed 17 * within them. - Note, though, that the bound table contains an extra entry 18 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in 19 * that list. The bound table is used solely for lookup of sockets when packets 20 * are received and that's not necessary for SOCK_DGRAM sockets since we create 21 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM 22 * sockets out of the bound hash buckets will reduce the chance of collisions 23 * when looking for SOCK_STREAM sockets and prevents us from having to check the 24 * socket type in the hash table lookups. 25 * 26 * - Sockets created by user action will either be "client" sockets that 27 * initiate a connection or "server" sockets that listen for connections; we do 28 * not support simultaneous connects (two "client" sockets connecting). 29 * 30 * - "Server" sockets are referred to as listener sockets throughout this 31 * implementation because they are in the TCP_LISTEN state. When a 32 * connection request is received (the second kind of socket mentioned above), 33 * we create a new socket and refer to it as a pending socket. These pending 34 * sockets are placed on the pending connection list of the listener socket. 35 * When future packets are received for the address the listener socket is 36 * bound to, we check if the source of the packet is from one that has an 37 * existing pending connection. If it does, we process the packet for the 38 * pending socket. When that socket reaches the connected state, it is removed 39 * from the listener socket's pending list and enqueued in the listener 40 * socket's accept queue. Callers of accept(2) will accept connected sockets 41 * from the listener socket's accept queue. If the socket cannot be accepted 42 * for some reason then it is marked rejected. Once the connection is 43 * accepted, it is owned by the user process and the responsibility for cleanup 44 * falls with that user process. 45 * 46 * - It is possible that these pending sockets will never reach the connected 47 * state; in fact, we may never receive another packet after the connection 48 * request. Because of this, we must schedule a cleanup function to run in the 49 * future, after some amount of time passes where a connection should have been 50 * established. This function ensures that the socket is off all lists so it 51 * cannot be retrieved, then drops all references to the socket so it is cleaned 52 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this 53 * function will also cleanup rejected sockets, those that reach the connected 54 * state but leave it before they have been accepted. 55 * 56 * - Lock ordering for pending or accept queue sockets is: 57 * 58 * lock_sock(listener); 59 * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); 60 * 61 * Using explicit nested locking keeps lockdep happy since normally only one 62 * lock of a given class may be taken at a time. 63 * 64 * - Sockets created by user action will be cleaned up when the user process 65 * calls close(2), causing our release implementation to be called. Our release 66 * implementation will perform some cleanup then drop the last reference so our 67 * sk_destruct implementation is invoked. Our sk_destruct implementation will 68 * perform additional cleanup that's common for both types of sockets. 69 * 70 * - A socket's reference count is what ensures that the structure won't be 71 * freed. Each entry in a list (such as the "global" bound and connected tables 72 * and the listener socket's pending list and connected queue) ensures a 73 * reference. When we defer work until process context and pass a socket as our 74 * argument, we must ensure the reference count is increased to ensure the 75 * socket isn't freed before the function is run; the deferred function will 76 * then drop the reference. 77 * 78 * - sk->sk_state uses the TCP state constants because they are widely used by 79 * other address families and exposed to userspace tools like ss(8): 80 * 81 * TCP_CLOSE - unconnected 82 * TCP_SYN_SENT - connecting 83 * TCP_ESTABLISHED - connected 84 * TCP_CLOSING - disconnecting 85 * TCP_LISTEN - listening 86 */ 87 88 #include <linux/types.h> 89 #include <linux/bitops.h> 90 #include <linux/cred.h> 91 #include <linux/init.h> 92 #include <linux/io.h> 93 #include <linux/kernel.h> 94 #include <linux/sched/signal.h> 95 #include <linux/kmod.h> 96 #include <linux/list.h> 97 #include <linux/miscdevice.h> 98 #include <linux/module.h> 99 #include <linux/mutex.h> 100 #include <linux/net.h> 101 #include <linux/poll.h> 102 #include <linux/random.h> 103 #include <linux/skbuff.h> 104 #include <linux/smp.h> 105 #include <linux/socket.h> 106 #include <linux/stddef.h> 107 #include <linux/unistd.h> 108 #include <linux/wait.h> 109 #include <linux/workqueue.h> 110 #include <net/sock.h> 111 #include <net/af_vsock.h> 112 113 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 114 static void vsock_sk_destruct(struct sock *sk); 115 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 116 117 /* Protocol family. */ 118 static struct proto vsock_proto = { 119 .name = "AF_VSOCK", 120 .owner = THIS_MODULE, 121 .obj_size = sizeof(struct vsock_sock), 122 }; 123 124 /* The default peer timeout indicates how long we will wait for a peer response 125 * to a control message. 126 */ 127 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 128 129 #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) 130 #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) 131 #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 132 133 static const struct vsock_transport *transport_single; 134 static DEFINE_MUTEX(vsock_register_mutex); 135 136 /**** UTILS ****/ 137 138 /* Each bound VSocket is stored in the bind hash table and each connected 139 * VSocket is stored in the connected hash table. 140 * 141 * Unbound sockets are all put on the same list attached to the end of the hash 142 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in 143 * the bucket that their local address hashes to (vsock_bound_sockets(addr) 144 * represents the list that addr hashes to). 145 * 146 * Specifically, we initialize the vsock_bind_table array to a size of 147 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through 148 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and 149 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function 150 * mods with VSOCK_HASH_SIZE to ensure this. 151 */ 152 #define MAX_PORT_RETRIES 24 153 154 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) 155 #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) 156 #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) 157 158 /* XXX This can probably be implemented in a better way. */ 159 #define VSOCK_CONN_HASH(src, dst) \ 160 (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) 161 #define vsock_connected_sockets(src, dst) \ 162 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) 163 #define vsock_connected_sockets_vsk(vsk) \ 164 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) 165 166 struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; 167 EXPORT_SYMBOL_GPL(vsock_bind_table); 168 struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; 169 EXPORT_SYMBOL_GPL(vsock_connected_table); 170 DEFINE_SPINLOCK(vsock_table_lock); 171 EXPORT_SYMBOL_GPL(vsock_table_lock); 172 173 /* Autobind this socket to the local address if necessary. */ 174 static int vsock_auto_bind(struct vsock_sock *vsk) 175 { 176 struct sock *sk = sk_vsock(vsk); 177 struct sockaddr_vm local_addr; 178 179 if (vsock_addr_bound(&vsk->local_addr)) 180 return 0; 181 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 182 return __vsock_bind(sk, &local_addr); 183 } 184 185 static int __init vsock_init_tables(void) 186 { 187 int i; 188 189 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) 190 INIT_LIST_HEAD(&vsock_bind_table[i]); 191 192 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) 193 INIT_LIST_HEAD(&vsock_connected_table[i]); 194 return 0; 195 } 196 197 static void __vsock_insert_bound(struct list_head *list, 198 struct vsock_sock *vsk) 199 { 200 sock_hold(&vsk->sk); 201 list_add(&vsk->bound_table, list); 202 } 203 204 static void __vsock_insert_connected(struct list_head *list, 205 struct vsock_sock *vsk) 206 { 207 sock_hold(&vsk->sk); 208 list_add(&vsk->connected_table, list); 209 } 210 211 static void __vsock_remove_bound(struct vsock_sock *vsk) 212 { 213 list_del_init(&vsk->bound_table); 214 sock_put(&vsk->sk); 215 } 216 217 static void __vsock_remove_connected(struct vsock_sock *vsk) 218 { 219 list_del_init(&vsk->connected_table); 220 sock_put(&vsk->sk); 221 } 222 223 static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) 224 { 225 struct vsock_sock *vsk; 226 227 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) 228 if (addr->svm_port == vsk->local_addr.svm_port) 229 return sk_vsock(vsk); 230 231 return NULL; 232 } 233 234 static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, 235 struct sockaddr_vm *dst) 236 { 237 struct vsock_sock *vsk; 238 239 list_for_each_entry(vsk, vsock_connected_sockets(src, dst), 240 connected_table) { 241 if (vsock_addr_equals_addr(src, &vsk->remote_addr) && 242 dst->svm_port == vsk->local_addr.svm_port) { 243 return sk_vsock(vsk); 244 } 245 } 246 247 return NULL; 248 } 249 250 static void vsock_insert_unbound(struct vsock_sock *vsk) 251 { 252 spin_lock_bh(&vsock_table_lock); 253 __vsock_insert_bound(vsock_unbound_sockets, vsk); 254 spin_unlock_bh(&vsock_table_lock); 255 } 256 257 void vsock_insert_connected(struct vsock_sock *vsk) 258 { 259 struct list_head *list = vsock_connected_sockets( 260 &vsk->remote_addr, &vsk->local_addr); 261 262 spin_lock_bh(&vsock_table_lock); 263 __vsock_insert_connected(list, vsk); 264 spin_unlock_bh(&vsock_table_lock); 265 } 266 EXPORT_SYMBOL_GPL(vsock_insert_connected); 267 268 void vsock_remove_bound(struct vsock_sock *vsk) 269 { 270 spin_lock_bh(&vsock_table_lock); 271 if (__vsock_in_bound_table(vsk)) 272 __vsock_remove_bound(vsk); 273 spin_unlock_bh(&vsock_table_lock); 274 } 275 EXPORT_SYMBOL_GPL(vsock_remove_bound); 276 277 void vsock_remove_connected(struct vsock_sock *vsk) 278 { 279 spin_lock_bh(&vsock_table_lock); 280 if (__vsock_in_connected_table(vsk)) 281 __vsock_remove_connected(vsk); 282 spin_unlock_bh(&vsock_table_lock); 283 } 284 EXPORT_SYMBOL_GPL(vsock_remove_connected); 285 286 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) 287 { 288 struct sock *sk; 289 290 spin_lock_bh(&vsock_table_lock); 291 sk = __vsock_find_bound_socket(addr); 292 if (sk) 293 sock_hold(sk); 294 295 spin_unlock_bh(&vsock_table_lock); 296 297 return sk; 298 } 299 EXPORT_SYMBOL_GPL(vsock_find_bound_socket); 300 301 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 302 struct sockaddr_vm *dst) 303 { 304 struct sock *sk; 305 306 spin_lock_bh(&vsock_table_lock); 307 sk = __vsock_find_connected_socket(src, dst); 308 if (sk) 309 sock_hold(sk); 310 311 spin_unlock_bh(&vsock_table_lock); 312 313 return sk; 314 } 315 EXPORT_SYMBOL_GPL(vsock_find_connected_socket); 316 317 void vsock_remove_sock(struct vsock_sock *vsk) 318 { 319 vsock_remove_bound(vsk); 320 vsock_remove_connected(vsk); 321 } 322 EXPORT_SYMBOL_GPL(vsock_remove_sock); 323 324 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) 325 { 326 int i; 327 328 spin_lock_bh(&vsock_table_lock); 329 330 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { 331 struct vsock_sock *vsk; 332 list_for_each_entry(vsk, &vsock_connected_table[i], 333 connected_table) 334 fn(sk_vsock(vsk)); 335 } 336 337 spin_unlock_bh(&vsock_table_lock); 338 } 339 EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); 340 341 void vsock_add_pending(struct sock *listener, struct sock *pending) 342 { 343 struct vsock_sock *vlistener; 344 struct vsock_sock *vpending; 345 346 vlistener = vsock_sk(listener); 347 vpending = vsock_sk(pending); 348 349 sock_hold(pending); 350 sock_hold(listener); 351 list_add_tail(&vpending->pending_links, &vlistener->pending_links); 352 } 353 EXPORT_SYMBOL_GPL(vsock_add_pending); 354 355 void vsock_remove_pending(struct sock *listener, struct sock *pending) 356 { 357 struct vsock_sock *vpending = vsock_sk(pending); 358 359 list_del_init(&vpending->pending_links); 360 sock_put(listener); 361 sock_put(pending); 362 } 363 EXPORT_SYMBOL_GPL(vsock_remove_pending); 364 365 void vsock_enqueue_accept(struct sock *listener, struct sock *connected) 366 { 367 struct vsock_sock *vlistener; 368 struct vsock_sock *vconnected; 369 370 vlistener = vsock_sk(listener); 371 vconnected = vsock_sk(connected); 372 373 sock_hold(connected); 374 sock_hold(listener); 375 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); 376 } 377 EXPORT_SYMBOL_GPL(vsock_enqueue_accept); 378 379 static struct sock *vsock_dequeue_accept(struct sock *listener) 380 { 381 struct vsock_sock *vlistener; 382 struct vsock_sock *vconnected; 383 384 vlistener = vsock_sk(listener); 385 386 if (list_empty(&vlistener->accept_queue)) 387 return NULL; 388 389 vconnected = list_entry(vlistener->accept_queue.next, 390 struct vsock_sock, accept_queue); 391 392 list_del_init(&vconnected->accept_queue); 393 sock_put(listener); 394 /* The caller will need a reference on the connected socket so we let 395 * it call sock_put(). 396 */ 397 398 return sk_vsock(vconnected); 399 } 400 401 static bool vsock_is_accept_queue_empty(struct sock *sk) 402 { 403 struct vsock_sock *vsk = vsock_sk(sk); 404 return list_empty(&vsk->accept_queue); 405 } 406 407 static bool vsock_is_pending(struct sock *sk) 408 { 409 struct vsock_sock *vsk = vsock_sk(sk); 410 return !list_empty(&vsk->pending_links); 411 } 412 413 static int vsock_send_shutdown(struct sock *sk, int mode) 414 { 415 struct vsock_sock *vsk = vsock_sk(sk); 416 417 return vsk->transport->shutdown(vsk, mode); 418 } 419 420 static void vsock_pending_work(struct work_struct *work) 421 { 422 struct sock *sk; 423 struct sock *listener; 424 struct vsock_sock *vsk; 425 bool cleanup; 426 427 vsk = container_of(work, struct vsock_sock, pending_work.work); 428 sk = sk_vsock(vsk); 429 listener = vsk->listener; 430 cleanup = true; 431 432 lock_sock(listener); 433 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 434 435 if (vsock_is_pending(sk)) { 436 vsock_remove_pending(listener, sk); 437 438 sk_acceptq_removed(listener); 439 } else if (!vsk->rejected) { 440 /* We are not on the pending list and accept() did not reject 441 * us, so we must have been accepted by our user process. We 442 * just need to drop our references to the sockets and be on 443 * our way. 444 */ 445 cleanup = false; 446 goto out; 447 } 448 449 /* We need to remove ourself from the global connected sockets list so 450 * incoming packets can't find this socket, and to reduce the reference 451 * count. 452 */ 453 vsock_remove_connected(vsk); 454 455 sk->sk_state = TCP_CLOSE; 456 457 out: 458 release_sock(sk); 459 release_sock(listener); 460 if (cleanup) 461 sock_put(sk); 462 463 sock_put(sk); 464 sock_put(listener); 465 } 466 467 /**** SOCKET OPERATIONS ****/ 468 469 static int __vsock_bind_stream(struct vsock_sock *vsk, 470 struct sockaddr_vm *addr) 471 { 472 static u32 port; 473 struct sockaddr_vm new_addr; 474 475 if (!port) 476 port = LAST_RESERVED_PORT + 1 + 477 prandom_u32_max(U32_MAX - LAST_RESERVED_PORT); 478 479 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); 480 481 if (addr->svm_port == VMADDR_PORT_ANY) { 482 bool found = false; 483 unsigned int i; 484 485 for (i = 0; i < MAX_PORT_RETRIES; i++) { 486 if (port <= LAST_RESERVED_PORT) 487 port = LAST_RESERVED_PORT + 1; 488 489 new_addr.svm_port = port++; 490 491 if (!__vsock_find_bound_socket(&new_addr)) { 492 found = true; 493 break; 494 } 495 } 496 497 if (!found) 498 return -EADDRNOTAVAIL; 499 } else { 500 /* If port is in reserved range, ensure caller 501 * has necessary privileges. 502 */ 503 if (addr->svm_port <= LAST_RESERVED_PORT && 504 !capable(CAP_NET_BIND_SERVICE)) { 505 return -EACCES; 506 } 507 508 if (__vsock_find_bound_socket(&new_addr)) 509 return -EADDRINUSE; 510 } 511 512 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); 513 514 /* Remove stream sockets from the unbound list and add them to the hash 515 * table for easy lookup by its address. The unbound list is simply an 516 * extra entry at the end of the hash table, a trick used by AF_UNIX. 517 */ 518 __vsock_remove_bound(vsk); 519 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); 520 521 return 0; 522 } 523 524 static int __vsock_bind_dgram(struct vsock_sock *vsk, 525 struct sockaddr_vm *addr) 526 { 527 return vsk->transport->dgram_bind(vsk, addr); 528 } 529 530 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) 531 { 532 struct vsock_sock *vsk = vsock_sk(sk); 533 u32 cid; 534 int retval; 535 536 /* First ensure this socket isn't already bound. */ 537 if (vsock_addr_bound(&vsk->local_addr)) 538 return -EINVAL; 539 540 /* Now bind to the provided address or select appropriate values if 541 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that 542 * like AF_INET prevents binding to a non-local IP address (in most 543 * cases), we only allow binding to the local CID. 544 */ 545 cid = vsk->transport->get_local_cid(); 546 if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) 547 return -EADDRNOTAVAIL; 548 549 switch (sk->sk_socket->type) { 550 case SOCK_STREAM: 551 spin_lock_bh(&vsock_table_lock); 552 retval = __vsock_bind_stream(vsk, addr); 553 spin_unlock_bh(&vsock_table_lock); 554 break; 555 556 case SOCK_DGRAM: 557 retval = __vsock_bind_dgram(vsk, addr); 558 break; 559 560 default: 561 retval = -EINVAL; 562 break; 563 } 564 565 return retval; 566 } 567 568 static void vsock_connect_timeout(struct work_struct *work); 569 570 static struct sock *__vsock_create(struct net *net, 571 struct socket *sock, 572 struct sock *parent, 573 gfp_t priority, 574 unsigned short type, 575 int kern) 576 { 577 struct sock *sk; 578 struct vsock_sock *psk; 579 struct vsock_sock *vsk; 580 581 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); 582 if (!sk) 583 return NULL; 584 585 sock_init_data(sock, sk); 586 587 /* sk->sk_type is normally set in sock_init_data, but only if sock is 588 * non-NULL. We make sure that our sockets always have a type by 589 * setting it here if needed. 590 */ 591 if (!sock) 592 sk->sk_type = type; 593 594 vsk = vsock_sk(sk); 595 vsk->transport = transport_single; 596 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 597 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 598 599 sk->sk_destruct = vsock_sk_destruct; 600 sk->sk_backlog_rcv = vsock_queue_rcv_skb; 601 sock_reset_flag(sk, SOCK_DONE); 602 603 INIT_LIST_HEAD(&vsk->bound_table); 604 INIT_LIST_HEAD(&vsk->connected_table); 605 vsk->listener = NULL; 606 INIT_LIST_HEAD(&vsk->pending_links); 607 INIT_LIST_HEAD(&vsk->accept_queue); 608 vsk->rejected = false; 609 vsk->sent_request = false; 610 vsk->ignore_connecting_rst = false; 611 vsk->peer_shutdown = 0; 612 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); 613 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); 614 615 psk = parent ? vsock_sk(parent) : NULL; 616 if (parent) { 617 vsk->trusted = psk->trusted; 618 vsk->owner = get_cred(psk->owner); 619 vsk->connect_timeout = psk->connect_timeout; 620 vsk->buffer_size = psk->buffer_size; 621 vsk->buffer_min_size = psk->buffer_min_size; 622 vsk->buffer_max_size = psk->buffer_max_size; 623 } else { 624 vsk->trusted = capable(CAP_NET_ADMIN); 625 vsk->owner = get_current_cred(); 626 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; 627 vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; 628 vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; 629 vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; 630 } 631 632 if (vsk->transport->init(vsk, psk) < 0) { 633 sk_free(sk); 634 return NULL; 635 } 636 637 if (sock) 638 vsock_insert_unbound(vsk); 639 640 return sk; 641 } 642 643 static void __vsock_release(struct sock *sk, int level) 644 { 645 if (sk) { 646 struct sock *pending; 647 struct vsock_sock *vsk; 648 649 vsk = vsock_sk(sk); 650 pending = NULL; /* Compiler warning. */ 651 652 /* The release call is supposed to use lock_sock_nested() 653 * rather than lock_sock(), if a sock lock should be acquired. 654 */ 655 vsk->transport->release(vsk); 656 657 /* When "level" is SINGLE_DEPTH_NESTING, use the nested 658 * version to avoid the warning "possible recursive locking 659 * detected". When "level" is 0, lock_sock_nested(sk, level) 660 * is the same as lock_sock(sk). 661 */ 662 lock_sock_nested(sk, level); 663 sock_orphan(sk); 664 sk->sk_shutdown = SHUTDOWN_MASK; 665 666 skb_queue_purge(&sk->sk_receive_queue); 667 668 /* Clean up any sockets that never were accepted. */ 669 while ((pending = vsock_dequeue_accept(sk)) != NULL) { 670 __vsock_release(pending, SINGLE_DEPTH_NESTING); 671 sock_put(pending); 672 } 673 674 release_sock(sk); 675 sock_put(sk); 676 } 677 } 678 679 static void vsock_sk_destruct(struct sock *sk) 680 { 681 struct vsock_sock *vsk = vsock_sk(sk); 682 683 vsk->transport->destruct(vsk); 684 685 /* When clearing these addresses, there's no need to set the family and 686 * possibly register the address family with the kernel. 687 */ 688 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 689 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 690 691 put_cred(vsk->owner); 692 } 693 694 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 695 { 696 int err; 697 698 err = sock_queue_rcv_skb(sk, skb); 699 if (err) 700 kfree_skb(skb); 701 702 return err; 703 } 704 705 struct sock *vsock_create_connected(struct sock *parent) 706 { 707 return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, 708 parent->sk_type, 0); 709 } 710 EXPORT_SYMBOL_GPL(vsock_create_connected); 711 712 s64 vsock_stream_has_data(struct vsock_sock *vsk) 713 { 714 return vsk->transport->stream_has_data(vsk); 715 } 716 EXPORT_SYMBOL_GPL(vsock_stream_has_data); 717 718 s64 vsock_stream_has_space(struct vsock_sock *vsk) 719 { 720 return vsk->transport->stream_has_space(vsk); 721 } 722 EXPORT_SYMBOL_GPL(vsock_stream_has_space); 723 724 static int vsock_release(struct socket *sock) 725 { 726 __vsock_release(sock->sk, 0); 727 sock->sk = NULL; 728 sock->state = SS_FREE; 729 730 return 0; 731 } 732 733 static int 734 vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 735 { 736 int err; 737 struct sock *sk; 738 struct sockaddr_vm *vm_addr; 739 740 sk = sock->sk; 741 742 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) 743 return -EINVAL; 744 745 lock_sock(sk); 746 err = __vsock_bind(sk, vm_addr); 747 release_sock(sk); 748 749 return err; 750 } 751 752 static int vsock_getname(struct socket *sock, 753 struct sockaddr *addr, int peer) 754 { 755 int err; 756 struct sock *sk; 757 struct vsock_sock *vsk; 758 struct sockaddr_vm *vm_addr; 759 760 sk = sock->sk; 761 vsk = vsock_sk(sk); 762 err = 0; 763 764 lock_sock(sk); 765 766 if (peer) { 767 if (sock->state != SS_CONNECTED) { 768 err = -ENOTCONN; 769 goto out; 770 } 771 vm_addr = &vsk->remote_addr; 772 } else { 773 vm_addr = &vsk->local_addr; 774 } 775 776 if (!vm_addr) { 777 err = -EINVAL; 778 goto out; 779 } 780 781 /* sys_getsockname() and sys_getpeername() pass us a 782 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately 783 * that macro is defined in socket.c instead of .h, so we hardcode its 784 * value here. 785 */ 786 BUILD_BUG_ON(sizeof(*vm_addr) > 128); 787 memcpy(addr, vm_addr, sizeof(*vm_addr)); 788 err = sizeof(*vm_addr); 789 790 out: 791 release_sock(sk); 792 return err; 793 } 794 795 static int vsock_shutdown(struct socket *sock, int mode) 796 { 797 int err; 798 struct sock *sk; 799 800 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses 801 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode 802 * here like the other address families do. Note also that the 803 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), 804 * which is what we want. 805 */ 806 mode++; 807 808 if ((mode & ~SHUTDOWN_MASK) || !mode) 809 return -EINVAL; 810 811 /* If this is a STREAM socket and it is not connected then bail out 812 * immediately. If it is a DGRAM socket then we must first kick the 813 * socket so that it wakes up from any sleeping calls, for example 814 * recv(), and then afterwards return the error. 815 */ 816 817 sk = sock->sk; 818 if (sock->state == SS_UNCONNECTED) { 819 err = -ENOTCONN; 820 if (sk->sk_type == SOCK_STREAM) 821 return err; 822 } else { 823 sock->state = SS_DISCONNECTING; 824 err = 0; 825 } 826 827 /* Receive and send shutdowns are treated alike. */ 828 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); 829 if (mode) { 830 lock_sock(sk); 831 sk->sk_shutdown |= mode; 832 sk->sk_state_change(sk); 833 release_sock(sk); 834 835 if (sk->sk_type == SOCK_STREAM) { 836 sock_reset_flag(sk, SOCK_DONE); 837 vsock_send_shutdown(sk, mode); 838 } 839 } 840 841 return err; 842 } 843 844 static __poll_t vsock_poll(struct file *file, struct socket *sock, 845 poll_table *wait) 846 { 847 struct sock *sk; 848 __poll_t mask; 849 struct vsock_sock *vsk; 850 851 sk = sock->sk; 852 vsk = vsock_sk(sk); 853 854 poll_wait(file, sk_sleep(sk), wait); 855 mask = 0; 856 857 if (sk->sk_err) 858 /* Signify that there has been an error on this socket. */ 859 mask |= EPOLLERR; 860 861 /* INET sockets treat local write shutdown and peer write shutdown as a 862 * case of EPOLLHUP set. 863 */ 864 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 865 ((sk->sk_shutdown & SEND_SHUTDOWN) && 866 (vsk->peer_shutdown & SEND_SHUTDOWN))) { 867 mask |= EPOLLHUP; 868 } 869 870 if (sk->sk_shutdown & RCV_SHUTDOWN || 871 vsk->peer_shutdown & SEND_SHUTDOWN) { 872 mask |= EPOLLRDHUP; 873 } 874 875 if (sock->type == SOCK_DGRAM) { 876 /* For datagram sockets we can read if there is something in 877 * the queue and write as long as the socket isn't shutdown for 878 * sending. 879 */ 880 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || 881 (sk->sk_shutdown & RCV_SHUTDOWN)) { 882 mask |= EPOLLIN | EPOLLRDNORM; 883 } 884 885 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 886 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 887 888 } else if (sock->type == SOCK_STREAM) { 889 const struct vsock_transport *transport = vsk->transport; 890 lock_sock(sk); 891 892 /* Listening sockets that have connections in their accept 893 * queue can be read. 894 */ 895 if (sk->sk_state == TCP_LISTEN 896 && !vsock_is_accept_queue_empty(sk)) 897 mask |= EPOLLIN | EPOLLRDNORM; 898 899 /* If there is something in the queue then we can read. */ 900 if (transport->stream_is_active(vsk) && 901 !(sk->sk_shutdown & RCV_SHUTDOWN)) { 902 bool data_ready_now = false; 903 int ret = transport->notify_poll_in( 904 vsk, 1, &data_ready_now); 905 if (ret < 0) { 906 mask |= EPOLLERR; 907 } else { 908 if (data_ready_now) 909 mask |= EPOLLIN | EPOLLRDNORM; 910 911 } 912 } 913 914 /* Sockets whose connections have been closed, reset, or 915 * terminated should also be considered read, and we check the 916 * shutdown flag for that. 917 */ 918 if (sk->sk_shutdown & RCV_SHUTDOWN || 919 vsk->peer_shutdown & SEND_SHUTDOWN) { 920 mask |= EPOLLIN | EPOLLRDNORM; 921 } 922 923 /* Connected sockets that can produce data can be written. */ 924 if (sk->sk_state == TCP_ESTABLISHED) { 925 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 926 bool space_avail_now = false; 927 int ret = transport->notify_poll_out( 928 vsk, 1, &space_avail_now); 929 if (ret < 0) { 930 mask |= EPOLLERR; 931 } else { 932 if (space_avail_now) 933 /* Remove EPOLLWRBAND since INET 934 * sockets are not setting it. 935 */ 936 mask |= EPOLLOUT | EPOLLWRNORM; 937 938 } 939 } 940 } 941 942 /* Simulate INET socket poll behaviors, which sets 943 * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, 944 * but local send is not shutdown. 945 */ 946 if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { 947 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 948 mask |= EPOLLOUT | EPOLLWRNORM; 949 950 } 951 952 release_sock(sk); 953 } 954 955 return mask; 956 } 957 958 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 959 size_t len) 960 { 961 int err; 962 struct sock *sk; 963 struct vsock_sock *vsk; 964 struct sockaddr_vm *remote_addr; 965 const struct vsock_transport *transport; 966 967 if (msg->msg_flags & MSG_OOB) 968 return -EOPNOTSUPP; 969 970 /* For now, MSG_DONTWAIT is always assumed... */ 971 err = 0; 972 sk = sock->sk; 973 vsk = vsock_sk(sk); 974 transport = vsk->transport; 975 976 lock_sock(sk); 977 978 err = vsock_auto_bind(vsk); 979 if (err) 980 goto out; 981 982 983 /* If the provided message contains an address, use that. Otherwise 984 * fall back on the socket's remote handle (if it has been connected). 985 */ 986 if (msg->msg_name && 987 vsock_addr_cast(msg->msg_name, msg->msg_namelen, 988 &remote_addr) == 0) { 989 /* Ensure this address is of the right type and is a valid 990 * destination. 991 */ 992 993 if (remote_addr->svm_cid == VMADDR_CID_ANY) 994 remote_addr->svm_cid = transport->get_local_cid(); 995 996 if (!vsock_addr_bound(remote_addr)) { 997 err = -EINVAL; 998 goto out; 999 } 1000 } else if (sock->state == SS_CONNECTED) { 1001 remote_addr = &vsk->remote_addr; 1002 1003 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1004 remote_addr->svm_cid = transport->get_local_cid(); 1005 1006 /* XXX Should connect() or this function ensure remote_addr is 1007 * bound? 1008 */ 1009 if (!vsock_addr_bound(&vsk->remote_addr)) { 1010 err = -EINVAL; 1011 goto out; 1012 } 1013 } else { 1014 err = -EINVAL; 1015 goto out; 1016 } 1017 1018 if (!transport->dgram_allow(remote_addr->svm_cid, 1019 remote_addr->svm_port)) { 1020 err = -EINVAL; 1021 goto out; 1022 } 1023 1024 err = transport->dgram_enqueue(vsk, remote_addr, msg, len); 1025 1026 out: 1027 release_sock(sk); 1028 return err; 1029 } 1030 1031 static int vsock_dgram_connect(struct socket *sock, 1032 struct sockaddr *addr, int addr_len, int flags) 1033 { 1034 int err; 1035 struct sock *sk; 1036 struct vsock_sock *vsk; 1037 struct sockaddr_vm *remote_addr; 1038 1039 sk = sock->sk; 1040 vsk = vsock_sk(sk); 1041 1042 err = vsock_addr_cast(addr, addr_len, &remote_addr); 1043 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { 1044 lock_sock(sk); 1045 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, 1046 VMADDR_PORT_ANY); 1047 sock->state = SS_UNCONNECTED; 1048 release_sock(sk); 1049 return 0; 1050 } else if (err != 0) 1051 return -EINVAL; 1052 1053 lock_sock(sk); 1054 1055 err = vsock_auto_bind(vsk); 1056 if (err) 1057 goto out; 1058 1059 if (!vsk->transport->dgram_allow(remote_addr->svm_cid, 1060 remote_addr->svm_port)) { 1061 err = -EINVAL; 1062 goto out; 1063 } 1064 1065 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); 1066 sock->state = SS_CONNECTED; 1067 1068 out: 1069 release_sock(sk); 1070 return err; 1071 } 1072 1073 static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1074 size_t len, int flags) 1075 { 1076 struct vsock_sock *vsk = vsock_sk(sock->sk); 1077 1078 return vsk->transport->dgram_dequeue(vsk, msg, len, flags); 1079 } 1080 1081 static const struct proto_ops vsock_dgram_ops = { 1082 .family = PF_VSOCK, 1083 .owner = THIS_MODULE, 1084 .release = vsock_release, 1085 .bind = vsock_bind, 1086 .connect = vsock_dgram_connect, 1087 .socketpair = sock_no_socketpair, 1088 .accept = sock_no_accept, 1089 .getname = vsock_getname, 1090 .poll = vsock_poll, 1091 .ioctl = sock_no_ioctl, 1092 .listen = sock_no_listen, 1093 .shutdown = vsock_shutdown, 1094 .setsockopt = sock_no_setsockopt, 1095 .getsockopt = sock_no_getsockopt, 1096 .sendmsg = vsock_dgram_sendmsg, 1097 .recvmsg = vsock_dgram_recvmsg, 1098 .mmap = sock_no_mmap, 1099 .sendpage = sock_no_sendpage, 1100 }; 1101 1102 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) 1103 { 1104 const struct vsock_transport *transport = vsk->transport; 1105 1106 if (!transport->cancel_pkt) 1107 return -EOPNOTSUPP; 1108 1109 return transport->cancel_pkt(vsk); 1110 } 1111 1112 static void vsock_connect_timeout(struct work_struct *work) 1113 { 1114 struct sock *sk; 1115 struct vsock_sock *vsk; 1116 int cancel = 0; 1117 1118 vsk = container_of(work, struct vsock_sock, connect_work.work); 1119 sk = sk_vsock(vsk); 1120 1121 lock_sock(sk); 1122 if (sk->sk_state == TCP_SYN_SENT && 1123 (sk->sk_shutdown != SHUTDOWN_MASK)) { 1124 sk->sk_state = TCP_CLOSE; 1125 sk->sk_err = ETIMEDOUT; 1126 sk->sk_error_report(sk); 1127 cancel = 1; 1128 } 1129 release_sock(sk); 1130 if (cancel) 1131 vsock_transport_cancel_pkt(vsk); 1132 1133 sock_put(sk); 1134 } 1135 1136 static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, 1137 int addr_len, int flags) 1138 { 1139 int err; 1140 struct sock *sk; 1141 struct vsock_sock *vsk; 1142 const struct vsock_transport *transport; 1143 struct sockaddr_vm *remote_addr; 1144 long timeout; 1145 DEFINE_WAIT(wait); 1146 1147 err = 0; 1148 sk = sock->sk; 1149 vsk = vsock_sk(sk); 1150 transport = vsk->transport; 1151 1152 lock_sock(sk); 1153 1154 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ 1155 switch (sock->state) { 1156 case SS_CONNECTED: 1157 err = -EISCONN; 1158 goto out; 1159 case SS_DISCONNECTING: 1160 err = -EINVAL; 1161 goto out; 1162 case SS_CONNECTING: 1163 /* This continues on so we can move sock into the SS_CONNECTED 1164 * state once the connection has completed (at which point err 1165 * will be set to zero also). Otherwise, we will either wait 1166 * for the connection or return -EALREADY should this be a 1167 * non-blocking call. 1168 */ 1169 err = -EALREADY; 1170 break; 1171 default: 1172 if ((sk->sk_state == TCP_LISTEN) || 1173 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { 1174 err = -EINVAL; 1175 goto out; 1176 } 1177 1178 /* The hypervisor and well-known contexts do not have socket 1179 * endpoints. 1180 */ 1181 if (!transport->stream_allow(remote_addr->svm_cid, 1182 remote_addr->svm_port)) { 1183 err = -ENETUNREACH; 1184 goto out; 1185 } 1186 1187 /* Set the remote address that we are connecting to. */ 1188 memcpy(&vsk->remote_addr, remote_addr, 1189 sizeof(vsk->remote_addr)); 1190 1191 err = vsock_auto_bind(vsk); 1192 if (err) 1193 goto out; 1194 1195 sk->sk_state = TCP_SYN_SENT; 1196 1197 err = transport->connect(vsk); 1198 if (err < 0) 1199 goto out; 1200 1201 /* Mark sock as connecting and set the error code to in 1202 * progress in case this is a non-blocking connect. 1203 */ 1204 sock->state = SS_CONNECTING; 1205 err = -EINPROGRESS; 1206 } 1207 1208 /* The receive path will handle all communication until we are able to 1209 * enter the connected state. Here we wait for the connection to be 1210 * completed or a notification of an error. 1211 */ 1212 timeout = vsk->connect_timeout; 1213 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1214 1215 while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { 1216 if (flags & O_NONBLOCK) { 1217 /* If we're not going to block, we schedule a timeout 1218 * function to generate a timeout on the connection 1219 * attempt, in case the peer doesn't respond in a 1220 * timely manner. We hold on to the socket until the 1221 * timeout fires. 1222 */ 1223 sock_hold(sk); 1224 schedule_delayed_work(&vsk->connect_work, timeout); 1225 1226 /* Skip ahead to preserve error code set above. */ 1227 goto out_wait; 1228 } 1229 1230 release_sock(sk); 1231 timeout = schedule_timeout(timeout); 1232 lock_sock(sk); 1233 1234 if (signal_pending(current)) { 1235 err = sock_intr_errno(timeout); 1236 sk->sk_state = TCP_CLOSE; 1237 sock->state = SS_UNCONNECTED; 1238 vsock_transport_cancel_pkt(vsk); 1239 goto out_wait; 1240 } else if (timeout == 0) { 1241 err = -ETIMEDOUT; 1242 sk->sk_state = TCP_CLOSE; 1243 sock->state = SS_UNCONNECTED; 1244 vsock_transport_cancel_pkt(vsk); 1245 goto out_wait; 1246 } 1247 1248 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1249 } 1250 1251 if (sk->sk_err) { 1252 err = -sk->sk_err; 1253 sk->sk_state = TCP_CLOSE; 1254 sock->state = SS_UNCONNECTED; 1255 } else { 1256 err = 0; 1257 } 1258 1259 out_wait: 1260 finish_wait(sk_sleep(sk), &wait); 1261 out: 1262 release_sock(sk); 1263 return err; 1264 } 1265 1266 static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, 1267 bool kern) 1268 { 1269 struct sock *listener; 1270 int err; 1271 struct sock *connected; 1272 struct vsock_sock *vconnected; 1273 long timeout; 1274 DEFINE_WAIT(wait); 1275 1276 err = 0; 1277 listener = sock->sk; 1278 1279 lock_sock(listener); 1280 1281 if (sock->type != SOCK_STREAM) { 1282 err = -EOPNOTSUPP; 1283 goto out; 1284 } 1285 1286 if (listener->sk_state != TCP_LISTEN) { 1287 err = -EINVAL; 1288 goto out; 1289 } 1290 1291 /* Wait for children sockets to appear; these are the new sockets 1292 * created upon connection establishment. 1293 */ 1294 timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); 1295 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1296 1297 while ((connected = vsock_dequeue_accept(listener)) == NULL && 1298 listener->sk_err == 0) { 1299 release_sock(listener); 1300 timeout = schedule_timeout(timeout); 1301 finish_wait(sk_sleep(listener), &wait); 1302 lock_sock(listener); 1303 1304 if (signal_pending(current)) { 1305 err = sock_intr_errno(timeout); 1306 goto out; 1307 } else if (timeout == 0) { 1308 err = -EAGAIN; 1309 goto out; 1310 } 1311 1312 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1313 } 1314 finish_wait(sk_sleep(listener), &wait); 1315 1316 if (listener->sk_err) 1317 err = -listener->sk_err; 1318 1319 if (connected) { 1320 sk_acceptq_removed(listener); 1321 1322 lock_sock_nested(connected, SINGLE_DEPTH_NESTING); 1323 vconnected = vsock_sk(connected); 1324 1325 /* If the listener socket has received an error, then we should 1326 * reject this socket and return. Note that we simply mark the 1327 * socket rejected, drop our reference, and let the cleanup 1328 * function handle the cleanup; the fact that we found it in 1329 * the listener's accept queue guarantees that the cleanup 1330 * function hasn't run yet. 1331 */ 1332 if (err) { 1333 vconnected->rejected = true; 1334 } else { 1335 newsock->state = SS_CONNECTED; 1336 sock_graft(connected, newsock); 1337 } 1338 1339 release_sock(connected); 1340 sock_put(connected); 1341 } 1342 1343 out: 1344 release_sock(listener); 1345 return err; 1346 } 1347 1348 static int vsock_listen(struct socket *sock, int backlog) 1349 { 1350 int err; 1351 struct sock *sk; 1352 struct vsock_sock *vsk; 1353 1354 sk = sock->sk; 1355 1356 lock_sock(sk); 1357 1358 if (sock->type != SOCK_STREAM) { 1359 err = -EOPNOTSUPP; 1360 goto out; 1361 } 1362 1363 if (sock->state != SS_UNCONNECTED) { 1364 err = -EINVAL; 1365 goto out; 1366 } 1367 1368 vsk = vsock_sk(sk); 1369 1370 if (!vsock_addr_bound(&vsk->local_addr)) { 1371 err = -EINVAL; 1372 goto out; 1373 } 1374 1375 sk->sk_max_ack_backlog = backlog; 1376 sk->sk_state = TCP_LISTEN; 1377 1378 err = 0; 1379 1380 out: 1381 release_sock(sk); 1382 return err; 1383 } 1384 1385 static void vsock_update_buffer_size(struct vsock_sock *vsk, 1386 const struct vsock_transport *transport, 1387 u64 val) 1388 { 1389 if (val > vsk->buffer_max_size) 1390 val = vsk->buffer_max_size; 1391 1392 if (val < vsk->buffer_min_size) 1393 val = vsk->buffer_min_size; 1394 1395 if (val != vsk->buffer_size && 1396 transport && transport->notify_buffer_size) 1397 transport->notify_buffer_size(vsk, &val); 1398 1399 vsk->buffer_size = val; 1400 } 1401 1402 static int vsock_stream_setsockopt(struct socket *sock, 1403 int level, 1404 int optname, 1405 char __user *optval, 1406 unsigned int optlen) 1407 { 1408 int err; 1409 struct sock *sk; 1410 struct vsock_sock *vsk; 1411 const struct vsock_transport *transport; 1412 u64 val; 1413 1414 if (level != AF_VSOCK) 1415 return -ENOPROTOOPT; 1416 1417 #define COPY_IN(_v) \ 1418 do { \ 1419 if (optlen < sizeof(_v)) { \ 1420 err = -EINVAL; \ 1421 goto exit; \ 1422 } \ 1423 if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ 1424 err = -EFAULT; \ 1425 goto exit; \ 1426 } \ 1427 } while (0) 1428 1429 err = 0; 1430 sk = sock->sk; 1431 vsk = vsock_sk(sk); 1432 transport = vsk->transport; 1433 1434 lock_sock(sk); 1435 1436 switch (optname) { 1437 case SO_VM_SOCKETS_BUFFER_SIZE: 1438 COPY_IN(val); 1439 vsock_update_buffer_size(vsk, transport, val); 1440 break; 1441 1442 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1443 COPY_IN(val); 1444 vsk->buffer_max_size = val; 1445 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 1446 break; 1447 1448 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1449 COPY_IN(val); 1450 vsk->buffer_min_size = val; 1451 vsock_update_buffer_size(vsk, transport, vsk->buffer_size); 1452 break; 1453 1454 case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1455 struct __kernel_old_timeval tv; 1456 COPY_IN(tv); 1457 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && 1458 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { 1459 vsk->connect_timeout = tv.tv_sec * HZ + 1460 DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ)); 1461 if (vsk->connect_timeout == 0) 1462 vsk->connect_timeout = 1463 VSOCK_DEFAULT_CONNECT_TIMEOUT; 1464 1465 } else { 1466 err = -ERANGE; 1467 } 1468 break; 1469 } 1470 1471 default: 1472 err = -ENOPROTOOPT; 1473 break; 1474 } 1475 1476 #undef COPY_IN 1477 1478 exit: 1479 release_sock(sk); 1480 return err; 1481 } 1482 1483 static int vsock_stream_getsockopt(struct socket *sock, 1484 int level, int optname, 1485 char __user *optval, 1486 int __user *optlen) 1487 { 1488 int err; 1489 int len; 1490 struct sock *sk; 1491 struct vsock_sock *vsk; 1492 u64 val; 1493 1494 if (level != AF_VSOCK) 1495 return -ENOPROTOOPT; 1496 1497 err = get_user(len, optlen); 1498 if (err != 0) 1499 return err; 1500 1501 #define COPY_OUT(_v) \ 1502 do { \ 1503 if (len < sizeof(_v)) \ 1504 return -EINVAL; \ 1505 \ 1506 len = sizeof(_v); \ 1507 if (copy_to_user(optval, &_v, len) != 0) \ 1508 return -EFAULT; \ 1509 \ 1510 } while (0) 1511 1512 err = 0; 1513 sk = sock->sk; 1514 vsk = vsock_sk(sk); 1515 1516 switch (optname) { 1517 case SO_VM_SOCKETS_BUFFER_SIZE: 1518 val = vsk->buffer_size; 1519 COPY_OUT(val); 1520 break; 1521 1522 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1523 val = vsk->buffer_max_size; 1524 COPY_OUT(val); 1525 break; 1526 1527 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1528 val = vsk->buffer_min_size; 1529 COPY_OUT(val); 1530 break; 1531 1532 case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1533 struct __kernel_old_timeval tv; 1534 tv.tv_sec = vsk->connect_timeout / HZ; 1535 tv.tv_usec = 1536 (vsk->connect_timeout - 1537 tv.tv_sec * HZ) * (1000000 / HZ); 1538 COPY_OUT(tv); 1539 break; 1540 } 1541 default: 1542 return -ENOPROTOOPT; 1543 } 1544 1545 err = put_user(len, optlen); 1546 if (err != 0) 1547 return -EFAULT; 1548 1549 #undef COPY_OUT 1550 1551 return 0; 1552 } 1553 1554 static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, 1555 size_t len) 1556 { 1557 struct sock *sk; 1558 struct vsock_sock *vsk; 1559 const struct vsock_transport *transport; 1560 ssize_t total_written; 1561 long timeout; 1562 int err; 1563 struct vsock_transport_send_notify_data send_data; 1564 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1565 1566 sk = sock->sk; 1567 vsk = vsock_sk(sk); 1568 transport = vsk->transport; 1569 total_written = 0; 1570 err = 0; 1571 1572 if (msg->msg_flags & MSG_OOB) 1573 return -EOPNOTSUPP; 1574 1575 lock_sock(sk); 1576 1577 /* Callers should not provide a destination with stream sockets. */ 1578 if (msg->msg_namelen) { 1579 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 1580 goto out; 1581 } 1582 1583 /* Send data only if both sides are not shutdown in the direction. */ 1584 if (sk->sk_shutdown & SEND_SHUTDOWN || 1585 vsk->peer_shutdown & RCV_SHUTDOWN) { 1586 err = -EPIPE; 1587 goto out; 1588 } 1589 1590 if (sk->sk_state != TCP_ESTABLISHED || 1591 !vsock_addr_bound(&vsk->local_addr)) { 1592 err = -ENOTCONN; 1593 goto out; 1594 } 1595 1596 if (!vsock_addr_bound(&vsk->remote_addr)) { 1597 err = -EDESTADDRREQ; 1598 goto out; 1599 } 1600 1601 /* Wait for room in the produce queue to enqueue our user's data. */ 1602 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1603 1604 err = transport->notify_send_init(vsk, &send_data); 1605 if (err < 0) 1606 goto out; 1607 1608 while (total_written < len) { 1609 ssize_t written; 1610 1611 add_wait_queue(sk_sleep(sk), &wait); 1612 while (vsock_stream_has_space(vsk) == 0 && 1613 sk->sk_err == 0 && 1614 !(sk->sk_shutdown & SEND_SHUTDOWN) && 1615 !(vsk->peer_shutdown & RCV_SHUTDOWN)) { 1616 1617 /* Don't wait for non-blocking sockets. */ 1618 if (timeout == 0) { 1619 err = -EAGAIN; 1620 remove_wait_queue(sk_sleep(sk), &wait); 1621 goto out_err; 1622 } 1623 1624 err = transport->notify_send_pre_block(vsk, &send_data); 1625 if (err < 0) { 1626 remove_wait_queue(sk_sleep(sk), &wait); 1627 goto out_err; 1628 } 1629 1630 release_sock(sk); 1631 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); 1632 lock_sock(sk); 1633 if (signal_pending(current)) { 1634 err = sock_intr_errno(timeout); 1635 remove_wait_queue(sk_sleep(sk), &wait); 1636 goto out_err; 1637 } else if (timeout == 0) { 1638 err = -EAGAIN; 1639 remove_wait_queue(sk_sleep(sk), &wait); 1640 goto out_err; 1641 } 1642 } 1643 remove_wait_queue(sk_sleep(sk), &wait); 1644 1645 /* These checks occur both as part of and after the loop 1646 * conditional since we need to check before and after 1647 * sleeping. 1648 */ 1649 if (sk->sk_err) { 1650 err = -sk->sk_err; 1651 goto out_err; 1652 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1653 (vsk->peer_shutdown & RCV_SHUTDOWN)) { 1654 err = -EPIPE; 1655 goto out_err; 1656 } 1657 1658 err = transport->notify_send_pre_enqueue(vsk, &send_data); 1659 if (err < 0) 1660 goto out_err; 1661 1662 /* Note that enqueue will only write as many bytes as are free 1663 * in the produce queue, so we don't need to ensure len is 1664 * smaller than the queue size. It is the caller's 1665 * responsibility to check how many bytes we were able to send. 1666 */ 1667 1668 written = transport->stream_enqueue( 1669 vsk, msg, 1670 len - total_written); 1671 if (written < 0) { 1672 err = -ENOMEM; 1673 goto out_err; 1674 } 1675 1676 total_written += written; 1677 1678 err = transport->notify_send_post_enqueue( 1679 vsk, written, &send_data); 1680 if (err < 0) 1681 goto out_err; 1682 1683 } 1684 1685 out_err: 1686 if (total_written > 0) 1687 err = total_written; 1688 out: 1689 release_sock(sk); 1690 return err; 1691 } 1692 1693 1694 static int 1695 vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1696 int flags) 1697 { 1698 struct sock *sk; 1699 struct vsock_sock *vsk; 1700 const struct vsock_transport *transport; 1701 int err; 1702 size_t target; 1703 ssize_t copied; 1704 long timeout; 1705 struct vsock_transport_recv_notify_data recv_data; 1706 1707 DEFINE_WAIT(wait); 1708 1709 sk = sock->sk; 1710 vsk = vsock_sk(sk); 1711 transport = vsk->transport; 1712 err = 0; 1713 1714 lock_sock(sk); 1715 1716 if (sk->sk_state != TCP_ESTABLISHED) { 1717 /* Recvmsg is supposed to return 0 if a peer performs an 1718 * orderly shutdown. Differentiate between that case and when a 1719 * peer has not connected or a local shutdown occured with the 1720 * SOCK_DONE flag. 1721 */ 1722 if (sock_flag(sk, SOCK_DONE)) 1723 err = 0; 1724 else 1725 err = -ENOTCONN; 1726 1727 goto out; 1728 } 1729 1730 if (flags & MSG_OOB) { 1731 err = -EOPNOTSUPP; 1732 goto out; 1733 } 1734 1735 /* We don't check peer_shutdown flag here since peer may actually shut 1736 * down, but there can be data in the queue that a local socket can 1737 * receive. 1738 */ 1739 if (sk->sk_shutdown & RCV_SHUTDOWN) { 1740 err = 0; 1741 goto out; 1742 } 1743 1744 /* It is valid on Linux to pass in a zero-length receive buffer. This 1745 * is not an error. We may as well bail out now. 1746 */ 1747 if (!len) { 1748 err = 0; 1749 goto out; 1750 } 1751 1752 /* We must not copy less than target bytes into the user's buffer 1753 * before returning successfully, so we wait for the consume queue to 1754 * have that much data to consume before dequeueing. Note that this 1755 * makes it impossible to handle cases where target is greater than the 1756 * queue size. 1757 */ 1758 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1759 if (target >= transport->stream_rcvhiwat(vsk)) { 1760 err = -ENOMEM; 1761 goto out; 1762 } 1763 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1764 copied = 0; 1765 1766 err = transport->notify_recv_init(vsk, target, &recv_data); 1767 if (err < 0) 1768 goto out; 1769 1770 1771 while (1) { 1772 s64 ready; 1773 1774 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1775 ready = vsock_stream_has_data(vsk); 1776 1777 if (ready == 0) { 1778 if (sk->sk_err != 0 || 1779 (sk->sk_shutdown & RCV_SHUTDOWN) || 1780 (vsk->peer_shutdown & SEND_SHUTDOWN)) { 1781 finish_wait(sk_sleep(sk), &wait); 1782 break; 1783 } 1784 /* Don't wait for non-blocking sockets. */ 1785 if (timeout == 0) { 1786 err = -EAGAIN; 1787 finish_wait(sk_sleep(sk), &wait); 1788 break; 1789 } 1790 1791 err = transport->notify_recv_pre_block( 1792 vsk, target, &recv_data); 1793 if (err < 0) { 1794 finish_wait(sk_sleep(sk), &wait); 1795 break; 1796 } 1797 release_sock(sk); 1798 timeout = schedule_timeout(timeout); 1799 lock_sock(sk); 1800 1801 if (signal_pending(current)) { 1802 err = sock_intr_errno(timeout); 1803 finish_wait(sk_sleep(sk), &wait); 1804 break; 1805 } else if (timeout == 0) { 1806 err = -EAGAIN; 1807 finish_wait(sk_sleep(sk), &wait); 1808 break; 1809 } 1810 } else { 1811 ssize_t read; 1812 1813 finish_wait(sk_sleep(sk), &wait); 1814 1815 if (ready < 0) { 1816 /* Invalid queue pair content. XXX This should 1817 * be changed to a connection reset in a later 1818 * change. 1819 */ 1820 1821 err = -ENOMEM; 1822 goto out; 1823 } 1824 1825 err = transport->notify_recv_pre_dequeue( 1826 vsk, target, &recv_data); 1827 if (err < 0) 1828 break; 1829 1830 read = transport->stream_dequeue( 1831 vsk, msg, 1832 len - copied, flags); 1833 if (read < 0) { 1834 err = -ENOMEM; 1835 break; 1836 } 1837 1838 copied += read; 1839 1840 err = transport->notify_recv_post_dequeue( 1841 vsk, target, read, 1842 !(flags & MSG_PEEK), &recv_data); 1843 if (err < 0) 1844 goto out; 1845 1846 if (read >= target || flags & MSG_PEEK) 1847 break; 1848 1849 target -= read; 1850 } 1851 } 1852 1853 if (sk->sk_err) 1854 err = -sk->sk_err; 1855 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1856 err = 0; 1857 1858 if (copied > 0) 1859 err = copied; 1860 1861 out: 1862 release_sock(sk); 1863 return err; 1864 } 1865 1866 static const struct proto_ops vsock_stream_ops = { 1867 .family = PF_VSOCK, 1868 .owner = THIS_MODULE, 1869 .release = vsock_release, 1870 .bind = vsock_bind, 1871 .connect = vsock_stream_connect, 1872 .socketpair = sock_no_socketpair, 1873 .accept = vsock_accept, 1874 .getname = vsock_getname, 1875 .poll = vsock_poll, 1876 .ioctl = sock_no_ioctl, 1877 .listen = vsock_listen, 1878 .shutdown = vsock_shutdown, 1879 .setsockopt = vsock_stream_setsockopt, 1880 .getsockopt = vsock_stream_getsockopt, 1881 .sendmsg = vsock_stream_sendmsg, 1882 .recvmsg = vsock_stream_recvmsg, 1883 .mmap = sock_no_mmap, 1884 .sendpage = sock_no_sendpage, 1885 }; 1886 1887 static int vsock_create(struct net *net, struct socket *sock, 1888 int protocol, int kern) 1889 { 1890 if (!sock) 1891 return -EINVAL; 1892 1893 if (protocol && protocol != PF_VSOCK) 1894 return -EPROTONOSUPPORT; 1895 1896 switch (sock->type) { 1897 case SOCK_DGRAM: 1898 sock->ops = &vsock_dgram_ops; 1899 break; 1900 case SOCK_STREAM: 1901 sock->ops = &vsock_stream_ops; 1902 break; 1903 default: 1904 return -ESOCKTNOSUPPORT; 1905 } 1906 1907 sock->state = SS_UNCONNECTED; 1908 1909 return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; 1910 } 1911 1912 static const struct net_proto_family vsock_family_ops = { 1913 .family = AF_VSOCK, 1914 .create = vsock_create, 1915 .owner = THIS_MODULE, 1916 }; 1917 1918 static long vsock_dev_do_ioctl(struct file *filp, 1919 unsigned int cmd, void __user *ptr) 1920 { 1921 u32 __user *p = ptr; 1922 int retval = 0; 1923 1924 switch (cmd) { 1925 case IOCTL_VM_SOCKETS_GET_LOCAL_CID: 1926 if (put_user(transport_single->get_local_cid(), p) != 0) 1927 retval = -EFAULT; 1928 break; 1929 1930 default: 1931 pr_err("Unknown ioctl %d\n", cmd); 1932 retval = -EINVAL; 1933 } 1934 1935 return retval; 1936 } 1937 1938 static long vsock_dev_ioctl(struct file *filp, 1939 unsigned int cmd, unsigned long arg) 1940 { 1941 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); 1942 } 1943 1944 #ifdef CONFIG_COMPAT 1945 static long vsock_dev_compat_ioctl(struct file *filp, 1946 unsigned int cmd, unsigned long arg) 1947 { 1948 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); 1949 } 1950 #endif 1951 1952 static const struct file_operations vsock_device_ops = { 1953 .owner = THIS_MODULE, 1954 .unlocked_ioctl = vsock_dev_ioctl, 1955 #ifdef CONFIG_COMPAT 1956 .compat_ioctl = vsock_dev_compat_ioctl, 1957 #endif 1958 .open = nonseekable_open, 1959 }; 1960 1961 static struct miscdevice vsock_device = { 1962 .name = "vsock", 1963 .fops = &vsock_device_ops, 1964 }; 1965 1966 int __vsock_core_init(const struct vsock_transport *t, struct module *owner) 1967 { 1968 int err = mutex_lock_interruptible(&vsock_register_mutex); 1969 1970 if (err) 1971 return err; 1972 1973 if (transport_single) { 1974 err = -EBUSY; 1975 goto err_busy; 1976 } 1977 1978 /* Transport must be the owner of the protocol so that it can't 1979 * unload while there are open sockets. 1980 */ 1981 vsock_proto.owner = owner; 1982 transport_single = t; 1983 1984 vsock_device.minor = MISC_DYNAMIC_MINOR; 1985 err = misc_register(&vsock_device); 1986 if (err) { 1987 pr_err("Failed to register misc device\n"); 1988 goto err_reset_transport; 1989 } 1990 1991 err = proto_register(&vsock_proto, 1); /* we want our slab */ 1992 if (err) { 1993 pr_err("Cannot register vsock protocol\n"); 1994 goto err_deregister_misc; 1995 } 1996 1997 err = sock_register(&vsock_family_ops); 1998 if (err) { 1999 pr_err("could not register af_vsock (%d) address family: %d\n", 2000 AF_VSOCK, err); 2001 goto err_unregister_proto; 2002 } 2003 2004 mutex_unlock(&vsock_register_mutex); 2005 return 0; 2006 2007 err_unregister_proto: 2008 proto_unregister(&vsock_proto); 2009 err_deregister_misc: 2010 misc_deregister(&vsock_device); 2011 err_reset_transport: 2012 transport_single = NULL; 2013 err_busy: 2014 mutex_unlock(&vsock_register_mutex); 2015 return err; 2016 } 2017 EXPORT_SYMBOL_GPL(__vsock_core_init); 2018 2019 void vsock_core_exit(void) 2020 { 2021 mutex_lock(&vsock_register_mutex); 2022 2023 misc_deregister(&vsock_device); 2024 sock_unregister(AF_VSOCK); 2025 proto_unregister(&vsock_proto); 2026 2027 /* We do not want the assignment below re-ordered. */ 2028 mb(); 2029 transport_single = NULL; 2030 2031 mutex_unlock(&vsock_register_mutex); 2032 } 2033 EXPORT_SYMBOL_GPL(vsock_core_exit); 2034 2035 const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) 2036 { 2037 return vsk->transport; 2038 } 2039 EXPORT_SYMBOL_GPL(vsock_core_get_transport); 2040 2041 static void __exit vsock_exit(void) 2042 { 2043 /* Do nothing. This function makes this module removable. */ 2044 } 2045 2046 module_init(vsock_init_tables); 2047 module_exit(vsock_exit); 2048 2049 MODULE_AUTHOR("VMware, Inc."); 2050 MODULE_DESCRIPTION("VMware Virtual Socket Family"); 2051 MODULE_VERSION("1.0.2.0-k"); 2052 MODULE_LICENSE("GPL v2"); 2053