1 /* 2 * VMware vSockets Driver 3 * 4 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License as published by the Free 8 * Software Foundation version 2 and no later version. 9 * 10 * This program is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 * more details. 14 */ 15 16 /* Implementation notes: 17 * 18 * - There are two kinds of sockets: those created by user action (such as 19 * calling socket(2)) and those created by incoming connection request packets. 20 * 21 * - There are two "global" tables, one for bound sockets (sockets that have 22 * specified an address that they are responsible for) and one for connected 23 * sockets (sockets that have established a connection with another socket). 24 * These tables are "global" in that all sockets on the system are placed 25 * within them. - Note, though, that the bound table contains an extra entry 26 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in 27 * that list. The bound table is used solely for lookup of sockets when packets 28 * are received and that's not necessary for SOCK_DGRAM sockets since we create 29 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM 30 * sockets out of the bound hash buckets will reduce the chance of collisions 31 * when looking for SOCK_STREAM sockets and prevents us from having to check the 32 * socket type in the hash table lookups. 33 * 34 * - Sockets created by user action will either be "client" sockets that 35 * initiate a connection or "server" sockets that listen for connections; we do 36 * not support simultaneous connects (two "client" sockets connecting). 37 * 38 * - "Server" sockets are referred to as listener sockets throughout this 39 * implementation because they are in the TCP_LISTEN state. When a 40 * connection request is received (the second kind of socket mentioned above), 41 * we create a new socket and refer to it as a pending socket. These pending 42 * sockets are placed on the pending connection list of the listener socket. 43 * When future packets are received for the address the listener socket is 44 * bound to, we check if the source of the packet is from one that has an 45 * existing pending connection. If it does, we process the packet for the 46 * pending socket. When that socket reaches the connected state, it is removed 47 * from the listener socket's pending list and enqueued in the listener 48 * socket's accept queue. Callers of accept(2) will accept connected sockets 49 * from the listener socket's accept queue. If the socket cannot be accepted 50 * for some reason then it is marked rejected. Once the connection is 51 * accepted, it is owned by the user process and the responsibility for cleanup 52 * falls with that user process. 53 * 54 * - It is possible that these pending sockets will never reach the connected 55 * state; in fact, we may never receive another packet after the connection 56 * request. Because of this, we must schedule a cleanup function to run in the 57 * future, after some amount of time passes where a connection should have been 58 * established. This function ensures that the socket is off all lists so it 59 * cannot be retrieved, then drops all references to the socket so it is cleaned 60 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this 61 * function will also cleanup rejected sockets, those that reach the connected 62 * state but leave it before they have been accepted. 63 * 64 * - Lock ordering for pending or accept queue sockets is: 65 * 66 * lock_sock(listener); 67 * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); 68 * 69 * Using explicit nested locking keeps lockdep happy since normally only one 70 * lock of a given class may be taken at a time. 71 * 72 * - Sockets created by user action will be cleaned up when the user process 73 * calls close(2), causing our release implementation to be called. Our release 74 * implementation will perform some cleanup then drop the last reference so our 75 * sk_destruct implementation is invoked. Our sk_destruct implementation will 76 * perform additional cleanup that's common for both types of sockets. 77 * 78 * - A socket's reference count is what ensures that the structure won't be 79 * freed. Each entry in a list (such as the "global" bound and connected tables 80 * and the listener socket's pending list and connected queue) ensures a 81 * reference. When we defer work until process context and pass a socket as our 82 * argument, we must ensure the reference count is increased to ensure the 83 * socket isn't freed before the function is run; the deferred function will 84 * then drop the reference. 85 * 86 * - sk->sk_state uses the TCP state constants because they are widely used by 87 * other address families and exposed to userspace tools like ss(8): 88 * 89 * TCP_CLOSE - unconnected 90 * TCP_SYN_SENT - connecting 91 * TCP_ESTABLISHED - connected 92 * TCP_CLOSING - disconnecting 93 * TCP_LISTEN - listening 94 */ 95 96 #include <linux/types.h> 97 #include <linux/bitops.h> 98 #include <linux/cred.h> 99 #include <linux/init.h> 100 #include <linux/io.h> 101 #include <linux/kernel.h> 102 #include <linux/sched/signal.h> 103 #include <linux/kmod.h> 104 #include <linux/list.h> 105 #include <linux/miscdevice.h> 106 #include <linux/module.h> 107 #include <linux/mutex.h> 108 #include <linux/net.h> 109 #include <linux/poll.h> 110 #include <linux/random.h> 111 #include <linux/skbuff.h> 112 #include <linux/smp.h> 113 #include <linux/socket.h> 114 #include <linux/stddef.h> 115 #include <linux/unistd.h> 116 #include <linux/wait.h> 117 #include <linux/workqueue.h> 118 #include <net/sock.h> 119 #include <net/af_vsock.h> 120 121 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 122 static void vsock_sk_destruct(struct sock *sk); 123 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 124 125 /* Protocol family. */ 126 static struct proto vsock_proto = { 127 .name = "AF_VSOCK", 128 .owner = THIS_MODULE, 129 .obj_size = sizeof(struct vsock_sock), 130 }; 131 132 /* The default peer timeout indicates how long we will wait for a peer response 133 * to a control message. 134 */ 135 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 136 137 static const struct vsock_transport *transport; 138 static DEFINE_MUTEX(vsock_register_mutex); 139 140 /**** EXPORTS ****/ 141 142 /* Get the ID of the local context. This is transport dependent. */ 143 144 int vm_sockets_get_local_cid(void) 145 { 146 return transport->get_local_cid(); 147 } 148 EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); 149 150 /**** UTILS ****/ 151 152 /* Each bound VSocket is stored in the bind hash table and each connected 153 * VSocket is stored in the connected hash table. 154 * 155 * Unbound sockets are all put on the same list attached to the end of the hash 156 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in 157 * the bucket that their local address hashes to (vsock_bound_sockets(addr) 158 * represents the list that addr hashes to). 159 * 160 * Specifically, we initialize the vsock_bind_table array to a size of 161 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through 162 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and 163 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function 164 * mods with VSOCK_HASH_SIZE to ensure this. 165 */ 166 #define MAX_PORT_RETRIES 24 167 168 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) 169 #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) 170 #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) 171 172 /* XXX This can probably be implemented in a better way. */ 173 #define VSOCK_CONN_HASH(src, dst) \ 174 (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) 175 #define vsock_connected_sockets(src, dst) \ 176 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) 177 #define vsock_connected_sockets_vsk(vsk) \ 178 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) 179 180 struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; 181 EXPORT_SYMBOL_GPL(vsock_bind_table); 182 struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; 183 EXPORT_SYMBOL_GPL(vsock_connected_table); 184 DEFINE_SPINLOCK(vsock_table_lock); 185 EXPORT_SYMBOL_GPL(vsock_table_lock); 186 187 /* Autobind this socket to the local address if necessary. */ 188 static int vsock_auto_bind(struct vsock_sock *vsk) 189 { 190 struct sock *sk = sk_vsock(vsk); 191 struct sockaddr_vm local_addr; 192 193 if (vsock_addr_bound(&vsk->local_addr)) 194 return 0; 195 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 196 return __vsock_bind(sk, &local_addr); 197 } 198 199 static int __init vsock_init_tables(void) 200 { 201 int i; 202 203 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) 204 INIT_LIST_HEAD(&vsock_bind_table[i]); 205 206 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) 207 INIT_LIST_HEAD(&vsock_connected_table[i]); 208 return 0; 209 } 210 211 static void __vsock_insert_bound(struct list_head *list, 212 struct vsock_sock *vsk) 213 { 214 sock_hold(&vsk->sk); 215 list_add(&vsk->bound_table, list); 216 } 217 218 static void __vsock_insert_connected(struct list_head *list, 219 struct vsock_sock *vsk) 220 { 221 sock_hold(&vsk->sk); 222 list_add(&vsk->connected_table, list); 223 } 224 225 static void __vsock_remove_bound(struct vsock_sock *vsk) 226 { 227 list_del_init(&vsk->bound_table); 228 sock_put(&vsk->sk); 229 } 230 231 static void __vsock_remove_connected(struct vsock_sock *vsk) 232 { 233 list_del_init(&vsk->connected_table); 234 sock_put(&vsk->sk); 235 } 236 237 static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) 238 { 239 struct vsock_sock *vsk; 240 241 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) 242 if (addr->svm_port == vsk->local_addr.svm_port) 243 return sk_vsock(vsk); 244 245 return NULL; 246 } 247 248 static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, 249 struct sockaddr_vm *dst) 250 { 251 struct vsock_sock *vsk; 252 253 list_for_each_entry(vsk, vsock_connected_sockets(src, dst), 254 connected_table) { 255 if (vsock_addr_equals_addr(src, &vsk->remote_addr) && 256 dst->svm_port == vsk->local_addr.svm_port) { 257 return sk_vsock(vsk); 258 } 259 } 260 261 return NULL; 262 } 263 264 static void vsock_insert_unbound(struct vsock_sock *vsk) 265 { 266 spin_lock_bh(&vsock_table_lock); 267 __vsock_insert_bound(vsock_unbound_sockets, vsk); 268 spin_unlock_bh(&vsock_table_lock); 269 } 270 271 void vsock_insert_connected(struct vsock_sock *vsk) 272 { 273 struct list_head *list = vsock_connected_sockets( 274 &vsk->remote_addr, &vsk->local_addr); 275 276 spin_lock_bh(&vsock_table_lock); 277 __vsock_insert_connected(list, vsk); 278 spin_unlock_bh(&vsock_table_lock); 279 } 280 EXPORT_SYMBOL_GPL(vsock_insert_connected); 281 282 void vsock_remove_bound(struct vsock_sock *vsk) 283 { 284 spin_lock_bh(&vsock_table_lock); 285 __vsock_remove_bound(vsk); 286 spin_unlock_bh(&vsock_table_lock); 287 } 288 EXPORT_SYMBOL_GPL(vsock_remove_bound); 289 290 void vsock_remove_connected(struct vsock_sock *vsk) 291 { 292 spin_lock_bh(&vsock_table_lock); 293 __vsock_remove_connected(vsk); 294 spin_unlock_bh(&vsock_table_lock); 295 } 296 EXPORT_SYMBOL_GPL(vsock_remove_connected); 297 298 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) 299 { 300 struct sock *sk; 301 302 spin_lock_bh(&vsock_table_lock); 303 sk = __vsock_find_bound_socket(addr); 304 if (sk) 305 sock_hold(sk); 306 307 spin_unlock_bh(&vsock_table_lock); 308 309 return sk; 310 } 311 EXPORT_SYMBOL_GPL(vsock_find_bound_socket); 312 313 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 314 struct sockaddr_vm *dst) 315 { 316 struct sock *sk; 317 318 spin_lock_bh(&vsock_table_lock); 319 sk = __vsock_find_connected_socket(src, dst); 320 if (sk) 321 sock_hold(sk); 322 323 spin_unlock_bh(&vsock_table_lock); 324 325 return sk; 326 } 327 EXPORT_SYMBOL_GPL(vsock_find_connected_socket); 328 329 static bool vsock_in_bound_table(struct vsock_sock *vsk) 330 { 331 bool ret; 332 333 spin_lock_bh(&vsock_table_lock); 334 ret = __vsock_in_bound_table(vsk); 335 spin_unlock_bh(&vsock_table_lock); 336 337 return ret; 338 } 339 340 static bool vsock_in_connected_table(struct vsock_sock *vsk) 341 { 342 bool ret; 343 344 spin_lock_bh(&vsock_table_lock); 345 ret = __vsock_in_connected_table(vsk); 346 spin_unlock_bh(&vsock_table_lock); 347 348 return ret; 349 } 350 351 void vsock_remove_sock(struct vsock_sock *vsk) 352 { 353 if (vsock_in_bound_table(vsk)) 354 vsock_remove_bound(vsk); 355 356 if (vsock_in_connected_table(vsk)) 357 vsock_remove_connected(vsk); 358 } 359 EXPORT_SYMBOL_GPL(vsock_remove_sock); 360 361 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) 362 { 363 int i; 364 365 spin_lock_bh(&vsock_table_lock); 366 367 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { 368 struct vsock_sock *vsk; 369 list_for_each_entry(vsk, &vsock_connected_table[i], 370 connected_table) 371 fn(sk_vsock(vsk)); 372 } 373 374 spin_unlock_bh(&vsock_table_lock); 375 } 376 EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); 377 378 void vsock_add_pending(struct sock *listener, struct sock *pending) 379 { 380 struct vsock_sock *vlistener; 381 struct vsock_sock *vpending; 382 383 vlistener = vsock_sk(listener); 384 vpending = vsock_sk(pending); 385 386 sock_hold(pending); 387 sock_hold(listener); 388 list_add_tail(&vpending->pending_links, &vlistener->pending_links); 389 } 390 EXPORT_SYMBOL_GPL(vsock_add_pending); 391 392 void vsock_remove_pending(struct sock *listener, struct sock *pending) 393 { 394 struct vsock_sock *vpending = vsock_sk(pending); 395 396 list_del_init(&vpending->pending_links); 397 sock_put(listener); 398 sock_put(pending); 399 } 400 EXPORT_SYMBOL_GPL(vsock_remove_pending); 401 402 void vsock_enqueue_accept(struct sock *listener, struct sock *connected) 403 { 404 struct vsock_sock *vlistener; 405 struct vsock_sock *vconnected; 406 407 vlistener = vsock_sk(listener); 408 vconnected = vsock_sk(connected); 409 410 sock_hold(connected); 411 sock_hold(listener); 412 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); 413 } 414 EXPORT_SYMBOL_GPL(vsock_enqueue_accept); 415 416 static struct sock *vsock_dequeue_accept(struct sock *listener) 417 { 418 struct vsock_sock *vlistener; 419 struct vsock_sock *vconnected; 420 421 vlistener = vsock_sk(listener); 422 423 if (list_empty(&vlistener->accept_queue)) 424 return NULL; 425 426 vconnected = list_entry(vlistener->accept_queue.next, 427 struct vsock_sock, accept_queue); 428 429 list_del_init(&vconnected->accept_queue); 430 sock_put(listener); 431 /* The caller will need a reference on the connected socket so we let 432 * it call sock_put(). 433 */ 434 435 return sk_vsock(vconnected); 436 } 437 438 static bool vsock_is_accept_queue_empty(struct sock *sk) 439 { 440 struct vsock_sock *vsk = vsock_sk(sk); 441 return list_empty(&vsk->accept_queue); 442 } 443 444 static bool vsock_is_pending(struct sock *sk) 445 { 446 struct vsock_sock *vsk = vsock_sk(sk); 447 return !list_empty(&vsk->pending_links); 448 } 449 450 static int vsock_send_shutdown(struct sock *sk, int mode) 451 { 452 return transport->shutdown(vsock_sk(sk), mode); 453 } 454 455 static void vsock_pending_work(struct work_struct *work) 456 { 457 struct sock *sk; 458 struct sock *listener; 459 struct vsock_sock *vsk; 460 bool cleanup; 461 462 vsk = container_of(work, struct vsock_sock, pending_work.work); 463 sk = sk_vsock(vsk); 464 listener = vsk->listener; 465 cleanup = true; 466 467 lock_sock(listener); 468 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 469 470 if (vsock_is_pending(sk)) { 471 vsock_remove_pending(listener, sk); 472 473 listener->sk_ack_backlog--; 474 } else if (!vsk->rejected) { 475 /* We are not on the pending list and accept() did not reject 476 * us, so we must have been accepted by our user process. We 477 * just need to drop our references to the sockets and be on 478 * our way. 479 */ 480 cleanup = false; 481 goto out; 482 } 483 484 /* We need to remove ourself from the global connected sockets list so 485 * incoming packets can't find this socket, and to reduce the reference 486 * count. 487 */ 488 if (vsock_in_connected_table(vsk)) 489 vsock_remove_connected(vsk); 490 491 sk->sk_state = TCP_CLOSE; 492 493 out: 494 release_sock(sk); 495 release_sock(listener); 496 if (cleanup) 497 sock_put(sk); 498 499 sock_put(sk); 500 sock_put(listener); 501 } 502 503 /**** SOCKET OPERATIONS ****/ 504 505 static int __vsock_bind_stream(struct vsock_sock *vsk, 506 struct sockaddr_vm *addr) 507 { 508 static u32 port = 0; 509 struct sockaddr_vm new_addr; 510 511 if (!port) 512 port = LAST_RESERVED_PORT + 1 + 513 prandom_u32_max(U32_MAX - LAST_RESERVED_PORT); 514 515 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); 516 517 if (addr->svm_port == VMADDR_PORT_ANY) { 518 bool found = false; 519 unsigned int i; 520 521 for (i = 0; i < MAX_PORT_RETRIES; i++) { 522 if (port <= LAST_RESERVED_PORT) 523 port = LAST_RESERVED_PORT + 1; 524 525 new_addr.svm_port = port++; 526 527 if (!__vsock_find_bound_socket(&new_addr)) { 528 found = true; 529 break; 530 } 531 } 532 533 if (!found) 534 return -EADDRNOTAVAIL; 535 } else { 536 /* If port is in reserved range, ensure caller 537 * has necessary privileges. 538 */ 539 if (addr->svm_port <= LAST_RESERVED_PORT && 540 !capable(CAP_NET_BIND_SERVICE)) { 541 return -EACCES; 542 } 543 544 if (__vsock_find_bound_socket(&new_addr)) 545 return -EADDRINUSE; 546 } 547 548 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); 549 550 /* Remove stream sockets from the unbound list and add them to the hash 551 * table for easy lookup by its address. The unbound list is simply an 552 * extra entry at the end of the hash table, a trick used by AF_UNIX. 553 */ 554 __vsock_remove_bound(vsk); 555 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); 556 557 return 0; 558 } 559 560 static int __vsock_bind_dgram(struct vsock_sock *vsk, 561 struct sockaddr_vm *addr) 562 { 563 return transport->dgram_bind(vsk, addr); 564 } 565 566 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) 567 { 568 struct vsock_sock *vsk = vsock_sk(sk); 569 u32 cid; 570 int retval; 571 572 /* First ensure this socket isn't already bound. */ 573 if (vsock_addr_bound(&vsk->local_addr)) 574 return -EINVAL; 575 576 /* Now bind to the provided address or select appropriate values if 577 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that 578 * like AF_INET prevents binding to a non-local IP address (in most 579 * cases), we only allow binding to the local CID. 580 */ 581 cid = transport->get_local_cid(); 582 if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) 583 return -EADDRNOTAVAIL; 584 585 switch (sk->sk_socket->type) { 586 case SOCK_STREAM: 587 spin_lock_bh(&vsock_table_lock); 588 retval = __vsock_bind_stream(vsk, addr); 589 spin_unlock_bh(&vsock_table_lock); 590 break; 591 592 case SOCK_DGRAM: 593 retval = __vsock_bind_dgram(vsk, addr); 594 break; 595 596 default: 597 retval = -EINVAL; 598 break; 599 } 600 601 return retval; 602 } 603 604 static void vsock_connect_timeout(struct work_struct *work); 605 606 struct sock *__vsock_create(struct net *net, 607 struct socket *sock, 608 struct sock *parent, 609 gfp_t priority, 610 unsigned short type, 611 int kern) 612 { 613 struct sock *sk; 614 struct vsock_sock *psk; 615 struct vsock_sock *vsk; 616 617 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); 618 if (!sk) 619 return NULL; 620 621 sock_init_data(sock, sk); 622 623 /* sk->sk_type is normally set in sock_init_data, but only if sock is 624 * non-NULL. We make sure that our sockets always have a type by 625 * setting it here if needed. 626 */ 627 if (!sock) 628 sk->sk_type = type; 629 630 vsk = vsock_sk(sk); 631 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 632 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 633 634 sk->sk_destruct = vsock_sk_destruct; 635 sk->sk_backlog_rcv = vsock_queue_rcv_skb; 636 sock_reset_flag(sk, SOCK_DONE); 637 638 INIT_LIST_HEAD(&vsk->bound_table); 639 INIT_LIST_HEAD(&vsk->connected_table); 640 vsk->listener = NULL; 641 INIT_LIST_HEAD(&vsk->pending_links); 642 INIT_LIST_HEAD(&vsk->accept_queue); 643 vsk->rejected = false; 644 vsk->sent_request = false; 645 vsk->ignore_connecting_rst = false; 646 vsk->peer_shutdown = 0; 647 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); 648 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); 649 650 psk = parent ? vsock_sk(parent) : NULL; 651 if (parent) { 652 vsk->trusted = psk->trusted; 653 vsk->owner = get_cred(psk->owner); 654 vsk->connect_timeout = psk->connect_timeout; 655 } else { 656 vsk->trusted = capable(CAP_NET_ADMIN); 657 vsk->owner = get_current_cred(); 658 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; 659 } 660 661 if (transport->init(vsk, psk) < 0) { 662 sk_free(sk); 663 return NULL; 664 } 665 666 if (sock) 667 vsock_insert_unbound(vsk); 668 669 return sk; 670 } 671 EXPORT_SYMBOL_GPL(__vsock_create); 672 673 static void __vsock_release(struct sock *sk) 674 { 675 if (sk) { 676 struct sk_buff *skb; 677 struct sock *pending; 678 struct vsock_sock *vsk; 679 680 vsk = vsock_sk(sk); 681 pending = NULL; /* Compiler warning. */ 682 683 transport->release(vsk); 684 685 lock_sock(sk); 686 sock_orphan(sk); 687 sk->sk_shutdown = SHUTDOWN_MASK; 688 689 while ((skb = skb_dequeue(&sk->sk_receive_queue))) 690 kfree_skb(skb); 691 692 /* Clean up any sockets that never were accepted. */ 693 while ((pending = vsock_dequeue_accept(sk)) != NULL) { 694 __vsock_release(pending); 695 sock_put(pending); 696 } 697 698 release_sock(sk); 699 sock_put(sk); 700 } 701 } 702 703 static void vsock_sk_destruct(struct sock *sk) 704 { 705 struct vsock_sock *vsk = vsock_sk(sk); 706 707 transport->destruct(vsk); 708 709 /* When clearing these addresses, there's no need to set the family and 710 * possibly register the address family with the kernel. 711 */ 712 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 713 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 714 715 put_cred(vsk->owner); 716 } 717 718 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 719 { 720 int err; 721 722 err = sock_queue_rcv_skb(sk, skb); 723 if (err) 724 kfree_skb(skb); 725 726 return err; 727 } 728 729 s64 vsock_stream_has_data(struct vsock_sock *vsk) 730 { 731 return transport->stream_has_data(vsk); 732 } 733 EXPORT_SYMBOL_GPL(vsock_stream_has_data); 734 735 s64 vsock_stream_has_space(struct vsock_sock *vsk) 736 { 737 return transport->stream_has_space(vsk); 738 } 739 EXPORT_SYMBOL_GPL(vsock_stream_has_space); 740 741 static int vsock_release(struct socket *sock) 742 { 743 __vsock_release(sock->sk); 744 sock->sk = NULL; 745 sock->state = SS_FREE; 746 747 return 0; 748 } 749 750 static int 751 vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 752 { 753 int err; 754 struct sock *sk; 755 struct sockaddr_vm *vm_addr; 756 757 sk = sock->sk; 758 759 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) 760 return -EINVAL; 761 762 lock_sock(sk); 763 err = __vsock_bind(sk, vm_addr); 764 release_sock(sk); 765 766 return err; 767 } 768 769 static int vsock_getname(struct socket *sock, 770 struct sockaddr *addr, int peer) 771 { 772 int err; 773 struct sock *sk; 774 struct vsock_sock *vsk; 775 struct sockaddr_vm *vm_addr; 776 777 sk = sock->sk; 778 vsk = vsock_sk(sk); 779 err = 0; 780 781 lock_sock(sk); 782 783 if (peer) { 784 if (sock->state != SS_CONNECTED) { 785 err = -ENOTCONN; 786 goto out; 787 } 788 vm_addr = &vsk->remote_addr; 789 } else { 790 vm_addr = &vsk->local_addr; 791 } 792 793 if (!vm_addr) { 794 err = -EINVAL; 795 goto out; 796 } 797 798 /* sys_getsockname() and sys_getpeername() pass us a 799 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately 800 * that macro is defined in socket.c instead of .h, so we hardcode its 801 * value here. 802 */ 803 BUILD_BUG_ON(sizeof(*vm_addr) > 128); 804 memcpy(addr, vm_addr, sizeof(*vm_addr)); 805 err = sizeof(*vm_addr); 806 807 out: 808 release_sock(sk); 809 return err; 810 } 811 812 static int vsock_shutdown(struct socket *sock, int mode) 813 { 814 int err; 815 struct sock *sk; 816 817 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses 818 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode 819 * here like the other address families do. Note also that the 820 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), 821 * which is what we want. 822 */ 823 mode++; 824 825 if ((mode & ~SHUTDOWN_MASK) || !mode) 826 return -EINVAL; 827 828 /* If this is a STREAM socket and it is not connected then bail out 829 * immediately. If it is a DGRAM socket then we must first kick the 830 * socket so that it wakes up from any sleeping calls, for example 831 * recv(), and then afterwards return the error. 832 */ 833 834 sk = sock->sk; 835 if (sock->state == SS_UNCONNECTED) { 836 err = -ENOTCONN; 837 if (sk->sk_type == SOCK_STREAM) 838 return err; 839 } else { 840 sock->state = SS_DISCONNECTING; 841 err = 0; 842 } 843 844 /* Receive and send shutdowns are treated alike. */ 845 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); 846 if (mode) { 847 lock_sock(sk); 848 sk->sk_shutdown |= mode; 849 sk->sk_state_change(sk); 850 release_sock(sk); 851 852 if (sk->sk_type == SOCK_STREAM) { 853 sock_reset_flag(sk, SOCK_DONE); 854 vsock_send_shutdown(sk, mode); 855 } 856 } 857 858 return err; 859 } 860 861 static __poll_t vsock_poll(struct file *file, struct socket *sock, 862 poll_table *wait) 863 { 864 struct sock *sk; 865 __poll_t mask; 866 struct vsock_sock *vsk; 867 868 sk = sock->sk; 869 vsk = vsock_sk(sk); 870 871 poll_wait(file, sk_sleep(sk), wait); 872 mask = 0; 873 874 if (sk->sk_err) 875 /* Signify that there has been an error on this socket. */ 876 mask |= EPOLLERR; 877 878 /* INET sockets treat local write shutdown and peer write shutdown as a 879 * case of EPOLLHUP set. 880 */ 881 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 882 ((sk->sk_shutdown & SEND_SHUTDOWN) && 883 (vsk->peer_shutdown & SEND_SHUTDOWN))) { 884 mask |= EPOLLHUP; 885 } 886 887 if (sk->sk_shutdown & RCV_SHUTDOWN || 888 vsk->peer_shutdown & SEND_SHUTDOWN) { 889 mask |= EPOLLRDHUP; 890 } 891 892 if (sock->type == SOCK_DGRAM) { 893 /* For datagram sockets we can read if there is something in 894 * the queue and write as long as the socket isn't shutdown for 895 * sending. 896 */ 897 if (!skb_queue_empty(&sk->sk_receive_queue) || 898 (sk->sk_shutdown & RCV_SHUTDOWN)) { 899 mask |= EPOLLIN | EPOLLRDNORM; 900 } 901 902 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 903 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 904 905 } else if (sock->type == SOCK_STREAM) { 906 lock_sock(sk); 907 908 /* Listening sockets that have connections in their accept 909 * queue can be read. 910 */ 911 if (sk->sk_state == TCP_LISTEN 912 && !vsock_is_accept_queue_empty(sk)) 913 mask |= EPOLLIN | EPOLLRDNORM; 914 915 /* If there is something in the queue then we can read. */ 916 if (transport->stream_is_active(vsk) && 917 !(sk->sk_shutdown & RCV_SHUTDOWN)) { 918 bool data_ready_now = false; 919 int ret = transport->notify_poll_in( 920 vsk, 1, &data_ready_now); 921 if (ret < 0) { 922 mask |= EPOLLERR; 923 } else { 924 if (data_ready_now) 925 mask |= EPOLLIN | EPOLLRDNORM; 926 927 } 928 } 929 930 /* Sockets whose connections have been closed, reset, or 931 * terminated should also be considered read, and we check the 932 * shutdown flag for that. 933 */ 934 if (sk->sk_shutdown & RCV_SHUTDOWN || 935 vsk->peer_shutdown & SEND_SHUTDOWN) { 936 mask |= EPOLLIN | EPOLLRDNORM; 937 } 938 939 /* Connected sockets that can produce data can be written. */ 940 if (sk->sk_state == TCP_ESTABLISHED) { 941 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 942 bool space_avail_now = false; 943 int ret = transport->notify_poll_out( 944 vsk, 1, &space_avail_now); 945 if (ret < 0) { 946 mask |= EPOLLERR; 947 } else { 948 if (space_avail_now) 949 /* Remove EPOLLWRBAND since INET 950 * sockets are not setting it. 951 */ 952 mask |= EPOLLOUT | EPOLLWRNORM; 953 954 } 955 } 956 } 957 958 /* Simulate INET socket poll behaviors, which sets 959 * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, 960 * but local send is not shutdown. 961 */ 962 if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { 963 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 964 mask |= EPOLLOUT | EPOLLWRNORM; 965 966 } 967 968 release_sock(sk); 969 } 970 971 return mask; 972 } 973 974 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 975 size_t len) 976 { 977 int err; 978 struct sock *sk; 979 struct vsock_sock *vsk; 980 struct sockaddr_vm *remote_addr; 981 982 if (msg->msg_flags & MSG_OOB) 983 return -EOPNOTSUPP; 984 985 /* For now, MSG_DONTWAIT is always assumed... */ 986 err = 0; 987 sk = sock->sk; 988 vsk = vsock_sk(sk); 989 990 lock_sock(sk); 991 992 err = vsock_auto_bind(vsk); 993 if (err) 994 goto out; 995 996 997 /* If the provided message contains an address, use that. Otherwise 998 * fall back on the socket's remote handle (if it has been connected). 999 */ 1000 if (msg->msg_name && 1001 vsock_addr_cast(msg->msg_name, msg->msg_namelen, 1002 &remote_addr) == 0) { 1003 /* Ensure this address is of the right type and is a valid 1004 * destination. 1005 */ 1006 1007 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1008 remote_addr->svm_cid = transport->get_local_cid(); 1009 1010 if (!vsock_addr_bound(remote_addr)) { 1011 err = -EINVAL; 1012 goto out; 1013 } 1014 } else if (sock->state == SS_CONNECTED) { 1015 remote_addr = &vsk->remote_addr; 1016 1017 if (remote_addr->svm_cid == VMADDR_CID_ANY) 1018 remote_addr->svm_cid = transport->get_local_cid(); 1019 1020 /* XXX Should connect() or this function ensure remote_addr is 1021 * bound? 1022 */ 1023 if (!vsock_addr_bound(&vsk->remote_addr)) { 1024 err = -EINVAL; 1025 goto out; 1026 } 1027 } else { 1028 err = -EINVAL; 1029 goto out; 1030 } 1031 1032 if (!transport->dgram_allow(remote_addr->svm_cid, 1033 remote_addr->svm_port)) { 1034 err = -EINVAL; 1035 goto out; 1036 } 1037 1038 err = transport->dgram_enqueue(vsk, remote_addr, msg, len); 1039 1040 out: 1041 release_sock(sk); 1042 return err; 1043 } 1044 1045 static int vsock_dgram_connect(struct socket *sock, 1046 struct sockaddr *addr, int addr_len, int flags) 1047 { 1048 int err; 1049 struct sock *sk; 1050 struct vsock_sock *vsk; 1051 struct sockaddr_vm *remote_addr; 1052 1053 sk = sock->sk; 1054 vsk = vsock_sk(sk); 1055 1056 err = vsock_addr_cast(addr, addr_len, &remote_addr); 1057 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { 1058 lock_sock(sk); 1059 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, 1060 VMADDR_PORT_ANY); 1061 sock->state = SS_UNCONNECTED; 1062 release_sock(sk); 1063 return 0; 1064 } else if (err != 0) 1065 return -EINVAL; 1066 1067 lock_sock(sk); 1068 1069 err = vsock_auto_bind(vsk); 1070 if (err) 1071 goto out; 1072 1073 if (!transport->dgram_allow(remote_addr->svm_cid, 1074 remote_addr->svm_port)) { 1075 err = -EINVAL; 1076 goto out; 1077 } 1078 1079 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); 1080 sock->state = SS_CONNECTED; 1081 1082 out: 1083 release_sock(sk); 1084 return err; 1085 } 1086 1087 static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, 1088 size_t len, int flags) 1089 { 1090 return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags); 1091 } 1092 1093 static const struct proto_ops vsock_dgram_ops = { 1094 .family = PF_VSOCK, 1095 .owner = THIS_MODULE, 1096 .release = vsock_release, 1097 .bind = vsock_bind, 1098 .connect = vsock_dgram_connect, 1099 .socketpair = sock_no_socketpair, 1100 .accept = sock_no_accept, 1101 .getname = vsock_getname, 1102 .poll = vsock_poll, 1103 .ioctl = sock_no_ioctl, 1104 .listen = sock_no_listen, 1105 .shutdown = vsock_shutdown, 1106 .setsockopt = sock_no_setsockopt, 1107 .getsockopt = sock_no_getsockopt, 1108 .sendmsg = vsock_dgram_sendmsg, 1109 .recvmsg = vsock_dgram_recvmsg, 1110 .mmap = sock_no_mmap, 1111 .sendpage = sock_no_sendpage, 1112 }; 1113 1114 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) 1115 { 1116 if (!transport->cancel_pkt) 1117 return -EOPNOTSUPP; 1118 1119 return transport->cancel_pkt(vsk); 1120 } 1121 1122 static void vsock_connect_timeout(struct work_struct *work) 1123 { 1124 struct sock *sk; 1125 struct vsock_sock *vsk; 1126 int cancel = 0; 1127 1128 vsk = container_of(work, struct vsock_sock, connect_work.work); 1129 sk = sk_vsock(vsk); 1130 1131 lock_sock(sk); 1132 if (sk->sk_state == TCP_SYN_SENT && 1133 (sk->sk_shutdown != SHUTDOWN_MASK)) { 1134 sk->sk_state = TCP_CLOSE; 1135 sk->sk_err = ETIMEDOUT; 1136 sk->sk_error_report(sk); 1137 cancel = 1; 1138 } 1139 release_sock(sk); 1140 if (cancel) 1141 vsock_transport_cancel_pkt(vsk); 1142 1143 sock_put(sk); 1144 } 1145 1146 static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, 1147 int addr_len, int flags) 1148 { 1149 int err; 1150 struct sock *sk; 1151 struct vsock_sock *vsk; 1152 struct sockaddr_vm *remote_addr; 1153 long timeout; 1154 DEFINE_WAIT(wait); 1155 1156 err = 0; 1157 sk = sock->sk; 1158 vsk = vsock_sk(sk); 1159 1160 lock_sock(sk); 1161 1162 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ 1163 switch (sock->state) { 1164 case SS_CONNECTED: 1165 err = -EISCONN; 1166 goto out; 1167 case SS_DISCONNECTING: 1168 err = -EINVAL; 1169 goto out; 1170 case SS_CONNECTING: 1171 /* This continues on so we can move sock into the SS_CONNECTED 1172 * state once the connection has completed (at which point err 1173 * will be set to zero also). Otherwise, we will either wait 1174 * for the connection or return -EALREADY should this be a 1175 * non-blocking call. 1176 */ 1177 err = -EALREADY; 1178 break; 1179 default: 1180 if ((sk->sk_state == TCP_LISTEN) || 1181 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { 1182 err = -EINVAL; 1183 goto out; 1184 } 1185 1186 /* The hypervisor and well-known contexts do not have socket 1187 * endpoints. 1188 */ 1189 if (!transport->stream_allow(remote_addr->svm_cid, 1190 remote_addr->svm_port)) { 1191 err = -ENETUNREACH; 1192 goto out; 1193 } 1194 1195 /* Set the remote address that we are connecting to. */ 1196 memcpy(&vsk->remote_addr, remote_addr, 1197 sizeof(vsk->remote_addr)); 1198 1199 err = vsock_auto_bind(vsk); 1200 if (err) 1201 goto out; 1202 1203 sk->sk_state = TCP_SYN_SENT; 1204 1205 err = transport->connect(vsk); 1206 if (err < 0) 1207 goto out; 1208 1209 /* Mark sock as connecting and set the error code to in 1210 * progress in case this is a non-blocking connect. 1211 */ 1212 sock->state = SS_CONNECTING; 1213 err = -EINPROGRESS; 1214 } 1215 1216 /* The receive path will handle all communication until we are able to 1217 * enter the connected state. Here we wait for the connection to be 1218 * completed or a notification of an error. 1219 */ 1220 timeout = vsk->connect_timeout; 1221 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1222 1223 while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { 1224 if (flags & O_NONBLOCK) { 1225 /* If we're not going to block, we schedule a timeout 1226 * function to generate a timeout on the connection 1227 * attempt, in case the peer doesn't respond in a 1228 * timely manner. We hold on to the socket until the 1229 * timeout fires. 1230 */ 1231 sock_hold(sk); 1232 schedule_delayed_work(&vsk->connect_work, timeout); 1233 1234 /* Skip ahead to preserve error code set above. */ 1235 goto out_wait; 1236 } 1237 1238 release_sock(sk); 1239 timeout = schedule_timeout(timeout); 1240 lock_sock(sk); 1241 1242 if (signal_pending(current)) { 1243 err = sock_intr_errno(timeout); 1244 sk->sk_state = TCP_CLOSE; 1245 sock->state = SS_UNCONNECTED; 1246 vsock_transport_cancel_pkt(vsk); 1247 goto out_wait; 1248 } else if (timeout == 0) { 1249 err = -ETIMEDOUT; 1250 sk->sk_state = TCP_CLOSE; 1251 sock->state = SS_UNCONNECTED; 1252 vsock_transport_cancel_pkt(vsk); 1253 goto out_wait; 1254 } 1255 1256 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1257 } 1258 1259 if (sk->sk_err) { 1260 err = -sk->sk_err; 1261 sk->sk_state = TCP_CLOSE; 1262 sock->state = SS_UNCONNECTED; 1263 } else { 1264 err = 0; 1265 } 1266 1267 out_wait: 1268 finish_wait(sk_sleep(sk), &wait); 1269 out: 1270 release_sock(sk); 1271 return err; 1272 } 1273 1274 static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, 1275 bool kern) 1276 { 1277 struct sock *listener; 1278 int err; 1279 struct sock *connected; 1280 struct vsock_sock *vconnected; 1281 long timeout; 1282 DEFINE_WAIT(wait); 1283 1284 err = 0; 1285 listener = sock->sk; 1286 1287 lock_sock(listener); 1288 1289 if (sock->type != SOCK_STREAM) { 1290 err = -EOPNOTSUPP; 1291 goto out; 1292 } 1293 1294 if (listener->sk_state != TCP_LISTEN) { 1295 err = -EINVAL; 1296 goto out; 1297 } 1298 1299 /* Wait for children sockets to appear; these are the new sockets 1300 * created upon connection establishment. 1301 */ 1302 timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); 1303 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1304 1305 while ((connected = vsock_dequeue_accept(listener)) == NULL && 1306 listener->sk_err == 0) { 1307 release_sock(listener); 1308 timeout = schedule_timeout(timeout); 1309 finish_wait(sk_sleep(listener), &wait); 1310 lock_sock(listener); 1311 1312 if (signal_pending(current)) { 1313 err = sock_intr_errno(timeout); 1314 goto out; 1315 } else if (timeout == 0) { 1316 err = -EAGAIN; 1317 goto out; 1318 } 1319 1320 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1321 } 1322 finish_wait(sk_sleep(listener), &wait); 1323 1324 if (listener->sk_err) 1325 err = -listener->sk_err; 1326 1327 if (connected) { 1328 listener->sk_ack_backlog--; 1329 1330 lock_sock_nested(connected, SINGLE_DEPTH_NESTING); 1331 vconnected = vsock_sk(connected); 1332 1333 /* If the listener socket has received an error, then we should 1334 * reject this socket and return. Note that we simply mark the 1335 * socket rejected, drop our reference, and let the cleanup 1336 * function handle the cleanup; the fact that we found it in 1337 * the listener's accept queue guarantees that the cleanup 1338 * function hasn't run yet. 1339 */ 1340 if (err) { 1341 vconnected->rejected = true; 1342 } else { 1343 newsock->state = SS_CONNECTED; 1344 sock_graft(connected, newsock); 1345 } 1346 1347 release_sock(connected); 1348 sock_put(connected); 1349 } 1350 1351 out: 1352 release_sock(listener); 1353 return err; 1354 } 1355 1356 static int vsock_listen(struct socket *sock, int backlog) 1357 { 1358 int err; 1359 struct sock *sk; 1360 struct vsock_sock *vsk; 1361 1362 sk = sock->sk; 1363 1364 lock_sock(sk); 1365 1366 if (sock->type != SOCK_STREAM) { 1367 err = -EOPNOTSUPP; 1368 goto out; 1369 } 1370 1371 if (sock->state != SS_UNCONNECTED) { 1372 err = -EINVAL; 1373 goto out; 1374 } 1375 1376 vsk = vsock_sk(sk); 1377 1378 if (!vsock_addr_bound(&vsk->local_addr)) { 1379 err = -EINVAL; 1380 goto out; 1381 } 1382 1383 sk->sk_max_ack_backlog = backlog; 1384 sk->sk_state = TCP_LISTEN; 1385 1386 err = 0; 1387 1388 out: 1389 release_sock(sk); 1390 return err; 1391 } 1392 1393 static int vsock_stream_setsockopt(struct socket *sock, 1394 int level, 1395 int optname, 1396 char __user *optval, 1397 unsigned int optlen) 1398 { 1399 int err; 1400 struct sock *sk; 1401 struct vsock_sock *vsk; 1402 u64 val; 1403 1404 if (level != AF_VSOCK) 1405 return -ENOPROTOOPT; 1406 1407 #define COPY_IN(_v) \ 1408 do { \ 1409 if (optlen < sizeof(_v)) { \ 1410 err = -EINVAL; \ 1411 goto exit; \ 1412 } \ 1413 if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ 1414 err = -EFAULT; \ 1415 goto exit; \ 1416 } \ 1417 } while (0) 1418 1419 err = 0; 1420 sk = sock->sk; 1421 vsk = vsock_sk(sk); 1422 1423 lock_sock(sk); 1424 1425 switch (optname) { 1426 case SO_VM_SOCKETS_BUFFER_SIZE: 1427 COPY_IN(val); 1428 transport->set_buffer_size(vsk, val); 1429 break; 1430 1431 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1432 COPY_IN(val); 1433 transport->set_max_buffer_size(vsk, val); 1434 break; 1435 1436 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1437 COPY_IN(val); 1438 transport->set_min_buffer_size(vsk, val); 1439 break; 1440 1441 case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1442 struct timeval tv; 1443 COPY_IN(tv); 1444 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && 1445 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { 1446 vsk->connect_timeout = tv.tv_sec * HZ + 1447 DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ)); 1448 if (vsk->connect_timeout == 0) 1449 vsk->connect_timeout = 1450 VSOCK_DEFAULT_CONNECT_TIMEOUT; 1451 1452 } else { 1453 err = -ERANGE; 1454 } 1455 break; 1456 } 1457 1458 default: 1459 err = -ENOPROTOOPT; 1460 break; 1461 } 1462 1463 #undef COPY_IN 1464 1465 exit: 1466 release_sock(sk); 1467 return err; 1468 } 1469 1470 static int vsock_stream_getsockopt(struct socket *sock, 1471 int level, int optname, 1472 char __user *optval, 1473 int __user *optlen) 1474 { 1475 int err; 1476 int len; 1477 struct sock *sk; 1478 struct vsock_sock *vsk; 1479 u64 val; 1480 1481 if (level != AF_VSOCK) 1482 return -ENOPROTOOPT; 1483 1484 err = get_user(len, optlen); 1485 if (err != 0) 1486 return err; 1487 1488 #define COPY_OUT(_v) \ 1489 do { \ 1490 if (len < sizeof(_v)) \ 1491 return -EINVAL; \ 1492 \ 1493 len = sizeof(_v); \ 1494 if (copy_to_user(optval, &_v, len) != 0) \ 1495 return -EFAULT; \ 1496 \ 1497 } while (0) 1498 1499 err = 0; 1500 sk = sock->sk; 1501 vsk = vsock_sk(sk); 1502 1503 switch (optname) { 1504 case SO_VM_SOCKETS_BUFFER_SIZE: 1505 val = transport->get_buffer_size(vsk); 1506 COPY_OUT(val); 1507 break; 1508 1509 case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1510 val = transport->get_max_buffer_size(vsk); 1511 COPY_OUT(val); 1512 break; 1513 1514 case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1515 val = transport->get_min_buffer_size(vsk); 1516 COPY_OUT(val); 1517 break; 1518 1519 case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1520 struct timeval tv; 1521 tv.tv_sec = vsk->connect_timeout / HZ; 1522 tv.tv_usec = 1523 (vsk->connect_timeout - 1524 tv.tv_sec * HZ) * (1000000 / HZ); 1525 COPY_OUT(tv); 1526 break; 1527 } 1528 default: 1529 return -ENOPROTOOPT; 1530 } 1531 1532 err = put_user(len, optlen); 1533 if (err != 0) 1534 return -EFAULT; 1535 1536 #undef COPY_OUT 1537 1538 return 0; 1539 } 1540 1541 static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, 1542 size_t len) 1543 { 1544 struct sock *sk; 1545 struct vsock_sock *vsk; 1546 ssize_t total_written; 1547 long timeout; 1548 int err; 1549 struct vsock_transport_send_notify_data send_data; 1550 DEFINE_WAIT_FUNC(wait, woken_wake_function); 1551 1552 sk = sock->sk; 1553 vsk = vsock_sk(sk); 1554 total_written = 0; 1555 err = 0; 1556 1557 if (msg->msg_flags & MSG_OOB) 1558 return -EOPNOTSUPP; 1559 1560 lock_sock(sk); 1561 1562 /* Callers should not provide a destination with stream sockets. */ 1563 if (msg->msg_namelen) { 1564 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 1565 goto out; 1566 } 1567 1568 /* Send data only if both sides are not shutdown in the direction. */ 1569 if (sk->sk_shutdown & SEND_SHUTDOWN || 1570 vsk->peer_shutdown & RCV_SHUTDOWN) { 1571 err = -EPIPE; 1572 goto out; 1573 } 1574 1575 if (sk->sk_state != TCP_ESTABLISHED || 1576 !vsock_addr_bound(&vsk->local_addr)) { 1577 err = -ENOTCONN; 1578 goto out; 1579 } 1580 1581 if (!vsock_addr_bound(&vsk->remote_addr)) { 1582 err = -EDESTADDRREQ; 1583 goto out; 1584 } 1585 1586 /* Wait for room in the produce queue to enqueue our user's data. */ 1587 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1588 1589 err = transport->notify_send_init(vsk, &send_data); 1590 if (err < 0) 1591 goto out; 1592 1593 while (total_written < len) { 1594 ssize_t written; 1595 1596 add_wait_queue(sk_sleep(sk), &wait); 1597 while (vsock_stream_has_space(vsk) == 0 && 1598 sk->sk_err == 0 && 1599 !(sk->sk_shutdown & SEND_SHUTDOWN) && 1600 !(vsk->peer_shutdown & RCV_SHUTDOWN)) { 1601 1602 /* Don't wait for non-blocking sockets. */ 1603 if (timeout == 0) { 1604 err = -EAGAIN; 1605 remove_wait_queue(sk_sleep(sk), &wait); 1606 goto out_err; 1607 } 1608 1609 err = transport->notify_send_pre_block(vsk, &send_data); 1610 if (err < 0) { 1611 remove_wait_queue(sk_sleep(sk), &wait); 1612 goto out_err; 1613 } 1614 1615 release_sock(sk); 1616 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); 1617 lock_sock(sk); 1618 if (signal_pending(current)) { 1619 err = sock_intr_errno(timeout); 1620 remove_wait_queue(sk_sleep(sk), &wait); 1621 goto out_err; 1622 } else if (timeout == 0) { 1623 err = -EAGAIN; 1624 remove_wait_queue(sk_sleep(sk), &wait); 1625 goto out_err; 1626 } 1627 } 1628 remove_wait_queue(sk_sleep(sk), &wait); 1629 1630 /* These checks occur both as part of and after the loop 1631 * conditional since we need to check before and after 1632 * sleeping. 1633 */ 1634 if (sk->sk_err) { 1635 err = -sk->sk_err; 1636 goto out_err; 1637 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1638 (vsk->peer_shutdown & RCV_SHUTDOWN)) { 1639 err = -EPIPE; 1640 goto out_err; 1641 } 1642 1643 err = transport->notify_send_pre_enqueue(vsk, &send_data); 1644 if (err < 0) 1645 goto out_err; 1646 1647 /* Note that enqueue will only write as many bytes as are free 1648 * in the produce queue, so we don't need to ensure len is 1649 * smaller than the queue size. It is the caller's 1650 * responsibility to check how many bytes we were able to send. 1651 */ 1652 1653 written = transport->stream_enqueue( 1654 vsk, msg, 1655 len - total_written); 1656 if (written < 0) { 1657 err = -ENOMEM; 1658 goto out_err; 1659 } 1660 1661 total_written += written; 1662 1663 err = transport->notify_send_post_enqueue( 1664 vsk, written, &send_data); 1665 if (err < 0) 1666 goto out_err; 1667 1668 } 1669 1670 out_err: 1671 if (total_written > 0) 1672 err = total_written; 1673 out: 1674 release_sock(sk); 1675 return err; 1676 } 1677 1678 1679 static int 1680 vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1681 int flags) 1682 { 1683 struct sock *sk; 1684 struct vsock_sock *vsk; 1685 int err; 1686 size_t target; 1687 ssize_t copied; 1688 long timeout; 1689 struct vsock_transport_recv_notify_data recv_data; 1690 1691 DEFINE_WAIT(wait); 1692 1693 sk = sock->sk; 1694 vsk = vsock_sk(sk); 1695 err = 0; 1696 1697 lock_sock(sk); 1698 1699 if (sk->sk_state != TCP_ESTABLISHED) { 1700 /* Recvmsg is supposed to return 0 if a peer performs an 1701 * orderly shutdown. Differentiate between that case and when a 1702 * peer has not connected or a local shutdown occured with the 1703 * SOCK_DONE flag. 1704 */ 1705 if (sock_flag(sk, SOCK_DONE)) 1706 err = 0; 1707 else 1708 err = -ENOTCONN; 1709 1710 goto out; 1711 } 1712 1713 if (flags & MSG_OOB) { 1714 err = -EOPNOTSUPP; 1715 goto out; 1716 } 1717 1718 /* We don't check peer_shutdown flag here since peer may actually shut 1719 * down, but there can be data in the queue that a local socket can 1720 * receive. 1721 */ 1722 if (sk->sk_shutdown & RCV_SHUTDOWN) { 1723 err = 0; 1724 goto out; 1725 } 1726 1727 /* It is valid on Linux to pass in a zero-length receive buffer. This 1728 * is not an error. We may as well bail out now. 1729 */ 1730 if (!len) { 1731 err = 0; 1732 goto out; 1733 } 1734 1735 /* We must not copy less than target bytes into the user's buffer 1736 * before returning successfully, so we wait for the consume queue to 1737 * have that much data to consume before dequeueing. Note that this 1738 * makes it impossible to handle cases where target is greater than the 1739 * queue size. 1740 */ 1741 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1742 if (target >= transport->stream_rcvhiwat(vsk)) { 1743 err = -ENOMEM; 1744 goto out; 1745 } 1746 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1747 copied = 0; 1748 1749 err = transport->notify_recv_init(vsk, target, &recv_data); 1750 if (err < 0) 1751 goto out; 1752 1753 1754 while (1) { 1755 s64 ready; 1756 1757 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1758 ready = vsock_stream_has_data(vsk); 1759 1760 if (ready == 0) { 1761 if (sk->sk_err != 0 || 1762 (sk->sk_shutdown & RCV_SHUTDOWN) || 1763 (vsk->peer_shutdown & SEND_SHUTDOWN)) { 1764 finish_wait(sk_sleep(sk), &wait); 1765 break; 1766 } 1767 /* Don't wait for non-blocking sockets. */ 1768 if (timeout == 0) { 1769 err = -EAGAIN; 1770 finish_wait(sk_sleep(sk), &wait); 1771 break; 1772 } 1773 1774 err = transport->notify_recv_pre_block( 1775 vsk, target, &recv_data); 1776 if (err < 0) { 1777 finish_wait(sk_sleep(sk), &wait); 1778 break; 1779 } 1780 release_sock(sk); 1781 timeout = schedule_timeout(timeout); 1782 lock_sock(sk); 1783 1784 if (signal_pending(current)) { 1785 err = sock_intr_errno(timeout); 1786 finish_wait(sk_sleep(sk), &wait); 1787 break; 1788 } else if (timeout == 0) { 1789 err = -EAGAIN; 1790 finish_wait(sk_sleep(sk), &wait); 1791 break; 1792 } 1793 } else { 1794 ssize_t read; 1795 1796 finish_wait(sk_sleep(sk), &wait); 1797 1798 if (ready < 0) { 1799 /* Invalid queue pair content. XXX This should 1800 * be changed to a connection reset in a later 1801 * change. 1802 */ 1803 1804 err = -ENOMEM; 1805 goto out; 1806 } 1807 1808 err = transport->notify_recv_pre_dequeue( 1809 vsk, target, &recv_data); 1810 if (err < 0) 1811 break; 1812 1813 read = transport->stream_dequeue( 1814 vsk, msg, 1815 len - copied, flags); 1816 if (read < 0) { 1817 err = -ENOMEM; 1818 break; 1819 } 1820 1821 copied += read; 1822 1823 err = transport->notify_recv_post_dequeue( 1824 vsk, target, read, 1825 !(flags & MSG_PEEK), &recv_data); 1826 if (err < 0) 1827 goto out; 1828 1829 if (read >= target || flags & MSG_PEEK) 1830 break; 1831 1832 target -= read; 1833 } 1834 } 1835 1836 if (sk->sk_err) 1837 err = -sk->sk_err; 1838 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1839 err = 0; 1840 1841 if (copied > 0) 1842 err = copied; 1843 1844 out: 1845 release_sock(sk); 1846 return err; 1847 } 1848 1849 static const struct proto_ops vsock_stream_ops = { 1850 .family = PF_VSOCK, 1851 .owner = THIS_MODULE, 1852 .release = vsock_release, 1853 .bind = vsock_bind, 1854 .connect = vsock_stream_connect, 1855 .socketpair = sock_no_socketpair, 1856 .accept = vsock_accept, 1857 .getname = vsock_getname, 1858 .poll = vsock_poll, 1859 .ioctl = sock_no_ioctl, 1860 .listen = vsock_listen, 1861 .shutdown = vsock_shutdown, 1862 .setsockopt = vsock_stream_setsockopt, 1863 .getsockopt = vsock_stream_getsockopt, 1864 .sendmsg = vsock_stream_sendmsg, 1865 .recvmsg = vsock_stream_recvmsg, 1866 .mmap = sock_no_mmap, 1867 .sendpage = sock_no_sendpage, 1868 }; 1869 1870 static int vsock_create(struct net *net, struct socket *sock, 1871 int protocol, int kern) 1872 { 1873 if (!sock) 1874 return -EINVAL; 1875 1876 if (protocol && protocol != PF_VSOCK) 1877 return -EPROTONOSUPPORT; 1878 1879 switch (sock->type) { 1880 case SOCK_DGRAM: 1881 sock->ops = &vsock_dgram_ops; 1882 break; 1883 case SOCK_STREAM: 1884 sock->ops = &vsock_stream_ops; 1885 break; 1886 default: 1887 return -ESOCKTNOSUPPORT; 1888 } 1889 1890 sock->state = SS_UNCONNECTED; 1891 1892 return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; 1893 } 1894 1895 static const struct net_proto_family vsock_family_ops = { 1896 .family = AF_VSOCK, 1897 .create = vsock_create, 1898 .owner = THIS_MODULE, 1899 }; 1900 1901 static long vsock_dev_do_ioctl(struct file *filp, 1902 unsigned int cmd, void __user *ptr) 1903 { 1904 u32 __user *p = ptr; 1905 int retval = 0; 1906 1907 switch (cmd) { 1908 case IOCTL_VM_SOCKETS_GET_LOCAL_CID: 1909 if (put_user(transport->get_local_cid(), p) != 0) 1910 retval = -EFAULT; 1911 break; 1912 1913 default: 1914 pr_err("Unknown ioctl %d\n", cmd); 1915 retval = -EINVAL; 1916 } 1917 1918 return retval; 1919 } 1920 1921 static long vsock_dev_ioctl(struct file *filp, 1922 unsigned int cmd, unsigned long arg) 1923 { 1924 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); 1925 } 1926 1927 #ifdef CONFIG_COMPAT 1928 static long vsock_dev_compat_ioctl(struct file *filp, 1929 unsigned int cmd, unsigned long arg) 1930 { 1931 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); 1932 } 1933 #endif 1934 1935 static const struct file_operations vsock_device_ops = { 1936 .owner = THIS_MODULE, 1937 .unlocked_ioctl = vsock_dev_ioctl, 1938 #ifdef CONFIG_COMPAT 1939 .compat_ioctl = vsock_dev_compat_ioctl, 1940 #endif 1941 .open = nonseekable_open, 1942 }; 1943 1944 static struct miscdevice vsock_device = { 1945 .name = "vsock", 1946 .fops = &vsock_device_ops, 1947 }; 1948 1949 int __vsock_core_init(const struct vsock_transport *t, struct module *owner) 1950 { 1951 int err = mutex_lock_interruptible(&vsock_register_mutex); 1952 1953 if (err) 1954 return err; 1955 1956 if (transport) { 1957 err = -EBUSY; 1958 goto err_busy; 1959 } 1960 1961 /* Transport must be the owner of the protocol so that it can't 1962 * unload while there are open sockets. 1963 */ 1964 vsock_proto.owner = owner; 1965 transport = t; 1966 1967 vsock_device.minor = MISC_DYNAMIC_MINOR; 1968 err = misc_register(&vsock_device); 1969 if (err) { 1970 pr_err("Failed to register misc device\n"); 1971 goto err_reset_transport; 1972 } 1973 1974 err = proto_register(&vsock_proto, 1); /* we want our slab */ 1975 if (err) { 1976 pr_err("Cannot register vsock protocol\n"); 1977 goto err_deregister_misc; 1978 } 1979 1980 err = sock_register(&vsock_family_ops); 1981 if (err) { 1982 pr_err("could not register af_vsock (%d) address family: %d\n", 1983 AF_VSOCK, err); 1984 goto err_unregister_proto; 1985 } 1986 1987 mutex_unlock(&vsock_register_mutex); 1988 return 0; 1989 1990 err_unregister_proto: 1991 proto_unregister(&vsock_proto); 1992 err_deregister_misc: 1993 misc_deregister(&vsock_device); 1994 err_reset_transport: 1995 transport = NULL; 1996 err_busy: 1997 mutex_unlock(&vsock_register_mutex); 1998 return err; 1999 } 2000 EXPORT_SYMBOL_GPL(__vsock_core_init); 2001 2002 void vsock_core_exit(void) 2003 { 2004 mutex_lock(&vsock_register_mutex); 2005 2006 misc_deregister(&vsock_device); 2007 sock_unregister(AF_VSOCK); 2008 proto_unregister(&vsock_proto); 2009 2010 /* We do not want the assignment below re-ordered. */ 2011 mb(); 2012 transport = NULL; 2013 2014 mutex_unlock(&vsock_register_mutex); 2015 } 2016 EXPORT_SYMBOL_GPL(vsock_core_exit); 2017 2018 const struct vsock_transport *vsock_core_get_transport(void) 2019 { 2020 /* vsock_register_mutex not taken since only the transport uses this 2021 * function and only while registered. 2022 */ 2023 return transport; 2024 } 2025 EXPORT_SYMBOL_GPL(vsock_core_get_transport); 2026 2027 static void __exit vsock_exit(void) 2028 { 2029 /* Do nothing. This function makes this module removable. */ 2030 } 2031 2032 module_init(vsock_init_tables); 2033 module_exit(vsock_exit); 2034 2035 MODULE_AUTHOR("VMware, Inc."); 2036 MODULE_DESCRIPTION("VMware Virtual Socket Family"); 2037 MODULE_VERSION("1.0.2.0-k"); 2038 MODULE_LICENSE("GPL v2"); 2039