1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2012-2014 Cisco Systems 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * of this software and associated documentation files (the "Software"), to deal 9 * in the Software without restriction, including without limitation the rights 10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the Software is 12 * furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 * THE SOFTWARE. 24 */ 25 26 #include "qemu/osdep.h" 27 #include <linux/ip.h> 28 #include <netdb.h> 29 #include "net/net.h" 30 #include "clients.h" 31 #include "qapi/error.h" 32 #include "qemu/error-report.h" 33 #include "qemu/option.h" 34 #include "qemu/sockets.h" 35 #include "qemu/iov.h" 36 #include "qemu/main-loop.h" 37 38 39 /* The buffer size needs to be investigated for optimum numbers and 40 * optimum means of paging in on different systems. This size is 41 * chosen to be sufficient to accommodate one packet with some headers 42 */ 43 44 #define BUFFER_ALIGN sysconf(_SC_PAGESIZE) 45 #define BUFFER_SIZE 2048 46 #define IOVSIZE 2 47 #define MAX_L2TPV3_MSGCNT 64 48 #define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE) 49 50 /* Header set to 0x30000 signifies a data packet */ 51 52 #define L2TPV3_DATA_PACKET 0x30000 53 54 /* IANA-assigned IP protocol ID for L2TPv3 */ 55 56 #ifndef IPPROTO_L2TP 57 #define IPPROTO_L2TP 0x73 58 #endif 59 60 typedef struct NetL2TPV3State { 61 NetClientState nc; 62 int fd; 63 64 /* 65 * these are used for xmit - that happens packet a time 66 * and for first sign of life packet (easier to parse that once) 67 */ 68 69 uint8_t *header_buf; 70 struct iovec *vec; 71 72 /* 73 * these are used for receive - try to "eat" up to 32 packets at a time 74 */ 75 76 struct mmsghdr *msgvec; 77 78 /* 79 * peer address 80 */ 81 82 struct sockaddr_storage *dgram_dst; 83 uint32_t dst_size; 84 85 /* 86 * L2TPv3 parameters 87 */ 88 89 uint64_t rx_cookie; 90 uint64_t tx_cookie; 91 uint32_t rx_session; 92 uint32_t tx_session; 93 uint32_t header_size; 94 uint32_t counter; 95 96 /* 97 * DOS avoidance in error handling 98 */ 99 100 bool header_mismatch; 101 102 /* 103 * Ring buffer handling 104 */ 105 106 int queue_head; 107 int queue_tail; 108 int queue_depth; 109 110 /* 111 * Precomputed offsets 112 */ 113 114 uint32_t offset; 115 uint32_t cookie_offset; 116 uint32_t counter_offset; 117 uint32_t session_offset; 118 119 /* Poll Control */ 120 121 bool read_poll; 122 bool write_poll; 123 124 /* Flags */ 125 126 bool ipv6; 127 bool udp; 128 bool has_counter; 129 bool pin_counter; 130 bool cookie; 131 bool cookie_is_64; 132 133 } NetL2TPV3State; 134 135 static void net_l2tpv3_send(void *opaque); 136 static void l2tpv3_writable(void *opaque); 137 138 static void l2tpv3_update_fd_handler(NetL2TPV3State *s) 139 { 140 qemu_set_fd_handler(s->fd, 141 s->read_poll ? net_l2tpv3_send : NULL, 142 s->write_poll ? l2tpv3_writable : NULL, 143 s); 144 } 145 146 static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable) 147 { 148 if (s->read_poll != enable) { 149 s->read_poll = enable; 150 l2tpv3_update_fd_handler(s); 151 } 152 } 153 154 static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable) 155 { 156 if (s->write_poll != enable) { 157 s->write_poll = enable; 158 l2tpv3_update_fd_handler(s); 159 } 160 } 161 162 static void l2tpv3_writable(void *opaque) 163 { 164 NetL2TPV3State *s = opaque; 165 l2tpv3_write_poll(s, false); 166 qemu_flush_queued_packets(&s->nc); 167 } 168 169 static void l2tpv3_send_completed(NetClientState *nc, ssize_t len) 170 { 171 NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); 172 l2tpv3_read_poll(s, true); 173 } 174 175 static void l2tpv3_poll(NetClientState *nc, bool enable) 176 { 177 NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); 178 l2tpv3_write_poll(s, enable); 179 l2tpv3_read_poll(s, enable); 180 } 181 182 static void l2tpv3_form_header(NetL2TPV3State *s) 183 { 184 uint32_t *counter; 185 186 if (s->udp) { 187 stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET); 188 } 189 stl_be_p( 190 (uint32_t *) (s->header_buf + s->session_offset), 191 s->tx_session 192 ); 193 if (s->cookie) { 194 if (s->cookie_is_64) { 195 stq_be_p( 196 (uint64_t *)(s->header_buf + s->cookie_offset), 197 s->tx_cookie 198 ); 199 } else { 200 stl_be_p( 201 (uint32_t *) (s->header_buf + s->cookie_offset), 202 s->tx_cookie 203 ); 204 } 205 } 206 if (s->has_counter) { 207 counter = (uint32_t *)(s->header_buf + s->counter_offset); 208 if (s->pin_counter) { 209 *counter = 0; 210 } else { 211 stl_be_p(counter, ++s->counter); 212 } 213 } 214 } 215 216 static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc, 217 const struct iovec *iov, 218 int iovcnt) 219 { 220 NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); 221 222 struct msghdr message; 223 int ret; 224 225 if (iovcnt > MAX_L2TPV3_IOVCNT - 1) { 226 error_report( 227 "iovec too long %d > %d, change l2tpv3.h", 228 iovcnt, MAX_L2TPV3_IOVCNT 229 ); 230 return -1; 231 } 232 l2tpv3_form_header(s); 233 memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec)); 234 s->vec->iov_base = s->header_buf; 235 s->vec->iov_len = s->offset; 236 message.msg_name = s->dgram_dst; 237 message.msg_namelen = s->dst_size; 238 message.msg_iov = s->vec; 239 message.msg_iovlen = iovcnt + 1; 240 message.msg_control = NULL; 241 message.msg_controllen = 0; 242 message.msg_flags = 0; 243 do { 244 ret = sendmsg(s->fd, &message, 0); 245 } while ((ret == -1) && (errno == EINTR)); 246 if (ret > 0) { 247 ret -= s->offset; 248 } else if (ret == 0) { 249 /* belt and braces - should not occur on DGRAM 250 * we should get an error and never a 0 send 251 */ 252 ret = iov_size(iov, iovcnt); 253 } else { 254 /* signal upper layer that socket buffer is full */ 255 ret = -errno; 256 if (ret == -EAGAIN || ret == -ENOBUFS) { 257 l2tpv3_write_poll(s, true); 258 ret = 0; 259 } 260 } 261 return ret; 262 } 263 264 static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc, 265 const uint8_t *buf, 266 size_t size) 267 { 268 NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); 269 270 struct iovec *vec; 271 struct msghdr message; 272 ssize_t ret = 0; 273 274 l2tpv3_form_header(s); 275 vec = s->vec; 276 vec->iov_base = s->header_buf; 277 vec->iov_len = s->offset; 278 vec++; 279 vec->iov_base = (void *) buf; 280 vec->iov_len = size; 281 message.msg_name = s->dgram_dst; 282 message.msg_namelen = s->dst_size; 283 message.msg_iov = s->vec; 284 message.msg_iovlen = 2; 285 message.msg_control = NULL; 286 message.msg_controllen = 0; 287 message.msg_flags = 0; 288 do { 289 ret = sendmsg(s->fd, &message, 0); 290 } while ((ret == -1) && (errno == EINTR)); 291 if (ret > 0) { 292 ret -= s->offset; 293 } else if (ret == 0) { 294 /* belt and braces - should not occur on DGRAM 295 * we should get an error and never a 0 send 296 */ 297 ret = size; 298 } else { 299 ret = -errno; 300 if (ret == -EAGAIN || ret == -ENOBUFS) { 301 /* signal upper layer that socket buffer is full */ 302 l2tpv3_write_poll(s, true); 303 ret = 0; 304 } 305 } 306 return ret; 307 } 308 309 static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf) 310 { 311 312 uint32_t *session; 313 uint64_t cookie; 314 315 if ((!s->udp) && (!s->ipv6)) { 316 buf += sizeof(struct iphdr) /* fix for ipv4 raw */; 317 } 318 319 /* we do not do a strict check for "data" packets as per 320 * the RFC spec because the pure IP spec does not have 321 * that anyway. 322 */ 323 324 if (s->cookie) { 325 if (s->cookie_is_64) { 326 cookie = ldq_be_p(buf + s->cookie_offset); 327 } else { 328 cookie = ldl_be_p(buf + s->cookie_offset) & 0xffffffffULL; 329 } 330 if (cookie != s->rx_cookie) { 331 if (!s->header_mismatch) { 332 error_report("unknown cookie id"); 333 } 334 return -1; 335 } 336 } 337 session = (uint32_t *) (buf + s->session_offset); 338 if (ldl_be_p(session) != s->rx_session) { 339 if (!s->header_mismatch) { 340 error_report("session mismatch"); 341 } 342 return -1; 343 } 344 return 0; 345 } 346 347 static void net_l2tpv3_process_queue(NetL2TPV3State *s) 348 { 349 int size = 0; 350 struct iovec *vec; 351 bool bad_read; 352 int data_size; 353 struct mmsghdr *msgvec; 354 355 /* go into ring mode only if there is a "pending" tail */ 356 if (s->queue_depth > 0) { 357 do { 358 msgvec = s->msgvec + s->queue_tail; 359 if (msgvec->msg_len > 0) { 360 data_size = msgvec->msg_len - s->header_size; 361 vec = msgvec->msg_hdr.msg_iov; 362 if ((data_size > 0) && 363 (l2tpv3_verify_header(s, vec->iov_base) == 0)) { 364 vec++; 365 /* Use the legacy delivery for now, we will 366 * switch to using our own ring as a queueing mechanism 367 * at a later date 368 */ 369 size = qemu_send_packet_async( 370 &s->nc, 371 vec->iov_base, 372 data_size, 373 l2tpv3_send_completed 374 ); 375 if (size == 0) { 376 l2tpv3_read_poll(s, false); 377 } 378 bad_read = false; 379 } else { 380 bad_read = true; 381 if (!s->header_mismatch) { 382 /* report error only once */ 383 error_report("l2tpv3 header verification failed"); 384 s->header_mismatch = true; 385 } 386 } 387 } else { 388 bad_read = true; 389 } 390 s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT; 391 s->queue_depth--; 392 } while ( 393 (s->queue_depth > 0) && 394 qemu_can_send_packet(&s->nc) && 395 ((size > 0) || bad_read) 396 ); 397 } 398 } 399 400 static void net_l2tpv3_send(void *opaque) 401 { 402 NetL2TPV3State *s = opaque; 403 int target_count, count; 404 struct mmsghdr *msgvec; 405 406 /* go into ring mode only if there is a "pending" tail */ 407 408 if (s->queue_depth) { 409 410 /* The ring buffer we use has variable intake 411 * count of how much we can read varies - adjust accordingly 412 */ 413 414 target_count = MAX_L2TPV3_MSGCNT - s->queue_depth; 415 416 /* Ensure we do not overrun the ring when we have 417 * a lot of enqueued packets 418 */ 419 420 if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) { 421 target_count = MAX_L2TPV3_MSGCNT - s->queue_head; 422 } 423 } else { 424 425 /* we do not have any pending packets - we can use 426 * the whole message vector linearly instead of using 427 * it as a ring 428 */ 429 430 s->queue_head = 0; 431 s->queue_tail = 0; 432 target_count = MAX_L2TPV3_MSGCNT; 433 } 434 435 msgvec = s->msgvec + s->queue_head; 436 if (target_count > 0) { 437 do { 438 count = recvmmsg( 439 s->fd, 440 msgvec, 441 target_count, MSG_DONTWAIT, NULL); 442 } while ((count == -1) && (errno == EINTR)); 443 if (count < 0) { 444 /* Recv error - we still need to flush packets here, 445 * (re)set queue head to current position 446 */ 447 count = 0; 448 } 449 s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT; 450 s->queue_depth += count; 451 } 452 net_l2tpv3_process_queue(s); 453 } 454 455 static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount) 456 { 457 int i, j; 458 struct iovec *iov; 459 struct mmsghdr *cleanup = msgvec; 460 if (cleanup) { 461 for (i = 0; i < count; i++) { 462 if (cleanup->msg_hdr.msg_iov) { 463 iov = cleanup->msg_hdr.msg_iov; 464 for (j = 0; j < iovcount; j++) { 465 g_free(iov->iov_base); 466 iov++; 467 } 468 g_free(cleanup->msg_hdr.msg_iov); 469 } 470 cleanup++; 471 } 472 g_free(msgvec); 473 } 474 } 475 476 static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int count) 477 { 478 int i; 479 struct iovec *iov; 480 struct mmsghdr *msgvec, *result; 481 482 msgvec = g_new(struct mmsghdr, count); 483 result = msgvec; 484 for (i = 0; i < count ; i++) { 485 msgvec->msg_hdr.msg_name = NULL; 486 msgvec->msg_hdr.msg_namelen = 0; 487 iov = g_new(struct iovec, IOVSIZE); 488 msgvec->msg_hdr.msg_iov = iov; 489 iov->iov_base = g_malloc(s->header_size); 490 iov->iov_len = s->header_size; 491 iov++ ; 492 iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE); 493 iov->iov_len = BUFFER_SIZE; 494 msgvec->msg_hdr.msg_iovlen = 2; 495 msgvec->msg_hdr.msg_control = NULL; 496 msgvec->msg_hdr.msg_controllen = 0; 497 msgvec->msg_hdr.msg_flags = 0; 498 msgvec++; 499 } 500 return result; 501 } 502 503 static void net_l2tpv3_cleanup(NetClientState *nc) 504 { 505 NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); 506 qemu_purge_queued_packets(nc); 507 l2tpv3_read_poll(s, false); 508 l2tpv3_write_poll(s, false); 509 if (s->fd >= 0) { 510 close(s->fd); 511 } 512 destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE); 513 g_free(s->vec); 514 g_free(s->header_buf); 515 g_free(s->dgram_dst); 516 } 517 518 static NetClientInfo net_l2tpv3_info = { 519 .type = NET_CLIENT_DRIVER_L2TPV3, 520 .size = sizeof(NetL2TPV3State), 521 .receive = net_l2tpv3_receive_dgram, 522 .receive_iov = net_l2tpv3_receive_dgram_iov, 523 .poll = l2tpv3_poll, 524 .cleanup = net_l2tpv3_cleanup, 525 }; 526 527 int net_init_l2tpv3(const Netdev *netdev, 528 const char *name, 529 NetClientState *peer, Error **errp) 530 { 531 const NetdevL2TPv3Options *l2tpv3; 532 NetL2TPV3State *s; 533 NetClientState *nc; 534 int fd = -1, gairet; 535 struct addrinfo hints; 536 struct addrinfo *result = NULL; 537 char *srcport, *dstport; 538 539 nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name); 540 541 s = DO_UPCAST(NetL2TPV3State, nc, nc); 542 543 s->queue_head = 0; 544 s->queue_tail = 0; 545 s->header_mismatch = false; 546 547 assert(netdev->type == NET_CLIENT_DRIVER_L2TPV3); 548 l2tpv3 = &netdev->u.l2tpv3; 549 550 if (l2tpv3->has_ipv6 && l2tpv3->ipv6) { 551 s->ipv6 = l2tpv3->ipv6; 552 } else { 553 s->ipv6 = false; 554 } 555 556 if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) { 557 error_setg(errp, "offset must be less than 256 bytes"); 558 goto outerr; 559 } 560 561 if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) { 562 if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) { 563 s->cookie = true; 564 } else { 565 error_setg(errp, 566 "require both 'rxcookie' and 'txcookie' or neither"); 567 goto outerr; 568 } 569 } else { 570 s->cookie = false; 571 } 572 573 if (l2tpv3->has_cookie64 || l2tpv3->cookie64) { 574 s->cookie_is_64 = true; 575 } else { 576 s->cookie_is_64 = false; 577 } 578 579 if (l2tpv3->has_udp && l2tpv3->udp) { 580 s->udp = true; 581 if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) { 582 error_setg(errp, "need both src and dst port for udp"); 583 goto outerr; 584 } else { 585 srcport = l2tpv3->srcport; 586 dstport = l2tpv3->dstport; 587 } 588 } else { 589 s->udp = false; 590 srcport = NULL; 591 dstport = NULL; 592 } 593 594 595 s->offset = 4; 596 s->session_offset = 0; 597 s->cookie_offset = 4; 598 s->counter_offset = 4; 599 600 s->tx_session = l2tpv3->txsession; 601 if (l2tpv3->has_rxsession) { 602 s->rx_session = l2tpv3->rxsession; 603 } else { 604 s->rx_session = s->tx_session; 605 } 606 607 if (s->cookie) { 608 s->rx_cookie = l2tpv3->rxcookie; 609 s->tx_cookie = l2tpv3->txcookie; 610 if (s->cookie_is_64 == true) { 611 /* 64 bit cookie */ 612 s->offset += 8; 613 s->counter_offset += 8; 614 } else { 615 /* 32 bit cookie */ 616 s->offset += 4; 617 s->counter_offset += 4; 618 } 619 } 620 621 memset(&hints, 0, sizeof(hints)); 622 623 if (s->ipv6) { 624 hints.ai_family = AF_INET6; 625 } else { 626 hints.ai_family = AF_INET; 627 } 628 if (s->udp) { 629 hints.ai_socktype = SOCK_DGRAM; 630 hints.ai_protocol = 0; 631 s->offset += 4; 632 s->counter_offset += 4; 633 s->session_offset += 4; 634 s->cookie_offset += 4; 635 } else { 636 hints.ai_socktype = SOCK_RAW; 637 hints.ai_protocol = IPPROTO_L2TP; 638 } 639 640 gairet = getaddrinfo(l2tpv3->src, srcport, &hints, &result); 641 642 if ((gairet != 0) || (result == NULL)) { 643 error_setg(errp, "could not resolve src, errno = %s", 644 gai_strerror(gairet)); 645 goto outerr; 646 } 647 fd = socket(result->ai_family, result->ai_socktype, result->ai_protocol); 648 if (fd == -1) { 649 fd = -errno; 650 error_setg(errp, "socket creation failed, errno = %d", 651 -fd); 652 goto outerr; 653 } 654 if (bind(fd, (struct sockaddr *) result->ai_addr, result->ai_addrlen)) { 655 error_setg(errp, "could not bind socket err=%i", errno); 656 goto outerr; 657 } 658 659 freeaddrinfo(result); 660 661 memset(&hints, 0, sizeof(hints)); 662 663 if (s->ipv6) { 664 hints.ai_family = AF_INET6; 665 } else { 666 hints.ai_family = AF_INET; 667 } 668 if (s->udp) { 669 hints.ai_socktype = SOCK_DGRAM; 670 hints.ai_protocol = 0; 671 } else { 672 hints.ai_socktype = SOCK_RAW; 673 hints.ai_protocol = IPPROTO_L2TP; 674 } 675 676 result = NULL; 677 gairet = getaddrinfo(l2tpv3->dst, dstport, &hints, &result); 678 if ((gairet != 0) || (result == NULL)) { 679 error_setg(errp, "could not resolve dst, error = %s", 680 gai_strerror(gairet)); 681 goto outerr; 682 } 683 684 s->dgram_dst = g_new0(struct sockaddr_storage, 1); 685 memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen); 686 s->dst_size = result->ai_addrlen; 687 688 freeaddrinfo(result); 689 690 if (l2tpv3->has_counter && l2tpv3->counter) { 691 s->has_counter = true; 692 s->offset += 4; 693 } else { 694 s->has_counter = false; 695 } 696 697 if (l2tpv3->has_pincounter && l2tpv3->pincounter) { 698 s->has_counter = true; /* pin counter implies that there is counter */ 699 s->pin_counter = true; 700 } else { 701 s->pin_counter = false; 702 } 703 704 if (l2tpv3->has_offset) { 705 /* extra offset */ 706 s->offset += l2tpv3->offset; 707 } 708 709 if ((s->ipv6) || (s->udp)) { 710 s->header_size = s->offset; 711 } else { 712 s->header_size = s->offset + sizeof(struct iphdr); 713 } 714 715 s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT); 716 s->vec = g_new(struct iovec, MAX_L2TPV3_IOVCNT); 717 s->header_buf = g_malloc(s->header_size); 718 719 qemu_set_nonblock(fd); 720 721 s->fd = fd; 722 s->counter = 0; 723 724 l2tpv3_read_poll(s, true); 725 726 snprintf(s->nc.info_str, sizeof(s->nc.info_str), 727 "l2tpv3: connected"); 728 return 0; 729 outerr: 730 qemu_del_net_client(nc); 731 if (fd >= 0) { 732 close(fd); 733 } 734 if (result) { 735 freeaddrinfo(result); 736 } 737 return -1; 738 } 739 740