1 /* 2 * Copyright (C) 2016 Red Hat, Inc. 3 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws> 4 * 5 * Network Block Device Server Side 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; under version 2 of the License. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qapi/error.h" 22 #include "nbd-internal.h" 23 24 static int system_errno_to_nbd_errno(int err) 25 { 26 switch (err) { 27 case 0: 28 return NBD_SUCCESS; 29 case EPERM: 30 case EROFS: 31 return NBD_EPERM; 32 case EIO: 33 return NBD_EIO; 34 case ENOMEM: 35 return NBD_ENOMEM; 36 #ifdef EDQUOT 37 case EDQUOT: 38 #endif 39 case EFBIG: 40 case ENOSPC: 41 return NBD_ENOSPC; 42 case ESHUTDOWN: 43 return NBD_ESHUTDOWN; 44 case EINVAL: 45 default: 46 return NBD_EINVAL; 47 } 48 } 49 50 /* Definitions for opaque data types */ 51 52 typedef struct NBDRequestData NBDRequestData; 53 54 struct NBDRequestData { 55 QSIMPLEQ_ENTRY(NBDRequestData) entry; 56 NBDClient *client; 57 uint8_t *data; 58 bool complete; 59 }; 60 61 struct NBDExport { 62 int refcount; 63 void (*close)(NBDExport *exp); 64 65 BlockBackend *blk; 66 char *name; 67 char *description; 68 off_t dev_offset; 69 off_t size; 70 uint16_t nbdflags; 71 QTAILQ_HEAD(, NBDClient) clients; 72 QTAILQ_ENTRY(NBDExport) next; 73 74 AioContext *ctx; 75 76 BlockBackend *eject_notifier_blk; 77 Notifier eject_notifier; 78 }; 79 80 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); 81 82 struct NBDClient { 83 int refcount; 84 void (*close)(NBDClient *client); 85 86 bool no_zeroes; 87 NBDExport *exp; 88 QCryptoTLSCreds *tlscreds; 89 char *tlsaclname; 90 QIOChannelSocket *sioc; /* The underlying data channel */ 91 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ 92 93 Coroutine *recv_coroutine; 94 95 CoMutex send_lock; 96 Coroutine *send_coroutine; 97 98 bool can_read; 99 100 QTAILQ_ENTRY(NBDClient) next; 101 int nb_requests; 102 bool closing; 103 }; 104 105 /* That's all folks */ 106 107 static void nbd_set_handlers(NBDClient *client); 108 static void nbd_unset_handlers(NBDClient *client); 109 static void nbd_update_can_read(NBDClient *client); 110 111 static gboolean nbd_negotiate_continue(QIOChannel *ioc, 112 GIOCondition condition, 113 void *opaque) 114 { 115 qemu_coroutine_enter(opaque); 116 return TRUE; 117 } 118 119 static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) 120 { 121 ssize_t ret; 122 guint watch; 123 124 assert(qemu_in_coroutine()); 125 /* Negotiation are always in main loop. */ 126 watch = qio_channel_add_watch(ioc, 127 G_IO_IN, 128 nbd_negotiate_continue, 129 qemu_coroutine_self(), 130 NULL); 131 ret = read_sync(ioc, buffer, size); 132 g_source_remove(watch); 133 return ret; 134 135 } 136 137 static ssize_t nbd_negotiate_write(QIOChannel *ioc, const void *buffer, 138 size_t size) 139 { 140 ssize_t ret; 141 guint watch; 142 143 assert(qemu_in_coroutine()); 144 /* Negotiation are always in main loop. */ 145 watch = qio_channel_add_watch(ioc, 146 G_IO_OUT, 147 nbd_negotiate_continue, 148 qemu_coroutine_self(), 149 NULL); 150 ret = write_sync(ioc, buffer, size); 151 g_source_remove(watch); 152 return ret; 153 } 154 155 static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size) 156 { 157 ssize_t ret, dropped = size; 158 uint8_t *buffer = g_malloc(MIN(65536, size)); 159 160 while (size > 0) { 161 ret = nbd_negotiate_read(ioc, buffer, MIN(65536, size)); 162 if (ret < 0) { 163 g_free(buffer); 164 return ret; 165 } 166 167 assert(ret <= size); 168 size -= ret; 169 } 170 171 g_free(buffer); 172 return dropped; 173 } 174 175 /* Basic flow for negotiation 176 177 Server Client 178 Negotiate 179 180 or 181 182 Server Client 183 Negotiate #1 184 Option 185 Negotiate #2 186 187 ---- 188 189 followed by 190 191 Server Client 192 Request 193 Response 194 Request 195 Response 196 ... 197 ... 198 Request (type == 2) 199 200 */ 201 202 /* Send a reply header, including length, but no payload. 203 * Return -errno on error, 0 on success. */ 204 static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type, 205 uint32_t opt, uint32_t len) 206 { 207 uint64_t magic; 208 209 TRACE("Reply opt=%" PRIx32 " type=%" PRIx32 " len=%" PRIu32, 210 type, opt, len); 211 212 magic = cpu_to_be64(NBD_REP_MAGIC); 213 if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) { 214 LOG("write failed (rep magic)"); 215 return -EINVAL; 216 } 217 opt = cpu_to_be32(opt); 218 if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) { 219 LOG("write failed (rep opt)"); 220 return -EINVAL; 221 } 222 type = cpu_to_be32(type); 223 if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) { 224 LOG("write failed (rep type)"); 225 return -EINVAL; 226 } 227 len = cpu_to_be32(len); 228 if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { 229 LOG("write failed (rep data length)"); 230 return -EINVAL; 231 } 232 return 0; 233 } 234 235 /* Send a reply header with default 0 length. 236 * Return -errno on error, 0 on success. */ 237 static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt) 238 { 239 return nbd_negotiate_send_rep_len(ioc, type, opt, 0); 240 } 241 242 /* Send an error reply. 243 * Return -errno on error, 0 on success. */ 244 static int GCC_FMT_ATTR(4, 5) 245 nbd_negotiate_send_rep_err(QIOChannel *ioc, uint32_t type, 246 uint32_t opt, const char *fmt, ...) 247 { 248 va_list va; 249 char *msg; 250 int ret; 251 size_t len; 252 253 va_start(va, fmt); 254 msg = g_strdup_vprintf(fmt, va); 255 va_end(va); 256 len = strlen(msg); 257 assert(len < 4096); 258 TRACE("sending error message \"%s\"", msg); 259 ret = nbd_negotiate_send_rep_len(ioc, type, opt, len); 260 if (ret < 0) { 261 goto out; 262 } 263 if (nbd_negotiate_write(ioc, msg, len) != len) { 264 LOG("write failed (error message)"); 265 ret = -EIO; 266 } else { 267 ret = 0; 268 } 269 out: 270 g_free(msg); 271 return ret; 272 } 273 274 /* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload. 275 * Return -errno on error, 0 on success. */ 276 static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) 277 { 278 size_t name_len, desc_len; 279 uint32_t len; 280 const char *name = exp->name ? exp->name : ""; 281 const char *desc = exp->description ? exp->description : ""; 282 int rc; 283 284 TRACE("Advertising export name '%s' description '%s'", name, desc); 285 name_len = strlen(name); 286 desc_len = strlen(desc); 287 len = name_len + desc_len + sizeof(len); 288 rc = nbd_negotiate_send_rep_len(ioc, NBD_REP_SERVER, NBD_OPT_LIST, len); 289 if (rc < 0) { 290 return rc; 291 } 292 293 len = cpu_to_be32(name_len); 294 if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { 295 LOG("write failed (name length)"); 296 return -EINVAL; 297 } 298 if (nbd_negotiate_write(ioc, name, name_len) != name_len) { 299 LOG("write failed (name buffer)"); 300 return -EINVAL; 301 } 302 if (nbd_negotiate_write(ioc, desc, desc_len) != desc_len) { 303 LOG("write failed (description buffer)"); 304 return -EINVAL; 305 } 306 return 0; 307 } 308 309 /* Process the NBD_OPT_LIST command, with a potential series of replies. 310 * Return -errno on error, 0 on success. */ 311 static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length) 312 { 313 NBDExport *exp; 314 315 if (length) { 316 if (nbd_negotiate_drop_sync(client->ioc, length) != length) { 317 return -EIO; 318 } 319 return nbd_negotiate_send_rep_err(client->ioc, 320 NBD_REP_ERR_INVALID, NBD_OPT_LIST, 321 "OPT_LIST should not have length"); 322 } 323 324 /* For each export, send a NBD_REP_SERVER reply. */ 325 QTAILQ_FOREACH(exp, &exports, next) { 326 if (nbd_negotiate_send_rep_list(client->ioc, exp)) { 327 return -EINVAL; 328 } 329 } 330 /* Finish with a NBD_REP_ACK. */ 331 return nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, NBD_OPT_LIST); 332 } 333 334 static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length) 335 { 336 int rc = -EINVAL; 337 char name[NBD_MAX_NAME_SIZE + 1]; 338 339 /* Client sends: 340 [20 .. xx] export name (length bytes) 341 */ 342 TRACE("Checking length"); 343 if (length >= sizeof(name)) { 344 LOG("Bad length received"); 345 goto fail; 346 } 347 if (nbd_negotiate_read(client->ioc, name, length) != length) { 348 LOG("read failed"); 349 goto fail; 350 } 351 name[length] = '\0'; 352 353 TRACE("Client requested export '%s'", name); 354 355 client->exp = nbd_export_find(name); 356 if (!client->exp) { 357 LOG("export not found"); 358 goto fail; 359 } 360 361 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next); 362 nbd_export_get(client->exp); 363 rc = 0; 364 fail: 365 return rc; 366 } 367 368 /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the 369 * new channel for all further (now-encrypted) communication. */ 370 static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, 371 uint32_t length) 372 { 373 QIOChannel *ioc; 374 QIOChannelTLS *tioc; 375 struct NBDTLSHandshakeData data = { 0 }; 376 377 TRACE("Setting up TLS"); 378 ioc = client->ioc; 379 if (length) { 380 if (nbd_negotiate_drop_sync(ioc, length) != length) { 381 return NULL; 382 } 383 nbd_negotiate_send_rep_err(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS, 384 "OPT_STARTTLS should not have length"); 385 return NULL; 386 } 387 388 if (nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, 389 NBD_OPT_STARTTLS) < 0) { 390 return NULL; 391 } 392 393 tioc = qio_channel_tls_new_server(ioc, 394 client->tlscreds, 395 client->tlsaclname, 396 NULL); 397 if (!tioc) { 398 return NULL; 399 } 400 401 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls"); 402 TRACE("Starting TLS handshake"); 403 data.loop = g_main_loop_new(g_main_context_default(), FALSE); 404 qio_channel_tls_handshake(tioc, 405 nbd_tls_handshake, 406 &data, 407 NULL); 408 409 if (!data.complete) { 410 g_main_loop_run(data.loop); 411 } 412 g_main_loop_unref(data.loop); 413 if (data.error) { 414 object_unref(OBJECT(tioc)); 415 error_free(data.error); 416 return NULL; 417 } 418 419 return QIO_CHANNEL(tioc); 420 } 421 422 423 /* Process all NBD_OPT_* client option commands. 424 * Return -errno on error, 0 on success. */ 425 static int nbd_negotiate_options(NBDClient *client) 426 { 427 uint32_t flags; 428 bool fixedNewstyle = false; 429 430 /* Client sends: 431 [ 0 .. 3] client flags 432 433 [ 0 .. 7] NBD_OPTS_MAGIC 434 [ 8 .. 11] NBD option 435 [12 .. 15] Data length 436 ... Rest of request 437 438 [ 0 .. 7] NBD_OPTS_MAGIC 439 [ 8 .. 11] Second NBD option 440 [12 .. 15] Data length 441 ... Rest of request 442 */ 443 444 if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) != 445 sizeof(flags)) { 446 LOG("read failed"); 447 return -EIO; 448 } 449 TRACE("Checking client flags"); 450 be32_to_cpus(&flags); 451 if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) { 452 TRACE("Client supports fixed newstyle handshake"); 453 fixedNewstyle = true; 454 flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE; 455 } 456 if (flags & NBD_FLAG_C_NO_ZEROES) { 457 TRACE("Client supports no zeroes at handshake end"); 458 client->no_zeroes = true; 459 flags &= ~NBD_FLAG_C_NO_ZEROES; 460 } 461 if (flags != 0) { 462 TRACE("Unknown client flags 0x%" PRIx32 " received", flags); 463 return -EIO; 464 } 465 466 while (1) { 467 int ret; 468 uint32_t clientflags, length; 469 uint64_t magic; 470 471 if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) != 472 sizeof(magic)) { 473 LOG("read failed"); 474 return -EINVAL; 475 } 476 TRACE("Checking opts magic"); 477 if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) { 478 LOG("Bad magic received"); 479 return -EINVAL; 480 } 481 482 if (nbd_negotiate_read(client->ioc, &clientflags, 483 sizeof(clientflags)) != sizeof(clientflags)) { 484 LOG("read failed"); 485 return -EINVAL; 486 } 487 clientflags = be32_to_cpu(clientflags); 488 489 if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) != 490 sizeof(length)) { 491 LOG("read failed"); 492 return -EINVAL; 493 } 494 length = be32_to_cpu(length); 495 496 TRACE("Checking option 0x%" PRIx32, clientflags); 497 if (client->tlscreds && 498 client->ioc == (QIOChannel *)client->sioc) { 499 QIOChannel *tioc; 500 if (!fixedNewstyle) { 501 TRACE("Unsupported option 0x%" PRIx32, clientflags); 502 return -EINVAL; 503 } 504 switch (clientflags) { 505 case NBD_OPT_STARTTLS: 506 tioc = nbd_negotiate_handle_starttls(client, length); 507 if (!tioc) { 508 return -EIO; 509 } 510 object_unref(OBJECT(client->ioc)); 511 client->ioc = QIO_CHANNEL(tioc); 512 break; 513 514 case NBD_OPT_EXPORT_NAME: 515 /* No way to return an error to client, so drop connection */ 516 TRACE("Option 0x%x not permitted before TLS", clientflags); 517 return -EINVAL; 518 519 default: 520 if (nbd_negotiate_drop_sync(client->ioc, length) != length) { 521 return -EIO; 522 } 523 ret = nbd_negotiate_send_rep_err(client->ioc, 524 NBD_REP_ERR_TLS_REQD, 525 clientflags, 526 "Option 0x%" PRIx32 527 "not permitted before TLS", 528 clientflags); 529 if (ret < 0) { 530 return ret; 531 } 532 /* Let the client keep trying, unless they asked to quit */ 533 if (clientflags == NBD_OPT_ABORT) { 534 return -EINVAL; 535 } 536 break; 537 } 538 } else if (fixedNewstyle) { 539 switch (clientflags) { 540 case NBD_OPT_LIST: 541 ret = nbd_negotiate_handle_list(client, length); 542 if (ret < 0) { 543 return ret; 544 } 545 break; 546 547 case NBD_OPT_ABORT: 548 /* NBD spec says we must try to reply before 549 * disconnecting, but that we must also tolerate 550 * guests that don't wait for our reply. */ 551 nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, clientflags); 552 return -EINVAL; 553 554 case NBD_OPT_EXPORT_NAME: 555 return nbd_negotiate_handle_export_name(client, length); 556 557 case NBD_OPT_STARTTLS: 558 if (nbd_negotiate_drop_sync(client->ioc, length) != length) { 559 return -EIO; 560 } 561 if (client->tlscreds) { 562 ret = nbd_negotiate_send_rep_err(client->ioc, 563 NBD_REP_ERR_INVALID, 564 clientflags, 565 "TLS already enabled"); 566 } else { 567 ret = nbd_negotiate_send_rep_err(client->ioc, 568 NBD_REP_ERR_POLICY, 569 clientflags, 570 "TLS not configured"); 571 } 572 if (ret < 0) { 573 return ret; 574 } 575 break; 576 default: 577 if (nbd_negotiate_drop_sync(client->ioc, length) != length) { 578 return -EIO; 579 } 580 ret = nbd_negotiate_send_rep_err(client->ioc, 581 NBD_REP_ERR_UNSUP, 582 clientflags, 583 "Unsupported option 0x%" 584 PRIx32, 585 clientflags); 586 if (ret < 0) { 587 return ret; 588 } 589 break; 590 } 591 } else { 592 /* 593 * If broken new-style we should drop the connection 594 * for anything except NBD_OPT_EXPORT_NAME 595 */ 596 switch (clientflags) { 597 case NBD_OPT_EXPORT_NAME: 598 return nbd_negotiate_handle_export_name(client, length); 599 600 default: 601 TRACE("Unsupported option 0x%" PRIx32, clientflags); 602 return -EINVAL; 603 } 604 } 605 } 606 } 607 608 typedef struct { 609 NBDClient *client; 610 Coroutine *co; 611 } NBDClientNewData; 612 613 static coroutine_fn int nbd_negotiate(NBDClientNewData *data) 614 { 615 NBDClient *client = data->client; 616 char buf[8 + 8 + 8 + 128]; 617 int rc; 618 const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM | 619 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA | 620 NBD_FLAG_SEND_WRITE_ZEROES); 621 bool oldStyle; 622 size_t len; 623 624 /* Old style negotiation header without options 625 [ 0 .. 7] passwd ("NBDMAGIC") 626 [ 8 .. 15] magic (NBD_CLIENT_MAGIC) 627 [16 .. 23] size 628 [24 .. 25] server flags (0) 629 [26 .. 27] export flags 630 [28 .. 151] reserved (0) 631 632 New style negotiation header with options 633 [ 0 .. 7] passwd ("NBDMAGIC") 634 [ 8 .. 15] magic (NBD_OPTS_MAGIC) 635 [16 .. 17] server flags (0) 636 ....options sent.... 637 [18 .. 25] size 638 [26 .. 27] export flags 639 [28 .. 151] reserved (0, omit if no_zeroes) 640 */ 641 642 qio_channel_set_blocking(client->ioc, false, NULL); 643 rc = -EINVAL; 644 645 TRACE("Beginning negotiation."); 646 memset(buf, 0, sizeof(buf)); 647 memcpy(buf, "NBDMAGIC", 8); 648 649 oldStyle = client->exp != NULL && !client->tlscreds; 650 if (oldStyle) { 651 TRACE("advertising size %" PRIu64 " and flags %x", 652 client->exp->size, client->exp->nbdflags | myflags); 653 stq_be_p(buf + 8, NBD_CLIENT_MAGIC); 654 stq_be_p(buf + 16, client->exp->size); 655 stw_be_p(buf + 26, client->exp->nbdflags | myflags); 656 } else { 657 stq_be_p(buf + 8, NBD_OPTS_MAGIC); 658 stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES); 659 } 660 661 if (oldStyle) { 662 if (client->tlscreds) { 663 TRACE("TLS cannot be enabled with oldstyle protocol"); 664 goto fail; 665 } 666 if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) != sizeof(buf)) { 667 LOG("write failed"); 668 goto fail; 669 } 670 } else { 671 if (nbd_negotiate_write(client->ioc, buf, 18) != 18) { 672 LOG("write failed"); 673 goto fail; 674 } 675 rc = nbd_negotiate_options(client); 676 if (rc != 0) { 677 LOG("option negotiation failed"); 678 goto fail; 679 } 680 681 TRACE("advertising size %" PRIu64 " and flags %x", 682 client->exp->size, client->exp->nbdflags | myflags); 683 stq_be_p(buf + 18, client->exp->size); 684 stw_be_p(buf + 26, client->exp->nbdflags | myflags); 685 len = client->no_zeroes ? 10 : sizeof(buf) - 18; 686 if (nbd_negotiate_write(client->ioc, buf + 18, len) != len) { 687 LOG("write failed"); 688 goto fail; 689 } 690 } 691 692 TRACE("Negotiation succeeded."); 693 rc = 0; 694 fail: 695 return rc; 696 } 697 698 static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) 699 { 700 uint8_t buf[NBD_REQUEST_SIZE]; 701 uint32_t magic; 702 ssize_t ret; 703 704 ret = read_sync(ioc, buf, sizeof(buf)); 705 if (ret < 0) { 706 return ret; 707 } 708 709 if (ret != sizeof(buf)) { 710 LOG("read failed"); 711 return -EINVAL; 712 } 713 714 /* Request 715 [ 0 .. 3] magic (NBD_REQUEST_MAGIC) 716 [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...) 717 [ 6 .. 7] type (NBD_CMD_READ, ...) 718 [ 8 .. 15] handle 719 [16 .. 23] from 720 [24 .. 27] len 721 */ 722 723 magic = ldl_be_p(buf); 724 request->flags = lduw_be_p(buf + 4); 725 request->type = lduw_be_p(buf + 6); 726 request->handle = ldq_be_p(buf + 8); 727 request->from = ldq_be_p(buf + 16); 728 request->len = ldl_be_p(buf + 24); 729 730 TRACE("Got request: { magic = 0x%" PRIx32 ", .flags = %" PRIx16 731 ", .type = %" PRIx16 ", from = %" PRIu64 ", len = %" PRIu32 " }", 732 magic, request->flags, request->type, request->from, request->len); 733 734 if (magic != NBD_REQUEST_MAGIC) { 735 LOG("invalid magic (got 0x%" PRIx32 ")", magic); 736 return -EINVAL; 737 } 738 return 0; 739 } 740 741 static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) 742 { 743 uint8_t buf[NBD_REPLY_SIZE]; 744 ssize_t ret; 745 746 reply->error = system_errno_to_nbd_errno(reply->error); 747 748 TRACE("Sending response to client: { .error = %" PRId32 749 ", handle = %" PRIu64 " }", 750 reply->error, reply->handle); 751 752 /* Reply 753 [ 0 .. 3] magic (NBD_REPLY_MAGIC) 754 [ 4 .. 7] error (0 == no error) 755 [ 7 .. 15] handle 756 */ 757 stl_be_p(buf, NBD_REPLY_MAGIC); 758 stl_be_p(buf + 4, reply->error); 759 stq_be_p(buf + 8, reply->handle); 760 761 ret = write_sync(ioc, buf, sizeof(buf)); 762 if (ret < 0) { 763 return ret; 764 } 765 766 if (ret != sizeof(buf)) { 767 LOG("writing to socket failed"); 768 return -EINVAL; 769 } 770 return 0; 771 } 772 773 #define MAX_NBD_REQUESTS 16 774 775 void nbd_client_get(NBDClient *client) 776 { 777 client->refcount++; 778 } 779 780 void nbd_client_put(NBDClient *client) 781 { 782 if (--client->refcount == 0) { 783 /* The last reference should be dropped by client->close, 784 * which is called by client_close. 785 */ 786 assert(client->closing); 787 788 nbd_unset_handlers(client); 789 object_unref(OBJECT(client->sioc)); 790 object_unref(OBJECT(client->ioc)); 791 if (client->tlscreds) { 792 object_unref(OBJECT(client->tlscreds)); 793 } 794 g_free(client->tlsaclname); 795 if (client->exp) { 796 QTAILQ_REMOVE(&client->exp->clients, client, next); 797 nbd_export_put(client->exp); 798 } 799 g_free(client); 800 } 801 } 802 803 static void client_close(NBDClient *client) 804 { 805 if (client->closing) { 806 return; 807 } 808 809 client->closing = true; 810 811 /* Force requests to finish. They will drop their own references, 812 * then we'll close the socket and free the NBDClient. 813 */ 814 qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, 815 NULL); 816 817 /* Also tell the client, so that they release their reference. */ 818 if (client->close) { 819 client->close(client); 820 } 821 } 822 823 static NBDRequestData *nbd_request_get(NBDClient *client) 824 { 825 NBDRequestData *req; 826 827 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1); 828 client->nb_requests++; 829 nbd_update_can_read(client); 830 831 req = g_new0(NBDRequestData, 1); 832 nbd_client_get(client); 833 req->client = client; 834 return req; 835 } 836 837 static void nbd_request_put(NBDRequestData *req) 838 { 839 NBDClient *client = req->client; 840 841 if (req->data) { 842 qemu_vfree(req->data); 843 } 844 g_free(req); 845 846 client->nb_requests--; 847 nbd_update_can_read(client); 848 nbd_client_put(client); 849 } 850 851 static void blk_aio_attached(AioContext *ctx, void *opaque) 852 { 853 NBDExport *exp = opaque; 854 NBDClient *client; 855 856 TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx); 857 858 exp->ctx = ctx; 859 860 QTAILQ_FOREACH(client, &exp->clients, next) { 861 nbd_set_handlers(client); 862 } 863 } 864 865 static void blk_aio_detach(void *opaque) 866 { 867 NBDExport *exp = opaque; 868 NBDClient *client; 869 870 TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx); 871 872 QTAILQ_FOREACH(client, &exp->clients, next) { 873 nbd_unset_handlers(client); 874 } 875 876 exp->ctx = NULL; 877 } 878 879 static void nbd_eject_notifier(Notifier *n, void *data) 880 { 881 NBDExport *exp = container_of(n, NBDExport, eject_notifier); 882 nbd_export_close(exp); 883 } 884 885 NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset, off_t size, 886 uint16_t nbdflags, void (*close)(NBDExport *), 887 bool writethrough, BlockBackend *on_eject_blk, 888 Error **errp) 889 { 890 BlockBackend *blk; 891 NBDExport *exp = g_malloc0(sizeof(NBDExport)); 892 893 blk = blk_new(); 894 blk_insert_bs(blk, bs); 895 blk_set_enable_write_cache(blk, !writethrough); 896 897 exp->refcount = 1; 898 QTAILQ_INIT(&exp->clients); 899 exp->blk = blk; 900 exp->dev_offset = dev_offset; 901 exp->nbdflags = nbdflags; 902 exp->size = size < 0 ? blk_getlength(blk) : size; 903 if (exp->size < 0) { 904 error_setg_errno(errp, -exp->size, 905 "Failed to determine the NBD export's length"); 906 goto fail; 907 } 908 exp->size -= exp->size % BDRV_SECTOR_SIZE; 909 910 exp->close = close; 911 exp->ctx = blk_get_aio_context(blk); 912 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp); 913 914 if (on_eject_blk) { 915 blk_ref(on_eject_blk); 916 exp->eject_notifier_blk = on_eject_blk; 917 exp->eject_notifier.notify = nbd_eject_notifier; 918 blk_add_remove_bs_notifier(on_eject_blk, &exp->eject_notifier); 919 } 920 921 /* 922 * NBD exports are used for non-shared storage migration. Make sure 923 * that BDRV_O_INACTIVE is cleared and the image is ready for write 924 * access since the export could be available before migration handover. 925 */ 926 aio_context_acquire(exp->ctx); 927 blk_invalidate_cache(blk, NULL); 928 aio_context_release(exp->ctx); 929 return exp; 930 931 fail: 932 blk_unref(blk); 933 g_free(exp); 934 return NULL; 935 } 936 937 NBDExport *nbd_export_find(const char *name) 938 { 939 NBDExport *exp; 940 QTAILQ_FOREACH(exp, &exports, next) { 941 if (strcmp(name, exp->name) == 0) { 942 return exp; 943 } 944 } 945 946 return NULL; 947 } 948 949 void nbd_export_set_name(NBDExport *exp, const char *name) 950 { 951 if (exp->name == name) { 952 return; 953 } 954 955 nbd_export_get(exp); 956 if (exp->name != NULL) { 957 g_free(exp->name); 958 exp->name = NULL; 959 QTAILQ_REMOVE(&exports, exp, next); 960 nbd_export_put(exp); 961 } 962 if (name != NULL) { 963 nbd_export_get(exp); 964 exp->name = g_strdup(name); 965 QTAILQ_INSERT_TAIL(&exports, exp, next); 966 } 967 nbd_export_put(exp); 968 } 969 970 void nbd_export_set_description(NBDExport *exp, const char *description) 971 { 972 g_free(exp->description); 973 exp->description = g_strdup(description); 974 } 975 976 void nbd_export_close(NBDExport *exp) 977 { 978 NBDClient *client, *next; 979 980 nbd_export_get(exp); 981 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) { 982 client_close(client); 983 } 984 nbd_export_set_name(exp, NULL); 985 nbd_export_set_description(exp, NULL); 986 nbd_export_put(exp); 987 } 988 989 void nbd_export_get(NBDExport *exp) 990 { 991 assert(exp->refcount > 0); 992 exp->refcount++; 993 } 994 995 void nbd_export_put(NBDExport *exp) 996 { 997 assert(exp->refcount > 0); 998 if (exp->refcount == 1) { 999 nbd_export_close(exp); 1000 } 1001 1002 if (--exp->refcount == 0) { 1003 assert(exp->name == NULL); 1004 assert(exp->description == NULL); 1005 1006 if (exp->close) { 1007 exp->close(exp); 1008 } 1009 1010 if (exp->blk) { 1011 if (exp->eject_notifier_blk) { 1012 notifier_remove(&exp->eject_notifier); 1013 blk_unref(exp->eject_notifier_blk); 1014 } 1015 blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, 1016 blk_aio_detach, exp); 1017 blk_unref(exp->blk); 1018 exp->blk = NULL; 1019 } 1020 1021 g_free(exp); 1022 } 1023 } 1024 1025 BlockBackend *nbd_export_get_blockdev(NBDExport *exp) 1026 { 1027 return exp->blk; 1028 } 1029 1030 void nbd_export_close_all(void) 1031 { 1032 NBDExport *exp, *next; 1033 1034 QTAILQ_FOREACH_SAFE(exp, &exports, next, next) { 1035 nbd_export_close(exp); 1036 } 1037 } 1038 1039 static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply, 1040 int len) 1041 { 1042 NBDClient *client = req->client; 1043 ssize_t rc, ret; 1044 1045 g_assert(qemu_in_coroutine()); 1046 qemu_co_mutex_lock(&client->send_lock); 1047 client->send_coroutine = qemu_coroutine_self(); 1048 nbd_set_handlers(client); 1049 1050 if (!len) { 1051 rc = nbd_send_reply(client->ioc, reply); 1052 } else { 1053 qio_channel_set_cork(client->ioc, true); 1054 rc = nbd_send_reply(client->ioc, reply); 1055 if (rc >= 0) { 1056 ret = write_sync(client->ioc, req->data, len); 1057 if (ret != len) { 1058 rc = -EIO; 1059 } 1060 } 1061 qio_channel_set_cork(client->ioc, false); 1062 } 1063 1064 client->send_coroutine = NULL; 1065 nbd_set_handlers(client); 1066 qemu_co_mutex_unlock(&client->send_lock); 1067 return rc; 1068 } 1069 1070 /* Collect a client request. Return 0 if request looks valid, -EAGAIN 1071 * to keep trying the collection, -EIO to drop connection right away, 1072 * and any other negative value to report an error to the client 1073 * (although the caller may still need to disconnect after reporting 1074 * the error). */ 1075 static ssize_t nbd_co_receive_request(NBDRequestData *req, 1076 NBDRequest *request) 1077 { 1078 NBDClient *client = req->client; 1079 ssize_t rc; 1080 1081 g_assert(qemu_in_coroutine()); 1082 client->recv_coroutine = qemu_coroutine_self(); 1083 nbd_update_can_read(client); 1084 1085 rc = nbd_receive_request(client->ioc, request); 1086 if (rc < 0) { 1087 if (rc != -EAGAIN) { 1088 rc = -EIO; 1089 } 1090 goto out; 1091 } 1092 1093 TRACE("Decoding type"); 1094 1095 if (request->type != NBD_CMD_WRITE) { 1096 /* No payload, we are ready to read the next request. */ 1097 req->complete = true; 1098 } 1099 1100 if (request->type == NBD_CMD_DISC) { 1101 /* Special case: we're going to disconnect without a reply, 1102 * whether or not flags, from, or len are bogus */ 1103 TRACE("Request type is DISCONNECT"); 1104 rc = -EIO; 1105 goto out; 1106 } 1107 1108 /* Check for sanity in the parameters, part 1. Defer as many 1109 * checks as possible until after reading any NBD_CMD_WRITE 1110 * payload, so we can try and keep the connection alive. */ 1111 if ((request->from + request->len) < request->from) { 1112 LOG("integer overflow detected, you're probably being attacked"); 1113 rc = -EINVAL; 1114 goto out; 1115 } 1116 1117 if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE) { 1118 if (request->len > NBD_MAX_BUFFER_SIZE) { 1119 LOG("len (%" PRIu32" ) is larger than max len (%u)", 1120 request->len, NBD_MAX_BUFFER_SIZE); 1121 rc = -EINVAL; 1122 goto out; 1123 } 1124 1125 req->data = blk_try_blockalign(client->exp->blk, request->len); 1126 if (req->data == NULL) { 1127 rc = -ENOMEM; 1128 goto out; 1129 } 1130 } 1131 if (request->type == NBD_CMD_WRITE) { 1132 TRACE("Reading %" PRIu32 " byte(s)", request->len); 1133 1134 if (read_sync(client->ioc, req->data, request->len) != request->len) { 1135 LOG("reading from socket failed"); 1136 rc = -EIO; 1137 goto out; 1138 } 1139 req->complete = true; 1140 } 1141 1142 /* Sanity checks, part 2. */ 1143 if (request->from + request->len > client->exp->size) { 1144 LOG("operation past EOF; From: %" PRIu64 ", Len: %" PRIu32 1145 ", Size: %" PRIu64, request->from, request->len, 1146 (uint64_t)client->exp->size); 1147 rc = request->type == NBD_CMD_WRITE ? -ENOSPC : -EINVAL; 1148 goto out; 1149 } 1150 if (request->flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) { 1151 LOG("unsupported flags (got 0x%x)", request->flags); 1152 rc = -EINVAL; 1153 goto out; 1154 } 1155 if (request->type != NBD_CMD_WRITE_ZEROES && 1156 (request->flags & NBD_CMD_FLAG_NO_HOLE)) { 1157 LOG("unexpected flags (got 0x%x)", request->flags); 1158 rc = -EINVAL; 1159 goto out; 1160 } 1161 1162 rc = 0; 1163 1164 out: 1165 client->recv_coroutine = NULL; 1166 nbd_update_can_read(client); 1167 1168 return rc; 1169 } 1170 1171 static void nbd_trip(void *opaque) 1172 { 1173 NBDClient *client = opaque; 1174 NBDExport *exp = client->exp; 1175 NBDRequestData *req; 1176 NBDRequest request; 1177 NBDReply reply; 1178 ssize_t ret; 1179 int flags; 1180 1181 TRACE("Reading request."); 1182 if (client->closing) { 1183 return; 1184 } 1185 1186 req = nbd_request_get(client); 1187 ret = nbd_co_receive_request(req, &request); 1188 if (ret == -EAGAIN) { 1189 goto done; 1190 } 1191 if (ret == -EIO) { 1192 goto out; 1193 } 1194 1195 reply.handle = request.handle; 1196 reply.error = 0; 1197 1198 if (ret < 0) { 1199 reply.error = -ret; 1200 goto error_reply; 1201 } 1202 1203 if (client->closing) { 1204 /* 1205 * The client may be closed when we are blocked in 1206 * nbd_co_receive_request() 1207 */ 1208 goto done; 1209 } 1210 1211 switch (request.type) { 1212 case NBD_CMD_READ: 1213 TRACE("Request type is READ"); 1214 1215 /* XXX: NBD Protocol only documents use of FUA with WRITE */ 1216 if (request.flags & NBD_CMD_FLAG_FUA) { 1217 ret = blk_co_flush(exp->blk); 1218 if (ret < 0) { 1219 LOG("flush failed"); 1220 reply.error = -ret; 1221 goto error_reply; 1222 } 1223 } 1224 1225 ret = blk_pread(exp->blk, request.from + exp->dev_offset, 1226 req->data, request.len); 1227 if (ret < 0) { 1228 LOG("reading from file failed"); 1229 reply.error = -ret; 1230 goto error_reply; 1231 } 1232 1233 TRACE("Read %" PRIu32" byte(s)", request.len); 1234 if (nbd_co_send_reply(req, &reply, request.len) < 0) 1235 goto out; 1236 break; 1237 case NBD_CMD_WRITE: 1238 TRACE("Request type is WRITE"); 1239 1240 if (exp->nbdflags & NBD_FLAG_READ_ONLY) { 1241 TRACE("Server is read-only, return error"); 1242 reply.error = EROFS; 1243 goto error_reply; 1244 } 1245 1246 TRACE("Writing to device"); 1247 1248 flags = 0; 1249 if (request.flags & NBD_CMD_FLAG_FUA) { 1250 flags |= BDRV_REQ_FUA; 1251 } 1252 ret = blk_pwrite(exp->blk, request.from + exp->dev_offset, 1253 req->data, request.len, flags); 1254 if (ret < 0) { 1255 LOG("writing to file failed"); 1256 reply.error = -ret; 1257 goto error_reply; 1258 } 1259 1260 if (nbd_co_send_reply(req, &reply, 0) < 0) { 1261 goto out; 1262 } 1263 break; 1264 1265 case NBD_CMD_WRITE_ZEROES: 1266 TRACE("Request type is WRITE_ZEROES"); 1267 1268 if (exp->nbdflags & NBD_FLAG_READ_ONLY) { 1269 TRACE("Server is read-only, return error"); 1270 reply.error = EROFS; 1271 goto error_reply; 1272 } 1273 1274 TRACE("Writing to device"); 1275 1276 flags = 0; 1277 if (request.flags & NBD_CMD_FLAG_FUA) { 1278 flags |= BDRV_REQ_FUA; 1279 } 1280 if (!(request.flags & NBD_CMD_FLAG_NO_HOLE)) { 1281 flags |= BDRV_REQ_MAY_UNMAP; 1282 } 1283 ret = blk_pwrite_zeroes(exp->blk, request.from + exp->dev_offset, 1284 request.len, flags); 1285 if (ret < 0) { 1286 LOG("writing to file failed"); 1287 reply.error = -ret; 1288 goto error_reply; 1289 } 1290 1291 if (nbd_co_send_reply(req, &reply, 0) < 0) { 1292 goto out; 1293 } 1294 break; 1295 1296 case NBD_CMD_DISC: 1297 /* unreachable, thanks to special case in nbd_co_receive_request() */ 1298 abort(); 1299 1300 case NBD_CMD_FLUSH: 1301 TRACE("Request type is FLUSH"); 1302 1303 ret = blk_co_flush(exp->blk); 1304 if (ret < 0) { 1305 LOG("flush failed"); 1306 reply.error = -ret; 1307 } 1308 if (nbd_co_send_reply(req, &reply, 0) < 0) { 1309 goto out; 1310 } 1311 break; 1312 case NBD_CMD_TRIM: 1313 TRACE("Request type is TRIM"); 1314 ret = blk_co_pdiscard(exp->blk, request.from + exp->dev_offset, 1315 request.len); 1316 if (ret < 0) { 1317 LOG("discard failed"); 1318 reply.error = -ret; 1319 } 1320 if (nbd_co_send_reply(req, &reply, 0) < 0) { 1321 goto out; 1322 } 1323 break; 1324 default: 1325 LOG("invalid request type (%" PRIu32 ") received", request.type); 1326 reply.error = EINVAL; 1327 error_reply: 1328 /* We must disconnect after NBD_CMD_WRITE if we did not 1329 * read the payload. 1330 */ 1331 if (nbd_co_send_reply(req, &reply, 0) < 0 || !req->complete) { 1332 goto out; 1333 } 1334 break; 1335 } 1336 1337 TRACE("Request/Reply complete"); 1338 1339 done: 1340 nbd_request_put(req); 1341 return; 1342 1343 out: 1344 nbd_request_put(req); 1345 client_close(client); 1346 } 1347 1348 static void nbd_read(void *opaque) 1349 { 1350 NBDClient *client = opaque; 1351 1352 if (client->recv_coroutine) { 1353 qemu_coroutine_enter(client->recv_coroutine); 1354 } else { 1355 qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client)); 1356 } 1357 } 1358 1359 static void nbd_restart_write(void *opaque) 1360 { 1361 NBDClient *client = opaque; 1362 1363 qemu_coroutine_enter(client->send_coroutine); 1364 } 1365 1366 static void nbd_set_handlers(NBDClient *client) 1367 { 1368 if (client->exp && client->exp->ctx) { 1369 aio_set_fd_handler(client->exp->ctx, client->sioc->fd, 1370 true, 1371 client->can_read ? nbd_read : NULL, 1372 client->send_coroutine ? nbd_restart_write : NULL, 1373 client); 1374 } 1375 } 1376 1377 static void nbd_unset_handlers(NBDClient *client) 1378 { 1379 if (client->exp && client->exp->ctx) { 1380 aio_set_fd_handler(client->exp->ctx, client->sioc->fd, 1381 true, NULL, NULL, NULL); 1382 } 1383 } 1384 1385 static void nbd_update_can_read(NBDClient *client) 1386 { 1387 bool can_read = client->recv_coroutine || 1388 client->nb_requests < MAX_NBD_REQUESTS; 1389 1390 if (can_read != client->can_read) { 1391 client->can_read = can_read; 1392 nbd_set_handlers(client); 1393 1394 /* There is no need to invoke aio_notify(), since aio_set_fd_handler() 1395 * in nbd_set_handlers() will have taken care of that */ 1396 } 1397 } 1398 1399 static coroutine_fn void nbd_co_client_start(void *opaque) 1400 { 1401 NBDClientNewData *data = opaque; 1402 NBDClient *client = data->client; 1403 NBDExport *exp = client->exp; 1404 1405 if (exp) { 1406 nbd_export_get(exp); 1407 } 1408 if (nbd_negotiate(data)) { 1409 client_close(client); 1410 goto out; 1411 } 1412 qemu_co_mutex_init(&client->send_lock); 1413 nbd_set_handlers(client); 1414 1415 if (exp) { 1416 QTAILQ_INSERT_TAIL(&exp->clients, client, next); 1417 } 1418 out: 1419 g_free(data); 1420 } 1421 1422 void nbd_client_new(NBDExport *exp, 1423 QIOChannelSocket *sioc, 1424 QCryptoTLSCreds *tlscreds, 1425 const char *tlsaclname, 1426 void (*close_fn)(NBDClient *)) 1427 { 1428 NBDClient *client; 1429 NBDClientNewData *data = g_new(NBDClientNewData, 1); 1430 1431 client = g_malloc0(sizeof(NBDClient)); 1432 client->refcount = 1; 1433 client->exp = exp; 1434 client->tlscreds = tlscreds; 1435 if (tlscreds) { 1436 object_ref(OBJECT(client->tlscreds)); 1437 } 1438 client->tlsaclname = g_strdup(tlsaclname); 1439 client->sioc = sioc; 1440 object_ref(OBJECT(client->sioc)); 1441 client->ioc = QIO_CHANNEL(sioc); 1442 object_ref(OBJECT(client->ioc)); 1443 client->can_read = true; 1444 client->close = close_fn; 1445 1446 data->client = client; 1447 data->co = qemu_coroutine_create(nbd_co_client_start, data); 1448 qemu_coroutine_enter(data->co); 1449 } 1450