1 /* 2 * Copyright (C) 2016-2018 Red Hat, Inc. 3 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws> 4 * 5 * Network Block Device Server Side 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; under version 2 of the License. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qapi/error.h" 22 #include "qemu/queue.h" 23 #include "trace.h" 24 #include "nbd-internal.h" 25 #include "qemu/units.h" 26 27 #define NBD_META_ID_BASE_ALLOCATION 0 28 #define NBD_META_ID_DIRTY_BITMAP 1 29 30 /* 31 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical 32 * constant. If an increase is needed, note that the NBD protocol 33 * recommends no larger than 32 mb, so that the client won't consider 34 * the reply as a denial of service attack. 35 */ 36 #define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8) 37 38 static int system_errno_to_nbd_errno(int err) 39 { 40 switch (err) { 41 case 0: 42 return NBD_SUCCESS; 43 case EPERM: 44 case EROFS: 45 return NBD_EPERM; 46 case EIO: 47 return NBD_EIO; 48 case ENOMEM: 49 return NBD_ENOMEM; 50 #ifdef EDQUOT 51 case EDQUOT: 52 #endif 53 case EFBIG: 54 case ENOSPC: 55 return NBD_ENOSPC; 56 case EOVERFLOW: 57 return NBD_EOVERFLOW; 58 case ESHUTDOWN: 59 return NBD_ESHUTDOWN; 60 case EINVAL: 61 default: 62 return NBD_EINVAL; 63 } 64 } 65 66 /* Definitions for opaque data types */ 67 68 typedef struct NBDRequestData NBDRequestData; 69 70 struct NBDRequestData { 71 QSIMPLEQ_ENTRY(NBDRequestData) entry; 72 NBDClient *client; 73 uint8_t *data; 74 bool complete; 75 }; 76 77 struct NBDExport { 78 int refcount; 79 void (*close)(NBDExport *exp); 80 81 BlockBackend *blk; 82 char *name; 83 char *description; 84 uint64_t dev_offset; 85 uint64_t size; 86 uint16_t nbdflags; 87 QTAILQ_HEAD(, NBDClient) clients; 88 QTAILQ_ENTRY(NBDExport) next; 89 90 AioContext *ctx; 91 92 BlockBackend *eject_notifier_blk; 93 Notifier eject_notifier; 94 95 BdrvDirtyBitmap *export_bitmap; 96 char *export_bitmap_context; 97 }; 98 99 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); 100 101 /* NBDExportMetaContexts represents a list of contexts to be exported, 102 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for 103 * NBD_OPT_LIST_META_CONTEXT. */ 104 typedef struct NBDExportMetaContexts { 105 NBDExport *exp; 106 bool valid; /* means that negotiation of the option finished without 107 errors */ 108 bool base_allocation; /* export base:allocation context (block status) */ 109 bool bitmap; /* export qemu:dirty-bitmap:<export bitmap name> */ 110 } NBDExportMetaContexts; 111 112 struct NBDClient { 113 int refcount; 114 void (*close_fn)(NBDClient *client, bool negotiated); 115 116 NBDExport *exp; 117 QCryptoTLSCreds *tlscreds; 118 char *tlsauthz; 119 QIOChannelSocket *sioc; /* The underlying data channel */ 120 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ 121 122 Coroutine *recv_coroutine; 123 124 CoMutex send_lock; 125 Coroutine *send_coroutine; 126 127 QTAILQ_ENTRY(NBDClient) next; 128 int nb_requests; 129 bool closing; 130 131 uint32_t check_align; /* If non-zero, check for aligned client requests */ 132 133 bool structured_reply; 134 NBDExportMetaContexts export_meta; 135 136 uint32_t opt; /* Current option being negotiated */ 137 uint32_t optlen; /* remaining length of data in ioc for the option being 138 negotiated now */ 139 }; 140 141 static void nbd_client_receive_next_request(NBDClient *client); 142 143 /* Basic flow for negotiation 144 145 Server Client 146 Negotiate 147 148 or 149 150 Server Client 151 Negotiate #1 152 Option 153 Negotiate #2 154 155 ---- 156 157 followed by 158 159 Server Client 160 Request 161 Response 162 Request 163 Response 164 ... 165 ... 166 Request (type == 2) 167 168 */ 169 170 static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option, 171 uint32_t type, uint32_t length) 172 { 173 stq_be_p(&rep->magic, NBD_REP_MAGIC); 174 stl_be_p(&rep->option, option); 175 stl_be_p(&rep->type, type); 176 stl_be_p(&rep->length, length); 177 } 178 179 /* Send a reply header, including length, but no payload. 180 * Return -errno on error, 0 on success. */ 181 static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type, 182 uint32_t len, Error **errp) 183 { 184 NBDOptionReply rep; 185 186 trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt), 187 type, nbd_rep_lookup(type), len); 188 189 assert(len < NBD_MAX_BUFFER_SIZE); 190 191 set_be_option_rep(&rep, client->opt, type, len); 192 return nbd_write(client->ioc, &rep, sizeof(rep), errp); 193 } 194 195 /* Send a reply header with default 0 length. 196 * Return -errno on error, 0 on success. */ 197 static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type, 198 Error **errp) 199 { 200 return nbd_negotiate_send_rep_len(client, type, 0, errp); 201 } 202 203 /* Send an error reply. 204 * Return -errno on error, 0 on success. */ 205 static int GCC_FMT_ATTR(4, 0) 206 nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type, 207 Error **errp, const char *fmt, va_list va) 208 { 209 char *msg; 210 int ret; 211 size_t len; 212 213 msg = g_strdup_vprintf(fmt, va); 214 len = strlen(msg); 215 assert(len < 4096); 216 trace_nbd_negotiate_send_rep_err(msg); 217 ret = nbd_negotiate_send_rep_len(client, type, len, errp); 218 if (ret < 0) { 219 goto out; 220 } 221 if (nbd_write(client->ioc, msg, len, errp) < 0) { 222 error_prepend(errp, "write failed (error message): "); 223 ret = -EIO; 224 } else { 225 ret = 0; 226 } 227 228 out: 229 g_free(msg); 230 return ret; 231 } 232 233 /* Send an error reply. 234 * Return -errno on error, 0 on success. */ 235 static int GCC_FMT_ATTR(4, 5) 236 nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type, 237 Error **errp, const char *fmt, ...) 238 { 239 va_list va; 240 int ret; 241 242 va_start(va, fmt); 243 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va); 244 va_end(va); 245 return ret; 246 } 247 248 /* Drop remainder of the current option, and send a reply with the 249 * given error type and message. Return -errno on read or write 250 * failure; or 0 if connection is still live. */ 251 static int GCC_FMT_ATTR(4, 0) 252 nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp, 253 const char *fmt, va_list va) 254 { 255 int ret = nbd_drop(client->ioc, client->optlen, errp); 256 257 client->optlen = 0; 258 if (!ret) { 259 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va); 260 } 261 return ret; 262 } 263 264 static int GCC_FMT_ATTR(4, 5) 265 nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp, 266 const char *fmt, ...) 267 { 268 int ret; 269 va_list va; 270 271 va_start(va, fmt); 272 ret = nbd_opt_vdrop(client, type, errp, fmt, va); 273 va_end(va); 274 275 return ret; 276 } 277 278 static int GCC_FMT_ATTR(3, 4) 279 nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...) 280 { 281 int ret; 282 va_list va; 283 284 va_start(va, fmt); 285 ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va); 286 va_end(va); 287 288 return ret; 289 } 290 291 /* Read size bytes from the unparsed payload of the current option. 292 * Return -errno on I/O error, 0 if option was completely handled by 293 * sending a reply about inconsistent lengths, or 1 on success. */ 294 static int nbd_opt_read(NBDClient *client, void *buffer, size_t size, 295 Error **errp) 296 { 297 if (size > client->optlen) { 298 return nbd_opt_invalid(client, errp, 299 "Inconsistent lengths in option %s", 300 nbd_opt_lookup(client->opt)); 301 } 302 client->optlen -= size; 303 return qio_channel_read_all(client->ioc, buffer, size, errp) < 0 ? -EIO : 1; 304 } 305 306 /* Drop size bytes from the unparsed payload of the current option. 307 * Return -errno on I/O error, 0 if option was completely handled by 308 * sending a reply about inconsistent lengths, or 1 on success. */ 309 static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp) 310 { 311 if (size > client->optlen) { 312 return nbd_opt_invalid(client, errp, 313 "Inconsistent lengths in option %s", 314 nbd_opt_lookup(client->opt)); 315 } 316 client->optlen -= size; 317 return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1; 318 } 319 320 /* nbd_opt_read_name 321 * 322 * Read a string with the format: 323 * uint32_t len (<= NBD_MAX_NAME_SIZE) 324 * len bytes string (not 0-terminated) 325 * 326 * @name should be enough to store NBD_MAX_NAME_SIZE+1. 327 * If @length is non-null, it will be set to the actual string length. 328 * 329 * Return -errno on I/O error, 0 if option was completely handled by 330 * sending a reply about inconsistent lengths, or 1 on success. 331 */ 332 static int nbd_opt_read_name(NBDClient *client, char *name, uint32_t *length, 333 Error **errp) 334 { 335 int ret; 336 uint32_t len; 337 338 ret = nbd_opt_read(client, &len, sizeof(len), errp); 339 if (ret <= 0) { 340 return ret; 341 } 342 len = cpu_to_be32(len); 343 344 if (len > NBD_MAX_NAME_SIZE) { 345 return nbd_opt_invalid(client, errp, 346 "Invalid name length: %" PRIu32, len); 347 } 348 349 ret = nbd_opt_read(client, name, len, errp); 350 if (ret <= 0) { 351 return ret; 352 } 353 name[len] = '\0'; 354 355 if (length) { 356 *length = len; 357 } 358 359 return 1; 360 } 361 362 /* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload. 363 * Return -errno on error, 0 on success. */ 364 static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp, 365 Error **errp) 366 { 367 size_t name_len, desc_len; 368 uint32_t len; 369 const char *name = exp->name ? exp->name : ""; 370 const char *desc = exp->description ? exp->description : ""; 371 QIOChannel *ioc = client->ioc; 372 int ret; 373 374 trace_nbd_negotiate_send_rep_list(name, desc); 375 name_len = strlen(name); 376 desc_len = strlen(desc); 377 len = name_len + desc_len + sizeof(len); 378 ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp); 379 if (ret < 0) { 380 return ret; 381 } 382 383 len = cpu_to_be32(name_len); 384 if (nbd_write(ioc, &len, sizeof(len), errp) < 0) { 385 error_prepend(errp, "write failed (name length): "); 386 return -EINVAL; 387 } 388 389 if (nbd_write(ioc, name, name_len, errp) < 0) { 390 error_prepend(errp, "write failed (name buffer): "); 391 return -EINVAL; 392 } 393 394 if (nbd_write(ioc, desc, desc_len, errp) < 0) { 395 error_prepend(errp, "write failed (description buffer): "); 396 return -EINVAL; 397 } 398 399 return 0; 400 } 401 402 /* Process the NBD_OPT_LIST command, with a potential series of replies. 403 * Return -errno on error, 0 on success. */ 404 static int nbd_negotiate_handle_list(NBDClient *client, Error **errp) 405 { 406 NBDExport *exp; 407 assert(client->opt == NBD_OPT_LIST); 408 409 /* For each export, send a NBD_REP_SERVER reply. */ 410 QTAILQ_FOREACH(exp, &exports, next) { 411 if (nbd_negotiate_send_rep_list(client, exp, errp)) { 412 return -EINVAL; 413 } 414 } 415 /* Finish with a NBD_REP_ACK. */ 416 return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 417 } 418 419 static void nbd_check_meta_export(NBDClient *client) 420 { 421 client->export_meta.valid &= client->exp == client->export_meta.exp; 422 } 423 424 /* Send a reply to NBD_OPT_EXPORT_NAME. 425 * Return -errno on error, 0 on success. */ 426 static int nbd_negotiate_handle_export_name(NBDClient *client, 427 uint16_t myflags, bool no_zeroes, 428 Error **errp) 429 { 430 char name[NBD_MAX_NAME_SIZE + 1]; 431 char buf[NBD_REPLY_EXPORT_NAME_SIZE] = ""; 432 size_t len; 433 int ret; 434 435 /* Client sends: 436 [20 .. xx] export name (length bytes) 437 Server replies: 438 [ 0 .. 7] size 439 [ 8 .. 9] export flags 440 [10 .. 133] reserved (0) [unless no_zeroes] 441 */ 442 trace_nbd_negotiate_handle_export_name(); 443 if (client->optlen >= sizeof(name)) { 444 error_setg(errp, "Bad length received"); 445 return -EINVAL; 446 } 447 if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) { 448 return -EIO; 449 } 450 name[client->optlen] = '\0'; 451 client->optlen = 0; 452 453 trace_nbd_negotiate_handle_export_name_request(name); 454 455 client->exp = nbd_export_find(name); 456 if (!client->exp) { 457 error_setg(errp, "export not found"); 458 return -EINVAL; 459 } 460 461 trace_nbd_negotiate_new_style_size_flags(client->exp->size, 462 client->exp->nbdflags | myflags); 463 stq_be_p(buf, client->exp->size); 464 stw_be_p(buf + 8, client->exp->nbdflags | myflags); 465 len = no_zeroes ? 10 : sizeof(buf); 466 ret = nbd_write(client->ioc, buf, len, errp); 467 if (ret < 0) { 468 error_prepend(errp, "write failed: "); 469 return ret; 470 } 471 472 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next); 473 nbd_export_get(client->exp); 474 nbd_check_meta_export(client); 475 476 return 0; 477 } 478 479 /* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes. 480 * The buffer does NOT include the info type prefix. 481 * Return -errno on error, 0 if ready to send more. */ 482 static int nbd_negotiate_send_info(NBDClient *client, 483 uint16_t info, uint32_t length, void *buf, 484 Error **errp) 485 { 486 int rc; 487 488 trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length); 489 rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO, 490 sizeof(info) + length, errp); 491 if (rc < 0) { 492 return rc; 493 } 494 info = cpu_to_be16(info); 495 if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) { 496 return -EIO; 497 } 498 if (nbd_write(client->ioc, buf, length, errp) < 0) { 499 return -EIO; 500 } 501 return 0; 502 } 503 504 /* nbd_reject_length: Handle any unexpected payload. 505 * @fatal requests that we quit talking to the client, even if we are able 506 * to successfully send an error reply. 507 * Return: 508 * -errno transmission error occurred or @fatal was requested, errp is set 509 * 0 error message successfully sent to client, errp is not set 510 */ 511 static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp) 512 { 513 int ret; 514 515 assert(client->optlen); 516 ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length", 517 nbd_opt_lookup(client->opt)); 518 if (fatal && !ret) { 519 error_setg(errp, "option '%s' has unexpected length", 520 nbd_opt_lookup(client->opt)); 521 return -EINVAL; 522 } 523 return ret; 524 } 525 526 /* Handle NBD_OPT_INFO and NBD_OPT_GO. 527 * Return -errno on error, 0 if ready for next option, and 1 to move 528 * into transmission phase. */ 529 static int nbd_negotiate_handle_info(NBDClient *client, uint16_t myflags, 530 Error **errp) 531 { 532 int rc; 533 char name[NBD_MAX_NAME_SIZE + 1]; 534 NBDExport *exp; 535 uint16_t requests; 536 uint16_t request; 537 uint32_t namelen; 538 bool sendname = false; 539 bool blocksize = false; 540 uint32_t sizes[3]; 541 char buf[sizeof(uint64_t) + sizeof(uint16_t)]; 542 uint32_t check_align = 0; 543 544 /* Client sends: 545 4 bytes: L, name length (can be 0) 546 L bytes: export name 547 2 bytes: N, number of requests (can be 0) 548 N * 2 bytes: N requests 549 */ 550 rc = nbd_opt_read_name(client, name, &namelen, errp); 551 if (rc <= 0) { 552 return rc; 553 } 554 trace_nbd_negotiate_handle_export_name_request(name); 555 556 rc = nbd_opt_read(client, &requests, sizeof(requests), errp); 557 if (rc <= 0) { 558 return rc; 559 } 560 requests = be16_to_cpu(requests); 561 trace_nbd_negotiate_handle_info_requests(requests); 562 while (requests--) { 563 rc = nbd_opt_read(client, &request, sizeof(request), errp); 564 if (rc <= 0) { 565 return rc; 566 } 567 request = be16_to_cpu(request); 568 trace_nbd_negotiate_handle_info_request(request, 569 nbd_info_lookup(request)); 570 /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE; 571 * everything else is either a request we don't know or 572 * something we send regardless of request */ 573 switch (request) { 574 case NBD_INFO_NAME: 575 sendname = true; 576 break; 577 case NBD_INFO_BLOCK_SIZE: 578 blocksize = true; 579 break; 580 } 581 } 582 if (client->optlen) { 583 return nbd_reject_length(client, false, errp); 584 } 585 586 exp = nbd_export_find(name); 587 if (!exp) { 588 return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN, 589 errp, "export '%s' not present", 590 name); 591 } 592 593 /* Don't bother sending NBD_INFO_NAME unless client requested it */ 594 if (sendname) { 595 rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name, 596 errp); 597 if (rc < 0) { 598 return rc; 599 } 600 } 601 602 /* Send NBD_INFO_DESCRIPTION only if available, regardless of 603 * client request */ 604 if (exp->description) { 605 size_t len = strlen(exp->description); 606 607 rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION, 608 len, exp->description, errp); 609 if (rc < 0) { 610 return rc; 611 } 612 } 613 614 /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size 615 * according to whether the client requested it, and according to 616 * whether this is OPT_INFO or OPT_GO. */ 617 /* minimum - 1 for back-compat, or actual if client will obey it. */ 618 if (client->opt == NBD_OPT_INFO || blocksize) { 619 check_align = sizes[0] = blk_get_request_alignment(exp->blk); 620 } else { 621 sizes[0] = 1; 622 } 623 assert(sizes[0] <= NBD_MAX_BUFFER_SIZE); 624 /* preferred - Hard-code to 4096 for now. 625 * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */ 626 sizes[1] = MAX(4096, sizes[0]); 627 /* maximum - At most 32M, but smaller as appropriate. */ 628 sizes[2] = MIN(blk_get_max_transfer(exp->blk), NBD_MAX_BUFFER_SIZE); 629 trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]); 630 sizes[0] = cpu_to_be32(sizes[0]); 631 sizes[1] = cpu_to_be32(sizes[1]); 632 sizes[2] = cpu_to_be32(sizes[2]); 633 rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE, 634 sizeof(sizes), sizes, errp); 635 if (rc < 0) { 636 return rc; 637 } 638 639 /* Send NBD_INFO_EXPORT always */ 640 trace_nbd_negotiate_new_style_size_flags(exp->size, 641 exp->nbdflags | myflags); 642 stq_be_p(buf, exp->size); 643 stw_be_p(buf + 8, exp->nbdflags | myflags); 644 rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT, 645 sizeof(buf), buf, errp); 646 if (rc < 0) { 647 return rc; 648 } 649 650 /* 651 * If the client is just asking for NBD_OPT_INFO, but forgot to 652 * request block sizes in a situation that would impact 653 * performance, then return an error. But for NBD_OPT_GO, we 654 * tolerate all clients, regardless of alignments. 655 */ 656 if (client->opt == NBD_OPT_INFO && !blocksize && 657 blk_get_request_alignment(exp->blk) > 1) { 658 return nbd_negotiate_send_rep_err(client, 659 NBD_REP_ERR_BLOCK_SIZE_REQD, 660 errp, 661 "request NBD_INFO_BLOCK_SIZE to " 662 "use this export"); 663 } 664 665 /* Final reply */ 666 rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 667 if (rc < 0) { 668 return rc; 669 } 670 671 if (client->opt == NBD_OPT_GO) { 672 client->exp = exp; 673 client->check_align = check_align; 674 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next); 675 nbd_export_get(client->exp); 676 nbd_check_meta_export(client); 677 rc = 1; 678 } 679 return rc; 680 } 681 682 683 /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the 684 * new channel for all further (now-encrypted) communication. */ 685 static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, 686 Error **errp) 687 { 688 QIOChannel *ioc; 689 QIOChannelTLS *tioc; 690 struct NBDTLSHandshakeData data = { 0 }; 691 692 assert(client->opt == NBD_OPT_STARTTLS); 693 694 trace_nbd_negotiate_handle_starttls(); 695 ioc = client->ioc; 696 697 if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) { 698 return NULL; 699 } 700 701 tioc = qio_channel_tls_new_server(ioc, 702 client->tlscreds, 703 client->tlsauthz, 704 errp); 705 if (!tioc) { 706 return NULL; 707 } 708 709 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls"); 710 trace_nbd_negotiate_handle_starttls_handshake(); 711 data.loop = g_main_loop_new(g_main_context_default(), FALSE); 712 qio_channel_tls_handshake(tioc, 713 nbd_tls_handshake, 714 &data, 715 NULL, 716 NULL); 717 718 if (!data.complete) { 719 g_main_loop_run(data.loop); 720 } 721 g_main_loop_unref(data.loop); 722 if (data.error) { 723 object_unref(OBJECT(tioc)); 724 error_propagate(errp, data.error); 725 return NULL; 726 } 727 728 return QIO_CHANNEL(tioc); 729 } 730 731 /* nbd_negotiate_send_meta_context 732 * 733 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT 734 * 735 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead. 736 */ 737 static int nbd_negotiate_send_meta_context(NBDClient *client, 738 const char *context, 739 uint32_t context_id, 740 Error **errp) 741 { 742 NBDOptionReplyMetaContext opt; 743 struct iovec iov[] = { 744 {.iov_base = &opt, .iov_len = sizeof(opt)}, 745 {.iov_base = (void *)context, .iov_len = strlen(context)} 746 }; 747 748 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 749 context_id = 0; 750 } 751 752 trace_nbd_negotiate_meta_query_reply(context, context_id); 753 set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT, 754 sizeof(opt) - sizeof(opt.h) + iov[1].iov_len); 755 stl_be_p(&opt.context_id, context_id); 756 757 return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0; 758 } 759 760 /* Read strlen(@pattern) bytes, and set @match to true if they match @pattern. 761 * @match is never set to false. 762 * 763 * Return -errno on I/O error, 0 if option was completely handled by 764 * sending a reply about inconsistent lengths, or 1 on success. 765 * 766 * Note: return code = 1 doesn't mean that we've read exactly @pattern. 767 * It only means that there are no errors. 768 */ 769 static int nbd_meta_pattern(NBDClient *client, const char *pattern, bool *match, 770 Error **errp) 771 { 772 int ret; 773 char *query; 774 size_t len = strlen(pattern); 775 776 assert(len); 777 778 query = g_malloc(len); 779 ret = nbd_opt_read(client, query, len, errp); 780 if (ret <= 0) { 781 g_free(query); 782 return ret; 783 } 784 785 if (strncmp(query, pattern, len) == 0) { 786 trace_nbd_negotiate_meta_query_parse(pattern); 787 *match = true; 788 } else { 789 trace_nbd_negotiate_meta_query_skip("pattern not matched"); 790 } 791 g_free(query); 792 793 return 1; 794 } 795 796 /* 797 * Read @len bytes, and set @match to true if they match @pattern, or if @len 798 * is 0 and the client is performing _LIST_. @match is never set to false. 799 * 800 * Return -errno on I/O error, 0 if option was completely handled by 801 * sending a reply about inconsistent lengths, or 1 on success. 802 * 803 * Note: return code = 1 doesn't mean that we've read exactly @pattern. 804 * It only means that there are no errors. 805 */ 806 static int nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern, 807 uint32_t len, bool *match, Error **errp) 808 { 809 if (len == 0) { 810 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 811 *match = true; 812 } 813 trace_nbd_negotiate_meta_query_parse("empty"); 814 return 1; 815 } 816 817 if (len != strlen(pattern)) { 818 trace_nbd_negotiate_meta_query_skip("different lengths"); 819 return nbd_opt_skip(client, len, errp); 820 } 821 822 return nbd_meta_pattern(client, pattern, match, errp); 823 } 824 825 /* nbd_meta_base_query 826 * 827 * Handle queries to 'base' namespace. For now, only the base:allocation 828 * context is available. 'len' is the amount of text remaining to be read from 829 * the current name, after the 'base:' portion has been stripped. 830 * 831 * Return -errno on I/O error, 0 if option was completely handled by 832 * sending a reply about inconsistent lengths, or 1 on success. 833 */ 834 static int nbd_meta_base_query(NBDClient *client, NBDExportMetaContexts *meta, 835 uint32_t len, Error **errp) 836 { 837 return nbd_meta_empty_or_pattern(client, "allocation", len, 838 &meta->base_allocation, errp); 839 } 840 841 /* nbd_meta_bitmap_query 842 * 843 * Handle query to 'qemu:' namespace. 844 * @len is the amount of text remaining to be read from the current name, after 845 * the 'qemu:' portion has been stripped. 846 * 847 * Return -errno on I/O error, 0 if option was completely handled by 848 * sending a reply about inconsistent lengths, or 1 on success. */ 849 static int nbd_meta_qemu_query(NBDClient *client, NBDExportMetaContexts *meta, 850 uint32_t len, Error **errp) 851 { 852 bool dirty_bitmap = false; 853 size_t dirty_bitmap_len = strlen("dirty-bitmap:"); 854 int ret; 855 856 if (!meta->exp->export_bitmap) { 857 trace_nbd_negotiate_meta_query_skip("no dirty-bitmap exported"); 858 return nbd_opt_skip(client, len, errp); 859 } 860 861 if (len == 0) { 862 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 863 meta->bitmap = true; 864 } 865 trace_nbd_negotiate_meta_query_parse("empty"); 866 return 1; 867 } 868 869 if (len < dirty_bitmap_len) { 870 trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:"); 871 return nbd_opt_skip(client, len, errp); 872 } 873 874 len -= dirty_bitmap_len; 875 ret = nbd_meta_pattern(client, "dirty-bitmap:", &dirty_bitmap, errp); 876 if (ret <= 0) { 877 return ret; 878 } 879 if (!dirty_bitmap) { 880 trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:"); 881 return nbd_opt_skip(client, len, errp); 882 } 883 884 trace_nbd_negotiate_meta_query_parse("dirty-bitmap:"); 885 886 return nbd_meta_empty_or_pattern( 887 client, meta->exp->export_bitmap_context + 888 strlen("qemu:dirty_bitmap:"), len, &meta->bitmap, errp); 889 } 890 891 /* nbd_negotiate_meta_query 892 * 893 * Parse namespace name and call corresponding function to parse body of the 894 * query. 895 * 896 * The only supported namespace now is 'base'. 897 * 898 * The function aims not wasting time and memory to read long unknown namespace 899 * names. 900 * 901 * Return -errno on I/O error, 0 if option was completely handled by 902 * sending a reply about inconsistent lengths, or 1 on success. */ 903 static int nbd_negotiate_meta_query(NBDClient *client, 904 NBDExportMetaContexts *meta, Error **errp) 905 { 906 /* 907 * Both 'qemu' and 'base' namespaces have length = 5 including a 908 * colon. If another length namespace is later introduced, this 909 * should certainly be refactored. 910 */ 911 int ret; 912 size_t ns_len = 5; 913 char ns[5]; 914 uint32_t len; 915 916 ret = nbd_opt_read(client, &len, sizeof(len), errp); 917 if (ret <= 0) { 918 return ret; 919 } 920 len = cpu_to_be32(len); 921 922 if (len < ns_len) { 923 trace_nbd_negotiate_meta_query_skip("length too short"); 924 return nbd_opt_skip(client, len, errp); 925 } 926 927 len -= ns_len; 928 ret = nbd_opt_read(client, ns, ns_len, errp); 929 if (ret <= 0) { 930 return ret; 931 } 932 933 if (!strncmp(ns, "base:", ns_len)) { 934 trace_nbd_negotiate_meta_query_parse("base:"); 935 return nbd_meta_base_query(client, meta, len, errp); 936 } else if (!strncmp(ns, "qemu:", ns_len)) { 937 trace_nbd_negotiate_meta_query_parse("qemu:"); 938 return nbd_meta_qemu_query(client, meta, len, errp); 939 } 940 941 trace_nbd_negotiate_meta_query_skip("unknown namespace"); 942 return nbd_opt_skip(client, len, errp); 943 } 944 945 /* nbd_negotiate_meta_queries 946 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT 947 * 948 * Return -errno on I/O error, or 0 if option was completely handled. */ 949 static int nbd_negotiate_meta_queries(NBDClient *client, 950 NBDExportMetaContexts *meta, Error **errp) 951 { 952 int ret; 953 char export_name[NBD_MAX_NAME_SIZE + 1]; 954 NBDExportMetaContexts local_meta; 955 uint32_t nb_queries; 956 int i; 957 958 if (!client->structured_reply) { 959 return nbd_opt_invalid(client, errp, 960 "request option '%s' when structured reply " 961 "is not negotiated", 962 nbd_opt_lookup(client->opt)); 963 } 964 965 if (client->opt == NBD_OPT_LIST_META_CONTEXT) { 966 /* Only change the caller's meta on SET. */ 967 meta = &local_meta; 968 } 969 970 memset(meta, 0, sizeof(*meta)); 971 972 ret = nbd_opt_read_name(client, export_name, NULL, errp); 973 if (ret <= 0) { 974 return ret; 975 } 976 977 meta->exp = nbd_export_find(export_name); 978 if (meta->exp == NULL) { 979 return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp, 980 "export '%s' not present", export_name); 981 } 982 983 ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp); 984 if (ret <= 0) { 985 return ret; 986 } 987 nb_queries = cpu_to_be32(nb_queries); 988 trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt), 989 export_name, nb_queries); 990 991 if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) { 992 /* enable all known contexts */ 993 meta->base_allocation = true; 994 meta->bitmap = !!meta->exp->export_bitmap; 995 } else { 996 for (i = 0; i < nb_queries; ++i) { 997 ret = nbd_negotiate_meta_query(client, meta, errp); 998 if (ret <= 0) { 999 return ret; 1000 } 1001 } 1002 } 1003 1004 if (meta->base_allocation) { 1005 ret = nbd_negotiate_send_meta_context(client, "base:allocation", 1006 NBD_META_ID_BASE_ALLOCATION, 1007 errp); 1008 if (ret < 0) { 1009 return ret; 1010 } 1011 } 1012 1013 if (meta->bitmap) { 1014 ret = nbd_negotiate_send_meta_context(client, 1015 meta->exp->export_bitmap_context, 1016 NBD_META_ID_DIRTY_BITMAP, 1017 errp); 1018 if (ret < 0) { 1019 return ret; 1020 } 1021 } 1022 1023 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 1024 if (ret == 0) { 1025 meta->valid = true; 1026 } 1027 1028 return ret; 1029 } 1030 1031 /* nbd_negotiate_options 1032 * Process all NBD_OPT_* client option commands, during fixed newstyle 1033 * negotiation. 1034 * Return: 1035 * -errno on error, errp is set 1036 * 0 on successful negotiation, errp is not set 1037 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect, 1038 * errp is not set 1039 */ 1040 static int nbd_negotiate_options(NBDClient *client, uint16_t myflags, 1041 Error **errp) 1042 { 1043 uint32_t flags; 1044 bool fixedNewstyle = false; 1045 bool no_zeroes = false; 1046 1047 /* Client sends: 1048 [ 0 .. 3] client flags 1049 1050 Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO: 1051 [ 0 .. 7] NBD_OPTS_MAGIC 1052 [ 8 .. 11] NBD option 1053 [12 .. 15] Data length 1054 ... Rest of request 1055 1056 [ 0 .. 7] NBD_OPTS_MAGIC 1057 [ 8 .. 11] Second NBD option 1058 [12 .. 15] Data length 1059 ... Rest of request 1060 */ 1061 1062 if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) { 1063 return -EIO; 1064 } 1065 trace_nbd_negotiate_options_flags(flags); 1066 if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) { 1067 fixedNewstyle = true; 1068 flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE; 1069 } 1070 if (flags & NBD_FLAG_C_NO_ZEROES) { 1071 no_zeroes = true; 1072 flags &= ~NBD_FLAG_C_NO_ZEROES; 1073 } 1074 if (flags != 0) { 1075 error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags); 1076 return -EINVAL; 1077 } 1078 1079 while (1) { 1080 int ret; 1081 uint32_t option, length; 1082 uint64_t magic; 1083 1084 if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) { 1085 return -EINVAL; 1086 } 1087 trace_nbd_negotiate_options_check_magic(magic); 1088 if (magic != NBD_OPTS_MAGIC) { 1089 error_setg(errp, "Bad magic received"); 1090 return -EINVAL; 1091 } 1092 1093 if (nbd_read32(client->ioc, &option, "option", errp) < 0) { 1094 return -EINVAL; 1095 } 1096 client->opt = option; 1097 1098 if (nbd_read32(client->ioc, &length, "option length", errp) < 0) { 1099 return -EINVAL; 1100 } 1101 assert(!client->optlen); 1102 client->optlen = length; 1103 1104 if (length > NBD_MAX_BUFFER_SIZE) { 1105 error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)", 1106 length, NBD_MAX_BUFFER_SIZE); 1107 return -EINVAL; 1108 } 1109 1110 trace_nbd_negotiate_options_check_option(option, 1111 nbd_opt_lookup(option)); 1112 if (client->tlscreds && 1113 client->ioc == (QIOChannel *)client->sioc) { 1114 QIOChannel *tioc; 1115 if (!fixedNewstyle) { 1116 error_setg(errp, "Unsupported option 0x%" PRIx32, option); 1117 return -EINVAL; 1118 } 1119 switch (option) { 1120 case NBD_OPT_STARTTLS: 1121 if (length) { 1122 /* Unconditionally drop the connection if the client 1123 * can't start a TLS negotiation correctly */ 1124 return nbd_reject_length(client, true, errp); 1125 } 1126 tioc = nbd_negotiate_handle_starttls(client, errp); 1127 if (!tioc) { 1128 return -EIO; 1129 } 1130 ret = 0; 1131 object_unref(OBJECT(client->ioc)); 1132 client->ioc = QIO_CHANNEL(tioc); 1133 break; 1134 1135 case NBD_OPT_EXPORT_NAME: 1136 /* No way to return an error to client, so drop connection */ 1137 error_setg(errp, "Option 0x%x not permitted before TLS", 1138 option); 1139 return -EINVAL; 1140 1141 default: 1142 /* Let the client keep trying, unless they asked to 1143 * quit. Always try to give an error back to the 1144 * client; but when replying to OPT_ABORT, be aware 1145 * that the client may hang up before receiving the 1146 * error, in which case we are fine ignoring the 1147 * resulting EPIPE. */ 1148 ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD, 1149 option == NBD_OPT_ABORT ? NULL : errp, 1150 "Option 0x%" PRIx32 1151 " not permitted before TLS", option); 1152 if (option == NBD_OPT_ABORT) { 1153 return 1; 1154 } 1155 break; 1156 } 1157 } else if (fixedNewstyle) { 1158 switch (option) { 1159 case NBD_OPT_LIST: 1160 if (length) { 1161 ret = nbd_reject_length(client, false, errp); 1162 } else { 1163 ret = nbd_negotiate_handle_list(client, errp); 1164 } 1165 break; 1166 1167 case NBD_OPT_ABORT: 1168 /* NBD spec says we must try to reply before 1169 * disconnecting, but that we must also tolerate 1170 * guests that don't wait for our reply. */ 1171 nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL); 1172 return 1; 1173 1174 case NBD_OPT_EXPORT_NAME: 1175 return nbd_negotiate_handle_export_name(client, 1176 myflags, no_zeroes, 1177 errp); 1178 1179 case NBD_OPT_INFO: 1180 case NBD_OPT_GO: 1181 ret = nbd_negotiate_handle_info(client, myflags, errp); 1182 if (ret == 1) { 1183 assert(option == NBD_OPT_GO); 1184 return 0; 1185 } 1186 break; 1187 1188 case NBD_OPT_STARTTLS: 1189 if (length) { 1190 ret = nbd_reject_length(client, false, errp); 1191 } else if (client->tlscreds) { 1192 ret = nbd_negotiate_send_rep_err(client, 1193 NBD_REP_ERR_INVALID, errp, 1194 "TLS already enabled"); 1195 } else { 1196 ret = nbd_negotiate_send_rep_err(client, 1197 NBD_REP_ERR_POLICY, errp, 1198 "TLS not configured"); 1199 } 1200 break; 1201 1202 case NBD_OPT_STRUCTURED_REPLY: 1203 if (length) { 1204 ret = nbd_reject_length(client, false, errp); 1205 } else if (client->structured_reply) { 1206 ret = nbd_negotiate_send_rep_err( 1207 client, NBD_REP_ERR_INVALID, errp, 1208 "structured reply already negotiated"); 1209 } else { 1210 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp); 1211 client->structured_reply = true; 1212 myflags |= NBD_FLAG_SEND_DF; 1213 } 1214 break; 1215 1216 case NBD_OPT_LIST_META_CONTEXT: 1217 case NBD_OPT_SET_META_CONTEXT: 1218 ret = nbd_negotiate_meta_queries(client, &client->export_meta, 1219 errp); 1220 break; 1221 1222 default: 1223 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp, 1224 "Unsupported option %" PRIu32 " (%s)", 1225 option, nbd_opt_lookup(option)); 1226 break; 1227 } 1228 } else { 1229 /* 1230 * If broken new-style we should drop the connection 1231 * for anything except NBD_OPT_EXPORT_NAME 1232 */ 1233 switch (option) { 1234 case NBD_OPT_EXPORT_NAME: 1235 return nbd_negotiate_handle_export_name(client, 1236 myflags, no_zeroes, 1237 errp); 1238 1239 default: 1240 error_setg(errp, "Unsupported option %" PRIu32 " (%s)", 1241 option, nbd_opt_lookup(option)); 1242 return -EINVAL; 1243 } 1244 } 1245 if (ret < 0) { 1246 return ret; 1247 } 1248 } 1249 } 1250 1251 /* nbd_negotiate 1252 * Return: 1253 * -errno on error, errp is set 1254 * 0 on successful negotiation, errp is not set 1255 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect, 1256 * errp is not set 1257 */ 1258 static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp) 1259 { 1260 char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = ""; 1261 int ret; 1262 const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM | 1263 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA | 1264 NBD_FLAG_SEND_WRITE_ZEROES | NBD_FLAG_SEND_CACHE); 1265 1266 /* Old style negotiation header, no room for options 1267 [ 0 .. 7] passwd ("NBDMAGIC") 1268 [ 8 .. 15] magic (NBD_CLIENT_MAGIC) 1269 [16 .. 23] size 1270 [24 .. 27] export flags (zero-extended) 1271 [28 .. 151] reserved (0) 1272 1273 New style negotiation header, client can send options 1274 [ 0 .. 7] passwd ("NBDMAGIC") 1275 [ 8 .. 15] magic (NBD_OPTS_MAGIC) 1276 [16 .. 17] server flags (0) 1277 ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO.... 1278 */ 1279 1280 qio_channel_set_blocking(client->ioc, false, NULL); 1281 1282 trace_nbd_negotiate_begin(); 1283 memcpy(buf, "NBDMAGIC", 8); 1284 1285 stq_be_p(buf + 8, NBD_OPTS_MAGIC); 1286 stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES); 1287 1288 if (nbd_write(client->ioc, buf, 18, errp) < 0) { 1289 error_prepend(errp, "write failed: "); 1290 return -EINVAL; 1291 } 1292 ret = nbd_negotiate_options(client, myflags, errp); 1293 if (ret != 0) { 1294 if (ret < 0) { 1295 error_prepend(errp, "option negotiation failed: "); 1296 } 1297 return ret; 1298 } 1299 1300 assert(!client->optlen); 1301 trace_nbd_negotiate_success(); 1302 1303 return 0; 1304 } 1305 1306 static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request, 1307 Error **errp) 1308 { 1309 uint8_t buf[NBD_REQUEST_SIZE]; 1310 uint32_t magic; 1311 int ret; 1312 1313 ret = nbd_read(ioc, buf, sizeof(buf), "request", errp); 1314 if (ret < 0) { 1315 return ret; 1316 } 1317 1318 /* Request 1319 [ 0 .. 3] magic (NBD_REQUEST_MAGIC) 1320 [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...) 1321 [ 6 .. 7] type (NBD_CMD_READ, ...) 1322 [ 8 .. 15] handle 1323 [16 .. 23] from 1324 [24 .. 27] len 1325 */ 1326 1327 magic = ldl_be_p(buf); 1328 request->flags = lduw_be_p(buf + 4); 1329 request->type = lduw_be_p(buf + 6); 1330 request->handle = ldq_be_p(buf + 8); 1331 request->from = ldq_be_p(buf + 16); 1332 request->len = ldl_be_p(buf + 24); 1333 1334 trace_nbd_receive_request(magic, request->flags, request->type, 1335 request->from, request->len); 1336 1337 if (magic != NBD_REQUEST_MAGIC) { 1338 error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic); 1339 return -EINVAL; 1340 } 1341 return 0; 1342 } 1343 1344 #define MAX_NBD_REQUESTS 16 1345 1346 void nbd_client_get(NBDClient *client) 1347 { 1348 client->refcount++; 1349 } 1350 1351 void nbd_client_put(NBDClient *client) 1352 { 1353 if (--client->refcount == 0) { 1354 /* The last reference should be dropped by client->close, 1355 * which is called by client_close. 1356 */ 1357 assert(client->closing); 1358 1359 qio_channel_detach_aio_context(client->ioc); 1360 object_unref(OBJECT(client->sioc)); 1361 object_unref(OBJECT(client->ioc)); 1362 if (client->tlscreds) { 1363 object_unref(OBJECT(client->tlscreds)); 1364 } 1365 g_free(client->tlsauthz); 1366 if (client->exp) { 1367 QTAILQ_REMOVE(&client->exp->clients, client, next); 1368 nbd_export_put(client->exp); 1369 } 1370 g_free(client); 1371 } 1372 } 1373 1374 static void client_close(NBDClient *client, bool negotiated) 1375 { 1376 if (client->closing) { 1377 return; 1378 } 1379 1380 client->closing = true; 1381 1382 /* Force requests to finish. They will drop their own references, 1383 * then we'll close the socket and free the NBDClient. 1384 */ 1385 qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, 1386 NULL); 1387 1388 /* Also tell the client, so that they release their reference. */ 1389 if (client->close_fn) { 1390 client->close_fn(client, negotiated); 1391 } 1392 } 1393 1394 static NBDRequestData *nbd_request_get(NBDClient *client) 1395 { 1396 NBDRequestData *req; 1397 1398 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1); 1399 client->nb_requests++; 1400 1401 req = g_new0(NBDRequestData, 1); 1402 nbd_client_get(client); 1403 req->client = client; 1404 return req; 1405 } 1406 1407 static void nbd_request_put(NBDRequestData *req) 1408 { 1409 NBDClient *client = req->client; 1410 1411 if (req->data) { 1412 qemu_vfree(req->data); 1413 } 1414 g_free(req); 1415 1416 client->nb_requests--; 1417 nbd_client_receive_next_request(client); 1418 1419 nbd_client_put(client); 1420 } 1421 1422 static void blk_aio_attached(AioContext *ctx, void *opaque) 1423 { 1424 NBDExport *exp = opaque; 1425 NBDClient *client; 1426 1427 trace_nbd_blk_aio_attached(exp->name, ctx); 1428 1429 exp->ctx = ctx; 1430 1431 QTAILQ_FOREACH(client, &exp->clients, next) { 1432 qio_channel_attach_aio_context(client->ioc, ctx); 1433 if (client->recv_coroutine) { 1434 aio_co_schedule(ctx, client->recv_coroutine); 1435 } 1436 if (client->send_coroutine) { 1437 aio_co_schedule(ctx, client->send_coroutine); 1438 } 1439 } 1440 } 1441 1442 static void blk_aio_detach(void *opaque) 1443 { 1444 NBDExport *exp = opaque; 1445 NBDClient *client; 1446 1447 trace_nbd_blk_aio_detach(exp->name, exp->ctx); 1448 1449 QTAILQ_FOREACH(client, &exp->clients, next) { 1450 qio_channel_detach_aio_context(client->ioc); 1451 } 1452 1453 exp->ctx = NULL; 1454 } 1455 1456 static void nbd_eject_notifier(Notifier *n, void *data) 1457 { 1458 NBDExport *exp = container_of(n, NBDExport, eject_notifier); 1459 nbd_export_close(exp); 1460 } 1461 1462 NBDExport *nbd_export_new(BlockDriverState *bs, uint64_t dev_offset, 1463 uint64_t size, const char *name, const char *desc, 1464 const char *bitmap, uint16_t nbdflags, 1465 void (*close)(NBDExport *), bool writethrough, 1466 BlockBackend *on_eject_blk, Error **errp) 1467 { 1468 AioContext *ctx; 1469 BlockBackend *blk; 1470 NBDExport *exp = g_new0(NBDExport, 1); 1471 uint64_t perm; 1472 int ret; 1473 1474 /* 1475 * NBD exports are used for non-shared storage migration. Make sure 1476 * that BDRV_O_INACTIVE is cleared and the image is ready for write 1477 * access since the export could be available before migration handover. 1478 */ 1479 assert(name); 1480 ctx = bdrv_get_aio_context(bs); 1481 aio_context_acquire(ctx); 1482 bdrv_invalidate_cache(bs, NULL); 1483 aio_context_release(ctx); 1484 1485 /* Don't allow resize while the NBD server is running, otherwise we don't 1486 * care what happens with the node. */ 1487 perm = BLK_PERM_CONSISTENT_READ; 1488 if ((nbdflags & NBD_FLAG_READ_ONLY) == 0) { 1489 perm |= BLK_PERM_WRITE; 1490 } 1491 blk = blk_new(bdrv_get_aio_context(bs), perm, 1492 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED | 1493 BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD); 1494 ret = blk_insert_bs(blk, bs, errp); 1495 if (ret < 0) { 1496 goto fail; 1497 } 1498 blk_set_enable_write_cache(blk, !writethrough); 1499 blk_set_allow_aio_context_change(blk, true); 1500 1501 exp->refcount = 1; 1502 QTAILQ_INIT(&exp->clients); 1503 exp->blk = blk; 1504 assert(dev_offset <= INT64_MAX); 1505 exp->dev_offset = dev_offset; 1506 exp->name = g_strdup(name); 1507 exp->description = g_strdup(desc); 1508 exp->nbdflags = nbdflags; 1509 assert(size <= INT64_MAX - dev_offset); 1510 exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE); 1511 1512 if (bitmap) { 1513 BdrvDirtyBitmap *bm = NULL; 1514 1515 while (true) { 1516 bm = bdrv_find_dirty_bitmap(bs, bitmap); 1517 if (bm != NULL || bs->backing == NULL) { 1518 break; 1519 } 1520 1521 bs = bs->backing->bs; 1522 } 1523 1524 if (bm == NULL) { 1525 error_setg(errp, "Bitmap '%s' is not found", bitmap); 1526 goto fail; 1527 } 1528 1529 if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) { 1530 goto fail; 1531 } 1532 1533 if ((nbdflags & NBD_FLAG_READ_ONLY) && bdrv_is_writable(bs) && 1534 bdrv_dirty_bitmap_enabled(bm)) { 1535 error_setg(errp, 1536 "Enabled bitmap '%s' incompatible with readonly export", 1537 bitmap); 1538 goto fail; 1539 } 1540 1541 bdrv_dirty_bitmap_set_busy(bm, true); 1542 exp->export_bitmap = bm; 1543 exp->export_bitmap_context = g_strdup_printf("qemu:dirty-bitmap:%s", 1544 bitmap); 1545 } 1546 1547 exp->close = close; 1548 exp->ctx = blk_get_aio_context(blk); 1549 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp); 1550 1551 if (on_eject_blk) { 1552 blk_ref(on_eject_blk); 1553 exp->eject_notifier_blk = on_eject_blk; 1554 exp->eject_notifier.notify = nbd_eject_notifier; 1555 blk_add_remove_bs_notifier(on_eject_blk, &exp->eject_notifier); 1556 } 1557 QTAILQ_INSERT_TAIL(&exports, exp, next); 1558 nbd_export_get(exp); 1559 return exp; 1560 1561 fail: 1562 blk_unref(blk); 1563 g_free(exp->name); 1564 g_free(exp->description); 1565 g_free(exp); 1566 return NULL; 1567 } 1568 1569 NBDExport *nbd_export_find(const char *name) 1570 { 1571 NBDExport *exp; 1572 QTAILQ_FOREACH(exp, &exports, next) { 1573 if (strcmp(name, exp->name) == 0) { 1574 return exp; 1575 } 1576 } 1577 1578 return NULL; 1579 } 1580 1581 void nbd_export_close(NBDExport *exp) 1582 { 1583 NBDClient *client, *next; 1584 1585 nbd_export_get(exp); 1586 /* 1587 * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a 1588 * close mode that stops advertising the export to new clients but 1589 * still permits existing clients to run to completion? Because of 1590 * that possibility, nbd_export_close() can be called more than 1591 * once on an export. 1592 */ 1593 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) { 1594 client_close(client, true); 1595 } 1596 if (exp->name) { 1597 nbd_export_put(exp); 1598 g_free(exp->name); 1599 exp->name = NULL; 1600 QTAILQ_REMOVE(&exports, exp, next); 1601 } 1602 g_free(exp->description); 1603 exp->description = NULL; 1604 nbd_export_put(exp); 1605 } 1606 1607 void nbd_export_remove(NBDExport *exp, NbdServerRemoveMode mode, Error **errp) 1608 { 1609 if (mode == NBD_SERVER_REMOVE_MODE_HARD || QTAILQ_EMPTY(&exp->clients)) { 1610 nbd_export_close(exp); 1611 return; 1612 } 1613 1614 assert(mode == NBD_SERVER_REMOVE_MODE_SAFE); 1615 1616 error_setg(errp, "export '%s' still in use", exp->name); 1617 error_append_hint(errp, "Use mode='hard' to force client disconnect\n"); 1618 } 1619 1620 void nbd_export_get(NBDExport *exp) 1621 { 1622 assert(exp->refcount > 0); 1623 exp->refcount++; 1624 } 1625 1626 void nbd_export_put(NBDExport *exp) 1627 { 1628 assert(exp->refcount > 0); 1629 if (exp->refcount == 1) { 1630 nbd_export_close(exp); 1631 } 1632 1633 /* nbd_export_close() may theoretically reduce refcount to 0. It may happen 1634 * if someone calls nbd_export_put() on named export not through 1635 * nbd_export_set_name() when refcount is 1. So, let's assert that 1636 * it is > 0. 1637 */ 1638 assert(exp->refcount > 0); 1639 if (--exp->refcount == 0) { 1640 assert(exp->name == NULL); 1641 assert(exp->description == NULL); 1642 1643 if (exp->close) { 1644 exp->close(exp); 1645 } 1646 1647 if (exp->blk) { 1648 if (exp->eject_notifier_blk) { 1649 notifier_remove(&exp->eject_notifier); 1650 blk_unref(exp->eject_notifier_blk); 1651 } 1652 blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, 1653 blk_aio_detach, exp); 1654 blk_unref(exp->blk); 1655 exp->blk = NULL; 1656 } 1657 1658 if (exp->export_bitmap) { 1659 bdrv_dirty_bitmap_set_busy(exp->export_bitmap, false); 1660 g_free(exp->export_bitmap_context); 1661 } 1662 1663 g_free(exp); 1664 } 1665 } 1666 1667 BlockBackend *nbd_export_get_blockdev(NBDExport *exp) 1668 { 1669 return exp->blk; 1670 } 1671 1672 void nbd_export_close_all(void) 1673 { 1674 NBDExport *exp, *next; 1675 1676 QTAILQ_FOREACH_SAFE(exp, &exports, next, next) { 1677 nbd_export_close(exp); 1678 } 1679 } 1680 1681 static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov, 1682 unsigned niov, Error **errp) 1683 { 1684 int ret; 1685 1686 g_assert(qemu_in_coroutine()); 1687 qemu_co_mutex_lock(&client->send_lock); 1688 client->send_coroutine = qemu_coroutine_self(); 1689 1690 ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0; 1691 1692 client->send_coroutine = NULL; 1693 qemu_co_mutex_unlock(&client->send_lock); 1694 1695 return ret; 1696 } 1697 1698 static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error, 1699 uint64_t handle) 1700 { 1701 stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC); 1702 stl_be_p(&reply->error, error); 1703 stq_be_p(&reply->handle, handle); 1704 } 1705 1706 static int nbd_co_send_simple_reply(NBDClient *client, 1707 uint64_t handle, 1708 uint32_t error, 1709 void *data, 1710 size_t len, 1711 Error **errp) 1712 { 1713 NBDSimpleReply reply; 1714 int nbd_err = system_errno_to_nbd_errno(error); 1715 struct iovec iov[] = { 1716 {.iov_base = &reply, .iov_len = sizeof(reply)}, 1717 {.iov_base = data, .iov_len = len} 1718 }; 1719 1720 trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err), 1721 len); 1722 set_be_simple_reply(&reply, nbd_err, handle); 1723 1724 return nbd_co_send_iov(client, iov, len ? 2 : 1, errp); 1725 } 1726 1727 static inline void set_be_chunk(NBDStructuredReplyChunk *chunk, uint16_t flags, 1728 uint16_t type, uint64_t handle, uint32_t length) 1729 { 1730 stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC); 1731 stw_be_p(&chunk->flags, flags); 1732 stw_be_p(&chunk->type, type); 1733 stq_be_p(&chunk->handle, handle); 1734 stl_be_p(&chunk->length, length); 1735 } 1736 1737 static int coroutine_fn nbd_co_send_structured_done(NBDClient *client, 1738 uint64_t handle, 1739 Error **errp) 1740 { 1741 NBDStructuredReplyChunk chunk; 1742 struct iovec iov[] = { 1743 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1744 }; 1745 1746 trace_nbd_co_send_structured_done(handle); 1747 set_be_chunk(&chunk, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, handle, 0); 1748 1749 return nbd_co_send_iov(client, iov, 1, errp); 1750 } 1751 1752 static int coroutine_fn nbd_co_send_structured_read(NBDClient *client, 1753 uint64_t handle, 1754 uint64_t offset, 1755 void *data, 1756 size_t size, 1757 bool final, 1758 Error **errp) 1759 { 1760 NBDStructuredReadData chunk; 1761 struct iovec iov[] = { 1762 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1763 {.iov_base = data, .iov_len = size} 1764 }; 1765 1766 assert(size); 1767 trace_nbd_co_send_structured_read(handle, offset, data, size); 1768 set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0, 1769 NBD_REPLY_TYPE_OFFSET_DATA, handle, 1770 sizeof(chunk) - sizeof(chunk.h) + size); 1771 stq_be_p(&chunk.offset, offset); 1772 1773 return nbd_co_send_iov(client, iov, 2, errp); 1774 } 1775 1776 static int coroutine_fn nbd_co_send_structured_error(NBDClient *client, 1777 uint64_t handle, 1778 uint32_t error, 1779 const char *msg, 1780 Error **errp) 1781 { 1782 NBDStructuredError chunk; 1783 int nbd_err = system_errno_to_nbd_errno(error); 1784 struct iovec iov[] = { 1785 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1786 {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0}, 1787 }; 1788 1789 assert(nbd_err); 1790 trace_nbd_co_send_structured_error(handle, nbd_err, 1791 nbd_err_lookup(nbd_err), msg ? msg : ""); 1792 set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, handle, 1793 sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len); 1794 stl_be_p(&chunk.error, nbd_err); 1795 stw_be_p(&chunk.message_length, iov[1].iov_len); 1796 1797 return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp); 1798 } 1799 1800 /* Do a sparse read and send the structured reply to the client. 1801 * Returns -errno if sending fails. bdrv_block_status_above() failure is 1802 * reported to the client, at which point this function succeeds. 1803 */ 1804 static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client, 1805 uint64_t handle, 1806 uint64_t offset, 1807 uint8_t *data, 1808 size_t size, 1809 Error **errp) 1810 { 1811 int ret = 0; 1812 NBDExport *exp = client->exp; 1813 size_t progress = 0; 1814 1815 while (progress < size) { 1816 int64_t pnum; 1817 int status = bdrv_block_status_above(blk_bs(exp->blk), NULL, 1818 offset + progress, 1819 size - progress, &pnum, NULL, 1820 NULL); 1821 bool final; 1822 1823 if (status < 0) { 1824 char *msg = g_strdup_printf("unable to check for holes: %s", 1825 strerror(-status)); 1826 1827 ret = nbd_co_send_structured_error(client, handle, -status, msg, 1828 errp); 1829 g_free(msg); 1830 return ret; 1831 } 1832 assert(pnum && pnum <= size - progress); 1833 final = progress + pnum == size; 1834 if (status & BDRV_BLOCK_ZERO) { 1835 NBDStructuredReadHole chunk; 1836 struct iovec iov[] = { 1837 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1838 }; 1839 1840 trace_nbd_co_send_structured_read_hole(handle, offset + progress, 1841 pnum); 1842 set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0, 1843 NBD_REPLY_TYPE_OFFSET_HOLE, 1844 handle, sizeof(chunk) - sizeof(chunk.h)); 1845 stq_be_p(&chunk.offset, offset + progress); 1846 stl_be_p(&chunk.length, pnum); 1847 ret = nbd_co_send_iov(client, iov, 1, errp); 1848 } else { 1849 ret = blk_pread(exp->blk, offset + progress + exp->dev_offset, 1850 data + progress, pnum); 1851 if (ret < 0) { 1852 error_setg_errno(errp, -ret, "reading from file failed"); 1853 break; 1854 } 1855 ret = nbd_co_send_structured_read(client, handle, offset + progress, 1856 data + progress, pnum, final, 1857 errp); 1858 } 1859 1860 if (ret < 0) { 1861 break; 1862 } 1863 progress += pnum; 1864 } 1865 return ret; 1866 } 1867 1868 /* 1869 * Populate @extents from block status. Update @bytes to be the actual 1870 * length encoded (which may be smaller than the original), and update 1871 * @nb_extents to the number of extents used. 1872 * 1873 * Returns zero on success and -errno on bdrv_block_status_above failure. 1874 */ 1875 static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset, 1876 uint64_t *bytes, NBDExtent *extents, 1877 unsigned int *nb_extents) 1878 { 1879 uint64_t remaining_bytes = *bytes; 1880 NBDExtent *extent = extents, *extents_end = extents + *nb_extents; 1881 bool first_extent = true; 1882 1883 assert(*nb_extents); 1884 while (remaining_bytes) { 1885 uint32_t flags; 1886 int64_t num; 1887 int ret = bdrv_block_status_above(bs, NULL, offset, remaining_bytes, 1888 &num, NULL, NULL); 1889 1890 if (ret < 0) { 1891 return ret; 1892 } 1893 1894 flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) | 1895 (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0); 1896 1897 if (first_extent) { 1898 extent->flags = flags; 1899 extent->length = num; 1900 first_extent = false; 1901 } else if (flags == extent->flags) { 1902 /* extend current extent */ 1903 extent->length += num; 1904 } else { 1905 if (extent + 1 == extents_end) { 1906 break; 1907 } 1908 1909 /* start new extent */ 1910 extent++; 1911 extent->flags = flags; 1912 extent->length = num; 1913 } 1914 offset += num; 1915 remaining_bytes -= num; 1916 } 1917 1918 extents_end = extent + 1; 1919 1920 for (extent = extents; extent < extents_end; extent++) { 1921 extent->flags = cpu_to_be32(extent->flags); 1922 extent->length = cpu_to_be32(extent->length); 1923 } 1924 1925 *bytes -= remaining_bytes; 1926 *nb_extents = extents_end - extents; 1927 1928 return 0; 1929 } 1930 1931 /* nbd_co_send_extents 1932 * 1933 * @length is only for tracing purposes (and may be smaller or larger 1934 * than the client's original request). @last controls whether 1935 * NBD_REPLY_FLAG_DONE is sent. @extents should already be in 1936 * big-endian format. 1937 */ 1938 static int nbd_co_send_extents(NBDClient *client, uint64_t handle, 1939 NBDExtent *extents, unsigned int nb_extents, 1940 uint64_t length, bool last, 1941 uint32_t context_id, Error **errp) 1942 { 1943 NBDStructuredMeta chunk; 1944 1945 struct iovec iov[] = { 1946 {.iov_base = &chunk, .iov_len = sizeof(chunk)}, 1947 {.iov_base = extents, .iov_len = nb_extents * sizeof(extents[0])} 1948 }; 1949 1950 trace_nbd_co_send_extents(handle, nb_extents, context_id, length, last); 1951 set_be_chunk(&chunk.h, last ? NBD_REPLY_FLAG_DONE : 0, 1952 NBD_REPLY_TYPE_BLOCK_STATUS, 1953 handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len); 1954 stl_be_p(&chunk.context_id, context_id); 1955 1956 return nbd_co_send_iov(client, iov, 2, errp); 1957 } 1958 1959 /* Get block status from the exported device and send it to the client */ 1960 static int nbd_co_send_block_status(NBDClient *client, uint64_t handle, 1961 BlockDriverState *bs, uint64_t offset, 1962 uint32_t length, bool dont_fragment, 1963 bool last, uint32_t context_id, 1964 Error **errp) 1965 { 1966 int ret; 1967 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS; 1968 NBDExtent *extents = g_new(NBDExtent, nb_extents); 1969 uint64_t final_length = length; 1970 1971 ret = blockstatus_to_extents(bs, offset, &final_length, extents, 1972 &nb_extents); 1973 if (ret < 0) { 1974 g_free(extents); 1975 return nbd_co_send_structured_error( 1976 client, handle, -ret, "can't get block status", errp); 1977 } 1978 1979 ret = nbd_co_send_extents(client, handle, extents, nb_extents, 1980 final_length, last, context_id, errp); 1981 1982 g_free(extents); 1983 1984 return ret; 1985 } 1986 1987 /* 1988 * Populate @extents from a dirty bitmap. Unless @dont_fragment, the 1989 * final extent may exceed the original @length. Store in @length the 1990 * byte length encoded (which may be smaller or larger than the 1991 * original), and return the number of extents used. 1992 */ 1993 static unsigned int bitmap_to_extents(BdrvDirtyBitmap *bitmap, uint64_t offset, 1994 uint64_t *length, NBDExtent *extents, 1995 unsigned int nb_extents, 1996 bool dont_fragment) 1997 { 1998 uint64_t begin = offset, end = offset; 1999 uint64_t overall_end = offset + *length; 2000 unsigned int i = 0; 2001 BdrvDirtyBitmapIter *it; 2002 bool dirty; 2003 2004 bdrv_dirty_bitmap_lock(bitmap); 2005 2006 it = bdrv_dirty_iter_new(bitmap); 2007 dirty = bdrv_get_dirty_locked(NULL, bitmap, offset); 2008 2009 assert(begin < overall_end && nb_extents); 2010 while (begin < overall_end && i < nb_extents) { 2011 bool next_dirty = !dirty; 2012 2013 if (dirty) { 2014 end = bdrv_dirty_bitmap_next_zero(bitmap, begin, UINT64_MAX); 2015 } else { 2016 bdrv_set_dirty_iter(it, begin); 2017 end = bdrv_dirty_iter_next(it); 2018 } 2019 if (end == -1 || end - begin > UINT32_MAX) { 2020 /* Cap to an aligned value < 4G beyond begin. */ 2021 end = MIN(bdrv_dirty_bitmap_size(bitmap), 2022 begin + UINT32_MAX + 1 - 2023 bdrv_dirty_bitmap_granularity(bitmap)); 2024 next_dirty = dirty; 2025 } 2026 if (dont_fragment && end > overall_end) { 2027 end = overall_end; 2028 } 2029 2030 extents[i].length = cpu_to_be32(end - begin); 2031 extents[i].flags = cpu_to_be32(dirty ? NBD_STATE_DIRTY : 0); 2032 i++; 2033 begin = end; 2034 dirty = next_dirty; 2035 } 2036 2037 bdrv_dirty_iter_free(it); 2038 2039 bdrv_dirty_bitmap_unlock(bitmap); 2040 2041 assert(offset < end); 2042 *length = end - offset; 2043 return i; 2044 } 2045 2046 static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle, 2047 BdrvDirtyBitmap *bitmap, uint64_t offset, 2048 uint32_t length, bool dont_fragment, bool last, 2049 uint32_t context_id, Error **errp) 2050 { 2051 int ret; 2052 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS; 2053 NBDExtent *extents = g_new(NBDExtent, nb_extents); 2054 uint64_t final_length = length; 2055 2056 nb_extents = bitmap_to_extents(bitmap, offset, &final_length, extents, 2057 nb_extents, dont_fragment); 2058 2059 ret = nbd_co_send_extents(client, handle, extents, nb_extents, 2060 final_length, last, context_id, errp); 2061 2062 g_free(extents); 2063 2064 return ret; 2065 } 2066 2067 /* nbd_co_receive_request 2068 * Collect a client request. Return 0 if request looks valid, -EIO to drop 2069 * connection right away, and any other negative value to report an error to 2070 * the client (although the caller may still need to disconnect after reporting 2071 * the error). 2072 */ 2073 static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, 2074 Error **errp) 2075 { 2076 NBDClient *client = req->client; 2077 int valid_flags; 2078 2079 g_assert(qemu_in_coroutine()); 2080 assert(client->recv_coroutine == qemu_coroutine_self()); 2081 if (nbd_receive_request(client->ioc, request, errp) < 0) { 2082 return -EIO; 2083 } 2084 2085 trace_nbd_co_receive_request_decode_type(request->handle, request->type, 2086 nbd_cmd_lookup(request->type)); 2087 2088 if (request->type != NBD_CMD_WRITE) { 2089 /* No payload, we are ready to read the next request. */ 2090 req->complete = true; 2091 } 2092 2093 if (request->type == NBD_CMD_DISC) { 2094 /* Special case: we're going to disconnect without a reply, 2095 * whether or not flags, from, or len are bogus */ 2096 return -EIO; 2097 } 2098 2099 if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE || 2100 request->type == NBD_CMD_CACHE) 2101 { 2102 if (request->len > NBD_MAX_BUFFER_SIZE) { 2103 error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)", 2104 request->len, NBD_MAX_BUFFER_SIZE); 2105 return -EINVAL; 2106 } 2107 2108 if (request->type != NBD_CMD_CACHE) { 2109 req->data = blk_try_blockalign(client->exp->blk, request->len); 2110 if (req->data == NULL) { 2111 error_setg(errp, "No memory"); 2112 return -ENOMEM; 2113 } 2114 } 2115 } 2116 2117 if (request->type == NBD_CMD_WRITE) { 2118 if (nbd_read(client->ioc, req->data, request->len, "CMD_WRITE data", 2119 errp) < 0) 2120 { 2121 return -EIO; 2122 } 2123 req->complete = true; 2124 2125 trace_nbd_co_receive_request_payload_received(request->handle, 2126 request->len); 2127 } 2128 2129 /* Sanity checks. */ 2130 if (client->exp->nbdflags & NBD_FLAG_READ_ONLY && 2131 (request->type == NBD_CMD_WRITE || 2132 request->type == NBD_CMD_WRITE_ZEROES || 2133 request->type == NBD_CMD_TRIM)) { 2134 error_setg(errp, "Export is read-only"); 2135 return -EROFS; 2136 } 2137 if (request->from > client->exp->size || 2138 request->len > client->exp->size - request->from) { 2139 error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu32 2140 ", Size: %" PRIu64, request->from, request->len, 2141 client->exp->size); 2142 return (request->type == NBD_CMD_WRITE || 2143 request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL; 2144 } 2145 if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len, 2146 client->check_align)) { 2147 /* 2148 * The block layer gracefully handles unaligned requests, but 2149 * it's still worth tracing client non-compliance 2150 */ 2151 trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type), 2152 request->from, 2153 request->len, 2154 client->check_align); 2155 } 2156 valid_flags = NBD_CMD_FLAG_FUA; 2157 if (request->type == NBD_CMD_READ && client->structured_reply) { 2158 valid_flags |= NBD_CMD_FLAG_DF; 2159 } else if (request->type == NBD_CMD_WRITE_ZEROES) { 2160 valid_flags |= NBD_CMD_FLAG_NO_HOLE; 2161 } else if (request->type == NBD_CMD_BLOCK_STATUS) { 2162 valid_flags |= NBD_CMD_FLAG_REQ_ONE; 2163 } 2164 if (request->flags & ~valid_flags) { 2165 error_setg(errp, "unsupported flags for command %s (got 0x%x)", 2166 nbd_cmd_lookup(request->type), request->flags); 2167 return -EINVAL; 2168 } 2169 2170 return 0; 2171 } 2172 2173 /* Send simple reply without a payload, or a structured error 2174 * @error_msg is ignored if @ret >= 0 2175 * Returns 0 if connection is still live, -errno on failure to talk to client 2176 */ 2177 static coroutine_fn int nbd_send_generic_reply(NBDClient *client, 2178 uint64_t handle, 2179 int ret, 2180 const char *error_msg, 2181 Error **errp) 2182 { 2183 if (client->structured_reply && ret < 0) { 2184 return nbd_co_send_structured_error(client, handle, -ret, error_msg, 2185 errp); 2186 } else { 2187 return nbd_co_send_simple_reply(client, handle, ret < 0 ? -ret : 0, 2188 NULL, 0, errp); 2189 } 2190 } 2191 2192 /* Handle NBD_CMD_READ request. 2193 * Return -errno if sending fails. Other errors are reported directly to the 2194 * client as an error reply. */ 2195 static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, 2196 uint8_t *data, Error **errp) 2197 { 2198 int ret; 2199 NBDExport *exp = client->exp; 2200 2201 assert(request->type == NBD_CMD_READ); 2202 2203 /* XXX: NBD Protocol only documents use of FUA with WRITE */ 2204 if (request->flags & NBD_CMD_FLAG_FUA) { 2205 ret = blk_co_flush(exp->blk); 2206 if (ret < 0) { 2207 return nbd_send_generic_reply(client, request->handle, ret, 2208 "flush failed", errp); 2209 } 2210 } 2211 2212 if (client->structured_reply && !(request->flags & NBD_CMD_FLAG_DF) && 2213 request->len) 2214 { 2215 return nbd_co_send_sparse_read(client, request->handle, request->from, 2216 data, request->len, errp); 2217 } 2218 2219 ret = blk_pread(exp->blk, request->from + exp->dev_offset, data, 2220 request->len); 2221 if (ret < 0) { 2222 return nbd_send_generic_reply(client, request->handle, ret, 2223 "reading from file failed", errp); 2224 } 2225 2226 if (client->structured_reply) { 2227 if (request->len) { 2228 return nbd_co_send_structured_read(client, request->handle, 2229 request->from, data, 2230 request->len, true, errp); 2231 } else { 2232 return nbd_co_send_structured_done(client, request->handle, errp); 2233 } 2234 } else { 2235 return nbd_co_send_simple_reply(client, request->handle, 0, 2236 data, request->len, errp); 2237 } 2238 } 2239 2240 /* 2241 * nbd_do_cmd_cache 2242 * 2243 * Handle NBD_CMD_CACHE request. 2244 * Return -errno if sending fails. Other errors are reported directly to the 2245 * client as an error reply. 2246 */ 2247 static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request, 2248 Error **errp) 2249 { 2250 int ret; 2251 NBDExport *exp = client->exp; 2252 2253 assert(request->type == NBD_CMD_CACHE); 2254 2255 ret = blk_co_preadv(exp->blk, request->from + exp->dev_offset, request->len, 2256 NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); 2257 2258 return nbd_send_generic_reply(client, request->handle, ret, 2259 "caching data failed", errp); 2260 } 2261 2262 /* Handle NBD request. 2263 * Return -errno if sending fails. Other errors are reported directly to the 2264 * client as an error reply. */ 2265 static coroutine_fn int nbd_handle_request(NBDClient *client, 2266 NBDRequest *request, 2267 uint8_t *data, Error **errp) 2268 { 2269 int ret; 2270 int flags; 2271 NBDExport *exp = client->exp; 2272 char *msg; 2273 2274 switch (request->type) { 2275 case NBD_CMD_CACHE: 2276 return nbd_do_cmd_cache(client, request, errp); 2277 2278 case NBD_CMD_READ: 2279 return nbd_do_cmd_read(client, request, data, errp); 2280 2281 case NBD_CMD_WRITE: 2282 flags = 0; 2283 if (request->flags & NBD_CMD_FLAG_FUA) { 2284 flags |= BDRV_REQ_FUA; 2285 } 2286 ret = blk_pwrite(exp->blk, request->from + exp->dev_offset, 2287 data, request->len, flags); 2288 return nbd_send_generic_reply(client, request->handle, ret, 2289 "writing to file failed", errp); 2290 2291 case NBD_CMD_WRITE_ZEROES: 2292 flags = 0; 2293 if (request->flags & NBD_CMD_FLAG_FUA) { 2294 flags |= BDRV_REQ_FUA; 2295 } 2296 if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) { 2297 flags |= BDRV_REQ_MAY_UNMAP; 2298 } 2299 ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, 2300 request->len, flags); 2301 return nbd_send_generic_reply(client, request->handle, ret, 2302 "writing to file failed", errp); 2303 2304 case NBD_CMD_DISC: 2305 /* unreachable, thanks to special case in nbd_co_receive_request() */ 2306 abort(); 2307 2308 case NBD_CMD_FLUSH: 2309 ret = blk_co_flush(exp->blk); 2310 return nbd_send_generic_reply(client, request->handle, ret, 2311 "flush failed", errp); 2312 2313 case NBD_CMD_TRIM: 2314 ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, 2315 request->len); 2316 if (ret == 0 && request->flags & NBD_CMD_FLAG_FUA) { 2317 ret = blk_co_flush(exp->blk); 2318 } 2319 return nbd_send_generic_reply(client, request->handle, ret, 2320 "discard failed", errp); 2321 2322 case NBD_CMD_BLOCK_STATUS: 2323 if (!request->len) { 2324 return nbd_send_generic_reply(client, request->handle, -EINVAL, 2325 "need non-zero length", errp); 2326 } 2327 if (client->export_meta.valid && 2328 (client->export_meta.base_allocation || 2329 client->export_meta.bitmap)) 2330 { 2331 bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE; 2332 2333 if (client->export_meta.base_allocation) { 2334 ret = nbd_co_send_block_status(client, request->handle, 2335 blk_bs(exp->blk), request->from, 2336 request->len, dont_fragment, 2337 !client->export_meta.bitmap, 2338 NBD_META_ID_BASE_ALLOCATION, 2339 errp); 2340 if (ret < 0) { 2341 return ret; 2342 } 2343 } 2344 2345 if (client->export_meta.bitmap) { 2346 ret = nbd_co_send_bitmap(client, request->handle, 2347 client->exp->export_bitmap, 2348 request->from, request->len, 2349 dont_fragment, 2350 true, NBD_META_ID_DIRTY_BITMAP, errp); 2351 if (ret < 0) { 2352 return ret; 2353 } 2354 } 2355 2356 return ret; 2357 } else { 2358 return nbd_send_generic_reply(client, request->handle, -EINVAL, 2359 "CMD_BLOCK_STATUS not negotiated", 2360 errp); 2361 } 2362 2363 default: 2364 msg = g_strdup_printf("invalid request type (%" PRIu32 ") received", 2365 request->type); 2366 ret = nbd_send_generic_reply(client, request->handle, -EINVAL, msg, 2367 errp); 2368 g_free(msg); 2369 return ret; 2370 } 2371 } 2372 2373 /* Owns a reference to the NBDClient passed as opaque. */ 2374 static coroutine_fn void nbd_trip(void *opaque) 2375 { 2376 NBDClient *client = opaque; 2377 NBDRequestData *req; 2378 NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */ 2379 int ret; 2380 Error *local_err = NULL; 2381 2382 trace_nbd_trip(); 2383 if (client->closing) { 2384 nbd_client_put(client); 2385 return; 2386 } 2387 2388 req = nbd_request_get(client); 2389 ret = nbd_co_receive_request(req, &request, &local_err); 2390 client->recv_coroutine = NULL; 2391 2392 if (client->closing) { 2393 /* 2394 * The client may be closed when we are blocked in 2395 * nbd_co_receive_request() 2396 */ 2397 goto done; 2398 } 2399 2400 nbd_client_receive_next_request(client); 2401 if (ret == -EIO) { 2402 goto disconnect; 2403 } 2404 2405 if (ret < 0) { 2406 /* It wans't -EIO, so, according to nbd_co_receive_request() 2407 * semantics, we should return the error to the client. */ 2408 Error *export_err = local_err; 2409 2410 local_err = NULL; 2411 ret = nbd_send_generic_reply(client, request.handle, -EINVAL, 2412 error_get_pretty(export_err), &local_err); 2413 error_free(export_err); 2414 } else { 2415 ret = nbd_handle_request(client, &request, req->data, &local_err); 2416 } 2417 if (ret < 0) { 2418 error_prepend(&local_err, "Failed to send reply: "); 2419 goto disconnect; 2420 } 2421 2422 /* We must disconnect after NBD_CMD_WRITE if we did not 2423 * read the payload. 2424 */ 2425 if (!req->complete) { 2426 error_setg(&local_err, "Request handling failed in intermediate state"); 2427 goto disconnect; 2428 } 2429 2430 done: 2431 nbd_request_put(req); 2432 nbd_client_put(client); 2433 return; 2434 2435 disconnect: 2436 if (local_err) { 2437 error_reportf_err(local_err, "Disconnect client, due to: "); 2438 } 2439 nbd_request_put(req); 2440 client_close(client, true); 2441 nbd_client_put(client); 2442 } 2443 2444 static void nbd_client_receive_next_request(NBDClient *client) 2445 { 2446 if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) { 2447 nbd_client_get(client); 2448 client->recv_coroutine = qemu_coroutine_create(nbd_trip, client); 2449 aio_co_schedule(client->exp->ctx, client->recv_coroutine); 2450 } 2451 } 2452 2453 static coroutine_fn void nbd_co_client_start(void *opaque) 2454 { 2455 NBDClient *client = opaque; 2456 Error *local_err = NULL; 2457 2458 qemu_co_mutex_init(&client->send_lock); 2459 2460 if (nbd_negotiate(client, &local_err)) { 2461 if (local_err) { 2462 error_report_err(local_err); 2463 } 2464 client_close(client, false); 2465 return; 2466 } 2467 2468 nbd_client_receive_next_request(client); 2469 } 2470 2471 /* 2472 * Create a new client listener using the given channel @sioc. 2473 * Begin servicing it in a coroutine. When the connection closes, call 2474 * @close_fn with an indication of whether the client completed negotiation. 2475 */ 2476 void nbd_client_new(QIOChannelSocket *sioc, 2477 QCryptoTLSCreds *tlscreds, 2478 const char *tlsauthz, 2479 void (*close_fn)(NBDClient *, bool)) 2480 { 2481 NBDClient *client; 2482 Coroutine *co; 2483 2484 client = g_new0(NBDClient, 1); 2485 client->refcount = 1; 2486 client->tlscreds = tlscreds; 2487 if (tlscreds) { 2488 object_ref(OBJECT(client->tlscreds)); 2489 } 2490 client->tlsauthz = g_strdup(tlsauthz); 2491 client->sioc = sioc; 2492 object_ref(OBJECT(client->sioc)); 2493 client->ioc = QIO_CHANNEL(sioc); 2494 object_ref(OBJECT(client->ioc)); 2495 client->close_fn = close_fn; 2496 2497 co = qemu_coroutine_create(nbd_co_client_start, client); 2498 qemu_coroutine_enter(co); 2499 } 2500