1 /* 2 * QEMU Block driver for NBD 3 * 4 * Copyright (C) 2016 Red Hat, Inc. 5 * Copyright (C) 2008 Bull S.A.S. 6 * Author: Laurent Vivier <Laurent.Vivier@bull.net> 7 * 8 * Some parts: 9 * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> 10 * 11 * Permission is hereby granted, free of charge, to any person obtaining a copy 12 * of this software and associated documentation files (the "Software"), to deal 13 * in the Software without restriction, including without limitation the rights 14 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 * copies of the Software, and to permit persons to whom the Software is 16 * furnished to do so, subject to the following conditions: 17 * 18 * The above copyright notice and this permission notice shall be included in 19 * all copies or substantial portions of the Software. 20 * 21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 27 * THE SOFTWARE. 28 */ 29 30 #include "qemu/osdep.h" 31 32 #include "trace.h" 33 #include "qemu/uri.h" 34 #include "qemu/option.h" 35 #include "qemu/cutils.h" 36 37 #include "qapi/qapi-visit-sockets.h" 38 #include "qapi/qmp/qstring.h" 39 40 #include "block/qdict.h" 41 #include "block/nbd.h" 42 #include "block/block_int.h" 43 44 #define EN_OPTSTR ":exportname=" 45 #define MAX_NBD_REQUESTS 16 46 47 #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs)) 48 #define INDEX_TO_HANDLE(bs, index) ((index) ^ (uint64_t)(intptr_t)(bs)) 49 50 typedef struct { 51 Coroutine *coroutine; 52 uint64_t offset; /* original offset of the request */ 53 bool receiving; /* waiting for connection_co? */ 54 } NBDClientRequest; 55 56 typedef struct BDRVNBDState { 57 QIOChannelSocket *sioc; /* The master data channel */ 58 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ 59 NBDExportInfo info; 60 61 CoMutex send_mutex; 62 CoQueue free_sema; 63 Coroutine *connection_co; 64 int in_flight; 65 66 NBDClientRequest requests[MAX_NBD_REQUESTS]; 67 NBDReply reply; 68 BlockDriverState *bs; 69 bool quit; 70 71 /* For nbd_refresh_filename() */ 72 SocketAddress *saddr; 73 char *export, *tlscredsid; 74 } BDRVNBDState; 75 76 static void nbd_recv_coroutines_wake_all(BDRVNBDState *s) 77 { 78 int i; 79 80 for (i = 0; i < MAX_NBD_REQUESTS; i++) { 81 NBDClientRequest *req = &s->requests[i]; 82 83 if (req->coroutine && req->receiving) { 84 aio_co_wake(req->coroutine); 85 } 86 } 87 } 88 89 static void nbd_client_detach_aio_context(BlockDriverState *bs) 90 { 91 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 92 93 qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc)); 94 } 95 96 static void nbd_client_attach_aio_context_bh(void *opaque) 97 { 98 BlockDriverState *bs = opaque; 99 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 100 101 /* 102 * The node is still drained, so we know the coroutine has yielded in 103 * nbd_read_eof(), the only place where bs->in_flight can reach 0, or it is 104 * entered for the first time. Both places are safe for entering the 105 * coroutine. 106 */ 107 qemu_aio_coroutine_enter(bs->aio_context, s->connection_co); 108 bdrv_dec_in_flight(bs); 109 } 110 111 static void nbd_client_attach_aio_context(BlockDriverState *bs, 112 AioContext *new_context) 113 { 114 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 115 116 qio_channel_attach_aio_context(QIO_CHANNEL(s->ioc), new_context); 117 118 bdrv_inc_in_flight(bs); 119 120 /* 121 * Need to wait here for the BH to run because the BH must run while the 122 * node is still drained. 123 */ 124 aio_wait_bh_oneshot(new_context, nbd_client_attach_aio_context_bh, bs); 125 } 126 127 128 static void nbd_teardown_connection(BlockDriverState *bs) 129 { 130 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 131 132 assert(s->ioc); 133 134 /* finish any pending coroutines */ 135 qio_channel_shutdown(s->ioc, 136 QIO_CHANNEL_SHUTDOWN_BOTH, 137 NULL); 138 BDRV_POLL_WHILE(bs, s->connection_co); 139 140 nbd_client_detach_aio_context(bs); 141 object_unref(OBJECT(s->sioc)); 142 s->sioc = NULL; 143 object_unref(OBJECT(s->ioc)); 144 s->ioc = NULL; 145 } 146 147 static coroutine_fn void nbd_connection_entry(void *opaque) 148 { 149 BDRVNBDState *s = opaque; 150 uint64_t i; 151 int ret = 0; 152 Error *local_err = NULL; 153 154 while (!s->quit) { 155 /* 156 * The NBD client can only really be considered idle when it has 157 * yielded from qio_channel_readv_all_eof(), waiting for data. This is 158 * the point where the additional scheduled coroutine entry happens 159 * after nbd_client_attach_aio_context(). 160 * 161 * Therefore we keep an additional in_flight reference all the time and 162 * only drop it temporarily here. 163 */ 164 assert(s->reply.handle == 0); 165 ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, &local_err); 166 167 if (local_err) { 168 trace_nbd_read_reply_entry_fail(ret, error_get_pretty(local_err)); 169 error_free(local_err); 170 } 171 if (ret <= 0) { 172 break; 173 } 174 175 /* 176 * There's no need for a mutex on the receive side, because the 177 * handler acts as a synchronization point and ensures that only 178 * one coroutine is called until the reply finishes. 179 */ 180 i = HANDLE_TO_INDEX(s, s->reply.handle); 181 if (i >= MAX_NBD_REQUESTS || 182 !s->requests[i].coroutine || 183 !s->requests[i].receiving || 184 (nbd_reply_is_structured(&s->reply) && !s->info.structured_reply)) 185 { 186 break; 187 } 188 189 /* 190 * We're woken up again by the request itself. Note that there 191 * is no race between yielding and reentering connection_co. This 192 * is because: 193 * 194 * - if the request runs on the same AioContext, it is only 195 * entered after we yield 196 * 197 * - if the request runs on a different AioContext, reentering 198 * connection_co happens through a bottom half, which can only 199 * run after we yield. 200 */ 201 aio_co_wake(s->requests[i].coroutine); 202 qemu_coroutine_yield(); 203 } 204 205 s->quit = true; 206 nbd_recv_coroutines_wake_all(s); 207 bdrv_dec_in_flight(s->bs); 208 209 s->connection_co = NULL; 210 aio_wait_kick(); 211 } 212 213 static int nbd_co_send_request(BlockDriverState *bs, 214 NBDRequest *request, 215 QEMUIOVector *qiov) 216 { 217 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 218 int rc, i; 219 220 qemu_co_mutex_lock(&s->send_mutex); 221 while (s->in_flight == MAX_NBD_REQUESTS) { 222 qemu_co_queue_wait(&s->free_sema, &s->send_mutex); 223 } 224 s->in_flight++; 225 226 for (i = 0; i < MAX_NBD_REQUESTS; i++) { 227 if (s->requests[i].coroutine == NULL) { 228 break; 229 } 230 } 231 232 g_assert(qemu_in_coroutine()); 233 assert(i < MAX_NBD_REQUESTS); 234 235 s->requests[i].coroutine = qemu_coroutine_self(); 236 s->requests[i].offset = request->from; 237 s->requests[i].receiving = false; 238 239 request->handle = INDEX_TO_HANDLE(s, i); 240 241 if (s->quit) { 242 rc = -EIO; 243 goto err; 244 } 245 assert(s->ioc); 246 247 if (qiov) { 248 qio_channel_set_cork(s->ioc, true); 249 rc = nbd_send_request(s->ioc, request); 250 if (rc >= 0 && !s->quit) { 251 if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov, 252 NULL) < 0) { 253 rc = -EIO; 254 } 255 } else if (rc >= 0) { 256 rc = -EIO; 257 } 258 qio_channel_set_cork(s->ioc, false); 259 } else { 260 rc = nbd_send_request(s->ioc, request); 261 } 262 263 err: 264 if (rc < 0) { 265 s->quit = true; 266 s->requests[i].coroutine = NULL; 267 s->in_flight--; 268 qemu_co_queue_next(&s->free_sema); 269 } 270 qemu_co_mutex_unlock(&s->send_mutex); 271 return rc; 272 } 273 274 static inline uint16_t payload_advance16(uint8_t **payload) 275 { 276 *payload += 2; 277 return lduw_be_p(*payload - 2); 278 } 279 280 static inline uint32_t payload_advance32(uint8_t **payload) 281 { 282 *payload += 4; 283 return ldl_be_p(*payload - 4); 284 } 285 286 static inline uint64_t payload_advance64(uint8_t **payload) 287 { 288 *payload += 8; 289 return ldq_be_p(*payload - 8); 290 } 291 292 static int nbd_parse_offset_hole_payload(BDRVNBDState *s, 293 NBDStructuredReplyChunk *chunk, 294 uint8_t *payload, uint64_t orig_offset, 295 QEMUIOVector *qiov, Error **errp) 296 { 297 uint64_t offset; 298 uint32_t hole_size; 299 300 if (chunk->length != sizeof(offset) + sizeof(hole_size)) { 301 error_setg(errp, "Protocol error: invalid payload for " 302 "NBD_REPLY_TYPE_OFFSET_HOLE"); 303 return -EINVAL; 304 } 305 306 offset = payload_advance64(&payload); 307 hole_size = payload_advance32(&payload); 308 309 if (!hole_size || offset < orig_offset || hole_size > qiov->size || 310 offset > orig_offset + qiov->size - hole_size) { 311 error_setg(errp, "Protocol error: server sent chunk exceeding requested" 312 " region"); 313 return -EINVAL; 314 } 315 if (s->info.min_block && 316 !QEMU_IS_ALIGNED(hole_size, s->info.min_block)) { 317 trace_nbd_structured_read_compliance("hole"); 318 } 319 320 qemu_iovec_memset(qiov, offset - orig_offset, 0, hole_size); 321 322 return 0; 323 } 324 325 /* 326 * nbd_parse_blockstatus_payload 327 * Based on our request, we expect only one extent in reply, for the 328 * base:allocation context. 329 */ 330 static int nbd_parse_blockstatus_payload(BDRVNBDState *s, 331 NBDStructuredReplyChunk *chunk, 332 uint8_t *payload, uint64_t orig_length, 333 NBDExtent *extent, Error **errp) 334 { 335 uint32_t context_id; 336 337 /* The server succeeded, so it must have sent [at least] one extent */ 338 if (chunk->length < sizeof(context_id) + sizeof(*extent)) { 339 error_setg(errp, "Protocol error: invalid payload for " 340 "NBD_REPLY_TYPE_BLOCK_STATUS"); 341 return -EINVAL; 342 } 343 344 context_id = payload_advance32(&payload); 345 if (s->info.context_id != context_id) { 346 error_setg(errp, "Protocol error: unexpected context id %d for " 347 "NBD_REPLY_TYPE_BLOCK_STATUS, when negotiated context " 348 "id is %d", context_id, 349 s->info.context_id); 350 return -EINVAL; 351 } 352 353 extent->length = payload_advance32(&payload); 354 extent->flags = payload_advance32(&payload); 355 356 if (extent->length == 0) { 357 error_setg(errp, "Protocol error: server sent status chunk with " 358 "zero length"); 359 return -EINVAL; 360 } 361 362 /* 363 * A server sending unaligned block status is in violation of the 364 * protocol, but as qemu-nbd 3.1 is such a server (at least for 365 * POSIX files that are not a multiple of 512 bytes, since qemu 366 * rounds files up to 512-byte multiples but lseek(SEEK_HOLE) 367 * still sees an implicit hole beyond the real EOF), it's nicer to 368 * work around the misbehaving server. If the request included 369 * more than the final unaligned block, truncate it back to an 370 * aligned result; if the request was only the final block, round 371 * up to the full block and change the status to fully-allocated 372 * (always a safe status, even if it loses information). 373 */ 374 if (s->info.min_block && !QEMU_IS_ALIGNED(extent->length, 375 s->info.min_block)) { 376 trace_nbd_parse_blockstatus_compliance("extent length is unaligned"); 377 if (extent->length > s->info.min_block) { 378 extent->length = QEMU_ALIGN_DOWN(extent->length, 379 s->info.min_block); 380 } else { 381 extent->length = s->info.min_block; 382 extent->flags = 0; 383 } 384 } 385 386 /* 387 * We used NBD_CMD_FLAG_REQ_ONE, so the server should not have 388 * sent us any more than one extent, nor should it have included 389 * status beyond our request in that extent. However, it's easy 390 * enough to ignore the server's noncompliance without killing the 391 * connection; just ignore trailing extents, and clamp things to 392 * the length of our request. 393 */ 394 if (chunk->length > sizeof(context_id) + sizeof(*extent)) { 395 trace_nbd_parse_blockstatus_compliance("more than one extent"); 396 } 397 if (extent->length > orig_length) { 398 extent->length = orig_length; 399 trace_nbd_parse_blockstatus_compliance("extent length too large"); 400 } 401 402 return 0; 403 } 404 405 /* 406 * nbd_parse_error_payload 407 * on success @errp contains message describing nbd error reply 408 */ 409 static int nbd_parse_error_payload(NBDStructuredReplyChunk *chunk, 410 uint8_t *payload, int *request_ret, 411 Error **errp) 412 { 413 uint32_t error; 414 uint16_t message_size; 415 416 assert(chunk->type & (1 << 15)); 417 418 if (chunk->length < sizeof(error) + sizeof(message_size)) { 419 error_setg(errp, 420 "Protocol error: invalid payload for structured error"); 421 return -EINVAL; 422 } 423 424 error = nbd_errno_to_system_errno(payload_advance32(&payload)); 425 if (error == 0) { 426 error_setg(errp, "Protocol error: server sent structured error chunk " 427 "with error = 0"); 428 return -EINVAL; 429 } 430 431 *request_ret = -error; 432 message_size = payload_advance16(&payload); 433 434 if (message_size > chunk->length - sizeof(error) - sizeof(message_size)) { 435 error_setg(errp, "Protocol error: server sent structured error chunk " 436 "with incorrect message size"); 437 return -EINVAL; 438 } 439 440 /* TODO: Add a trace point to mention the server complaint */ 441 442 /* TODO handle ERROR_OFFSET */ 443 444 return 0; 445 } 446 447 static int nbd_co_receive_offset_data_payload(BDRVNBDState *s, 448 uint64_t orig_offset, 449 QEMUIOVector *qiov, Error **errp) 450 { 451 QEMUIOVector sub_qiov; 452 uint64_t offset; 453 size_t data_size; 454 int ret; 455 NBDStructuredReplyChunk *chunk = &s->reply.structured; 456 457 assert(nbd_reply_is_structured(&s->reply)); 458 459 /* The NBD spec requires at least one byte of payload */ 460 if (chunk->length <= sizeof(offset)) { 461 error_setg(errp, "Protocol error: invalid payload for " 462 "NBD_REPLY_TYPE_OFFSET_DATA"); 463 return -EINVAL; 464 } 465 466 if (nbd_read64(s->ioc, &offset, "OFFSET_DATA offset", errp) < 0) { 467 return -EIO; 468 } 469 470 data_size = chunk->length - sizeof(offset); 471 assert(data_size); 472 if (offset < orig_offset || data_size > qiov->size || 473 offset > orig_offset + qiov->size - data_size) { 474 error_setg(errp, "Protocol error: server sent chunk exceeding requested" 475 " region"); 476 return -EINVAL; 477 } 478 if (s->info.min_block && !QEMU_IS_ALIGNED(data_size, s->info.min_block)) { 479 trace_nbd_structured_read_compliance("data"); 480 } 481 482 qemu_iovec_init(&sub_qiov, qiov->niov); 483 qemu_iovec_concat(&sub_qiov, qiov, offset - orig_offset, data_size); 484 ret = qio_channel_readv_all(s->ioc, sub_qiov.iov, sub_qiov.niov, errp); 485 qemu_iovec_destroy(&sub_qiov); 486 487 return ret < 0 ? -EIO : 0; 488 } 489 490 #define NBD_MAX_MALLOC_PAYLOAD 1000 491 static coroutine_fn int nbd_co_receive_structured_payload( 492 BDRVNBDState *s, void **payload, Error **errp) 493 { 494 int ret; 495 uint32_t len; 496 497 assert(nbd_reply_is_structured(&s->reply)); 498 499 len = s->reply.structured.length; 500 501 if (len == 0) { 502 return 0; 503 } 504 505 if (payload == NULL) { 506 error_setg(errp, "Unexpected structured payload"); 507 return -EINVAL; 508 } 509 510 if (len > NBD_MAX_MALLOC_PAYLOAD) { 511 error_setg(errp, "Payload too large"); 512 return -EINVAL; 513 } 514 515 *payload = g_new(char, len); 516 ret = nbd_read(s->ioc, *payload, len, "structured payload", errp); 517 if (ret < 0) { 518 g_free(*payload); 519 *payload = NULL; 520 return ret; 521 } 522 523 return 0; 524 } 525 526 /* 527 * nbd_co_do_receive_one_chunk 528 * for simple reply: 529 * set request_ret to received reply error 530 * if qiov is not NULL: read payload to @qiov 531 * for structured reply chunk: 532 * if error chunk: read payload, set @request_ret, do not set @payload 533 * else if offset_data chunk: read payload data to @qiov, do not set @payload 534 * else: read payload to @payload 535 * 536 * If function fails, @errp contains corresponding error message, and the 537 * connection with the server is suspect. If it returns 0, then the 538 * transaction succeeded (although @request_ret may be a negative errno 539 * corresponding to the server's error reply), and errp is unchanged. 540 */ 541 static coroutine_fn int nbd_co_do_receive_one_chunk( 542 BDRVNBDState *s, uint64_t handle, bool only_structured, 543 int *request_ret, QEMUIOVector *qiov, void **payload, Error **errp) 544 { 545 int ret; 546 int i = HANDLE_TO_INDEX(s, handle); 547 void *local_payload = NULL; 548 NBDStructuredReplyChunk *chunk; 549 550 if (payload) { 551 *payload = NULL; 552 } 553 *request_ret = 0; 554 555 /* Wait until we're woken up by nbd_connection_entry. */ 556 s->requests[i].receiving = true; 557 qemu_coroutine_yield(); 558 s->requests[i].receiving = false; 559 if (s->quit) { 560 error_setg(errp, "Connection closed"); 561 return -EIO; 562 } 563 assert(s->ioc); 564 565 assert(s->reply.handle == handle); 566 567 if (nbd_reply_is_simple(&s->reply)) { 568 if (only_structured) { 569 error_setg(errp, "Protocol error: simple reply when structured " 570 "reply chunk was expected"); 571 return -EINVAL; 572 } 573 574 *request_ret = -nbd_errno_to_system_errno(s->reply.simple.error); 575 if (*request_ret < 0 || !qiov) { 576 return 0; 577 } 578 579 return qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov, 580 errp) < 0 ? -EIO : 0; 581 } 582 583 /* handle structured reply chunk */ 584 assert(s->info.structured_reply); 585 chunk = &s->reply.structured; 586 587 if (chunk->type == NBD_REPLY_TYPE_NONE) { 588 if (!(chunk->flags & NBD_REPLY_FLAG_DONE)) { 589 error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk without" 590 " NBD_REPLY_FLAG_DONE flag set"); 591 return -EINVAL; 592 } 593 if (chunk->length) { 594 error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk with" 595 " nonzero length"); 596 return -EINVAL; 597 } 598 return 0; 599 } 600 601 if (chunk->type == NBD_REPLY_TYPE_OFFSET_DATA) { 602 if (!qiov) { 603 error_setg(errp, "Unexpected NBD_REPLY_TYPE_OFFSET_DATA chunk"); 604 return -EINVAL; 605 } 606 607 return nbd_co_receive_offset_data_payload(s, s->requests[i].offset, 608 qiov, errp); 609 } 610 611 if (nbd_reply_type_is_error(chunk->type)) { 612 payload = &local_payload; 613 } 614 615 ret = nbd_co_receive_structured_payload(s, payload, errp); 616 if (ret < 0) { 617 return ret; 618 } 619 620 if (nbd_reply_type_is_error(chunk->type)) { 621 ret = nbd_parse_error_payload(chunk, local_payload, request_ret, errp); 622 g_free(local_payload); 623 return ret; 624 } 625 626 return 0; 627 } 628 629 /* 630 * nbd_co_receive_one_chunk 631 * Read reply, wake up connection_co and set s->quit if needed. 632 * Return value is a fatal error code or normal nbd reply error code 633 */ 634 static coroutine_fn int nbd_co_receive_one_chunk( 635 BDRVNBDState *s, uint64_t handle, bool only_structured, 636 int *request_ret, QEMUIOVector *qiov, NBDReply *reply, void **payload, 637 Error **errp) 638 { 639 int ret = nbd_co_do_receive_one_chunk(s, handle, only_structured, 640 request_ret, qiov, payload, errp); 641 642 if (ret < 0) { 643 s->quit = true; 644 } else { 645 /* For assert at loop start in nbd_connection_entry */ 646 if (reply) { 647 *reply = s->reply; 648 } 649 s->reply.handle = 0; 650 } 651 652 if (s->connection_co) { 653 aio_co_wake(s->connection_co); 654 } 655 656 return ret; 657 } 658 659 typedef struct NBDReplyChunkIter { 660 int ret; 661 int request_ret; 662 Error *err; 663 bool done, only_structured; 664 } NBDReplyChunkIter; 665 666 static void nbd_iter_channel_error(NBDReplyChunkIter *iter, 667 int ret, Error **local_err) 668 { 669 assert(ret < 0); 670 671 if (!iter->ret) { 672 iter->ret = ret; 673 error_propagate(&iter->err, *local_err); 674 } else { 675 error_free(*local_err); 676 } 677 678 *local_err = NULL; 679 } 680 681 static void nbd_iter_request_error(NBDReplyChunkIter *iter, int ret) 682 { 683 assert(ret < 0); 684 685 if (!iter->request_ret) { 686 iter->request_ret = ret; 687 } 688 } 689 690 /* 691 * NBD_FOREACH_REPLY_CHUNK 692 * The pointer stored in @payload requires g_free() to free it. 693 */ 694 #define NBD_FOREACH_REPLY_CHUNK(s, iter, handle, structured, \ 695 qiov, reply, payload) \ 696 for (iter = (NBDReplyChunkIter) { .only_structured = structured }; \ 697 nbd_reply_chunk_iter_receive(s, &iter, handle, qiov, reply, payload);) 698 699 /* 700 * nbd_reply_chunk_iter_receive 701 * The pointer stored in @payload requires g_free() to free it. 702 */ 703 static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s, 704 NBDReplyChunkIter *iter, 705 uint64_t handle, 706 QEMUIOVector *qiov, NBDReply *reply, 707 void **payload) 708 { 709 int ret, request_ret; 710 NBDReply local_reply; 711 NBDStructuredReplyChunk *chunk; 712 Error *local_err = NULL; 713 if (s->quit) { 714 error_setg(&local_err, "Connection closed"); 715 nbd_iter_channel_error(iter, -EIO, &local_err); 716 goto break_loop; 717 } 718 719 if (iter->done) { 720 /* Previous iteration was last. */ 721 goto break_loop; 722 } 723 724 if (reply == NULL) { 725 reply = &local_reply; 726 } 727 728 ret = nbd_co_receive_one_chunk(s, handle, iter->only_structured, 729 &request_ret, qiov, reply, payload, 730 &local_err); 731 if (ret < 0) { 732 nbd_iter_channel_error(iter, ret, &local_err); 733 } else if (request_ret < 0) { 734 nbd_iter_request_error(iter, request_ret); 735 } 736 737 /* Do not execute the body of NBD_FOREACH_REPLY_CHUNK for simple reply. */ 738 if (nbd_reply_is_simple(reply) || s->quit) { 739 goto break_loop; 740 } 741 742 chunk = &reply->structured; 743 iter->only_structured = true; 744 745 if (chunk->type == NBD_REPLY_TYPE_NONE) { 746 /* NBD_REPLY_FLAG_DONE is already checked in nbd_co_receive_one_chunk */ 747 assert(chunk->flags & NBD_REPLY_FLAG_DONE); 748 goto break_loop; 749 } 750 751 if (chunk->flags & NBD_REPLY_FLAG_DONE) { 752 /* This iteration is last. */ 753 iter->done = true; 754 } 755 756 /* Execute the loop body */ 757 return true; 758 759 break_loop: 760 s->requests[HANDLE_TO_INDEX(s, handle)].coroutine = NULL; 761 762 qemu_co_mutex_lock(&s->send_mutex); 763 s->in_flight--; 764 qemu_co_queue_next(&s->free_sema); 765 qemu_co_mutex_unlock(&s->send_mutex); 766 767 return false; 768 } 769 770 static int nbd_co_receive_return_code(BDRVNBDState *s, uint64_t handle, 771 int *request_ret, Error **errp) 772 { 773 NBDReplyChunkIter iter; 774 775 NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, NULL, NULL) { 776 /* nbd_reply_chunk_iter_receive does all the work */ 777 } 778 779 error_propagate(errp, iter.err); 780 *request_ret = iter.request_ret; 781 return iter.ret; 782 } 783 784 static int nbd_co_receive_cmdread_reply(BDRVNBDState *s, uint64_t handle, 785 uint64_t offset, QEMUIOVector *qiov, 786 int *request_ret, Error **errp) 787 { 788 NBDReplyChunkIter iter; 789 NBDReply reply; 790 void *payload = NULL; 791 Error *local_err = NULL; 792 793 NBD_FOREACH_REPLY_CHUNK(s, iter, handle, s->info.structured_reply, 794 qiov, &reply, &payload) 795 { 796 int ret; 797 NBDStructuredReplyChunk *chunk = &reply.structured; 798 799 assert(nbd_reply_is_structured(&reply)); 800 801 switch (chunk->type) { 802 case NBD_REPLY_TYPE_OFFSET_DATA: 803 /* 804 * special cased in nbd_co_receive_one_chunk, data is already 805 * in qiov 806 */ 807 break; 808 case NBD_REPLY_TYPE_OFFSET_HOLE: 809 ret = nbd_parse_offset_hole_payload(s, &reply.structured, payload, 810 offset, qiov, &local_err); 811 if (ret < 0) { 812 s->quit = true; 813 nbd_iter_channel_error(&iter, ret, &local_err); 814 } 815 break; 816 default: 817 if (!nbd_reply_type_is_error(chunk->type)) { 818 /* not allowed reply type */ 819 s->quit = true; 820 error_setg(&local_err, 821 "Unexpected reply type: %d (%s) for CMD_READ", 822 chunk->type, nbd_reply_type_lookup(chunk->type)); 823 nbd_iter_channel_error(&iter, -EINVAL, &local_err); 824 } 825 } 826 827 g_free(payload); 828 payload = NULL; 829 } 830 831 error_propagate(errp, iter.err); 832 *request_ret = iter.request_ret; 833 return iter.ret; 834 } 835 836 static int nbd_co_receive_blockstatus_reply(BDRVNBDState *s, 837 uint64_t handle, uint64_t length, 838 NBDExtent *extent, 839 int *request_ret, Error **errp) 840 { 841 NBDReplyChunkIter iter; 842 NBDReply reply; 843 void *payload = NULL; 844 Error *local_err = NULL; 845 bool received = false; 846 847 assert(!extent->length); 848 NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, &reply, &payload) { 849 int ret; 850 NBDStructuredReplyChunk *chunk = &reply.structured; 851 852 assert(nbd_reply_is_structured(&reply)); 853 854 switch (chunk->type) { 855 case NBD_REPLY_TYPE_BLOCK_STATUS: 856 if (received) { 857 s->quit = true; 858 error_setg(&local_err, "Several BLOCK_STATUS chunks in reply"); 859 nbd_iter_channel_error(&iter, -EINVAL, &local_err); 860 } 861 received = true; 862 863 ret = nbd_parse_blockstatus_payload(s, &reply.structured, 864 payload, length, extent, 865 &local_err); 866 if (ret < 0) { 867 s->quit = true; 868 nbd_iter_channel_error(&iter, ret, &local_err); 869 } 870 break; 871 default: 872 if (!nbd_reply_type_is_error(chunk->type)) { 873 s->quit = true; 874 error_setg(&local_err, 875 "Unexpected reply type: %d (%s) " 876 "for CMD_BLOCK_STATUS", 877 chunk->type, nbd_reply_type_lookup(chunk->type)); 878 nbd_iter_channel_error(&iter, -EINVAL, &local_err); 879 } 880 } 881 882 g_free(payload); 883 payload = NULL; 884 } 885 886 if (!extent->length && !iter.request_ret) { 887 error_setg(&local_err, "Server did not reply with any status extents"); 888 nbd_iter_channel_error(&iter, -EIO, &local_err); 889 } 890 891 error_propagate(errp, iter.err); 892 *request_ret = iter.request_ret; 893 return iter.ret; 894 } 895 896 static int nbd_co_request(BlockDriverState *bs, NBDRequest *request, 897 QEMUIOVector *write_qiov) 898 { 899 int ret, request_ret; 900 Error *local_err = NULL; 901 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 902 903 assert(request->type != NBD_CMD_READ); 904 if (write_qiov) { 905 assert(request->type == NBD_CMD_WRITE); 906 assert(request->len == iov_size(write_qiov->iov, write_qiov->niov)); 907 } else { 908 assert(request->type != NBD_CMD_WRITE); 909 } 910 ret = nbd_co_send_request(bs, request, write_qiov); 911 if (ret < 0) { 912 return ret; 913 } 914 915 ret = nbd_co_receive_return_code(s, request->handle, 916 &request_ret, &local_err); 917 if (local_err) { 918 trace_nbd_co_request_fail(request->from, request->len, request->handle, 919 request->flags, request->type, 920 nbd_cmd_lookup(request->type), 921 ret, error_get_pretty(local_err)); 922 error_free(local_err); 923 } 924 return ret ? ret : request_ret; 925 } 926 927 static int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, 928 uint64_t bytes, QEMUIOVector *qiov, int flags) 929 { 930 int ret, request_ret; 931 Error *local_err = NULL; 932 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 933 NBDRequest request = { 934 .type = NBD_CMD_READ, 935 .from = offset, 936 .len = bytes, 937 }; 938 939 assert(bytes <= NBD_MAX_BUFFER_SIZE); 940 assert(!flags); 941 942 if (!bytes) { 943 return 0; 944 } 945 /* 946 * Work around the fact that the block layer doesn't do 947 * byte-accurate sizing yet - if the read exceeds the server's 948 * advertised size because the block layer rounded size up, then 949 * truncate the request to the server and tail-pad with zero. 950 */ 951 if (offset >= s->info.size) { 952 assert(bytes < BDRV_SECTOR_SIZE); 953 qemu_iovec_memset(qiov, 0, 0, bytes); 954 return 0; 955 } 956 if (offset + bytes > s->info.size) { 957 uint64_t slop = offset + bytes - s->info.size; 958 959 assert(slop < BDRV_SECTOR_SIZE); 960 qemu_iovec_memset(qiov, bytes - slop, 0, slop); 961 request.len -= slop; 962 } 963 964 ret = nbd_co_send_request(bs, &request, NULL); 965 if (ret < 0) { 966 return ret; 967 } 968 969 ret = nbd_co_receive_cmdread_reply(s, request.handle, offset, qiov, 970 &request_ret, &local_err); 971 if (local_err) { 972 trace_nbd_co_request_fail(request.from, request.len, request.handle, 973 request.flags, request.type, 974 nbd_cmd_lookup(request.type), 975 ret, error_get_pretty(local_err)); 976 error_free(local_err); 977 } 978 return ret ? ret : request_ret; 979 } 980 981 static int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, 982 uint64_t bytes, QEMUIOVector *qiov, int flags) 983 { 984 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 985 NBDRequest request = { 986 .type = NBD_CMD_WRITE, 987 .from = offset, 988 .len = bytes, 989 }; 990 991 assert(!(s->info.flags & NBD_FLAG_READ_ONLY)); 992 if (flags & BDRV_REQ_FUA) { 993 assert(s->info.flags & NBD_FLAG_SEND_FUA); 994 request.flags |= NBD_CMD_FLAG_FUA; 995 } 996 997 assert(bytes <= NBD_MAX_BUFFER_SIZE); 998 999 if (!bytes) { 1000 return 0; 1001 } 1002 return nbd_co_request(bs, &request, qiov); 1003 } 1004 1005 static int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, 1006 int bytes, BdrvRequestFlags flags) 1007 { 1008 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1009 NBDRequest request = { 1010 .type = NBD_CMD_WRITE_ZEROES, 1011 .from = offset, 1012 .len = bytes, 1013 }; 1014 1015 assert(!(s->info.flags & NBD_FLAG_READ_ONLY)); 1016 if (!(s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) { 1017 return -ENOTSUP; 1018 } 1019 1020 if (flags & BDRV_REQ_FUA) { 1021 assert(s->info.flags & NBD_FLAG_SEND_FUA); 1022 request.flags |= NBD_CMD_FLAG_FUA; 1023 } 1024 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 1025 request.flags |= NBD_CMD_FLAG_NO_HOLE; 1026 } 1027 1028 if (!bytes) { 1029 return 0; 1030 } 1031 return nbd_co_request(bs, &request, NULL); 1032 } 1033 1034 static int nbd_client_co_flush(BlockDriverState *bs) 1035 { 1036 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1037 NBDRequest request = { .type = NBD_CMD_FLUSH }; 1038 1039 if (!(s->info.flags & NBD_FLAG_SEND_FLUSH)) { 1040 return 0; 1041 } 1042 1043 request.from = 0; 1044 request.len = 0; 1045 1046 return nbd_co_request(bs, &request, NULL); 1047 } 1048 1049 static int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, 1050 int bytes) 1051 { 1052 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1053 NBDRequest request = { 1054 .type = NBD_CMD_TRIM, 1055 .from = offset, 1056 .len = bytes, 1057 }; 1058 1059 assert(!(s->info.flags & NBD_FLAG_READ_ONLY)); 1060 if (!(s->info.flags & NBD_FLAG_SEND_TRIM) || !bytes) { 1061 return 0; 1062 } 1063 1064 return nbd_co_request(bs, &request, NULL); 1065 } 1066 1067 static int coroutine_fn nbd_client_co_block_status( 1068 BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, 1069 int64_t *pnum, int64_t *map, BlockDriverState **file) 1070 { 1071 int ret, request_ret; 1072 NBDExtent extent = { 0 }; 1073 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1074 Error *local_err = NULL; 1075 1076 NBDRequest request = { 1077 .type = NBD_CMD_BLOCK_STATUS, 1078 .from = offset, 1079 .len = MIN(MIN_NON_ZERO(QEMU_ALIGN_DOWN(INT_MAX, 1080 bs->bl.request_alignment), 1081 s->info.max_block), 1082 MIN(bytes, s->info.size - offset)), 1083 .flags = NBD_CMD_FLAG_REQ_ONE, 1084 }; 1085 1086 if (!s->info.base_allocation) { 1087 *pnum = bytes; 1088 *map = offset; 1089 *file = bs; 1090 return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 1091 } 1092 1093 /* 1094 * Work around the fact that the block layer doesn't do 1095 * byte-accurate sizing yet - if the status request exceeds the 1096 * server's advertised size because the block layer rounded size 1097 * up, we truncated the request to the server (above), or are 1098 * called on just the hole. 1099 */ 1100 if (offset >= s->info.size) { 1101 *pnum = bytes; 1102 assert(bytes < BDRV_SECTOR_SIZE); 1103 /* Intentionally don't report offset_valid for the hole */ 1104 return BDRV_BLOCK_ZERO; 1105 } 1106 1107 if (s->info.min_block) { 1108 assert(QEMU_IS_ALIGNED(request.len, s->info.min_block)); 1109 } 1110 ret = nbd_co_send_request(bs, &request, NULL); 1111 if (ret < 0) { 1112 return ret; 1113 } 1114 1115 ret = nbd_co_receive_blockstatus_reply(s, request.handle, bytes, 1116 &extent, &request_ret, &local_err); 1117 if (local_err) { 1118 trace_nbd_co_request_fail(request.from, request.len, request.handle, 1119 request.flags, request.type, 1120 nbd_cmd_lookup(request.type), 1121 ret, error_get_pretty(local_err)); 1122 error_free(local_err); 1123 } 1124 if (ret < 0 || request_ret < 0) { 1125 return ret ? ret : request_ret; 1126 } 1127 1128 assert(extent.length); 1129 *pnum = extent.length; 1130 *map = offset; 1131 *file = bs; 1132 return (extent.flags & NBD_STATE_HOLE ? 0 : BDRV_BLOCK_DATA) | 1133 (extent.flags & NBD_STATE_ZERO ? BDRV_BLOCK_ZERO : 0) | 1134 BDRV_BLOCK_OFFSET_VALID; 1135 } 1136 1137 static void nbd_client_close(BlockDriverState *bs) 1138 { 1139 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1140 NBDRequest request = { .type = NBD_CMD_DISC }; 1141 1142 assert(s->ioc); 1143 1144 nbd_send_request(s->ioc, &request); 1145 1146 nbd_teardown_connection(bs); 1147 } 1148 1149 static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr, 1150 Error **errp) 1151 { 1152 QIOChannelSocket *sioc; 1153 Error *local_err = NULL; 1154 1155 sioc = qio_channel_socket_new(); 1156 qio_channel_set_name(QIO_CHANNEL(sioc), "nbd-client"); 1157 1158 qio_channel_socket_connect_sync(sioc, saddr, &local_err); 1159 if (local_err) { 1160 object_unref(OBJECT(sioc)); 1161 error_propagate(errp, local_err); 1162 return NULL; 1163 } 1164 1165 qio_channel_set_delay(QIO_CHANNEL(sioc), false); 1166 1167 return sioc; 1168 } 1169 1170 static int nbd_client_connect(BlockDriverState *bs, 1171 SocketAddress *saddr, 1172 const char *export, 1173 QCryptoTLSCreds *tlscreds, 1174 const char *hostname, 1175 const char *x_dirty_bitmap, 1176 Error **errp) 1177 { 1178 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1179 int ret; 1180 1181 /* 1182 * establish TCP connection, return error if it fails 1183 * TODO: Configurable retry-until-timeout behaviour. 1184 */ 1185 QIOChannelSocket *sioc = nbd_establish_connection(saddr, errp); 1186 1187 if (!sioc) { 1188 return -ECONNREFUSED; 1189 } 1190 1191 /* NBD handshake */ 1192 trace_nbd_client_connect(export); 1193 qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL); 1194 1195 s->info.request_sizes = true; 1196 s->info.structured_reply = true; 1197 s->info.base_allocation = true; 1198 s->info.x_dirty_bitmap = g_strdup(x_dirty_bitmap); 1199 s->info.name = g_strdup(export ?: ""); 1200 ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), tlscreds, hostname, 1201 &s->ioc, &s->info, errp); 1202 g_free(s->info.x_dirty_bitmap); 1203 g_free(s->info.name); 1204 if (ret < 0) { 1205 object_unref(OBJECT(sioc)); 1206 return ret; 1207 } 1208 if (x_dirty_bitmap && !s->info.base_allocation) { 1209 error_setg(errp, "requested x-dirty-bitmap %s not found", 1210 x_dirty_bitmap); 1211 ret = -EINVAL; 1212 goto fail; 1213 } 1214 if (s->info.flags & NBD_FLAG_READ_ONLY) { 1215 ret = bdrv_apply_auto_read_only(bs, "NBD export is read-only", errp); 1216 if (ret < 0) { 1217 goto fail; 1218 } 1219 } 1220 if (s->info.flags & NBD_FLAG_SEND_FUA) { 1221 bs->supported_write_flags = BDRV_REQ_FUA; 1222 bs->supported_zero_flags |= BDRV_REQ_FUA; 1223 } 1224 if (s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) { 1225 bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP; 1226 } 1227 1228 s->sioc = sioc; 1229 1230 if (!s->ioc) { 1231 s->ioc = QIO_CHANNEL(sioc); 1232 object_ref(OBJECT(s->ioc)); 1233 } 1234 1235 /* 1236 * Now that we're connected, set the socket to be non-blocking and 1237 * kick the reply mechanism. 1238 */ 1239 qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); 1240 s->connection_co = qemu_coroutine_create(nbd_connection_entry, s); 1241 bdrv_inc_in_flight(bs); 1242 nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs)); 1243 1244 trace_nbd_client_connect_success(export); 1245 1246 return 0; 1247 1248 fail: 1249 /* 1250 * We have connected, but must fail for other reasons. The 1251 * connection is still blocking; send NBD_CMD_DISC as a courtesy 1252 * to the server. 1253 */ 1254 { 1255 NBDRequest request = { .type = NBD_CMD_DISC }; 1256 1257 nbd_send_request(s->ioc ?: QIO_CHANNEL(sioc), &request); 1258 1259 object_unref(OBJECT(sioc)); 1260 1261 return ret; 1262 } 1263 } 1264 1265 static int nbd_client_init(BlockDriverState *bs, 1266 SocketAddress *saddr, 1267 const char *export, 1268 QCryptoTLSCreds *tlscreds, 1269 const char *hostname, 1270 const char *x_dirty_bitmap, 1271 Error **errp) 1272 { 1273 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1274 1275 s->bs = bs; 1276 qemu_co_mutex_init(&s->send_mutex); 1277 qemu_co_queue_init(&s->free_sema); 1278 1279 return nbd_client_connect(bs, saddr, export, tlscreds, hostname, 1280 x_dirty_bitmap, errp); 1281 } 1282 1283 static int nbd_parse_uri(const char *filename, QDict *options) 1284 { 1285 URI *uri; 1286 const char *p; 1287 QueryParams *qp = NULL; 1288 int ret = 0; 1289 bool is_unix; 1290 1291 uri = uri_parse(filename); 1292 if (!uri) { 1293 return -EINVAL; 1294 } 1295 1296 /* transport */ 1297 if (!g_strcmp0(uri->scheme, "nbd")) { 1298 is_unix = false; 1299 } else if (!g_strcmp0(uri->scheme, "nbd+tcp")) { 1300 is_unix = false; 1301 } else if (!g_strcmp0(uri->scheme, "nbd+unix")) { 1302 is_unix = true; 1303 } else { 1304 ret = -EINVAL; 1305 goto out; 1306 } 1307 1308 p = uri->path ? uri->path : "/"; 1309 p += strspn(p, "/"); 1310 if (p[0]) { 1311 qdict_put_str(options, "export", p); 1312 } 1313 1314 qp = query_params_parse(uri->query); 1315 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { 1316 ret = -EINVAL; 1317 goto out; 1318 } 1319 1320 if (is_unix) { 1321 /* nbd+unix:///export?socket=path */ 1322 if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { 1323 ret = -EINVAL; 1324 goto out; 1325 } 1326 qdict_put_str(options, "server.type", "unix"); 1327 qdict_put_str(options, "server.path", qp->p[0].value); 1328 } else { 1329 QString *host; 1330 char *port_str; 1331 1332 /* nbd[+tcp]://host[:port]/export */ 1333 if (!uri->server) { 1334 ret = -EINVAL; 1335 goto out; 1336 } 1337 1338 /* strip braces from literal IPv6 address */ 1339 if (uri->server[0] == '[') { 1340 host = qstring_from_substr(uri->server, 1, 1341 strlen(uri->server) - 1); 1342 } else { 1343 host = qstring_from_str(uri->server); 1344 } 1345 1346 qdict_put_str(options, "server.type", "inet"); 1347 qdict_put(options, "server.host", host); 1348 1349 port_str = g_strdup_printf("%d", uri->port ?: NBD_DEFAULT_PORT); 1350 qdict_put_str(options, "server.port", port_str); 1351 g_free(port_str); 1352 } 1353 1354 out: 1355 if (qp) { 1356 query_params_free(qp); 1357 } 1358 uri_free(uri); 1359 return ret; 1360 } 1361 1362 static bool nbd_has_filename_options_conflict(QDict *options, Error **errp) 1363 { 1364 const QDictEntry *e; 1365 1366 for (e = qdict_first(options); e; e = qdict_next(options, e)) { 1367 if (!strcmp(e->key, "host") || 1368 !strcmp(e->key, "port") || 1369 !strcmp(e->key, "path") || 1370 !strcmp(e->key, "export") || 1371 strstart(e->key, "server.", NULL)) 1372 { 1373 error_setg(errp, "Option '%s' cannot be used with a file name", 1374 e->key); 1375 return true; 1376 } 1377 } 1378 1379 return false; 1380 } 1381 1382 static void nbd_parse_filename(const char *filename, QDict *options, 1383 Error **errp) 1384 { 1385 char *file; 1386 char *export_name; 1387 const char *host_spec; 1388 const char *unixpath; 1389 1390 if (nbd_has_filename_options_conflict(options, errp)) { 1391 return; 1392 } 1393 1394 if (strstr(filename, "://")) { 1395 int ret = nbd_parse_uri(filename, options); 1396 if (ret < 0) { 1397 error_setg(errp, "No valid URL specified"); 1398 } 1399 return; 1400 } 1401 1402 file = g_strdup(filename); 1403 1404 export_name = strstr(file, EN_OPTSTR); 1405 if (export_name) { 1406 if (export_name[strlen(EN_OPTSTR)] == 0) { 1407 goto out; 1408 } 1409 export_name[0] = 0; /* truncate 'file' */ 1410 export_name += strlen(EN_OPTSTR); 1411 1412 qdict_put_str(options, "export", export_name); 1413 } 1414 1415 /* extract the host_spec - fail if it's not nbd:... */ 1416 if (!strstart(file, "nbd:", &host_spec)) { 1417 error_setg(errp, "File name string for NBD must start with 'nbd:'"); 1418 goto out; 1419 } 1420 1421 if (!*host_spec) { 1422 goto out; 1423 } 1424 1425 /* are we a UNIX or TCP socket? */ 1426 if (strstart(host_spec, "unix:", &unixpath)) { 1427 qdict_put_str(options, "server.type", "unix"); 1428 qdict_put_str(options, "server.path", unixpath); 1429 } else { 1430 InetSocketAddress *addr = g_new(InetSocketAddress, 1); 1431 1432 if (inet_parse(addr, host_spec, errp)) { 1433 goto out_inet; 1434 } 1435 1436 qdict_put_str(options, "server.type", "inet"); 1437 qdict_put_str(options, "server.host", addr->host); 1438 qdict_put_str(options, "server.port", addr->port); 1439 out_inet: 1440 qapi_free_InetSocketAddress(addr); 1441 } 1442 1443 out: 1444 g_free(file); 1445 } 1446 1447 static bool nbd_process_legacy_socket_options(QDict *output_options, 1448 QemuOpts *legacy_opts, 1449 Error **errp) 1450 { 1451 const char *path = qemu_opt_get(legacy_opts, "path"); 1452 const char *host = qemu_opt_get(legacy_opts, "host"); 1453 const char *port = qemu_opt_get(legacy_opts, "port"); 1454 const QDictEntry *e; 1455 1456 if (!path && !host && !port) { 1457 return true; 1458 } 1459 1460 for (e = qdict_first(output_options); e; e = qdict_next(output_options, e)) 1461 { 1462 if (strstart(e->key, "server.", NULL)) { 1463 error_setg(errp, "Cannot use 'server' and path/host/port at the " 1464 "same time"); 1465 return false; 1466 } 1467 } 1468 1469 if (path && host) { 1470 error_setg(errp, "path and host may not be used at the same time"); 1471 return false; 1472 } else if (path) { 1473 if (port) { 1474 error_setg(errp, "port may not be used without host"); 1475 return false; 1476 } 1477 1478 qdict_put_str(output_options, "server.type", "unix"); 1479 qdict_put_str(output_options, "server.path", path); 1480 } else if (host) { 1481 qdict_put_str(output_options, "server.type", "inet"); 1482 qdict_put_str(output_options, "server.host", host); 1483 qdict_put_str(output_options, "server.port", 1484 port ?: stringify(NBD_DEFAULT_PORT)); 1485 } 1486 1487 return true; 1488 } 1489 1490 static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, 1491 Error **errp) 1492 { 1493 SocketAddress *saddr = NULL; 1494 QDict *addr = NULL; 1495 Visitor *iv = NULL; 1496 Error *local_err = NULL; 1497 1498 qdict_extract_subqdict(options, &addr, "server."); 1499 if (!qdict_size(addr)) { 1500 error_setg(errp, "NBD server address missing"); 1501 goto done; 1502 } 1503 1504 iv = qobject_input_visitor_new_flat_confused(addr, errp); 1505 if (!iv) { 1506 goto done; 1507 } 1508 1509 visit_type_SocketAddress(iv, NULL, &saddr, &local_err); 1510 if (local_err) { 1511 error_propagate(errp, local_err); 1512 goto done; 1513 } 1514 1515 done: 1516 qobject_unref(addr); 1517 visit_free(iv); 1518 return saddr; 1519 } 1520 1521 static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp) 1522 { 1523 Object *obj; 1524 QCryptoTLSCreds *creds; 1525 1526 obj = object_resolve_path_component( 1527 object_get_objects_root(), id); 1528 if (!obj) { 1529 error_setg(errp, "No TLS credentials with id '%s'", 1530 id); 1531 return NULL; 1532 } 1533 creds = (QCryptoTLSCreds *) 1534 object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS); 1535 if (!creds) { 1536 error_setg(errp, "Object with id '%s' is not TLS credentials", 1537 id); 1538 return NULL; 1539 } 1540 1541 if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) { 1542 error_setg(errp, 1543 "Expecting TLS credentials with a client endpoint"); 1544 return NULL; 1545 } 1546 object_ref(obj); 1547 return creds; 1548 } 1549 1550 1551 static QemuOptsList nbd_runtime_opts = { 1552 .name = "nbd", 1553 .head = QTAILQ_HEAD_INITIALIZER(nbd_runtime_opts.head), 1554 .desc = { 1555 { 1556 .name = "host", 1557 .type = QEMU_OPT_STRING, 1558 .help = "TCP host to connect to", 1559 }, 1560 { 1561 .name = "port", 1562 .type = QEMU_OPT_STRING, 1563 .help = "TCP port to connect to", 1564 }, 1565 { 1566 .name = "path", 1567 .type = QEMU_OPT_STRING, 1568 .help = "Unix socket path to connect to", 1569 }, 1570 { 1571 .name = "export", 1572 .type = QEMU_OPT_STRING, 1573 .help = "Name of the NBD export to open", 1574 }, 1575 { 1576 .name = "tls-creds", 1577 .type = QEMU_OPT_STRING, 1578 .help = "ID of the TLS credentials to use", 1579 }, 1580 { 1581 .name = "x-dirty-bitmap", 1582 .type = QEMU_OPT_STRING, 1583 .help = "experimental: expose named dirty bitmap in place of " 1584 "block status", 1585 }, 1586 { /* end of list */ } 1587 }, 1588 }; 1589 1590 static int nbd_open(BlockDriverState *bs, QDict *options, int flags, 1591 Error **errp) 1592 { 1593 BDRVNBDState *s = bs->opaque; 1594 QemuOpts *opts = NULL; 1595 Error *local_err = NULL; 1596 QCryptoTLSCreds *tlscreds = NULL; 1597 const char *hostname = NULL; 1598 int ret = -EINVAL; 1599 1600 opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort); 1601 qemu_opts_absorb_qdict(opts, options, &local_err); 1602 if (local_err) { 1603 error_propagate(errp, local_err); 1604 goto error; 1605 } 1606 1607 /* Translate @host, @port, and @path to a SocketAddress */ 1608 if (!nbd_process_legacy_socket_options(options, opts, errp)) { 1609 goto error; 1610 } 1611 1612 /* Pop the config into our state object. Exit if invalid. */ 1613 s->saddr = nbd_config(s, options, errp); 1614 if (!s->saddr) { 1615 goto error; 1616 } 1617 1618 s->export = g_strdup(qemu_opt_get(opts, "export")); 1619 1620 s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds")); 1621 if (s->tlscredsid) { 1622 tlscreds = nbd_get_tls_creds(s->tlscredsid, errp); 1623 if (!tlscreds) { 1624 goto error; 1625 } 1626 1627 /* TODO SOCKET_ADDRESS_KIND_FD where fd has AF_INET or AF_INET6 */ 1628 if (s->saddr->type != SOCKET_ADDRESS_TYPE_INET) { 1629 error_setg(errp, "TLS only supported over IP sockets"); 1630 goto error; 1631 } 1632 hostname = s->saddr->u.inet.host; 1633 } 1634 1635 /* NBD handshake */ 1636 ret = nbd_client_init(bs, s->saddr, s->export, tlscreds, hostname, 1637 qemu_opt_get(opts, "x-dirty-bitmap"), errp); 1638 1639 error: 1640 if (tlscreds) { 1641 object_unref(OBJECT(tlscreds)); 1642 } 1643 if (ret < 0) { 1644 qapi_free_SocketAddress(s->saddr); 1645 g_free(s->export); 1646 g_free(s->tlscredsid); 1647 } 1648 qemu_opts_del(opts); 1649 return ret; 1650 } 1651 1652 static int nbd_co_flush(BlockDriverState *bs) 1653 { 1654 return nbd_client_co_flush(bs); 1655 } 1656 1657 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) 1658 { 1659 BDRVNBDState *s = (BDRVNBDState *)bs->opaque; 1660 uint32_t min = s->info.min_block; 1661 uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block); 1662 1663 /* 1664 * If the server did not advertise an alignment: 1665 * - a size that is not sector-aligned implies that an alignment 1666 * of 1 can be used to access those tail bytes 1667 * - advertisement of block status requires an alignment of 1, so 1668 * that we don't violate block layer constraints that block 1669 * status is always aligned (as we can't control whether the 1670 * server will report sub-sector extents, such as a hole at EOF 1671 * on an unaligned POSIX file) 1672 * - otherwise, assume the server is so old that we are safer avoiding 1673 * sub-sector requests 1674 */ 1675 if (!min) { 1676 min = (!QEMU_IS_ALIGNED(s->info.size, BDRV_SECTOR_SIZE) || 1677 s->info.base_allocation) ? 1 : BDRV_SECTOR_SIZE; 1678 } 1679 1680 bs->bl.request_alignment = min; 1681 bs->bl.max_pdiscard = max; 1682 bs->bl.max_pwrite_zeroes = max; 1683 bs->bl.max_transfer = max; 1684 1685 if (s->info.opt_block && 1686 s->info.opt_block > bs->bl.opt_transfer) { 1687 bs->bl.opt_transfer = s->info.opt_block; 1688 } 1689 } 1690 1691 static void nbd_close(BlockDriverState *bs) 1692 { 1693 BDRVNBDState *s = bs->opaque; 1694 1695 nbd_client_close(bs); 1696 1697 qapi_free_SocketAddress(s->saddr); 1698 g_free(s->export); 1699 g_free(s->tlscredsid); 1700 } 1701 1702 static int64_t nbd_getlength(BlockDriverState *bs) 1703 { 1704 BDRVNBDState *s = bs->opaque; 1705 1706 return s->info.size; 1707 } 1708 1709 static void nbd_refresh_filename(BlockDriverState *bs) 1710 { 1711 BDRVNBDState *s = bs->opaque; 1712 const char *host = NULL, *port = NULL, *path = NULL; 1713 1714 if (s->saddr->type == SOCKET_ADDRESS_TYPE_INET) { 1715 const InetSocketAddress *inet = &s->saddr->u.inet; 1716 if (!inet->has_ipv4 && !inet->has_ipv6 && !inet->has_to) { 1717 host = inet->host; 1718 port = inet->port; 1719 } 1720 } else if (s->saddr->type == SOCKET_ADDRESS_TYPE_UNIX) { 1721 path = s->saddr->u.q_unix.path; 1722 } /* else can't represent as pseudo-filename */ 1723 1724 if (path && s->export) { 1725 snprintf(bs->exact_filename, sizeof(bs->exact_filename), 1726 "nbd+unix:///%s?socket=%s", s->export, path); 1727 } else if (path && !s->export) { 1728 snprintf(bs->exact_filename, sizeof(bs->exact_filename), 1729 "nbd+unix://?socket=%s", path); 1730 } else if (host && s->export) { 1731 snprintf(bs->exact_filename, sizeof(bs->exact_filename), 1732 "nbd://%s:%s/%s", host, port, s->export); 1733 } else if (host && !s->export) { 1734 snprintf(bs->exact_filename, sizeof(bs->exact_filename), 1735 "nbd://%s:%s", host, port); 1736 } 1737 } 1738 1739 static char *nbd_dirname(BlockDriverState *bs, Error **errp) 1740 { 1741 /* The generic bdrv_dirname() implementation is able to work out some 1742 * directory name for NBD nodes, but that would be wrong. So far there is no 1743 * specification for how "export paths" would work, so NBD does not have 1744 * directory names. */ 1745 error_setg(errp, "Cannot generate a base directory for NBD nodes"); 1746 return NULL; 1747 } 1748 1749 static const char *const nbd_strong_runtime_opts[] = { 1750 "path", 1751 "host", 1752 "port", 1753 "export", 1754 "tls-creds", 1755 "server.", 1756 1757 NULL 1758 }; 1759 1760 static BlockDriver bdrv_nbd = { 1761 .format_name = "nbd", 1762 .protocol_name = "nbd", 1763 .instance_size = sizeof(BDRVNBDState), 1764 .bdrv_parse_filename = nbd_parse_filename, 1765 .bdrv_file_open = nbd_open, 1766 .bdrv_co_preadv = nbd_client_co_preadv, 1767 .bdrv_co_pwritev = nbd_client_co_pwritev, 1768 .bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes, 1769 .bdrv_close = nbd_close, 1770 .bdrv_co_flush_to_os = nbd_co_flush, 1771 .bdrv_co_pdiscard = nbd_client_co_pdiscard, 1772 .bdrv_refresh_limits = nbd_refresh_limits, 1773 .bdrv_getlength = nbd_getlength, 1774 .bdrv_detach_aio_context = nbd_client_detach_aio_context, 1775 .bdrv_attach_aio_context = nbd_client_attach_aio_context, 1776 .bdrv_refresh_filename = nbd_refresh_filename, 1777 .bdrv_co_block_status = nbd_client_co_block_status, 1778 .bdrv_dirname = nbd_dirname, 1779 .strong_runtime_opts = nbd_strong_runtime_opts, 1780 }; 1781 1782 static BlockDriver bdrv_nbd_tcp = { 1783 .format_name = "nbd", 1784 .protocol_name = "nbd+tcp", 1785 .instance_size = sizeof(BDRVNBDState), 1786 .bdrv_parse_filename = nbd_parse_filename, 1787 .bdrv_file_open = nbd_open, 1788 .bdrv_co_preadv = nbd_client_co_preadv, 1789 .bdrv_co_pwritev = nbd_client_co_pwritev, 1790 .bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes, 1791 .bdrv_close = nbd_close, 1792 .bdrv_co_flush_to_os = nbd_co_flush, 1793 .bdrv_co_pdiscard = nbd_client_co_pdiscard, 1794 .bdrv_refresh_limits = nbd_refresh_limits, 1795 .bdrv_getlength = nbd_getlength, 1796 .bdrv_detach_aio_context = nbd_client_detach_aio_context, 1797 .bdrv_attach_aio_context = nbd_client_attach_aio_context, 1798 .bdrv_refresh_filename = nbd_refresh_filename, 1799 .bdrv_co_block_status = nbd_client_co_block_status, 1800 .bdrv_dirname = nbd_dirname, 1801 .strong_runtime_opts = nbd_strong_runtime_opts, 1802 }; 1803 1804 static BlockDriver bdrv_nbd_unix = { 1805 .format_name = "nbd", 1806 .protocol_name = "nbd+unix", 1807 .instance_size = sizeof(BDRVNBDState), 1808 .bdrv_parse_filename = nbd_parse_filename, 1809 .bdrv_file_open = nbd_open, 1810 .bdrv_co_preadv = nbd_client_co_preadv, 1811 .bdrv_co_pwritev = nbd_client_co_pwritev, 1812 .bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes, 1813 .bdrv_close = nbd_close, 1814 .bdrv_co_flush_to_os = nbd_co_flush, 1815 .bdrv_co_pdiscard = nbd_client_co_pdiscard, 1816 .bdrv_refresh_limits = nbd_refresh_limits, 1817 .bdrv_getlength = nbd_getlength, 1818 .bdrv_detach_aio_context = nbd_client_detach_aio_context, 1819 .bdrv_attach_aio_context = nbd_client_attach_aio_context, 1820 .bdrv_refresh_filename = nbd_refresh_filename, 1821 .bdrv_co_block_status = nbd_client_co_block_status, 1822 .bdrv_dirname = nbd_dirname, 1823 .strong_runtime_opts = nbd_strong_runtime_opts, 1824 }; 1825 1826 static void bdrv_nbd_init(void) 1827 { 1828 bdrv_register(&bdrv_nbd); 1829 bdrv_register(&bdrv_nbd_tcp); 1830 bdrv_register(&bdrv_nbd_unix); 1831 } 1832 1833 block_init(bdrv_nbd_init); 1834