1 /* 2 * GlusterFS backend for QEMU 3 * 4 * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com> 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 #include "qemu/osdep.h" 11 #include <glusterfs/api/glfs.h> 12 #include "block/block_int.h" 13 #include "qapi/error.h" 14 #include "qemu/uri.h" 15 16 typedef struct GlusterAIOCB { 17 int64_t size; 18 int ret; 19 QEMUBH *bh; 20 Coroutine *coroutine; 21 AioContext *aio_context; 22 } GlusterAIOCB; 23 24 typedef struct BDRVGlusterState { 25 struct glfs *glfs; 26 struct glfs_fd *fd; 27 bool supports_seek_data; 28 int debug_level; 29 } BDRVGlusterState; 30 31 typedef struct GlusterConf { 32 char *server; 33 int port; 34 char *volname; 35 char *image; 36 char *transport; 37 int debug_level; 38 } GlusterConf; 39 40 static void qemu_gluster_gconf_free(GlusterConf *gconf) 41 { 42 if (gconf) { 43 g_free(gconf->server); 44 g_free(gconf->volname); 45 g_free(gconf->image); 46 g_free(gconf->transport); 47 g_free(gconf); 48 } 49 } 50 51 static int parse_volume_options(GlusterConf *gconf, char *path) 52 { 53 char *p, *q; 54 55 if (!path) { 56 return -EINVAL; 57 } 58 59 /* volume */ 60 p = q = path + strspn(path, "/"); 61 p += strcspn(p, "/"); 62 if (*p == '\0') { 63 return -EINVAL; 64 } 65 gconf->volname = g_strndup(q, p - q); 66 67 /* image */ 68 p += strspn(p, "/"); 69 if (*p == '\0') { 70 return -EINVAL; 71 } 72 gconf->image = g_strdup(p); 73 return 0; 74 } 75 76 /* 77 * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...] 78 * 79 * 'gluster' is the protocol. 80 * 81 * 'transport' specifies the transport type used to connect to gluster 82 * management daemon (glusterd). Valid transport types are 83 * tcp, unix and rdma. If a transport type isn't specified, then tcp 84 * type is assumed. 85 * 86 * 'server' specifies the server where the volume file specification for 87 * the given volume resides. This can be either hostname, ipv4 address 88 * or ipv6 address. ipv6 address needs to be within square brackets [ ]. 89 * If transport type is 'unix', then 'server' field should not be specified. 90 * The 'socket' field needs to be populated with the path to unix domain 91 * socket. 92 * 93 * 'port' is the port number on which glusterd is listening. This is optional 94 * and if not specified, QEMU will send 0 which will make gluster to use the 95 * default port. If the transport type is unix, then 'port' should not be 96 * specified. 97 * 98 * 'volname' is the name of the gluster volume which contains the VM image. 99 * 100 * 'image' is the path to the actual VM image that resides on gluster volume. 101 * 102 * Examples: 103 * 104 * file=gluster://1.2.3.4/testvol/a.img 105 * file=gluster+tcp://1.2.3.4/testvol/a.img 106 * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img 107 * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img 108 * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img 109 * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img 110 * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket 111 * file=gluster+rdma://1.2.3.4:24007/testvol/a.img 112 */ 113 static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) 114 { 115 URI *uri; 116 QueryParams *qp = NULL; 117 bool is_unix = false; 118 int ret = 0; 119 120 uri = uri_parse(filename); 121 if (!uri) { 122 return -EINVAL; 123 } 124 125 /* transport */ 126 if (!uri->scheme || !strcmp(uri->scheme, "gluster")) { 127 gconf->transport = g_strdup("tcp"); 128 } else if (!strcmp(uri->scheme, "gluster+tcp")) { 129 gconf->transport = g_strdup("tcp"); 130 } else if (!strcmp(uri->scheme, "gluster+unix")) { 131 gconf->transport = g_strdup("unix"); 132 is_unix = true; 133 } else if (!strcmp(uri->scheme, "gluster+rdma")) { 134 gconf->transport = g_strdup("rdma"); 135 } else { 136 ret = -EINVAL; 137 goto out; 138 } 139 140 ret = parse_volume_options(gconf, uri->path); 141 if (ret < 0) { 142 goto out; 143 } 144 145 qp = query_params_parse(uri->query); 146 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { 147 ret = -EINVAL; 148 goto out; 149 } 150 151 if (is_unix) { 152 if (uri->server || uri->port) { 153 ret = -EINVAL; 154 goto out; 155 } 156 if (strcmp(qp->p[0].name, "socket")) { 157 ret = -EINVAL; 158 goto out; 159 } 160 gconf->server = g_strdup(qp->p[0].value); 161 } else { 162 gconf->server = g_strdup(uri->server ? uri->server : "localhost"); 163 gconf->port = uri->port; 164 } 165 166 out: 167 if (qp) { 168 query_params_free(qp); 169 } 170 uri_free(uri); 171 return ret; 172 } 173 174 static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, 175 Error **errp) 176 { 177 struct glfs *glfs = NULL; 178 int ret; 179 int old_errno; 180 181 ret = qemu_gluster_parseuri(gconf, filename); 182 if (ret < 0) { 183 error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/" 184 "volname/image[?socket=...]"); 185 errno = -ret; 186 goto out; 187 } 188 189 glfs = glfs_new(gconf->volname); 190 if (!glfs) { 191 goto out; 192 } 193 194 ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server, 195 gconf->port); 196 if (ret < 0) { 197 goto out; 198 } 199 200 ret = glfs_set_logging(glfs, "-", gconf->debug_level); 201 if (ret < 0) { 202 goto out; 203 } 204 205 ret = glfs_init(glfs); 206 if (ret) { 207 error_setg_errno(errp, errno, 208 "Gluster connection failed for server=%s port=%d " 209 "volume=%s image=%s transport=%s", gconf->server, 210 gconf->port, gconf->volname, gconf->image, 211 gconf->transport); 212 213 /* glfs_init sometimes doesn't set errno although docs suggest that */ 214 if (errno == 0) 215 errno = EINVAL; 216 217 goto out; 218 } 219 return glfs; 220 221 out: 222 if (glfs) { 223 old_errno = errno; 224 glfs_fini(glfs); 225 errno = old_errno; 226 } 227 return NULL; 228 } 229 230 static void qemu_gluster_complete_aio(void *opaque) 231 { 232 GlusterAIOCB *acb = (GlusterAIOCB *)opaque; 233 234 qemu_bh_delete(acb->bh); 235 acb->bh = NULL; 236 qemu_coroutine_enter(acb->coroutine, NULL); 237 } 238 239 /* 240 * AIO callback routine called from GlusterFS thread. 241 */ 242 static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) 243 { 244 GlusterAIOCB *acb = (GlusterAIOCB *)arg; 245 246 if (!ret || ret == acb->size) { 247 acb->ret = 0; /* Success */ 248 } else if (ret < 0) { 249 acb->ret = -errno; /* Read/Write failed */ 250 } else { 251 acb->ret = -EIO; /* Partial read/write - fail it */ 252 } 253 254 acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb); 255 qemu_bh_schedule(acb->bh); 256 } 257 258 #define GLUSTER_OPT_FILENAME "filename" 259 #define GLUSTER_OPT_DEBUG "debug" 260 #define GLUSTER_DEBUG_DEFAULT 4 261 #define GLUSTER_DEBUG_MAX 9 262 263 /* TODO Convert to fine grained options */ 264 static QemuOptsList runtime_opts = { 265 .name = "gluster", 266 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 267 .desc = { 268 { 269 .name = GLUSTER_OPT_FILENAME, 270 .type = QEMU_OPT_STRING, 271 .help = "URL to the gluster image", 272 }, 273 { 274 .name = GLUSTER_OPT_DEBUG, 275 .type = QEMU_OPT_NUMBER, 276 .help = "Gluster log level, valid range is 0-9", 277 }, 278 { /* end of list */ } 279 }, 280 }; 281 282 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) 283 { 284 assert(open_flags != NULL); 285 286 *open_flags |= O_BINARY; 287 288 if (bdrv_flags & BDRV_O_RDWR) { 289 *open_flags |= O_RDWR; 290 } else { 291 *open_flags |= O_RDONLY; 292 } 293 294 if ((bdrv_flags & BDRV_O_NOCACHE)) { 295 *open_flags |= O_DIRECT; 296 } 297 } 298 299 /* 300 * Do SEEK_DATA/HOLE to detect if it is functional. Older broken versions of 301 * gfapi incorrectly return the current offset when SEEK_DATA/HOLE is used. 302 * - Corrected versions return -1 and set errno to EINVAL. 303 * - Versions that support SEEK_DATA/HOLE correctly, will return -1 and set 304 * errno to ENXIO when SEEK_DATA is called with a position of EOF. 305 */ 306 static bool qemu_gluster_test_seek(struct glfs_fd *fd) 307 { 308 off_t ret, eof; 309 310 eof = glfs_lseek(fd, 0, SEEK_END); 311 if (eof < 0) { 312 /* this should never occur */ 313 return false; 314 } 315 316 /* this should always fail with ENXIO if SEEK_DATA is supported */ 317 ret = glfs_lseek(fd, eof, SEEK_DATA); 318 return (ret < 0) && (errno == ENXIO); 319 } 320 321 static int qemu_gluster_open(BlockDriverState *bs, QDict *options, 322 int bdrv_flags, Error **errp) 323 { 324 BDRVGlusterState *s = bs->opaque; 325 int open_flags = 0; 326 int ret = 0; 327 GlusterConf *gconf = g_new0(GlusterConf, 1); 328 QemuOpts *opts; 329 Error *local_err = NULL; 330 const char *filename; 331 332 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 333 qemu_opts_absorb_qdict(opts, options, &local_err); 334 if (local_err) { 335 error_propagate(errp, local_err); 336 ret = -EINVAL; 337 goto out; 338 } 339 340 filename = qemu_opt_get(opts, GLUSTER_OPT_FILENAME); 341 342 s->debug_level = qemu_opt_get_number(opts, GLUSTER_OPT_DEBUG, 343 GLUSTER_DEBUG_DEFAULT); 344 if (s->debug_level < 0) { 345 s->debug_level = 0; 346 } else if (s->debug_level > GLUSTER_DEBUG_MAX) { 347 s->debug_level = GLUSTER_DEBUG_MAX; 348 } 349 350 gconf->debug_level = s->debug_level; 351 s->glfs = qemu_gluster_init(gconf, filename, errp); 352 if (!s->glfs) { 353 ret = -errno; 354 goto out; 355 } 356 357 #ifdef CONFIG_GLUSTERFS_XLATOR_OPT 358 /* Without this, if fsync fails for a recoverable reason (for instance, 359 * ENOSPC), gluster will dump its cache, preventing retries. This means 360 * almost certain data loss. Not all gluster versions support the 361 * 'resync-failed-syncs-after-fsync' key value, but there is no way to 362 * discover during runtime if it is supported (this api returns success for 363 * unknown key/value pairs) */ 364 ret = glfs_set_xlator_option(s->glfs, "*-write-behind", 365 "resync-failed-syncs-after-fsync", 366 "on"); 367 if (ret < 0) { 368 error_setg_errno(errp, errno, "Unable to set xlator key/value pair"); 369 ret = -errno; 370 goto out; 371 } 372 #endif 373 374 qemu_gluster_parse_flags(bdrv_flags, &open_flags); 375 376 s->fd = glfs_open(s->glfs, gconf->image, open_flags); 377 if (!s->fd) { 378 ret = -errno; 379 } 380 381 s->supports_seek_data = qemu_gluster_test_seek(s->fd); 382 383 out: 384 qemu_opts_del(opts); 385 qemu_gluster_gconf_free(gconf); 386 if (!ret) { 387 return ret; 388 } 389 if (s->fd) { 390 glfs_close(s->fd); 391 } 392 if (s->glfs) { 393 glfs_fini(s->glfs); 394 } 395 return ret; 396 } 397 398 typedef struct BDRVGlusterReopenState { 399 struct glfs *glfs; 400 struct glfs_fd *fd; 401 } BDRVGlusterReopenState; 402 403 404 static int qemu_gluster_reopen_prepare(BDRVReopenState *state, 405 BlockReopenQueue *queue, Error **errp) 406 { 407 int ret = 0; 408 BDRVGlusterState *s; 409 BDRVGlusterReopenState *reop_s; 410 GlusterConf *gconf = NULL; 411 int open_flags = 0; 412 413 assert(state != NULL); 414 assert(state->bs != NULL); 415 416 s = state->bs->opaque; 417 418 state->opaque = g_new0(BDRVGlusterReopenState, 1); 419 reop_s = state->opaque; 420 421 qemu_gluster_parse_flags(state->flags, &open_flags); 422 423 gconf = g_new0(GlusterConf, 1); 424 425 gconf->debug_level = s->debug_level; 426 reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp); 427 if (reop_s->glfs == NULL) { 428 ret = -errno; 429 goto exit; 430 } 431 432 #ifdef CONFIG_GLUSTERFS_XLATOR_OPT 433 ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind", 434 "resync-failed-syncs-after-fsync", "on"); 435 if (ret < 0) { 436 error_setg_errno(errp, errno, "Unable to set xlator key/value pair"); 437 ret = -errno; 438 goto exit; 439 } 440 #endif 441 442 reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); 443 if (reop_s->fd == NULL) { 444 /* reops->glfs will be cleaned up in _abort */ 445 ret = -errno; 446 goto exit; 447 } 448 449 exit: 450 /* state->opaque will be freed in either the _abort or _commit */ 451 qemu_gluster_gconf_free(gconf); 452 return ret; 453 } 454 455 static void qemu_gluster_reopen_commit(BDRVReopenState *state) 456 { 457 BDRVGlusterReopenState *reop_s = state->opaque; 458 BDRVGlusterState *s = state->bs->opaque; 459 460 461 /* close the old */ 462 if (s->fd) { 463 glfs_close(s->fd); 464 } 465 if (s->glfs) { 466 glfs_fini(s->glfs); 467 } 468 469 /* use the newly opened image / connection */ 470 s->fd = reop_s->fd; 471 s->glfs = reop_s->glfs; 472 473 g_free(state->opaque); 474 state->opaque = NULL; 475 476 return; 477 } 478 479 480 static void qemu_gluster_reopen_abort(BDRVReopenState *state) 481 { 482 BDRVGlusterReopenState *reop_s = state->opaque; 483 484 if (reop_s == NULL) { 485 return; 486 } 487 488 if (reop_s->fd) { 489 glfs_close(reop_s->fd); 490 } 491 492 if (reop_s->glfs) { 493 glfs_fini(reop_s->glfs); 494 } 495 496 g_free(state->opaque); 497 state->opaque = NULL; 498 499 return; 500 } 501 502 #ifdef CONFIG_GLUSTERFS_ZEROFILL 503 static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs, 504 int64_t offset, int size, BdrvRequestFlags flags) 505 { 506 int ret; 507 GlusterAIOCB acb; 508 BDRVGlusterState *s = bs->opaque; 509 510 acb.size = size; 511 acb.ret = 0; 512 acb.coroutine = qemu_coroutine_self(); 513 acb.aio_context = bdrv_get_aio_context(bs); 514 515 ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb); 516 if (ret < 0) { 517 return -errno; 518 } 519 520 qemu_coroutine_yield(); 521 return acb.ret; 522 } 523 524 static inline bool gluster_supports_zerofill(void) 525 { 526 return 1; 527 } 528 529 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, 530 int64_t size) 531 { 532 return glfs_zerofill(fd, offset, size); 533 } 534 535 #else 536 static inline bool gluster_supports_zerofill(void) 537 { 538 return 0; 539 } 540 541 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, 542 int64_t size) 543 { 544 return 0; 545 } 546 #endif 547 548 static int qemu_gluster_create(const char *filename, 549 QemuOpts *opts, Error **errp) 550 { 551 struct glfs *glfs; 552 struct glfs_fd *fd; 553 int ret = 0; 554 int prealloc = 0; 555 int64_t total_size = 0; 556 char *tmp = NULL; 557 GlusterConf *gconf = g_new0(GlusterConf, 1); 558 559 gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG, 560 GLUSTER_DEBUG_DEFAULT); 561 if (gconf->debug_level < 0) { 562 gconf->debug_level = 0; 563 } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) { 564 gconf->debug_level = GLUSTER_DEBUG_MAX; 565 } 566 567 glfs = qemu_gluster_init(gconf, filename, errp); 568 if (!glfs) { 569 ret = -errno; 570 goto out; 571 } 572 573 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 574 BDRV_SECTOR_SIZE); 575 576 tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 577 if (!tmp || !strcmp(tmp, "off")) { 578 prealloc = 0; 579 } else if (!strcmp(tmp, "full") && 580 gluster_supports_zerofill()) { 581 prealloc = 1; 582 } else { 583 error_setg(errp, "Invalid preallocation mode: '%s'" 584 " or GlusterFS doesn't support zerofill API", 585 tmp); 586 ret = -EINVAL; 587 goto out; 588 } 589 590 fd = glfs_creat(glfs, gconf->image, 591 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); 592 if (!fd) { 593 ret = -errno; 594 } else { 595 if (!glfs_ftruncate(fd, total_size)) { 596 if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) { 597 ret = -errno; 598 } 599 } else { 600 ret = -errno; 601 } 602 603 if (glfs_close(fd) != 0) { 604 ret = -errno; 605 } 606 } 607 out: 608 g_free(tmp); 609 qemu_gluster_gconf_free(gconf); 610 if (glfs) { 611 glfs_fini(glfs); 612 } 613 return ret; 614 } 615 616 static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, 617 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) 618 { 619 int ret; 620 GlusterAIOCB acb; 621 BDRVGlusterState *s = bs->opaque; 622 size_t size = nb_sectors * BDRV_SECTOR_SIZE; 623 off_t offset = sector_num * BDRV_SECTOR_SIZE; 624 625 acb.size = size; 626 acb.ret = 0; 627 acb.coroutine = qemu_coroutine_self(); 628 acb.aio_context = bdrv_get_aio_context(bs); 629 630 if (write) { 631 ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, 632 gluster_finish_aiocb, &acb); 633 } else { 634 ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, 635 gluster_finish_aiocb, &acb); 636 } 637 638 if (ret < 0) { 639 return -errno; 640 } 641 642 qemu_coroutine_yield(); 643 return acb.ret; 644 } 645 646 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) 647 { 648 int ret; 649 BDRVGlusterState *s = bs->opaque; 650 651 ret = glfs_ftruncate(s->fd, offset); 652 if (ret < 0) { 653 return -errno; 654 } 655 656 return 0; 657 } 658 659 static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, 660 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 661 { 662 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); 663 } 664 665 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, 666 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 667 { 668 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); 669 } 670 671 static void qemu_gluster_close(BlockDriverState *bs) 672 { 673 BDRVGlusterState *s = bs->opaque; 674 675 if (s->fd) { 676 glfs_close(s->fd); 677 s->fd = NULL; 678 } 679 glfs_fini(s->glfs); 680 } 681 682 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) 683 { 684 int ret; 685 GlusterAIOCB acb; 686 BDRVGlusterState *s = bs->opaque; 687 688 acb.size = 0; 689 acb.ret = 0; 690 acb.coroutine = qemu_coroutine_self(); 691 acb.aio_context = bdrv_get_aio_context(bs); 692 693 ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb); 694 if (ret < 0) { 695 ret = -errno; 696 goto error; 697 } 698 699 qemu_coroutine_yield(); 700 if (acb.ret < 0) { 701 ret = acb.ret; 702 goto error; 703 } 704 705 return acb.ret; 706 707 error: 708 /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache 709 * after a fsync failure, so we have no way of allowing the guest to safely 710 * continue. Gluster versions prior to 3.5.6 don't retain the cache 711 * either, but will invalidate the fd on error, so this is again our only 712 * option. 713 * 714 * The 'resync-failed-syncs-after-fsync' xlator option for the 715 * write-behind cache will cause later gluster versions to retain its 716 * cache after error, so long as the fd remains open. However, we 717 * currently have no way of knowing if this option is supported. 718 * 719 * TODO: Once gluster provides a way for us to determine if the option 720 * is supported, bypass the closure and setting drv to NULL. */ 721 qemu_gluster_close(bs); 722 bs->drv = NULL; 723 return ret; 724 } 725 726 #ifdef CONFIG_GLUSTERFS_DISCARD 727 static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, 728 int64_t sector_num, int nb_sectors) 729 { 730 int ret; 731 GlusterAIOCB acb; 732 BDRVGlusterState *s = bs->opaque; 733 size_t size = nb_sectors * BDRV_SECTOR_SIZE; 734 off_t offset = sector_num * BDRV_SECTOR_SIZE; 735 736 acb.size = 0; 737 acb.ret = 0; 738 acb.coroutine = qemu_coroutine_self(); 739 acb.aio_context = bdrv_get_aio_context(bs); 740 741 ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb); 742 if (ret < 0) { 743 return -errno; 744 } 745 746 qemu_coroutine_yield(); 747 return acb.ret; 748 } 749 #endif 750 751 static int64_t qemu_gluster_getlength(BlockDriverState *bs) 752 { 753 BDRVGlusterState *s = bs->opaque; 754 int64_t ret; 755 756 ret = glfs_lseek(s->fd, 0, SEEK_END); 757 if (ret < 0) { 758 return -errno; 759 } else { 760 return ret; 761 } 762 } 763 764 static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs) 765 { 766 BDRVGlusterState *s = bs->opaque; 767 struct stat st; 768 int ret; 769 770 ret = glfs_fstat(s->fd, &st); 771 if (ret < 0) { 772 return -errno; 773 } else { 774 return st.st_blocks * 512; 775 } 776 } 777 778 static int qemu_gluster_has_zero_init(BlockDriverState *bs) 779 { 780 /* GlusterFS volume could be backed by a block device */ 781 return 0; 782 } 783 784 /* 785 * Find allocation range in @bs around offset @start. 786 * May change underlying file descriptor's file offset. 787 * If @start is not in a hole, store @start in @data, and the 788 * beginning of the next hole in @hole, and return 0. 789 * If @start is in a non-trailing hole, store @start in @hole and the 790 * beginning of the next non-hole in @data, and return 0. 791 * If @start is in a trailing hole or beyond EOF, return -ENXIO. 792 * If we can't find out, return a negative errno other than -ENXIO. 793 * 794 * (Shamefully copied from raw-posix.c, only miniscule adaptions.) 795 */ 796 static int find_allocation(BlockDriverState *bs, off_t start, 797 off_t *data, off_t *hole) 798 { 799 BDRVGlusterState *s = bs->opaque; 800 off_t offs; 801 802 if (!s->supports_seek_data) { 803 return -ENOTSUP; 804 } 805 806 /* 807 * SEEK_DATA cases: 808 * D1. offs == start: start is in data 809 * D2. offs > start: start is in a hole, next data at offs 810 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole 811 * or start is beyond EOF 812 * If the latter happens, the file has been truncated behind 813 * our back since we opened it. All bets are off then. 814 * Treating like a trailing hole is simplest. 815 * D4. offs < 0, errno != ENXIO: we learned nothing 816 */ 817 offs = glfs_lseek(s->fd, start, SEEK_DATA); 818 if (offs < 0) { 819 return -errno; /* D3 or D4 */ 820 } 821 assert(offs >= start); 822 823 if (offs > start) { 824 /* D2: in hole, next data at offs */ 825 *hole = start; 826 *data = offs; 827 return 0; 828 } 829 830 /* D1: in data, end not yet known */ 831 832 /* 833 * SEEK_HOLE cases: 834 * H1. offs == start: start is in a hole 835 * If this happens here, a hole has been dug behind our back 836 * since the previous lseek(). 837 * H2. offs > start: either start is in data, next hole at offs, 838 * or start is in trailing hole, EOF at offs 839 * Linux treats trailing holes like any other hole: offs == 840 * start. Solaris seeks to EOF instead: offs > start (blech). 841 * If that happens here, a hole has been dug behind our back 842 * since the previous lseek(). 843 * H3. offs < 0, errno = ENXIO: start is beyond EOF 844 * If this happens, the file has been truncated behind our 845 * back since we opened it. Treat it like a trailing hole. 846 * H4. offs < 0, errno != ENXIO: we learned nothing 847 * Pretend we know nothing at all, i.e. "forget" about D1. 848 */ 849 offs = glfs_lseek(s->fd, start, SEEK_HOLE); 850 if (offs < 0) { 851 return -errno; /* D1 and (H3 or H4) */ 852 } 853 assert(offs >= start); 854 855 if (offs > start) { 856 /* 857 * D1 and H2: either in data, next hole at offs, or it was in 858 * data but is now in a trailing hole. In the latter case, 859 * all bets are off. Treating it as if it there was data all 860 * the way to EOF is safe, so simply do that. 861 */ 862 *data = start; 863 *hole = offs; 864 return 0; 865 } 866 867 /* D1 and H1 */ 868 return -EBUSY; 869 } 870 871 /* 872 * Returns the allocation status of the specified sectors. 873 * 874 * If 'sector_num' is beyond the end of the disk image the return value is 0 875 * and 'pnum' is set to 0. 876 * 877 * 'pnum' is set to the number of sectors (including and immediately following 878 * the specified sector) that are known to be in the same 879 * allocated/unallocated state. 880 * 881 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 882 * beyond the end of the disk image it will be clamped. 883 * 884 * (Based on raw_co_get_block_status() from raw-posix.c.) 885 */ 886 static int64_t coroutine_fn qemu_gluster_co_get_block_status( 887 BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, 888 BlockDriverState **file) 889 { 890 BDRVGlusterState *s = bs->opaque; 891 off_t start, data = 0, hole = 0; 892 int64_t total_size; 893 int ret = -EINVAL; 894 895 if (!s->fd) { 896 return ret; 897 } 898 899 start = sector_num * BDRV_SECTOR_SIZE; 900 total_size = bdrv_getlength(bs); 901 if (total_size < 0) { 902 return total_size; 903 } else if (start >= total_size) { 904 *pnum = 0; 905 return 0; 906 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { 907 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); 908 } 909 910 ret = find_allocation(bs, start, &data, &hole); 911 if (ret == -ENXIO) { 912 /* Trailing hole */ 913 *pnum = nb_sectors; 914 ret = BDRV_BLOCK_ZERO; 915 } else if (ret < 0) { 916 /* No info available, so pretend there are no holes */ 917 *pnum = nb_sectors; 918 ret = BDRV_BLOCK_DATA; 919 } else if (data == start) { 920 /* On a data extent, compute sectors to the end of the extent, 921 * possibly including a partial sector at EOF. */ 922 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); 923 ret = BDRV_BLOCK_DATA; 924 } else { 925 /* On a hole, compute sectors to the beginning of the next extent. */ 926 assert(hole == start); 927 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); 928 ret = BDRV_BLOCK_ZERO; 929 } 930 931 *file = bs; 932 933 return ret | BDRV_BLOCK_OFFSET_VALID | start; 934 } 935 936 937 static QemuOptsList qemu_gluster_create_opts = { 938 .name = "qemu-gluster-create-opts", 939 .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head), 940 .desc = { 941 { 942 .name = BLOCK_OPT_SIZE, 943 .type = QEMU_OPT_SIZE, 944 .help = "Virtual disk size" 945 }, 946 { 947 .name = BLOCK_OPT_PREALLOC, 948 .type = QEMU_OPT_STRING, 949 .help = "Preallocation mode (allowed values: off, full)" 950 }, 951 { 952 .name = GLUSTER_OPT_DEBUG, 953 .type = QEMU_OPT_NUMBER, 954 .help = "Gluster log level, valid range is 0-9", 955 }, 956 { /* end of list */ } 957 } 958 }; 959 960 static BlockDriver bdrv_gluster = { 961 .format_name = "gluster", 962 .protocol_name = "gluster", 963 .instance_size = sizeof(BDRVGlusterState), 964 .bdrv_needs_filename = true, 965 .bdrv_file_open = qemu_gluster_open, 966 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, 967 .bdrv_reopen_commit = qemu_gluster_reopen_commit, 968 .bdrv_reopen_abort = qemu_gluster_reopen_abort, 969 .bdrv_close = qemu_gluster_close, 970 .bdrv_create = qemu_gluster_create, 971 .bdrv_getlength = qemu_gluster_getlength, 972 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, 973 .bdrv_truncate = qemu_gluster_truncate, 974 .bdrv_co_readv = qemu_gluster_co_readv, 975 .bdrv_co_writev = qemu_gluster_co_writev, 976 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, 977 .bdrv_has_zero_init = qemu_gluster_has_zero_init, 978 #ifdef CONFIG_GLUSTERFS_DISCARD 979 .bdrv_co_discard = qemu_gluster_co_discard, 980 #endif 981 #ifdef CONFIG_GLUSTERFS_ZEROFILL 982 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, 983 #endif 984 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, 985 .create_opts = &qemu_gluster_create_opts, 986 }; 987 988 static BlockDriver bdrv_gluster_tcp = { 989 .format_name = "gluster", 990 .protocol_name = "gluster+tcp", 991 .instance_size = sizeof(BDRVGlusterState), 992 .bdrv_needs_filename = true, 993 .bdrv_file_open = qemu_gluster_open, 994 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, 995 .bdrv_reopen_commit = qemu_gluster_reopen_commit, 996 .bdrv_reopen_abort = qemu_gluster_reopen_abort, 997 .bdrv_close = qemu_gluster_close, 998 .bdrv_create = qemu_gluster_create, 999 .bdrv_getlength = qemu_gluster_getlength, 1000 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, 1001 .bdrv_truncate = qemu_gluster_truncate, 1002 .bdrv_co_readv = qemu_gluster_co_readv, 1003 .bdrv_co_writev = qemu_gluster_co_writev, 1004 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, 1005 .bdrv_has_zero_init = qemu_gluster_has_zero_init, 1006 #ifdef CONFIG_GLUSTERFS_DISCARD 1007 .bdrv_co_discard = qemu_gluster_co_discard, 1008 #endif 1009 #ifdef CONFIG_GLUSTERFS_ZEROFILL 1010 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, 1011 #endif 1012 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, 1013 .create_opts = &qemu_gluster_create_opts, 1014 }; 1015 1016 static BlockDriver bdrv_gluster_unix = { 1017 .format_name = "gluster", 1018 .protocol_name = "gluster+unix", 1019 .instance_size = sizeof(BDRVGlusterState), 1020 .bdrv_needs_filename = true, 1021 .bdrv_file_open = qemu_gluster_open, 1022 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, 1023 .bdrv_reopen_commit = qemu_gluster_reopen_commit, 1024 .bdrv_reopen_abort = qemu_gluster_reopen_abort, 1025 .bdrv_close = qemu_gluster_close, 1026 .bdrv_create = qemu_gluster_create, 1027 .bdrv_getlength = qemu_gluster_getlength, 1028 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, 1029 .bdrv_truncate = qemu_gluster_truncate, 1030 .bdrv_co_readv = qemu_gluster_co_readv, 1031 .bdrv_co_writev = qemu_gluster_co_writev, 1032 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, 1033 .bdrv_has_zero_init = qemu_gluster_has_zero_init, 1034 #ifdef CONFIG_GLUSTERFS_DISCARD 1035 .bdrv_co_discard = qemu_gluster_co_discard, 1036 #endif 1037 #ifdef CONFIG_GLUSTERFS_ZEROFILL 1038 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, 1039 #endif 1040 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, 1041 .create_opts = &qemu_gluster_create_opts, 1042 }; 1043 1044 static BlockDriver bdrv_gluster_rdma = { 1045 .format_name = "gluster", 1046 .protocol_name = "gluster+rdma", 1047 .instance_size = sizeof(BDRVGlusterState), 1048 .bdrv_needs_filename = true, 1049 .bdrv_file_open = qemu_gluster_open, 1050 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, 1051 .bdrv_reopen_commit = qemu_gluster_reopen_commit, 1052 .bdrv_reopen_abort = qemu_gluster_reopen_abort, 1053 .bdrv_close = qemu_gluster_close, 1054 .bdrv_create = qemu_gluster_create, 1055 .bdrv_getlength = qemu_gluster_getlength, 1056 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, 1057 .bdrv_truncate = qemu_gluster_truncate, 1058 .bdrv_co_readv = qemu_gluster_co_readv, 1059 .bdrv_co_writev = qemu_gluster_co_writev, 1060 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, 1061 .bdrv_has_zero_init = qemu_gluster_has_zero_init, 1062 #ifdef CONFIG_GLUSTERFS_DISCARD 1063 .bdrv_co_discard = qemu_gluster_co_discard, 1064 #endif 1065 #ifdef CONFIG_GLUSTERFS_ZEROFILL 1066 .bdrv_co_pwrite_zeroes = qemu_gluster_co_pwrite_zeroes, 1067 #endif 1068 .bdrv_co_get_block_status = qemu_gluster_co_get_block_status, 1069 .create_opts = &qemu_gluster_create_opts, 1070 }; 1071 1072 static void bdrv_gluster_init(void) 1073 { 1074 bdrv_register(&bdrv_gluster_rdma); 1075 bdrv_register(&bdrv_gluster_unix); 1076 bdrv_register(&bdrv_gluster_tcp); 1077 bdrv_register(&bdrv_gluster); 1078 } 1079 1080 block_init(bdrv_gluster_init); 1081