xref: /openbmc/qemu/block/gluster.c (revision b957a1b0)
1 /*
2  * GlusterFS backend for QEMU
3  *
4  * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
5  *
6  * Pipe handling mechanism in AIO implementation is derived from
7  * block/rbd.c. Hence,
8  *
9  * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
10  *                         Josh Durgin <josh.durgin@dreamhost.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2.  See
13  * the COPYING file in the top-level directory.
14  *
15  * Contributions after 2012-01-13 are licensed under the terms of the
16  * GNU GPL, version 2 or (at your option) any later version.
17  */
18 #include <glusterfs/api/glfs.h>
19 #include "block/block_int.h"
20 #include "qemu/sockets.h"
21 #include "qemu/uri.h"
22 
23 typedef struct GlusterAIOCB {
24     BlockDriverAIOCB common;
25     int64_t size;
26     int ret;
27     bool *finished;
28     QEMUBH *bh;
29 } GlusterAIOCB;
30 
31 typedef struct BDRVGlusterState {
32     struct glfs *glfs;
33     int fds[2];
34     struct glfs_fd *fd;
35     int qemu_aio_count;
36     int event_reader_pos;
37     GlusterAIOCB *event_acb;
38 } BDRVGlusterState;
39 
40 #define GLUSTER_FD_READ  0
41 #define GLUSTER_FD_WRITE 1
42 
43 typedef struct GlusterConf {
44     char *server;
45     int port;
46     char *volname;
47     char *image;
48     char *transport;
49 } GlusterConf;
50 
51 static void qemu_gluster_gconf_free(GlusterConf *gconf)
52 {
53     g_free(gconf->server);
54     g_free(gconf->volname);
55     g_free(gconf->image);
56     g_free(gconf->transport);
57     g_free(gconf);
58 }
59 
60 static int parse_volume_options(GlusterConf *gconf, char *path)
61 {
62     char *p, *q;
63 
64     if (!path) {
65         return -EINVAL;
66     }
67 
68     /* volume */
69     p = q = path + strspn(path, "/");
70     p += strcspn(p, "/");
71     if (*p == '\0') {
72         return -EINVAL;
73     }
74     gconf->volname = g_strndup(q, p - q);
75 
76     /* image */
77     p += strspn(p, "/");
78     if (*p == '\0') {
79         return -EINVAL;
80     }
81     gconf->image = g_strdup(p);
82     return 0;
83 }
84 
85 /*
86  * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
87  *
88  * 'gluster' is the protocol.
89  *
90  * 'transport' specifies the transport type used to connect to gluster
91  * management daemon (glusterd). Valid transport types are
92  * tcp, unix and rdma. If a transport type isn't specified, then tcp
93  * type is assumed.
94  *
95  * 'server' specifies the server where the volume file specification for
96  * the given volume resides. This can be either hostname, ipv4 address
97  * or ipv6 address. ipv6 address needs to be within square brackets [ ].
98  * If transport type is 'unix', then 'server' field should not be specifed.
99  * The 'socket' field needs to be populated with the path to unix domain
100  * socket.
101  *
102  * 'port' is the port number on which glusterd is listening. This is optional
103  * and if not specified, QEMU will send 0 which will make gluster to use the
104  * default port. If the transport type is unix, then 'port' should not be
105  * specified.
106  *
107  * 'volname' is the name of the gluster volume which contains the VM image.
108  *
109  * 'image' is the path to the actual VM image that resides on gluster volume.
110  *
111  * Examples:
112  *
113  * file=gluster://1.2.3.4/testvol/a.img
114  * file=gluster+tcp://1.2.3.4/testvol/a.img
115  * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
116  * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
117  * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
118  * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
119  * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
120  * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
121  */
122 static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
123 {
124     URI *uri;
125     QueryParams *qp = NULL;
126     bool is_unix = false;
127     int ret = 0;
128 
129     uri = uri_parse(filename);
130     if (!uri) {
131         return -EINVAL;
132     }
133 
134     /* transport */
135     if (!strcmp(uri->scheme, "gluster")) {
136         gconf->transport = g_strdup("tcp");
137     } else if (!strcmp(uri->scheme, "gluster+tcp")) {
138         gconf->transport = g_strdup("tcp");
139     } else if (!strcmp(uri->scheme, "gluster+unix")) {
140         gconf->transport = g_strdup("unix");
141         is_unix = true;
142     } else if (!strcmp(uri->scheme, "gluster+rdma")) {
143         gconf->transport = g_strdup("rdma");
144     } else {
145         ret = -EINVAL;
146         goto out;
147     }
148 
149     ret = parse_volume_options(gconf, uri->path);
150     if (ret < 0) {
151         goto out;
152     }
153 
154     qp = query_params_parse(uri->query);
155     if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
156         ret = -EINVAL;
157         goto out;
158     }
159 
160     if (is_unix) {
161         if (uri->server || uri->port) {
162             ret = -EINVAL;
163             goto out;
164         }
165         if (strcmp(qp->p[0].name, "socket")) {
166             ret = -EINVAL;
167             goto out;
168         }
169         gconf->server = g_strdup(qp->p[0].value);
170     } else {
171         gconf->server = g_strdup(uri->server);
172         gconf->port = uri->port;
173     }
174 
175 out:
176     if (qp) {
177         query_params_free(qp);
178     }
179     uri_free(uri);
180     return ret;
181 }
182 
183 static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
184 {
185     struct glfs *glfs = NULL;
186     int ret;
187     int old_errno;
188 
189     ret = qemu_gluster_parseuri(gconf, filename);
190     if (ret < 0) {
191         error_report("Usage: file=gluster[+transport]://[server[:port]]/"
192             "volname/image[?socket=...]");
193         errno = -ret;
194         goto out;
195     }
196 
197     glfs = glfs_new(gconf->volname);
198     if (!glfs) {
199         goto out;
200     }
201 
202     ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
203             gconf->port);
204     if (ret < 0) {
205         goto out;
206     }
207 
208     /*
209      * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
210      * GlusterFS makes GF_LOG_* macros available to libgfapi users.
211      */
212     ret = glfs_set_logging(glfs, "-", 4);
213     if (ret < 0) {
214         goto out;
215     }
216 
217     ret = glfs_init(glfs);
218     if (ret) {
219         error_report("Gluster connection failed for server=%s port=%d "
220              "volume=%s image=%s transport=%s", gconf->server, gconf->port,
221              gconf->volname, gconf->image, gconf->transport);
222         goto out;
223     }
224     return glfs;
225 
226 out:
227     if (glfs) {
228         old_errno = errno;
229         glfs_fini(glfs);
230         errno = old_errno;
231     }
232     return NULL;
233 }
234 
235 static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
236 {
237     int ret;
238     bool *finished = acb->finished;
239     BlockDriverCompletionFunc *cb = acb->common.cb;
240     void *opaque = acb->common.opaque;
241 
242     if (!acb->ret || acb->ret == acb->size) {
243         ret = 0; /* Success */
244     } else if (acb->ret < 0) {
245         ret = acb->ret; /* Read/Write failed */
246     } else {
247         ret = -EIO; /* Partial read/write - fail it */
248     }
249 
250     s->qemu_aio_count--;
251     qemu_aio_release(acb);
252     cb(opaque, ret);
253     if (finished) {
254         *finished = true;
255     }
256 }
257 
258 static void qemu_gluster_aio_event_reader(void *opaque)
259 {
260     BDRVGlusterState *s = opaque;
261     ssize_t ret;
262 
263     do {
264         char *p = (char *)&s->event_acb;
265 
266         ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
267                    sizeof(s->event_acb) - s->event_reader_pos);
268         if (ret > 0) {
269             s->event_reader_pos += ret;
270             if (s->event_reader_pos == sizeof(s->event_acb)) {
271                 s->event_reader_pos = 0;
272                 qemu_gluster_complete_aio(s->event_acb, s);
273             }
274         }
275     } while (ret < 0 && errno == EINTR);
276 }
277 
278 static int qemu_gluster_aio_flush_cb(void *opaque)
279 {
280     BDRVGlusterState *s = opaque;
281 
282     return (s->qemu_aio_count > 0);
283 }
284 
285 /* TODO Convert to fine grained options */
286 static QemuOptsList runtime_opts = {
287     .name = "gluster",
288     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
289     .desc = {
290         {
291             .name = "filename",
292             .type = QEMU_OPT_STRING,
293             .help = "URL to the gluster image",
294         },
295         { /* end of list */ }
296     },
297 };
298 
299 static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
300                              int bdrv_flags)
301 {
302     BDRVGlusterState *s = bs->opaque;
303     int open_flags = O_BINARY;
304     int ret = 0;
305     GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
306     QemuOpts *opts;
307     Error *local_err = NULL;
308     const char *filename;
309 
310     opts = qemu_opts_create_nofail(&runtime_opts);
311     qemu_opts_absorb_qdict(opts, options, &local_err);
312     if (error_is_set(&local_err)) {
313         qerror_report_err(local_err);
314         error_free(local_err);
315         ret = -EINVAL;
316         goto out;
317     }
318 
319     filename = qemu_opt_get(opts, "filename");
320 
321 
322     s->glfs = qemu_gluster_init(gconf, filename);
323     if (!s->glfs) {
324         ret = -errno;
325         goto out;
326     }
327 
328     if (bdrv_flags & BDRV_O_RDWR) {
329         open_flags |= O_RDWR;
330     } else {
331         open_flags |= O_RDONLY;
332     }
333 
334     if ((bdrv_flags & BDRV_O_NOCACHE)) {
335         open_flags |= O_DIRECT;
336     }
337 
338     s->fd = glfs_open(s->glfs, gconf->image, open_flags);
339     if (!s->fd) {
340         ret = -errno;
341         goto out;
342     }
343 
344     ret = qemu_pipe(s->fds);
345     if (ret < 0) {
346         ret = -errno;
347         goto out;
348     }
349     fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
350     qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
351         qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
352 
353 out:
354     qemu_opts_del(opts);
355     qemu_gluster_gconf_free(gconf);
356     if (!ret) {
357         return ret;
358     }
359     if (s->fd) {
360         glfs_close(s->fd);
361     }
362     if (s->glfs) {
363         glfs_fini(s->glfs);
364     }
365     return ret;
366 }
367 
368 static int qemu_gluster_create(const char *filename,
369         QEMUOptionParameter *options)
370 {
371     struct glfs *glfs;
372     struct glfs_fd *fd;
373     int ret = 0;
374     int64_t total_size = 0;
375     GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
376 
377     glfs = qemu_gluster_init(gconf, filename);
378     if (!glfs) {
379         ret = -errno;
380         goto out;
381     }
382 
383     while (options && options->name) {
384         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
385             total_size = options->value.n / BDRV_SECTOR_SIZE;
386         }
387         options++;
388     }
389 
390     fd = glfs_creat(glfs, gconf->image,
391         O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
392     if (!fd) {
393         ret = -errno;
394     } else {
395         if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
396             ret = -errno;
397         }
398         if (glfs_close(fd) != 0) {
399             ret = -errno;
400         }
401     }
402 out:
403     qemu_gluster_gconf_free(gconf);
404     if (glfs) {
405         glfs_fini(glfs);
406     }
407     return ret;
408 }
409 
410 static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
411 {
412     GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
413     bool finished = false;
414 
415     acb->finished = &finished;
416     while (!finished) {
417         qemu_aio_wait();
418     }
419 }
420 
421 static const AIOCBInfo gluster_aiocb_info = {
422     .aiocb_size = sizeof(GlusterAIOCB),
423     .cancel = qemu_gluster_aio_cancel,
424 };
425 
426 static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
427 {
428     GlusterAIOCB *acb = (GlusterAIOCB *)arg;
429     BlockDriverState *bs = acb->common.bs;
430     BDRVGlusterState *s = bs->opaque;
431     int retval;
432 
433     acb->ret = ret;
434     retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
435     if (retval != sizeof(acb)) {
436         /*
437          * Gluster AIO callback thread failed to notify the waiting
438          * QEMU thread about IO completion.
439          *
440          * Complete this IO request and make the disk inaccessible for
441          * subsequent reads and writes.
442          */
443         error_report("Gluster failed to notify QEMU about IO completion");
444 
445         qemu_mutex_lock_iothread(); /* We are in gluster thread context */
446         acb->common.cb(acb->common.opaque, -EIO);
447         qemu_aio_release(acb);
448         s->qemu_aio_count--;
449         close(s->fds[GLUSTER_FD_READ]);
450         close(s->fds[GLUSTER_FD_WRITE]);
451         qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL,
452             NULL);
453         bs->drv = NULL; /* Make the disk inaccessible */
454         qemu_mutex_unlock_iothread();
455     }
456 }
457 
458 static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
459         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
460         BlockDriverCompletionFunc *cb, void *opaque, int write)
461 {
462     int ret;
463     GlusterAIOCB *acb;
464     BDRVGlusterState *s = bs->opaque;
465     size_t size;
466     off_t offset;
467 
468     offset = sector_num * BDRV_SECTOR_SIZE;
469     size = nb_sectors * BDRV_SECTOR_SIZE;
470     s->qemu_aio_count++;
471 
472     acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
473     acb->size = size;
474     acb->ret = 0;
475     acb->finished = NULL;
476 
477     if (write) {
478         ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
479             &gluster_finish_aiocb, acb);
480     } else {
481         ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
482             &gluster_finish_aiocb, acb);
483     }
484 
485     if (ret < 0) {
486         goto out;
487     }
488     return &acb->common;
489 
490 out:
491     s->qemu_aio_count--;
492     qemu_aio_release(acb);
493     return NULL;
494 }
495 
496 static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
497         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
498         BlockDriverCompletionFunc *cb, void *opaque)
499 {
500     return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
501 }
502 
503 static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
504         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
505         BlockDriverCompletionFunc *cb, void *opaque)
506 {
507     return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
508 }
509 
510 static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
511         BlockDriverCompletionFunc *cb, void *opaque)
512 {
513     int ret;
514     GlusterAIOCB *acb;
515     BDRVGlusterState *s = bs->opaque;
516 
517     acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
518     acb->size = 0;
519     acb->ret = 0;
520     acb->finished = NULL;
521     s->qemu_aio_count++;
522 
523     ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
524     if (ret < 0) {
525         goto out;
526     }
527     return &acb->common;
528 
529 out:
530     s->qemu_aio_count--;
531     qemu_aio_release(acb);
532     return NULL;
533 }
534 
535 static int64_t qemu_gluster_getlength(BlockDriverState *bs)
536 {
537     BDRVGlusterState *s = bs->opaque;
538     int64_t ret;
539 
540     ret = glfs_lseek(s->fd, 0, SEEK_END);
541     if (ret < 0) {
542         return -errno;
543     } else {
544         return ret;
545     }
546 }
547 
548 static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
549 {
550     BDRVGlusterState *s = bs->opaque;
551     struct stat st;
552     int ret;
553 
554     ret = glfs_fstat(s->fd, &st);
555     if (ret < 0) {
556         return -errno;
557     } else {
558         return st.st_blocks * 512;
559     }
560 }
561 
562 static void qemu_gluster_close(BlockDriverState *bs)
563 {
564     BDRVGlusterState *s = bs->opaque;
565 
566     close(s->fds[GLUSTER_FD_READ]);
567     close(s->fds[GLUSTER_FD_WRITE]);
568     qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL);
569 
570     if (s->fd) {
571         glfs_close(s->fd);
572         s->fd = NULL;
573     }
574     glfs_fini(s->glfs);
575 }
576 
577 static int qemu_gluster_has_zero_init(BlockDriverState *bs)
578 {
579     /* GlusterFS volume could be backed by a block device */
580     return 0;
581 }
582 
583 static QEMUOptionParameter qemu_gluster_create_options[] = {
584     {
585         .name = BLOCK_OPT_SIZE,
586         .type = OPT_SIZE,
587         .help = "Virtual disk size"
588     },
589     { NULL }
590 };
591 
592 static BlockDriver bdrv_gluster = {
593     .format_name                  = "gluster",
594     .protocol_name                = "gluster",
595     .instance_size                = sizeof(BDRVGlusterState),
596     .bdrv_file_open               = qemu_gluster_open,
597     .bdrv_close                   = qemu_gluster_close,
598     .bdrv_create                  = qemu_gluster_create,
599     .bdrv_getlength               = qemu_gluster_getlength,
600     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
601     .bdrv_aio_readv               = qemu_gluster_aio_readv,
602     .bdrv_aio_writev              = qemu_gluster_aio_writev,
603     .bdrv_aio_flush               = qemu_gluster_aio_flush,
604     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
605     .create_options               = qemu_gluster_create_options,
606 };
607 
608 static BlockDriver bdrv_gluster_tcp = {
609     .format_name                  = "gluster",
610     .protocol_name                = "gluster+tcp",
611     .instance_size                = sizeof(BDRVGlusterState),
612     .bdrv_file_open               = qemu_gluster_open,
613     .bdrv_close                   = qemu_gluster_close,
614     .bdrv_create                  = qemu_gluster_create,
615     .bdrv_getlength               = qemu_gluster_getlength,
616     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
617     .bdrv_aio_readv               = qemu_gluster_aio_readv,
618     .bdrv_aio_writev              = qemu_gluster_aio_writev,
619     .bdrv_aio_flush               = qemu_gluster_aio_flush,
620     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
621     .create_options               = qemu_gluster_create_options,
622 };
623 
624 static BlockDriver bdrv_gluster_unix = {
625     .format_name                  = "gluster",
626     .protocol_name                = "gluster+unix",
627     .instance_size                = sizeof(BDRVGlusterState),
628     .bdrv_file_open               = qemu_gluster_open,
629     .bdrv_close                   = qemu_gluster_close,
630     .bdrv_create                  = qemu_gluster_create,
631     .bdrv_getlength               = qemu_gluster_getlength,
632     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
633     .bdrv_aio_readv               = qemu_gluster_aio_readv,
634     .bdrv_aio_writev              = qemu_gluster_aio_writev,
635     .bdrv_aio_flush               = qemu_gluster_aio_flush,
636     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
637     .create_options               = qemu_gluster_create_options,
638 };
639 
640 static BlockDriver bdrv_gluster_rdma = {
641     .format_name                  = "gluster",
642     .protocol_name                = "gluster+rdma",
643     .instance_size                = sizeof(BDRVGlusterState),
644     .bdrv_file_open               = qemu_gluster_open,
645     .bdrv_close                   = qemu_gluster_close,
646     .bdrv_create                  = qemu_gluster_create,
647     .bdrv_getlength               = qemu_gluster_getlength,
648     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
649     .bdrv_aio_readv               = qemu_gluster_aio_readv,
650     .bdrv_aio_writev              = qemu_gluster_aio_writev,
651     .bdrv_aio_flush               = qemu_gluster_aio_flush,
652     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
653     .create_options               = qemu_gluster_create_options,
654 };
655 
656 static void bdrv_gluster_init(void)
657 {
658     bdrv_register(&bdrv_gluster_rdma);
659     bdrv_register(&bdrv_gluster_unix);
660     bdrv_register(&bdrv_gluster_tcp);
661     bdrv_register(&bdrv_gluster);
662 }
663 
664 block_init(bdrv_gluster_init);
665