1 /*
2  * vhost-user-blk sample application
3  *
4  * Copyright (c) 2017 Intel Corporation. All rights reserved.
5  *
6  * Author:
7  *  Changpeng Liu <changpeng.liu@intel.com>
8  *
9  * This work is based on the "vhost-user-scsi" sample and "virtio-blk" driver
10  * implementation by:
11  *  Felipe Franciosi <felipe@nutanix.com>
12  *  Anthony Liguori <aliguori@us.ibm.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2 only.
15  * See the COPYING file in the top-level directory.
16  */
17 
18 #include "qemu/osdep.h"
19 #include "standard-headers/linux/virtio_blk.h"
20 #include "contrib/libvhost-user/libvhost-user-glib.h"
21 #include "contrib/libvhost-user/libvhost-user.h"
22 
23 #if defined(__linux__)
24 #include <linux/fs.h>
25 #include <sys/ioctl.h>
26 #endif
27 
28 struct virtio_blk_inhdr {
29     unsigned char status;
30 };
31 
32 /* vhost user block device */
33 typedef struct VubDev {
34     VugDev parent;
35     int blk_fd;
36     struct virtio_blk_config blkcfg;
37     bool enable_ro;
38     char *blk_name;
39     GMainLoop *loop;
40 } VubDev;
41 
42 typedef struct VubReq {
43     VuVirtqElement *elem;
44     int64_t sector_num;
45     size_t size;
46     struct virtio_blk_inhdr *in;
47     struct virtio_blk_outhdr *out;
48     VubDev *vdev_blk;
49     struct VuVirtq *vq;
50 } VubReq;
51 
52 /* refer util/iov.c */
53 static size_t vub_iov_size(const struct iovec *iov,
54                               const unsigned int iov_cnt)
55 {
56     size_t len;
57     unsigned int i;
58 
59     len = 0;
60     for (i = 0; i < iov_cnt; i++) {
61         len += iov[i].iov_len;
62     }
63     return len;
64 }
65 
66 static size_t vub_iov_to_buf(const struct iovec *iov,
67                              const unsigned int iov_cnt, void *buf)
68 {
69     size_t len;
70     unsigned int i;
71 
72     len = 0;
73     for (i = 0; i < iov_cnt; i++) {
74         memcpy(buf + len,  iov[i].iov_base, iov[i].iov_len);
75         len += iov[i].iov_len;
76     }
77     return len;
78 }
79 
80 static void vub_panic_cb(VuDev *vu_dev, const char *buf)
81 {
82     VugDev *gdev;
83     VubDev *vdev_blk;
84 
85     assert(vu_dev);
86 
87     gdev = container_of(vu_dev, VugDev, parent);
88     vdev_blk = container_of(gdev, VubDev, parent);
89     if (buf) {
90         g_warning("vu_panic: %s", buf);
91     }
92 
93     g_main_loop_quit(vdev_blk->loop);
94 }
95 
96 static void vub_req_complete(VubReq *req)
97 {
98     VugDev *gdev = &req->vdev_blk->parent;
99     VuDev *vu_dev = &gdev->parent;
100 
101     /* IO size with 1 extra status byte */
102     vu_queue_push(vu_dev, req->vq, req->elem,
103                   req->size + 1);
104     vu_queue_notify(vu_dev, req->vq);
105 
106     if (req->elem) {
107         free(req->elem);
108     }
109 
110     g_free(req);
111 }
112 
113 static int vub_open(const char *file_name, bool wce)
114 {
115     int fd;
116     int flags = O_RDWR;
117 
118     if (!wce) {
119         flags |= O_DIRECT;
120     }
121 
122     fd = open(file_name, flags);
123     if (fd < 0) {
124         fprintf(stderr, "Cannot open file %s, %s\n", file_name,
125                 strerror(errno));
126         return -1;
127     }
128 
129     return fd;
130 }
131 
132 static ssize_t
133 vub_readv(VubReq *req, struct iovec *iov, uint32_t iovcnt)
134 {
135     VubDev *vdev_blk = req->vdev_blk;
136     ssize_t rc;
137 
138     if (!iovcnt) {
139         fprintf(stderr, "Invalid Read IOV count\n");
140         return -1;
141     }
142 
143     req->size = vub_iov_size(iov, iovcnt);
144     rc = preadv(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
145     if (rc < 0) {
146         fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
147                 vdev_blk->blk_name, req->sector_num, req->size,
148                 strerror(errno));
149         return -1;
150     }
151 
152     return rc;
153 }
154 
155 static ssize_t
156 vub_writev(VubReq *req, struct iovec *iov, uint32_t iovcnt)
157 {
158     VubDev *vdev_blk = req->vdev_blk;
159     ssize_t rc;
160 
161     if (!iovcnt) {
162         fprintf(stderr, "Invalid Write IOV count\n");
163         return -1;
164     }
165 
166     req->size = vub_iov_size(iov, iovcnt);
167     rc = pwritev(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
168     if (rc < 0) {
169         fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
170                 vdev_blk->blk_name, req->sector_num, req->size,
171                 strerror(errno));
172         return -1;
173     }
174 
175     return rc;
176 }
177 
178 static int
179 vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
180                          uint32_t type)
181 {
182     struct virtio_blk_discard_write_zeroes *desc;
183     ssize_t size;
184     void *buf;
185 
186     size = vub_iov_size(iov, iovcnt);
187     if (size != sizeof(*desc)) {
188         fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc));
189         return -1;
190     }
191     buf = g_new0(char, size);
192     vub_iov_to_buf(iov, iovcnt, buf);
193 
194     #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
195     VubDev *vdev_blk = req->vdev_blk;
196     desc = (struct virtio_blk_discard_write_zeroes *)buf;
197     uint64_t range[2] = { le64toh(desc->sector) << 9,
198                           le32toh(desc->num_sectors) << 9 };
199     if (type == VIRTIO_BLK_T_DISCARD) {
200         if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) {
201             g_free(buf);
202             return 0;
203         }
204     } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
205         if (ioctl(vdev_blk->blk_fd, BLKZEROOUT, range) == 0) {
206             g_free(buf);
207             return 0;
208         }
209     }
210     #endif
211 
212     g_free(buf);
213     return -1;
214 }
215 
216 static void
217 vub_flush(VubReq *req)
218 {
219     VubDev *vdev_blk = req->vdev_blk;
220 
221     fdatasync(vdev_blk->blk_fd);
222 }
223 
224 static int vub_virtio_process_req(VubDev *vdev_blk,
225                                      VuVirtq *vq)
226 {
227     VugDev *gdev = &vdev_blk->parent;
228     VuDev *vu_dev = &gdev->parent;
229     VuVirtqElement *elem;
230     uint32_t type;
231     unsigned in_num;
232     unsigned out_num;
233     VubReq *req;
234 
235     elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
236     if (!elem) {
237         return -1;
238     }
239 
240     /* refer to hw/block/virtio_blk.c */
241     if (elem->out_num < 1 || elem->in_num < 1) {
242         fprintf(stderr, "virtio-blk request missing headers\n");
243         free(elem);
244         return -1;
245     }
246 
247     req = g_new0(VubReq, 1);
248     req->vdev_blk = vdev_blk;
249     req->vq = vq;
250     req->elem = elem;
251 
252     in_num = elem->in_num;
253     out_num = elem->out_num;
254 
255     /* don't support VIRTIO_F_ANY_LAYOUT and virtio 1.0 only */
256     if (elem->out_sg[0].iov_len < sizeof(struct virtio_blk_outhdr)) {
257         fprintf(stderr, "Invalid outhdr size\n");
258         goto err;
259     }
260     req->out = (struct virtio_blk_outhdr *)elem->out_sg[0].iov_base;
261     out_num--;
262 
263     if (elem->in_sg[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
264         fprintf(stderr, "Invalid inhdr size\n");
265         goto err;
266     }
267     req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base;
268     in_num--;
269 
270     type = le32toh(req->out->type);
271     switch (type & ~VIRTIO_BLK_T_BARRIER) {
272     case VIRTIO_BLK_T_IN:
273     case VIRTIO_BLK_T_OUT: {
274         ssize_t ret = 0;
275         bool is_write = type & VIRTIO_BLK_T_OUT;
276         req->sector_num = le64toh(req->out->sector);
277         if (is_write) {
278             ret  = vub_writev(req, &elem->out_sg[1], out_num);
279         } else {
280             ret = vub_readv(req, &elem->in_sg[0], in_num);
281         }
282         if (ret >= 0) {
283             req->in->status = VIRTIO_BLK_S_OK;
284         } else {
285             req->in->status = VIRTIO_BLK_S_IOERR;
286         }
287         vub_req_complete(req);
288         break;
289     }
290     case VIRTIO_BLK_T_FLUSH:
291         vub_flush(req);
292         req->in->status = VIRTIO_BLK_S_OK;
293         vub_req_complete(req);
294         break;
295     case VIRTIO_BLK_T_GET_ID: {
296         size_t size = MIN(vub_iov_size(&elem->in_sg[0], in_num),
297                           VIRTIO_BLK_ID_BYTES);
298         snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
299         req->in->status = VIRTIO_BLK_S_OK;
300         req->size = elem->in_sg[0].iov_len;
301         vub_req_complete(req);
302         break;
303     }
304     case VIRTIO_BLK_T_DISCARD:
305     case VIRTIO_BLK_T_WRITE_ZEROES: {
306         int rc;
307         rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
308         if (rc == 0) {
309             req->in->status = VIRTIO_BLK_S_OK;
310         } else {
311             req->in->status = VIRTIO_BLK_S_IOERR;
312         }
313         vub_req_complete(req);
314         break;
315     }
316     default:
317         req->in->status = VIRTIO_BLK_S_UNSUPP;
318         vub_req_complete(req);
319         break;
320     }
321 
322     return 0;
323 
324 err:
325     free(elem);
326     g_free(req);
327     return -1;
328 }
329 
330 static void vub_process_vq(VuDev *vu_dev, int idx)
331 {
332     VugDev *gdev;
333     VubDev *vdev_blk;
334     VuVirtq *vq;
335     int ret;
336 
337     if ((idx < 0) || (idx >= VHOST_MAX_NR_VIRTQUEUE)) {
338         fprintf(stderr, "VQ Index out of range: %d\n", idx);
339         vub_panic_cb(vu_dev, NULL);
340         return;
341     }
342 
343     gdev = container_of(vu_dev, VugDev, parent);
344     vdev_blk = container_of(gdev, VubDev, parent);
345     assert(vdev_blk);
346 
347     vq = vu_get_queue(vu_dev, idx);
348     assert(vq);
349 
350     while (1) {
351         ret = vub_virtio_process_req(vdev_blk, vq);
352         if (ret) {
353             break;
354         }
355     }
356 }
357 
358 static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
359 {
360     VuVirtq *vq;
361 
362     assert(vu_dev);
363 
364     vq = vu_get_queue(vu_dev, idx);
365     vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
366 }
367 
368 static uint64_t
369 vub_get_features(VuDev *dev)
370 {
371     uint64_t features;
372     VugDev *gdev;
373     VubDev *vdev_blk;
374 
375     gdev = container_of(dev, VugDev, parent);
376     vdev_blk = container_of(gdev, VubDev, parent);
377 
378     features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
379                1ull << VIRTIO_BLK_F_SEG_MAX |
380                1ull << VIRTIO_BLK_F_TOPOLOGY |
381                1ull << VIRTIO_BLK_F_BLK_SIZE |
382                1ull << VIRTIO_BLK_F_FLUSH |
383                #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
384                1ull << VIRTIO_BLK_F_DISCARD |
385                1ull << VIRTIO_BLK_F_WRITE_ZEROES |
386                #endif
387                1ull << VIRTIO_BLK_F_CONFIG_WCE |
388                1ull << VIRTIO_F_VERSION_1 |
389                1ull << VHOST_USER_F_PROTOCOL_FEATURES;
390 
391     if (vdev_blk->enable_ro) {
392         features |= 1ull << VIRTIO_BLK_F_RO;
393     }
394 
395     return features;
396 }
397 
398 static uint64_t
399 vub_get_protocol_features(VuDev *dev)
400 {
401     return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
402            1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
403 }
404 
405 static int
406 vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
407 {
408     VugDev *gdev;
409     VubDev *vdev_blk;
410 
411     gdev = container_of(vu_dev, VugDev, parent);
412     vdev_blk = container_of(gdev, VubDev, parent);
413     memcpy(config, &vdev_blk->blkcfg, len);
414 
415     return 0;
416 }
417 
418 static int
419 vub_set_config(VuDev *vu_dev, const uint8_t *data,
420                uint32_t offset, uint32_t size, uint32_t flags)
421 {
422     VugDev *gdev;
423     VubDev *vdev_blk;
424     uint8_t wce;
425     int fd;
426 
427     /* don't support live migration */
428     if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
429         return -1;
430     }
431 
432     gdev = container_of(vu_dev, VugDev, parent);
433     vdev_blk = container_of(gdev, VubDev, parent);
434 
435     if (offset != offsetof(struct virtio_blk_config, wce) ||
436         size != 1) {
437         return -1;
438     }
439 
440     wce = *data;
441     if (wce == vdev_blk->blkcfg.wce) {
442         /* Do nothing as same with old configuration */
443         return 0;
444     }
445 
446     vdev_blk->blkcfg.wce = wce;
447     fprintf(stdout, "Write Cache Policy Changed\n");
448     if (vdev_blk->blk_fd >= 0) {
449         close(vdev_blk->blk_fd);
450         vdev_blk->blk_fd = -1;
451     }
452 
453     fd = vub_open(vdev_blk->blk_name, wce);
454     if (fd < 0) {
455         fprintf(stderr, "Error to open block device %s\n", vdev_blk->blk_name);
456         vdev_blk->blk_fd = -1;
457         return -1;
458     }
459     vdev_blk->blk_fd = fd;
460 
461     return 0;
462 }
463 
464 static const VuDevIface vub_iface = {
465     .get_features = vub_get_features,
466     .queue_set_started = vub_queue_set_started,
467     .get_protocol_features = vub_get_protocol_features,
468     .get_config = vub_get_config,
469     .set_config = vub_set_config,
470 };
471 
472 static int unix_sock_new(char *unix_fn)
473 {
474     int sock;
475     struct sockaddr_un un;
476     size_t len;
477 
478     assert(unix_fn);
479 
480     sock = socket(AF_UNIX, SOCK_STREAM, 0);
481     if (sock <= 0) {
482         perror("socket");
483         return -1;
484     }
485 
486     un.sun_family = AF_UNIX;
487     (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn);
488     len = sizeof(un.sun_family) + strlen(un.sun_path);
489 
490     (void)unlink(unix_fn);
491     if (bind(sock, (struct sockaddr *)&un, len) < 0) {
492         perror("bind");
493         goto fail;
494     }
495 
496     if (listen(sock, 1) < 0) {
497         perror("listen");
498         goto fail;
499     }
500 
501     return sock;
502 
503 fail:
504     (void)close(sock);
505 
506     return -1;
507 }
508 
509 static void vub_free(struct VubDev *vdev_blk)
510 {
511     if (!vdev_blk) {
512         return;
513     }
514 
515     g_main_loop_unref(vdev_blk->loop);
516     if (vdev_blk->blk_fd >= 0) {
517         close(vdev_blk->blk_fd);
518     }
519     g_free(vdev_blk);
520 }
521 
522 static uint32_t
523 vub_get_blocksize(int fd)
524 {
525     uint32_t blocksize = 512;
526 
527 #if defined(__linux__) && defined(BLKSSZGET)
528     if (ioctl(fd, BLKSSZGET, &blocksize) == 0) {
529         return blocksize;
530     }
531 #endif
532 
533     return blocksize;
534 }
535 
536 static void
537 vub_initialize_config(int fd, struct virtio_blk_config *config)
538 {
539     off64_t capacity;
540 
541     capacity = lseek64(fd, 0, SEEK_END);
542     config->capacity = capacity >> 9;
543     config->blk_size = vub_get_blocksize(fd);
544     config->size_max = 65536;
545     config->seg_max = 128 - 2;
546     config->min_io_size = 1;
547     config->opt_io_size = 1;
548     config->num_queues = 1;
549     #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
550     config->max_discard_sectors = 32768;
551     config->max_discard_seg = 1;
552     config->discard_sector_alignment = config->blk_size >> 9;
553     config->max_write_zeroes_sectors = 32768;
554     config->max_write_zeroes_seg = 1;
555     #endif
556 }
557 
558 static VubDev *
559 vub_new(char *blk_file)
560 {
561     VubDev *vdev_blk;
562 
563     vdev_blk = g_new0(VubDev, 1);
564     vdev_blk->loop = g_main_loop_new(NULL, FALSE);
565     vdev_blk->blk_fd = vub_open(blk_file, 0);
566     if (vdev_blk->blk_fd  < 0) {
567         fprintf(stderr, "Error to open block device %s\n", blk_file);
568         vub_free(vdev_blk);
569         return NULL;
570     }
571     vdev_blk->enable_ro = false;
572     vdev_blk->blkcfg.wce = 0;
573     vdev_blk->blk_name = blk_file;
574 
575     /* fill virtio_blk_config with block parameters */
576     vub_initialize_config(vdev_blk->blk_fd, &vdev_blk->blkcfg);
577 
578     return vdev_blk;
579 }
580 
581 int main(int argc, char **argv)
582 {
583     int opt;
584     char *unix_socket = NULL;
585     char *blk_file = NULL;
586     bool enable_ro = false;
587     int lsock = -1, csock = -1;
588     VubDev *vdev_blk = NULL;
589 
590     while ((opt = getopt(argc, argv, "b:rs:h")) != -1) {
591         switch (opt) {
592         case 'b':
593             blk_file = g_strdup(optarg);
594             break;
595         case 's':
596             unix_socket = g_strdup(optarg);
597             break;
598         case 'r':
599             enable_ro = true;
600             break;
601         case 'h':
602         default:
603             printf("Usage: %s [ -b block device or file, -s UNIX domain socket"
604                    " | -r Enable read-only ] | [ -h ]\n", argv[0]);
605             return 0;
606         }
607     }
608 
609     if (!unix_socket || !blk_file) {
610         printf("Usage: %s [ -b block device or file, -s UNIX domain socket"
611                " | -r Enable read-only ] | [ -h ]\n", argv[0]);
612         return -1;
613     }
614 
615     lsock = unix_sock_new(unix_socket);
616     if (lsock < 0) {
617         goto err;
618     }
619 
620     csock = accept(lsock, (void *)0, (void *)0);
621     if (csock < 0) {
622         fprintf(stderr, "Accept error %s\n", strerror(errno));
623         goto err;
624     }
625 
626     vdev_blk = vub_new(blk_file);
627     if (!vdev_blk) {
628         goto err;
629     }
630     if (enable_ro) {
631         vdev_blk->enable_ro = true;
632     }
633 
634     vug_init(&vdev_blk->parent, csock, vub_panic_cb, &vub_iface);
635 
636     g_main_loop_run(vdev_blk->loop);
637 
638     vug_deinit(&vdev_blk->parent);
639 
640 err:
641     vub_free(vdev_blk);
642     if (csock >= 0) {
643         close(csock);
644     }
645     if (lsock >= 0) {
646         close(lsock);
647     }
648     g_free(unix_socket);
649     g_free(blk_file);
650 
651     return 0;
652 }
653