1 /*
2 * vhost-user-blk sample application
3 *
4 * Copyright (c) 2017 Intel Corporation. All rights reserved.
5 *
6 * Author:
7 * Changpeng Liu <changpeng.liu@intel.com>
8 *
9 * This work is based on the "vhost-user-scsi" sample and "virtio-blk" driver
10 * implementation by:
11 * Felipe Franciosi <felipe@nutanix.com>
12 * Anthony Liguori <aliguori@us.ibm.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2 only.
15 * See the COPYING file in the top-level directory.
16 */
17
18 #include "qemu/osdep.h"
19 #include "qemu/bswap.h"
20 #include "standard-headers/linux/virtio_blk.h"
21 #include "libvhost-user-glib.h"
22
23 #if defined(__linux__)
24 #include <linux/fs.h>
25 #include <sys/ioctl.h>
26 #endif
27
28 enum {
29 VHOST_USER_BLK_MAX_QUEUES = 8,
30 };
31
32 struct virtio_blk_inhdr {
33 unsigned char status;
34 };
35
36 /* vhost user block device */
37 typedef struct VubDev {
38 VugDev parent;
39 int blk_fd;
40 struct virtio_blk_config blkcfg;
41 bool enable_ro;
42 char *blk_name;
43 GMainLoop *loop;
44 } VubDev;
45
46 typedef struct VubReq {
47 VuVirtqElement *elem;
48 int64_t sector_num;
49 size_t size;
50 struct virtio_blk_inhdr *in;
51 struct virtio_blk_outhdr *out;
52 VubDev *vdev_blk;
53 struct VuVirtq *vq;
54 } VubReq;
55
56 /* refer util/iov.c */
vub_iov_size(const struct iovec * iov,const unsigned int iov_cnt)57 static size_t vub_iov_size(const struct iovec *iov,
58 const unsigned int iov_cnt)
59 {
60 size_t len;
61 unsigned int i;
62
63 len = 0;
64 for (i = 0; i < iov_cnt; i++) {
65 len += iov[i].iov_len;
66 }
67 return len;
68 }
69
vub_iov_to_buf(const struct iovec * iov,const unsigned int iov_cnt,void * buf)70 static size_t vub_iov_to_buf(const struct iovec *iov,
71 const unsigned int iov_cnt, void *buf)
72 {
73 size_t len;
74 unsigned int i;
75
76 len = 0;
77 for (i = 0; i < iov_cnt; i++) {
78 memcpy(buf + len, iov[i].iov_base, iov[i].iov_len);
79 len += iov[i].iov_len;
80 }
81 return len;
82 }
83
vub_panic_cb(VuDev * vu_dev,const char * buf)84 static void vub_panic_cb(VuDev *vu_dev, const char *buf)
85 {
86 VugDev *gdev;
87 VubDev *vdev_blk;
88
89 assert(vu_dev);
90
91 gdev = container_of(vu_dev, VugDev, parent);
92 vdev_blk = container_of(gdev, VubDev, parent);
93 if (buf) {
94 g_warning("vu_panic: %s", buf);
95 }
96
97 g_main_loop_quit(vdev_blk->loop);
98 }
99
vub_req_complete(VubReq * req)100 static void vub_req_complete(VubReq *req)
101 {
102 VugDev *gdev = &req->vdev_blk->parent;
103 VuDev *vu_dev = &gdev->parent;
104
105 /* IO size with 1 extra status byte */
106 vu_queue_push(vu_dev, req->vq, req->elem,
107 req->size + 1);
108 vu_queue_notify(vu_dev, req->vq);
109
110 g_free(req->elem);
111 g_free(req);
112 }
113
vub_open(const char * file_name,bool wce)114 static int vub_open(const char *file_name, bool wce)
115 {
116 int fd;
117 int flags = O_RDWR;
118
119 if (!wce) {
120 flags |= O_DIRECT;
121 }
122
123 fd = open(file_name, flags);
124 if (fd < 0) {
125 fprintf(stderr, "Cannot open file %s, %s\n", file_name,
126 strerror(errno));
127 return -1;
128 }
129
130 return fd;
131 }
132
133 static ssize_t
vub_readv(VubReq * req,struct iovec * iov,uint32_t iovcnt)134 vub_readv(VubReq *req, struct iovec *iov, uint32_t iovcnt)
135 {
136 VubDev *vdev_blk = req->vdev_blk;
137 ssize_t rc;
138
139 if (!iovcnt) {
140 fprintf(stderr, "Invalid Read IOV count\n");
141 return -1;
142 }
143
144 req->size = vub_iov_size(iov, iovcnt);
145 rc = preadv(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
146 if (rc < 0) {
147 fprintf(stderr, "%s, Sector %"PRIu64", Size %zu failed with %s\n",
148 vdev_blk->blk_name, req->sector_num, req->size,
149 strerror(errno));
150 return -1;
151 }
152
153 return rc;
154 }
155
156 static ssize_t
vub_writev(VubReq * req,struct iovec * iov,uint32_t iovcnt)157 vub_writev(VubReq *req, struct iovec *iov, uint32_t iovcnt)
158 {
159 VubDev *vdev_blk = req->vdev_blk;
160 ssize_t rc;
161
162 if (!iovcnt) {
163 fprintf(stderr, "Invalid Write IOV count\n");
164 return -1;
165 }
166
167 req->size = vub_iov_size(iov, iovcnt);
168 rc = pwritev(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
169 if (rc < 0) {
170 fprintf(stderr, "%s, Sector %"PRIu64", Size %zu failed with %s\n",
171 vdev_blk->blk_name, req->sector_num, req->size,
172 strerror(errno));
173 return -1;
174 }
175
176 return rc;
177 }
178
179 static int
vub_discard_write_zeroes(VubReq * req,struct iovec * iov,uint32_t iovcnt,uint32_t type)180 vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
181 uint32_t type)
182 {
183 struct virtio_blk_discard_write_zeroes *desc;
184 ssize_t size;
185 void *buf;
186
187 size = vub_iov_size(iov, iovcnt);
188 if (size != sizeof(*desc)) {
189 fprintf(stderr, "Invalid size %zd, expect %zd\n", size, sizeof(*desc));
190 return -1;
191 }
192 buf = g_new0(char, size);
193 vub_iov_to_buf(iov, iovcnt, buf);
194
195 #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
196 VubDev *vdev_blk = req->vdev_blk;
197 desc = buf;
198 uint64_t range[2] = { le64_to_cpu(desc->sector) << 9,
199 (uint64_t)le32_to_cpu(desc->num_sectors) << 9 };
200 if (type == VIRTIO_BLK_T_DISCARD) {
201 if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) {
202 g_free(buf);
203 return 0;
204 }
205 } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
206 if (ioctl(vdev_blk->blk_fd, BLKZEROOUT, range) == 0) {
207 g_free(buf);
208 return 0;
209 }
210 }
211 #endif
212
213 g_free(buf);
214 return -1;
215 }
216
217 static void
vub_flush(VubReq * req)218 vub_flush(VubReq *req)
219 {
220 VubDev *vdev_blk = req->vdev_blk;
221
222 fdatasync(vdev_blk->blk_fd);
223 }
224
vub_virtio_process_req(VubDev * vdev_blk,VuVirtq * vq)225 static int vub_virtio_process_req(VubDev *vdev_blk,
226 VuVirtq *vq)
227 {
228 VugDev *gdev = &vdev_blk->parent;
229 VuDev *vu_dev = &gdev->parent;
230 VuVirtqElement *elem;
231 uint32_t type;
232 unsigned in_num;
233 unsigned out_num;
234 VubReq *req;
235
236 elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
237 if (!elem) {
238 return -1;
239 }
240
241 /* refer to hw/block/virtio_blk.c */
242 if (elem->out_num < 1 || elem->in_num < 1) {
243 fprintf(stderr, "virtio-blk request missing headers\n");
244 g_free(elem);
245 return -1;
246 }
247
248 req = g_new0(VubReq, 1);
249 req->vdev_blk = vdev_blk;
250 req->vq = vq;
251 req->elem = elem;
252
253 in_num = elem->in_num;
254 out_num = elem->out_num;
255
256 /* don't support VIRTIO_F_ANY_LAYOUT and virtio 1.0 only */
257 if (elem->out_sg[0].iov_len < sizeof(struct virtio_blk_outhdr)) {
258 fprintf(stderr, "Invalid outhdr size\n");
259 goto err;
260 }
261 req->out = (struct virtio_blk_outhdr *)elem->out_sg[0].iov_base;
262 out_num--;
263
264 if (elem->in_sg[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
265 fprintf(stderr, "Invalid inhdr size\n");
266 goto err;
267 }
268 req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base;
269 in_num--;
270
271 type = le32_to_cpu(req->out->type);
272 switch (type & ~VIRTIO_BLK_T_BARRIER) {
273 case VIRTIO_BLK_T_IN:
274 case VIRTIO_BLK_T_OUT: {
275 ssize_t ret = 0;
276 bool is_write = type & VIRTIO_BLK_T_OUT;
277 req->sector_num = le64_to_cpu(req->out->sector);
278 if (is_write) {
279 ret = vub_writev(req, &elem->out_sg[1], out_num);
280 } else {
281 ret = vub_readv(req, &elem->in_sg[0], in_num);
282 }
283 if (ret >= 0) {
284 req->in->status = VIRTIO_BLK_S_OK;
285 } else {
286 req->in->status = VIRTIO_BLK_S_IOERR;
287 }
288 vub_req_complete(req);
289 break;
290 }
291 case VIRTIO_BLK_T_FLUSH:
292 vub_flush(req);
293 req->in->status = VIRTIO_BLK_S_OK;
294 vub_req_complete(req);
295 break;
296 case VIRTIO_BLK_T_GET_ID: {
297 size_t size = MIN(vub_iov_size(&elem->in_sg[0], in_num),
298 VIRTIO_BLK_ID_BYTES);
299 snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
300 req->in->status = VIRTIO_BLK_S_OK;
301 req->size = elem->in_sg[0].iov_len;
302 vub_req_complete(req);
303 break;
304 }
305 case VIRTIO_BLK_T_DISCARD:
306 case VIRTIO_BLK_T_WRITE_ZEROES: {
307 int rc;
308 rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
309 if (rc == 0) {
310 req->in->status = VIRTIO_BLK_S_OK;
311 } else {
312 req->in->status = VIRTIO_BLK_S_IOERR;
313 }
314 vub_req_complete(req);
315 break;
316 }
317 default:
318 req->in->status = VIRTIO_BLK_S_UNSUPP;
319 vub_req_complete(req);
320 break;
321 }
322
323 return 0;
324
325 err:
326 g_free(elem);
327 g_free(req);
328 return -1;
329 }
330
vub_process_vq(VuDev * vu_dev,int idx)331 static void vub_process_vq(VuDev *vu_dev, int idx)
332 {
333 VugDev *gdev;
334 VubDev *vdev_blk;
335 VuVirtq *vq;
336 int ret;
337
338 gdev = container_of(vu_dev, VugDev, parent);
339 vdev_blk = container_of(gdev, VubDev, parent);
340 assert(vdev_blk);
341
342 vq = vu_get_queue(vu_dev, idx);
343 assert(vq);
344
345 while (1) {
346 ret = vub_virtio_process_req(vdev_blk, vq);
347 if (ret) {
348 break;
349 }
350 }
351 }
352
vub_queue_set_started(VuDev * vu_dev,int idx,bool started)353 static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
354 {
355 VuVirtq *vq;
356
357 assert(vu_dev);
358
359 vq = vu_get_queue(vu_dev, idx);
360 vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
361 }
362
363 static uint64_t
vub_get_features(VuDev * dev)364 vub_get_features(VuDev *dev)
365 {
366 uint64_t features;
367 VugDev *gdev;
368 VubDev *vdev_blk;
369
370 gdev = container_of(dev, VugDev, parent);
371 vdev_blk = container_of(gdev, VubDev, parent);
372
373 features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
374 1ull << VIRTIO_BLK_F_SEG_MAX |
375 1ull << VIRTIO_BLK_F_TOPOLOGY |
376 1ull << VIRTIO_BLK_F_BLK_SIZE |
377 1ull << VIRTIO_BLK_F_FLUSH |
378 #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
379 1ull << VIRTIO_BLK_F_DISCARD |
380 1ull << VIRTIO_BLK_F_WRITE_ZEROES |
381 #endif
382 1ull << VIRTIO_BLK_F_CONFIG_WCE;
383
384 if (vdev_blk->enable_ro) {
385 features |= 1ull << VIRTIO_BLK_F_RO;
386 }
387
388 return features;
389 }
390
391 static uint64_t
vub_get_protocol_features(VuDev * dev)392 vub_get_protocol_features(VuDev *dev)
393 {
394 return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
395 1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
396 }
397
398 static int
vub_get_config(VuDev * vu_dev,uint8_t * config,uint32_t len)399 vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
400 {
401 VugDev *gdev;
402 VubDev *vdev_blk;
403
404 if (len > sizeof(struct virtio_blk_config)) {
405 return -1;
406 }
407
408 gdev = container_of(vu_dev, VugDev, parent);
409 vdev_blk = container_of(gdev, VubDev, parent);
410 memcpy(config, &vdev_blk->blkcfg, len);
411
412 return 0;
413 }
414
415 static int
vub_set_config(VuDev * vu_dev,const uint8_t * data,uint32_t offset,uint32_t size,uint32_t flags)416 vub_set_config(VuDev *vu_dev, const uint8_t *data,
417 uint32_t offset, uint32_t size, uint32_t flags)
418 {
419 VugDev *gdev;
420 VubDev *vdev_blk;
421 uint8_t wce;
422 int fd;
423
424 /* don't support live migration */
425 if (flags != VHOST_SET_CONFIG_TYPE_FRONTEND) {
426 return -1;
427 }
428
429 gdev = container_of(vu_dev, VugDev, parent);
430 vdev_blk = container_of(gdev, VubDev, parent);
431
432 if (offset != offsetof(struct virtio_blk_config, wce) ||
433 size != 1) {
434 return -1;
435 }
436
437 wce = *data;
438 if (wce == vdev_blk->blkcfg.wce) {
439 /* Do nothing as same with old configuration */
440 return 0;
441 }
442
443 vdev_blk->blkcfg.wce = wce;
444 fprintf(stdout, "Write Cache Policy Changed\n");
445 if (vdev_blk->blk_fd >= 0) {
446 close(vdev_blk->blk_fd);
447 vdev_blk->blk_fd = -1;
448 }
449
450 fd = vub_open(vdev_blk->blk_name, wce);
451 if (fd < 0) {
452 fprintf(stderr, "Error to open block device %s\n", vdev_blk->blk_name);
453 vdev_blk->blk_fd = -1;
454 return -1;
455 }
456 vdev_blk->blk_fd = fd;
457
458 return 0;
459 }
460
461 static const VuDevIface vub_iface = {
462 .get_features = vub_get_features,
463 .queue_set_started = vub_queue_set_started,
464 .get_protocol_features = vub_get_protocol_features,
465 .get_config = vub_get_config,
466 .set_config = vub_set_config,
467 };
468
unix_sock_new(char * unix_fn)469 static int unix_sock_new(char *unix_fn)
470 {
471 int sock;
472 struct sockaddr_un un;
473
474 assert(unix_fn);
475
476 sock = socket(AF_UNIX, SOCK_STREAM, 0);
477 if (sock < 0) {
478 perror("socket");
479 return -1;
480 }
481
482 un.sun_family = AF_UNIX;
483 (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn);
484
485 (void)unlink(unix_fn);
486 if (bind(sock, (struct sockaddr *)&un, sizeof(un)) < 0) {
487 perror("bind");
488 goto fail;
489 }
490
491 if (listen(sock, 1) < 0) {
492 perror("listen");
493 goto fail;
494 }
495
496 return sock;
497
498 fail:
499 (void)close(sock);
500
501 return -1;
502 }
503
vub_free(struct VubDev * vdev_blk)504 static void vub_free(struct VubDev *vdev_blk)
505 {
506 if (!vdev_blk) {
507 return;
508 }
509
510 g_main_loop_unref(vdev_blk->loop);
511 if (vdev_blk->blk_fd >= 0) {
512 close(vdev_blk->blk_fd);
513 }
514 g_free(vdev_blk);
515 }
516
517 static uint32_t
vub_get_blocksize(int fd)518 vub_get_blocksize(int fd)
519 {
520 uint32_t blocksize = 512;
521
522 #if defined(__linux__) && defined(BLKSSZGET)
523 if (ioctl(fd, BLKSSZGET, &blocksize) == 0) {
524 return blocksize;
525 }
526 #endif
527
528 return blocksize;
529 }
530
531 static void
vub_initialize_config(int fd,struct virtio_blk_config * config)532 vub_initialize_config(int fd, struct virtio_blk_config *config)
533 {
534 off_t capacity;
535
536 capacity = lseek(fd, 0, SEEK_END);
537 config->capacity = capacity >> 9;
538 config->blk_size = vub_get_blocksize(fd);
539 config->size_max = 65536;
540 config->seg_max = 128 - 2;
541 config->min_io_size = 1;
542 config->opt_io_size = 1;
543 config->num_queues = 1;
544 #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
545 config->max_discard_sectors = 32768;
546 config->max_discard_seg = 1;
547 config->discard_sector_alignment = config->blk_size >> 9;
548 config->max_write_zeroes_sectors = 32768;
549 config->max_write_zeroes_seg = 1;
550 #endif
551 }
552
553 static VubDev *
vub_new(char * blk_file)554 vub_new(char *blk_file)
555 {
556 VubDev *vdev_blk;
557
558 vdev_blk = g_new0(VubDev, 1);
559 vdev_blk->loop = g_main_loop_new(NULL, FALSE);
560 vdev_blk->blk_fd = vub_open(blk_file, 0);
561 if (vdev_blk->blk_fd < 0) {
562 fprintf(stderr, "Error to open block device %s\n", blk_file);
563 vub_free(vdev_blk);
564 return NULL;
565 }
566 vdev_blk->enable_ro = false;
567 vdev_blk->blkcfg.wce = 0;
568 vdev_blk->blk_name = blk_file;
569
570 /* fill virtio_blk_config with block parameters */
571 vub_initialize_config(vdev_blk->blk_fd, &vdev_blk->blkcfg);
572
573 return vdev_blk;
574 }
575
576 static int opt_fdnum = -1;
577 static char *opt_socket_path;
578 static char *opt_blk_file;
579 static gboolean opt_print_caps;
580 static gboolean opt_read_only;
581
582 static GOptionEntry entries[] = {
583 { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps,
584 "Print capabilities", NULL },
585 { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum,
586 "Use inherited fd socket", "FDNUM" },
587 { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path,
588 "Use UNIX socket path", "PATH" },
589 {"blk-file", 'b', 0, G_OPTION_ARG_FILENAME, &opt_blk_file,
590 "block device or file path", "PATH"},
591 { "read-only", 'r', 0, G_OPTION_ARG_NONE, &opt_read_only,
592 "Enable read-only", NULL },
593 { NULL, },
594 };
595
main(int argc,char ** argv)596 int main(int argc, char **argv)
597 {
598 int lsock = -1, csock = -1;
599 VubDev *vdev_blk = NULL;
600 GError *error = NULL;
601 GOptionContext *context;
602
603 context = g_option_context_new(NULL);
604 g_option_context_add_main_entries(context, entries, NULL);
605 if (!g_option_context_parse(context, &argc, &argv, &error)) {
606 g_printerr("Option parsing failed: %s\n", error->message);
607 exit(EXIT_FAILURE);
608 }
609 if (opt_print_caps) {
610 g_print("{\n");
611 g_print(" \"type\": \"block\",\n");
612 g_print(" \"features\": [\n");
613 g_print(" \"read-only\",\n");
614 g_print(" \"blk-file\"\n");
615 g_print(" ]\n");
616 g_print("}\n");
617 exit(EXIT_SUCCESS);
618 }
619
620 if (!opt_blk_file) {
621 g_print("%s\n", g_option_context_get_help(context, true, NULL));
622 exit(EXIT_FAILURE);
623 }
624
625 if (opt_socket_path) {
626 lsock = unix_sock_new(opt_socket_path);
627 if (lsock < 0) {
628 exit(EXIT_FAILURE);
629 }
630 } else if (opt_fdnum < 0) {
631 g_print("%s\n", g_option_context_get_help(context, true, NULL));
632 exit(EXIT_FAILURE);
633 } else {
634 lsock = opt_fdnum;
635 }
636
637 csock = accept(lsock, NULL, NULL);
638 if (csock < 0) {
639 g_printerr("Accept error %s\n", strerror(errno));
640 exit(EXIT_FAILURE);
641 }
642
643 vdev_blk = vub_new(opt_blk_file);
644 if (!vdev_blk) {
645 exit(EXIT_FAILURE);
646 }
647 if (opt_read_only) {
648 vdev_blk->enable_ro = true;
649 }
650
651 if (!vug_init(&vdev_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock,
652 vub_panic_cb, &vub_iface)) {
653 g_printerr("Failed to initialize libvhost-user-glib\n");
654 exit(EXIT_FAILURE);
655 }
656
657 g_main_loop_run(vdev_blk->loop);
658 g_main_loop_unref(vdev_blk->loop);
659 g_option_context_free(context);
660 vug_deinit(&vdev_blk->parent);
661 vub_free(vdev_blk);
662 if (csock >= 0) {
663 close(csock);
664 }
665 if (lsock >= 0) {
666 close(lsock);
667 }
668 g_free(opt_socket_path);
669 g_free(opt_blk_file);
670
671 return 0;
672 }
673