1 /* Virtio ring implementation. 2 * 3 * Copyright 2007 Rusty Russell IBM Corporation 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 */ 19 #include <linux/virtio.h> 20 #include <linux/virtio_ring.h> 21 #include <linux/virtio_config.h> 22 #include <linux/device.h> 23 #include <linux/slab.h> 24 25 /* virtio guest is communicating with a virtual "device" that actually runs on 26 * a host processor. Memory barriers are used to control SMP effects. */ 27 #ifdef CONFIG_SMP 28 /* Where possible, use SMP barriers which are more lightweight than mandatory 29 * barriers, because mandatory barriers control MMIO effects on accesses 30 * through relaxed memory I/O windows (which virtio does not use). */ 31 #define virtio_mb() smp_mb() 32 #define virtio_rmb() smp_rmb() 33 #define virtio_wmb() smp_wmb() 34 #else 35 /* We must force memory ordering even if guest is UP since host could be 36 * running on another CPU, but SMP barriers are defined to barrier() in that 37 * configuration. So fall back to mandatory barriers instead. */ 38 #define virtio_mb() mb() 39 #define virtio_rmb() rmb() 40 #define virtio_wmb() wmb() 41 #endif 42 43 #ifdef DEBUG 44 /* For development, we want to crash whenever the ring is screwed. */ 45 #define BAD_RING(_vq, fmt, args...) \ 46 do { \ 47 dev_err(&(_vq)->vq.vdev->dev, \ 48 "%s:"fmt, (_vq)->vq.name, ##args); \ 49 BUG(); \ 50 } while (0) 51 /* Caller is supposed to guarantee no reentry. */ 52 #define START_USE(_vq) \ 53 do { \ 54 if ((_vq)->in_use) \ 55 panic("%s:in_use = %i\n", \ 56 (_vq)->vq.name, (_vq)->in_use); \ 57 (_vq)->in_use = __LINE__; \ 58 } while (0) 59 #define END_USE(_vq) \ 60 do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0) 61 #else 62 #define BAD_RING(_vq, fmt, args...) \ 63 do { \ 64 dev_err(&_vq->vq.vdev->dev, \ 65 "%s:"fmt, (_vq)->vq.name, ##args); \ 66 (_vq)->broken = true; \ 67 } while (0) 68 #define START_USE(vq) 69 #define END_USE(vq) 70 #endif 71 72 struct vring_virtqueue 73 { 74 struct virtqueue vq; 75 76 /* Actual memory layout for this queue */ 77 struct vring vring; 78 79 /* Other side has made a mess, don't try any more. */ 80 bool broken; 81 82 /* Host supports indirect buffers */ 83 bool indirect; 84 85 /* Host publishes avail event idx */ 86 bool event; 87 88 /* Number of free buffers */ 89 unsigned int num_free; 90 /* Head of free buffer list. */ 91 unsigned int free_head; 92 /* Number we've added since last sync. */ 93 unsigned int num_added; 94 95 /* Last used index we've seen. */ 96 u16 last_used_idx; 97 98 /* How to notify other side. FIXME: commonalize hcalls! */ 99 void (*notify)(struct virtqueue *vq); 100 101 #ifdef DEBUG 102 /* They're supposed to lock for us. */ 103 unsigned int in_use; 104 #endif 105 106 /* Tokens for callbacks. */ 107 void *data[]; 108 }; 109 110 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) 111 112 /* Set up an indirect table of descriptors and add it to the queue. */ 113 static int vring_add_indirect(struct vring_virtqueue *vq, 114 struct scatterlist sg[], 115 unsigned int out, 116 unsigned int in, 117 gfp_t gfp) 118 { 119 struct vring_desc *desc; 120 unsigned head; 121 int i; 122 123 desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); 124 if (!desc) 125 return -ENOMEM; 126 127 /* Transfer entries from the sg list into the indirect page */ 128 for (i = 0; i < out; i++) { 129 desc[i].flags = VRING_DESC_F_NEXT; 130 desc[i].addr = sg_phys(sg); 131 desc[i].len = sg->length; 132 desc[i].next = i+1; 133 sg++; 134 } 135 for (; i < (out + in); i++) { 136 desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; 137 desc[i].addr = sg_phys(sg); 138 desc[i].len = sg->length; 139 desc[i].next = i+1; 140 sg++; 141 } 142 143 /* Last one doesn't continue. */ 144 desc[i-1].flags &= ~VRING_DESC_F_NEXT; 145 desc[i-1].next = 0; 146 147 /* We're about to use a buffer */ 148 vq->num_free--; 149 150 /* Use a single buffer which doesn't continue */ 151 head = vq->free_head; 152 vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT; 153 vq->vring.desc[head].addr = virt_to_phys(desc); 154 vq->vring.desc[head].len = i * sizeof(struct vring_desc); 155 156 /* Update free pointer */ 157 vq->free_head = vq->vring.desc[head].next; 158 159 return head; 160 } 161 162 int virtqueue_add_buf_gfp(struct virtqueue *_vq, 163 struct scatterlist sg[], 164 unsigned int out, 165 unsigned int in, 166 void *data, 167 gfp_t gfp) 168 { 169 struct vring_virtqueue *vq = to_vvq(_vq); 170 unsigned int i, avail, uninitialized_var(prev); 171 int head; 172 173 START_USE(vq); 174 175 BUG_ON(data == NULL); 176 177 /* If the host supports indirect descriptor tables, and we have multiple 178 * buffers, then go indirect. FIXME: tune this threshold */ 179 if (vq->indirect && (out + in) > 1 && vq->num_free) { 180 head = vring_add_indirect(vq, sg, out, in, gfp); 181 if (likely(head >= 0)) 182 goto add_head; 183 } 184 185 BUG_ON(out + in > vq->vring.num); 186 BUG_ON(out + in == 0); 187 188 if (vq->num_free < out + in) { 189 pr_debug("Can't add buf len %i - avail = %i\n", 190 out + in, vq->num_free); 191 /* FIXME: for historical reasons, we force a notify here if 192 * there are outgoing parts to the buffer. Presumably the 193 * host should service the ring ASAP. */ 194 if (out) 195 vq->notify(&vq->vq); 196 END_USE(vq); 197 return -ENOSPC; 198 } 199 200 /* We're about to use some buffers from the free list. */ 201 vq->num_free -= out + in; 202 203 head = vq->free_head; 204 for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { 205 vq->vring.desc[i].flags = VRING_DESC_F_NEXT; 206 vq->vring.desc[i].addr = sg_phys(sg); 207 vq->vring.desc[i].len = sg->length; 208 prev = i; 209 sg++; 210 } 211 for (; in; i = vq->vring.desc[i].next, in--) { 212 vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; 213 vq->vring.desc[i].addr = sg_phys(sg); 214 vq->vring.desc[i].len = sg->length; 215 prev = i; 216 sg++; 217 } 218 /* Last one doesn't continue. */ 219 vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; 220 221 /* Update free pointer */ 222 vq->free_head = i; 223 224 add_head: 225 /* Set token. */ 226 vq->data[head] = data; 227 228 /* Put entry in available array (but don't update avail->idx until they 229 * do sync). FIXME: avoid modulus here? */ 230 avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num; 231 vq->vring.avail->ring[avail] = head; 232 233 pr_debug("Added buffer head %i to %p\n", head, vq); 234 END_USE(vq); 235 236 return vq->num_free; 237 } 238 EXPORT_SYMBOL_GPL(virtqueue_add_buf_gfp); 239 240 void virtqueue_kick(struct virtqueue *_vq) 241 { 242 struct vring_virtqueue *vq = to_vvq(_vq); 243 u16 new, old; 244 START_USE(vq); 245 /* Descriptors and available array need to be set before we expose the 246 * new available array entries. */ 247 virtio_wmb(); 248 249 old = vq->vring.avail->idx; 250 new = vq->vring.avail->idx = old + vq->num_added; 251 vq->num_added = 0; 252 253 /* Need to update avail index before checking if we should notify */ 254 virtio_mb(); 255 256 if (vq->event ? 257 vring_need_event(vring_avail_event(&vq->vring), new, old) : 258 !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) 259 /* Prod other side to tell it about changes. */ 260 vq->notify(&vq->vq); 261 262 END_USE(vq); 263 } 264 EXPORT_SYMBOL_GPL(virtqueue_kick); 265 266 static void detach_buf(struct vring_virtqueue *vq, unsigned int head) 267 { 268 unsigned int i; 269 270 /* Clear data ptr. */ 271 vq->data[head] = NULL; 272 273 /* Put back on free list: find end */ 274 i = head; 275 276 /* Free the indirect table */ 277 if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) 278 kfree(phys_to_virt(vq->vring.desc[i].addr)); 279 280 while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { 281 i = vq->vring.desc[i].next; 282 vq->num_free++; 283 } 284 285 vq->vring.desc[i].next = vq->free_head; 286 vq->free_head = head; 287 /* Plus final descriptor */ 288 vq->num_free++; 289 } 290 291 static inline bool more_used(const struct vring_virtqueue *vq) 292 { 293 return vq->last_used_idx != vq->vring.used->idx; 294 } 295 296 void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) 297 { 298 struct vring_virtqueue *vq = to_vvq(_vq); 299 void *ret; 300 unsigned int i; 301 302 START_USE(vq); 303 304 if (unlikely(vq->broken)) { 305 END_USE(vq); 306 return NULL; 307 } 308 309 if (!more_used(vq)) { 310 pr_debug("No more buffers in queue\n"); 311 END_USE(vq); 312 return NULL; 313 } 314 315 /* Only get used array entries after they have been exposed by host. */ 316 virtio_rmb(); 317 318 i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; 319 *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; 320 321 if (unlikely(i >= vq->vring.num)) { 322 BAD_RING(vq, "id %u out of range\n", i); 323 return NULL; 324 } 325 if (unlikely(!vq->data[i])) { 326 BAD_RING(vq, "id %u is not a head!\n", i); 327 return NULL; 328 } 329 330 /* detach_buf clears data, so grab it now. */ 331 ret = vq->data[i]; 332 detach_buf(vq, i); 333 vq->last_used_idx++; 334 /* If we expect an interrupt for the next entry, tell host 335 * by writing event index and flush out the write before 336 * the read in the next get_buf call. */ 337 if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { 338 vring_used_event(&vq->vring) = vq->last_used_idx; 339 virtio_mb(); 340 } 341 342 END_USE(vq); 343 return ret; 344 } 345 EXPORT_SYMBOL_GPL(virtqueue_get_buf); 346 347 void virtqueue_disable_cb(struct virtqueue *_vq) 348 { 349 struct vring_virtqueue *vq = to_vvq(_vq); 350 351 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 352 } 353 EXPORT_SYMBOL_GPL(virtqueue_disable_cb); 354 355 bool virtqueue_enable_cb(struct virtqueue *_vq) 356 { 357 struct vring_virtqueue *vq = to_vvq(_vq); 358 359 START_USE(vq); 360 361 /* We optimistically turn back on interrupts, then check if there was 362 * more to do. */ 363 /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to 364 * either clear the flags bit or point the event index at the next 365 * entry. Always do both to keep code simple. */ 366 vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; 367 vring_used_event(&vq->vring) = vq->last_used_idx; 368 virtio_mb(); 369 if (unlikely(more_used(vq))) { 370 END_USE(vq); 371 return false; 372 } 373 374 END_USE(vq); 375 return true; 376 } 377 EXPORT_SYMBOL_GPL(virtqueue_enable_cb); 378 379 bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) 380 { 381 struct vring_virtqueue *vq = to_vvq(_vq); 382 u16 bufs; 383 384 START_USE(vq); 385 386 /* We optimistically turn back on interrupts, then check if there was 387 * more to do. */ 388 /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to 389 * either clear the flags bit or point the event index at the next 390 * entry. Always do both to keep code simple. */ 391 vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; 392 /* TODO: tune this threshold */ 393 bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; 394 vring_used_event(&vq->vring) = vq->last_used_idx + bufs; 395 virtio_mb(); 396 if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { 397 END_USE(vq); 398 return false; 399 } 400 401 END_USE(vq); 402 return true; 403 } 404 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); 405 406 void *virtqueue_detach_unused_buf(struct virtqueue *_vq) 407 { 408 struct vring_virtqueue *vq = to_vvq(_vq); 409 unsigned int i; 410 void *buf; 411 412 START_USE(vq); 413 414 for (i = 0; i < vq->vring.num; i++) { 415 if (!vq->data[i]) 416 continue; 417 /* detach_buf clears data, so grab it now. */ 418 buf = vq->data[i]; 419 detach_buf(vq, i); 420 vq->vring.avail->idx--; 421 END_USE(vq); 422 return buf; 423 } 424 /* That should have freed everything. */ 425 BUG_ON(vq->num_free != vq->vring.num); 426 427 END_USE(vq); 428 return NULL; 429 } 430 EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); 431 432 irqreturn_t vring_interrupt(int irq, void *_vq) 433 { 434 struct vring_virtqueue *vq = to_vvq(_vq); 435 436 if (!more_used(vq)) { 437 pr_debug("virtqueue interrupt with no work for %p\n", vq); 438 return IRQ_NONE; 439 } 440 441 if (unlikely(vq->broken)) 442 return IRQ_HANDLED; 443 444 pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); 445 if (vq->vq.callback) 446 vq->vq.callback(&vq->vq); 447 448 return IRQ_HANDLED; 449 } 450 EXPORT_SYMBOL_GPL(vring_interrupt); 451 452 struct virtqueue *vring_new_virtqueue(unsigned int num, 453 unsigned int vring_align, 454 struct virtio_device *vdev, 455 void *pages, 456 void (*notify)(struct virtqueue *), 457 void (*callback)(struct virtqueue *), 458 const char *name) 459 { 460 struct vring_virtqueue *vq; 461 unsigned int i; 462 463 /* We assume num is a power of 2. */ 464 if (num & (num - 1)) { 465 dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); 466 return NULL; 467 } 468 469 vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); 470 if (!vq) 471 return NULL; 472 473 vring_init(&vq->vring, num, pages, vring_align); 474 vq->vq.callback = callback; 475 vq->vq.vdev = vdev; 476 vq->vq.name = name; 477 vq->notify = notify; 478 vq->broken = false; 479 vq->last_used_idx = 0; 480 vq->num_added = 0; 481 list_add_tail(&vq->vq.list, &vdev->vqs); 482 #ifdef DEBUG 483 vq->in_use = false; 484 #endif 485 486 vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); 487 vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); 488 489 /* No callback? Tell other side not to bother us. */ 490 if (!callback) 491 vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; 492 493 /* Put everything in free lists. */ 494 vq->num_free = num; 495 vq->free_head = 0; 496 for (i = 0; i < num-1; i++) { 497 vq->vring.desc[i].next = i+1; 498 vq->data[i] = NULL; 499 } 500 vq->data[i] = NULL; 501 502 return &vq->vq; 503 } 504 EXPORT_SYMBOL_GPL(vring_new_virtqueue); 505 506 void vring_del_virtqueue(struct virtqueue *vq) 507 { 508 list_del(&vq->list); 509 kfree(to_vvq(vq)); 510 } 511 EXPORT_SYMBOL_GPL(vring_del_virtqueue); 512 513 /* Manipulates transport-specific feature bits. */ 514 void vring_transport_features(struct virtio_device *vdev) 515 { 516 unsigned int i; 517 518 for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) { 519 switch (i) { 520 case VIRTIO_RING_F_INDIRECT_DESC: 521 break; 522 case VIRTIO_RING_F_EVENT_IDX: 523 break; 524 default: 525 /* We don't understand this bit. */ 526 clear_bit(i, vdev->features); 527 } 528 } 529 } 530 EXPORT_SYMBOL_GPL(vring_transport_features); 531 532 MODULE_LICENSE("GPL"); 533