1 /* 2 * MSI-X device support 3 * 4 * This module includes support for MSI-X in pci devices. 5 * 6 * Author: Michael S. Tsirkin <mst@redhat.com> 7 * 8 * Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com) 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * the COPYING file in the top-level directory. 12 * 13 * Contributions after 2012-01-13 are licensed under the terms of the 14 * GNU GPL, version 2 or (at your option) any later version. 15 */ 16 17 #include "qemu/osdep.h" 18 #include "hw/hw.h" 19 #include "hw/pci/msi.h" 20 #include "hw/pci/msix.h" 21 #include "hw/pci/pci.h" 22 #include "hw/xen/xen.h" 23 #include "qemu/range.h" 24 25 #define MSIX_CAP_LENGTH 12 26 27 /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */ 28 #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1) 29 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8) 30 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8) 31 32 MSIMessage msix_get_message(PCIDevice *dev, unsigned vector) 33 { 34 uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE; 35 MSIMessage msg; 36 37 msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR); 38 msg.data = pci_get_long(table_entry + PCI_MSIX_ENTRY_DATA); 39 return msg; 40 } 41 42 /* 43 * Special API for POWER to configure the vectors through 44 * a side channel. Should never be used by devices. 45 */ 46 void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg) 47 { 48 uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE; 49 50 pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address); 51 pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data); 52 table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; 53 } 54 55 static uint8_t msix_pending_mask(int vector) 56 { 57 return 1 << (vector % 8); 58 } 59 60 static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) 61 { 62 return dev->msix_pba + vector / 8; 63 } 64 65 static int msix_is_pending(PCIDevice *dev, int vector) 66 { 67 return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); 68 } 69 70 void msix_set_pending(PCIDevice *dev, unsigned int vector) 71 { 72 *msix_pending_byte(dev, vector) |= msix_pending_mask(vector); 73 } 74 75 static void msix_clr_pending(PCIDevice *dev, int vector) 76 { 77 *msix_pending_byte(dev, vector) &= ~msix_pending_mask(vector); 78 } 79 80 static bool msix_vector_masked(PCIDevice *dev, unsigned int vector, bool fmask) 81 { 82 unsigned offset = vector * PCI_MSIX_ENTRY_SIZE; 83 uint8_t *data = &dev->msix_table[offset + PCI_MSIX_ENTRY_DATA]; 84 /* MSIs on Xen can be remapped into pirqs. In those cases, masking 85 * and unmasking go through the PV evtchn path. */ 86 if (xen_enabled() && xen_is_pirq_msi(pci_get_long(data))) { 87 return false; 88 } 89 return fmask || dev->msix_table[offset + PCI_MSIX_ENTRY_VECTOR_CTRL] & 90 PCI_MSIX_ENTRY_CTRL_MASKBIT; 91 } 92 93 bool msix_is_masked(PCIDevice *dev, unsigned int vector) 94 { 95 return msix_vector_masked(dev, vector, dev->msix_function_masked); 96 } 97 98 static void msix_fire_vector_notifier(PCIDevice *dev, 99 unsigned int vector, bool is_masked) 100 { 101 MSIMessage msg; 102 int ret; 103 104 if (!dev->msix_vector_use_notifier) { 105 return; 106 } 107 if (is_masked) { 108 dev->msix_vector_release_notifier(dev, vector); 109 } else { 110 msg = msix_get_message(dev, vector); 111 ret = dev->msix_vector_use_notifier(dev, vector, msg); 112 assert(ret >= 0); 113 } 114 } 115 116 static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked) 117 { 118 bool is_masked = msix_is_masked(dev, vector); 119 120 if (is_masked == was_masked) { 121 return; 122 } 123 124 msix_fire_vector_notifier(dev, vector, is_masked); 125 126 if (!is_masked && msix_is_pending(dev, vector)) { 127 msix_clr_pending(dev, vector); 128 msix_notify(dev, vector); 129 } 130 } 131 132 static void msix_update_function_masked(PCIDevice *dev) 133 { 134 dev->msix_function_masked = !msix_enabled(dev) || 135 (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK); 136 } 137 138 /* Handle MSI-X capability config write. */ 139 void msix_write_config(PCIDevice *dev, uint32_t addr, 140 uint32_t val, int len) 141 { 142 unsigned enable_pos = dev->msix_cap + MSIX_CONTROL_OFFSET; 143 int vector; 144 bool was_masked; 145 146 if (!msix_present(dev) || !range_covers_byte(addr, len, enable_pos)) { 147 return; 148 } 149 150 was_masked = dev->msix_function_masked; 151 msix_update_function_masked(dev); 152 153 if (!msix_enabled(dev)) { 154 return; 155 } 156 157 pci_device_deassert_intx(dev); 158 159 if (dev->msix_function_masked == was_masked) { 160 return; 161 } 162 163 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 164 msix_handle_mask_update(dev, vector, 165 msix_vector_masked(dev, vector, was_masked)); 166 } 167 } 168 169 static uint64_t msix_table_mmio_read(void *opaque, hwaddr addr, 170 unsigned size) 171 { 172 PCIDevice *dev = opaque; 173 174 return pci_get_long(dev->msix_table + addr); 175 } 176 177 static void msix_table_mmio_write(void *opaque, hwaddr addr, 178 uint64_t val, unsigned size) 179 { 180 PCIDevice *dev = opaque; 181 int vector = addr / PCI_MSIX_ENTRY_SIZE; 182 bool was_masked; 183 184 was_masked = msix_is_masked(dev, vector); 185 pci_set_long(dev->msix_table + addr, val); 186 msix_handle_mask_update(dev, vector, was_masked); 187 } 188 189 static const MemoryRegionOps msix_table_mmio_ops = { 190 .read = msix_table_mmio_read, 191 .write = msix_table_mmio_write, 192 .endianness = DEVICE_LITTLE_ENDIAN, 193 .valid = { 194 .min_access_size = 4, 195 .max_access_size = 4, 196 }, 197 }; 198 199 static uint64_t msix_pba_mmio_read(void *opaque, hwaddr addr, 200 unsigned size) 201 { 202 PCIDevice *dev = opaque; 203 if (dev->msix_vector_poll_notifier) { 204 unsigned vector_start = addr * 8; 205 unsigned vector_end = MIN(addr + size * 8, dev->msix_entries_nr); 206 dev->msix_vector_poll_notifier(dev, vector_start, vector_end); 207 } 208 209 return pci_get_long(dev->msix_pba + addr); 210 } 211 212 static void msix_pba_mmio_write(void *opaque, hwaddr addr, 213 uint64_t val, unsigned size) 214 { 215 } 216 217 static const MemoryRegionOps msix_pba_mmio_ops = { 218 .read = msix_pba_mmio_read, 219 .write = msix_pba_mmio_write, 220 .endianness = DEVICE_LITTLE_ENDIAN, 221 .valid = { 222 .min_access_size = 4, 223 .max_access_size = 4, 224 }, 225 }; 226 227 static void msix_mask_all(struct PCIDevice *dev, unsigned nentries) 228 { 229 int vector; 230 231 for (vector = 0; vector < nentries; ++vector) { 232 unsigned offset = 233 vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL; 234 bool was_masked = msix_is_masked(dev, vector); 235 236 dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT; 237 msix_handle_mask_update(dev, vector, was_masked); 238 } 239 } 240 241 /* Initialize the MSI-X structures */ 242 int msix_init(struct PCIDevice *dev, unsigned short nentries, 243 MemoryRegion *table_bar, uint8_t table_bar_nr, 244 unsigned table_offset, MemoryRegion *pba_bar, 245 uint8_t pba_bar_nr, unsigned pba_offset, uint8_t cap_pos) 246 { 247 int cap; 248 unsigned table_size, pba_size; 249 uint8_t *config; 250 251 /* Nothing to do if MSI is not supported by interrupt controller */ 252 if (!msi_nonbroken) { 253 return -ENOTSUP; 254 } 255 256 if (nentries < 1 || nentries > PCI_MSIX_FLAGS_QSIZE + 1) { 257 return -EINVAL; 258 } 259 260 table_size = nentries * PCI_MSIX_ENTRY_SIZE; 261 pba_size = QEMU_ALIGN_UP(nentries, 64) / 8; 262 263 /* Sanity test: table & pba don't overlap, fit within BARs, min aligned */ 264 if ((table_bar_nr == pba_bar_nr && 265 ranges_overlap(table_offset, table_size, pba_offset, pba_size)) || 266 table_offset + table_size > memory_region_size(table_bar) || 267 pba_offset + pba_size > memory_region_size(pba_bar) || 268 (table_offset | pba_offset) & PCI_MSIX_FLAGS_BIRMASK) { 269 return -EINVAL; 270 } 271 272 cap = pci_add_capability(dev, PCI_CAP_ID_MSIX, cap_pos, MSIX_CAP_LENGTH); 273 if (cap < 0) { 274 return cap; 275 } 276 277 dev->msix_cap = cap; 278 dev->cap_present |= QEMU_PCI_CAP_MSIX; 279 config = dev->config + cap; 280 281 pci_set_word(config + PCI_MSIX_FLAGS, nentries - 1); 282 dev->msix_entries_nr = nentries; 283 dev->msix_function_masked = true; 284 285 pci_set_long(config + PCI_MSIX_TABLE, table_offset | table_bar_nr); 286 pci_set_long(config + PCI_MSIX_PBA, pba_offset | pba_bar_nr); 287 288 /* Make flags bit writable. */ 289 dev->wmask[cap + MSIX_CONTROL_OFFSET] |= MSIX_ENABLE_MASK | 290 MSIX_MASKALL_MASK; 291 292 dev->msix_table = g_malloc0(table_size); 293 dev->msix_pba = g_malloc0(pba_size); 294 dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used); 295 296 msix_mask_all(dev, nentries); 297 298 memory_region_init_io(&dev->msix_table_mmio, OBJECT(dev), &msix_table_mmio_ops, dev, 299 "msix-table", table_size); 300 memory_region_add_subregion(table_bar, table_offset, &dev->msix_table_mmio); 301 memory_region_init_io(&dev->msix_pba_mmio, OBJECT(dev), &msix_pba_mmio_ops, dev, 302 "msix-pba", pba_size); 303 memory_region_add_subregion(pba_bar, pba_offset, &dev->msix_pba_mmio); 304 305 return 0; 306 } 307 308 int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries, 309 uint8_t bar_nr) 310 { 311 int ret; 312 char *name; 313 uint32_t bar_size = 4096; 314 uint32_t bar_pba_offset = bar_size / 2; 315 uint32_t bar_pba_size = (nentries / 8 + 1) * 8; 316 317 /* 318 * Migration compatibility dictates that this remains a 4k 319 * BAR with the vector table in the lower half and PBA in 320 * the upper half for nentries which is lower or equal to 128. 321 * No need to care about using more than 65 entries for legacy 322 * machine types who has at most 64 queues. 323 */ 324 if (nentries * PCI_MSIX_ENTRY_SIZE > bar_pba_offset) { 325 bar_pba_offset = nentries * PCI_MSIX_ENTRY_SIZE; 326 } 327 328 if (bar_pba_offset + bar_pba_size > 4096) { 329 bar_size = bar_pba_offset + bar_pba_size; 330 } 331 332 bar_size = pow2ceil(bar_size); 333 334 name = g_strdup_printf("%s-msix", dev->name); 335 memory_region_init(&dev->msix_exclusive_bar, OBJECT(dev), name, bar_size); 336 g_free(name); 337 338 ret = msix_init(dev, nentries, &dev->msix_exclusive_bar, bar_nr, 339 0, &dev->msix_exclusive_bar, 340 bar_nr, bar_pba_offset, 341 0); 342 if (ret) { 343 return ret; 344 } 345 346 pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY, 347 &dev->msix_exclusive_bar); 348 349 return 0; 350 } 351 352 static void msix_free_irq_entries(PCIDevice *dev) 353 { 354 int vector; 355 356 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 357 dev->msix_entry_used[vector] = 0; 358 msix_clr_pending(dev, vector); 359 } 360 } 361 362 static void msix_clear_all_vectors(PCIDevice *dev) 363 { 364 int vector; 365 366 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 367 msix_clr_pending(dev, vector); 368 } 369 } 370 371 /* Clean up resources for the device. */ 372 void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar) 373 { 374 if (!msix_present(dev)) { 375 return; 376 } 377 pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH); 378 dev->msix_cap = 0; 379 msix_free_irq_entries(dev); 380 dev->msix_entries_nr = 0; 381 memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio); 382 g_free(dev->msix_pba); 383 dev->msix_pba = NULL; 384 memory_region_del_subregion(table_bar, &dev->msix_table_mmio); 385 g_free(dev->msix_table); 386 dev->msix_table = NULL; 387 g_free(dev->msix_entry_used); 388 dev->msix_entry_used = NULL; 389 dev->cap_present &= ~QEMU_PCI_CAP_MSIX; 390 } 391 392 void msix_uninit_exclusive_bar(PCIDevice *dev) 393 { 394 if (msix_present(dev)) { 395 msix_uninit(dev, &dev->msix_exclusive_bar, &dev->msix_exclusive_bar); 396 } 397 } 398 399 void msix_save(PCIDevice *dev, QEMUFile *f) 400 { 401 unsigned n = dev->msix_entries_nr; 402 403 if (!msix_present(dev)) { 404 return; 405 } 406 407 qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE); 408 qemu_put_buffer(f, dev->msix_pba, (n + 7) / 8); 409 } 410 411 /* Should be called after restoring the config space. */ 412 void msix_load(PCIDevice *dev, QEMUFile *f) 413 { 414 unsigned n = dev->msix_entries_nr; 415 unsigned int vector; 416 417 if (!msix_present(dev)) { 418 return; 419 } 420 421 msix_clear_all_vectors(dev); 422 qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE); 423 qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8); 424 msix_update_function_masked(dev); 425 426 for (vector = 0; vector < n; vector++) { 427 msix_handle_mask_update(dev, vector, true); 428 } 429 } 430 431 /* Does device support MSI-X? */ 432 int msix_present(PCIDevice *dev) 433 { 434 return dev->cap_present & QEMU_PCI_CAP_MSIX; 435 } 436 437 /* Is MSI-X enabled? */ 438 int msix_enabled(PCIDevice *dev) 439 { 440 return (dev->cap_present & QEMU_PCI_CAP_MSIX) && 441 (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 442 MSIX_ENABLE_MASK); 443 } 444 445 /* Send an MSI-X message */ 446 void msix_notify(PCIDevice *dev, unsigned vector) 447 { 448 MSIMessage msg; 449 450 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) 451 return; 452 if (msix_is_masked(dev, vector)) { 453 msix_set_pending(dev, vector); 454 return; 455 } 456 457 msg = msix_get_message(dev, vector); 458 459 msi_send_message(dev, msg); 460 } 461 462 void msix_reset(PCIDevice *dev) 463 { 464 if (!msix_present(dev)) { 465 return; 466 } 467 msix_clear_all_vectors(dev); 468 dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &= 469 ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET]; 470 memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE); 471 memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8); 472 msix_mask_all(dev, dev->msix_entries_nr); 473 } 474 475 /* PCI spec suggests that devices make it possible for software to configure 476 * less vectors than supported by the device, but does not specify a standard 477 * mechanism for devices to do so. 478 * 479 * We support this by asking devices to declare vectors software is going to 480 * actually use, and checking this on the notification path. Devices that 481 * don't want to follow the spec suggestion can declare all vectors as used. */ 482 483 /* Mark vector as used. */ 484 int msix_vector_use(PCIDevice *dev, unsigned vector) 485 { 486 if (vector >= dev->msix_entries_nr) 487 return -EINVAL; 488 dev->msix_entry_used[vector]++; 489 return 0; 490 } 491 492 /* Mark vector as unused. */ 493 void msix_vector_unuse(PCIDevice *dev, unsigned vector) 494 { 495 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) { 496 return; 497 } 498 if (--dev->msix_entry_used[vector]) { 499 return; 500 } 501 msix_clr_pending(dev, vector); 502 } 503 504 void msix_unuse_all_vectors(PCIDevice *dev) 505 { 506 if (!msix_present(dev)) { 507 return; 508 } 509 msix_free_irq_entries(dev); 510 } 511 512 unsigned int msix_nr_vectors_allocated(const PCIDevice *dev) 513 { 514 return dev->msix_entries_nr; 515 } 516 517 static int msix_set_notifier_for_vector(PCIDevice *dev, unsigned int vector) 518 { 519 MSIMessage msg; 520 521 if (msix_is_masked(dev, vector)) { 522 return 0; 523 } 524 msg = msix_get_message(dev, vector); 525 return dev->msix_vector_use_notifier(dev, vector, msg); 526 } 527 528 static void msix_unset_notifier_for_vector(PCIDevice *dev, unsigned int vector) 529 { 530 if (msix_is_masked(dev, vector)) { 531 return; 532 } 533 dev->msix_vector_release_notifier(dev, vector); 534 } 535 536 int msix_set_vector_notifiers(PCIDevice *dev, 537 MSIVectorUseNotifier use_notifier, 538 MSIVectorReleaseNotifier release_notifier, 539 MSIVectorPollNotifier poll_notifier) 540 { 541 int vector, ret; 542 543 assert(use_notifier && release_notifier); 544 545 dev->msix_vector_use_notifier = use_notifier; 546 dev->msix_vector_release_notifier = release_notifier; 547 dev->msix_vector_poll_notifier = poll_notifier; 548 549 if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 550 (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) { 551 for (vector = 0; vector < dev->msix_entries_nr; vector++) { 552 ret = msix_set_notifier_for_vector(dev, vector); 553 if (ret < 0) { 554 goto undo; 555 } 556 } 557 } 558 if (dev->msix_vector_poll_notifier) { 559 dev->msix_vector_poll_notifier(dev, 0, dev->msix_entries_nr); 560 } 561 return 0; 562 563 undo: 564 while (--vector >= 0) { 565 msix_unset_notifier_for_vector(dev, vector); 566 } 567 dev->msix_vector_use_notifier = NULL; 568 dev->msix_vector_release_notifier = NULL; 569 return ret; 570 } 571 572 void msix_unset_vector_notifiers(PCIDevice *dev) 573 { 574 int vector; 575 576 assert(dev->msix_vector_use_notifier && 577 dev->msix_vector_release_notifier); 578 579 if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 580 (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) { 581 for (vector = 0; vector < dev->msix_entries_nr; vector++) { 582 msix_unset_notifier_for_vector(dev, vector); 583 } 584 } 585 dev->msix_vector_use_notifier = NULL; 586 dev->msix_vector_release_notifier = NULL; 587 dev->msix_vector_poll_notifier = NULL; 588 } 589 590 static void put_msix_state(QEMUFile *f, void *pv, size_t size) 591 { 592 msix_save(pv, f); 593 } 594 595 static int get_msix_state(QEMUFile *f, void *pv, size_t size) 596 { 597 msix_load(pv, f); 598 return 0; 599 } 600 601 static VMStateInfo vmstate_info_msix = { 602 .name = "msix state", 603 .get = get_msix_state, 604 .put = put_msix_state, 605 }; 606 607 const VMStateDescription vmstate_msix = { 608 .name = "msix", 609 .fields = (VMStateField[]) { 610 { 611 .name = "msix", 612 .version_id = 0, 613 .field_exists = NULL, 614 .size = 0, /* ouch */ 615 .info = &vmstate_info_msix, 616 .flags = VMS_SINGLE, 617 .offset = 0, 618 }, 619 VMSTATE_END_OF_LIST() 620 } 621 }; 622