1 /* 2 * MSI-X device support 3 * 4 * This module includes support for MSI-X in pci devices. 5 * 6 * Author: Michael S. Tsirkin <mst@redhat.com> 7 * 8 * Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com) 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * the COPYING file in the top-level directory. 12 * 13 * Contributions after 2012-01-13 are licensed under the terms of the 14 * GNU GPL, version 2 or (at your option) any later version. 15 */ 16 17 #include "qemu/osdep.h" 18 #include "hw/hw.h" 19 #include "hw/pci/msi.h" 20 #include "hw/pci/msix.h" 21 #include "hw/pci/pci.h" 22 #include "hw/xen/xen.h" 23 #include "qemu/range.h" 24 #include "qapi/error.h" 25 26 #define MSIX_CAP_LENGTH 12 27 28 /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */ 29 #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1) 30 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8) 31 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8) 32 33 MSIMessage msix_get_message(PCIDevice *dev, unsigned vector) 34 { 35 uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE; 36 MSIMessage msg; 37 38 msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR); 39 msg.data = pci_get_long(table_entry + PCI_MSIX_ENTRY_DATA); 40 return msg; 41 } 42 43 /* 44 * Special API for POWER to configure the vectors through 45 * a side channel. Should never be used by devices. 46 */ 47 void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg) 48 { 49 uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE; 50 51 pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address); 52 pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data); 53 table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; 54 } 55 56 static uint8_t msix_pending_mask(int vector) 57 { 58 return 1 << (vector % 8); 59 } 60 61 static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) 62 { 63 return dev->msix_pba + vector / 8; 64 } 65 66 static int msix_is_pending(PCIDevice *dev, int vector) 67 { 68 return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); 69 } 70 71 void msix_set_pending(PCIDevice *dev, unsigned int vector) 72 { 73 *msix_pending_byte(dev, vector) |= msix_pending_mask(vector); 74 } 75 76 void msix_clr_pending(PCIDevice *dev, int vector) 77 { 78 *msix_pending_byte(dev, vector) &= ~msix_pending_mask(vector); 79 } 80 81 static bool msix_vector_masked(PCIDevice *dev, unsigned int vector, bool fmask) 82 { 83 unsigned offset = vector * PCI_MSIX_ENTRY_SIZE; 84 uint8_t *data = &dev->msix_table[offset + PCI_MSIX_ENTRY_DATA]; 85 /* MSIs on Xen can be remapped into pirqs. In those cases, masking 86 * and unmasking go through the PV evtchn path. */ 87 if (xen_enabled() && xen_is_pirq_msi(pci_get_long(data))) { 88 return false; 89 } 90 return fmask || dev->msix_table[offset + PCI_MSIX_ENTRY_VECTOR_CTRL] & 91 PCI_MSIX_ENTRY_CTRL_MASKBIT; 92 } 93 94 bool msix_is_masked(PCIDevice *dev, unsigned int vector) 95 { 96 return msix_vector_masked(dev, vector, dev->msix_function_masked); 97 } 98 99 static void msix_fire_vector_notifier(PCIDevice *dev, 100 unsigned int vector, bool is_masked) 101 { 102 MSIMessage msg; 103 int ret; 104 105 if (!dev->msix_vector_use_notifier) { 106 return; 107 } 108 if (is_masked) { 109 dev->msix_vector_release_notifier(dev, vector); 110 } else { 111 msg = msix_get_message(dev, vector); 112 ret = dev->msix_vector_use_notifier(dev, vector, msg); 113 assert(ret >= 0); 114 } 115 } 116 117 static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked) 118 { 119 bool is_masked = msix_is_masked(dev, vector); 120 121 if (is_masked == was_masked) { 122 return; 123 } 124 125 msix_fire_vector_notifier(dev, vector, is_masked); 126 127 if (!is_masked && msix_is_pending(dev, vector)) { 128 msix_clr_pending(dev, vector); 129 msix_notify(dev, vector); 130 } 131 } 132 133 static void msix_update_function_masked(PCIDevice *dev) 134 { 135 dev->msix_function_masked = !msix_enabled(dev) || 136 (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK); 137 } 138 139 /* Handle MSI-X capability config write. */ 140 void msix_write_config(PCIDevice *dev, uint32_t addr, 141 uint32_t val, int len) 142 { 143 unsigned enable_pos = dev->msix_cap + MSIX_CONTROL_OFFSET; 144 int vector; 145 bool was_masked; 146 147 if (!msix_present(dev) || !range_covers_byte(addr, len, enable_pos)) { 148 return; 149 } 150 151 was_masked = dev->msix_function_masked; 152 msix_update_function_masked(dev); 153 154 if (!msix_enabled(dev)) { 155 return; 156 } 157 158 pci_device_deassert_intx(dev); 159 160 if (dev->msix_function_masked == was_masked) { 161 return; 162 } 163 164 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 165 msix_handle_mask_update(dev, vector, 166 msix_vector_masked(dev, vector, was_masked)); 167 } 168 } 169 170 static uint64_t msix_table_mmio_read(void *opaque, hwaddr addr, 171 unsigned size) 172 { 173 PCIDevice *dev = opaque; 174 175 return pci_get_long(dev->msix_table + addr); 176 } 177 178 static void msix_table_mmio_write(void *opaque, hwaddr addr, 179 uint64_t val, unsigned size) 180 { 181 PCIDevice *dev = opaque; 182 int vector = addr / PCI_MSIX_ENTRY_SIZE; 183 bool was_masked; 184 185 was_masked = msix_is_masked(dev, vector); 186 pci_set_long(dev->msix_table + addr, val); 187 msix_handle_mask_update(dev, vector, was_masked); 188 } 189 190 static const MemoryRegionOps msix_table_mmio_ops = { 191 .read = msix_table_mmio_read, 192 .write = msix_table_mmio_write, 193 .endianness = DEVICE_LITTLE_ENDIAN, 194 .valid = { 195 .min_access_size = 4, 196 .max_access_size = 4, 197 }, 198 }; 199 200 static uint64_t msix_pba_mmio_read(void *opaque, hwaddr addr, 201 unsigned size) 202 { 203 PCIDevice *dev = opaque; 204 if (dev->msix_vector_poll_notifier) { 205 unsigned vector_start = addr * 8; 206 unsigned vector_end = MIN(addr + size * 8, dev->msix_entries_nr); 207 dev->msix_vector_poll_notifier(dev, vector_start, vector_end); 208 } 209 210 return pci_get_long(dev->msix_pba + addr); 211 } 212 213 static void msix_pba_mmio_write(void *opaque, hwaddr addr, 214 uint64_t val, unsigned size) 215 { 216 } 217 218 static const MemoryRegionOps msix_pba_mmio_ops = { 219 .read = msix_pba_mmio_read, 220 .write = msix_pba_mmio_write, 221 .endianness = DEVICE_LITTLE_ENDIAN, 222 .valid = { 223 .min_access_size = 4, 224 .max_access_size = 4, 225 }, 226 }; 227 228 static void msix_mask_all(struct PCIDevice *dev, unsigned nentries) 229 { 230 int vector; 231 232 for (vector = 0; vector < nentries; ++vector) { 233 unsigned offset = 234 vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL; 235 bool was_masked = msix_is_masked(dev, vector); 236 237 dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT; 238 msix_handle_mask_update(dev, vector, was_masked); 239 } 240 } 241 242 /* 243 * Make PCI device @dev MSI-X capable 244 * @nentries is the max number of MSI-X vectors that the device support. 245 * @table_bar is the MemoryRegion that MSI-X table structure resides. 246 * @table_bar_nr is number of base address register corresponding to @table_bar. 247 * @table_offset indicates the offset that the MSI-X table structure starts with 248 * in @table_bar. 249 * @pba_bar is the MemoryRegion that the Pending Bit Array structure resides. 250 * @pba_bar_nr is number of base address register corresponding to @pba_bar. 251 * @pba_offset indicates the offset that the Pending Bit Array structure 252 * starts with in @pba_bar. 253 * Non-zero @cap_pos puts capability MSI-X at that offset in PCI config space. 254 * @errp is for returning errors. 255 * 256 * Return 0 on success; set @errp and return -errno on error: 257 * -ENOTSUP means lacking msi support for a msi-capable platform. 258 * -EINVAL means capability overlap, happens when @cap_pos is non-zero, 259 * also means a programming error, except device assignment, which can check 260 * if a real HW is broken. 261 */ 262 int msix_init(struct PCIDevice *dev, unsigned short nentries, 263 MemoryRegion *table_bar, uint8_t table_bar_nr, 264 unsigned table_offset, MemoryRegion *pba_bar, 265 uint8_t pba_bar_nr, unsigned pba_offset, uint8_t cap_pos, 266 Error **errp) 267 { 268 int cap; 269 unsigned table_size, pba_size; 270 uint8_t *config; 271 272 /* Nothing to do if MSI is not supported by interrupt controller */ 273 if (!msi_nonbroken) { 274 error_setg(errp, "MSI-X is not supported by interrupt controller"); 275 return -ENOTSUP; 276 } 277 278 if (nentries < 1 || nentries > PCI_MSIX_FLAGS_QSIZE + 1) { 279 error_setg(errp, "The number of MSI-X vectors is invalid"); 280 return -EINVAL; 281 } 282 283 table_size = nentries * PCI_MSIX_ENTRY_SIZE; 284 pba_size = QEMU_ALIGN_UP(nentries, 64) / 8; 285 286 /* Sanity test: table & pba don't overlap, fit within BARs, min aligned */ 287 if ((table_bar_nr == pba_bar_nr && 288 ranges_overlap(table_offset, table_size, pba_offset, pba_size)) || 289 table_offset + table_size > memory_region_size(table_bar) || 290 pba_offset + pba_size > memory_region_size(pba_bar) || 291 (table_offset | pba_offset) & PCI_MSIX_FLAGS_BIRMASK) { 292 error_setg(errp, "table & pba overlap, or they don't fit in BARs," 293 " or don't align"); 294 return -EINVAL; 295 } 296 297 cap = pci_add_capability2(dev, PCI_CAP_ID_MSIX, 298 cap_pos, MSIX_CAP_LENGTH, errp); 299 if (cap < 0) { 300 return cap; 301 } 302 303 dev->msix_cap = cap; 304 dev->cap_present |= QEMU_PCI_CAP_MSIX; 305 config = dev->config + cap; 306 307 pci_set_word(config + PCI_MSIX_FLAGS, nentries - 1); 308 dev->msix_entries_nr = nentries; 309 dev->msix_function_masked = true; 310 311 pci_set_long(config + PCI_MSIX_TABLE, table_offset | table_bar_nr); 312 pci_set_long(config + PCI_MSIX_PBA, pba_offset | pba_bar_nr); 313 314 /* Make flags bit writable. */ 315 dev->wmask[cap + MSIX_CONTROL_OFFSET] |= MSIX_ENABLE_MASK | 316 MSIX_MASKALL_MASK; 317 318 dev->msix_table = g_malloc0(table_size); 319 dev->msix_pba = g_malloc0(pba_size); 320 dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used); 321 322 msix_mask_all(dev, nentries); 323 324 memory_region_init_io(&dev->msix_table_mmio, OBJECT(dev), &msix_table_mmio_ops, dev, 325 "msix-table", table_size); 326 memory_region_add_subregion(table_bar, table_offset, &dev->msix_table_mmio); 327 memory_region_init_io(&dev->msix_pba_mmio, OBJECT(dev), &msix_pba_mmio_ops, dev, 328 "msix-pba", pba_size); 329 memory_region_add_subregion(pba_bar, pba_offset, &dev->msix_pba_mmio); 330 331 return 0; 332 } 333 334 int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries, 335 uint8_t bar_nr, Error **errp) 336 { 337 int ret; 338 char *name; 339 uint32_t bar_size = 4096; 340 uint32_t bar_pba_offset = bar_size / 2; 341 uint32_t bar_pba_size = (nentries / 8 + 1) * 8; 342 343 /* 344 * Migration compatibility dictates that this remains a 4k 345 * BAR with the vector table in the lower half and PBA in 346 * the upper half for nentries which is lower or equal to 128. 347 * No need to care about using more than 65 entries for legacy 348 * machine types who has at most 64 queues. 349 */ 350 if (nentries * PCI_MSIX_ENTRY_SIZE > bar_pba_offset) { 351 bar_pba_offset = nentries * PCI_MSIX_ENTRY_SIZE; 352 } 353 354 if (bar_pba_offset + bar_pba_size > 4096) { 355 bar_size = bar_pba_offset + bar_pba_size; 356 } 357 358 bar_size = pow2ceil(bar_size); 359 360 name = g_strdup_printf("%s-msix", dev->name); 361 memory_region_init(&dev->msix_exclusive_bar, OBJECT(dev), name, bar_size); 362 g_free(name); 363 364 ret = msix_init(dev, nentries, &dev->msix_exclusive_bar, bar_nr, 365 0, &dev->msix_exclusive_bar, 366 bar_nr, bar_pba_offset, 367 0, errp); 368 if (ret) { 369 return ret; 370 } 371 372 pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY, 373 &dev->msix_exclusive_bar); 374 375 return 0; 376 } 377 378 static void msix_free_irq_entries(PCIDevice *dev) 379 { 380 int vector; 381 382 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 383 dev->msix_entry_used[vector] = 0; 384 msix_clr_pending(dev, vector); 385 } 386 } 387 388 static void msix_clear_all_vectors(PCIDevice *dev) 389 { 390 int vector; 391 392 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 393 msix_clr_pending(dev, vector); 394 } 395 } 396 397 /* Clean up resources for the device. */ 398 void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar) 399 { 400 if (!msix_present(dev)) { 401 return; 402 } 403 pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH); 404 dev->msix_cap = 0; 405 msix_free_irq_entries(dev); 406 dev->msix_entries_nr = 0; 407 memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio); 408 g_free(dev->msix_pba); 409 dev->msix_pba = NULL; 410 memory_region_del_subregion(table_bar, &dev->msix_table_mmio); 411 g_free(dev->msix_table); 412 dev->msix_table = NULL; 413 g_free(dev->msix_entry_used); 414 dev->msix_entry_used = NULL; 415 dev->cap_present &= ~QEMU_PCI_CAP_MSIX; 416 } 417 418 void msix_uninit_exclusive_bar(PCIDevice *dev) 419 { 420 if (msix_present(dev)) { 421 msix_uninit(dev, &dev->msix_exclusive_bar, &dev->msix_exclusive_bar); 422 } 423 } 424 425 void msix_save(PCIDevice *dev, QEMUFile *f) 426 { 427 unsigned n = dev->msix_entries_nr; 428 429 if (!msix_present(dev)) { 430 return; 431 } 432 433 qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE); 434 qemu_put_buffer(f, dev->msix_pba, (n + 7) / 8); 435 } 436 437 /* Should be called after restoring the config space. */ 438 void msix_load(PCIDevice *dev, QEMUFile *f) 439 { 440 unsigned n = dev->msix_entries_nr; 441 unsigned int vector; 442 443 if (!msix_present(dev)) { 444 return; 445 } 446 447 msix_clear_all_vectors(dev); 448 qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE); 449 qemu_get_buffer(f, dev->msix_pba, (n + 7) / 8); 450 msix_update_function_masked(dev); 451 452 for (vector = 0; vector < n; vector++) { 453 msix_handle_mask_update(dev, vector, true); 454 } 455 } 456 457 /* Does device support MSI-X? */ 458 int msix_present(PCIDevice *dev) 459 { 460 return dev->cap_present & QEMU_PCI_CAP_MSIX; 461 } 462 463 /* Is MSI-X enabled? */ 464 int msix_enabled(PCIDevice *dev) 465 { 466 return (dev->cap_present & QEMU_PCI_CAP_MSIX) && 467 (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 468 MSIX_ENABLE_MASK); 469 } 470 471 /* Send an MSI-X message */ 472 void msix_notify(PCIDevice *dev, unsigned vector) 473 { 474 MSIMessage msg; 475 476 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) { 477 return; 478 } 479 480 if (msix_is_masked(dev, vector)) { 481 msix_set_pending(dev, vector); 482 return; 483 } 484 485 msg = msix_get_message(dev, vector); 486 487 msi_send_message(dev, msg); 488 } 489 490 void msix_reset(PCIDevice *dev) 491 { 492 if (!msix_present(dev)) { 493 return; 494 } 495 msix_clear_all_vectors(dev); 496 dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &= 497 ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET]; 498 memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE); 499 memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8); 500 msix_mask_all(dev, dev->msix_entries_nr); 501 } 502 503 /* PCI spec suggests that devices make it possible for software to configure 504 * less vectors than supported by the device, but does not specify a standard 505 * mechanism for devices to do so. 506 * 507 * We support this by asking devices to declare vectors software is going to 508 * actually use, and checking this on the notification path. Devices that 509 * don't want to follow the spec suggestion can declare all vectors as used. */ 510 511 /* Mark vector as used. */ 512 int msix_vector_use(PCIDevice *dev, unsigned vector) 513 { 514 if (vector >= dev->msix_entries_nr) { 515 return -EINVAL; 516 } 517 518 dev->msix_entry_used[vector]++; 519 return 0; 520 } 521 522 /* Mark vector as unused. */ 523 void msix_vector_unuse(PCIDevice *dev, unsigned vector) 524 { 525 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) { 526 return; 527 } 528 if (--dev->msix_entry_used[vector]) { 529 return; 530 } 531 msix_clr_pending(dev, vector); 532 } 533 534 void msix_unuse_all_vectors(PCIDevice *dev) 535 { 536 if (!msix_present(dev)) { 537 return; 538 } 539 msix_free_irq_entries(dev); 540 } 541 542 unsigned int msix_nr_vectors_allocated(const PCIDevice *dev) 543 { 544 return dev->msix_entries_nr; 545 } 546 547 static int msix_set_notifier_for_vector(PCIDevice *dev, unsigned int vector) 548 { 549 MSIMessage msg; 550 551 if (msix_is_masked(dev, vector)) { 552 return 0; 553 } 554 msg = msix_get_message(dev, vector); 555 return dev->msix_vector_use_notifier(dev, vector, msg); 556 } 557 558 static void msix_unset_notifier_for_vector(PCIDevice *dev, unsigned int vector) 559 { 560 if (msix_is_masked(dev, vector)) { 561 return; 562 } 563 dev->msix_vector_release_notifier(dev, vector); 564 } 565 566 int msix_set_vector_notifiers(PCIDevice *dev, 567 MSIVectorUseNotifier use_notifier, 568 MSIVectorReleaseNotifier release_notifier, 569 MSIVectorPollNotifier poll_notifier) 570 { 571 int vector, ret; 572 573 assert(use_notifier && release_notifier); 574 575 dev->msix_vector_use_notifier = use_notifier; 576 dev->msix_vector_release_notifier = release_notifier; 577 dev->msix_vector_poll_notifier = poll_notifier; 578 579 if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 580 (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) { 581 for (vector = 0; vector < dev->msix_entries_nr; vector++) { 582 ret = msix_set_notifier_for_vector(dev, vector); 583 if (ret < 0) { 584 goto undo; 585 } 586 } 587 } 588 if (dev->msix_vector_poll_notifier) { 589 dev->msix_vector_poll_notifier(dev, 0, dev->msix_entries_nr); 590 } 591 return 0; 592 593 undo: 594 while (--vector >= 0) { 595 msix_unset_notifier_for_vector(dev, vector); 596 } 597 dev->msix_vector_use_notifier = NULL; 598 dev->msix_vector_release_notifier = NULL; 599 return ret; 600 } 601 602 void msix_unset_vector_notifiers(PCIDevice *dev) 603 { 604 int vector; 605 606 assert(dev->msix_vector_use_notifier && 607 dev->msix_vector_release_notifier); 608 609 if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 610 (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) { 611 for (vector = 0; vector < dev->msix_entries_nr; vector++) { 612 msix_unset_notifier_for_vector(dev, vector); 613 } 614 } 615 dev->msix_vector_use_notifier = NULL; 616 dev->msix_vector_release_notifier = NULL; 617 dev->msix_vector_poll_notifier = NULL; 618 } 619 620 static int put_msix_state(QEMUFile *f, void *pv, size_t size, 621 VMStateField *field, QJSON *vmdesc) 622 { 623 msix_save(pv, f); 624 625 return 0; 626 } 627 628 static int get_msix_state(QEMUFile *f, void *pv, size_t size, 629 VMStateField *field) 630 { 631 msix_load(pv, f); 632 return 0; 633 } 634 635 static VMStateInfo vmstate_info_msix = { 636 .name = "msix state", 637 .get = get_msix_state, 638 .put = put_msix_state, 639 }; 640 641 const VMStateDescription vmstate_msix = { 642 .name = "msix", 643 .fields = (VMStateField[]) { 644 { 645 .name = "msix", 646 .version_id = 0, 647 .field_exists = NULL, 648 .size = 0, /* ouch */ 649 .info = &vmstate_info_msix, 650 .flags = VMS_SINGLE, 651 .offset = 0, 652 }, 653 VMSTATE_END_OF_LIST() 654 } 655 }; 656