1 /* 2 * MSI-X device support 3 * 4 * This module includes support for MSI-X in pci devices. 5 * 6 * Author: Michael S. Tsirkin <mst@redhat.com> 7 * 8 * Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com) 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * the COPYING file in the top-level directory. 12 * 13 * Contributions after 2012-01-13 are licensed under the terms of the 14 * GNU GPL, version 2 or (at your option) any later version. 15 */ 16 17 #include "qemu/osdep.h" 18 #include "hw/hw.h" 19 #include "hw/pci/msi.h" 20 #include "hw/pci/msix.h" 21 #include "hw/pci/pci.h" 22 #include "hw/xen/xen.h" 23 #include "qemu/range.h" 24 #include "qapi/error.h" 25 #include "trace.h" 26 27 /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */ 28 #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1) 29 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8) 30 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8) 31 32 MSIMessage msix_get_message(PCIDevice *dev, unsigned vector) 33 { 34 uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE; 35 MSIMessage msg; 36 37 msg.address = pci_get_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR); 38 msg.data = pci_get_long(table_entry + PCI_MSIX_ENTRY_DATA); 39 return msg; 40 } 41 42 /* 43 * Special API for POWER to configure the vectors through 44 * a side channel. Should never be used by devices. 45 */ 46 void msix_set_message(PCIDevice *dev, int vector, struct MSIMessage msg) 47 { 48 uint8_t *table_entry = dev->msix_table + vector * PCI_MSIX_ENTRY_SIZE; 49 50 pci_set_quad(table_entry + PCI_MSIX_ENTRY_LOWER_ADDR, msg.address); 51 pci_set_long(table_entry + PCI_MSIX_ENTRY_DATA, msg.data); 52 table_entry[PCI_MSIX_ENTRY_VECTOR_CTRL] &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; 53 } 54 55 static uint8_t msix_pending_mask(int vector) 56 { 57 return 1 << (vector % 8); 58 } 59 60 static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) 61 { 62 return dev->msix_pba + vector / 8; 63 } 64 65 static int msix_is_pending(PCIDevice *dev, int vector) 66 { 67 return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); 68 } 69 70 void msix_set_pending(PCIDevice *dev, unsigned int vector) 71 { 72 *msix_pending_byte(dev, vector) |= msix_pending_mask(vector); 73 } 74 75 void msix_clr_pending(PCIDevice *dev, int vector) 76 { 77 *msix_pending_byte(dev, vector) &= ~msix_pending_mask(vector); 78 } 79 80 static bool msix_vector_masked(PCIDevice *dev, unsigned int vector, bool fmask) 81 { 82 unsigned offset = vector * PCI_MSIX_ENTRY_SIZE; 83 uint8_t *data = &dev->msix_table[offset + PCI_MSIX_ENTRY_DATA]; 84 /* MSIs on Xen can be remapped into pirqs. In those cases, masking 85 * and unmasking go through the PV evtchn path. */ 86 if (xen_enabled() && xen_is_pirq_msi(pci_get_long(data))) { 87 return false; 88 } 89 return fmask || dev->msix_table[offset + PCI_MSIX_ENTRY_VECTOR_CTRL] & 90 PCI_MSIX_ENTRY_CTRL_MASKBIT; 91 } 92 93 bool msix_is_masked(PCIDevice *dev, unsigned int vector) 94 { 95 return msix_vector_masked(dev, vector, dev->msix_function_masked); 96 } 97 98 static void msix_fire_vector_notifier(PCIDevice *dev, 99 unsigned int vector, bool is_masked) 100 { 101 MSIMessage msg; 102 int ret; 103 104 if (!dev->msix_vector_use_notifier) { 105 return; 106 } 107 if (is_masked) { 108 dev->msix_vector_release_notifier(dev, vector); 109 } else { 110 msg = msix_get_message(dev, vector); 111 ret = dev->msix_vector_use_notifier(dev, vector, msg); 112 assert(ret >= 0); 113 } 114 } 115 116 static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked) 117 { 118 bool is_masked = msix_is_masked(dev, vector); 119 120 if (is_masked == was_masked) { 121 return; 122 } 123 124 msix_fire_vector_notifier(dev, vector, is_masked); 125 126 if (!is_masked && msix_is_pending(dev, vector)) { 127 msix_clr_pending(dev, vector); 128 msix_notify(dev, vector); 129 } 130 } 131 132 static bool msix_masked(PCIDevice *dev) 133 { 134 return dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK; 135 } 136 137 static void msix_update_function_masked(PCIDevice *dev) 138 { 139 dev->msix_function_masked = !msix_enabled(dev) || msix_masked(dev); 140 } 141 142 /* Handle MSI-X capability config write. */ 143 void msix_write_config(PCIDevice *dev, uint32_t addr, 144 uint32_t val, int len) 145 { 146 unsigned enable_pos = dev->msix_cap + MSIX_CONTROL_OFFSET; 147 int vector; 148 bool was_masked; 149 150 if (!msix_present(dev) || !range_covers_byte(addr, len, enable_pos)) { 151 return; 152 } 153 154 trace_msix_write_config(dev->name, msix_enabled(dev), msix_masked(dev)); 155 156 was_masked = dev->msix_function_masked; 157 msix_update_function_masked(dev); 158 159 if (!msix_enabled(dev)) { 160 return; 161 } 162 163 pci_device_deassert_intx(dev); 164 165 if (dev->msix_function_masked == was_masked) { 166 return; 167 } 168 169 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 170 msix_handle_mask_update(dev, vector, 171 msix_vector_masked(dev, vector, was_masked)); 172 } 173 } 174 175 static uint64_t msix_table_mmio_read(void *opaque, hwaddr addr, 176 unsigned size) 177 { 178 PCIDevice *dev = opaque; 179 180 return pci_get_long(dev->msix_table + addr); 181 } 182 183 static void msix_table_mmio_write(void *opaque, hwaddr addr, 184 uint64_t val, unsigned size) 185 { 186 PCIDevice *dev = opaque; 187 int vector = addr / PCI_MSIX_ENTRY_SIZE; 188 bool was_masked; 189 190 was_masked = msix_is_masked(dev, vector); 191 pci_set_long(dev->msix_table + addr, val); 192 msix_handle_mask_update(dev, vector, was_masked); 193 } 194 195 static const MemoryRegionOps msix_table_mmio_ops = { 196 .read = msix_table_mmio_read, 197 .write = msix_table_mmio_write, 198 .endianness = DEVICE_LITTLE_ENDIAN, 199 .valid = { 200 .min_access_size = 4, 201 .max_access_size = 4, 202 }, 203 }; 204 205 static uint64_t msix_pba_mmio_read(void *opaque, hwaddr addr, 206 unsigned size) 207 { 208 PCIDevice *dev = opaque; 209 if (dev->msix_vector_poll_notifier) { 210 unsigned vector_start = addr * 8; 211 unsigned vector_end = MIN(addr + size * 8, dev->msix_entries_nr); 212 dev->msix_vector_poll_notifier(dev, vector_start, vector_end); 213 } 214 215 return pci_get_long(dev->msix_pba + addr); 216 } 217 218 static void msix_pba_mmio_write(void *opaque, hwaddr addr, 219 uint64_t val, unsigned size) 220 { 221 } 222 223 static const MemoryRegionOps msix_pba_mmio_ops = { 224 .read = msix_pba_mmio_read, 225 .write = msix_pba_mmio_write, 226 .endianness = DEVICE_LITTLE_ENDIAN, 227 .valid = { 228 .min_access_size = 4, 229 .max_access_size = 4, 230 }, 231 }; 232 233 static void msix_mask_all(struct PCIDevice *dev, unsigned nentries) 234 { 235 int vector; 236 237 for (vector = 0; vector < nentries; ++vector) { 238 unsigned offset = 239 vector * PCI_MSIX_ENTRY_SIZE + PCI_MSIX_ENTRY_VECTOR_CTRL; 240 bool was_masked = msix_is_masked(dev, vector); 241 242 dev->msix_table[offset] |= PCI_MSIX_ENTRY_CTRL_MASKBIT; 243 msix_handle_mask_update(dev, vector, was_masked); 244 } 245 } 246 247 /* 248 * Make PCI device @dev MSI-X capable 249 * @nentries is the max number of MSI-X vectors that the device support. 250 * @table_bar is the MemoryRegion that MSI-X table structure resides. 251 * @table_bar_nr is number of base address register corresponding to @table_bar. 252 * @table_offset indicates the offset that the MSI-X table structure starts with 253 * in @table_bar. 254 * @pba_bar is the MemoryRegion that the Pending Bit Array structure resides. 255 * @pba_bar_nr is number of base address register corresponding to @pba_bar. 256 * @pba_offset indicates the offset that the Pending Bit Array structure 257 * starts with in @pba_bar. 258 * Non-zero @cap_pos puts capability MSI-X at that offset in PCI config space. 259 * @errp is for returning errors. 260 * 261 * Return 0 on success; set @errp and return -errno on error: 262 * -ENOTSUP means lacking msi support for a msi-capable platform. 263 * -EINVAL means capability overlap, happens when @cap_pos is non-zero, 264 * also means a programming error, except device assignment, which can check 265 * if a real HW is broken. 266 */ 267 int msix_init(struct PCIDevice *dev, unsigned short nentries, 268 MemoryRegion *table_bar, uint8_t table_bar_nr, 269 unsigned table_offset, MemoryRegion *pba_bar, 270 uint8_t pba_bar_nr, unsigned pba_offset, uint8_t cap_pos, 271 Error **errp) 272 { 273 int cap; 274 unsigned table_size, pba_size; 275 uint8_t *config; 276 277 /* Nothing to do if MSI is not supported by interrupt controller */ 278 if (!msi_nonbroken) { 279 error_setg(errp, "MSI-X is not supported by interrupt controller"); 280 return -ENOTSUP; 281 } 282 283 if (nentries < 1 || nentries > PCI_MSIX_FLAGS_QSIZE + 1) { 284 error_setg(errp, "The number of MSI-X vectors is invalid"); 285 return -EINVAL; 286 } 287 288 table_size = nentries * PCI_MSIX_ENTRY_SIZE; 289 pba_size = QEMU_ALIGN_UP(nentries, 64) / 8; 290 291 /* Sanity test: table & pba don't overlap, fit within BARs, min aligned */ 292 if ((table_bar_nr == pba_bar_nr && 293 ranges_overlap(table_offset, table_size, pba_offset, pba_size)) || 294 table_offset + table_size > memory_region_size(table_bar) || 295 pba_offset + pba_size > memory_region_size(pba_bar) || 296 (table_offset | pba_offset) & PCI_MSIX_FLAGS_BIRMASK) { 297 error_setg(errp, "table & pba overlap, or they don't fit in BARs," 298 " or don't align"); 299 return -EINVAL; 300 } 301 302 cap = pci_add_capability(dev, PCI_CAP_ID_MSIX, 303 cap_pos, MSIX_CAP_LENGTH, errp); 304 if (cap < 0) { 305 return cap; 306 } 307 308 dev->msix_cap = cap; 309 dev->cap_present |= QEMU_PCI_CAP_MSIX; 310 config = dev->config + cap; 311 312 pci_set_word(config + PCI_MSIX_FLAGS, nentries - 1); 313 dev->msix_entries_nr = nentries; 314 dev->msix_function_masked = true; 315 316 pci_set_long(config + PCI_MSIX_TABLE, table_offset | table_bar_nr); 317 pci_set_long(config + PCI_MSIX_PBA, pba_offset | pba_bar_nr); 318 319 /* Make flags bit writable. */ 320 dev->wmask[cap + MSIX_CONTROL_OFFSET] |= MSIX_ENABLE_MASK | 321 MSIX_MASKALL_MASK; 322 323 dev->msix_table = g_malloc0(table_size); 324 dev->msix_pba = g_malloc0(pba_size); 325 dev->msix_entry_used = g_malloc0(nentries * sizeof *dev->msix_entry_used); 326 327 msix_mask_all(dev, nentries); 328 329 memory_region_init_io(&dev->msix_table_mmio, OBJECT(dev), &msix_table_mmio_ops, dev, 330 "msix-table", table_size); 331 memory_region_add_subregion(table_bar, table_offset, &dev->msix_table_mmio); 332 memory_region_init_io(&dev->msix_pba_mmio, OBJECT(dev), &msix_pba_mmio_ops, dev, 333 "msix-pba", pba_size); 334 memory_region_add_subregion(pba_bar, pba_offset, &dev->msix_pba_mmio); 335 336 return 0; 337 } 338 339 int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries, 340 uint8_t bar_nr, Error **errp) 341 { 342 int ret; 343 char *name; 344 uint32_t bar_size = 4096; 345 uint32_t bar_pba_offset = bar_size / 2; 346 uint32_t bar_pba_size = QEMU_ALIGN_UP(nentries, 64) / 8; 347 348 /* 349 * Migration compatibility dictates that this remains a 4k 350 * BAR with the vector table in the lower half and PBA in 351 * the upper half for nentries which is lower or equal to 128. 352 * No need to care about using more than 65 entries for legacy 353 * machine types who has at most 64 queues. 354 */ 355 if (nentries * PCI_MSIX_ENTRY_SIZE > bar_pba_offset) { 356 bar_pba_offset = nentries * PCI_MSIX_ENTRY_SIZE; 357 } 358 359 if (bar_pba_offset + bar_pba_size > 4096) { 360 bar_size = bar_pba_offset + bar_pba_size; 361 } 362 363 bar_size = pow2ceil(bar_size); 364 365 name = g_strdup_printf("%s-msix", dev->name); 366 memory_region_init(&dev->msix_exclusive_bar, OBJECT(dev), name, bar_size); 367 g_free(name); 368 369 ret = msix_init(dev, nentries, &dev->msix_exclusive_bar, bar_nr, 370 0, &dev->msix_exclusive_bar, 371 bar_nr, bar_pba_offset, 372 0, errp); 373 if (ret) { 374 return ret; 375 } 376 377 pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY, 378 &dev->msix_exclusive_bar); 379 380 return 0; 381 } 382 383 static void msix_free_irq_entries(PCIDevice *dev) 384 { 385 int vector; 386 387 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 388 dev->msix_entry_used[vector] = 0; 389 msix_clr_pending(dev, vector); 390 } 391 } 392 393 static void msix_clear_all_vectors(PCIDevice *dev) 394 { 395 int vector; 396 397 for (vector = 0; vector < dev->msix_entries_nr; ++vector) { 398 msix_clr_pending(dev, vector); 399 } 400 } 401 402 /* Clean up resources for the device. */ 403 void msix_uninit(PCIDevice *dev, MemoryRegion *table_bar, MemoryRegion *pba_bar) 404 { 405 if (!msix_present(dev)) { 406 return; 407 } 408 pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH); 409 dev->msix_cap = 0; 410 msix_free_irq_entries(dev); 411 dev->msix_entries_nr = 0; 412 memory_region_del_subregion(pba_bar, &dev->msix_pba_mmio); 413 g_free(dev->msix_pba); 414 dev->msix_pba = NULL; 415 memory_region_del_subregion(table_bar, &dev->msix_table_mmio); 416 g_free(dev->msix_table); 417 dev->msix_table = NULL; 418 g_free(dev->msix_entry_used); 419 dev->msix_entry_used = NULL; 420 dev->cap_present &= ~QEMU_PCI_CAP_MSIX; 421 } 422 423 void msix_uninit_exclusive_bar(PCIDevice *dev) 424 { 425 if (msix_present(dev)) { 426 msix_uninit(dev, &dev->msix_exclusive_bar, &dev->msix_exclusive_bar); 427 } 428 } 429 430 void msix_save(PCIDevice *dev, QEMUFile *f) 431 { 432 unsigned n = dev->msix_entries_nr; 433 434 if (!msix_present(dev)) { 435 return; 436 } 437 438 qemu_put_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE); 439 qemu_put_buffer(f, dev->msix_pba, DIV_ROUND_UP(n, 8)); 440 } 441 442 /* Should be called after restoring the config space. */ 443 void msix_load(PCIDevice *dev, QEMUFile *f) 444 { 445 unsigned n = dev->msix_entries_nr; 446 unsigned int vector; 447 448 if (!msix_present(dev)) { 449 return; 450 } 451 452 msix_clear_all_vectors(dev); 453 qemu_get_buffer(f, dev->msix_table, n * PCI_MSIX_ENTRY_SIZE); 454 qemu_get_buffer(f, dev->msix_pba, DIV_ROUND_UP(n, 8)); 455 msix_update_function_masked(dev); 456 457 for (vector = 0; vector < n; vector++) { 458 msix_handle_mask_update(dev, vector, true); 459 } 460 } 461 462 /* Does device support MSI-X? */ 463 int msix_present(PCIDevice *dev) 464 { 465 return dev->cap_present & QEMU_PCI_CAP_MSIX; 466 } 467 468 /* Is MSI-X enabled? */ 469 int msix_enabled(PCIDevice *dev) 470 { 471 return (dev->cap_present & QEMU_PCI_CAP_MSIX) && 472 (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 473 MSIX_ENABLE_MASK); 474 } 475 476 /* Send an MSI-X message */ 477 void msix_notify(PCIDevice *dev, unsigned vector) 478 { 479 MSIMessage msg; 480 481 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) { 482 return; 483 } 484 485 if (msix_is_masked(dev, vector)) { 486 msix_set_pending(dev, vector); 487 return; 488 } 489 490 msg = msix_get_message(dev, vector); 491 492 msi_send_message(dev, msg); 493 } 494 495 void msix_reset(PCIDevice *dev) 496 { 497 if (!msix_present(dev)) { 498 return; 499 } 500 msix_clear_all_vectors(dev); 501 dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &= 502 ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET]; 503 memset(dev->msix_table, 0, dev->msix_entries_nr * PCI_MSIX_ENTRY_SIZE); 504 memset(dev->msix_pba, 0, QEMU_ALIGN_UP(dev->msix_entries_nr, 64) / 8); 505 msix_mask_all(dev, dev->msix_entries_nr); 506 } 507 508 /* PCI spec suggests that devices make it possible for software to configure 509 * less vectors than supported by the device, but does not specify a standard 510 * mechanism for devices to do so. 511 * 512 * We support this by asking devices to declare vectors software is going to 513 * actually use, and checking this on the notification path. Devices that 514 * don't want to follow the spec suggestion can declare all vectors as used. */ 515 516 /* Mark vector as used. */ 517 int msix_vector_use(PCIDevice *dev, unsigned vector) 518 { 519 if (vector >= dev->msix_entries_nr) { 520 return -EINVAL; 521 } 522 523 dev->msix_entry_used[vector]++; 524 return 0; 525 } 526 527 /* Mark vector as unused. */ 528 void msix_vector_unuse(PCIDevice *dev, unsigned vector) 529 { 530 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) { 531 return; 532 } 533 if (--dev->msix_entry_used[vector]) { 534 return; 535 } 536 msix_clr_pending(dev, vector); 537 } 538 539 void msix_unuse_all_vectors(PCIDevice *dev) 540 { 541 if (!msix_present(dev)) { 542 return; 543 } 544 msix_free_irq_entries(dev); 545 } 546 547 unsigned int msix_nr_vectors_allocated(const PCIDevice *dev) 548 { 549 return dev->msix_entries_nr; 550 } 551 552 static int msix_set_notifier_for_vector(PCIDevice *dev, unsigned int vector) 553 { 554 MSIMessage msg; 555 556 if (msix_is_masked(dev, vector)) { 557 return 0; 558 } 559 msg = msix_get_message(dev, vector); 560 return dev->msix_vector_use_notifier(dev, vector, msg); 561 } 562 563 static void msix_unset_notifier_for_vector(PCIDevice *dev, unsigned int vector) 564 { 565 if (msix_is_masked(dev, vector)) { 566 return; 567 } 568 dev->msix_vector_release_notifier(dev, vector); 569 } 570 571 int msix_set_vector_notifiers(PCIDevice *dev, 572 MSIVectorUseNotifier use_notifier, 573 MSIVectorReleaseNotifier release_notifier, 574 MSIVectorPollNotifier poll_notifier) 575 { 576 int vector, ret; 577 578 assert(use_notifier && release_notifier); 579 580 dev->msix_vector_use_notifier = use_notifier; 581 dev->msix_vector_release_notifier = release_notifier; 582 dev->msix_vector_poll_notifier = poll_notifier; 583 584 if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 585 (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) { 586 for (vector = 0; vector < dev->msix_entries_nr; vector++) { 587 ret = msix_set_notifier_for_vector(dev, vector); 588 if (ret < 0) { 589 goto undo; 590 } 591 } 592 } 593 if (dev->msix_vector_poll_notifier) { 594 dev->msix_vector_poll_notifier(dev, 0, dev->msix_entries_nr); 595 } 596 return 0; 597 598 undo: 599 while (--vector >= 0) { 600 msix_unset_notifier_for_vector(dev, vector); 601 } 602 dev->msix_vector_use_notifier = NULL; 603 dev->msix_vector_release_notifier = NULL; 604 return ret; 605 } 606 607 void msix_unset_vector_notifiers(PCIDevice *dev) 608 { 609 int vector; 610 611 assert(dev->msix_vector_use_notifier && 612 dev->msix_vector_release_notifier); 613 614 if ((dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & 615 (MSIX_ENABLE_MASK | MSIX_MASKALL_MASK)) == MSIX_ENABLE_MASK) { 616 for (vector = 0; vector < dev->msix_entries_nr; vector++) { 617 msix_unset_notifier_for_vector(dev, vector); 618 } 619 } 620 dev->msix_vector_use_notifier = NULL; 621 dev->msix_vector_release_notifier = NULL; 622 dev->msix_vector_poll_notifier = NULL; 623 } 624 625 static int put_msix_state(QEMUFile *f, void *pv, size_t size, 626 const VMStateField *field, QJSON *vmdesc) 627 { 628 msix_save(pv, f); 629 630 return 0; 631 } 632 633 static int get_msix_state(QEMUFile *f, void *pv, size_t size, 634 const VMStateField *field) 635 { 636 msix_load(pv, f); 637 return 0; 638 } 639 640 static VMStateInfo vmstate_info_msix = { 641 .name = "msix state", 642 .get = get_msix_state, 643 .put = put_msix_state, 644 }; 645 646 const VMStateDescription vmstate_msix = { 647 .name = "msix", 648 .fields = (VMStateField[]) { 649 { 650 .name = "msix", 651 .version_id = 0, 652 .field_exists = NULL, 653 .size = 0, /* ouch */ 654 .info = &vmstate_info_msix, 655 .flags = VMS_SINGLE, 656 .offset = 0, 657 }, 658 VMSTATE_END_OF_LIST() 659 } 660 }; 661