1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 /* 3 * Copyright(c) 2015 - 2020 Intel Corporation. 4 */ 5 6 #include <linux/topology.h> 7 #include <linux/cpumask.h> 8 #include <linux/module.h> 9 #include <linux/interrupt.h> 10 #include <linux/numa.h> 11 12 #include "hfi.h" 13 #include "affinity.h" 14 #include "sdma.h" 15 #include "trace.h" 16 17 struct hfi1_affinity_node_list node_affinity = { 18 .list = LIST_HEAD_INIT(node_affinity.list), 19 .lock = __MUTEX_INITIALIZER(node_affinity.lock) 20 }; 21 22 /* Name of IRQ types, indexed by enum irq_type */ 23 static const char * const irq_type_names[] = { 24 "SDMA", 25 "RCVCTXT", 26 "NETDEVCTXT", 27 "GENERAL", 28 "OTHER", 29 }; 30 31 /* Per NUMA node count of HFI devices */ 32 static unsigned int *hfi1_per_node_cntr; 33 34 static inline void init_cpu_mask_set(struct cpu_mask_set *set) 35 { 36 cpumask_clear(&set->mask); 37 cpumask_clear(&set->used); 38 set->gen = 0; 39 } 40 41 /* Increment generation of CPU set if needed */ 42 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set) 43 { 44 if (cpumask_equal(&set->mask, &set->used)) { 45 /* 46 * We've used up all the CPUs, bump up the generation 47 * and reset the 'used' map 48 */ 49 set->gen++; 50 cpumask_clear(&set->used); 51 } 52 } 53 54 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set) 55 { 56 if (cpumask_empty(&set->used) && set->gen) { 57 set->gen--; 58 cpumask_copy(&set->used, &set->mask); 59 } 60 } 61 62 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */ 63 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff) 64 { 65 int cpu; 66 67 if (!diff || !set) 68 return -EINVAL; 69 70 _cpu_mask_set_gen_inc(set); 71 72 /* Find out CPUs left in CPU mask */ 73 cpumask_andnot(diff, &set->mask, &set->used); 74 75 cpu = cpumask_first(diff); 76 if (cpu >= nr_cpu_ids) /* empty */ 77 cpu = -EINVAL; 78 else 79 cpumask_set_cpu(cpu, &set->used); 80 81 return cpu; 82 } 83 84 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu) 85 { 86 if (!set) 87 return; 88 89 cpumask_clear_cpu(cpu, &set->used); 90 _cpu_mask_set_gen_dec(set); 91 } 92 93 /* Initialize non-HT cpu cores mask */ 94 void init_real_cpu_mask(void) 95 { 96 int possible, curr_cpu, i, ht; 97 98 cpumask_clear(&node_affinity.real_cpu_mask); 99 100 /* Start with cpu online mask as the real cpu mask */ 101 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); 102 103 /* 104 * Remove HT cores from the real cpu mask. Do this in two steps below. 105 */ 106 possible = cpumask_weight(&node_affinity.real_cpu_mask); 107 ht = cpumask_weight(topology_sibling_cpumask( 108 cpumask_first(&node_affinity.real_cpu_mask))); 109 /* 110 * Step 1. Skip over the first N HT siblings and use them as the 111 * "real" cores. Assumes that HT cores are not enumerated in 112 * succession (except in the single core case). 113 */ 114 curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); 115 for (i = 0; i < possible / ht; i++) 116 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); 117 /* 118 * Step 2. Remove the remaining HT siblings. Use cpumask_next() to 119 * skip any gaps. 120 */ 121 for (; i < possible; i++) { 122 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); 123 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); 124 } 125 } 126 127 int node_affinity_init(void) 128 { 129 int node; 130 struct pci_dev *dev = NULL; 131 const struct pci_device_id *ids = hfi1_pci_tbl; 132 133 cpumask_clear(&node_affinity.proc.used); 134 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); 135 136 node_affinity.proc.gen = 0; 137 node_affinity.num_core_siblings = 138 cpumask_weight(topology_sibling_cpumask( 139 cpumask_first(&node_affinity.proc.mask) 140 )); 141 node_affinity.num_possible_nodes = num_possible_nodes(); 142 node_affinity.num_online_nodes = num_online_nodes(); 143 node_affinity.num_online_cpus = num_online_cpus(); 144 145 /* 146 * The real cpu mask is part of the affinity struct but it has to be 147 * initialized early. It is needed to calculate the number of user 148 * contexts in set_up_context_variables(). 149 */ 150 init_real_cpu_mask(); 151 152 hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes, 153 sizeof(*hfi1_per_node_cntr), GFP_KERNEL); 154 if (!hfi1_per_node_cntr) 155 return -ENOMEM; 156 157 while (ids->vendor) { 158 dev = NULL; 159 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { 160 node = pcibus_to_node(dev->bus); 161 if (node < 0) 162 goto out; 163 164 hfi1_per_node_cntr[node]++; 165 } 166 ids++; 167 } 168 169 return 0; 170 171 out: 172 /* 173 * Invalid PCI NUMA node information found, note it, and populate 174 * our database 1:1. 175 */ 176 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n"); 177 pr_err("HFI: System BIOS may need to be upgraded\n"); 178 for (node = 0; node < node_affinity.num_possible_nodes; node++) 179 hfi1_per_node_cntr[node] = 1; 180 181 return 0; 182 } 183 184 static void node_affinity_destroy(struct hfi1_affinity_node *entry) 185 { 186 free_percpu(entry->comp_vect_affinity); 187 kfree(entry); 188 } 189 190 void node_affinity_destroy_all(void) 191 { 192 struct list_head *pos, *q; 193 struct hfi1_affinity_node *entry; 194 195 mutex_lock(&node_affinity.lock); 196 list_for_each_safe(pos, q, &node_affinity.list) { 197 entry = list_entry(pos, struct hfi1_affinity_node, 198 list); 199 list_del(pos); 200 node_affinity_destroy(entry); 201 } 202 mutex_unlock(&node_affinity.lock); 203 kfree(hfi1_per_node_cntr); 204 } 205 206 static struct hfi1_affinity_node *node_affinity_allocate(int node) 207 { 208 struct hfi1_affinity_node *entry; 209 210 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 211 if (!entry) 212 return NULL; 213 entry->node = node; 214 entry->comp_vect_affinity = alloc_percpu(u16); 215 INIT_LIST_HEAD(&entry->list); 216 217 return entry; 218 } 219 220 /* 221 * It appends an entry to the list. 222 * It *must* be called with node_affinity.lock held. 223 */ 224 static void node_affinity_add_tail(struct hfi1_affinity_node *entry) 225 { 226 list_add_tail(&entry->list, &node_affinity.list); 227 } 228 229 /* It must be called with node_affinity.lock held */ 230 static struct hfi1_affinity_node *node_affinity_lookup(int node) 231 { 232 struct list_head *pos; 233 struct hfi1_affinity_node *entry; 234 235 list_for_each(pos, &node_affinity.list) { 236 entry = list_entry(pos, struct hfi1_affinity_node, list); 237 if (entry->node == node) 238 return entry; 239 } 240 241 return NULL; 242 } 243 244 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, 245 u16 __percpu *comp_vect_affinity) 246 { 247 int curr_cpu; 248 u16 cntr; 249 u16 prev_cntr; 250 int ret_cpu; 251 252 if (!possible_cpumask) { 253 ret_cpu = -EINVAL; 254 goto fail; 255 } 256 257 if (!comp_vect_affinity) { 258 ret_cpu = -EINVAL; 259 goto fail; 260 } 261 262 ret_cpu = cpumask_first(possible_cpumask); 263 if (ret_cpu >= nr_cpu_ids) { 264 ret_cpu = -EINVAL; 265 goto fail; 266 } 267 268 prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); 269 for_each_cpu(curr_cpu, possible_cpumask) { 270 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 271 272 if (cntr < prev_cntr) { 273 ret_cpu = curr_cpu; 274 prev_cntr = cntr; 275 } 276 } 277 278 *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; 279 280 fail: 281 return ret_cpu; 282 } 283 284 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, 285 u16 __percpu *comp_vect_affinity) 286 { 287 int curr_cpu; 288 int max_cpu; 289 u16 cntr; 290 u16 prev_cntr; 291 292 if (!possible_cpumask) 293 return -EINVAL; 294 295 if (!comp_vect_affinity) 296 return -EINVAL; 297 298 max_cpu = cpumask_first(possible_cpumask); 299 if (max_cpu >= nr_cpu_ids) 300 return -EINVAL; 301 302 prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); 303 for_each_cpu(curr_cpu, possible_cpumask) { 304 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 305 306 if (cntr > prev_cntr) { 307 max_cpu = curr_cpu; 308 prev_cntr = cntr; 309 } 310 } 311 312 *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; 313 314 return max_cpu; 315 } 316 317 /* 318 * Non-interrupt CPUs are used first, then interrupt CPUs. 319 * Two already allocated cpu masks must be passed. 320 */ 321 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, 322 struct hfi1_affinity_node *entry, 323 cpumask_var_t non_intr_cpus, 324 cpumask_var_t available_cpus) 325 __must_hold(&node_affinity.lock) 326 { 327 int cpu; 328 struct cpu_mask_set *set = dd->comp_vect; 329 330 lockdep_assert_held(&node_affinity.lock); 331 if (!non_intr_cpus) { 332 cpu = -1; 333 goto fail; 334 } 335 336 if (!available_cpus) { 337 cpu = -1; 338 goto fail; 339 } 340 341 /* Available CPUs for pinning completion vectors */ 342 _cpu_mask_set_gen_inc(set); 343 cpumask_andnot(available_cpus, &set->mask, &set->used); 344 345 /* Available CPUs without SDMA engine interrupts */ 346 cpumask_andnot(non_intr_cpus, available_cpus, 347 &entry->def_intr.used); 348 349 /* If there are non-interrupt CPUs available, use them first */ 350 if (!cpumask_empty(non_intr_cpus)) 351 cpu = cpumask_first(non_intr_cpus); 352 else /* Otherwise, use interrupt CPUs */ 353 cpu = cpumask_first(available_cpus); 354 355 if (cpu >= nr_cpu_ids) { /* empty */ 356 cpu = -1; 357 goto fail; 358 } 359 cpumask_set_cpu(cpu, &set->used); 360 361 fail: 362 return cpu; 363 } 364 365 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) 366 { 367 struct cpu_mask_set *set = dd->comp_vect; 368 369 if (cpu < 0) 370 return; 371 372 cpu_mask_set_put(set, cpu); 373 } 374 375 /* _dev_comp_vect_mappings_destroy() is reentrant */ 376 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) 377 { 378 int i, cpu; 379 380 if (!dd->comp_vect_mappings) 381 return; 382 383 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 384 cpu = dd->comp_vect_mappings[i]; 385 _dev_comp_vect_cpu_put(dd, cpu); 386 dd->comp_vect_mappings[i] = -1; 387 hfi1_cdbg(AFFINITY, 388 "[%s] Release CPU %d from completion vector %d", 389 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); 390 } 391 392 kfree(dd->comp_vect_mappings); 393 dd->comp_vect_mappings = NULL; 394 } 395 396 /* 397 * This function creates the table for looking up CPUs for completion vectors. 398 * num_comp_vectors needs to have been initilized before calling this function. 399 */ 400 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, 401 struct hfi1_affinity_node *entry) 402 __must_hold(&node_affinity.lock) 403 { 404 int i, cpu, ret; 405 cpumask_var_t non_intr_cpus; 406 cpumask_var_t available_cpus; 407 408 lockdep_assert_held(&node_affinity.lock); 409 410 if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) 411 return -ENOMEM; 412 413 if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { 414 free_cpumask_var(non_intr_cpus); 415 return -ENOMEM; 416 } 417 418 dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus, 419 sizeof(*dd->comp_vect_mappings), 420 GFP_KERNEL); 421 if (!dd->comp_vect_mappings) { 422 ret = -ENOMEM; 423 goto fail; 424 } 425 for (i = 0; i < dd->comp_vect_possible_cpus; i++) 426 dd->comp_vect_mappings[i] = -1; 427 428 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 429 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, 430 available_cpus); 431 if (cpu < 0) { 432 ret = -EINVAL; 433 goto fail; 434 } 435 436 dd->comp_vect_mappings[i] = cpu; 437 hfi1_cdbg(AFFINITY, 438 "[%s] Completion Vector %d -> CPU %d", 439 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); 440 } 441 442 free_cpumask_var(available_cpus); 443 free_cpumask_var(non_intr_cpus); 444 return 0; 445 446 fail: 447 free_cpumask_var(available_cpus); 448 free_cpumask_var(non_intr_cpus); 449 _dev_comp_vect_mappings_destroy(dd); 450 451 return ret; 452 } 453 454 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) 455 { 456 int ret; 457 struct hfi1_affinity_node *entry; 458 459 mutex_lock(&node_affinity.lock); 460 entry = node_affinity_lookup(dd->node); 461 if (!entry) { 462 ret = -EINVAL; 463 goto unlock; 464 } 465 ret = _dev_comp_vect_mappings_create(dd, entry); 466 unlock: 467 mutex_unlock(&node_affinity.lock); 468 469 return ret; 470 } 471 472 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) 473 { 474 _dev_comp_vect_mappings_destroy(dd); 475 } 476 477 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) 478 { 479 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); 480 struct hfi1_devdata *dd = dd_from_dev(verbs_dev); 481 482 if (!dd->comp_vect_mappings) 483 return -EINVAL; 484 if (comp_vect >= dd->comp_vect_possible_cpus) 485 return -EINVAL; 486 487 return dd->comp_vect_mappings[comp_vect]; 488 } 489 490 /* 491 * It assumes dd->comp_vect_possible_cpus is available. 492 */ 493 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, 494 struct hfi1_affinity_node *entry, 495 bool first_dev_init) 496 __must_hold(&node_affinity.lock) 497 { 498 int i, j, curr_cpu; 499 int possible_cpus_comp_vect = 0; 500 struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; 501 502 lockdep_assert_held(&node_affinity.lock); 503 /* 504 * If there's only one CPU available for completion vectors, then 505 * there will only be one completion vector available. Othewise, 506 * the number of completion vector available will be the number of 507 * available CPUs divide it by the number of devices in the 508 * local NUMA node. 509 */ 510 if (cpumask_weight(&entry->comp_vect_mask) == 1) { 511 possible_cpus_comp_vect = 1; 512 dd_dev_warn(dd, 513 "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); 514 } else { 515 possible_cpus_comp_vect += 516 cpumask_weight(&entry->comp_vect_mask) / 517 hfi1_per_node_cntr[dd->node]; 518 519 /* 520 * If the completion vector CPUs available doesn't divide 521 * evenly among devices, then the first device device to be 522 * initialized gets an extra CPU. 523 */ 524 if (first_dev_init && 525 cpumask_weight(&entry->comp_vect_mask) % 526 hfi1_per_node_cntr[dd->node] != 0) 527 possible_cpus_comp_vect++; 528 } 529 530 dd->comp_vect_possible_cpus = possible_cpus_comp_vect; 531 532 /* Reserving CPUs for device completion vector */ 533 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 534 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, 535 entry->comp_vect_affinity); 536 if (curr_cpu < 0) 537 goto fail; 538 539 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); 540 } 541 542 hfi1_cdbg(AFFINITY, 543 "[%s] Completion vector affinity CPU set(s) %*pbl", 544 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), 545 cpumask_pr_args(dev_comp_vect_mask)); 546 547 return 0; 548 549 fail: 550 for (j = 0; j < i; j++) 551 per_cpu_affinity_put_max(&entry->comp_vect_mask, 552 entry->comp_vect_affinity); 553 554 return curr_cpu; 555 } 556 557 /* 558 * It assumes dd->comp_vect_possible_cpus is available. 559 */ 560 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, 561 struct hfi1_affinity_node *entry) 562 __must_hold(&node_affinity.lock) 563 { 564 int i, cpu; 565 566 lockdep_assert_held(&node_affinity.lock); 567 if (!dd->comp_vect_possible_cpus) 568 return; 569 570 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 571 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, 572 entry->comp_vect_affinity); 573 /* Clearing CPU in device completion vector cpu mask */ 574 if (cpu >= 0) 575 cpumask_clear_cpu(cpu, &dd->comp_vect->mask); 576 } 577 578 dd->comp_vect_possible_cpus = 0; 579 } 580 581 /* 582 * Interrupt affinity. 583 * 584 * non-rcv avail gets a default mask that 585 * starts as possible cpus with threads reset 586 * and each rcv avail reset. 587 * 588 * rcv avail gets node relative 1 wrapping back 589 * to the node relative 1 as necessary. 590 * 591 */ 592 int hfi1_dev_affinity_init(struct hfi1_devdata *dd) 593 { 594 struct hfi1_affinity_node *entry; 595 const struct cpumask *local_mask; 596 int curr_cpu, possible, i, ret; 597 bool new_entry = false; 598 599 local_mask = cpumask_of_node(dd->node); 600 if (cpumask_first(local_mask) >= nr_cpu_ids) 601 local_mask = topology_core_cpumask(0); 602 603 mutex_lock(&node_affinity.lock); 604 entry = node_affinity_lookup(dd->node); 605 606 /* 607 * If this is the first time this NUMA node's affinity is used, 608 * create an entry in the global affinity structure and initialize it. 609 */ 610 if (!entry) { 611 entry = node_affinity_allocate(dd->node); 612 if (!entry) { 613 dd_dev_err(dd, 614 "Unable to allocate global affinity node\n"); 615 ret = -ENOMEM; 616 goto fail; 617 } 618 new_entry = true; 619 620 init_cpu_mask_set(&entry->def_intr); 621 init_cpu_mask_set(&entry->rcv_intr); 622 cpumask_clear(&entry->comp_vect_mask); 623 cpumask_clear(&entry->general_intr_mask); 624 /* Use the "real" cpu mask of this node as the default */ 625 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, 626 local_mask); 627 628 /* fill in the receive list */ 629 possible = cpumask_weight(&entry->def_intr.mask); 630 curr_cpu = cpumask_first(&entry->def_intr.mask); 631 632 if (possible == 1) { 633 /* only one CPU, everyone will use it */ 634 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); 635 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 636 } else { 637 /* 638 * The general/control context will be the first CPU in 639 * the default list, so it is removed from the default 640 * list and added to the general interrupt list. 641 */ 642 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); 643 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 644 curr_cpu = cpumask_next(curr_cpu, 645 &entry->def_intr.mask); 646 647 /* 648 * Remove the remaining kernel receive queues from 649 * the default list and add them to the receive list. 650 */ 651 for (i = 0; 652 i < (dd->n_krcv_queues - 1) * 653 hfi1_per_node_cntr[dd->node]; 654 i++) { 655 cpumask_clear_cpu(curr_cpu, 656 &entry->def_intr.mask); 657 cpumask_set_cpu(curr_cpu, 658 &entry->rcv_intr.mask); 659 curr_cpu = cpumask_next(curr_cpu, 660 &entry->def_intr.mask); 661 if (curr_cpu >= nr_cpu_ids) 662 break; 663 } 664 665 /* 666 * If there ends up being 0 CPU cores leftover for SDMA 667 * engines, use the same CPU cores as general/control 668 * context. 669 */ 670 if (cpumask_weight(&entry->def_intr.mask) == 0) 671 cpumask_copy(&entry->def_intr.mask, 672 &entry->general_intr_mask); 673 } 674 675 /* Determine completion vector CPUs for the entire node */ 676 cpumask_and(&entry->comp_vect_mask, 677 &node_affinity.real_cpu_mask, local_mask); 678 cpumask_andnot(&entry->comp_vect_mask, 679 &entry->comp_vect_mask, 680 &entry->rcv_intr.mask); 681 cpumask_andnot(&entry->comp_vect_mask, 682 &entry->comp_vect_mask, 683 &entry->general_intr_mask); 684 685 /* 686 * If there ends up being 0 CPU cores leftover for completion 687 * vectors, use the same CPU core as the general/control 688 * context. 689 */ 690 if (cpumask_weight(&entry->comp_vect_mask) == 0) 691 cpumask_copy(&entry->comp_vect_mask, 692 &entry->general_intr_mask); 693 } 694 695 ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); 696 if (ret < 0) 697 goto fail; 698 699 if (new_entry) 700 node_affinity_add_tail(entry); 701 702 dd->affinity_entry = entry; 703 mutex_unlock(&node_affinity.lock); 704 705 return 0; 706 707 fail: 708 if (new_entry) 709 node_affinity_destroy(entry); 710 mutex_unlock(&node_affinity.lock); 711 return ret; 712 } 713 714 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) 715 { 716 struct hfi1_affinity_node *entry; 717 718 mutex_lock(&node_affinity.lock); 719 if (!dd->affinity_entry) 720 goto unlock; 721 entry = node_affinity_lookup(dd->node); 722 if (!entry) 723 goto unlock; 724 725 /* 726 * Free device completion vector CPUs to be used by future 727 * completion vectors 728 */ 729 _dev_comp_vect_cpu_mask_clean_up(dd, entry); 730 unlock: 731 dd->affinity_entry = NULL; 732 mutex_unlock(&node_affinity.lock); 733 } 734 735 /* 736 * Function updates the irq affinity hint for msix after it has been changed 737 * by the user using the /proc/irq interface. This function only accepts 738 * one cpu in the mask. 739 */ 740 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) 741 { 742 struct sdma_engine *sde = msix->arg; 743 struct hfi1_devdata *dd = sde->dd; 744 struct hfi1_affinity_node *entry; 745 struct cpu_mask_set *set; 746 int i, old_cpu; 747 748 if (cpu > num_online_cpus() || cpu == sde->cpu) 749 return; 750 751 mutex_lock(&node_affinity.lock); 752 entry = node_affinity_lookup(dd->node); 753 if (!entry) 754 goto unlock; 755 756 old_cpu = sde->cpu; 757 sde->cpu = cpu; 758 cpumask_clear(&msix->mask); 759 cpumask_set_cpu(cpu, &msix->mask); 760 dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n", 761 msix->irq, irq_type_names[msix->type], 762 sde->this_idx, cpu); 763 irq_set_affinity_hint(msix->irq, &msix->mask); 764 765 /* 766 * Set the new cpu in the hfi1_affinity_node and clean 767 * the old cpu if it is not used by any other IRQ 768 */ 769 set = &entry->def_intr; 770 cpumask_set_cpu(cpu, &set->mask); 771 cpumask_set_cpu(cpu, &set->used); 772 for (i = 0; i < dd->msix_info.max_requested; i++) { 773 struct hfi1_msix_entry *other_msix; 774 775 other_msix = &dd->msix_info.msix_entries[i]; 776 if (other_msix->type != IRQ_SDMA || other_msix == msix) 777 continue; 778 779 if (cpumask_test_cpu(old_cpu, &other_msix->mask)) 780 goto unlock; 781 } 782 cpumask_clear_cpu(old_cpu, &set->mask); 783 cpumask_clear_cpu(old_cpu, &set->used); 784 unlock: 785 mutex_unlock(&node_affinity.lock); 786 } 787 788 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify, 789 const cpumask_t *mask) 790 { 791 int cpu = cpumask_first(mask); 792 struct hfi1_msix_entry *msix = container_of(notify, 793 struct hfi1_msix_entry, 794 notify); 795 796 /* Only one CPU configuration supported currently */ 797 hfi1_update_sdma_affinity(msix, cpu); 798 } 799 800 static void hfi1_irq_notifier_release(struct kref *ref) 801 { 802 /* 803 * This is required by affinity notifier. We don't have anything to 804 * free here. 805 */ 806 } 807 808 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) 809 { 810 struct irq_affinity_notify *notify = &msix->notify; 811 812 notify->irq = msix->irq; 813 notify->notify = hfi1_irq_notifier_notify; 814 notify->release = hfi1_irq_notifier_release; 815 816 if (irq_set_affinity_notifier(notify->irq, notify)) 817 pr_err("Failed to register sdma irq affinity notifier for irq %d\n", 818 notify->irq); 819 } 820 821 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix) 822 { 823 struct irq_affinity_notify *notify = &msix->notify; 824 825 if (irq_set_affinity_notifier(notify->irq, NULL)) 826 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n", 827 notify->irq); 828 } 829 830 /* 831 * Function sets the irq affinity for msix. 832 * It *must* be called with node_affinity.lock held. 833 */ 834 static int get_irq_affinity(struct hfi1_devdata *dd, 835 struct hfi1_msix_entry *msix) 836 { 837 cpumask_var_t diff; 838 struct hfi1_affinity_node *entry; 839 struct cpu_mask_set *set = NULL; 840 struct sdma_engine *sde = NULL; 841 struct hfi1_ctxtdata *rcd = NULL; 842 char extra[64]; 843 int cpu = -1; 844 845 extra[0] = '\0'; 846 cpumask_clear(&msix->mask); 847 848 entry = node_affinity_lookup(dd->node); 849 850 switch (msix->type) { 851 case IRQ_SDMA: 852 sde = (struct sdma_engine *)msix->arg; 853 scnprintf(extra, 64, "engine %u", sde->this_idx); 854 set = &entry->def_intr; 855 break; 856 case IRQ_GENERAL: 857 cpu = cpumask_first(&entry->general_intr_mask); 858 break; 859 case IRQ_RCVCTXT: 860 rcd = (struct hfi1_ctxtdata *)msix->arg; 861 if (rcd->ctxt == HFI1_CTRL_CTXT) 862 cpu = cpumask_first(&entry->general_intr_mask); 863 else 864 set = &entry->rcv_intr; 865 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 866 break; 867 case IRQ_NETDEVCTXT: 868 rcd = (struct hfi1_ctxtdata *)msix->arg; 869 set = &entry->def_intr; 870 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 871 break; 872 default: 873 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); 874 return -EINVAL; 875 } 876 877 /* 878 * The general and control contexts are placed on a particular 879 * CPU, which is set above. Skip accounting for it. Everything else 880 * finds its CPU here. 881 */ 882 if (cpu == -1 && set) { 883 if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) 884 return -ENOMEM; 885 886 cpu = cpu_mask_set_get_first(set, diff); 887 if (cpu < 0) { 888 free_cpumask_var(diff); 889 dd_dev_err(dd, "Failure to obtain CPU for IRQ\n"); 890 return cpu; 891 } 892 893 free_cpumask_var(diff); 894 } 895 896 cpumask_set_cpu(cpu, &msix->mask); 897 dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n", 898 msix->irq, irq_type_names[msix->type], 899 extra, cpu); 900 irq_set_affinity_hint(msix->irq, &msix->mask); 901 902 if (msix->type == IRQ_SDMA) { 903 sde->cpu = cpu; 904 hfi1_setup_sdma_notifier(msix); 905 } 906 907 return 0; 908 } 909 910 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) 911 { 912 int ret; 913 914 mutex_lock(&node_affinity.lock); 915 ret = get_irq_affinity(dd, msix); 916 mutex_unlock(&node_affinity.lock); 917 return ret; 918 } 919 920 void hfi1_put_irq_affinity(struct hfi1_devdata *dd, 921 struct hfi1_msix_entry *msix) 922 { 923 struct cpu_mask_set *set = NULL; 924 struct hfi1_affinity_node *entry; 925 926 mutex_lock(&node_affinity.lock); 927 entry = node_affinity_lookup(dd->node); 928 929 switch (msix->type) { 930 case IRQ_SDMA: 931 set = &entry->def_intr; 932 hfi1_cleanup_sdma_notifier(msix); 933 break; 934 case IRQ_GENERAL: 935 /* Don't do accounting for general contexts */ 936 break; 937 case IRQ_RCVCTXT: { 938 struct hfi1_ctxtdata *rcd = msix->arg; 939 940 /* Don't do accounting for control contexts */ 941 if (rcd->ctxt != HFI1_CTRL_CTXT) 942 set = &entry->rcv_intr; 943 break; 944 } 945 case IRQ_NETDEVCTXT: 946 set = &entry->def_intr; 947 break; 948 default: 949 mutex_unlock(&node_affinity.lock); 950 return; 951 } 952 953 if (set) { 954 cpumask_andnot(&set->used, &set->used, &msix->mask); 955 _cpu_mask_set_gen_dec(set); 956 } 957 958 irq_set_affinity_hint(msix->irq, NULL); 959 cpumask_clear(&msix->mask); 960 mutex_unlock(&node_affinity.lock); 961 } 962 963 /* This should be called with node_affinity.lock held */ 964 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, 965 struct hfi1_affinity_node_list *affinity) 966 { 967 int possible, curr_cpu, i; 968 uint num_cores_per_socket = node_affinity.num_online_cpus / 969 affinity->num_core_siblings / 970 node_affinity.num_online_nodes; 971 972 cpumask_copy(hw_thread_mask, &affinity->proc.mask); 973 if (affinity->num_core_siblings > 0) { 974 /* Removing other siblings not needed for now */ 975 possible = cpumask_weight(hw_thread_mask); 976 curr_cpu = cpumask_first(hw_thread_mask); 977 for (i = 0; 978 i < num_cores_per_socket * node_affinity.num_online_nodes; 979 i++) 980 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); 981 982 for (; i < possible; i++) { 983 cpumask_clear_cpu(curr_cpu, hw_thread_mask); 984 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); 985 } 986 987 /* Identifying correct HW threads within physical cores */ 988 cpumask_shift_left(hw_thread_mask, hw_thread_mask, 989 num_cores_per_socket * 990 node_affinity.num_online_nodes * 991 hw_thread_no); 992 } 993 } 994 995 int hfi1_get_proc_affinity(int node) 996 { 997 int cpu = -1, ret, i; 998 struct hfi1_affinity_node *entry; 999 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; 1000 const struct cpumask *node_mask, 1001 *proc_mask = current->cpus_ptr; 1002 struct hfi1_affinity_node_list *affinity = &node_affinity; 1003 struct cpu_mask_set *set = &affinity->proc; 1004 1005 /* 1006 * check whether process/context affinity has already 1007 * been set 1008 */ 1009 if (current->nr_cpus_allowed == 1) { 1010 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", 1011 current->pid, current->comm, 1012 cpumask_pr_args(proc_mask)); 1013 /* 1014 * Mark the pre-set CPU as used. This is atomic so we don't 1015 * need the lock 1016 */ 1017 cpu = cpumask_first(proc_mask); 1018 cpumask_set_cpu(cpu, &set->used); 1019 goto done; 1020 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { 1021 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", 1022 current->pid, current->comm, 1023 cpumask_pr_args(proc_mask)); 1024 goto done; 1025 } 1026 1027 /* 1028 * The process does not have a preset CPU affinity so find one to 1029 * recommend using the following algorithm: 1030 * 1031 * For each user process that is opening a context on HFI Y: 1032 * a) If all cores are filled, reinitialize the bitmask 1033 * b) Fill real cores first, then HT cores (First set of HT 1034 * cores on all physical cores, then second set of HT core, 1035 * and, so on) in the following order: 1036 * 1037 * 1. Same NUMA node as HFI Y and not running an IRQ 1038 * handler 1039 * 2. Same NUMA node as HFI Y and running an IRQ handler 1040 * 3. Different NUMA node to HFI Y and not running an IRQ 1041 * handler 1042 * 4. Different NUMA node to HFI Y and running an IRQ 1043 * handler 1044 * c) Mark core as filled in the bitmask. As user processes are 1045 * done, clear cores from the bitmask. 1046 */ 1047 1048 ret = zalloc_cpumask_var(&diff, GFP_KERNEL); 1049 if (!ret) 1050 goto done; 1051 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); 1052 if (!ret) 1053 goto free_diff; 1054 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); 1055 if (!ret) 1056 goto free_hw_thread_mask; 1057 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); 1058 if (!ret) 1059 goto free_available_mask; 1060 1061 mutex_lock(&affinity->lock); 1062 /* 1063 * If we've used all available HW threads, clear the mask and start 1064 * overloading. 1065 */ 1066 _cpu_mask_set_gen_inc(set); 1067 1068 /* 1069 * If NUMA node has CPUs used by interrupt handlers, include them in the 1070 * interrupt handler mask. 1071 */ 1072 entry = node_affinity_lookup(node); 1073 if (entry) { 1074 cpumask_copy(intrs_mask, (entry->def_intr.gen ? 1075 &entry->def_intr.mask : 1076 &entry->def_intr.used)); 1077 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? 1078 &entry->rcv_intr.mask : 1079 &entry->rcv_intr.used)); 1080 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); 1081 } 1082 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", 1083 cpumask_pr_args(intrs_mask)); 1084 1085 cpumask_copy(hw_thread_mask, &set->mask); 1086 1087 /* 1088 * If HT cores are enabled, identify which HW threads within the 1089 * physical cores should be used. 1090 */ 1091 if (affinity->num_core_siblings > 0) { 1092 for (i = 0; i < affinity->num_core_siblings; i++) { 1093 find_hw_thread_mask(i, hw_thread_mask, affinity); 1094 1095 /* 1096 * If there's at least one available core for this HW 1097 * thread number, stop looking for a core. 1098 * 1099 * diff will always be not empty at least once in this 1100 * loop as the used mask gets reset when 1101 * (set->mask == set->used) before this loop. 1102 */ 1103 cpumask_andnot(diff, hw_thread_mask, &set->used); 1104 if (!cpumask_empty(diff)) 1105 break; 1106 } 1107 } 1108 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", 1109 cpumask_pr_args(hw_thread_mask)); 1110 1111 node_mask = cpumask_of_node(node); 1112 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, 1113 cpumask_pr_args(node_mask)); 1114 1115 /* Get cpumask of available CPUs on preferred NUMA */ 1116 cpumask_and(available_mask, hw_thread_mask, node_mask); 1117 cpumask_andnot(available_mask, available_mask, &set->used); 1118 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, 1119 cpumask_pr_args(available_mask)); 1120 1121 /* 1122 * At first, we don't want to place processes on the same 1123 * CPUs as interrupt handlers. Then, CPUs running interrupt 1124 * handlers are used. 1125 * 1126 * 1) If diff is not empty, then there are CPUs not running 1127 * non-interrupt handlers available, so diff gets copied 1128 * over to available_mask. 1129 * 2) If diff is empty, then all CPUs not running interrupt 1130 * handlers are taken, so available_mask contains all 1131 * available CPUs running interrupt handlers. 1132 * 3) If available_mask is empty, then all CPUs on the 1133 * preferred NUMA node are taken, so other NUMA nodes are 1134 * used for process assignments using the same method as 1135 * the preferred NUMA node. 1136 */ 1137 cpumask_andnot(diff, available_mask, intrs_mask); 1138 if (!cpumask_empty(diff)) 1139 cpumask_copy(available_mask, diff); 1140 1141 /* If we don't have CPUs on the preferred node, use other NUMA nodes */ 1142 if (cpumask_empty(available_mask)) { 1143 cpumask_andnot(available_mask, hw_thread_mask, &set->used); 1144 /* Excluding preferred NUMA cores */ 1145 cpumask_andnot(available_mask, available_mask, node_mask); 1146 hfi1_cdbg(PROC, 1147 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", 1148 cpumask_pr_args(available_mask)); 1149 1150 /* 1151 * At first, we don't want to place processes on the same 1152 * CPUs as interrupt handlers. 1153 */ 1154 cpumask_andnot(diff, available_mask, intrs_mask); 1155 if (!cpumask_empty(diff)) 1156 cpumask_copy(available_mask, diff); 1157 } 1158 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", 1159 cpumask_pr_args(available_mask)); 1160 1161 cpu = cpumask_first(available_mask); 1162 if (cpu >= nr_cpu_ids) /* empty */ 1163 cpu = -1; 1164 else 1165 cpumask_set_cpu(cpu, &set->used); 1166 1167 mutex_unlock(&affinity->lock); 1168 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); 1169 1170 free_cpumask_var(intrs_mask); 1171 free_available_mask: 1172 free_cpumask_var(available_mask); 1173 free_hw_thread_mask: 1174 free_cpumask_var(hw_thread_mask); 1175 free_diff: 1176 free_cpumask_var(diff); 1177 done: 1178 return cpu; 1179 } 1180 1181 void hfi1_put_proc_affinity(int cpu) 1182 { 1183 struct hfi1_affinity_node_list *affinity = &node_affinity; 1184 struct cpu_mask_set *set = &affinity->proc; 1185 1186 if (cpu < 0) 1187 return; 1188 1189 mutex_lock(&affinity->lock); 1190 cpu_mask_set_put(set, cpu); 1191 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); 1192 mutex_unlock(&affinity->lock); 1193 } 1194