1 /* 2 * Copyright(c) 2015, 2016 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/topology.h> 48 #include <linux/cpumask.h> 49 #include <linux/module.h> 50 #include <linux/interrupt.h> 51 52 #include "hfi.h" 53 #include "affinity.h" 54 #include "sdma.h" 55 #include "trace.h" 56 57 struct hfi1_affinity_node_list node_affinity = { 58 .list = LIST_HEAD_INIT(node_affinity.list), 59 .lock = __MUTEX_INITIALIZER(node_affinity.lock) 60 }; 61 62 /* Name of IRQ types, indexed by enum irq_type */ 63 static const char * const irq_type_names[] = { 64 "SDMA", 65 "RCVCTXT", 66 "GENERAL", 67 "OTHER", 68 }; 69 70 /* Per NUMA node count of HFI devices */ 71 static unsigned int *hfi1_per_node_cntr; 72 73 static inline void init_cpu_mask_set(struct cpu_mask_set *set) 74 { 75 cpumask_clear(&set->mask); 76 cpumask_clear(&set->used); 77 set->gen = 0; 78 } 79 80 /* Initialize non-HT cpu cores mask */ 81 void init_real_cpu_mask(void) 82 { 83 int possible, curr_cpu, i, ht; 84 85 cpumask_clear(&node_affinity.real_cpu_mask); 86 87 /* Start with cpu online mask as the real cpu mask */ 88 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); 89 90 /* 91 * Remove HT cores from the real cpu mask. Do this in two steps below. 92 */ 93 possible = cpumask_weight(&node_affinity.real_cpu_mask); 94 ht = cpumask_weight(topology_sibling_cpumask( 95 cpumask_first(&node_affinity.real_cpu_mask))); 96 /* 97 * Step 1. Skip over the first N HT siblings and use them as the 98 * "real" cores. Assumes that HT cores are not enumerated in 99 * succession (except in the single core case). 100 */ 101 curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); 102 for (i = 0; i < possible / ht; i++) 103 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); 104 /* 105 * Step 2. Remove the remaining HT siblings. Use cpumask_next() to 106 * skip any gaps. 107 */ 108 for (; i < possible; i++) { 109 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); 110 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); 111 } 112 } 113 114 int node_affinity_init(void) 115 { 116 int node; 117 struct pci_dev *dev = NULL; 118 const struct pci_device_id *ids = hfi1_pci_tbl; 119 120 cpumask_clear(&node_affinity.proc.used); 121 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); 122 123 node_affinity.proc.gen = 0; 124 node_affinity.num_core_siblings = 125 cpumask_weight(topology_sibling_cpumask( 126 cpumask_first(&node_affinity.proc.mask) 127 )); 128 node_affinity.num_online_nodes = num_online_nodes(); 129 node_affinity.num_online_cpus = num_online_cpus(); 130 131 /* 132 * The real cpu mask is part of the affinity struct but it has to be 133 * initialized early. It is needed to calculate the number of user 134 * contexts in set_up_context_variables(). 135 */ 136 init_real_cpu_mask(); 137 138 hfi1_per_node_cntr = kcalloc(num_possible_nodes(), 139 sizeof(*hfi1_per_node_cntr), GFP_KERNEL); 140 if (!hfi1_per_node_cntr) 141 return -ENOMEM; 142 143 while (ids->vendor) { 144 dev = NULL; 145 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { 146 node = pcibus_to_node(dev->bus); 147 if (node < 0) 148 node = numa_node_id(); 149 150 hfi1_per_node_cntr[node]++; 151 } 152 ids++; 153 } 154 155 return 0; 156 } 157 158 void node_affinity_destroy(void) 159 { 160 struct list_head *pos, *q; 161 struct hfi1_affinity_node *entry; 162 163 mutex_lock(&node_affinity.lock); 164 list_for_each_safe(pos, q, &node_affinity.list) { 165 entry = list_entry(pos, struct hfi1_affinity_node, 166 list); 167 list_del(pos); 168 kfree(entry); 169 } 170 mutex_unlock(&node_affinity.lock); 171 kfree(hfi1_per_node_cntr); 172 } 173 174 static struct hfi1_affinity_node *node_affinity_allocate(int node) 175 { 176 struct hfi1_affinity_node *entry; 177 178 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 179 if (!entry) 180 return NULL; 181 entry->node = node; 182 INIT_LIST_HEAD(&entry->list); 183 184 return entry; 185 } 186 187 /* 188 * It appends an entry to the list. 189 * It *must* be called with node_affinity.lock held. 190 */ 191 static void node_affinity_add_tail(struct hfi1_affinity_node *entry) 192 { 193 list_add_tail(&entry->list, &node_affinity.list); 194 } 195 196 /* It must be called with node_affinity.lock held */ 197 static struct hfi1_affinity_node *node_affinity_lookup(int node) 198 { 199 struct list_head *pos; 200 struct hfi1_affinity_node *entry; 201 202 list_for_each(pos, &node_affinity.list) { 203 entry = list_entry(pos, struct hfi1_affinity_node, list); 204 if (entry->node == node) 205 return entry; 206 } 207 208 return NULL; 209 } 210 211 /* 212 * Interrupt affinity. 213 * 214 * non-rcv avail gets a default mask that 215 * starts as possible cpus with threads reset 216 * and each rcv avail reset. 217 * 218 * rcv avail gets node relative 1 wrapping back 219 * to the node relative 1 as necessary. 220 * 221 */ 222 int hfi1_dev_affinity_init(struct hfi1_devdata *dd) 223 { 224 int node = pcibus_to_node(dd->pcidev->bus); 225 struct hfi1_affinity_node *entry; 226 const struct cpumask *local_mask; 227 int curr_cpu, possible, i; 228 229 if (node < 0) 230 node = numa_node_id(); 231 dd->node = node; 232 233 local_mask = cpumask_of_node(dd->node); 234 if (cpumask_first(local_mask) >= nr_cpu_ids) 235 local_mask = topology_core_cpumask(0); 236 237 mutex_lock(&node_affinity.lock); 238 entry = node_affinity_lookup(dd->node); 239 240 /* 241 * If this is the first time this NUMA node's affinity is used, 242 * create an entry in the global affinity structure and initialize it. 243 */ 244 if (!entry) { 245 entry = node_affinity_allocate(node); 246 if (!entry) { 247 dd_dev_err(dd, 248 "Unable to allocate global affinity node\n"); 249 mutex_unlock(&node_affinity.lock); 250 return -ENOMEM; 251 } 252 init_cpu_mask_set(&entry->def_intr); 253 init_cpu_mask_set(&entry->rcv_intr); 254 cpumask_clear(&entry->general_intr_mask); 255 /* Use the "real" cpu mask of this node as the default */ 256 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, 257 local_mask); 258 259 /* fill in the receive list */ 260 possible = cpumask_weight(&entry->def_intr.mask); 261 curr_cpu = cpumask_first(&entry->def_intr.mask); 262 263 if (possible == 1) { 264 /* only one CPU, everyone will use it */ 265 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); 266 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 267 } else { 268 /* 269 * The general/control context will be the first CPU in 270 * the default list, so it is removed from the default 271 * list and added to the general interrupt list. 272 */ 273 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); 274 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 275 curr_cpu = cpumask_next(curr_cpu, 276 &entry->def_intr.mask); 277 278 /* 279 * Remove the remaining kernel receive queues from 280 * the default list and add them to the receive list. 281 */ 282 for (i = 0; 283 i < (dd->n_krcv_queues - 1) * 284 hfi1_per_node_cntr[dd->node]; 285 i++) { 286 cpumask_clear_cpu(curr_cpu, 287 &entry->def_intr.mask); 288 cpumask_set_cpu(curr_cpu, 289 &entry->rcv_intr.mask); 290 curr_cpu = cpumask_next(curr_cpu, 291 &entry->def_intr.mask); 292 if (curr_cpu >= nr_cpu_ids) 293 break; 294 } 295 296 /* 297 * If there ends up being 0 CPU cores leftover for SDMA 298 * engines, use the same CPU cores as general/control 299 * context. 300 */ 301 if (cpumask_weight(&entry->def_intr.mask) == 0) 302 cpumask_copy(&entry->def_intr.mask, 303 &entry->general_intr_mask); 304 } 305 306 node_affinity_add_tail(entry); 307 } 308 mutex_unlock(&node_affinity.lock); 309 return 0; 310 } 311 312 /* 313 * Function updates the irq affinity hint for msix after it has been changed 314 * by the user using the /proc/irq interface. This function only accepts 315 * one cpu in the mask. 316 */ 317 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) 318 { 319 struct sdma_engine *sde = msix->arg; 320 struct hfi1_devdata *dd = sde->dd; 321 struct hfi1_affinity_node *entry; 322 struct cpu_mask_set *set; 323 int i, old_cpu; 324 325 if (cpu > num_online_cpus() || cpu == sde->cpu) 326 return; 327 328 mutex_lock(&node_affinity.lock); 329 entry = node_affinity_lookup(dd->node); 330 if (!entry) 331 goto unlock; 332 333 old_cpu = sde->cpu; 334 sde->cpu = cpu; 335 cpumask_clear(&msix->mask); 336 cpumask_set_cpu(cpu, &msix->mask); 337 dd_dev_dbg(dd, "IRQ vector: %u, type %s engine %u -> cpu: %d\n", 338 msix->msix.vector, irq_type_names[msix->type], 339 sde->this_idx, cpu); 340 irq_set_affinity_hint(msix->msix.vector, &msix->mask); 341 342 /* 343 * Set the new cpu in the hfi1_affinity_node and clean 344 * the old cpu if it is not used by any other IRQ 345 */ 346 set = &entry->def_intr; 347 cpumask_set_cpu(cpu, &set->mask); 348 cpumask_set_cpu(cpu, &set->used); 349 for (i = 0; i < dd->num_msix_entries; i++) { 350 struct hfi1_msix_entry *other_msix; 351 352 other_msix = &dd->msix_entries[i]; 353 if (other_msix->type != IRQ_SDMA || other_msix == msix) 354 continue; 355 356 if (cpumask_test_cpu(old_cpu, &other_msix->mask)) 357 goto unlock; 358 } 359 cpumask_clear_cpu(old_cpu, &set->mask); 360 cpumask_clear_cpu(old_cpu, &set->used); 361 unlock: 362 mutex_unlock(&node_affinity.lock); 363 } 364 365 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify, 366 const cpumask_t *mask) 367 { 368 int cpu = cpumask_first(mask); 369 struct hfi1_msix_entry *msix = container_of(notify, 370 struct hfi1_msix_entry, 371 notify); 372 373 /* Only one CPU configuration supported currently */ 374 hfi1_update_sdma_affinity(msix, cpu); 375 } 376 377 static void hfi1_irq_notifier_release(struct kref *ref) 378 { 379 /* 380 * This is required by affinity notifier. We don't have anything to 381 * free here. 382 */ 383 } 384 385 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) 386 { 387 struct irq_affinity_notify *notify = &msix->notify; 388 389 notify->irq = msix->msix.vector; 390 notify->notify = hfi1_irq_notifier_notify; 391 notify->release = hfi1_irq_notifier_release; 392 393 if (irq_set_affinity_notifier(notify->irq, notify)) 394 pr_err("Failed to register sdma irq affinity notifier for irq %d\n", 395 notify->irq); 396 } 397 398 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix) 399 { 400 struct irq_affinity_notify *notify = &msix->notify; 401 402 if (irq_set_affinity_notifier(notify->irq, NULL)) 403 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n", 404 notify->irq); 405 } 406 407 /* 408 * Function sets the irq affinity for msix. 409 * It *must* be called with node_affinity.lock held. 410 */ 411 static int get_irq_affinity(struct hfi1_devdata *dd, 412 struct hfi1_msix_entry *msix) 413 { 414 int ret; 415 cpumask_var_t diff; 416 struct hfi1_affinity_node *entry; 417 struct cpu_mask_set *set = NULL; 418 struct sdma_engine *sde = NULL; 419 struct hfi1_ctxtdata *rcd = NULL; 420 char extra[64]; 421 int cpu = -1; 422 423 extra[0] = '\0'; 424 cpumask_clear(&msix->mask); 425 426 ret = zalloc_cpumask_var(&diff, GFP_KERNEL); 427 if (!ret) 428 return -ENOMEM; 429 430 entry = node_affinity_lookup(dd->node); 431 432 switch (msix->type) { 433 case IRQ_SDMA: 434 sde = (struct sdma_engine *)msix->arg; 435 scnprintf(extra, 64, "engine %u", sde->this_idx); 436 set = &entry->def_intr; 437 break; 438 case IRQ_GENERAL: 439 cpu = cpumask_first(&entry->general_intr_mask); 440 break; 441 case IRQ_RCVCTXT: 442 rcd = (struct hfi1_ctxtdata *)msix->arg; 443 if (rcd->ctxt == HFI1_CTRL_CTXT) 444 cpu = cpumask_first(&entry->general_intr_mask); 445 else 446 set = &entry->rcv_intr; 447 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 448 break; 449 default: 450 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); 451 return -EINVAL; 452 } 453 454 /* 455 * The general and control contexts are placed on a particular 456 * CPU, which is set above. Skip accounting for it. Everything else 457 * finds its CPU here. 458 */ 459 if (cpu == -1 && set) { 460 if (cpumask_equal(&set->mask, &set->used)) { 461 /* 462 * We've used up all the CPUs, bump up the generation 463 * and reset the 'used' map 464 */ 465 set->gen++; 466 cpumask_clear(&set->used); 467 } 468 cpumask_andnot(diff, &set->mask, &set->used); 469 cpu = cpumask_first(diff); 470 cpumask_set_cpu(cpu, &set->used); 471 } 472 473 cpumask_set_cpu(cpu, &msix->mask); 474 dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n", 475 msix->msix.vector, irq_type_names[msix->type], 476 extra, cpu); 477 irq_set_affinity_hint(msix->msix.vector, &msix->mask); 478 479 if (msix->type == IRQ_SDMA) { 480 sde->cpu = cpu; 481 hfi1_setup_sdma_notifier(msix); 482 } 483 484 free_cpumask_var(diff); 485 return 0; 486 } 487 488 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) 489 { 490 int ret; 491 492 mutex_lock(&node_affinity.lock); 493 ret = get_irq_affinity(dd, msix); 494 mutex_unlock(&node_affinity.lock); 495 return ret; 496 } 497 498 void hfi1_put_irq_affinity(struct hfi1_devdata *dd, 499 struct hfi1_msix_entry *msix) 500 { 501 struct cpu_mask_set *set = NULL; 502 struct hfi1_ctxtdata *rcd; 503 struct hfi1_affinity_node *entry; 504 505 mutex_lock(&node_affinity.lock); 506 entry = node_affinity_lookup(dd->node); 507 508 switch (msix->type) { 509 case IRQ_SDMA: 510 set = &entry->def_intr; 511 hfi1_cleanup_sdma_notifier(msix); 512 break; 513 case IRQ_GENERAL: 514 /* Don't do accounting for general contexts */ 515 break; 516 case IRQ_RCVCTXT: 517 rcd = (struct hfi1_ctxtdata *)msix->arg; 518 /* Don't do accounting for control contexts */ 519 if (rcd->ctxt != HFI1_CTRL_CTXT) 520 set = &entry->rcv_intr; 521 break; 522 default: 523 mutex_unlock(&node_affinity.lock); 524 return; 525 } 526 527 if (set) { 528 cpumask_andnot(&set->used, &set->used, &msix->mask); 529 if (cpumask_empty(&set->used) && set->gen) { 530 set->gen--; 531 cpumask_copy(&set->used, &set->mask); 532 } 533 } 534 535 irq_set_affinity_hint(msix->msix.vector, NULL); 536 cpumask_clear(&msix->mask); 537 mutex_unlock(&node_affinity.lock); 538 } 539 540 /* This should be called with node_affinity.lock held */ 541 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, 542 struct hfi1_affinity_node_list *affinity) 543 { 544 int possible, curr_cpu, i; 545 uint num_cores_per_socket = node_affinity.num_online_cpus / 546 affinity->num_core_siblings / 547 node_affinity.num_online_nodes; 548 549 cpumask_copy(hw_thread_mask, &affinity->proc.mask); 550 if (affinity->num_core_siblings > 0) { 551 /* Removing other siblings not needed for now */ 552 possible = cpumask_weight(hw_thread_mask); 553 curr_cpu = cpumask_first(hw_thread_mask); 554 for (i = 0; 555 i < num_cores_per_socket * node_affinity.num_online_nodes; 556 i++) 557 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); 558 559 for (; i < possible; i++) { 560 cpumask_clear_cpu(curr_cpu, hw_thread_mask); 561 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); 562 } 563 564 /* Identifying correct HW threads within physical cores */ 565 cpumask_shift_left(hw_thread_mask, hw_thread_mask, 566 num_cores_per_socket * 567 node_affinity.num_online_nodes * 568 hw_thread_no); 569 } 570 } 571 572 int hfi1_get_proc_affinity(int node) 573 { 574 int cpu = -1, ret, i; 575 struct hfi1_affinity_node *entry; 576 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; 577 const struct cpumask *node_mask, 578 *proc_mask = tsk_cpus_allowed(current); 579 struct hfi1_affinity_node_list *affinity = &node_affinity; 580 struct cpu_mask_set *set = &affinity->proc; 581 582 /* 583 * check whether process/context affinity has already 584 * been set 585 */ 586 if (cpumask_weight(proc_mask) == 1) { 587 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", 588 current->pid, current->comm, 589 cpumask_pr_args(proc_mask)); 590 /* 591 * Mark the pre-set CPU as used. This is atomic so we don't 592 * need the lock 593 */ 594 cpu = cpumask_first(proc_mask); 595 cpumask_set_cpu(cpu, &set->used); 596 goto done; 597 } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { 598 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", 599 current->pid, current->comm, 600 cpumask_pr_args(proc_mask)); 601 goto done; 602 } 603 604 /* 605 * The process does not have a preset CPU affinity so find one to 606 * recommend using the following algorithm: 607 * 608 * For each user process that is opening a context on HFI Y: 609 * a) If all cores are filled, reinitialize the bitmask 610 * b) Fill real cores first, then HT cores (First set of HT 611 * cores on all physical cores, then second set of HT core, 612 * and, so on) in the following order: 613 * 614 * 1. Same NUMA node as HFI Y and not running an IRQ 615 * handler 616 * 2. Same NUMA node as HFI Y and running an IRQ handler 617 * 3. Different NUMA node to HFI Y and not running an IRQ 618 * handler 619 * 4. Different NUMA node to HFI Y and running an IRQ 620 * handler 621 * c) Mark core as filled in the bitmask. As user processes are 622 * done, clear cores from the bitmask. 623 */ 624 625 ret = zalloc_cpumask_var(&diff, GFP_KERNEL); 626 if (!ret) 627 goto done; 628 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); 629 if (!ret) 630 goto free_diff; 631 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); 632 if (!ret) 633 goto free_hw_thread_mask; 634 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); 635 if (!ret) 636 goto free_available_mask; 637 638 mutex_lock(&affinity->lock); 639 /* 640 * If we've used all available HW threads, clear the mask and start 641 * overloading. 642 */ 643 if (cpumask_equal(&set->mask, &set->used)) { 644 set->gen++; 645 cpumask_clear(&set->used); 646 } 647 648 /* 649 * If NUMA node has CPUs used by interrupt handlers, include them in the 650 * interrupt handler mask. 651 */ 652 entry = node_affinity_lookup(node); 653 if (entry) { 654 cpumask_copy(intrs_mask, (entry->def_intr.gen ? 655 &entry->def_intr.mask : 656 &entry->def_intr.used)); 657 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? 658 &entry->rcv_intr.mask : 659 &entry->rcv_intr.used)); 660 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); 661 } 662 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", 663 cpumask_pr_args(intrs_mask)); 664 665 cpumask_copy(hw_thread_mask, &set->mask); 666 667 /* 668 * If HT cores are enabled, identify which HW threads within the 669 * physical cores should be used. 670 */ 671 if (affinity->num_core_siblings > 0) { 672 for (i = 0; i < affinity->num_core_siblings; i++) { 673 find_hw_thread_mask(i, hw_thread_mask, affinity); 674 675 /* 676 * If there's at least one available core for this HW 677 * thread number, stop looking for a core. 678 * 679 * diff will always be not empty at least once in this 680 * loop as the used mask gets reset when 681 * (set->mask == set->used) before this loop. 682 */ 683 cpumask_andnot(diff, hw_thread_mask, &set->used); 684 if (!cpumask_empty(diff)) 685 break; 686 } 687 } 688 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", 689 cpumask_pr_args(hw_thread_mask)); 690 691 node_mask = cpumask_of_node(node); 692 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, 693 cpumask_pr_args(node_mask)); 694 695 /* Get cpumask of available CPUs on preferred NUMA */ 696 cpumask_and(available_mask, hw_thread_mask, node_mask); 697 cpumask_andnot(available_mask, available_mask, &set->used); 698 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, 699 cpumask_pr_args(available_mask)); 700 701 /* 702 * At first, we don't want to place processes on the same 703 * CPUs as interrupt handlers. Then, CPUs running interrupt 704 * handlers are used. 705 * 706 * 1) If diff is not empty, then there are CPUs not running 707 * non-interrupt handlers available, so diff gets copied 708 * over to available_mask. 709 * 2) If diff is empty, then all CPUs not running interrupt 710 * handlers are taken, so available_mask contains all 711 * available CPUs running interrupt handlers. 712 * 3) If available_mask is empty, then all CPUs on the 713 * preferred NUMA node are taken, so other NUMA nodes are 714 * used for process assignments using the same method as 715 * the preferred NUMA node. 716 */ 717 cpumask_andnot(diff, available_mask, intrs_mask); 718 if (!cpumask_empty(diff)) 719 cpumask_copy(available_mask, diff); 720 721 /* If we don't have CPUs on the preferred node, use other NUMA nodes */ 722 if (cpumask_empty(available_mask)) { 723 cpumask_andnot(available_mask, hw_thread_mask, &set->used); 724 /* Excluding preferred NUMA cores */ 725 cpumask_andnot(available_mask, available_mask, node_mask); 726 hfi1_cdbg(PROC, 727 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", 728 cpumask_pr_args(available_mask)); 729 730 /* 731 * At first, we don't want to place processes on the same 732 * CPUs as interrupt handlers. 733 */ 734 cpumask_andnot(diff, available_mask, intrs_mask); 735 if (!cpumask_empty(diff)) 736 cpumask_copy(available_mask, diff); 737 } 738 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", 739 cpumask_pr_args(available_mask)); 740 741 cpu = cpumask_first(available_mask); 742 if (cpu >= nr_cpu_ids) /* empty */ 743 cpu = -1; 744 else 745 cpumask_set_cpu(cpu, &set->used); 746 747 mutex_unlock(&affinity->lock); 748 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); 749 750 free_cpumask_var(intrs_mask); 751 free_available_mask: 752 free_cpumask_var(available_mask); 753 free_hw_thread_mask: 754 free_cpumask_var(hw_thread_mask); 755 free_diff: 756 free_cpumask_var(diff); 757 done: 758 return cpu; 759 } 760 761 void hfi1_put_proc_affinity(int cpu) 762 { 763 struct hfi1_affinity_node_list *affinity = &node_affinity; 764 struct cpu_mask_set *set = &affinity->proc; 765 766 if (cpu < 0) 767 return; 768 769 mutex_lock(&affinity->lock); 770 cpumask_clear_cpu(cpu, &set->used); 771 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); 772 if (cpumask_empty(&set->used) && set->gen) { 773 set->gen--; 774 cpumask_copy(&set->used, &set->mask); 775 } 776 mutex_unlock(&affinity->lock); 777 } 778 779 int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf, 780 size_t count) 781 { 782 struct hfi1_affinity_node *entry; 783 cpumask_var_t mask; 784 int ret, i; 785 786 mutex_lock(&node_affinity.lock); 787 entry = node_affinity_lookup(dd->node); 788 789 if (!entry) { 790 ret = -EINVAL; 791 goto unlock; 792 } 793 794 ret = zalloc_cpumask_var(&mask, GFP_KERNEL); 795 if (!ret) { 796 ret = -ENOMEM; 797 goto unlock; 798 } 799 800 ret = cpulist_parse(buf, mask); 801 if (ret) 802 goto out; 803 804 if (!cpumask_subset(mask, cpu_online_mask) || cpumask_empty(mask)) { 805 dd_dev_warn(dd, "Invalid CPU mask\n"); 806 ret = -EINVAL; 807 goto out; 808 } 809 810 /* reset the SDMA interrupt affinity details */ 811 init_cpu_mask_set(&entry->def_intr); 812 cpumask_copy(&entry->def_intr.mask, mask); 813 814 /* Reassign the affinity for each SDMA interrupt. */ 815 for (i = 0; i < dd->num_msix_entries; i++) { 816 struct hfi1_msix_entry *msix; 817 818 msix = &dd->msix_entries[i]; 819 if (msix->type != IRQ_SDMA) 820 continue; 821 822 ret = get_irq_affinity(dd, msix); 823 824 if (ret) 825 break; 826 } 827 out: 828 free_cpumask_var(mask); 829 unlock: 830 mutex_unlock(&node_affinity.lock); 831 return ret ? ret : strnlen(buf, PAGE_SIZE); 832 } 833 834 int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf) 835 { 836 struct hfi1_affinity_node *entry; 837 838 mutex_lock(&node_affinity.lock); 839 entry = node_affinity_lookup(dd->node); 840 841 if (!entry) { 842 mutex_unlock(&node_affinity.lock); 843 return -EINVAL; 844 } 845 846 cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask); 847 mutex_unlock(&node_affinity.lock); 848 return strnlen(buf, PAGE_SIZE); 849 } 850