1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 /* 3 * Copyright(c) 2015 - 2020 Intel Corporation. 4 * Copyright(c) 2021 Cornelis Networks. 5 */ 6 7 #include <linux/pci.h> 8 #include <linux/netdevice.h> 9 #include <linux/vmalloc.h> 10 #include <linux/delay.h> 11 #include <linux/xarray.h> 12 #include <linux/module.h> 13 #include <linux/printk.h> 14 #include <linux/hrtimer.h> 15 #include <linux/bitmap.h> 16 #include <linux/numa.h> 17 #include <rdma/rdma_vt.h> 18 19 #include "hfi.h" 20 #include "device.h" 21 #include "common.h" 22 #include "trace.h" 23 #include "mad.h" 24 #include "sdma.h" 25 #include "debugfs.h" 26 #include "verbs.h" 27 #include "aspm.h" 28 #include "affinity.h" 29 #include "vnic.h" 30 #include "exp_rcv.h" 31 #include "netdev.h" 32 33 #undef pr_fmt 34 #define pr_fmt(fmt) DRIVER_NAME ": " fmt 35 36 /* 37 * min buffers we want to have per context, after driver 38 */ 39 #define HFI1_MIN_USER_CTXT_BUFCNT 7 40 41 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ 42 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ 43 44 #define NUM_IB_PORTS 1 45 46 /* 47 * Number of user receive contexts we are configured to use (to allow for more 48 * pio buffers per ctxt, etc.) Zero means use one user context per CPU. 49 */ 50 int num_user_contexts = -1; 51 module_param_named(num_user_contexts, num_user_contexts, int, 0444); 52 MODULE_PARM_DESC( 53 num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)"); 54 55 uint krcvqs[RXE_NUM_DATA_VL]; 56 int krcvqsset; 57 module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); 58 MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); 59 60 /* computed based on above array */ 61 unsigned long n_krcvqs; 62 63 static unsigned hfi1_rcvarr_split = 25; 64 module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); 65 MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); 66 67 static uint eager_buffer_size = (8 << 20); /* 8MB */ 68 module_param(eager_buffer_size, uint, S_IRUGO); 69 MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB"); 70 71 static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ 72 module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); 73 MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); 74 75 static uint hfi1_hdrq_entsize = 32; 76 module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); 77 MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); 78 79 unsigned int user_credit_return_threshold = 33; /* default is 33% */ 80 module_param(user_credit_return_threshold, uint, S_IRUGO); 81 MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); 82 83 DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); 84 85 static int hfi1_create_kctxt(struct hfi1_devdata *dd, 86 struct hfi1_pportdata *ppd) 87 { 88 struct hfi1_ctxtdata *rcd; 89 int ret; 90 91 /* Control context has to be always 0 */ 92 BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); 93 94 ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd); 95 if (ret < 0) { 96 dd_dev_err(dd, "Kernel receive context allocation failed\n"); 97 return ret; 98 } 99 100 /* 101 * Set up the kernel context flags here and now because they use 102 * default values for all receive side memories. User contexts will 103 * be handled as they are created. 104 */ 105 rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | 106 HFI1_CAP_KGET(NODROP_RHQ_FULL) | 107 HFI1_CAP_KGET(NODROP_EGR_FULL) | 108 HFI1_CAP_KGET(DMA_RTAIL); 109 110 /* Control context must use DMA_RTAIL */ 111 if (rcd->ctxt == HFI1_CTRL_CTXT) 112 rcd->flags |= HFI1_CAP_DMA_RTAIL; 113 rcd->fast_handler = get_dma_rtail_setting(rcd) ? 114 handle_receive_interrupt_dma_rtail : 115 handle_receive_interrupt_nodma_rtail; 116 rcd->slow_handler = handle_receive_interrupt; 117 118 hfi1_set_seq_cnt(rcd, 1); 119 120 rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); 121 if (!rcd->sc) { 122 dd_dev_err(dd, "Kernel send context allocation failed\n"); 123 return -ENOMEM; 124 } 125 hfi1_init_ctxt(rcd->sc); 126 127 return 0; 128 } 129 130 /* 131 * Create the receive context array and one or more kernel contexts 132 */ 133 int hfi1_create_kctxts(struct hfi1_devdata *dd) 134 { 135 u16 i; 136 int ret; 137 138 dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd), 139 GFP_KERNEL, dd->node); 140 if (!dd->rcd) 141 return -ENOMEM; 142 143 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 144 ret = hfi1_create_kctxt(dd, dd->pport); 145 if (ret) 146 goto bail; 147 } 148 149 return 0; 150 bail: 151 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) 152 hfi1_free_ctxt(dd->rcd[i]); 153 154 /* All the contexts should be freed, free the array */ 155 kfree(dd->rcd); 156 dd->rcd = NULL; 157 return ret; 158 } 159 160 /* 161 * Helper routines for the receive context reference count (rcd and uctxt). 162 */ 163 static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) 164 { 165 kref_init(&rcd->kref); 166 } 167 168 /** 169 * hfi1_rcd_free - When reference is zero clean up. 170 * @kref: pointer to an initialized rcd data structure 171 * 172 */ 173 static void hfi1_rcd_free(struct kref *kref) 174 { 175 unsigned long flags; 176 struct hfi1_ctxtdata *rcd = 177 container_of(kref, struct hfi1_ctxtdata, kref); 178 179 spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); 180 rcd->dd->rcd[rcd->ctxt] = NULL; 181 spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); 182 183 hfi1_free_ctxtdata(rcd->dd, rcd); 184 185 kfree(rcd); 186 } 187 188 /** 189 * hfi1_rcd_put - decrement reference for rcd 190 * @rcd: pointer to an initialized rcd data structure 191 * 192 * Use this to put a reference after the init. 193 */ 194 int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) 195 { 196 if (rcd) 197 return kref_put(&rcd->kref, hfi1_rcd_free); 198 199 return 0; 200 } 201 202 /** 203 * hfi1_rcd_get - increment reference for rcd 204 * @rcd: pointer to an initialized rcd data structure 205 * 206 * Use this to get a reference after the init. 207 * 208 * Return : reflect kref_get_unless_zero(), which returns non-zero on 209 * increment, otherwise 0. 210 */ 211 int hfi1_rcd_get(struct hfi1_ctxtdata *rcd) 212 { 213 return kref_get_unless_zero(&rcd->kref); 214 } 215 216 /** 217 * allocate_rcd_index - allocate an rcd index from the rcd array 218 * @dd: pointer to a valid devdata structure 219 * @rcd: rcd data structure to assign 220 * @index: pointer to index that is allocated 221 * 222 * Find an empty index in the rcd array, and assign the given rcd to it. 223 * If the array is full, we are EBUSY. 224 * 225 */ 226 static int allocate_rcd_index(struct hfi1_devdata *dd, 227 struct hfi1_ctxtdata *rcd, u16 *index) 228 { 229 unsigned long flags; 230 u16 ctxt; 231 232 spin_lock_irqsave(&dd->uctxt_lock, flags); 233 for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) 234 if (!dd->rcd[ctxt]) 235 break; 236 237 if (ctxt < dd->num_rcv_contexts) { 238 rcd->ctxt = ctxt; 239 dd->rcd[ctxt] = rcd; 240 hfi1_rcd_init(rcd); 241 } 242 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 243 244 if (ctxt >= dd->num_rcv_contexts) 245 return -EBUSY; 246 247 *index = ctxt; 248 249 return 0; 250 } 251 252 /** 253 * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the 254 * array 255 * @dd: pointer to a valid devdata structure 256 * @ctxt: the index of an possilbe rcd 257 * 258 * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given 259 * ctxt index is valid. 260 * 261 * The caller is responsible for making the _put(). 262 * 263 */ 264 struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, 265 u16 ctxt) 266 { 267 if (ctxt < dd->num_rcv_contexts) 268 return hfi1_rcd_get_by_index(dd, ctxt); 269 270 return NULL; 271 } 272 273 /** 274 * hfi1_rcd_get_by_index - get by index 275 * @dd: pointer to a valid devdata structure 276 * @ctxt: the index of an possilbe rcd 277 * 278 * We need to protect access to the rcd array. If access is needed to 279 * one or more index, get the protecting spinlock and then increment the 280 * kref. 281 * 282 * The caller is responsible for making the _put(). 283 * 284 */ 285 struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) 286 { 287 unsigned long flags; 288 struct hfi1_ctxtdata *rcd = NULL; 289 290 spin_lock_irqsave(&dd->uctxt_lock, flags); 291 if (dd->rcd[ctxt]) { 292 rcd = dd->rcd[ctxt]; 293 if (!hfi1_rcd_get(rcd)) 294 rcd = NULL; 295 } 296 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 297 298 return rcd; 299 } 300 301 /* 302 * Common code for user and kernel context create and setup. 303 * NOTE: the initial kref is done here (hf1_rcd_init()). 304 */ 305 int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, 306 struct hfi1_ctxtdata **context) 307 { 308 struct hfi1_devdata *dd = ppd->dd; 309 struct hfi1_ctxtdata *rcd; 310 unsigned kctxt_ngroups = 0; 311 u32 base; 312 313 if (dd->rcv_entries.nctxt_extra > 314 dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt) 315 kctxt_ngroups = (dd->rcv_entries.nctxt_extra - 316 (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)); 317 rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); 318 if (rcd) { 319 u32 rcvtids, max_entries; 320 u16 ctxt; 321 int ret; 322 323 ret = allocate_rcd_index(dd, rcd, &ctxt); 324 if (ret) { 325 *context = NULL; 326 kfree(rcd); 327 return ret; 328 } 329 330 INIT_LIST_HEAD(&rcd->qp_wait_list); 331 hfi1_exp_tid_group_init(rcd); 332 rcd->ppd = ppd; 333 rcd->dd = dd; 334 rcd->numa_id = numa; 335 rcd->rcv_array_groups = dd->rcv_entries.ngroups; 336 rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; 337 rcd->msix_intr = CCE_NUM_MSIX_VECTORS; 338 339 mutex_init(&rcd->exp_mutex); 340 spin_lock_init(&rcd->exp_lock); 341 INIT_LIST_HEAD(&rcd->flow_queue.queue_head); 342 INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); 343 344 hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); 345 346 /* 347 * Calculate the context's RcvArray entry starting point. 348 * We do this here because we have to take into account all 349 * the RcvArray entries that previous context would have 350 * taken and we have to account for any extra groups assigned 351 * to the static (kernel) or dynamic (vnic/user) contexts. 352 */ 353 if (ctxt < dd->first_dyn_alloc_ctxt) { 354 if (ctxt < kctxt_ngroups) { 355 base = ctxt * (dd->rcv_entries.ngroups + 1); 356 rcd->rcv_array_groups++; 357 } else { 358 base = kctxt_ngroups + 359 (ctxt * dd->rcv_entries.ngroups); 360 } 361 } else { 362 u16 ct = ctxt - dd->first_dyn_alloc_ctxt; 363 364 base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + 365 kctxt_ngroups); 366 if (ct < dd->rcv_entries.nctxt_extra) { 367 base += ct * (dd->rcv_entries.ngroups + 1); 368 rcd->rcv_array_groups++; 369 } else { 370 base += dd->rcv_entries.nctxt_extra + 371 (ct * dd->rcv_entries.ngroups); 372 } 373 } 374 rcd->eager_base = base * dd->rcv_entries.group_size; 375 376 rcd->rcvhdrq_cnt = rcvhdrcnt; 377 rcd->rcvhdrqentsize = hfi1_hdrq_entsize; 378 rcd->rhf_offset = 379 rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); 380 /* 381 * Simple Eager buffer allocation: we have already pre-allocated 382 * the number of RcvArray entry groups. Each ctxtdata structure 383 * holds the number of groups for that context. 384 * 385 * To follow CSR requirements and maintain cacheline alignment, 386 * make sure all sizes and bases are multiples of group_size. 387 * 388 * The expected entry count is what is left after assigning 389 * eager. 390 */ 391 max_entries = rcd->rcv_array_groups * 392 dd->rcv_entries.group_size; 393 rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); 394 rcd->egrbufs.count = round_down(rcvtids, 395 dd->rcv_entries.group_size); 396 if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { 397 dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", 398 rcd->ctxt); 399 rcd->egrbufs.count = MAX_EAGER_ENTRIES; 400 } 401 hfi1_cdbg(PROC, 402 "ctxt%u: max Eager buffer RcvArray entries: %u\n", 403 rcd->ctxt, rcd->egrbufs.count); 404 405 /* 406 * Allocate array that will hold the eager buffer accounting 407 * data. 408 * This will allocate the maximum possible buffer count based 409 * on the value of the RcvArray split parameter. 410 * The resulting value will be rounded down to the closest 411 * multiple of dd->rcv_entries.group_size. 412 */ 413 rcd->egrbufs.buffers = 414 kcalloc_node(rcd->egrbufs.count, 415 sizeof(*rcd->egrbufs.buffers), 416 GFP_KERNEL, numa); 417 if (!rcd->egrbufs.buffers) 418 goto bail; 419 rcd->egrbufs.rcvtids = 420 kcalloc_node(rcd->egrbufs.count, 421 sizeof(*rcd->egrbufs.rcvtids), 422 GFP_KERNEL, numa); 423 if (!rcd->egrbufs.rcvtids) 424 goto bail; 425 rcd->egrbufs.size = eager_buffer_size; 426 /* 427 * The size of the buffers programmed into the RcvArray 428 * entries needs to be big enough to handle the highest 429 * MTU supported. 430 */ 431 if (rcd->egrbufs.size < hfi1_max_mtu) { 432 rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); 433 hfi1_cdbg(PROC, 434 "ctxt%u: eager bufs size too small. Adjusting to %u\n", 435 rcd->ctxt, rcd->egrbufs.size); 436 } 437 rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; 438 439 /* Applicable only for statically created kernel contexts */ 440 if (ctxt < dd->first_dyn_alloc_ctxt) { 441 rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), 442 GFP_KERNEL, numa); 443 if (!rcd->opstats) 444 goto bail; 445 446 /* Initialize TID flow generations for the context */ 447 hfi1_kern_init_ctxt_generations(rcd); 448 } 449 450 *context = rcd; 451 return 0; 452 } 453 454 bail: 455 *context = NULL; 456 hfi1_free_ctxt(rcd); 457 return -ENOMEM; 458 } 459 460 /** 461 * hfi1_free_ctxt - free context 462 * @rcd: pointer to an initialized rcd data structure 463 * 464 * This wrapper is the free function that matches hfi1_create_ctxtdata(). 465 * When a context is done being used (kernel or user), this function is called 466 * for the "final" put to match the kref init from hf1i_create_ctxtdata(). 467 * Other users of the context do a get/put sequence to make sure that the 468 * structure isn't removed while in use. 469 */ 470 void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) 471 { 472 hfi1_rcd_put(rcd); 473 } 474 475 /* 476 * Select the largest ccti value over all SLs to determine the intra- 477 * packet gap for the link. 478 * 479 * called with cca_timer_lock held (to protect access to cca_timer 480 * array), and rcu_read_lock() (to protect access to cc_state). 481 */ 482 void set_link_ipg(struct hfi1_pportdata *ppd) 483 { 484 struct hfi1_devdata *dd = ppd->dd; 485 struct cc_state *cc_state; 486 int i; 487 u16 cce, ccti_limit, max_ccti = 0; 488 u16 shift, mult; 489 u64 src; 490 u32 current_egress_rate; /* Mbits /sec */ 491 u32 max_pkt_time; 492 /* 493 * max_pkt_time is the maximum packet egress time in units 494 * of the fabric clock period 1/(805 MHz). 495 */ 496 497 cc_state = get_cc_state(ppd); 498 499 if (!cc_state) 500 /* 501 * This should _never_ happen - rcu_read_lock() is held, 502 * and set_link_ipg() should not be called if cc_state 503 * is NULL. 504 */ 505 return; 506 507 for (i = 0; i < OPA_MAX_SLS; i++) { 508 u16 ccti = ppd->cca_timer[i].ccti; 509 510 if (ccti > max_ccti) 511 max_ccti = ccti; 512 } 513 514 ccti_limit = cc_state->cct.ccti_limit; 515 if (max_ccti > ccti_limit) 516 max_ccti = ccti_limit; 517 518 cce = cc_state->cct.entries[max_ccti].entry; 519 shift = (cce & 0xc000) >> 14; 520 mult = (cce & 0x3fff); 521 522 current_egress_rate = active_egress_rate(ppd); 523 524 max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); 525 526 src = (max_pkt_time >> shift) * mult; 527 528 src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; 529 src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; 530 531 write_csr(dd, SEND_STATIC_RATE_CONTROL, src); 532 } 533 534 static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) 535 { 536 struct cca_timer *cca_timer; 537 struct hfi1_pportdata *ppd; 538 int sl; 539 u16 ccti_timer, ccti_min; 540 struct cc_state *cc_state; 541 unsigned long flags; 542 enum hrtimer_restart ret = HRTIMER_NORESTART; 543 544 cca_timer = container_of(t, struct cca_timer, hrtimer); 545 ppd = cca_timer->ppd; 546 sl = cca_timer->sl; 547 548 rcu_read_lock(); 549 550 cc_state = get_cc_state(ppd); 551 552 if (!cc_state) { 553 rcu_read_unlock(); 554 return HRTIMER_NORESTART; 555 } 556 557 /* 558 * 1) decrement ccti for SL 559 * 2) calculate IPG for link (set_link_ipg()) 560 * 3) restart timer, unless ccti is at min value 561 */ 562 563 ccti_min = cc_state->cong_setting.entries[sl].ccti_min; 564 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 565 566 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 567 568 if (cca_timer->ccti > ccti_min) { 569 cca_timer->ccti--; 570 set_link_ipg(ppd); 571 } 572 573 if (cca_timer->ccti > ccti_min) { 574 unsigned long nsec = 1024 * ccti_timer; 575 /* ccti_timer is in units of 1.024 usec */ 576 hrtimer_forward_now(t, ns_to_ktime(nsec)); 577 ret = HRTIMER_RESTART; 578 } 579 580 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 581 rcu_read_unlock(); 582 return ret; 583 } 584 585 /* 586 * Common code for initializing the physical port structure. 587 */ 588 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, 589 struct hfi1_devdata *dd, u8 hw_pidx, u32 port) 590 { 591 int i; 592 uint default_pkey_idx; 593 struct cc_state *cc_state; 594 595 ppd->dd = dd; 596 ppd->hw_pidx = hw_pidx; 597 ppd->port = port; /* IB port number, not index */ 598 ppd->prev_link_width = LINK_WIDTH_DEFAULT; 599 /* 600 * There are C_VL_COUNT number of PortVLXmitWait counters. 601 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. 602 */ 603 for (i = 0; i < C_VL_COUNT + 1; i++) { 604 ppd->port_vl_xmit_wait_last[i] = 0; 605 ppd->vl_xmit_flit_cnt[i] = 0; 606 } 607 608 default_pkey_idx = 1; 609 610 ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; 611 ppd->part_enforce |= HFI1_PART_ENFORCE_IN; 612 ppd->pkeys[0] = 0x8001; 613 614 INIT_WORK(&ppd->link_vc_work, handle_verify_cap); 615 INIT_WORK(&ppd->link_up_work, handle_link_up); 616 INIT_WORK(&ppd->link_down_work, handle_link_down); 617 INIT_WORK(&ppd->freeze_work, handle_freeze); 618 INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); 619 INIT_WORK(&ppd->sma_message_work, handle_sma_message); 620 INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); 621 INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link); 622 INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); 623 INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); 624 625 mutex_init(&ppd->hls_lock); 626 spin_lock_init(&ppd->qsfp_info.qsfp_lock); 627 628 ppd->qsfp_info.ppd = ppd; 629 ppd->sm_trap_qp = 0x0; 630 ppd->sa_qp = 0x1; 631 632 ppd->hfi1_wq = NULL; 633 634 spin_lock_init(&ppd->cca_timer_lock); 635 636 for (i = 0; i < OPA_MAX_SLS; i++) { 637 hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, 638 HRTIMER_MODE_REL); 639 ppd->cca_timer[i].ppd = ppd; 640 ppd->cca_timer[i].sl = i; 641 ppd->cca_timer[i].ccti = 0; 642 ppd->cca_timer[i].hrtimer.function = cca_timer_fn; 643 } 644 645 ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; 646 647 spin_lock_init(&ppd->cc_state_lock); 648 spin_lock_init(&ppd->cc_log_lock); 649 cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL); 650 RCU_INIT_POINTER(ppd->cc_state, cc_state); 651 if (!cc_state) 652 goto bail; 653 return; 654 655 bail: 656 dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); 657 } 658 659 /* 660 * Do initialization for device that is only needed on 661 * first detect, not on resets. 662 */ 663 static int loadtime_init(struct hfi1_devdata *dd) 664 { 665 return 0; 666 } 667 668 /** 669 * init_after_reset - re-initialize after a reset 670 * @dd: the hfi1_ib device 671 * 672 * sanity check at least some of the values after reset, and 673 * ensure no receive or transmit (explicitly, in case reset 674 * failed 675 */ 676 static int init_after_reset(struct hfi1_devdata *dd) 677 { 678 int i; 679 struct hfi1_ctxtdata *rcd; 680 /* 681 * Ensure chip does no sends or receives, tail updates, or 682 * pioavail updates while we re-initialize. This is mostly 683 * for the driver data structures, not chip registers. 684 */ 685 for (i = 0; i < dd->num_rcv_contexts; i++) { 686 rcd = hfi1_rcd_get_by_index(dd, i); 687 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | 688 HFI1_RCVCTRL_INTRAVAIL_DIS | 689 HFI1_RCVCTRL_TAILUPD_DIS, rcd); 690 hfi1_rcd_put(rcd); 691 } 692 pio_send_control(dd, PSC_GLOBAL_DISABLE); 693 for (i = 0; i < dd->num_send_contexts; i++) 694 sc_disable(dd->send_contexts[i].sc); 695 696 return 0; 697 } 698 699 static void enable_chip(struct hfi1_devdata *dd) 700 { 701 struct hfi1_ctxtdata *rcd; 702 u32 rcvmask; 703 u16 i; 704 705 /* enable PIO send */ 706 pio_send_control(dd, PSC_GLOBAL_ENABLE); 707 708 /* 709 * Enable kernel ctxts' receive and receive interrupt. 710 * Other ctxts done as user opens and initializes them. 711 */ 712 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 713 rcd = hfi1_rcd_get_by_index(dd, i); 714 if (!rcd) 715 continue; 716 rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; 717 rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? 718 HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; 719 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 720 rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; 721 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) 722 rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; 723 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) 724 rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; 725 if (HFI1_CAP_IS_KSET(TID_RDMA)) 726 rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; 727 hfi1_rcvctrl(dd, rcvmask, rcd); 728 sc_enable(rcd->sc); 729 hfi1_rcd_put(rcd); 730 } 731 } 732 733 /** 734 * create_workqueues - create per port workqueues 735 * @dd: the hfi1_ib device 736 */ 737 static int create_workqueues(struct hfi1_devdata *dd) 738 { 739 int pidx; 740 struct hfi1_pportdata *ppd; 741 742 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 743 ppd = dd->pport + pidx; 744 if (!ppd->hfi1_wq) { 745 ppd->hfi1_wq = 746 alloc_workqueue( 747 "hfi%d_%d", 748 WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | 749 WQ_MEM_RECLAIM, 750 HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, 751 dd->unit, pidx); 752 if (!ppd->hfi1_wq) 753 goto wq_error; 754 } 755 if (!ppd->link_wq) { 756 /* 757 * Make the link workqueue single-threaded to enforce 758 * serialization. 759 */ 760 ppd->link_wq = 761 alloc_workqueue( 762 "hfi_link_%d_%d", 763 WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, 764 1, /* max_active */ 765 dd->unit, pidx); 766 if (!ppd->link_wq) 767 goto wq_error; 768 } 769 } 770 return 0; 771 wq_error: 772 pr_err("alloc_workqueue failed for port %d\n", pidx + 1); 773 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 774 ppd = dd->pport + pidx; 775 if (ppd->hfi1_wq) { 776 destroy_workqueue(ppd->hfi1_wq); 777 ppd->hfi1_wq = NULL; 778 } 779 if (ppd->link_wq) { 780 destroy_workqueue(ppd->link_wq); 781 ppd->link_wq = NULL; 782 } 783 } 784 return -ENOMEM; 785 } 786 787 /** 788 * destroy_workqueues - destroy per port workqueues 789 * @dd: the hfi1_ib device 790 */ 791 static void destroy_workqueues(struct hfi1_devdata *dd) 792 { 793 int pidx; 794 struct hfi1_pportdata *ppd; 795 796 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 797 ppd = dd->pport + pidx; 798 799 if (ppd->hfi1_wq) { 800 destroy_workqueue(ppd->hfi1_wq); 801 ppd->hfi1_wq = NULL; 802 } 803 if (ppd->link_wq) { 804 destroy_workqueue(ppd->link_wq); 805 ppd->link_wq = NULL; 806 } 807 } 808 } 809 810 /** 811 * enable_general_intr() - Enable the IRQs that will be handled by the 812 * general interrupt handler. 813 * @dd: valid devdata 814 * 815 */ 816 static void enable_general_intr(struct hfi1_devdata *dd) 817 { 818 set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); 819 set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); 820 set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); 821 set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); 822 set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); 823 set_intr_bits(dd, IS_DC_START, IS_DC_END, true); 824 set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); 825 } 826 827 /** 828 * hfi1_init - do the actual initialization sequence on the chip 829 * @dd: the hfi1_ib device 830 * @reinit: re-initializing, so don't allocate new memory 831 * 832 * Do the actual initialization sequence on the chip. This is done 833 * both from the init routine called from the PCI infrastructure, and 834 * when we reset the chip, or detect that it was reset internally, 835 * or it's administratively re-enabled. 836 * 837 * Memory allocation here and in called routines is only done in 838 * the first case (reinit == 0). We have to be careful, because even 839 * without memory allocation, we need to re-write all the chip registers 840 * TIDs, etc. after the reset or enable has completed. 841 */ 842 int hfi1_init(struct hfi1_devdata *dd, int reinit) 843 { 844 int ret = 0, pidx, lastfail = 0; 845 unsigned long len; 846 u16 i; 847 struct hfi1_ctxtdata *rcd; 848 struct hfi1_pportdata *ppd; 849 850 /* Set up send low level handlers */ 851 dd->process_pio_send = hfi1_verbs_send_pio; 852 dd->process_dma_send = hfi1_verbs_send_dma; 853 dd->pio_inline_send = pio_copy; 854 dd->process_vnic_dma_send = hfi1_vnic_send_dma; 855 856 if (is_ax(dd)) { 857 atomic_set(&dd->drop_packet, DROP_PACKET_ON); 858 dd->do_drop = true; 859 } else { 860 atomic_set(&dd->drop_packet, DROP_PACKET_OFF); 861 dd->do_drop = false; 862 } 863 864 /* make sure the link is not "up" */ 865 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 866 ppd = dd->pport + pidx; 867 ppd->linkup = 0; 868 } 869 870 if (reinit) 871 ret = init_after_reset(dd); 872 else 873 ret = loadtime_init(dd); 874 if (ret) 875 goto done; 876 877 /* allocate dummy tail memory for all receive contexts */ 878 dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, 879 sizeof(u64), 880 &dd->rcvhdrtail_dummy_dma, 881 GFP_KERNEL); 882 883 if (!dd->rcvhdrtail_dummy_kvaddr) { 884 dd_dev_err(dd, "cannot allocate dummy tail memory\n"); 885 ret = -ENOMEM; 886 goto done; 887 } 888 889 /* dd->rcd can be NULL if early initialization failed */ 890 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { 891 /* 892 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 893 * re-init, the simplest way to handle this is to free 894 * existing, and re-allocate. 895 * Need to re-create rest of ctxt 0 ctxtdata as well. 896 */ 897 rcd = hfi1_rcd_get_by_index(dd, i); 898 if (!rcd) 899 continue; 900 901 rcd->do_interrupt = &handle_receive_interrupt; 902 903 lastfail = hfi1_create_rcvhdrq(dd, rcd); 904 if (!lastfail) 905 lastfail = hfi1_setup_eagerbufs(rcd); 906 if (!lastfail) 907 lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); 908 if (lastfail) { 909 dd_dev_err(dd, 910 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 911 ret = lastfail; 912 } 913 /* enable IRQ */ 914 hfi1_rcd_put(rcd); 915 } 916 917 /* Allocate enough memory for user event notification. */ 918 len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * 919 sizeof(*dd->events)); 920 dd->events = vmalloc_user(len); 921 if (!dd->events) 922 dd_dev_err(dd, "Failed to allocate user events page\n"); 923 /* 924 * Allocate a page for device and port status. 925 * Page will be shared amongst all user processes. 926 */ 927 dd->status = vmalloc_user(PAGE_SIZE); 928 if (!dd->status) 929 dd_dev_err(dd, "Failed to allocate dev status page\n"); 930 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 931 ppd = dd->pport + pidx; 932 if (dd->status) 933 /* Currently, we only have one port */ 934 ppd->statusp = &dd->status->port; 935 936 set_mtu(ppd); 937 } 938 939 /* enable chip even if we have an error, so we can debug cause */ 940 enable_chip(dd); 941 942 done: 943 /* 944 * Set status even if port serdes is not initialized 945 * so that diags will work. 946 */ 947 if (dd->status) 948 dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | 949 HFI1_STATUS_INITTED; 950 if (!ret) { 951 /* enable all interrupts from the chip */ 952 enable_general_intr(dd); 953 init_qsfp_int(dd); 954 955 /* chip is OK for user apps; mark it as initialized */ 956 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 957 ppd = dd->pport + pidx; 958 959 /* 960 * start the serdes - must be after interrupts are 961 * enabled so we are notified when the link goes up 962 */ 963 lastfail = bringup_serdes(ppd); 964 if (lastfail) 965 dd_dev_info(dd, 966 "Failed to bring up port %u\n", 967 ppd->port); 968 969 /* 970 * Set status even if port serdes is not initialized 971 * so that diags will work. 972 */ 973 if (ppd->statusp) 974 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | 975 HFI1_STATUS_INITTED; 976 if (!ppd->link_speed_enabled) 977 continue; 978 } 979 } 980 981 /* if ret is non-zero, we probably should do some cleanup here... */ 982 return ret; 983 } 984 985 struct hfi1_devdata *hfi1_lookup(int unit) 986 { 987 return xa_load(&hfi1_dev_table, unit); 988 } 989 990 /* 991 * Stop the timers during unit shutdown, or after an error late 992 * in initialization. 993 */ 994 static void stop_timers(struct hfi1_devdata *dd) 995 { 996 struct hfi1_pportdata *ppd; 997 int pidx; 998 999 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1000 ppd = dd->pport + pidx; 1001 if (ppd->led_override_timer.function) { 1002 del_timer_sync(&ppd->led_override_timer); 1003 atomic_set(&ppd->led_override_timer_active, 0); 1004 } 1005 } 1006 } 1007 1008 /** 1009 * shutdown_device - shut down a device 1010 * @dd: the hfi1_ib device 1011 * 1012 * This is called to make the device quiet when we are about to 1013 * unload the driver, and also when the device is administratively 1014 * disabled. It does not free any data structures. 1015 * Everything it does has to be setup again by hfi1_init(dd, 1) 1016 */ 1017 static void shutdown_device(struct hfi1_devdata *dd) 1018 { 1019 struct hfi1_pportdata *ppd; 1020 struct hfi1_ctxtdata *rcd; 1021 unsigned pidx; 1022 int i; 1023 1024 if (dd->flags & HFI1_SHUTDOWN) 1025 return; 1026 dd->flags |= HFI1_SHUTDOWN; 1027 1028 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1029 ppd = dd->pport + pidx; 1030 1031 ppd->linkup = 0; 1032 if (ppd->statusp) 1033 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | 1034 HFI1_STATUS_IB_READY); 1035 } 1036 dd->flags &= ~HFI1_INITTED; 1037 1038 /* mask and clean up interrupts */ 1039 set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); 1040 msix_clean_up_interrupts(dd); 1041 1042 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1043 ppd = dd->pport + pidx; 1044 for (i = 0; i < dd->num_rcv_contexts; i++) { 1045 rcd = hfi1_rcd_get_by_index(dd, i); 1046 hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | 1047 HFI1_RCVCTRL_CTXT_DIS | 1048 HFI1_RCVCTRL_INTRAVAIL_DIS | 1049 HFI1_RCVCTRL_PKEY_DIS | 1050 HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); 1051 hfi1_rcd_put(rcd); 1052 } 1053 /* 1054 * Gracefully stop all sends allowing any in progress to 1055 * trickle out first. 1056 */ 1057 for (i = 0; i < dd->num_send_contexts; i++) 1058 sc_flush(dd->send_contexts[i].sc); 1059 } 1060 1061 /* 1062 * Enough for anything that's going to trickle out to have actually 1063 * done so. 1064 */ 1065 udelay(20); 1066 1067 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1068 ppd = dd->pport + pidx; 1069 1070 /* disable all contexts */ 1071 for (i = 0; i < dd->num_send_contexts; i++) 1072 sc_disable(dd->send_contexts[i].sc); 1073 /* disable the send device */ 1074 pio_send_control(dd, PSC_GLOBAL_DISABLE); 1075 1076 shutdown_led_override(ppd); 1077 1078 /* 1079 * Clear SerdesEnable. 1080 * We can't count on interrupts since we are stopping. 1081 */ 1082 hfi1_quiet_serdes(ppd); 1083 if (ppd->hfi1_wq) 1084 flush_workqueue(ppd->hfi1_wq); 1085 if (ppd->link_wq) 1086 flush_workqueue(ppd->link_wq); 1087 } 1088 sdma_exit(dd); 1089 } 1090 1091 /** 1092 * hfi1_free_ctxtdata - free a context's allocated data 1093 * @dd: the hfi1_ib device 1094 * @rcd: the ctxtdata structure 1095 * 1096 * free up any allocated data for a context 1097 * It should never change any chip state, or global driver state. 1098 */ 1099 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1100 { 1101 u32 e; 1102 1103 if (!rcd) 1104 return; 1105 1106 if (rcd->rcvhdrq) { 1107 dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd), 1108 rcd->rcvhdrq, rcd->rcvhdrq_dma); 1109 rcd->rcvhdrq = NULL; 1110 if (hfi1_rcvhdrtail_kvaddr(rcd)) { 1111 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 1112 (void *)hfi1_rcvhdrtail_kvaddr(rcd), 1113 rcd->rcvhdrqtailaddr_dma); 1114 rcd->rcvhdrtail_kvaddr = NULL; 1115 } 1116 } 1117 1118 /* all the RcvArray entries should have been cleared by now */ 1119 kfree(rcd->egrbufs.rcvtids); 1120 rcd->egrbufs.rcvtids = NULL; 1121 1122 for (e = 0; e < rcd->egrbufs.alloced; e++) { 1123 if (rcd->egrbufs.buffers[e].dma) 1124 dma_free_coherent(&dd->pcidev->dev, 1125 rcd->egrbufs.buffers[e].len, 1126 rcd->egrbufs.buffers[e].addr, 1127 rcd->egrbufs.buffers[e].dma); 1128 } 1129 kfree(rcd->egrbufs.buffers); 1130 rcd->egrbufs.alloced = 0; 1131 rcd->egrbufs.buffers = NULL; 1132 1133 sc_free(rcd->sc); 1134 rcd->sc = NULL; 1135 1136 vfree(rcd->subctxt_uregbase); 1137 vfree(rcd->subctxt_rcvegrbuf); 1138 vfree(rcd->subctxt_rcvhdr_base); 1139 kfree(rcd->opstats); 1140 1141 rcd->subctxt_uregbase = NULL; 1142 rcd->subctxt_rcvegrbuf = NULL; 1143 rcd->subctxt_rcvhdr_base = NULL; 1144 rcd->opstats = NULL; 1145 } 1146 1147 /* 1148 * Release our hold on the shared asic data. If we are the last one, 1149 * return the structure to be finalized outside the lock. Must be 1150 * holding hfi1_dev_table lock. 1151 */ 1152 static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) 1153 { 1154 struct hfi1_asic_data *ad; 1155 int other; 1156 1157 if (!dd->asic_data) 1158 return NULL; 1159 dd->asic_data->dds[dd->hfi1_id] = NULL; 1160 other = dd->hfi1_id ? 0 : 1; 1161 ad = dd->asic_data; 1162 dd->asic_data = NULL; 1163 /* return NULL if the other dd still has a link */ 1164 return ad->dds[other] ? NULL : ad; 1165 } 1166 1167 static void finalize_asic_data(struct hfi1_devdata *dd, 1168 struct hfi1_asic_data *ad) 1169 { 1170 clean_up_i2c(dd, ad); 1171 kfree(ad); 1172 } 1173 1174 /** 1175 * hfi1_free_devdata - cleans up and frees per-unit data structure 1176 * @dd: pointer to a valid devdata structure 1177 * 1178 * It cleans up and frees all data structures set up by 1179 * by hfi1_alloc_devdata(). 1180 */ 1181 void hfi1_free_devdata(struct hfi1_devdata *dd) 1182 { 1183 struct hfi1_asic_data *ad; 1184 unsigned long flags; 1185 1186 xa_lock_irqsave(&hfi1_dev_table, flags); 1187 __xa_erase(&hfi1_dev_table, dd->unit); 1188 ad = release_asic_data(dd); 1189 xa_unlock_irqrestore(&hfi1_dev_table, flags); 1190 1191 finalize_asic_data(dd, ad); 1192 free_platform_config(dd); 1193 rcu_barrier(); /* wait for rcu callbacks to complete */ 1194 free_percpu(dd->int_counter); 1195 free_percpu(dd->rcv_limit); 1196 free_percpu(dd->send_schedule); 1197 free_percpu(dd->tx_opstats); 1198 dd->int_counter = NULL; 1199 dd->rcv_limit = NULL; 1200 dd->send_schedule = NULL; 1201 dd->tx_opstats = NULL; 1202 kfree(dd->comp_vect); 1203 dd->comp_vect = NULL; 1204 sdma_clean(dd, dd->num_sdma); 1205 rvt_dealloc_device(&dd->verbs_dev.rdi); 1206 } 1207 1208 /** 1209 * hfi1_alloc_devdata - Allocate our primary per-unit data structure. 1210 * @pdev: Valid PCI device 1211 * @extra: How many bytes to alloc past the default 1212 * 1213 * Must be done via verbs allocator, because the verbs cleanup process 1214 * both does cleanup and free of the data structure. 1215 * "extra" is for chip-specific data. 1216 */ 1217 static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, 1218 size_t extra) 1219 { 1220 struct hfi1_devdata *dd; 1221 int ret, nports; 1222 1223 /* extra is * number of ports */ 1224 nports = extra / sizeof(struct hfi1_pportdata); 1225 1226 dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, 1227 nports); 1228 if (!dd) 1229 return ERR_PTR(-ENOMEM); 1230 dd->num_pports = nports; 1231 dd->pport = (struct hfi1_pportdata *)(dd + 1); 1232 dd->pcidev = pdev; 1233 pci_set_drvdata(pdev, dd); 1234 1235 ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b, 1236 GFP_KERNEL); 1237 if (ret < 0) { 1238 dev_err(&pdev->dev, 1239 "Could not allocate unit ID: error %d\n", -ret); 1240 goto bail; 1241 } 1242 rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); 1243 /* 1244 * If the BIOS does not have the NUMA node information set, select 1245 * NUMA 0 so we get consistent performance. 1246 */ 1247 dd->node = pcibus_to_node(pdev->bus); 1248 if (dd->node == NUMA_NO_NODE) { 1249 dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n"); 1250 dd->node = 0; 1251 } 1252 1253 /* 1254 * Initialize all locks for the device. This needs to be as early as 1255 * possible so locks are usable. 1256 */ 1257 spin_lock_init(&dd->sc_lock); 1258 spin_lock_init(&dd->sendctrl_lock); 1259 spin_lock_init(&dd->rcvctrl_lock); 1260 spin_lock_init(&dd->uctxt_lock); 1261 spin_lock_init(&dd->hfi1_diag_trans_lock); 1262 spin_lock_init(&dd->sc_init_lock); 1263 spin_lock_init(&dd->dc8051_memlock); 1264 seqlock_init(&dd->sc2vl_lock); 1265 spin_lock_init(&dd->sde_map_lock); 1266 spin_lock_init(&dd->pio_map_lock); 1267 mutex_init(&dd->dc8051_lock); 1268 init_waitqueue_head(&dd->event_queue); 1269 spin_lock_init(&dd->irq_src_lock); 1270 1271 dd->int_counter = alloc_percpu(u64); 1272 if (!dd->int_counter) { 1273 ret = -ENOMEM; 1274 goto bail; 1275 } 1276 1277 dd->rcv_limit = alloc_percpu(u64); 1278 if (!dd->rcv_limit) { 1279 ret = -ENOMEM; 1280 goto bail; 1281 } 1282 1283 dd->send_schedule = alloc_percpu(u64); 1284 if (!dd->send_schedule) { 1285 ret = -ENOMEM; 1286 goto bail; 1287 } 1288 1289 dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); 1290 if (!dd->tx_opstats) { 1291 ret = -ENOMEM; 1292 goto bail; 1293 } 1294 1295 dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); 1296 if (!dd->comp_vect) { 1297 ret = -ENOMEM; 1298 goto bail; 1299 } 1300 1301 atomic_set(&dd->ipoib_rsm_usr_num, 0); 1302 return dd; 1303 1304 bail: 1305 hfi1_free_devdata(dd); 1306 return ERR_PTR(ret); 1307 } 1308 1309 /* 1310 * Called from freeze mode handlers, and from PCI error 1311 * reporting code. Should be paranoid about state of 1312 * system and data structures. 1313 */ 1314 void hfi1_disable_after_error(struct hfi1_devdata *dd) 1315 { 1316 if (dd->flags & HFI1_INITTED) { 1317 u32 pidx; 1318 1319 dd->flags &= ~HFI1_INITTED; 1320 if (dd->pport) 1321 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1322 struct hfi1_pportdata *ppd; 1323 1324 ppd = dd->pport + pidx; 1325 if (dd->flags & HFI1_PRESENT) 1326 set_link_state(ppd, HLS_DN_DISABLE); 1327 1328 if (ppd->statusp) 1329 *ppd->statusp &= ~HFI1_STATUS_IB_READY; 1330 } 1331 } 1332 1333 /* 1334 * Mark as having had an error for driver, and also 1335 * for /sys and status word mapped to user programs. 1336 * This marks unit as not usable, until reset. 1337 */ 1338 if (dd->status) 1339 dd->status->dev |= HFI1_STATUS_HWERROR; 1340 } 1341 1342 static void remove_one(struct pci_dev *); 1343 static int init_one(struct pci_dev *, const struct pci_device_id *); 1344 static void shutdown_one(struct pci_dev *); 1345 1346 #define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: " 1347 #define PFX DRIVER_NAME ": " 1348 1349 const struct pci_device_id hfi1_pci_tbl[] = { 1350 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, 1351 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, 1352 { 0, } 1353 }; 1354 1355 MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); 1356 1357 static struct pci_driver hfi1_pci_driver = { 1358 .name = DRIVER_NAME, 1359 .probe = init_one, 1360 .remove = remove_one, 1361 .shutdown = shutdown_one, 1362 .id_table = hfi1_pci_tbl, 1363 .err_handler = &hfi1_pci_err_handler, 1364 }; 1365 1366 static void __init compute_krcvqs(void) 1367 { 1368 int i; 1369 1370 for (i = 0; i < krcvqsset; i++) 1371 n_krcvqs += krcvqs[i]; 1372 } 1373 1374 /* 1375 * Do all the generic driver unit- and chip-independent memory 1376 * allocation and initialization. 1377 */ 1378 static int __init hfi1_mod_init(void) 1379 { 1380 int ret; 1381 1382 ret = dev_init(); 1383 if (ret) 1384 goto bail; 1385 1386 ret = node_affinity_init(); 1387 if (ret) 1388 goto bail; 1389 1390 /* validate max MTU before any devices start */ 1391 if (!valid_opa_max_mtu(hfi1_max_mtu)) { 1392 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", 1393 hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); 1394 hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; 1395 } 1396 /* valid CUs run from 1-128 in powers of 2 */ 1397 if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) 1398 hfi1_cu = 1; 1399 /* valid credit return threshold is 0-100, variable is unsigned */ 1400 if (user_credit_return_threshold > 100) 1401 user_credit_return_threshold = 100; 1402 1403 compute_krcvqs(); 1404 /* 1405 * sanitize receive interrupt count, time must wait until after 1406 * the hardware type is known 1407 */ 1408 if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) 1409 rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; 1410 /* reject invalid combinations */ 1411 if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { 1412 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); 1413 rcv_intr_count = 1; 1414 } 1415 if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { 1416 /* 1417 * Avoid indefinite packet delivery by requiring a timeout 1418 * if count is > 1. 1419 */ 1420 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); 1421 rcv_intr_timeout = 1; 1422 } 1423 if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { 1424 /* 1425 * The dynamic algorithm expects a non-zero timeout 1426 * and a count > 1. 1427 */ 1428 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); 1429 rcv_intr_dynamic = 0; 1430 } 1431 1432 /* sanitize link CRC options */ 1433 link_crc_mask &= SUPPORTED_CRCS; 1434 1435 ret = opfn_init(); 1436 if (ret < 0) { 1437 pr_err("Failed to allocate opfn_wq"); 1438 goto bail_dev; 1439 } 1440 1441 /* 1442 * These must be called before the driver is registered with 1443 * the PCI subsystem. 1444 */ 1445 hfi1_dbg_init(); 1446 ret = pci_register_driver(&hfi1_pci_driver); 1447 if (ret < 0) { 1448 pr_err("Unable to register driver: error %d\n", -ret); 1449 goto bail_dev; 1450 } 1451 goto bail; /* all OK */ 1452 1453 bail_dev: 1454 hfi1_dbg_exit(); 1455 dev_cleanup(); 1456 bail: 1457 return ret; 1458 } 1459 1460 module_init(hfi1_mod_init); 1461 1462 /* 1463 * Do the non-unit driver cleanup, memory free, etc. at unload. 1464 */ 1465 static void __exit hfi1_mod_cleanup(void) 1466 { 1467 pci_unregister_driver(&hfi1_pci_driver); 1468 opfn_exit(); 1469 node_affinity_destroy_all(); 1470 hfi1_dbg_exit(); 1471 1472 WARN_ON(!xa_empty(&hfi1_dev_table)); 1473 dispose_firmware(); /* asymmetric with obtain_firmware() */ 1474 dev_cleanup(); 1475 } 1476 1477 module_exit(hfi1_mod_cleanup); 1478 1479 /* this can only be called after a successful initialization */ 1480 static void cleanup_device_data(struct hfi1_devdata *dd) 1481 { 1482 int ctxt; 1483 int pidx; 1484 1485 /* users can't do anything more with chip */ 1486 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1487 struct hfi1_pportdata *ppd = &dd->pport[pidx]; 1488 struct cc_state *cc_state; 1489 int i; 1490 1491 if (ppd->statusp) 1492 *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; 1493 1494 for (i = 0; i < OPA_MAX_SLS; i++) 1495 hrtimer_cancel(&ppd->cca_timer[i].hrtimer); 1496 1497 spin_lock(&ppd->cc_state_lock); 1498 cc_state = get_cc_state_protected(ppd); 1499 RCU_INIT_POINTER(ppd->cc_state, NULL); 1500 spin_unlock(&ppd->cc_state_lock); 1501 1502 if (cc_state) 1503 kfree_rcu(cc_state, rcu); 1504 } 1505 1506 free_credit_return(dd); 1507 1508 if (dd->rcvhdrtail_dummy_kvaddr) { 1509 dma_free_coherent(&dd->pcidev->dev, sizeof(u64), 1510 (void *)dd->rcvhdrtail_dummy_kvaddr, 1511 dd->rcvhdrtail_dummy_dma); 1512 dd->rcvhdrtail_dummy_kvaddr = NULL; 1513 } 1514 1515 /* 1516 * Free any resources still in use (usually just kernel contexts) 1517 * at unload; we do for ctxtcnt, because that's what we allocate. 1518 */ 1519 for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { 1520 struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; 1521 1522 if (rcd) { 1523 hfi1_free_ctxt_rcv_groups(rcd); 1524 hfi1_free_ctxt(rcd); 1525 } 1526 } 1527 1528 kfree(dd->rcd); 1529 dd->rcd = NULL; 1530 1531 free_pio_map(dd); 1532 /* must follow rcv context free - need to remove rcv's hooks */ 1533 for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) 1534 sc_free(dd->send_contexts[ctxt].sc); 1535 dd->num_send_contexts = 0; 1536 kfree(dd->send_contexts); 1537 dd->send_contexts = NULL; 1538 kfree(dd->hw_to_sw); 1539 dd->hw_to_sw = NULL; 1540 kfree(dd->boardname); 1541 vfree(dd->events); 1542 vfree(dd->status); 1543 } 1544 1545 /* 1546 * Clean up on unit shutdown, or error during unit load after 1547 * successful initialization. 1548 */ 1549 static void postinit_cleanup(struct hfi1_devdata *dd) 1550 { 1551 hfi1_start_cleanup(dd); 1552 hfi1_comp_vectors_clean_up(dd); 1553 hfi1_dev_affinity_clean_up(dd); 1554 1555 hfi1_pcie_ddcleanup(dd); 1556 hfi1_pcie_cleanup(dd->pcidev); 1557 1558 cleanup_device_data(dd); 1559 1560 hfi1_free_devdata(dd); 1561 } 1562 1563 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1564 { 1565 int ret = 0, j, pidx, initfail; 1566 struct hfi1_devdata *dd; 1567 struct hfi1_pportdata *ppd; 1568 1569 /* First, lock the non-writable module parameters */ 1570 HFI1_CAP_LOCK(); 1571 1572 /* Validate dev ids */ 1573 if (!(ent->device == PCI_DEVICE_ID_INTEL0 || 1574 ent->device == PCI_DEVICE_ID_INTEL1)) { 1575 dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", 1576 ent->device); 1577 ret = -ENODEV; 1578 goto bail; 1579 } 1580 1581 /* Allocate the dd so we can get to work */ 1582 dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * 1583 sizeof(struct hfi1_pportdata)); 1584 if (IS_ERR(dd)) { 1585 ret = PTR_ERR(dd); 1586 goto bail; 1587 } 1588 1589 /* Validate some global module parameters */ 1590 ret = hfi1_validate_rcvhdrcnt(dd, rcvhdrcnt); 1591 if (ret) 1592 goto bail; 1593 1594 /* use the encoding function as a sanitization check */ 1595 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { 1596 dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", 1597 hfi1_hdrq_entsize); 1598 ret = -EINVAL; 1599 goto bail; 1600 } 1601 1602 /* The receive eager buffer size must be set before the receive 1603 * contexts are created. 1604 * 1605 * Set the eager buffer size. Validate that it falls in a range 1606 * allowed by the hardware - all powers of 2 between the min and 1607 * max. The maximum valid MTU is within the eager buffer range 1608 * so we do not need to cap the max_mtu by an eager buffer size 1609 * setting. 1610 */ 1611 if (eager_buffer_size) { 1612 if (!is_power_of_2(eager_buffer_size)) 1613 eager_buffer_size = 1614 roundup_pow_of_two(eager_buffer_size); 1615 eager_buffer_size = 1616 clamp_val(eager_buffer_size, 1617 MIN_EAGER_BUFFER * 8, 1618 MAX_EAGER_BUFFER_TOTAL); 1619 dd_dev_info(dd, "Eager buffer size %u\n", 1620 eager_buffer_size); 1621 } else { 1622 dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); 1623 ret = -EINVAL; 1624 goto bail; 1625 } 1626 1627 /* restrict value of hfi1_rcvarr_split */ 1628 hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); 1629 1630 ret = hfi1_pcie_init(dd); 1631 if (ret) 1632 goto bail; 1633 1634 /* 1635 * Do device-specific initialization, function table setup, dd 1636 * allocation, etc. 1637 */ 1638 ret = hfi1_init_dd(dd); 1639 if (ret) 1640 goto clean_bail; /* error already printed */ 1641 1642 ret = create_workqueues(dd); 1643 if (ret) 1644 goto clean_bail; 1645 1646 /* do the generic initialization */ 1647 initfail = hfi1_init(dd, 0); 1648 1649 ret = hfi1_register_ib_device(dd); 1650 1651 /* 1652 * Now ready for use. this should be cleared whenever we 1653 * detect a reset, or initiate one. If earlier failure, 1654 * we still create devices, so diags, etc. can be used 1655 * to determine cause of problem. 1656 */ 1657 if (!initfail && !ret) { 1658 dd->flags |= HFI1_INITTED; 1659 /* create debufs files after init and ib register */ 1660 hfi1_dbg_ibdev_init(&dd->verbs_dev); 1661 } 1662 1663 j = hfi1_device_create(dd); 1664 if (j) 1665 dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1666 1667 if (initfail || ret) { 1668 msix_clean_up_interrupts(dd); 1669 stop_timers(dd); 1670 flush_workqueue(ib_wq); 1671 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1672 hfi1_quiet_serdes(dd->pport + pidx); 1673 ppd = dd->pport + pidx; 1674 if (ppd->hfi1_wq) { 1675 destroy_workqueue(ppd->hfi1_wq); 1676 ppd->hfi1_wq = NULL; 1677 } 1678 if (ppd->link_wq) { 1679 destroy_workqueue(ppd->link_wq); 1680 ppd->link_wq = NULL; 1681 } 1682 } 1683 if (!j) 1684 hfi1_device_remove(dd); 1685 if (!ret) 1686 hfi1_unregister_ib_device(dd); 1687 postinit_cleanup(dd); 1688 if (initfail) 1689 ret = initfail; 1690 goto bail; /* everything already cleaned */ 1691 } 1692 1693 sdma_start(dd); 1694 1695 return 0; 1696 1697 clean_bail: 1698 hfi1_pcie_cleanup(pdev); 1699 bail: 1700 return ret; 1701 } 1702 1703 static void wait_for_clients(struct hfi1_devdata *dd) 1704 { 1705 /* 1706 * Remove the device init value and complete the device if there is 1707 * no clients or wait for active clients to finish. 1708 */ 1709 if (refcount_dec_and_test(&dd->user_refcount)) 1710 complete(&dd->user_comp); 1711 1712 wait_for_completion(&dd->user_comp); 1713 } 1714 1715 static void remove_one(struct pci_dev *pdev) 1716 { 1717 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1718 1719 /* close debugfs files before ib unregister */ 1720 hfi1_dbg_ibdev_exit(&dd->verbs_dev); 1721 1722 /* remove the /dev hfi1 interface */ 1723 hfi1_device_remove(dd); 1724 1725 /* wait for existing user space clients to finish */ 1726 wait_for_clients(dd); 1727 1728 /* unregister from IB core */ 1729 hfi1_unregister_ib_device(dd); 1730 1731 /* free netdev data */ 1732 hfi1_free_rx(dd); 1733 1734 /* 1735 * Disable the IB link, disable interrupts on the device, 1736 * clear dma engines, etc. 1737 */ 1738 shutdown_device(dd); 1739 destroy_workqueues(dd); 1740 1741 stop_timers(dd); 1742 1743 /* wait until all of our (qsfp) queue_work() calls complete */ 1744 flush_workqueue(ib_wq); 1745 1746 postinit_cleanup(dd); 1747 } 1748 1749 static void shutdown_one(struct pci_dev *pdev) 1750 { 1751 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1752 1753 shutdown_device(dd); 1754 } 1755 1756 /** 1757 * hfi1_create_rcvhdrq - create a receive header queue 1758 * @dd: the hfi1_ib device 1759 * @rcd: the context data 1760 * 1761 * This must be contiguous memory (from an i/o perspective), and must be 1762 * DMA'able (which means for some systems, it will go through an IOMMU, 1763 * or be forced into a low address range). 1764 */ 1765 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1766 { 1767 unsigned amt; 1768 1769 if (!rcd->rcvhdrq) { 1770 gfp_t gfp_flags; 1771 1772 amt = rcvhdrq_size(rcd); 1773 1774 if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic) 1775 gfp_flags = GFP_KERNEL; 1776 else 1777 gfp_flags = GFP_USER; 1778 rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt, 1779 &rcd->rcvhdrq_dma, 1780 gfp_flags | __GFP_COMP); 1781 1782 if (!rcd->rcvhdrq) { 1783 dd_dev_err(dd, 1784 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1785 amt, rcd->ctxt); 1786 goto bail; 1787 } 1788 1789 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || 1790 HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { 1791 rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev, 1792 PAGE_SIZE, 1793 &rcd->rcvhdrqtailaddr_dma, 1794 gfp_flags); 1795 if (!rcd->rcvhdrtail_kvaddr) 1796 goto bail_free; 1797 } 1798 } 1799 1800 set_hdrq_regs(rcd->dd, rcd->ctxt, rcd->rcvhdrqentsize, 1801 rcd->rcvhdrq_cnt); 1802 1803 return 0; 1804 1805 bail_free: 1806 dd_dev_err(dd, 1807 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1808 rcd->ctxt); 1809 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1810 rcd->rcvhdrq_dma); 1811 rcd->rcvhdrq = NULL; 1812 bail: 1813 return -ENOMEM; 1814 } 1815 1816 /** 1817 * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user 1818 * contexts. 1819 * @rcd: the context we are setting up. 1820 * 1821 * Allocate the eager TID buffers and program them into hip. 1822 * They are no longer completely contiguous, we do multiple allocation 1823 * calls. Otherwise we get the OOM code involved, by asking for too 1824 * much per call, with disastrous results on some kernels. 1825 */ 1826 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) 1827 { 1828 struct hfi1_devdata *dd = rcd->dd; 1829 u32 max_entries, egrtop, alloced_bytes = 0; 1830 gfp_t gfp_flags; 1831 u16 order, idx = 0; 1832 int ret = 0; 1833 u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); 1834 1835 /* 1836 * GFP_USER, but without GFP_FS, so buffer cache can be 1837 * coalesced (we hope); otherwise, even at order 4, 1838 * heavy filesystem activity makes these fail, and we can 1839 * use compound pages. 1840 */ 1841 gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; 1842 1843 /* 1844 * The minimum size of the eager buffers is a groups of MTU-sized 1845 * buffers. 1846 * The global eager_buffer_size parameter is checked against the 1847 * theoretical lower limit of the value. Here, we check against the 1848 * MTU. 1849 */ 1850 if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) 1851 rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; 1852 /* 1853 * If using one-pkt-per-egr-buffer, lower the eager buffer 1854 * size to the max MTU (page-aligned). 1855 */ 1856 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 1857 rcd->egrbufs.rcvtid_size = round_mtu; 1858 1859 /* 1860 * Eager buffers sizes of 1MB or less require smaller TID sizes 1861 * to satisfy the "multiple of 8 RcvArray entries" requirement. 1862 */ 1863 if (rcd->egrbufs.size <= (1 << 20)) 1864 rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, 1865 rounddown_pow_of_two(rcd->egrbufs.size / 8)); 1866 1867 while (alloced_bytes < rcd->egrbufs.size && 1868 rcd->egrbufs.alloced < rcd->egrbufs.count) { 1869 rcd->egrbufs.buffers[idx].addr = 1870 dma_alloc_coherent(&dd->pcidev->dev, 1871 rcd->egrbufs.rcvtid_size, 1872 &rcd->egrbufs.buffers[idx].dma, 1873 gfp_flags); 1874 if (rcd->egrbufs.buffers[idx].addr) { 1875 rcd->egrbufs.buffers[idx].len = 1876 rcd->egrbufs.rcvtid_size; 1877 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = 1878 rcd->egrbufs.buffers[idx].addr; 1879 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma = 1880 rcd->egrbufs.buffers[idx].dma; 1881 rcd->egrbufs.alloced++; 1882 alloced_bytes += rcd->egrbufs.rcvtid_size; 1883 idx++; 1884 } else { 1885 u32 new_size, i, j; 1886 u64 offset = 0; 1887 1888 /* 1889 * Fail the eager buffer allocation if: 1890 * - we are already using the lowest acceptable size 1891 * - we are using one-pkt-per-egr-buffer (this implies 1892 * that we are accepting only one size) 1893 */ 1894 if (rcd->egrbufs.rcvtid_size == round_mtu || 1895 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { 1896 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", 1897 rcd->ctxt); 1898 ret = -ENOMEM; 1899 goto bail_rcvegrbuf_phys; 1900 } 1901 1902 new_size = rcd->egrbufs.rcvtid_size / 2; 1903 1904 /* 1905 * If the first attempt to allocate memory failed, don't 1906 * fail everything but continue with the next lower 1907 * size. 1908 */ 1909 if (idx == 0) { 1910 rcd->egrbufs.rcvtid_size = new_size; 1911 continue; 1912 } 1913 1914 /* 1915 * Re-partition already allocated buffers to a smaller 1916 * size. 1917 */ 1918 rcd->egrbufs.alloced = 0; 1919 for (i = 0, j = 0, offset = 0; j < idx; i++) { 1920 if (i >= rcd->egrbufs.count) 1921 break; 1922 rcd->egrbufs.rcvtids[i].dma = 1923 rcd->egrbufs.buffers[j].dma + offset; 1924 rcd->egrbufs.rcvtids[i].addr = 1925 rcd->egrbufs.buffers[j].addr + offset; 1926 rcd->egrbufs.alloced++; 1927 if ((rcd->egrbufs.buffers[j].dma + offset + 1928 new_size) == 1929 (rcd->egrbufs.buffers[j].dma + 1930 rcd->egrbufs.buffers[j].len)) { 1931 j++; 1932 offset = 0; 1933 } else { 1934 offset += new_size; 1935 } 1936 } 1937 rcd->egrbufs.rcvtid_size = new_size; 1938 } 1939 } 1940 rcd->egrbufs.numbufs = idx; 1941 rcd->egrbufs.size = alloced_bytes; 1942 1943 hfi1_cdbg(PROC, 1944 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB\n", 1945 rcd->ctxt, rcd->egrbufs.alloced, 1946 rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); 1947 1948 /* 1949 * Set the contexts rcv array head update threshold to the closest 1950 * power of 2 (so we can use a mask instead of modulo) below half 1951 * the allocated entries. 1952 */ 1953 rcd->egrbufs.threshold = 1954 rounddown_pow_of_two(rcd->egrbufs.alloced / 2); 1955 /* 1956 * Compute the expected RcvArray entry base. This is done after 1957 * allocating the eager buffers in order to maximize the 1958 * expected RcvArray entries for the context. 1959 */ 1960 max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; 1961 egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); 1962 rcd->expected_count = max_entries - egrtop; 1963 if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) 1964 rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; 1965 1966 rcd->expected_base = rcd->eager_base + egrtop; 1967 hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", 1968 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, 1969 rcd->eager_base, rcd->expected_base); 1970 1971 if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { 1972 hfi1_cdbg(PROC, 1973 "ctxt%u: current Eager buffer size is invalid %u\n", 1974 rcd->ctxt, rcd->egrbufs.rcvtid_size); 1975 ret = -EINVAL; 1976 goto bail_rcvegrbuf_phys; 1977 } 1978 1979 for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { 1980 hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, 1981 rcd->egrbufs.rcvtids[idx].dma, order); 1982 cond_resched(); 1983 } 1984 1985 return 0; 1986 1987 bail_rcvegrbuf_phys: 1988 for (idx = 0; idx < rcd->egrbufs.alloced && 1989 rcd->egrbufs.buffers[idx].addr; 1990 idx++) { 1991 dma_free_coherent(&dd->pcidev->dev, 1992 rcd->egrbufs.buffers[idx].len, 1993 rcd->egrbufs.buffers[idx].addr, 1994 rcd->egrbufs.buffers[idx].dma); 1995 rcd->egrbufs.buffers[idx].addr = NULL; 1996 rcd->egrbufs.buffers[idx].dma = 0; 1997 rcd->egrbufs.buffers[idx].len = 0; 1998 } 1999 2000 return ret; 2001 } 2002