1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 48 #include <linux/pci.h> 49 #include <linux/netdevice.h> 50 #include <linux/vmalloc.h> 51 #include <linux/delay.h> 52 #include <linux/idr.h> 53 #include <linux/module.h> 54 #include <linux/printk.h> 55 #include <linux/hrtimer.h> 56 #include <linux/bitmap.h> 57 #include <rdma/rdma_vt.h> 58 59 #include "hfi.h" 60 #include "device.h" 61 #include "common.h" 62 #include "trace.h" 63 #include "mad.h" 64 #include "sdma.h" 65 #include "debugfs.h" 66 #include "verbs.h" 67 #include "aspm.h" 68 #include "affinity.h" 69 #include "vnic.h" 70 #include "exp_rcv.h" 71 72 #undef pr_fmt 73 #define pr_fmt(fmt) DRIVER_NAME ": " fmt 74 75 #define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 76 /* 77 * min buffers we want to have per context, after driver 78 */ 79 #define HFI1_MIN_USER_CTXT_BUFCNT 7 80 81 #define HFI1_MIN_HDRQ_EGRBUF_CNT 2 82 #define HFI1_MAX_HDRQ_EGRBUF_CNT 16352 83 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ 84 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ 85 86 #define NUM_IB_PORTS 1 87 88 /* 89 * Number of user receive contexts we are configured to use (to allow for more 90 * pio buffers per ctxt, etc.) Zero means use one user context per CPU. 91 */ 92 int num_user_contexts = -1; 93 module_param_named(num_user_contexts, num_user_contexts, int, 0444); 94 MODULE_PARM_DESC( 95 num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)"); 96 97 uint krcvqs[RXE_NUM_DATA_VL]; 98 int krcvqsset; 99 module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); 100 MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); 101 102 /* computed based on above array */ 103 unsigned long n_krcvqs; 104 105 static unsigned hfi1_rcvarr_split = 25; 106 module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); 107 MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); 108 109 static uint eager_buffer_size = (8 << 20); /* 8MB */ 110 module_param(eager_buffer_size, uint, S_IRUGO); 111 MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB"); 112 113 static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ 114 module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); 115 MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); 116 117 static uint hfi1_hdrq_entsize = 32; 118 module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); 119 MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); 120 121 unsigned int user_credit_return_threshold = 33; /* default is 33% */ 122 module_param(user_credit_return_threshold, uint, S_IRUGO); 123 MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); 124 125 static inline u64 encode_rcv_header_entry_size(u16 size); 126 127 static struct idr hfi1_unit_table; 128 129 static int hfi1_create_kctxt(struct hfi1_devdata *dd, 130 struct hfi1_pportdata *ppd) 131 { 132 struct hfi1_ctxtdata *rcd; 133 int ret; 134 135 /* Control context has to be always 0 */ 136 BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); 137 138 ret = hfi1_create_ctxtdata(ppd, dd->node, &rcd); 139 if (ret < 0) { 140 dd_dev_err(dd, "Kernel receive context allocation failed\n"); 141 return ret; 142 } 143 144 /* 145 * Set up the kernel context flags here and now because they use 146 * default values for all receive side memories. User contexts will 147 * be handled as they are created. 148 */ 149 rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | 150 HFI1_CAP_KGET(NODROP_RHQ_FULL) | 151 HFI1_CAP_KGET(NODROP_EGR_FULL) | 152 HFI1_CAP_KGET(DMA_RTAIL); 153 154 /* Control context must use DMA_RTAIL */ 155 if (rcd->ctxt == HFI1_CTRL_CTXT) 156 rcd->flags |= HFI1_CAP_DMA_RTAIL; 157 rcd->seq_cnt = 1; 158 159 rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); 160 if (!rcd->sc) { 161 dd_dev_err(dd, "Kernel send context allocation failed\n"); 162 return -ENOMEM; 163 } 164 hfi1_init_ctxt(rcd->sc); 165 166 return 0; 167 } 168 169 /* 170 * Create the receive context array and one or more kernel contexts 171 */ 172 int hfi1_create_kctxts(struct hfi1_devdata *dd) 173 { 174 u16 i; 175 int ret; 176 177 dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd), 178 GFP_KERNEL, dd->node); 179 if (!dd->rcd) 180 return -ENOMEM; 181 182 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 183 ret = hfi1_create_kctxt(dd, dd->pport); 184 if (ret) 185 goto bail; 186 } 187 188 return 0; 189 bail: 190 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) 191 hfi1_free_ctxt(dd->rcd[i]); 192 193 /* All the contexts should be freed, free the array */ 194 kfree(dd->rcd); 195 dd->rcd = NULL; 196 return ret; 197 } 198 199 /* 200 * Helper routines for the receive context reference count (rcd and uctxt). 201 */ 202 static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) 203 { 204 kref_init(&rcd->kref); 205 } 206 207 /** 208 * hfi1_rcd_free - When reference is zero clean up. 209 * @kref: pointer to an initialized rcd data structure 210 * 211 */ 212 static void hfi1_rcd_free(struct kref *kref) 213 { 214 unsigned long flags; 215 struct hfi1_ctxtdata *rcd = 216 container_of(kref, struct hfi1_ctxtdata, kref); 217 218 hfi1_free_ctxtdata(rcd->dd, rcd); 219 220 spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); 221 rcd->dd->rcd[rcd->ctxt] = NULL; 222 spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); 223 224 kfree(rcd); 225 } 226 227 /** 228 * hfi1_rcd_put - decrement reference for rcd 229 * @rcd: pointer to an initialized rcd data structure 230 * 231 * Use this to put a reference after the init. 232 */ 233 int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) 234 { 235 if (rcd) 236 return kref_put(&rcd->kref, hfi1_rcd_free); 237 238 return 0; 239 } 240 241 /** 242 * hfi1_rcd_get - increment reference for rcd 243 * @rcd: pointer to an initialized rcd data structure 244 * 245 * Use this to get a reference after the init. 246 */ 247 void hfi1_rcd_get(struct hfi1_ctxtdata *rcd) 248 { 249 kref_get(&rcd->kref); 250 } 251 252 /** 253 * allocate_rcd_index - allocate an rcd index from the rcd array 254 * @dd: pointer to a valid devdata structure 255 * @rcd: rcd data structure to assign 256 * @index: pointer to index that is allocated 257 * 258 * Find an empty index in the rcd array, and assign the given rcd to it. 259 * If the array is full, we are EBUSY. 260 * 261 */ 262 static int allocate_rcd_index(struct hfi1_devdata *dd, 263 struct hfi1_ctxtdata *rcd, u16 *index) 264 { 265 unsigned long flags; 266 u16 ctxt; 267 268 spin_lock_irqsave(&dd->uctxt_lock, flags); 269 for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) 270 if (!dd->rcd[ctxt]) 271 break; 272 273 if (ctxt < dd->num_rcv_contexts) { 274 rcd->ctxt = ctxt; 275 dd->rcd[ctxt] = rcd; 276 hfi1_rcd_init(rcd); 277 } 278 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 279 280 if (ctxt >= dd->num_rcv_contexts) 281 return -EBUSY; 282 283 *index = ctxt; 284 285 return 0; 286 } 287 288 /** 289 * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the 290 * array 291 * @dd: pointer to a valid devdata structure 292 * @ctxt: the index of an possilbe rcd 293 * 294 * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given 295 * ctxt index is valid. 296 * 297 * The caller is responsible for making the _put(). 298 * 299 */ 300 struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, 301 u16 ctxt) 302 { 303 if (ctxt < dd->num_rcv_contexts) 304 return hfi1_rcd_get_by_index(dd, ctxt); 305 306 return NULL; 307 } 308 309 /** 310 * hfi1_rcd_get_by_index 311 * @dd: pointer to a valid devdata structure 312 * @ctxt: the index of an possilbe rcd 313 * 314 * We need to protect access to the rcd array. If access is needed to 315 * one or more index, get the protecting spinlock and then increment the 316 * kref. 317 * 318 * The caller is responsible for making the _put(). 319 * 320 */ 321 struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) 322 { 323 unsigned long flags; 324 struct hfi1_ctxtdata *rcd = NULL; 325 326 spin_lock_irqsave(&dd->uctxt_lock, flags); 327 if (dd->rcd[ctxt]) { 328 rcd = dd->rcd[ctxt]; 329 hfi1_rcd_get(rcd); 330 } 331 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 332 333 return rcd; 334 } 335 336 /* 337 * Common code for user and kernel context create and setup. 338 * NOTE: the initial kref is done here (hf1_rcd_init()). 339 */ 340 int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, 341 struct hfi1_ctxtdata **context) 342 { 343 struct hfi1_devdata *dd = ppd->dd; 344 struct hfi1_ctxtdata *rcd; 345 unsigned kctxt_ngroups = 0; 346 u32 base; 347 348 if (dd->rcv_entries.nctxt_extra > 349 dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt) 350 kctxt_ngroups = (dd->rcv_entries.nctxt_extra - 351 (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)); 352 rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, numa); 353 if (rcd) { 354 u32 rcvtids, max_entries; 355 u16 ctxt; 356 int ret; 357 358 ret = allocate_rcd_index(dd, rcd, &ctxt); 359 if (ret) { 360 *context = NULL; 361 kfree(rcd); 362 return ret; 363 } 364 365 INIT_LIST_HEAD(&rcd->qp_wait_list); 366 hfi1_exp_tid_group_init(rcd); 367 rcd->ppd = ppd; 368 rcd->dd = dd; 369 rcd->numa_id = numa; 370 rcd->rcv_array_groups = dd->rcv_entries.ngroups; 371 rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; 372 373 mutex_init(&rcd->exp_mutex); 374 375 hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); 376 377 /* 378 * Calculate the context's RcvArray entry starting point. 379 * We do this here because we have to take into account all 380 * the RcvArray entries that previous context would have 381 * taken and we have to account for any extra groups assigned 382 * to the static (kernel) or dynamic (vnic/user) contexts. 383 */ 384 if (ctxt < dd->first_dyn_alloc_ctxt) { 385 if (ctxt < kctxt_ngroups) { 386 base = ctxt * (dd->rcv_entries.ngroups + 1); 387 rcd->rcv_array_groups++; 388 } else { 389 base = kctxt_ngroups + 390 (ctxt * dd->rcv_entries.ngroups); 391 } 392 } else { 393 u16 ct = ctxt - dd->first_dyn_alloc_ctxt; 394 395 base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + 396 kctxt_ngroups); 397 if (ct < dd->rcv_entries.nctxt_extra) { 398 base += ct * (dd->rcv_entries.ngroups + 1); 399 rcd->rcv_array_groups++; 400 } else { 401 base += dd->rcv_entries.nctxt_extra + 402 (ct * dd->rcv_entries.ngroups); 403 } 404 } 405 rcd->eager_base = base * dd->rcv_entries.group_size; 406 407 rcd->rcvhdrq_cnt = rcvhdrcnt; 408 rcd->rcvhdrqentsize = hfi1_hdrq_entsize; 409 rcd->rhf_offset = 410 rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); 411 /* 412 * Simple Eager buffer allocation: we have already pre-allocated 413 * the number of RcvArray entry groups. Each ctxtdata structure 414 * holds the number of groups for that context. 415 * 416 * To follow CSR requirements and maintain cacheline alignment, 417 * make sure all sizes and bases are multiples of group_size. 418 * 419 * The expected entry count is what is left after assigning 420 * eager. 421 */ 422 max_entries = rcd->rcv_array_groups * 423 dd->rcv_entries.group_size; 424 rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); 425 rcd->egrbufs.count = round_down(rcvtids, 426 dd->rcv_entries.group_size); 427 if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { 428 dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", 429 rcd->ctxt); 430 rcd->egrbufs.count = MAX_EAGER_ENTRIES; 431 } 432 hfi1_cdbg(PROC, 433 "ctxt%u: max Eager buffer RcvArray entries: %u\n", 434 rcd->ctxt, rcd->egrbufs.count); 435 436 /* 437 * Allocate array that will hold the eager buffer accounting 438 * data. 439 * This will allocate the maximum possible buffer count based 440 * on the value of the RcvArray split parameter. 441 * The resulting value will be rounded down to the closest 442 * multiple of dd->rcv_entries.group_size. 443 */ 444 rcd->egrbufs.buffers = 445 kcalloc_node(rcd->egrbufs.count, 446 sizeof(*rcd->egrbufs.buffers), 447 GFP_KERNEL, numa); 448 if (!rcd->egrbufs.buffers) 449 goto bail; 450 rcd->egrbufs.rcvtids = 451 kcalloc_node(rcd->egrbufs.count, 452 sizeof(*rcd->egrbufs.rcvtids), 453 GFP_KERNEL, numa); 454 if (!rcd->egrbufs.rcvtids) 455 goto bail; 456 rcd->egrbufs.size = eager_buffer_size; 457 /* 458 * The size of the buffers programmed into the RcvArray 459 * entries needs to be big enough to handle the highest 460 * MTU supported. 461 */ 462 if (rcd->egrbufs.size < hfi1_max_mtu) { 463 rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); 464 hfi1_cdbg(PROC, 465 "ctxt%u: eager bufs size too small. Adjusting to %zu\n", 466 rcd->ctxt, rcd->egrbufs.size); 467 } 468 rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; 469 470 /* Applicable only for statically created kernel contexts */ 471 if (ctxt < dd->first_dyn_alloc_ctxt) { 472 rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), 473 GFP_KERNEL, numa); 474 if (!rcd->opstats) 475 goto bail; 476 } 477 478 *context = rcd; 479 return 0; 480 } 481 482 bail: 483 *context = NULL; 484 hfi1_free_ctxt(rcd); 485 return -ENOMEM; 486 } 487 488 /** 489 * hfi1_free_ctxt 490 * @rcd: pointer to an initialized rcd data structure 491 * 492 * This wrapper is the free function that matches hfi1_create_ctxtdata(). 493 * When a context is done being used (kernel or user), this function is called 494 * for the "final" put to match the kref init from hf1i_create_ctxtdata(). 495 * Other users of the context do a get/put sequence to make sure that the 496 * structure isn't removed while in use. 497 */ 498 void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) 499 { 500 hfi1_rcd_put(rcd); 501 } 502 503 /* 504 * Convert a receive header entry size that to the encoding used in the CSR. 505 * 506 * Return a zero if the given size is invalid. 507 */ 508 static inline u64 encode_rcv_header_entry_size(u16 size) 509 { 510 /* there are only 3 valid receive header entry sizes */ 511 if (size == 2) 512 return 1; 513 if (size == 16) 514 return 2; 515 else if (size == 32) 516 return 4; 517 return 0; /* invalid */ 518 } 519 520 /* 521 * Select the largest ccti value over all SLs to determine the intra- 522 * packet gap for the link. 523 * 524 * called with cca_timer_lock held (to protect access to cca_timer 525 * array), and rcu_read_lock() (to protect access to cc_state). 526 */ 527 void set_link_ipg(struct hfi1_pportdata *ppd) 528 { 529 struct hfi1_devdata *dd = ppd->dd; 530 struct cc_state *cc_state; 531 int i; 532 u16 cce, ccti_limit, max_ccti = 0; 533 u16 shift, mult; 534 u64 src; 535 u32 current_egress_rate; /* Mbits /sec */ 536 u32 max_pkt_time; 537 /* 538 * max_pkt_time is the maximum packet egress time in units 539 * of the fabric clock period 1/(805 MHz). 540 */ 541 542 cc_state = get_cc_state(ppd); 543 544 if (!cc_state) 545 /* 546 * This should _never_ happen - rcu_read_lock() is held, 547 * and set_link_ipg() should not be called if cc_state 548 * is NULL. 549 */ 550 return; 551 552 for (i = 0; i < OPA_MAX_SLS; i++) { 553 u16 ccti = ppd->cca_timer[i].ccti; 554 555 if (ccti > max_ccti) 556 max_ccti = ccti; 557 } 558 559 ccti_limit = cc_state->cct.ccti_limit; 560 if (max_ccti > ccti_limit) 561 max_ccti = ccti_limit; 562 563 cce = cc_state->cct.entries[max_ccti].entry; 564 shift = (cce & 0xc000) >> 14; 565 mult = (cce & 0x3fff); 566 567 current_egress_rate = active_egress_rate(ppd); 568 569 max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); 570 571 src = (max_pkt_time >> shift) * mult; 572 573 src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; 574 src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; 575 576 write_csr(dd, SEND_STATIC_RATE_CONTROL, src); 577 } 578 579 static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) 580 { 581 struct cca_timer *cca_timer; 582 struct hfi1_pportdata *ppd; 583 int sl; 584 u16 ccti_timer, ccti_min; 585 struct cc_state *cc_state; 586 unsigned long flags; 587 enum hrtimer_restart ret = HRTIMER_NORESTART; 588 589 cca_timer = container_of(t, struct cca_timer, hrtimer); 590 ppd = cca_timer->ppd; 591 sl = cca_timer->sl; 592 593 rcu_read_lock(); 594 595 cc_state = get_cc_state(ppd); 596 597 if (!cc_state) { 598 rcu_read_unlock(); 599 return HRTIMER_NORESTART; 600 } 601 602 /* 603 * 1) decrement ccti for SL 604 * 2) calculate IPG for link (set_link_ipg()) 605 * 3) restart timer, unless ccti is at min value 606 */ 607 608 ccti_min = cc_state->cong_setting.entries[sl].ccti_min; 609 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 610 611 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 612 613 if (cca_timer->ccti > ccti_min) { 614 cca_timer->ccti--; 615 set_link_ipg(ppd); 616 } 617 618 if (cca_timer->ccti > ccti_min) { 619 unsigned long nsec = 1024 * ccti_timer; 620 /* ccti_timer is in units of 1.024 usec */ 621 hrtimer_forward_now(t, ns_to_ktime(nsec)); 622 ret = HRTIMER_RESTART; 623 } 624 625 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 626 rcu_read_unlock(); 627 return ret; 628 } 629 630 /* 631 * Common code for initializing the physical port structure. 632 */ 633 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, 634 struct hfi1_devdata *dd, u8 hw_pidx, u8 port) 635 { 636 int i; 637 uint default_pkey_idx; 638 struct cc_state *cc_state; 639 640 ppd->dd = dd; 641 ppd->hw_pidx = hw_pidx; 642 ppd->port = port; /* IB port number, not index */ 643 ppd->prev_link_width = LINK_WIDTH_DEFAULT; 644 /* 645 * There are C_VL_COUNT number of PortVLXmitWait counters. 646 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. 647 */ 648 for (i = 0; i < C_VL_COUNT + 1; i++) { 649 ppd->port_vl_xmit_wait_last[i] = 0; 650 ppd->vl_xmit_flit_cnt[i] = 0; 651 } 652 653 default_pkey_idx = 1; 654 655 ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; 656 ppd->part_enforce |= HFI1_PART_ENFORCE_IN; 657 658 if (loopback) { 659 dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n", 660 !default_pkey_idx); 661 ppd->pkeys[!default_pkey_idx] = 0x8001; 662 } 663 664 INIT_WORK(&ppd->link_vc_work, handle_verify_cap); 665 INIT_WORK(&ppd->link_up_work, handle_link_up); 666 INIT_WORK(&ppd->link_down_work, handle_link_down); 667 INIT_WORK(&ppd->freeze_work, handle_freeze); 668 INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); 669 INIT_WORK(&ppd->sma_message_work, handle_sma_message); 670 INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); 671 INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link); 672 INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); 673 INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); 674 675 mutex_init(&ppd->hls_lock); 676 spin_lock_init(&ppd->qsfp_info.qsfp_lock); 677 678 ppd->qsfp_info.ppd = ppd; 679 ppd->sm_trap_qp = 0x0; 680 ppd->sa_qp = 0x1; 681 682 ppd->hfi1_wq = NULL; 683 684 spin_lock_init(&ppd->cca_timer_lock); 685 686 for (i = 0; i < OPA_MAX_SLS; i++) { 687 hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, 688 HRTIMER_MODE_REL); 689 ppd->cca_timer[i].ppd = ppd; 690 ppd->cca_timer[i].sl = i; 691 ppd->cca_timer[i].ccti = 0; 692 ppd->cca_timer[i].hrtimer.function = cca_timer_fn; 693 } 694 695 ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; 696 697 spin_lock_init(&ppd->cc_state_lock); 698 spin_lock_init(&ppd->cc_log_lock); 699 cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL); 700 RCU_INIT_POINTER(ppd->cc_state, cc_state); 701 if (!cc_state) 702 goto bail; 703 return; 704 705 bail: 706 dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); 707 } 708 709 /* 710 * Do initialization for device that is only needed on 711 * first detect, not on resets. 712 */ 713 static int loadtime_init(struct hfi1_devdata *dd) 714 { 715 return 0; 716 } 717 718 /** 719 * init_after_reset - re-initialize after a reset 720 * @dd: the hfi1_ib device 721 * 722 * sanity check at least some of the values after reset, and 723 * ensure no receive or transmit (explicitly, in case reset 724 * failed 725 */ 726 static int init_after_reset(struct hfi1_devdata *dd) 727 { 728 int i; 729 struct hfi1_ctxtdata *rcd; 730 /* 731 * Ensure chip does no sends or receives, tail updates, or 732 * pioavail updates while we re-initialize. This is mostly 733 * for the driver data structures, not chip registers. 734 */ 735 for (i = 0; i < dd->num_rcv_contexts; i++) { 736 rcd = hfi1_rcd_get_by_index(dd, i); 737 hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | 738 HFI1_RCVCTRL_INTRAVAIL_DIS | 739 HFI1_RCVCTRL_TAILUPD_DIS, rcd); 740 hfi1_rcd_put(rcd); 741 } 742 pio_send_control(dd, PSC_GLOBAL_DISABLE); 743 for (i = 0; i < dd->num_send_contexts; i++) 744 sc_disable(dd->send_contexts[i].sc); 745 746 return 0; 747 } 748 749 static void enable_chip(struct hfi1_devdata *dd) 750 { 751 struct hfi1_ctxtdata *rcd; 752 u32 rcvmask; 753 u16 i; 754 755 /* enable PIO send */ 756 pio_send_control(dd, PSC_GLOBAL_ENABLE); 757 758 /* 759 * Enable kernel ctxts' receive and receive interrupt. 760 * Other ctxts done as user opens and initializes them. 761 */ 762 for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { 763 rcd = hfi1_rcd_get_by_index(dd, i); 764 if (!rcd) 765 continue; 766 rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; 767 rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? 768 HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; 769 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 770 rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; 771 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) 772 rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; 773 if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) 774 rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; 775 hfi1_rcvctrl(dd, rcvmask, rcd); 776 sc_enable(rcd->sc); 777 hfi1_rcd_put(rcd); 778 } 779 } 780 781 /** 782 * create_workqueues - create per port workqueues 783 * @dd: the hfi1_ib device 784 */ 785 static int create_workqueues(struct hfi1_devdata *dd) 786 { 787 int pidx; 788 struct hfi1_pportdata *ppd; 789 790 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 791 ppd = dd->pport + pidx; 792 if (!ppd->hfi1_wq) { 793 ppd->hfi1_wq = 794 alloc_workqueue( 795 "hfi%d_%d", 796 WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 797 HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, 798 dd->unit, pidx); 799 if (!ppd->hfi1_wq) 800 goto wq_error; 801 } 802 if (!ppd->link_wq) { 803 /* 804 * Make the link workqueue single-threaded to enforce 805 * serialization. 806 */ 807 ppd->link_wq = 808 alloc_workqueue( 809 "hfi_link_%d_%d", 810 WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, 811 1, /* max_active */ 812 dd->unit, pidx); 813 if (!ppd->link_wq) 814 goto wq_error; 815 } 816 } 817 return 0; 818 wq_error: 819 pr_err("alloc_workqueue failed for port %d\n", pidx + 1); 820 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 821 ppd = dd->pport + pidx; 822 if (ppd->hfi1_wq) { 823 destroy_workqueue(ppd->hfi1_wq); 824 ppd->hfi1_wq = NULL; 825 } 826 if (ppd->link_wq) { 827 destroy_workqueue(ppd->link_wq); 828 ppd->link_wq = NULL; 829 } 830 } 831 return -ENOMEM; 832 } 833 834 /** 835 * enable_general_intr() - Enable the IRQs that will be handled by the 836 * general interrupt handler. 837 * @dd: valid devdata 838 * 839 */ 840 static void enable_general_intr(struct hfi1_devdata *dd) 841 { 842 set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); 843 set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); 844 set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); 845 set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); 846 set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); 847 set_intr_bits(dd, IS_DC_START, IS_DC_END, true); 848 set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); 849 } 850 851 /** 852 * hfi1_init - do the actual initialization sequence on the chip 853 * @dd: the hfi1_ib device 854 * @reinit: re-initializing, so don't allocate new memory 855 * 856 * Do the actual initialization sequence on the chip. This is done 857 * both from the init routine called from the PCI infrastructure, and 858 * when we reset the chip, or detect that it was reset internally, 859 * or it's administratively re-enabled. 860 * 861 * Memory allocation here and in called routines is only done in 862 * the first case (reinit == 0). We have to be careful, because even 863 * without memory allocation, we need to re-write all the chip registers 864 * TIDs, etc. after the reset or enable has completed. 865 */ 866 int hfi1_init(struct hfi1_devdata *dd, int reinit) 867 { 868 int ret = 0, pidx, lastfail = 0; 869 unsigned long len; 870 u16 i; 871 struct hfi1_ctxtdata *rcd; 872 struct hfi1_pportdata *ppd; 873 874 /* Set up send low level handlers */ 875 dd->process_pio_send = hfi1_verbs_send_pio; 876 dd->process_dma_send = hfi1_verbs_send_dma; 877 dd->pio_inline_send = pio_copy; 878 dd->process_vnic_dma_send = hfi1_vnic_send_dma; 879 880 if (is_ax(dd)) { 881 atomic_set(&dd->drop_packet, DROP_PACKET_ON); 882 dd->do_drop = 1; 883 } else { 884 atomic_set(&dd->drop_packet, DROP_PACKET_OFF); 885 dd->do_drop = 0; 886 } 887 888 /* make sure the link is not "up" */ 889 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 890 ppd = dd->pport + pidx; 891 ppd->linkup = 0; 892 } 893 894 if (reinit) 895 ret = init_after_reset(dd); 896 else 897 ret = loadtime_init(dd); 898 if (ret) 899 goto done; 900 901 /* allocate dummy tail memory for all receive contexts */ 902 dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent( 903 &dd->pcidev->dev, sizeof(u64), 904 &dd->rcvhdrtail_dummy_dma, 905 GFP_KERNEL); 906 907 if (!dd->rcvhdrtail_dummy_kvaddr) { 908 dd_dev_err(dd, "cannot allocate dummy tail memory\n"); 909 ret = -ENOMEM; 910 goto done; 911 } 912 913 /* dd->rcd can be NULL if early initialization failed */ 914 for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { 915 /* 916 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 917 * re-init, the simplest way to handle this is to free 918 * existing, and re-allocate. 919 * Need to re-create rest of ctxt 0 ctxtdata as well. 920 */ 921 rcd = hfi1_rcd_get_by_index(dd, i); 922 if (!rcd) 923 continue; 924 925 rcd->do_interrupt = &handle_receive_interrupt; 926 927 lastfail = hfi1_create_rcvhdrq(dd, rcd); 928 if (!lastfail) 929 lastfail = hfi1_setup_eagerbufs(rcd); 930 if (lastfail) { 931 dd_dev_err(dd, 932 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 933 ret = lastfail; 934 } 935 /* enable IRQ */ 936 hfi1_rcd_put(rcd); 937 } 938 939 /* Allocate enough memory for user event notification. */ 940 len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * 941 sizeof(*dd->events)); 942 dd->events = vmalloc_user(len); 943 if (!dd->events) 944 dd_dev_err(dd, "Failed to allocate user events page\n"); 945 /* 946 * Allocate a page for device and port status. 947 * Page will be shared amongst all user processes. 948 */ 949 dd->status = vmalloc_user(PAGE_SIZE); 950 if (!dd->status) 951 dd_dev_err(dd, "Failed to allocate dev status page\n"); 952 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 953 ppd = dd->pport + pidx; 954 if (dd->status) 955 /* Currently, we only have one port */ 956 ppd->statusp = &dd->status->port; 957 958 set_mtu(ppd); 959 } 960 961 /* enable chip even if we have an error, so we can debug cause */ 962 enable_chip(dd); 963 964 done: 965 /* 966 * Set status even if port serdes is not initialized 967 * so that diags will work. 968 */ 969 if (dd->status) 970 dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | 971 HFI1_STATUS_INITTED; 972 if (!ret) { 973 /* enable all interrupts from the chip */ 974 enable_general_intr(dd); 975 init_qsfp_int(dd); 976 977 /* chip is OK for user apps; mark it as initialized */ 978 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 979 ppd = dd->pport + pidx; 980 981 /* 982 * start the serdes - must be after interrupts are 983 * enabled so we are notified when the link goes up 984 */ 985 lastfail = bringup_serdes(ppd); 986 if (lastfail) 987 dd_dev_info(dd, 988 "Failed to bring up port %u\n", 989 ppd->port); 990 991 /* 992 * Set status even if port serdes is not initialized 993 * so that diags will work. 994 */ 995 if (ppd->statusp) 996 *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | 997 HFI1_STATUS_INITTED; 998 if (!ppd->link_speed_enabled) 999 continue; 1000 } 1001 } 1002 1003 /* if ret is non-zero, we probably should do some cleanup here... */ 1004 return ret; 1005 } 1006 1007 static inline struct hfi1_devdata *__hfi1_lookup(int unit) 1008 { 1009 return idr_find(&hfi1_unit_table, unit); 1010 } 1011 1012 struct hfi1_devdata *hfi1_lookup(int unit) 1013 { 1014 struct hfi1_devdata *dd; 1015 unsigned long flags; 1016 1017 spin_lock_irqsave(&hfi1_devs_lock, flags); 1018 dd = __hfi1_lookup(unit); 1019 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1020 1021 return dd; 1022 } 1023 1024 /* 1025 * Stop the timers during unit shutdown, or after an error late 1026 * in initialization. 1027 */ 1028 static void stop_timers(struct hfi1_devdata *dd) 1029 { 1030 struct hfi1_pportdata *ppd; 1031 int pidx; 1032 1033 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1034 ppd = dd->pport + pidx; 1035 if (ppd->led_override_timer.function) { 1036 del_timer_sync(&ppd->led_override_timer); 1037 atomic_set(&ppd->led_override_timer_active, 0); 1038 } 1039 } 1040 } 1041 1042 /** 1043 * shutdown_device - shut down a device 1044 * @dd: the hfi1_ib device 1045 * 1046 * This is called to make the device quiet when we are about to 1047 * unload the driver, and also when the device is administratively 1048 * disabled. It does not free any data structures. 1049 * Everything it does has to be setup again by hfi1_init(dd, 1) 1050 */ 1051 static void shutdown_device(struct hfi1_devdata *dd) 1052 { 1053 struct hfi1_pportdata *ppd; 1054 struct hfi1_ctxtdata *rcd; 1055 unsigned pidx; 1056 int i; 1057 1058 if (dd->flags & HFI1_SHUTDOWN) 1059 return; 1060 dd->flags |= HFI1_SHUTDOWN; 1061 1062 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1063 ppd = dd->pport + pidx; 1064 1065 ppd->linkup = 0; 1066 if (ppd->statusp) 1067 *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | 1068 HFI1_STATUS_IB_READY); 1069 } 1070 dd->flags &= ~HFI1_INITTED; 1071 1072 /* mask and clean up interrupts */ 1073 set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); 1074 msix_clean_up_interrupts(dd); 1075 1076 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1077 ppd = dd->pport + pidx; 1078 for (i = 0; i < dd->num_rcv_contexts; i++) { 1079 rcd = hfi1_rcd_get_by_index(dd, i); 1080 hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | 1081 HFI1_RCVCTRL_CTXT_DIS | 1082 HFI1_RCVCTRL_INTRAVAIL_DIS | 1083 HFI1_RCVCTRL_PKEY_DIS | 1084 HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); 1085 hfi1_rcd_put(rcd); 1086 } 1087 /* 1088 * Gracefully stop all sends allowing any in progress to 1089 * trickle out first. 1090 */ 1091 for (i = 0; i < dd->num_send_contexts; i++) 1092 sc_flush(dd->send_contexts[i].sc); 1093 } 1094 1095 /* 1096 * Enough for anything that's going to trickle out to have actually 1097 * done so. 1098 */ 1099 udelay(20); 1100 1101 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1102 ppd = dd->pport + pidx; 1103 1104 /* disable all contexts */ 1105 for (i = 0; i < dd->num_send_contexts; i++) 1106 sc_disable(dd->send_contexts[i].sc); 1107 /* disable the send device */ 1108 pio_send_control(dd, PSC_GLOBAL_DISABLE); 1109 1110 shutdown_led_override(ppd); 1111 1112 /* 1113 * Clear SerdesEnable. 1114 * We can't count on interrupts since we are stopping. 1115 */ 1116 hfi1_quiet_serdes(ppd); 1117 1118 if (ppd->hfi1_wq) { 1119 destroy_workqueue(ppd->hfi1_wq); 1120 ppd->hfi1_wq = NULL; 1121 } 1122 if (ppd->link_wq) { 1123 destroy_workqueue(ppd->link_wq); 1124 ppd->link_wq = NULL; 1125 } 1126 } 1127 sdma_exit(dd); 1128 } 1129 1130 /** 1131 * hfi1_free_ctxtdata - free a context's allocated data 1132 * @dd: the hfi1_ib device 1133 * @rcd: the ctxtdata structure 1134 * 1135 * free up any allocated data for a context 1136 * It should never change any chip state, or global driver state. 1137 */ 1138 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1139 { 1140 u32 e; 1141 1142 if (!rcd) 1143 return; 1144 1145 if (rcd->rcvhdrq) { 1146 dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd), 1147 rcd->rcvhdrq, rcd->rcvhdrq_dma); 1148 rcd->rcvhdrq = NULL; 1149 if (rcd->rcvhdrtail_kvaddr) { 1150 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 1151 (void *)rcd->rcvhdrtail_kvaddr, 1152 rcd->rcvhdrqtailaddr_dma); 1153 rcd->rcvhdrtail_kvaddr = NULL; 1154 } 1155 } 1156 1157 /* all the RcvArray entries should have been cleared by now */ 1158 kfree(rcd->egrbufs.rcvtids); 1159 rcd->egrbufs.rcvtids = NULL; 1160 1161 for (e = 0; e < rcd->egrbufs.alloced; e++) { 1162 if (rcd->egrbufs.buffers[e].dma) 1163 dma_free_coherent(&dd->pcidev->dev, 1164 rcd->egrbufs.buffers[e].len, 1165 rcd->egrbufs.buffers[e].addr, 1166 rcd->egrbufs.buffers[e].dma); 1167 } 1168 kfree(rcd->egrbufs.buffers); 1169 rcd->egrbufs.alloced = 0; 1170 rcd->egrbufs.buffers = NULL; 1171 1172 sc_free(rcd->sc); 1173 rcd->sc = NULL; 1174 1175 vfree(rcd->subctxt_uregbase); 1176 vfree(rcd->subctxt_rcvegrbuf); 1177 vfree(rcd->subctxt_rcvhdr_base); 1178 kfree(rcd->opstats); 1179 1180 rcd->subctxt_uregbase = NULL; 1181 rcd->subctxt_rcvegrbuf = NULL; 1182 rcd->subctxt_rcvhdr_base = NULL; 1183 rcd->opstats = NULL; 1184 } 1185 1186 /* 1187 * Release our hold on the shared asic data. If we are the last one, 1188 * return the structure to be finalized outside the lock. Must be 1189 * holding hfi1_devs_lock. 1190 */ 1191 static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) 1192 { 1193 struct hfi1_asic_data *ad; 1194 int other; 1195 1196 if (!dd->asic_data) 1197 return NULL; 1198 dd->asic_data->dds[dd->hfi1_id] = NULL; 1199 other = dd->hfi1_id ? 0 : 1; 1200 ad = dd->asic_data; 1201 dd->asic_data = NULL; 1202 /* return NULL if the other dd still has a link */ 1203 return ad->dds[other] ? NULL : ad; 1204 } 1205 1206 static void finalize_asic_data(struct hfi1_devdata *dd, 1207 struct hfi1_asic_data *ad) 1208 { 1209 clean_up_i2c(dd, ad); 1210 kfree(ad); 1211 } 1212 1213 /** 1214 * hfi1_clean_devdata - cleans up per-unit data structure 1215 * @dd: pointer to a valid devdata structure 1216 * 1217 * It cleans up all data structures set up by 1218 * by hfi1_alloc_devdata(). 1219 */ 1220 static void hfi1_clean_devdata(struct hfi1_devdata *dd) 1221 { 1222 struct hfi1_asic_data *ad; 1223 unsigned long flags; 1224 1225 spin_lock_irqsave(&hfi1_devs_lock, flags); 1226 if (!list_empty(&dd->list)) { 1227 idr_remove(&hfi1_unit_table, dd->unit); 1228 list_del_init(&dd->list); 1229 } 1230 ad = release_asic_data(dd); 1231 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1232 1233 finalize_asic_data(dd, ad); 1234 free_platform_config(dd); 1235 rcu_barrier(); /* wait for rcu callbacks to complete */ 1236 free_percpu(dd->int_counter); 1237 free_percpu(dd->rcv_limit); 1238 free_percpu(dd->send_schedule); 1239 free_percpu(dd->tx_opstats); 1240 dd->int_counter = NULL; 1241 dd->rcv_limit = NULL; 1242 dd->send_schedule = NULL; 1243 dd->tx_opstats = NULL; 1244 kfree(dd->comp_vect); 1245 dd->comp_vect = NULL; 1246 sdma_clean(dd, dd->num_sdma); 1247 rvt_dealloc_device(&dd->verbs_dev.rdi); 1248 } 1249 1250 static void __hfi1_free_devdata(struct kobject *kobj) 1251 { 1252 struct hfi1_devdata *dd = 1253 container_of(kobj, struct hfi1_devdata, kobj); 1254 1255 hfi1_clean_devdata(dd); 1256 } 1257 1258 static struct kobj_type hfi1_devdata_type = { 1259 .release = __hfi1_free_devdata, 1260 }; 1261 1262 void hfi1_free_devdata(struct hfi1_devdata *dd) 1263 { 1264 kobject_put(&dd->kobj); 1265 } 1266 1267 /** 1268 * hfi1_alloc_devdata - Allocate our primary per-unit data structure. 1269 * @pdev: Valid PCI device 1270 * @extra: How many bytes to alloc past the default 1271 * 1272 * Must be done via verbs allocator, because the verbs cleanup process 1273 * both does cleanup and free of the data structure. 1274 * "extra" is for chip-specific data. 1275 * 1276 * Use the idr mechanism to get a unit number for this unit. 1277 */ 1278 static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, 1279 size_t extra) 1280 { 1281 unsigned long flags; 1282 struct hfi1_devdata *dd; 1283 int ret, nports; 1284 1285 /* extra is * number of ports */ 1286 nports = extra / sizeof(struct hfi1_pportdata); 1287 1288 dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, 1289 nports); 1290 if (!dd) 1291 return ERR_PTR(-ENOMEM); 1292 dd->num_pports = nports; 1293 dd->pport = (struct hfi1_pportdata *)(dd + 1); 1294 dd->pcidev = pdev; 1295 pci_set_drvdata(pdev, dd); 1296 1297 INIT_LIST_HEAD(&dd->list); 1298 idr_preload(GFP_KERNEL); 1299 spin_lock_irqsave(&hfi1_devs_lock, flags); 1300 1301 ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT); 1302 if (ret >= 0) { 1303 dd->unit = ret; 1304 list_add(&dd->list, &hfi1_dev_list); 1305 } 1306 dd->node = -1; 1307 1308 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1309 idr_preload_end(); 1310 1311 if (ret < 0) { 1312 dev_err(&pdev->dev, 1313 "Could not allocate unit ID: error %d\n", -ret); 1314 goto bail; 1315 } 1316 rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); 1317 1318 /* 1319 * Initialize all locks for the device. This needs to be as early as 1320 * possible so locks are usable. 1321 */ 1322 spin_lock_init(&dd->sc_lock); 1323 spin_lock_init(&dd->sendctrl_lock); 1324 spin_lock_init(&dd->rcvctrl_lock); 1325 spin_lock_init(&dd->uctxt_lock); 1326 spin_lock_init(&dd->hfi1_diag_trans_lock); 1327 spin_lock_init(&dd->sc_init_lock); 1328 spin_lock_init(&dd->dc8051_memlock); 1329 seqlock_init(&dd->sc2vl_lock); 1330 spin_lock_init(&dd->sde_map_lock); 1331 spin_lock_init(&dd->pio_map_lock); 1332 mutex_init(&dd->dc8051_lock); 1333 init_waitqueue_head(&dd->event_queue); 1334 spin_lock_init(&dd->irq_src_lock); 1335 1336 dd->int_counter = alloc_percpu(u64); 1337 if (!dd->int_counter) { 1338 ret = -ENOMEM; 1339 goto bail; 1340 } 1341 1342 dd->rcv_limit = alloc_percpu(u64); 1343 if (!dd->rcv_limit) { 1344 ret = -ENOMEM; 1345 goto bail; 1346 } 1347 1348 dd->send_schedule = alloc_percpu(u64); 1349 if (!dd->send_schedule) { 1350 ret = -ENOMEM; 1351 goto bail; 1352 } 1353 1354 dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); 1355 if (!dd->tx_opstats) { 1356 ret = -ENOMEM; 1357 goto bail; 1358 } 1359 1360 dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); 1361 if (!dd->comp_vect) { 1362 ret = -ENOMEM; 1363 goto bail; 1364 } 1365 1366 kobject_init(&dd->kobj, &hfi1_devdata_type); 1367 return dd; 1368 1369 bail: 1370 hfi1_clean_devdata(dd); 1371 return ERR_PTR(ret); 1372 } 1373 1374 /* 1375 * Called from freeze mode handlers, and from PCI error 1376 * reporting code. Should be paranoid about state of 1377 * system and data structures. 1378 */ 1379 void hfi1_disable_after_error(struct hfi1_devdata *dd) 1380 { 1381 if (dd->flags & HFI1_INITTED) { 1382 u32 pidx; 1383 1384 dd->flags &= ~HFI1_INITTED; 1385 if (dd->pport) 1386 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1387 struct hfi1_pportdata *ppd; 1388 1389 ppd = dd->pport + pidx; 1390 if (dd->flags & HFI1_PRESENT) 1391 set_link_state(ppd, HLS_DN_DISABLE); 1392 1393 if (ppd->statusp) 1394 *ppd->statusp &= ~HFI1_STATUS_IB_READY; 1395 } 1396 } 1397 1398 /* 1399 * Mark as having had an error for driver, and also 1400 * for /sys and status word mapped to user programs. 1401 * This marks unit as not usable, until reset. 1402 */ 1403 if (dd->status) 1404 dd->status->dev |= HFI1_STATUS_HWERROR; 1405 } 1406 1407 static void remove_one(struct pci_dev *); 1408 static int init_one(struct pci_dev *, const struct pci_device_id *); 1409 static void shutdown_one(struct pci_dev *); 1410 1411 #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " 1412 #define PFX DRIVER_NAME ": " 1413 1414 const struct pci_device_id hfi1_pci_tbl[] = { 1415 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, 1416 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, 1417 { 0, } 1418 }; 1419 1420 MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); 1421 1422 static struct pci_driver hfi1_pci_driver = { 1423 .name = DRIVER_NAME, 1424 .probe = init_one, 1425 .remove = remove_one, 1426 .shutdown = shutdown_one, 1427 .id_table = hfi1_pci_tbl, 1428 .err_handler = &hfi1_pci_err_handler, 1429 }; 1430 1431 static void __init compute_krcvqs(void) 1432 { 1433 int i; 1434 1435 for (i = 0; i < krcvqsset; i++) 1436 n_krcvqs += krcvqs[i]; 1437 } 1438 1439 /* 1440 * Do all the generic driver unit- and chip-independent memory 1441 * allocation and initialization. 1442 */ 1443 static int __init hfi1_mod_init(void) 1444 { 1445 int ret; 1446 1447 ret = dev_init(); 1448 if (ret) 1449 goto bail; 1450 1451 ret = node_affinity_init(); 1452 if (ret) 1453 goto bail; 1454 1455 /* validate max MTU before any devices start */ 1456 if (!valid_opa_max_mtu(hfi1_max_mtu)) { 1457 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", 1458 hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); 1459 hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; 1460 } 1461 /* valid CUs run from 1-128 in powers of 2 */ 1462 if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) 1463 hfi1_cu = 1; 1464 /* valid credit return threshold is 0-100, variable is unsigned */ 1465 if (user_credit_return_threshold > 100) 1466 user_credit_return_threshold = 100; 1467 1468 compute_krcvqs(); 1469 /* 1470 * sanitize receive interrupt count, time must wait until after 1471 * the hardware type is known 1472 */ 1473 if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) 1474 rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; 1475 /* reject invalid combinations */ 1476 if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { 1477 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); 1478 rcv_intr_count = 1; 1479 } 1480 if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { 1481 /* 1482 * Avoid indefinite packet delivery by requiring a timeout 1483 * if count is > 1. 1484 */ 1485 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); 1486 rcv_intr_timeout = 1; 1487 } 1488 if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { 1489 /* 1490 * The dynamic algorithm expects a non-zero timeout 1491 * and a count > 1. 1492 */ 1493 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); 1494 rcv_intr_dynamic = 0; 1495 } 1496 1497 /* sanitize link CRC options */ 1498 link_crc_mask &= SUPPORTED_CRCS; 1499 1500 /* 1501 * These must be called before the driver is registered with 1502 * the PCI subsystem. 1503 */ 1504 idr_init(&hfi1_unit_table); 1505 1506 hfi1_dbg_init(); 1507 ret = pci_register_driver(&hfi1_pci_driver); 1508 if (ret < 0) { 1509 pr_err("Unable to register driver: error %d\n", -ret); 1510 goto bail_dev; 1511 } 1512 goto bail; /* all OK */ 1513 1514 bail_dev: 1515 hfi1_dbg_exit(); 1516 idr_destroy(&hfi1_unit_table); 1517 dev_cleanup(); 1518 bail: 1519 return ret; 1520 } 1521 1522 module_init(hfi1_mod_init); 1523 1524 /* 1525 * Do the non-unit driver cleanup, memory free, etc. at unload. 1526 */ 1527 static void __exit hfi1_mod_cleanup(void) 1528 { 1529 pci_unregister_driver(&hfi1_pci_driver); 1530 node_affinity_destroy_all(); 1531 hfi1_dbg_exit(); 1532 1533 idr_destroy(&hfi1_unit_table); 1534 dispose_firmware(); /* asymmetric with obtain_firmware() */ 1535 dev_cleanup(); 1536 } 1537 1538 module_exit(hfi1_mod_cleanup); 1539 1540 /* this can only be called after a successful initialization */ 1541 static void cleanup_device_data(struct hfi1_devdata *dd) 1542 { 1543 int ctxt; 1544 int pidx; 1545 1546 /* users can't do anything more with chip */ 1547 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1548 struct hfi1_pportdata *ppd = &dd->pport[pidx]; 1549 struct cc_state *cc_state; 1550 int i; 1551 1552 if (ppd->statusp) 1553 *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; 1554 1555 for (i = 0; i < OPA_MAX_SLS; i++) 1556 hrtimer_cancel(&ppd->cca_timer[i].hrtimer); 1557 1558 spin_lock(&ppd->cc_state_lock); 1559 cc_state = get_cc_state_protected(ppd); 1560 RCU_INIT_POINTER(ppd->cc_state, NULL); 1561 spin_unlock(&ppd->cc_state_lock); 1562 1563 if (cc_state) 1564 kfree_rcu(cc_state, rcu); 1565 } 1566 1567 free_credit_return(dd); 1568 1569 if (dd->rcvhdrtail_dummy_kvaddr) { 1570 dma_free_coherent(&dd->pcidev->dev, sizeof(u64), 1571 (void *)dd->rcvhdrtail_dummy_kvaddr, 1572 dd->rcvhdrtail_dummy_dma); 1573 dd->rcvhdrtail_dummy_kvaddr = NULL; 1574 } 1575 1576 /* 1577 * Free any resources still in use (usually just kernel contexts) 1578 * at unload; we do for ctxtcnt, because that's what we allocate. 1579 */ 1580 for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { 1581 struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; 1582 1583 if (rcd) { 1584 hfi1_clear_tids(rcd); 1585 hfi1_free_ctxt(rcd); 1586 } 1587 } 1588 1589 kfree(dd->rcd); 1590 dd->rcd = NULL; 1591 1592 free_pio_map(dd); 1593 /* must follow rcv context free - need to remove rcv's hooks */ 1594 for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) 1595 sc_free(dd->send_contexts[ctxt].sc); 1596 dd->num_send_contexts = 0; 1597 kfree(dd->send_contexts); 1598 dd->send_contexts = NULL; 1599 kfree(dd->hw_to_sw); 1600 dd->hw_to_sw = NULL; 1601 kfree(dd->boardname); 1602 vfree(dd->events); 1603 vfree(dd->status); 1604 } 1605 1606 /* 1607 * Clean up on unit shutdown, or error during unit load after 1608 * successful initialization. 1609 */ 1610 static void postinit_cleanup(struct hfi1_devdata *dd) 1611 { 1612 hfi1_start_cleanup(dd); 1613 hfi1_comp_vectors_clean_up(dd); 1614 hfi1_dev_affinity_clean_up(dd); 1615 1616 hfi1_pcie_ddcleanup(dd); 1617 hfi1_pcie_cleanup(dd->pcidev); 1618 1619 cleanup_device_data(dd); 1620 1621 hfi1_free_devdata(dd); 1622 } 1623 1624 static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt) 1625 { 1626 if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { 1627 dd_dev_err(dd, "Receive header queue count too small\n"); 1628 return -EINVAL; 1629 } 1630 1631 if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { 1632 dd_dev_err(dd, 1633 "Receive header queue count cannot be greater than %u\n", 1634 HFI1_MAX_HDRQ_EGRBUF_CNT); 1635 return -EINVAL; 1636 } 1637 1638 if (thecnt % HDRQ_INCREMENT) { 1639 dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n", 1640 thecnt, HDRQ_INCREMENT); 1641 return -EINVAL; 1642 } 1643 1644 return 0; 1645 } 1646 1647 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1648 { 1649 int ret = 0, j, pidx, initfail; 1650 struct hfi1_devdata *dd; 1651 struct hfi1_pportdata *ppd; 1652 1653 /* First, lock the non-writable module parameters */ 1654 HFI1_CAP_LOCK(); 1655 1656 /* Validate dev ids */ 1657 if (!(ent->device == PCI_DEVICE_ID_INTEL0 || 1658 ent->device == PCI_DEVICE_ID_INTEL1)) { 1659 dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", 1660 ent->device); 1661 ret = -ENODEV; 1662 goto bail; 1663 } 1664 1665 /* Allocate the dd so we can get to work */ 1666 dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * 1667 sizeof(struct hfi1_pportdata)); 1668 if (IS_ERR(dd)) { 1669 ret = PTR_ERR(dd); 1670 goto bail; 1671 } 1672 1673 /* Validate some global module parameters */ 1674 ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt); 1675 if (ret) 1676 goto bail; 1677 1678 /* use the encoding function as a sanitization check */ 1679 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { 1680 dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", 1681 hfi1_hdrq_entsize); 1682 ret = -EINVAL; 1683 goto bail; 1684 } 1685 1686 /* The receive eager buffer size must be set before the receive 1687 * contexts are created. 1688 * 1689 * Set the eager buffer size. Validate that it falls in a range 1690 * allowed by the hardware - all powers of 2 between the min and 1691 * max. The maximum valid MTU is within the eager buffer range 1692 * so we do not need to cap the max_mtu by an eager buffer size 1693 * setting. 1694 */ 1695 if (eager_buffer_size) { 1696 if (!is_power_of_2(eager_buffer_size)) 1697 eager_buffer_size = 1698 roundup_pow_of_two(eager_buffer_size); 1699 eager_buffer_size = 1700 clamp_val(eager_buffer_size, 1701 MIN_EAGER_BUFFER * 8, 1702 MAX_EAGER_BUFFER_TOTAL); 1703 dd_dev_info(dd, "Eager buffer size %u\n", 1704 eager_buffer_size); 1705 } else { 1706 dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); 1707 ret = -EINVAL; 1708 goto bail; 1709 } 1710 1711 /* restrict value of hfi1_rcvarr_split */ 1712 hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); 1713 1714 ret = hfi1_pcie_init(dd); 1715 if (ret) 1716 goto bail; 1717 1718 /* 1719 * Do device-specific initialization, function table setup, dd 1720 * allocation, etc. 1721 */ 1722 ret = hfi1_init_dd(dd); 1723 if (ret) 1724 goto clean_bail; /* error already printed */ 1725 1726 ret = create_workqueues(dd); 1727 if (ret) 1728 goto clean_bail; 1729 1730 /* do the generic initialization */ 1731 initfail = hfi1_init(dd, 0); 1732 1733 /* setup vnic */ 1734 hfi1_vnic_setup(dd); 1735 1736 ret = hfi1_register_ib_device(dd); 1737 1738 /* 1739 * Now ready for use. this should be cleared whenever we 1740 * detect a reset, or initiate one. If earlier failure, 1741 * we still create devices, so diags, etc. can be used 1742 * to determine cause of problem. 1743 */ 1744 if (!initfail && !ret) { 1745 dd->flags |= HFI1_INITTED; 1746 /* create debufs files after init and ib register */ 1747 hfi1_dbg_ibdev_init(&dd->verbs_dev); 1748 } 1749 1750 j = hfi1_device_create(dd); 1751 if (j) 1752 dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1753 1754 if (initfail || ret) { 1755 msix_clean_up_interrupts(dd); 1756 stop_timers(dd); 1757 flush_workqueue(ib_wq); 1758 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1759 hfi1_quiet_serdes(dd->pport + pidx); 1760 ppd = dd->pport + pidx; 1761 if (ppd->hfi1_wq) { 1762 destroy_workqueue(ppd->hfi1_wq); 1763 ppd->hfi1_wq = NULL; 1764 } 1765 if (ppd->link_wq) { 1766 destroy_workqueue(ppd->link_wq); 1767 ppd->link_wq = NULL; 1768 } 1769 } 1770 if (!j) 1771 hfi1_device_remove(dd); 1772 if (!ret) 1773 hfi1_unregister_ib_device(dd); 1774 hfi1_vnic_cleanup(dd); 1775 postinit_cleanup(dd); 1776 if (initfail) 1777 ret = initfail; 1778 goto bail; /* everything already cleaned */ 1779 } 1780 1781 sdma_start(dd); 1782 1783 return 0; 1784 1785 clean_bail: 1786 hfi1_pcie_cleanup(pdev); 1787 bail: 1788 return ret; 1789 } 1790 1791 static void wait_for_clients(struct hfi1_devdata *dd) 1792 { 1793 /* 1794 * Remove the device init value and complete the device if there is 1795 * no clients or wait for active clients to finish. 1796 */ 1797 if (atomic_dec_and_test(&dd->user_refcount)) 1798 complete(&dd->user_comp); 1799 1800 wait_for_completion(&dd->user_comp); 1801 } 1802 1803 static void remove_one(struct pci_dev *pdev) 1804 { 1805 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1806 1807 /* close debugfs files before ib unregister */ 1808 hfi1_dbg_ibdev_exit(&dd->verbs_dev); 1809 1810 /* remove the /dev hfi1 interface */ 1811 hfi1_device_remove(dd); 1812 1813 /* wait for existing user space clients to finish */ 1814 wait_for_clients(dd); 1815 1816 /* unregister from IB core */ 1817 hfi1_unregister_ib_device(dd); 1818 1819 /* cleanup vnic */ 1820 hfi1_vnic_cleanup(dd); 1821 1822 /* 1823 * Disable the IB link, disable interrupts on the device, 1824 * clear dma engines, etc. 1825 */ 1826 shutdown_device(dd); 1827 1828 stop_timers(dd); 1829 1830 /* wait until all of our (qsfp) queue_work() calls complete */ 1831 flush_workqueue(ib_wq); 1832 1833 postinit_cleanup(dd); 1834 } 1835 1836 static void shutdown_one(struct pci_dev *pdev) 1837 { 1838 struct hfi1_devdata *dd = pci_get_drvdata(pdev); 1839 1840 shutdown_device(dd); 1841 } 1842 1843 /** 1844 * hfi1_create_rcvhdrq - create a receive header queue 1845 * @dd: the hfi1_ib device 1846 * @rcd: the context data 1847 * 1848 * This must be contiguous memory (from an i/o perspective), and must be 1849 * DMA'able (which means for some systems, it will go through an IOMMU, 1850 * or be forced into a low address range). 1851 */ 1852 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) 1853 { 1854 unsigned amt; 1855 u64 reg; 1856 1857 if (!rcd->rcvhdrq) { 1858 gfp_t gfp_flags; 1859 1860 amt = rcvhdrq_size(rcd); 1861 1862 if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic) 1863 gfp_flags = GFP_KERNEL; 1864 else 1865 gfp_flags = GFP_USER; 1866 rcd->rcvhdrq = dma_zalloc_coherent( 1867 &dd->pcidev->dev, amt, &rcd->rcvhdrq_dma, 1868 gfp_flags | __GFP_COMP); 1869 1870 if (!rcd->rcvhdrq) { 1871 dd_dev_err(dd, 1872 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1873 amt, rcd->ctxt); 1874 goto bail; 1875 } 1876 1877 if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || 1878 HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { 1879 rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( 1880 &dd->pcidev->dev, PAGE_SIZE, 1881 &rcd->rcvhdrqtailaddr_dma, gfp_flags); 1882 if (!rcd->rcvhdrtail_kvaddr) 1883 goto bail_free; 1884 } 1885 } 1886 /* 1887 * These values are per-context: 1888 * RcvHdrCnt 1889 * RcvHdrEntSize 1890 * RcvHdrSize 1891 */ 1892 reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT) 1893 & RCV_HDR_CNT_CNT_MASK) 1894 << RCV_HDR_CNT_CNT_SHIFT; 1895 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg); 1896 reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize) 1897 & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) 1898 << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; 1899 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); 1900 reg = ((u64)DEFAULT_RCVHDRSIZE & RCV_HDR_SIZE_HDR_SIZE_MASK) 1901 << RCV_HDR_SIZE_HDR_SIZE_SHIFT; 1902 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); 1903 1904 /* 1905 * Program dummy tail address for every receive context 1906 * before enabling any receive context 1907 */ 1908 write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR, 1909 dd->rcvhdrtail_dummy_dma); 1910 1911 return 0; 1912 1913 bail_free: 1914 dd_dev_err(dd, 1915 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1916 rcd->ctxt); 1917 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1918 rcd->rcvhdrq_dma); 1919 rcd->rcvhdrq = NULL; 1920 bail: 1921 return -ENOMEM; 1922 } 1923 1924 /** 1925 * allocate eager buffers, both kernel and user contexts. 1926 * @rcd: the context we are setting up. 1927 * 1928 * Allocate the eager TID buffers and program them into hip. 1929 * They are no longer completely contiguous, we do multiple allocation 1930 * calls. Otherwise we get the OOM code involved, by asking for too 1931 * much per call, with disastrous results on some kernels. 1932 */ 1933 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) 1934 { 1935 struct hfi1_devdata *dd = rcd->dd; 1936 u32 max_entries, egrtop, alloced_bytes = 0; 1937 gfp_t gfp_flags; 1938 u16 order, idx = 0; 1939 int ret = 0; 1940 u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); 1941 1942 /* 1943 * GFP_USER, but without GFP_FS, so buffer cache can be 1944 * coalesced (we hope); otherwise, even at order 4, 1945 * heavy filesystem activity makes these fail, and we can 1946 * use compound pages. 1947 */ 1948 gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; 1949 1950 /* 1951 * The minimum size of the eager buffers is a groups of MTU-sized 1952 * buffers. 1953 * The global eager_buffer_size parameter is checked against the 1954 * theoretical lower limit of the value. Here, we check against the 1955 * MTU. 1956 */ 1957 if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) 1958 rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; 1959 /* 1960 * If using one-pkt-per-egr-buffer, lower the eager buffer 1961 * size to the max MTU (page-aligned). 1962 */ 1963 if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) 1964 rcd->egrbufs.rcvtid_size = round_mtu; 1965 1966 /* 1967 * Eager buffers sizes of 1MB or less require smaller TID sizes 1968 * to satisfy the "multiple of 8 RcvArray entries" requirement. 1969 */ 1970 if (rcd->egrbufs.size <= (1 << 20)) 1971 rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, 1972 rounddown_pow_of_two(rcd->egrbufs.size / 8)); 1973 1974 while (alloced_bytes < rcd->egrbufs.size && 1975 rcd->egrbufs.alloced < rcd->egrbufs.count) { 1976 rcd->egrbufs.buffers[idx].addr = 1977 dma_zalloc_coherent(&dd->pcidev->dev, 1978 rcd->egrbufs.rcvtid_size, 1979 &rcd->egrbufs.buffers[idx].dma, 1980 gfp_flags); 1981 if (rcd->egrbufs.buffers[idx].addr) { 1982 rcd->egrbufs.buffers[idx].len = 1983 rcd->egrbufs.rcvtid_size; 1984 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = 1985 rcd->egrbufs.buffers[idx].addr; 1986 rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma = 1987 rcd->egrbufs.buffers[idx].dma; 1988 rcd->egrbufs.alloced++; 1989 alloced_bytes += rcd->egrbufs.rcvtid_size; 1990 idx++; 1991 } else { 1992 u32 new_size, i, j; 1993 u64 offset = 0; 1994 1995 /* 1996 * Fail the eager buffer allocation if: 1997 * - we are already using the lowest acceptable size 1998 * - we are using one-pkt-per-egr-buffer (this implies 1999 * that we are accepting only one size) 2000 */ 2001 if (rcd->egrbufs.rcvtid_size == round_mtu || 2002 !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { 2003 dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", 2004 rcd->ctxt); 2005 ret = -ENOMEM; 2006 goto bail_rcvegrbuf_phys; 2007 } 2008 2009 new_size = rcd->egrbufs.rcvtid_size / 2; 2010 2011 /* 2012 * If the first attempt to allocate memory failed, don't 2013 * fail everything but continue with the next lower 2014 * size. 2015 */ 2016 if (idx == 0) { 2017 rcd->egrbufs.rcvtid_size = new_size; 2018 continue; 2019 } 2020 2021 /* 2022 * Re-partition already allocated buffers to a smaller 2023 * size. 2024 */ 2025 rcd->egrbufs.alloced = 0; 2026 for (i = 0, j = 0, offset = 0; j < idx; i++) { 2027 if (i >= rcd->egrbufs.count) 2028 break; 2029 rcd->egrbufs.rcvtids[i].dma = 2030 rcd->egrbufs.buffers[j].dma + offset; 2031 rcd->egrbufs.rcvtids[i].addr = 2032 rcd->egrbufs.buffers[j].addr + offset; 2033 rcd->egrbufs.alloced++; 2034 if ((rcd->egrbufs.buffers[j].dma + offset + 2035 new_size) == 2036 (rcd->egrbufs.buffers[j].dma + 2037 rcd->egrbufs.buffers[j].len)) { 2038 j++; 2039 offset = 0; 2040 } else { 2041 offset += new_size; 2042 } 2043 } 2044 rcd->egrbufs.rcvtid_size = new_size; 2045 } 2046 } 2047 rcd->egrbufs.numbufs = idx; 2048 rcd->egrbufs.size = alloced_bytes; 2049 2050 hfi1_cdbg(PROC, 2051 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", 2052 rcd->ctxt, rcd->egrbufs.alloced, 2053 rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); 2054 2055 /* 2056 * Set the contexts rcv array head update threshold to the closest 2057 * power of 2 (so we can use a mask instead of modulo) below half 2058 * the allocated entries. 2059 */ 2060 rcd->egrbufs.threshold = 2061 rounddown_pow_of_two(rcd->egrbufs.alloced / 2); 2062 /* 2063 * Compute the expected RcvArray entry base. This is done after 2064 * allocating the eager buffers in order to maximize the 2065 * expected RcvArray entries for the context. 2066 */ 2067 max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; 2068 egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); 2069 rcd->expected_count = max_entries - egrtop; 2070 if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) 2071 rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; 2072 2073 rcd->expected_base = rcd->eager_base + egrtop; 2074 hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", 2075 rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, 2076 rcd->eager_base, rcd->expected_base); 2077 2078 if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { 2079 hfi1_cdbg(PROC, 2080 "ctxt%u: current Eager buffer size is invalid %u\n", 2081 rcd->ctxt, rcd->egrbufs.rcvtid_size); 2082 ret = -EINVAL; 2083 goto bail_rcvegrbuf_phys; 2084 } 2085 2086 for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { 2087 hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, 2088 rcd->egrbufs.rcvtids[idx].dma, order); 2089 cond_resched(); 2090 } 2091 2092 return 0; 2093 2094 bail_rcvegrbuf_phys: 2095 for (idx = 0; idx < rcd->egrbufs.alloced && 2096 rcd->egrbufs.buffers[idx].addr; 2097 idx++) { 2098 dma_free_coherent(&dd->pcidev->dev, 2099 rcd->egrbufs.buffers[idx].len, 2100 rcd->egrbufs.buffers[idx].addr, 2101 rcd->egrbufs.buffers[idx].dma); 2102 rcd->egrbufs.buffers[idx].addr = NULL; 2103 rcd->egrbufs.buffers[idx].dma = 0; 2104 rcd->egrbufs.buffers[idx].len = 0; 2105 } 2106 2107 return ret; 2108 } 2109