1 /* 2 * Copyright(c) 2015 - 2017 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 48 #include <rdma/ib_mad.h> 49 #include <rdma/ib_user_verbs.h> 50 #include <linux/io.h> 51 #include <linux/module.h> 52 #include <linux/utsname.h> 53 #include <linux/rculist.h> 54 #include <linux/mm.h> 55 #include <linux/vmalloc.h> 56 #include <rdma/opa_addr.h> 57 58 #include "hfi.h" 59 #include "common.h" 60 #include "device.h" 61 #include "trace.h" 62 #include "qp.h" 63 #include "verbs_txreq.h" 64 #include "debugfs.h" 65 #include "vnic.h" 66 67 static unsigned int hfi1_lkey_table_size = 16; 68 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, 69 S_IRUGO); 70 MODULE_PARM_DESC(lkey_table_size, 71 "LKEY table size in bits (2^n, 1 <= n <= 23)"); 72 73 static unsigned int hfi1_max_pds = 0xFFFF; 74 module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO); 75 MODULE_PARM_DESC(max_pds, 76 "Maximum number of protection domains to support"); 77 78 static unsigned int hfi1_max_ahs = 0xFFFF; 79 module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO); 80 MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); 81 82 unsigned int hfi1_max_cqes = 0x2FFFFF; 83 module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO); 84 MODULE_PARM_DESC(max_cqes, 85 "Maximum number of completion queue entries to support"); 86 87 unsigned int hfi1_max_cqs = 0x1FFFF; 88 module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO); 89 MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); 90 91 unsigned int hfi1_max_qp_wrs = 0x3FFF; 92 module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO); 93 MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); 94 95 unsigned int hfi1_max_qps = 32768; 96 module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO); 97 MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); 98 99 unsigned int hfi1_max_sges = 0x60; 100 module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO); 101 MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); 102 103 unsigned int hfi1_max_mcast_grps = 16384; 104 module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO); 105 MODULE_PARM_DESC(max_mcast_grps, 106 "Maximum number of multicast groups to support"); 107 108 unsigned int hfi1_max_mcast_qp_attached = 16; 109 module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached, 110 uint, S_IRUGO); 111 MODULE_PARM_DESC(max_mcast_qp_attached, 112 "Maximum number of attached QPs to support"); 113 114 unsigned int hfi1_max_srqs = 1024; 115 module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO); 116 MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); 117 118 unsigned int hfi1_max_srq_sges = 128; 119 module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO); 120 MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); 121 122 unsigned int hfi1_max_srq_wrs = 0x1FFFF; 123 module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO); 124 MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); 125 126 unsigned short piothreshold = 256; 127 module_param(piothreshold, ushort, S_IRUGO); 128 MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); 129 130 #define COPY_CACHELESS 1 131 #define COPY_ADAPTIVE 2 132 static unsigned int sge_copy_mode; 133 module_param(sge_copy_mode, uint, S_IRUGO); 134 MODULE_PARM_DESC(sge_copy_mode, 135 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS"); 136 137 static void verbs_sdma_complete( 138 struct sdma_txreq *cookie, 139 int status); 140 141 static int pio_wait(struct rvt_qp *qp, 142 struct send_context *sc, 143 struct hfi1_pkt_state *ps, 144 u32 flag); 145 146 /* Length of buffer to create verbs txreq cache name */ 147 #define TXREQ_NAME_LEN 24 148 149 /* 16B trailing buffer */ 150 static const u8 trail_buf[MAX_16B_PADDING]; 151 152 static uint wss_threshold; 153 module_param(wss_threshold, uint, S_IRUGO); 154 MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); 155 static uint wss_clean_period = 256; 156 module_param(wss_clean_period, uint, S_IRUGO); 157 MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); 158 159 /* memory working set size */ 160 struct hfi1_wss { 161 unsigned long *entries; 162 atomic_t total_count; 163 atomic_t clean_counter; 164 atomic_t clean_entry; 165 166 int threshold; 167 int num_entries; 168 long pages_mask; 169 }; 170 171 static struct hfi1_wss wss; 172 173 int hfi1_wss_init(void) 174 { 175 long llc_size; 176 long llc_bits; 177 long table_size; 178 long table_bits; 179 180 /* check for a valid percent range - default to 80 if none or invalid */ 181 if (wss_threshold < 1 || wss_threshold > 100) 182 wss_threshold = 80; 183 /* reject a wildly large period */ 184 if (wss_clean_period > 1000000) 185 wss_clean_period = 256; 186 /* reject a zero period */ 187 if (wss_clean_period == 0) 188 wss_clean_period = 1; 189 190 /* 191 * Calculate the table size - the next power of 2 larger than the 192 * LLC size. LLC size is in KiB. 193 */ 194 llc_size = wss_llc_size() * 1024; 195 table_size = roundup_pow_of_two(llc_size); 196 197 /* one bit per page in rounded up table */ 198 llc_bits = llc_size / PAGE_SIZE; 199 table_bits = table_size / PAGE_SIZE; 200 wss.pages_mask = table_bits - 1; 201 wss.num_entries = table_bits / BITS_PER_LONG; 202 203 wss.threshold = (llc_bits * wss_threshold) / 100; 204 if (wss.threshold == 0) 205 wss.threshold = 1; 206 207 atomic_set(&wss.clean_counter, wss_clean_period); 208 209 wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), 210 GFP_KERNEL); 211 if (!wss.entries) { 212 hfi1_wss_exit(); 213 return -ENOMEM; 214 } 215 216 return 0; 217 } 218 219 void hfi1_wss_exit(void) 220 { 221 /* coded to handle partially initialized and repeat callers */ 222 kfree(wss.entries); 223 wss.entries = NULL; 224 } 225 226 /* 227 * Advance the clean counter. When the clean period has expired, 228 * clean an entry. 229 * 230 * This is implemented in atomics to avoid locking. Because multiple 231 * variables are involved, it can be racy which can lead to slightly 232 * inaccurate information. Since this is only a heuristic, this is 233 * OK. Any innaccuracies will clean themselves out as the counter 234 * advances. That said, it is unlikely the entry clean operation will 235 * race - the next possible racer will not start until the next clean 236 * period. 237 * 238 * The clean counter is implemented as a decrement to zero. When zero 239 * is reached an entry is cleaned. 240 */ 241 static void wss_advance_clean_counter(void) 242 { 243 int entry; 244 int weight; 245 unsigned long bits; 246 247 /* become the cleaner if we decrement the counter to zero */ 248 if (atomic_dec_and_test(&wss.clean_counter)) { 249 /* 250 * Set, not add, the clean period. This avoids an issue 251 * where the counter could decrement below the clean period. 252 * Doing a set can result in lost decrements, slowing the 253 * clean advance. Since this a heuristic, this possible 254 * slowdown is OK. 255 * 256 * An alternative is to loop, advancing the counter by a 257 * clean period until the result is > 0. However, this could 258 * lead to several threads keeping another in the clean loop. 259 * This could be mitigated by limiting the number of times 260 * we stay in the loop. 261 */ 262 atomic_set(&wss.clean_counter, wss_clean_period); 263 264 /* 265 * Uniquely grab the entry to clean and move to next. 266 * The current entry is always the lower bits of 267 * wss.clean_entry. The table size, wss.num_entries, 268 * is always a power-of-2. 269 */ 270 entry = (atomic_inc_return(&wss.clean_entry) - 1) 271 & (wss.num_entries - 1); 272 273 /* clear the entry and count the bits */ 274 bits = xchg(&wss.entries[entry], 0); 275 weight = hweight64((u64)bits); 276 /* only adjust the contended total count if needed */ 277 if (weight) 278 atomic_sub(weight, &wss.total_count); 279 } 280 } 281 282 /* 283 * Insert the given address into the working set array. 284 */ 285 static void wss_insert(void *address) 286 { 287 u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; 288 u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ 289 u32 nr = page & (BITS_PER_LONG - 1); 290 291 if (!test_and_set_bit(nr, &wss.entries[entry])) 292 atomic_inc(&wss.total_count); 293 294 wss_advance_clean_counter(); 295 } 296 297 /* 298 * Is the working set larger than the threshold? 299 */ 300 static inline bool wss_exceeds_threshold(void) 301 { 302 return atomic_read(&wss.total_count) >= wss.threshold; 303 } 304 305 /* 306 * Translate ib_wr_opcode into ib_wc_opcode. 307 */ 308 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { 309 [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, 310 [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, 311 [IB_WR_SEND] = IB_WC_SEND, 312 [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, 313 [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, 314 [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, 315 [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD, 316 [IB_WR_SEND_WITH_INV] = IB_WC_SEND, 317 [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV, 318 [IB_WR_REG_MR] = IB_WC_REG_MR 319 }; 320 321 /* 322 * Length of header by opcode, 0 --> not supported 323 */ 324 const u8 hdr_len_by_opcode[256] = { 325 /* RC */ 326 [IB_OPCODE_RC_SEND_FIRST] = 12 + 8, 327 [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8, 328 [IB_OPCODE_RC_SEND_LAST] = 12 + 8, 329 [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, 330 [IB_OPCODE_RC_SEND_ONLY] = 12 + 8, 331 [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, 332 [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16, 333 [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8, 334 [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8, 335 [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, 336 [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16, 337 [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, 338 [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16, 339 [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4, 340 [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8, 341 [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4, 342 [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4, 343 [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4, 344 [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4 + 8, 345 [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28, 346 [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, 347 [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4, 348 [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, 349 /* UC */ 350 [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, 351 [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, 352 [IB_OPCODE_UC_SEND_LAST] = 12 + 8, 353 [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, 354 [IB_OPCODE_UC_SEND_ONLY] = 12 + 8, 355 [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, 356 [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16, 357 [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8, 358 [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8, 359 [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, 360 [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16, 361 [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, 362 /* UD */ 363 [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8, 364 [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12 365 }; 366 367 static const opcode_handler opcode_handler_tbl[256] = { 368 /* RC */ 369 [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv, 370 [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv, 371 [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv, 372 [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, 373 [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv, 374 [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, 375 [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv, 376 [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv, 377 [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv, 378 [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, 379 [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv, 380 [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, 381 [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv, 382 [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv, 383 [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv, 384 [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv, 385 [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv, 386 [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv, 387 [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv, 388 [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv, 389 [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, 390 [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv, 391 [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, 392 /* UC */ 393 [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, 394 [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, 395 [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv, 396 [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, 397 [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv, 398 [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, 399 [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv, 400 [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv, 401 [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv, 402 [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, 403 [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv, 404 [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, 405 /* UD */ 406 [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv, 407 [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv, 408 /* CNP */ 409 [IB_OPCODE_CNP] = &hfi1_cnp_rcv 410 }; 411 412 #define OPMASK 0x1f 413 414 static const u32 pio_opmask[BIT(3)] = { 415 /* RC */ 416 [IB_OPCODE_RC >> 5] = 417 BIT(RC_OP(SEND_ONLY) & OPMASK) | 418 BIT(RC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) | 419 BIT(RC_OP(RDMA_WRITE_ONLY) & OPMASK) | 420 BIT(RC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK) | 421 BIT(RC_OP(RDMA_READ_REQUEST) & OPMASK) | 422 BIT(RC_OP(ACKNOWLEDGE) & OPMASK) | 423 BIT(RC_OP(ATOMIC_ACKNOWLEDGE) & OPMASK) | 424 BIT(RC_OP(COMPARE_SWAP) & OPMASK) | 425 BIT(RC_OP(FETCH_ADD) & OPMASK), 426 /* UC */ 427 [IB_OPCODE_UC >> 5] = 428 BIT(UC_OP(SEND_ONLY) & OPMASK) | 429 BIT(UC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) | 430 BIT(UC_OP(RDMA_WRITE_ONLY) & OPMASK) | 431 BIT(UC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK), 432 }; 433 434 /* 435 * System image GUID. 436 */ 437 __be64 ib_hfi1_sys_image_guid; 438 439 /** 440 * hfi1_copy_sge - copy data to SGE memory 441 * @ss: the SGE state 442 * @data: the data to copy 443 * @length: the length of the data 444 * @release: boolean to release MR 445 * @copy_last: do a separate copy of the last 8 bytes 446 */ 447 void hfi1_copy_sge( 448 struct rvt_sge_state *ss, 449 void *data, u32 length, 450 bool release, 451 bool copy_last) 452 { 453 struct rvt_sge *sge = &ss->sge; 454 int i; 455 bool in_last = false; 456 bool cacheless_copy = false; 457 458 if (sge_copy_mode == COPY_CACHELESS) { 459 cacheless_copy = length >= PAGE_SIZE; 460 } else if (sge_copy_mode == COPY_ADAPTIVE) { 461 if (length >= PAGE_SIZE) { 462 /* 463 * NOTE: this *assumes*: 464 * o The first vaddr is the dest. 465 * o If multiple pages, then vaddr is sequential. 466 */ 467 wss_insert(sge->vaddr); 468 if (length >= (2 * PAGE_SIZE)) 469 wss_insert(sge->vaddr + PAGE_SIZE); 470 471 cacheless_copy = wss_exceeds_threshold(); 472 } else { 473 wss_advance_clean_counter(); 474 } 475 } 476 if (copy_last) { 477 if (length > 8) { 478 length -= 8; 479 } else { 480 copy_last = false; 481 in_last = true; 482 } 483 } 484 485 again: 486 while (length) { 487 u32 len = rvt_get_sge_length(sge, length); 488 489 WARN_ON_ONCE(len == 0); 490 if (unlikely(in_last)) { 491 /* enforce byte transfer ordering */ 492 for (i = 0; i < len; i++) 493 ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; 494 } else if (cacheless_copy) { 495 cacheless_memcpy(sge->vaddr, data, len); 496 } else { 497 memcpy(sge->vaddr, data, len); 498 } 499 rvt_update_sge(ss, len, release); 500 data += len; 501 length -= len; 502 } 503 504 if (copy_last) { 505 copy_last = false; 506 in_last = true; 507 length = 8; 508 goto again; 509 } 510 } 511 512 /* 513 * Make sure the QP is ready and able to accept the given opcode. 514 */ 515 static inline opcode_handler qp_ok(struct hfi1_packet *packet) 516 { 517 if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) 518 return NULL; 519 if (((packet->opcode & RVT_OPCODE_QP_MASK) == 520 packet->qp->allowed_ops) || 521 (packet->opcode == IB_OPCODE_CNP)) 522 return opcode_handler_tbl[packet->opcode]; 523 524 return NULL; 525 } 526 527 static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc) 528 { 529 #ifdef CONFIG_FAULT_INJECTION 530 if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) 531 /* 532 * In order to drop non-IB traffic we 533 * set PbcInsertHrc to NONE (0x2). 534 * The packet will still be delivered 535 * to the receiving node but a 536 * KHdrHCRCErr (KDETH packet with a bad 537 * HCRC) will be triggered and the 538 * packet will not be delivered to the 539 * correct context. 540 */ 541 pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT; 542 else 543 /* 544 * In order to drop regular verbs 545 * traffic we set the PbcTestEbp 546 * flag. The packet will still be 547 * delivered to the receiving node but 548 * a 'late ebp error' will be 549 * triggered and will be dropped. 550 */ 551 pbc |= PBC_TEST_EBP; 552 #endif 553 return pbc; 554 } 555 556 static int hfi1_do_pkey_check(struct hfi1_packet *packet) 557 { 558 struct hfi1_ctxtdata *rcd = packet->rcd; 559 struct hfi1_pportdata *ppd = rcd->ppd; 560 struct hfi1_16b_header *hdr = packet->hdr; 561 u16 pkey; 562 563 /* Pkey check needed only for bypass packets */ 564 if (packet->etype != RHF_RCV_TYPE_BYPASS) 565 return 0; 566 567 /* Perform pkey check */ 568 pkey = hfi1_16B_get_pkey(hdr); 569 return ingress_pkey_check(ppd, pkey, packet->sc, 570 packet->qp->s_pkey_index, 571 packet->slid, true); 572 } 573 574 static inline void hfi1_handle_packet(struct hfi1_packet *packet, 575 bool is_mcast) 576 { 577 u32 qp_num; 578 struct hfi1_ctxtdata *rcd = packet->rcd; 579 struct hfi1_pportdata *ppd = rcd->ppd; 580 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 581 struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; 582 opcode_handler packet_handler; 583 unsigned long flags; 584 585 inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]); 586 587 if (unlikely(is_mcast)) { 588 struct rvt_mcast *mcast; 589 struct rvt_mcast_qp *p; 590 591 if (!packet->grh) 592 goto drop; 593 mcast = rvt_mcast_find(&ibp->rvp, 594 &packet->grh->dgid, 595 opa_get_lid(packet->dlid, 9B)); 596 if (!mcast) 597 goto drop; 598 list_for_each_entry_rcu(p, &mcast->qp_list, list) { 599 packet->qp = p->qp; 600 if (hfi1_do_pkey_check(packet)) 601 goto drop; 602 spin_lock_irqsave(&packet->qp->r_lock, flags); 603 packet_handler = qp_ok(packet); 604 if (likely(packet_handler)) 605 packet_handler(packet); 606 else 607 ibp->rvp.n_pkt_drops++; 608 spin_unlock_irqrestore(&packet->qp->r_lock, flags); 609 } 610 /* 611 * Notify rvt_multicast_detach() if it is waiting for us 612 * to finish. 613 */ 614 if (atomic_dec_return(&mcast->refcount) <= 1) 615 wake_up(&mcast->wait); 616 } else { 617 /* Get the destination QP number. */ 618 qp_num = ib_bth_get_qpn(packet->ohdr); 619 rcu_read_lock(); 620 packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); 621 if (!packet->qp) 622 goto unlock_drop; 623 624 if (hfi1_do_pkey_check(packet)) 625 goto unlock_drop; 626 627 if (unlikely(hfi1_dbg_fault_opcode(packet->qp, packet->opcode, 628 true))) 629 goto unlock_drop; 630 631 spin_lock_irqsave(&packet->qp->r_lock, flags); 632 packet_handler = qp_ok(packet); 633 if (likely(packet_handler)) 634 packet_handler(packet); 635 else 636 ibp->rvp.n_pkt_drops++; 637 spin_unlock_irqrestore(&packet->qp->r_lock, flags); 638 rcu_read_unlock(); 639 } 640 return; 641 unlock_drop: 642 rcu_read_unlock(); 643 drop: 644 ibp->rvp.n_pkt_drops++; 645 } 646 647 /** 648 * hfi1_ib_rcv - process an incoming packet 649 * @packet: data packet information 650 * 651 * This is called to process an incoming packet at interrupt level. 652 */ 653 void hfi1_ib_rcv(struct hfi1_packet *packet) 654 { 655 struct hfi1_ctxtdata *rcd = packet->rcd; 656 657 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); 658 hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); 659 } 660 661 void hfi1_16B_rcv(struct hfi1_packet *packet) 662 { 663 struct hfi1_ctxtdata *rcd = packet->rcd; 664 665 trace_input_ibhdr(rcd->dd, packet, false); 666 hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); 667 } 668 669 /* 670 * This is called from a timer to check for QPs 671 * which need kernel memory in order to send a packet. 672 */ 673 static void mem_timer(struct timer_list *t) 674 { 675 struct hfi1_ibdev *dev = from_timer(dev, t, mem_timer); 676 struct list_head *list = &dev->memwait; 677 struct rvt_qp *qp = NULL; 678 struct iowait *wait; 679 unsigned long flags; 680 struct hfi1_qp_priv *priv; 681 682 write_seqlock_irqsave(&dev->iowait_lock, flags); 683 if (!list_empty(list)) { 684 wait = list_first_entry(list, struct iowait, list); 685 qp = iowait_to_qp(wait); 686 priv = qp->priv; 687 list_del_init(&priv->s_iowait.list); 688 priv->s_iowait.lock = NULL; 689 /* refcount held until actual wake up */ 690 if (!list_empty(list)) 691 mod_timer(&dev->mem_timer, jiffies + 1); 692 } 693 write_sequnlock_irqrestore(&dev->iowait_lock, flags); 694 695 if (qp) 696 hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM); 697 } 698 699 /* 700 * This is called with progress side lock held. 701 */ 702 /* New API */ 703 static void verbs_sdma_complete( 704 struct sdma_txreq *cookie, 705 int status) 706 { 707 struct verbs_txreq *tx = 708 container_of(cookie, struct verbs_txreq, txreq); 709 struct rvt_qp *qp = tx->qp; 710 711 spin_lock(&qp->s_lock); 712 if (tx->wqe) { 713 hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); 714 } else if (qp->ibqp.qp_type == IB_QPT_RC) { 715 struct hfi1_opa_header *hdr; 716 717 hdr = &tx->phdr.hdr; 718 hfi1_rc_send_complete(qp, hdr); 719 } 720 spin_unlock(&qp->s_lock); 721 722 hfi1_put_txreq(tx); 723 } 724 725 static int wait_kmem(struct hfi1_ibdev *dev, 726 struct rvt_qp *qp, 727 struct hfi1_pkt_state *ps) 728 { 729 struct hfi1_qp_priv *priv = qp->priv; 730 unsigned long flags; 731 int ret = 0; 732 733 spin_lock_irqsave(&qp->s_lock, flags); 734 if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { 735 write_seqlock(&dev->iowait_lock); 736 list_add_tail(&ps->s_txreq->txreq.list, 737 &priv->s_iowait.tx_head); 738 if (list_empty(&priv->s_iowait.list)) { 739 if (list_empty(&dev->memwait)) 740 mod_timer(&dev->mem_timer, jiffies + 1); 741 qp->s_flags |= RVT_S_WAIT_KMEM; 742 list_add_tail(&priv->s_iowait.list, &dev->memwait); 743 priv->s_iowait.lock = &dev->iowait_lock; 744 trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); 745 rvt_get_qp(qp); 746 } 747 write_sequnlock(&dev->iowait_lock); 748 qp->s_flags &= ~RVT_S_BUSY; 749 ret = -EBUSY; 750 } 751 spin_unlock_irqrestore(&qp->s_lock, flags); 752 753 return ret; 754 } 755 756 /* 757 * This routine calls txadds for each sg entry. 758 * 759 * Add failures will revert the sge cursor 760 */ 761 static noinline int build_verbs_ulp_payload( 762 struct sdma_engine *sde, 763 u32 length, 764 struct verbs_txreq *tx) 765 { 766 struct rvt_sge_state *ss = tx->ss; 767 struct rvt_sge *sg_list = ss->sg_list; 768 struct rvt_sge sge = ss->sge; 769 u8 num_sge = ss->num_sge; 770 u32 len; 771 int ret = 0; 772 773 while (length) { 774 len = ss->sge.length; 775 if (len > length) 776 len = length; 777 if (len > ss->sge.sge_length) 778 len = ss->sge.sge_length; 779 WARN_ON_ONCE(len == 0); 780 ret = sdma_txadd_kvaddr( 781 sde->dd, 782 &tx->txreq, 783 ss->sge.vaddr, 784 len); 785 if (ret) 786 goto bail_txadd; 787 rvt_update_sge(ss, len, false); 788 length -= len; 789 } 790 return ret; 791 bail_txadd: 792 /* unwind cursor */ 793 ss->sge = sge; 794 ss->num_sge = num_sge; 795 ss->sg_list = sg_list; 796 return ret; 797 } 798 799 /** 800 * update_tx_opstats - record stats by opcode 801 * @qp; the qp 802 * @ps: transmit packet state 803 * @plen: the plen in dwords 804 * 805 * This is a routine to record the tx opstats after a 806 * packet has been presented to the egress mechanism. 807 */ 808 static void update_tx_opstats(struct rvt_qp *qp, struct hfi1_pkt_state *ps, 809 u32 plen) 810 { 811 #ifdef CONFIG_DEBUG_FS 812 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); 813 struct hfi1_opcode_stats_perctx *s = get_cpu_ptr(dd->tx_opstats); 814 815 inc_opstats(plen * 4, &s->stats[ps->opcode]); 816 put_cpu_ptr(s); 817 #endif 818 } 819 820 /* 821 * Build the number of DMA descriptors needed to send length bytes of data. 822 * 823 * NOTE: DMA mapping is held in the tx until completed in the ring or 824 * the tx desc is freed without having been submitted to the ring 825 * 826 * This routine ensures all the helper routine calls succeed. 827 */ 828 /* New API */ 829 static int build_verbs_tx_desc( 830 struct sdma_engine *sde, 831 u32 length, 832 struct verbs_txreq *tx, 833 struct hfi1_ahg_info *ahg_info, 834 u64 pbc) 835 { 836 int ret = 0; 837 struct hfi1_sdma_header *phdr = &tx->phdr; 838 u16 hdrbytes = (tx->hdr_dwords + sizeof(pbc) / 4) << 2; 839 u8 extra_bytes = 0; 840 841 if (tx->phdr.hdr.hdr_type) { 842 /* 843 * hdrbytes accounts for PBC. Need to subtract 8 bytes 844 * before calculating padding. 845 */ 846 extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) + 847 (SIZE_OF_CRC << 2) + SIZE_OF_LT; 848 } 849 if (!ahg_info->ahgcount) { 850 ret = sdma_txinit_ahg( 851 &tx->txreq, 852 ahg_info->tx_flags, 853 hdrbytes + length + 854 extra_bytes, 855 ahg_info->ahgidx, 856 0, 857 NULL, 858 0, 859 verbs_sdma_complete); 860 if (ret) 861 goto bail_txadd; 862 phdr->pbc = cpu_to_le64(pbc); 863 ret = sdma_txadd_kvaddr( 864 sde->dd, 865 &tx->txreq, 866 phdr, 867 hdrbytes); 868 if (ret) 869 goto bail_txadd; 870 } else { 871 ret = sdma_txinit_ahg( 872 &tx->txreq, 873 ahg_info->tx_flags, 874 length, 875 ahg_info->ahgidx, 876 ahg_info->ahgcount, 877 ahg_info->ahgdesc, 878 hdrbytes, 879 verbs_sdma_complete); 880 if (ret) 881 goto bail_txadd; 882 } 883 /* add the ulp payload - if any. tx->ss can be NULL for acks */ 884 if (tx->ss) { 885 ret = build_verbs_ulp_payload(sde, length, tx); 886 if (ret) 887 goto bail_txadd; 888 } 889 890 /* add icrc, lt byte, and padding to flit */ 891 if (extra_bytes) 892 ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq, 893 (void *)trail_buf, extra_bytes); 894 895 bail_txadd: 896 return ret; 897 } 898 899 int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, 900 u64 pbc) 901 { 902 struct hfi1_qp_priv *priv = qp->priv; 903 struct hfi1_ahg_info *ahg_info = priv->s_ahg; 904 u32 hdrwords = ps->s_txreq->hdr_dwords; 905 u32 len = ps->s_txreq->s_cur_size; 906 u32 plen; 907 struct hfi1_ibdev *dev = ps->dev; 908 struct hfi1_pportdata *ppd = ps->ppd; 909 struct verbs_txreq *tx; 910 u8 sc5 = priv->s_sc; 911 int ret; 912 u32 dwords; 913 914 if (ps->s_txreq->phdr.hdr.hdr_type) { 915 u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len); 916 917 dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) + 918 SIZE_OF_LT) >> 2; 919 } else { 920 dwords = (len + 3) >> 2; 921 } 922 plen = hdrwords + dwords + sizeof(pbc) / 4; 923 924 tx = ps->s_txreq; 925 if (!sdma_txreq_built(&tx->txreq)) { 926 if (likely(pbc == 0)) { 927 u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); 928 929 /* No vl15 here */ 930 /* set PBC_DC_INFO bit (aka SC[4]) in pbc */ 931 if (ps->s_txreq->phdr.hdr.hdr_type) 932 pbc |= PBC_PACKET_BYPASS | 933 PBC_INSERT_BYPASS_ICRC; 934 else 935 pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); 936 937 if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, 938 false))) 939 pbc = hfi1_fault_tx(qp, ps->opcode, pbc); 940 pbc = create_pbc(ppd, 941 pbc, 942 qp->srate_mbps, 943 vl, 944 plen); 945 } 946 tx->wqe = qp->s_wqe; 947 ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc); 948 if (unlikely(ret)) 949 goto bail_build; 950 } 951 ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq, 952 ps->pkts_sent); 953 if (unlikely(ret < 0)) { 954 if (ret == -ECOMM) 955 goto bail_ecomm; 956 return ret; 957 } 958 959 update_tx_opstats(qp, ps, plen); 960 trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device), 961 &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); 962 return ret; 963 964 bail_ecomm: 965 /* The current one got "sent" */ 966 return 0; 967 bail_build: 968 ret = wait_kmem(dev, qp, ps); 969 if (!ret) { 970 /* free txreq - bad state */ 971 hfi1_put_txreq(ps->s_txreq); 972 ps->s_txreq = NULL; 973 } 974 return ret; 975 } 976 977 /* 978 * If we are now in the error state, return zero to flush the 979 * send work request. 980 */ 981 static int pio_wait(struct rvt_qp *qp, 982 struct send_context *sc, 983 struct hfi1_pkt_state *ps, 984 u32 flag) 985 { 986 struct hfi1_qp_priv *priv = qp->priv; 987 struct hfi1_devdata *dd = sc->dd; 988 struct hfi1_ibdev *dev = &dd->verbs_dev; 989 unsigned long flags; 990 int ret = 0; 991 992 /* 993 * Note that as soon as want_buffer() is called and 994 * possibly before it returns, sc_piobufavail() 995 * could be called. Therefore, put QP on the I/O wait list before 996 * enabling the PIO avail interrupt. 997 */ 998 spin_lock_irqsave(&qp->s_lock, flags); 999 if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { 1000 write_seqlock(&dev->iowait_lock); 1001 list_add_tail(&ps->s_txreq->txreq.list, 1002 &priv->s_iowait.tx_head); 1003 if (list_empty(&priv->s_iowait.list)) { 1004 struct hfi1_ibdev *dev = &dd->verbs_dev; 1005 int was_empty; 1006 1007 dev->n_piowait += !!(flag & RVT_S_WAIT_PIO); 1008 dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN); 1009 qp->s_flags |= flag; 1010 was_empty = list_empty(&sc->piowait); 1011 iowait_queue(ps->pkts_sent, &priv->s_iowait, 1012 &sc->piowait); 1013 priv->s_iowait.lock = &dev->iowait_lock; 1014 trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO); 1015 rvt_get_qp(qp); 1016 /* counting: only call wantpiobuf_intr if first user */ 1017 if (was_empty) 1018 hfi1_sc_wantpiobuf_intr(sc, 1); 1019 } 1020 write_sequnlock(&dev->iowait_lock); 1021 qp->s_flags &= ~RVT_S_BUSY; 1022 ret = -EBUSY; 1023 } 1024 spin_unlock_irqrestore(&qp->s_lock, flags); 1025 return ret; 1026 } 1027 1028 static void verbs_pio_complete(void *arg, int code) 1029 { 1030 struct rvt_qp *qp = (struct rvt_qp *)arg; 1031 struct hfi1_qp_priv *priv = qp->priv; 1032 1033 if (iowait_pio_dec(&priv->s_iowait)) 1034 iowait_drain_wakeup(&priv->s_iowait); 1035 } 1036 1037 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, 1038 u64 pbc) 1039 { 1040 struct hfi1_qp_priv *priv = qp->priv; 1041 u32 hdrwords = ps->s_txreq->hdr_dwords; 1042 struct rvt_sge_state *ss = ps->s_txreq->ss; 1043 u32 len = ps->s_txreq->s_cur_size; 1044 u32 dwords; 1045 u32 plen; 1046 struct hfi1_pportdata *ppd = ps->ppd; 1047 u32 *hdr; 1048 u8 sc5; 1049 unsigned long flags = 0; 1050 struct send_context *sc; 1051 struct pio_buf *pbuf; 1052 int wc_status = IB_WC_SUCCESS; 1053 int ret = 0; 1054 pio_release_cb cb = NULL; 1055 u8 extra_bytes = 0; 1056 1057 if (ps->s_txreq->phdr.hdr.hdr_type) { 1058 u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len); 1059 1060 extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT; 1061 dwords = (len + extra_bytes) >> 2; 1062 hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah; 1063 } else { 1064 dwords = (len + 3) >> 2; 1065 hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh; 1066 } 1067 plen = hdrwords + dwords + sizeof(pbc) / 4; 1068 1069 /* only RC/UC use complete */ 1070 switch (qp->ibqp.qp_type) { 1071 case IB_QPT_RC: 1072 case IB_QPT_UC: 1073 cb = verbs_pio_complete; 1074 break; 1075 default: 1076 break; 1077 } 1078 1079 /* vl15 special case taken care of in ud.c */ 1080 sc5 = priv->s_sc; 1081 sc = ps->s_txreq->psc; 1082 1083 if (likely(pbc == 0)) { 1084 u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); 1085 1086 /* set PBC_DC_INFO bit (aka SC[4]) in pbc */ 1087 if (ps->s_txreq->phdr.hdr.hdr_type) 1088 pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; 1089 else 1090 pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); 1091 if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, false))) 1092 pbc = hfi1_fault_tx(qp, ps->opcode, pbc); 1093 pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); 1094 } 1095 if (cb) 1096 iowait_pio_inc(&priv->s_iowait); 1097 pbuf = sc_buffer_alloc(sc, plen, cb, qp); 1098 if (unlikely(!pbuf)) { 1099 if (cb) 1100 verbs_pio_complete(qp, 0); 1101 if (ppd->host_link_state != HLS_UP_ACTIVE) { 1102 /* 1103 * If we have filled the PIO buffers to capacity and are 1104 * not in an active state this request is not going to 1105 * go out to so just complete it with an error or else a 1106 * ULP or the core may be stuck waiting. 1107 */ 1108 hfi1_cdbg( 1109 PIO, 1110 "alloc failed. state not active, completing"); 1111 wc_status = IB_WC_GENERAL_ERR; 1112 goto pio_bail; 1113 } else { 1114 /* 1115 * This is a normal occurrence. The PIO buffs are full 1116 * up but we are still happily sending, well we could be 1117 * so lets continue to queue the request. 1118 */ 1119 hfi1_cdbg(PIO, "alloc failed. state active, queuing"); 1120 ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO); 1121 if (!ret) 1122 /* txreq not queued - free */ 1123 goto bail; 1124 /* tx consumed in wait */ 1125 return ret; 1126 } 1127 } 1128 1129 if (dwords == 0) { 1130 pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords); 1131 } else { 1132 seg_pio_copy_start(pbuf, pbc, 1133 hdr, hdrwords * 4); 1134 if (ss) { 1135 while (len) { 1136 void *addr = ss->sge.vaddr; 1137 u32 slen = ss->sge.length; 1138 1139 if (slen > len) 1140 slen = len; 1141 rvt_update_sge(ss, slen, false); 1142 seg_pio_copy_mid(pbuf, addr, slen); 1143 len -= slen; 1144 } 1145 } 1146 /* add icrc, lt byte, and padding to flit */ 1147 if (extra_bytes) 1148 seg_pio_copy_mid(pbuf, trail_buf, extra_bytes); 1149 1150 seg_pio_copy_end(pbuf); 1151 } 1152 1153 update_tx_opstats(qp, ps, plen); 1154 trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device), 1155 &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); 1156 1157 pio_bail: 1158 if (qp->s_wqe) { 1159 spin_lock_irqsave(&qp->s_lock, flags); 1160 hfi1_send_complete(qp, qp->s_wqe, wc_status); 1161 spin_unlock_irqrestore(&qp->s_lock, flags); 1162 } else if (qp->ibqp.qp_type == IB_QPT_RC) { 1163 spin_lock_irqsave(&qp->s_lock, flags); 1164 hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr); 1165 spin_unlock_irqrestore(&qp->s_lock, flags); 1166 } 1167 1168 ret = 0; 1169 1170 bail: 1171 hfi1_put_txreq(ps->s_txreq); 1172 return ret; 1173 } 1174 1175 /* 1176 * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent 1177 * being an entry from the partition key table), return 0 1178 * otherwise. Use the matching criteria for egress partition keys 1179 * specified in the OPAv1 spec., section 9.1l.7. 1180 */ 1181 static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) 1182 { 1183 u16 mkey = pkey & PKEY_LOW_15_MASK; 1184 u16 mentry = ent & PKEY_LOW_15_MASK; 1185 1186 if (mkey == mentry) { 1187 /* 1188 * If pkey[15] is set (full partition member), 1189 * is bit 15 in the corresponding table element 1190 * clear (limited member)? 1191 */ 1192 if (pkey & PKEY_MEMBER_MASK) 1193 return !!(ent & PKEY_MEMBER_MASK); 1194 return 1; 1195 } 1196 return 0; 1197 } 1198 1199 /** 1200 * egress_pkey_check - check P_KEY of a packet 1201 * @ppd: Physical IB port data 1202 * @slid: SLID for packet 1203 * @bkey: PKEY for header 1204 * @sc5: SC for packet 1205 * @s_pkey_index: It will be used for look up optimization for kernel contexts 1206 * only. If it is negative value, then it means user contexts is calling this 1207 * function. 1208 * 1209 * It checks if hdr's pkey is valid. 1210 * 1211 * Return: 0 on success, otherwise, 1 1212 */ 1213 int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey, 1214 u8 sc5, int8_t s_pkey_index) 1215 { 1216 struct hfi1_devdata *dd; 1217 int i; 1218 int is_user_ctxt_mechanism = (s_pkey_index < 0); 1219 1220 if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) 1221 return 0; 1222 1223 /* If SC15, pkey[0:14] must be 0x7fff */ 1224 if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) 1225 goto bad; 1226 1227 /* Is the pkey = 0x0, or 0x8000? */ 1228 if ((pkey & PKEY_LOW_15_MASK) == 0) 1229 goto bad; 1230 1231 /* 1232 * For the kernel contexts only, if a qp is passed into the function, 1233 * the most likely matching pkey has index qp->s_pkey_index 1234 */ 1235 if (!is_user_ctxt_mechanism && 1236 egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) { 1237 return 0; 1238 } 1239 1240 for (i = 0; i < MAX_PKEY_VALUES; i++) { 1241 if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) 1242 return 0; 1243 } 1244 bad: 1245 /* 1246 * For the user-context mechanism, the P_KEY check would only happen 1247 * once per SDMA request, not once per packet. Therefore, there's no 1248 * need to increment the counter for the user-context mechanism. 1249 */ 1250 if (!is_user_ctxt_mechanism) { 1251 incr_cntr64(&ppd->port_xmit_constraint_errors); 1252 dd = ppd->dd; 1253 if (!(dd->err_info_xmit_constraint.status & 1254 OPA_EI_STATUS_SMASK)) { 1255 dd->err_info_xmit_constraint.status |= 1256 OPA_EI_STATUS_SMASK; 1257 dd->err_info_xmit_constraint.slid = slid; 1258 dd->err_info_xmit_constraint.pkey = pkey; 1259 } 1260 } 1261 return 1; 1262 } 1263 1264 /** 1265 * get_send_routine - choose an egress routine 1266 * 1267 * Choose an egress routine based on QP type 1268 * and size 1269 */ 1270 static inline send_routine get_send_routine(struct rvt_qp *qp, 1271 struct hfi1_pkt_state *ps) 1272 { 1273 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); 1274 struct hfi1_qp_priv *priv = qp->priv; 1275 struct verbs_txreq *tx = ps->s_txreq; 1276 1277 if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA))) 1278 return dd->process_pio_send; 1279 switch (qp->ibqp.qp_type) { 1280 case IB_QPT_SMI: 1281 return dd->process_pio_send; 1282 case IB_QPT_GSI: 1283 case IB_QPT_UD: 1284 break; 1285 case IB_QPT_UC: 1286 case IB_QPT_RC: { 1287 if (piothreshold && 1288 tx->s_cur_size <= min(piothreshold, qp->pmtu) && 1289 (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) && 1290 iowait_sdma_pending(&priv->s_iowait) == 0 && 1291 !sdma_txreq_built(&tx->txreq)) 1292 return dd->process_pio_send; 1293 break; 1294 } 1295 default: 1296 break; 1297 } 1298 return dd->process_dma_send; 1299 } 1300 1301 /** 1302 * hfi1_verbs_send - send a packet 1303 * @qp: the QP to send on 1304 * @ps: the state of the packet to send 1305 * 1306 * Return zero if packet is sent or queued OK. 1307 * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise. 1308 */ 1309 int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 1310 { 1311 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); 1312 struct hfi1_qp_priv *priv = qp->priv; 1313 struct ib_other_headers *ohdr; 1314 send_routine sr; 1315 int ret; 1316 u16 pkey; 1317 u32 slid; 1318 1319 /* locate the pkey within the headers */ 1320 if (ps->s_txreq->phdr.hdr.hdr_type) { 1321 struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah; 1322 u8 l4 = hfi1_16B_get_l4(hdr); 1323 1324 if (l4 == OPA_16B_L4_IB_GLOBAL) 1325 ohdr = &hdr->u.l.oth; 1326 else 1327 ohdr = &hdr->u.oth; 1328 slid = hfi1_16B_get_slid(hdr); 1329 pkey = hfi1_16B_get_pkey(hdr); 1330 } else { 1331 struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh; 1332 u8 lnh = ib_get_lnh(hdr); 1333 1334 if (lnh == HFI1_LRH_GRH) 1335 ohdr = &hdr->u.l.oth; 1336 else 1337 ohdr = &hdr->u.oth; 1338 slid = ib_get_slid(hdr); 1339 pkey = ib_bth_get_pkey(ohdr); 1340 } 1341 1342 ps->opcode = ib_bth_get_opcode(ohdr); 1343 sr = get_send_routine(qp, ps); 1344 ret = egress_pkey_check(dd->pport, slid, pkey, 1345 priv->s_sc, qp->s_pkey_index); 1346 if (unlikely(ret)) { 1347 /* 1348 * The value we are returning here does not get propagated to 1349 * the verbs caller. Thus we need to complete the request with 1350 * error otherwise the caller could be sitting waiting on the 1351 * completion event. Only do this for PIO. SDMA has its own 1352 * mechanism for handling the errors. So for SDMA we can just 1353 * return. 1354 */ 1355 if (sr == dd->process_pio_send) { 1356 unsigned long flags; 1357 1358 hfi1_cdbg(PIO, "%s() Failed. Completing with err", 1359 __func__); 1360 spin_lock_irqsave(&qp->s_lock, flags); 1361 hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); 1362 spin_unlock_irqrestore(&qp->s_lock, flags); 1363 } 1364 return -EINVAL; 1365 } 1366 if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait)) 1367 return pio_wait(qp, 1368 ps->s_txreq->psc, 1369 ps, 1370 RVT_S_WAIT_PIO_DRAIN); 1371 return sr(qp, ps, 0); 1372 } 1373 1374 /** 1375 * hfi1_fill_device_attr - Fill in rvt dev info device attributes. 1376 * @dd: the device data structure 1377 */ 1378 static void hfi1_fill_device_attr(struct hfi1_devdata *dd) 1379 { 1380 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; 1381 u32 ver = dd->dc8051_ver; 1382 1383 memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props)); 1384 1385 rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 32) | 1386 ((u64)(dc8051_ver_min(ver)) << 16) | 1387 (u64)dc8051_ver_patch(ver); 1388 1389 rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | 1390 IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | 1391 IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | 1392 IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE | 1393 IB_DEVICE_MEM_MGT_EXTENSIONS | 1394 IB_DEVICE_RDMA_NETDEV_OPA_VNIC; 1395 rdi->dparms.props.page_size_cap = PAGE_SIZE; 1396 rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; 1397 rdi->dparms.props.vendor_part_id = dd->pcidev->device; 1398 rdi->dparms.props.hw_ver = dd->minrev; 1399 rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid; 1400 rdi->dparms.props.max_mr_size = U64_MAX; 1401 rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; 1402 rdi->dparms.props.max_qp = hfi1_max_qps; 1403 rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; 1404 rdi->dparms.props.max_sge = hfi1_max_sges; 1405 rdi->dparms.props.max_sge_rd = hfi1_max_sges; 1406 rdi->dparms.props.max_cq = hfi1_max_cqs; 1407 rdi->dparms.props.max_ah = hfi1_max_ahs; 1408 rdi->dparms.props.max_cqe = hfi1_max_cqes; 1409 rdi->dparms.props.max_mr = rdi->lkey_table.max; 1410 rdi->dparms.props.max_fmr = rdi->lkey_table.max; 1411 rdi->dparms.props.max_map_per_fmr = 32767; 1412 rdi->dparms.props.max_pd = hfi1_max_pds; 1413 rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; 1414 rdi->dparms.props.max_qp_init_rd_atom = 255; 1415 rdi->dparms.props.max_srq = hfi1_max_srqs; 1416 rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs; 1417 rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges; 1418 rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB; 1419 rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd); 1420 rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps; 1421 rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached; 1422 rdi->dparms.props.max_total_mcast_qp_attach = 1423 rdi->dparms.props.max_mcast_qp_attach * 1424 rdi->dparms.props.max_mcast_grp; 1425 } 1426 1427 static inline u16 opa_speed_to_ib(u16 in) 1428 { 1429 u16 out = 0; 1430 1431 if (in & OPA_LINK_SPEED_25G) 1432 out |= IB_SPEED_EDR; 1433 if (in & OPA_LINK_SPEED_12_5G) 1434 out |= IB_SPEED_FDR; 1435 1436 return out; 1437 } 1438 1439 /* 1440 * Convert a single OPA link width (no multiple flags) to an IB value. 1441 * A zero OPA link width means link down, which means the IB width value 1442 * is a don't care. 1443 */ 1444 static inline u16 opa_width_to_ib(u16 in) 1445 { 1446 switch (in) { 1447 case OPA_LINK_WIDTH_1X: 1448 /* map 2x and 3x to 1x as they don't exist in IB */ 1449 case OPA_LINK_WIDTH_2X: 1450 case OPA_LINK_WIDTH_3X: 1451 return IB_WIDTH_1X; 1452 default: /* link down or unknown, return our largest width */ 1453 case OPA_LINK_WIDTH_4X: 1454 return IB_WIDTH_4X; 1455 } 1456 } 1457 1458 static int query_port(struct rvt_dev_info *rdi, u8 port_num, 1459 struct ib_port_attr *props) 1460 { 1461 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); 1462 struct hfi1_devdata *dd = dd_from_dev(verbs_dev); 1463 struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; 1464 u32 lid = ppd->lid; 1465 1466 /* props being zeroed by the caller, avoid zeroing it here */ 1467 props->lid = lid ? lid : 0; 1468 props->lmc = ppd->lmc; 1469 /* OPA logical states match IB logical states */ 1470 props->state = driver_lstate(ppd); 1471 props->phys_state = driver_pstate(ppd); 1472 props->gid_tbl_len = HFI1_GUIDS_PER_PORT; 1473 props->active_width = (u8)opa_width_to_ib(ppd->link_width_active); 1474 /* see rate_show() in ib core/sysfs.c */ 1475 props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active); 1476 props->max_vl_num = ppd->vls_supported; 1477 1478 /* Once we are a "first class" citizen and have added the OPA MTUs to 1479 * the core we can advertise the larger MTU enum to the ULPs, for now 1480 * advertise only 4K. 1481 * 1482 * Those applications which are either OPA aware or pass the MTU enum 1483 * from the Path Records to us will get the new 8k MTU. Those that 1484 * attempt to process the MTU enum may fail in various ways. 1485 */ 1486 props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ? 1487 4096 : hfi1_max_mtu), IB_MTU_4096); 1488 props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : 1489 mtu_to_enum(ppd->ibmtu, IB_MTU_4096); 1490 1491 /* 1492 * sm_lid of 0xFFFF needs special handling so that it can 1493 * be differentiated from a permissve LID of 0xFFFF. 1494 * We set the grh_required flag here so the SA can program 1495 * the DGID in the address handle appropriately 1496 */ 1497 if (props->sm_lid == be16_to_cpu(IB_LID_PERMISSIVE)) 1498 props->grh_required = true; 1499 1500 return 0; 1501 } 1502 1503 static int modify_device(struct ib_device *device, 1504 int device_modify_mask, 1505 struct ib_device_modify *device_modify) 1506 { 1507 struct hfi1_devdata *dd = dd_from_ibdev(device); 1508 unsigned i; 1509 int ret; 1510 1511 if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | 1512 IB_DEVICE_MODIFY_NODE_DESC)) { 1513 ret = -EOPNOTSUPP; 1514 goto bail; 1515 } 1516 1517 if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) { 1518 memcpy(device->node_desc, device_modify->node_desc, 1519 IB_DEVICE_NODE_DESC_MAX); 1520 for (i = 0; i < dd->num_pports; i++) { 1521 struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; 1522 1523 hfi1_node_desc_chg(ibp); 1524 } 1525 } 1526 1527 if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { 1528 ib_hfi1_sys_image_guid = 1529 cpu_to_be64(device_modify->sys_image_guid); 1530 for (i = 0; i < dd->num_pports; i++) { 1531 struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; 1532 1533 hfi1_sys_guid_chg(ibp); 1534 } 1535 } 1536 1537 ret = 0; 1538 1539 bail: 1540 return ret; 1541 } 1542 1543 static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num) 1544 { 1545 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); 1546 struct hfi1_devdata *dd = dd_from_dev(verbs_dev); 1547 struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; 1548 int ret; 1549 1550 set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0, 1551 OPA_LINKDOWN_REASON_UNKNOWN); 1552 ret = set_link_state(ppd, HLS_DN_DOWNDEF); 1553 return ret; 1554 } 1555 1556 static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, 1557 int guid_index, __be64 *guid) 1558 { 1559 struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp); 1560 1561 if (guid_index >= HFI1_GUIDS_PER_PORT) 1562 return -EINVAL; 1563 1564 *guid = get_sguid(ibp, guid_index); 1565 return 0; 1566 } 1567 1568 /* 1569 * convert ah port,sl to sc 1570 */ 1571 u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah) 1572 { 1573 struct hfi1_ibport *ibp = to_iport(ibdev, rdma_ah_get_port_num(ah)); 1574 1575 return ibp->sl_to_sc[rdma_ah_get_sl(ah)]; 1576 } 1577 1578 static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) 1579 { 1580 struct hfi1_ibport *ibp; 1581 struct hfi1_pportdata *ppd; 1582 struct hfi1_devdata *dd; 1583 u8 sc5; 1584 1585 if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) && 1586 !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) 1587 return -EINVAL; 1588 1589 /* test the mapping for validity */ 1590 ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); 1591 ppd = ppd_from_ibp(ibp); 1592 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; 1593 dd = dd_from_ppd(ppd); 1594 if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf) 1595 return -EINVAL; 1596 return 0; 1597 } 1598 1599 static void hfi1_notify_new_ah(struct ib_device *ibdev, 1600 struct rdma_ah_attr *ah_attr, 1601 struct rvt_ah *ah) 1602 { 1603 struct hfi1_ibport *ibp; 1604 struct hfi1_pportdata *ppd; 1605 struct hfi1_devdata *dd; 1606 u8 sc5; 1607 struct rdma_ah_attr *attr = &ah->attr; 1608 1609 /* 1610 * Do not trust reading anything from rvt_ah at this point as it is not 1611 * done being setup. We can however modify things which we need to set. 1612 */ 1613 1614 ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); 1615 ppd = ppd_from_ibp(ibp); 1616 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)]; 1617 hfi1_update_ah_attr(ibdev, attr); 1618 hfi1_make_opa_lid(attr); 1619 dd = dd_from_ppd(ppd); 1620 ah->vl = sc_to_vlt(dd, sc5); 1621 if (ah->vl < num_vls || ah->vl == 15) 1622 ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu); 1623 } 1624 1625 /** 1626 * hfi1_get_npkeys - return the size of the PKEY table for context 0 1627 * @dd: the hfi1_ib device 1628 */ 1629 unsigned hfi1_get_npkeys(struct hfi1_devdata *dd) 1630 { 1631 return ARRAY_SIZE(dd->pport[0].pkeys); 1632 } 1633 1634 static void init_ibport(struct hfi1_pportdata *ppd) 1635 { 1636 struct hfi1_ibport *ibp = &ppd->ibport_data; 1637 size_t sz = ARRAY_SIZE(ibp->sl_to_sc); 1638 int i; 1639 1640 for (i = 0; i < sz; i++) { 1641 ibp->sl_to_sc[i] = i; 1642 ibp->sc_to_sl[i] = i; 1643 } 1644 1645 for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++) 1646 INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list); 1647 timer_setup(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, 0); 1648 1649 spin_lock_init(&ibp->rvp.lock); 1650 /* Set the prefix to the default value (see ch. 4.1.1) */ 1651 ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; 1652 ibp->rvp.sm_lid = 0; 1653 /* 1654 * Below should only set bits defined in OPA PortInfo.CapabilityMask 1655 * and PortInfo.CapabilityMask3 1656 */ 1657 ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP | 1658 IB_PORT_CAP_MASK_NOTICE_SUP; 1659 ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported; 1660 ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; 1661 ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; 1662 ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; 1663 ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; 1664 ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; 1665 1666 RCU_INIT_POINTER(ibp->rvp.qp[0], NULL); 1667 RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); 1668 } 1669 1670 static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str) 1671 { 1672 struct rvt_dev_info *rdi = ib_to_rvt(ibdev); 1673 struct hfi1_ibdev *dev = dev_from_rdi(rdi); 1674 u32 ver = dd_from_dev(dev)->dc8051_ver; 1675 1676 snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver), 1677 dc8051_ver_min(ver), dc8051_ver_patch(ver)); 1678 } 1679 1680 static const char * const driver_cntr_names[] = { 1681 /* must be element 0*/ 1682 "DRIVER_KernIntr", 1683 "DRIVER_ErrorIntr", 1684 "DRIVER_Tx_Errs", 1685 "DRIVER_Rcv_Errs", 1686 "DRIVER_HW_Errs", 1687 "DRIVER_NoPIOBufs", 1688 "DRIVER_CtxtsOpen", 1689 "DRIVER_RcvLen_Errs", 1690 "DRIVER_EgrBufFull", 1691 "DRIVER_EgrHdrFull" 1692 }; 1693 1694 static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */ 1695 static const char **dev_cntr_names; 1696 static const char **port_cntr_names; 1697 static int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names); 1698 static int num_dev_cntrs; 1699 static int num_port_cntrs; 1700 static int cntr_names_initialized; 1701 1702 /* 1703 * Convert a list of names separated by '\n' into an array of NULL terminated 1704 * strings. Optionally some entries can be reserved in the array to hold extra 1705 * external strings. 1706 */ 1707 static int init_cntr_names(const char *names_in, 1708 const size_t names_len, 1709 int num_extra_names, 1710 int *num_cntrs, 1711 const char ***cntr_names) 1712 { 1713 char *names_out, *p, **q; 1714 int i, n; 1715 1716 n = 0; 1717 for (i = 0; i < names_len; i++) 1718 if (names_in[i] == '\n') 1719 n++; 1720 1721 names_out = kmalloc((n + num_extra_names) * sizeof(char *) + names_len, 1722 GFP_KERNEL); 1723 if (!names_out) { 1724 *num_cntrs = 0; 1725 *cntr_names = NULL; 1726 return -ENOMEM; 1727 } 1728 1729 p = names_out + (n + num_extra_names) * sizeof(char *); 1730 memcpy(p, names_in, names_len); 1731 1732 q = (char **)names_out; 1733 for (i = 0; i < n; i++) { 1734 q[i] = p; 1735 p = strchr(p, '\n'); 1736 *p++ = '\0'; 1737 } 1738 1739 *num_cntrs = n; 1740 *cntr_names = (const char **)names_out; 1741 return 0; 1742 } 1743 1744 static struct rdma_hw_stats *alloc_hw_stats(struct ib_device *ibdev, 1745 u8 port_num) 1746 { 1747 int i, err; 1748 1749 mutex_lock(&cntr_names_lock); 1750 if (!cntr_names_initialized) { 1751 struct hfi1_devdata *dd = dd_from_ibdev(ibdev); 1752 1753 err = init_cntr_names(dd->cntrnames, 1754 dd->cntrnameslen, 1755 num_driver_cntrs, 1756 &num_dev_cntrs, 1757 &dev_cntr_names); 1758 if (err) { 1759 mutex_unlock(&cntr_names_lock); 1760 return NULL; 1761 } 1762 1763 for (i = 0; i < num_driver_cntrs; i++) 1764 dev_cntr_names[num_dev_cntrs + i] = 1765 driver_cntr_names[i]; 1766 1767 err = init_cntr_names(dd->portcntrnames, 1768 dd->portcntrnameslen, 1769 0, 1770 &num_port_cntrs, 1771 &port_cntr_names); 1772 if (err) { 1773 kfree(dev_cntr_names); 1774 dev_cntr_names = NULL; 1775 mutex_unlock(&cntr_names_lock); 1776 return NULL; 1777 } 1778 cntr_names_initialized = 1; 1779 } 1780 mutex_unlock(&cntr_names_lock); 1781 1782 if (!port_num) 1783 return rdma_alloc_hw_stats_struct( 1784 dev_cntr_names, 1785 num_dev_cntrs + num_driver_cntrs, 1786 RDMA_HW_STATS_DEFAULT_LIFESPAN); 1787 else 1788 return rdma_alloc_hw_stats_struct( 1789 port_cntr_names, 1790 num_port_cntrs, 1791 RDMA_HW_STATS_DEFAULT_LIFESPAN); 1792 } 1793 1794 static u64 hfi1_sps_ints(void) 1795 { 1796 unsigned long flags; 1797 struct hfi1_devdata *dd; 1798 u64 sps_ints = 0; 1799 1800 spin_lock_irqsave(&hfi1_devs_lock, flags); 1801 list_for_each_entry(dd, &hfi1_dev_list, list) { 1802 sps_ints += get_all_cpu_total(dd->int_counter); 1803 } 1804 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1805 return sps_ints; 1806 } 1807 1808 static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, 1809 u8 port, int index) 1810 { 1811 u64 *values; 1812 int count; 1813 1814 if (!port) { 1815 u64 *stats = (u64 *)&hfi1_stats; 1816 int i; 1817 1818 hfi1_read_cntrs(dd_from_ibdev(ibdev), NULL, &values); 1819 values[num_dev_cntrs] = hfi1_sps_ints(); 1820 for (i = 1; i < num_driver_cntrs; i++) 1821 values[num_dev_cntrs + i] = stats[i]; 1822 count = num_dev_cntrs + num_driver_cntrs; 1823 } else { 1824 struct hfi1_ibport *ibp = to_iport(ibdev, port); 1825 1826 hfi1_read_portcntrs(ppd_from_ibp(ibp), NULL, &values); 1827 count = num_port_cntrs; 1828 } 1829 1830 memcpy(stats->value, values, count * sizeof(u64)); 1831 return count; 1832 } 1833 1834 /** 1835 * hfi1_register_ib_device - register our device with the infiniband core 1836 * @dd: the device data structure 1837 * Return 0 if successful, errno if unsuccessful. 1838 */ 1839 int hfi1_register_ib_device(struct hfi1_devdata *dd) 1840 { 1841 struct hfi1_ibdev *dev = &dd->verbs_dev; 1842 struct ib_device *ibdev = &dev->rdi.ibdev; 1843 struct hfi1_pportdata *ppd = dd->pport; 1844 struct hfi1_ibport *ibp = &ppd->ibport_data; 1845 unsigned i; 1846 int ret; 1847 1848 for (i = 0; i < dd->num_pports; i++) 1849 init_ibport(ppd + i); 1850 1851 /* Only need to initialize non-zero fields. */ 1852 1853 timer_setup(&dev->mem_timer, mem_timer, 0); 1854 1855 seqlock_init(&dev->iowait_lock); 1856 seqlock_init(&dev->txwait_lock); 1857 INIT_LIST_HEAD(&dev->txwait); 1858 INIT_LIST_HEAD(&dev->memwait); 1859 1860 ret = verbs_txreq_init(dev); 1861 if (ret) 1862 goto err_verbs_txreq; 1863 1864 /* Use first-port GUID as node guid */ 1865 ibdev->node_guid = get_sguid(ibp, HFI1_PORT_GUID_INDEX); 1866 1867 /* 1868 * The system image GUID is supposed to be the same for all 1869 * HFIs in a single system but since there can be other 1870 * device types in the system, we can't be sure this is unique. 1871 */ 1872 if (!ib_hfi1_sys_image_guid) 1873 ib_hfi1_sys_image_guid = ibdev->node_guid; 1874 ibdev->owner = THIS_MODULE; 1875 ibdev->phys_port_cnt = dd->num_pports; 1876 ibdev->dev.parent = &dd->pcidev->dev; 1877 ibdev->modify_device = modify_device; 1878 ibdev->alloc_hw_stats = alloc_hw_stats; 1879 ibdev->get_hw_stats = get_hw_stats; 1880 ibdev->alloc_rdma_netdev = hfi1_vnic_alloc_rn; 1881 1882 /* keep process mad in the driver */ 1883 ibdev->process_mad = hfi1_process_mad; 1884 ibdev->get_dev_fw_str = hfi1_get_dev_fw_str; 1885 1886 strncpy(ibdev->node_desc, init_utsname()->nodename, 1887 sizeof(ibdev->node_desc)); 1888 1889 /* 1890 * Fill in rvt info object. 1891 */ 1892 dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; 1893 dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; 1894 dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; 1895 dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; 1896 dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be; 1897 dd->verbs_dev.rdi.driver_f.query_port_state = query_port; 1898 dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port; 1899 dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg; 1900 /* 1901 * Fill in rvt info device attributes. 1902 */ 1903 hfi1_fill_device_attr(dd); 1904 1905 /* queue pair */ 1906 dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size; 1907 dd->verbs_dev.rdi.dparms.qpn_start = 0; 1908 dd->verbs_dev.rdi.dparms.qpn_inc = 1; 1909 dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift; 1910 dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16; 1911 dd->verbs_dev.rdi.dparms.qpn_res_end = 1912 dd->verbs_dev.rdi.dparms.qpn_res_start + 65535; 1913 dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC; 1914 dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK; 1915 dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT; 1916 dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK; 1917 dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA | 1918 RDMA_CORE_CAP_OPA_AH; 1919 dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE; 1920 1921 dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc; 1922 dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free; 1923 dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps; 1924 dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset; 1925 dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send_from_rvt; 1926 dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send; 1927 dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send; 1928 dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr; 1929 dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; 1930 dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters; 1931 dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue; 1932 dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp; 1933 dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; 1934 dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp; 1935 dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu; 1936 dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; 1937 dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; 1938 dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; 1939 dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; 1940 1941 /* completeion queue */ 1942 snprintf(dd->verbs_dev.rdi.dparms.cq_name, 1943 sizeof(dd->verbs_dev.rdi.dparms.cq_name), 1944 "hfi1_cq%d", dd->unit); 1945 dd->verbs_dev.rdi.dparms.node = dd->node; 1946 1947 /* misc settings */ 1948 dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */ 1949 dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; 1950 dd->verbs_dev.rdi.dparms.nports = dd->num_pports; 1951 dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); 1952 1953 /* post send table */ 1954 dd->verbs_dev.rdi.post_parms = hfi1_post_parms; 1955 1956 ppd = dd->pport; 1957 for (i = 0; i < dd->num_pports; i++, ppd++) 1958 rvt_init_port(&dd->verbs_dev.rdi, 1959 &ppd->ibport_data.rvp, 1960 i, 1961 ppd->pkeys); 1962 1963 ret = rvt_register_device(&dd->verbs_dev.rdi); 1964 if (ret) 1965 goto err_verbs_txreq; 1966 1967 ret = hfi1_verbs_register_sysfs(dd); 1968 if (ret) 1969 goto err_class; 1970 1971 return ret; 1972 1973 err_class: 1974 rvt_unregister_device(&dd->verbs_dev.rdi); 1975 err_verbs_txreq: 1976 verbs_txreq_exit(dev); 1977 dd_dev_err(dd, "cannot register verbs: %d!\n", -ret); 1978 return ret; 1979 } 1980 1981 void hfi1_unregister_ib_device(struct hfi1_devdata *dd) 1982 { 1983 struct hfi1_ibdev *dev = &dd->verbs_dev; 1984 1985 hfi1_verbs_unregister_sysfs(dd); 1986 1987 rvt_unregister_device(&dd->verbs_dev.rdi); 1988 1989 if (!list_empty(&dev->txwait)) 1990 dd_dev_err(dd, "txwait list not empty!\n"); 1991 if (!list_empty(&dev->memwait)) 1992 dd_dev_err(dd, "memwait list not empty!\n"); 1993 1994 del_timer_sync(&dev->mem_timer); 1995 verbs_txreq_exit(dev); 1996 1997 mutex_lock(&cntr_names_lock); 1998 kfree(dev_cntr_names); 1999 kfree(port_cntr_names); 2000 dev_cntr_names = NULL; 2001 port_cntr_names = NULL; 2002 cntr_names_initialized = 0; 2003 mutex_unlock(&cntr_names_lock); 2004 } 2005 2006 void hfi1_cnp_rcv(struct hfi1_packet *packet) 2007 { 2008 struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); 2009 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 2010 struct ib_header *hdr = packet->hdr; 2011 struct rvt_qp *qp = packet->qp; 2012 u32 lqpn, rqpn = 0; 2013 u16 rlid = 0; 2014 u8 sl, sc5, svc_type; 2015 2016 switch (packet->qp->ibqp.qp_type) { 2017 case IB_QPT_UC: 2018 rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); 2019 rqpn = qp->remote_qpn; 2020 svc_type = IB_CC_SVCTYPE_UC; 2021 break; 2022 case IB_QPT_RC: 2023 rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); 2024 rqpn = qp->remote_qpn; 2025 svc_type = IB_CC_SVCTYPE_RC; 2026 break; 2027 case IB_QPT_SMI: 2028 case IB_QPT_GSI: 2029 case IB_QPT_UD: 2030 svc_type = IB_CC_SVCTYPE_UD; 2031 break; 2032 default: 2033 ibp->rvp.n_pkt_drops++; 2034 return; 2035 } 2036 2037 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf); 2038 sl = ibp->sc_to_sl[sc5]; 2039 lqpn = qp->ibqp.qp_num; 2040 2041 process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); 2042 } 2043