1 /* 2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. 3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/log2.h> 35 36 #include <rdma/ib_cache.h> 37 #include <rdma/ib_pack.h> 38 39 #include <linux/mlx4/qp.h> 40 41 #include "mlx4_ib.h" 42 #include "user.h" 43 44 enum { 45 MLX4_IB_ACK_REQ_FREQ = 8, 46 }; 47 48 enum { 49 MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, 50 MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f 51 }; 52 53 enum { 54 /* 55 * Largest possible UD header: send with GRH and immediate data. 56 */ 57 MLX4_IB_UD_HEADER_SIZE = 72, 58 MLX4_IB_LSO_HEADER_SPARE = 128, 59 }; 60 61 struct mlx4_ib_sqp { 62 struct mlx4_ib_qp qp; 63 int pkey_index; 64 u32 qkey; 65 u32 send_psn; 66 struct ib_ud_header ud_header; 67 u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; 68 }; 69 70 enum { 71 MLX4_IB_MIN_SQ_STRIDE = 6, 72 MLX4_IB_CACHE_LINE_SIZE = 64, 73 }; 74 75 static const __be32 mlx4_ib_opcode[] = { 76 [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), 77 [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), 78 [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), 79 [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), 80 [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), 81 [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), 82 [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), 83 [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), 84 [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), 85 [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), 86 [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), 87 }; 88 89 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) 90 { 91 return container_of(mqp, struct mlx4_ib_sqp, qp); 92 } 93 94 static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) 95 { 96 return qp->mqp.qpn >= dev->dev->caps.sqp_start && 97 qp->mqp.qpn <= dev->dev->caps.sqp_start + 3; 98 } 99 100 static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) 101 { 102 return qp->mqp.qpn >= dev->dev->caps.sqp_start && 103 qp->mqp.qpn <= dev->dev->caps.sqp_start + 1; 104 } 105 106 static void *get_wqe(struct mlx4_ib_qp *qp, int offset) 107 { 108 return mlx4_buf_offset(&qp->buf, offset); 109 } 110 111 static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) 112 { 113 return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); 114 } 115 116 static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) 117 { 118 return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); 119 } 120 121 /* 122 * Stamp a SQ WQE so that it is invalid if prefetched by marking the 123 * first four bytes of every 64 byte chunk with 124 * 0x7FFFFFF | (invalid_ownership_value << 31). 125 * 126 * When the max work request size is less than or equal to the WQE 127 * basic block size, as an optimization, we can stamp all WQEs with 128 * 0xffffffff, and skip the very first chunk of each WQE. 129 */ 130 static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) 131 { 132 __be32 *wqe; 133 int i; 134 int s; 135 int ind; 136 void *buf; 137 __be32 stamp; 138 struct mlx4_wqe_ctrl_seg *ctrl; 139 140 if (qp->sq_max_wqes_per_wr > 1) { 141 s = roundup(size, 1U << qp->sq.wqe_shift); 142 for (i = 0; i < s; i += 64) { 143 ind = (i >> qp->sq.wqe_shift) + n; 144 stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : 145 cpu_to_be32(0xffffffff); 146 buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 147 wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); 148 *wqe = stamp; 149 } 150 } else { 151 ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); 152 s = (ctrl->fence_size & 0x3f) << 4; 153 for (i = 64; i < s; i += 64) { 154 wqe = buf + i; 155 *wqe = cpu_to_be32(0xffffffff); 156 } 157 } 158 } 159 160 static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) 161 { 162 struct mlx4_wqe_ctrl_seg *ctrl; 163 struct mlx4_wqe_inline_seg *inl; 164 void *wqe; 165 int s; 166 167 ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); 168 s = sizeof(struct mlx4_wqe_ctrl_seg); 169 170 if (qp->ibqp.qp_type == IB_QPT_UD) { 171 struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; 172 struct mlx4_av *av = (struct mlx4_av *)dgram->av; 173 memset(dgram, 0, sizeof *dgram); 174 av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); 175 s += sizeof(struct mlx4_wqe_datagram_seg); 176 } 177 178 /* Pad the remainder of the WQE with an inline data segment. */ 179 if (size > s) { 180 inl = wqe + s; 181 inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); 182 } 183 ctrl->srcrb_flags = 0; 184 ctrl->fence_size = size / 16; 185 /* 186 * Make sure descriptor is fully written before setting ownership bit 187 * (because HW can start executing as soon as we do). 188 */ 189 wmb(); 190 191 ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | 192 (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); 193 194 stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); 195 } 196 197 /* Post NOP WQE to prevent wrap-around in the middle of WR */ 198 static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) 199 { 200 unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); 201 if (unlikely(s < qp->sq_max_wqes_per_wr)) { 202 post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); 203 ind += s; 204 } 205 return ind; 206 } 207 208 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) 209 { 210 struct ib_event event; 211 struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; 212 213 if (type == MLX4_EVENT_TYPE_PATH_MIG) 214 to_mibqp(qp)->port = to_mibqp(qp)->alt_port; 215 216 if (ibqp->event_handler) { 217 event.device = ibqp->device; 218 event.element.qp = ibqp; 219 switch (type) { 220 case MLX4_EVENT_TYPE_PATH_MIG: 221 event.event = IB_EVENT_PATH_MIG; 222 break; 223 case MLX4_EVENT_TYPE_COMM_EST: 224 event.event = IB_EVENT_COMM_EST; 225 break; 226 case MLX4_EVENT_TYPE_SQ_DRAINED: 227 event.event = IB_EVENT_SQ_DRAINED; 228 break; 229 case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: 230 event.event = IB_EVENT_QP_LAST_WQE_REACHED; 231 break; 232 case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: 233 event.event = IB_EVENT_QP_FATAL; 234 break; 235 case MLX4_EVENT_TYPE_PATH_MIG_FAILED: 236 event.event = IB_EVENT_PATH_MIG_ERR; 237 break; 238 case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 239 event.event = IB_EVENT_QP_REQ_ERR; 240 break; 241 case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: 242 event.event = IB_EVENT_QP_ACCESS_ERR; 243 break; 244 default: 245 printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " 246 "on QP %06x\n", type, qp->qpn); 247 return; 248 } 249 250 ibqp->event_handler(&event, ibqp->qp_context); 251 } 252 } 253 254 static int send_wqe_overhead(enum ib_qp_type type, u32 flags) 255 { 256 /* 257 * UD WQEs must have a datagram segment. 258 * RC and UC WQEs might have a remote address segment. 259 * MLX WQEs need two extra inline data segments (for the UD 260 * header and space for the ICRC). 261 */ 262 switch (type) { 263 case IB_QPT_UD: 264 return sizeof (struct mlx4_wqe_ctrl_seg) + 265 sizeof (struct mlx4_wqe_datagram_seg) + 266 ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); 267 case IB_QPT_UC: 268 return sizeof (struct mlx4_wqe_ctrl_seg) + 269 sizeof (struct mlx4_wqe_raddr_seg); 270 case IB_QPT_RC: 271 return sizeof (struct mlx4_wqe_ctrl_seg) + 272 sizeof (struct mlx4_wqe_atomic_seg) + 273 sizeof (struct mlx4_wqe_raddr_seg); 274 case IB_QPT_SMI: 275 case IB_QPT_GSI: 276 return sizeof (struct mlx4_wqe_ctrl_seg) + 277 ALIGN(MLX4_IB_UD_HEADER_SIZE + 278 DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, 279 MLX4_INLINE_ALIGN) * 280 sizeof (struct mlx4_wqe_inline_seg), 281 sizeof (struct mlx4_wqe_data_seg)) + 282 ALIGN(4 + 283 sizeof (struct mlx4_wqe_inline_seg), 284 sizeof (struct mlx4_wqe_data_seg)); 285 default: 286 return sizeof (struct mlx4_wqe_ctrl_seg); 287 } 288 } 289 290 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, 291 int is_user, int has_srq, struct mlx4_ib_qp *qp) 292 { 293 /* Sanity check RQ size before proceeding */ 294 if (cap->max_recv_wr > dev->dev->caps.max_wqes || 295 cap->max_recv_sge > dev->dev->caps.max_rq_sg) 296 return -EINVAL; 297 298 if (has_srq) { 299 /* QPs attached to an SRQ should have no RQ */ 300 if (cap->max_recv_wr) 301 return -EINVAL; 302 303 qp->rq.wqe_cnt = qp->rq.max_gs = 0; 304 } else { 305 /* HW requires >= 1 RQ entry with >= 1 gather entry */ 306 if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) 307 return -EINVAL; 308 309 qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); 310 qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); 311 qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); 312 } 313 314 cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; 315 cap->max_recv_sge = qp->rq.max_gs; 316 317 return 0; 318 } 319 320 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, 321 enum ib_qp_type type, struct mlx4_ib_qp *qp) 322 { 323 int s; 324 325 /* Sanity check SQ size before proceeding */ 326 if (cap->max_send_wr > dev->dev->caps.max_wqes || 327 cap->max_send_sge > dev->dev->caps.max_sq_sg || 328 cap->max_inline_data + send_wqe_overhead(type, qp->flags) + 329 sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) 330 return -EINVAL; 331 332 /* 333 * For MLX transport we need 2 extra S/G entries: 334 * one for the header and one for the checksum at the end 335 */ 336 if ((type == IB_QPT_SMI || type == IB_QPT_GSI) && 337 cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) 338 return -EINVAL; 339 340 s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), 341 cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + 342 send_wqe_overhead(type, qp->flags); 343 344 if (s > dev->dev->caps.max_sq_desc_sz) 345 return -EINVAL; 346 347 /* 348 * Hermon supports shrinking WQEs, such that a single work 349 * request can include multiple units of 1 << wqe_shift. This 350 * way, work requests can differ in size, and do not have to 351 * be a power of 2 in size, saving memory and speeding up send 352 * WR posting. Unfortunately, if we do this then the 353 * wqe_index field in CQEs can't be used to look up the WR ID 354 * anymore, so we do this only if selective signaling is off. 355 * 356 * Further, on 32-bit platforms, we can't use vmap() to make 357 * the QP buffer virtually contiguous. Thus we have to use 358 * constant-sized WRs to make sure a WR is always fully within 359 * a single page-sized chunk. 360 * 361 * Finally, we use NOP work requests to pad the end of the 362 * work queue, to avoid wrap-around in the middle of WR. We 363 * set NEC bit to avoid getting completions with error for 364 * these NOP WRs, but since NEC is only supported starting 365 * with firmware 2.2.232, we use constant-sized WRs for older 366 * firmware. 367 * 368 * And, since MLX QPs only support SEND, we use constant-sized 369 * WRs in this case. 370 * 371 * We look for the smallest value of wqe_shift such that the 372 * resulting number of wqes does not exceed device 373 * capabilities. 374 * 375 * We set WQE size to at least 64 bytes, this way stamping 376 * invalidates each WQE. 377 */ 378 if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && 379 qp->sq_signal_bits && BITS_PER_LONG == 64 && 380 type != IB_QPT_SMI && type != IB_QPT_GSI) 381 qp->sq.wqe_shift = ilog2(64); 382 else 383 qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); 384 385 for (;;) { 386 qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); 387 388 /* 389 * We need to leave 2 KB + 1 WR of headroom in the SQ to 390 * allow HW to prefetch. 391 */ 392 qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; 393 qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * 394 qp->sq_max_wqes_per_wr + 395 qp->sq_spare_wqes); 396 397 if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) 398 break; 399 400 if (qp->sq_max_wqes_per_wr <= 1) 401 return -EINVAL; 402 403 ++qp->sq.wqe_shift; 404 } 405 406 qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, 407 (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - 408 send_wqe_overhead(type, qp->flags)) / 409 sizeof (struct mlx4_wqe_data_seg); 410 411 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 412 (qp->sq.wqe_cnt << qp->sq.wqe_shift); 413 if (qp->rq.wqe_shift > qp->sq.wqe_shift) { 414 qp->rq.offset = 0; 415 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; 416 } else { 417 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; 418 qp->sq.offset = 0; 419 } 420 421 cap->max_send_wr = qp->sq.max_post = 422 (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; 423 cap->max_send_sge = min(qp->sq.max_gs, 424 min(dev->dev->caps.max_sq_sg, 425 dev->dev->caps.max_rq_sg)); 426 /* We don't support inline sends for kernel QPs (yet) */ 427 cap->max_inline_data = 0; 428 429 return 0; 430 } 431 432 static int set_user_sq_size(struct mlx4_ib_dev *dev, 433 struct mlx4_ib_qp *qp, 434 struct mlx4_ib_create_qp *ucmd) 435 { 436 /* Sanity check SQ size before proceeding */ 437 if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || 438 ucmd->log_sq_stride > 439 ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || 440 ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) 441 return -EINVAL; 442 443 qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; 444 qp->sq.wqe_shift = ucmd->log_sq_stride; 445 446 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 447 (qp->sq.wqe_cnt << qp->sq.wqe_shift); 448 449 return 0; 450 } 451 452 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, 453 struct ib_qp_init_attr *init_attr, 454 struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) 455 { 456 int qpn; 457 int err; 458 459 mutex_init(&qp->mutex); 460 spin_lock_init(&qp->sq.lock); 461 spin_lock_init(&qp->rq.lock); 462 463 qp->state = IB_QPS_RESET; 464 if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) 465 qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 466 467 err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); 468 if (err) 469 goto err; 470 471 if (pd->uobject) { 472 struct mlx4_ib_create_qp ucmd; 473 474 if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { 475 err = -EFAULT; 476 goto err; 477 } 478 479 qp->sq_no_prefetch = ucmd.sq_no_prefetch; 480 481 err = set_user_sq_size(dev, qp, &ucmd); 482 if (err) 483 goto err; 484 485 qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, 486 qp->buf_size, 0, 0); 487 if (IS_ERR(qp->umem)) { 488 err = PTR_ERR(qp->umem); 489 goto err; 490 } 491 492 err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), 493 ilog2(qp->umem->page_size), &qp->mtt); 494 if (err) 495 goto err_buf; 496 497 err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); 498 if (err) 499 goto err_mtt; 500 501 if (!init_attr->srq) { 502 err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), 503 ucmd.db_addr, &qp->db); 504 if (err) 505 goto err_mtt; 506 } 507 } else { 508 qp->sq_no_prefetch = 0; 509 510 if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) 511 qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; 512 513 if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) 514 qp->flags |= MLX4_IB_QP_LSO; 515 516 err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp); 517 if (err) 518 goto err; 519 520 if (!init_attr->srq) { 521 err = mlx4_db_alloc(dev->dev, &qp->db, 0); 522 if (err) 523 goto err; 524 525 *qp->db.db = 0; 526 } 527 528 if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { 529 err = -ENOMEM; 530 goto err_db; 531 } 532 533 err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, 534 &qp->mtt); 535 if (err) 536 goto err_buf; 537 538 err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); 539 if (err) 540 goto err_mtt; 541 542 qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); 543 qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); 544 545 if (!qp->sq.wrid || !qp->rq.wrid) { 546 err = -ENOMEM; 547 goto err_wrid; 548 } 549 } 550 551 if (sqpn) { 552 qpn = sqpn; 553 } else { 554 err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn); 555 if (err) 556 goto err_wrid; 557 } 558 559 err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); 560 if (err) 561 goto err_qpn; 562 563 /* 564 * Hardware wants QPN written in big-endian order (after 565 * shifting) for send doorbell. Precompute this value to save 566 * a little bit when posting sends. 567 */ 568 qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); 569 570 qp->mqp.event = mlx4_ib_qp_event; 571 572 return 0; 573 574 err_qpn: 575 if (!sqpn) 576 mlx4_qp_release_range(dev->dev, qpn, 1); 577 578 err_wrid: 579 if (pd->uobject) { 580 if (!init_attr->srq) 581 mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 582 &qp->db); 583 } else { 584 kfree(qp->sq.wrid); 585 kfree(qp->rq.wrid); 586 } 587 588 err_mtt: 589 mlx4_mtt_cleanup(dev->dev, &qp->mtt); 590 591 err_buf: 592 if (pd->uobject) 593 ib_umem_release(qp->umem); 594 else 595 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); 596 597 err_db: 598 if (!pd->uobject && !init_attr->srq) 599 mlx4_db_free(dev->dev, &qp->db); 600 601 err: 602 return err; 603 } 604 605 static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) 606 { 607 switch (state) { 608 case IB_QPS_RESET: return MLX4_QP_STATE_RST; 609 case IB_QPS_INIT: return MLX4_QP_STATE_INIT; 610 case IB_QPS_RTR: return MLX4_QP_STATE_RTR; 611 case IB_QPS_RTS: return MLX4_QP_STATE_RTS; 612 case IB_QPS_SQD: return MLX4_QP_STATE_SQD; 613 case IB_QPS_SQE: return MLX4_QP_STATE_SQER; 614 case IB_QPS_ERR: return MLX4_QP_STATE_ERR; 615 default: return -1; 616 } 617 } 618 619 static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) 620 __acquires(&send_cq->lock) __acquires(&recv_cq->lock) 621 { 622 if (send_cq == recv_cq) { 623 spin_lock_irq(&send_cq->lock); 624 __acquire(&recv_cq->lock); 625 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { 626 spin_lock_irq(&send_cq->lock); 627 spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); 628 } else { 629 spin_lock_irq(&recv_cq->lock); 630 spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); 631 } 632 } 633 634 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) 635 __releases(&send_cq->lock) __releases(&recv_cq->lock) 636 { 637 if (send_cq == recv_cq) { 638 __release(&recv_cq->lock); 639 spin_unlock_irq(&send_cq->lock); 640 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { 641 spin_unlock(&recv_cq->lock); 642 spin_unlock_irq(&send_cq->lock); 643 } else { 644 spin_unlock(&send_cq->lock); 645 spin_unlock_irq(&recv_cq->lock); 646 } 647 } 648 649 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, 650 int is_user) 651 { 652 struct mlx4_ib_cq *send_cq, *recv_cq; 653 654 if (qp->state != IB_QPS_RESET) 655 if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), 656 MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) 657 printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n", 658 qp->mqp.qpn); 659 660 send_cq = to_mcq(qp->ibqp.send_cq); 661 recv_cq = to_mcq(qp->ibqp.recv_cq); 662 663 mlx4_ib_lock_cqs(send_cq, recv_cq); 664 665 if (!is_user) { 666 __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, 667 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); 668 if (send_cq != recv_cq) 669 __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); 670 } 671 672 mlx4_qp_remove(dev->dev, &qp->mqp); 673 674 mlx4_ib_unlock_cqs(send_cq, recv_cq); 675 676 mlx4_qp_free(dev->dev, &qp->mqp); 677 678 if (!is_sqp(dev, qp)) 679 mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); 680 681 mlx4_mtt_cleanup(dev->dev, &qp->mtt); 682 683 if (is_user) { 684 if (!qp->ibqp.srq) 685 mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), 686 &qp->db); 687 ib_umem_release(qp->umem); 688 } else { 689 kfree(qp->sq.wrid); 690 kfree(qp->rq.wrid); 691 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); 692 if (!qp->ibqp.srq) 693 mlx4_db_free(dev->dev, &qp->db); 694 } 695 } 696 697 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, 698 struct ib_qp_init_attr *init_attr, 699 struct ib_udata *udata) 700 { 701 struct mlx4_ib_dev *dev = to_mdev(pd->device); 702 struct mlx4_ib_sqp *sqp; 703 struct mlx4_ib_qp *qp; 704 int err; 705 706 /* 707 * We only support LSO and multicast loopback blocking, and 708 * only for kernel UD QPs. 709 */ 710 if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO | 711 IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) 712 return ERR_PTR(-EINVAL); 713 714 if (init_attr->create_flags && 715 (pd->uobject || init_attr->qp_type != IB_QPT_UD)) 716 return ERR_PTR(-EINVAL); 717 718 switch (init_attr->qp_type) { 719 case IB_QPT_RC: 720 case IB_QPT_UC: 721 case IB_QPT_UD: 722 { 723 qp = kzalloc(sizeof *qp, GFP_KERNEL); 724 if (!qp) 725 return ERR_PTR(-ENOMEM); 726 727 err = create_qp_common(dev, pd, init_attr, udata, 0, qp); 728 if (err) { 729 kfree(qp); 730 return ERR_PTR(err); 731 } 732 733 qp->ibqp.qp_num = qp->mqp.qpn; 734 735 break; 736 } 737 case IB_QPT_SMI: 738 case IB_QPT_GSI: 739 { 740 /* Userspace is not allowed to create special QPs: */ 741 if (pd->uobject) 742 return ERR_PTR(-EINVAL); 743 744 sqp = kzalloc(sizeof *sqp, GFP_KERNEL); 745 if (!sqp) 746 return ERR_PTR(-ENOMEM); 747 748 qp = &sqp->qp; 749 750 err = create_qp_common(dev, pd, init_attr, udata, 751 dev->dev->caps.sqp_start + 752 (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) + 753 init_attr->port_num - 1, 754 qp); 755 if (err) { 756 kfree(sqp); 757 return ERR_PTR(err); 758 } 759 760 qp->port = init_attr->port_num; 761 qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; 762 763 break; 764 } 765 default: 766 /* Don't support raw QPs */ 767 return ERR_PTR(-EINVAL); 768 } 769 770 return &qp->ibqp; 771 } 772 773 int mlx4_ib_destroy_qp(struct ib_qp *qp) 774 { 775 struct mlx4_ib_dev *dev = to_mdev(qp->device); 776 struct mlx4_ib_qp *mqp = to_mqp(qp); 777 778 if (is_qp0(dev, mqp)) 779 mlx4_CLOSE_PORT(dev->dev, mqp->port); 780 781 destroy_qp_common(dev, mqp, !!qp->pd->uobject); 782 783 if (is_sqp(dev, mqp)) 784 kfree(to_msqp(mqp)); 785 else 786 kfree(mqp); 787 788 return 0; 789 } 790 791 static int to_mlx4_st(enum ib_qp_type type) 792 { 793 switch (type) { 794 case IB_QPT_RC: return MLX4_QP_ST_RC; 795 case IB_QPT_UC: return MLX4_QP_ST_UC; 796 case IB_QPT_UD: return MLX4_QP_ST_UD; 797 case IB_QPT_SMI: 798 case IB_QPT_GSI: return MLX4_QP_ST_MLX; 799 default: return -1; 800 } 801 } 802 803 static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, 804 int attr_mask) 805 { 806 u8 dest_rd_atomic; 807 u32 access_flags; 808 u32 hw_access_flags = 0; 809 810 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) 811 dest_rd_atomic = attr->max_dest_rd_atomic; 812 else 813 dest_rd_atomic = qp->resp_depth; 814 815 if (attr_mask & IB_QP_ACCESS_FLAGS) 816 access_flags = attr->qp_access_flags; 817 else 818 access_flags = qp->atomic_rd_en; 819 820 if (!dest_rd_atomic) 821 access_flags &= IB_ACCESS_REMOTE_WRITE; 822 823 if (access_flags & IB_ACCESS_REMOTE_READ) 824 hw_access_flags |= MLX4_QP_BIT_RRE; 825 if (access_flags & IB_ACCESS_REMOTE_ATOMIC) 826 hw_access_flags |= MLX4_QP_BIT_RAE; 827 if (access_flags & IB_ACCESS_REMOTE_WRITE) 828 hw_access_flags |= MLX4_QP_BIT_RWE; 829 830 return cpu_to_be32(hw_access_flags); 831 } 832 833 static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, 834 int attr_mask) 835 { 836 if (attr_mask & IB_QP_PKEY_INDEX) 837 sqp->pkey_index = attr->pkey_index; 838 if (attr_mask & IB_QP_QKEY) 839 sqp->qkey = attr->qkey; 840 if (attr_mask & IB_QP_SQ_PSN) 841 sqp->send_psn = attr->sq_psn; 842 } 843 844 static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) 845 { 846 path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); 847 } 848 849 static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, 850 struct mlx4_qp_path *path, u8 port) 851 { 852 path->grh_mylmc = ah->src_path_bits & 0x7f; 853 path->rlid = cpu_to_be16(ah->dlid); 854 if (ah->static_rate) { 855 path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET; 856 while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && 857 !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) 858 --path->static_rate; 859 } else 860 path->static_rate = 0; 861 path->counter_index = 0xff; 862 863 if (ah->ah_flags & IB_AH_GRH) { 864 if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { 865 printk(KERN_ERR "sgid_index (%u) too large. max is %d\n", 866 ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); 867 return -1; 868 } 869 870 path->grh_mylmc |= 1 << 7; 871 path->mgid_index = ah->grh.sgid_index; 872 path->hop_limit = ah->grh.hop_limit; 873 path->tclass_flowlabel = 874 cpu_to_be32((ah->grh.traffic_class << 20) | 875 (ah->grh.flow_label)); 876 memcpy(path->rgid, ah->grh.dgid.raw, 16); 877 } 878 879 path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | 880 ((port - 1) << 6) | ((ah->sl & 0xf) << 2); 881 882 return 0; 883 } 884 885 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, 886 const struct ib_qp_attr *attr, int attr_mask, 887 enum ib_qp_state cur_state, enum ib_qp_state new_state) 888 { 889 struct mlx4_ib_dev *dev = to_mdev(ibqp->device); 890 struct mlx4_ib_qp *qp = to_mqp(ibqp); 891 struct mlx4_qp_context *context; 892 enum mlx4_qp_optpar optpar = 0; 893 int sqd_event; 894 int err = -EINVAL; 895 896 context = kzalloc(sizeof *context, GFP_KERNEL); 897 if (!context) 898 return -ENOMEM; 899 900 context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | 901 (to_mlx4_st(ibqp->qp_type) << 16)); 902 903 if (!(attr_mask & IB_QP_PATH_MIG_STATE)) 904 context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); 905 else { 906 optpar |= MLX4_QP_OPTPAR_PM_STATE; 907 switch (attr->path_mig_state) { 908 case IB_MIG_MIGRATED: 909 context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); 910 break; 911 case IB_MIG_REARM: 912 context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); 913 break; 914 case IB_MIG_ARMED: 915 context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); 916 break; 917 } 918 } 919 920 if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) 921 context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; 922 else if (ibqp->qp_type == IB_QPT_UD) { 923 if (qp->flags & MLX4_IB_QP_LSO) 924 context->mtu_msgmax = (IB_MTU_4096 << 5) | 925 ilog2(dev->dev->caps.max_gso_sz); 926 else 927 context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; 928 } else if (attr_mask & IB_QP_PATH_MTU) { 929 if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { 930 printk(KERN_ERR "path MTU (%u) is invalid\n", 931 attr->path_mtu); 932 goto out; 933 } 934 context->mtu_msgmax = (attr->path_mtu << 5) | 935 ilog2(dev->dev->caps.max_msg_sz); 936 } 937 938 if (qp->rq.wqe_cnt) 939 context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; 940 context->rq_size_stride |= qp->rq.wqe_shift - 4; 941 942 if (qp->sq.wqe_cnt) 943 context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; 944 context->sq_size_stride |= qp->sq.wqe_shift - 4; 945 946 if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) 947 context->sq_size_stride |= !!qp->sq_no_prefetch << 7; 948 949 if (qp->ibqp.uobject) 950 context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); 951 else 952 context->usr_page = cpu_to_be32(dev->priv_uar.index); 953 954 if (attr_mask & IB_QP_DEST_QPN) 955 context->remote_qpn = cpu_to_be32(attr->dest_qp_num); 956 957 if (attr_mask & IB_QP_PORT) { 958 if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && 959 !(attr_mask & IB_QP_AV)) { 960 mlx4_set_sched(&context->pri_path, attr->port_num); 961 optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; 962 } 963 } 964 965 if (attr_mask & IB_QP_PKEY_INDEX) { 966 context->pri_path.pkey_index = attr->pkey_index; 967 optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; 968 } 969 970 if (attr_mask & IB_QP_AV) { 971 if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, 972 attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) 973 goto out; 974 975 optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | 976 MLX4_QP_OPTPAR_SCHED_QUEUE); 977 } 978 979 if (attr_mask & IB_QP_TIMEOUT) { 980 context->pri_path.ackto = attr->timeout << 3; 981 optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; 982 } 983 984 if (attr_mask & IB_QP_ALT_PATH) { 985 if (attr->alt_port_num == 0 || 986 attr->alt_port_num > dev->dev->caps.num_ports) 987 goto out; 988 989 if (attr->alt_pkey_index >= 990 dev->dev->caps.pkey_table_len[attr->alt_port_num]) 991 goto out; 992 993 if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, 994 attr->alt_port_num)) 995 goto out; 996 997 context->alt_path.pkey_index = attr->alt_pkey_index; 998 context->alt_path.ackto = attr->alt_timeout << 3; 999 optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; 1000 } 1001 1002 context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn); 1003 context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); 1004 1005 /* Set "fast registration enabled" for all kernel QPs */ 1006 if (!qp->ibqp.uobject) 1007 context->params1 |= cpu_to_be32(1 << 11); 1008 1009 if (attr_mask & IB_QP_RNR_RETRY) { 1010 context->params1 |= cpu_to_be32(attr->rnr_retry << 13); 1011 optpar |= MLX4_QP_OPTPAR_RNR_RETRY; 1012 } 1013 1014 if (attr_mask & IB_QP_RETRY_CNT) { 1015 context->params1 |= cpu_to_be32(attr->retry_cnt << 16); 1016 optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; 1017 } 1018 1019 if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { 1020 if (attr->max_rd_atomic) 1021 context->params1 |= 1022 cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); 1023 optpar |= MLX4_QP_OPTPAR_SRA_MAX; 1024 } 1025 1026 if (attr_mask & IB_QP_SQ_PSN) 1027 context->next_send_psn = cpu_to_be32(attr->sq_psn); 1028 1029 context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn); 1030 1031 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { 1032 if (attr->max_dest_rd_atomic) 1033 context->params2 |= 1034 cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); 1035 optpar |= MLX4_QP_OPTPAR_RRA_MAX; 1036 } 1037 1038 if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { 1039 context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); 1040 optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; 1041 } 1042 1043 if (ibqp->srq) 1044 context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); 1045 1046 if (attr_mask & IB_QP_MIN_RNR_TIMER) { 1047 context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); 1048 optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; 1049 } 1050 if (attr_mask & IB_QP_RQ_PSN) 1051 context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); 1052 1053 context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn); 1054 1055 if (attr_mask & IB_QP_QKEY) { 1056 context->qkey = cpu_to_be32(attr->qkey); 1057 optpar |= MLX4_QP_OPTPAR_Q_KEY; 1058 } 1059 1060 if (ibqp->srq) 1061 context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); 1062 1063 if (!ibqp->srq && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) 1064 context->db_rec_addr = cpu_to_be64(qp->db.dma); 1065 1066 if (cur_state == IB_QPS_INIT && 1067 new_state == IB_QPS_RTR && 1068 (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || 1069 ibqp->qp_type == IB_QPT_UD)) { 1070 context->pri_path.sched_queue = (qp->port - 1) << 6; 1071 if (is_qp0(dev, qp)) 1072 context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; 1073 else 1074 context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; 1075 } 1076 1077 if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && 1078 attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) 1079 sqd_event = 1; 1080 else 1081 sqd_event = 0; 1082 1083 if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) 1084 context->rlkey |= (1 << 4); 1085 1086 /* 1087 * Before passing a kernel QP to the HW, make sure that the 1088 * ownership bits of the send queue are set and the SQ 1089 * headroom is stamped so that the hardware doesn't start 1090 * processing stale work requests. 1091 */ 1092 if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { 1093 struct mlx4_wqe_ctrl_seg *ctrl; 1094 int i; 1095 1096 for (i = 0; i < qp->sq.wqe_cnt; ++i) { 1097 ctrl = get_send_wqe(qp, i); 1098 ctrl->owner_opcode = cpu_to_be32(1 << 31); 1099 if (qp->sq_max_wqes_per_wr == 1) 1100 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); 1101 1102 stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); 1103 } 1104 } 1105 1106 err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), 1107 to_mlx4_state(new_state), context, optpar, 1108 sqd_event, &qp->mqp); 1109 if (err) 1110 goto out; 1111 1112 qp->state = new_state; 1113 1114 if (attr_mask & IB_QP_ACCESS_FLAGS) 1115 qp->atomic_rd_en = attr->qp_access_flags; 1116 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) 1117 qp->resp_depth = attr->max_dest_rd_atomic; 1118 if (attr_mask & IB_QP_PORT) 1119 qp->port = attr->port_num; 1120 if (attr_mask & IB_QP_ALT_PATH) 1121 qp->alt_port = attr->alt_port_num; 1122 1123 if (is_sqp(dev, qp)) 1124 store_sqp_attrs(to_msqp(qp), attr, attr_mask); 1125 1126 /* 1127 * If we moved QP0 to RTR, bring the IB link up; if we moved 1128 * QP0 to RESET or ERROR, bring the link back down. 1129 */ 1130 if (is_qp0(dev, qp)) { 1131 if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) 1132 if (mlx4_INIT_PORT(dev->dev, qp->port)) 1133 printk(KERN_WARNING "INIT_PORT failed for port %d\n", 1134 qp->port); 1135 1136 if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && 1137 (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) 1138 mlx4_CLOSE_PORT(dev->dev, qp->port); 1139 } 1140 1141 /* 1142 * If we moved a kernel QP to RESET, clean up all old CQ 1143 * entries and reinitialize the QP. 1144 */ 1145 if (new_state == IB_QPS_RESET && !ibqp->uobject) { 1146 mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn, 1147 ibqp->srq ? to_msrq(ibqp->srq): NULL); 1148 if (ibqp->send_cq != ibqp->recv_cq) 1149 mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL); 1150 1151 qp->rq.head = 0; 1152 qp->rq.tail = 0; 1153 qp->sq.head = 0; 1154 qp->sq.tail = 0; 1155 qp->sq_next_wqe = 0; 1156 if (!ibqp->srq) 1157 *qp->db.db = 0; 1158 } 1159 1160 out: 1161 kfree(context); 1162 return err; 1163 } 1164 1165 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, 1166 int attr_mask, struct ib_udata *udata) 1167 { 1168 struct mlx4_ib_dev *dev = to_mdev(ibqp->device); 1169 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1170 enum ib_qp_state cur_state, new_state; 1171 int err = -EINVAL; 1172 1173 mutex_lock(&qp->mutex); 1174 1175 cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; 1176 new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; 1177 1178 if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) 1179 goto out; 1180 1181 if ((attr_mask & IB_QP_PORT) && 1182 (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) { 1183 goto out; 1184 } 1185 1186 if (attr_mask & IB_QP_PKEY_INDEX) { 1187 int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; 1188 if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) 1189 goto out; 1190 } 1191 1192 if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && 1193 attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { 1194 goto out; 1195 } 1196 1197 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && 1198 attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { 1199 goto out; 1200 } 1201 1202 if (cur_state == new_state && cur_state == IB_QPS_RESET) { 1203 err = 0; 1204 goto out; 1205 } 1206 1207 err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); 1208 1209 out: 1210 mutex_unlock(&qp->mutex); 1211 return err; 1212 } 1213 1214 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, 1215 void *wqe, unsigned *mlx_seg_len) 1216 { 1217 struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; 1218 struct mlx4_wqe_mlx_seg *mlx = wqe; 1219 struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; 1220 struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); 1221 u16 pkey; 1222 int send_size; 1223 int header_size; 1224 int spc; 1225 int i; 1226 1227 send_size = 0; 1228 for (i = 0; i < wr->num_sge; ++i) 1229 send_size += wr->sg_list[i].length; 1230 1231 ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header); 1232 1233 sqp->ud_header.lrh.service_level = 1234 be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; 1235 sqp->ud_header.lrh.destination_lid = ah->av.dlid; 1236 sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.g_slid & 0x7f); 1237 if (mlx4_ib_ah_grh_present(ah)) { 1238 sqp->ud_header.grh.traffic_class = 1239 (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff; 1240 sqp->ud_header.grh.flow_label = 1241 ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff); 1242 sqp->ud_header.grh.hop_limit = ah->av.hop_limit; 1243 ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24, 1244 ah->av.gid_index, &sqp->ud_header.grh.source_gid); 1245 memcpy(sqp->ud_header.grh.destination_gid.raw, 1246 ah->av.dgid, 16); 1247 } 1248 1249 mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 1250 mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | 1251 (sqp->ud_header.lrh.destination_lid == 1252 IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | 1253 (sqp->ud_header.lrh.service_level << 8)); 1254 mlx->rlid = sqp->ud_header.lrh.destination_lid; 1255 1256 switch (wr->opcode) { 1257 case IB_WR_SEND: 1258 sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; 1259 sqp->ud_header.immediate_present = 0; 1260 break; 1261 case IB_WR_SEND_WITH_IMM: 1262 sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; 1263 sqp->ud_header.immediate_present = 1; 1264 sqp->ud_header.immediate_data = wr->ex.imm_data; 1265 break; 1266 default: 1267 return -EINVAL; 1268 } 1269 1270 sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; 1271 if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) 1272 sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; 1273 sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); 1274 if (!sqp->qp.ibqp.qp_num) 1275 ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); 1276 else 1277 ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); 1278 sqp->ud_header.bth.pkey = cpu_to_be16(pkey); 1279 sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); 1280 sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); 1281 sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? 1282 sqp->qkey : wr->wr.ud.remote_qkey); 1283 sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); 1284 1285 header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); 1286 1287 if (0) { 1288 printk(KERN_ERR "built UD header of size %d:\n", header_size); 1289 for (i = 0; i < header_size / 4; ++i) { 1290 if (i % 8 == 0) 1291 printk(" [%02x] ", i * 4); 1292 printk(" %08x", 1293 be32_to_cpu(((__be32 *) sqp->header_buf)[i])); 1294 if ((i + 1) % 8 == 0) 1295 printk("\n"); 1296 } 1297 printk("\n"); 1298 } 1299 1300 /* 1301 * Inline data segments may not cross a 64 byte boundary. If 1302 * our UD header is bigger than the space available up to the 1303 * next 64 byte boundary in the WQE, use two inline data 1304 * segments to hold the UD header. 1305 */ 1306 spc = MLX4_INLINE_ALIGN - 1307 ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); 1308 if (header_size <= spc) { 1309 inl->byte_count = cpu_to_be32(1 << 31 | header_size); 1310 memcpy(inl + 1, sqp->header_buf, header_size); 1311 i = 1; 1312 } else { 1313 inl->byte_count = cpu_to_be32(1 << 31 | spc); 1314 memcpy(inl + 1, sqp->header_buf, spc); 1315 1316 inl = (void *) (inl + 1) + spc; 1317 memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); 1318 /* 1319 * Need a barrier here to make sure all the data is 1320 * visible before the byte_count field is set. 1321 * Otherwise the HCA prefetcher could grab the 64-byte 1322 * chunk with this inline segment and get a valid (!= 1323 * 0xffffffff) byte count but stale data, and end up 1324 * generating a packet with bad headers. 1325 * 1326 * The first inline segment's byte_count field doesn't 1327 * need a barrier, because it comes after a 1328 * control/MLX segment and therefore is at an offset 1329 * of 16 mod 64. 1330 */ 1331 wmb(); 1332 inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); 1333 i = 2; 1334 } 1335 1336 *mlx_seg_len = 1337 ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); 1338 return 0; 1339 } 1340 1341 static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) 1342 { 1343 unsigned cur; 1344 struct mlx4_ib_cq *cq; 1345 1346 cur = wq->head - wq->tail; 1347 if (likely(cur + nreq < wq->max_post)) 1348 return 0; 1349 1350 cq = to_mcq(ib_cq); 1351 spin_lock(&cq->lock); 1352 cur = wq->head - wq->tail; 1353 spin_unlock(&cq->lock); 1354 1355 return cur + nreq >= wq->max_post; 1356 } 1357 1358 static __be32 convert_access(int acc) 1359 { 1360 return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC) : 0) | 1361 (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) | 1362 (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ) : 0) | 1363 (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | 1364 cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); 1365 } 1366 1367 static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) 1368 { 1369 struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); 1370 int i; 1371 1372 for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i) 1373 mfrpl->mapped_page_list[i] = 1374 cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] | 1375 MLX4_MTT_FLAG_PRESENT); 1376 1377 fseg->flags = convert_access(wr->wr.fast_reg.access_flags); 1378 fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey); 1379 fseg->buf_list = cpu_to_be64(mfrpl->map); 1380 fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); 1381 fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length); 1382 fseg->offset = 0; /* XXX -- is this just for ZBVA? */ 1383 fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift); 1384 fseg->reserved[0] = 0; 1385 fseg->reserved[1] = 0; 1386 } 1387 1388 static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) 1389 { 1390 iseg->flags = 0; 1391 iseg->mem_key = cpu_to_be32(rkey); 1392 iseg->guest_id = 0; 1393 iseg->pa = 0; 1394 } 1395 1396 static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, 1397 u64 remote_addr, u32 rkey) 1398 { 1399 rseg->raddr = cpu_to_be64(remote_addr); 1400 rseg->rkey = cpu_to_be32(rkey); 1401 rseg->reserved = 0; 1402 } 1403 1404 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr) 1405 { 1406 if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { 1407 aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); 1408 aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); 1409 } else { 1410 aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); 1411 aseg->compare = 0; 1412 } 1413 1414 } 1415 1416 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, 1417 struct ib_send_wr *wr) 1418 { 1419 memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); 1420 dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); 1421 dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); 1422 } 1423 1424 static void set_mlx_icrc_seg(void *dseg) 1425 { 1426 u32 *t = dseg; 1427 struct mlx4_wqe_inline_seg *iseg = dseg; 1428 1429 t[1] = 0; 1430 1431 /* 1432 * Need a barrier here before writing the byte_count field to 1433 * make sure that all the data is visible before the 1434 * byte_count field is set. Otherwise, if the segment begins 1435 * a new cacheline, the HCA prefetcher could grab the 64-byte 1436 * chunk and get a valid (!= * 0xffffffff) byte count but 1437 * stale data, and end up sending the wrong data. 1438 */ 1439 wmb(); 1440 1441 iseg->byte_count = cpu_to_be32((1 << 31) | 4); 1442 } 1443 1444 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) 1445 { 1446 dseg->lkey = cpu_to_be32(sg->lkey); 1447 dseg->addr = cpu_to_be64(sg->addr); 1448 1449 /* 1450 * Need a barrier here before writing the byte_count field to 1451 * make sure that all the data is visible before the 1452 * byte_count field is set. Otherwise, if the segment begins 1453 * a new cacheline, the HCA prefetcher could grab the 64-byte 1454 * chunk and get a valid (!= * 0xffffffff) byte count but 1455 * stale data, and end up sending the wrong data. 1456 */ 1457 wmb(); 1458 1459 dseg->byte_count = cpu_to_be32(sg->length); 1460 } 1461 1462 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) 1463 { 1464 dseg->byte_count = cpu_to_be32(sg->length); 1465 dseg->lkey = cpu_to_be32(sg->lkey); 1466 dseg->addr = cpu_to_be64(sg->addr); 1467 } 1468 1469 static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, 1470 struct mlx4_ib_qp *qp, unsigned *lso_seg_len, 1471 __be32 *lso_hdr_sz, __be32 *blh) 1472 { 1473 unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); 1474 1475 if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) 1476 *blh = cpu_to_be32(1 << 6); 1477 1478 if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && 1479 wr->num_sge > qp->sq.max_gs - (halign >> 4))) 1480 return -EINVAL; 1481 1482 memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); 1483 1484 *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | 1485 wr->wr.ud.hlen); 1486 *lso_seg_len = halign; 1487 return 0; 1488 } 1489 1490 static __be32 send_ieth(struct ib_send_wr *wr) 1491 { 1492 switch (wr->opcode) { 1493 case IB_WR_SEND_WITH_IMM: 1494 case IB_WR_RDMA_WRITE_WITH_IMM: 1495 return wr->ex.imm_data; 1496 1497 case IB_WR_SEND_WITH_INV: 1498 return cpu_to_be32(wr->ex.invalidate_rkey); 1499 1500 default: 1501 return 0; 1502 } 1503 } 1504 1505 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, 1506 struct ib_send_wr **bad_wr) 1507 { 1508 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1509 void *wqe; 1510 struct mlx4_wqe_ctrl_seg *ctrl; 1511 struct mlx4_wqe_data_seg *dseg; 1512 unsigned long flags; 1513 int nreq; 1514 int err = 0; 1515 unsigned ind; 1516 int uninitialized_var(stamp); 1517 int uninitialized_var(size); 1518 unsigned uninitialized_var(seglen); 1519 __be32 dummy; 1520 __be32 *lso_wqe; 1521 __be32 uninitialized_var(lso_hdr_sz); 1522 __be32 blh; 1523 int i; 1524 1525 spin_lock_irqsave(&qp->sq.lock, flags); 1526 1527 ind = qp->sq_next_wqe; 1528 1529 for (nreq = 0; wr; ++nreq, wr = wr->next) { 1530 lso_wqe = &dummy; 1531 blh = 0; 1532 1533 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { 1534 err = -ENOMEM; 1535 *bad_wr = wr; 1536 goto out; 1537 } 1538 1539 if (unlikely(wr->num_sge > qp->sq.max_gs)) { 1540 err = -EINVAL; 1541 *bad_wr = wr; 1542 goto out; 1543 } 1544 1545 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 1546 qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 1547 1548 ctrl->srcrb_flags = 1549 (wr->send_flags & IB_SEND_SIGNALED ? 1550 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | 1551 (wr->send_flags & IB_SEND_SOLICITED ? 1552 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | 1553 ((wr->send_flags & IB_SEND_IP_CSUM) ? 1554 cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | 1555 MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) | 1556 qp->sq_signal_bits; 1557 1558 ctrl->imm = send_ieth(wr); 1559 1560 wqe += sizeof *ctrl; 1561 size = sizeof *ctrl / 16; 1562 1563 switch (ibqp->qp_type) { 1564 case IB_QPT_RC: 1565 case IB_QPT_UC: 1566 switch (wr->opcode) { 1567 case IB_WR_ATOMIC_CMP_AND_SWP: 1568 case IB_WR_ATOMIC_FETCH_AND_ADD: 1569 set_raddr_seg(wqe, wr->wr.atomic.remote_addr, 1570 wr->wr.atomic.rkey); 1571 wqe += sizeof (struct mlx4_wqe_raddr_seg); 1572 1573 set_atomic_seg(wqe, wr); 1574 wqe += sizeof (struct mlx4_wqe_atomic_seg); 1575 1576 size += (sizeof (struct mlx4_wqe_raddr_seg) + 1577 sizeof (struct mlx4_wqe_atomic_seg)) / 16; 1578 1579 break; 1580 1581 case IB_WR_RDMA_READ: 1582 case IB_WR_RDMA_WRITE: 1583 case IB_WR_RDMA_WRITE_WITH_IMM: 1584 set_raddr_seg(wqe, wr->wr.rdma.remote_addr, 1585 wr->wr.rdma.rkey); 1586 wqe += sizeof (struct mlx4_wqe_raddr_seg); 1587 size += sizeof (struct mlx4_wqe_raddr_seg) / 16; 1588 break; 1589 1590 case IB_WR_LOCAL_INV: 1591 ctrl->srcrb_flags |= 1592 cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); 1593 set_local_inv_seg(wqe, wr->ex.invalidate_rkey); 1594 wqe += sizeof (struct mlx4_wqe_local_inval_seg); 1595 size += sizeof (struct mlx4_wqe_local_inval_seg) / 16; 1596 break; 1597 1598 case IB_WR_FAST_REG_MR: 1599 ctrl->srcrb_flags |= 1600 cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); 1601 set_fmr_seg(wqe, wr); 1602 wqe += sizeof (struct mlx4_wqe_fmr_seg); 1603 size += sizeof (struct mlx4_wqe_fmr_seg) / 16; 1604 break; 1605 1606 default: 1607 /* No extra segments required for sends */ 1608 break; 1609 } 1610 break; 1611 1612 case IB_QPT_UD: 1613 set_datagram_seg(wqe, wr); 1614 wqe += sizeof (struct mlx4_wqe_datagram_seg); 1615 size += sizeof (struct mlx4_wqe_datagram_seg) / 16; 1616 1617 if (wr->opcode == IB_WR_LSO) { 1618 err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); 1619 if (unlikely(err)) { 1620 *bad_wr = wr; 1621 goto out; 1622 } 1623 lso_wqe = (__be32 *) wqe; 1624 wqe += seglen; 1625 size += seglen / 16; 1626 } 1627 break; 1628 1629 case IB_QPT_SMI: 1630 case IB_QPT_GSI: 1631 err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); 1632 if (unlikely(err)) { 1633 *bad_wr = wr; 1634 goto out; 1635 } 1636 wqe += seglen; 1637 size += seglen / 16; 1638 break; 1639 1640 default: 1641 break; 1642 } 1643 1644 /* 1645 * Write data segments in reverse order, so as to 1646 * overwrite cacheline stamp last within each 1647 * cacheline. This avoids issues with WQE 1648 * prefetching. 1649 */ 1650 1651 dseg = wqe; 1652 dseg += wr->num_sge - 1; 1653 size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); 1654 1655 /* Add one more inline data segment for ICRC for MLX sends */ 1656 if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI || 1657 qp->ibqp.qp_type == IB_QPT_GSI)) { 1658 set_mlx_icrc_seg(dseg + 1); 1659 size += sizeof (struct mlx4_wqe_data_seg) / 16; 1660 } 1661 1662 for (i = wr->num_sge - 1; i >= 0; --i, --dseg) 1663 set_data_seg(dseg, wr->sg_list + i); 1664 1665 /* 1666 * Possibly overwrite stamping in cacheline with LSO 1667 * segment only after making sure all data segments 1668 * are written. 1669 */ 1670 wmb(); 1671 *lso_wqe = lso_hdr_sz; 1672 1673 ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? 1674 MLX4_WQE_CTRL_FENCE : 0) | size; 1675 1676 /* 1677 * Make sure descriptor is fully written before 1678 * setting ownership bit (because HW can start 1679 * executing as soon as we do). 1680 */ 1681 wmb(); 1682 1683 if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { 1684 err = -EINVAL; 1685 goto out; 1686 } 1687 1688 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | 1689 (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; 1690 1691 stamp = ind + qp->sq_spare_wqes; 1692 ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); 1693 1694 /* 1695 * We can improve latency by not stamping the last 1696 * send queue WQE until after ringing the doorbell, so 1697 * only stamp here if there are still more WQEs to post. 1698 * 1699 * Same optimization applies to padding with NOP wqe 1700 * in case of WQE shrinking (used to prevent wrap-around 1701 * in the middle of WR). 1702 */ 1703 if (wr->next) { 1704 stamp_send_wqe(qp, stamp, size * 16); 1705 ind = pad_wraparound(qp, ind); 1706 } 1707 } 1708 1709 out: 1710 if (likely(nreq)) { 1711 qp->sq.head += nreq; 1712 1713 /* 1714 * Make sure that descriptors are written before 1715 * doorbell record. 1716 */ 1717 wmb(); 1718 1719 writel(qp->doorbell_qpn, 1720 to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); 1721 1722 /* 1723 * Make sure doorbells don't leak out of SQ spinlock 1724 * and reach the HCA out of order. 1725 */ 1726 mmiowb(); 1727 1728 stamp_send_wqe(qp, stamp, size * 16); 1729 1730 ind = pad_wraparound(qp, ind); 1731 qp->sq_next_wqe = ind; 1732 } 1733 1734 spin_unlock_irqrestore(&qp->sq.lock, flags); 1735 1736 return err; 1737 } 1738 1739 int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, 1740 struct ib_recv_wr **bad_wr) 1741 { 1742 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1743 struct mlx4_wqe_data_seg *scat; 1744 unsigned long flags; 1745 int err = 0; 1746 int nreq; 1747 int ind; 1748 int i; 1749 1750 spin_lock_irqsave(&qp->rq.lock, flags); 1751 1752 ind = qp->rq.head & (qp->rq.wqe_cnt - 1); 1753 1754 for (nreq = 0; wr; ++nreq, wr = wr->next) { 1755 if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { 1756 err = -ENOMEM; 1757 *bad_wr = wr; 1758 goto out; 1759 } 1760 1761 if (unlikely(wr->num_sge > qp->rq.max_gs)) { 1762 err = -EINVAL; 1763 *bad_wr = wr; 1764 goto out; 1765 } 1766 1767 scat = get_recv_wqe(qp, ind); 1768 1769 for (i = 0; i < wr->num_sge; ++i) 1770 __set_data_seg(scat + i, wr->sg_list + i); 1771 1772 if (i < qp->rq.max_gs) { 1773 scat[i].byte_count = 0; 1774 scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); 1775 scat[i].addr = 0; 1776 } 1777 1778 qp->rq.wrid[ind] = wr->wr_id; 1779 1780 ind = (ind + 1) & (qp->rq.wqe_cnt - 1); 1781 } 1782 1783 out: 1784 if (likely(nreq)) { 1785 qp->rq.head += nreq; 1786 1787 /* 1788 * Make sure that descriptors are written before 1789 * doorbell record. 1790 */ 1791 wmb(); 1792 1793 *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); 1794 } 1795 1796 spin_unlock_irqrestore(&qp->rq.lock, flags); 1797 1798 return err; 1799 } 1800 1801 static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) 1802 { 1803 switch (mlx4_state) { 1804 case MLX4_QP_STATE_RST: return IB_QPS_RESET; 1805 case MLX4_QP_STATE_INIT: return IB_QPS_INIT; 1806 case MLX4_QP_STATE_RTR: return IB_QPS_RTR; 1807 case MLX4_QP_STATE_RTS: return IB_QPS_RTS; 1808 case MLX4_QP_STATE_SQ_DRAINING: 1809 case MLX4_QP_STATE_SQD: return IB_QPS_SQD; 1810 case MLX4_QP_STATE_SQER: return IB_QPS_SQE; 1811 case MLX4_QP_STATE_ERR: return IB_QPS_ERR; 1812 default: return -1; 1813 } 1814 } 1815 1816 static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) 1817 { 1818 switch (mlx4_mig_state) { 1819 case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; 1820 case MLX4_QP_PM_REARM: return IB_MIG_REARM; 1821 case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; 1822 default: return -1; 1823 } 1824 } 1825 1826 static int to_ib_qp_access_flags(int mlx4_flags) 1827 { 1828 int ib_flags = 0; 1829 1830 if (mlx4_flags & MLX4_QP_BIT_RRE) 1831 ib_flags |= IB_ACCESS_REMOTE_READ; 1832 if (mlx4_flags & MLX4_QP_BIT_RWE) 1833 ib_flags |= IB_ACCESS_REMOTE_WRITE; 1834 if (mlx4_flags & MLX4_QP_BIT_RAE) 1835 ib_flags |= IB_ACCESS_REMOTE_ATOMIC; 1836 1837 return ib_flags; 1838 } 1839 1840 static void to_ib_ah_attr(struct mlx4_dev *dev, struct ib_ah_attr *ib_ah_attr, 1841 struct mlx4_qp_path *path) 1842 { 1843 memset(ib_ah_attr, 0, sizeof *ib_ah_attr); 1844 ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; 1845 1846 if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) 1847 return; 1848 1849 ib_ah_attr->dlid = be16_to_cpu(path->rlid); 1850 ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; 1851 ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; 1852 ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; 1853 ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; 1854 if (ib_ah_attr->ah_flags) { 1855 ib_ah_attr->grh.sgid_index = path->mgid_index; 1856 ib_ah_attr->grh.hop_limit = path->hop_limit; 1857 ib_ah_attr->grh.traffic_class = 1858 (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; 1859 ib_ah_attr->grh.flow_label = 1860 be32_to_cpu(path->tclass_flowlabel) & 0xfffff; 1861 memcpy(ib_ah_attr->grh.dgid.raw, 1862 path->rgid, sizeof ib_ah_attr->grh.dgid.raw); 1863 } 1864 } 1865 1866 int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, 1867 struct ib_qp_init_attr *qp_init_attr) 1868 { 1869 struct mlx4_ib_dev *dev = to_mdev(ibqp->device); 1870 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1871 struct mlx4_qp_context context; 1872 int mlx4_state; 1873 int err = 0; 1874 1875 mutex_lock(&qp->mutex); 1876 1877 if (qp->state == IB_QPS_RESET) { 1878 qp_attr->qp_state = IB_QPS_RESET; 1879 goto done; 1880 } 1881 1882 err = mlx4_qp_query(dev->dev, &qp->mqp, &context); 1883 if (err) { 1884 err = -EINVAL; 1885 goto out; 1886 } 1887 1888 mlx4_state = be32_to_cpu(context.flags) >> 28; 1889 1890 qp->state = to_ib_qp_state(mlx4_state); 1891 qp_attr->qp_state = qp->state; 1892 qp_attr->path_mtu = context.mtu_msgmax >> 5; 1893 qp_attr->path_mig_state = 1894 to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); 1895 qp_attr->qkey = be32_to_cpu(context.qkey); 1896 qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; 1897 qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; 1898 qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; 1899 qp_attr->qp_access_flags = 1900 to_ib_qp_access_flags(be32_to_cpu(context.params2)); 1901 1902 if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { 1903 to_ib_ah_attr(dev->dev, &qp_attr->ah_attr, &context.pri_path); 1904 to_ib_ah_attr(dev->dev, &qp_attr->alt_ah_attr, &context.alt_path); 1905 qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; 1906 qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; 1907 } 1908 1909 qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; 1910 if (qp_attr->qp_state == IB_QPS_INIT) 1911 qp_attr->port_num = qp->port; 1912 else 1913 qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; 1914 1915 /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ 1916 qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; 1917 1918 qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); 1919 1920 qp_attr->max_dest_rd_atomic = 1921 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); 1922 qp_attr->min_rnr_timer = 1923 (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; 1924 qp_attr->timeout = context.pri_path.ackto >> 3; 1925 qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; 1926 qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; 1927 qp_attr->alt_timeout = context.alt_path.ackto >> 3; 1928 1929 done: 1930 qp_attr->cur_qp_state = qp_attr->qp_state; 1931 qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; 1932 qp_attr->cap.max_recv_sge = qp->rq.max_gs; 1933 1934 if (!ibqp->uobject) { 1935 qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; 1936 qp_attr->cap.max_send_sge = qp->sq.max_gs; 1937 } else { 1938 qp_attr->cap.max_send_wr = 0; 1939 qp_attr->cap.max_send_sge = 0; 1940 } 1941 1942 /* 1943 * We don't support inline sends for kernel QPs (yet), and we 1944 * don't know what userspace's value should be. 1945 */ 1946 qp_attr->cap.max_inline_data = 0; 1947 1948 qp_init_attr->cap = qp_attr->cap; 1949 1950 qp_init_attr->create_flags = 0; 1951 if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) 1952 qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; 1953 1954 if (qp->flags & MLX4_IB_QP_LSO) 1955 qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; 1956 1957 out: 1958 mutex_unlock(&qp->mutex); 1959 return err; 1960 } 1961 1962