1 /* 2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <rdma/ib_cache.h> 34 #include <rdma/ib_pack.h> 35 36 #include <linux/mlx4/qp.h> 37 38 #include "mlx4_ib.h" 39 #include "user.h" 40 41 enum { 42 MLX4_IB_ACK_REQ_FREQ = 8, 43 }; 44 45 enum { 46 MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, 47 MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f 48 }; 49 50 enum { 51 /* 52 * Largest possible UD header: send with GRH and immediate data. 53 */ 54 MLX4_IB_UD_HEADER_SIZE = 72 55 }; 56 57 struct mlx4_ib_sqp { 58 struct mlx4_ib_qp qp; 59 int pkey_index; 60 u32 qkey; 61 u32 send_psn; 62 struct ib_ud_header ud_header; 63 u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; 64 }; 65 66 static const __be32 mlx4_ib_opcode[] = { 67 [IB_WR_SEND] = __constant_cpu_to_be32(MLX4_OPCODE_SEND), 68 [IB_WR_SEND_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM), 69 [IB_WR_RDMA_WRITE] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), 70 [IB_WR_RDMA_WRITE_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), 71 [IB_WR_RDMA_READ] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ), 72 [IB_WR_ATOMIC_CMP_AND_SWP] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), 73 [IB_WR_ATOMIC_FETCH_AND_ADD] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), 74 }; 75 76 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) 77 { 78 return container_of(mqp, struct mlx4_ib_sqp, qp); 79 } 80 81 static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) 82 { 83 return qp->mqp.qpn >= dev->dev->caps.sqp_start && 84 qp->mqp.qpn <= dev->dev->caps.sqp_start + 3; 85 } 86 87 static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) 88 { 89 return qp->mqp.qpn >= dev->dev->caps.sqp_start && 90 qp->mqp.qpn <= dev->dev->caps.sqp_start + 1; 91 } 92 93 static void *get_wqe(struct mlx4_ib_qp *qp, int offset) 94 { 95 if (qp->buf.nbufs == 1) 96 return qp->buf.u.direct.buf + offset; 97 else 98 return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf + 99 (offset & (PAGE_SIZE - 1)); 100 } 101 102 static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) 103 { 104 return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); 105 } 106 107 static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) 108 { 109 return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); 110 } 111 112 /* 113 * Stamp a SQ WQE so that it is invalid if prefetched by marking the 114 * first four bytes of every 64 byte chunk with 0xffffffff, except for 115 * the very first chunk of the WQE. 116 */ 117 static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) 118 { 119 u32 *wqe = get_send_wqe(qp, n); 120 int i; 121 122 for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) 123 wqe[i] = 0xffffffff; 124 } 125 126 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) 127 { 128 struct ib_event event; 129 struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; 130 131 if (type == MLX4_EVENT_TYPE_PATH_MIG) 132 to_mibqp(qp)->port = to_mibqp(qp)->alt_port; 133 134 if (ibqp->event_handler) { 135 event.device = ibqp->device; 136 event.element.qp = ibqp; 137 switch (type) { 138 case MLX4_EVENT_TYPE_PATH_MIG: 139 event.event = IB_EVENT_PATH_MIG; 140 break; 141 case MLX4_EVENT_TYPE_COMM_EST: 142 event.event = IB_EVENT_COMM_EST; 143 break; 144 case MLX4_EVENT_TYPE_SQ_DRAINED: 145 event.event = IB_EVENT_SQ_DRAINED; 146 break; 147 case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: 148 event.event = IB_EVENT_QP_LAST_WQE_REACHED; 149 break; 150 case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: 151 event.event = IB_EVENT_QP_FATAL; 152 break; 153 case MLX4_EVENT_TYPE_PATH_MIG_FAILED: 154 event.event = IB_EVENT_PATH_MIG_ERR; 155 break; 156 case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 157 event.event = IB_EVENT_QP_REQ_ERR; 158 break; 159 case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: 160 event.event = IB_EVENT_QP_ACCESS_ERR; 161 break; 162 default: 163 printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " 164 "on QP %06x\n", type, qp->qpn); 165 return; 166 } 167 168 ibqp->event_handler(&event, ibqp->qp_context); 169 } 170 } 171 172 static int send_wqe_overhead(enum ib_qp_type type) 173 { 174 /* 175 * UD WQEs must have a datagram segment. 176 * RC and UC WQEs might have a remote address segment. 177 * MLX WQEs need two extra inline data segments (for the UD 178 * header and space for the ICRC). 179 */ 180 switch (type) { 181 case IB_QPT_UD: 182 return sizeof (struct mlx4_wqe_ctrl_seg) + 183 sizeof (struct mlx4_wqe_datagram_seg); 184 case IB_QPT_UC: 185 return sizeof (struct mlx4_wqe_ctrl_seg) + 186 sizeof (struct mlx4_wqe_raddr_seg); 187 case IB_QPT_RC: 188 return sizeof (struct mlx4_wqe_ctrl_seg) + 189 sizeof (struct mlx4_wqe_atomic_seg) + 190 sizeof (struct mlx4_wqe_raddr_seg); 191 case IB_QPT_SMI: 192 case IB_QPT_GSI: 193 return sizeof (struct mlx4_wqe_ctrl_seg) + 194 ALIGN(MLX4_IB_UD_HEADER_SIZE + 195 DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, 196 MLX4_INLINE_ALIGN) * 197 sizeof (struct mlx4_wqe_inline_seg), 198 sizeof (struct mlx4_wqe_data_seg)) + 199 ALIGN(4 + 200 sizeof (struct mlx4_wqe_inline_seg), 201 sizeof (struct mlx4_wqe_data_seg)); 202 default: 203 return sizeof (struct mlx4_wqe_ctrl_seg); 204 } 205 } 206 207 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, 208 int is_user, int has_srq, struct mlx4_ib_qp *qp) 209 { 210 /* Sanity check RQ size before proceeding */ 211 if (cap->max_recv_wr > dev->dev->caps.max_wqes || 212 cap->max_recv_sge > dev->dev->caps.max_rq_sg) 213 return -EINVAL; 214 215 if (has_srq) { 216 /* QPs attached to an SRQ should have no RQ */ 217 if (cap->max_recv_wr) 218 return -EINVAL; 219 220 qp->rq.wqe_cnt = qp->rq.max_gs = 0; 221 } else { 222 /* HW requires >= 1 RQ entry with >= 1 gather entry */ 223 if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) 224 return -EINVAL; 225 226 qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); 227 qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); 228 qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); 229 } 230 231 cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; 232 cap->max_recv_sge = qp->rq.max_gs; 233 234 return 0; 235 } 236 237 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, 238 enum ib_qp_type type, struct mlx4_ib_qp *qp) 239 { 240 /* Sanity check SQ size before proceeding */ 241 if (cap->max_send_wr > dev->dev->caps.max_wqes || 242 cap->max_send_sge > dev->dev->caps.max_sq_sg || 243 cap->max_inline_data + send_wqe_overhead(type) + 244 sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) 245 return -EINVAL; 246 247 /* 248 * For MLX transport we need 2 extra S/G entries: 249 * one for the header and one for the checksum at the end 250 */ 251 if ((type == IB_QPT_SMI || type == IB_QPT_GSI) && 252 cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) 253 return -EINVAL; 254 255 qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge * 256 sizeof (struct mlx4_wqe_data_seg), 257 cap->max_inline_data + 258 sizeof (struct mlx4_wqe_inline_seg)) + 259 send_wqe_overhead(type))); 260 qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) / 261 sizeof (struct mlx4_wqe_data_seg); 262 263 /* 264 * We need to leave 2 KB + 1 WQE of headroom in the SQ to 265 * allow HW to prefetch. 266 */ 267 qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; 268 qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes); 269 270 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 271 (qp->sq.wqe_cnt << qp->sq.wqe_shift); 272 if (qp->rq.wqe_shift > qp->sq.wqe_shift) { 273 qp->rq.offset = 0; 274 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; 275 } else { 276 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; 277 qp->sq.offset = 0; 278 } 279 280 cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; 281 cap->max_send_sge = qp->sq.max_gs; 282 /* We don't support inline sends for kernel QPs (yet) */ 283 cap->max_inline_data = 0; 284 285 return 0; 286 } 287 288 static int set_user_sq_size(struct mlx4_ib_qp *qp, 289 struct mlx4_ib_create_qp *ucmd) 290 { 291 qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; 292 qp->sq.wqe_shift = ucmd->log_sq_stride; 293 294 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 295 (qp->sq.wqe_cnt << qp->sq.wqe_shift); 296 297 return 0; 298 } 299 300 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, 301 struct ib_qp_init_attr *init_attr, 302 struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) 303 { 304 int err; 305 306 mutex_init(&qp->mutex); 307 spin_lock_init(&qp->sq.lock); 308 spin_lock_init(&qp->rq.lock); 309 310 qp->state = IB_QPS_RESET; 311 qp->atomic_rd_en = 0; 312 qp->resp_depth = 0; 313 314 qp->rq.head = 0; 315 qp->rq.tail = 0; 316 qp->sq.head = 0; 317 qp->sq.tail = 0; 318 319 err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); 320 if (err) 321 goto err; 322 323 if (pd->uobject) { 324 struct mlx4_ib_create_qp ucmd; 325 326 if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { 327 err = -EFAULT; 328 goto err; 329 } 330 331 qp->sq_no_prefetch = ucmd.sq_no_prefetch; 332 333 err = set_user_sq_size(qp, &ucmd); 334 if (err) 335 goto err; 336 337 qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, 338 qp->buf_size, 0); 339 if (IS_ERR(qp->umem)) { 340 err = PTR_ERR(qp->umem); 341 goto err; 342 } 343 344 err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), 345 ilog2(qp->umem->page_size), &qp->mtt); 346 if (err) 347 goto err_buf; 348 349 err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); 350 if (err) 351 goto err_mtt; 352 353 if (!init_attr->srq) { 354 err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), 355 ucmd.db_addr, &qp->db); 356 if (err) 357 goto err_mtt; 358 } 359 } else { 360 qp->sq_no_prefetch = 0; 361 362 err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp); 363 if (err) 364 goto err; 365 366 if (!init_attr->srq) { 367 err = mlx4_ib_db_alloc(dev, &qp->db, 0); 368 if (err) 369 goto err; 370 371 *qp->db.db = 0; 372 } 373 374 if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { 375 err = -ENOMEM; 376 goto err_db; 377 } 378 379 err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, 380 &qp->mtt); 381 if (err) 382 goto err_buf; 383 384 err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); 385 if (err) 386 goto err_mtt; 387 388 qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); 389 qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); 390 391 if (!qp->sq.wrid || !qp->rq.wrid) { 392 err = -ENOMEM; 393 goto err_wrid; 394 } 395 } 396 397 err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp); 398 if (err) 399 goto err_wrid; 400 401 /* 402 * Hardware wants QPN written in big-endian order (after 403 * shifting) for send doorbell. Precompute this value to save 404 * a little bit when posting sends. 405 */ 406 qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); 407 408 if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) 409 qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 410 else 411 qp->sq_signal_bits = 0; 412 413 qp->mqp.event = mlx4_ib_qp_event; 414 415 return 0; 416 417 err_wrid: 418 if (pd->uobject) { 419 if (!init_attr->srq) 420 mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 421 &qp->db); 422 } else { 423 kfree(qp->sq.wrid); 424 kfree(qp->rq.wrid); 425 } 426 427 err_mtt: 428 mlx4_mtt_cleanup(dev->dev, &qp->mtt); 429 430 err_buf: 431 if (pd->uobject) 432 ib_umem_release(qp->umem); 433 else 434 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); 435 436 err_db: 437 if (!pd->uobject && !init_attr->srq) 438 mlx4_ib_db_free(dev, &qp->db); 439 440 err: 441 return err; 442 } 443 444 static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) 445 { 446 switch (state) { 447 case IB_QPS_RESET: return MLX4_QP_STATE_RST; 448 case IB_QPS_INIT: return MLX4_QP_STATE_INIT; 449 case IB_QPS_RTR: return MLX4_QP_STATE_RTR; 450 case IB_QPS_RTS: return MLX4_QP_STATE_RTS; 451 case IB_QPS_SQD: return MLX4_QP_STATE_SQD; 452 case IB_QPS_SQE: return MLX4_QP_STATE_SQER; 453 case IB_QPS_ERR: return MLX4_QP_STATE_ERR; 454 default: return -1; 455 } 456 } 457 458 static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) 459 { 460 if (send_cq == recv_cq) 461 spin_lock_irq(&send_cq->lock); 462 else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { 463 spin_lock_irq(&send_cq->lock); 464 spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); 465 } else { 466 spin_lock_irq(&recv_cq->lock); 467 spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); 468 } 469 } 470 471 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) 472 { 473 if (send_cq == recv_cq) 474 spin_unlock_irq(&send_cq->lock); 475 else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { 476 spin_unlock(&recv_cq->lock); 477 spin_unlock_irq(&send_cq->lock); 478 } else { 479 spin_unlock(&send_cq->lock); 480 spin_unlock_irq(&recv_cq->lock); 481 } 482 } 483 484 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, 485 int is_user) 486 { 487 struct mlx4_ib_cq *send_cq, *recv_cq; 488 489 if (qp->state != IB_QPS_RESET) 490 if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), 491 MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) 492 printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n", 493 qp->mqp.qpn); 494 495 send_cq = to_mcq(qp->ibqp.send_cq); 496 recv_cq = to_mcq(qp->ibqp.recv_cq); 497 498 mlx4_ib_lock_cqs(send_cq, recv_cq); 499 500 if (!is_user) { 501 __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, 502 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); 503 if (send_cq != recv_cq) 504 __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); 505 } 506 507 mlx4_qp_remove(dev->dev, &qp->mqp); 508 509 mlx4_ib_unlock_cqs(send_cq, recv_cq); 510 511 mlx4_qp_free(dev->dev, &qp->mqp); 512 mlx4_mtt_cleanup(dev->dev, &qp->mtt); 513 514 if (is_user) { 515 if (!qp->ibqp.srq) 516 mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), 517 &qp->db); 518 ib_umem_release(qp->umem); 519 } else { 520 kfree(qp->sq.wrid); 521 kfree(qp->rq.wrid); 522 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); 523 if (!qp->ibqp.srq) 524 mlx4_ib_db_free(dev, &qp->db); 525 } 526 } 527 528 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, 529 struct ib_qp_init_attr *init_attr, 530 struct ib_udata *udata) 531 { 532 struct mlx4_ib_dev *dev = to_mdev(pd->device); 533 struct mlx4_ib_sqp *sqp; 534 struct mlx4_ib_qp *qp; 535 int err; 536 537 switch (init_attr->qp_type) { 538 case IB_QPT_RC: 539 case IB_QPT_UC: 540 case IB_QPT_UD: 541 { 542 qp = kmalloc(sizeof *qp, GFP_KERNEL); 543 if (!qp) 544 return ERR_PTR(-ENOMEM); 545 546 err = create_qp_common(dev, pd, init_attr, udata, 0, qp); 547 if (err) { 548 kfree(qp); 549 return ERR_PTR(err); 550 } 551 552 qp->ibqp.qp_num = qp->mqp.qpn; 553 554 break; 555 } 556 case IB_QPT_SMI: 557 case IB_QPT_GSI: 558 { 559 /* Userspace is not allowed to create special QPs: */ 560 if (pd->uobject) 561 return ERR_PTR(-EINVAL); 562 563 sqp = kmalloc(sizeof *sqp, GFP_KERNEL); 564 if (!sqp) 565 return ERR_PTR(-ENOMEM); 566 567 qp = &sqp->qp; 568 569 err = create_qp_common(dev, pd, init_attr, udata, 570 dev->dev->caps.sqp_start + 571 (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) + 572 init_attr->port_num - 1, 573 qp); 574 if (err) { 575 kfree(sqp); 576 return ERR_PTR(err); 577 } 578 579 qp->port = init_attr->port_num; 580 qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; 581 582 break; 583 } 584 default: 585 /* Don't support raw QPs */ 586 return ERR_PTR(-EINVAL); 587 } 588 589 return &qp->ibqp; 590 } 591 592 int mlx4_ib_destroy_qp(struct ib_qp *qp) 593 { 594 struct mlx4_ib_dev *dev = to_mdev(qp->device); 595 struct mlx4_ib_qp *mqp = to_mqp(qp); 596 597 if (is_qp0(dev, mqp)) 598 mlx4_CLOSE_PORT(dev->dev, mqp->port); 599 600 destroy_qp_common(dev, mqp, !!qp->pd->uobject); 601 602 if (is_sqp(dev, mqp)) 603 kfree(to_msqp(mqp)); 604 else 605 kfree(mqp); 606 607 return 0; 608 } 609 610 static int to_mlx4_st(enum ib_qp_type type) 611 { 612 switch (type) { 613 case IB_QPT_RC: return MLX4_QP_ST_RC; 614 case IB_QPT_UC: return MLX4_QP_ST_UC; 615 case IB_QPT_UD: return MLX4_QP_ST_UD; 616 case IB_QPT_SMI: 617 case IB_QPT_GSI: return MLX4_QP_ST_MLX; 618 default: return -1; 619 } 620 } 621 622 static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr, 623 int attr_mask) 624 { 625 u8 dest_rd_atomic; 626 u32 access_flags; 627 u32 hw_access_flags = 0; 628 629 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) 630 dest_rd_atomic = attr->max_dest_rd_atomic; 631 else 632 dest_rd_atomic = qp->resp_depth; 633 634 if (attr_mask & IB_QP_ACCESS_FLAGS) 635 access_flags = attr->qp_access_flags; 636 else 637 access_flags = qp->atomic_rd_en; 638 639 if (!dest_rd_atomic) 640 access_flags &= IB_ACCESS_REMOTE_WRITE; 641 642 if (access_flags & IB_ACCESS_REMOTE_READ) 643 hw_access_flags |= MLX4_QP_BIT_RRE; 644 if (access_flags & IB_ACCESS_REMOTE_ATOMIC) 645 hw_access_flags |= MLX4_QP_BIT_RAE; 646 if (access_flags & IB_ACCESS_REMOTE_WRITE) 647 hw_access_flags |= MLX4_QP_BIT_RWE; 648 649 return cpu_to_be32(hw_access_flags); 650 } 651 652 static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr, 653 int attr_mask) 654 { 655 if (attr_mask & IB_QP_PKEY_INDEX) 656 sqp->pkey_index = attr->pkey_index; 657 if (attr_mask & IB_QP_QKEY) 658 sqp->qkey = attr->qkey; 659 if (attr_mask & IB_QP_SQ_PSN) 660 sqp->send_psn = attr->sq_psn; 661 } 662 663 static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) 664 { 665 path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6); 666 } 667 668 static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, 669 struct mlx4_qp_path *path, u8 port) 670 { 671 path->grh_mylmc = ah->src_path_bits & 0x7f; 672 path->rlid = cpu_to_be16(ah->dlid); 673 if (ah->static_rate) { 674 path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET; 675 while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && 676 !(1 << path->static_rate & dev->dev->caps.stat_rate_support)) 677 --path->static_rate; 678 } else 679 path->static_rate = 0; 680 path->counter_index = 0xff; 681 682 if (ah->ah_flags & IB_AH_GRH) { 683 if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { 684 printk(KERN_ERR "sgid_index (%u) too large. max is %d\n", 685 ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1); 686 return -1; 687 } 688 689 path->grh_mylmc |= 1 << 7; 690 path->mgid_index = ah->grh.sgid_index; 691 path->hop_limit = ah->grh.hop_limit; 692 path->tclass_flowlabel = 693 cpu_to_be32((ah->grh.traffic_class << 20) | 694 (ah->grh.flow_label)); 695 memcpy(path->rgid, ah->grh.dgid.raw, 16); 696 } 697 698 path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | 699 ((port - 1) << 6) | ((ah->sl & 0xf) << 2); 700 701 return 0; 702 } 703 704 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, 705 const struct ib_qp_attr *attr, int attr_mask, 706 enum ib_qp_state cur_state, enum ib_qp_state new_state) 707 { 708 struct mlx4_ib_dev *dev = to_mdev(ibqp->device); 709 struct mlx4_ib_qp *qp = to_mqp(ibqp); 710 struct mlx4_qp_context *context; 711 enum mlx4_qp_optpar optpar = 0; 712 int sqd_event; 713 int err = -EINVAL; 714 715 context = kzalloc(sizeof *context, GFP_KERNEL); 716 if (!context) 717 return -ENOMEM; 718 719 context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | 720 (to_mlx4_st(ibqp->qp_type) << 16)); 721 context->flags |= cpu_to_be32(1 << 8); /* DE? */ 722 723 if (!(attr_mask & IB_QP_PATH_MIG_STATE)) 724 context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); 725 else { 726 optpar |= MLX4_QP_OPTPAR_PM_STATE; 727 switch (attr->path_mig_state) { 728 case IB_MIG_MIGRATED: 729 context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); 730 break; 731 case IB_MIG_REARM: 732 context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11); 733 break; 734 case IB_MIG_ARMED: 735 context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11); 736 break; 737 } 738 } 739 740 if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || 741 ibqp->qp_type == IB_QPT_UD) 742 context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; 743 else if (attr_mask & IB_QP_PATH_MTU) { 744 if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { 745 printk(KERN_ERR "path MTU (%u) is invalid\n", 746 attr->path_mtu); 747 goto out; 748 } 749 context->mtu_msgmax = (attr->path_mtu << 5) | 31; 750 } 751 752 if (qp->rq.wqe_cnt) 753 context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; 754 context->rq_size_stride |= qp->rq.wqe_shift - 4; 755 756 if (qp->sq.wqe_cnt) 757 context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; 758 context->sq_size_stride |= qp->sq.wqe_shift - 4; 759 760 if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) 761 context->sq_size_stride |= !!qp->sq_no_prefetch << 7; 762 763 if (qp->ibqp.uobject) 764 context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); 765 else 766 context->usr_page = cpu_to_be32(dev->priv_uar.index); 767 768 if (attr_mask & IB_QP_DEST_QPN) 769 context->remote_qpn = cpu_to_be32(attr->dest_qp_num); 770 771 if (attr_mask & IB_QP_PORT) { 772 if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD && 773 !(attr_mask & IB_QP_AV)) { 774 mlx4_set_sched(&context->pri_path, attr->port_num); 775 optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE; 776 } 777 } 778 779 if (attr_mask & IB_QP_PKEY_INDEX) { 780 context->pri_path.pkey_index = attr->pkey_index; 781 optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; 782 } 783 784 if (attr_mask & IB_QP_AV) { 785 if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, 786 attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) 787 goto out; 788 789 optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | 790 MLX4_QP_OPTPAR_SCHED_QUEUE); 791 } 792 793 if (attr_mask & IB_QP_TIMEOUT) { 794 context->pri_path.ackto = attr->timeout << 3; 795 optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; 796 } 797 798 if (attr_mask & IB_QP_ALT_PATH) { 799 if (attr->alt_port_num == 0 || 800 attr->alt_port_num > dev->dev->caps.num_ports) 801 goto out; 802 803 if (attr->alt_pkey_index >= 804 dev->dev->caps.pkey_table_len[attr->alt_port_num]) 805 goto out; 806 807 if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, 808 attr->alt_port_num)) 809 goto out; 810 811 context->alt_path.pkey_index = attr->alt_pkey_index; 812 context->alt_path.ackto = attr->alt_timeout << 3; 813 optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; 814 } 815 816 context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn); 817 context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); 818 819 if (attr_mask & IB_QP_RNR_RETRY) { 820 context->params1 |= cpu_to_be32(attr->rnr_retry << 13); 821 optpar |= MLX4_QP_OPTPAR_RNR_RETRY; 822 } 823 824 if (attr_mask & IB_QP_RETRY_CNT) { 825 context->params1 |= cpu_to_be32(attr->retry_cnt << 16); 826 optpar |= MLX4_QP_OPTPAR_RETRY_COUNT; 827 } 828 829 if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { 830 if (attr->max_rd_atomic) 831 context->params1 |= 832 cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); 833 optpar |= MLX4_QP_OPTPAR_SRA_MAX; 834 } 835 836 if (attr_mask & IB_QP_SQ_PSN) 837 context->next_send_psn = cpu_to_be32(attr->sq_psn); 838 839 context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn); 840 841 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { 842 if (attr->max_dest_rd_atomic) 843 context->params2 |= 844 cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); 845 optpar |= MLX4_QP_OPTPAR_RRA_MAX; 846 } 847 848 if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { 849 context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask); 850 optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; 851 } 852 853 if (ibqp->srq) 854 context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); 855 856 if (attr_mask & IB_QP_MIN_RNR_TIMER) { 857 context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); 858 optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT; 859 } 860 if (attr_mask & IB_QP_RQ_PSN) 861 context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); 862 863 context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn); 864 865 if (attr_mask & IB_QP_QKEY) { 866 context->qkey = cpu_to_be32(attr->qkey); 867 optpar |= MLX4_QP_OPTPAR_Q_KEY; 868 } 869 870 if (ibqp->srq) 871 context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); 872 873 if (!ibqp->srq && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) 874 context->db_rec_addr = cpu_to_be64(qp->db.dma); 875 876 if (cur_state == IB_QPS_INIT && 877 new_state == IB_QPS_RTR && 878 (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || 879 ibqp->qp_type == IB_QPT_UD)) { 880 context->pri_path.sched_queue = (qp->port - 1) << 6; 881 if (is_qp0(dev, qp)) 882 context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; 883 else 884 context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; 885 } 886 887 if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && 888 attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) 889 sqd_event = 1; 890 else 891 sqd_event = 0; 892 893 /* 894 * Before passing a kernel QP to the HW, make sure that the 895 * ownership bits of the send queue are set and the SQ 896 * headroom is stamped so that the hardware doesn't start 897 * processing stale work requests. 898 */ 899 if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { 900 struct mlx4_wqe_ctrl_seg *ctrl; 901 int i; 902 903 for (i = 0; i < qp->sq.wqe_cnt; ++i) { 904 ctrl = get_send_wqe(qp, i); 905 ctrl->owner_opcode = cpu_to_be32(1 << 31); 906 907 stamp_send_wqe(qp, i); 908 } 909 } 910 911 err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state), 912 to_mlx4_state(new_state), context, optpar, 913 sqd_event, &qp->mqp); 914 if (err) 915 goto out; 916 917 qp->state = new_state; 918 919 if (attr_mask & IB_QP_ACCESS_FLAGS) 920 qp->atomic_rd_en = attr->qp_access_flags; 921 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) 922 qp->resp_depth = attr->max_dest_rd_atomic; 923 if (attr_mask & IB_QP_PORT) 924 qp->port = attr->port_num; 925 if (attr_mask & IB_QP_ALT_PATH) 926 qp->alt_port = attr->alt_port_num; 927 928 if (is_sqp(dev, qp)) 929 store_sqp_attrs(to_msqp(qp), attr, attr_mask); 930 931 /* 932 * If we moved QP0 to RTR, bring the IB link up; if we moved 933 * QP0 to RESET or ERROR, bring the link back down. 934 */ 935 if (is_qp0(dev, qp)) { 936 if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) 937 if (mlx4_INIT_PORT(dev->dev, qp->port)) 938 printk(KERN_WARNING "INIT_PORT failed for port %d\n", 939 qp->port); 940 941 if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && 942 (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) 943 mlx4_CLOSE_PORT(dev->dev, qp->port); 944 } 945 946 /* 947 * If we moved a kernel QP to RESET, clean up all old CQ 948 * entries and reinitialize the QP. 949 */ 950 if (new_state == IB_QPS_RESET && !ibqp->uobject) { 951 mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn, 952 ibqp->srq ? to_msrq(ibqp->srq): NULL); 953 if (ibqp->send_cq != ibqp->recv_cq) 954 mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL); 955 956 qp->rq.head = 0; 957 qp->rq.tail = 0; 958 qp->sq.head = 0; 959 qp->sq.tail = 0; 960 if (!ibqp->srq) 961 *qp->db.db = 0; 962 } 963 964 out: 965 kfree(context); 966 return err; 967 } 968 969 static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 }; 970 static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = { 971 [IB_QPT_UD] = (IB_QP_PKEY_INDEX | 972 IB_QP_PORT | 973 IB_QP_QKEY), 974 [IB_QPT_UC] = (IB_QP_PKEY_INDEX | 975 IB_QP_PORT | 976 IB_QP_ACCESS_FLAGS), 977 [IB_QPT_RC] = (IB_QP_PKEY_INDEX | 978 IB_QP_PORT | 979 IB_QP_ACCESS_FLAGS), 980 [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | 981 IB_QP_QKEY), 982 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | 983 IB_QP_QKEY), 984 }; 985 986 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, 987 int attr_mask, struct ib_udata *udata) 988 { 989 struct mlx4_ib_dev *dev = to_mdev(ibqp->device); 990 struct mlx4_ib_qp *qp = to_mqp(ibqp); 991 enum ib_qp_state cur_state, new_state; 992 int err = -EINVAL; 993 994 mutex_lock(&qp->mutex); 995 996 cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; 997 new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; 998 999 if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) 1000 goto out; 1001 1002 if ((attr_mask & IB_QP_PORT) && 1003 (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) { 1004 goto out; 1005 } 1006 1007 if (attr_mask & IB_QP_PKEY_INDEX) { 1008 int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; 1009 if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) 1010 goto out; 1011 } 1012 1013 if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && 1014 attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { 1015 goto out; 1016 } 1017 1018 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && 1019 attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { 1020 goto out; 1021 } 1022 1023 if (cur_state == new_state && cur_state == IB_QPS_RESET) { 1024 err = 0; 1025 goto out; 1026 } 1027 1028 if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) { 1029 err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr, 1030 mlx4_ib_qp_attr_mask_table[ibqp->qp_type], 1031 IB_QPS_RESET, IB_QPS_INIT); 1032 if (err) 1033 goto out; 1034 cur_state = IB_QPS_INIT; 1035 } 1036 1037 err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); 1038 1039 out: 1040 mutex_unlock(&qp->mutex); 1041 return err; 1042 } 1043 1044 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, 1045 void *wqe) 1046 { 1047 struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; 1048 struct mlx4_wqe_mlx_seg *mlx = wqe; 1049 struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; 1050 struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); 1051 u16 pkey; 1052 int send_size; 1053 int header_size; 1054 int spc; 1055 int i; 1056 1057 send_size = 0; 1058 for (i = 0; i < wr->num_sge; ++i) 1059 send_size += wr->sg_list[i].length; 1060 1061 ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header); 1062 1063 sqp->ud_header.lrh.service_level = 1064 be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28; 1065 sqp->ud_header.lrh.destination_lid = ah->av.dlid; 1066 sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.g_slid & 0x7f); 1067 if (mlx4_ib_ah_grh_present(ah)) { 1068 sqp->ud_header.grh.traffic_class = 1069 (be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff; 1070 sqp->ud_header.grh.flow_label = 1071 ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff); 1072 sqp->ud_header.grh.hop_limit = ah->av.hop_limit; 1073 ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24, 1074 ah->av.gid_index, &sqp->ud_header.grh.source_gid); 1075 memcpy(sqp->ud_header.grh.destination_gid.raw, 1076 ah->av.dgid, 16); 1077 } 1078 1079 mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 1080 mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | 1081 (sqp->ud_header.lrh.destination_lid == 1082 IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | 1083 (sqp->ud_header.lrh.service_level << 8)); 1084 mlx->rlid = sqp->ud_header.lrh.destination_lid; 1085 1086 switch (wr->opcode) { 1087 case IB_WR_SEND: 1088 sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; 1089 sqp->ud_header.immediate_present = 0; 1090 break; 1091 case IB_WR_SEND_WITH_IMM: 1092 sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; 1093 sqp->ud_header.immediate_present = 1; 1094 sqp->ud_header.immediate_data = wr->imm_data; 1095 break; 1096 default: 1097 return -EINVAL; 1098 } 1099 1100 sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; 1101 if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) 1102 sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; 1103 sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); 1104 if (!sqp->qp.ibqp.qp_num) 1105 ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); 1106 else 1107 ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); 1108 sqp->ud_header.bth.pkey = cpu_to_be16(pkey); 1109 sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); 1110 sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); 1111 sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? 1112 sqp->qkey : wr->wr.ud.remote_qkey); 1113 sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); 1114 1115 header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); 1116 1117 if (0) { 1118 printk(KERN_ERR "built UD header of size %d:\n", header_size); 1119 for (i = 0; i < header_size / 4; ++i) { 1120 if (i % 8 == 0) 1121 printk(" [%02x] ", i * 4); 1122 printk(" %08x", 1123 be32_to_cpu(((__be32 *) sqp->header_buf)[i])); 1124 if ((i + 1) % 8 == 0) 1125 printk("\n"); 1126 } 1127 printk("\n"); 1128 } 1129 1130 /* 1131 * Inline data segments may not cross a 64 byte boundary. If 1132 * our UD header is bigger than the space available up to the 1133 * next 64 byte boundary in the WQE, use two inline data 1134 * segments to hold the UD header. 1135 */ 1136 spc = MLX4_INLINE_ALIGN - 1137 ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); 1138 if (header_size <= spc) { 1139 inl->byte_count = cpu_to_be32(1 << 31 | header_size); 1140 memcpy(inl + 1, sqp->header_buf, header_size); 1141 i = 1; 1142 } else { 1143 inl->byte_count = cpu_to_be32(1 << 31 | spc); 1144 memcpy(inl + 1, sqp->header_buf, spc); 1145 1146 inl = (void *) (inl + 1) + spc; 1147 memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); 1148 /* 1149 * Need a barrier here to make sure all the data is 1150 * visible before the byte_count field is set. 1151 * Otherwise the HCA prefetcher could grab the 64-byte 1152 * chunk with this inline segment and get a valid (!= 1153 * 0xffffffff) byte count but stale data, and end up 1154 * generating a packet with bad headers. 1155 * 1156 * The first inline segment's byte_count field doesn't 1157 * need a barrier, because it comes after a 1158 * control/MLX segment and therefore is at an offset 1159 * of 16 mod 64. 1160 */ 1161 wmb(); 1162 inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); 1163 i = 2; 1164 } 1165 1166 return ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); 1167 } 1168 1169 static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) 1170 { 1171 unsigned cur; 1172 struct mlx4_ib_cq *cq; 1173 1174 cur = wq->head - wq->tail; 1175 if (likely(cur + nreq < wq->max_post)) 1176 return 0; 1177 1178 cq = to_mcq(ib_cq); 1179 spin_lock(&cq->lock); 1180 cur = wq->head - wq->tail; 1181 spin_unlock(&cq->lock); 1182 1183 return cur + nreq >= wq->max_post; 1184 } 1185 1186 static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, 1187 u64 remote_addr, u32 rkey) 1188 { 1189 rseg->raddr = cpu_to_be64(remote_addr); 1190 rseg->rkey = cpu_to_be32(rkey); 1191 rseg->reserved = 0; 1192 } 1193 1194 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr) 1195 { 1196 if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { 1197 aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); 1198 aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); 1199 } else { 1200 aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); 1201 aseg->compare = 0; 1202 } 1203 1204 } 1205 1206 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, 1207 struct ib_send_wr *wr) 1208 { 1209 memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); 1210 dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); 1211 dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); 1212 } 1213 1214 static void set_mlx_icrc_seg(void *dseg) 1215 { 1216 u32 *t = dseg; 1217 struct mlx4_wqe_inline_seg *iseg = dseg; 1218 1219 t[1] = 0; 1220 1221 /* 1222 * Need a barrier here before writing the byte_count field to 1223 * make sure that all the data is visible before the 1224 * byte_count field is set. Otherwise, if the segment begins 1225 * a new cacheline, the HCA prefetcher could grab the 64-byte 1226 * chunk and get a valid (!= * 0xffffffff) byte count but 1227 * stale data, and end up sending the wrong data. 1228 */ 1229 wmb(); 1230 1231 iseg->byte_count = cpu_to_be32((1 << 31) | 4); 1232 } 1233 1234 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) 1235 { 1236 dseg->lkey = cpu_to_be32(sg->lkey); 1237 dseg->addr = cpu_to_be64(sg->addr); 1238 1239 /* 1240 * Need a barrier here before writing the byte_count field to 1241 * make sure that all the data is visible before the 1242 * byte_count field is set. Otherwise, if the segment begins 1243 * a new cacheline, the HCA prefetcher could grab the 64-byte 1244 * chunk and get a valid (!= * 0xffffffff) byte count but 1245 * stale data, and end up sending the wrong data. 1246 */ 1247 wmb(); 1248 1249 dseg->byte_count = cpu_to_be32(sg->length); 1250 } 1251 1252 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) 1253 { 1254 dseg->byte_count = cpu_to_be32(sg->length); 1255 dseg->lkey = cpu_to_be32(sg->lkey); 1256 dseg->addr = cpu_to_be64(sg->addr); 1257 } 1258 1259 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, 1260 struct ib_send_wr **bad_wr) 1261 { 1262 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1263 void *wqe; 1264 struct mlx4_wqe_ctrl_seg *ctrl; 1265 struct mlx4_wqe_data_seg *dseg; 1266 unsigned long flags; 1267 int nreq; 1268 int err = 0; 1269 int ind; 1270 int size; 1271 int i; 1272 1273 spin_lock_irqsave(&qp->rq.lock, flags); 1274 1275 ind = qp->sq.head; 1276 1277 for (nreq = 0; wr; ++nreq, wr = wr->next) { 1278 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { 1279 err = -ENOMEM; 1280 *bad_wr = wr; 1281 goto out; 1282 } 1283 1284 if (unlikely(wr->num_sge > qp->sq.max_gs)) { 1285 err = -EINVAL; 1286 *bad_wr = wr; 1287 goto out; 1288 } 1289 1290 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 1291 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 1292 1293 ctrl->srcrb_flags = 1294 (wr->send_flags & IB_SEND_SIGNALED ? 1295 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | 1296 (wr->send_flags & IB_SEND_SOLICITED ? 1297 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | 1298 qp->sq_signal_bits; 1299 1300 if (wr->opcode == IB_WR_SEND_WITH_IMM || 1301 wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) 1302 ctrl->imm = wr->imm_data; 1303 else 1304 ctrl->imm = 0; 1305 1306 wqe += sizeof *ctrl; 1307 size = sizeof *ctrl / 16; 1308 1309 switch (ibqp->qp_type) { 1310 case IB_QPT_RC: 1311 case IB_QPT_UC: 1312 switch (wr->opcode) { 1313 case IB_WR_ATOMIC_CMP_AND_SWP: 1314 case IB_WR_ATOMIC_FETCH_AND_ADD: 1315 set_raddr_seg(wqe, wr->wr.atomic.remote_addr, 1316 wr->wr.atomic.rkey); 1317 wqe += sizeof (struct mlx4_wqe_raddr_seg); 1318 1319 set_atomic_seg(wqe, wr); 1320 wqe += sizeof (struct mlx4_wqe_atomic_seg); 1321 1322 size += (sizeof (struct mlx4_wqe_raddr_seg) + 1323 sizeof (struct mlx4_wqe_atomic_seg)) / 16; 1324 1325 break; 1326 1327 case IB_WR_RDMA_READ: 1328 case IB_WR_RDMA_WRITE: 1329 case IB_WR_RDMA_WRITE_WITH_IMM: 1330 set_raddr_seg(wqe, wr->wr.rdma.remote_addr, 1331 wr->wr.rdma.rkey); 1332 wqe += sizeof (struct mlx4_wqe_raddr_seg); 1333 size += sizeof (struct mlx4_wqe_raddr_seg) / 16; 1334 break; 1335 1336 default: 1337 /* No extra segments required for sends */ 1338 break; 1339 } 1340 break; 1341 1342 case IB_QPT_UD: 1343 set_datagram_seg(wqe, wr); 1344 wqe += sizeof (struct mlx4_wqe_datagram_seg); 1345 size += sizeof (struct mlx4_wqe_datagram_seg) / 16; 1346 break; 1347 1348 case IB_QPT_SMI: 1349 case IB_QPT_GSI: 1350 err = build_mlx_header(to_msqp(qp), wr, ctrl); 1351 if (err < 0) { 1352 *bad_wr = wr; 1353 goto out; 1354 } 1355 wqe += err; 1356 size += err / 16; 1357 1358 err = 0; 1359 break; 1360 1361 default: 1362 break; 1363 } 1364 1365 /* 1366 * Write data segments in reverse order, so as to 1367 * overwrite cacheline stamp last within each 1368 * cacheline. This avoids issues with WQE 1369 * prefetching. 1370 */ 1371 1372 dseg = wqe; 1373 dseg += wr->num_sge - 1; 1374 size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); 1375 1376 /* Add one more inline data segment for ICRC for MLX sends */ 1377 if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI || 1378 qp->ibqp.qp_type == IB_QPT_GSI)) { 1379 set_mlx_icrc_seg(dseg + 1); 1380 size += sizeof (struct mlx4_wqe_data_seg) / 16; 1381 } 1382 1383 for (i = wr->num_sge - 1; i >= 0; --i, --dseg) 1384 set_data_seg(dseg, wr->sg_list + i); 1385 1386 ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? 1387 MLX4_WQE_CTRL_FENCE : 0) | size; 1388 1389 /* 1390 * Make sure descriptor is fully written before 1391 * setting ownership bit (because HW can start 1392 * executing as soon as we do). 1393 */ 1394 wmb(); 1395 1396 if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) { 1397 err = -EINVAL; 1398 goto out; 1399 } 1400 1401 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | 1402 (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); 1403 1404 /* 1405 * We can improve latency by not stamping the last 1406 * send queue WQE until after ringing the doorbell, so 1407 * only stamp here if there are still more WQEs to post. 1408 */ 1409 if (wr->next) 1410 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & 1411 (qp->sq.wqe_cnt - 1)); 1412 1413 ++ind; 1414 } 1415 1416 out: 1417 if (likely(nreq)) { 1418 qp->sq.head += nreq; 1419 1420 /* 1421 * Make sure that descriptors are written before 1422 * doorbell record. 1423 */ 1424 wmb(); 1425 1426 writel(qp->doorbell_qpn, 1427 to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); 1428 1429 /* 1430 * Make sure doorbells don't leak out of SQ spinlock 1431 * and reach the HCA out of order. 1432 */ 1433 mmiowb(); 1434 1435 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & 1436 (qp->sq.wqe_cnt - 1)); 1437 } 1438 1439 spin_unlock_irqrestore(&qp->rq.lock, flags); 1440 1441 return err; 1442 } 1443 1444 int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, 1445 struct ib_recv_wr **bad_wr) 1446 { 1447 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1448 struct mlx4_wqe_data_seg *scat; 1449 unsigned long flags; 1450 int err = 0; 1451 int nreq; 1452 int ind; 1453 int i; 1454 1455 spin_lock_irqsave(&qp->rq.lock, flags); 1456 1457 ind = qp->rq.head & (qp->rq.wqe_cnt - 1); 1458 1459 for (nreq = 0; wr; ++nreq, wr = wr->next) { 1460 if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.send_cq)) { 1461 err = -ENOMEM; 1462 *bad_wr = wr; 1463 goto out; 1464 } 1465 1466 if (unlikely(wr->num_sge > qp->rq.max_gs)) { 1467 err = -EINVAL; 1468 *bad_wr = wr; 1469 goto out; 1470 } 1471 1472 scat = get_recv_wqe(qp, ind); 1473 1474 for (i = 0; i < wr->num_sge; ++i) 1475 __set_data_seg(scat + i, wr->sg_list + i); 1476 1477 if (i < qp->rq.max_gs) { 1478 scat[i].byte_count = 0; 1479 scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); 1480 scat[i].addr = 0; 1481 } 1482 1483 qp->rq.wrid[ind] = wr->wr_id; 1484 1485 ind = (ind + 1) & (qp->rq.wqe_cnt - 1); 1486 } 1487 1488 out: 1489 if (likely(nreq)) { 1490 qp->rq.head += nreq; 1491 1492 /* 1493 * Make sure that descriptors are written before 1494 * doorbell record. 1495 */ 1496 wmb(); 1497 1498 *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); 1499 } 1500 1501 spin_unlock_irqrestore(&qp->rq.lock, flags); 1502 1503 return err; 1504 } 1505 1506 static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) 1507 { 1508 switch (mlx4_state) { 1509 case MLX4_QP_STATE_RST: return IB_QPS_RESET; 1510 case MLX4_QP_STATE_INIT: return IB_QPS_INIT; 1511 case MLX4_QP_STATE_RTR: return IB_QPS_RTR; 1512 case MLX4_QP_STATE_RTS: return IB_QPS_RTS; 1513 case MLX4_QP_STATE_SQ_DRAINING: 1514 case MLX4_QP_STATE_SQD: return IB_QPS_SQD; 1515 case MLX4_QP_STATE_SQER: return IB_QPS_SQE; 1516 case MLX4_QP_STATE_ERR: return IB_QPS_ERR; 1517 default: return -1; 1518 } 1519 } 1520 1521 static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) 1522 { 1523 switch (mlx4_mig_state) { 1524 case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; 1525 case MLX4_QP_PM_REARM: return IB_MIG_REARM; 1526 case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; 1527 default: return -1; 1528 } 1529 } 1530 1531 static int to_ib_qp_access_flags(int mlx4_flags) 1532 { 1533 int ib_flags = 0; 1534 1535 if (mlx4_flags & MLX4_QP_BIT_RRE) 1536 ib_flags |= IB_ACCESS_REMOTE_READ; 1537 if (mlx4_flags & MLX4_QP_BIT_RWE) 1538 ib_flags |= IB_ACCESS_REMOTE_WRITE; 1539 if (mlx4_flags & MLX4_QP_BIT_RAE) 1540 ib_flags |= IB_ACCESS_REMOTE_ATOMIC; 1541 1542 return ib_flags; 1543 } 1544 1545 static void to_ib_ah_attr(struct mlx4_dev *dev, struct ib_ah_attr *ib_ah_attr, 1546 struct mlx4_qp_path *path) 1547 { 1548 memset(ib_ah_attr, 0, sizeof *ib_ah_attr); 1549 ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; 1550 1551 if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) 1552 return; 1553 1554 ib_ah_attr->dlid = be16_to_cpu(path->rlid); 1555 ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; 1556 ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; 1557 ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; 1558 ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; 1559 if (ib_ah_attr->ah_flags) { 1560 ib_ah_attr->grh.sgid_index = path->mgid_index; 1561 ib_ah_attr->grh.hop_limit = path->hop_limit; 1562 ib_ah_attr->grh.traffic_class = 1563 (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; 1564 ib_ah_attr->grh.flow_label = 1565 be32_to_cpu(path->tclass_flowlabel) & 0xfffff; 1566 memcpy(ib_ah_attr->grh.dgid.raw, 1567 path->rgid, sizeof ib_ah_attr->grh.dgid.raw); 1568 } 1569 } 1570 1571 int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, 1572 struct ib_qp_init_attr *qp_init_attr) 1573 { 1574 struct mlx4_ib_dev *dev = to_mdev(ibqp->device); 1575 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1576 struct mlx4_qp_context context; 1577 int mlx4_state; 1578 int err; 1579 1580 if (qp->state == IB_QPS_RESET) { 1581 qp_attr->qp_state = IB_QPS_RESET; 1582 goto done; 1583 } 1584 1585 err = mlx4_qp_query(dev->dev, &qp->mqp, &context); 1586 if (err) 1587 return -EINVAL; 1588 1589 mlx4_state = be32_to_cpu(context.flags) >> 28; 1590 1591 qp_attr->qp_state = to_ib_qp_state(mlx4_state); 1592 qp_attr->path_mtu = context.mtu_msgmax >> 5; 1593 qp_attr->path_mig_state = 1594 to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); 1595 qp_attr->qkey = be32_to_cpu(context.qkey); 1596 qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; 1597 qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; 1598 qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; 1599 qp_attr->qp_access_flags = 1600 to_ib_qp_access_flags(be32_to_cpu(context.params2)); 1601 1602 if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { 1603 to_ib_ah_attr(dev->dev, &qp_attr->ah_attr, &context.pri_path); 1604 to_ib_ah_attr(dev->dev, &qp_attr->alt_ah_attr, &context.alt_path); 1605 qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; 1606 qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; 1607 } 1608 1609 qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; 1610 if (qp_attr->qp_state == IB_QPS_INIT) 1611 qp_attr->port_num = qp->port; 1612 else 1613 qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; 1614 1615 /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ 1616 qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; 1617 1618 qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); 1619 1620 qp_attr->max_dest_rd_atomic = 1621 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); 1622 qp_attr->min_rnr_timer = 1623 (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; 1624 qp_attr->timeout = context.pri_path.ackto >> 3; 1625 qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; 1626 qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; 1627 qp_attr->alt_timeout = context.alt_path.ackto >> 3; 1628 1629 done: 1630 qp_attr->cur_qp_state = qp_attr->qp_state; 1631 qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; 1632 qp_attr->cap.max_recv_sge = qp->rq.max_gs; 1633 1634 if (!ibqp->uobject) { 1635 qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; 1636 qp_attr->cap.max_send_sge = qp->sq.max_gs; 1637 } else { 1638 qp_attr->cap.max_send_wr = 0; 1639 qp_attr->cap.max_send_sge = 0; 1640 } 1641 1642 /* 1643 * We don't support inline sends for kernel QPs (yet), and we 1644 * don't know what userspace's value should be. 1645 */ 1646 qp_attr->cap.max_inline_data = 0; 1647 1648 qp_init_attr->cap = qp_attr->cap; 1649 1650 return 0; 1651 } 1652 1653