1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include <linux/smp.h> 5 #include "dr_types.h" 6 7 #define QUEUE_SIZE 128 8 #define SIGNAL_PER_DIV_QUEUE 16 9 #define TH_NUMS_TO_DRAIN 2 10 11 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 12 13 struct dr_data_seg { 14 u64 addr; 15 u32 length; 16 u32 lkey; 17 unsigned int send_flags; 18 }; 19 20 struct postsend_info { 21 struct dr_data_seg write; 22 struct dr_data_seg read; 23 u64 remote_addr; 24 u32 rkey; 25 }; 26 27 struct dr_qp_rtr_attr { 28 struct mlx5dr_cmd_gid_attr dgid_attr; 29 enum ib_mtu mtu; 30 u32 qp_num; 31 u16 port_num; 32 u8 min_rnr_timer; 33 u8 sgid_index; 34 u16 udp_src_port; 35 u8 fl:1; 36 }; 37 38 struct dr_qp_rts_attr { 39 u8 timeout; 40 u8 retry_cnt; 41 u8 rnr_retry; 42 }; 43 44 struct dr_qp_init_attr { 45 u32 cqn; 46 u32 pdn; 47 u32 max_send_wr; 48 struct mlx5_uars_page *uar; 49 u8 isolate_vl_tc:1; 50 }; 51 52 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64) 53 { 54 unsigned int idx; 55 u8 opcode; 56 57 opcode = get_cqe_opcode(cqe64); 58 if (opcode == MLX5_CQE_REQ_ERR) { 59 idx = be16_to_cpu(cqe64->wqe_counter) & 60 (dr_cq->qp->sq.wqe_cnt - 1); 61 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1; 62 } else if (opcode == MLX5_CQE_RESP_ERR) { 63 ++dr_cq->qp->sq.cc; 64 } else { 65 idx = be16_to_cpu(cqe64->wqe_counter) & 66 (dr_cq->qp->sq.wqe_cnt - 1); 67 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1; 68 69 return CQ_OK; 70 } 71 72 return CQ_POLL_ERR; 73 } 74 75 static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq) 76 { 77 struct mlx5_cqe64 *cqe64; 78 int err; 79 80 cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq); 81 if (!cqe64) 82 return CQ_EMPTY; 83 84 mlx5_cqwq_pop(&dr_cq->wq); 85 err = dr_parse_cqe(dr_cq, cqe64); 86 mlx5_cqwq_update_db_record(&dr_cq->wq); 87 88 return err; 89 } 90 91 static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne) 92 { 93 int npolled; 94 int err = 0; 95 96 for (npolled = 0; npolled < ne; ++npolled) { 97 err = dr_cq_poll_one(dr_cq); 98 if (err != CQ_OK) 99 break; 100 } 101 102 return err == CQ_POLL_ERR ? err : npolled; 103 } 104 105 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, 106 struct dr_qp_init_attr *attr) 107 { 108 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 109 u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; 110 struct mlx5_wq_param wqp; 111 struct mlx5dr_qp *dr_qp; 112 int inlen; 113 void *qpc; 114 void *in; 115 int err; 116 117 dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL); 118 if (!dr_qp) 119 return NULL; 120 121 wqp.buf_numa_node = mdev->priv.numa_node; 122 wqp.db_numa_node = mdev->priv.numa_node; 123 124 dr_qp->rq.pc = 0; 125 dr_qp->rq.cc = 0; 126 dr_qp->rq.wqe_cnt = 4; 127 dr_qp->sq.pc = 0; 128 dr_qp->sq.cc = 0; 129 dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr); 130 131 MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); 132 MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt)); 133 MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt)); 134 err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq, 135 &dr_qp->wq_ctrl); 136 if (err) { 137 mlx5_core_warn(mdev, "Can't create QP WQ\n"); 138 goto err_wq; 139 } 140 141 dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt, 142 sizeof(dr_qp->sq.wqe_head[0]), 143 GFP_KERNEL); 144 145 if (!dr_qp->sq.wqe_head) { 146 mlx5_core_warn(mdev, "Can't allocate wqe head\n"); 147 goto err_wqe_head; 148 } 149 150 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 151 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 152 dr_qp->wq_ctrl.buf.npages; 153 in = kvzalloc(inlen, GFP_KERNEL); 154 if (!in) { 155 err = -ENOMEM; 156 goto err_in; 157 } 158 159 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 160 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 161 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 162 MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc); 163 MLX5_SET(qpc, qpc, pd, attr->pdn); 164 MLX5_SET(qpc, qpc, uar_page, attr->uar->index); 165 MLX5_SET(qpc, qpc, log_page_size, 166 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 167 MLX5_SET(qpc, qpc, fre, 1); 168 MLX5_SET(qpc, qpc, rlky, 1); 169 MLX5_SET(qpc, qpc, cqn_snd, attr->cqn); 170 MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn); 171 MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); 172 MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt)); 173 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 174 MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt)); 175 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 176 MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma); 177 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 178 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 179 mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf, 180 (__be64 *)MLX5_ADDR_OF(create_qp_in, 181 in, pas)); 182 183 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 184 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 185 dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn); 186 kvfree(in); 187 if (err) 188 goto err_in; 189 dr_qp->uar = attr->uar; 190 191 return dr_qp; 192 193 err_in: 194 kfree(dr_qp->sq.wqe_head); 195 err_wqe_head: 196 mlx5_wq_destroy(&dr_qp->wq_ctrl); 197 err_wq: 198 kfree(dr_qp); 199 return NULL; 200 } 201 202 static void dr_destroy_qp(struct mlx5_core_dev *mdev, 203 struct mlx5dr_qp *dr_qp) 204 { 205 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 206 207 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 208 MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn); 209 mlx5_cmd_exec_in(mdev, destroy_qp, in); 210 211 kfree(dr_qp->sq.wqe_head); 212 mlx5_wq_destroy(&dr_qp->wq_ctrl); 213 kfree(dr_qp); 214 } 215 216 static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl) 217 { 218 dma_wmb(); 219 *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff); 220 221 /* After wmb() the hw aware of new work */ 222 wmb(); 223 224 mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET); 225 } 226 227 static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr, 228 u32 rkey, struct dr_data_seg *data_seg, 229 u32 opcode, bool notify_hw) 230 { 231 struct mlx5_wqe_raddr_seg *wq_raddr; 232 struct mlx5_wqe_ctrl_seg *wq_ctrl; 233 struct mlx5_wqe_data_seg *wq_dseg; 234 unsigned int size; 235 unsigned int idx; 236 237 size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 + 238 sizeof(*wq_raddr) / 16; 239 240 idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1); 241 242 wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx); 243 wq_ctrl->imm = 0; 244 wq_ctrl->fm_ce_se = (data_seg->send_flags) ? 245 MLX5_WQE_CTRL_CQ_UPDATE : 0; 246 wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) | 247 opcode); 248 wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8); 249 wq_raddr = (void *)(wq_ctrl + 1); 250 wq_raddr->raddr = cpu_to_be64(remote_addr); 251 wq_raddr->rkey = cpu_to_be32(rkey); 252 wq_raddr->reserved = 0; 253 254 wq_dseg = (void *)(wq_raddr + 1); 255 wq_dseg->byte_count = cpu_to_be32(data_seg->length); 256 wq_dseg->lkey = cpu_to_be32(data_seg->lkey); 257 wq_dseg->addr = cpu_to_be64(data_seg->addr); 258 259 dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++; 260 261 if (notify_hw) 262 dr_cmd_notify_hw(dr_qp, wq_ctrl); 263 } 264 265 static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info) 266 { 267 dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, 268 &send_info->write, MLX5_OPCODE_RDMA_WRITE, false); 269 dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, 270 &send_info->read, MLX5_OPCODE_RDMA_READ, true); 271 } 272 273 /** 274 * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent 275 * with send_list parameters: 276 * 277 * @ste: The data that attached to this specific ste 278 * @size: of data to write 279 * @offset: of the data from start of the hw_ste entry 280 * @data: data 281 * @ste_info: ste to be sent with send_list 282 * @send_list: to append into it 283 * @copy_data: if true indicates that the data should be kept because 284 * it's not backuped any where (like in re-hash). 285 * if false, it lets the data to be updated after 286 * it was added to the list. 287 */ 288 void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size, 289 u16 offset, u8 *data, 290 struct mlx5dr_ste_send_info *ste_info, 291 struct list_head *send_list, 292 bool copy_data) 293 { 294 ste_info->size = size; 295 ste_info->ste = ste; 296 ste_info->offset = offset; 297 298 if (copy_data) { 299 memcpy(ste_info->data_cont, data, size); 300 ste_info->data = ste_info->data_cont; 301 } else { 302 ste_info->data = data; 303 } 304 305 list_add_tail(&ste_info->send_list, send_list); 306 } 307 308 /* The function tries to consume one wc each time, unless the queue is full, in 309 * that case, which means that the hw is behind the sw in a full queue len 310 * the function will drain the cq till it empty. 311 */ 312 static int dr_handle_pending_wc(struct mlx5dr_domain *dmn, 313 struct mlx5dr_send_ring *send_ring) 314 { 315 bool is_drain = false; 316 int ne; 317 318 if (send_ring->pending_wqe < send_ring->signal_th) 319 return 0; 320 321 /* Queue is full start drain it */ 322 if (send_ring->pending_wqe >= 323 dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN) 324 is_drain = true; 325 326 do { 327 ne = dr_poll_cq(send_ring->cq, 1); 328 if (unlikely(ne < 0)) { 329 mlx5_core_warn_once(dmn->mdev, "SMFS QPN 0x%x is disabled/limited", 330 send_ring->qp->qpn); 331 send_ring->err_state = true; 332 return ne; 333 } else if (ne == 1) { 334 send_ring->pending_wqe -= send_ring->signal_th; 335 } 336 } while (is_drain && send_ring->pending_wqe); 337 338 return 0; 339 } 340 341 static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring, 342 struct postsend_info *send_info) 343 { 344 send_ring->pending_wqe++; 345 346 if (send_ring->pending_wqe % send_ring->signal_th == 0) 347 send_info->write.send_flags |= IB_SEND_SIGNALED; 348 349 send_ring->pending_wqe++; 350 send_info->read.length = send_info->write.length; 351 /* Read into the same write area */ 352 send_info->read.addr = (uintptr_t)send_info->write.addr; 353 send_info->read.lkey = send_ring->mr->mkey; 354 355 if (send_ring->pending_wqe % send_ring->signal_th == 0) 356 send_info->read.send_flags = IB_SEND_SIGNALED; 357 else 358 send_info->read.send_flags = 0; 359 } 360 361 static int dr_postsend_icm_data(struct mlx5dr_domain *dmn, 362 struct postsend_info *send_info) 363 { 364 struct mlx5dr_send_ring *send_ring = dmn->send_ring; 365 u32 buff_offset; 366 int ret; 367 368 if (unlikely(dmn->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || 369 send_ring->err_state)) { 370 mlx5_core_dbg_once(dmn->mdev, 371 "Skipping post send: QP err state: %d, device state: %d\n", 372 send_ring->err_state, dmn->mdev->state); 373 return 0; 374 } 375 376 spin_lock(&send_ring->lock); 377 378 ret = dr_handle_pending_wc(dmn, send_ring); 379 if (ret) 380 goto out_unlock; 381 382 if (send_info->write.length > dmn->info.max_inline_size) { 383 buff_offset = (send_ring->tx_head & 384 (dmn->send_ring->signal_th - 1)) * 385 send_ring->max_post_send_size; 386 /* Copy to ring mr */ 387 memcpy(send_ring->buf + buff_offset, 388 (void *)(uintptr_t)send_info->write.addr, 389 send_info->write.length); 390 send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset; 391 send_info->write.lkey = send_ring->mr->mkey; 392 } 393 394 send_ring->tx_head++; 395 dr_fill_data_segs(send_ring, send_info); 396 dr_post_send(send_ring->qp, send_info); 397 398 out_unlock: 399 spin_unlock(&send_ring->lock); 400 return ret; 401 } 402 403 static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn, 404 struct mlx5dr_ste_htbl *htbl, 405 u8 **data, 406 u32 *byte_size, 407 int *iterations, 408 int *num_stes) 409 { 410 int alloc_size; 411 412 if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) { 413 *iterations = htbl->chunk->byte_size / 414 dmn->send_ring->max_post_send_size; 415 *byte_size = dmn->send_ring->max_post_send_size; 416 alloc_size = *byte_size; 417 *num_stes = *byte_size / DR_STE_SIZE; 418 } else { 419 *iterations = 1; 420 *num_stes = htbl->chunk->num_of_entries; 421 alloc_size = *num_stes * DR_STE_SIZE; 422 } 423 424 *data = kvzalloc(alloc_size, GFP_KERNEL); 425 if (!*data) 426 return -ENOMEM; 427 428 return 0; 429 } 430 431 /** 432 * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm. 433 * 434 * @dmn: Domain 435 * @ste: The ste struct that contains the data (at 436 * least part of it) 437 * @data: The real data to send size data 438 * @size: for writing. 439 * @offset: The offset from the icm mapped data to 440 * start write to this for write only part of the 441 * buffer. 442 * 443 * Return: 0 on success. 444 */ 445 int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste, 446 u8 *data, u16 size, u16 offset) 447 { 448 struct postsend_info send_info = {}; 449 450 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size); 451 452 send_info.write.addr = (uintptr_t)data; 453 send_info.write.length = size; 454 send_info.write.lkey = 0; 455 send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset; 456 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(ste->htbl->chunk); 457 458 return dr_postsend_icm_data(dmn, &send_info); 459 } 460 461 int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn, 462 struct mlx5dr_ste_htbl *htbl, 463 u8 *formatted_ste, u8 *mask) 464 { 465 u32 byte_size = htbl->chunk->byte_size; 466 int num_stes_per_iter; 467 int iterations; 468 u8 *data; 469 int ret; 470 int i; 471 int j; 472 473 ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, 474 &iterations, &num_stes_per_iter); 475 if (ret) 476 return ret; 477 478 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE); 479 480 /* Send the data iteration times */ 481 for (i = 0; i < iterations; i++) { 482 u32 ste_index = i * (byte_size / DR_STE_SIZE); 483 struct postsend_info send_info = {}; 484 485 /* Copy all ste's on the data buffer 486 * need to add the bit_mask 487 */ 488 for (j = 0; j < num_stes_per_iter; j++) { 489 struct mlx5dr_ste *ste = &htbl->ste_arr[ste_index + j]; 490 u32 ste_off = j * DR_STE_SIZE; 491 492 if (mlx5dr_ste_is_not_used(ste)) { 493 memcpy(data + ste_off, 494 formatted_ste, DR_STE_SIZE); 495 } else { 496 /* Copy data */ 497 memcpy(data + ste_off, 498 htbl->ste_arr[ste_index + j].hw_ste, 499 DR_STE_SIZE_REDUCED); 500 /* Copy bit_mask */ 501 memcpy(data + ste_off + DR_STE_SIZE_REDUCED, 502 mask, DR_STE_SIZE_MASK); 503 /* Only when we have mask we need to re-arrange the STE */ 504 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, 505 data + (j * DR_STE_SIZE), 506 DR_STE_SIZE); 507 } 508 } 509 510 send_info.write.addr = (uintptr_t)data; 511 send_info.write.length = byte_size; 512 send_info.write.lkey = 0; 513 send_info.remote_addr = 514 mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index); 515 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(htbl->chunk); 516 517 ret = dr_postsend_icm_data(dmn, &send_info); 518 if (ret) 519 goto out_free; 520 } 521 522 out_free: 523 kvfree(data); 524 return ret; 525 } 526 527 /* Initialize htble with default STEs */ 528 int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn, 529 struct mlx5dr_ste_htbl *htbl, 530 u8 *ste_init_data, 531 bool update_hw_ste) 532 { 533 u32 byte_size = htbl->chunk->byte_size; 534 int iterations; 535 int num_stes; 536 u8 *copy_dst; 537 u8 *data; 538 int ret; 539 int i; 540 541 ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, 542 &iterations, &num_stes); 543 if (ret) 544 return ret; 545 546 if (update_hw_ste) { 547 /* Copy the reduced STE to hash table ste_arr */ 548 for (i = 0; i < num_stes; i++) { 549 copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED; 550 memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED); 551 } 552 } 553 554 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE); 555 556 /* Copy the same STE on the data buffer */ 557 for (i = 0; i < num_stes; i++) { 558 copy_dst = data + i * DR_STE_SIZE; 559 memcpy(copy_dst, ste_init_data, DR_STE_SIZE); 560 } 561 562 /* Send the data iteration times */ 563 for (i = 0; i < iterations; i++) { 564 u8 ste_index = i * (byte_size / DR_STE_SIZE); 565 struct postsend_info send_info = {}; 566 567 send_info.write.addr = (uintptr_t)data; 568 send_info.write.length = byte_size; 569 send_info.write.lkey = 0; 570 send_info.remote_addr = 571 mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index); 572 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(htbl->chunk); 573 574 ret = dr_postsend_icm_data(dmn, &send_info); 575 if (ret) 576 goto out_free; 577 } 578 579 out_free: 580 kvfree(data); 581 return ret; 582 } 583 584 int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn, 585 struct mlx5dr_action *action) 586 { 587 struct postsend_info send_info = {}; 588 int ret; 589 590 send_info.write.addr = (uintptr_t)action->rewrite->data; 591 send_info.write.length = action->rewrite->num_of_actions * 592 DR_MODIFY_ACTION_SIZE; 593 send_info.write.lkey = 0; 594 send_info.remote_addr = 595 mlx5dr_icm_pool_get_chunk_mr_addr(action->rewrite->chunk); 596 send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(action->rewrite->chunk); 597 598 ret = dr_postsend_icm_data(dmn, &send_info); 599 600 return ret; 601 } 602 603 static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev, 604 struct mlx5dr_qp *dr_qp, 605 int port) 606 { 607 u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 608 void *qpc; 609 610 qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); 611 612 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port); 613 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 614 MLX5_SET(qpc, qpc, rre, 1); 615 MLX5_SET(qpc, qpc, rwe, 1); 616 617 MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); 618 MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn); 619 620 return mlx5_cmd_exec_in(mdev, rst2init_qp, in); 621 } 622 623 static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev, 624 struct mlx5dr_qp *dr_qp, 625 struct dr_qp_rts_attr *attr) 626 { 627 u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 628 void *qpc; 629 630 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc); 631 632 MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); 633 634 MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); 635 MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); 636 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ 637 638 MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 639 MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); 640 641 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); 642 } 643 644 static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, 645 struct mlx5dr_qp *dr_qp, 646 struct dr_qp_rtr_attr *attr) 647 { 648 u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 649 void *qpc; 650 651 qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc); 652 653 MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); 654 655 MLX5_SET(qpc, qpc, mtu, attr->mtu); 656 MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1); 657 MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num); 658 memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), 659 attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac)); 660 memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), 661 attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid)); 662 MLX5_SET(qpc, qpc, primary_address_path.src_addr_index, 663 attr->sgid_index); 664 665 if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2) 666 MLX5_SET(qpc, qpc, primary_address_path.udp_sport, 667 attr->udp_src_port); 668 669 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); 670 MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl); 671 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 672 673 MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 674 MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); 675 676 return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); 677 } 678 679 static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps) 680 { 681 /* Check whether RC RoCE QP creation with force loopback is allowed. 682 * There are two separate capability bits for this: 683 * - force loopback when RoCE is enabled 684 * - force loopback when RoCE is disabled 685 */ 686 return ((caps->roce_caps.roce_en && 687 caps->roce_caps.fl_rc_qp_when_roce_enabled) || 688 (!caps->roce_caps.roce_en && 689 caps->roce_caps.fl_rc_qp_when_roce_disabled)); 690 } 691 692 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) 693 { 694 struct mlx5dr_qp *dr_qp = dmn->send_ring->qp; 695 struct dr_qp_rts_attr rts_attr = {}; 696 struct dr_qp_rtr_attr rtr_attr = {}; 697 enum ib_mtu mtu = IB_MTU_1024; 698 u16 gid_index = 0; 699 int port = 1; 700 int ret; 701 702 /* Init */ 703 ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port); 704 if (ret) { 705 mlx5dr_err(dmn, "Failed modify QP rst2init\n"); 706 return ret; 707 } 708 709 /* RTR */ 710 rtr_attr.mtu = mtu; 711 rtr_attr.qp_num = dr_qp->qpn; 712 rtr_attr.min_rnr_timer = 12; 713 rtr_attr.port_num = port; 714 rtr_attr.udp_src_port = dmn->info.caps.roce_min_src_udp; 715 716 /* If QP creation with force loopback is allowed, then there 717 * is no need for GID index when creating the QP. 718 * Otherwise we query GID attributes and use GID index. 719 */ 720 rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps); 721 if (!rtr_attr.fl) { 722 ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, 723 &rtr_attr.dgid_attr); 724 if (ret) 725 return ret; 726 727 rtr_attr.sgid_index = gid_index; 728 } 729 730 ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr); 731 if (ret) { 732 mlx5dr_err(dmn, "Failed modify QP init2rtr\n"); 733 return ret; 734 } 735 736 /* RTS */ 737 rts_attr.timeout = 14; 738 rts_attr.retry_cnt = 7; 739 rts_attr.rnr_retry = 7; 740 741 ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr); 742 if (ret) { 743 mlx5dr_err(dmn, "Failed modify QP rtr2rts\n"); 744 return ret; 745 } 746 747 return 0; 748 } 749 750 static void dr_cq_complete(struct mlx5_core_cq *mcq, 751 struct mlx5_eqe *eqe) 752 { 753 pr_err("CQ completion CQ: #%u\n", mcq->cqn); 754 } 755 756 static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, 757 struct mlx5_uars_page *uar, 758 size_t ncqe) 759 { 760 u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {}; 761 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 762 struct mlx5_wq_param wqp; 763 struct mlx5_cqe64 *cqe; 764 struct mlx5dr_cq *cq; 765 int inlen, err, eqn; 766 void *cqc, *in; 767 __be64 *pas; 768 int vector; 769 u32 i; 770 771 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 772 if (!cq) 773 return NULL; 774 775 ncqe = roundup_pow_of_two(ncqe); 776 MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe)); 777 778 wqp.buf_numa_node = mdev->priv.numa_node; 779 wqp.db_numa_node = mdev->priv.numa_node; 780 781 err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq, 782 &cq->wq_ctrl); 783 if (err) 784 goto out; 785 786 for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { 787 cqe = mlx5_cqwq_get_wqe(&cq->wq, i); 788 cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; 789 } 790 791 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 792 sizeof(u64) * cq->wq_ctrl.buf.npages; 793 in = kvzalloc(inlen, GFP_KERNEL); 794 if (!in) 795 goto err_cqwq; 796 797 vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); 798 err = mlx5_vector2eqn(mdev, vector, &eqn); 799 if (err) { 800 kvfree(in); 801 goto err_cqwq; 802 } 803 804 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 805 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 806 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); 807 MLX5_SET(cqc, cqc, uar_page, uar->index); 808 MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - 809 MLX5_ADAPTER_PAGE_SHIFT); 810 MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); 811 812 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 813 mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); 814 815 cq->mcq.comp = dr_cq_complete; 816 817 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 818 kvfree(in); 819 820 if (err) 821 goto err_cqwq; 822 823 cq->mcq.cqe_sz = 64; 824 cq->mcq.set_ci_db = cq->wq_ctrl.db.db; 825 cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; 826 *cq->mcq.set_ci_db = 0; 827 828 /* set no-zero value, in order to avoid the HW to run db-recovery on 829 * CQ that used in polling mode. 830 */ 831 *cq->mcq.arm_db = cpu_to_be32(2 << 28); 832 833 cq->mcq.vector = 0; 834 cq->mcq.uar = uar; 835 836 return cq; 837 838 err_cqwq: 839 mlx5_wq_destroy(&cq->wq_ctrl); 840 out: 841 kfree(cq); 842 return NULL; 843 } 844 845 static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq) 846 { 847 mlx5_core_destroy_cq(mdev, &cq->mcq); 848 mlx5_wq_destroy(&cq->wq_ctrl); 849 kfree(cq); 850 } 851 852 static int dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey) 853 { 854 u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {}; 855 void *mkc; 856 857 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 858 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 859 MLX5_SET(mkc, mkc, a, 1); 860 MLX5_SET(mkc, mkc, rw, 1); 861 MLX5_SET(mkc, mkc, rr, 1); 862 MLX5_SET(mkc, mkc, lw, 1); 863 MLX5_SET(mkc, mkc, lr, 1); 864 865 MLX5_SET(mkc, mkc, pd, pdn); 866 MLX5_SET(mkc, mkc, length64, 1); 867 MLX5_SET(mkc, mkc, qpn, 0xffffff); 868 869 return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in)); 870 } 871 872 static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev, 873 u32 pdn, void *buf, size_t size) 874 { 875 struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL); 876 struct device *dma_device; 877 dma_addr_t dma_addr; 878 int err; 879 880 if (!mr) 881 return NULL; 882 883 dma_device = mlx5_core_dma_dev(mdev); 884 dma_addr = dma_map_single(dma_device, buf, size, 885 DMA_BIDIRECTIONAL); 886 err = dma_mapping_error(dma_device, dma_addr); 887 if (err) { 888 mlx5_core_warn(mdev, "Can't dma buf\n"); 889 kfree(mr); 890 return NULL; 891 } 892 893 err = dr_create_mkey(mdev, pdn, &mr->mkey); 894 if (err) { 895 mlx5_core_warn(mdev, "Can't create mkey\n"); 896 dma_unmap_single(dma_device, dma_addr, size, 897 DMA_BIDIRECTIONAL); 898 kfree(mr); 899 return NULL; 900 } 901 902 mr->dma_addr = dma_addr; 903 mr->size = size; 904 mr->addr = buf; 905 906 return mr; 907 } 908 909 static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr) 910 { 911 mlx5_core_destroy_mkey(mdev, mr->mkey); 912 dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size, 913 DMA_BIDIRECTIONAL); 914 kfree(mr); 915 } 916 917 int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) 918 { 919 struct dr_qp_init_attr init_attr = {}; 920 int cq_size; 921 int size; 922 int ret; 923 924 dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL); 925 if (!dmn->send_ring) 926 return -ENOMEM; 927 928 cq_size = QUEUE_SIZE + 1; 929 dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size); 930 if (!dmn->send_ring->cq) { 931 mlx5dr_err(dmn, "Failed creating CQ\n"); 932 ret = -ENOMEM; 933 goto free_send_ring; 934 } 935 936 init_attr.cqn = dmn->send_ring->cq->mcq.cqn; 937 init_attr.pdn = dmn->pdn; 938 init_attr.uar = dmn->uar; 939 init_attr.max_send_wr = QUEUE_SIZE; 940 941 /* Isolated VL is applicable only if force loopback is supported */ 942 if (dr_send_allow_fl(&dmn->info.caps)) 943 init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc; 944 945 spin_lock_init(&dmn->send_ring->lock); 946 947 dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr); 948 if (!dmn->send_ring->qp) { 949 mlx5dr_err(dmn, "Failed creating QP\n"); 950 ret = -ENOMEM; 951 goto clean_cq; 952 } 953 954 dmn->send_ring->cq->qp = dmn->send_ring->qp; 955 956 dmn->info.max_send_wr = QUEUE_SIZE; 957 dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data, 958 DR_STE_SIZE); 959 960 dmn->send_ring->signal_th = dmn->info.max_send_wr / 961 SIGNAL_PER_DIV_QUEUE; 962 963 /* Prepare qp to be used */ 964 ret = dr_prepare_qp_to_rts(dmn); 965 if (ret) 966 goto clean_qp; 967 968 dmn->send_ring->max_post_send_size = 969 mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K, 970 DR_ICM_TYPE_STE); 971 972 /* Allocating the max size as a buffer for writing */ 973 size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size; 974 dmn->send_ring->buf = kzalloc(size, GFP_KERNEL); 975 if (!dmn->send_ring->buf) { 976 ret = -ENOMEM; 977 goto clean_qp; 978 } 979 980 dmn->send_ring->buf_size = size; 981 982 dmn->send_ring->mr = dr_reg_mr(dmn->mdev, 983 dmn->pdn, dmn->send_ring->buf, size); 984 if (!dmn->send_ring->mr) { 985 ret = -ENOMEM; 986 goto free_mem; 987 } 988 989 dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev, 990 dmn->pdn, dmn->send_ring->sync_buff, 991 MIN_READ_SYNC); 992 if (!dmn->send_ring->sync_mr) { 993 ret = -ENOMEM; 994 goto clean_mr; 995 } 996 997 return 0; 998 999 clean_mr: 1000 dr_dereg_mr(dmn->mdev, dmn->send_ring->mr); 1001 free_mem: 1002 kfree(dmn->send_ring->buf); 1003 clean_qp: 1004 dr_destroy_qp(dmn->mdev, dmn->send_ring->qp); 1005 clean_cq: 1006 dr_destroy_cq(dmn->mdev, dmn->send_ring->cq); 1007 free_send_ring: 1008 kfree(dmn->send_ring); 1009 1010 return ret; 1011 } 1012 1013 void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, 1014 struct mlx5dr_send_ring *send_ring) 1015 { 1016 dr_destroy_qp(dmn->mdev, send_ring->qp); 1017 dr_destroy_cq(dmn->mdev, send_ring->cq); 1018 dr_dereg_mr(dmn->mdev, send_ring->sync_mr); 1019 dr_dereg_mr(dmn->mdev, send_ring->mr); 1020 kfree(send_ring->buf); 1021 kfree(send_ring); 1022 } 1023 1024 int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn) 1025 { 1026 struct mlx5dr_send_ring *send_ring = dmn->send_ring; 1027 struct postsend_info send_info = {}; 1028 u8 data[DR_STE_SIZE]; 1029 int num_of_sends_req; 1030 int ret; 1031 int i; 1032 1033 /* Sending this amount of requests makes sure we will get drain */ 1034 num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2; 1035 1036 /* Send fake requests forcing the last to be signaled */ 1037 send_info.write.addr = (uintptr_t)data; 1038 send_info.write.length = DR_STE_SIZE; 1039 send_info.write.lkey = 0; 1040 /* Using the sync_mr in order to write/read */ 1041 send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr; 1042 send_info.rkey = send_ring->sync_mr->mkey; 1043 1044 for (i = 0; i < num_of_sends_req; i++) { 1045 ret = dr_postsend_icm_data(dmn, &send_info); 1046 if (ret) 1047 return ret; 1048 } 1049 1050 spin_lock(&send_ring->lock); 1051 ret = dr_handle_pending_wc(dmn, send_ring); 1052 spin_unlock(&send_ring->lock); 1053 1054 return ret; 1055 } 1056