1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include <linux/smp.h> 5 #include "dr_types.h" 6 7 #define QUEUE_SIZE 128 8 #define SIGNAL_PER_DIV_QUEUE 16 9 #define TH_NUMS_TO_DRAIN 2 10 11 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 12 13 struct dr_data_seg { 14 u64 addr; 15 u32 length; 16 u32 lkey; 17 unsigned int send_flags; 18 }; 19 20 struct postsend_info { 21 struct dr_data_seg write; 22 struct dr_data_seg read; 23 u64 remote_addr; 24 u32 rkey; 25 }; 26 27 struct dr_qp_rtr_attr { 28 struct mlx5dr_cmd_gid_attr dgid_attr; 29 enum ib_mtu mtu; 30 u32 qp_num; 31 u16 port_num; 32 u8 min_rnr_timer; 33 u8 sgid_index; 34 u16 udp_src_port; 35 u8 fl:1; 36 }; 37 38 struct dr_qp_rts_attr { 39 u8 timeout; 40 u8 retry_cnt; 41 u8 rnr_retry; 42 }; 43 44 struct dr_qp_init_attr { 45 u32 cqn; 46 u32 pdn; 47 u32 max_send_wr; 48 struct mlx5_uars_page *uar; 49 u8 isolate_vl_tc:1; 50 }; 51 52 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64) 53 { 54 unsigned int idx; 55 u8 opcode; 56 57 opcode = get_cqe_opcode(cqe64); 58 if (opcode == MLX5_CQE_REQ_ERR) { 59 idx = be16_to_cpu(cqe64->wqe_counter) & 60 (dr_cq->qp->sq.wqe_cnt - 1); 61 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1; 62 } else if (opcode == MLX5_CQE_RESP_ERR) { 63 ++dr_cq->qp->sq.cc; 64 } else { 65 idx = be16_to_cpu(cqe64->wqe_counter) & 66 (dr_cq->qp->sq.wqe_cnt - 1); 67 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1; 68 69 return CQ_OK; 70 } 71 72 return CQ_POLL_ERR; 73 } 74 75 static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq) 76 { 77 struct mlx5_cqe64 *cqe64; 78 int err; 79 80 cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq); 81 if (!cqe64) 82 return CQ_EMPTY; 83 84 mlx5_cqwq_pop(&dr_cq->wq); 85 err = dr_parse_cqe(dr_cq, cqe64); 86 mlx5_cqwq_update_db_record(&dr_cq->wq); 87 88 return err; 89 } 90 91 static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne) 92 { 93 int npolled; 94 int err = 0; 95 96 for (npolled = 0; npolled < ne; ++npolled) { 97 err = dr_cq_poll_one(dr_cq); 98 if (err != CQ_OK) 99 break; 100 } 101 102 return err == CQ_POLL_ERR ? err : npolled; 103 } 104 105 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, 106 struct dr_qp_init_attr *attr) 107 { 108 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 109 u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; 110 struct mlx5_wq_param wqp; 111 struct mlx5dr_qp *dr_qp; 112 int inlen; 113 void *qpc; 114 void *in; 115 int err; 116 117 dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL); 118 if (!dr_qp) 119 return NULL; 120 121 wqp.buf_numa_node = mdev->priv.numa_node; 122 wqp.db_numa_node = mdev->priv.numa_node; 123 124 dr_qp->rq.pc = 0; 125 dr_qp->rq.cc = 0; 126 dr_qp->rq.wqe_cnt = 4; 127 dr_qp->sq.pc = 0; 128 dr_qp->sq.cc = 0; 129 dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr); 130 131 MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); 132 MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt)); 133 MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt)); 134 err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq, 135 &dr_qp->wq_ctrl); 136 if (err) { 137 mlx5_core_warn(mdev, "Can't create QP WQ\n"); 138 goto err_wq; 139 } 140 141 dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt, 142 sizeof(dr_qp->sq.wqe_head[0]), 143 GFP_KERNEL); 144 145 if (!dr_qp->sq.wqe_head) { 146 mlx5_core_warn(mdev, "Can't allocate wqe head\n"); 147 goto err_wqe_head; 148 } 149 150 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 151 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 152 dr_qp->wq_ctrl.buf.npages; 153 in = kvzalloc(inlen, GFP_KERNEL); 154 if (!in) { 155 err = -ENOMEM; 156 goto err_in; 157 } 158 159 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 160 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 161 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 162 MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc); 163 MLX5_SET(qpc, qpc, pd, attr->pdn); 164 MLX5_SET(qpc, qpc, uar_page, attr->uar->index); 165 MLX5_SET(qpc, qpc, log_page_size, 166 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 167 MLX5_SET(qpc, qpc, fre, 1); 168 MLX5_SET(qpc, qpc, rlky, 1); 169 MLX5_SET(qpc, qpc, cqn_snd, attr->cqn); 170 MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn); 171 MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); 172 MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt)); 173 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 174 MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt)); 175 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 176 MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma); 177 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 178 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 179 mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf, 180 (__be64 *)MLX5_ADDR_OF(create_qp_in, 181 in, pas)); 182 183 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 184 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 185 dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn); 186 kvfree(in); 187 if (err) 188 goto err_in; 189 dr_qp->uar = attr->uar; 190 191 return dr_qp; 192 193 err_in: 194 kfree(dr_qp->sq.wqe_head); 195 err_wqe_head: 196 mlx5_wq_destroy(&dr_qp->wq_ctrl); 197 err_wq: 198 kfree(dr_qp); 199 return NULL; 200 } 201 202 static void dr_destroy_qp(struct mlx5_core_dev *mdev, 203 struct mlx5dr_qp *dr_qp) 204 { 205 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 206 207 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 208 MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn); 209 mlx5_cmd_exec_in(mdev, destroy_qp, in); 210 211 kfree(dr_qp->sq.wqe_head); 212 mlx5_wq_destroy(&dr_qp->wq_ctrl); 213 kfree(dr_qp); 214 } 215 216 static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl) 217 { 218 dma_wmb(); 219 *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff); 220 221 /* After wmb() the hw aware of new work */ 222 wmb(); 223 224 mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET); 225 } 226 227 static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr, 228 u32 rkey, struct dr_data_seg *data_seg, 229 u32 opcode, bool notify_hw) 230 { 231 struct mlx5_wqe_raddr_seg *wq_raddr; 232 struct mlx5_wqe_ctrl_seg *wq_ctrl; 233 struct mlx5_wqe_data_seg *wq_dseg; 234 unsigned int size; 235 unsigned int idx; 236 237 size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 + 238 sizeof(*wq_raddr) / 16; 239 240 idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1); 241 242 wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx); 243 wq_ctrl->imm = 0; 244 wq_ctrl->fm_ce_se = (data_seg->send_flags) ? 245 MLX5_WQE_CTRL_CQ_UPDATE : 0; 246 wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) | 247 opcode); 248 wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8); 249 wq_raddr = (void *)(wq_ctrl + 1); 250 wq_raddr->raddr = cpu_to_be64(remote_addr); 251 wq_raddr->rkey = cpu_to_be32(rkey); 252 wq_raddr->reserved = 0; 253 254 wq_dseg = (void *)(wq_raddr + 1); 255 wq_dseg->byte_count = cpu_to_be32(data_seg->length); 256 wq_dseg->lkey = cpu_to_be32(data_seg->lkey); 257 wq_dseg->addr = cpu_to_be64(data_seg->addr); 258 259 dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++; 260 261 if (notify_hw) 262 dr_cmd_notify_hw(dr_qp, wq_ctrl); 263 } 264 265 static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info) 266 { 267 dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, 268 &send_info->write, MLX5_OPCODE_RDMA_WRITE, false); 269 dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, 270 &send_info->read, MLX5_OPCODE_RDMA_READ, true); 271 } 272 273 /** 274 * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent 275 * with send_list parameters: 276 * 277 * @ste: The data that attached to this specific ste 278 * @size: of data to write 279 * @offset: of the data from start of the hw_ste entry 280 * @data: data 281 * @ste_info: ste to be sent with send_list 282 * @send_list: to append into it 283 * @copy_data: if true indicates that the data should be kept because 284 * it's not backuped any where (like in re-hash). 285 * if false, it lets the data to be updated after 286 * it was added to the list. 287 */ 288 void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size, 289 u16 offset, u8 *data, 290 struct mlx5dr_ste_send_info *ste_info, 291 struct list_head *send_list, 292 bool copy_data) 293 { 294 ste_info->size = size; 295 ste_info->ste = ste; 296 ste_info->offset = offset; 297 298 if (copy_data) { 299 memcpy(ste_info->data_cont, data, size); 300 ste_info->data = ste_info->data_cont; 301 } else { 302 ste_info->data = data; 303 } 304 305 list_add_tail(&ste_info->send_list, send_list); 306 } 307 308 /* The function tries to consume one wc each time, unless the queue is full, in 309 * that case, which means that the hw is behind the sw in a full queue len 310 * the function will drain the cq till it empty. 311 */ 312 static int dr_handle_pending_wc(struct mlx5dr_domain *dmn, 313 struct mlx5dr_send_ring *send_ring) 314 { 315 bool is_drain = false; 316 int ne; 317 318 if (send_ring->pending_wqe < send_ring->signal_th) 319 return 0; 320 321 /* Queue is full start drain it */ 322 if (send_ring->pending_wqe >= 323 dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN) 324 is_drain = true; 325 326 do { 327 ne = dr_poll_cq(send_ring->cq, 1); 328 if (ne < 0) 329 return ne; 330 else if (ne == 1) 331 send_ring->pending_wqe -= send_ring->signal_th; 332 } while (is_drain && send_ring->pending_wqe); 333 334 return 0; 335 } 336 337 static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring, 338 struct postsend_info *send_info) 339 { 340 send_ring->pending_wqe++; 341 342 if (send_ring->pending_wqe % send_ring->signal_th == 0) 343 send_info->write.send_flags |= IB_SEND_SIGNALED; 344 345 send_ring->pending_wqe++; 346 send_info->read.length = send_info->write.length; 347 /* Read into the same write area */ 348 send_info->read.addr = (uintptr_t)send_info->write.addr; 349 send_info->read.lkey = send_ring->mr->mkey.key; 350 351 if (send_ring->pending_wqe % send_ring->signal_th == 0) 352 send_info->read.send_flags = IB_SEND_SIGNALED; 353 else 354 send_info->read.send_flags = 0; 355 } 356 357 static int dr_postsend_icm_data(struct mlx5dr_domain *dmn, 358 struct postsend_info *send_info) 359 { 360 struct mlx5dr_send_ring *send_ring = dmn->send_ring; 361 u32 buff_offset; 362 int ret; 363 364 spin_lock(&send_ring->lock); 365 366 ret = dr_handle_pending_wc(dmn, send_ring); 367 if (ret) 368 goto out_unlock; 369 370 if (send_info->write.length > dmn->info.max_inline_size) { 371 buff_offset = (send_ring->tx_head & 372 (dmn->send_ring->signal_th - 1)) * 373 send_ring->max_post_send_size; 374 /* Copy to ring mr */ 375 memcpy(send_ring->buf + buff_offset, 376 (void *)(uintptr_t)send_info->write.addr, 377 send_info->write.length); 378 send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset; 379 send_info->write.lkey = send_ring->mr->mkey.key; 380 } 381 382 send_ring->tx_head++; 383 dr_fill_data_segs(send_ring, send_info); 384 dr_post_send(send_ring->qp, send_info); 385 386 out_unlock: 387 spin_unlock(&send_ring->lock); 388 return ret; 389 } 390 391 static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn, 392 struct mlx5dr_ste_htbl *htbl, 393 u8 **data, 394 u32 *byte_size, 395 int *iterations, 396 int *num_stes) 397 { 398 int alloc_size; 399 400 if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) { 401 *iterations = htbl->chunk->byte_size / 402 dmn->send_ring->max_post_send_size; 403 *byte_size = dmn->send_ring->max_post_send_size; 404 alloc_size = *byte_size; 405 *num_stes = *byte_size / DR_STE_SIZE; 406 } else { 407 *iterations = 1; 408 *num_stes = htbl->chunk->num_of_entries; 409 alloc_size = *num_stes * DR_STE_SIZE; 410 } 411 412 *data = kvzalloc(alloc_size, GFP_KERNEL); 413 if (!*data) 414 return -ENOMEM; 415 416 return 0; 417 } 418 419 /** 420 * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm. 421 * 422 * @dmn: Domain 423 * @ste: The ste struct that contains the data (at 424 * least part of it) 425 * @data: The real data to send size data 426 * @size: for writing. 427 * @offset: The offset from the icm mapped data to 428 * start write to this for write only part of the 429 * buffer. 430 * 431 * Return: 0 on success. 432 */ 433 int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste, 434 u8 *data, u16 size, u16 offset) 435 { 436 struct postsend_info send_info = {}; 437 438 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size); 439 440 send_info.write.addr = (uintptr_t)data; 441 send_info.write.length = size; 442 send_info.write.lkey = 0; 443 send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset; 444 send_info.rkey = ste->htbl->chunk->rkey; 445 446 return dr_postsend_icm_data(dmn, &send_info); 447 } 448 449 int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn, 450 struct mlx5dr_ste_htbl *htbl, 451 u8 *formatted_ste, u8 *mask) 452 { 453 u32 byte_size = htbl->chunk->byte_size; 454 int num_stes_per_iter; 455 int iterations; 456 u8 *data; 457 int ret; 458 int i; 459 int j; 460 461 ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, 462 &iterations, &num_stes_per_iter); 463 if (ret) 464 return ret; 465 466 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE); 467 468 /* Send the data iteration times */ 469 for (i = 0; i < iterations; i++) { 470 u32 ste_index = i * (byte_size / DR_STE_SIZE); 471 struct postsend_info send_info = {}; 472 473 /* Copy all ste's on the data buffer 474 * need to add the bit_mask 475 */ 476 for (j = 0; j < num_stes_per_iter; j++) { 477 struct mlx5dr_ste *ste = &htbl->ste_arr[ste_index + j]; 478 u32 ste_off = j * DR_STE_SIZE; 479 480 if (mlx5dr_ste_is_not_used(ste)) { 481 memcpy(data + ste_off, 482 formatted_ste, DR_STE_SIZE); 483 } else { 484 /* Copy data */ 485 memcpy(data + ste_off, 486 htbl->ste_arr[ste_index + j].hw_ste, 487 DR_STE_SIZE_REDUCED); 488 /* Copy bit_mask */ 489 memcpy(data + ste_off + DR_STE_SIZE_REDUCED, 490 mask, DR_STE_SIZE_MASK); 491 /* Only when we have mask we need to re-arrange the STE */ 492 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, 493 data + (j * DR_STE_SIZE), 494 DR_STE_SIZE); 495 } 496 } 497 498 send_info.write.addr = (uintptr_t)data; 499 send_info.write.length = byte_size; 500 send_info.write.lkey = 0; 501 send_info.remote_addr = 502 mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index); 503 send_info.rkey = htbl->chunk->rkey; 504 505 ret = dr_postsend_icm_data(dmn, &send_info); 506 if (ret) 507 goto out_free; 508 } 509 510 out_free: 511 kvfree(data); 512 return ret; 513 } 514 515 /* Initialize htble with default STEs */ 516 int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn, 517 struct mlx5dr_ste_htbl *htbl, 518 u8 *ste_init_data, 519 bool update_hw_ste) 520 { 521 u32 byte_size = htbl->chunk->byte_size; 522 int iterations; 523 int num_stes; 524 u8 *copy_dst; 525 u8 *data; 526 int ret; 527 int i; 528 529 ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, 530 &iterations, &num_stes); 531 if (ret) 532 return ret; 533 534 if (update_hw_ste) { 535 /* Copy the reduced STE to hash table ste_arr */ 536 for (i = 0; i < num_stes; i++) { 537 copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED; 538 memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED); 539 } 540 } 541 542 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE); 543 544 /* Copy the same STE on the data buffer */ 545 for (i = 0; i < num_stes; i++) { 546 copy_dst = data + i * DR_STE_SIZE; 547 memcpy(copy_dst, ste_init_data, DR_STE_SIZE); 548 } 549 550 /* Send the data iteration times */ 551 for (i = 0; i < iterations; i++) { 552 u8 ste_index = i * (byte_size / DR_STE_SIZE); 553 struct postsend_info send_info = {}; 554 555 send_info.write.addr = (uintptr_t)data; 556 send_info.write.length = byte_size; 557 send_info.write.lkey = 0; 558 send_info.remote_addr = 559 mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index); 560 send_info.rkey = htbl->chunk->rkey; 561 562 ret = dr_postsend_icm_data(dmn, &send_info); 563 if (ret) 564 goto out_free; 565 } 566 567 out_free: 568 kvfree(data); 569 return ret; 570 } 571 572 int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn, 573 struct mlx5dr_action *action) 574 { 575 struct postsend_info send_info = {}; 576 int ret; 577 578 send_info.write.addr = (uintptr_t)action->rewrite->data; 579 send_info.write.length = action->rewrite->num_of_actions * 580 DR_MODIFY_ACTION_SIZE; 581 send_info.write.lkey = 0; 582 send_info.remote_addr = action->rewrite->chunk->mr_addr; 583 send_info.rkey = action->rewrite->chunk->rkey; 584 585 ret = dr_postsend_icm_data(dmn, &send_info); 586 587 return ret; 588 } 589 590 static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev, 591 struct mlx5dr_qp *dr_qp, 592 int port) 593 { 594 u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 595 void *qpc; 596 597 qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); 598 599 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port); 600 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 601 MLX5_SET(qpc, qpc, rre, 1); 602 MLX5_SET(qpc, qpc, rwe, 1); 603 604 MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); 605 MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn); 606 607 return mlx5_cmd_exec_in(mdev, rst2init_qp, in); 608 } 609 610 static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev, 611 struct mlx5dr_qp *dr_qp, 612 struct dr_qp_rts_attr *attr) 613 { 614 u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 615 void *qpc; 616 617 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc); 618 619 MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); 620 621 MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); 622 MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); 623 624 MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 625 MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); 626 627 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); 628 } 629 630 static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, 631 struct mlx5dr_qp *dr_qp, 632 struct dr_qp_rtr_attr *attr) 633 { 634 u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 635 void *qpc; 636 637 qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc); 638 639 MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); 640 641 MLX5_SET(qpc, qpc, mtu, attr->mtu); 642 MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1); 643 MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num); 644 memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), 645 attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac)); 646 memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), 647 attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid)); 648 MLX5_SET(qpc, qpc, primary_address_path.src_addr_index, 649 attr->sgid_index); 650 651 if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2) 652 MLX5_SET(qpc, qpc, primary_address_path.udp_sport, 653 attr->udp_src_port); 654 655 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); 656 MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl); 657 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 658 659 MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 660 MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); 661 662 return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); 663 } 664 665 static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps) 666 { 667 /* Check whether RC RoCE QP creation with force loopback is allowed. 668 * There are two separate capability bits for this: 669 * - force loopback when RoCE is enabled 670 * - force loopback when RoCE is disabled 671 */ 672 return ((caps->roce_caps.roce_en && 673 caps->roce_caps.fl_rc_qp_when_roce_enabled) || 674 (!caps->roce_caps.roce_en && 675 caps->roce_caps.fl_rc_qp_when_roce_disabled)); 676 } 677 678 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) 679 { 680 struct mlx5dr_qp *dr_qp = dmn->send_ring->qp; 681 struct dr_qp_rts_attr rts_attr = {}; 682 struct dr_qp_rtr_attr rtr_attr = {}; 683 enum ib_mtu mtu = IB_MTU_1024; 684 u16 gid_index = 0; 685 int port = 1; 686 int ret; 687 688 /* Init */ 689 ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port); 690 if (ret) { 691 mlx5dr_err(dmn, "Failed modify QP rst2init\n"); 692 return ret; 693 } 694 695 /* RTR */ 696 rtr_attr.mtu = mtu; 697 rtr_attr.qp_num = dr_qp->qpn; 698 rtr_attr.min_rnr_timer = 12; 699 rtr_attr.port_num = port; 700 rtr_attr.udp_src_port = dmn->info.caps.roce_min_src_udp; 701 702 /* If QP creation with force loopback is allowed, then there 703 * is no need for GID index when creating the QP. 704 * Otherwise we query GID attributes and use GID index. 705 */ 706 rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps); 707 if (!rtr_attr.fl) { 708 ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, 709 &rtr_attr.dgid_attr); 710 if (ret) 711 return ret; 712 713 rtr_attr.sgid_index = gid_index; 714 } 715 716 ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr); 717 if (ret) { 718 mlx5dr_err(dmn, "Failed modify QP init2rtr\n"); 719 return ret; 720 } 721 722 /* RTS */ 723 rts_attr.timeout = 14; 724 rts_attr.retry_cnt = 7; 725 rts_attr.rnr_retry = 7; 726 727 ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr); 728 if (ret) { 729 mlx5dr_err(dmn, "Failed modify QP rtr2rts\n"); 730 return ret; 731 } 732 733 return 0; 734 } 735 736 static void dr_cq_complete(struct mlx5_core_cq *mcq, 737 struct mlx5_eqe *eqe) 738 { 739 pr_err("CQ completion CQ: #%u\n", mcq->cqn); 740 } 741 742 static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, 743 struct mlx5_uars_page *uar, 744 size_t ncqe) 745 { 746 u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {}; 747 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 748 struct mlx5_wq_param wqp; 749 struct mlx5_cqe64 *cqe; 750 struct mlx5dr_cq *cq; 751 int inlen, err, eqn; 752 unsigned int irqn; 753 void *cqc, *in; 754 __be64 *pas; 755 int vector; 756 u32 i; 757 758 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 759 if (!cq) 760 return NULL; 761 762 ncqe = roundup_pow_of_two(ncqe); 763 MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe)); 764 765 wqp.buf_numa_node = mdev->priv.numa_node; 766 wqp.db_numa_node = mdev->priv.numa_node; 767 768 err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq, 769 &cq->wq_ctrl); 770 if (err) 771 goto out; 772 773 for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { 774 cqe = mlx5_cqwq_get_wqe(&cq->wq, i); 775 cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; 776 } 777 778 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 779 sizeof(u64) * cq->wq_ctrl.buf.npages; 780 in = kvzalloc(inlen, GFP_KERNEL); 781 if (!in) 782 goto err_cqwq; 783 784 vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); 785 err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn); 786 if (err) { 787 kvfree(in); 788 goto err_cqwq; 789 } 790 791 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 792 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 793 MLX5_SET(cqc, cqc, c_eqn, eqn); 794 MLX5_SET(cqc, cqc, uar_page, uar->index); 795 MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - 796 MLX5_ADAPTER_PAGE_SHIFT); 797 MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); 798 799 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 800 mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); 801 802 cq->mcq.comp = dr_cq_complete; 803 804 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 805 kvfree(in); 806 807 if (err) 808 goto err_cqwq; 809 810 cq->mcq.cqe_sz = 64; 811 cq->mcq.set_ci_db = cq->wq_ctrl.db.db; 812 cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; 813 *cq->mcq.set_ci_db = 0; 814 815 /* set no-zero value, in order to avoid the HW to run db-recovery on 816 * CQ that used in polling mode. 817 */ 818 *cq->mcq.arm_db = cpu_to_be32(2 << 28); 819 820 cq->mcq.vector = 0; 821 cq->mcq.irqn = irqn; 822 cq->mcq.uar = uar; 823 824 return cq; 825 826 err_cqwq: 827 mlx5_wq_destroy(&cq->wq_ctrl); 828 out: 829 kfree(cq); 830 return NULL; 831 } 832 833 static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq) 834 { 835 mlx5_core_destroy_cq(mdev, &cq->mcq); 836 mlx5_wq_destroy(&cq->wq_ctrl); 837 kfree(cq); 838 } 839 840 static int 841 dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey) 842 { 843 u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {}; 844 void *mkc; 845 846 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 847 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 848 MLX5_SET(mkc, mkc, a, 1); 849 MLX5_SET(mkc, mkc, rw, 1); 850 MLX5_SET(mkc, mkc, rr, 1); 851 MLX5_SET(mkc, mkc, lw, 1); 852 MLX5_SET(mkc, mkc, lr, 1); 853 854 MLX5_SET(mkc, mkc, pd, pdn); 855 MLX5_SET(mkc, mkc, length64, 1); 856 MLX5_SET(mkc, mkc, qpn, 0xffffff); 857 858 return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in)); 859 } 860 861 static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev, 862 u32 pdn, void *buf, size_t size) 863 { 864 struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL); 865 struct device *dma_device; 866 dma_addr_t dma_addr; 867 int err; 868 869 if (!mr) 870 return NULL; 871 872 dma_device = mlx5_core_dma_dev(mdev); 873 dma_addr = dma_map_single(dma_device, buf, size, 874 DMA_BIDIRECTIONAL); 875 err = dma_mapping_error(dma_device, dma_addr); 876 if (err) { 877 mlx5_core_warn(mdev, "Can't dma buf\n"); 878 kfree(mr); 879 return NULL; 880 } 881 882 err = dr_create_mkey(mdev, pdn, &mr->mkey); 883 if (err) { 884 mlx5_core_warn(mdev, "Can't create mkey\n"); 885 dma_unmap_single(dma_device, dma_addr, size, 886 DMA_BIDIRECTIONAL); 887 kfree(mr); 888 return NULL; 889 } 890 891 mr->dma_addr = dma_addr; 892 mr->size = size; 893 mr->addr = buf; 894 895 return mr; 896 } 897 898 static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr) 899 { 900 mlx5_core_destroy_mkey(mdev, &mr->mkey); 901 dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size, 902 DMA_BIDIRECTIONAL); 903 kfree(mr); 904 } 905 906 int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) 907 { 908 struct dr_qp_init_attr init_attr = {}; 909 int cq_size; 910 int size; 911 int ret; 912 913 dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL); 914 if (!dmn->send_ring) 915 return -ENOMEM; 916 917 cq_size = QUEUE_SIZE + 1; 918 dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size); 919 if (!dmn->send_ring->cq) { 920 mlx5dr_err(dmn, "Failed creating CQ\n"); 921 ret = -ENOMEM; 922 goto free_send_ring; 923 } 924 925 init_attr.cqn = dmn->send_ring->cq->mcq.cqn; 926 init_attr.pdn = dmn->pdn; 927 init_attr.uar = dmn->uar; 928 init_attr.max_send_wr = QUEUE_SIZE; 929 930 /* Isolated VL is applicable only if force loopback is supported */ 931 if (dr_send_allow_fl(&dmn->info.caps)) 932 init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc; 933 934 spin_lock_init(&dmn->send_ring->lock); 935 936 dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr); 937 if (!dmn->send_ring->qp) { 938 mlx5dr_err(dmn, "Failed creating QP\n"); 939 ret = -ENOMEM; 940 goto clean_cq; 941 } 942 943 dmn->send_ring->cq->qp = dmn->send_ring->qp; 944 945 dmn->info.max_send_wr = QUEUE_SIZE; 946 dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data, 947 DR_STE_SIZE); 948 949 dmn->send_ring->signal_th = dmn->info.max_send_wr / 950 SIGNAL_PER_DIV_QUEUE; 951 952 /* Prepare qp to be used */ 953 ret = dr_prepare_qp_to_rts(dmn); 954 if (ret) 955 goto clean_qp; 956 957 dmn->send_ring->max_post_send_size = 958 mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K, 959 DR_ICM_TYPE_STE); 960 961 /* Allocating the max size as a buffer for writing */ 962 size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size; 963 dmn->send_ring->buf = kzalloc(size, GFP_KERNEL); 964 if (!dmn->send_ring->buf) { 965 ret = -ENOMEM; 966 goto clean_qp; 967 } 968 969 dmn->send_ring->buf_size = size; 970 971 dmn->send_ring->mr = dr_reg_mr(dmn->mdev, 972 dmn->pdn, dmn->send_ring->buf, size); 973 if (!dmn->send_ring->mr) { 974 ret = -ENOMEM; 975 goto free_mem; 976 } 977 978 dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev, 979 dmn->pdn, dmn->send_ring->sync_buff, 980 MIN_READ_SYNC); 981 if (!dmn->send_ring->sync_mr) { 982 ret = -ENOMEM; 983 goto clean_mr; 984 } 985 986 return 0; 987 988 clean_mr: 989 dr_dereg_mr(dmn->mdev, dmn->send_ring->mr); 990 free_mem: 991 kfree(dmn->send_ring->buf); 992 clean_qp: 993 dr_destroy_qp(dmn->mdev, dmn->send_ring->qp); 994 clean_cq: 995 dr_destroy_cq(dmn->mdev, dmn->send_ring->cq); 996 free_send_ring: 997 kfree(dmn->send_ring); 998 999 return ret; 1000 } 1001 1002 void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, 1003 struct mlx5dr_send_ring *send_ring) 1004 { 1005 dr_destroy_qp(dmn->mdev, send_ring->qp); 1006 dr_destroy_cq(dmn->mdev, send_ring->cq); 1007 dr_dereg_mr(dmn->mdev, send_ring->sync_mr); 1008 dr_dereg_mr(dmn->mdev, send_ring->mr); 1009 kfree(send_ring->buf); 1010 kfree(send_ring); 1011 } 1012 1013 int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn) 1014 { 1015 struct mlx5dr_send_ring *send_ring = dmn->send_ring; 1016 struct postsend_info send_info = {}; 1017 u8 data[DR_STE_SIZE]; 1018 int num_of_sends_req; 1019 int ret; 1020 int i; 1021 1022 /* Sending this amount of requests makes sure we will get drain */ 1023 num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2; 1024 1025 /* Send fake requests forcing the last to be signaled */ 1026 send_info.write.addr = (uintptr_t)data; 1027 send_info.write.length = DR_STE_SIZE; 1028 send_info.write.lkey = 0; 1029 /* Using the sync_mr in order to write/read */ 1030 send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr; 1031 send_info.rkey = send_ring->sync_mr->mkey.key; 1032 1033 for (i = 0; i < num_of_sends_req; i++) { 1034 ret = dr_postsend_icm_data(dmn, &send_info); 1035 if (ret) 1036 return ret; 1037 } 1038 1039 spin_lock(&send_ring->lock); 1040 ret = dr_handle_pending_wc(dmn, send_ring); 1041 spin_unlock(&send_ring->lock); 1042 1043 return ret; 1044 } 1045