1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 /* 3 * Copyright(c) 2020 - Cornelis Networks, Inc. 4 * Copyright(c) 2015 - 2018 Intel Corporation. 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/types.h> 9 #include <linux/device.h> 10 #include <linux/dmapool.h> 11 #include <linux/slab.h> 12 #include <linux/list.h> 13 #include <linux/highmem.h> 14 #include <linux/io.h> 15 #include <linux/uio.h> 16 #include <linux/rbtree.h> 17 #include <linux/spinlock.h> 18 #include <linux/delay.h> 19 #include <linux/kthread.h> 20 #include <linux/mmu_context.h> 21 #include <linux/module.h> 22 #include <linux/vmalloc.h> 23 #include <linux/string.h> 24 25 #include "hfi.h" 26 #include "sdma.h" 27 #include "user_sdma.h" 28 #include "verbs.h" /* for the headers */ 29 #include "common.h" /* for struct hfi1_tid_info */ 30 #include "trace.h" 31 32 static uint hfi1_sdma_comp_ring_size = 128; 33 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 34 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 35 36 static unsigned initial_pkt_count = 8; 37 38 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 39 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 40 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 41 static void user_sdma_free_request(struct user_sdma_request *req); 42 static int check_header_template(struct user_sdma_request *req, 43 struct hfi1_pkt_header *hdr, u32 lrhlen, 44 u32 datalen); 45 static int set_txreq_header(struct user_sdma_request *req, 46 struct user_sdma_txreq *tx, u32 datalen); 47 static int set_txreq_header_ahg(struct user_sdma_request *req, 48 struct user_sdma_txreq *tx, u32 len); 49 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 50 struct hfi1_user_sdma_comp_q *cq, 51 u16 idx, enum hfi1_sdma_comp_state state, 52 int ret); 53 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 54 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 55 56 static int defer_packet_queue( 57 struct sdma_engine *sde, 58 struct iowait_work *wait, 59 struct sdma_txreq *txreq, 60 uint seq, 61 bool pkts_sent); 62 static void activate_packet_queue(struct iowait *wait, int reason); 63 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 64 unsigned long len); 65 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 66 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 67 void *arg2, bool *stop); 68 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 69 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 70 71 static struct mmu_rb_ops sdma_rb_ops = { 72 .filter = sdma_rb_filter, 73 .insert = sdma_rb_insert, 74 .evict = sdma_rb_evict, 75 .remove = sdma_rb_remove, 76 .invalidate = sdma_rb_invalidate 77 }; 78 79 static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, 80 struct user_sdma_txreq *tx, 81 struct user_sdma_iovec *iovec, 82 u32 *pkt_remaining); 83 84 static int defer_packet_queue( 85 struct sdma_engine *sde, 86 struct iowait_work *wait, 87 struct sdma_txreq *txreq, 88 uint seq, 89 bool pkts_sent) 90 { 91 struct hfi1_user_sdma_pkt_q *pq = 92 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 93 94 write_seqlock(&sde->waitlock); 95 trace_hfi1_usdma_defer(pq, sde, &pq->busy); 96 if (sdma_progress(sde, seq, txreq)) 97 goto eagain; 98 /* 99 * We are assuming that if the list is enqueued somewhere, it 100 * is to the dmawait list since that is the only place where 101 * it is supposed to be enqueued. 102 */ 103 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 104 if (list_empty(&pq->busy.list)) { 105 pq->busy.lock = &sde->waitlock; 106 iowait_get_priority(&pq->busy); 107 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 108 } 109 write_sequnlock(&sde->waitlock); 110 return -EBUSY; 111 eagain: 112 write_sequnlock(&sde->waitlock); 113 return -EAGAIN; 114 } 115 116 static void activate_packet_queue(struct iowait *wait, int reason) 117 { 118 struct hfi1_user_sdma_pkt_q *pq = 119 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 120 121 trace_hfi1_usdma_activate(pq, wait, reason); 122 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 123 wake_up(&wait->wait_dma); 124 }; 125 126 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 127 struct hfi1_filedata *fd) 128 { 129 int ret = -ENOMEM; 130 char buf[64]; 131 struct hfi1_devdata *dd; 132 struct hfi1_user_sdma_comp_q *cq; 133 struct hfi1_user_sdma_pkt_q *pq; 134 135 if (!uctxt || !fd) 136 return -EBADF; 137 138 if (!hfi1_sdma_comp_ring_size) 139 return -EINVAL; 140 141 dd = uctxt->dd; 142 143 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 144 if (!pq) 145 return -ENOMEM; 146 pq->dd = dd; 147 pq->ctxt = uctxt->ctxt; 148 pq->subctxt = fd->subctxt; 149 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 150 atomic_set(&pq->n_reqs, 0); 151 init_waitqueue_head(&pq->wait); 152 atomic_set(&pq->n_locked, 0); 153 154 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 155 activate_packet_queue, NULL, NULL); 156 pq->reqidx = 0; 157 158 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 159 sizeof(*pq->reqs), 160 GFP_KERNEL); 161 if (!pq->reqs) 162 goto pq_reqs_nomem; 163 164 pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL); 165 if (!pq->req_in_use) 166 goto pq_reqs_no_in_use; 167 168 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 169 fd->subctxt); 170 pq->txreq_cache = kmem_cache_create(buf, 171 sizeof(struct user_sdma_txreq), 172 L1_CACHE_BYTES, 173 SLAB_HWCACHE_ALIGN, 174 NULL); 175 if (!pq->txreq_cache) { 176 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 177 uctxt->ctxt); 178 goto pq_txreq_nomem; 179 } 180 181 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 182 if (!cq) 183 goto cq_nomem; 184 185 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 186 * hfi1_sdma_comp_ring_size)); 187 if (!cq->comps) 188 goto cq_comps_nomem; 189 190 cq->nentries = hfi1_sdma_comp_ring_size; 191 192 ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq, 193 &pq->handler); 194 if (ret) { 195 dd_dev_err(dd, "Failed to register with MMU %d", ret); 196 goto pq_mmu_fail; 197 } 198 199 rcu_assign_pointer(fd->pq, pq); 200 fd->cq = cq; 201 202 return 0; 203 204 pq_mmu_fail: 205 vfree(cq->comps); 206 cq_comps_nomem: 207 kfree(cq); 208 cq_nomem: 209 kmem_cache_destroy(pq->txreq_cache); 210 pq_txreq_nomem: 211 bitmap_free(pq->req_in_use); 212 pq_reqs_no_in_use: 213 kfree(pq->reqs); 214 pq_reqs_nomem: 215 kfree(pq); 216 217 return ret; 218 } 219 220 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) 221 { 222 unsigned long flags; 223 seqlock_t *lock = pq->busy.lock; 224 225 if (!lock) 226 return; 227 write_seqlock_irqsave(lock, flags); 228 if (!list_empty(&pq->busy.list)) { 229 list_del_init(&pq->busy.list); 230 pq->busy.lock = NULL; 231 } 232 write_sequnlock_irqrestore(lock, flags); 233 } 234 235 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 236 struct hfi1_ctxtdata *uctxt) 237 { 238 struct hfi1_user_sdma_pkt_q *pq; 239 240 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 241 242 spin_lock(&fd->pq_rcu_lock); 243 pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, 244 lockdep_is_held(&fd->pq_rcu_lock)); 245 if (pq) { 246 rcu_assign_pointer(fd->pq, NULL); 247 spin_unlock(&fd->pq_rcu_lock); 248 synchronize_srcu(&fd->pq_srcu); 249 /* at this point there can be no more new requests */ 250 if (pq->handler) 251 hfi1_mmu_rb_unregister(pq->handler); 252 iowait_sdma_drain(&pq->busy); 253 /* Wait until all requests have been freed. */ 254 wait_event_interruptible( 255 pq->wait, 256 !atomic_read(&pq->n_reqs)); 257 kfree(pq->reqs); 258 bitmap_free(pq->req_in_use); 259 kmem_cache_destroy(pq->txreq_cache); 260 flush_pq_iowait(pq); 261 kfree(pq); 262 } else { 263 spin_unlock(&fd->pq_rcu_lock); 264 } 265 if (fd->cq) { 266 vfree(fd->cq->comps); 267 kfree(fd->cq); 268 fd->cq = NULL; 269 } 270 return 0; 271 } 272 273 static u8 dlid_to_selector(u16 dlid) 274 { 275 static u8 mapping[256]; 276 static int initialized; 277 static u8 next; 278 int hash; 279 280 if (!initialized) { 281 memset(mapping, 0xFF, 256); 282 initialized = 1; 283 } 284 285 hash = ((dlid >> 8) ^ dlid) & 0xFF; 286 if (mapping[hash] == 0xFF) { 287 mapping[hash] = next; 288 next = (next + 1) & 0x7F; 289 } 290 291 return mapping[hash]; 292 } 293 294 /** 295 * hfi1_user_sdma_process_request() - Process and start a user sdma request 296 * @fd: valid file descriptor 297 * @iovec: array of io vectors to process 298 * @dim: overall iovec array size 299 * @count: number of io vector array entries processed 300 */ 301 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 302 struct iovec *iovec, unsigned long dim, 303 unsigned long *count) 304 { 305 int ret = 0, i; 306 struct hfi1_ctxtdata *uctxt = fd->uctxt; 307 struct hfi1_user_sdma_pkt_q *pq = 308 srcu_dereference(fd->pq, &fd->pq_srcu); 309 struct hfi1_user_sdma_comp_q *cq = fd->cq; 310 struct hfi1_devdata *dd = pq->dd; 311 unsigned long idx = 0; 312 u8 pcount = initial_pkt_count; 313 struct sdma_req_info info; 314 struct user_sdma_request *req; 315 u8 opcode, sc, vl; 316 u16 pkey; 317 u32 slid; 318 u16 dlid; 319 u32 selector; 320 321 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 322 hfi1_cdbg( 323 SDMA, 324 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 325 dd->unit, uctxt->ctxt, fd->subctxt, 326 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 327 return -EINVAL; 328 } 329 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 330 if (ret) { 331 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 332 dd->unit, uctxt->ctxt, fd->subctxt, ret); 333 return -EFAULT; 334 } 335 336 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 337 (u16 *)&info); 338 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 339 hfi1_cdbg(SDMA, 340 "[%u:%u:%u:%u] Invalid comp index", 341 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 342 return -EINVAL; 343 } 344 345 /* 346 * Sanity check the header io vector count. Need at least 1 vector 347 * (header) and cannot be larger than the actual io vector count. 348 */ 349 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 350 hfi1_cdbg(SDMA, 351 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 352 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 353 req_iovcnt(info.ctrl), dim); 354 return -EINVAL; 355 } 356 357 if (!info.fragsize) { 358 hfi1_cdbg(SDMA, 359 "[%u:%u:%u:%u] Request does not specify fragsize", 360 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 361 return -EINVAL; 362 } 363 364 /* Try to claim the request. */ 365 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 366 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 367 dd->unit, uctxt->ctxt, fd->subctxt, 368 info.comp_idx); 369 return -EBADSLT; 370 } 371 /* 372 * All safety checks have been done and this request has been claimed. 373 */ 374 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 375 info.comp_idx); 376 req = pq->reqs + info.comp_idx; 377 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 378 req->data_len = 0; 379 req->pq = pq; 380 req->cq = cq; 381 req->ahg_idx = -1; 382 req->iov_idx = 0; 383 req->sent = 0; 384 req->seqnum = 0; 385 req->seqcomp = 0; 386 req->seqsubmitted = 0; 387 req->tids = NULL; 388 req->has_error = 0; 389 INIT_LIST_HEAD(&req->txps); 390 391 memcpy(&req->info, &info, sizeof(info)); 392 393 /* The request is initialized, count it */ 394 atomic_inc(&pq->n_reqs); 395 396 if (req_opcode(info.ctrl) == EXPECTED) { 397 /* expected must have a TID info and at least one data vector */ 398 if (req->data_iovs < 2) { 399 SDMA_DBG(req, 400 "Not enough vectors for expected request"); 401 ret = -EINVAL; 402 goto free_req; 403 } 404 req->data_iovs--; 405 } 406 407 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 408 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 409 MAX_VECTORS_PER_REQ); 410 ret = -EINVAL; 411 goto free_req; 412 } 413 414 /* Copy the header from the user buffer */ 415 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 416 sizeof(req->hdr)); 417 if (ret) { 418 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 419 ret = -EFAULT; 420 goto free_req; 421 } 422 423 /* If Static rate control is not enabled, sanitize the header. */ 424 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 425 req->hdr.pbc[2] = 0; 426 427 /* Validate the opcode. Do not trust packets from user space blindly. */ 428 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 429 if ((opcode & USER_OPCODE_CHECK_MASK) != 430 USER_OPCODE_CHECK_VAL) { 431 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 432 ret = -EINVAL; 433 goto free_req; 434 } 435 /* 436 * Validate the vl. Do not trust packets from user space blindly. 437 * VL comes from PBC, SC comes from LRH, and the VL needs to 438 * match the SC look up. 439 */ 440 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 441 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 442 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 443 if (vl >= dd->pport->vls_operational || 444 vl != sc_to_vlt(dd, sc)) { 445 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 446 ret = -EINVAL; 447 goto free_req; 448 } 449 450 /* Checking P_KEY for requests from user-space */ 451 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 452 slid = be16_to_cpu(req->hdr.lrh[3]); 453 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 454 ret = -EINVAL; 455 goto free_req; 456 } 457 458 /* 459 * Also should check the BTH.lnh. If it says the next header is GRH then 460 * the RXE parsing will be off and will land in the middle of the KDETH 461 * or miss it entirely. 462 */ 463 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 464 SDMA_DBG(req, "User tried to pass in a GRH"); 465 ret = -EINVAL; 466 goto free_req; 467 } 468 469 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 470 /* 471 * Calculate the initial TID offset based on the values of 472 * KDETH.OFFSET and KDETH.OM that are passed in. 473 */ 474 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 475 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 476 KDETH_OM_LARGE : KDETH_OM_SMALL); 477 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 478 info.comp_idx, req->tidoffset); 479 idx++; 480 481 /* Save all the IO vector structures */ 482 for (i = 0; i < req->data_iovs; i++) { 483 req->iovs[i].offset = 0; 484 INIT_LIST_HEAD(&req->iovs[i].list); 485 memcpy(&req->iovs[i].iov, 486 iovec + idx++, 487 sizeof(req->iovs[i].iov)); 488 if (req->iovs[i].iov.iov_len == 0) { 489 ret = -EINVAL; 490 goto free_req; 491 } 492 req->data_len += req->iovs[i].iov.iov_len; 493 } 494 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 495 info.comp_idx, req->data_len); 496 if (pcount > req->info.npkts) 497 pcount = req->info.npkts; 498 /* 499 * Copy any TID info 500 * User space will provide the TID info only when the 501 * request type is EXPECTED. This is true even if there is 502 * only one packet in the request and the header is already 503 * setup. The reason for the singular TID case is that the 504 * driver needs to perform safety checks. 505 */ 506 if (req_opcode(req->info.ctrl) == EXPECTED) { 507 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 508 u32 *tmp; 509 510 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 511 ret = -EINVAL; 512 goto free_req; 513 } 514 515 /* 516 * We have to copy all of the tids because they may vary 517 * in size and, therefore, the TID count might not be 518 * equal to the pkt count. However, there is no way to 519 * tell at this point. 520 */ 521 tmp = memdup_user(iovec[idx].iov_base, 522 ntids * sizeof(*req->tids)); 523 if (IS_ERR(tmp)) { 524 ret = PTR_ERR(tmp); 525 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 526 ntids, ret); 527 goto free_req; 528 } 529 req->tids = tmp; 530 req->n_tids = ntids; 531 req->tididx = 0; 532 idx++; 533 } 534 535 dlid = be16_to_cpu(req->hdr.lrh[1]); 536 selector = dlid_to_selector(dlid); 537 selector += uctxt->ctxt + fd->subctxt; 538 req->sde = sdma_select_user_engine(dd, selector, vl); 539 540 if (!req->sde || !sdma_running(req->sde)) { 541 ret = -ECOMM; 542 goto free_req; 543 } 544 545 /* We don't need an AHG entry if the request contains only one packet */ 546 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 547 req->ahg_idx = sdma_ahg_alloc(req->sde); 548 549 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 550 pq->state = SDMA_PKT_Q_ACTIVE; 551 552 /* 553 * This is a somewhat blocking send implementation. 554 * The driver will block the caller until all packets of the 555 * request have been submitted to the SDMA engine. However, it 556 * will not wait for send completions. 557 */ 558 while (req->seqsubmitted != req->info.npkts) { 559 ret = user_sdma_send_pkts(req, pcount); 560 if (ret < 0) { 561 int we_ret; 562 563 if (ret != -EBUSY) 564 goto free_req; 565 we_ret = wait_event_interruptible_timeout( 566 pq->busy.wait_dma, 567 pq->state == SDMA_PKT_Q_ACTIVE, 568 msecs_to_jiffies( 569 SDMA_IOWAIT_TIMEOUT)); 570 trace_hfi1_usdma_we(pq, we_ret); 571 if (we_ret <= 0) 572 flush_pq_iowait(pq); 573 } 574 } 575 *count += idx; 576 return 0; 577 free_req: 578 /* 579 * If the submitted seqsubmitted == npkts, the completion routine 580 * controls the final state. If sequbmitted < npkts, wait for any 581 * outstanding packets to finish before cleaning up. 582 */ 583 if (req->seqsubmitted < req->info.npkts) { 584 if (req->seqsubmitted) 585 wait_event(pq->busy.wait_dma, 586 (req->seqcomp == req->seqsubmitted - 1)); 587 user_sdma_free_request(req); 588 pq_update(pq); 589 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 590 } 591 return ret; 592 } 593 594 static inline u32 compute_data_length(struct user_sdma_request *req, 595 struct user_sdma_txreq *tx) 596 { 597 /* 598 * Determine the proper size of the packet data. 599 * The size of the data of the first packet is in the header 600 * template. However, it includes the header and ICRC, which need 601 * to be subtracted. 602 * The minimum representable packet data length in a header is 4 bytes, 603 * therefore, when the data length request is less than 4 bytes, there's 604 * only one packet, and the packet data length is equal to that of the 605 * request data length. 606 * The size of the remaining packets is the minimum of the frag 607 * size (MTU) or remaining data in the request. 608 */ 609 u32 len; 610 611 if (!req->seqnum) { 612 if (req->data_len < sizeof(u32)) 613 len = req->data_len; 614 else 615 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 616 (sizeof(tx->hdr) - 4)); 617 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 618 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 619 PAGE_SIZE; 620 /* 621 * Get the data length based on the remaining space in the 622 * TID pair. 623 */ 624 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 625 /* If we've filled up the TID pair, move to the next one. */ 626 if (unlikely(!len) && ++req->tididx < req->n_tids && 627 req->tids[req->tididx]) { 628 tidlen = EXP_TID_GET(req->tids[req->tididx], 629 LEN) * PAGE_SIZE; 630 req->tidoffset = 0; 631 len = min_t(u32, tidlen, req->info.fragsize); 632 } 633 /* 634 * Since the TID pairs map entire pages, make sure that we 635 * are not going to try to send more data that we have 636 * remaining. 637 */ 638 len = min(len, req->data_len - req->sent); 639 } else { 640 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 641 } 642 trace_hfi1_sdma_user_compute_length(req->pq->dd, 643 req->pq->ctxt, 644 req->pq->subctxt, 645 req->info.comp_idx, 646 len); 647 return len; 648 } 649 650 static inline u32 pad_len(u32 len) 651 { 652 if (len & (sizeof(u32) - 1)) 653 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 654 return len; 655 } 656 657 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 658 { 659 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 660 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 661 } 662 663 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 664 struct user_sdma_txreq *tx, 665 u32 datalen) 666 { 667 int ret; 668 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 669 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 670 struct hfi1_user_sdma_pkt_q *pq = req->pq; 671 672 /* 673 * Copy the request header into the tx header 674 * because the HW needs a cacheline-aligned 675 * address. 676 * This copy can be optimized out if the hdr 677 * member of user_sdma_request were also 678 * cacheline aligned. 679 */ 680 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 681 if (PBC2LRH(pbclen) != lrhlen) { 682 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 683 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 684 } 685 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 686 if (ret) 687 return ret; 688 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 689 sizeof(tx->hdr) + datalen, req->ahg_idx, 690 0, NULL, 0, user_sdma_txreq_cb); 691 if (ret) 692 return ret; 693 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 694 if (ret) 695 sdma_txclean(pq->dd, &tx->txreq); 696 return ret; 697 } 698 699 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 700 { 701 int ret = 0; 702 u16 count; 703 unsigned npkts = 0; 704 struct user_sdma_txreq *tx = NULL; 705 struct hfi1_user_sdma_pkt_q *pq = NULL; 706 struct user_sdma_iovec *iovec = NULL; 707 708 if (!req->pq) 709 return -EINVAL; 710 711 pq = req->pq; 712 713 /* If tx completion has reported an error, we are done. */ 714 if (READ_ONCE(req->has_error)) 715 return -EFAULT; 716 717 /* 718 * Check if we might have sent the entire request already 719 */ 720 if (unlikely(req->seqnum == req->info.npkts)) { 721 if (!list_empty(&req->txps)) 722 goto dosend; 723 return ret; 724 } 725 726 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 727 maxpkts = req->info.npkts - req->seqnum; 728 729 while (npkts < maxpkts) { 730 u32 datalen = 0; 731 732 /* 733 * Check whether any of the completions have come back 734 * with errors. If so, we are not going to process any 735 * more packets from this request. 736 */ 737 if (READ_ONCE(req->has_error)) 738 return -EFAULT; 739 740 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 741 if (!tx) 742 return -ENOMEM; 743 744 tx->flags = 0; 745 tx->req = req; 746 INIT_LIST_HEAD(&tx->list); 747 748 /* 749 * For the last packet set the ACK request 750 * and disable header suppression. 751 */ 752 if (req->seqnum == req->info.npkts - 1) 753 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 754 TXREQ_FLAGS_REQ_DISABLE_SH); 755 756 /* 757 * Calculate the payload size - this is min of the fragment 758 * (MTU) size or the remaining bytes in the request but only 759 * if we have payload data. 760 */ 761 if (req->data_len) { 762 iovec = &req->iovs[req->iov_idx]; 763 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 764 if (++req->iov_idx == req->data_iovs) { 765 ret = -EFAULT; 766 goto free_tx; 767 } 768 iovec = &req->iovs[req->iov_idx]; 769 WARN_ON(iovec->offset); 770 } 771 772 datalen = compute_data_length(req, tx); 773 774 /* 775 * Disable header suppression for the payload <= 8DWS. 776 * If there is an uncorrectable error in the receive 777 * data FIFO when the received payload size is less than 778 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 779 * not reported.There is set RHF.EccErr if the header 780 * is not suppressed. 781 */ 782 if (!datalen) { 783 SDMA_DBG(req, 784 "Request has data but pkt len is 0"); 785 ret = -EFAULT; 786 goto free_tx; 787 } else if (datalen <= 32) { 788 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 789 } 790 } 791 792 if (req->ahg_idx >= 0) { 793 if (!req->seqnum) { 794 ret = user_sdma_txadd_ahg(req, tx, datalen); 795 if (ret) 796 goto free_tx; 797 } else { 798 int changes; 799 800 changes = set_txreq_header_ahg(req, tx, 801 datalen); 802 if (changes < 0) { 803 ret = changes; 804 goto free_tx; 805 } 806 } 807 } else { 808 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 809 datalen, user_sdma_txreq_cb); 810 if (ret) 811 goto free_tx; 812 /* 813 * Modify the header for this packet. This only needs 814 * to be done if we are not going to use AHG. Otherwise, 815 * the HW will do it based on the changes we gave it 816 * during sdma_txinit_ahg(). 817 */ 818 ret = set_txreq_header(req, tx, datalen); 819 if (ret) 820 goto free_txreq; 821 } 822 823 req->koffset += datalen; 824 if (req_opcode(req->info.ctrl) == EXPECTED) 825 req->tidoffset += datalen; 826 req->sent += datalen; 827 while (datalen) { 828 ret = add_system_pages_to_sdma_packet(req, tx, iovec, 829 &datalen); 830 if (ret) 831 goto free_txreq; 832 iovec = &req->iovs[req->iov_idx]; 833 } 834 list_add_tail(&tx->txreq.list, &req->txps); 835 /* 836 * It is important to increment this here as it is used to 837 * generate the BTH.PSN and, therefore, can't be bulk-updated 838 * outside of the loop. 839 */ 840 tx->seqnum = req->seqnum++; 841 npkts++; 842 } 843 dosend: 844 ret = sdma_send_txlist(req->sde, 845 iowait_get_ib_work(&pq->busy), 846 &req->txps, &count); 847 req->seqsubmitted += count; 848 if (req->seqsubmitted == req->info.npkts) { 849 /* 850 * The txreq has already been submitted to the HW queue 851 * so we can free the AHG entry now. Corruption will not 852 * happen due to the sequential manner in which 853 * descriptors are processed. 854 */ 855 if (req->ahg_idx >= 0) 856 sdma_ahg_free(req->sde, req->ahg_idx); 857 } 858 return ret; 859 860 free_txreq: 861 sdma_txclean(pq->dd, &tx->txreq); 862 free_tx: 863 kmem_cache_free(pq->txreq_cache, tx); 864 return ret; 865 } 866 867 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 868 { 869 struct evict_data evict_data; 870 struct mmu_rb_handler *handler = pq->handler; 871 872 evict_data.cleared = 0; 873 evict_data.target = npages; 874 hfi1_mmu_rb_evict(handler, &evict_data); 875 return evict_data.cleared; 876 } 877 878 static int check_header_template(struct user_sdma_request *req, 879 struct hfi1_pkt_header *hdr, u32 lrhlen, 880 u32 datalen) 881 { 882 /* 883 * Perform safety checks for any type of packet: 884 * - transfer size is multiple of 64bytes 885 * - packet length is multiple of 4 bytes 886 * - packet length is not larger than MTU size 887 * 888 * These checks are only done for the first packet of the 889 * transfer since the header is "given" to us by user space. 890 * For the remainder of the packets we compute the values. 891 */ 892 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 893 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 894 return -EINVAL; 895 896 if (req_opcode(req->info.ctrl) == EXPECTED) { 897 /* 898 * The header is checked only on the first packet. Furthermore, 899 * we ensure that at least one TID entry is copied when the 900 * request is submitted. Therefore, we don't have to verify that 901 * tididx points to something sane. 902 */ 903 u32 tidval = req->tids[req->tididx], 904 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 905 tididx = EXP_TID_GET(tidval, IDX), 906 tidctrl = EXP_TID_GET(tidval, CTRL), 907 tidoff; 908 __le32 kval = hdr->kdeth.ver_tid_offset; 909 910 tidoff = KDETH_GET(kval, OFFSET) * 911 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 912 KDETH_OM_LARGE : KDETH_OM_SMALL); 913 /* 914 * Expected receive packets have the following 915 * additional checks: 916 * - offset is not larger than the TID size 917 * - TIDCtrl values match between header and TID array 918 * - TID indexes match between header and TID array 919 */ 920 if ((tidoff + datalen > tidlen) || 921 KDETH_GET(kval, TIDCTRL) != tidctrl || 922 KDETH_GET(kval, TID) != tididx) 923 return -EINVAL; 924 } 925 return 0; 926 } 927 928 /* 929 * Correctly set the BTH.PSN field based on type of 930 * transfer - eager packets can just increment the PSN but 931 * expected packets encode generation and sequence in the 932 * BTH.PSN field so just incrementing will result in errors. 933 */ 934 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 935 { 936 u32 val = be32_to_cpu(bthpsn), 937 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 938 0xffffffull), 939 psn = val & mask; 940 if (expct) 941 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 942 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 943 else 944 psn = psn + frags; 945 return psn & mask; 946 } 947 948 static int set_txreq_header(struct user_sdma_request *req, 949 struct user_sdma_txreq *tx, u32 datalen) 950 { 951 struct hfi1_user_sdma_pkt_q *pq = req->pq; 952 struct hfi1_pkt_header *hdr = &tx->hdr; 953 u8 omfactor; /* KDETH.OM */ 954 u16 pbclen; 955 int ret; 956 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 957 958 /* Copy the header template to the request before modification */ 959 memcpy(hdr, &req->hdr, sizeof(*hdr)); 960 961 /* 962 * Check if the PBC and LRH length are mismatched. If so 963 * adjust both in the header. 964 */ 965 pbclen = le16_to_cpu(hdr->pbc[0]); 966 if (PBC2LRH(pbclen) != lrhlen) { 967 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 968 hdr->pbc[0] = cpu_to_le16(pbclen); 969 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 970 /* 971 * Third packet 972 * This is the first packet in the sequence that has 973 * a "static" size that can be used for the rest of 974 * the packets (besides the last one). 975 */ 976 if (unlikely(req->seqnum == 2)) { 977 /* 978 * From this point on the lengths in both the 979 * PBC and LRH are the same until the last 980 * packet. 981 * Adjust the template so we don't have to update 982 * every packet 983 */ 984 req->hdr.pbc[0] = hdr->pbc[0]; 985 req->hdr.lrh[2] = hdr->lrh[2]; 986 } 987 } 988 /* 989 * We only have to modify the header if this is not the 990 * first packet in the request. Otherwise, we use the 991 * header given to us. 992 */ 993 if (unlikely(!req->seqnum)) { 994 ret = check_header_template(req, hdr, lrhlen, datalen); 995 if (ret) 996 return ret; 997 goto done; 998 } 999 1000 hdr->bth[2] = cpu_to_be32( 1001 set_pkt_bth_psn(hdr->bth[2], 1002 (req_opcode(req->info.ctrl) == EXPECTED), 1003 req->seqnum)); 1004 1005 /* Set ACK request on last packet */ 1006 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1007 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1008 1009 /* Set the new offset */ 1010 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1011 /* Expected packets have to fill in the new TID information */ 1012 if (req_opcode(req->info.ctrl) == EXPECTED) { 1013 tidval = req->tids[req->tididx]; 1014 /* 1015 * If the offset puts us at the end of the current TID, 1016 * advance everything. 1017 */ 1018 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1019 PAGE_SIZE)) { 1020 req->tidoffset = 0; 1021 /* 1022 * Since we don't copy all the TIDs, all at once, 1023 * we have to check again. 1024 */ 1025 if (++req->tididx > req->n_tids - 1 || 1026 !req->tids[req->tididx]) { 1027 return -EINVAL; 1028 } 1029 tidval = req->tids[req->tididx]; 1030 } 1031 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1032 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1033 KDETH_OM_SMALL_SHIFT; 1034 /* Set KDETH.TIDCtrl based on value for this TID. */ 1035 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1036 EXP_TID_GET(tidval, CTRL)); 1037 /* Set KDETH.TID based on value for this TID */ 1038 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1039 EXP_TID_GET(tidval, IDX)); 1040 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1041 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1042 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1043 /* 1044 * Set the KDETH.OFFSET and KDETH.OM based on size of 1045 * transfer. 1046 */ 1047 trace_hfi1_sdma_user_tid_info( 1048 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1049 req->tidoffset, req->tidoffset >> omfactor, 1050 omfactor != KDETH_OM_SMALL_SHIFT); 1051 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1052 req->tidoffset >> omfactor); 1053 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1054 omfactor != KDETH_OM_SMALL_SHIFT); 1055 } 1056 done: 1057 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1058 req->info.comp_idx, hdr, tidval); 1059 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1060 } 1061 1062 static int set_txreq_header_ahg(struct user_sdma_request *req, 1063 struct user_sdma_txreq *tx, u32 datalen) 1064 { 1065 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1066 int idx = 0; 1067 u8 omfactor; /* KDETH.OM */ 1068 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1069 struct hfi1_pkt_header *hdr = &req->hdr; 1070 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1071 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1072 size_t array_size = ARRAY_SIZE(ahg); 1073 1074 if (PBC2LRH(pbclen) != lrhlen) { 1075 /* PBC.PbcLengthDWs */ 1076 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1077 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1078 if (idx < 0) 1079 return idx; 1080 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1081 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1082 (__force u16)cpu_to_be16(lrhlen >> 2)); 1083 if (idx < 0) 1084 return idx; 1085 } 1086 1087 /* 1088 * Do the common updates 1089 */ 1090 /* BTH.PSN and BTH.A */ 1091 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1092 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1093 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1094 val32 |= 1UL << 31; 1095 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1096 (__force u16)cpu_to_be16(val32 >> 16)); 1097 if (idx < 0) 1098 return idx; 1099 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1100 (__force u16)cpu_to_be16(val32 & 0xffff)); 1101 if (idx < 0) 1102 return idx; 1103 /* KDETH.Offset */ 1104 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1105 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1106 if (idx < 0) 1107 return idx; 1108 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1109 (__force u16)cpu_to_le16(req->koffset >> 16)); 1110 if (idx < 0) 1111 return idx; 1112 if (req_opcode(req->info.ctrl) == EXPECTED) { 1113 __le16 val; 1114 1115 tidval = req->tids[req->tididx]; 1116 1117 /* 1118 * If the offset puts us at the end of the current TID, 1119 * advance everything. 1120 */ 1121 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1122 PAGE_SIZE)) { 1123 req->tidoffset = 0; 1124 /* 1125 * Since we don't copy all the TIDs, all at once, 1126 * we have to check again. 1127 */ 1128 if (++req->tididx > req->n_tids - 1 || 1129 !req->tids[req->tididx]) 1130 return -EINVAL; 1131 tidval = req->tids[req->tididx]; 1132 } 1133 omfactor = ((EXP_TID_GET(tidval, LEN) * 1134 PAGE_SIZE) >= 1135 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1136 KDETH_OM_SMALL_SHIFT; 1137 /* KDETH.OM and KDETH.OFFSET (TID) */ 1138 idx = ahg_header_set( 1139 ahg, idx, array_size, 7, 0, 16, 1140 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1141 ((req->tidoffset >> omfactor) 1142 & 0x7fff))); 1143 if (idx < 0) 1144 return idx; 1145 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1146 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1147 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1148 1149 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1150 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1151 INTR) << 1152 AHG_KDETH_INTR_SHIFT)); 1153 } else { 1154 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1155 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1156 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1157 INTR) << 1158 AHG_KDETH_INTR_SHIFT)); 1159 } 1160 1161 idx = ahg_header_set(ahg, idx, array_size, 1162 7, 16, 14, (__force u16)val); 1163 if (idx < 0) 1164 return idx; 1165 } 1166 1167 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1168 req->info.comp_idx, req->sde->this_idx, 1169 req->ahg_idx, ahg, idx, tidval); 1170 sdma_txinit_ahg(&tx->txreq, 1171 SDMA_TXREQ_F_USE_AHG, 1172 datalen, req->ahg_idx, idx, 1173 ahg, sizeof(req->hdr), 1174 user_sdma_txreq_cb); 1175 1176 return idx; 1177 } 1178 1179 /** 1180 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1181 * @txreq: valid sdma tx request 1182 * @status: success/failure of request 1183 * 1184 * Called when the SDMA progress state machine gets notification that 1185 * the SDMA descriptors for this tx request have been processed by the 1186 * DMA engine. Called in interrupt context. 1187 * Only do work on completed sequences. 1188 */ 1189 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1190 { 1191 struct user_sdma_txreq *tx = 1192 container_of(txreq, struct user_sdma_txreq, txreq); 1193 struct user_sdma_request *req; 1194 struct hfi1_user_sdma_pkt_q *pq; 1195 struct hfi1_user_sdma_comp_q *cq; 1196 enum hfi1_sdma_comp_state state = COMPLETE; 1197 1198 if (!tx->req) 1199 return; 1200 1201 req = tx->req; 1202 pq = req->pq; 1203 cq = req->cq; 1204 1205 if (status != SDMA_TXREQ_S_OK) { 1206 SDMA_DBG(req, "SDMA completion with error %d", 1207 status); 1208 WRITE_ONCE(req->has_error, 1); 1209 state = ERROR; 1210 } 1211 1212 req->seqcomp = tx->seqnum; 1213 kmem_cache_free(pq->txreq_cache, tx); 1214 1215 /* sequence isn't complete? We are done */ 1216 if (req->seqcomp != req->info.npkts - 1) 1217 return; 1218 1219 user_sdma_free_request(req); 1220 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1221 pq_update(pq); 1222 } 1223 1224 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1225 { 1226 if (atomic_dec_and_test(&pq->n_reqs)) 1227 wake_up(&pq->wait); 1228 } 1229 1230 static void user_sdma_free_request(struct user_sdma_request *req) 1231 { 1232 if (!list_empty(&req->txps)) { 1233 struct sdma_txreq *t, *p; 1234 1235 list_for_each_entry_safe(t, p, &req->txps, list) { 1236 struct user_sdma_txreq *tx = 1237 container_of(t, struct user_sdma_txreq, txreq); 1238 list_del_init(&t->list); 1239 sdma_txclean(req->pq->dd, t); 1240 kmem_cache_free(req->pq->txreq_cache, tx); 1241 } 1242 } 1243 1244 kfree(req->tids); 1245 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1246 } 1247 1248 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1249 struct hfi1_user_sdma_comp_q *cq, 1250 u16 idx, enum hfi1_sdma_comp_state state, 1251 int ret) 1252 { 1253 if (state == ERROR) 1254 cq->comps[idx].errcode = -ret; 1255 smp_wmb(); /* make sure errcode is visible first */ 1256 cq->comps[idx].status = state; 1257 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1258 idx, state, ret); 1259 } 1260 1261 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1262 unsigned int start, unsigned int npages) 1263 { 1264 hfi1_release_user_pages(mm, pages + start, npages, false); 1265 kfree(pages); 1266 } 1267 1268 static void free_system_node(struct sdma_mmu_node *node) 1269 { 1270 if (node->npages) { 1271 unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0, 1272 node->npages); 1273 atomic_sub(node->npages, &node->pq->n_locked); 1274 } 1275 kfree(node); 1276 } 1277 1278 static inline void acquire_node(struct sdma_mmu_node *node) 1279 { 1280 atomic_inc(&node->refcount); 1281 WARN_ON(atomic_read(&node->refcount) < 0); 1282 } 1283 1284 static inline void release_node(struct mmu_rb_handler *handler, 1285 struct sdma_mmu_node *node) 1286 { 1287 atomic_dec(&node->refcount); 1288 WARN_ON(atomic_read(&node->refcount) < 0); 1289 } 1290 1291 static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, 1292 unsigned long start, 1293 unsigned long end) 1294 { 1295 struct mmu_rb_node *rb_node; 1296 struct sdma_mmu_node *node; 1297 unsigned long flags; 1298 1299 spin_lock_irqsave(&handler->lock, flags); 1300 rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start)); 1301 if (!rb_node) { 1302 spin_unlock_irqrestore(&handler->lock, flags); 1303 return NULL; 1304 } 1305 node = container_of(rb_node, struct sdma_mmu_node, rb); 1306 acquire_node(node); 1307 spin_unlock_irqrestore(&handler->lock, flags); 1308 1309 return node; 1310 } 1311 1312 static int pin_system_pages(struct user_sdma_request *req, 1313 uintptr_t start_address, size_t length, 1314 struct sdma_mmu_node *node, int npages) 1315 { 1316 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1317 int pinned, cleared; 1318 struct page **pages; 1319 1320 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1321 if (!pages) 1322 return -ENOMEM; 1323 1324 retry: 1325 if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked), 1326 npages)) { 1327 SDMA_DBG(req, "Evicting: nlocked %u npages %u", 1328 atomic_read(&pq->n_locked), npages); 1329 cleared = sdma_cache_evict(pq, npages); 1330 if (cleared >= npages) 1331 goto retry; 1332 } 1333 1334 SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u", 1335 start_address, node->npages, npages); 1336 pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0, 1337 pages); 1338 1339 if (pinned < 0) { 1340 kfree(pages); 1341 SDMA_DBG(req, "pinned %d", pinned); 1342 return pinned; 1343 } 1344 if (pinned != npages) { 1345 unpin_vector_pages(current->mm, pages, node->npages, pinned); 1346 SDMA_DBG(req, "npages %u pinned %d", npages, pinned); 1347 return -EFAULT; 1348 } 1349 node->rb.addr = start_address; 1350 node->rb.len = length; 1351 node->pages = pages; 1352 node->npages = npages; 1353 atomic_add(pinned, &pq->n_locked); 1354 SDMA_DBG(req, "done. pinned %d", pinned); 1355 return 0; 1356 } 1357 1358 static int add_system_pinning(struct user_sdma_request *req, 1359 struct sdma_mmu_node **node_p, 1360 unsigned long start, unsigned long len) 1361 1362 { 1363 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1364 struct sdma_mmu_node *node; 1365 int ret; 1366 1367 node = kzalloc(sizeof(*node), GFP_KERNEL); 1368 if (!node) 1369 return -ENOMEM; 1370 1371 node->pq = pq; 1372 ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); 1373 if (ret == 0) { 1374 ret = hfi1_mmu_rb_insert(pq->handler, &node->rb); 1375 if (ret) 1376 free_system_node(node); 1377 else 1378 *node_p = node; 1379 1380 return ret; 1381 } 1382 1383 kfree(node); 1384 return ret; 1385 } 1386 1387 static int get_system_cache_entry(struct user_sdma_request *req, 1388 struct sdma_mmu_node **node_p, 1389 size_t req_start, size_t req_len) 1390 { 1391 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1392 u64 start = ALIGN_DOWN(req_start, PAGE_SIZE); 1393 u64 end = PFN_ALIGN(req_start + req_len); 1394 struct mmu_rb_handler *handler = pq->handler; 1395 int ret; 1396 1397 if ((end - start) == 0) { 1398 SDMA_DBG(req, 1399 "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx", 1400 req_start, req_len, start, end); 1401 return -EINVAL; 1402 } 1403 1404 SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len); 1405 1406 while (1) { 1407 struct sdma_mmu_node *node = 1408 find_system_node(handler, start, end); 1409 u64 prepend_len = 0; 1410 1411 SDMA_DBG(req, "node %p start %llx end %llu", node, start, end); 1412 if (!node) { 1413 ret = add_system_pinning(req, node_p, start, 1414 end - start); 1415 if (ret == -EEXIST) { 1416 /* 1417 * Another execution context has inserted a 1418 * conficting entry first. 1419 */ 1420 continue; 1421 } 1422 return ret; 1423 } 1424 1425 if (node->rb.addr <= start) { 1426 /* 1427 * This entry covers at least part of the region. If it doesn't extend 1428 * to the end, then this will be called again for the next segment. 1429 */ 1430 *node_p = node; 1431 return 0; 1432 } 1433 1434 SDMA_DBG(req, "prepend: node->rb.addr %lx, node->refcount %d", 1435 node->rb.addr, atomic_read(&node->refcount)); 1436 prepend_len = node->rb.addr - start; 1437 1438 /* 1439 * This node will not be returned, instead a new node 1440 * will be. So release the reference. 1441 */ 1442 release_node(handler, node); 1443 1444 /* Prepend a node to cover the beginning of the allocation */ 1445 ret = add_system_pinning(req, node_p, start, prepend_len); 1446 if (ret == -EEXIST) { 1447 /* Another execution context has inserted a conficting entry first. */ 1448 continue; 1449 } 1450 return ret; 1451 } 1452 } 1453 1454 static int add_mapping_to_sdma_packet(struct user_sdma_request *req, 1455 struct user_sdma_txreq *tx, 1456 struct sdma_mmu_node *cache_entry, 1457 size_t start, 1458 size_t from_this_cache_entry) 1459 { 1460 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1461 unsigned int page_offset; 1462 unsigned int from_this_page; 1463 size_t page_index; 1464 void *ctx; 1465 int ret; 1466 1467 /* 1468 * Because the cache may be more fragmented than the memory that is being accessed, 1469 * it's not strictly necessary to have a descriptor per cache entry. 1470 */ 1471 1472 while (from_this_cache_entry) { 1473 page_index = PFN_DOWN(start - cache_entry->rb.addr); 1474 1475 if (page_index >= cache_entry->npages) { 1476 SDMA_DBG(req, 1477 "Request for page_index %zu >= cache_entry->npages %u", 1478 page_index, cache_entry->npages); 1479 return -EINVAL; 1480 } 1481 1482 page_offset = start - ALIGN_DOWN(start, PAGE_SIZE); 1483 from_this_page = PAGE_SIZE - page_offset; 1484 1485 if (from_this_page < from_this_cache_entry) { 1486 ctx = NULL; 1487 } else { 1488 /* 1489 * In the case they are equal the next line has no practical effect, 1490 * but it's better to do a register to register copy than a conditional 1491 * branch. 1492 */ 1493 from_this_page = from_this_cache_entry; 1494 ctx = cache_entry; 1495 } 1496 1497 ret = sdma_txadd_page(pq->dd, ctx, &tx->txreq, 1498 cache_entry->pages[page_index], 1499 page_offset, from_this_page); 1500 if (ret) { 1501 /* 1502 * When there's a failure, the entire request is freed by 1503 * user_sdma_send_pkts(). 1504 */ 1505 SDMA_DBG(req, 1506 "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u", 1507 ret, page_index, page_offset, from_this_page); 1508 return ret; 1509 } 1510 start += from_this_page; 1511 from_this_cache_entry -= from_this_page; 1512 } 1513 return 0; 1514 } 1515 1516 static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, 1517 struct user_sdma_txreq *tx, 1518 struct user_sdma_iovec *iovec, 1519 size_t from_this_iovec) 1520 { 1521 struct mmu_rb_handler *handler = req->pq->handler; 1522 1523 while (from_this_iovec > 0) { 1524 struct sdma_mmu_node *cache_entry; 1525 size_t from_this_cache_entry; 1526 size_t start; 1527 int ret; 1528 1529 start = (uintptr_t)iovec->iov.iov_base + iovec->offset; 1530 ret = get_system_cache_entry(req, &cache_entry, start, 1531 from_this_iovec); 1532 if (ret) { 1533 SDMA_DBG(req, "pin system segment failed %d", ret); 1534 return ret; 1535 } 1536 1537 from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr); 1538 if (from_this_cache_entry > from_this_iovec) 1539 from_this_cache_entry = from_this_iovec; 1540 1541 ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, 1542 from_this_cache_entry); 1543 if (ret) { 1544 /* 1545 * We're guaranteed that there will be no descriptor 1546 * completion callback that releases this node 1547 * because only the last descriptor referencing it 1548 * has a context attached, and a failure means the 1549 * last descriptor was never added. 1550 */ 1551 release_node(handler, cache_entry); 1552 SDMA_DBG(req, "add system segment failed %d", ret); 1553 return ret; 1554 } 1555 1556 iovec->offset += from_this_cache_entry; 1557 from_this_iovec -= from_this_cache_entry; 1558 } 1559 1560 return 0; 1561 } 1562 1563 static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, 1564 struct user_sdma_txreq *tx, 1565 struct user_sdma_iovec *iovec, 1566 u32 *pkt_data_remaining) 1567 { 1568 size_t remaining_to_add = *pkt_data_remaining; 1569 /* 1570 * Walk through iovec entries, ensure the associated pages 1571 * are pinned and mapped, add data to the packet until no more 1572 * data remains to be added. 1573 */ 1574 while (remaining_to_add > 0) { 1575 struct user_sdma_iovec *cur_iovec; 1576 size_t from_this_iovec; 1577 int ret; 1578 1579 cur_iovec = iovec; 1580 from_this_iovec = iovec->iov.iov_len - iovec->offset; 1581 1582 if (from_this_iovec > remaining_to_add) { 1583 from_this_iovec = remaining_to_add; 1584 } else { 1585 /* The current iovec entry will be consumed by this pass. */ 1586 req->iov_idx++; 1587 iovec++; 1588 } 1589 1590 ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec, 1591 from_this_iovec); 1592 if (ret) 1593 return ret; 1594 1595 remaining_to_add -= from_this_iovec; 1596 } 1597 *pkt_data_remaining = remaining_to_add; 1598 1599 return 0; 1600 } 1601 1602 void system_descriptor_complete(struct hfi1_devdata *dd, 1603 struct sdma_desc *descp) 1604 { 1605 switch (sdma_mapping_type(descp)) { 1606 case SDMA_MAP_SINGLE: 1607 dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp), 1608 sdma_mapping_len(descp), DMA_TO_DEVICE); 1609 break; 1610 case SDMA_MAP_PAGE: 1611 dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp), 1612 sdma_mapping_len(descp), DMA_TO_DEVICE); 1613 break; 1614 } 1615 1616 if (descp->pinning_ctx) { 1617 struct sdma_mmu_node *node = descp->pinning_ctx; 1618 1619 release_node(node->rb.handler, node); 1620 } 1621 } 1622 1623 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1624 unsigned long len) 1625 { 1626 return (bool)(node->addr == addr); 1627 } 1628 1629 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1630 { 1631 struct sdma_mmu_node *node = 1632 container_of(mnode, struct sdma_mmu_node, rb); 1633 1634 atomic_inc(&node->refcount); 1635 return 0; 1636 } 1637 1638 /* 1639 * Return 1 to remove the node from the rb tree and call the remove op. 1640 * 1641 * Called with the rb tree lock held. 1642 */ 1643 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1644 void *evict_arg, bool *stop) 1645 { 1646 struct sdma_mmu_node *node = 1647 container_of(mnode, struct sdma_mmu_node, rb); 1648 struct evict_data *evict_data = evict_arg; 1649 1650 /* is this node still being used? */ 1651 if (atomic_read(&node->refcount)) 1652 return 0; /* keep this node */ 1653 1654 /* this node will be evicted, add its pages to our count */ 1655 evict_data->cleared += node->npages; 1656 1657 /* have enough pages been cleared? */ 1658 if (evict_data->cleared >= evict_data->target) 1659 *stop = true; 1660 1661 return 1; /* remove this node */ 1662 } 1663 1664 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1665 { 1666 struct sdma_mmu_node *node = 1667 container_of(mnode, struct sdma_mmu_node, rb); 1668 1669 free_system_node(node); 1670 } 1671 1672 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1673 { 1674 struct sdma_mmu_node *node = 1675 container_of(mnode, struct sdma_mmu_node, rb); 1676 1677 if (!atomic_read(&node->refcount)) 1678 return 1; 1679 return 0; 1680 } 1681