1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 /* 3 * Copyright(c) 2020 - Cornelis Networks, Inc. 4 * Copyright(c) 2015 - 2018 Intel Corporation. 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/types.h> 9 #include <linux/device.h> 10 #include <linux/dmapool.h> 11 #include <linux/slab.h> 12 #include <linux/list.h> 13 #include <linux/highmem.h> 14 #include <linux/io.h> 15 #include <linux/uio.h> 16 #include <linux/rbtree.h> 17 #include <linux/spinlock.h> 18 #include <linux/delay.h> 19 #include <linux/kthread.h> 20 #include <linux/mmu_context.h> 21 #include <linux/module.h> 22 #include <linux/vmalloc.h> 23 #include <linux/string.h> 24 25 #include "hfi.h" 26 #include "sdma.h" 27 #include "user_sdma.h" 28 #include "verbs.h" /* for the headers */ 29 #include "common.h" /* for struct hfi1_tid_info */ 30 #include "trace.h" 31 32 static uint hfi1_sdma_comp_ring_size = 128; 33 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 34 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 35 36 static unsigned initial_pkt_count = 8; 37 38 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 39 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 40 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 41 static void user_sdma_free_request(struct user_sdma_request *req); 42 static int check_header_template(struct user_sdma_request *req, 43 struct hfi1_pkt_header *hdr, u32 lrhlen, 44 u32 datalen); 45 static int set_txreq_header(struct user_sdma_request *req, 46 struct user_sdma_txreq *tx, u32 datalen); 47 static int set_txreq_header_ahg(struct user_sdma_request *req, 48 struct user_sdma_txreq *tx, u32 len); 49 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 50 struct hfi1_user_sdma_comp_q *cq, 51 u16 idx, enum hfi1_sdma_comp_state state, 52 int ret); 53 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 54 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 55 56 static int defer_packet_queue( 57 struct sdma_engine *sde, 58 struct iowait_work *wait, 59 struct sdma_txreq *txreq, 60 uint seq, 61 bool pkts_sent); 62 static void activate_packet_queue(struct iowait *wait, int reason); 63 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 64 unsigned long len); 65 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 66 void *arg2, bool *stop); 67 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 68 69 static struct mmu_rb_ops sdma_rb_ops = { 70 .filter = sdma_rb_filter, 71 .evict = sdma_rb_evict, 72 .remove = sdma_rb_remove, 73 }; 74 75 static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, 76 struct user_sdma_txreq *tx, 77 struct user_sdma_iovec *iovec, 78 u32 *pkt_remaining); 79 80 static int defer_packet_queue( 81 struct sdma_engine *sde, 82 struct iowait_work *wait, 83 struct sdma_txreq *txreq, 84 uint seq, 85 bool pkts_sent) 86 { 87 struct hfi1_user_sdma_pkt_q *pq = 88 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 89 90 write_seqlock(&sde->waitlock); 91 trace_hfi1_usdma_defer(pq, sde, &pq->busy); 92 if (sdma_progress(sde, seq, txreq)) 93 goto eagain; 94 /* 95 * We are assuming that if the list is enqueued somewhere, it 96 * is to the dmawait list since that is the only place where 97 * it is supposed to be enqueued. 98 */ 99 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 100 if (list_empty(&pq->busy.list)) { 101 pq->busy.lock = &sde->waitlock; 102 iowait_get_priority(&pq->busy); 103 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 104 } 105 write_sequnlock(&sde->waitlock); 106 return -EBUSY; 107 eagain: 108 write_sequnlock(&sde->waitlock); 109 return -EAGAIN; 110 } 111 112 static void activate_packet_queue(struct iowait *wait, int reason) 113 { 114 struct hfi1_user_sdma_pkt_q *pq = 115 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 116 117 trace_hfi1_usdma_activate(pq, wait, reason); 118 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 119 wake_up(&wait->wait_dma); 120 }; 121 122 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 123 struct hfi1_filedata *fd) 124 { 125 int ret = -ENOMEM; 126 char buf[64]; 127 struct hfi1_devdata *dd; 128 struct hfi1_user_sdma_comp_q *cq; 129 struct hfi1_user_sdma_pkt_q *pq; 130 131 if (!uctxt || !fd) 132 return -EBADF; 133 134 if (!hfi1_sdma_comp_ring_size) 135 return -EINVAL; 136 137 dd = uctxt->dd; 138 139 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 140 if (!pq) 141 return -ENOMEM; 142 pq->dd = dd; 143 pq->ctxt = uctxt->ctxt; 144 pq->subctxt = fd->subctxt; 145 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 146 atomic_set(&pq->n_reqs, 0); 147 init_waitqueue_head(&pq->wait); 148 atomic_set(&pq->n_locked, 0); 149 150 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 151 activate_packet_queue, NULL, NULL); 152 pq->reqidx = 0; 153 154 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 155 sizeof(*pq->reqs), 156 GFP_KERNEL); 157 if (!pq->reqs) 158 goto pq_reqs_nomem; 159 160 pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL); 161 if (!pq->req_in_use) 162 goto pq_reqs_no_in_use; 163 164 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 165 fd->subctxt); 166 pq->txreq_cache = kmem_cache_create(buf, 167 sizeof(struct user_sdma_txreq), 168 L1_CACHE_BYTES, 169 SLAB_HWCACHE_ALIGN, 170 NULL); 171 if (!pq->txreq_cache) { 172 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 173 uctxt->ctxt); 174 goto pq_txreq_nomem; 175 } 176 177 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 178 if (!cq) 179 goto cq_nomem; 180 181 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 182 * hfi1_sdma_comp_ring_size)); 183 if (!cq->comps) 184 goto cq_comps_nomem; 185 186 cq->nentries = hfi1_sdma_comp_ring_size; 187 188 ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq, 189 &pq->handler); 190 if (ret) { 191 dd_dev_err(dd, "Failed to register with MMU %d", ret); 192 goto pq_mmu_fail; 193 } 194 195 rcu_assign_pointer(fd->pq, pq); 196 fd->cq = cq; 197 198 return 0; 199 200 pq_mmu_fail: 201 vfree(cq->comps); 202 cq_comps_nomem: 203 kfree(cq); 204 cq_nomem: 205 kmem_cache_destroy(pq->txreq_cache); 206 pq_txreq_nomem: 207 bitmap_free(pq->req_in_use); 208 pq_reqs_no_in_use: 209 kfree(pq->reqs); 210 pq_reqs_nomem: 211 kfree(pq); 212 213 return ret; 214 } 215 216 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) 217 { 218 unsigned long flags; 219 seqlock_t *lock = pq->busy.lock; 220 221 if (!lock) 222 return; 223 write_seqlock_irqsave(lock, flags); 224 if (!list_empty(&pq->busy.list)) { 225 list_del_init(&pq->busy.list); 226 pq->busy.lock = NULL; 227 } 228 write_sequnlock_irqrestore(lock, flags); 229 } 230 231 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 232 struct hfi1_ctxtdata *uctxt) 233 { 234 struct hfi1_user_sdma_pkt_q *pq; 235 236 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 237 238 spin_lock(&fd->pq_rcu_lock); 239 pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, 240 lockdep_is_held(&fd->pq_rcu_lock)); 241 if (pq) { 242 rcu_assign_pointer(fd->pq, NULL); 243 spin_unlock(&fd->pq_rcu_lock); 244 synchronize_srcu(&fd->pq_srcu); 245 /* at this point there can be no more new requests */ 246 iowait_sdma_drain(&pq->busy); 247 /* Wait until all requests have been freed. */ 248 wait_event_interruptible( 249 pq->wait, 250 !atomic_read(&pq->n_reqs)); 251 kfree(pq->reqs); 252 if (pq->handler) 253 hfi1_mmu_rb_unregister(pq->handler); 254 bitmap_free(pq->req_in_use); 255 kmem_cache_destroy(pq->txreq_cache); 256 flush_pq_iowait(pq); 257 kfree(pq); 258 } else { 259 spin_unlock(&fd->pq_rcu_lock); 260 } 261 if (fd->cq) { 262 vfree(fd->cq->comps); 263 kfree(fd->cq); 264 fd->cq = NULL; 265 } 266 return 0; 267 } 268 269 static u8 dlid_to_selector(u16 dlid) 270 { 271 static u8 mapping[256]; 272 static int initialized; 273 static u8 next; 274 int hash; 275 276 if (!initialized) { 277 memset(mapping, 0xFF, 256); 278 initialized = 1; 279 } 280 281 hash = ((dlid >> 8) ^ dlid) & 0xFF; 282 if (mapping[hash] == 0xFF) { 283 mapping[hash] = next; 284 next = (next + 1) & 0x7F; 285 } 286 287 return mapping[hash]; 288 } 289 290 /** 291 * hfi1_user_sdma_process_request() - Process and start a user sdma request 292 * @fd: valid file descriptor 293 * @iovec: array of io vectors to process 294 * @dim: overall iovec array size 295 * @count: number of io vector array entries processed 296 */ 297 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 298 struct iovec *iovec, unsigned long dim, 299 unsigned long *count) 300 { 301 int ret = 0, i; 302 struct hfi1_ctxtdata *uctxt = fd->uctxt; 303 struct hfi1_user_sdma_pkt_q *pq = 304 srcu_dereference(fd->pq, &fd->pq_srcu); 305 struct hfi1_user_sdma_comp_q *cq = fd->cq; 306 struct hfi1_devdata *dd = pq->dd; 307 unsigned long idx = 0; 308 u8 pcount = initial_pkt_count; 309 struct sdma_req_info info; 310 struct user_sdma_request *req; 311 u8 opcode, sc, vl; 312 u16 pkey; 313 u32 slid; 314 u16 dlid; 315 u32 selector; 316 317 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 318 hfi1_cdbg( 319 SDMA, 320 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 321 dd->unit, uctxt->ctxt, fd->subctxt, 322 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 323 return -EINVAL; 324 } 325 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 326 if (ret) { 327 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 328 dd->unit, uctxt->ctxt, fd->subctxt, ret); 329 return -EFAULT; 330 } 331 332 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 333 (u16 *)&info); 334 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 335 hfi1_cdbg(SDMA, 336 "[%u:%u:%u:%u] Invalid comp index", 337 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 338 return -EINVAL; 339 } 340 341 /* 342 * Sanity check the header io vector count. Need at least 1 vector 343 * (header) and cannot be larger than the actual io vector count. 344 */ 345 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 346 hfi1_cdbg(SDMA, 347 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 348 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 349 req_iovcnt(info.ctrl), dim); 350 return -EINVAL; 351 } 352 353 if (!info.fragsize) { 354 hfi1_cdbg(SDMA, 355 "[%u:%u:%u:%u] Request does not specify fragsize", 356 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 357 return -EINVAL; 358 } 359 360 /* Try to claim the request. */ 361 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 362 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 363 dd->unit, uctxt->ctxt, fd->subctxt, 364 info.comp_idx); 365 return -EBADSLT; 366 } 367 /* 368 * All safety checks have been done and this request has been claimed. 369 */ 370 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 371 info.comp_idx); 372 req = pq->reqs + info.comp_idx; 373 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 374 req->data_len = 0; 375 req->pq = pq; 376 req->cq = cq; 377 req->ahg_idx = -1; 378 req->iov_idx = 0; 379 req->sent = 0; 380 req->seqnum = 0; 381 req->seqcomp = 0; 382 req->seqsubmitted = 0; 383 req->tids = NULL; 384 req->has_error = 0; 385 INIT_LIST_HEAD(&req->txps); 386 387 memcpy(&req->info, &info, sizeof(info)); 388 389 /* The request is initialized, count it */ 390 atomic_inc(&pq->n_reqs); 391 392 if (req_opcode(info.ctrl) == EXPECTED) { 393 /* expected must have a TID info and at least one data vector */ 394 if (req->data_iovs < 2) { 395 SDMA_DBG(req, 396 "Not enough vectors for expected request"); 397 ret = -EINVAL; 398 goto free_req; 399 } 400 req->data_iovs--; 401 } 402 403 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 404 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 405 MAX_VECTORS_PER_REQ); 406 ret = -EINVAL; 407 goto free_req; 408 } 409 410 /* Copy the header from the user buffer */ 411 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 412 sizeof(req->hdr)); 413 if (ret) { 414 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 415 ret = -EFAULT; 416 goto free_req; 417 } 418 419 /* If Static rate control is not enabled, sanitize the header. */ 420 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 421 req->hdr.pbc[2] = 0; 422 423 /* Validate the opcode. Do not trust packets from user space blindly. */ 424 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 425 if ((opcode & USER_OPCODE_CHECK_MASK) != 426 USER_OPCODE_CHECK_VAL) { 427 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 428 ret = -EINVAL; 429 goto free_req; 430 } 431 /* 432 * Validate the vl. Do not trust packets from user space blindly. 433 * VL comes from PBC, SC comes from LRH, and the VL needs to 434 * match the SC look up. 435 */ 436 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 437 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 438 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 439 if (vl >= dd->pport->vls_operational || 440 vl != sc_to_vlt(dd, sc)) { 441 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 442 ret = -EINVAL; 443 goto free_req; 444 } 445 446 /* Checking P_KEY for requests from user-space */ 447 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 448 slid = be16_to_cpu(req->hdr.lrh[3]); 449 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 450 ret = -EINVAL; 451 goto free_req; 452 } 453 454 /* 455 * Also should check the BTH.lnh. If it says the next header is GRH then 456 * the RXE parsing will be off and will land in the middle of the KDETH 457 * or miss it entirely. 458 */ 459 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 460 SDMA_DBG(req, "User tried to pass in a GRH"); 461 ret = -EINVAL; 462 goto free_req; 463 } 464 465 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 466 /* 467 * Calculate the initial TID offset based on the values of 468 * KDETH.OFFSET and KDETH.OM that are passed in. 469 */ 470 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 471 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 472 KDETH_OM_LARGE : KDETH_OM_SMALL); 473 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 474 info.comp_idx, req->tidoffset); 475 idx++; 476 477 /* Save all the IO vector structures */ 478 for (i = 0; i < req->data_iovs; i++) { 479 req->iovs[i].offset = 0; 480 INIT_LIST_HEAD(&req->iovs[i].list); 481 memcpy(&req->iovs[i].iov, 482 iovec + idx++, 483 sizeof(req->iovs[i].iov)); 484 if (req->iovs[i].iov.iov_len == 0) { 485 ret = -EINVAL; 486 goto free_req; 487 } 488 req->data_len += req->iovs[i].iov.iov_len; 489 } 490 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 491 info.comp_idx, req->data_len); 492 if (pcount > req->info.npkts) 493 pcount = req->info.npkts; 494 /* 495 * Copy any TID info 496 * User space will provide the TID info only when the 497 * request type is EXPECTED. This is true even if there is 498 * only one packet in the request and the header is already 499 * setup. The reason for the singular TID case is that the 500 * driver needs to perform safety checks. 501 */ 502 if (req_opcode(req->info.ctrl) == EXPECTED) { 503 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 504 u32 *tmp; 505 506 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 507 ret = -EINVAL; 508 goto free_req; 509 } 510 511 /* 512 * We have to copy all of the tids because they may vary 513 * in size and, therefore, the TID count might not be 514 * equal to the pkt count. However, there is no way to 515 * tell at this point. 516 */ 517 tmp = memdup_user(iovec[idx].iov_base, 518 ntids * sizeof(*req->tids)); 519 if (IS_ERR(tmp)) { 520 ret = PTR_ERR(tmp); 521 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 522 ntids, ret); 523 goto free_req; 524 } 525 req->tids = tmp; 526 req->n_tids = ntids; 527 req->tididx = 0; 528 idx++; 529 } 530 531 dlid = be16_to_cpu(req->hdr.lrh[1]); 532 selector = dlid_to_selector(dlid); 533 selector += uctxt->ctxt + fd->subctxt; 534 req->sde = sdma_select_user_engine(dd, selector, vl); 535 536 if (!req->sde || !sdma_running(req->sde)) { 537 ret = -ECOMM; 538 goto free_req; 539 } 540 541 /* We don't need an AHG entry if the request contains only one packet */ 542 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 543 req->ahg_idx = sdma_ahg_alloc(req->sde); 544 545 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 546 pq->state = SDMA_PKT_Q_ACTIVE; 547 548 /* 549 * This is a somewhat blocking send implementation. 550 * The driver will block the caller until all packets of the 551 * request have been submitted to the SDMA engine. However, it 552 * will not wait for send completions. 553 */ 554 while (req->seqsubmitted != req->info.npkts) { 555 ret = user_sdma_send_pkts(req, pcount); 556 if (ret < 0) { 557 int we_ret; 558 559 if (ret != -EBUSY) 560 goto free_req; 561 we_ret = wait_event_interruptible_timeout( 562 pq->busy.wait_dma, 563 pq->state == SDMA_PKT_Q_ACTIVE, 564 msecs_to_jiffies( 565 SDMA_IOWAIT_TIMEOUT)); 566 trace_hfi1_usdma_we(pq, we_ret); 567 if (we_ret <= 0) 568 flush_pq_iowait(pq); 569 } 570 } 571 *count += idx; 572 return 0; 573 free_req: 574 /* 575 * If the submitted seqsubmitted == npkts, the completion routine 576 * controls the final state. If sequbmitted < npkts, wait for any 577 * outstanding packets to finish before cleaning up. 578 */ 579 if (req->seqsubmitted < req->info.npkts) { 580 if (req->seqsubmitted) 581 wait_event(pq->busy.wait_dma, 582 (req->seqcomp == req->seqsubmitted - 1)); 583 user_sdma_free_request(req); 584 pq_update(pq); 585 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 586 } 587 return ret; 588 } 589 590 static inline u32 compute_data_length(struct user_sdma_request *req, 591 struct user_sdma_txreq *tx) 592 { 593 /* 594 * Determine the proper size of the packet data. 595 * The size of the data of the first packet is in the header 596 * template. However, it includes the header and ICRC, which need 597 * to be subtracted. 598 * The minimum representable packet data length in a header is 4 bytes, 599 * therefore, when the data length request is less than 4 bytes, there's 600 * only one packet, and the packet data length is equal to that of the 601 * request data length. 602 * The size of the remaining packets is the minimum of the frag 603 * size (MTU) or remaining data in the request. 604 */ 605 u32 len; 606 607 if (!req->seqnum) { 608 if (req->data_len < sizeof(u32)) 609 len = req->data_len; 610 else 611 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 612 (sizeof(tx->hdr) - 4)); 613 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 614 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 615 PAGE_SIZE; 616 /* 617 * Get the data length based on the remaining space in the 618 * TID pair. 619 */ 620 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 621 /* If we've filled up the TID pair, move to the next one. */ 622 if (unlikely(!len) && ++req->tididx < req->n_tids && 623 req->tids[req->tididx]) { 624 tidlen = EXP_TID_GET(req->tids[req->tididx], 625 LEN) * PAGE_SIZE; 626 req->tidoffset = 0; 627 len = min_t(u32, tidlen, req->info.fragsize); 628 } 629 /* 630 * Since the TID pairs map entire pages, make sure that we 631 * are not going to try to send more data that we have 632 * remaining. 633 */ 634 len = min(len, req->data_len - req->sent); 635 } else { 636 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 637 } 638 trace_hfi1_sdma_user_compute_length(req->pq->dd, 639 req->pq->ctxt, 640 req->pq->subctxt, 641 req->info.comp_idx, 642 len); 643 return len; 644 } 645 646 static inline u32 pad_len(u32 len) 647 { 648 if (len & (sizeof(u32) - 1)) 649 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 650 return len; 651 } 652 653 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 654 { 655 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 656 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 657 } 658 659 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 660 struct user_sdma_txreq *tx, 661 u32 datalen) 662 { 663 int ret; 664 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 665 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 666 struct hfi1_user_sdma_pkt_q *pq = req->pq; 667 668 /* 669 * Copy the request header into the tx header 670 * because the HW needs a cacheline-aligned 671 * address. 672 * This copy can be optimized out if the hdr 673 * member of user_sdma_request were also 674 * cacheline aligned. 675 */ 676 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 677 if (PBC2LRH(pbclen) != lrhlen) { 678 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 679 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 680 } 681 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 682 if (ret) 683 return ret; 684 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 685 sizeof(tx->hdr) + datalen, req->ahg_idx, 686 0, NULL, 0, user_sdma_txreq_cb); 687 if (ret) 688 return ret; 689 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 690 if (ret) 691 sdma_txclean(pq->dd, &tx->txreq); 692 return ret; 693 } 694 695 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 696 { 697 int ret = 0; 698 u16 count; 699 unsigned npkts = 0; 700 struct user_sdma_txreq *tx = NULL; 701 struct hfi1_user_sdma_pkt_q *pq = NULL; 702 struct user_sdma_iovec *iovec = NULL; 703 704 if (!req->pq) 705 return -EINVAL; 706 707 pq = req->pq; 708 709 /* If tx completion has reported an error, we are done. */ 710 if (READ_ONCE(req->has_error)) 711 return -EFAULT; 712 713 /* 714 * Check if we might have sent the entire request already 715 */ 716 if (unlikely(req->seqnum == req->info.npkts)) { 717 if (!list_empty(&req->txps)) 718 goto dosend; 719 return ret; 720 } 721 722 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 723 maxpkts = req->info.npkts - req->seqnum; 724 725 while (npkts < maxpkts) { 726 u32 datalen = 0; 727 728 /* 729 * Check whether any of the completions have come back 730 * with errors. If so, we are not going to process any 731 * more packets from this request. 732 */ 733 if (READ_ONCE(req->has_error)) 734 return -EFAULT; 735 736 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 737 if (!tx) 738 return -ENOMEM; 739 740 tx->flags = 0; 741 tx->req = req; 742 INIT_LIST_HEAD(&tx->list); 743 744 /* 745 * For the last packet set the ACK request 746 * and disable header suppression. 747 */ 748 if (req->seqnum == req->info.npkts - 1) 749 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 750 TXREQ_FLAGS_REQ_DISABLE_SH); 751 752 /* 753 * Calculate the payload size - this is min of the fragment 754 * (MTU) size or the remaining bytes in the request but only 755 * if we have payload data. 756 */ 757 if (req->data_len) { 758 iovec = &req->iovs[req->iov_idx]; 759 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 760 if (++req->iov_idx == req->data_iovs) { 761 ret = -EFAULT; 762 goto free_tx; 763 } 764 iovec = &req->iovs[req->iov_idx]; 765 WARN_ON(iovec->offset); 766 } 767 768 datalen = compute_data_length(req, tx); 769 770 /* 771 * Disable header suppression for the payload <= 8DWS. 772 * If there is an uncorrectable error in the receive 773 * data FIFO when the received payload size is less than 774 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 775 * not reported.There is set RHF.EccErr if the header 776 * is not suppressed. 777 */ 778 if (!datalen) { 779 SDMA_DBG(req, 780 "Request has data but pkt len is 0"); 781 ret = -EFAULT; 782 goto free_tx; 783 } else if (datalen <= 32) { 784 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 785 } 786 } 787 788 if (req->ahg_idx >= 0) { 789 if (!req->seqnum) { 790 ret = user_sdma_txadd_ahg(req, tx, datalen); 791 if (ret) 792 goto free_tx; 793 } else { 794 int changes; 795 796 changes = set_txreq_header_ahg(req, tx, 797 datalen); 798 if (changes < 0) { 799 ret = changes; 800 goto free_tx; 801 } 802 } 803 } else { 804 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 805 datalen, user_sdma_txreq_cb); 806 if (ret) 807 goto free_tx; 808 /* 809 * Modify the header for this packet. This only needs 810 * to be done if we are not going to use AHG. Otherwise, 811 * the HW will do it based on the changes we gave it 812 * during sdma_txinit_ahg(). 813 */ 814 ret = set_txreq_header(req, tx, datalen); 815 if (ret) 816 goto free_txreq; 817 } 818 819 req->koffset += datalen; 820 if (req_opcode(req->info.ctrl) == EXPECTED) 821 req->tidoffset += datalen; 822 req->sent += datalen; 823 while (datalen) { 824 ret = add_system_pages_to_sdma_packet(req, tx, iovec, 825 &datalen); 826 if (ret) 827 goto free_txreq; 828 iovec = &req->iovs[req->iov_idx]; 829 } 830 list_add_tail(&tx->txreq.list, &req->txps); 831 /* 832 * It is important to increment this here as it is used to 833 * generate the BTH.PSN and, therefore, can't be bulk-updated 834 * outside of the loop. 835 */ 836 tx->seqnum = req->seqnum++; 837 npkts++; 838 } 839 dosend: 840 ret = sdma_send_txlist(req->sde, 841 iowait_get_ib_work(&pq->busy), 842 &req->txps, &count); 843 req->seqsubmitted += count; 844 if (req->seqsubmitted == req->info.npkts) { 845 /* 846 * The txreq has already been submitted to the HW queue 847 * so we can free the AHG entry now. Corruption will not 848 * happen due to the sequential manner in which 849 * descriptors are processed. 850 */ 851 if (req->ahg_idx >= 0) 852 sdma_ahg_free(req->sde, req->ahg_idx); 853 } 854 return ret; 855 856 free_txreq: 857 sdma_txclean(pq->dd, &tx->txreq); 858 free_tx: 859 kmem_cache_free(pq->txreq_cache, tx); 860 return ret; 861 } 862 863 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 864 { 865 struct evict_data evict_data; 866 struct mmu_rb_handler *handler = pq->handler; 867 868 evict_data.cleared = 0; 869 evict_data.target = npages; 870 hfi1_mmu_rb_evict(handler, &evict_data); 871 return evict_data.cleared; 872 } 873 874 static int check_header_template(struct user_sdma_request *req, 875 struct hfi1_pkt_header *hdr, u32 lrhlen, 876 u32 datalen) 877 { 878 /* 879 * Perform safety checks for any type of packet: 880 * - transfer size is multiple of 64bytes 881 * - packet length is multiple of 4 bytes 882 * - packet length is not larger than MTU size 883 * 884 * These checks are only done for the first packet of the 885 * transfer since the header is "given" to us by user space. 886 * For the remainder of the packets we compute the values. 887 */ 888 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 889 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 890 return -EINVAL; 891 892 if (req_opcode(req->info.ctrl) == EXPECTED) { 893 /* 894 * The header is checked only on the first packet. Furthermore, 895 * we ensure that at least one TID entry is copied when the 896 * request is submitted. Therefore, we don't have to verify that 897 * tididx points to something sane. 898 */ 899 u32 tidval = req->tids[req->tididx], 900 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 901 tididx = EXP_TID_GET(tidval, IDX), 902 tidctrl = EXP_TID_GET(tidval, CTRL), 903 tidoff; 904 __le32 kval = hdr->kdeth.ver_tid_offset; 905 906 tidoff = KDETH_GET(kval, OFFSET) * 907 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 908 KDETH_OM_LARGE : KDETH_OM_SMALL); 909 /* 910 * Expected receive packets have the following 911 * additional checks: 912 * - offset is not larger than the TID size 913 * - TIDCtrl values match between header and TID array 914 * - TID indexes match between header and TID array 915 */ 916 if ((tidoff + datalen > tidlen) || 917 KDETH_GET(kval, TIDCTRL) != tidctrl || 918 KDETH_GET(kval, TID) != tididx) 919 return -EINVAL; 920 } 921 return 0; 922 } 923 924 /* 925 * Correctly set the BTH.PSN field based on type of 926 * transfer - eager packets can just increment the PSN but 927 * expected packets encode generation and sequence in the 928 * BTH.PSN field so just incrementing will result in errors. 929 */ 930 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 931 { 932 u32 val = be32_to_cpu(bthpsn), 933 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 934 0xffffffull), 935 psn = val & mask; 936 if (expct) 937 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 938 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 939 else 940 psn = psn + frags; 941 return psn & mask; 942 } 943 944 static int set_txreq_header(struct user_sdma_request *req, 945 struct user_sdma_txreq *tx, u32 datalen) 946 { 947 struct hfi1_user_sdma_pkt_q *pq = req->pq; 948 struct hfi1_pkt_header *hdr = &tx->hdr; 949 u8 omfactor; /* KDETH.OM */ 950 u16 pbclen; 951 int ret; 952 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 953 954 /* Copy the header template to the request before modification */ 955 memcpy(hdr, &req->hdr, sizeof(*hdr)); 956 957 /* 958 * Check if the PBC and LRH length are mismatched. If so 959 * adjust both in the header. 960 */ 961 pbclen = le16_to_cpu(hdr->pbc[0]); 962 if (PBC2LRH(pbclen) != lrhlen) { 963 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 964 hdr->pbc[0] = cpu_to_le16(pbclen); 965 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 966 /* 967 * Third packet 968 * This is the first packet in the sequence that has 969 * a "static" size that can be used for the rest of 970 * the packets (besides the last one). 971 */ 972 if (unlikely(req->seqnum == 2)) { 973 /* 974 * From this point on the lengths in both the 975 * PBC and LRH are the same until the last 976 * packet. 977 * Adjust the template so we don't have to update 978 * every packet 979 */ 980 req->hdr.pbc[0] = hdr->pbc[0]; 981 req->hdr.lrh[2] = hdr->lrh[2]; 982 } 983 } 984 /* 985 * We only have to modify the header if this is not the 986 * first packet in the request. Otherwise, we use the 987 * header given to us. 988 */ 989 if (unlikely(!req->seqnum)) { 990 ret = check_header_template(req, hdr, lrhlen, datalen); 991 if (ret) 992 return ret; 993 goto done; 994 } 995 996 hdr->bth[2] = cpu_to_be32( 997 set_pkt_bth_psn(hdr->bth[2], 998 (req_opcode(req->info.ctrl) == EXPECTED), 999 req->seqnum)); 1000 1001 /* Set ACK request on last packet */ 1002 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1003 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1004 1005 /* Set the new offset */ 1006 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1007 /* Expected packets have to fill in the new TID information */ 1008 if (req_opcode(req->info.ctrl) == EXPECTED) { 1009 tidval = req->tids[req->tididx]; 1010 /* 1011 * If the offset puts us at the end of the current TID, 1012 * advance everything. 1013 */ 1014 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1015 PAGE_SIZE)) { 1016 req->tidoffset = 0; 1017 /* 1018 * Since we don't copy all the TIDs, all at once, 1019 * we have to check again. 1020 */ 1021 if (++req->tididx > req->n_tids - 1 || 1022 !req->tids[req->tididx]) { 1023 return -EINVAL; 1024 } 1025 tidval = req->tids[req->tididx]; 1026 } 1027 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1028 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1029 KDETH_OM_SMALL_SHIFT; 1030 /* Set KDETH.TIDCtrl based on value for this TID. */ 1031 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1032 EXP_TID_GET(tidval, CTRL)); 1033 /* Set KDETH.TID based on value for this TID */ 1034 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1035 EXP_TID_GET(tidval, IDX)); 1036 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1037 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1038 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1039 /* 1040 * Set the KDETH.OFFSET and KDETH.OM based on size of 1041 * transfer. 1042 */ 1043 trace_hfi1_sdma_user_tid_info( 1044 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1045 req->tidoffset, req->tidoffset >> omfactor, 1046 omfactor != KDETH_OM_SMALL_SHIFT); 1047 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1048 req->tidoffset >> omfactor); 1049 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1050 omfactor != KDETH_OM_SMALL_SHIFT); 1051 } 1052 done: 1053 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1054 req->info.comp_idx, hdr, tidval); 1055 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1056 } 1057 1058 static int set_txreq_header_ahg(struct user_sdma_request *req, 1059 struct user_sdma_txreq *tx, u32 datalen) 1060 { 1061 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1062 int idx = 0; 1063 u8 omfactor; /* KDETH.OM */ 1064 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1065 struct hfi1_pkt_header *hdr = &req->hdr; 1066 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1067 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1068 size_t array_size = ARRAY_SIZE(ahg); 1069 1070 if (PBC2LRH(pbclen) != lrhlen) { 1071 /* PBC.PbcLengthDWs */ 1072 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1073 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1074 if (idx < 0) 1075 return idx; 1076 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1077 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1078 (__force u16)cpu_to_be16(lrhlen >> 2)); 1079 if (idx < 0) 1080 return idx; 1081 } 1082 1083 /* 1084 * Do the common updates 1085 */ 1086 /* BTH.PSN and BTH.A */ 1087 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1088 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1089 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1090 val32 |= 1UL << 31; 1091 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1092 (__force u16)cpu_to_be16(val32 >> 16)); 1093 if (idx < 0) 1094 return idx; 1095 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1096 (__force u16)cpu_to_be16(val32 & 0xffff)); 1097 if (idx < 0) 1098 return idx; 1099 /* KDETH.Offset */ 1100 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1101 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1102 if (idx < 0) 1103 return idx; 1104 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1105 (__force u16)cpu_to_le16(req->koffset >> 16)); 1106 if (idx < 0) 1107 return idx; 1108 if (req_opcode(req->info.ctrl) == EXPECTED) { 1109 __le16 val; 1110 1111 tidval = req->tids[req->tididx]; 1112 1113 /* 1114 * If the offset puts us at the end of the current TID, 1115 * advance everything. 1116 */ 1117 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1118 PAGE_SIZE)) { 1119 req->tidoffset = 0; 1120 /* 1121 * Since we don't copy all the TIDs, all at once, 1122 * we have to check again. 1123 */ 1124 if (++req->tididx > req->n_tids - 1 || 1125 !req->tids[req->tididx]) 1126 return -EINVAL; 1127 tidval = req->tids[req->tididx]; 1128 } 1129 omfactor = ((EXP_TID_GET(tidval, LEN) * 1130 PAGE_SIZE) >= 1131 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1132 KDETH_OM_SMALL_SHIFT; 1133 /* KDETH.OM and KDETH.OFFSET (TID) */ 1134 idx = ahg_header_set( 1135 ahg, idx, array_size, 7, 0, 16, 1136 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1137 ((req->tidoffset >> omfactor) 1138 & 0x7fff))); 1139 if (idx < 0) 1140 return idx; 1141 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1142 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1143 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1144 1145 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1146 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1147 INTR) << 1148 AHG_KDETH_INTR_SHIFT)); 1149 } else { 1150 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1151 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1152 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1153 INTR) << 1154 AHG_KDETH_INTR_SHIFT)); 1155 } 1156 1157 idx = ahg_header_set(ahg, idx, array_size, 1158 7, 16, 14, (__force u16)val); 1159 if (idx < 0) 1160 return idx; 1161 } 1162 1163 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1164 req->info.comp_idx, req->sde->this_idx, 1165 req->ahg_idx, ahg, idx, tidval); 1166 sdma_txinit_ahg(&tx->txreq, 1167 SDMA_TXREQ_F_USE_AHG, 1168 datalen, req->ahg_idx, idx, 1169 ahg, sizeof(req->hdr), 1170 user_sdma_txreq_cb); 1171 1172 return idx; 1173 } 1174 1175 /** 1176 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1177 * @txreq: valid sdma tx request 1178 * @status: success/failure of request 1179 * 1180 * Called when the SDMA progress state machine gets notification that 1181 * the SDMA descriptors for this tx request have been processed by the 1182 * DMA engine. Called in interrupt context. 1183 * Only do work on completed sequences. 1184 */ 1185 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1186 { 1187 struct user_sdma_txreq *tx = 1188 container_of(txreq, struct user_sdma_txreq, txreq); 1189 struct user_sdma_request *req; 1190 struct hfi1_user_sdma_pkt_q *pq; 1191 struct hfi1_user_sdma_comp_q *cq; 1192 enum hfi1_sdma_comp_state state = COMPLETE; 1193 1194 if (!tx->req) 1195 return; 1196 1197 req = tx->req; 1198 pq = req->pq; 1199 cq = req->cq; 1200 1201 if (status != SDMA_TXREQ_S_OK) { 1202 SDMA_DBG(req, "SDMA completion with error %d", 1203 status); 1204 WRITE_ONCE(req->has_error, 1); 1205 state = ERROR; 1206 } 1207 1208 req->seqcomp = tx->seqnum; 1209 kmem_cache_free(pq->txreq_cache, tx); 1210 1211 /* sequence isn't complete? We are done */ 1212 if (req->seqcomp != req->info.npkts - 1) 1213 return; 1214 1215 user_sdma_free_request(req); 1216 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1217 pq_update(pq); 1218 } 1219 1220 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1221 { 1222 if (atomic_dec_and_test(&pq->n_reqs)) 1223 wake_up(&pq->wait); 1224 } 1225 1226 static void user_sdma_free_request(struct user_sdma_request *req) 1227 { 1228 if (!list_empty(&req->txps)) { 1229 struct sdma_txreq *t, *p; 1230 1231 list_for_each_entry_safe(t, p, &req->txps, list) { 1232 struct user_sdma_txreq *tx = 1233 container_of(t, struct user_sdma_txreq, txreq); 1234 list_del_init(&t->list); 1235 sdma_txclean(req->pq->dd, t); 1236 kmem_cache_free(req->pq->txreq_cache, tx); 1237 } 1238 } 1239 1240 kfree(req->tids); 1241 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1242 } 1243 1244 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1245 struct hfi1_user_sdma_comp_q *cq, 1246 u16 idx, enum hfi1_sdma_comp_state state, 1247 int ret) 1248 { 1249 if (state == ERROR) 1250 cq->comps[idx].errcode = -ret; 1251 smp_wmb(); /* make sure errcode is visible first */ 1252 cq->comps[idx].status = state; 1253 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1254 idx, state, ret); 1255 } 1256 1257 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1258 unsigned int start, unsigned int npages) 1259 { 1260 hfi1_release_user_pages(mm, pages + start, npages, false); 1261 kfree(pages); 1262 } 1263 1264 static void free_system_node(struct sdma_mmu_node *node) 1265 { 1266 if (node->npages) { 1267 unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0, 1268 node->npages); 1269 atomic_sub(node->npages, &node->pq->n_locked); 1270 } 1271 kfree(node); 1272 } 1273 1274 /* 1275 * kref_get()'s an additional kref on the returned rb_node to prevent rb_node 1276 * from being released until after rb_node is assigned to an SDMA descriptor 1277 * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the 1278 * virtual address range for rb_node is invalidated between now and then. 1279 */ 1280 static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, 1281 unsigned long start, 1282 unsigned long end) 1283 { 1284 struct mmu_rb_node *rb_node; 1285 unsigned long flags; 1286 1287 spin_lock_irqsave(&handler->lock, flags); 1288 rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start)); 1289 if (!rb_node) { 1290 spin_unlock_irqrestore(&handler->lock, flags); 1291 return NULL; 1292 } 1293 1294 /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ 1295 kref_get(&rb_node->refcount); 1296 spin_unlock_irqrestore(&handler->lock, flags); 1297 1298 return container_of(rb_node, struct sdma_mmu_node, rb); 1299 } 1300 1301 static int pin_system_pages(struct user_sdma_request *req, 1302 uintptr_t start_address, size_t length, 1303 struct sdma_mmu_node *node, int npages) 1304 { 1305 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1306 int pinned, cleared; 1307 struct page **pages; 1308 1309 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1310 if (!pages) 1311 return -ENOMEM; 1312 1313 retry: 1314 if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked), 1315 npages)) { 1316 SDMA_DBG(req, "Evicting: nlocked %u npages %u", 1317 atomic_read(&pq->n_locked), npages); 1318 cleared = sdma_cache_evict(pq, npages); 1319 if (cleared >= npages) 1320 goto retry; 1321 } 1322 1323 SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u", 1324 start_address, node->npages, npages); 1325 pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0, 1326 pages); 1327 1328 if (pinned < 0) { 1329 kfree(pages); 1330 SDMA_DBG(req, "pinned %d", pinned); 1331 return pinned; 1332 } 1333 if (pinned != npages) { 1334 unpin_vector_pages(current->mm, pages, node->npages, pinned); 1335 SDMA_DBG(req, "npages %u pinned %d", npages, pinned); 1336 return -EFAULT; 1337 } 1338 node->rb.addr = start_address; 1339 node->rb.len = length; 1340 node->pages = pages; 1341 node->npages = npages; 1342 atomic_add(pinned, &pq->n_locked); 1343 SDMA_DBG(req, "done. pinned %d", pinned); 1344 return 0; 1345 } 1346 1347 /* 1348 * kref refcount on *node_p will be 2 on successful addition: one kref from 1349 * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being 1350 * released until after *node_p is assigned to an SDMA descriptor (struct 1351 * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual 1352 * address range for *node_p is invalidated between now and then. 1353 */ 1354 static int add_system_pinning(struct user_sdma_request *req, 1355 struct sdma_mmu_node **node_p, 1356 unsigned long start, unsigned long len) 1357 1358 { 1359 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1360 struct sdma_mmu_node *node; 1361 int ret; 1362 1363 node = kzalloc(sizeof(*node), GFP_KERNEL); 1364 if (!node) 1365 return -ENOMEM; 1366 1367 /* First kref "moves" to mmu_rb_handler */ 1368 kref_init(&node->rb.refcount); 1369 1370 /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ 1371 kref_get(&node->rb.refcount); 1372 1373 node->pq = pq; 1374 ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); 1375 if (ret == 0) { 1376 ret = hfi1_mmu_rb_insert(pq->handler, &node->rb); 1377 if (ret) 1378 free_system_node(node); 1379 else 1380 *node_p = node; 1381 1382 return ret; 1383 } 1384 1385 kfree(node); 1386 return ret; 1387 } 1388 1389 static int get_system_cache_entry(struct user_sdma_request *req, 1390 struct sdma_mmu_node **node_p, 1391 size_t req_start, size_t req_len) 1392 { 1393 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1394 u64 start = ALIGN_DOWN(req_start, PAGE_SIZE); 1395 u64 end = PFN_ALIGN(req_start + req_len); 1396 struct mmu_rb_handler *handler = pq->handler; 1397 int ret; 1398 1399 if ((end - start) == 0) { 1400 SDMA_DBG(req, 1401 "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx", 1402 req_start, req_len, start, end); 1403 return -EINVAL; 1404 } 1405 1406 SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len); 1407 1408 while (1) { 1409 struct sdma_mmu_node *node = 1410 find_system_node(handler, start, end); 1411 u64 prepend_len = 0; 1412 1413 SDMA_DBG(req, "node %p start %llx end %llu", node, start, end); 1414 if (!node) { 1415 ret = add_system_pinning(req, node_p, start, 1416 end - start); 1417 if (ret == -EEXIST) { 1418 /* 1419 * Another execution context has inserted a 1420 * conficting entry first. 1421 */ 1422 continue; 1423 } 1424 return ret; 1425 } 1426 1427 if (node->rb.addr <= start) { 1428 /* 1429 * This entry covers at least part of the region. If it doesn't extend 1430 * to the end, then this will be called again for the next segment. 1431 */ 1432 *node_p = node; 1433 return 0; 1434 } 1435 1436 SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d", 1437 node->rb.addr, kref_read(&node->rb.refcount)); 1438 prepend_len = node->rb.addr - start; 1439 1440 /* 1441 * This node will not be returned, instead a new node 1442 * will be. So release the reference. 1443 */ 1444 kref_put(&node->rb.refcount, hfi1_mmu_rb_release); 1445 1446 /* Prepend a node to cover the beginning of the allocation */ 1447 ret = add_system_pinning(req, node_p, start, prepend_len); 1448 if (ret == -EEXIST) { 1449 /* Another execution context has inserted a conficting entry first. */ 1450 continue; 1451 } 1452 return ret; 1453 } 1454 } 1455 1456 static void sdma_mmu_rb_node_get(void *ctx) 1457 { 1458 struct mmu_rb_node *node = ctx; 1459 1460 kref_get(&node->refcount); 1461 } 1462 1463 static void sdma_mmu_rb_node_put(void *ctx) 1464 { 1465 struct sdma_mmu_node *node = ctx; 1466 1467 kref_put(&node->rb.refcount, hfi1_mmu_rb_release); 1468 } 1469 1470 static int add_mapping_to_sdma_packet(struct user_sdma_request *req, 1471 struct user_sdma_txreq *tx, 1472 struct sdma_mmu_node *cache_entry, 1473 size_t start, 1474 size_t from_this_cache_entry) 1475 { 1476 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1477 unsigned int page_offset; 1478 unsigned int from_this_page; 1479 size_t page_index; 1480 void *ctx; 1481 int ret; 1482 1483 /* 1484 * Because the cache may be more fragmented than the memory that is being accessed, 1485 * it's not strictly necessary to have a descriptor per cache entry. 1486 */ 1487 1488 while (from_this_cache_entry) { 1489 page_index = PFN_DOWN(start - cache_entry->rb.addr); 1490 1491 if (page_index >= cache_entry->npages) { 1492 SDMA_DBG(req, 1493 "Request for page_index %zu >= cache_entry->npages %u", 1494 page_index, cache_entry->npages); 1495 return -EINVAL; 1496 } 1497 1498 page_offset = start - ALIGN_DOWN(start, PAGE_SIZE); 1499 from_this_page = PAGE_SIZE - page_offset; 1500 1501 if (from_this_page < from_this_cache_entry) { 1502 ctx = NULL; 1503 } else { 1504 /* 1505 * In the case they are equal the next line has no practical effect, 1506 * but it's better to do a register to register copy than a conditional 1507 * branch. 1508 */ 1509 from_this_page = from_this_cache_entry; 1510 ctx = cache_entry; 1511 } 1512 1513 ret = sdma_txadd_page(pq->dd, &tx->txreq, 1514 cache_entry->pages[page_index], 1515 page_offset, from_this_page, 1516 ctx, 1517 sdma_mmu_rb_node_get, 1518 sdma_mmu_rb_node_put); 1519 if (ret) { 1520 /* 1521 * When there's a failure, the entire request is freed by 1522 * user_sdma_send_pkts(). 1523 */ 1524 SDMA_DBG(req, 1525 "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u", 1526 ret, page_index, page_offset, from_this_page); 1527 return ret; 1528 } 1529 start += from_this_page; 1530 from_this_cache_entry -= from_this_page; 1531 } 1532 return 0; 1533 } 1534 1535 static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, 1536 struct user_sdma_txreq *tx, 1537 struct user_sdma_iovec *iovec, 1538 size_t from_this_iovec) 1539 { 1540 while (from_this_iovec > 0) { 1541 struct sdma_mmu_node *cache_entry; 1542 size_t from_this_cache_entry; 1543 size_t start; 1544 int ret; 1545 1546 start = (uintptr_t)iovec->iov.iov_base + iovec->offset; 1547 ret = get_system_cache_entry(req, &cache_entry, start, 1548 from_this_iovec); 1549 if (ret) { 1550 SDMA_DBG(req, "pin system segment failed %d", ret); 1551 return ret; 1552 } 1553 1554 from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr); 1555 if (from_this_cache_entry > from_this_iovec) 1556 from_this_cache_entry = from_this_iovec; 1557 1558 ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, 1559 from_this_cache_entry); 1560 1561 /* 1562 * Done adding cache_entry to zero or more sdma_desc. Can 1563 * kref_put() the "safety" kref taken under 1564 * get_system_cache_entry(). 1565 */ 1566 kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release); 1567 1568 if (ret) { 1569 SDMA_DBG(req, "add system segment failed %d", ret); 1570 return ret; 1571 } 1572 1573 iovec->offset += from_this_cache_entry; 1574 from_this_iovec -= from_this_cache_entry; 1575 } 1576 1577 return 0; 1578 } 1579 1580 static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, 1581 struct user_sdma_txreq *tx, 1582 struct user_sdma_iovec *iovec, 1583 u32 *pkt_data_remaining) 1584 { 1585 size_t remaining_to_add = *pkt_data_remaining; 1586 /* 1587 * Walk through iovec entries, ensure the associated pages 1588 * are pinned and mapped, add data to the packet until no more 1589 * data remains to be added. 1590 */ 1591 while (remaining_to_add > 0) { 1592 struct user_sdma_iovec *cur_iovec; 1593 size_t from_this_iovec; 1594 int ret; 1595 1596 cur_iovec = iovec; 1597 from_this_iovec = iovec->iov.iov_len - iovec->offset; 1598 1599 if (from_this_iovec > remaining_to_add) { 1600 from_this_iovec = remaining_to_add; 1601 } else { 1602 /* The current iovec entry will be consumed by this pass. */ 1603 req->iov_idx++; 1604 iovec++; 1605 } 1606 1607 ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec, 1608 from_this_iovec); 1609 if (ret) 1610 return ret; 1611 1612 remaining_to_add -= from_this_iovec; 1613 } 1614 *pkt_data_remaining = remaining_to_add; 1615 1616 return 0; 1617 } 1618 1619 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1620 unsigned long len) 1621 { 1622 return (bool)(node->addr == addr); 1623 } 1624 1625 /* 1626 * Return 1 to remove the node from the rb tree and call the remove op. 1627 * 1628 * Called with the rb tree lock held. 1629 */ 1630 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1631 void *evict_arg, bool *stop) 1632 { 1633 struct sdma_mmu_node *node = 1634 container_of(mnode, struct sdma_mmu_node, rb); 1635 struct evict_data *evict_data = evict_arg; 1636 1637 /* this node will be evicted, add its pages to our count */ 1638 evict_data->cleared += node->npages; 1639 1640 /* have enough pages been cleared? */ 1641 if (evict_data->cleared >= evict_data->target) 1642 *stop = true; 1643 1644 return 1; /* remove this node */ 1645 } 1646 1647 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1648 { 1649 struct sdma_mmu_node *node = 1650 container_of(mnode, struct sdma_mmu_node, rb); 1651 1652 free_system_node(node); 1653 } 1654