1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 80 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 81 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 82 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 83 static int pin_vector_pages(struct user_sdma_request *req, 84 struct user_sdma_iovec *iovec); 85 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 86 unsigned start, unsigned npages); 87 static int check_header_template(struct user_sdma_request *req, 88 struct hfi1_pkt_header *hdr, u32 lrhlen, 89 u32 datalen); 90 static int set_txreq_header(struct user_sdma_request *req, 91 struct user_sdma_txreq *tx, u32 datalen); 92 static int set_txreq_header_ahg(struct user_sdma_request *req, 93 struct user_sdma_txreq *tx, u32 len); 94 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 95 struct hfi1_user_sdma_comp_q *cq, 96 u16 idx, enum hfi1_sdma_comp_state state, 97 int ret); 98 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 99 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 100 101 static int defer_packet_queue( 102 struct sdma_engine *sde, 103 struct iowait_work *wait, 104 struct sdma_txreq *txreq, 105 uint seq, 106 bool pkts_sent); 107 static void activate_packet_queue(struct iowait *wait, int reason); 108 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 109 unsigned long len); 110 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 111 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 112 void *arg2, bool *stop); 113 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 114 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 115 116 static struct mmu_rb_ops sdma_rb_ops = { 117 .filter = sdma_rb_filter, 118 .insert = sdma_rb_insert, 119 .evict = sdma_rb_evict, 120 .remove = sdma_rb_remove, 121 .invalidate = sdma_rb_invalidate 122 }; 123 124 static int defer_packet_queue( 125 struct sdma_engine *sde, 126 struct iowait_work *wait, 127 struct sdma_txreq *txreq, 128 uint seq, 129 bool pkts_sent) 130 { 131 struct hfi1_user_sdma_pkt_q *pq = 132 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 133 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 134 struct user_sdma_txreq *tx = 135 container_of(txreq, struct user_sdma_txreq, txreq); 136 137 if (sdma_progress(sde, seq, txreq)) { 138 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 139 goto eagain; 140 } 141 /* 142 * We are assuming that if the list is enqueued somewhere, it 143 * is to the dmawait list since that is the only place where 144 * it is supposed to be enqueued. 145 */ 146 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 147 write_seqlock(&dev->iowait_lock); 148 if (list_empty(&pq->busy.list)) 149 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 150 write_sequnlock(&dev->iowait_lock); 151 return -EBUSY; 152 eagain: 153 return -EAGAIN; 154 } 155 156 static void activate_packet_queue(struct iowait *wait, int reason) 157 { 158 struct hfi1_user_sdma_pkt_q *pq = 159 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 160 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 161 wake_up(&wait->wait_dma); 162 }; 163 164 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 165 struct hfi1_filedata *fd) 166 { 167 int ret = -ENOMEM; 168 char buf[64]; 169 struct hfi1_devdata *dd; 170 struct hfi1_user_sdma_comp_q *cq; 171 struct hfi1_user_sdma_pkt_q *pq; 172 173 if (!uctxt || !fd) 174 return -EBADF; 175 176 if (!hfi1_sdma_comp_ring_size) 177 return -EINVAL; 178 179 dd = uctxt->dd; 180 181 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 182 if (!pq) 183 return -ENOMEM; 184 185 pq->dd = dd; 186 pq->ctxt = uctxt->ctxt; 187 pq->subctxt = fd->subctxt; 188 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 189 atomic_set(&pq->n_reqs, 0); 190 init_waitqueue_head(&pq->wait); 191 atomic_set(&pq->n_locked, 0); 192 pq->mm = fd->mm; 193 194 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 195 activate_packet_queue, NULL); 196 pq->reqidx = 0; 197 198 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 199 sizeof(*pq->reqs), 200 GFP_KERNEL); 201 if (!pq->reqs) 202 goto pq_reqs_nomem; 203 204 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 205 sizeof(*pq->req_in_use), 206 GFP_KERNEL); 207 if (!pq->req_in_use) 208 goto pq_reqs_no_in_use; 209 210 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 211 fd->subctxt); 212 pq->txreq_cache = kmem_cache_create(buf, 213 sizeof(struct user_sdma_txreq), 214 L1_CACHE_BYTES, 215 SLAB_HWCACHE_ALIGN, 216 NULL); 217 if (!pq->txreq_cache) { 218 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 219 uctxt->ctxt); 220 goto pq_txreq_nomem; 221 } 222 223 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 224 if (!cq) 225 goto cq_nomem; 226 227 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 228 * hfi1_sdma_comp_ring_size)); 229 if (!cq->comps) 230 goto cq_comps_nomem; 231 232 cq->nentries = hfi1_sdma_comp_ring_size; 233 234 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 235 &pq->handler); 236 if (ret) { 237 dd_dev_err(dd, "Failed to register with MMU %d", ret); 238 goto pq_mmu_fail; 239 } 240 241 fd->pq = pq; 242 fd->cq = cq; 243 244 return 0; 245 246 pq_mmu_fail: 247 vfree(cq->comps); 248 cq_comps_nomem: 249 kfree(cq); 250 cq_nomem: 251 kmem_cache_destroy(pq->txreq_cache); 252 pq_txreq_nomem: 253 kfree(pq->req_in_use); 254 pq_reqs_no_in_use: 255 kfree(pq->reqs); 256 pq_reqs_nomem: 257 kfree(pq); 258 259 return ret; 260 } 261 262 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 263 struct hfi1_ctxtdata *uctxt) 264 { 265 struct hfi1_user_sdma_pkt_q *pq; 266 267 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 268 269 pq = fd->pq; 270 if (pq) { 271 if (pq->handler) 272 hfi1_mmu_rb_unregister(pq->handler); 273 iowait_sdma_drain(&pq->busy); 274 /* Wait until all requests have been freed. */ 275 wait_event_interruptible( 276 pq->wait, 277 !atomic_read(&pq->n_reqs)); 278 kfree(pq->reqs); 279 kfree(pq->req_in_use); 280 kmem_cache_destroy(pq->txreq_cache); 281 kfree(pq); 282 fd->pq = NULL; 283 } 284 if (fd->cq) { 285 vfree(fd->cq->comps); 286 kfree(fd->cq); 287 fd->cq = NULL; 288 } 289 return 0; 290 } 291 292 static u8 dlid_to_selector(u16 dlid) 293 { 294 static u8 mapping[256]; 295 static int initialized; 296 static u8 next; 297 int hash; 298 299 if (!initialized) { 300 memset(mapping, 0xFF, 256); 301 initialized = 1; 302 } 303 304 hash = ((dlid >> 8) ^ dlid) & 0xFF; 305 if (mapping[hash] == 0xFF) { 306 mapping[hash] = next; 307 next = (next + 1) & 0x7F; 308 } 309 310 return mapping[hash]; 311 } 312 313 /** 314 * hfi1_user_sdma_process_request() - Process and start a user sdma request 315 * @fd: valid file descriptor 316 * @iovec: array of io vectors to process 317 * @dim: overall iovec array size 318 * @count: number of io vector array entries processed 319 */ 320 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 321 struct iovec *iovec, unsigned long dim, 322 unsigned long *count) 323 { 324 int ret = 0, i; 325 struct hfi1_ctxtdata *uctxt = fd->uctxt; 326 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 327 struct hfi1_user_sdma_comp_q *cq = fd->cq; 328 struct hfi1_devdata *dd = pq->dd; 329 unsigned long idx = 0; 330 u8 pcount = initial_pkt_count; 331 struct sdma_req_info info; 332 struct user_sdma_request *req; 333 u8 opcode, sc, vl; 334 u16 pkey; 335 u32 slid; 336 u16 dlid; 337 u32 selector; 338 339 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 340 hfi1_cdbg( 341 SDMA, 342 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 343 dd->unit, uctxt->ctxt, fd->subctxt, 344 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 345 return -EINVAL; 346 } 347 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 348 if (ret) { 349 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 350 dd->unit, uctxt->ctxt, fd->subctxt, ret); 351 return -EFAULT; 352 } 353 354 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 355 (u16 *)&info); 356 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 357 hfi1_cdbg(SDMA, 358 "[%u:%u:%u:%u] Invalid comp index", 359 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 360 return -EINVAL; 361 } 362 363 /* 364 * Sanity check the header io vector count. Need at least 1 vector 365 * (header) and cannot be larger than the actual io vector count. 366 */ 367 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 368 hfi1_cdbg(SDMA, 369 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 370 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 371 req_iovcnt(info.ctrl), dim); 372 return -EINVAL; 373 } 374 375 if (!info.fragsize) { 376 hfi1_cdbg(SDMA, 377 "[%u:%u:%u:%u] Request does not specify fragsize", 378 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 379 return -EINVAL; 380 } 381 382 /* Try to claim the request. */ 383 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 384 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 385 dd->unit, uctxt->ctxt, fd->subctxt, 386 info.comp_idx); 387 return -EBADSLT; 388 } 389 /* 390 * All safety checks have been done and this request has been claimed. 391 */ 392 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 393 info.comp_idx); 394 req = pq->reqs + info.comp_idx; 395 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 396 req->data_len = 0; 397 req->pq = pq; 398 req->cq = cq; 399 req->ahg_idx = -1; 400 req->iov_idx = 0; 401 req->sent = 0; 402 req->seqnum = 0; 403 req->seqcomp = 0; 404 req->seqsubmitted = 0; 405 req->tids = NULL; 406 req->has_error = 0; 407 INIT_LIST_HEAD(&req->txps); 408 409 memcpy(&req->info, &info, sizeof(info)); 410 411 /* The request is initialized, count it */ 412 atomic_inc(&pq->n_reqs); 413 414 if (req_opcode(info.ctrl) == EXPECTED) { 415 /* expected must have a TID info and at least one data vector */ 416 if (req->data_iovs < 2) { 417 SDMA_DBG(req, 418 "Not enough vectors for expected request"); 419 ret = -EINVAL; 420 goto free_req; 421 } 422 req->data_iovs--; 423 } 424 425 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 426 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 427 MAX_VECTORS_PER_REQ); 428 ret = -EINVAL; 429 goto free_req; 430 } 431 /* Copy the header from the user buffer */ 432 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 433 sizeof(req->hdr)); 434 if (ret) { 435 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 436 ret = -EFAULT; 437 goto free_req; 438 } 439 440 /* If Static rate control is not enabled, sanitize the header. */ 441 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 442 req->hdr.pbc[2] = 0; 443 444 /* Validate the opcode. Do not trust packets from user space blindly. */ 445 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 446 if ((opcode & USER_OPCODE_CHECK_MASK) != 447 USER_OPCODE_CHECK_VAL) { 448 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 449 ret = -EINVAL; 450 goto free_req; 451 } 452 /* 453 * Validate the vl. Do not trust packets from user space blindly. 454 * VL comes from PBC, SC comes from LRH, and the VL needs to 455 * match the SC look up. 456 */ 457 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 458 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 459 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 460 if (vl >= dd->pport->vls_operational || 461 vl != sc_to_vlt(dd, sc)) { 462 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 463 ret = -EINVAL; 464 goto free_req; 465 } 466 467 /* Checking P_KEY for requests from user-space */ 468 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 469 slid = be16_to_cpu(req->hdr.lrh[3]); 470 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 471 ret = -EINVAL; 472 goto free_req; 473 } 474 475 /* 476 * Also should check the BTH.lnh. If it says the next header is GRH then 477 * the RXE parsing will be off and will land in the middle of the KDETH 478 * or miss it entirely. 479 */ 480 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 481 SDMA_DBG(req, "User tried to pass in a GRH"); 482 ret = -EINVAL; 483 goto free_req; 484 } 485 486 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 487 /* 488 * Calculate the initial TID offset based on the values of 489 * KDETH.OFFSET and KDETH.OM that are passed in. 490 */ 491 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 492 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 493 KDETH_OM_LARGE : KDETH_OM_SMALL); 494 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 495 info.comp_idx, req->tidoffset); 496 idx++; 497 498 /* Save all the IO vector structures */ 499 for (i = 0; i < req->data_iovs; i++) { 500 req->iovs[i].offset = 0; 501 INIT_LIST_HEAD(&req->iovs[i].list); 502 memcpy(&req->iovs[i].iov, 503 iovec + idx++, 504 sizeof(req->iovs[i].iov)); 505 ret = pin_vector_pages(req, &req->iovs[i]); 506 if (ret) { 507 req->data_iovs = i; 508 goto free_req; 509 } 510 req->data_len += req->iovs[i].iov.iov_len; 511 } 512 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 513 info.comp_idx, req->data_len); 514 if (pcount > req->info.npkts) 515 pcount = req->info.npkts; 516 /* 517 * Copy any TID info 518 * User space will provide the TID info only when the 519 * request type is EXPECTED. This is true even if there is 520 * only one packet in the request and the header is already 521 * setup. The reason for the singular TID case is that the 522 * driver needs to perform safety checks. 523 */ 524 if (req_opcode(req->info.ctrl) == EXPECTED) { 525 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 526 u32 *tmp; 527 528 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 529 ret = -EINVAL; 530 goto free_req; 531 } 532 533 /* 534 * We have to copy all of the tids because they may vary 535 * in size and, therefore, the TID count might not be 536 * equal to the pkt count. However, there is no way to 537 * tell at this point. 538 */ 539 tmp = memdup_user(iovec[idx].iov_base, 540 ntids * sizeof(*req->tids)); 541 if (IS_ERR(tmp)) { 542 ret = PTR_ERR(tmp); 543 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 544 ntids, ret); 545 goto free_req; 546 } 547 req->tids = tmp; 548 req->n_tids = ntids; 549 req->tididx = 0; 550 idx++; 551 } 552 553 dlid = be16_to_cpu(req->hdr.lrh[1]); 554 selector = dlid_to_selector(dlid); 555 selector += uctxt->ctxt + fd->subctxt; 556 req->sde = sdma_select_user_engine(dd, selector, vl); 557 558 if (!req->sde || !sdma_running(req->sde)) { 559 ret = -ECOMM; 560 goto free_req; 561 } 562 563 /* We don't need an AHG entry if the request contains only one packet */ 564 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 565 req->ahg_idx = sdma_ahg_alloc(req->sde); 566 567 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 568 pq->state = SDMA_PKT_Q_ACTIVE; 569 /* Send the first N packets in the request to buy us some time */ 570 ret = user_sdma_send_pkts(req, pcount); 571 if (unlikely(ret < 0 && ret != -EBUSY)) 572 goto free_req; 573 574 /* 575 * This is a somewhat blocking send implementation. 576 * The driver will block the caller until all packets of the 577 * request have been submitted to the SDMA engine. However, it 578 * will not wait for send completions. 579 */ 580 while (req->seqsubmitted != req->info.npkts) { 581 ret = user_sdma_send_pkts(req, pcount); 582 if (ret < 0) { 583 if (ret != -EBUSY) 584 goto free_req; 585 wait_event_interruptible_timeout( 586 pq->busy.wait_dma, 587 (pq->state == SDMA_PKT_Q_ACTIVE), 588 msecs_to_jiffies( 589 SDMA_IOWAIT_TIMEOUT)); 590 } 591 } 592 *count += idx; 593 return 0; 594 free_req: 595 /* 596 * If the submitted seqsubmitted == npkts, the completion routine 597 * controls the final state. If sequbmitted < npkts, wait for any 598 * outstanding packets to finish before cleaning up. 599 */ 600 if (req->seqsubmitted < req->info.npkts) { 601 if (req->seqsubmitted) 602 wait_event(pq->busy.wait_dma, 603 (req->seqcomp == req->seqsubmitted - 1)); 604 user_sdma_free_request(req, true); 605 pq_update(pq); 606 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 607 } 608 return ret; 609 } 610 611 static inline u32 compute_data_length(struct user_sdma_request *req, 612 struct user_sdma_txreq *tx) 613 { 614 /* 615 * Determine the proper size of the packet data. 616 * The size of the data of the first packet is in the header 617 * template. However, it includes the header and ICRC, which need 618 * to be subtracted. 619 * The minimum representable packet data length in a header is 4 bytes, 620 * therefore, when the data length request is less than 4 bytes, there's 621 * only one packet, and the packet data length is equal to that of the 622 * request data length. 623 * The size of the remaining packets is the minimum of the frag 624 * size (MTU) or remaining data in the request. 625 */ 626 u32 len; 627 628 if (!req->seqnum) { 629 if (req->data_len < sizeof(u32)) 630 len = req->data_len; 631 else 632 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 633 (sizeof(tx->hdr) - 4)); 634 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 635 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 636 PAGE_SIZE; 637 /* 638 * Get the data length based on the remaining space in the 639 * TID pair. 640 */ 641 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 642 /* If we've filled up the TID pair, move to the next one. */ 643 if (unlikely(!len) && ++req->tididx < req->n_tids && 644 req->tids[req->tididx]) { 645 tidlen = EXP_TID_GET(req->tids[req->tididx], 646 LEN) * PAGE_SIZE; 647 req->tidoffset = 0; 648 len = min_t(u32, tidlen, req->info.fragsize); 649 } 650 /* 651 * Since the TID pairs map entire pages, make sure that we 652 * are not going to try to send more data that we have 653 * remaining. 654 */ 655 len = min(len, req->data_len - req->sent); 656 } else { 657 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 658 } 659 trace_hfi1_sdma_user_compute_length(req->pq->dd, 660 req->pq->ctxt, 661 req->pq->subctxt, 662 req->info.comp_idx, 663 len); 664 return len; 665 } 666 667 static inline u32 pad_len(u32 len) 668 { 669 if (len & (sizeof(u32) - 1)) 670 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 671 return len; 672 } 673 674 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 675 { 676 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 677 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 678 } 679 680 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 681 struct user_sdma_txreq *tx, 682 u32 datalen) 683 { 684 int ret; 685 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 686 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 687 struct hfi1_user_sdma_pkt_q *pq = req->pq; 688 689 /* 690 * Copy the request header into the tx header 691 * because the HW needs a cacheline-aligned 692 * address. 693 * This copy can be optimized out if the hdr 694 * member of user_sdma_request were also 695 * cacheline aligned. 696 */ 697 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 698 if (PBC2LRH(pbclen) != lrhlen) { 699 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 700 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 701 } 702 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 703 if (ret) 704 return ret; 705 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 706 sizeof(tx->hdr) + datalen, req->ahg_idx, 707 0, NULL, 0, user_sdma_txreq_cb); 708 if (ret) 709 return ret; 710 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 711 if (ret) 712 sdma_txclean(pq->dd, &tx->txreq); 713 return ret; 714 } 715 716 static int user_sdma_txadd(struct user_sdma_request *req, 717 struct user_sdma_txreq *tx, 718 struct user_sdma_iovec *iovec, u32 datalen, 719 u32 *queued_ptr, u32 *data_sent_ptr, 720 u64 *iov_offset_ptr) 721 { 722 int ret; 723 unsigned int pageidx, len; 724 unsigned long base, offset; 725 u64 iov_offset = *iov_offset_ptr; 726 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 727 struct hfi1_user_sdma_pkt_q *pq = req->pq; 728 729 base = (unsigned long)iovec->iov.iov_base; 730 offset = offset_in_page(base + iovec->offset + iov_offset); 731 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 732 PAGE_SHIFT); 733 len = offset + req->info.fragsize > PAGE_SIZE ? 734 PAGE_SIZE - offset : req->info.fragsize; 735 len = min((datalen - queued), len); 736 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 737 offset, len); 738 if (ret) { 739 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 740 return ret; 741 } 742 iov_offset += len; 743 queued += len; 744 data_sent += len; 745 if (unlikely(queued < datalen && pageidx == iovec->npages && 746 req->iov_idx < req->data_iovs - 1)) { 747 iovec->offset += iov_offset; 748 iovec = &req->iovs[++req->iov_idx]; 749 iov_offset = 0; 750 } 751 752 *queued_ptr = queued; 753 *data_sent_ptr = data_sent; 754 *iov_offset_ptr = iov_offset; 755 return ret; 756 } 757 758 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 759 { 760 int ret = 0; 761 u16 count; 762 unsigned npkts = 0; 763 struct user_sdma_txreq *tx = NULL; 764 struct hfi1_user_sdma_pkt_q *pq = NULL; 765 struct user_sdma_iovec *iovec = NULL; 766 767 if (!req->pq) 768 return -EINVAL; 769 770 pq = req->pq; 771 772 /* If tx completion has reported an error, we are done. */ 773 if (READ_ONCE(req->has_error)) 774 return -EFAULT; 775 776 /* 777 * Check if we might have sent the entire request already 778 */ 779 if (unlikely(req->seqnum == req->info.npkts)) { 780 if (!list_empty(&req->txps)) 781 goto dosend; 782 return ret; 783 } 784 785 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 786 maxpkts = req->info.npkts - req->seqnum; 787 788 while (npkts < maxpkts) { 789 u32 datalen = 0, queued = 0, data_sent = 0; 790 u64 iov_offset = 0; 791 792 /* 793 * Check whether any of the completions have come back 794 * with errors. If so, we are not going to process any 795 * more packets from this request. 796 */ 797 if (READ_ONCE(req->has_error)) 798 return -EFAULT; 799 800 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 801 if (!tx) 802 return -ENOMEM; 803 804 tx->flags = 0; 805 tx->req = req; 806 tx->busycount = 0; 807 INIT_LIST_HEAD(&tx->list); 808 809 /* 810 * For the last packet set the ACK request 811 * and disable header suppression. 812 */ 813 if (req->seqnum == req->info.npkts - 1) 814 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 815 TXREQ_FLAGS_REQ_DISABLE_SH); 816 817 /* 818 * Calculate the payload size - this is min of the fragment 819 * (MTU) size or the remaining bytes in the request but only 820 * if we have payload data. 821 */ 822 if (req->data_len) { 823 iovec = &req->iovs[req->iov_idx]; 824 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 825 if (++req->iov_idx == req->data_iovs) { 826 ret = -EFAULT; 827 goto free_tx; 828 } 829 iovec = &req->iovs[req->iov_idx]; 830 WARN_ON(iovec->offset); 831 } 832 833 datalen = compute_data_length(req, tx); 834 835 /* 836 * Disable header suppression for the payload <= 8DWS. 837 * If there is an uncorrectable error in the receive 838 * data FIFO when the received payload size is less than 839 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 840 * not reported.There is set RHF.EccErr if the header 841 * is not suppressed. 842 */ 843 if (!datalen) { 844 SDMA_DBG(req, 845 "Request has data but pkt len is 0"); 846 ret = -EFAULT; 847 goto free_tx; 848 } else if (datalen <= 32) { 849 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 850 } 851 } 852 853 if (req->ahg_idx >= 0) { 854 if (!req->seqnum) { 855 ret = user_sdma_txadd_ahg(req, tx, datalen); 856 if (ret) 857 goto free_tx; 858 } else { 859 int changes; 860 861 changes = set_txreq_header_ahg(req, tx, 862 datalen); 863 if (changes < 0) { 864 ret = changes; 865 goto free_tx; 866 } 867 } 868 } else { 869 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 870 datalen, user_sdma_txreq_cb); 871 if (ret) 872 goto free_tx; 873 /* 874 * Modify the header for this packet. This only needs 875 * to be done if we are not going to use AHG. Otherwise, 876 * the HW will do it based on the changes we gave it 877 * during sdma_txinit_ahg(). 878 */ 879 ret = set_txreq_header(req, tx, datalen); 880 if (ret) 881 goto free_txreq; 882 } 883 884 /* 885 * If the request contains any data vectors, add up to 886 * fragsize bytes to the descriptor. 887 */ 888 while (queued < datalen && 889 (req->sent + data_sent) < req->data_len) { 890 ret = user_sdma_txadd(req, tx, iovec, datalen, 891 &queued, &data_sent, &iov_offset); 892 if (ret) 893 goto free_txreq; 894 } 895 /* 896 * The txreq was submitted successfully so we can update 897 * the counters. 898 */ 899 req->koffset += datalen; 900 if (req_opcode(req->info.ctrl) == EXPECTED) 901 req->tidoffset += datalen; 902 req->sent += data_sent; 903 if (req->data_len) 904 iovec->offset += iov_offset; 905 list_add_tail(&tx->txreq.list, &req->txps); 906 /* 907 * It is important to increment this here as it is used to 908 * generate the BTH.PSN and, therefore, can't be bulk-updated 909 * outside of the loop. 910 */ 911 tx->seqnum = req->seqnum++; 912 npkts++; 913 } 914 dosend: 915 ret = sdma_send_txlist(req->sde, 916 iowait_get_ib_work(&pq->busy), 917 &req->txps, &count); 918 req->seqsubmitted += count; 919 if (req->seqsubmitted == req->info.npkts) { 920 /* 921 * The txreq has already been submitted to the HW queue 922 * so we can free the AHG entry now. Corruption will not 923 * happen due to the sequential manner in which 924 * descriptors are processed. 925 */ 926 if (req->ahg_idx >= 0) 927 sdma_ahg_free(req->sde, req->ahg_idx); 928 } 929 return ret; 930 931 free_txreq: 932 sdma_txclean(pq->dd, &tx->txreq); 933 free_tx: 934 kmem_cache_free(pq->txreq_cache, tx); 935 return ret; 936 } 937 938 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 939 { 940 struct evict_data evict_data; 941 942 evict_data.cleared = 0; 943 evict_data.target = npages; 944 hfi1_mmu_rb_evict(pq->handler, &evict_data); 945 return evict_data.cleared; 946 } 947 948 static int pin_sdma_pages(struct user_sdma_request *req, 949 struct user_sdma_iovec *iovec, 950 struct sdma_mmu_node *node, 951 int npages) 952 { 953 int pinned, cleared; 954 struct page **pages; 955 struct hfi1_user_sdma_pkt_q *pq = req->pq; 956 957 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 958 if (!pages) 959 return -ENOMEM; 960 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 961 962 npages -= node->npages; 963 retry: 964 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 965 atomic_read(&pq->n_locked), npages)) { 966 cleared = sdma_cache_evict(pq, npages); 967 if (cleared >= npages) 968 goto retry; 969 } 970 pinned = hfi1_acquire_user_pages(pq->mm, 971 ((unsigned long)iovec->iov.iov_base + 972 (node->npages * PAGE_SIZE)), npages, 0, 973 pages + node->npages); 974 if (pinned < 0) { 975 kfree(pages); 976 return pinned; 977 } 978 if (pinned != npages) { 979 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 980 return -EFAULT; 981 } 982 kfree(node->pages); 983 node->rb.len = iovec->iov.iov_len; 984 node->pages = pages; 985 atomic_add(pinned, &pq->n_locked); 986 return pinned; 987 } 988 989 static void unpin_sdma_pages(struct sdma_mmu_node *node) 990 { 991 if (node->npages) { 992 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 993 atomic_sub(node->npages, &node->pq->n_locked); 994 } 995 } 996 997 static int pin_vector_pages(struct user_sdma_request *req, 998 struct user_sdma_iovec *iovec) 999 { 1000 int ret = 0, pinned, npages; 1001 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1002 struct sdma_mmu_node *node = NULL; 1003 struct mmu_rb_node *rb_node; 1004 struct iovec *iov; 1005 bool extracted; 1006 1007 extracted = 1008 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1009 (unsigned long) 1010 iovec->iov.iov_base, 1011 iovec->iov.iov_len, &rb_node); 1012 if (rb_node) { 1013 node = container_of(rb_node, struct sdma_mmu_node, rb); 1014 if (!extracted) { 1015 atomic_inc(&node->refcount); 1016 iovec->pages = node->pages; 1017 iovec->npages = node->npages; 1018 iovec->node = node; 1019 return 0; 1020 } 1021 } 1022 1023 if (!node) { 1024 node = kzalloc(sizeof(*node), GFP_KERNEL); 1025 if (!node) 1026 return -ENOMEM; 1027 1028 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1029 node->pq = pq; 1030 atomic_set(&node->refcount, 0); 1031 } 1032 1033 iov = &iovec->iov; 1034 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1035 if (node->npages < npages) { 1036 pinned = pin_sdma_pages(req, iovec, node, npages); 1037 if (pinned < 0) { 1038 ret = pinned; 1039 goto bail; 1040 } 1041 node->npages += pinned; 1042 npages = node->npages; 1043 } 1044 iovec->pages = node->pages; 1045 iovec->npages = npages; 1046 iovec->node = node; 1047 1048 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1049 if (ret) { 1050 iovec->node = NULL; 1051 goto bail; 1052 } 1053 return 0; 1054 bail: 1055 unpin_sdma_pages(node); 1056 kfree(node); 1057 return ret; 1058 } 1059 1060 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1061 unsigned start, unsigned npages) 1062 { 1063 hfi1_release_user_pages(mm, pages + start, npages, false); 1064 kfree(pages); 1065 } 1066 1067 static int check_header_template(struct user_sdma_request *req, 1068 struct hfi1_pkt_header *hdr, u32 lrhlen, 1069 u32 datalen) 1070 { 1071 /* 1072 * Perform safety checks for any type of packet: 1073 * - transfer size is multiple of 64bytes 1074 * - packet length is multiple of 4 bytes 1075 * - packet length is not larger than MTU size 1076 * 1077 * These checks are only done for the first packet of the 1078 * transfer since the header is "given" to us by user space. 1079 * For the remainder of the packets we compute the values. 1080 */ 1081 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1082 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1083 return -EINVAL; 1084 1085 if (req_opcode(req->info.ctrl) == EXPECTED) { 1086 /* 1087 * The header is checked only on the first packet. Furthermore, 1088 * we ensure that at least one TID entry is copied when the 1089 * request is submitted. Therefore, we don't have to verify that 1090 * tididx points to something sane. 1091 */ 1092 u32 tidval = req->tids[req->tididx], 1093 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1094 tididx = EXP_TID_GET(tidval, IDX), 1095 tidctrl = EXP_TID_GET(tidval, CTRL), 1096 tidoff; 1097 __le32 kval = hdr->kdeth.ver_tid_offset; 1098 1099 tidoff = KDETH_GET(kval, OFFSET) * 1100 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1101 KDETH_OM_LARGE : KDETH_OM_SMALL); 1102 /* 1103 * Expected receive packets have the following 1104 * additional checks: 1105 * - offset is not larger than the TID size 1106 * - TIDCtrl values match between header and TID array 1107 * - TID indexes match between header and TID array 1108 */ 1109 if ((tidoff + datalen > tidlen) || 1110 KDETH_GET(kval, TIDCTRL) != tidctrl || 1111 KDETH_GET(kval, TID) != tididx) 1112 return -EINVAL; 1113 } 1114 return 0; 1115 } 1116 1117 /* 1118 * Correctly set the BTH.PSN field based on type of 1119 * transfer - eager packets can just increment the PSN but 1120 * expected packets encode generation and sequence in the 1121 * BTH.PSN field so just incrementing will result in errors. 1122 */ 1123 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1124 { 1125 u32 val = be32_to_cpu(bthpsn), 1126 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1127 0xffffffull), 1128 psn = val & mask; 1129 if (expct) 1130 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1131 else 1132 psn = psn + frags; 1133 return psn & mask; 1134 } 1135 1136 static int set_txreq_header(struct user_sdma_request *req, 1137 struct user_sdma_txreq *tx, u32 datalen) 1138 { 1139 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1140 struct hfi1_pkt_header *hdr = &tx->hdr; 1141 u8 omfactor; /* KDETH.OM */ 1142 u16 pbclen; 1143 int ret; 1144 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1145 1146 /* Copy the header template to the request before modification */ 1147 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1148 1149 /* 1150 * Check if the PBC and LRH length are mismatched. If so 1151 * adjust both in the header. 1152 */ 1153 pbclen = le16_to_cpu(hdr->pbc[0]); 1154 if (PBC2LRH(pbclen) != lrhlen) { 1155 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1156 hdr->pbc[0] = cpu_to_le16(pbclen); 1157 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1158 /* 1159 * Third packet 1160 * This is the first packet in the sequence that has 1161 * a "static" size that can be used for the rest of 1162 * the packets (besides the last one). 1163 */ 1164 if (unlikely(req->seqnum == 2)) { 1165 /* 1166 * From this point on the lengths in both the 1167 * PBC and LRH are the same until the last 1168 * packet. 1169 * Adjust the template so we don't have to update 1170 * every packet 1171 */ 1172 req->hdr.pbc[0] = hdr->pbc[0]; 1173 req->hdr.lrh[2] = hdr->lrh[2]; 1174 } 1175 } 1176 /* 1177 * We only have to modify the header if this is not the 1178 * first packet in the request. Otherwise, we use the 1179 * header given to us. 1180 */ 1181 if (unlikely(!req->seqnum)) { 1182 ret = check_header_template(req, hdr, lrhlen, datalen); 1183 if (ret) 1184 return ret; 1185 goto done; 1186 } 1187 1188 hdr->bth[2] = cpu_to_be32( 1189 set_pkt_bth_psn(hdr->bth[2], 1190 (req_opcode(req->info.ctrl) == EXPECTED), 1191 req->seqnum)); 1192 1193 /* Set ACK request on last packet */ 1194 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1195 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1196 1197 /* Set the new offset */ 1198 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1199 /* Expected packets have to fill in the new TID information */ 1200 if (req_opcode(req->info.ctrl) == EXPECTED) { 1201 tidval = req->tids[req->tididx]; 1202 /* 1203 * If the offset puts us at the end of the current TID, 1204 * advance everything. 1205 */ 1206 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1207 PAGE_SIZE)) { 1208 req->tidoffset = 0; 1209 /* 1210 * Since we don't copy all the TIDs, all at once, 1211 * we have to check again. 1212 */ 1213 if (++req->tididx > req->n_tids - 1 || 1214 !req->tids[req->tididx]) { 1215 return -EINVAL; 1216 } 1217 tidval = req->tids[req->tididx]; 1218 } 1219 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1220 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1221 KDETH_OM_SMALL_SHIFT; 1222 /* Set KDETH.TIDCtrl based on value for this TID. */ 1223 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1224 EXP_TID_GET(tidval, CTRL)); 1225 /* Set KDETH.TID based on value for this TID */ 1226 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1227 EXP_TID_GET(tidval, IDX)); 1228 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1229 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1230 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1231 /* 1232 * Set the KDETH.OFFSET and KDETH.OM based on size of 1233 * transfer. 1234 */ 1235 trace_hfi1_sdma_user_tid_info( 1236 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1237 req->tidoffset, req->tidoffset >> omfactor, 1238 omfactor != KDETH_OM_SMALL_SHIFT); 1239 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1240 req->tidoffset >> omfactor); 1241 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1242 omfactor != KDETH_OM_SMALL_SHIFT); 1243 } 1244 done: 1245 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1246 req->info.comp_idx, hdr, tidval); 1247 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1248 } 1249 1250 static int set_txreq_header_ahg(struct user_sdma_request *req, 1251 struct user_sdma_txreq *tx, u32 datalen) 1252 { 1253 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1254 int idx = 0; 1255 u8 omfactor; /* KDETH.OM */ 1256 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1257 struct hfi1_pkt_header *hdr = &req->hdr; 1258 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1259 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1260 size_t array_size = ARRAY_SIZE(ahg); 1261 1262 if (PBC2LRH(pbclen) != lrhlen) { 1263 /* PBC.PbcLengthDWs */ 1264 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1265 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1266 if (idx < 0) 1267 return idx; 1268 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1269 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1270 (__force u16)cpu_to_be16(lrhlen >> 2)); 1271 if (idx < 0) 1272 return idx; 1273 } 1274 1275 /* 1276 * Do the common updates 1277 */ 1278 /* BTH.PSN and BTH.A */ 1279 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1280 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1281 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1282 val32 |= 1UL << 31; 1283 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1284 (__force u16)cpu_to_be16(val32 >> 16)); 1285 if (idx < 0) 1286 return idx; 1287 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1288 (__force u16)cpu_to_be16(val32 & 0xffff)); 1289 if (idx < 0) 1290 return idx; 1291 /* KDETH.Offset */ 1292 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1293 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1294 if (idx < 0) 1295 return idx; 1296 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1297 (__force u16)cpu_to_le16(req->koffset >> 16)); 1298 if (idx < 0) 1299 return idx; 1300 if (req_opcode(req->info.ctrl) == EXPECTED) { 1301 __le16 val; 1302 1303 tidval = req->tids[req->tididx]; 1304 1305 /* 1306 * If the offset puts us at the end of the current TID, 1307 * advance everything. 1308 */ 1309 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1310 PAGE_SIZE)) { 1311 req->tidoffset = 0; 1312 /* 1313 * Since we don't copy all the TIDs, all at once, 1314 * we have to check again. 1315 */ 1316 if (++req->tididx > req->n_tids - 1 || 1317 !req->tids[req->tididx]) 1318 return -EINVAL; 1319 tidval = req->tids[req->tididx]; 1320 } 1321 omfactor = ((EXP_TID_GET(tidval, LEN) * 1322 PAGE_SIZE) >= 1323 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1324 KDETH_OM_SMALL_SHIFT; 1325 /* KDETH.OM and KDETH.OFFSET (TID) */ 1326 idx = ahg_header_set( 1327 ahg, idx, array_size, 7, 0, 16, 1328 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1329 ((req->tidoffset >> omfactor) 1330 & 0x7fff))); 1331 if (idx < 0) 1332 return idx; 1333 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1334 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1335 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1336 1337 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1338 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1339 INTR) << 1340 AHG_KDETH_INTR_SHIFT)); 1341 } else { 1342 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1343 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1344 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1345 INTR) << 1346 AHG_KDETH_INTR_SHIFT)); 1347 } 1348 1349 idx = ahg_header_set(ahg, idx, array_size, 1350 7, 16, 14, (__force u16)val); 1351 if (idx < 0) 1352 return idx; 1353 } 1354 1355 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1356 req->info.comp_idx, req->sde->this_idx, 1357 req->ahg_idx, ahg, idx, tidval); 1358 sdma_txinit_ahg(&tx->txreq, 1359 SDMA_TXREQ_F_USE_AHG, 1360 datalen, req->ahg_idx, idx, 1361 ahg, sizeof(req->hdr), 1362 user_sdma_txreq_cb); 1363 1364 return idx; 1365 } 1366 1367 /** 1368 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1369 * @txreq: valid sdma tx request 1370 * @status: success/failure of request 1371 * 1372 * Called when the SDMA progress state machine gets notification that 1373 * the SDMA descriptors for this tx request have been processed by the 1374 * DMA engine. Called in interrupt context. 1375 * Only do work on completed sequences. 1376 */ 1377 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1378 { 1379 struct user_sdma_txreq *tx = 1380 container_of(txreq, struct user_sdma_txreq, txreq); 1381 struct user_sdma_request *req; 1382 struct hfi1_user_sdma_pkt_q *pq; 1383 struct hfi1_user_sdma_comp_q *cq; 1384 enum hfi1_sdma_comp_state state = COMPLETE; 1385 1386 if (!tx->req) 1387 return; 1388 1389 req = tx->req; 1390 pq = req->pq; 1391 cq = req->cq; 1392 1393 if (status != SDMA_TXREQ_S_OK) { 1394 SDMA_DBG(req, "SDMA completion with error %d", 1395 status); 1396 WRITE_ONCE(req->has_error, 1); 1397 state = ERROR; 1398 } 1399 1400 req->seqcomp = tx->seqnum; 1401 kmem_cache_free(pq->txreq_cache, tx); 1402 1403 /* sequence isn't complete? We are done */ 1404 if (req->seqcomp != req->info.npkts - 1) 1405 return; 1406 1407 user_sdma_free_request(req, false); 1408 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1409 pq_update(pq); 1410 } 1411 1412 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1413 { 1414 if (atomic_dec_and_test(&pq->n_reqs)) 1415 wake_up(&pq->wait); 1416 } 1417 1418 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1419 { 1420 int i; 1421 1422 if (!list_empty(&req->txps)) { 1423 struct sdma_txreq *t, *p; 1424 1425 list_for_each_entry_safe(t, p, &req->txps, list) { 1426 struct user_sdma_txreq *tx = 1427 container_of(t, struct user_sdma_txreq, txreq); 1428 list_del_init(&t->list); 1429 sdma_txclean(req->pq->dd, t); 1430 kmem_cache_free(req->pq->txreq_cache, tx); 1431 } 1432 } 1433 1434 for (i = 0; i < req->data_iovs; i++) { 1435 struct sdma_mmu_node *node = req->iovs[i].node; 1436 1437 if (!node) 1438 continue; 1439 1440 req->iovs[i].node = NULL; 1441 1442 if (unpin) 1443 hfi1_mmu_rb_remove(req->pq->handler, 1444 &node->rb); 1445 else 1446 atomic_dec(&node->refcount); 1447 } 1448 1449 kfree(req->tids); 1450 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1451 } 1452 1453 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1454 struct hfi1_user_sdma_comp_q *cq, 1455 u16 idx, enum hfi1_sdma_comp_state state, 1456 int ret) 1457 { 1458 if (state == ERROR) 1459 cq->comps[idx].errcode = -ret; 1460 smp_wmb(); /* make sure errcode is visible first */ 1461 cq->comps[idx].status = state; 1462 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1463 idx, state, ret); 1464 } 1465 1466 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1467 unsigned long len) 1468 { 1469 return (bool)(node->addr == addr); 1470 } 1471 1472 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1473 { 1474 struct sdma_mmu_node *node = 1475 container_of(mnode, struct sdma_mmu_node, rb); 1476 1477 atomic_inc(&node->refcount); 1478 return 0; 1479 } 1480 1481 /* 1482 * Return 1 to remove the node from the rb tree and call the remove op. 1483 * 1484 * Called with the rb tree lock held. 1485 */ 1486 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1487 void *evict_arg, bool *stop) 1488 { 1489 struct sdma_mmu_node *node = 1490 container_of(mnode, struct sdma_mmu_node, rb); 1491 struct evict_data *evict_data = evict_arg; 1492 1493 /* is this node still being used? */ 1494 if (atomic_read(&node->refcount)) 1495 return 0; /* keep this node */ 1496 1497 /* this node will be evicted, add its pages to our count */ 1498 evict_data->cleared += node->npages; 1499 1500 /* have enough pages been cleared? */ 1501 if (evict_data->cleared >= evict_data->target) 1502 *stop = true; 1503 1504 return 1; /* remove this node */ 1505 } 1506 1507 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1508 { 1509 struct sdma_mmu_node *node = 1510 container_of(mnode, struct sdma_mmu_node, rb); 1511 1512 unpin_sdma_pages(node); 1513 kfree(node); 1514 } 1515 1516 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1517 { 1518 struct sdma_mmu_node *node = 1519 container_of(mnode, struct sdma_mmu_node, rb); 1520 1521 if (!atomic_read(&node->refcount)) 1522 return 1; 1523 return 0; 1524 } 1525