1 /* 2 * Copyright(c) 2015 - 2017 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 80 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 81 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 82 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 83 static int pin_vector_pages(struct user_sdma_request *req, 84 struct user_sdma_iovec *iovec); 85 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 86 unsigned start, unsigned npages); 87 static int check_header_template(struct user_sdma_request *req, 88 struct hfi1_pkt_header *hdr, u32 lrhlen, 89 u32 datalen); 90 static int set_txreq_header(struct user_sdma_request *req, 91 struct user_sdma_txreq *tx, u32 datalen); 92 static int set_txreq_header_ahg(struct user_sdma_request *req, 93 struct user_sdma_txreq *tx, u32 len); 94 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 95 struct hfi1_user_sdma_comp_q *cq, 96 u16 idx, enum hfi1_sdma_comp_state state, 97 int ret); 98 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 99 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 100 101 static int defer_packet_queue( 102 struct sdma_engine *sde, 103 struct iowait *wait, 104 struct sdma_txreq *txreq, 105 uint seq, 106 bool pkts_sent); 107 static void activate_packet_queue(struct iowait *wait, int reason); 108 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 109 unsigned long len); 110 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 111 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 112 void *arg2, bool *stop); 113 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 114 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 115 116 static struct mmu_rb_ops sdma_rb_ops = { 117 .filter = sdma_rb_filter, 118 .insert = sdma_rb_insert, 119 .evict = sdma_rb_evict, 120 .remove = sdma_rb_remove, 121 .invalidate = sdma_rb_invalidate 122 }; 123 124 static int defer_packet_queue( 125 struct sdma_engine *sde, 126 struct iowait *wait, 127 struct sdma_txreq *txreq, 128 uint seq, 129 bool pkts_sent) 130 { 131 struct hfi1_user_sdma_pkt_q *pq = 132 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 133 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 134 struct user_sdma_txreq *tx = 135 container_of(txreq, struct user_sdma_txreq, txreq); 136 137 if (sdma_progress(sde, seq, txreq)) { 138 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 139 goto eagain; 140 } 141 /* 142 * We are assuming that if the list is enqueued somewhere, it 143 * is to the dmawait list since that is the only place where 144 * it is supposed to be enqueued. 145 */ 146 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 147 write_seqlock(&dev->iowait_lock); 148 if (list_empty(&pq->busy.list)) 149 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 150 write_sequnlock(&dev->iowait_lock); 151 return -EBUSY; 152 eagain: 153 return -EAGAIN; 154 } 155 156 static void activate_packet_queue(struct iowait *wait, int reason) 157 { 158 struct hfi1_user_sdma_pkt_q *pq = 159 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 160 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 161 wake_up(&wait->wait_dma); 162 }; 163 164 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 165 struct hfi1_filedata *fd) 166 { 167 int ret = -ENOMEM; 168 char buf[64]; 169 struct hfi1_devdata *dd; 170 struct hfi1_user_sdma_comp_q *cq; 171 struct hfi1_user_sdma_pkt_q *pq; 172 173 if (!uctxt || !fd) 174 return -EBADF; 175 176 if (!hfi1_sdma_comp_ring_size) 177 return -EINVAL; 178 179 dd = uctxt->dd; 180 181 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 182 if (!pq) 183 return -ENOMEM; 184 185 pq->dd = dd; 186 pq->ctxt = uctxt->ctxt; 187 pq->subctxt = fd->subctxt; 188 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 189 atomic_set(&pq->n_reqs, 0); 190 init_waitqueue_head(&pq->wait); 191 atomic_set(&pq->n_locked, 0); 192 pq->mm = fd->mm; 193 194 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 195 activate_packet_queue, NULL); 196 pq->reqidx = 0; 197 198 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 199 sizeof(*pq->reqs), 200 GFP_KERNEL); 201 if (!pq->reqs) 202 goto pq_reqs_nomem; 203 204 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 205 sizeof(*pq->req_in_use), 206 GFP_KERNEL); 207 if (!pq->req_in_use) 208 goto pq_reqs_no_in_use; 209 210 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 211 fd->subctxt); 212 pq->txreq_cache = kmem_cache_create(buf, 213 sizeof(struct user_sdma_txreq), 214 L1_CACHE_BYTES, 215 SLAB_HWCACHE_ALIGN, 216 NULL); 217 if (!pq->txreq_cache) { 218 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 219 uctxt->ctxt); 220 goto pq_txreq_nomem; 221 } 222 223 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 224 if (!cq) 225 goto cq_nomem; 226 227 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 228 * hfi1_sdma_comp_ring_size)); 229 if (!cq->comps) 230 goto cq_comps_nomem; 231 232 cq->nentries = hfi1_sdma_comp_ring_size; 233 234 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 235 &pq->handler); 236 if (ret) { 237 dd_dev_err(dd, "Failed to register with MMU %d", ret); 238 goto pq_mmu_fail; 239 } 240 241 fd->pq = pq; 242 fd->cq = cq; 243 244 return 0; 245 246 pq_mmu_fail: 247 vfree(cq->comps); 248 cq_comps_nomem: 249 kfree(cq); 250 cq_nomem: 251 kmem_cache_destroy(pq->txreq_cache); 252 pq_txreq_nomem: 253 kfree(pq->req_in_use); 254 pq_reqs_no_in_use: 255 kfree(pq->reqs); 256 pq_reqs_nomem: 257 kfree(pq); 258 259 return ret; 260 } 261 262 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 263 struct hfi1_ctxtdata *uctxt) 264 { 265 struct hfi1_user_sdma_pkt_q *pq; 266 267 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 268 269 pq = fd->pq; 270 if (pq) { 271 if (pq->handler) 272 hfi1_mmu_rb_unregister(pq->handler); 273 iowait_sdma_drain(&pq->busy); 274 /* Wait until all requests have been freed. */ 275 wait_event_interruptible( 276 pq->wait, 277 !atomic_read(&pq->n_reqs)); 278 kfree(pq->reqs); 279 kfree(pq->req_in_use); 280 kmem_cache_destroy(pq->txreq_cache); 281 kfree(pq); 282 fd->pq = NULL; 283 } 284 if (fd->cq) { 285 vfree(fd->cq->comps); 286 kfree(fd->cq); 287 fd->cq = NULL; 288 } 289 return 0; 290 } 291 292 static u8 dlid_to_selector(u16 dlid) 293 { 294 static u8 mapping[256]; 295 static int initialized; 296 static u8 next; 297 int hash; 298 299 if (!initialized) { 300 memset(mapping, 0xFF, 256); 301 initialized = 1; 302 } 303 304 hash = ((dlid >> 8) ^ dlid) & 0xFF; 305 if (mapping[hash] == 0xFF) { 306 mapping[hash] = next; 307 next = (next + 1) & 0x7F; 308 } 309 310 return mapping[hash]; 311 } 312 313 /** 314 * hfi1_user_sdma_process_request() - Process and start a user sdma request 315 * @fd: valid file descriptor 316 * @iovec: array of io vectors to process 317 * @dim: overall iovec array size 318 * @count: number of io vector array entries processed 319 */ 320 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 321 struct iovec *iovec, unsigned long dim, 322 unsigned long *count) 323 { 324 int ret = 0, i; 325 struct hfi1_ctxtdata *uctxt = fd->uctxt; 326 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 327 struct hfi1_user_sdma_comp_q *cq = fd->cq; 328 struct hfi1_devdata *dd = pq->dd; 329 unsigned long idx = 0; 330 u8 pcount = initial_pkt_count; 331 struct sdma_req_info info; 332 struct user_sdma_request *req; 333 u8 opcode, sc, vl; 334 u16 pkey; 335 u32 slid; 336 u16 dlid; 337 u32 selector; 338 339 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 340 hfi1_cdbg( 341 SDMA, 342 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 343 dd->unit, uctxt->ctxt, fd->subctxt, 344 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 345 return -EINVAL; 346 } 347 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 348 if (ret) { 349 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 350 dd->unit, uctxt->ctxt, fd->subctxt, ret); 351 return -EFAULT; 352 } 353 354 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 355 (u16 *)&info); 356 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 357 hfi1_cdbg(SDMA, 358 "[%u:%u:%u:%u] Invalid comp index", 359 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 360 return -EINVAL; 361 } 362 363 /* 364 * Sanity check the header io vector count. Need at least 1 vector 365 * (header) and cannot be larger than the actual io vector count. 366 */ 367 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 368 hfi1_cdbg(SDMA, 369 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 370 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 371 req_iovcnt(info.ctrl), dim); 372 return -EINVAL; 373 } 374 375 if (!info.fragsize) { 376 hfi1_cdbg(SDMA, 377 "[%u:%u:%u:%u] Request does not specify fragsize", 378 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 379 return -EINVAL; 380 } 381 382 /* Try to claim the request. */ 383 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 384 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 385 dd->unit, uctxt->ctxt, fd->subctxt, 386 info.comp_idx); 387 return -EBADSLT; 388 } 389 /* 390 * All safety checks have been done and this request has been claimed. 391 */ 392 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 393 info.comp_idx); 394 req = pq->reqs + info.comp_idx; 395 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 396 req->data_len = 0; 397 req->pq = pq; 398 req->cq = cq; 399 req->ahg_idx = -1; 400 req->iov_idx = 0; 401 req->sent = 0; 402 req->seqnum = 0; 403 req->seqcomp = 0; 404 req->seqsubmitted = 0; 405 req->tids = NULL; 406 req->has_error = 0; 407 INIT_LIST_HEAD(&req->txps); 408 409 memcpy(&req->info, &info, sizeof(info)); 410 411 /* The request is initialized, count it */ 412 atomic_inc(&pq->n_reqs); 413 414 if (req_opcode(info.ctrl) == EXPECTED) { 415 /* expected must have a TID info and at least one data vector */ 416 if (req->data_iovs < 2) { 417 SDMA_DBG(req, 418 "Not enough vectors for expected request"); 419 ret = -EINVAL; 420 goto free_req; 421 } 422 req->data_iovs--; 423 } 424 425 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 426 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 427 MAX_VECTORS_PER_REQ); 428 ret = -EINVAL; 429 goto free_req; 430 } 431 /* Copy the header from the user buffer */ 432 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 433 sizeof(req->hdr)); 434 if (ret) { 435 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 436 ret = -EFAULT; 437 goto free_req; 438 } 439 440 /* If Static rate control is not enabled, sanitize the header. */ 441 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 442 req->hdr.pbc[2] = 0; 443 444 /* Validate the opcode. Do not trust packets from user space blindly. */ 445 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 446 if ((opcode & USER_OPCODE_CHECK_MASK) != 447 USER_OPCODE_CHECK_VAL) { 448 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 449 ret = -EINVAL; 450 goto free_req; 451 } 452 /* 453 * Validate the vl. Do not trust packets from user space blindly. 454 * VL comes from PBC, SC comes from LRH, and the VL needs to 455 * match the SC look up. 456 */ 457 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 458 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 459 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 460 if (vl >= dd->pport->vls_operational || 461 vl != sc_to_vlt(dd, sc)) { 462 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 463 ret = -EINVAL; 464 goto free_req; 465 } 466 467 /* Checking P_KEY for requests from user-space */ 468 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 469 slid = be16_to_cpu(req->hdr.lrh[3]); 470 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 471 ret = -EINVAL; 472 goto free_req; 473 } 474 475 /* 476 * Also should check the BTH.lnh. If it says the next header is GRH then 477 * the RXE parsing will be off and will land in the middle of the KDETH 478 * or miss it entirely. 479 */ 480 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 481 SDMA_DBG(req, "User tried to pass in a GRH"); 482 ret = -EINVAL; 483 goto free_req; 484 } 485 486 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 487 /* 488 * Calculate the initial TID offset based on the values of 489 * KDETH.OFFSET and KDETH.OM that are passed in. 490 */ 491 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 492 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 493 KDETH_OM_LARGE : KDETH_OM_SMALL); 494 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 495 info.comp_idx, req->tidoffset); 496 idx++; 497 498 /* Save all the IO vector structures */ 499 for (i = 0; i < req->data_iovs; i++) { 500 req->iovs[i].offset = 0; 501 INIT_LIST_HEAD(&req->iovs[i].list); 502 memcpy(&req->iovs[i].iov, 503 iovec + idx++, 504 sizeof(req->iovs[i].iov)); 505 ret = pin_vector_pages(req, &req->iovs[i]); 506 if (ret) { 507 req->data_iovs = i; 508 goto free_req; 509 } 510 req->data_len += req->iovs[i].iov.iov_len; 511 } 512 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 513 info.comp_idx, req->data_len); 514 if (pcount > req->info.npkts) 515 pcount = req->info.npkts; 516 /* 517 * Copy any TID info 518 * User space will provide the TID info only when the 519 * request type is EXPECTED. This is true even if there is 520 * only one packet in the request and the header is already 521 * setup. The reason for the singular TID case is that the 522 * driver needs to perform safety checks. 523 */ 524 if (req_opcode(req->info.ctrl) == EXPECTED) { 525 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 526 u32 *tmp; 527 528 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 529 ret = -EINVAL; 530 goto free_req; 531 } 532 533 /* 534 * We have to copy all of the tids because they may vary 535 * in size and, therefore, the TID count might not be 536 * equal to the pkt count. However, there is no way to 537 * tell at this point. 538 */ 539 tmp = memdup_user(iovec[idx].iov_base, 540 ntids * sizeof(*req->tids)); 541 if (IS_ERR(tmp)) { 542 ret = PTR_ERR(tmp); 543 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 544 ntids, ret); 545 goto free_req; 546 } 547 req->tids = tmp; 548 req->n_tids = ntids; 549 req->tididx = 0; 550 idx++; 551 } 552 553 dlid = be16_to_cpu(req->hdr.lrh[1]); 554 selector = dlid_to_selector(dlid); 555 selector += uctxt->ctxt + fd->subctxt; 556 req->sde = sdma_select_user_engine(dd, selector, vl); 557 558 if (!req->sde || !sdma_running(req->sde)) { 559 ret = -ECOMM; 560 goto free_req; 561 } 562 563 /* We don't need an AHG entry if the request contains only one packet */ 564 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 565 req->ahg_idx = sdma_ahg_alloc(req->sde); 566 567 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 568 pq->state = SDMA_PKT_Q_ACTIVE; 569 /* Send the first N packets in the request to buy us some time */ 570 ret = user_sdma_send_pkts(req, pcount); 571 if (unlikely(ret < 0 && ret != -EBUSY)) 572 goto free_req; 573 574 /* 575 * This is a somewhat blocking send implementation. 576 * The driver will block the caller until all packets of the 577 * request have been submitted to the SDMA engine. However, it 578 * will not wait for send completions. 579 */ 580 while (req->seqsubmitted != req->info.npkts) { 581 ret = user_sdma_send_pkts(req, pcount); 582 if (ret < 0) { 583 if (ret != -EBUSY) 584 goto free_req; 585 wait_event_interruptible_timeout( 586 pq->busy.wait_dma, 587 (pq->state == SDMA_PKT_Q_ACTIVE), 588 msecs_to_jiffies( 589 SDMA_IOWAIT_TIMEOUT)); 590 } 591 } 592 *count += idx; 593 return 0; 594 free_req: 595 /* 596 * If the submitted seqsubmitted == npkts, the completion routine 597 * controls the final state. If sequbmitted < npkts, wait for any 598 * outstanding packets to finish before cleaning up. 599 */ 600 if (req->seqsubmitted < req->info.npkts) { 601 if (req->seqsubmitted) 602 wait_event(pq->busy.wait_dma, 603 (req->seqcomp == req->seqsubmitted - 1)); 604 user_sdma_free_request(req, true); 605 pq_update(pq); 606 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 607 } 608 return ret; 609 } 610 611 static inline u32 compute_data_length(struct user_sdma_request *req, 612 struct user_sdma_txreq *tx) 613 { 614 /* 615 * Determine the proper size of the packet data. 616 * The size of the data of the first packet is in the header 617 * template. However, it includes the header and ICRC, which need 618 * to be subtracted. 619 * The minimum representable packet data length in a header is 4 bytes, 620 * therefore, when the data length request is less than 4 bytes, there's 621 * only one packet, and the packet data length is equal to that of the 622 * request data length. 623 * The size of the remaining packets is the minimum of the frag 624 * size (MTU) or remaining data in the request. 625 */ 626 u32 len; 627 628 if (!req->seqnum) { 629 if (req->data_len < sizeof(u32)) 630 len = req->data_len; 631 else 632 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 633 (sizeof(tx->hdr) - 4)); 634 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 635 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 636 PAGE_SIZE; 637 /* 638 * Get the data length based on the remaining space in the 639 * TID pair. 640 */ 641 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 642 /* If we've filled up the TID pair, move to the next one. */ 643 if (unlikely(!len) && ++req->tididx < req->n_tids && 644 req->tids[req->tididx]) { 645 tidlen = EXP_TID_GET(req->tids[req->tididx], 646 LEN) * PAGE_SIZE; 647 req->tidoffset = 0; 648 len = min_t(u32, tidlen, req->info.fragsize); 649 } 650 /* 651 * Since the TID pairs map entire pages, make sure that we 652 * are not going to try to send more data that we have 653 * remaining. 654 */ 655 len = min(len, req->data_len - req->sent); 656 } else { 657 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 658 } 659 trace_hfi1_sdma_user_compute_length(req->pq->dd, 660 req->pq->ctxt, 661 req->pq->subctxt, 662 req->info.comp_idx, 663 len); 664 return len; 665 } 666 667 static inline u32 pad_len(u32 len) 668 { 669 if (len & (sizeof(u32) - 1)) 670 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 671 return len; 672 } 673 674 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 675 { 676 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 677 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 678 } 679 680 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 681 struct user_sdma_txreq *tx, 682 u32 datalen) 683 { 684 int ret; 685 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 686 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 687 struct hfi1_user_sdma_pkt_q *pq = req->pq; 688 689 /* 690 * Copy the request header into the tx header 691 * because the HW needs a cacheline-aligned 692 * address. 693 * This copy can be optimized out if the hdr 694 * member of user_sdma_request were also 695 * cacheline aligned. 696 */ 697 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 698 if (PBC2LRH(pbclen) != lrhlen) { 699 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 700 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 701 } 702 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 703 if (ret) 704 return ret; 705 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 706 sizeof(tx->hdr) + datalen, req->ahg_idx, 707 0, NULL, 0, user_sdma_txreq_cb); 708 if (ret) 709 return ret; 710 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 711 if (ret) 712 sdma_txclean(pq->dd, &tx->txreq); 713 return ret; 714 } 715 716 static int user_sdma_txadd(struct user_sdma_request *req, 717 struct user_sdma_txreq *tx, 718 struct user_sdma_iovec *iovec, u32 datalen, 719 u32 *queued_ptr, u32 *data_sent_ptr, 720 u64 *iov_offset_ptr) 721 { 722 int ret; 723 unsigned int pageidx, len; 724 unsigned long base, offset; 725 u64 iov_offset = *iov_offset_ptr; 726 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 727 struct hfi1_user_sdma_pkt_q *pq = req->pq; 728 729 base = (unsigned long)iovec->iov.iov_base; 730 offset = offset_in_page(base + iovec->offset + iov_offset); 731 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 732 PAGE_SHIFT); 733 len = offset + req->info.fragsize > PAGE_SIZE ? 734 PAGE_SIZE - offset : req->info.fragsize; 735 len = min((datalen - queued), len); 736 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 737 offset, len); 738 if (ret) { 739 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 740 return ret; 741 } 742 iov_offset += len; 743 queued += len; 744 data_sent += len; 745 if (unlikely(queued < datalen && pageidx == iovec->npages && 746 req->iov_idx < req->data_iovs - 1)) { 747 iovec->offset += iov_offset; 748 iovec = &req->iovs[++req->iov_idx]; 749 iov_offset = 0; 750 } 751 752 *queued_ptr = queued; 753 *data_sent_ptr = data_sent; 754 *iov_offset_ptr = iov_offset; 755 return ret; 756 } 757 758 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 759 { 760 int ret = 0; 761 u16 count; 762 unsigned npkts = 0; 763 struct user_sdma_txreq *tx = NULL; 764 struct hfi1_user_sdma_pkt_q *pq = NULL; 765 struct user_sdma_iovec *iovec = NULL; 766 767 if (!req->pq) 768 return -EINVAL; 769 770 pq = req->pq; 771 772 /* If tx completion has reported an error, we are done. */ 773 if (READ_ONCE(req->has_error)) 774 return -EFAULT; 775 776 /* 777 * Check if we might have sent the entire request already 778 */ 779 if (unlikely(req->seqnum == req->info.npkts)) { 780 if (!list_empty(&req->txps)) 781 goto dosend; 782 return ret; 783 } 784 785 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 786 maxpkts = req->info.npkts - req->seqnum; 787 788 while (npkts < maxpkts) { 789 u32 datalen = 0, queued = 0, data_sent = 0; 790 u64 iov_offset = 0; 791 792 /* 793 * Check whether any of the completions have come back 794 * with errors. If so, we are not going to process any 795 * more packets from this request. 796 */ 797 if (READ_ONCE(req->has_error)) 798 return -EFAULT; 799 800 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 801 if (!tx) 802 return -ENOMEM; 803 804 tx->flags = 0; 805 tx->req = req; 806 tx->busycount = 0; 807 INIT_LIST_HEAD(&tx->list); 808 809 /* 810 * For the last packet set the ACK request 811 * and disable header suppression. 812 */ 813 if (req->seqnum == req->info.npkts - 1) 814 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 815 TXREQ_FLAGS_REQ_DISABLE_SH); 816 817 /* 818 * Calculate the payload size - this is min of the fragment 819 * (MTU) size or the remaining bytes in the request but only 820 * if we have payload data. 821 */ 822 if (req->data_len) { 823 iovec = &req->iovs[req->iov_idx]; 824 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 825 if (++req->iov_idx == req->data_iovs) { 826 ret = -EFAULT; 827 goto free_txreq; 828 } 829 iovec = &req->iovs[req->iov_idx]; 830 WARN_ON(iovec->offset); 831 } 832 833 datalen = compute_data_length(req, tx); 834 835 /* 836 * Disable header suppression for the payload <= 8DWS. 837 * If there is an uncorrectable error in the receive 838 * data FIFO when the received payload size is less than 839 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 840 * not reported.There is set RHF.EccErr if the header 841 * is not suppressed. 842 */ 843 if (!datalen) { 844 SDMA_DBG(req, 845 "Request has data but pkt len is 0"); 846 ret = -EFAULT; 847 goto free_tx; 848 } else if (datalen <= 32) { 849 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 850 } 851 } 852 853 if (req->ahg_idx >= 0) { 854 if (!req->seqnum) { 855 ret = user_sdma_txadd_ahg(req, tx, datalen); 856 if (ret) 857 goto free_tx; 858 } else { 859 int changes; 860 861 changes = set_txreq_header_ahg(req, tx, 862 datalen); 863 if (changes < 0) 864 goto free_tx; 865 } 866 } else { 867 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 868 datalen, user_sdma_txreq_cb); 869 if (ret) 870 goto free_tx; 871 /* 872 * Modify the header for this packet. This only needs 873 * to be done if we are not going to use AHG. Otherwise, 874 * the HW will do it based on the changes we gave it 875 * during sdma_txinit_ahg(). 876 */ 877 ret = set_txreq_header(req, tx, datalen); 878 if (ret) 879 goto free_txreq; 880 } 881 882 /* 883 * If the request contains any data vectors, add up to 884 * fragsize bytes to the descriptor. 885 */ 886 while (queued < datalen && 887 (req->sent + data_sent) < req->data_len) { 888 ret = user_sdma_txadd(req, tx, iovec, datalen, 889 &queued, &data_sent, &iov_offset); 890 if (ret) 891 goto free_txreq; 892 } 893 /* 894 * The txreq was submitted successfully so we can update 895 * the counters. 896 */ 897 req->koffset += datalen; 898 if (req_opcode(req->info.ctrl) == EXPECTED) 899 req->tidoffset += datalen; 900 req->sent += data_sent; 901 if (req->data_len) 902 iovec->offset += iov_offset; 903 list_add_tail(&tx->txreq.list, &req->txps); 904 /* 905 * It is important to increment this here as it is used to 906 * generate the BTH.PSN and, therefore, can't be bulk-updated 907 * outside of the loop. 908 */ 909 tx->seqnum = req->seqnum++; 910 npkts++; 911 } 912 dosend: 913 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); 914 req->seqsubmitted += count; 915 if (req->seqsubmitted == req->info.npkts) { 916 /* 917 * The txreq has already been submitted to the HW queue 918 * so we can free the AHG entry now. Corruption will not 919 * happen due to the sequential manner in which 920 * descriptors are processed. 921 */ 922 if (req->ahg_idx >= 0) 923 sdma_ahg_free(req->sde, req->ahg_idx); 924 } 925 return ret; 926 927 free_txreq: 928 sdma_txclean(pq->dd, &tx->txreq); 929 free_tx: 930 kmem_cache_free(pq->txreq_cache, tx); 931 return ret; 932 } 933 934 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 935 { 936 struct evict_data evict_data; 937 938 evict_data.cleared = 0; 939 evict_data.target = npages; 940 hfi1_mmu_rb_evict(pq->handler, &evict_data); 941 return evict_data.cleared; 942 } 943 944 static int pin_sdma_pages(struct user_sdma_request *req, 945 struct user_sdma_iovec *iovec, 946 struct sdma_mmu_node *node, 947 int npages) 948 { 949 int pinned, cleared; 950 struct page **pages; 951 struct hfi1_user_sdma_pkt_q *pq = req->pq; 952 953 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 954 if (!pages) 955 return -ENOMEM; 956 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 957 958 npages -= node->npages; 959 retry: 960 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 961 atomic_read(&pq->n_locked), npages)) { 962 cleared = sdma_cache_evict(pq, npages); 963 if (cleared >= npages) 964 goto retry; 965 } 966 pinned = hfi1_acquire_user_pages(pq->mm, 967 ((unsigned long)iovec->iov.iov_base + 968 (node->npages * PAGE_SIZE)), npages, 0, 969 pages + node->npages); 970 if (pinned < 0) { 971 kfree(pages); 972 return pinned; 973 } 974 if (pinned != npages) { 975 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 976 return -EFAULT; 977 } 978 kfree(node->pages); 979 node->rb.len = iovec->iov.iov_len; 980 node->pages = pages; 981 atomic_add(pinned, &pq->n_locked); 982 return pinned; 983 } 984 985 static void unpin_sdma_pages(struct sdma_mmu_node *node) 986 { 987 if (node->npages) { 988 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 989 atomic_sub(node->npages, &node->pq->n_locked); 990 } 991 } 992 993 static int pin_vector_pages(struct user_sdma_request *req, 994 struct user_sdma_iovec *iovec) 995 { 996 int ret = 0, pinned, npages; 997 struct hfi1_user_sdma_pkt_q *pq = req->pq; 998 struct sdma_mmu_node *node = NULL; 999 struct mmu_rb_node *rb_node; 1000 struct iovec *iov; 1001 bool extracted; 1002 1003 extracted = 1004 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1005 (unsigned long) 1006 iovec->iov.iov_base, 1007 iovec->iov.iov_len, &rb_node); 1008 if (rb_node) { 1009 node = container_of(rb_node, struct sdma_mmu_node, rb); 1010 if (!extracted) { 1011 atomic_inc(&node->refcount); 1012 iovec->pages = node->pages; 1013 iovec->npages = node->npages; 1014 iovec->node = node; 1015 return 0; 1016 } 1017 } 1018 1019 if (!node) { 1020 node = kzalloc(sizeof(*node), GFP_KERNEL); 1021 if (!node) 1022 return -ENOMEM; 1023 1024 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1025 node->pq = pq; 1026 atomic_set(&node->refcount, 0); 1027 } 1028 1029 iov = &iovec->iov; 1030 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1031 if (node->npages < npages) { 1032 pinned = pin_sdma_pages(req, iovec, node, npages); 1033 if (pinned < 0) { 1034 ret = pinned; 1035 goto bail; 1036 } 1037 node->npages += pinned; 1038 npages = node->npages; 1039 } 1040 iovec->pages = node->pages; 1041 iovec->npages = npages; 1042 iovec->node = node; 1043 1044 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1045 if (ret) { 1046 iovec->node = NULL; 1047 goto bail; 1048 } 1049 return 0; 1050 bail: 1051 unpin_sdma_pages(node); 1052 kfree(node); 1053 return ret; 1054 } 1055 1056 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1057 unsigned start, unsigned npages) 1058 { 1059 hfi1_release_user_pages(mm, pages + start, npages, false); 1060 kfree(pages); 1061 } 1062 1063 static int check_header_template(struct user_sdma_request *req, 1064 struct hfi1_pkt_header *hdr, u32 lrhlen, 1065 u32 datalen) 1066 { 1067 /* 1068 * Perform safety checks for any type of packet: 1069 * - transfer size is multiple of 64bytes 1070 * - packet length is multiple of 4 bytes 1071 * - packet length is not larger than MTU size 1072 * 1073 * These checks are only done for the first packet of the 1074 * transfer since the header is "given" to us by user space. 1075 * For the remainder of the packets we compute the values. 1076 */ 1077 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1078 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1079 return -EINVAL; 1080 1081 if (req_opcode(req->info.ctrl) == EXPECTED) { 1082 /* 1083 * The header is checked only on the first packet. Furthermore, 1084 * we ensure that at least one TID entry is copied when the 1085 * request is submitted. Therefore, we don't have to verify that 1086 * tididx points to something sane. 1087 */ 1088 u32 tidval = req->tids[req->tididx], 1089 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1090 tididx = EXP_TID_GET(tidval, IDX), 1091 tidctrl = EXP_TID_GET(tidval, CTRL), 1092 tidoff; 1093 __le32 kval = hdr->kdeth.ver_tid_offset; 1094 1095 tidoff = KDETH_GET(kval, OFFSET) * 1096 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1097 KDETH_OM_LARGE : KDETH_OM_SMALL); 1098 /* 1099 * Expected receive packets have the following 1100 * additional checks: 1101 * - offset is not larger than the TID size 1102 * - TIDCtrl values match between header and TID array 1103 * - TID indexes match between header and TID array 1104 */ 1105 if ((tidoff + datalen > tidlen) || 1106 KDETH_GET(kval, TIDCTRL) != tidctrl || 1107 KDETH_GET(kval, TID) != tididx) 1108 return -EINVAL; 1109 } 1110 return 0; 1111 } 1112 1113 /* 1114 * Correctly set the BTH.PSN field based on type of 1115 * transfer - eager packets can just increment the PSN but 1116 * expected packets encode generation and sequence in the 1117 * BTH.PSN field so just incrementing will result in errors. 1118 */ 1119 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1120 { 1121 u32 val = be32_to_cpu(bthpsn), 1122 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1123 0xffffffull), 1124 psn = val & mask; 1125 if (expct) 1126 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1127 else 1128 psn = psn + frags; 1129 return psn & mask; 1130 } 1131 1132 static int set_txreq_header(struct user_sdma_request *req, 1133 struct user_sdma_txreq *tx, u32 datalen) 1134 { 1135 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1136 struct hfi1_pkt_header *hdr = &tx->hdr; 1137 u8 omfactor; /* KDETH.OM */ 1138 u16 pbclen; 1139 int ret; 1140 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1141 1142 /* Copy the header template to the request before modification */ 1143 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1144 1145 /* 1146 * Check if the PBC and LRH length are mismatched. If so 1147 * adjust both in the header. 1148 */ 1149 pbclen = le16_to_cpu(hdr->pbc[0]); 1150 if (PBC2LRH(pbclen) != lrhlen) { 1151 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1152 hdr->pbc[0] = cpu_to_le16(pbclen); 1153 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1154 /* 1155 * Third packet 1156 * This is the first packet in the sequence that has 1157 * a "static" size that can be used for the rest of 1158 * the packets (besides the last one). 1159 */ 1160 if (unlikely(req->seqnum == 2)) { 1161 /* 1162 * From this point on the lengths in both the 1163 * PBC and LRH are the same until the last 1164 * packet. 1165 * Adjust the template so we don't have to update 1166 * every packet 1167 */ 1168 req->hdr.pbc[0] = hdr->pbc[0]; 1169 req->hdr.lrh[2] = hdr->lrh[2]; 1170 } 1171 } 1172 /* 1173 * We only have to modify the header if this is not the 1174 * first packet in the request. Otherwise, we use the 1175 * header given to us. 1176 */ 1177 if (unlikely(!req->seqnum)) { 1178 ret = check_header_template(req, hdr, lrhlen, datalen); 1179 if (ret) 1180 return ret; 1181 goto done; 1182 } 1183 1184 hdr->bth[2] = cpu_to_be32( 1185 set_pkt_bth_psn(hdr->bth[2], 1186 (req_opcode(req->info.ctrl) == EXPECTED), 1187 req->seqnum)); 1188 1189 /* Set ACK request on last packet */ 1190 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1191 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1192 1193 /* Set the new offset */ 1194 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1195 /* Expected packets have to fill in the new TID information */ 1196 if (req_opcode(req->info.ctrl) == EXPECTED) { 1197 tidval = req->tids[req->tididx]; 1198 /* 1199 * If the offset puts us at the end of the current TID, 1200 * advance everything. 1201 */ 1202 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1203 PAGE_SIZE)) { 1204 req->tidoffset = 0; 1205 /* 1206 * Since we don't copy all the TIDs, all at once, 1207 * we have to check again. 1208 */ 1209 if (++req->tididx > req->n_tids - 1 || 1210 !req->tids[req->tididx]) { 1211 return -EINVAL; 1212 } 1213 tidval = req->tids[req->tididx]; 1214 } 1215 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1216 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1217 KDETH_OM_SMALL_SHIFT; 1218 /* Set KDETH.TIDCtrl based on value for this TID. */ 1219 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1220 EXP_TID_GET(tidval, CTRL)); 1221 /* Set KDETH.TID based on value for this TID */ 1222 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1223 EXP_TID_GET(tidval, IDX)); 1224 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1225 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1226 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1227 /* 1228 * Set the KDETH.OFFSET and KDETH.OM based on size of 1229 * transfer. 1230 */ 1231 trace_hfi1_sdma_user_tid_info( 1232 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1233 req->tidoffset, req->tidoffset >> omfactor, 1234 omfactor != KDETH_OM_SMALL_SHIFT); 1235 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1236 req->tidoffset >> omfactor); 1237 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1238 omfactor != KDETH_OM_SMALL_SHIFT); 1239 } 1240 done: 1241 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1242 req->info.comp_idx, hdr, tidval); 1243 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1244 } 1245 1246 static int set_txreq_header_ahg(struct user_sdma_request *req, 1247 struct user_sdma_txreq *tx, u32 datalen) 1248 { 1249 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1250 int idx = 0; 1251 u8 omfactor; /* KDETH.OM */ 1252 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1253 struct hfi1_pkt_header *hdr = &req->hdr; 1254 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1255 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1256 size_t array_size = ARRAY_SIZE(ahg); 1257 1258 if (PBC2LRH(pbclen) != lrhlen) { 1259 /* PBC.PbcLengthDWs */ 1260 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1261 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1262 if (idx < 0) 1263 return idx; 1264 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1265 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1266 (__force u16)cpu_to_be16(lrhlen >> 2)); 1267 if (idx < 0) 1268 return idx; 1269 } 1270 1271 /* 1272 * Do the common updates 1273 */ 1274 /* BTH.PSN and BTH.A */ 1275 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1276 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1277 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1278 val32 |= 1UL << 31; 1279 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1280 (__force u16)cpu_to_be16(val32 >> 16)); 1281 if (idx < 0) 1282 return idx; 1283 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1284 (__force u16)cpu_to_be16(val32 & 0xffff)); 1285 if (idx < 0) 1286 return idx; 1287 /* KDETH.Offset */ 1288 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1289 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1290 if (idx < 0) 1291 return idx; 1292 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1293 (__force u16)cpu_to_le16(req->koffset >> 16)); 1294 if (idx < 0) 1295 return idx; 1296 if (req_opcode(req->info.ctrl) == EXPECTED) { 1297 __le16 val; 1298 1299 tidval = req->tids[req->tididx]; 1300 1301 /* 1302 * If the offset puts us at the end of the current TID, 1303 * advance everything. 1304 */ 1305 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1306 PAGE_SIZE)) { 1307 req->tidoffset = 0; 1308 /* 1309 * Since we don't copy all the TIDs, all at once, 1310 * we have to check again. 1311 */ 1312 if (++req->tididx > req->n_tids - 1 || 1313 !req->tids[req->tididx]) 1314 return -EINVAL; 1315 tidval = req->tids[req->tididx]; 1316 } 1317 omfactor = ((EXP_TID_GET(tidval, LEN) * 1318 PAGE_SIZE) >= 1319 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1320 KDETH_OM_SMALL_SHIFT; 1321 /* KDETH.OM and KDETH.OFFSET (TID) */ 1322 idx = ahg_header_set( 1323 ahg, idx, array_size, 7, 0, 16, 1324 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1325 ((req->tidoffset >> omfactor) 1326 & 0x7fff))); 1327 if (idx < 0) 1328 return idx; 1329 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1330 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1331 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1332 1333 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1334 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1335 INTR) << 1336 AHG_KDETH_INTR_SHIFT)); 1337 } else { 1338 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1339 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1340 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1341 INTR) << 1342 AHG_KDETH_INTR_SHIFT)); 1343 } 1344 1345 idx = ahg_header_set(ahg, idx, array_size, 1346 7, 16, 14, (__force u16)val); 1347 if (idx < 0) 1348 return idx; 1349 } 1350 1351 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1352 req->info.comp_idx, req->sde->this_idx, 1353 req->ahg_idx, ahg, idx, tidval); 1354 sdma_txinit_ahg(&tx->txreq, 1355 SDMA_TXREQ_F_USE_AHG, 1356 datalen, req->ahg_idx, idx, 1357 ahg, sizeof(req->hdr), 1358 user_sdma_txreq_cb); 1359 1360 return idx; 1361 } 1362 1363 /** 1364 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1365 * @txreq: valid sdma tx request 1366 * @status: success/failure of request 1367 * 1368 * Called when the SDMA progress state machine gets notification that 1369 * the SDMA descriptors for this tx request have been processed by the 1370 * DMA engine. Called in interrupt context. 1371 * Only do work on completed sequences. 1372 */ 1373 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1374 { 1375 struct user_sdma_txreq *tx = 1376 container_of(txreq, struct user_sdma_txreq, txreq); 1377 struct user_sdma_request *req; 1378 struct hfi1_user_sdma_pkt_q *pq; 1379 struct hfi1_user_sdma_comp_q *cq; 1380 enum hfi1_sdma_comp_state state = COMPLETE; 1381 1382 if (!tx->req) 1383 return; 1384 1385 req = tx->req; 1386 pq = req->pq; 1387 cq = req->cq; 1388 1389 if (status != SDMA_TXREQ_S_OK) { 1390 SDMA_DBG(req, "SDMA completion with error %d", 1391 status); 1392 WRITE_ONCE(req->has_error, 1); 1393 state = ERROR; 1394 } 1395 1396 req->seqcomp = tx->seqnum; 1397 kmem_cache_free(pq->txreq_cache, tx); 1398 1399 /* sequence isn't complete? We are done */ 1400 if (req->seqcomp != req->info.npkts - 1) 1401 return; 1402 1403 user_sdma_free_request(req, false); 1404 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1405 pq_update(pq); 1406 } 1407 1408 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1409 { 1410 if (atomic_dec_and_test(&pq->n_reqs)) 1411 wake_up(&pq->wait); 1412 } 1413 1414 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1415 { 1416 int i; 1417 1418 if (!list_empty(&req->txps)) { 1419 struct sdma_txreq *t, *p; 1420 1421 list_for_each_entry_safe(t, p, &req->txps, list) { 1422 struct user_sdma_txreq *tx = 1423 container_of(t, struct user_sdma_txreq, txreq); 1424 list_del_init(&t->list); 1425 sdma_txclean(req->pq->dd, t); 1426 kmem_cache_free(req->pq->txreq_cache, tx); 1427 } 1428 } 1429 1430 for (i = 0; i < req->data_iovs; i++) { 1431 struct sdma_mmu_node *node = req->iovs[i].node; 1432 1433 if (!node) 1434 continue; 1435 1436 req->iovs[i].node = NULL; 1437 1438 if (unpin) 1439 hfi1_mmu_rb_remove(req->pq->handler, 1440 &node->rb); 1441 else 1442 atomic_dec(&node->refcount); 1443 } 1444 1445 kfree(req->tids); 1446 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1447 } 1448 1449 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1450 struct hfi1_user_sdma_comp_q *cq, 1451 u16 idx, enum hfi1_sdma_comp_state state, 1452 int ret) 1453 { 1454 if (state == ERROR) 1455 cq->comps[idx].errcode = -ret; 1456 smp_wmb(); /* make sure errcode is visible first */ 1457 cq->comps[idx].status = state; 1458 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1459 idx, state, ret); 1460 } 1461 1462 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1463 unsigned long len) 1464 { 1465 return (bool)(node->addr == addr); 1466 } 1467 1468 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1469 { 1470 struct sdma_mmu_node *node = 1471 container_of(mnode, struct sdma_mmu_node, rb); 1472 1473 atomic_inc(&node->refcount); 1474 return 0; 1475 } 1476 1477 /* 1478 * Return 1 to remove the node from the rb tree and call the remove op. 1479 * 1480 * Called with the rb tree lock held. 1481 */ 1482 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1483 void *evict_arg, bool *stop) 1484 { 1485 struct sdma_mmu_node *node = 1486 container_of(mnode, struct sdma_mmu_node, rb); 1487 struct evict_data *evict_data = evict_arg; 1488 1489 /* is this node still being used? */ 1490 if (atomic_read(&node->refcount)) 1491 return 0; /* keep this node */ 1492 1493 /* this node will be evicted, add its pages to our count */ 1494 evict_data->cleared += node->npages; 1495 1496 /* have enough pages been cleared? */ 1497 if (evict_data->cleared >= evict_data->target) 1498 *stop = true; 1499 1500 return 1; /* remove this node */ 1501 } 1502 1503 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1504 { 1505 struct sdma_mmu_node *node = 1506 container_of(mnode, struct sdma_mmu_node, rb); 1507 1508 unpin_sdma_pages(node); 1509 kfree(node); 1510 } 1511 1512 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1513 { 1514 struct sdma_mmu_node *node = 1515 container_of(mnode, struct sdma_mmu_node, rb); 1516 1517 if (!atomic_read(&node->refcount)) 1518 return 1; 1519 return 0; 1520 } 1521