1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 80 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 81 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 82 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 83 static int pin_vector_pages(struct user_sdma_request *req, 84 struct user_sdma_iovec *iovec); 85 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 86 unsigned start, unsigned npages); 87 static int check_header_template(struct user_sdma_request *req, 88 struct hfi1_pkt_header *hdr, u32 lrhlen, 89 u32 datalen); 90 static int set_txreq_header(struct user_sdma_request *req, 91 struct user_sdma_txreq *tx, u32 datalen); 92 static int set_txreq_header_ahg(struct user_sdma_request *req, 93 struct user_sdma_txreq *tx, u32 len); 94 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 95 struct hfi1_user_sdma_comp_q *cq, 96 u16 idx, enum hfi1_sdma_comp_state state, 97 int ret); 98 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 99 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 100 101 static int defer_packet_queue( 102 struct sdma_engine *sde, 103 struct iowait_work *wait, 104 struct sdma_txreq *txreq, 105 uint seq, 106 bool pkts_sent); 107 static void activate_packet_queue(struct iowait *wait, int reason); 108 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 109 unsigned long len); 110 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 111 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 112 void *arg2, bool *stop); 113 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 114 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 115 116 static struct mmu_rb_ops sdma_rb_ops = { 117 .filter = sdma_rb_filter, 118 .insert = sdma_rb_insert, 119 .evict = sdma_rb_evict, 120 .remove = sdma_rb_remove, 121 .invalidate = sdma_rb_invalidate 122 }; 123 124 static int defer_packet_queue( 125 struct sdma_engine *sde, 126 struct iowait_work *wait, 127 struct sdma_txreq *txreq, 128 uint seq, 129 bool pkts_sent) 130 { 131 struct hfi1_user_sdma_pkt_q *pq = 132 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 133 struct user_sdma_txreq *tx = 134 container_of(txreq, struct user_sdma_txreq, txreq); 135 136 if (sdma_progress(sde, seq, txreq)) { 137 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 138 goto eagain; 139 } 140 /* 141 * We are assuming that if the list is enqueued somewhere, it 142 * is to the dmawait list since that is the only place where 143 * it is supposed to be enqueued. 144 */ 145 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 146 write_seqlock(&sde->waitlock); 147 if (list_empty(&pq->busy.list)) 148 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 149 write_sequnlock(&sde->waitlock); 150 return -EBUSY; 151 eagain: 152 return -EAGAIN; 153 } 154 155 static void activate_packet_queue(struct iowait *wait, int reason) 156 { 157 struct hfi1_user_sdma_pkt_q *pq = 158 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 159 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 160 wake_up(&wait->wait_dma); 161 }; 162 163 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 164 struct hfi1_filedata *fd) 165 { 166 int ret = -ENOMEM; 167 char buf[64]; 168 struct hfi1_devdata *dd; 169 struct hfi1_user_sdma_comp_q *cq; 170 struct hfi1_user_sdma_pkt_q *pq; 171 172 if (!uctxt || !fd) 173 return -EBADF; 174 175 if (!hfi1_sdma_comp_ring_size) 176 return -EINVAL; 177 178 dd = uctxt->dd; 179 180 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 181 if (!pq) 182 return -ENOMEM; 183 184 pq->dd = dd; 185 pq->ctxt = uctxt->ctxt; 186 pq->subctxt = fd->subctxt; 187 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 188 atomic_set(&pq->n_reqs, 0); 189 init_waitqueue_head(&pq->wait); 190 atomic_set(&pq->n_locked, 0); 191 pq->mm = fd->mm; 192 193 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 194 activate_packet_queue, NULL); 195 pq->reqidx = 0; 196 197 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 198 sizeof(*pq->reqs), 199 GFP_KERNEL); 200 if (!pq->reqs) 201 goto pq_reqs_nomem; 202 203 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 204 sizeof(*pq->req_in_use), 205 GFP_KERNEL); 206 if (!pq->req_in_use) 207 goto pq_reqs_no_in_use; 208 209 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 210 fd->subctxt); 211 pq->txreq_cache = kmem_cache_create(buf, 212 sizeof(struct user_sdma_txreq), 213 L1_CACHE_BYTES, 214 SLAB_HWCACHE_ALIGN, 215 NULL); 216 if (!pq->txreq_cache) { 217 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 218 uctxt->ctxt); 219 goto pq_txreq_nomem; 220 } 221 222 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 223 if (!cq) 224 goto cq_nomem; 225 226 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 227 * hfi1_sdma_comp_ring_size)); 228 if (!cq->comps) 229 goto cq_comps_nomem; 230 231 cq->nentries = hfi1_sdma_comp_ring_size; 232 233 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 234 &pq->handler); 235 if (ret) { 236 dd_dev_err(dd, "Failed to register with MMU %d", ret); 237 goto pq_mmu_fail; 238 } 239 240 fd->pq = pq; 241 fd->cq = cq; 242 243 return 0; 244 245 pq_mmu_fail: 246 vfree(cq->comps); 247 cq_comps_nomem: 248 kfree(cq); 249 cq_nomem: 250 kmem_cache_destroy(pq->txreq_cache); 251 pq_txreq_nomem: 252 kfree(pq->req_in_use); 253 pq_reqs_no_in_use: 254 kfree(pq->reqs); 255 pq_reqs_nomem: 256 kfree(pq); 257 258 return ret; 259 } 260 261 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 262 struct hfi1_ctxtdata *uctxt) 263 { 264 struct hfi1_user_sdma_pkt_q *pq; 265 266 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 267 268 pq = fd->pq; 269 if (pq) { 270 if (pq->handler) 271 hfi1_mmu_rb_unregister(pq->handler); 272 iowait_sdma_drain(&pq->busy); 273 /* Wait until all requests have been freed. */ 274 wait_event_interruptible( 275 pq->wait, 276 !atomic_read(&pq->n_reqs)); 277 kfree(pq->reqs); 278 kfree(pq->req_in_use); 279 kmem_cache_destroy(pq->txreq_cache); 280 kfree(pq); 281 fd->pq = NULL; 282 } 283 if (fd->cq) { 284 vfree(fd->cq->comps); 285 kfree(fd->cq); 286 fd->cq = NULL; 287 } 288 return 0; 289 } 290 291 static u8 dlid_to_selector(u16 dlid) 292 { 293 static u8 mapping[256]; 294 static int initialized; 295 static u8 next; 296 int hash; 297 298 if (!initialized) { 299 memset(mapping, 0xFF, 256); 300 initialized = 1; 301 } 302 303 hash = ((dlid >> 8) ^ dlid) & 0xFF; 304 if (mapping[hash] == 0xFF) { 305 mapping[hash] = next; 306 next = (next + 1) & 0x7F; 307 } 308 309 return mapping[hash]; 310 } 311 312 /** 313 * hfi1_user_sdma_process_request() - Process and start a user sdma request 314 * @fd: valid file descriptor 315 * @iovec: array of io vectors to process 316 * @dim: overall iovec array size 317 * @count: number of io vector array entries processed 318 */ 319 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 320 struct iovec *iovec, unsigned long dim, 321 unsigned long *count) 322 { 323 int ret = 0, i; 324 struct hfi1_ctxtdata *uctxt = fd->uctxt; 325 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 326 struct hfi1_user_sdma_comp_q *cq = fd->cq; 327 struct hfi1_devdata *dd = pq->dd; 328 unsigned long idx = 0; 329 u8 pcount = initial_pkt_count; 330 struct sdma_req_info info; 331 struct user_sdma_request *req; 332 u8 opcode, sc, vl; 333 u16 pkey; 334 u32 slid; 335 u16 dlid; 336 u32 selector; 337 338 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 339 hfi1_cdbg( 340 SDMA, 341 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 342 dd->unit, uctxt->ctxt, fd->subctxt, 343 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 344 return -EINVAL; 345 } 346 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 347 if (ret) { 348 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 349 dd->unit, uctxt->ctxt, fd->subctxt, ret); 350 return -EFAULT; 351 } 352 353 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 354 (u16 *)&info); 355 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 356 hfi1_cdbg(SDMA, 357 "[%u:%u:%u:%u] Invalid comp index", 358 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 359 return -EINVAL; 360 } 361 362 /* 363 * Sanity check the header io vector count. Need at least 1 vector 364 * (header) and cannot be larger than the actual io vector count. 365 */ 366 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 367 hfi1_cdbg(SDMA, 368 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 369 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 370 req_iovcnt(info.ctrl), dim); 371 return -EINVAL; 372 } 373 374 if (!info.fragsize) { 375 hfi1_cdbg(SDMA, 376 "[%u:%u:%u:%u] Request does not specify fragsize", 377 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 378 return -EINVAL; 379 } 380 381 /* Try to claim the request. */ 382 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 383 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 384 dd->unit, uctxt->ctxt, fd->subctxt, 385 info.comp_idx); 386 return -EBADSLT; 387 } 388 /* 389 * All safety checks have been done and this request has been claimed. 390 */ 391 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 392 info.comp_idx); 393 req = pq->reqs + info.comp_idx; 394 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 395 req->data_len = 0; 396 req->pq = pq; 397 req->cq = cq; 398 req->ahg_idx = -1; 399 req->iov_idx = 0; 400 req->sent = 0; 401 req->seqnum = 0; 402 req->seqcomp = 0; 403 req->seqsubmitted = 0; 404 req->tids = NULL; 405 req->has_error = 0; 406 INIT_LIST_HEAD(&req->txps); 407 408 memcpy(&req->info, &info, sizeof(info)); 409 410 /* The request is initialized, count it */ 411 atomic_inc(&pq->n_reqs); 412 413 if (req_opcode(info.ctrl) == EXPECTED) { 414 /* expected must have a TID info and at least one data vector */ 415 if (req->data_iovs < 2) { 416 SDMA_DBG(req, 417 "Not enough vectors for expected request"); 418 ret = -EINVAL; 419 goto free_req; 420 } 421 req->data_iovs--; 422 } 423 424 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 425 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 426 MAX_VECTORS_PER_REQ); 427 ret = -EINVAL; 428 goto free_req; 429 } 430 /* Copy the header from the user buffer */ 431 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 432 sizeof(req->hdr)); 433 if (ret) { 434 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 435 ret = -EFAULT; 436 goto free_req; 437 } 438 439 /* If Static rate control is not enabled, sanitize the header. */ 440 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 441 req->hdr.pbc[2] = 0; 442 443 /* Validate the opcode. Do not trust packets from user space blindly. */ 444 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 445 if ((opcode & USER_OPCODE_CHECK_MASK) != 446 USER_OPCODE_CHECK_VAL) { 447 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 448 ret = -EINVAL; 449 goto free_req; 450 } 451 /* 452 * Validate the vl. Do not trust packets from user space blindly. 453 * VL comes from PBC, SC comes from LRH, and the VL needs to 454 * match the SC look up. 455 */ 456 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 457 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 458 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 459 if (vl >= dd->pport->vls_operational || 460 vl != sc_to_vlt(dd, sc)) { 461 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 462 ret = -EINVAL; 463 goto free_req; 464 } 465 466 /* Checking P_KEY for requests from user-space */ 467 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 468 slid = be16_to_cpu(req->hdr.lrh[3]); 469 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 470 ret = -EINVAL; 471 goto free_req; 472 } 473 474 /* 475 * Also should check the BTH.lnh. If it says the next header is GRH then 476 * the RXE parsing will be off and will land in the middle of the KDETH 477 * or miss it entirely. 478 */ 479 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 480 SDMA_DBG(req, "User tried to pass in a GRH"); 481 ret = -EINVAL; 482 goto free_req; 483 } 484 485 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 486 /* 487 * Calculate the initial TID offset based on the values of 488 * KDETH.OFFSET and KDETH.OM that are passed in. 489 */ 490 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 491 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 492 KDETH_OM_LARGE : KDETH_OM_SMALL); 493 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 494 info.comp_idx, req->tidoffset); 495 idx++; 496 497 /* Save all the IO vector structures */ 498 for (i = 0; i < req->data_iovs; i++) { 499 req->iovs[i].offset = 0; 500 INIT_LIST_HEAD(&req->iovs[i].list); 501 memcpy(&req->iovs[i].iov, 502 iovec + idx++, 503 sizeof(req->iovs[i].iov)); 504 ret = pin_vector_pages(req, &req->iovs[i]); 505 if (ret) { 506 req->data_iovs = i; 507 goto free_req; 508 } 509 req->data_len += req->iovs[i].iov.iov_len; 510 } 511 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 512 info.comp_idx, req->data_len); 513 if (pcount > req->info.npkts) 514 pcount = req->info.npkts; 515 /* 516 * Copy any TID info 517 * User space will provide the TID info only when the 518 * request type is EXPECTED. This is true even if there is 519 * only one packet in the request and the header is already 520 * setup. The reason for the singular TID case is that the 521 * driver needs to perform safety checks. 522 */ 523 if (req_opcode(req->info.ctrl) == EXPECTED) { 524 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 525 u32 *tmp; 526 527 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 528 ret = -EINVAL; 529 goto free_req; 530 } 531 532 /* 533 * We have to copy all of the tids because they may vary 534 * in size and, therefore, the TID count might not be 535 * equal to the pkt count. However, there is no way to 536 * tell at this point. 537 */ 538 tmp = memdup_user(iovec[idx].iov_base, 539 ntids * sizeof(*req->tids)); 540 if (IS_ERR(tmp)) { 541 ret = PTR_ERR(tmp); 542 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 543 ntids, ret); 544 goto free_req; 545 } 546 req->tids = tmp; 547 req->n_tids = ntids; 548 req->tididx = 0; 549 idx++; 550 } 551 552 dlid = be16_to_cpu(req->hdr.lrh[1]); 553 selector = dlid_to_selector(dlid); 554 selector += uctxt->ctxt + fd->subctxt; 555 req->sde = sdma_select_user_engine(dd, selector, vl); 556 557 if (!req->sde || !sdma_running(req->sde)) { 558 ret = -ECOMM; 559 goto free_req; 560 } 561 562 /* We don't need an AHG entry if the request contains only one packet */ 563 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 564 req->ahg_idx = sdma_ahg_alloc(req->sde); 565 566 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 567 pq->state = SDMA_PKT_Q_ACTIVE; 568 /* Send the first N packets in the request to buy us some time */ 569 ret = user_sdma_send_pkts(req, pcount); 570 if (unlikely(ret < 0 && ret != -EBUSY)) 571 goto free_req; 572 573 /* 574 * This is a somewhat blocking send implementation. 575 * The driver will block the caller until all packets of the 576 * request have been submitted to the SDMA engine. However, it 577 * will not wait for send completions. 578 */ 579 while (req->seqsubmitted != req->info.npkts) { 580 ret = user_sdma_send_pkts(req, pcount); 581 if (ret < 0) { 582 if (ret != -EBUSY) 583 goto free_req; 584 wait_event_interruptible_timeout( 585 pq->busy.wait_dma, 586 (pq->state == SDMA_PKT_Q_ACTIVE), 587 msecs_to_jiffies( 588 SDMA_IOWAIT_TIMEOUT)); 589 } 590 } 591 *count += idx; 592 return 0; 593 free_req: 594 /* 595 * If the submitted seqsubmitted == npkts, the completion routine 596 * controls the final state. If sequbmitted < npkts, wait for any 597 * outstanding packets to finish before cleaning up. 598 */ 599 if (req->seqsubmitted < req->info.npkts) { 600 if (req->seqsubmitted) 601 wait_event(pq->busy.wait_dma, 602 (req->seqcomp == req->seqsubmitted - 1)); 603 user_sdma_free_request(req, true); 604 pq_update(pq); 605 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 606 } 607 return ret; 608 } 609 610 static inline u32 compute_data_length(struct user_sdma_request *req, 611 struct user_sdma_txreq *tx) 612 { 613 /* 614 * Determine the proper size of the packet data. 615 * The size of the data of the first packet is in the header 616 * template. However, it includes the header and ICRC, which need 617 * to be subtracted. 618 * The minimum representable packet data length in a header is 4 bytes, 619 * therefore, when the data length request is less than 4 bytes, there's 620 * only one packet, and the packet data length is equal to that of the 621 * request data length. 622 * The size of the remaining packets is the minimum of the frag 623 * size (MTU) or remaining data in the request. 624 */ 625 u32 len; 626 627 if (!req->seqnum) { 628 if (req->data_len < sizeof(u32)) 629 len = req->data_len; 630 else 631 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 632 (sizeof(tx->hdr) - 4)); 633 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 634 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 635 PAGE_SIZE; 636 /* 637 * Get the data length based on the remaining space in the 638 * TID pair. 639 */ 640 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 641 /* If we've filled up the TID pair, move to the next one. */ 642 if (unlikely(!len) && ++req->tididx < req->n_tids && 643 req->tids[req->tididx]) { 644 tidlen = EXP_TID_GET(req->tids[req->tididx], 645 LEN) * PAGE_SIZE; 646 req->tidoffset = 0; 647 len = min_t(u32, tidlen, req->info.fragsize); 648 } 649 /* 650 * Since the TID pairs map entire pages, make sure that we 651 * are not going to try to send more data that we have 652 * remaining. 653 */ 654 len = min(len, req->data_len - req->sent); 655 } else { 656 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 657 } 658 trace_hfi1_sdma_user_compute_length(req->pq->dd, 659 req->pq->ctxt, 660 req->pq->subctxt, 661 req->info.comp_idx, 662 len); 663 return len; 664 } 665 666 static inline u32 pad_len(u32 len) 667 { 668 if (len & (sizeof(u32) - 1)) 669 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 670 return len; 671 } 672 673 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 674 { 675 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 676 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 677 } 678 679 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 680 struct user_sdma_txreq *tx, 681 u32 datalen) 682 { 683 int ret; 684 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 685 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 686 struct hfi1_user_sdma_pkt_q *pq = req->pq; 687 688 /* 689 * Copy the request header into the tx header 690 * because the HW needs a cacheline-aligned 691 * address. 692 * This copy can be optimized out if the hdr 693 * member of user_sdma_request were also 694 * cacheline aligned. 695 */ 696 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 697 if (PBC2LRH(pbclen) != lrhlen) { 698 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 699 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 700 } 701 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 702 if (ret) 703 return ret; 704 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 705 sizeof(tx->hdr) + datalen, req->ahg_idx, 706 0, NULL, 0, user_sdma_txreq_cb); 707 if (ret) 708 return ret; 709 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 710 if (ret) 711 sdma_txclean(pq->dd, &tx->txreq); 712 return ret; 713 } 714 715 static int user_sdma_txadd(struct user_sdma_request *req, 716 struct user_sdma_txreq *tx, 717 struct user_sdma_iovec *iovec, u32 datalen, 718 u32 *queued_ptr, u32 *data_sent_ptr, 719 u64 *iov_offset_ptr) 720 { 721 int ret; 722 unsigned int pageidx, len; 723 unsigned long base, offset; 724 u64 iov_offset = *iov_offset_ptr; 725 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 726 struct hfi1_user_sdma_pkt_q *pq = req->pq; 727 728 base = (unsigned long)iovec->iov.iov_base; 729 offset = offset_in_page(base + iovec->offset + iov_offset); 730 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 731 PAGE_SHIFT); 732 len = offset + req->info.fragsize > PAGE_SIZE ? 733 PAGE_SIZE - offset : req->info.fragsize; 734 len = min((datalen - queued), len); 735 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 736 offset, len); 737 if (ret) { 738 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 739 return ret; 740 } 741 iov_offset += len; 742 queued += len; 743 data_sent += len; 744 if (unlikely(queued < datalen && pageidx == iovec->npages && 745 req->iov_idx < req->data_iovs - 1)) { 746 iovec->offset += iov_offset; 747 iovec = &req->iovs[++req->iov_idx]; 748 iov_offset = 0; 749 } 750 751 *queued_ptr = queued; 752 *data_sent_ptr = data_sent; 753 *iov_offset_ptr = iov_offset; 754 return ret; 755 } 756 757 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 758 { 759 int ret = 0; 760 u16 count; 761 unsigned npkts = 0; 762 struct user_sdma_txreq *tx = NULL; 763 struct hfi1_user_sdma_pkt_q *pq = NULL; 764 struct user_sdma_iovec *iovec = NULL; 765 766 if (!req->pq) 767 return -EINVAL; 768 769 pq = req->pq; 770 771 /* If tx completion has reported an error, we are done. */ 772 if (READ_ONCE(req->has_error)) 773 return -EFAULT; 774 775 /* 776 * Check if we might have sent the entire request already 777 */ 778 if (unlikely(req->seqnum == req->info.npkts)) { 779 if (!list_empty(&req->txps)) 780 goto dosend; 781 return ret; 782 } 783 784 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 785 maxpkts = req->info.npkts - req->seqnum; 786 787 while (npkts < maxpkts) { 788 u32 datalen = 0, queued = 0, data_sent = 0; 789 u64 iov_offset = 0; 790 791 /* 792 * Check whether any of the completions have come back 793 * with errors. If so, we are not going to process any 794 * more packets from this request. 795 */ 796 if (READ_ONCE(req->has_error)) 797 return -EFAULT; 798 799 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 800 if (!tx) 801 return -ENOMEM; 802 803 tx->flags = 0; 804 tx->req = req; 805 tx->busycount = 0; 806 INIT_LIST_HEAD(&tx->list); 807 808 /* 809 * For the last packet set the ACK request 810 * and disable header suppression. 811 */ 812 if (req->seqnum == req->info.npkts - 1) 813 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 814 TXREQ_FLAGS_REQ_DISABLE_SH); 815 816 /* 817 * Calculate the payload size - this is min of the fragment 818 * (MTU) size or the remaining bytes in the request but only 819 * if we have payload data. 820 */ 821 if (req->data_len) { 822 iovec = &req->iovs[req->iov_idx]; 823 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 824 if (++req->iov_idx == req->data_iovs) { 825 ret = -EFAULT; 826 goto free_tx; 827 } 828 iovec = &req->iovs[req->iov_idx]; 829 WARN_ON(iovec->offset); 830 } 831 832 datalen = compute_data_length(req, tx); 833 834 /* 835 * Disable header suppression for the payload <= 8DWS. 836 * If there is an uncorrectable error in the receive 837 * data FIFO when the received payload size is less than 838 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 839 * not reported.There is set RHF.EccErr if the header 840 * is not suppressed. 841 */ 842 if (!datalen) { 843 SDMA_DBG(req, 844 "Request has data but pkt len is 0"); 845 ret = -EFAULT; 846 goto free_tx; 847 } else if (datalen <= 32) { 848 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 849 } 850 } 851 852 if (req->ahg_idx >= 0) { 853 if (!req->seqnum) { 854 ret = user_sdma_txadd_ahg(req, tx, datalen); 855 if (ret) 856 goto free_tx; 857 } else { 858 int changes; 859 860 changes = set_txreq_header_ahg(req, tx, 861 datalen); 862 if (changes < 0) { 863 ret = changes; 864 goto free_tx; 865 } 866 } 867 } else { 868 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 869 datalen, user_sdma_txreq_cb); 870 if (ret) 871 goto free_tx; 872 /* 873 * Modify the header for this packet. This only needs 874 * to be done if we are not going to use AHG. Otherwise, 875 * the HW will do it based on the changes we gave it 876 * during sdma_txinit_ahg(). 877 */ 878 ret = set_txreq_header(req, tx, datalen); 879 if (ret) 880 goto free_txreq; 881 } 882 883 /* 884 * If the request contains any data vectors, add up to 885 * fragsize bytes to the descriptor. 886 */ 887 while (queued < datalen && 888 (req->sent + data_sent) < req->data_len) { 889 ret = user_sdma_txadd(req, tx, iovec, datalen, 890 &queued, &data_sent, &iov_offset); 891 if (ret) 892 goto free_txreq; 893 } 894 /* 895 * The txreq was submitted successfully so we can update 896 * the counters. 897 */ 898 req->koffset += datalen; 899 if (req_opcode(req->info.ctrl) == EXPECTED) 900 req->tidoffset += datalen; 901 req->sent += data_sent; 902 if (req->data_len) 903 iovec->offset += iov_offset; 904 list_add_tail(&tx->txreq.list, &req->txps); 905 /* 906 * It is important to increment this here as it is used to 907 * generate the BTH.PSN and, therefore, can't be bulk-updated 908 * outside of the loop. 909 */ 910 tx->seqnum = req->seqnum++; 911 npkts++; 912 } 913 dosend: 914 ret = sdma_send_txlist(req->sde, 915 iowait_get_ib_work(&pq->busy), 916 &req->txps, &count); 917 req->seqsubmitted += count; 918 if (req->seqsubmitted == req->info.npkts) { 919 /* 920 * The txreq has already been submitted to the HW queue 921 * so we can free the AHG entry now. Corruption will not 922 * happen due to the sequential manner in which 923 * descriptors are processed. 924 */ 925 if (req->ahg_idx >= 0) 926 sdma_ahg_free(req->sde, req->ahg_idx); 927 } 928 return ret; 929 930 free_txreq: 931 sdma_txclean(pq->dd, &tx->txreq); 932 free_tx: 933 kmem_cache_free(pq->txreq_cache, tx); 934 return ret; 935 } 936 937 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 938 { 939 struct evict_data evict_data; 940 941 evict_data.cleared = 0; 942 evict_data.target = npages; 943 hfi1_mmu_rb_evict(pq->handler, &evict_data); 944 return evict_data.cleared; 945 } 946 947 static int pin_sdma_pages(struct user_sdma_request *req, 948 struct user_sdma_iovec *iovec, 949 struct sdma_mmu_node *node, 950 int npages) 951 { 952 int pinned, cleared; 953 struct page **pages; 954 struct hfi1_user_sdma_pkt_q *pq = req->pq; 955 956 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 957 if (!pages) 958 return -ENOMEM; 959 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 960 961 npages -= node->npages; 962 retry: 963 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 964 atomic_read(&pq->n_locked), npages)) { 965 cleared = sdma_cache_evict(pq, npages); 966 if (cleared >= npages) 967 goto retry; 968 } 969 pinned = hfi1_acquire_user_pages(pq->mm, 970 ((unsigned long)iovec->iov.iov_base + 971 (node->npages * PAGE_SIZE)), npages, 0, 972 pages + node->npages); 973 if (pinned < 0) { 974 kfree(pages); 975 return pinned; 976 } 977 if (pinned != npages) { 978 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 979 return -EFAULT; 980 } 981 kfree(node->pages); 982 node->rb.len = iovec->iov.iov_len; 983 node->pages = pages; 984 atomic_add(pinned, &pq->n_locked); 985 return pinned; 986 } 987 988 static void unpin_sdma_pages(struct sdma_mmu_node *node) 989 { 990 if (node->npages) { 991 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 992 atomic_sub(node->npages, &node->pq->n_locked); 993 } 994 } 995 996 static int pin_vector_pages(struct user_sdma_request *req, 997 struct user_sdma_iovec *iovec) 998 { 999 int ret = 0, pinned, npages; 1000 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1001 struct sdma_mmu_node *node = NULL; 1002 struct mmu_rb_node *rb_node; 1003 struct iovec *iov; 1004 bool extracted; 1005 1006 extracted = 1007 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1008 (unsigned long) 1009 iovec->iov.iov_base, 1010 iovec->iov.iov_len, &rb_node); 1011 if (rb_node) { 1012 node = container_of(rb_node, struct sdma_mmu_node, rb); 1013 if (!extracted) { 1014 atomic_inc(&node->refcount); 1015 iovec->pages = node->pages; 1016 iovec->npages = node->npages; 1017 iovec->node = node; 1018 return 0; 1019 } 1020 } 1021 1022 if (!node) { 1023 node = kzalloc(sizeof(*node), GFP_KERNEL); 1024 if (!node) 1025 return -ENOMEM; 1026 1027 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1028 node->pq = pq; 1029 atomic_set(&node->refcount, 0); 1030 } 1031 1032 iov = &iovec->iov; 1033 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1034 if (node->npages < npages) { 1035 pinned = pin_sdma_pages(req, iovec, node, npages); 1036 if (pinned < 0) { 1037 ret = pinned; 1038 goto bail; 1039 } 1040 node->npages += pinned; 1041 npages = node->npages; 1042 } 1043 iovec->pages = node->pages; 1044 iovec->npages = npages; 1045 iovec->node = node; 1046 1047 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1048 if (ret) { 1049 iovec->node = NULL; 1050 goto bail; 1051 } 1052 return 0; 1053 bail: 1054 unpin_sdma_pages(node); 1055 kfree(node); 1056 return ret; 1057 } 1058 1059 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1060 unsigned start, unsigned npages) 1061 { 1062 hfi1_release_user_pages(mm, pages + start, npages, false); 1063 kfree(pages); 1064 } 1065 1066 static int check_header_template(struct user_sdma_request *req, 1067 struct hfi1_pkt_header *hdr, u32 lrhlen, 1068 u32 datalen) 1069 { 1070 /* 1071 * Perform safety checks for any type of packet: 1072 * - transfer size is multiple of 64bytes 1073 * - packet length is multiple of 4 bytes 1074 * - packet length is not larger than MTU size 1075 * 1076 * These checks are only done for the first packet of the 1077 * transfer since the header is "given" to us by user space. 1078 * For the remainder of the packets we compute the values. 1079 */ 1080 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1081 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1082 return -EINVAL; 1083 1084 if (req_opcode(req->info.ctrl) == EXPECTED) { 1085 /* 1086 * The header is checked only on the first packet. Furthermore, 1087 * we ensure that at least one TID entry is copied when the 1088 * request is submitted. Therefore, we don't have to verify that 1089 * tididx points to something sane. 1090 */ 1091 u32 tidval = req->tids[req->tididx], 1092 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1093 tididx = EXP_TID_GET(tidval, IDX), 1094 tidctrl = EXP_TID_GET(tidval, CTRL), 1095 tidoff; 1096 __le32 kval = hdr->kdeth.ver_tid_offset; 1097 1098 tidoff = KDETH_GET(kval, OFFSET) * 1099 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1100 KDETH_OM_LARGE : KDETH_OM_SMALL); 1101 /* 1102 * Expected receive packets have the following 1103 * additional checks: 1104 * - offset is not larger than the TID size 1105 * - TIDCtrl values match between header and TID array 1106 * - TID indexes match between header and TID array 1107 */ 1108 if ((tidoff + datalen > tidlen) || 1109 KDETH_GET(kval, TIDCTRL) != tidctrl || 1110 KDETH_GET(kval, TID) != tididx) 1111 return -EINVAL; 1112 } 1113 return 0; 1114 } 1115 1116 /* 1117 * Correctly set the BTH.PSN field based on type of 1118 * transfer - eager packets can just increment the PSN but 1119 * expected packets encode generation and sequence in the 1120 * BTH.PSN field so just incrementing will result in errors. 1121 */ 1122 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1123 { 1124 u32 val = be32_to_cpu(bthpsn), 1125 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1126 0xffffffull), 1127 psn = val & mask; 1128 if (expct) 1129 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1130 else 1131 psn = psn + frags; 1132 return psn & mask; 1133 } 1134 1135 static int set_txreq_header(struct user_sdma_request *req, 1136 struct user_sdma_txreq *tx, u32 datalen) 1137 { 1138 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1139 struct hfi1_pkt_header *hdr = &tx->hdr; 1140 u8 omfactor; /* KDETH.OM */ 1141 u16 pbclen; 1142 int ret; 1143 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1144 1145 /* Copy the header template to the request before modification */ 1146 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1147 1148 /* 1149 * Check if the PBC and LRH length are mismatched. If so 1150 * adjust both in the header. 1151 */ 1152 pbclen = le16_to_cpu(hdr->pbc[0]); 1153 if (PBC2LRH(pbclen) != lrhlen) { 1154 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1155 hdr->pbc[0] = cpu_to_le16(pbclen); 1156 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1157 /* 1158 * Third packet 1159 * This is the first packet in the sequence that has 1160 * a "static" size that can be used for the rest of 1161 * the packets (besides the last one). 1162 */ 1163 if (unlikely(req->seqnum == 2)) { 1164 /* 1165 * From this point on the lengths in both the 1166 * PBC and LRH are the same until the last 1167 * packet. 1168 * Adjust the template so we don't have to update 1169 * every packet 1170 */ 1171 req->hdr.pbc[0] = hdr->pbc[0]; 1172 req->hdr.lrh[2] = hdr->lrh[2]; 1173 } 1174 } 1175 /* 1176 * We only have to modify the header if this is not the 1177 * first packet in the request. Otherwise, we use the 1178 * header given to us. 1179 */ 1180 if (unlikely(!req->seqnum)) { 1181 ret = check_header_template(req, hdr, lrhlen, datalen); 1182 if (ret) 1183 return ret; 1184 goto done; 1185 } 1186 1187 hdr->bth[2] = cpu_to_be32( 1188 set_pkt_bth_psn(hdr->bth[2], 1189 (req_opcode(req->info.ctrl) == EXPECTED), 1190 req->seqnum)); 1191 1192 /* Set ACK request on last packet */ 1193 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1194 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1195 1196 /* Set the new offset */ 1197 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1198 /* Expected packets have to fill in the new TID information */ 1199 if (req_opcode(req->info.ctrl) == EXPECTED) { 1200 tidval = req->tids[req->tididx]; 1201 /* 1202 * If the offset puts us at the end of the current TID, 1203 * advance everything. 1204 */ 1205 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1206 PAGE_SIZE)) { 1207 req->tidoffset = 0; 1208 /* 1209 * Since we don't copy all the TIDs, all at once, 1210 * we have to check again. 1211 */ 1212 if (++req->tididx > req->n_tids - 1 || 1213 !req->tids[req->tididx]) { 1214 return -EINVAL; 1215 } 1216 tidval = req->tids[req->tididx]; 1217 } 1218 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1219 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1220 KDETH_OM_SMALL_SHIFT; 1221 /* Set KDETH.TIDCtrl based on value for this TID. */ 1222 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1223 EXP_TID_GET(tidval, CTRL)); 1224 /* Set KDETH.TID based on value for this TID */ 1225 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1226 EXP_TID_GET(tidval, IDX)); 1227 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1228 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1229 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1230 /* 1231 * Set the KDETH.OFFSET and KDETH.OM based on size of 1232 * transfer. 1233 */ 1234 trace_hfi1_sdma_user_tid_info( 1235 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1236 req->tidoffset, req->tidoffset >> omfactor, 1237 omfactor != KDETH_OM_SMALL_SHIFT); 1238 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1239 req->tidoffset >> omfactor); 1240 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1241 omfactor != KDETH_OM_SMALL_SHIFT); 1242 } 1243 done: 1244 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1245 req->info.comp_idx, hdr, tidval); 1246 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1247 } 1248 1249 static int set_txreq_header_ahg(struct user_sdma_request *req, 1250 struct user_sdma_txreq *tx, u32 datalen) 1251 { 1252 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1253 int idx = 0; 1254 u8 omfactor; /* KDETH.OM */ 1255 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1256 struct hfi1_pkt_header *hdr = &req->hdr; 1257 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1258 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1259 size_t array_size = ARRAY_SIZE(ahg); 1260 1261 if (PBC2LRH(pbclen) != lrhlen) { 1262 /* PBC.PbcLengthDWs */ 1263 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1264 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1265 if (idx < 0) 1266 return idx; 1267 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1268 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1269 (__force u16)cpu_to_be16(lrhlen >> 2)); 1270 if (idx < 0) 1271 return idx; 1272 } 1273 1274 /* 1275 * Do the common updates 1276 */ 1277 /* BTH.PSN and BTH.A */ 1278 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1279 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1280 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1281 val32 |= 1UL << 31; 1282 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1283 (__force u16)cpu_to_be16(val32 >> 16)); 1284 if (idx < 0) 1285 return idx; 1286 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1287 (__force u16)cpu_to_be16(val32 & 0xffff)); 1288 if (idx < 0) 1289 return idx; 1290 /* KDETH.Offset */ 1291 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1292 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1293 if (idx < 0) 1294 return idx; 1295 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1296 (__force u16)cpu_to_le16(req->koffset >> 16)); 1297 if (idx < 0) 1298 return idx; 1299 if (req_opcode(req->info.ctrl) == EXPECTED) { 1300 __le16 val; 1301 1302 tidval = req->tids[req->tididx]; 1303 1304 /* 1305 * If the offset puts us at the end of the current TID, 1306 * advance everything. 1307 */ 1308 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1309 PAGE_SIZE)) { 1310 req->tidoffset = 0; 1311 /* 1312 * Since we don't copy all the TIDs, all at once, 1313 * we have to check again. 1314 */ 1315 if (++req->tididx > req->n_tids - 1 || 1316 !req->tids[req->tididx]) 1317 return -EINVAL; 1318 tidval = req->tids[req->tididx]; 1319 } 1320 omfactor = ((EXP_TID_GET(tidval, LEN) * 1321 PAGE_SIZE) >= 1322 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1323 KDETH_OM_SMALL_SHIFT; 1324 /* KDETH.OM and KDETH.OFFSET (TID) */ 1325 idx = ahg_header_set( 1326 ahg, idx, array_size, 7, 0, 16, 1327 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1328 ((req->tidoffset >> omfactor) 1329 & 0x7fff))); 1330 if (idx < 0) 1331 return idx; 1332 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1333 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1334 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1335 1336 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1337 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1338 INTR) << 1339 AHG_KDETH_INTR_SHIFT)); 1340 } else { 1341 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1342 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1343 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1344 INTR) << 1345 AHG_KDETH_INTR_SHIFT)); 1346 } 1347 1348 idx = ahg_header_set(ahg, idx, array_size, 1349 7, 16, 14, (__force u16)val); 1350 if (idx < 0) 1351 return idx; 1352 } 1353 1354 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1355 req->info.comp_idx, req->sde->this_idx, 1356 req->ahg_idx, ahg, idx, tidval); 1357 sdma_txinit_ahg(&tx->txreq, 1358 SDMA_TXREQ_F_USE_AHG, 1359 datalen, req->ahg_idx, idx, 1360 ahg, sizeof(req->hdr), 1361 user_sdma_txreq_cb); 1362 1363 return idx; 1364 } 1365 1366 /** 1367 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1368 * @txreq: valid sdma tx request 1369 * @status: success/failure of request 1370 * 1371 * Called when the SDMA progress state machine gets notification that 1372 * the SDMA descriptors for this tx request have been processed by the 1373 * DMA engine. Called in interrupt context. 1374 * Only do work on completed sequences. 1375 */ 1376 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1377 { 1378 struct user_sdma_txreq *tx = 1379 container_of(txreq, struct user_sdma_txreq, txreq); 1380 struct user_sdma_request *req; 1381 struct hfi1_user_sdma_pkt_q *pq; 1382 struct hfi1_user_sdma_comp_q *cq; 1383 enum hfi1_sdma_comp_state state = COMPLETE; 1384 1385 if (!tx->req) 1386 return; 1387 1388 req = tx->req; 1389 pq = req->pq; 1390 cq = req->cq; 1391 1392 if (status != SDMA_TXREQ_S_OK) { 1393 SDMA_DBG(req, "SDMA completion with error %d", 1394 status); 1395 WRITE_ONCE(req->has_error, 1); 1396 state = ERROR; 1397 } 1398 1399 req->seqcomp = tx->seqnum; 1400 kmem_cache_free(pq->txreq_cache, tx); 1401 1402 /* sequence isn't complete? We are done */ 1403 if (req->seqcomp != req->info.npkts - 1) 1404 return; 1405 1406 user_sdma_free_request(req, false); 1407 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1408 pq_update(pq); 1409 } 1410 1411 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1412 { 1413 if (atomic_dec_and_test(&pq->n_reqs)) 1414 wake_up(&pq->wait); 1415 } 1416 1417 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1418 { 1419 int i; 1420 1421 if (!list_empty(&req->txps)) { 1422 struct sdma_txreq *t, *p; 1423 1424 list_for_each_entry_safe(t, p, &req->txps, list) { 1425 struct user_sdma_txreq *tx = 1426 container_of(t, struct user_sdma_txreq, txreq); 1427 list_del_init(&t->list); 1428 sdma_txclean(req->pq->dd, t); 1429 kmem_cache_free(req->pq->txreq_cache, tx); 1430 } 1431 } 1432 1433 for (i = 0; i < req->data_iovs; i++) { 1434 struct sdma_mmu_node *node = req->iovs[i].node; 1435 1436 if (!node) 1437 continue; 1438 1439 req->iovs[i].node = NULL; 1440 1441 if (unpin) 1442 hfi1_mmu_rb_remove(req->pq->handler, 1443 &node->rb); 1444 else 1445 atomic_dec(&node->refcount); 1446 } 1447 1448 kfree(req->tids); 1449 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1450 } 1451 1452 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1453 struct hfi1_user_sdma_comp_q *cq, 1454 u16 idx, enum hfi1_sdma_comp_state state, 1455 int ret) 1456 { 1457 if (state == ERROR) 1458 cq->comps[idx].errcode = -ret; 1459 smp_wmb(); /* make sure errcode is visible first */ 1460 cq->comps[idx].status = state; 1461 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1462 idx, state, ret); 1463 } 1464 1465 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1466 unsigned long len) 1467 { 1468 return (bool)(node->addr == addr); 1469 } 1470 1471 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1472 { 1473 struct sdma_mmu_node *node = 1474 container_of(mnode, struct sdma_mmu_node, rb); 1475 1476 atomic_inc(&node->refcount); 1477 return 0; 1478 } 1479 1480 /* 1481 * Return 1 to remove the node from the rb tree and call the remove op. 1482 * 1483 * Called with the rb tree lock held. 1484 */ 1485 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1486 void *evict_arg, bool *stop) 1487 { 1488 struct sdma_mmu_node *node = 1489 container_of(mnode, struct sdma_mmu_node, rb); 1490 struct evict_data *evict_data = evict_arg; 1491 1492 /* is this node still being used? */ 1493 if (atomic_read(&node->refcount)) 1494 return 0; /* keep this node */ 1495 1496 /* this node will be evicted, add its pages to our count */ 1497 evict_data->cleared += node->npages; 1498 1499 /* have enough pages been cleared? */ 1500 if (evict_data->cleared >= evict_data->target) 1501 *stop = true; 1502 1503 return 1; /* remove this node */ 1504 } 1505 1506 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1507 { 1508 struct sdma_mmu_node *node = 1509 container_of(mnode, struct sdma_mmu_node, rb); 1510 1511 unpin_sdma_pages(node); 1512 kfree(node); 1513 } 1514 1515 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1516 { 1517 struct sdma_mmu_node *node = 1518 container_of(mnode, struct sdma_mmu_node, rb); 1519 1520 if (!atomic_read(&node->refcount)) 1521 return 1; 1522 return 0; 1523 } 1524