1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 80 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 81 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 82 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 83 static int pin_vector_pages(struct user_sdma_request *req, 84 struct user_sdma_iovec *iovec); 85 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 86 unsigned start, unsigned npages); 87 static int check_header_template(struct user_sdma_request *req, 88 struct hfi1_pkt_header *hdr, u32 lrhlen, 89 u32 datalen); 90 static int set_txreq_header(struct user_sdma_request *req, 91 struct user_sdma_txreq *tx, u32 datalen); 92 static int set_txreq_header_ahg(struct user_sdma_request *req, 93 struct user_sdma_txreq *tx, u32 len); 94 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 95 struct hfi1_user_sdma_comp_q *cq, 96 u16 idx, enum hfi1_sdma_comp_state state, 97 int ret); 98 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 99 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 100 101 static int defer_packet_queue( 102 struct sdma_engine *sde, 103 struct iowait_work *wait, 104 struct sdma_txreq *txreq, 105 uint seq, 106 bool pkts_sent); 107 static void activate_packet_queue(struct iowait *wait, int reason); 108 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 109 unsigned long len); 110 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 111 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 112 void *arg2, bool *stop); 113 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 114 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 115 116 static struct mmu_rb_ops sdma_rb_ops = { 117 .filter = sdma_rb_filter, 118 .insert = sdma_rb_insert, 119 .evict = sdma_rb_evict, 120 .remove = sdma_rb_remove, 121 .invalidate = sdma_rb_invalidate 122 }; 123 124 static int defer_packet_queue( 125 struct sdma_engine *sde, 126 struct iowait_work *wait, 127 struct sdma_txreq *txreq, 128 uint seq, 129 bool pkts_sent) 130 { 131 struct hfi1_user_sdma_pkt_q *pq = 132 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 133 struct user_sdma_txreq *tx = 134 container_of(txreq, struct user_sdma_txreq, txreq); 135 136 if (sdma_progress(sde, seq, txreq)) { 137 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 138 goto eagain; 139 } 140 /* 141 * We are assuming that if the list is enqueued somewhere, it 142 * is to the dmawait list since that is the only place where 143 * it is supposed to be enqueued. 144 */ 145 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 146 write_seqlock(&sde->waitlock); 147 if (list_empty(&pq->busy.list)) { 148 iowait_get_priority(&pq->busy); 149 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 150 } 151 write_sequnlock(&sde->waitlock); 152 return -EBUSY; 153 eagain: 154 return -EAGAIN; 155 } 156 157 static void activate_packet_queue(struct iowait *wait, int reason) 158 { 159 struct hfi1_user_sdma_pkt_q *pq = 160 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 161 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 162 wake_up(&wait->wait_dma); 163 }; 164 165 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 166 struct hfi1_filedata *fd) 167 { 168 int ret = -ENOMEM; 169 char buf[64]; 170 struct hfi1_devdata *dd; 171 struct hfi1_user_sdma_comp_q *cq; 172 struct hfi1_user_sdma_pkt_q *pq; 173 174 if (!uctxt || !fd) 175 return -EBADF; 176 177 if (!hfi1_sdma_comp_ring_size) 178 return -EINVAL; 179 180 dd = uctxt->dd; 181 182 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 183 if (!pq) 184 return -ENOMEM; 185 186 pq->dd = dd; 187 pq->ctxt = uctxt->ctxt; 188 pq->subctxt = fd->subctxt; 189 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 190 atomic_set(&pq->n_reqs, 0); 191 init_waitqueue_head(&pq->wait); 192 atomic_set(&pq->n_locked, 0); 193 pq->mm = fd->mm; 194 195 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 196 activate_packet_queue, NULL, NULL); 197 pq->reqidx = 0; 198 199 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 200 sizeof(*pq->reqs), 201 GFP_KERNEL); 202 if (!pq->reqs) 203 goto pq_reqs_nomem; 204 205 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 206 sizeof(*pq->req_in_use), 207 GFP_KERNEL); 208 if (!pq->req_in_use) 209 goto pq_reqs_no_in_use; 210 211 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 212 fd->subctxt); 213 pq->txreq_cache = kmem_cache_create(buf, 214 sizeof(struct user_sdma_txreq), 215 L1_CACHE_BYTES, 216 SLAB_HWCACHE_ALIGN, 217 NULL); 218 if (!pq->txreq_cache) { 219 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 220 uctxt->ctxt); 221 goto pq_txreq_nomem; 222 } 223 224 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 225 if (!cq) 226 goto cq_nomem; 227 228 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 229 * hfi1_sdma_comp_ring_size)); 230 if (!cq->comps) 231 goto cq_comps_nomem; 232 233 cq->nentries = hfi1_sdma_comp_ring_size; 234 235 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 236 &pq->handler); 237 if (ret) { 238 dd_dev_err(dd, "Failed to register with MMU %d", ret); 239 goto pq_mmu_fail; 240 } 241 242 fd->pq = pq; 243 fd->cq = cq; 244 245 return 0; 246 247 pq_mmu_fail: 248 vfree(cq->comps); 249 cq_comps_nomem: 250 kfree(cq); 251 cq_nomem: 252 kmem_cache_destroy(pq->txreq_cache); 253 pq_txreq_nomem: 254 kfree(pq->req_in_use); 255 pq_reqs_no_in_use: 256 kfree(pq->reqs); 257 pq_reqs_nomem: 258 kfree(pq); 259 260 return ret; 261 } 262 263 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 264 struct hfi1_ctxtdata *uctxt) 265 { 266 struct hfi1_user_sdma_pkt_q *pq; 267 268 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 269 270 pq = fd->pq; 271 if (pq) { 272 if (pq->handler) 273 hfi1_mmu_rb_unregister(pq->handler); 274 iowait_sdma_drain(&pq->busy); 275 /* Wait until all requests have been freed. */ 276 wait_event_interruptible( 277 pq->wait, 278 !atomic_read(&pq->n_reqs)); 279 kfree(pq->reqs); 280 kfree(pq->req_in_use); 281 kmem_cache_destroy(pq->txreq_cache); 282 kfree(pq); 283 fd->pq = NULL; 284 } 285 if (fd->cq) { 286 vfree(fd->cq->comps); 287 kfree(fd->cq); 288 fd->cq = NULL; 289 } 290 return 0; 291 } 292 293 static u8 dlid_to_selector(u16 dlid) 294 { 295 static u8 mapping[256]; 296 static int initialized; 297 static u8 next; 298 int hash; 299 300 if (!initialized) { 301 memset(mapping, 0xFF, 256); 302 initialized = 1; 303 } 304 305 hash = ((dlid >> 8) ^ dlid) & 0xFF; 306 if (mapping[hash] == 0xFF) { 307 mapping[hash] = next; 308 next = (next + 1) & 0x7F; 309 } 310 311 return mapping[hash]; 312 } 313 314 /** 315 * hfi1_user_sdma_process_request() - Process and start a user sdma request 316 * @fd: valid file descriptor 317 * @iovec: array of io vectors to process 318 * @dim: overall iovec array size 319 * @count: number of io vector array entries processed 320 */ 321 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 322 struct iovec *iovec, unsigned long dim, 323 unsigned long *count) 324 { 325 int ret = 0, i; 326 struct hfi1_ctxtdata *uctxt = fd->uctxt; 327 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 328 struct hfi1_user_sdma_comp_q *cq = fd->cq; 329 struct hfi1_devdata *dd = pq->dd; 330 unsigned long idx = 0; 331 u8 pcount = initial_pkt_count; 332 struct sdma_req_info info; 333 struct user_sdma_request *req; 334 u8 opcode, sc, vl; 335 u16 pkey; 336 u32 slid; 337 u16 dlid; 338 u32 selector; 339 340 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 341 hfi1_cdbg( 342 SDMA, 343 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 344 dd->unit, uctxt->ctxt, fd->subctxt, 345 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 346 return -EINVAL; 347 } 348 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 349 if (ret) { 350 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 351 dd->unit, uctxt->ctxt, fd->subctxt, ret); 352 return -EFAULT; 353 } 354 355 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 356 (u16 *)&info); 357 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 358 hfi1_cdbg(SDMA, 359 "[%u:%u:%u:%u] Invalid comp index", 360 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 361 return -EINVAL; 362 } 363 364 /* 365 * Sanity check the header io vector count. Need at least 1 vector 366 * (header) and cannot be larger than the actual io vector count. 367 */ 368 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 369 hfi1_cdbg(SDMA, 370 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 371 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 372 req_iovcnt(info.ctrl), dim); 373 return -EINVAL; 374 } 375 376 if (!info.fragsize) { 377 hfi1_cdbg(SDMA, 378 "[%u:%u:%u:%u] Request does not specify fragsize", 379 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 380 return -EINVAL; 381 } 382 383 /* Try to claim the request. */ 384 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 385 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 386 dd->unit, uctxt->ctxt, fd->subctxt, 387 info.comp_idx); 388 return -EBADSLT; 389 } 390 /* 391 * All safety checks have been done and this request has been claimed. 392 */ 393 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 394 info.comp_idx); 395 req = pq->reqs + info.comp_idx; 396 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 397 req->data_len = 0; 398 req->pq = pq; 399 req->cq = cq; 400 req->ahg_idx = -1; 401 req->iov_idx = 0; 402 req->sent = 0; 403 req->seqnum = 0; 404 req->seqcomp = 0; 405 req->seqsubmitted = 0; 406 req->tids = NULL; 407 req->has_error = 0; 408 INIT_LIST_HEAD(&req->txps); 409 410 memcpy(&req->info, &info, sizeof(info)); 411 412 /* The request is initialized, count it */ 413 atomic_inc(&pq->n_reqs); 414 415 if (req_opcode(info.ctrl) == EXPECTED) { 416 /* expected must have a TID info and at least one data vector */ 417 if (req->data_iovs < 2) { 418 SDMA_DBG(req, 419 "Not enough vectors for expected request"); 420 ret = -EINVAL; 421 goto free_req; 422 } 423 req->data_iovs--; 424 } 425 426 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 427 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 428 MAX_VECTORS_PER_REQ); 429 ret = -EINVAL; 430 goto free_req; 431 } 432 /* Copy the header from the user buffer */ 433 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 434 sizeof(req->hdr)); 435 if (ret) { 436 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 437 ret = -EFAULT; 438 goto free_req; 439 } 440 441 /* If Static rate control is not enabled, sanitize the header. */ 442 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 443 req->hdr.pbc[2] = 0; 444 445 /* Validate the opcode. Do not trust packets from user space blindly. */ 446 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 447 if ((opcode & USER_OPCODE_CHECK_MASK) != 448 USER_OPCODE_CHECK_VAL) { 449 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 450 ret = -EINVAL; 451 goto free_req; 452 } 453 /* 454 * Validate the vl. Do not trust packets from user space blindly. 455 * VL comes from PBC, SC comes from LRH, and the VL needs to 456 * match the SC look up. 457 */ 458 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 459 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 460 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 461 if (vl >= dd->pport->vls_operational || 462 vl != sc_to_vlt(dd, sc)) { 463 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 464 ret = -EINVAL; 465 goto free_req; 466 } 467 468 /* Checking P_KEY for requests from user-space */ 469 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 470 slid = be16_to_cpu(req->hdr.lrh[3]); 471 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 472 ret = -EINVAL; 473 goto free_req; 474 } 475 476 /* 477 * Also should check the BTH.lnh. If it says the next header is GRH then 478 * the RXE parsing will be off and will land in the middle of the KDETH 479 * or miss it entirely. 480 */ 481 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 482 SDMA_DBG(req, "User tried to pass in a GRH"); 483 ret = -EINVAL; 484 goto free_req; 485 } 486 487 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 488 /* 489 * Calculate the initial TID offset based on the values of 490 * KDETH.OFFSET and KDETH.OM that are passed in. 491 */ 492 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 493 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 494 KDETH_OM_LARGE : KDETH_OM_SMALL); 495 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 496 info.comp_idx, req->tidoffset); 497 idx++; 498 499 /* Save all the IO vector structures */ 500 for (i = 0; i < req->data_iovs; i++) { 501 req->iovs[i].offset = 0; 502 INIT_LIST_HEAD(&req->iovs[i].list); 503 memcpy(&req->iovs[i].iov, 504 iovec + idx++, 505 sizeof(req->iovs[i].iov)); 506 ret = pin_vector_pages(req, &req->iovs[i]); 507 if (ret) { 508 req->data_iovs = i; 509 goto free_req; 510 } 511 req->data_len += req->iovs[i].iov.iov_len; 512 } 513 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 514 info.comp_idx, req->data_len); 515 if (pcount > req->info.npkts) 516 pcount = req->info.npkts; 517 /* 518 * Copy any TID info 519 * User space will provide the TID info only when the 520 * request type is EXPECTED. This is true even if there is 521 * only one packet in the request and the header is already 522 * setup. The reason for the singular TID case is that the 523 * driver needs to perform safety checks. 524 */ 525 if (req_opcode(req->info.ctrl) == EXPECTED) { 526 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 527 u32 *tmp; 528 529 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 530 ret = -EINVAL; 531 goto free_req; 532 } 533 534 /* 535 * We have to copy all of the tids because they may vary 536 * in size and, therefore, the TID count might not be 537 * equal to the pkt count. However, there is no way to 538 * tell at this point. 539 */ 540 tmp = memdup_user(iovec[idx].iov_base, 541 ntids * sizeof(*req->tids)); 542 if (IS_ERR(tmp)) { 543 ret = PTR_ERR(tmp); 544 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 545 ntids, ret); 546 goto free_req; 547 } 548 req->tids = tmp; 549 req->n_tids = ntids; 550 req->tididx = 0; 551 idx++; 552 } 553 554 dlid = be16_to_cpu(req->hdr.lrh[1]); 555 selector = dlid_to_selector(dlid); 556 selector += uctxt->ctxt + fd->subctxt; 557 req->sde = sdma_select_user_engine(dd, selector, vl); 558 559 if (!req->sde || !sdma_running(req->sde)) { 560 ret = -ECOMM; 561 goto free_req; 562 } 563 564 /* We don't need an AHG entry if the request contains only one packet */ 565 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 566 req->ahg_idx = sdma_ahg_alloc(req->sde); 567 568 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 569 pq->state = SDMA_PKT_Q_ACTIVE; 570 /* Send the first N packets in the request to buy us some time */ 571 ret = user_sdma_send_pkts(req, pcount); 572 if (unlikely(ret < 0 && ret != -EBUSY)) 573 goto free_req; 574 575 /* 576 * This is a somewhat blocking send implementation. 577 * The driver will block the caller until all packets of the 578 * request have been submitted to the SDMA engine. However, it 579 * will not wait for send completions. 580 */ 581 while (req->seqsubmitted != req->info.npkts) { 582 ret = user_sdma_send_pkts(req, pcount); 583 if (ret < 0) { 584 if (ret != -EBUSY) 585 goto free_req; 586 wait_event_interruptible_timeout( 587 pq->busy.wait_dma, 588 (pq->state == SDMA_PKT_Q_ACTIVE), 589 msecs_to_jiffies( 590 SDMA_IOWAIT_TIMEOUT)); 591 } 592 } 593 *count += idx; 594 return 0; 595 free_req: 596 /* 597 * If the submitted seqsubmitted == npkts, the completion routine 598 * controls the final state. If sequbmitted < npkts, wait for any 599 * outstanding packets to finish before cleaning up. 600 */ 601 if (req->seqsubmitted < req->info.npkts) { 602 if (req->seqsubmitted) 603 wait_event(pq->busy.wait_dma, 604 (req->seqcomp == req->seqsubmitted - 1)); 605 user_sdma_free_request(req, true); 606 pq_update(pq); 607 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 608 } 609 return ret; 610 } 611 612 static inline u32 compute_data_length(struct user_sdma_request *req, 613 struct user_sdma_txreq *tx) 614 { 615 /* 616 * Determine the proper size of the packet data. 617 * The size of the data of the first packet is in the header 618 * template. However, it includes the header and ICRC, which need 619 * to be subtracted. 620 * The minimum representable packet data length in a header is 4 bytes, 621 * therefore, when the data length request is less than 4 bytes, there's 622 * only one packet, and the packet data length is equal to that of the 623 * request data length. 624 * The size of the remaining packets is the minimum of the frag 625 * size (MTU) or remaining data in the request. 626 */ 627 u32 len; 628 629 if (!req->seqnum) { 630 if (req->data_len < sizeof(u32)) 631 len = req->data_len; 632 else 633 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 634 (sizeof(tx->hdr) - 4)); 635 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 636 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 637 PAGE_SIZE; 638 /* 639 * Get the data length based on the remaining space in the 640 * TID pair. 641 */ 642 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 643 /* If we've filled up the TID pair, move to the next one. */ 644 if (unlikely(!len) && ++req->tididx < req->n_tids && 645 req->tids[req->tididx]) { 646 tidlen = EXP_TID_GET(req->tids[req->tididx], 647 LEN) * PAGE_SIZE; 648 req->tidoffset = 0; 649 len = min_t(u32, tidlen, req->info.fragsize); 650 } 651 /* 652 * Since the TID pairs map entire pages, make sure that we 653 * are not going to try to send more data that we have 654 * remaining. 655 */ 656 len = min(len, req->data_len - req->sent); 657 } else { 658 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 659 } 660 trace_hfi1_sdma_user_compute_length(req->pq->dd, 661 req->pq->ctxt, 662 req->pq->subctxt, 663 req->info.comp_idx, 664 len); 665 return len; 666 } 667 668 static inline u32 pad_len(u32 len) 669 { 670 if (len & (sizeof(u32) - 1)) 671 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 672 return len; 673 } 674 675 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 676 { 677 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 678 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 679 } 680 681 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 682 struct user_sdma_txreq *tx, 683 u32 datalen) 684 { 685 int ret; 686 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 687 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 688 struct hfi1_user_sdma_pkt_q *pq = req->pq; 689 690 /* 691 * Copy the request header into the tx header 692 * because the HW needs a cacheline-aligned 693 * address. 694 * This copy can be optimized out if the hdr 695 * member of user_sdma_request were also 696 * cacheline aligned. 697 */ 698 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 699 if (PBC2LRH(pbclen) != lrhlen) { 700 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 701 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 702 } 703 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 704 if (ret) 705 return ret; 706 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 707 sizeof(tx->hdr) + datalen, req->ahg_idx, 708 0, NULL, 0, user_sdma_txreq_cb); 709 if (ret) 710 return ret; 711 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 712 if (ret) 713 sdma_txclean(pq->dd, &tx->txreq); 714 return ret; 715 } 716 717 static int user_sdma_txadd(struct user_sdma_request *req, 718 struct user_sdma_txreq *tx, 719 struct user_sdma_iovec *iovec, u32 datalen, 720 u32 *queued_ptr, u32 *data_sent_ptr, 721 u64 *iov_offset_ptr) 722 { 723 int ret; 724 unsigned int pageidx, len; 725 unsigned long base, offset; 726 u64 iov_offset = *iov_offset_ptr; 727 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 728 struct hfi1_user_sdma_pkt_q *pq = req->pq; 729 730 base = (unsigned long)iovec->iov.iov_base; 731 offset = offset_in_page(base + iovec->offset + iov_offset); 732 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 733 PAGE_SHIFT); 734 len = offset + req->info.fragsize > PAGE_SIZE ? 735 PAGE_SIZE - offset : req->info.fragsize; 736 len = min((datalen - queued), len); 737 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 738 offset, len); 739 if (ret) { 740 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 741 return ret; 742 } 743 iov_offset += len; 744 queued += len; 745 data_sent += len; 746 if (unlikely(queued < datalen && pageidx == iovec->npages && 747 req->iov_idx < req->data_iovs - 1)) { 748 iovec->offset += iov_offset; 749 iovec = &req->iovs[++req->iov_idx]; 750 iov_offset = 0; 751 } 752 753 *queued_ptr = queued; 754 *data_sent_ptr = data_sent; 755 *iov_offset_ptr = iov_offset; 756 return ret; 757 } 758 759 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 760 { 761 int ret = 0; 762 u16 count; 763 unsigned npkts = 0; 764 struct user_sdma_txreq *tx = NULL; 765 struct hfi1_user_sdma_pkt_q *pq = NULL; 766 struct user_sdma_iovec *iovec = NULL; 767 768 if (!req->pq) 769 return -EINVAL; 770 771 pq = req->pq; 772 773 /* If tx completion has reported an error, we are done. */ 774 if (READ_ONCE(req->has_error)) 775 return -EFAULT; 776 777 /* 778 * Check if we might have sent the entire request already 779 */ 780 if (unlikely(req->seqnum == req->info.npkts)) { 781 if (!list_empty(&req->txps)) 782 goto dosend; 783 return ret; 784 } 785 786 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 787 maxpkts = req->info.npkts - req->seqnum; 788 789 while (npkts < maxpkts) { 790 u32 datalen = 0, queued = 0, data_sent = 0; 791 u64 iov_offset = 0; 792 793 /* 794 * Check whether any of the completions have come back 795 * with errors. If so, we are not going to process any 796 * more packets from this request. 797 */ 798 if (READ_ONCE(req->has_error)) 799 return -EFAULT; 800 801 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 802 if (!tx) 803 return -ENOMEM; 804 805 tx->flags = 0; 806 tx->req = req; 807 tx->busycount = 0; 808 INIT_LIST_HEAD(&tx->list); 809 810 /* 811 * For the last packet set the ACK request 812 * and disable header suppression. 813 */ 814 if (req->seqnum == req->info.npkts - 1) 815 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 816 TXREQ_FLAGS_REQ_DISABLE_SH); 817 818 /* 819 * Calculate the payload size - this is min of the fragment 820 * (MTU) size or the remaining bytes in the request but only 821 * if we have payload data. 822 */ 823 if (req->data_len) { 824 iovec = &req->iovs[req->iov_idx]; 825 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 826 if (++req->iov_idx == req->data_iovs) { 827 ret = -EFAULT; 828 goto free_tx; 829 } 830 iovec = &req->iovs[req->iov_idx]; 831 WARN_ON(iovec->offset); 832 } 833 834 datalen = compute_data_length(req, tx); 835 836 /* 837 * Disable header suppression for the payload <= 8DWS. 838 * If there is an uncorrectable error in the receive 839 * data FIFO when the received payload size is less than 840 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 841 * not reported.There is set RHF.EccErr if the header 842 * is not suppressed. 843 */ 844 if (!datalen) { 845 SDMA_DBG(req, 846 "Request has data but pkt len is 0"); 847 ret = -EFAULT; 848 goto free_tx; 849 } else if (datalen <= 32) { 850 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 851 } 852 } 853 854 if (req->ahg_idx >= 0) { 855 if (!req->seqnum) { 856 ret = user_sdma_txadd_ahg(req, tx, datalen); 857 if (ret) 858 goto free_tx; 859 } else { 860 int changes; 861 862 changes = set_txreq_header_ahg(req, tx, 863 datalen); 864 if (changes < 0) { 865 ret = changes; 866 goto free_tx; 867 } 868 } 869 } else { 870 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 871 datalen, user_sdma_txreq_cb); 872 if (ret) 873 goto free_tx; 874 /* 875 * Modify the header for this packet. This only needs 876 * to be done if we are not going to use AHG. Otherwise, 877 * the HW will do it based on the changes we gave it 878 * during sdma_txinit_ahg(). 879 */ 880 ret = set_txreq_header(req, tx, datalen); 881 if (ret) 882 goto free_txreq; 883 } 884 885 /* 886 * If the request contains any data vectors, add up to 887 * fragsize bytes to the descriptor. 888 */ 889 while (queued < datalen && 890 (req->sent + data_sent) < req->data_len) { 891 ret = user_sdma_txadd(req, tx, iovec, datalen, 892 &queued, &data_sent, &iov_offset); 893 if (ret) 894 goto free_txreq; 895 } 896 /* 897 * The txreq was submitted successfully so we can update 898 * the counters. 899 */ 900 req->koffset += datalen; 901 if (req_opcode(req->info.ctrl) == EXPECTED) 902 req->tidoffset += datalen; 903 req->sent += data_sent; 904 if (req->data_len) 905 iovec->offset += iov_offset; 906 list_add_tail(&tx->txreq.list, &req->txps); 907 /* 908 * It is important to increment this here as it is used to 909 * generate the BTH.PSN and, therefore, can't be bulk-updated 910 * outside of the loop. 911 */ 912 tx->seqnum = req->seqnum++; 913 npkts++; 914 } 915 dosend: 916 ret = sdma_send_txlist(req->sde, 917 iowait_get_ib_work(&pq->busy), 918 &req->txps, &count); 919 req->seqsubmitted += count; 920 if (req->seqsubmitted == req->info.npkts) { 921 /* 922 * The txreq has already been submitted to the HW queue 923 * so we can free the AHG entry now. Corruption will not 924 * happen due to the sequential manner in which 925 * descriptors are processed. 926 */ 927 if (req->ahg_idx >= 0) 928 sdma_ahg_free(req->sde, req->ahg_idx); 929 } 930 return ret; 931 932 free_txreq: 933 sdma_txclean(pq->dd, &tx->txreq); 934 free_tx: 935 kmem_cache_free(pq->txreq_cache, tx); 936 return ret; 937 } 938 939 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 940 { 941 struct evict_data evict_data; 942 943 evict_data.cleared = 0; 944 evict_data.target = npages; 945 hfi1_mmu_rb_evict(pq->handler, &evict_data); 946 return evict_data.cleared; 947 } 948 949 static int pin_sdma_pages(struct user_sdma_request *req, 950 struct user_sdma_iovec *iovec, 951 struct sdma_mmu_node *node, 952 int npages) 953 { 954 int pinned, cleared; 955 struct page **pages; 956 struct hfi1_user_sdma_pkt_q *pq = req->pq; 957 958 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 959 if (!pages) 960 return -ENOMEM; 961 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 962 963 npages -= node->npages; 964 retry: 965 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 966 atomic_read(&pq->n_locked), npages)) { 967 cleared = sdma_cache_evict(pq, npages); 968 if (cleared >= npages) 969 goto retry; 970 } 971 pinned = hfi1_acquire_user_pages(pq->mm, 972 ((unsigned long)iovec->iov.iov_base + 973 (node->npages * PAGE_SIZE)), npages, 0, 974 pages + node->npages); 975 if (pinned < 0) { 976 kfree(pages); 977 return pinned; 978 } 979 if (pinned != npages) { 980 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 981 return -EFAULT; 982 } 983 kfree(node->pages); 984 node->rb.len = iovec->iov.iov_len; 985 node->pages = pages; 986 atomic_add(pinned, &pq->n_locked); 987 return pinned; 988 } 989 990 static void unpin_sdma_pages(struct sdma_mmu_node *node) 991 { 992 if (node->npages) { 993 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 994 atomic_sub(node->npages, &node->pq->n_locked); 995 } 996 } 997 998 static int pin_vector_pages(struct user_sdma_request *req, 999 struct user_sdma_iovec *iovec) 1000 { 1001 int ret = 0, pinned, npages; 1002 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1003 struct sdma_mmu_node *node = NULL; 1004 struct mmu_rb_node *rb_node; 1005 struct iovec *iov; 1006 bool extracted; 1007 1008 extracted = 1009 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1010 (unsigned long) 1011 iovec->iov.iov_base, 1012 iovec->iov.iov_len, &rb_node); 1013 if (rb_node) { 1014 node = container_of(rb_node, struct sdma_mmu_node, rb); 1015 if (!extracted) { 1016 atomic_inc(&node->refcount); 1017 iovec->pages = node->pages; 1018 iovec->npages = node->npages; 1019 iovec->node = node; 1020 return 0; 1021 } 1022 } 1023 1024 if (!node) { 1025 node = kzalloc(sizeof(*node), GFP_KERNEL); 1026 if (!node) 1027 return -ENOMEM; 1028 1029 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1030 node->pq = pq; 1031 atomic_set(&node->refcount, 0); 1032 } 1033 1034 iov = &iovec->iov; 1035 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1036 if (node->npages < npages) { 1037 pinned = pin_sdma_pages(req, iovec, node, npages); 1038 if (pinned < 0) { 1039 ret = pinned; 1040 goto bail; 1041 } 1042 node->npages += pinned; 1043 npages = node->npages; 1044 } 1045 iovec->pages = node->pages; 1046 iovec->npages = npages; 1047 iovec->node = node; 1048 1049 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1050 if (ret) { 1051 iovec->node = NULL; 1052 goto bail; 1053 } 1054 return 0; 1055 bail: 1056 unpin_sdma_pages(node); 1057 kfree(node); 1058 return ret; 1059 } 1060 1061 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1062 unsigned start, unsigned npages) 1063 { 1064 hfi1_release_user_pages(mm, pages + start, npages, false); 1065 kfree(pages); 1066 } 1067 1068 static int check_header_template(struct user_sdma_request *req, 1069 struct hfi1_pkt_header *hdr, u32 lrhlen, 1070 u32 datalen) 1071 { 1072 /* 1073 * Perform safety checks for any type of packet: 1074 * - transfer size is multiple of 64bytes 1075 * - packet length is multiple of 4 bytes 1076 * - packet length is not larger than MTU size 1077 * 1078 * These checks are only done for the first packet of the 1079 * transfer since the header is "given" to us by user space. 1080 * For the remainder of the packets we compute the values. 1081 */ 1082 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1083 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1084 return -EINVAL; 1085 1086 if (req_opcode(req->info.ctrl) == EXPECTED) { 1087 /* 1088 * The header is checked only on the first packet. Furthermore, 1089 * we ensure that at least one TID entry is copied when the 1090 * request is submitted. Therefore, we don't have to verify that 1091 * tididx points to something sane. 1092 */ 1093 u32 tidval = req->tids[req->tididx], 1094 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1095 tididx = EXP_TID_GET(tidval, IDX), 1096 tidctrl = EXP_TID_GET(tidval, CTRL), 1097 tidoff; 1098 __le32 kval = hdr->kdeth.ver_tid_offset; 1099 1100 tidoff = KDETH_GET(kval, OFFSET) * 1101 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1102 KDETH_OM_LARGE : KDETH_OM_SMALL); 1103 /* 1104 * Expected receive packets have the following 1105 * additional checks: 1106 * - offset is not larger than the TID size 1107 * - TIDCtrl values match between header and TID array 1108 * - TID indexes match between header and TID array 1109 */ 1110 if ((tidoff + datalen > tidlen) || 1111 KDETH_GET(kval, TIDCTRL) != tidctrl || 1112 KDETH_GET(kval, TID) != tididx) 1113 return -EINVAL; 1114 } 1115 return 0; 1116 } 1117 1118 /* 1119 * Correctly set the BTH.PSN field based on type of 1120 * transfer - eager packets can just increment the PSN but 1121 * expected packets encode generation and sequence in the 1122 * BTH.PSN field so just incrementing will result in errors. 1123 */ 1124 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1125 { 1126 u32 val = be32_to_cpu(bthpsn), 1127 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1128 0xffffffull), 1129 psn = val & mask; 1130 if (expct) 1131 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 1132 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 1133 else 1134 psn = psn + frags; 1135 return psn & mask; 1136 } 1137 1138 static int set_txreq_header(struct user_sdma_request *req, 1139 struct user_sdma_txreq *tx, u32 datalen) 1140 { 1141 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1142 struct hfi1_pkt_header *hdr = &tx->hdr; 1143 u8 omfactor; /* KDETH.OM */ 1144 u16 pbclen; 1145 int ret; 1146 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1147 1148 /* Copy the header template to the request before modification */ 1149 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1150 1151 /* 1152 * Check if the PBC and LRH length are mismatched. If so 1153 * adjust both in the header. 1154 */ 1155 pbclen = le16_to_cpu(hdr->pbc[0]); 1156 if (PBC2LRH(pbclen) != lrhlen) { 1157 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1158 hdr->pbc[0] = cpu_to_le16(pbclen); 1159 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1160 /* 1161 * Third packet 1162 * This is the first packet in the sequence that has 1163 * a "static" size that can be used for the rest of 1164 * the packets (besides the last one). 1165 */ 1166 if (unlikely(req->seqnum == 2)) { 1167 /* 1168 * From this point on the lengths in both the 1169 * PBC and LRH are the same until the last 1170 * packet. 1171 * Adjust the template so we don't have to update 1172 * every packet 1173 */ 1174 req->hdr.pbc[0] = hdr->pbc[0]; 1175 req->hdr.lrh[2] = hdr->lrh[2]; 1176 } 1177 } 1178 /* 1179 * We only have to modify the header if this is not the 1180 * first packet in the request. Otherwise, we use the 1181 * header given to us. 1182 */ 1183 if (unlikely(!req->seqnum)) { 1184 ret = check_header_template(req, hdr, lrhlen, datalen); 1185 if (ret) 1186 return ret; 1187 goto done; 1188 } 1189 1190 hdr->bth[2] = cpu_to_be32( 1191 set_pkt_bth_psn(hdr->bth[2], 1192 (req_opcode(req->info.ctrl) == EXPECTED), 1193 req->seqnum)); 1194 1195 /* Set ACK request on last packet */ 1196 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1197 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1198 1199 /* Set the new offset */ 1200 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1201 /* Expected packets have to fill in the new TID information */ 1202 if (req_opcode(req->info.ctrl) == EXPECTED) { 1203 tidval = req->tids[req->tididx]; 1204 /* 1205 * If the offset puts us at the end of the current TID, 1206 * advance everything. 1207 */ 1208 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1209 PAGE_SIZE)) { 1210 req->tidoffset = 0; 1211 /* 1212 * Since we don't copy all the TIDs, all at once, 1213 * we have to check again. 1214 */ 1215 if (++req->tididx > req->n_tids - 1 || 1216 !req->tids[req->tididx]) { 1217 return -EINVAL; 1218 } 1219 tidval = req->tids[req->tididx]; 1220 } 1221 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1222 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1223 KDETH_OM_SMALL_SHIFT; 1224 /* Set KDETH.TIDCtrl based on value for this TID. */ 1225 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1226 EXP_TID_GET(tidval, CTRL)); 1227 /* Set KDETH.TID based on value for this TID */ 1228 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1229 EXP_TID_GET(tidval, IDX)); 1230 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1231 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1232 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1233 /* 1234 * Set the KDETH.OFFSET and KDETH.OM based on size of 1235 * transfer. 1236 */ 1237 trace_hfi1_sdma_user_tid_info( 1238 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1239 req->tidoffset, req->tidoffset >> omfactor, 1240 omfactor != KDETH_OM_SMALL_SHIFT); 1241 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1242 req->tidoffset >> omfactor); 1243 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1244 omfactor != KDETH_OM_SMALL_SHIFT); 1245 } 1246 done: 1247 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1248 req->info.comp_idx, hdr, tidval); 1249 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1250 } 1251 1252 static int set_txreq_header_ahg(struct user_sdma_request *req, 1253 struct user_sdma_txreq *tx, u32 datalen) 1254 { 1255 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1256 int idx = 0; 1257 u8 omfactor; /* KDETH.OM */ 1258 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1259 struct hfi1_pkt_header *hdr = &req->hdr; 1260 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1261 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1262 size_t array_size = ARRAY_SIZE(ahg); 1263 1264 if (PBC2LRH(pbclen) != lrhlen) { 1265 /* PBC.PbcLengthDWs */ 1266 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1267 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1268 if (idx < 0) 1269 return idx; 1270 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1271 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1272 (__force u16)cpu_to_be16(lrhlen >> 2)); 1273 if (idx < 0) 1274 return idx; 1275 } 1276 1277 /* 1278 * Do the common updates 1279 */ 1280 /* BTH.PSN and BTH.A */ 1281 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1282 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1283 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1284 val32 |= 1UL << 31; 1285 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1286 (__force u16)cpu_to_be16(val32 >> 16)); 1287 if (idx < 0) 1288 return idx; 1289 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1290 (__force u16)cpu_to_be16(val32 & 0xffff)); 1291 if (idx < 0) 1292 return idx; 1293 /* KDETH.Offset */ 1294 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1295 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1296 if (idx < 0) 1297 return idx; 1298 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1299 (__force u16)cpu_to_le16(req->koffset >> 16)); 1300 if (idx < 0) 1301 return idx; 1302 if (req_opcode(req->info.ctrl) == EXPECTED) { 1303 __le16 val; 1304 1305 tidval = req->tids[req->tididx]; 1306 1307 /* 1308 * If the offset puts us at the end of the current TID, 1309 * advance everything. 1310 */ 1311 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1312 PAGE_SIZE)) { 1313 req->tidoffset = 0; 1314 /* 1315 * Since we don't copy all the TIDs, all at once, 1316 * we have to check again. 1317 */ 1318 if (++req->tididx > req->n_tids - 1 || 1319 !req->tids[req->tididx]) 1320 return -EINVAL; 1321 tidval = req->tids[req->tididx]; 1322 } 1323 omfactor = ((EXP_TID_GET(tidval, LEN) * 1324 PAGE_SIZE) >= 1325 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1326 KDETH_OM_SMALL_SHIFT; 1327 /* KDETH.OM and KDETH.OFFSET (TID) */ 1328 idx = ahg_header_set( 1329 ahg, idx, array_size, 7, 0, 16, 1330 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1331 ((req->tidoffset >> omfactor) 1332 & 0x7fff))); 1333 if (idx < 0) 1334 return idx; 1335 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1336 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1337 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1338 1339 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1340 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1341 INTR) << 1342 AHG_KDETH_INTR_SHIFT)); 1343 } else { 1344 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1345 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1346 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1347 INTR) << 1348 AHG_KDETH_INTR_SHIFT)); 1349 } 1350 1351 idx = ahg_header_set(ahg, idx, array_size, 1352 7, 16, 14, (__force u16)val); 1353 if (idx < 0) 1354 return idx; 1355 } 1356 1357 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1358 req->info.comp_idx, req->sde->this_idx, 1359 req->ahg_idx, ahg, idx, tidval); 1360 sdma_txinit_ahg(&tx->txreq, 1361 SDMA_TXREQ_F_USE_AHG, 1362 datalen, req->ahg_idx, idx, 1363 ahg, sizeof(req->hdr), 1364 user_sdma_txreq_cb); 1365 1366 return idx; 1367 } 1368 1369 /** 1370 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1371 * @txreq: valid sdma tx request 1372 * @status: success/failure of request 1373 * 1374 * Called when the SDMA progress state machine gets notification that 1375 * the SDMA descriptors for this tx request have been processed by the 1376 * DMA engine. Called in interrupt context. 1377 * Only do work on completed sequences. 1378 */ 1379 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1380 { 1381 struct user_sdma_txreq *tx = 1382 container_of(txreq, struct user_sdma_txreq, txreq); 1383 struct user_sdma_request *req; 1384 struct hfi1_user_sdma_pkt_q *pq; 1385 struct hfi1_user_sdma_comp_q *cq; 1386 enum hfi1_sdma_comp_state state = COMPLETE; 1387 1388 if (!tx->req) 1389 return; 1390 1391 req = tx->req; 1392 pq = req->pq; 1393 cq = req->cq; 1394 1395 if (status != SDMA_TXREQ_S_OK) { 1396 SDMA_DBG(req, "SDMA completion with error %d", 1397 status); 1398 WRITE_ONCE(req->has_error, 1); 1399 state = ERROR; 1400 } 1401 1402 req->seqcomp = tx->seqnum; 1403 kmem_cache_free(pq->txreq_cache, tx); 1404 1405 /* sequence isn't complete? We are done */ 1406 if (req->seqcomp != req->info.npkts - 1) 1407 return; 1408 1409 user_sdma_free_request(req, false); 1410 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1411 pq_update(pq); 1412 } 1413 1414 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1415 { 1416 if (atomic_dec_and_test(&pq->n_reqs)) 1417 wake_up(&pq->wait); 1418 } 1419 1420 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1421 { 1422 int i; 1423 1424 if (!list_empty(&req->txps)) { 1425 struct sdma_txreq *t, *p; 1426 1427 list_for_each_entry_safe(t, p, &req->txps, list) { 1428 struct user_sdma_txreq *tx = 1429 container_of(t, struct user_sdma_txreq, txreq); 1430 list_del_init(&t->list); 1431 sdma_txclean(req->pq->dd, t); 1432 kmem_cache_free(req->pq->txreq_cache, tx); 1433 } 1434 } 1435 1436 for (i = 0; i < req->data_iovs; i++) { 1437 struct sdma_mmu_node *node = req->iovs[i].node; 1438 1439 if (!node) 1440 continue; 1441 1442 req->iovs[i].node = NULL; 1443 1444 if (unpin) 1445 hfi1_mmu_rb_remove(req->pq->handler, 1446 &node->rb); 1447 else 1448 atomic_dec(&node->refcount); 1449 } 1450 1451 kfree(req->tids); 1452 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1453 } 1454 1455 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1456 struct hfi1_user_sdma_comp_q *cq, 1457 u16 idx, enum hfi1_sdma_comp_state state, 1458 int ret) 1459 { 1460 if (state == ERROR) 1461 cq->comps[idx].errcode = -ret; 1462 smp_wmb(); /* make sure errcode is visible first */ 1463 cq->comps[idx].status = state; 1464 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1465 idx, state, ret); 1466 } 1467 1468 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1469 unsigned long len) 1470 { 1471 return (bool)(node->addr == addr); 1472 } 1473 1474 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1475 { 1476 struct sdma_mmu_node *node = 1477 container_of(mnode, struct sdma_mmu_node, rb); 1478 1479 atomic_inc(&node->refcount); 1480 return 0; 1481 } 1482 1483 /* 1484 * Return 1 to remove the node from the rb tree and call the remove op. 1485 * 1486 * Called with the rb tree lock held. 1487 */ 1488 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1489 void *evict_arg, bool *stop) 1490 { 1491 struct sdma_mmu_node *node = 1492 container_of(mnode, struct sdma_mmu_node, rb); 1493 struct evict_data *evict_data = evict_arg; 1494 1495 /* is this node still being used? */ 1496 if (atomic_read(&node->refcount)) 1497 return 0; /* keep this node */ 1498 1499 /* this node will be evicted, add its pages to our count */ 1500 evict_data->cleared += node->npages; 1501 1502 /* have enough pages been cleared? */ 1503 if (evict_data->cleared >= evict_data->target) 1504 *stop = true; 1505 1506 return 1; /* remove this node */ 1507 } 1508 1509 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1510 { 1511 struct sdma_mmu_node *node = 1512 container_of(mnode, struct sdma_mmu_node, rb); 1513 1514 unpin_sdma_pages(node); 1515 kfree(node); 1516 } 1517 1518 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1519 { 1520 struct sdma_mmu_node *node = 1521 container_of(mnode, struct sdma_mmu_node, rb); 1522 1523 if (!atomic_read(&node->refcount)) 1524 return 1; 1525 return 0; 1526 } 1527