1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 80 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 81 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 82 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 83 static int pin_vector_pages(struct user_sdma_request *req, 84 struct user_sdma_iovec *iovec); 85 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 86 unsigned start, unsigned npages); 87 static int check_header_template(struct user_sdma_request *req, 88 struct hfi1_pkt_header *hdr, u32 lrhlen, 89 u32 datalen); 90 static int set_txreq_header(struct user_sdma_request *req, 91 struct user_sdma_txreq *tx, u32 datalen); 92 static int set_txreq_header_ahg(struct user_sdma_request *req, 93 struct user_sdma_txreq *tx, u32 len); 94 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 95 struct hfi1_user_sdma_comp_q *cq, 96 u16 idx, enum hfi1_sdma_comp_state state, 97 int ret); 98 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 99 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 100 101 static int defer_packet_queue( 102 struct sdma_engine *sde, 103 struct iowait_work *wait, 104 struct sdma_txreq *txreq, 105 uint seq, 106 bool pkts_sent); 107 static void activate_packet_queue(struct iowait *wait, int reason); 108 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 109 unsigned long len); 110 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 111 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 112 void *arg2, bool *stop); 113 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 114 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 115 116 static struct mmu_rb_ops sdma_rb_ops = { 117 .filter = sdma_rb_filter, 118 .insert = sdma_rb_insert, 119 .evict = sdma_rb_evict, 120 .remove = sdma_rb_remove, 121 .invalidate = sdma_rb_invalidate 122 }; 123 124 static int defer_packet_queue( 125 struct sdma_engine *sde, 126 struct iowait_work *wait, 127 struct sdma_txreq *txreq, 128 uint seq, 129 bool pkts_sent) 130 { 131 struct hfi1_user_sdma_pkt_q *pq = 132 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 133 134 write_seqlock(&sde->waitlock); 135 if (sdma_progress(sde, seq, txreq)) 136 goto eagain; 137 /* 138 * We are assuming that if the list is enqueued somewhere, it 139 * is to the dmawait list since that is the only place where 140 * it is supposed to be enqueued. 141 */ 142 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 143 if (list_empty(&pq->busy.list)) { 144 iowait_get_priority(&pq->busy); 145 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 146 } 147 write_sequnlock(&sde->waitlock); 148 return -EBUSY; 149 eagain: 150 write_sequnlock(&sde->waitlock); 151 return -EAGAIN; 152 } 153 154 static void activate_packet_queue(struct iowait *wait, int reason) 155 { 156 struct hfi1_user_sdma_pkt_q *pq = 157 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 158 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 159 wake_up(&wait->wait_dma); 160 }; 161 162 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 163 struct hfi1_filedata *fd) 164 { 165 int ret = -ENOMEM; 166 char buf[64]; 167 struct hfi1_devdata *dd; 168 struct hfi1_user_sdma_comp_q *cq; 169 struct hfi1_user_sdma_pkt_q *pq; 170 171 if (!uctxt || !fd) 172 return -EBADF; 173 174 if (!hfi1_sdma_comp_ring_size) 175 return -EINVAL; 176 177 dd = uctxt->dd; 178 179 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 180 if (!pq) 181 return -ENOMEM; 182 183 pq->dd = dd; 184 pq->ctxt = uctxt->ctxt; 185 pq->subctxt = fd->subctxt; 186 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 187 atomic_set(&pq->n_reqs, 0); 188 init_waitqueue_head(&pq->wait); 189 atomic_set(&pq->n_locked, 0); 190 pq->mm = fd->mm; 191 192 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 193 activate_packet_queue, NULL, NULL); 194 pq->reqidx = 0; 195 196 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 197 sizeof(*pq->reqs), 198 GFP_KERNEL); 199 if (!pq->reqs) 200 goto pq_reqs_nomem; 201 202 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 203 sizeof(*pq->req_in_use), 204 GFP_KERNEL); 205 if (!pq->req_in_use) 206 goto pq_reqs_no_in_use; 207 208 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 209 fd->subctxt); 210 pq->txreq_cache = kmem_cache_create(buf, 211 sizeof(struct user_sdma_txreq), 212 L1_CACHE_BYTES, 213 SLAB_HWCACHE_ALIGN, 214 NULL); 215 if (!pq->txreq_cache) { 216 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 217 uctxt->ctxt); 218 goto pq_txreq_nomem; 219 } 220 221 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 222 if (!cq) 223 goto cq_nomem; 224 225 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 226 * hfi1_sdma_comp_ring_size)); 227 if (!cq->comps) 228 goto cq_comps_nomem; 229 230 cq->nentries = hfi1_sdma_comp_ring_size; 231 232 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 233 &pq->handler); 234 if (ret) { 235 dd_dev_err(dd, "Failed to register with MMU %d", ret); 236 goto pq_mmu_fail; 237 } 238 239 fd->pq = pq; 240 fd->cq = cq; 241 242 return 0; 243 244 pq_mmu_fail: 245 vfree(cq->comps); 246 cq_comps_nomem: 247 kfree(cq); 248 cq_nomem: 249 kmem_cache_destroy(pq->txreq_cache); 250 pq_txreq_nomem: 251 kfree(pq->req_in_use); 252 pq_reqs_no_in_use: 253 kfree(pq->reqs); 254 pq_reqs_nomem: 255 kfree(pq); 256 257 return ret; 258 } 259 260 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 261 struct hfi1_ctxtdata *uctxt) 262 { 263 struct hfi1_user_sdma_pkt_q *pq; 264 265 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 266 267 pq = fd->pq; 268 if (pq) { 269 if (pq->handler) 270 hfi1_mmu_rb_unregister(pq->handler); 271 iowait_sdma_drain(&pq->busy); 272 /* Wait until all requests have been freed. */ 273 wait_event_interruptible( 274 pq->wait, 275 !atomic_read(&pq->n_reqs)); 276 kfree(pq->reqs); 277 kfree(pq->req_in_use); 278 kmem_cache_destroy(pq->txreq_cache); 279 kfree(pq); 280 fd->pq = NULL; 281 } 282 if (fd->cq) { 283 vfree(fd->cq->comps); 284 kfree(fd->cq); 285 fd->cq = NULL; 286 } 287 return 0; 288 } 289 290 static u8 dlid_to_selector(u16 dlid) 291 { 292 static u8 mapping[256]; 293 static int initialized; 294 static u8 next; 295 int hash; 296 297 if (!initialized) { 298 memset(mapping, 0xFF, 256); 299 initialized = 1; 300 } 301 302 hash = ((dlid >> 8) ^ dlid) & 0xFF; 303 if (mapping[hash] == 0xFF) { 304 mapping[hash] = next; 305 next = (next + 1) & 0x7F; 306 } 307 308 return mapping[hash]; 309 } 310 311 /** 312 * hfi1_user_sdma_process_request() - Process and start a user sdma request 313 * @fd: valid file descriptor 314 * @iovec: array of io vectors to process 315 * @dim: overall iovec array size 316 * @count: number of io vector array entries processed 317 */ 318 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 319 struct iovec *iovec, unsigned long dim, 320 unsigned long *count) 321 { 322 int ret = 0, i; 323 struct hfi1_ctxtdata *uctxt = fd->uctxt; 324 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 325 struct hfi1_user_sdma_comp_q *cq = fd->cq; 326 struct hfi1_devdata *dd = pq->dd; 327 unsigned long idx = 0; 328 u8 pcount = initial_pkt_count; 329 struct sdma_req_info info; 330 struct user_sdma_request *req; 331 u8 opcode, sc, vl; 332 u16 pkey; 333 u32 slid; 334 u16 dlid; 335 u32 selector; 336 337 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 338 hfi1_cdbg( 339 SDMA, 340 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 341 dd->unit, uctxt->ctxt, fd->subctxt, 342 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 343 return -EINVAL; 344 } 345 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 346 if (ret) { 347 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 348 dd->unit, uctxt->ctxt, fd->subctxt, ret); 349 return -EFAULT; 350 } 351 352 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 353 (u16 *)&info); 354 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 355 hfi1_cdbg(SDMA, 356 "[%u:%u:%u:%u] Invalid comp index", 357 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 358 return -EINVAL; 359 } 360 361 /* 362 * Sanity check the header io vector count. Need at least 1 vector 363 * (header) and cannot be larger than the actual io vector count. 364 */ 365 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 366 hfi1_cdbg(SDMA, 367 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 368 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 369 req_iovcnt(info.ctrl), dim); 370 return -EINVAL; 371 } 372 373 if (!info.fragsize) { 374 hfi1_cdbg(SDMA, 375 "[%u:%u:%u:%u] Request does not specify fragsize", 376 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 377 return -EINVAL; 378 } 379 380 /* Try to claim the request. */ 381 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 382 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 383 dd->unit, uctxt->ctxt, fd->subctxt, 384 info.comp_idx); 385 return -EBADSLT; 386 } 387 /* 388 * All safety checks have been done and this request has been claimed. 389 */ 390 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 391 info.comp_idx); 392 req = pq->reqs + info.comp_idx; 393 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 394 req->data_len = 0; 395 req->pq = pq; 396 req->cq = cq; 397 req->ahg_idx = -1; 398 req->iov_idx = 0; 399 req->sent = 0; 400 req->seqnum = 0; 401 req->seqcomp = 0; 402 req->seqsubmitted = 0; 403 req->tids = NULL; 404 req->has_error = 0; 405 INIT_LIST_HEAD(&req->txps); 406 407 memcpy(&req->info, &info, sizeof(info)); 408 409 /* The request is initialized, count it */ 410 atomic_inc(&pq->n_reqs); 411 412 if (req_opcode(info.ctrl) == EXPECTED) { 413 /* expected must have a TID info and at least one data vector */ 414 if (req->data_iovs < 2) { 415 SDMA_DBG(req, 416 "Not enough vectors for expected request"); 417 ret = -EINVAL; 418 goto free_req; 419 } 420 req->data_iovs--; 421 } 422 423 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 424 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 425 MAX_VECTORS_PER_REQ); 426 ret = -EINVAL; 427 goto free_req; 428 } 429 /* Copy the header from the user buffer */ 430 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 431 sizeof(req->hdr)); 432 if (ret) { 433 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 434 ret = -EFAULT; 435 goto free_req; 436 } 437 438 /* If Static rate control is not enabled, sanitize the header. */ 439 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 440 req->hdr.pbc[2] = 0; 441 442 /* Validate the opcode. Do not trust packets from user space blindly. */ 443 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 444 if ((opcode & USER_OPCODE_CHECK_MASK) != 445 USER_OPCODE_CHECK_VAL) { 446 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 447 ret = -EINVAL; 448 goto free_req; 449 } 450 /* 451 * Validate the vl. Do not trust packets from user space blindly. 452 * VL comes from PBC, SC comes from LRH, and the VL needs to 453 * match the SC look up. 454 */ 455 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 456 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 457 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 458 if (vl >= dd->pport->vls_operational || 459 vl != sc_to_vlt(dd, sc)) { 460 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 461 ret = -EINVAL; 462 goto free_req; 463 } 464 465 /* Checking P_KEY for requests from user-space */ 466 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 467 slid = be16_to_cpu(req->hdr.lrh[3]); 468 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 469 ret = -EINVAL; 470 goto free_req; 471 } 472 473 /* 474 * Also should check the BTH.lnh. If it says the next header is GRH then 475 * the RXE parsing will be off and will land in the middle of the KDETH 476 * or miss it entirely. 477 */ 478 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 479 SDMA_DBG(req, "User tried to pass in a GRH"); 480 ret = -EINVAL; 481 goto free_req; 482 } 483 484 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 485 /* 486 * Calculate the initial TID offset based on the values of 487 * KDETH.OFFSET and KDETH.OM that are passed in. 488 */ 489 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 490 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 491 KDETH_OM_LARGE : KDETH_OM_SMALL); 492 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 493 info.comp_idx, req->tidoffset); 494 idx++; 495 496 /* Save all the IO vector structures */ 497 for (i = 0; i < req->data_iovs; i++) { 498 req->iovs[i].offset = 0; 499 INIT_LIST_HEAD(&req->iovs[i].list); 500 memcpy(&req->iovs[i].iov, 501 iovec + idx++, 502 sizeof(req->iovs[i].iov)); 503 ret = pin_vector_pages(req, &req->iovs[i]); 504 if (ret) { 505 req->data_iovs = i; 506 goto free_req; 507 } 508 req->data_len += req->iovs[i].iov.iov_len; 509 } 510 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 511 info.comp_idx, req->data_len); 512 if (pcount > req->info.npkts) 513 pcount = req->info.npkts; 514 /* 515 * Copy any TID info 516 * User space will provide the TID info only when the 517 * request type is EXPECTED. This is true even if there is 518 * only one packet in the request and the header is already 519 * setup. The reason for the singular TID case is that the 520 * driver needs to perform safety checks. 521 */ 522 if (req_opcode(req->info.ctrl) == EXPECTED) { 523 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 524 u32 *tmp; 525 526 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 527 ret = -EINVAL; 528 goto free_req; 529 } 530 531 /* 532 * We have to copy all of the tids because they may vary 533 * in size and, therefore, the TID count might not be 534 * equal to the pkt count. However, there is no way to 535 * tell at this point. 536 */ 537 tmp = memdup_user(iovec[idx].iov_base, 538 ntids * sizeof(*req->tids)); 539 if (IS_ERR(tmp)) { 540 ret = PTR_ERR(tmp); 541 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 542 ntids, ret); 543 goto free_req; 544 } 545 req->tids = tmp; 546 req->n_tids = ntids; 547 req->tididx = 0; 548 idx++; 549 } 550 551 dlid = be16_to_cpu(req->hdr.lrh[1]); 552 selector = dlid_to_selector(dlid); 553 selector += uctxt->ctxt + fd->subctxt; 554 req->sde = sdma_select_user_engine(dd, selector, vl); 555 556 if (!req->sde || !sdma_running(req->sde)) { 557 ret = -ECOMM; 558 goto free_req; 559 } 560 561 /* We don't need an AHG entry if the request contains only one packet */ 562 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 563 req->ahg_idx = sdma_ahg_alloc(req->sde); 564 565 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 566 pq->state = SDMA_PKT_Q_ACTIVE; 567 /* Send the first N packets in the request to buy us some time */ 568 ret = user_sdma_send_pkts(req, pcount); 569 if (unlikely(ret < 0 && ret != -EBUSY)) 570 goto free_req; 571 572 /* 573 * This is a somewhat blocking send implementation. 574 * The driver will block the caller until all packets of the 575 * request have been submitted to the SDMA engine. However, it 576 * will not wait for send completions. 577 */ 578 while (req->seqsubmitted != req->info.npkts) { 579 ret = user_sdma_send_pkts(req, pcount); 580 if (ret < 0) { 581 if (ret != -EBUSY) 582 goto free_req; 583 wait_event_interruptible_timeout( 584 pq->busy.wait_dma, 585 (pq->state == SDMA_PKT_Q_ACTIVE), 586 msecs_to_jiffies( 587 SDMA_IOWAIT_TIMEOUT)); 588 } 589 } 590 *count += idx; 591 return 0; 592 free_req: 593 /* 594 * If the submitted seqsubmitted == npkts, the completion routine 595 * controls the final state. If sequbmitted < npkts, wait for any 596 * outstanding packets to finish before cleaning up. 597 */ 598 if (req->seqsubmitted < req->info.npkts) { 599 if (req->seqsubmitted) 600 wait_event(pq->busy.wait_dma, 601 (req->seqcomp == req->seqsubmitted - 1)); 602 user_sdma_free_request(req, true); 603 pq_update(pq); 604 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 605 } 606 return ret; 607 } 608 609 static inline u32 compute_data_length(struct user_sdma_request *req, 610 struct user_sdma_txreq *tx) 611 { 612 /* 613 * Determine the proper size of the packet data. 614 * The size of the data of the first packet is in the header 615 * template. However, it includes the header and ICRC, which need 616 * to be subtracted. 617 * The minimum representable packet data length in a header is 4 bytes, 618 * therefore, when the data length request is less than 4 bytes, there's 619 * only one packet, and the packet data length is equal to that of the 620 * request data length. 621 * The size of the remaining packets is the minimum of the frag 622 * size (MTU) or remaining data in the request. 623 */ 624 u32 len; 625 626 if (!req->seqnum) { 627 if (req->data_len < sizeof(u32)) 628 len = req->data_len; 629 else 630 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 631 (sizeof(tx->hdr) - 4)); 632 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 633 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 634 PAGE_SIZE; 635 /* 636 * Get the data length based on the remaining space in the 637 * TID pair. 638 */ 639 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 640 /* If we've filled up the TID pair, move to the next one. */ 641 if (unlikely(!len) && ++req->tididx < req->n_tids && 642 req->tids[req->tididx]) { 643 tidlen = EXP_TID_GET(req->tids[req->tididx], 644 LEN) * PAGE_SIZE; 645 req->tidoffset = 0; 646 len = min_t(u32, tidlen, req->info.fragsize); 647 } 648 /* 649 * Since the TID pairs map entire pages, make sure that we 650 * are not going to try to send more data that we have 651 * remaining. 652 */ 653 len = min(len, req->data_len - req->sent); 654 } else { 655 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 656 } 657 trace_hfi1_sdma_user_compute_length(req->pq->dd, 658 req->pq->ctxt, 659 req->pq->subctxt, 660 req->info.comp_idx, 661 len); 662 return len; 663 } 664 665 static inline u32 pad_len(u32 len) 666 { 667 if (len & (sizeof(u32) - 1)) 668 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 669 return len; 670 } 671 672 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 673 { 674 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 675 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 676 } 677 678 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 679 struct user_sdma_txreq *tx, 680 u32 datalen) 681 { 682 int ret; 683 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 684 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 685 struct hfi1_user_sdma_pkt_q *pq = req->pq; 686 687 /* 688 * Copy the request header into the tx header 689 * because the HW needs a cacheline-aligned 690 * address. 691 * This copy can be optimized out if the hdr 692 * member of user_sdma_request were also 693 * cacheline aligned. 694 */ 695 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 696 if (PBC2LRH(pbclen) != lrhlen) { 697 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 698 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 699 } 700 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 701 if (ret) 702 return ret; 703 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 704 sizeof(tx->hdr) + datalen, req->ahg_idx, 705 0, NULL, 0, user_sdma_txreq_cb); 706 if (ret) 707 return ret; 708 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 709 if (ret) 710 sdma_txclean(pq->dd, &tx->txreq); 711 return ret; 712 } 713 714 static int user_sdma_txadd(struct user_sdma_request *req, 715 struct user_sdma_txreq *tx, 716 struct user_sdma_iovec *iovec, u32 datalen, 717 u32 *queued_ptr, u32 *data_sent_ptr, 718 u64 *iov_offset_ptr) 719 { 720 int ret; 721 unsigned int pageidx, len; 722 unsigned long base, offset; 723 u64 iov_offset = *iov_offset_ptr; 724 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 725 struct hfi1_user_sdma_pkt_q *pq = req->pq; 726 727 base = (unsigned long)iovec->iov.iov_base; 728 offset = offset_in_page(base + iovec->offset + iov_offset); 729 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 730 PAGE_SHIFT); 731 len = offset + req->info.fragsize > PAGE_SIZE ? 732 PAGE_SIZE - offset : req->info.fragsize; 733 len = min((datalen - queued), len); 734 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 735 offset, len); 736 if (ret) { 737 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 738 return ret; 739 } 740 iov_offset += len; 741 queued += len; 742 data_sent += len; 743 if (unlikely(queued < datalen && pageidx == iovec->npages && 744 req->iov_idx < req->data_iovs - 1)) { 745 iovec->offset += iov_offset; 746 iovec = &req->iovs[++req->iov_idx]; 747 iov_offset = 0; 748 } 749 750 *queued_ptr = queued; 751 *data_sent_ptr = data_sent; 752 *iov_offset_ptr = iov_offset; 753 return ret; 754 } 755 756 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 757 { 758 int ret = 0; 759 u16 count; 760 unsigned npkts = 0; 761 struct user_sdma_txreq *tx = NULL; 762 struct hfi1_user_sdma_pkt_q *pq = NULL; 763 struct user_sdma_iovec *iovec = NULL; 764 765 if (!req->pq) 766 return -EINVAL; 767 768 pq = req->pq; 769 770 /* If tx completion has reported an error, we are done. */ 771 if (READ_ONCE(req->has_error)) 772 return -EFAULT; 773 774 /* 775 * Check if we might have sent the entire request already 776 */ 777 if (unlikely(req->seqnum == req->info.npkts)) { 778 if (!list_empty(&req->txps)) 779 goto dosend; 780 return ret; 781 } 782 783 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 784 maxpkts = req->info.npkts - req->seqnum; 785 786 while (npkts < maxpkts) { 787 u32 datalen = 0, queued = 0, data_sent = 0; 788 u64 iov_offset = 0; 789 790 /* 791 * Check whether any of the completions have come back 792 * with errors. If so, we are not going to process any 793 * more packets from this request. 794 */ 795 if (READ_ONCE(req->has_error)) 796 return -EFAULT; 797 798 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 799 if (!tx) 800 return -ENOMEM; 801 802 tx->flags = 0; 803 tx->req = req; 804 INIT_LIST_HEAD(&tx->list); 805 806 /* 807 * For the last packet set the ACK request 808 * and disable header suppression. 809 */ 810 if (req->seqnum == req->info.npkts - 1) 811 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 812 TXREQ_FLAGS_REQ_DISABLE_SH); 813 814 /* 815 * Calculate the payload size - this is min of the fragment 816 * (MTU) size or the remaining bytes in the request but only 817 * if we have payload data. 818 */ 819 if (req->data_len) { 820 iovec = &req->iovs[req->iov_idx]; 821 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 822 if (++req->iov_idx == req->data_iovs) { 823 ret = -EFAULT; 824 goto free_tx; 825 } 826 iovec = &req->iovs[req->iov_idx]; 827 WARN_ON(iovec->offset); 828 } 829 830 datalen = compute_data_length(req, tx); 831 832 /* 833 * Disable header suppression for the payload <= 8DWS. 834 * If there is an uncorrectable error in the receive 835 * data FIFO when the received payload size is less than 836 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 837 * not reported.There is set RHF.EccErr if the header 838 * is not suppressed. 839 */ 840 if (!datalen) { 841 SDMA_DBG(req, 842 "Request has data but pkt len is 0"); 843 ret = -EFAULT; 844 goto free_tx; 845 } else if (datalen <= 32) { 846 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 847 } 848 } 849 850 if (req->ahg_idx >= 0) { 851 if (!req->seqnum) { 852 ret = user_sdma_txadd_ahg(req, tx, datalen); 853 if (ret) 854 goto free_tx; 855 } else { 856 int changes; 857 858 changes = set_txreq_header_ahg(req, tx, 859 datalen); 860 if (changes < 0) { 861 ret = changes; 862 goto free_tx; 863 } 864 } 865 } else { 866 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 867 datalen, user_sdma_txreq_cb); 868 if (ret) 869 goto free_tx; 870 /* 871 * Modify the header for this packet. This only needs 872 * to be done if we are not going to use AHG. Otherwise, 873 * the HW will do it based on the changes we gave it 874 * during sdma_txinit_ahg(). 875 */ 876 ret = set_txreq_header(req, tx, datalen); 877 if (ret) 878 goto free_txreq; 879 } 880 881 /* 882 * If the request contains any data vectors, add up to 883 * fragsize bytes to the descriptor. 884 */ 885 while (queued < datalen && 886 (req->sent + data_sent) < req->data_len) { 887 ret = user_sdma_txadd(req, tx, iovec, datalen, 888 &queued, &data_sent, &iov_offset); 889 if (ret) 890 goto free_txreq; 891 } 892 /* 893 * The txreq was submitted successfully so we can update 894 * the counters. 895 */ 896 req->koffset += datalen; 897 if (req_opcode(req->info.ctrl) == EXPECTED) 898 req->tidoffset += datalen; 899 req->sent += data_sent; 900 if (req->data_len) 901 iovec->offset += iov_offset; 902 list_add_tail(&tx->txreq.list, &req->txps); 903 /* 904 * It is important to increment this here as it is used to 905 * generate the BTH.PSN and, therefore, can't be bulk-updated 906 * outside of the loop. 907 */ 908 tx->seqnum = req->seqnum++; 909 npkts++; 910 } 911 dosend: 912 ret = sdma_send_txlist(req->sde, 913 iowait_get_ib_work(&pq->busy), 914 &req->txps, &count); 915 req->seqsubmitted += count; 916 if (req->seqsubmitted == req->info.npkts) { 917 /* 918 * The txreq has already been submitted to the HW queue 919 * so we can free the AHG entry now. Corruption will not 920 * happen due to the sequential manner in which 921 * descriptors are processed. 922 */ 923 if (req->ahg_idx >= 0) 924 sdma_ahg_free(req->sde, req->ahg_idx); 925 } 926 return ret; 927 928 free_txreq: 929 sdma_txclean(pq->dd, &tx->txreq); 930 free_tx: 931 kmem_cache_free(pq->txreq_cache, tx); 932 return ret; 933 } 934 935 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 936 { 937 struct evict_data evict_data; 938 939 evict_data.cleared = 0; 940 evict_data.target = npages; 941 hfi1_mmu_rb_evict(pq->handler, &evict_data); 942 return evict_data.cleared; 943 } 944 945 static int pin_sdma_pages(struct user_sdma_request *req, 946 struct user_sdma_iovec *iovec, 947 struct sdma_mmu_node *node, 948 int npages) 949 { 950 int pinned, cleared; 951 struct page **pages; 952 struct hfi1_user_sdma_pkt_q *pq = req->pq; 953 954 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 955 if (!pages) 956 return -ENOMEM; 957 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 958 959 npages -= node->npages; 960 retry: 961 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 962 atomic_read(&pq->n_locked), npages)) { 963 cleared = sdma_cache_evict(pq, npages); 964 if (cleared >= npages) 965 goto retry; 966 } 967 pinned = hfi1_acquire_user_pages(pq->mm, 968 ((unsigned long)iovec->iov.iov_base + 969 (node->npages * PAGE_SIZE)), npages, 0, 970 pages + node->npages); 971 if (pinned < 0) { 972 kfree(pages); 973 return pinned; 974 } 975 if (pinned != npages) { 976 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 977 return -EFAULT; 978 } 979 kfree(node->pages); 980 node->rb.len = iovec->iov.iov_len; 981 node->pages = pages; 982 atomic_add(pinned, &pq->n_locked); 983 return pinned; 984 } 985 986 static void unpin_sdma_pages(struct sdma_mmu_node *node) 987 { 988 if (node->npages) { 989 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 990 atomic_sub(node->npages, &node->pq->n_locked); 991 } 992 } 993 994 static int pin_vector_pages(struct user_sdma_request *req, 995 struct user_sdma_iovec *iovec) 996 { 997 int ret = 0, pinned, npages; 998 struct hfi1_user_sdma_pkt_q *pq = req->pq; 999 struct sdma_mmu_node *node = NULL; 1000 struct mmu_rb_node *rb_node; 1001 struct iovec *iov; 1002 bool extracted; 1003 1004 extracted = 1005 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1006 (unsigned long) 1007 iovec->iov.iov_base, 1008 iovec->iov.iov_len, &rb_node); 1009 if (rb_node) { 1010 node = container_of(rb_node, struct sdma_mmu_node, rb); 1011 if (!extracted) { 1012 atomic_inc(&node->refcount); 1013 iovec->pages = node->pages; 1014 iovec->npages = node->npages; 1015 iovec->node = node; 1016 return 0; 1017 } 1018 } 1019 1020 if (!node) { 1021 node = kzalloc(sizeof(*node), GFP_KERNEL); 1022 if (!node) 1023 return -ENOMEM; 1024 1025 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1026 node->pq = pq; 1027 atomic_set(&node->refcount, 0); 1028 } 1029 1030 iov = &iovec->iov; 1031 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1032 if (node->npages < npages) { 1033 pinned = pin_sdma_pages(req, iovec, node, npages); 1034 if (pinned < 0) { 1035 ret = pinned; 1036 goto bail; 1037 } 1038 node->npages += pinned; 1039 npages = node->npages; 1040 } 1041 iovec->pages = node->pages; 1042 iovec->npages = npages; 1043 iovec->node = node; 1044 1045 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1046 if (ret) { 1047 iovec->node = NULL; 1048 goto bail; 1049 } 1050 return 0; 1051 bail: 1052 unpin_sdma_pages(node); 1053 kfree(node); 1054 return ret; 1055 } 1056 1057 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1058 unsigned start, unsigned npages) 1059 { 1060 hfi1_release_user_pages(mm, pages + start, npages, false); 1061 kfree(pages); 1062 } 1063 1064 static int check_header_template(struct user_sdma_request *req, 1065 struct hfi1_pkt_header *hdr, u32 lrhlen, 1066 u32 datalen) 1067 { 1068 /* 1069 * Perform safety checks for any type of packet: 1070 * - transfer size is multiple of 64bytes 1071 * - packet length is multiple of 4 bytes 1072 * - packet length is not larger than MTU size 1073 * 1074 * These checks are only done for the first packet of the 1075 * transfer since the header is "given" to us by user space. 1076 * For the remainder of the packets we compute the values. 1077 */ 1078 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1079 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1080 return -EINVAL; 1081 1082 if (req_opcode(req->info.ctrl) == EXPECTED) { 1083 /* 1084 * The header is checked only on the first packet. Furthermore, 1085 * we ensure that at least one TID entry is copied when the 1086 * request is submitted. Therefore, we don't have to verify that 1087 * tididx points to something sane. 1088 */ 1089 u32 tidval = req->tids[req->tididx], 1090 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1091 tididx = EXP_TID_GET(tidval, IDX), 1092 tidctrl = EXP_TID_GET(tidval, CTRL), 1093 tidoff; 1094 __le32 kval = hdr->kdeth.ver_tid_offset; 1095 1096 tidoff = KDETH_GET(kval, OFFSET) * 1097 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1098 KDETH_OM_LARGE : KDETH_OM_SMALL); 1099 /* 1100 * Expected receive packets have the following 1101 * additional checks: 1102 * - offset is not larger than the TID size 1103 * - TIDCtrl values match between header and TID array 1104 * - TID indexes match between header and TID array 1105 */ 1106 if ((tidoff + datalen > tidlen) || 1107 KDETH_GET(kval, TIDCTRL) != tidctrl || 1108 KDETH_GET(kval, TID) != tididx) 1109 return -EINVAL; 1110 } 1111 return 0; 1112 } 1113 1114 /* 1115 * Correctly set the BTH.PSN field based on type of 1116 * transfer - eager packets can just increment the PSN but 1117 * expected packets encode generation and sequence in the 1118 * BTH.PSN field so just incrementing will result in errors. 1119 */ 1120 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1121 { 1122 u32 val = be32_to_cpu(bthpsn), 1123 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1124 0xffffffull), 1125 psn = val & mask; 1126 if (expct) 1127 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 1128 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 1129 else 1130 psn = psn + frags; 1131 return psn & mask; 1132 } 1133 1134 static int set_txreq_header(struct user_sdma_request *req, 1135 struct user_sdma_txreq *tx, u32 datalen) 1136 { 1137 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1138 struct hfi1_pkt_header *hdr = &tx->hdr; 1139 u8 omfactor; /* KDETH.OM */ 1140 u16 pbclen; 1141 int ret; 1142 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1143 1144 /* Copy the header template to the request before modification */ 1145 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1146 1147 /* 1148 * Check if the PBC and LRH length are mismatched. If so 1149 * adjust both in the header. 1150 */ 1151 pbclen = le16_to_cpu(hdr->pbc[0]); 1152 if (PBC2LRH(pbclen) != lrhlen) { 1153 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1154 hdr->pbc[0] = cpu_to_le16(pbclen); 1155 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1156 /* 1157 * Third packet 1158 * This is the first packet in the sequence that has 1159 * a "static" size that can be used for the rest of 1160 * the packets (besides the last one). 1161 */ 1162 if (unlikely(req->seqnum == 2)) { 1163 /* 1164 * From this point on the lengths in both the 1165 * PBC and LRH are the same until the last 1166 * packet. 1167 * Adjust the template so we don't have to update 1168 * every packet 1169 */ 1170 req->hdr.pbc[0] = hdr->pbc[0]; 1171 req->hdr.lrh[2] = hdr->lrh[2]; 1172 } 1173 } 1174 /* 1175 * We only have to modify the header if this is not the 1176 * first packet in the request. Otherwise, we use the 1177 * header given to us. 1178 */ 1179 if (unlikely(!req->seqnum)) { 1180 ret = check_header_template(req, hdr, lrhlen, datalen); 1181 if (ret) 1182 return ret; 1183 goto done; 1184 } 1185 1186 hdr->bth[2] = cpu_to_be32( 1187 set_pkt_bth_psn(hdr->bth[2], 1188 (req_opcode(req->info.ctrl) == EXPECTED), 1189 req->seqnum)); 1190 1191 /* Set ACK request on last packet */ 1192 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1193 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1194 1195 /* Set the new offset */ 1196 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1197 /* Expected packets have to fill in the new TID information */ 1198 if (req_opcode(req->info.ctrl) == EXPECTED) { 1199 tidval = req->tids[req->tididx]; 1200 /* 1201 * If the offset puts us at the end of the current TID, 1202 * advance everything. 1203 */ 1204 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1205 PAGE_SIZE)) { 1206 req->tidoffset = 0; 1207 /* 1208 * Since we don't copy all the TIDs, all at once, 1209 * we have to check again. 1210 */ 1211 if (++req->tididx > req->n_tids - 1 || 1212 !req->tids[req->tididx]) { 1213 return -EINVAL; 1214 } 1215 tidval = req->tids[req->tididx]; 1216 } 1217 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1218 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1219 KDETH_OM_SMALL_SHIFT; 1220 /* Set KDETH.TIDCtrl based on value for this TID. */ 1221 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1222 EXP_TID_GET(tidval, CTRL)); 1223 /* Set KDETH.TID based on value for this TID */ 1224 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1225 EXP_TID_GET(tidval, IDX)); 1226 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1227 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1228 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1229 /* 1230 * Set the KDETH.OFFSET and KDETH.OM based on size of 1231 * transfer. 1232 */ 1233 trace_hfi1_sdma_user_tid_info( 1234 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1235 req->tidoffset, req->tidoffset >> omfactor, 1236 omfactor != KDETH_OM_SMALL_SHIFT); 1237 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1238 req->tidoffset >> omfactor); 1239 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1240 omfactor != KDETH_OM_SMALL_SHIFT); 1241 } 1242 done: 1243 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1244 req->info.comp_idx, hdr, tidval); 1245 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1246 } 1247 1248 static int set_txreq_header_ahg(struct user_sdma_request *req, 1249 struct user_sdma_txreq *tx, u32 datalen) 1250 { 1251 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1252 int idx = 0; 1253 u8 omfactor; /* KDETH.OM */ 1254 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1255 struct hfi1_pkt_header *hdr = &req->hdr; 1256 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1257 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1258 size_t array_size = ARRAY_SIZE(ahg); 1259 1260 if (PBC2LRH(pbclen) != lrhlen) { 1261 /* PBC.PbcLengthDWs */ 1262 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1263 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1264 if (idx < 0) 1265 return idx; 1266 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1267 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1268 (__force u16)cpu_to_be16(lrhlen >> 2)); 1269 if (idx < 0) 1270 return idx; 1271 } 1272 1273 /* 1274 * Do the common updates 1275 */ 1276 /* BTH.PSN and BTH.A */ 1277 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1278 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1279 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1280 val32 |= 1UL << 31; 1281 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1282 (__force u16)cpu_to_be16(val32 >> 16)); 1283 if (idx < 0) 1284 return idx; 1285 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1286 (__force u16)cpu_to_be16(val32 & 0xffff)); 1287 if (idx < 0) 1288 return idx; 1289 /* KDETH.Offset */ 1290 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1291 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1292 if (idx < 0) 1293 return idx; 1294 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1295 (__force u16)cpu_to_le16(req->koffset >> 16)); 1296 if (idx < 0) 1297 return idx; 1298 if (req_opcode(req->info.ctrl) == EXPECTED) { 1299 __le16 val; 1300 1301 tidval = req->tids[req->tididx]; 1302 1303 /* 1304 * If the offset puts us at the end of the current TID, 1305 * advance everything. 1306 */ 1307 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1308 PAGE_SIZE)) { 1309 req->tidoffset = 0; 1310 /* 1311 * Since we don't copy all the TIDs, all at once, 1312 * we have to check again. 1313 */ 1314 if (++req->tididx > req->n_tids - 1 || 1315 !req->tids[req->tididx]) 1316 return -EINVAL; 1317 tidval = req->tids[req->tididx]; 1318 } 1319 omfactor = ((EXP_TID_GET(tidval, LEN) * 1320 PAGE_SIZE) >= 1321 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1322 KDETH_OM_SMALL_SHIFT; 1323 /* KDETH.OM and KDETH.OFFSET (TID) */ 1324 idx = ahg_header_set( 1325 ahg, idx, array_size, 7, 0, 16, 1326 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1327 ((req->tidoffset >> omfactor) 1328 & 0x7fff))); 1329 if (idx < 0) 1330 return idx; 1331 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1332 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1333 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1334 1335 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1336 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1337 INTR) << 1338 AHG_KDETH_INTR_SHIFT)); 1339 } else { 1340 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1341 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1342 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1343 INTR) << 1344 AHG_KDETH_INTR_SHIFT)); 1345 } 1346 1347 idx = ahg_header_set(ahg, idx, array_size, 1348 7, 16, 14, (__force u16)val); 1349 if (idx < 0) 1350 return idx; 1351 } 1352 1353 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1354 req->info.comp_idx, req->sde->this_idx, 1355 req->ahg_idx, ahg, idx, tidval); 1356 sdma_txinit_ahg(&tx->txreq, 1357 SDMA_TXREQ_F_USE_AHG, 1358 datalen, req->ahg_idx, idx, 1359 ahg, sizeof(req->hdr), 1360 user_sdma_txreq_cb); 1361 1362 return idx; 1363 } 1364 1365 /** 1366 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1367 * @txreq: valid sdma tx request 1368 * @status: success/failure of request 1369 * 1370 * Called when the SDMA progress state machine gets notification that 1371 * the SDMA descriptors for this tx request have been processed by the 1372 * DMA engine. Called in interrupt context. 1373 * Only do work on completed sequences. 1374 */ 1375 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1376 { 1377 struct user_sdma_txreq *tx = 1378 container_of(txreq, struct user_sdma_txreq, txreq); 1379 struct user_sdma_request *req; 1380 struct hfi1_user_sdma_pkt_q *pq; 1381 struct hfi1_user_sdma_comp_q *cq; 1382 enum hfi1_sdma_comp_state state = COMPLETE; 1383 1384 if (!tx->req) 1385 return; 1386 1387 req = tx->req; 1388 pq = req->pq; 1389 cq = req->cq; 1390 1391 if (status != SDMA_TXREQ_S_OK) { 1392 SDMA_DBG(req, "SDMA completion with error %d", 1393 status); 1394 WRITE_ONCE(req->has_error, 1); 1395 state = ERROR; 1396 } 1397 1398 req->seqcomp = tx->seqnum; 1399 kmem_cache_free(pq->txreq_cache, tx); 1400 1401 /* sequence isn't complete? We are done */ 1402 if (req->seqcomp != req->info.npkts - 1) 1403 return; 1404 1405 user_sdma_free_request(req, false); 1406 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1407 pq_update(pq); 1408 } 1409 1410 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1411 { 1412 if (atomic_dec_and_test(&pq->n_reqs)) 1413 wake_up(&pq->wait); 1414 } 1415 1416 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1417 { 1418 int i; 1419 1420 if (!list_empty(&req->txps)) { 1421 struct sdma_txreq *t, *p; 1422 1423 list_for_each_entry_safe(t, p, &req->txps, list) { 1424 struct user_sdma_txreq *tx = 1425 container_of(t, struct user_sdma_txreq, txreq); 1426 list_del_init(&t->list); 1427 sdma_txclean(req->pq->dd, t); 1428 kmem_cache_free(req->pq->txreq_cache, tx); 1429 } 1430 } 1431 1432 for (i = 0; i < req->data_iovs; i++) { 1433 struct sdma_mmu_node *node = req->iovs[i].node; 1434 1435 if (!node) 1436 continue; 1437 1438 req->iovs[i].node = NULL; 1439 1440 if (unpin) 1441 hfi1_mmu_rb_remove(req->pq->handler, 1442 &node->rb); 1443 else 1444 atomic_dec(&node->refcount); 1445 } 1446 1447 kfree(req->tids); 1448 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1449 } 1450 1451 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1452 struct hfi1_user_sdma_comp_q *cq, 1453 u16 idx, enum hfi1_sdma_comp_state state, 1454 int ret) 1455 { 1456 if (state == ERROR) 1457 cq->comps[idx].errcode = -ret; 1458 smp_wmb(); /* make sure errcode is visible first */ 1459 cq->comps[idx].status = state; 1460 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1461 idx, state, ret); 1462 } 1463 1464 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1465 unsigned long len) 1466 { 1467 return (bool)(node->addr == addr); 1468 } 1469 1470 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1471 { 1472 struct sdma_mmu_node *node = 1473 container_of(mnode, struct sdma_mmu_node, rb); 1474 1475 atomic_inc(&node->refcount); 1476 return 0; 1477 } 1478 1479 /* 1480 * Return 1 to remove the node from the rb tree and call the remove op. 1481 * 1482 * Called with the rb tree lock held. 1483 */ 1484 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1485 void *evict_arg, bool *stop) 1486 { 1487 struct sdma_mmu_node *node = 1488 container_of(mnode, struct sdma_mmu_node, rb); 1489 struct evict_data *evict_data = evict_arg; 1490 1491 /* is this node still being used? */ 1492 if (atomic_read(&node->refcount)) 1493 return 0; /* keep this node */ 1494 1495 /* this node will be evicted, add its pages to our count */ 1496 evict_data->cleared += node->npages; 1497 1498 /* have enough pages been cleared? */ 1499 if (evict_data->cleared >= evict_data->target) 1500 *stop = true; 1501 1502 return 1; /* remove this node */ 1503 } 1504 1505 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1506 { 1507 struct sdma_mmu_node *node = 1508 container_of(mnode, struct sdma_mmu_node, rb); 1509 1510 unpin_sdma_pages(node); 1511 kfree(node); 1512 } 1513 1514 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1515 { 1516 struct sdma_mmu_node *node = 1517 container_of(mnode, struct sdma_mmu_node, rb); 1518 1519 if (!atomic_read(&node->refcount)) 1520 return 1; 1521 return 0; 1522 } 1523