1 /* 2 * Copyright(c) 2020 - Cornelis Networks, Inc. 3 * Copyright(c) 2015 - 2018 Intel Corporation. 4 * 5 * This file is provided under a dual BSD/GPLv2 license. When using or 6 * redistributing this file, you may do so under either license. 7 * 8 * GPL LICENSE SUMMARY 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of version 2 of the GNU General Public License as 12 * published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it will be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * General Public License for more details. 18 * 19 * BSD LICENSE 20 * 21 * Redistribution and use in source and binary forms, with or without 22 * modification, are permitted provided that the following conditions 23 * are met: 24 * 25 * - Redistributions of source code must retain the above copyright 26 * notice, this list of conditions and the following disclaimer. 27 * - Redistributions in binary form must reproduce the above copyright 28 * notice, this list of conditions and the following disclaimer in 29 * the documentation and/or other materials provided with the 30 * distribution. 31 * - Neither the name of Intel Corporation nor the names of its 32 * contributors may be used to endorse or promote products derived 33 * from this software without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 38 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 39 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 45 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 46 * 47 */ 48 #include <linux/mm.h> 49 #include <linux/types.h> 50 #include <linux/device.h> 51 #include <linux/dmapool.h> 52 #include <linux/slab.h> 53 #include <linux/list.h> 54 #include <linux/highmem.h> 55 #include <linux/io.h> 56 #include <linux/uio.h> 57 #include <linux/rbtree.h> 58 #include <linux/spinlock.h> 59 #include <linux/delay.h> 60 #include <linux/kthread.h> 61 #include <linux/mmu_context.h> 62 #include <linux/module.h> 63 #include <linux/vmalloc.h> 64 #include <linux/string.h> 65 66 #include "hfi.h" 67 #include "sdma.h" 68 #include "mmu_rb.h" 69 #include "user_sdma.h" 70 #include "verbs.h" /* for the headers */ 71 #include "common.h" /* for struct hfi1_tid_info */ 72 #include "trace.h" 73 74 static uint hfi1_sdma_comp_ring_size = 128; 75 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 76 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 77 78 static unsigned initial_pkt_count = 8; 79 80 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 81 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 82 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 83 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 84 static int pin_vector_pages(struct user_sdma_request *req, 85 struct user_sdma_iovec *iovec); 86 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 87 unsigned start, unsigned npages); 88 static int check_header_template(struct user_sdma_request *req, 89 struct hfi1_pkt_header *hdr, u32 lrhlen, 90 u32 datalen); 91 static int set_txreq_header(struct user_sdma_request *req, 92 struct user_sdma_txreq *tx, u32 datalen); 93 static int set_txreq_header_ahg(struct user_sdma_request *req, 94 struct user_sdma_txreq *tx, u32 len); 95 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 96 struct hfi1_user_sdma_comp_q *cq, 97 u16 idx, enum hfi1_sdma_comp_state state, 98 int ret); 99 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 100 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 101 102 static int defer_packet_queue( 103 struct sdma_engine *sde, 104 struct iowait_work *wait, 105 struct sdma_txreq *txreq, 106 uint seq, 107 bool pkts_sent); 108 static void activate_packet_queue(struct iowait *wait, int reason); 109 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 110 unsigned long len); 111 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 112 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 113 void *arg2, bool *stop); 114 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 115 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 116 117 static struct mmu_rb_ops sdma_rb_ops = { 118 .filter = sdma_rb_filter, 119 .insert = sdma_rb_insert, 120 .evict = sdma_rb_evict, 121 .remove = sdma_rb_remove, 122 .invalidate = sdma_rb_invalidate 123 }; 124 125 static int defer_packet_queue( 126 struct sdma_engine *sde, 127 struct iowait_work *wait, 128 struct sdma_txreq *txreq, 129 uint seq, 130 bool pkts_sent) 131 { 132 struct hfi1_user_sdma_pkt_q *pq = 133 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 134 135 write_seqlock(&sde->waitlock); 136 if (sdma_progress(sde, seq, txreq)) 137 goto eagain; 138 /* 139 * We are assuming that if the list is enqueued somewhere, it 140 * is to the dmawait list since that is the only place where 141 * it is supposed to be enqueued. 142 */ 143 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 144 if (list_empty(&pq->busy.list)) { 145 pq->busy.lock = &sde->waitlock; 146 iowait_get_priority(&pq->busy); 147 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 148 } 149 write_sequnlock(&sde->waitlock); 150 return -EBUSY; 151 eagain: 152 write_sequnlock(&sde->waitlock); 153 return -EAGAIN; 154 } 155 156 static void activate_packet_queue(struct iowait *wait, int reason) 157 { 158 struct hfi1_user_sdma_pkt_q *pq = 159 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 160 pq->busy.lock = NULL; 161 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 162 wake_up(&wait->wait_dma); 163 }; 164 165 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 166 struct hfi1_filedata *fd) 167 { 168 int ret = -ENOMEM; 169 char buf[64]; 170 struct hfi1_devdata *dd; 171 struct hfi1_user_sdma_comp_q *cq; 172 struct hfi1_user_sdma_pkt_q *pq; 173 174 if (!uctxt || !fd) 175 return -EBADF; 176 177 if (!hfi1_sdma_comp_ring_size) 178 return -EINVAL; 179 180 dd = uctxt->dd; 181 182 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 183 if (!pq) 184 return -ENOMEM; 185 pq->dd = dd; 186 pq->ctxt = uctxt->ctxt; 187 pq->subctxt = fd->subctxt; 188 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 189 atomic_set(&pq->n_reqs, 0); 190 init_waitqueue_head(&pq->wait); 191 atomic_set(&pq->n_locked, 0); 192 193 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 194 activate_packet_queue, NULL, NULL); 195 pq->reqidx = 0; 196 197 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 198 sizeof(*pq->reqs), 199 GFP_KERNEL); 200 if (!pq->reqs) 201 goto pq_reqs_nomem; 202 203 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 204 sizeof(*pq->req_in_use), 205 GFP_KERNEL); 206 if (!pq->req_in_use) 207 goto pq_reqs_no_in_use; 208 209 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 210 fd->subctxt); 211 pq->txreq_cache = kmem_cache_create(buf, 212 sizeof(struct user_sdma_txreq), 213 L1_CACHE_BYTES, 214 SLAB_HWCACHE_ALIGN, 215 NULL); 216 if (!pq->txreq_cache) { 217 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 218 uctxt->ctxt); 219 goto pq_txreq_nomem; 220 } 221 222 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 223 if (!cq) 224 goto cq_nomem; 225 226 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 227 * hfi1_sdma_comp_ring_size)); 228 if (!cq->comps) 229 goto cq_comps_nomem; 230 231 cq->nentries = hfi1_sdma_comp_ring_size; 232 233 ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq, 234 &pq->handler); 235 if (ret) { 236 dd_dev_err(dd, "Failed to register with MMU %d", ret); 237 goto pq_mmu_fail; 238 } 239 240 rcu_assign_pointer(fd->pq, pq); 241 fd->cq = cq; 242 243 return 0; 244 245 pq_mmu_fail: 246 vfree(cq->comps); 247 cq_comps_nomem: 248 kfree(cq); 249 cq_nomem: 250 kmem_cache_destroy(pq->txreq_cache); 251 pq_txreq_nomem: 252 kfree(pq->req_in_use); 253 pq_reqs_no_in_use: 254 kfree(pq->reqs); 255 pq_reqs_nomem: 256 kfree(pq); 257 258 return ret; 259 } 260 261 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) 262 { 263 unsigned long flags; 264 seqlock_t *lock = pq->busy.lock; 265 266 if (!lock) 267 return; 268 write_seqlock_irqsave(lock, flags); 269 if (!list_empty(&pq->busy.list)) { 270 list_del_init(&pq->busy.list); 271 pq->busy.lock = NULL; 272 } 273 write_sequnlock_irqrestore(lock, flags); 274 } 275 276 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 277 struct hfi1_ctxtdata *uctxt) 278 { 279 struct hfi1_user_sdma_pkt_q *pq; 280 281 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 282 283 spin_lock(&fd->pq_rcu_lock); 284 pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, 285 lockdep_is_held(&fd->pq_rcu_lock)); 286 if (pq) { 287 rcu_assign_pointer(fd->pq, NULL); 288 spin_unlock(&fd->pq_rcu_lock); 289 synchronize_srcu(&fd->pq_srcu); 290 /* at this point there can be no more new requests */ 291 if (pq->handler) 292 hfi1_mmu_rb_unregister(pq->handler); 293 iowait_sdma_drain(&pq->busy); 294 /* Wait until all requests have been freed. */ 295 wait_event_interruptible( 296 pq->wait, 297 !atomic_read(&pq->n_reqs)); 298 kfree(pq->reqs); 299 kfree(pq->req_in_use); 300 kmem_cache_destroy(pq->txreq_cache); 301 flush_pq_iowait(pq); 302 kfree(pq); 303 } else { 304 spin_unlock(&fd->pq_rcu_lock); 305 } 306 if (fd->cq) { 307 vfree(fd->cq->comps); 308 kfree(fd->cq); 309 fd->cq = NULL; 310 } 311 return 0; 312 } 313 314 static u8 dlid_to_selector(u16 dlid) 315 { 316 static u8 mapping[256]; 317 static int initialized; 318 static u8 next; 319 int hash; 320 321 if (!initialized) { 322 memset(mapping, 0xFF, 256); 323 initialized = 1; 324 } 325 326 hash = ((dlid >> 8) ^ dlid) & 0xFF; 327 if (mapping[hash] == 0xFF) { 328 mapping[hash] = next; 329 next = (next + 1) & 0x7F; 330 } 331 332 return mapping[hash]; 333 } 334 335 /** 336 * hfi1_user_sdma_process_request() - Process and start a user sdma request 337 * @fd: valid file descriptor 338 * @iovec: array of io vectors to process 339 * @dim: overall iovec array size 340 * @count: number of io vector array entries processed 341 */ 342 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 343 struct iovec *iovec, unsigned long dim, 344 unsigned long *count) 345 { 346 int ret = 0, i; 347 struct hfi1_ctxtdata *uctxt = fd->uctxt; 348 struct hfi1_user_sdma_pkt_q *pq = 349 srcu_dereference(fd->pq, &fd->pq_srcu); 350 struct hfi1_user_sdma_comp_q *cq = fd->cq; 351 struct hfi1_devdata *dd = pq->dd; 352 unsigned long idx = 0; 353 u8 pcount = initial_pkt_count; 354 struct sdma_req_info info; 355 struct user_sdma_request *req; 356 u8 opcode, sc, vl; 357 u16 pkey; 358 u32 slid; 359 u16 dlid; 360 u32 selector; 361 362 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 363 hfi1_cdbg( 364 SDMA, 365 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 366 dd->unit, uctxt->ctxt, fd->subctxt, 367 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 368 return -EINVAL; 369 } 370 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 371 if (ret) { 372 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 373 dd->unit, uctxt->ctxt, fd->subctxt, ret); 374 return -EFAULT; 375 } 376 377 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 378 (u16 *)&info); 379 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 380 hfi1_cdbg(SDMA, 381 "[%u:%u:%u:%u] Invalid comp index", 382 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 383 return -EINVAL; 384 } 385 386 /* 387 * Sanity check the header io vector count. Need at least 1 vector 388 * (header) and cannot be larger than the actual io vector count. 389 */ 390 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 391 hfi1_cdbg(SDMA, 392 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 393 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 394 req_iovcnt(info.ctrl), dim); 395 return -EINVAL; 396 } 397 398 if (!info.fragsize) { 399 hfi1_cdbg(SDMA, 400 "[%u:%u:%u:%u] Request does not specify fragsize", 401 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 402 return -EINVAL; 403 } 404 405 /* Try to claim the request. */ 406 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 407 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 408 dd->unit, uctxt->ctxt, fd->subctxt, 409 info.comp_idx); 410 return -EBADSLT; 411 } 412 /* 413 * All safety checks have been done and this request has been claimed. 414 */ 415 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 416 info.comp_idx); 417 req = pq->reqs + info.comp_idx; 418 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 419 req->data_len = 0; 420 req->pq = pq; 421 req->cq = cq; 422 req->ahg_idx = -1; 423 req->iov_idx = 0; 424 req->sent = 0; 425 req->seqnum = 0; 426 req->seqcomp = 0; 427 req->seqsubmitted = 0; 428 req->tids = NULL; 429 req->has_error = 0; 430 INIT_LIST_HEAD(&req->txps); 431 432 memcpy(&req->info, &info, sizeof(info)); 433 434 /* The request is initialized, count it */ 435 atomic_inc(&pq->n_reqs); 436 437 if (req_opcode(info.ctrl) == EXPECTED) { 438 /* expected must have a TID info and at least one data vector */ 439 if (req->data_iovs < 2) { 440 SDMA_DBG(req, 441 "Not enough vectors for expected request"); 442 ret = -EINVAL; 443 goto free_req; 444 } 445 req->data_iovs--; 446 } 447 448 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 449 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 450 MAX_VECTORS_PER_REQ); 451 ret = -EINVAL; 452 goto free_req; 453 } 454 /* Copy the header from the user buffer */ 455 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 456 sizeof(req->hdr)); 457 if (ret) { 458 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 459 ret = -EFAULT; 460 goto free_req; 461 } 462 463 /* If Static rate control is not enabled, sanitize the header. */ 464 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 465 req->hdr.pbc[2] = 0; 466 467 /* Validate the opcode. Do not trust packets from user space blindly. */ 468 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 469 if ((opcode & USER_OPCODE_CHECK_MASK) != 470 USER_OPCODE_CHECK_VAL) { 471 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 472 ret = -EINVAL; 473 goto free_req; 474 } 475 /* 476 * Validate the vl. Do not trust packets from user space blindly. 477 * VL comes from PBC, SC comes from LRH, and the VL needs to 478 * match the SC look up. 479 */ 480 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 481 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 482 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 483 if (vl >= dd->pport->vls_operational || 484 vl != sc_to_vlt(dd, sc)) { 485 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 486 ret = -EINVAL; 487 goto free_req; 488 } 489 490 /* Checking P_KEY for requests from user-space */ 491 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 492 slid = be16_to_cpu(req->hdr.lrh[3]); 493 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 494 ret = -EINVAL; 495 goto free_req; 496 } 497 498 /* 499 * Also should check the BTH.lnh. If it says the next header is GRH then 500 * the RXE parsing will be off and will land in the middle of the KDETH 501 * or miss it entirely. 502 */ 503 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 504 SDMA_DBG(req, "User tried to pass in a GRH"); 505 ret = -EINVAL; 506 goto free_req; 507 } 508 509 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 510 /* 511 * Calculate the initial TID offset based on the values of 512 * KDETH.OFFSET and KDETH.OM that are passed in. 513 */ 514 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 515 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 516 KDETH_OM_LARGE : KDETH_OM_SMALL); 517 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 518 info.comp_idx, req->tidoffset); 519 idx++; 520 521 /* Save all the IO vector structures */ 522 for (i = 0; i < req->data_iovs; i++) { 523 req->iovs[i].offset = 0; 524 INIT_LIST_HEAD(&req->iovs[i].list); 525 memcpy(&req->iovs[i].iov, 526 iovec + idx++, 527 sizeof(req->iovs[i].iov)); 528 ret = pin_vector_pages(req, &req->iovs[i]); 529 if (ret) { 530 req->data_iovs = i; 531 goto free_req; 532 } 533 req->data_len += req->iovs[i].iov.iov_len; 534 } 535 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 536 info.comp_idx, req->data_len); 537 if (pcount > req->info.npkts) 538 pcount = req->info.npkts; 539 /* 540 * Copy any TID info 541 * User space will provide the TID info only when the 542 * request type is EXPECTED. This is true even if there is 543 * only one packet in the request and the header is already 544 * setup. The reason for the singular TID case is that the 545 * driver needs to perform safety checks. 546 */ 547 if (req_opcode(req->info.ctrl) == EXPECTED) { 548 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 549 u32 *tmp; 550 551 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 552 ret = -EINVAL; 553 goto free_req; 554 } 555 556 /* 557 * We have to copy all of the tids because they may vary 558 * in size and, therefore, the TID count might not be 559 * equal to the pkt count. However, there is no way to 560 * tell at this point. 561 */ 562 tmp = memdup_user(iovec[idx].iov_base, 563 ntids * sizeof(*req->tids)); 564 if (IS_ERR(tmp)) { 565 ret = PTR_ERR(tmp); 566 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 567 ntids, ret); 568 goto free_req; 569 } 570 req->tids = tmp; 571 req->n_tids = ntids; 572 req->tididx = 0; 573 idx++; 574 } 575 576 dlid = be16_to_cpu(req->hdr.lrh[1]); 577 selector = dlid_to_selector(dlid); 578 selector += uctxt->ctxt + fd->subctxt; 579 req->sde = sdma_select_user_engine(dd, selector, vl); 580 581 if (!req->sde || !sdma_running(req->sde)) { 582 ret = -ECOMM; 583 goto free_req; 584 } 585 586 /* We don't need an AHG entry if the request contains only one packet */ 587 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 588 req->ahg_idx = sdma_ahg_alloc(req->sde); 589 590 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 591 pq->state = SDMA_PKT_Q_ACTIVE; 592 593 /* 594 * This is a somewhat blocking send implementation. 595 * The driver will block the caller until all packets of the 596 * request have been submitted to the SDMA engine. However, it 597 * will not wait for send completions. 598 */ 599 while (req->seqsubmitted != req->info.npkts) { 600 ret = user_sdma_send_pkts(req, pcount); 601 if (ret < 0) { 602 if (ret != -EBUSY) 603 goto free_req; 604 if (wait_event_interruptible_timeout( 605 pq->busy.wait_dma, 606 pq->state == SDMA_PKT_Q_ACTIVE, 607 msecs_to_jiffies( 608 SDMA_IOWAIT_TIMEOUT)) <= 0) 609 flush_pq_iowait(pq); 610 } 611 } 612 *count += idx; 613 return 0; 614 free_req: 615 /* 616 * If the submitted seqsubmitted == npkts, the completion routine 617 * controls the final state. If sequbmitted < npkts, wait for any 618 * outstanding packets to finish before cleaning up. 619 */ 620 if (req->seqsubmitted < req->info.npkts) { 621 if (req->seqsubmitted) 622 wait_event(pq->busy.wait_dma, 623 (req->seqcomp == req->seqsubmitted - 1)); 624 user_sdma_free_request(req, true); 625 pq_update(pq); 626 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 627 } 628 return ret; 629 } 630 631 static inline u32 compute_data_length(struct user_sdma_request *req, 632 struct user_sdma_txreq *tx) 633 { 634 /* 635 * Determine the proper size of the packet data. 636 * The size of the data of the first packet is in the header 637 * template. However, it includes the header and ICRC, which need 638 * to be subtracted. 639 * The minimum representable packet data length in a header is 4 bytes, 640 * therefore, when the data length request is less than 4 bytes, there's 641 * only one packet, and the packet data length is equal to that of the 642 * request data length. 643 * The size of the remaining packets is the minimum of the frag 644 * size (MTU) or remaining data in the request. 645 */ 646 u32 len; 647 648 if (!req->seqnum) { 649 if (req->data_len < sizeof(u32)) 650 len = req->data_len; 651 else 652 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 653 (sizeof(tx->hdr) - 4)); 654 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 655 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 656 PAGE_SIZE; 657 /* 658 * Get the data length based on the remaining space in the 659 * TID pair. 660 */ 661 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 662 /* If we've filled up the TID pair, move to the next one. */ 663 if (unlikely(!len) && ++req->tididx < req->n_tids && 664 req->tids[req->tididx]) { 665 tidlen = EXP_TID_GET(req->tids[req->tididx], 666 LEN) * PAGE_SIZE; 667 req->tidoffset = 0; 668 len = min_t(u32, tidlen, req->info.fragsize); 669 } 670 /* 671 * Since the TID pairs map entire pages, make sure that we 672 * are not going to try to send more data that we have 673 * remaining. 674 */ 675 len = min(len, req->data_len - req->sent); 676 } else { 677 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 678 } 679 trace_hfi1_sdma_user_compute_length(req->pq->dd, 680 req->pq->ctxt, 681 req->pq->subctxt, 682 req->info.comp_idx, 683 len); 684 return len; 685 } 686 687 static inline u32 pad_len(u32 len) 688 { 689 if (len & (sizeof(u32) - 1)) 690 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 691 return len; 692 } 693 694 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 695 { 696 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 697 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 698 } 699 700 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 701 struct user_sdma_txreq *tx, 702 u32 datalen) 703 { 704 int ret; 705 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 706 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 707 struct hfi1_user_sdma_pkt_q *pq = req->pq; 708 709 /* 710 * Copy the request header into the tx header 711 * because the HW needs a cacheline-aligned 712 * address. 713 * This copy can be optimized out if the hdr 714 * member of user_sdma_request were also 715 * cacheline aligned. 716 */ 717 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 718 if (PBC2LRH(pbclen) != lrhlen) { 719 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 720 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 721 } 722 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 723 if (ret) 724 return ret; 725 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 726 sizeof(tx->hdr) + datalen, req->ahg_idx, 727 0, NULL, 0, user_sdma_txreq_cb); 728 if (ret) 729 return ret; 730 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 731 if (ret) 732 sdma_txclean(pq->dd, &tx->txreq); 733 return ret; 734 } 735 736 static int user_sdma_txadd(struct user_sdma_request *req, 737 struct user_sdma_txreq *tx, 738 struct user_sdma_iovec *iovec, u32 datalen, 739 u32 *queued_ptr, u32 *data_sent_ptr, 740 u64 *iov_offset_ptr) 741 { 742 int ret; 743 unsigned int pageidx, len; 744 unsigned long base, offset; 745 u64 iov_offset = *iov_offset_ptr; 746 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 747 struct hfi1_user_sdma_pkt_q *pq = req->pq; 748 749 base = (unsigned long)iovec->iov.iov_base; 750 offset = offset_in_page(base + iovec->offset + iov_offset); 751 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 752 PAGE_SHIFT); 753 len = offset + req->info.fragsize > PAGE_SIZE ? 754 PAGE_SIZE - offset : req->info.fragsize; 755 len = min((datalen - queued), len); 756 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 757 offset, len); 758 if (ret) { 759 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 760 return ret; 761 } 762 iov_offset += len; 763 queued += len; 764 data_sent += len; 765 if (unlikely(queued < datalen && pageidx == iovec->npages && 766 req->iov_idx < req->data_iovs - 1)) { 767 iovec->offset += iov_offset; 768 iovec = &req->iovs[++req->iov_idx]; 769 iov_offset = 0; 770 } 771 772 *queued_ptr = queued; 773 *data_sent_ptr = data_sent; 774 *iov_offset_ptr = iov_offset; 775 return ret; 776 } 777 778 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 779 { 780 int ret = 0; 781 u16 count; 782 unsigned npkts = 0; 783 struct user_sdma_txreq *tx = NULL; 784 struct hfi1_user_sdma_pkt_q *pq = NULL; 785 struct user_sdma_iovec *iovec = NULL; 786 787 if (!req->pq) 788 return -EINVAL; 789 790 pq = req->pq; 791 792 /* If tx completion has reported an error, we are done. */ 793 if (READ_ONCE(req->has_error)) 794 return -EFAULT; 795 796 /* 797 * Check if we might have sent the entire request already 798 */ 799 if (unlikely(req->seqnum == req->info.npkts)) { 800 if (!list_empty(&req->txps)) 801 goto dosend; 802 return ret; 803 } 804 805 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 806 maxpkts = req->info.npkts - req->seqnum; 807 808 while (npkts < maxpkts) { 809 u32 datalen = 0, queued = 0, data_sent = 0; 810 u64 iov_offset = 0; 811 812 /* 813 * Check whether any of the completions have come back 814 * with errors. If so, we are not going to process any 815 * more packets from this request. 816 */ 817 if (READ_ONCE(req->has_error)) 818 return -EFAULT; 819 820 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 821 if (!tx) 822 return -ENOMEM; 823 824 tx->flags = 0; 825 tx->req = req; 826 INIT_LIST_HEAD(&tx->list); 827 828 /* 829 * For the last packet set the ACK request 830 * and disable header suppression. 831 */ 832 if (req->seqnum == req->info.npkts - 1) 833 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 834 TXREQ_FLAGS_REQ_DISABLE_SH); 835 836 /* 837 * Calculate the payload size - this is min of the fragment 838 * (MTU) size or the remaining bytes in the request but only 839 * if we have payload data. 840 */ 841 if (req->data_len) { 842 iovec = &req->iovs[req->iov_idx]; 843 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 844 if (++req->iov_idx == req->data_iovs) { 845 ret = -EFAULT; 846 goto free_tx; 847 } 848 iovec = &req->iovs[req->iov_idx]; 849 WARN_ON(iovec->offset); 850 } 851 852 datalen = compute_data_length(req, tx); 853 854 /* 855 * Disable header suppression for the payload <= 8DWS. 856 * If there is an uncorrectable error in the receive 857 * data FIFO when the received payload size is less than 858 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 859 * not reported.There is set RHF.EccErr if the header 860 * is not suppressed. 861 */ 862 if (!datalen) { 863 SDMA_DBG(req, 864 "Request has data but pkt len is 0"); 865 ret = -EFAULT; 866 goto free_tx; 867 } else if (datalen <= 32) { 868 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 869 } 870 } 871 872 if (req->ahg_idx >= 0) { 873 if (!req->seqnum) { 874 ret = user_sdma_txadd_ahg(req, tx, datalen); 875 if (ret) 876 goto free_tx; 877 } else { 878 int changes; 879 880 changes = set_txreq_header_ahg(req, tx, 881 datalen); 882 if (changes < 0) { 883 ret = changes; 884 goto free_tx; 885 } 886 } 887 } else { 888 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 889 datalen, user_sdma_txreq_cb); 890 if (ret) 891 goto free_tx; 892 /* 893 * Modify the header for this packet. This only needs 894 * to be done if we are not going to use AHG. Otherwise, 895 * the HW will do it based on the changes we gave it 896 * during sdma_txinit_ahg(). 897 */ 898 ret = set_txreq_header(req, tx, datalen); 899 if (ret) 900 goto free_txreq; 901 } 902 903 /* 904 * If the request contains any data vectors, add up to 905 * fragsize bytes to the descriptor. 906 */ 907 while (queued < datalen && 908 (req->sent + data_sent) < req->data_len) { 909 ret = user_sdma_txadd(req, tx, iovec, datalen, 910 &queued, &data_sent, &iov_offset); 911 if (ret) 912 goto free_txreq; 913 } 914 /* 915 * The txreq was submitted successfully so we can update 916 * the counters. 917 */ 918 req->koffset += datalen; 919 if (req_opcode(req->info.ctrl) == EXPECTED) 920 req->tidoffset += datalen; 921 req->sent += data_sent; 922 if (req->data_len) 923 iovec->offset += iov_offset; 924 list_add_tail(&tx->txreq.list, &req->txps); 925 /* 926 * It is important to increment this here as it is used to 927 * generate the BTH.PSN and, therefore, can't be bulk-updated 928 * outside of the loop. 929 */ 930 tx->seqnum = req->seqnum++; 931 npkts++; 932 } 933 dosend: 934 ret = sdma_send_txlist(req->sde, 935 iowait_get_ib_work(&pq->busy), 936 &req->txps, &count); 937 req->seqsubmitted += count; 938 if (req->seqsubmitted == req->info.npkts) { 939 /* 940 * The txreq has already been submitted to the HW queue 941 * so we can free the AHG entry now. Corruption will not 942 * happen due to the sequential manner in which 943 * descriptors are processed. 944 */ 945 if (req->ahg_idx >= 0) 946 sdma_ahg_free(req->sde, req->ahg_idx); 947 } 948 return ret; 949 950 free_txreq: 951 sdma_txclean(pq->dd, &tx->txreq); 952 free_tx: 953 kmem_cache_free(pq->txreq_cache, tx); 954 return ret; 955 } 956 957 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 958 { 959 struct evict_data evict_data; 960 961 evict_data.cleared = 0; 962 evict_data.target = npages; 963 hfi1_mmu_rb_evict(pq->handler, &evict_data); 964 return evict_data.cleared; 965 } 966 967 static int pin_sdma_pages(struct user_sdma_request *req, 968 struct user_sdma_iovec *iovec, 969 struct sdma_mmu_node *node, 970 int npages) 971 { 972 int pinned, cleared; 973 struct page **pages; 974 struct hfi1_user_sdma_pkt_q *pq = req->pq; 975 976 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 977 if (!pages) 978 return -ENOMEM; 979 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 980 981 npages -= node->npages; 982 retry: 983 if (!hfi1_can_pin_pages(pq->dd, current->mm, 984 atomic_read(&pq->n_locked), npages)) { 985 cleared = sdma_cache_evict(pq, npages); 986 if (cleared >= npages) 987 goto retry; 988 } 989 pinned = hfi1_acquire_user_pages(current->mm, 990 ((unsigned long)iovec->iov.iov_base + 991 (node->npages * PAGE_SIZE)), npages, 0, 992 pages + node->npages); 993 if (pinned < 0) { 994 kfree(pages); 995 return pinned; 996 } 997 if (pinned != npages) { 998 unpin_vector_pages(current->mm, pages, node->npages, pinned); 999 return -EFAULT; 1000 } 1001 kfree(node->pages); 1002 node->rb.len = iovec->iov.iov_len; 1003 node->pages = pages; 1004 atomic_add(pinned, &pq->n_locked); 1005 return pinned; 1006 } 1007 1008 static void unpin_sdma_pages(struct sdma_mmu_node *node) 1009 { 1010 if (node->npages) { 1011 unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0, 1012 node->npages); 1013 atomic_sub(node->npages, &node->pq->n_locked); 1014 } 1015 } 1016 1017 static int pin_vector_pages(struct user_sdma_request *req, 1018 struct user_sdma_iovec *iovec) 1019 { 1020 int ret = 0, pinned, npages; 1021 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1022 struct sdma_mmu_node *node = NULL; 1023 struct mmu_rb_node *rb_node; 1024 struct iovec *iov; 1025 bool extracted; 1026 1027 extracted = 1028 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1029 (unsigned long) 1030 iovec->iov.iov_base, 1031 iovec->iov.iov_len, &rb_node); 1032 if (rb_node) { 1033 node = container_of(rb_node, struct sdma_mmu_node, rb); 1034 if (!extracted) { 1035 atomic_inc(&node->refcount); 1036 iovec->pages = node->pages; 1037 iovec->npages = node->npages; 1038 iovec->node = node; 1039 return 0; 1040 } 1041 } 1042 1043 if (!node) { 1044 node = kzalloc(sizeof(*node), GFP_KERNEL); 1045 if (!node) 1046 return -ENOMEM; 1047 1048 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1049 node->pq = pq; 1050 atomic_set(&node->refcount, 0); 1051 } 1052 1053 iov = &iovec->iov; 1054 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1055 if (node->npages < npages) { 1056 pinned = pin_sdma_pages(req, iovec, node, npages); 1057 if (pinned < 0) { 1058 ret = pinned; 1059 goto bail; 1060 } 1061 node->npages += pinned; 1062 npages = node->npages; 1063 } 1064 iovec->pages = node->pages; 1065 iovec->npages = npages; 1066 iovec->node = node; 1067 1068 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1069 if (ret) { 1070 iovec->node = NULL; 1071 goto bail; 1072 } 1073 return 0; 1074 bail: 1075 unpin_sdma_pages(node); 1076 kfree(node); 1077 return ret; 1078 } 1079 1080 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1081 unsigned start, unsigned npages) 1082 { 1083 hfi1_release_user_pages(mm, pages + start, npages, false); 1084 kfree(pages); 1085 } 1086 1087 static int check_header_template(struct user_sdma_request *req, 1088 struct hfi1_pkt_header *hdr, u32 lrhlen, 1089 u32 datalen) 1090 { 1091 /* 1092 * Perform safety checks for any type of packet: 1093 * - transfer size is multiple of 64bytes 1094 * - packet length is multiple of 4 bytes 1095 * - packet length is not larger than MTU size 1096 * 1097 * These checks are only done for the first packet of the 1098 * transfer since the header is "given" to us by user space. 1099 * For the remainder of the packets we compute the values. 1100 */ 1101 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1102 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1103 return -EINVAL; 1104 1105 if (req_opcode(req->info.ctrl) == EXPECTED) { 1106 /* 1107 * The header is checked only on the first packet. Furthermore, 1108 * we ensure that at least one TID entry is copied when the 1109 * request is submitted. Therefore, we don't have to verify that 1110 * tididx points to something sane. 1111 */ 1112 u32 tidval = req->tids[req->tididx], 1113 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1114 tididx = EXP_TID_GET(tidval, IDX), 1115 tidctrl = EXP_TID_GET(tidval, CTRL), 1116 tidoff; 1117 __le32 kval = hdr->kdeth.ver_tid_offset; 1118 1119 tidoff = KDETH_GET(kval, OFFSET) * 1120 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1121 KDETH_OM_LARGE : KDETH_OM_SMALL); 1122 /* 1123 * Expected receive packets have the following 1124 * additional checks: 1125 * - offset is not larger than the TID size 1126 * - TIDCtrl values match between header and TID array 1127 * - TID indexes match between header and TID array 1128 */ 1129 if ((tidoff + datalen > tidlen) || 1130 KDETH_GET(kval, TIDCTRL) != tidctrl || 1131 KDETH_GET(kval, TID) != tididx) 1132 return -EINVAL; 1133 } 1134 return 0; 1135 } 1136 1137 /* 1138 * Correctly set the BTH.PSN field based on type of 1139 * transfer - eager packets can just increment the PSN but 1140 * expected packets encode generation and sequence in the 1141 * BTH.PSN field so just incrementing will result in errors. 1142 */ 1143 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1144 { 1145 u32 val = be32_to_cpu(bthpsn), 1146 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1147 0xffffffull), 1148 psn = val & mask; 1149 if (expct) 1150 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 1151 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 1152 else 1153 psn = psn + frags; 1154 return psn & mask; 1155 } 1156 1157 static int set_txreq_header(struct user_sdma_request *req, 1158 struct user_sdma_txreq *tx, u32 datalen) 1159 { 1160 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1161 struct hfi1_pkt_header *hdr = &tx->hdr; 1162 u8 omfactor; /* KDETH.OM */ 1163 u16 pbclen; 1164 int ret; 1165 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1166 1167 /* Copy the header template to the request before modification */ 1168 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1169 1170 /* 1171 * Check if the PBC and LRH length are mismatched. If so 1172 * adjust both in the header. 1173 */ 1174 pbclen = le16_to_cpu(hdr->pbc[0]); 1175 if (PBC2LRH(pbclen) != lrhlen) { 1176 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1177 hdr->pbc[0] = cpu_to_le16(pbclen); 1178 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1179 /* 1180 * Third packet 1181 * This is the first packet in the sequence that has 1182 * a "static" size that can be used for the rest of 1183 * the packets (besides the last one). 1184 */ 1185 if (unlikely(req->seqnum == 2)) { 1186 /* 1187 * From this point on the lengths in both the 1188 * PBC and LRH are the same until the last 1189 * packet. 1190 * Adjust the template so we don't have to update 1191 * every packet 1192 */ 1193 req->hdr.pbc[0] = hdr->pbc[0]; 1194 req->hdr.lrh[2] = hdr->lrh[2]; 1195 } 1196 } 1197 /* 1198 * We only have to modify the header if this is not the 1199 * first packet in the request. Otherwise, we use the 1200 * header given to us. 1201 */ 1202 if (unlikely(!req->seqnum)) { 1203 ret = check_header_template(req, hdr, lrhlen, datalen); 1204 if (ret) 1205 return ret; 1206 goto done; 1207 } 1208 1209 hdr->bth[2] = cpu_to_be32( 1210 set_pkt_bth_psn(hdr->bth[2], 1211 (req_opcode(req->info.ctrl) == EXPECTED), 1212 req->seqnum)); 1213 1214 /* Set ACK request on last packet */ 1215 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1216 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1217 1218 /* Set the new offset */ 1219 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1220 /* Expected packets have to fill in the new TID information */ 1221 if (req_opcode(req->info.ctrl) == EXPECTED) { 1222 tidval = req->tids[req->tididx]; 1223 /* 1224 * If the offset puts us at the end of the current TID, 1225 * advance everything. 1226 */ 1227 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1228 PAGE_SIZE)) { 1229 req->tidoffset = 0; 1230 /* 1231 * Since we don't copy all the TIDs, all at once, 1232 * we have to check again. 1233 */ 1234 if (++req->tididx > req->n_tids - 1 || 1235 !req->tids[req->tididx]) { 1236 return -EINVAL; 1237 } 1238 tidval = req->tids[req->tididx]; 1239 } 1240 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1241 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1242 KDETH_OM_SMALL_SHIFT; 1243 /* Set KDETH.TIDCtrl based on value for this TID. */ 1244 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1245 EXP_TID_GET(tidval, CTRL)); 1246 /* Set KDETH.TID based on value for this TID */ 1247 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1248 EXP_TID_GET(tidval, IDX)); 1249 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1250 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1251 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1252 /* 1253 * Set the KDETH.OFFSET and KDETH.OM based on size of 1254 * transfer. 1255 */ 1256 trace_hfi1_sdma_user_tid_info( 1257 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1258 req->tidoffset, req->tidoffset >> omfactor, 1259 omfactor != KDETH_OM_SMALL_SHIFT); 1260 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1261 req->tidoffset >> omfactor); 1262 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1263 omfactor != KDETH_OM_SMALL_SHIFT); 1264 } 1265 done: 1266 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1267 req->info.comp_idx, hdr, tidval); 1268 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1269 } 1270 1271 static int set_txreq_header_ahg(struct user_sdma_request *req, 1272 struct user_sdma_txreq *tx, u32 datalen) 1273 { 1274 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1275 int idx = 0; 1276 u8 omfactor; /* KDETH.OM */ 1277 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1278 struct hfi1_pkt_header *hdr = &req->hdr; 1279 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1280 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1281 size_t array_size = ARRAY_SIZE(ahg); 1282 1283 if (PBC2LRH(pbclen) != lrhlen) { 1284 /* PBC.PbcLengthDWs */ 1285 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1286 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1287 if (idx < 0) 1288 return idx; 1289 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1290 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1291 (__force u16)cpu_to_be16(lrhlen >> 2)); 1292 if (idx < 0) 1293 return idx; 1294 } 1295 1296 /* 1297 * Do the common updates 1298 */ 1299 /* BTH.PSN and BTH.A */ 1300 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1301 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1302 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1303 val32 |= 1UL << 31; 1304 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1305 (__force u16)cpu_to_be16(val32 >> 16)); 1306 if (idx < 0) 1307 return idx; 1308 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1309 (__force u16)cpu_to_be16(val32 & 0xffff)); 1310 if (idx < 0) 1311 return idx; 1312 /* KDETH.Offset */ 1313 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1314 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1315 if (idx < 0) 1316 return idx; 1317 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1318 (__force u16)cpu_to_le16(req->koffset >> 16)); 1319 if (idx < 0) 1320 return idx; 1321 if (req_opcode(req->info.ctrl) == EXPECTED) { 1322 __le16 val; 1323 1324 tidval = req->tids[req->tididx]; 1325 1326 /* 1327 * If the offset puts us at the end of the current TID, 1328 * advance everything. 1329 */ 1330 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1331 PAGE_SIZE)) { 1332 req->tidoffset = 0; 1333 /* 1334 * Since we don't copy all the TIDs, all at once, 1335 * we have to check again. 1336 */ 1337 if (++req->tididx > req->n_tids - 1 || 1338 !req->tids[req->tididx]) 1339 return -EINVAL; 1340 tidval = req->tids[req->tididx]; 1341 } 1342 omfactor = ((EXP_TID_GET(tidval, LEN) * 1343 PAGE_SIZE) >= 1344 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1345 KDETH_OM_SMALL_SHIFT; 1346 /* KDETH.OM and KDETH.OFFSET (TID) */ 1347 idx = ahg_header_set( 1348 ahg, idx, array_size, 7, 0, 16, 1349 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1350 ((req->tidoffset >> omfactor) 1351 & 0x7fff))); 1352 if (idx < 0) 1353 return idx; 1354 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1355 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1356 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1357 1358 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1359 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1360 INTR) << 1361 AHG_KDETH_INTR_SHIFT)); 1362 } else { 1363 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1364 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1365 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1366 INTR) << 1367 AHG_KDETH_INTR_SHIFT)); 1368 } 1369 1370 idx = ahg_header_set(ahg, idx, array_size, 1371 7, 16, 14, (__force u16)val); 1372 if (idx < 0) 1373 return idx; 1374 } 1375 1376 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1377 req->info.comp_idx, req->sde->this_idx, 1378 req->ahg_idx, ahg, idx, tidval); 1379 sdma_txinit_ahg(&tx->txreq, 1380 SDMA_TXREQ_F_USE_AHG, 1381 datalen, req->ahg_idx, idx, 1382 ahg, sizeof(req->hdr), 1383 user_sdma_txreq_cb); 1384 1385 return idx; 1386 } 1387 1388 /** 1389 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1390 * @txreq: valid sdma tx request 1391 * @status: success/failure of request 1392 * 1393 * Called when the SDMA progress state machine gets notification that 1394 * the SDMA descriptors for this tx request have been processed by the 1395 * DMA engine. Called in interrupt context. 1396 * Only do work on completed sequences. 1397 */ 1398 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1399 { 1400 struct user_sdma_txreq *tx = 1401 container_of(txreq, struct user_sdma_txreq, txreq); 1402 struct user_sdma_request *req; 1403 struct hfi1_user_sdma_pkt_q *pq; 1404 struct hfi1_user_sdma_comp_q *cq; 1405 enum hfi1_sdma_comp_state state = COMPLETE; 1406 1407 if (!tx->req) 1408 return; 1409 1410 req = tx->req; 1411 pq = req->pq; 1412 cq = req->cq; 1413 1414 if (status != SDMA_TXREQ_S_OK) { 1415 SDMA_DBG(req, "SDMA completion with error %d", 1416 status); 1417 WRITE_ONCE(req->has_error, 1); 1418 state = ERROR; 1419 } 1420 1421 req->seqcomp = tx->seqnum; 1422 kmem_cache_free(pq->txreq_cache, tx); 1423 1424 /* sequence isn't complete? We are done */ 1425 if (req->seqcomp != req->info.npkts - 1) 1426 return; 1427 1428 user_sdma_free_request(req, false); 1429 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1430 pq_update(pq); 1431 } 1432 1433 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1434 { 1435 if (atomic_dec_and_test(&pq->n_reqs)) 1436 wake_up(&pq->wait); 1437 } 1438 1439 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1440 { 1441 int i; 1442 1443 if (!list_empty(&req->txps)) { 1444 struct sdma_txreq *t, *p; 1445 1446 list_for_each_entry_safe(t, p, &req->txps, list) { 1447 struct user_sdma_txreq *tx = 1448 container_of(t, struct user_sdma_txreq, txreq); 1449 list_del_init(&t->list); 1450 sdma_txclean(req->pq->dd, t); 1451 kmem_cache_free(req->pq->txreq_cache, tx); 1452 } 1453 } 1454 1455 for (i = 0; i < req->data_iovs; i++) { 1456 struct sdma_mmu_node *node = req->iovs[i].node; 1457 1458 if (!node) 1459 continue; 1460 1461 req->iovs[i].node = NULL; 1462 1463 if (unpin) 1464 hfi1_mmu_rb_remove(req->pq->handler, 1465 &node->rb); 1466 else 1467 atomic_dec(&node->refcount); 1468 } 1469 1470 kfree(req->tids); 1471 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1472 } 1473 1474 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1475 struct hfi1_user_sdma_comp_q *cq, 1476 u16 idx, enum hfi1_sdma_comp_state state, 1477 int ret) 1478 { 1479 if (state == ERROR) 1480 cq->comps[idx].errcode = -ret; 1481 smp_wmb(); /* make sure errcode is visible first */ 1482 cq->comps[idx].status = state; 1483 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1484 idx, state, ret); 1485 } 1486 1487 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1488 unsigned long len) 1489 { 1490 return (bool)(node->addr == addr); 1491 } 1492 1493 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1494 { 1495 struct sdma_mmu_node *node = 1496 container_of(mnode, struct sdma_mmu_node, rb); 1497 1498 atomic_inc(&node->refcount); 1499 return 0; 1500 } 1501 1502 /* 1503 * Return 1 to remove the node from the rb tree and call the remove op. 1504 * 1505 * Called with the rb tree lock held. 1506 */ 1507 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1508 void *evict_arg, bool *stop) 1509 { 1510 struct sdma_mmu_node *node = 1511 container_of(mnode, struct sdma_mmu_node, rb); 1512 struct evict_data *evict_data = evict_arg; 1513 1514 /* is this node still being used? */ 1515 if (atomic_read(&node->refcount)) 1516 return 0; /* keep this node */ 1517 1518 /* this node will be evicted, add its pages to our count */ 1519 evict_data->cleared += node->npages; 1520 1521 /* have enough pages been cleared? */ 1522 if (evict_data->cleared >= evict_data->target) 1523 *stop = true; 1524 1525 return 1; /* remove this node */ 1526 } 1527 1528 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1529 { 1530 struct sdma_mmu_node *node = 1531 container_of(mnode, struct sdma_mmu_node, rb); 1532 1533 unpin_sdma_pages(node); 1534 kfree(node); 1535 } 1536 1537 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1538 { 1539 struct sdma_mmu_node *node = 1540 container_of(mnode, struct sdma_mmu_node, rb); 1541 1542 if (!atomic_read(&node->refcount)) 1543 return 1; 1544 return 0; 1545 } 1546