1 /* 2 * Copyright(c) 2015, 2016 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 64 #include "hfi.h" 65 #include "sdma.h" 66 #include "user_sdma.h" 67 #include "verbs.h" /* for the headers */ 68 #include "common.h" /* for struct hfi1_tid_info */ 69 #include "trace.h" 70 #include "mmu_rb.h" 71 72 static uint hfi1_sdma_comp_ring_size = 128; 73 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 74 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 75 76 /* The maximum number of Data io vectors per message/request */ 77 #define MAX_VECTORS_PER_REQ 8 78 /* 79 * Maximum number of packet to send from each message/request 80 * before moving to the next one. 81 */ 82 #define MAX_PKTS_PER_QUEUE 16 83 84 #define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) 85 86 #define req_opcode(x) \ 87 (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 88 #define req_version(x) \ 89 (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 90 #define req_iovcnt(x) \ 91 (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) 92 93 /* Number of BTH.PSN bits used for sequence number in expected rcvs */ 94 #define BTH_SEQ_MASK 0x7ffull 95 96 /* 97 * Define fields in the KDETH header so we can update the header 98 * template. 99 */ 100 #define KDETH_OFFSET_SHIFT 0 101 #define KDETH_OFFSET_MASK 0x7fff 102 #define KDETH_OM_SHIFT 15 103 #define KDETH_OM_MASK 0x1 104 #define KDETH_TID_SHIFT 16 105 #define KDETH_TID_MASK 0x3ff 106 #define KDETH_TIDCTRL_SHIFT 26 107 #define KDETH_TIDCTRL_MASK 0x3 108 #define KDETH_INTR_SHIFT 28 109 #define KDETH_INTR_MASK 0x1 110 #define KDETH_SH_SHIFT 29 111 #define KDETH_SH_MASK 0x1 112 #define KDETH_HCRC_UPPER_SHIFT 16 113 #define KDETH_HCRC_UPPER_MASK 0xff 114 #define KDETH_HCRC_LOWER_SHIFT 24 115 #define KDETH_HCRC_LOWER_MASK 0xff 116 117 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) 118 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) 119 120 #define KDETH_GET(val, field) \ 121 (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) 122 #define KDETH_SET(dw, field, val) do { \ 123 u32 dwval = le32_to_cpu(dw); \ 124 dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ 125 dwval |= (((val) & KDETH_##field##_MASK) << \ 126 KDETH_##field##_SHIFT); \ 127 dw = cpu_to_le32(dwval); \ 128 } while (0) 129 130 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ 131 do { \ 132 if ((idx) < ARRAY_SIZE((arr))) \ 133 (arr)[(idx++)] = sdma_build_ahg_descriptor( \ 134 (__force u16)(value), (dw), (bit), \ 135 (width)); \ 136 else \ 137 return -ERANGE; \ 138 } while (0) 139 140 /* KDETH OM multipliers and switch over point */ 141 #define KDETH_OM_SMALL 4 142 #define KDETH_OM_LARGE 64 143 #define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) 144 145 /* Last packet in the request */ 146 #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0) 147 148 #define SDMA_REQ_IN_USE 0 149 #define SDMA_REQ_FOR_THREAD 1 150 #define SDMA_REQ_SEND_DONE 2 151 #define SDMA_REQ_HAVE_AHG 3 152 #define SDMA_REQ_HAS_ERROR 4 153 #define SDMA_REQ_DONE_ERROR 5 154 155 #define SDMA_PKT_Q_INACTIVE BIT(0) 156 #define SDMA_PKT_Q_ACTIVE BIT(1) 157 #define SDMA_PKT_Q_DEFERRED BIT(2) 158 159 /* 160 * Maximum retry attempts to submit a TX request 161 * before putting the process to sleep. 162 */ 163 #define MAX_DEFER_RETRY_COUNT 1 164 165 static unsigned initial_pkt_count = 8; 166 167 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ 168 169 struct sdma_mmu_node; 170 171 struct user_sdma_iovec { 172 struct list_head list; 173 struct iovec iov; 174 /* number of pages in this vector */ 175 unsigned npages; 176 /* array of pinned pages for this vector */ 177 struct page **pages; 178 /* 179 * offset into the virtual address space of the vector at 180 * which we last left off. 181 */ 182 u64 offset; 183 struct sdma_mmu_node *node; 184 }; 185 186 #define SDMA_CACHE_NODE_EVICT 0 187 188 struct sdma_mmu_node { 189 struct mmu_rb_node rb; 190 struct list_head list; 191 struct hfi1_user_sdma_pkt_q *pq; 192 atomic_t refcount; 193 struct page **pages; 194 unsigned npages; 195 unsigned long flags; 196 }; 197 198 struct user_sdma_request { 199 struct sdma_req_info info; 200 struct hfi1_user_sdma_pkt_q *pq; 201 struct hfi1_user_sdma_comp_q *cq; 202 /* This is the original header from user space */ 203 struct hfi1_pkt_header hdr; 204 /* 205 * Pointer to the SDMA engine for this request. 206 * Since different request could be on different VLs, 207 * each request will need it's own engine pointer. 208 */ 209 struct sdma_engine *sde; 210 u8 ahg_idx; 211 u32 ahg[9]; 212 /* 213 * KDETH.Offset (Eager) field 214 * We need to remember the initial value so the headers 215 * can be updated properly. 216 */ 217 u32 koffset; 218 /* 219 * KDETH.OFFSET (TID) field 220 * The offset can cover multiple packets, depending on the 221 * size of the TID entry. 222 */ 223 u32 tidoffset; 224 /* 225 * KDETH.OM 226 * Remember this because the header template always sets it 227 * to 0. 228 */ 229 u8 omfactor; 230 /* 231 * We copy the iovs for this request (based on 232 * info.iovcnt). These are only the data vectors 233 */ 234 unsigned data_iovs; 235 /* total length of the data in the request */ 236 u32 data_len; 237 /* progress index moving along the iovs array */ 238 unsigned iov_idx; 239 struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; 240 /* number of elements copied to the tids array */ 241 u16 n_tids; 242 /* TID array values copied from the tid_iov vector */ 243 u32 *tids; 244 u16 tididx; 245 u32 sent; 246 u64 seqnum; 247 u64 seqcomp; 248 u64 seqsubmitted; 249 struct list_head txps; 250 unsigned long flags; 251 /* status of the last txreq completed */ 252 int status; 253 }; 254 255 /* 256 * A single txreq could span up to 3 physical pages when the MTU 257 * is sufficiently large (> 4K). Each of the IOV pointers also 258 * needs it's own set of flags so the vector has been handled 259 * independently of each other. 260 */ 261 struct user_sdma_txreq { 262 /* Packet header for the txreq */ 263 struct hfi1_pkt_header hdr; 264 struct sdma_txreq txreq; 265 struct list_head list; 266 struct user_sdma_request *req; 267 u16 flags; 268 unsigned busycount; 269 u64 seqnum; 270 }; 271 272 #define SDMA_DBG(req, fmt, ...) \ 273 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ 274 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ 275 ##__VA_ARGS__) 276 #define SDMA_Q_DBG(pq, fmt, ...) \ 277 hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ 278 (pq)->subctxt, ##__VA_ARGS__) 279 280 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); 281 static int num_user_pages(const struct iovec *); 282 static void user_sdma_txreq_cb(struct sdma_txreq *, int); 283 static inline void pq_update(struct hfi1_user_sdma_pkt_q *); 284 static void user_sdma_free_request(struct user_sdma_request *, bool); 285 static int pin_vector_pages(struct user_sdma_request *, 286 struct user_sdma_iovec *); 287 static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned, 288 unsigned); 289 static int check_header_template(struct user_sdma_request *, 290 struct hfi1_pkt_header *, u32, u32); 291 static int set_txreq_header(struct user_sdma_request *, 292 struct user_sdma_txreq *, u32); 293 static int set_txreq_header_ahg(struct user_sdma_request *, 294 struct user_sdma_txreq *, u32); 295 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *, 296 struct hfi1_user_sdma_comp_q *, 297 u16, enum hfi1_sdma_comp_state, int); 298 static inline u32 set_pkt_bth_psn(__be32, u8, u32); 299 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 300 301 static int defer_packet_queue( 302 struct sdma_engine *, 303 struct iowait *, 304 struct sdma_txreq *, 305 unsigned seq); 306 static void activate_packet_queue(struct iowait *, int); 307 static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); 308 static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *); 309 static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *, 310 struct mm_struct *); 311 static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *); 312 313 static struct mmu_rb_ops sdma_rb_ops = { 314 .filter = sdma_rb_filter, 315 .insert = sdma_rb_insert, 316 .remove = sdma_rb_remove, 317 .invalidate = sdma_rb_invalidate 318 }; 319 320 static int defer_packet_queue( 321 struct sdma_engine *sde, 322 struct iowait *wait, 323 struct sdma_txreq *txreq, 324 unsigned seq) 325 { 326 struct hfi1_user_sdma_pkt_q *pq = 327 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 328 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 329 struct user_sdma_txreq *tx = 330 container_of(txreq, struct user_sdma_txreq, txreq); 331 332 if (sdma_progress(sde, seq, txreq)) { 333 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 334 goto eagain; 335 } 336 /* 337 * We are assuming that if the list is enqueued somewhere, it 338 * is to the dmawait list since that is the only place where 339 * it is supposed to be enqueued. 340 */ 341 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 342 write_seqlock(&dev->iowait_lock); 343 if (list_empty(&pq->busy.list)) 344 list_add_tail(&pq->busy.list, &sde->dmawait); 345 write_sequnlock(&dev->iowait_lock); 346 return -EBUSY; 347 eagain: 348 return -EAGAIN; 349 } 350 351 static void activate_packet_queue(struct iowait *wait, int reason) 352 { 353 struct hfi1_user_sdma_pkt_q *pq = 354 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 355 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 356 wake_up(&wait->wait_dma); 357 }; 358 359 static void sdma_kmem_cache_ctor(void *obj) 360 { 361 struct user_sdma_txreq *tx = obj; 362 363 memset(tx, 0, sizeof(*tx)); 364 } 365 366 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) 367 { 368 struct hfi1_filedata *fd; 369 int ret = 0; 370 unsigned memsize; 371 char buf[64]; 372 struct hfi1_devdata *dd; 373 struct hfi1_user_sdma_comp_q *cq; 374 struct hfi1_user_sdma_pkt_q *pq; 375 unsigned long flags; 376 377 if (!uctxt || !fp) { 378 ret = -EBADF; 379 goto done; 380 } 381 382 fd = fp->private_data; 383 384 if (!hfi1_sdma_comp_ring_size) { 385 ret = -EINVAL; 386 goto done; 387 } 388 389 dd = uctxt->dd; 390 391 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 392 if (!pq) 393 goto pq_nomem; 394 395 memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size; 396 pq->reqs = kzalloc(memsize, GFP_KERNEL); 397 if (!pq->reqs) 398 goto pq_reqs_nomem; 399 400 INIT_LIST_HEAD(&pq->list); 401 pq->dd = dd; 402 pq->ctxt = uctxt->ctxt; 403 pq->subctxt = fd->subctxt; 404 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 405 pq->state = SDMA_PKT_Q_INACTIVE; 406 atomic_set(&pq->n_reqs, 0); 407 init_waitqueue_head(&pq->wait); 408 pq->sdma_rb_root = RB_ROOT; 409 INIT_LIST_HEAD(&pq->evict); 410 spin_lock_init(&pq->evict_lock); 411 412 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 413 activate_packet_queue, NULL); 414 pq->reqidx = 0; 415 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 416 fd->subctxt); 417 pq->txreq_cache = kmem_cache_create(buf, 418 sizeof(struct user_sdma_txreq), 419 L1_CACHE_BYTES, 420 SLAB_HWCACHE_ALIGN, 421 sdma_kmem_cache_ctor); 422 if (!pq->txreq_cache) { 423 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 424 uctxt->ctxt); 425 goto pq_txreq_nomem; 426 } 427 fd->pq = pq; 428 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 429 if (!cq) 430 goto cq_nomem; 431 432 memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size); 433 cq->comps = vmalloc_user(memsize); 434 if (!cq->comps) 435 goto cq_comps_nomem; 436 437 cq->nentries = hfi1_sdma_comp_ring_size; 438 fd->cq = cq; 439 440 ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops); 441 if (ret) { 442 dd_dev_err(dd, "Failed to register with MMU %d", ret); 443 goto done; 444 } 445 446 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 447 list_add(&pq->list, &uctxt->sdma_queues); 448 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 449 goto done; 450 451 cq_comps_nomem: 452 kfree(cq); 453 cq_nomem: 454 kmem_cache_destroy(pq->txreq_cache); 455 pq_txreq_nomem: 456 kfree(pq->reqs); 457 pq_reqs_nomem: 458 kfree(pq); 459 fd->pq = NULL; 460 pq_nomem: 461 ret = -ENOMEM; 462 done: 463 return ret; 464 } 465 466 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) 467 { 468 struct hfi1_ctxtdata *uctxt = fd->uctxt; 469 struct hfi1_user_sdma_pkt_q *pq; 470 unsigned long flags; 471 472 hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, 473 uctxt->ctxt, fd->subctxt); 474 pq = fd->pq; 475 hfi1_mmu_rb_unregister(&pq->sdma_rb_root); 476 if (pq) { 477 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 478 if (!list_empty(&pq->list)) 479 list_del_init(&pq->list); 480 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 481 iowait_sdma_drain(&pq->busy); 482 /* Wait until all requests have been freed. */ 483 wait_event_interruptible( 484 pq->wait, 485 (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); 486 kfree(pq->reqs); 487 kmem_cache_destroy(pq->txreq_cache); 488 kfree(pq); 489 fd->pq = NULL; 490 } 491 if (fd->cq) { 492 vfree(fd->cq->comps); 493 kfree(fd->cq); 494 fd->cq = NULL; 495 } 496 return 0; 497 } 498 499 int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, 500 unsigned long dim, unsigned long *count) 501 { 502 int ret = 0, i = 0; 503 struct hfi1_filedata *fd = fp->private_data; 504 struct hfi1_ctxtdata *uctxt = fd->uctxt; 505 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 506 struct hfi1_user_sdma_comp_q *cq = fd->cq; 507 struct hfi1_devdata *dd = pq->dd; 508 unsigned long idx = 0; 509 u8 pcount = initial_pkt_count; 510 struct sdma_req_info info; 511 struct user_sdma_request *req; 512 u8 opcode, sc, vl; 513 int req_queued = 0; 514 515 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 516 hfi1_cdbg( 517 SDMA, 518 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 519 dd->unit, uctxt->ctxt, fd->subctxt, 520 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 521 return -EINVAL; 522 } 523 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 524 if (ret) { 525 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 526 dd->unit, uctxt->ctxt, fd->subctxt, ret); 527 return -EFAULT; 528 } 529 530 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 531 (u16 *)&info); 532 if (cq->comps[info.comp_idx].status == QUEUED || 533 test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) { 534 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", 535 dd->unit, uctxt->ctxt, fd->subctxt, 536 info.comp_idx); 537 return -EBADSLT; 538 } 539 if (!info.fragsize) { 540 hfi1_cdbg(SDMA, 541 "[%u:%u:%u:%u] Request does not specify fragsize", 542 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 543 return -EINVAL; 544 } 545 /* 546 * We've done all the safety checks that we can up to this point, 547 * "allocate" the request entry. 548 */ 549 hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, 550 uctxt->ctxt, fd->subctxt, info.comp_idx); 551 req = pq->reqs + info.comp_idx; 552 memset(req, 0, sizeof(*req)); 553 /* Mark the request as IN_USE before we start filling it in. */ 554 set_bit(SDMA_REQ_IN_USE, &req->flags); 555 req->data_iovs = req_iovcnt(info.ctrl) - 1; 556 req->pq = pq; 557 req->cq = cq; 558 req->status = -1; 559 INIT_LIST_HEAD(&req->txps); 560 561 memcpy(&req->info, &info, sizeof(info)); 562 563 if (req_opcode(info.ctrl) == EXPECTED) 564 req->data_iovs--; 565 566 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 567 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 568 MAX_VECTORS_PER_REQ); 569 return -EINVAL; 570 } 571 /* Copy the header from the user buffer */ 572 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 573 sizeof(req->hdr)); 574 if (ret) { 575 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 576 ret = -EFAULT; 577 goto free_req; 578 } 579 580 /* If Static rate control is not enabled, sanitize the header. */ 581 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 582 req->hdr.pbc[2] = 0; 583 584 /* Validate the opcode. Do not trust packets from user space blindly. */ 585 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 586 if ((opcode & USER_OPCODE_CHECK_MASK) != 587 USER_OPCODE_CHECK_VAL) { 588 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 589 ret = -EINVAL; 590 goto free_req; 591 } 592 /* 593 * Validate the vl. Do not trust packets from user space blindly. 594 * VL comes from PBC, SC comes from LRH, and the VL needs to 595 * match the SC look up. 596 */ 597 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 598 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 599 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 600 if (vl >= dd->pport->vls_operational || 601 vl != sc_to_vlt(dd, sc)) { 602 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 603 ret = -EINVAL; 604 goto free_req; 605 } 606 607 /* Checking P_KEY for requests from user-space */ 608 if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, 609 PKEY_CHECK_INVALID)) { 610 ret = -EINVAL; 611 goto free_req; 612 } 613 614 /* 615 * Also should check the BTH.lnh. If it says the next header is GRH then 616 * the RXE parsing will be off and will land in the middle of the KDETH 617 * or miss it entirely. 618 */ 619 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 620 SDMA_DBG(req, "User tried to pass in a GRH"); 621 ret = -EINVAL; 622 goto free_req; 623 } 624 625 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 626 /* 627 * Calculate the initial TID offset based on the values of 628 * KDETH.OFFSET and KDETH.OM that are passed in. 629 */ 630 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 631 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 632 KDETH_OM_LARGE : KDETH_OM_SMALL); 633 SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); 634 idx++; 635 636 /* Save all the IO vector structures */ 637 while (i < req->data_iovs) { 638 INIT_LIST_HEAD(&req->iovs[i].list); 639 memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); 640 ret = pin_vector_pages(req, &req->iovs[i]); 641 if (ret) { 642 req->status = ret; 643 goto free_req; 644 } 645 req->data_len += req->iovs[i++].iov.iov_len; 646 } 647 SDMA_DBG(req, "total data length %u", req->data_len); 648 649 if (pcount > req->info.npkts) 650 pcount = req->info.npkts; 651 /* 652 * Copy any TID info 653 * User space will provide the TID info only when the 654 * request type is EXPECTED. This is true even if there is 655 * only one packet in the request and the header is already 656 * setup. The reason for the singular TID case is that the 657 * driver needs to perform safety checks. 658 */ 659 if (req_opcode(req->info.ctrl) == EXPECTED) { 660 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 661 662 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 663 ret = -EINVAL; 664 goto free_req; 665 } 666 req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL); 667 if (!req->tids) { 668 ret = -ENOMEM; 669 goto free_req; 670 } 671 /* 672 * We have to copy all of the tids because they may vary 673 * in size and, therefore, the TID count might not be 674 * equal to the pkt count. However, there is no way to 675 * tell at this point. 676 */ 677 ret = copy_from_user(req->tids, iovec[idx].iov_base, 678 ntids * sizeof(*req->tids)); 679 if (ret) { 680 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 681 ntids, ret); 682 ret = -EFAULT; 683 goto free_req; 684 } 685 req->n_tids = ntids; 686 idx++; 687 } 688 689 /* Have to select the engine */ 690 req->sde = sdma_select_engine_vl(dd, 691 (u32)(uctxt->ctxt + fd->subctxt), 692 vl); 693 if (!req->sde || !sdma_running(req->sde)) { 694 ret = -ECOMM; 695 goto free_req; 696 } 697 698 /* We don't need an AHG entry if the request contains only one packet */ 699 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) { 700 int ahg = sdma_ahg_alloc(req->sde); 701 702 if (likely(ahg >= 0)) { 703 req->ahg_idx = (u8)ahg; 704 set_bit(SDMA_REQ_HAVE_AHG, &req->flags); 705 } 706 } 707 708 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 709 atomic_inc(&pq->n_reqs); 710 req_queued = 1; 711 /* Send the first N packets in the request to buy us some time */ 712 ret = user_sdma_send_pkts(req, pcount); 713 if (unlikely(ret < 0 && ret != -EBUSY)) { 714 req->status = ret; 715 goto free_req; 716 } 717 718 /* 719 * It is possible that the SDMA engine would have processed all the 720 * submitted packets by the time we get here. Therefore, only set 721 * packet queue state to ACTIVE if there are still uncompleted 722 * requests. 723 */ 724 if (atomic_read(&pq->n_reqs)) 725 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 726 727 /* 728 * This is a somewhat blocking send implementation. 729 * The driver will block the caller until all packets of the 730 * request have been submitted to the SDMA engine. However, it 731 * will not wait for send completions. 732 */ 733 while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { 734 ret = user_sdma_send_pkts(req, pcount); 735 if (ret < 0) { 736 if (ret != -EBUSY) { 737 req->status = ret; 738 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 739 if (ACCESS_ONCE(req->seqcomp) == 740 req->seqsubmitted - 1) 741 goto free_req; 742 return ret; 743 } 744 wait_event_interruptible_timeout( 745 pq->busy.wait_dma, 746 (pq->state == SDMA_PKT_Q_ACTIVE), 747 msecs_to_jiffies( 748 SDMA_IOWAIT_TIMEOUT)); 749 } 750 } 751 *count += idx; 752 return 0; 753 free_req: 754 user_sdma_free_request(req, true); 755 if (req_queued) 756 pq_update(pq); 757 set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); 758 return ret; 759 } 760 761 static inline u32 compute_data_length(struct user_sdma_request *req, 762 struct user_sdma_txreq *tx) 763 { 764 /* 765 * Determine the proper size of the packet data. 766 * The size of the data of the first packet is in the header 767 * template. However, it includes the header and ICRC, which need 768 * to be subtracted. 769 * The size of the remaining packets is the minimum of the frag 770 * size (MTU) or remaining data in the request. 771 */ 772 u32 len; 773 774 if (!req->seqnum) { 775 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 776 (sizeof(tx->hdr) - 4)); 777 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 778 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 779 PAGE_SIZE; 780 /* 781 * Get the data length based on the remaining space in the 782 * TID pair. 783 */ 784 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 785 /* If we've filled up the TID pair, move to the next one. */ 786 if (unlikely(!len) && ++req->tididx < req->n_tids && 787 req->tids[req->tididx]) { 788 tidlen = EXP_TID_GET(req->tids[req->tididx], 789 LEN) * PAGE_SIZE; 790 req->tidoffset = 0; 791 len = min_t(u32, tidlen, req->info.fragsize); 792 } 793 /* 794 * Since the TID pairs map entire pages, make sure that we 795 * are not going to try to send more data that we have 796 * remaining. 797 */ 798 len = min(len, req->data_len - req->sent); 799 } else { 800 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 801 } 802 SDMA_DBG(req, "Data Length = %u", len); 803 return len; 804 } 805 806 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 807 { 808 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 809 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 810 } 811 812 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) 813 { 814 int ret = 0; 815 unsigned npkts = 0; 816 struct user_sdma_txreq *tx = NULL; 817 struct hfi1_user_sdma_pkt_q *pq = NULL; 818 struct user_sdma_iovec *iovec = NULL; 819 820 if (!req->pq) 821 return -EINVAL; 822 823 pq = req->pq; 824 825 /* If tx completion has reported an error, we are done. */ 826 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 827 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 828 return -EFAULT; 829 } 830 831 /* 832 * Check if we might have sent the entire request already 833 */ 834 if (unlikely(req->seqnum == req->info.npkts)) { 835 if (!list_empty(&req->txps)) 836 goto dosend; 837 return ret; 838 } 839 840 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 841 maxpkts = req->info.npkts - req->seqnum; 842 843 while (npkts < maxpkts) { 844 u32 datalen = 0, queued = 0, data_sent = 0; 845 u64 iov_offset = 0; 846 847 /* 848 * Check whether any of the completions have come back 849 * with errors. If so, we are not going to process any 850 * more packets from this request. 851 */ 852 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 853 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 854 return -EFAULT; 855 } 856 857 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 858 if (!tx) 859 return -ENOMEM; 860 861 tx->flags = 0; 862 tx->req = req; 863 tx->busycount = 0; 864 INIT_LIST_HEAD(&tx->list); 865 866 if (req->seqnum == req->info.npkts - 1) 867 tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT; 868 869 /* 870 * Calculate the payload size - this is min of the fragment 871 * (MTU) size or the remaining bytes in the request but only 872 * if we have payload data. 873 */ 874 if (req->data_len) { 875 iovec = &req->iovs[req->iov_idx]; 876 if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { 877 if (++req->iov_idx == req->data_iovs) { 878 ret = -EFAULT; 879 goto free_txreq; 880 } 881 iovec = &req->iovs[req->iov_idx]; 882 WARN_ON(iovec->offset); 883 } 884 885 datalen = compute_data_length(req, tx); 886 if (!datalen) { 887 SDMA_DBG(req, 888 "Request has data but pkt len is 0"); 889 ret = -EFAULT; 890 goto free_tx; 891 } 892 } 893 894 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { 895 if (!req->seqnum) { 896 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 897 u32 lrhlen = get_lrh_len(req->hdr, datalen); 898 /* 899 * Copy the request header into the tx header 900 * because the HW needs a cacheline-aligned 901 * address. 902 * This copy can be optimized out if the hdr 903 * member of user_sdma_request were also 904 * cacheline aligned. 905 */ 906 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 907 if (PBC2LRH(pbclen) != lrhlen) { 908 pbclen = (pbclen & 0xf000) | 909 LRH2PBC(lrhlen); 910 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 911 } 912 ret = sdma_txinit_ahg(&tx->txreq, 913 SDMA_TXREQ_F_AHG_COPY, 914 sizeof(tx->hdr) + datalen, 915 req->ahg_idx, 0, NULL, 0, 916 user_sdma_txreq_cb); 917 if (ret) 918 goto free_tx; 919 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, 920 &tx->hdr, 921 sizeof(tx->hdr)); 922 if (ret) 923 goto free_txreq; 924 } else { 925 int changes; 926 927 changes = set_txreq_header_ahg(req, tx, 928 datalen); 929 if (changes < 0) 930 goto free_tx; 931 sdma_txinit_ahg(&tx->txreq, 932 SDMA_TXREQ_F_USE_AHG, 933 datalen, req->ahg_idx, changes, 934 req->ahg, sizeof(req->hdr), 935 user_sdma_txreq_cb); 936 } 937 } else { 938 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 939 datalen, user_sdma_txreq_cb); 940 if (ret) 941 goto free_tx; 942 /* 943 * Modify the header for this packet. This only needs 944 * to be done if we are not going to use AHG. Otherwise, 945 * the HW will do it based on the changes we gave it 946 * during sdma_txinit_ahg(). 947 */ 948 ret = set_txreq_header(req, tx, datalen); 949 if (ret) 950 goto free_txreq; 951 } 952 953 /* 954 * If the request contains any data vectors, add up to 955 * fragsize bytes to the descriptor. 956 */ 957 while (queued < datalen && 958 (req->sent + data_sent) < req->data_len) { 959 unsigned long base, offset; 960 unsigned pageidx, len; 961 962 base = (unsigned long)iovec->iov.iov_base; 963 offset = offset_in_page(base + iovec->offset + 964 iov_offset); 965 pageidx = (((iovec->offset + iov_offset + 966 base) - (base & PAGE_MASK)) >> PAGE_SHIFT); 967 len = offset + req->info.fragsize > PAGE_SIZE ? 968 PAGE_SIZE - offset : req->info.fragsize; 969 len = min((datalen - queued), len); 970 ret = sdma_txadd_page(pq->dd, &tx->txreq, 971 iovec->pages[pageidx], 972 offset, len); 973 if (ret) { 974 SDMA_DBG(req, "SDMA txreq add page failed %d\n", 975 ret); 976 goto free_txreq; 977 } 978 iov_offset += len; 979 queued += len; 980 data_sent += len; 981 if (unlikely(queued < datalen && 982 pageidx == iovec->npages && 983 req->iov_idx < req->data_iovs - 1)) { 984 iovec->offset += iov_offset; 985 iovec = &req->iovs[++req->iov_idx]; 986 iov_offset = 0; 987 } 988 } 989 /* 990 * The txreq was submitted successfully so we can update 991 * the counters. 992 */ 993 req->koffset += datalen; 994 if (req_opcode(req->info.ctrl) == EXPECTED) 995 req->tidoffset += datalen; 996 req->sent += data_sent; 997 if (req->data_len) 998 iovec->offset += iov_offset; 999 list_add_tail(&tx->txreq.list, &req->txps); 1000 /* 1001 * It is important to increment this here as it is used to 1002 * generate the BTH.PSN and, therefore, can't be bulk-updated 1003 * outside of the loop. 1004 */ 1005 tx->seqnum = req->seqnum++; 1006 npkts++; 1007 } 1008 dosend: 1009 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps); 1010 if (list_empty(&req->txps)) { 1011 req->seqsubmitted = req->seqnum; 1012 if (req->seqnum == req->info.npkts) { 1013 set_bit(SDMA_REQ_SEND_DONE, &req->flags); 1014 /* 1015 * The txreq has already been submitted to the HW queue 1016 * so we can free the AHG entry now. Corruption will not 1017 * happen due to the sequential manner in which 1018 * descriptors are processed. 1019 */ 1020 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) 1021 sdma_ahg_free(req->sde, req->ahg_idx); 1022 } 1023 } else if (ret > 0) { 1024 req->seqsubmitted += ret; 1025 ret = 0; 1026 } 1027 return ret; 1028 1029 free_txreq: 1030 sdma_txclean(pq->dd, &tx->txreq); 1031 free_tx: 1032 kmem_cache_free(pq->txreq_cache, tx); 1033 return ret; 1034 } 1035 1036 /* 1037 * How many pages in this iovec element? 1038 */ 1039 static inline int num_user_pages(const struct iovec *iov) 1040 { 1041 const unsigned long addr = (unsigned long)iov->iov_base; 1042 const unsigned long len = iov->iov_len; 1043 const unsigned long spage = addr & PAGE_MASK; 1044 const unsigned long epage = (addr + len - 1) & PAGE_MASK; 1045 1046 return 1 + ((epage - spage) >> PAGE_SHIFT); 1047 } 1048 1049 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 1050 { 1051 u32 cleared = 0; 1052 struct sdma_mmu_node *node, *ptr; 1053 struct list_head to_evict = LIST_HEAD_INIT(to_evict); 1054 1055 spin_lock(&pq->evict_lock); 1056 list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) { 1057 /* Make sure that no one is still using the node. */ 1058 if (!atomic_read(&node->refcount)) { 1059 set_bit(SDMA_CACHE_NODE_EVICT, &node->flags); 1060 list_del_init(&node->list); 1061 list_add(&node->list, &to_evict); 1062 cleared += node->npages; 1063 if (cleared >= npages) 1064 break; 1065 } 1066 } 1067 spin_unlock(&pq->evict_lock); 1068 1069 list_for_each_entry_safe(node, ptr, &to_evict, list) 1070 hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); 1071 1072 return cleared; 1073 } 1074 1075 static int pin_vector_pages(struct user_sdma_request *req, 1076 struct user_sdma_iovec *iovec) { 1077 int ret = 0, pinned, npages, cleared; 1078 struct page **pages; 1079 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1080 struct sdma_mmu_node *node = NULL; 1081 struct mmu_rb_node *rb_node; 1082 1083 rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root, 1084 (unsigned long)iovec->iov.iov_base, 1085 iovec->iov.iov_len); 1086 if (rb_node && !IS_ERR(rb_node)) 1087 node = container_of(rb_node, struct sdma_mmu_node, rb); 1088 else 1089 rb_node = NULL; 1090 1091 if (!node) { 1092 node = kzalloc(sizeof(*node), GFP_KERNEL); 1093 if (!node) 1094 return -ENOMEM; 1095 1096 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1097 node->pq = pq; 1098 atomic_set(&node->refcount, 0); 1099 INIT_LIST_HEAD(&node->list); 1100 } 1101 1102 npages = num_user_pages(&iovec->iov); 1103 if (node->npages < npages) { 1104 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1105 if (!pages) { 1106 SDMA_DBG(req, "Failed page array alloc"); 1107 ret = -ENOMEM; 1108 goto bail; 1109 } 1110 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 1111 1112 npages -= node->npages; 1113 1114 /* 1115 * If rb_node is NULL, it means that this is brand new node 1116 * and, therefore not on the eviction list. 1117 * If, however, the rb_node is non-NULL, it means that the 1118 * node is already in RB tree and, therefore on the eviction 1119 * list (nodes are unconditionally inserted in the eviction 1120 * list). In that case, we have to remove the node prior to 1121 * calling the eviction function in order to prevent it from 1122 * freeing this node. 1123 */ 1124 if (rb_node) { 1125 spin_lock(&pq->evict_lock); 1126 list_del_init(&node->list); 1127 spin_unlock(&pq->evict_lock); 1128 } 1129 retry: 1130 if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) { 1131 cleared = sdma_cache_evict(pq, npages); 1132 if (cleared >= npages) 1133 goto retry; 1134 } 1135 pinned = hfi1_acquire_user_pages( 1136 ((unsigned long)iovec->iov.iov_base + 1137 (node->npages * PAGE_SIZE)), npages, 0, 1138 pages + node->npages); 1139 if (pinned < 0) { 1140 kfree(pages); 1141 ret = pinned; 1142 goto bail; 1143 } 1144 if (pinned != npages) { 1145 unpin_vector_pages(current->mm, pages, node->npages, 1146 pinned); 1147 ret = -EFAULT; 1148 goto bail; 1149 } 1150 kfree(node->pages); 1151 node->rb.len = iovec->iov.iov_len; 1152 node->pages = pages; 1153 node->npages += pinned; 1154 npages = node->npages; 1155 spin_lock(&pq->evict_lock); 1156 list_add(&node->list, &pq->evict); 1157 pq->n_locked += pinned; 1158 spin_unlock(&pq->evict_lock); 1159 } 1160 iovec->pages = node->pages; 1161 iovec->npages = npages; 1162 iovec->node = node; 1163 1164 ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); 1165 if (ret) { 1166 spin_lock(&pq->evict_lock); 1167 if (!list_empty(&node->list)) 1168 list_del(&node->list); 1169 pq->n_locked -= node->npages; 1170 spin_unlock(&pq->evict_lock); 1171 goto bail; 1172 } 1173 return 0; 1174 bail: 1175 if (rb_node) 1176 unpin_vector_pages(current->mm, node->pages, 0, node->npages); 1177 kfree(node); 1178 return ret; 1179 } 1180 1181 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1182 unsigned start, unsigned npages) 1183 { 1184 hfi1_release_user_pages(mm, pages + start, npages, 0); 1185 kfree(pages); 1186 } 1187 1188 static int check_header_template(struct user_sdma_request *req, 1189 struct hfi1_pkt_header *hdr, u32 lrhlen, 1190 u32 datalen) 1191 { 1192 /* 1193 * Perform safety checks for any type of packet: 1194 * - transfer size is multiple of 64bytes 1195 * - packet length is multiple of 4bytes 1196 * - entire request length is multiple of 4bytes 1197 * - packet length is not larger than MTU size 1198 * 1199 * These checks are only done for the first packet of the 1200 * transfer since the header is "given" to us by user space. 1201 * For the remainder of the packets we compute the values. 1202 */ 1203 if (req->info.fragsize % PIO_BLOCK_SIZE || 1204 lrhlen & 0x3 || req->data_len & 0x3 || 1205 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1206 return -EINVAL; 1207 1208 if (req_opcode(req->info.ctrl) == EXPECTED) { 1209 /* 1210 * The header is checked only on the first packet. Furthermore, 1211 * we ensure that at least one TID entry is copied when the 1212 * request is submitted. Therefore, we don't have to verify that 1213 * tididx points to something sane. 1214 */ 1215 u32 tidval = req->tids[req->tididx], 1216 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1217 tididx = EXP_TID_GET(tidval, IDX), 1218 tidctrl = EXP_TID_GET(tidval, CTRL), 1219 tidoff; 1220 __le32 kval = hdr->kdeth.ver_tid_offset; 1221 1222 tidoff = KDETH_GET(kval, OFFSET) * 1223 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1224 KDETH_OM_LARGE : KDETH_OM_SMALL); 1225 /* 1226 * Expected receive packets have the following 1227 * additional checks: 1228 * - offset is not larger than the TID size 1229 * - TIDCtrl values match between header and TID array 1230 * - TID indexes match between header and TID array 1231 */ 1232 if ((tidoff + datalen > tidlen) || 1233 KDETH_GET(kval, TIDCTRL) != tidctrl || 1234 KDETH_GET(kval, TID) != tididx) 1235 return -EINVAL; 1236 } 1237 return 0; 1238 } 1239 1240 /* 1241 * Correctly set the BTH.PSN field based on type of 1242 * transfer - eager packets can just increment the PSN but 1243 * expected packets encode generation and sequence in the 1244 * BTH.PSN field so just incrementing will result in errors. 1245 */ 1246 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1247 { 1248 u32 val = be32_to_cpu(bthpsn), 1249 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1250 0xffffffull), 1251 psn = val & mask; 1252 if (expct) 1253 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1254 else 1255 psn = psn + frags; 1256 return psn & mask; 1257 } 1258 1259 static int set_txreq_header(struct user_sdma_request *req, 1260 struct user_sdma_txreq *tx, u32 datalen) 1261 { 1262 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1263 struct hfi1_pkt_header *hdr = &tx->hdr; 1264 u16 pbclen; 1265 int ret; 1266 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen); 1267 1268 /* Copy the header template to the request before modification */ 1269 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1270 1271 /* 1272 * Check if the PBC and LRH length are mismatched. If so 1273 * adjust both in the header. 1274 */ 1275 pbclen = le16_to_cpu(hdr->pbc[0]); 1276 if (PBC2LRH(pbclen) != lrhlen) { 1277 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1278 hdr->pbc[0] = cpu_to_le16(pbclen); 1279 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1280 /* 1281 * Third packet 1282 * This is the first packet in the sequence that has 1283 * a "static" size that can be used for the rest of 1284 * the packets (besides the last one). 1285 */ 1286 if (unlikely(req->seqnum == 2)) { 1287 /* 1288 * From this point on the lengths in both the 1289 * PBC and LRH are the same until the last 1290 * packet. 1291 * Adjust the template so we don't have to update 1292 * every packet 1293 */ 1294 req->hdr.pbc[0] = hdr->pbc[0]; 1295 req->hdr.lrh[2] = hdr->lrh[2]; 1296 } 1297 } 1298 /* 1299 * We only have to modify the header if this is not the 1300 * first packet in the request. Otherwise, we use the 1301 * header given to us. 1302 */ 1303 if (unlikely(!req->seqnum)) { 1304 ret = check_header_template(req, hdr, lrhlen, datalen); 1305 if (ret) 1306 return ret; 1307 goto done; 1308 } 1309 1310 hdr->bth[2] = cpu_to_be32( 1311 set_pkt_bth_psn(hdr->bth[2], 1312 (req_opcode(req->info.ctrl) == EXPECTED), 1313 req->seqnum)); 1314 1315 /* Set ACK request on last packet */ 1316 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) 1317 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1318 1319 /* Set the new offset */ 1320 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1321 /* Expected packets have to fill in the new TID information */ 1322 if (req_opcode(req->info.ctrl) == EXPECTED) { 1323 tidval = req->tids[req->tididx]; 1324 /* 1325 * If the offset puts us at the end of the current TID, 1326 * advance everything. 1327 */ 1328 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1329 PAGE_SIZE)) { 1330 req->tidoffset = 0; 1331 /* 1332 * Since we don't copy all the TIDs, all at once, 1333 * we have to check again. 1334 */ 1335 if (++req->tididx > req->n_tids - 1 || 1336 !req->tids[req->tididx]) { 1337 return -EINVAL; 1338 } 1339 tidval = req->tids[req->tididx]; 1340 } 1341 req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1342 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL; 1343 /* Set KDETH.TIDCtrl based on value for this TID. */ 1344 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1345 EXP_TID_GET(tidval, CTRL)); 1346 /* Set KDETH.TID based on value for this TID */ 1347 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1348 EXP_TID_GET(tidval, IDX)); 1349 /* Clear KDETH.SH only on the last packet */ 1350 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) 1351 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1352 /* 1353 * Set the KDETH.OFFSET and KDETH.OM based on size of 1354 * transfer. 1355 */ 1356 SDMA_DBG(req, "TID offset %ubytes %uunits om%u", 1357 req->tidoffset, req->tidoffset / req->omfactor, 1358 req->omfactor != KDETH_OM_SMALL); 1359 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1360 req->tidoffset / req->omfactor); 1361 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1362 req->omfactor != KDETH_OM_SMALL); 1363 } 1364 done: 1365 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1366 req->info.comp_idx, hdr, tidval); 1367 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1368 } 1369 1370 static int set_txreq_header_ahg(struct user_sdma_request *req, 1371 struct user_sdma_txreq *tx, u32 len) 1372 { 1373 int diff = 0; 1374 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1375 struct hfi1_pkt_header *hdr = &req->hdr; 1376 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1377 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len); 1378 1379 if (PBC2LRH(pbclen) != lrhlen) { 1380 /* PBC.PbcLengthDWs */ 1381 AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, 1382 cpu_to_le16(LRH2PBC(lrhlen))); 1383 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1384 AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, 1385 cpu_to_be16(lrhlen >> 2)); 1386 } 1387 1388 /* 1389 * Do the common updates 1390 */ 1391 /* BTH.PSN and BTH.A */ 1392 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1393 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1394 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) 1395 val32 |= 1UL << 31; 1396 AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); 1397 AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); 1398 /* KDETH.Offset */ 1399 AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, 1400 cpu_to_le16(req->koffset & 0xffff)); 1401 AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, 1402 cpu_to_le16(req->koffset >> 16)); 1403 if (req_opcode(req->info.ctrl) == EXPECTED) { 1404 __le16 val; 1405 1406 tidval = req->tids[req->tididx]; 1407 1408 /* 1409 * If the offset puts us at the end of the current TID, 1410 * advance everything. 1411 */ 1412 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1413 PAGE_SIZE)) { 1414 req->tidoffset = 0; 1415 /* 1416 * Since we don't copy all the TIDs, all at once, 1417 * we have to check again. 1418 */ 1419 if (++req->tididx > req->n_tids - 1 || 1420 !req->tids[req->tididx]) { 1421 return -EINVAL; 1422 } 1423 tidval = req->tids[req->tididx]; 1424 } 1425 req->omfactor = ((EXP_TID_GET(tidval, LEN) * 1426 PAGE_SIZE) >= 1427 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE : 1428 KDETH_OM_SMALL; 1429 /* KDETH.OM and KDETH.OFFSET (TID) */ 1430 AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, 1431 ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 | 1432 ((req->tidoffset / req->omfactor) & 0x7fff))); 1433 /* KDETH.TIDCtrl, KDETH.TID */ 1434 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1435 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1436 /* Clear KDETH.SH on last packet */ 1437 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) { 1438 val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset, 1439 INTR) >> 16); 1440 val &= cpu_to_le16(~(1U << 13)); 1441 AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); 1442 } else { 1443 AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val); 1444 } 1445 } 1446 1447 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1448 req->info.comp_idx, req->sde->this_idx, 1449 req->ahg_idx, req->ahg, diff, tidval); 1450 return diff; 1451 } 1452 1453 /* 1454 * SDMA tx request completion callback. Called when the SDMA progress 1455 * state machine gets notification that the SDMA descriptors for this 1456 * tx request have been processed by the DMA engine. Called in 1457 * interrupt context. 1458 */ 1459 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1460 { 1461 struct user_sdma_txreq *tx = 1462 container_of(txreq, struct user_sdma_txreq, txreq); 1463 struct user_sdma_request *req; 1464 struct hfi1_user_sdma_pkt_q *pq; 1465 struct hfi1_user_sdma_comp_q *cq; 1466 u16 idx; 1467 1468 if (!tx->req) 1469 return; 1470 1471 req = tx->req; 1472 pq = req->pq; 1473 cq = req->cq; 1474 1475 if (status != SDMA_TXREQ_S_OK) { 1476 SDMA_DBG(req, "SDMA completion with error %d", 1477 status); 1478 set_bit(SDMA_REQ_HAS_ERROR, &req->flags); 1479 } 1480 1481 req->seqcomp = tx->seqnum; 1482 kmem_cache_free(pq->txreq_cache, tx); 1483 tx = NULL; 1484 1485 idx = req->info.comp_idx; 1486 if (req->status == -1 && status == SDMA_TXREQ_S_OK) { 1487 if (req->seqcomp == req->info.npkts - 1) { 1488 req->status = 0; 1489 user_sdma_free_request(req, false); 1490 pq_update(pq); 1491 set_comp_state(pq, cq, idx, COMPLETE, 0); 1492 } 1493 } else { 1494 if (status != SDMA_TXREQ_S_OK) 1495 req->status = status; 1496 if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && 1497 (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || 1498 test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { 1499 user_sdma_free_request(req, false); 1500 pq_update(pq); 1501 set_comp_state(pq, cq, idx, ERROR, req->status); 1502 } 1503 } 1504 } 1505 1506 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1507 { 1508 if (atomic_dec_and_test(&pq->n_reqs)) { 1509 xchg(&pq->state, SDMA_PKT_Q_INACTIVE); 1510 wake_up(&pq->wait); 1511 } 1512 } 1513 1514 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1515 { 1516 if (!list_empty(&req->txps)) { 1517 struct sdma_txreq *t, *p; 1518 1519 list_for_each_entry_safe(t, p, &req->txps, list) { 1520 struct user_sdma_txreq *tx = 1521 container_of(t, struct user_sdma_txreq, txreq); 1522 list_del_init(&t->list); 1523 sdma_txclean(req->pq->dd, t); 1524 kmem_cache_free(req->pq->txreq_cache, tx); 1525 } 1526 } 1527 if (req->data_iovs) { 1528 struct sdma_mmu_node *node; 1529 int i; 1530 1531 for (i = 0; i < req->data_iovs; i++) { 1532 node = req->iovs[i].node; 1533 if (!node) 1534 continue; 1535 1536 if (unpin) 1537 hfi1_mmu_rb_remove(&req->pq->sdma_rb_root, 1538 &node->rb); 1539 else 1540 atomic_dec(&node->refcount); 1541 } 1542 } 1543 kfree(req->tids); 1544 clear_bit(SDMA_REQ_IN_USE, &req->flags); 1545 } 1546 1547 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1548 struct hfi1_user_sdma_comp_q *cq, 1549 u16 idx, enum hfi1_sdma_comp_state state, 1550 int ret) 1551 { 1552 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", 1553 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); 1554 cq->comps[idx].status = state; 1555 if (state == ERROR) 1556 cq->comps[idx].errcode = -ret; 1557 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1558 idx, state, ret); 1559 } 1560 1561 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1562 unsigned long len) 1563 { 1564 return (bool)(node->addr == addr); 1565 } 1566 1567 static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) 1568 { 1569 struct sdma_mmu_node *node = 1570 container_of(mnode, struct sdma_mmu_node, rb); 1571 1572 atomic_inc(&node->refcount); 1573 return 0; 1574 } 1575 1576 static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, 1577 struct mm_struct *mm) 1578 { 1579 struct sdma_mmu_node *node = 1580 container_of(mnode, struct sdma_mmu_node, rb); 1581 1582 spin_lock(&node->pq->evict_lock); 1583 /* 1584 * We've been called by the MMU notifier but this node has been 1585 * scheduled for eviction. The eviction function will take care 1586 * of freeing this node. 1587 * We have to take the above lock first because we are racing 1588 * against the setting of the bit in the eviction function. 1589 */ 1590 if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) { 1591 spin_unlock(&node->pq->evict_lock); 1592 return; 1593 } 1594 1595 if (!list_empty(&node->list)) 1596 list_del(&node->list); 1597 node->pq->n_locked -= node->npages; 1598 spin_unlock(&node->pq->evict_lock); 1599 1600 /* 1601 * If mm is set, we are being called by the MMU notifier and we 1602 * should not pass a mm_struct to unpin_vector_page(). This is to 1603 * prevent a deadlock when hfi1_release_user_pages() attempts to 1604 * take the mmap_sem, which the MMU notifier has already taken. 1605 */ 1606 unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0, 1607 node->npages); 1608 /* 1609 * If called by the MMU notifier, we have to adjust the pinned 1610 * page count ourselves. 1611 */ 1612 if (mm) 1613 mm->pinned_vm -= node->npages; 1614 kfree(node); 1615 } 1616 1617 static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) 1618 { 1619 struct sdma_mmu_node *node = 1620 container_of(mnode, struct sdma_mmu_node, rb); 1621 1622 if (!atomic_read(&node->refcount)) 1623 return 1; 1624 return 0; 1625 } 1626