1 /* 2 * Copyright(c) 2015, 2016 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "user_sdma.h" 68 #include "verbs.h" /* for the headers */ 69 #include "common.h" /* for struct hfi1_tid_info */ 70 #include "trace.h" 71 #include "mmu_rb.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 /* The maximum number of Data io vectors per message/request */ 78 #define MAX_VECTORS_PER_REQ 8 79 /* 80 * Maximum number of packet to send from each message/request 81 * before moving to the next one. 82 */ 83 #define MAX_PKTS_PER_QUEUE 16 84 85 #define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) 86 87 #define req_opcode(x) \ 88 (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 89 #define req_version(x) \ 90 (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 91 #define req_iovcnt(x) \ 92 (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) 93 94 /* Number of BTH.PSN bits used for sequence number in expected rcvs */ 95 #define BTH_SEQ_MASK 0x7ffull 96 97 /* 98 * Define fields in the KDETH header so we can update the header 99 * template. 100 */ 101 #define KDETH_OFFSET_SHIFT 0 102 #define KDETH_OFFSET_MASK 0x7fff 103 #define KDETH_OM_SHIFT 15 104 #define KDETH_OM_MASK 0x1 105 #define KDETH_TID_SHIFT 16 106 #define KDETH_TID_MASK 0x3ff 107 #define KDETH_TIDCTRL_SHIFT 26 108 #define KDETH_TIDCTRL_MASK 0x3 109 #define KDETH_INTR_SHIFT 28 110 #define KDETH_INTR_MASK 0x1 111 #define KDETH_SH_SHIFT 29 112 #define KDETH_SH_MASK 0x1 113 #define KDETH_HCRC_UPPER_SHIFT 16 114 #define KDETH_HCRC_UPPER_MASK 0xff 115 #define KDETH_HCRC_LOWER_SHIFT 24 116 #define KDETH_HCRC_LOWER_MASK 0xff 117 118 #define AHG_KDETH_INTR_SHIFT 12 119 #define AHG_KDETH_SH_SHIFT 13 120 121 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) 122 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) 123 124 #define KDETH_GET(val, field) \ 125 (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) 126 #define KDETH_SET(dw, field, val) do { \ 127 u32 dwval = le32_to_cpu(dw); \ 128 dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ 129 dwval |= (((val) & KDETH_##field##_MASK) << \ 130 KDETH_##field##_SHIFT); \ 131 dw = cpu_to_le32(dwval); \ 132 } while (0) 133 134 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ 135 do { \ 136 if ((idx) < ARRAY_SIZE((arr))) \ 137 (arr)[(idx++)] = sdma_build_ahg_descriptor( \ 138 (__force u16)(value), (dw), (bit), \ 139 (width)); \ 140 else \ 141 return -ERANGE; \ 142 } while (0) 143 144 /* KDETH OM multipliers and switch over point */ 145 #define KDETH_OM_SMALL 4 146 #define KDETH_OM_LARGE 64 147 #define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) 148 149 /* Tx request flag bits */ 150 #define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ 151 #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ 152 153 /* SDMA request flag bits */ 154 #define SDMA_REQ_FOR_THREAD 1 155 #define SDMA_REQ_SEND_DONE 2 156 #define SDMA_REQ_HAVE_AHG 3 157 #define SDMA_REQ_HAS_ERROR 4 158 #define SDMA_REQ_DONE_ERROR 5 159 160 #define SDMA_PKT_Q_INACTIVE BIT(0) 161 #define SDMA_PKT_Q_ACTIVE BIT(1) 162 #define SDMA_PKT_Q_DEFERRED BIT(2) 163 164 /* 165 * Maximum retry attempts to submit a TX request 166 * before putting the process to sleep. 167 */ 168 #define MAX_DEFER_RETRY_COUNT 1 169 170 static unsigned initial_pkt_count = 8; 171 172 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ 173 174 struct sdma_mmu_node; 175 176 struct user_sdma_iovec { 177 struct list_head list; 178 struct iovec iov; 179 /* number of pages in this vector */ 180 unsigned npages; 181 /* array of pinned pages for this vector */ 182 struct page **pages; 183 /* 184 * offset into the virtual address space of the vector at 185 * which we last left off. 186 */ 187 u64 offset; 188 struct sdma_mmu_node *node; 189 }; 190 191 struct sdma_mmu_node { 192 struct mmu_rb_node rb; 193 struct hfi1_user_sdma_pkt_q *pq; 194 atomic_t refcount; 195 struct page **pages; 196 unsigned npages; 197 }; 198 199 /* evict operation argument */ 200 struct evict_data { 201 u32 cleared; /* count evicted so far */ 202 u32 target; /* target count to evict */ 203 }; 204 205 struct user_sdma_request { 206 struct sdma_req_info info; 207 struct hfi1_user_sdma_pkt_q *pq; 208 struct hfi1_user_sdma_comp_q *cq; 209 /* This is the original header from user space */ 210 struct hfi1_pkt_header hdr; 211 /* 212 * Pointer to the SDMA engine for this request. 213 * Since different request could be on different VLs, 214 * each request will need it's own engine pointer. 215 */ 216 struct sdma_engine *sde; 217 u8 ahg_idx; 218 u32 ahg[9]; 219 /* 220 * KDETH.Offset (Eager) field 221 * We need to remember the initial value so the headers 222 * can be updated properly. 223 */ 224 u32 koffset; 225 /* 226 * KDETH.OFFSET (TID) field 227 * The offset can cover multiple packets, depending on the 228 * size of the TID entry. 229 */ 230 u32 tidoffset; 231 /* 232 * KDETH.OM 233 * Remember this because the header template always sets it 234 * to 0. 235 */ 236 u8 omfactor; 237 /* 238 * We copy the iovs for this request (based on 239 * info.iovcnt). These are only the data vectors 240 */ 241 unsigned data_iovs; 242 /* total length of the data in the request */ 243 u32 data_len; 244 /* progress index moving along the iovs array */ 245 unsigned iov_idx; 246 struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; 247 /* number of elements copied to the tids array */ 248 u16 n_tids; 249 /* TID array values copied from the tid_iov vector */ 250 u32 *tids; 251 u16 tididx; 252 u32 sent; 253 u64 seqnum; 254 u64 seqcomp; 255 u64 seqsubmitted; 256 struct list_head txps; 257 unsigned long flags; 258 /* status of the last txreq completed */ 259 int status; 260 }; 261 262 /* 263 * A single txreq could span up to 3 physical pages when the MTU 264 * is sufficiently large (> 4K). Each of the IOV pointers also 265 * needs it's own set of flags so the vector has been handled 266 * independently of each other. 267 */ 268 struct user_sdma_txreq { 269 /* Packet header for the txreq */ 270 struct hfi1_pkt_header hdr; 271 struct sdma_txreq txreq; 272 struct list_head list; 273 struct user_sdma_request *req; 274 u16 flags; 275 unsigned busycount; 276 u64 seqnum; 277 }; 278 279 #define SDMA_DBG(req, fmt, ...) \ 280 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ 281 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ 282 ##__VA_ARGS__) 283 #define SDMA_Q_DBG(pq, fmt, ...) \ 284 hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ 285 (pq)->subctxt, ##__VA_ARGS__) 286 287 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); 288 static int num_user_pages(const struct iovec *); 289 static void user_sdma_txreq_cb(struct sdma_txreq *, int); 290 static inline void pq_update(struct hfi1_user_sdma_pkt_q *); 291 static void user_sdma_free_request(struct user_sdma_request *, bool); 292 static int pin_vector_pages(struct user_sdma_request *, 293 struct user_sdma_iovec *); 294 static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned, 295 unsigned); 296 static int check_header_template(struct user_sdma_request *, 297 struct hfi1_pkt_header *, u32, u32); 298 static int set_txreq_header(struct user_sdma_request *, 299 struct user_sdma_txreq *, u32); 300 static int set_txreq_header_ahg(struct user_sdma_request *, 301 struct user_sdma_txreq *, u32); 302 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *, 303 struct hfi1_user_sdma_comp_q *, 304 u16, enum hfi1_sdma_comp_state, int); 305 static inline u32 set_pkt_bth_psn(__be32, u8, u32); 306 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 307 308 static int defer_packet_queue( 309 struct sdma_engine *, 310 struct iowait *, 311 struct sdma_txreq *, 312 unsigned seq); 313 static void activate_packet_queue(struct iowait *, int); 314 static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); 315 static int sdma_rb_insert(void *, struct mmu_rb_node *); 316 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 317 void *arg2, bool *stop); 318 static void sdma_rb_remove(void *, struct mmu_rb_node *); 319 static int sdma_rb_invalidate(void *, struct mmu_rb_node *); 320 321 static struct mmu_rb_ops sdma_rb_ops = { 322 .filter = sdma_rb_filter, 323 .insert = sdma_rb_insert, 324 .evict = sdma_rb_evict, 325 .remove = sdma_rb_remove, 326 .invalidate = sdma_rb_invalidate 327 }; 328 329 static int defer_packet_queue( 330 struct sdma_engine *sde, 331 struct iowait *wait, 332 struct sdma_txreq *txreq, 333 unsigned seq) 334 { 335 struct hfi1_user_sdma_pkt_q *pq = 336 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 337 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 338 struct user_sdma_txreq *tx = 339 container_of(txreq, struct user_sdma_txreq, txreq); 340 341 if (sdma_progress(sde, seq, txreq)) { 342 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 343 goto eagain; 344 } 345 /* 346 * We are assuming that if the list is enqueued somewhere, it 347 * is to the dmawait list since that is the only place where 348 * it is supposed to be enqueued. 349 */ 350 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 351 write_seqlock(&dev->iowait_lock); 352 if (list_empty(&pq->busy.list)) 353 list_add_tail(&pq->busy.list, &sde->dmawait); 354 write_sequnlock(&dev->iowait_lock); 355 return -EBUSY; 356 eagain: 357 return -EAGAIN; 358 } 359 360 static void activate_packet_queue(struct iowait *wait, int reason) 361 { 362 struct hfi1_user_sdma_pkt_q *pq = 363 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 364 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 365 wake_up(&wait->wait_dma); 366 }; 367 368 static void sdma_kmem_cache_ctor(void *obj) 369 { 370 struct user_sdma_txreq *tx = obj; 371 372 memset(tx, 0, sizeof(*tx)); 373 } 374 375 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) 376 { 377 struct hfi1_filedata *fd; 378 int ret = 0; 379 unsigned memsize; 380 char buf[64]; 381 struct hfi1_devdata *dd; 382 struct hfi1_user_sdma_comp_q *cq; 383 struct hfi1_user_sdma_pkt_q *pq; 384 unsigned long flags; 385 386 if (!uctxt || !fp) { 387 ret = -EBADF; 388 goto done; 389 } 390 391 fd = fp->private_data; 392 393 if (!hfi1_sdma_comp_ring_size) { 394 ret = -EINVAL; 395 goto done; 396 } 397 398 dd = uctxt->dd; 399 400 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 401 if (!pq) 402 goto pq_nomem; 403 404 memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size; 405 pq->reqs = kzalloc(memsize, GFP_KERNEL); 406 if (!pq->reqs) 407 goto pq_reqs_nomem; 408 409 memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long); 410 pq->req_in_use = kzalloc(memsize, GFP_KERNEL); 411 if (!pq->req_in_use) 412 goto pq_reqs_no_in_use; 413 414 INIT_LIST_HEAD(&pq->list); 415 pq->dd = dd; 416 pq->ctxt = uctxt->ctxt; 417 pq->subctxt = fd->subctxt; 418 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 419 pq->state = SDMA_PKT_Q_INACTIVE; 420 atomic_set(&pq->n_reqs, 0); 421 init_waitqueue_head(&pq->wait); 422 atomic_set(&pq->n_locked, 0); 423 pq->mm = fd->mm; 424 425 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 426 activate_packet_queue, NULL); 427 pq->reqidx = 0; 428 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 429 fd->subctxt); 430 pq->txreq_cache = kmem_cache_create(buf, 431 sizeof(struct user_sdma_txreq), 432 L1_CACHE_BYTES, 433 SLAB_HWCACHE_ALIGN, 434 sdma_kmem_cache_ctor); 435 if (!pq->txreq_cache) { 436 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 437 uctxt->ctxt); 438 goto pq_txreq_nomem; 439 } 440 fd->pq = pq; 441 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 442 if (!cq) 443 goto cq_nomem; 444 445 memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size); 446 cq->comps = vmalloc_user(memsize); 447 if (!cq->comps) 448 goto cq_comps_nomem; 449 450 cq->nentries = hfi1_sdma_comp_ring_size; 451 fd->cq = cq; 452 453 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 454 &pq->handler); 455 if (ret) { 456 dd_dev_err(dd, "Failed to register with MMU %d", ret); 457 goto done; 458 } 459 460 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 461 list_add(&pq->list, &uctxt->sdma_queues); 462 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 463 goto done; 464 465 cq_comps_nomem: 466 kfree(cq); 467 cq_nomem: 468 kmem_cache_destroy(pq->txreq_cache); 469 pq_txreq_nomem: 470 kfree(pq->req_in_use); 471 pq_reqs_no_in_use: 472 kfree(pq->reqs); 473 pq_reqs_nomem: 474 kfree(pq); 475 fd->pq = NULL; 476 pq_nomem: 477 ret = -ENOMEM; 478 done: 479 return ret; 480 } 481 482 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) 483 { 484 struct hfi1_ctxtdata *uctxt = fd->uctxt; 485 struct hfi1_user_sdma_pkt_q *pq; 486 unsigned long flags; 487 488 hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, 489 uctxt->ctxt, fd->subctxt); 490 pq = fd->pq; 491 if (pq) { 492 if (pq->handler) 493 hfi1_mmu_rb_unregister(pq->handler); 494 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 495 if (!list_empty(&pq->list)) 496 list_del_init(&pq->list); 497 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 498 iowait_sdma_drain(&pq->busy); 499 /* Wait until all requests have been freed. */ 500 wait_event_interruptible( 501 pq->wait, 502 (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); 503 kfree(pq->reqs); 504 kfree(pq->req_in_use); 505 kmem_cache_destroy(pq->txreq_cache); 506 kfree(pq); 507 fd->pq = NULL; 508 } 509 if (fd->cq) { 510 vfree(fd->cq->comps); 511 kfree(fd->cq); 512 fd->cq = NULL; 513 } 514 return 0; 515 } 516 517 static u8 dlid_to_selector(u16 dlid) 518 { 519 static u8 mapping[256]; 520 static int initialized; 521 static u8 next; 522 int hash; 523 524 if (!initialized) { 525 memset(mapping, 0xFF, 256); 526 initialized = 1; 527 } 528 529 hash = ((dlid >> 8) ^ dlid) & 0xFF; 530 if (mapping[hash] == 0xFF) { 531 mapping[hash] = next; 532 next = (next + 1) & 0x7F; 533 } 534 535 return mapping[hash]; 536 } 537 538 int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, 539 unsigned long dim, unsigned long *count) 540 { 541 int ret = 0, i; 542 struct hfi1_filedata *fd = fp->private_data; 543 struct hfi1_ctxtdata *uctxt = fd->uctxt; 544 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 545 struct hfi1_user_sdma_comp_q *cq = fd->cq; 546 struct hfi1_devdata *dd = pq->dd; 547 unsigned long idx = 0; 548 u8 pcount = initial_pkt_count; 549 struct sdma_req_info info; 550 struct user_sdma_request *req; 551 u8 opcode, sc, vl; 552 int req_queued = 0; 553 u16 dlid; 554 u32 selector; 555 556 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 557 hfi1_cdbg( 558 SDMA, 559 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 560 dd->unit, uctxt->ctxt, fd->subctxt, 561 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 562 return -EINVAL; 563 } 564 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 565 if (ret) { 566 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 567 dd->unit, uctxt->ctxt, fd->subctxt, ret); 568 return -EFAULT; 569 } 570 571 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 572 (u16 *)&info); 573 574 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 575 hfi1_cdbg(SDMA, 576 "[%u:%u:%u:%u] Invalid comp index", 577 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 578 return -EINVAL; 579 } 580 581 /* 582 * Sanity check the header io vector count. Need at least 1 vector 583 * (header) and cannot be larger than the actual io vector count. 584 */ 585 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 586 hfi1_cdbg(SDMA, 587 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 588 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 589 req_iovcnt(info.ctrl), dim); 590 return -EINVAL; 591 } 592 593 if (!info.fragsize) { 594 hfi1_cdbg(SDMA, 595 "[%u:%u:%u:%u] Request does not specify fragsize", 596 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 597 return -EINVAL; 598 } 599 600 /* Try to claim the request. */ 601 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 602 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 603 dd->unit, uctxt->ctxt, fd->subctxt, 604 info.comp_idx); 605 return -EBADSLT; 606 } 607 /* 608 * All safety checks have been done and this request has been claimed. 609 */ 610 hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, 611 uctxt->ctxt, fd->subctxt, info.comp_idx); 612 req = pq->reqs + info.comp_idx; 613 memset(req, 0, sizeof(*req)); 614 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 615 req->pq = pq; 616 req->cq = cq; 617 req->status = -1; 618 INIT_LIST_HEAD(&req->txps); 619 620 memcpy(&req->info, &info, sizeof(info)); 621 622 if (req_opcode(info.ctrl) == EXPECTED) { 623 /* expected must have a TID info and at least one data vector */ 624 if (req->data_iovs < 2) { 625 SDMA_DBG(req, 626 "Not enough vectors for expected request"); 627 ret = -EINVAL; 628 goto free_req; 629 } 630 req->data_iovs--; 631 } 632 633 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 634 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 635 MAX_VECTORS_PER_REQ); 636 ret = -EINVAL; 637 goto free_req; 638 } 639 /* Copy the header from the user buffer */ 640 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 641 sizeof(req->hdr)); 642 if (ret) { 643 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 644 ret = -EFAULT; 645 goto free_req; 646 } 647 648 /* If Static rate control is not enabled, sanitize the header. */ 649 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 650 req->hdr.pbc[2] = 0; 651 652 /* Validate the opcode. Do not trust packets from user space blindly. */ 653 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 654 if ((opcode & USER_OPCODE_CHECK_MASK) != 655 USER_OPCODE_CHECK_VAL) { 656 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 657 ret = -EINVAL; 658 goto free_req; 659 } 660 /* 661 * Validate the vl. Do not trust packets from user space blindly. 662 * VL comes from PBC, SC comes from LRH, and the VL needs to 663 * match the SC look up. 664 */ 665 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 666 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 667 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 668 if (vl >= dd->pport->vls_operational || 669 vl != sc_to_vlt(dd, sc)) { 670 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 671 ret = -EINVAL; 672 goto free_req; 673 } 674 675 /* Checking P_KEY for requests from user-space */ 676 if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, 677 PKEY_CHECK_INVALID)) { 678 ret = -EINVAL; 679 goto free_req; 680 } 681 682 /* 683 * Also should check the BTH.lnh. If it says the next header is GRH then 684 * the RXE parsing will be off and will land in the middle of the KDETH 685 * or miss it entirely. 686 */ 687 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 688 SDMA_DBG(req, "User tried to pass in a GRH"); 689 ret = -EINVAL; 690 goto free_req; 691 } 692 693 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 694 /* 695 * Calculate the initial TID offset based on the values of 696 * KDETH.OFFSET and KDETH.OM that are passed in. 697 */ 698 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 699 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 700 KDETH_OM_LARGE : KDETH_OM_SMALL); 701 SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); 702 idx++; 703 704 /* Save all the IO vector structures */ 705 for (i = 0; i < req->data_iovs; i++) { 706 INIT_LIST_HEAD(&req->iovs[i].list); 707 memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); 708 ret = pin_vector_pages(req, &req->iovs[i]); 709 if (ret) { 710 req->status = ret; 711 goto free_req; 712 } 713 req->data_len += req->iovs[i].iov.iov_len; 714 } 715 SDMA_DBG(req, "total data length %u", req->data_len); 716 717 if (pcount > req->info.npkts) 718 pcount = req->info.npkts; 719 /* 720 * Copy any TID info 721 * User space will provide the TID info only when the 722 * request type is EXPECTED. This is true even if there is 723 * only one packet in the request and the header is already 724 * setup. The reason for the singular TID case is that the 725 * driver needs to perform safety checks. 726 */ 727 if (req_opcode(req->info.ctrl) == EXPECTED) { 728 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 729 u32 *tmp; 730 731 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 732 ret = -EINVAL; 733 goto free_req; 734 } 735 736 /* 737 * We have to copy all of the tids because they may vary 738 * in size and, therefore, the TID count might not be 739 * equal to the pkt count. However, there is no way to 740 * tell at this point. 741 */ 742 tmp = memdup_user(iovec[idx].iov_base, 743 ntids * sizeof(*req->tids)); 744 if (IS_ERR(tmp)) { 745 ret = PTR_ERR(tmp); 746 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 747 ntids, ret); 748 goto free_req; 749 } 750 req->tids = tmp; 751 req->n_tids = ntids; 752 idx++; 753 } 754 755 dlid = be16_to_cpu(req->hdr.lrh[1]); 756 selector = dlid_to_selector(dlid); 757 selector += uctxt->ctxt + fd->subctxt; 758 req->sde = sdma_select_user_engine(dd, selector, vl); 759 760 if (!req->sde || !sdma_running(req->sde)) { 761 ret = -ECOMM; 762 goto free_req; 763 } 764 765 /* We don't need an AHG entry if the request contains only one packet */ 766 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) { 767 int ahg = sdma_ahg_alloc(req->sde); 768 769 if (likely(ahg >= 0)) { 770 req->ahg_idx = (u8)ahg; 771 set_bit(SDMA_REQ_HAVE_AHG, &req->flags); 772 } 773 } 774 775 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 776 atomic_inc(&pq->n_reqs); 777 req_queued = 1; 778 /* Send the first N packets in the request to buy us some time */ 779 ret = user_sdma_send_pkts(req, pcount); 780 if (unlikely(ret < 0 && ret != -EBUSY)) { 781 req->status = ret; 782 goto free_req; 783 } 784 785 /* 786 * It is possible that the SDMA engine would have processed all the 787 * submitted packets by the time we get here. Therefore, only set 788 * packet queue state to ACTIVE if there are still uncompleted 789 * requests. 790 */ 791 if (atomic_read(&pq->n_reqs)) 792 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 793 794 /* 795 * This is a somewhat blocking send implementation. 796 * The driver will block the caller until all packets of the 797 * request have been submitted to the SDMA engine. However, it 798 * will not wait for send completions. 799 */ 800 while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { 801 ret = user_sdma_send_pkts(req, pcount); 802 if (ret < 0) { 803 if (ret != -EBUSY) { 804 req->status = ret; 805 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 806 if (ACCESS_ONCE(req->seqcomp) == 807 req->seqsubmitted - 1) 808 goto free_req; 809 return ret; 810 } 811 wait_event_interruptible_timeout( 812 pq->busy.wait_dma, 813 (pq->state == SDMA_PKT_Q_ACTIVE), 814 msecs_to_jiffies( 815 SDMA_IOWAIT_TIMEOUT)); 816 } 817 } 818 *count += idx; 819 return 0; 820 free_req: 821 user_sdma_free_request(req, true); 822 if (req_queued) 823 pq_update(pq); 824 set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); 825 return ret; 826 } 827 828 static inline u32 compute_data_length(struct user_sdma_request *req, 829 struct user_sdma_txreq *tx) 830 { 831 /* 832 * Determine the proper size of the packet data. 833 * The size of the data of the first packet is in the header 834 * template. However, it includes the header and ICRC, which need 835 * to be subtracted. 836 * The minimum representable packet data length in a header is 4 bytes, 837 * therefore, when the data length request is less than 4 bytes, there's 838 * only one packet, and the packet data length is equal to that of the 839 * request data length. 840 * The size of the remaining packets is the minimum of the frag 841 * size (MTU) or remaining data in the request. 842 */ 843 u32 len; 844 845 if (!req->seqnum) { 846 if (req->data_len < sizeof(u32)) 847 len = req->data_len; 848 else 849 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 850 (sizeof(tx->hdr) - 4)); 851 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 852 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 853 PAGE_SIZE; 854 /* 855 * Get the data length based on the remaining space in the 856 * TID pair. 857 */ 858 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 859 /* If we've filled up the TID pair, move to the next one. */ 860 if (unlikely(!len) && ++req->tididx < req->n_tids && 861 req->tids[req->tididx]) { 862 tidlen = EXP_TID_GET(req->tids[req->tididx], 863 LEN) * PAGE_SIZE; 864 req->tidoffset = 0; 865 len = min_t(u32, tidlen, req->info.fragsize); 866 } 867 /* 868 * Since the TID pairs map entire pages, make sure that we 869 * are not going to try to send more data that we have 870 * remaining. 871 */ 872 len = min(len, req->data_len - req->sent); 873 } else { 874 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 875 } 876 SDMA_DBG(req, "Data Length = %u", len); 877 return len; 878 } 879 880 static inline u32 pad_len(u32 len) 881 { 882 if (len & (sizeof(u32) - 1)) 883 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 884 return len; 885 } 886 887 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 888 { 889 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 890 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 891 } 892 893 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) 894 { 895 int ret = 0, count; 896 unsigned npkts = 0; 897 struct user_sdma_txreq *tx = NULL; 898 struct hfi1_user_sdma_pkt_q *pq = NULL; 899 struct user_sdma_iovec *iovec = NULL; 900 901 if (!req->pq) 902 return -EINVAL; 903 904 pq = req->pq; 905 906 /* If tx completion has reported an error, we are done. */ 907 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 908 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 909 return -EFAULT; 910 } 911 912 /* 913 * Check if we might have sent the entire request already 914 */ 915 if (unlikely(req->seqnum == req->info.npkts)) { 916 if (!list_empty(&req->txps)) 917 goto dosend; 918 return ret; 919 } 920 921 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 922 maxpkts = req->info.npkts - req->seqnum; 923 924 while (npkts < maxpkts) { 925 u32 datalen = 0, queued = 0, data_sent = 0; 926 u64 iov_offset = 0; 927 928 /* 929 * Check whether any of the completions have come back 930 * with errors. If so, we are not going to process any 931 * more packets from this request. 932 */ 933 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 934 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 935 return -EFAULT; 936 } 937 938 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 939 if (!tx) 940 return -ENOMEM; 941 942 tx->flags = 0; 943 tx->req = req; 944 tx->busycount = 0; 945 INIT_LIST_HEAD(&tx->list); 946 947 /* 948 * For the last packet set the ACK request 949 * and disable header suppression. 950 */ 951 if (req->seqnum == req->info.npkts - 1) 952 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 953 TXREQ_FLAGS_REQ_DISABLE_SH); 954 955 /* 956 * Calculate the payload size - this is min of the fragment 957 * (MTU) size or the remaining bytes in the request but only 958 * if we have payload data. 959 */ 960 if (req->data_len) { 961 iovec = &req->iovs[req->iov_idx]; 962 if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { 963 if (++req->iov_idx == req->data_iovs) { 964 ret = -EFAULT; 965 goto free_txreq; 966 } 967 iovec = &req->iovs[req->iov_idx]; 968 WARN_ON(iovec->offset); 969 } 970 971 datalen = compute_data_length(req, tx); 972 973 /* 974 * Disable header suppression for the payload <= 8DWS. 975 * If there is an uncorrectable error in the receive 976 * data FIFO when the received payload size is less than 977 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 978 * not reported.There is set RHF.EccErr if the header 979 * is not suppressed. 980 */ 981 if (!datalen) { 982 SDMA_DBG(req, 983 "Request has data but pkt len is 0"); 984 ret = -EFAULT; 985 goto free_tx; 986 } else if (datalen <= 32) { 987 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 988 } 989 } 990 991 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { 992 if (!req->seqnum) { 993 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 994 u32 lrhlen = get_lrh_len(req->hdr, 995 pad_len(datalen)); 996 /* 997 * Copy the request header into the tx header 998 * because the HW needs a cacheline-aligned 999 * address. 1000 * This copy can be optimized out if the hdr 1001 * member of user_sdma_request were also 1002 * cacheline aligned. 1003 */ 1004 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 1005 if (PBC2LRH(pbclen) != lrhlen) { 1006 pbclen = (pbclen & 0xf000) | 1007 LRH2PBC(lrhlen); 1008 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 1009 } 1010 ret = check_header_template(req, &tx->hdr, 1011 lrhlen, datalen); 1012 if (ret) 1013 goto free_tx; 1014 ret = sdma_txinit_ahg(&tx->txreq, 1015 SDMA_TXREQ_F_AHG_COPY, 1016 sizeof(tx->hdr) + datalen, 1017 req->ahg_idx, 0, NULL, 0, 1018 user_sdma_txreq_cb); 1019 if (ret) 1020 goto free_tx; 1021 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, 1022 &tx->hdr, 1023 sizeof(tx->hdr)); 1024 if (ret) 1025 goto free_txreq; 1026 } else { 1027 int changes; 1028 1029 changes = set_txreq_header_ahg(req, tx, 1030 datalen); 1031 if (changes < 0) 1032 goto free_tx; 1033 sdma_txinit_ahg(&tx->txreq, 1034 SDMA_TXREQ_F_USE_AHG, 1035 datalen, req->ahg_idx, changes, 1036 req->ahg, sizeof(req->hdr), 1037 user_sdma_txreq_cb); 1038 } 1039 } else { 1040 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 1041 datalen, user_sdma_txreq_cb); 1042 if (ret) 1043 goto free_tx; 1044 /* 1045 * Modify the header for this packet. This only needs 1046 * to be done if we are not going to use AHG. Otherwise, 1047 * the HW will do it based on the changes we gave it 1048 * during sdma_txinit_ahg(). 1049 */ 1050 ret = set_txreq_header(req, tx, datalen); 1051 if (ret) 1052 goto free_txreq; 1053 } 1054 1055 /* 1056 * If the request contains any data vectors, add up to 1057 * fragsize bytes to the descriptor. 1058 */ 1059 while (queued < datalen && 1060 (req->sent + data_sent) < req->data_len) { 1061 unsigned long base, offset; 1062 unsigned pageidx, len; 1063 1064 base = (unsigned long)iovec->iov.iov_base; 1065 offset = offset_in_page(base + iovec->offset + 1066 iov_offset); 1067 pageidx = (((iovec->offset + iov_offset + 1068 base) - (base & PAGE_MASK)) >> PAGE_SHIFT); 1069 len = offset + req->info.fragsize > PAGE_SIZE ? 1070 PAGE_SIZE - offset : req->info.fragsize; 1071 len = min((datalen - queued), len); 1072 ret = sdma_txadd_page(pq->dd, &tx->txreq, 1073 iovec->pages[pageidx], 1074 offset, len); 1075 if (ret) { 1076 SDMA_DBG(req, "SDMA txreq add page failed %d\n", 1077 ret); 1078 goto free_txreq; 1079 } 1080 iov_offset += len; 1081 queued += len; 1082 data_sent += len; 1083 if (unlikely(queued < datalen && 1084 pageidx == iovec->npages && 1085 req->iov_idx < req->data_iovs - 1)) { 1086 iovec->offset += iov_offset; 1087 iovec = &req->iovs[++req->iov_idx]; 1088 iov_offset = 0; 1089 } 1090 } 1091 /* 1092 * The txreq was submitted successfully so we can update 1093 * the counters. 1094 */ 1095 req->koffset += datalen; 1096 if (req_opcode(req->info.ctrl) == EXPECTED) 1097 req->tidoffset += datalen; 1098 req->sent += data_sent; 1099 if (req->data_len) 1100 iovec->offset += iov_offset; 1101 list_add_tail(&tx->txreq.list, &req->txps); 1102 /* 1103 * It is important to increment this here as it is used to 1104 * generate the BTH.PSN and, therefore, can't be bulk-updated 1105 * outside of the loop. 1106 */ 1107 tx->seqnum = req->seqnum++; 1108 npkts++; 1109 } 1110 dosend: 1111 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); 1112 req->seqsubmitted += count; 1113 if (req->seqsubmitted == req->info.npkts) { 1114 set_bit(SDMA_REQ_SEND_DONE, &req->flags); 1115 /* 1116 * The txreq has already been submitted to the HW queue 1117 * so we can free the AHG entry now. Corruption will not 1118 * happen due to the sequential manner in which 1119 * descriptors are processed. 1120 */ 1121 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) 1122 sdma_ahg_free(req->sde, req->ahg_idx); 1123 } 1124 return ret; 1125 1126 free_txreq: 1127 sdma_txclean(pq->dd, &tx->txreq); 1128 free_tx: 1129 kmem_cache_free(pq->txreq_cache, tx); 1130 return ret; 1131 } 1132 1133 /* 1134 * How many pages in this iovec element? 1135 */ 1136 static inline int num_user_pages(const struct iovec *iov) 1137 { 1138 const unsigned long addr = (unsigned long)iov->iov_base; 1139 const unsigned long len = iov->iov_len; 1140 const unsigned long spage = addr & PAGE_MASK; 1141 const unsigned long epage = (addr + len - 1) & PAGE_MASK; 1142 1143 return 1 + ((epage - spage) >> PAGE_SHIFT); 1144 } 1145 1146 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 1147 { 1148 struct evict_data evict_data; 1149 1150 evict_data.cleared = 0; 1151 evict_data.target = npages; 1152 hfi1_mmu_rb_evict(pq->handler, &evict_data); 1153 return evict_data.cleared; 1154 } 1155 1156 static int pin_vector_pages(struct user_sdma_request *req, 1157 struct user_sdma_iovec *iovec) 1158 { 1159 int ret = 0, pinned, npages, cleared; 1160 struct page **pages; 1161 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1162 struct sdma_mmu_node *node = NULL; 1163 struct mmu_rb_node *rb_node; 1164 1165 rb_node = hfi1_mmu_rb_extract(pq->handler, 1166 (unsigned long)iovec->iov.iov_base, 1167 iovec->iov.iov_len); 1168 if (rb_node) 1169 node = container_of(rb_node, struct sdma_mmu_node, rb); 1170 else 1171 rb_node = NULL; 1172 1173 if (!node) { 1174 node = kzalloc(sizeof(*node), GFP_KERNEL); 1175 if (!node) 1176 return -ENOMEM; 1177 1178 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1179 node->pq = pq; 1180 atomic_set(&node->refcount, 0); 1181 } 1182 1183 npages = num_user_pages(&iovec->iov); 1184 if (node->npages < npages) { 1185 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1186 if (!pages) { 1187 SDMA_DBG(req, "Failed page array alloc"); 1188 ret = -ENOMEM; 1189 goto bail; 1190 } 1191 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 1192 1193 npages -= node->npages; 1194 1195 retry: 1196 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 1197 atomic_read(&pq->n_locked), npages)) { 1198 cleared = sdma_cache_evict(pq, npages); 1199 if (cleared >= npages) 1200 goto retry; 1201 } 1202 pinned = hfi1_acquire_user_pages(pq->mm, 1203 ((unsigned long)iovec->iov.iov_base + 1204 (node->npages * PAGE_SIZE)), npages, 0, 1205 pages + node->npages); 1206 if (pinned < 0) { 1207 kfree(pages); 1208 ret = pinned; 1209 goto bail; 1210 } 1211 if (pinned != npages) { 1212 unpin_vector_pages(pq->mm, pages, node->npages, 1213 pinned); 1214 ret = -EFAULT; 1215 goto bail; 1216 } 1217 kfree(node->pages); 1218 node->rb.len = iovec->iov.iov_len; 1219 node->pages = pages; 1220 node->npages += pinned; 1221 npages = node->npages; 1222 atomic_add(pinned, &pq->n_locked); 1223 } 1224 iovec->pages = node->pages; 1225 iovec->npages = npages; 1226 iovec->node = node; 1227 1228 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1229 if (ret) { 1230 atomic_sub(node->npages, &pq->n_locked); 1231 iovec->node = NULL; 1232 goto bail; 1233 } 1234 return 0; 1235 bail: 1236 if (rb_node) 1237 unpin_vector_pages(pq->mm, node->pages, 0, node->npages); 1238 kfree(node); 1239 return ret; 1240 } 1241 1242 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1243 unsigned start, unsigned npages) 1244 { 1245 hfi1_release_user_pages(mm, pages + start, npages, false); 1246 kfree(pages); 1247 } 1248 1249 static int check_header_template(struct user_sdma_request *req, 1250 struct hfi1_pkt_header *hdr, u32 lrhlen, 1251 u32 datalen) 1252 { 1253 /* 1254 * Perform safety checks for any type of packet: 1255 * - transfer size is multiple of 64bytes 1256 * - packet length is multiple of 4 bytes 1257 * - packet length is not larger than MTU size 1258 * 1259 * These checks are only done for the first packet of the 1260 * transfer since the header is "given" to us by user space. 1261 * For the remainder of the packets we compute the values. 1262 */ 1263 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1264 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1265 return -EINVAL; 1266 1267 if (req_opcode(req->info.ctrl) == EXPECTED) { 1268 /* 1269 * The header is checked only on the first packet. Furthermore, 1270 * we ensure that at least one TID entry is copied when the 1271 * request is submitted. Therefore, we don't have to verify that 1272 * tididx points to something sane. 1273 */ 1274 u32 tidval = req->tids[req->tididx], 1275 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1276 tididx = EXP_TID_GET(tidval, IDX), 1277 tidctrl = EXP_TID_GET(tidval, CTRL), 1278 tidoff; 1279 __le32 kval = hdr->kdeth.ver_tid_offset; 1280 1281 tidoff = KDETH_GET(kval, OFFSET) * 1282 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1283 KDETH_OM_LARGE : KDETH_OM_SMALL); 1284 /* 1285 * Expected receive packets have the following 1286 * additional checks: 1287 * - offset is not larger than the TID size 1288 * - TIDCtrl values match between header and TID array 1289 * - TID indexes match between header and TID array 1290 */ 1291 if ((tidoff + datalen > tidlen) || 1292 KDETH_GET(kval, TIDCTRL) != tidctrl || 1293 KDETH_GET(kval, TID) != tididx) 1294 return -EINVAL; 1295 } 1296 return 0; 1297 } 1298 1299 /* 1300 * Correctly set the BTH.PSN field based on type of 1301 * transfer - eager packets can just increment the PSN but 1302 * expected packets encode generation and sequence in the 1303 * BTH.PSN field so just incrementing will result in errors. 1304 */ 1305 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1306 { 1307 u32 val = be32_to_cpu(bthpsn), 1308 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1309 0xffffffull), 1310 psn = val & mask; 1311 if (expct) 1312 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1313 else 1314 psn = psn + frags; 1315 return psn & mask; 1316 } 1317 1318 static int set_txreq_header(struct user_sdma_request *req, 1319 struct user_sdma_txreq *tx, u32 datalen) 1320 { 1321 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1322 struct hfi1_pkt_header *hdr = &tx->hdr; 1323 u16 pbclen; 1324 int ret; 1325 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1326 1327 /* Copy the header template to the request before modification */ 1328 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1329 1330 /* 1331 * Check if the PBC and LRH length are mismatched. If so 1332 * adjust both in the header. 1333 */ 1334 pbclen = le16_to_cpu(hdr->pbc[0]); 1335 if (PBC2LRH(pbclen) != lrhlen) { 1336 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1337 hdr->pbc[0] = cpu_to_le16(pbclen); 1338 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1339 /* 1340 * Third packet 1341 * This is the first packet in the sequence that has 1342 * a "static" size that can be used for the rest of 1343 * the packets (besides the last one). 1344 */ 1345 if (unlikely(req->seqnum == 2)) { 1346 /* 1347 * From this point on the lengths in both the 1348 * PBC and LRH are the same until the last 1349 * packet. 1350 * Adjust the template so we don't have to update 1351 * every packet 1352 */ 1353 req->hdr.pbc[0] = hdr->pbc[0]; 1354 req->hdr.lrh[2] = hdr->lrh[2]; 1355 } 1356 } 1357 /* 1358 * We only have to modify the header if this is not the 1359 * first packet in the request. Otherwise, we use the 1360 * header given to us. 1361 */ 1362 if (unlikely(!req->seqnum)) { 1363 ret = check_header_template(req, hdr, lrhlen, datalen); 1364 if (ret) 1365 return ret; 1366 goto done; 1367 } 1368 1369 hdr->bth[2] = cpu_to_be32( 1370 set_pkt_bth_psn(hdr->bth[2], 1371 (req_opcode(req->info.ctrl) == EXPECTED), 1372 req->seqnum)); 1373 1374 /* Set ACK request on last packet */ 1375 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1376 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1377 1378 /* Set the new offset */ 1379 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1380 /* Expected packets have to fill in the new TID information */ 1381 if (req_opcode(req->info.ctrl) == EXPECTED) { 1382 tidval = req->tids[req->tididx]; 1383 /* 1384 * If the offset puts us at the end of the current TID, 1385 * advance everything. 1386 */ 1387 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1388 PAGE_SIZE)) { 1389 req->tidoffset = 0; 1390 /* 1391 * Since we don't copy all the TIDs, all at once, 1392 * we have to check again. 1393 */ 1394 if (++req->tididx > req->n_tids - 1 || 1395 !req->tids[req->tididx]) { 1396 return -EINVAL; 1397 } 1398 tidval = req->tids[req->tididx]; 1399 } 1400 req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1401 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL; 1402 /* Set KDETH.TIDCtrl based on value for this TID. */ 1403 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1404 EXP_TID_GET(tidval, CTRL)); 1405 /* Set KDETH.TID based on value for this TID */ 1406 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1407 EXP_TID_GET(tidval, IDX)); 1408 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1409 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1410 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1411 /* 1412 * Set the KDETH.OFFSET and KDETH.OM based on size of 1413 * transfer. 1414 */ 1415 SDMA_DBG(req, "TID offset %ubytes %uunits om%u", 1416 req->tidoffset, req->tidoffset / req->omfactor, 1417 req->omfactor != KDETH_OM_SMALL); 1418 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1419 req->tidoffset / req->omfactor); 1420 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1421 req->omfactor != KDETH_OM_SMALL); 1422 } 1423 done: 1424 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1425 req->info.comp_idx, hdr, tidval); 1426 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1427 } 1428 1429 static int set_txreq_header_ahg(struct user_sdma_request *req, 1430 struct user_sdma_txreq *tx, u32 len) 1431 { 1432 int diff = 0; 1433 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1434 struct hfi1_pkt_header *hdr = &req->hdr; 1435 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1436 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); 1437 1438 if (PBC2LRH(pbclen) != lrhlen) { 1439 /* PBC.PbcLengthDWs */ 1440 AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, 1441 cpu_to_le16(LRH2PBC(lrhlen))); 1442 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1443 AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, 1444 cpu_to_be16(lrhlen >> 2)); 1445 } 1446 1447 /* 1448 * Do the common updates 1449 */ 1450 /* BTH.PSN and BTH.A */ 1451 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1452 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1453 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1454 val32 |= 1UL << 31; 1455 AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); 1456 AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); 1457 /* KDETH.Offset */ 1458 AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, 1459 cpu_to_le16(req->koffset & 0xffff)); 1460 AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, 1461 cpu_to_le16(req->koffset >> 16)); 1462 if (req_opcode(req->info.ctrl) == EXPECTED) { 1463 __le16 val; 1464 1465 tidval = req->tids[req->tididx]; 1466 1467 /* 1468 * If the offset puts us at the end of the current TID, 1469 * advance everything. 1470 */ 1471 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1472 PAGE_SIZE)) { 1473 req->tidoffset = 0; 1474 /* 1475 * Since we don't copy all the TIDs, all at once, 1476 * we have to check again. 1477 */ 1478 if (++req->tididx > req->n_tids - 1 || 1479 !req->tids[req->tididx]) { 1480 return -EINVAL; 1481 } 1482 tidval = req->tids[req->tididx]; 1483 } 1484 req->omfactor = ((EXP_TID_GET(tidval, LEN) * 1485 PAGE_SIZE) >= 1486 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE : 1487 KDETH_OM_SMALL; 1488 /* KDETH.OM and KDETH.OFFSET (TID) */ 1489 AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, 1490 ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 | 1491 ((req->tidoffset / req->omfactor) & 0x7fff))); 1492 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1493 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1494 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1495 1496 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1497 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1498 INTR) << 1499 AHG_KDETH_INTR_SHIFT)); 1500 } else { 1501 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1502 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1503 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1504 INTR) << 1505 AHG_KDETH_INTR_SHIFT)); 1506 } 1507 1508 AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); 1509 } 1510 1511 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1512 req->info.comp_idx, req->sde->this_idx, 1513 req->ahg_idx, req->ahg, diff, tidval); 1514 return diff; 1515 } 1516 1517 /* 1518 * SDMA tx request completion callback. Called when the SDMA progress 1519 * state machine gets notification that the SDMA descriptors for this 1520 * tx request have been processed by the DMA engine. Called in 1521 * interrupt context. 1522 */ 1523 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1524 { 1525 struct user_sdma_txreq *tx = 1526 container_of(txreq, struct user_sdma_txreq, txreq); 1527 struct user_sdma_request *req; 1528 struct hfi1_user_sdma_pkt_q *pq; 1529 struct hfi1_user_sdma_comp_q *cq; 1530 u16 idx; 1531 1532 if (!tx->req) 1533 return; 1534 1535 req = tx->req; 1536 pq = req->pq; 1537 cq = req->cq; 1538 1539 if (status != SDMA_TXREQ_S_OK) { 1540 SDMA_DBG(req, "SDMA completion with error %d", 1541 status); 1542 set_bit(SDMA_REQ_HAS_ERROR, &req->flags); 1543 } 1544 1545 req->seqcomp = tx->seqnum; 1546 kmem_cache_free(pq->txreq_cache, tx); 1547 tx = NULL; 1548 1549 idx = req->info.comp_idx; 1550 if (req->status == -1 && status == SDMA_TXREQ_S_OK) { 1551 if (req->seqcomp == req->info.npkts - 1) { 1552 req->status = 0; 1553 user_sdma_free_request(req, false); 1554 pq_update(pq); 1555 set_comp_state(pq, cq, idx, COMPLETE, 0); 1556 } 1557 } else { 1558 if (status != SDMA_TXREQ_S_OK) 1559 req->status = status; 1560 if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && 1561 (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || 1562 test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { 1563 user_sdma_free_request(req, false); 1564 pq_update(pq); 1565 set_comp_state(pq, cq, idx, ERROR, req->status); 1566 } 1567 } 1568 } 1569 1570 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1571 { 1572 if (atomic_dec_and_test(&pq->n_reqs)) { 1573 xchg(&pq->state, SDMA_PKT_Q_INACTIVE); 1574 wake_up(&pq->wait); 1575 } 1576 } 1577 1578 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1579 { 1580 if (!list_empty(&req->txps)) { 1581 struct sdma_txreq *t, *p; 1582 1583 list_for_each_entry_safe(t, p, &req->txps, list) { 1584 struct user_sdma_txreq *tx = 1585 container_of(t, struct user_sdma_txreq, txreq); 1586 list_del_init(&t->list); 1587 sdma_txclean(req->pq->dd, t); 1588 kmem_cache_free(req->pq->txreq_cache, tx); 1589 } 1590 } 1591 if (req->data_iovs) { 1592 struct sdma_mmu_node *node; 1593 int i; 1594 1595 for (i = 0; i < req->data_iovs; i++) { 1596 node = req->iovs[i].node; 1597 if (!node) 1598 continue; 1599 1600 if (unpin) 1601 hfi1_mmu_rb_remove(req->pq->handler, 1602 &node->rb); 1603 else 1604 atomic_dec(&node->refcount); 1605 } 1606 } 1607 kfree(req->tids); 1608 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1609 } 1610 1611 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1612 struct hfi1_user_sdma_comp_q *cq, 1613 u16 idx, enum hfi1_sdma_comp_state state, 1614 int ret) 1615 { 1616 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", 1617 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); 1618 cq->comps[idx].status = state; 1619 if (state == ERROR) 1620 cq->comps[idx].errcode = -ret; 1621 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1622 idx, state, ret); 1623 } 1624 1625 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1626 unsigned long len) 1627 { 1628 return (bool)(node->addr == addr); 1629 } 1630 1631 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1632 { 1633 struct sdma_mmu_node *node = 1634 container_of(mnode, struct sdma_mmu_node, rb); 1635 1636 atomic_inc(&node->refcount); 1637 return 0; 1638 } 1639 1640 /* 1641 * Return 1 to remove the node from the rb tree and call the remove op. 1642 * 1643 * Called with the rb tree lock held. 1644 */ 1645 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1646 void *evict_arg, bool *stop) 1647 { 1648 struct sdma_mmu_node *node = 1649 container_of(mnode, struct sdma_mmu_node, rb); 1650 struct evict_data *evict_data = evict_arg; 1651 1652 /* is this node still being used? */ 1653 if (atomic_read(&node->refcount)) 1654 return 0; /* keep this node */ 1655 1656 /* this node will be evicted, add its pages to our count */ 1657 evict_data->cleared += node->npages; 1658 1659 /* have enough pages been cleared? */ 1660 if (evict_data->cleared >= evict_data->target) 1661 *stop = true; 1662 1663 return 1; /* remove this node */ 1664 } 1665 1666 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1667 { 1668 struct sdma_mmu_node *node = 1669 container_of(mnode, struct sdma_mmu_node, rb); 1670 1671 atomic_sub(node->npages, &node->pq->n_locked); 1672 1673 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 1674 1675 kfree(node); 1676 } 1677 1678 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1679 { 1680 struct sdma_mmu_node *node = 1681 container_of(mnode, struct sdma_mmu_node, rb); 1682 1683 if (!atomic_read(&node->refcount)) 1684 return 1; 1685 return 0; 1686 } 1687