xref: /openbmc/linux/drivers/ntb/test/ntb_perf.c (revision d236d361)
1 /*
2  * This file is provided under a dual BSD/GPLv2 license.  When using or
3  *   redistributing this file, you may do so under either license.
4  *
5  *   GPL LICENSE SUMMARY
6  *
7  *   Copyright(c) 2015 Intel Corporation. All rights reserved.
8  *
9  *   This program is free software; you can redistribute it and/or modify
10  *   it under the terms of version 2 of the GNU General Public License as
11  *   published by the Free Software Foundation.
12  *
13  *   BSD LICENSE
14  *
15  *   Copyright(c) 2015 Intel Corporation. All rights reserved.
16  *
17  *   Redistribution and use in source and binary forms, with or without
18  *   modification, are permitted provided that the following conditions
19  *   are met:
20  *
21  *     * Redistributions of source code must retain the above copyright
22  *       notice, this list of conditions and the following disclaimer.
23  *     * Redistributions in binary form must reproduce the above copy
24  *       notice, this list of conditions and the following disclaimer in
25  *       the documentation and/or other materials provided with the
26  *       distribution.
27  *     * Neither the name of Intel Corporation nor the names of its
28  *       contributors may be used to endorse or promote products derived
29  *       from this software without specific prior written permission.
30  *
31  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
34  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
35  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
37  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
41  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42  *
43  *   PCIe NTB Perf Linux driver
44  */
45 
46 #include <linux/init.h>
47 #include <linux/kernel.h>
48 #include <linux/module.h>
49 #include <linux/kthread.h>
50 #include <linux/time.h>
51 #include <linux/timer.h>
52 #include <linux/dma-mapping.h>
53 #include <linux/pci.h>
54 #include <linux/slab.h>
55 #include <linux/spinlock.h>
56 #include <linux/debugfs.h>
57 #include <linux/dmaengine.h>
58 #include <linux/delay.h>
59 #include <linux/sizes.h>
60 #include <linux/ntb.h>
61 #include <linux/mutex.h>
62 
63 #define DRIVER_NAME		"ntb_perf"
64 #define DRIVER_DESCRIPTION	"PCIe NTB Performance Measurement Tool"
65 
66 #define DRIVER_LICENSE		"Dual BSD/GPL"
67 #define DRIVER_VERSION		"1.0"
68 #define DRIVER_AUTHOR		"Dave Jiang <dave.jiang@intel.com>"
69 
70 #define PERF_LINK_DOWN_TIMEOUT	10
71 #define PERF_VERSION		0xffff0001
72 #define MAX_THREADS		32
73 #define MAX_TEST_SIZE		SZ_1M
74 #define MAX_SRCS		32
75 #define DMA_OUT_RESOURCE_TO	msecs_to_jiffies(50)
76 #define DMA_RETRIES		20
77 #define SZ_4G			(1ULL << 32)
78 #define MAX_SEG_ORDER		20 /* no larger than 1M for kmalloc buffer */
79 
80 MODULE_LICENSE(DRIVER_LICENSE);
81 MODULE_VERSION(DRIVER_VERSION);
82 MODULE_AUTHOR(DRIVER_AUTHOR);
83 MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
84 
85 static struct dentry *perf_debugfs_dir;
86 
87 static unsigned long max_mw_size;
88 module_param(max_mw_size, ulong, 0644);
89 MODULE_PARM_DESC(max_mw_size, "Limit size of large memory windows");
90 
91 static unsigned int seg_order = 19; /* 512K */
92 module_param(seg_order, uint, 0644);
93 MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
94 
95 static unsigned int run_order = 32; /* 4G */
96 module_param(run_order, uint, 0644);
97 MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
98 
99 static bool use_dma; /* default to 0 */
100 module_param(use_dma, bool, 0644);
101 MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
102 
103 struct perf_mw {
104 	phys_addr_t	phys_addr;
105 	resource_size_t	phys_size;
106 	resource_size_t	xlat_align;
107 	resource_size_t	xlat_align_size;
108 	void __iomem	*vbase;
109 	size_t		xlat_size;
110 	size_t		buf_size;
111 	void		*virt_addr;
112 	dma_addr_t	dma_addr;
113 };
114 
115 struct perf_ctx;
116 
117 struct pthr_ctx {
118 	struct task_struct	*thread;
119 	struct perf_ctx		*perf;
120 	atomic_t		dma_sync;
121 	struct dma_chan		*dma_chan;
122 	int			dma_prep_err;
123 	int			src_idx;
124 	void			*srcs[MAX_SRCS];
125 	wait_queue_head_t       *wq;
126 	int			status;
127 	u64			copied;
128 	u64			diff_us;
129 };
130 
131 struct perf_ctx {
132 	struct ntb_dev		*ntb;
133 	spinlock_t		db_lock;
134 	struct perf_mw		mw;
135 	bool			link_is_up;
136 	struct delayed_work	link_work;
137 	wait_queue_head_t	link_wq;
138 	struct dentry		*debugfs_node_dir;
139 	struct dentry		*debugfs_run;
140 	struct dentry		*debugfs_threads;
141 	u8			perf_threads;
142 	/* mutex ensures only one set of threads run at once */
143 	struct mutex		run_mutex;
144 	struct pthr_ctx		pthr_ctx[MAX_THREADS];
145 	atomic_t		tsync;
146 	atomic_t                tdone;
147 };
148 
149 enum {
150 	VERSION = 0,
151 	MW_SZ_HIGH,
152 	MW_SZ_LOW,
153 	MAX_SPAD
154 };
155 
156 static void perf_link_event(void *ctx)
157 {
158 	struct perf_ctx *perf = ctx;
159 
160 	if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1) {
161 		schedule_delayed_work(&perf->link_work, 2*HZ);
162 	} else {
163 		dev_dbg(&perf->ntb->pdev->dev, "link down\n");
164 
165 		if (!perf->link_is_up)
166 			cancel_delayed_work_sync(&perf->link_work);
167 
168 		perf->link_is_up = false;
169 	}
170 }
171 
172 static void perf_db_event(void *ctx, int vec)
173 {
174 	struct perf_ctx *perf = ctx;
175 	u64 db_bits, db_mask;
176 
177 	db_mask = ntb_db_vector_mask(perf->ntb, vec);
178 	db_bits = ntb_db_read(perf->ntb);
179 
180 	dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
181 		vec, db_mask, db_bits);
182 }
183 
184 static const struct ntb_ctx_ops perf_ops = {
185 	.link_event = perf_link_event,
186 	.db_event = perf_db_event,
187 };
188 
189 static void perf_copy_callback(void *data)
190 {
191 	struct pthr_ctx *pctx = data;
192 
193 	atomic_dec(&pctx->dma_sync);
194 }
195 
196 static ssize_t perf_copy(struct pthr_ctx *pctx, char __iomem *dst,
197 			 char *src, size_t size)
198 {
199 	struct perf_ctx *perf = pctx->perf;
200 	struct dma_async_tx_descriptor *txd;
201 	struct dma_chan *chan = pctx->dma_chan;
202 	struct dma_device *device;
203 	struct dmaengine_unmap_data *unmap;
204 	dma_cookie_t cookie;
205 	size_t src_off, dst_off;
206 	struct perf_mw *mw = &perf->mw;
207 	void __iomem *vbase;
208 	void __iomem *dst_vaddr;
209 	dma_addr_t dst_phys;
210 	int retries = 0;
211 
212 	if (!use_dma) {
213 		memcpy_toio(dst, src, size);
214 		return size;
215 	}
216 
217 	if (!chan) {
218 		dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
219 		return -EINVAL;
220 	}
221 
222 	device = chan->device;
223 	src_off = (uintptr_t)src & ~PAGE_MASK;
224 	dst_off = (uintptr_t __force)dst & ~PAGE_MASK;
225 
226 	if (!is_dma_copy_aligned(device, src_off, dst_off, size))
227 		return -ENODEV;
228 
229 	vbase = mw->vbase;
230 	dst_vaddr = dst;
231 	dst_phys = mw->phys_addr + (dst_vaddr - vbase);
232 
233 	unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
234 	if (!unmap)
235 		return -ENOMEM;
236 
237 	unmap->len = size;
238 	unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
239 				      src_off, size, DMA_TO_DEVICE);
240 	if (dma_mapping_error(device->dev, unmap->addr[0]))
241 		goto err_get_unmap;
242 
243 	unmap->to_cnt = 1;
244 
245 	do {
246 		txd = device->device_prep_dma_memcpy(chan, dst_phys,
247 						     unmap->addr[0],
248 						     size, DMA_PREP_INTERRUPT);
249 		if (!txd) {
250 			set_current_state(TASK_INTERRUPTIBLE);
251 			schedule_timeout(DMA_OUT_RESOURCE_TO);
252 		}
253 	} while (!txd && (++retries < DMA_RETRIES));
254 
255 	if (!txd) {
256 		pctx->dma_prep_err++;
257 		goto err_get_unmap;
258 	}
259 
260 	txd->callback = perf_copy_callback;
261 	txd->callback_param = pctx;
262 	dma_set_unmap(txd, unmap);
263 
264 	cookie = dmaengine_submit(txd);
265 	if (dma_submit_error(cookie))
266 		goto err_set_unmap;
267 
268 	dmaengine_unmap_put(unmap);
269 
270 	atomic_inc(&pctx->dma_sync);
271 	dma_async_issue_pending(chan);
272 
273 	return size;
274 
275 err_set_unmap:
276 	dmaengine_unmap_put(unmap);
277 err_get_unmap:
278 	dmaengine_unmap_put(unmap);
279 	return 0;
280 }
281 
282 static int perf_move_data(struct pthr_ctx *pctx, char __iomem *dst, char *src,
283 			  u64 buf_size, u64 win_size, u64 total)
284 {
285 	int chunks, total_chunks, i;
286 	int copied_chunks = 0;
287 	u64 copied = 0, result;
288 	char __iomem *tmp = dst;
289 	u64 perf, diff_us;
290 	ktime_t kstart, kstop, kdiff;
291 	unsigned long last_sleep = jiffies;
292 
293 	chunks = div64_u64(win_size, buf_size);
294 	total_chunks = div64_u64(total, buf_size);
295 	kstart = ktime_get();
296 
297 	for (i = 0; i < total_chunks; i++) {
298 		result = perf_copy(pctx, tmp, src, buf_size);
299 		copied += result;
300 		copied_chunks++;
301 		if (copied_chunks == chunks) {
302 			tmp = dst;
303 			copied_chunks = 0;
304 		} else
305 			tmp += buf_size;
306 
307 		/* Probably should schedule every 5s to prevent soft hang. */
308 		if (unlikely((jiffies - last_sleep) > 5 * HZ)) {
309 			last_sleep = jiffies;
310 			set_current_state(TASK_INTERRUPTIBLE);
311 			schedule_timeout(1);
312 		}
313 
314 		if (unlikely(kthread_should_stop()))
315 			break;
316 	}
317 
318 	if (use_dma) {
319 		pr_debug("%s: All DMA descriptors submitted\n", current->comm);
320 		while (atomic_read(&pctx->dma_sync) != 0) {
321 			if (kthread_should_stop())
322 				break;
323 			msleep(20);
324 		}
325 	}
326 
327 	kstop = ktime_get();
328 	kdiff = ktime_sub(kstop, kstart);
329 	diff_us = ktime_to_us(kdiff);
330 
331 	pr_debug("%s: copied %llu bytes\n", current->comm, copied);
332 
333 	pr_debug("%s: lasted %llu usecs\n", current->comm, diff_us);
334 
335 	perf = div64_u64(copied, diff_us);
336 
337 	pr_debug("%s: MBytes/s: %llu\n", current->comm, perf);
338 
339 	pctx->copied = copied;
340 	pctx->diff_us = diff_us;
341 
342 	return 0;
343 }
344 
345 static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
346 {
347 	return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
348 }
349 
350 static int ntb_perf_thread(void *data)
351 {
352 	struct pthr_ctx *pctx = data;
353 	struct perf_ctx *perf = pctx->perf;
354 	struct pci_dev *pdev = perf->ntb->pdev;
355 	struct perf_mw *mw = &perf->mw;
356 	char __iomem *dst;
357 	u64 win_size, buf_size, total;
358 	void *src;
359 	int rc, node, i;
360 	struct dma_chan *dma_chan = NULL;
361 
362 	pr_debug("kthread %s starting...\n", current->comm);
363 
364 	node = dev_to_node(&pdev->dev);
365 
366 	if (use_dma && !pctx->dma_chan) {
367 		dma_cap_mask_t dma_mask;
368 
369 		dma_cap_zero(dma_mask);
370 		dma_cap_set(DMA_MEMCPY, dma_mask);
371 		dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
372 					       (void *)(unsigned long)node);
373 		if (!dma_chan) {
374 			pr_warn("%s: cannot acquire DMA channel, quitting\n",
375 				current->comm);
376 			return -ENODEV;
377 		}
378 		pctx->dma_chan = dma_chan;
379 	}
380 
381 	for (i = 0; i < MAX_SRCS; i++) {
382 		pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
383 		if (!pctx->srcs[i]) {
384 			rc = -ENOMEM;
385 			goto err;
386 		}
387 	}
388 
389 	win_size = mw->phys_size;
390 	buf_size = 1ULL << seg_order;
391 	total = 1ULL << run_order;
392 
393 	if (buf_size > MAX_TEST_SIZE)
394 		buf_size = MAX_TEST_SIZE;
395 
396 	dst = (char __iomem *)mw->vbase;
397 
398 	atomic_inc(&perf->tsync);
399 	while (atomic_read(&perf->tsync) != perf->perf_threads)
400 		schedule();
401 
402 	src = pctx->srcs[pctx->src_idx];
403 	pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
404 
405 	rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
406 
407 	atomic_dec(&perf->tsync);
408 
409 	if (rc < 0) {
410 		pr_err("%s: failed\n", current->comm);
411 		rc = -ENXIO;
412 		goto err;
413 	}
414 
415 	for (i = 0; i < MAX_SRCS; i++) {
416 		kfree(pctx->srcs[i]);
417 		pctx->srcs[i] = NULL;
418 	}
419 
420 	atomic_inc(&perf->tdone);
421 	wake_up(pctx->wq);
422 	rc = 0;
423 	goto done;
424 
425 err:
426 	for (i = 0; i < MAX_SRCS; i++) {
427 		kfree(pctx->srcs[i]);
428 		pctx->srcs[i] = NULL;
429 	}
430 
431 	if (dma_chan) {
432 		dma_release_channel(dma_chan);
433 		pctx->dma_chan = NULL;
434 	}
435 
436 done:
437 	/* Wait until we are told to stop */
438 	for (;;) {
439 		set_current_state(TASK_INTERRUPTIBLE);
440 		if (kthread_should_stop())
441 			break;
442 		schedule();
443 	}
444 	__set_current_state(TASK_RUNNING);
445 
446 	return rc;
447 }
448 
449 static void perf_free_mw(struct perf_ctx *perf)
450 {
451 	struct perf_mw *mw = &perf->mw;
452 	struct pci_dev *pdev = perf->ntb->pdev;
453 
454 	if (!mw->virt_addr)
455 		return;
456 
457 	ntb_mw_clear_trans(perf->ntb, 0);
458 	dma_free_coherent(&pdev->dev, mw->buf_size,
459 			  mw->virt_addr, mw->dma_addr);
460 	mw->xlat_size = 0;
461 	mw->buf_size = 0;
462 	mw->virt_addr = NULL;
463 }
464 
465 static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
466 {
467 	struct perf_mw *mw = &perf->mw;
468 	size_t xlat_size, buf_size;
469 	int rc;
470 
471 	if (!size)
472 		return -EINVAL;
473 
474 	xlat_size = round_up(size, mw->xlat_align_size);
475 	buf_size = round_up(size, mw->xlat_align);
476 
477 	if (mw->xlat_size == xlat_size)
478 		return 0;
479 
480 	if (mw->buf_size)
481 		perf_free_mw(perf);
482 
483 	mw->xlat_size = xlat_size;
484 	mw->buf_size = buf_size;
485 
486 	mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
487 					   &mw->dma_addr, GFP_KERNEL);
488 	if (!mw->virt_addr) {
489 		mw->xlat_size = 0;
490 		mw->buf_size = 0;
491 	}
492 
493 	rc = ntb_mw_set_trans(perf->ntb, 0, mw->dma_addr, mw->xlat_size);
494 	if (rc) {
495 		dev_err(&perf->ntb->dev, "Unable to set mw0 translation\n");
496 		perf_free_mw(perf);
497 		return -EIO;
498 	}
499 
500 	return 0;
501 }
502 
503 static void perf_link_work(struct work_struct *work)
504 {
505 	struct perf_ctx *perf =
506 		container_of(work, struct perf_ctx, link_work.work);
507 	struct ntb_dev *ndev = perf->ntb;
508 	struct pci_dev *pdev = ndev->pdev;
509 	u32 val;
510 	u64 size;
511 	int rc;
512 
513 	dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
514 
515 	size = perf->mw.phys_size;
516 
517 	if (max_mw_size && size > max_mw_size)
518 		size = max_mw_size;
519 
520 	ntb_peer_spad_write(ndev, MW_SZ_HIGH, upper_32_bits(size));
521 	ntb_peer_spad_write(ndev, MW_SZ_LOW, lower_32_bits(size));
522 	ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
523 
524 	/* now read what peer wrote */
525 	val = ntb_spad_read(ndev, VERSION);
526 	if (val != PERF_VERSION) {
527 		dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
528 		goto out;
529 	}
530 
531 	val = ntb_spad_read(ndev, MW_SZ_HIGH);
532 	size = (u64)val << 32;
533 
534 	val = ntb_spad_read(ndev, MW_SZ_LOW);
535 	size |= val;
536 
537 	dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
538 
539 	rc = perf_set_mw(perf, size);
540 	if (rc)
541 		goto out1;
542 
543 	perf->link_is_up = true;
544 	wake_up(&perf->link_wq);
545 
546 	return;
547 
548 out1:
549 	perf_free_mw(perf);
550 
551 out:
552 	if (ntb_link_is_up(ndev, NULL, NULL) == 1)
553 		schedule_delayed_work(&perf->link_work,
554 				      msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
555 }
556 
557 static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
558 {
559 	struct perf_mw *mw;
560 	int rc;
561 
562 	mw = &perf->mw;
563 
564 	rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
565 			      &mw->xlat_align, &mw->xlat_align_size);
566 	if (rc)
567 		return rc;
568 
569 	perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
570 	if (!mw->vbase)
571 		return -ENOMEM;
572 
573 	return 0;
574 }
575 
576 static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
577 				size_t count, loff_t *offp)
578 {
579 	struct perf_ctx *perf = filp->private_data;
580 	char *buf;
581 	ssize_t ret, out_off = 0;
582 	struct pthr_ctx *pctx;
583 	int i;
584 	u64 rate;
585 
586 	if (!perf)
587 		return 0;
588 
589 	buf = kmalloc(1024, GFP_KERNEL);
590 	if (!buf)
591 		return -ENOMEM;
592 
593 	if (mutex_is_locked(&perf->run_mutex)) {
594 		out_off = scnprintf(buf, 64, "running\n");
595 		goto read_from_buf;
596 	}
597 
598 	for (i = 0; i < MAX_THREADS; i++) {
599 		pctx = &perf->pthr_ctx[i];
600 
601 		if (pctx->status == -ENODATA)
602 			break;
603 
604 		if (pctx->status) {
605 			out_off += scnprintf(buf + out_off, 1024 - out_off,
606 					    "%d: error %d\n", i,
607 					    pctx->status);
608 			continue;
609 		}
610 
611 		rate = div64_u64(pctx->copied, pctx->diff_us);
612 		out_off += scnprintf(buf + out_off, 1024 - out_off,
613 			"%d: copied %llu bytes in %llu usecs, %llu MBytes/s\n",
614 			i, pctx->copied, pctx->diff_us, rate);
615 	}
616 
617 read_from_buf:
618 	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_off);
619 	kfree(buf);
620 
621 	return ret;
622 }
623 
624 static void threads_cleanup(struct perf_ctx *perf)
625 {
626 	struct pthr_ctx *pctx;
627 	int i;
628 
629 	for (i = 0; i < MAX_THREADS; i++) {
630 		pctx = &perf->pthr_ctx[i];
631 		if (pctx->thread) {
632 			pctx->status = kthread_stop(pctx->thread);
633 			pctx->thread = NULL;
634 		}
635 	}
636 }
637 
638 static void perf_clear_thread_status(struct perf_ctx *perf)
639 {
640 	int i;
641 
642 	for (i = 0; i < MAX_THREADS; i++)
643 		perf->pthr_ctx[i].status = -ENODATA;
644 }
645 
646 static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
647 				 size_t count, loff_t *offp)
648 {
649 	struct perf_ctx *perf = filp->private_data;
650 	int node, i;
651 	DECLARE_WAIT_QUEUE_HEAD(wq);
652 
653 	if (wait_event_interruptible(perf->link_wq, perf->link_is_up))
654 		return -ENOLINK;
655 
656 	if (perf->perf_threads == 0)
657 		return -EINVAL;
658 
659 	if (!mutex_trylock(&perf->run_mutex))
660 		return -EBUSY;
661 
662 	perf_clear_thread_status(perf);
663 
664 	if (perf->perf_threads > MAX_THREADS) {
665 		perf->perf_threads = MAX_THREADS;
666 		pr_info("Reset total threads to: %u\n", MAX_THREADS);
667 	}
668 
669 	/* no greater than 1M */
670 	if (seg_order > MAX_SEG_ORDER) {
671 		seg_order = MAX_SEG_ORDER;
672 		pr_info("Fix seg_order to %u\n", seg_order);
673 	}
674 
675 	if (run_order < seg_order) {
676 		run_order = seg_order;
677 		pr_info("Fix run_order to %u\n", run_order);
678 	}
679 
680 	node = dev_to_node(&perf->ntb->pdev->dev);
681 	atomic_set(&perf->tdone, 0);
682 
683 	/* launch kernel thread */
684 	for (i = 0; i < perf->perf_threads; i++) {
685 		struct pthr_ctx *pctx;
686 
687 		pctx = &perf->pthr_ctx[i];
688 		atomic_set(&pctx->dma_sync, 0);
689 		pctx->perf = perf;
690 		pctx->wq = &wq;
691 		pctx->thread =
692 			kthread_create_on_node(ntb_perf_thread,
693 					       (void *)pctx,
694 					       node, "ntb_perf %d", i);
695 		if (IS_ERR(pctx->thread)) {
696 			pctx->thread = NULL;
697 			goto err;
698 		} else {
699 			wake_up_process(pctx->thread);
700 		}
701 	}
702 
703 	wait_event_interruptible(wq,
704 		atomic_read(&perf->tdone) == perf->perf_threads);
705 
706 	threads_cleanup(perf);
707 	mutex_unlock(&perf->run_mutex);
708 	return count;
709 
710 err:
711 	threads_cleanup(perf);
712 	mutex_unlock(&perf->run_mutex);
713 	return -ENXIO;
714 }
715 
716 static const struct file_operations ntb_perf_debugfs_run = {
717 	.owner = THIS_MODULE,
718 	.open = simple_open,
719 	.read = debugfs_run_read,
720 	.write = debugfs_run_write,
721 };
722 
723 static int perf_debugfs_setup(struct perf_ctx *perf)
724 {
725 	struct pci_dev *pdev = perf->ntb->pdev;
726 
727 	if (!debugfs_initialized())
728 		return -ENODEV;
729 
730 	if (!perf_debugfs_dir) {
731 		perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
732 		if (!perf_debugfs_dir)
733 			return -ENODEV;
734 	}
735 
736 	perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
737 						    perf_debugfs_dir);
738 	if (!perf->debugfs_node_dir)
739 		return -ENODEV;
740 
741 	perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
742 						perf->debugfs_node_dir, perf,
743 						&ntb_perf_debugfs_run);
744 	if (!perf->debugfs_run)
745 		return -ENODEV;
746 
747 	perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
748 						  perf->debugfs_node_dir,
749 						  &perf->perf_threads);
750 	if (!perf->debugfs_threads)
751 		return -ENODEV;
752 
753 	return 0;
754 }
755 
756 static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
757 {
758 	struct pci_dev *pdev = ntb->pdev;
759 	struct perf_ctx *perf;
760 	int node;
761 	int rc = 0;
762 
763 	if (ntb_spad_count(ntb) < MAX_SPAD) {
764 		dev_err(&ntb->dev, "Not enough scratch pad registers for %s",
765 			DRIVER_NAME);
766 		return -EIO;
767 	}
768 
769 	node = dev_to_node(&pdev->dev);
770 
771 	perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
772 	if (!perf) {
773 		rc = -ENOMEM;
774 		goto err_perf;
775 	}
776 
777 	perf->ntb = ntb;
778 	perf->perf_threads = 1;
779 	atomic_set(&perf->tsync, 0);
780 	mutex_init(&perf->run_mutex);
781 	spin_lock_init(&perf->db_lock);
782 	perf_setup_mw(ntb, perf);
783 	init_waitqueue_head(&perf->link_wq);
784 	INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
785 
786 	rc = ntb_set_ctx(ntb, perf, &perf_ops);
787 	if (rc)
788 		goto err_ctx;
789 
790 	perf->link_is_up = false;
791 	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
792 	ntb_link_event(ntb);
793 
794 	rc = perf_debugfs_setup(perf);
795 	if (rc)
796 		goto err_ctx;
797 
798 	perf_clear_thread_status(perf);
799 
800 	return 0;
801 
802 err_ctx:
803 	cancel_delayed_work_sync(&perf->link_work);
804 	kfree(perf);
805 err_perf:
806 	return rc;
807 }
808 
809 static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
810 {
811 	struct perf_ctx *perf = ntb->ctx;
812 	int i;
813 
814 	dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
815 
816 	mutex_lock(&perf->run_mutex);
817 
818 	cancel_delayed_work_sync(&perf->link_work);
819 
820 	ntb_clear_ctx(ntb);
821 	ntb_link_disable(ntb);
822 
823 	debugfs_remove_recursive(perf_debugfs_dir);
824 	perf_debugfs_dir = NULL;
825 
826 	if (use_dma) {
827 		for (i = 0; i < MAX_THREADS; i++) {
828 			struct pthr_ctx *pctx = &perf->pthr_ctx[i];
829 
830 			if (pctx->dma_chan)
831 				dma_release_channel(pctx->dma_chan);
832 		}
833 	}
834 
835 	kfree(perf);
836 }
837 
838 static struct ntb_client perf_client = {
839 	.ops = {
840 		.probe = perf_probe,
841 		.remove = perf_remove,
842 	},
843 };
844 module_ntb_client(perf_client);
845