10b1de5d5SChris Wilson /*
20b1de5d5SChris Wilson * Copyright © 2016 Intel Corporation
30b1de5d5SChris Wilson *
40b1de5d5SChris Wilson * Permission is hereby granted, free of charge, to any person obtaining a
50b1de5d5SChris Wilson * copy of this software and associated documentation files (the "Software"),
60b1de5d5SChris Wilson * to deal in the Software without restriction, including without limitation
70b1de5d5SChris Wilson * the rights to use, copy, modify, merge, publish, distribute, sublicense,
80b1de5d5SChris Wilson * and/or sell copies of the Software, and to permit persons to whom the
90b1de5d5SChris Wilson * Software is furnished to do so, subject to the following conditions:
100b1de5d5SChris Wilson *
110b1de5d5SChris Wilson * The above copyright notice and this permission notice (including the next
120b1de5d5SChris Wilson * paragraph) shall be included in all copies or substantial portions of the
130b1de5d5SChris Wilson * Software.
140b1de5d5SChris Wilson *
150b1de5d5SChris Wilson * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
160b1de5d5SChris Wilson * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
170b1de5d5SChris Wilson * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
180b1de5d5SChris Wilson * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
190b1de5d5SChris Wilson * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
200b1de5d5SChris Wilson * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
210b1de5d5SChris Wilson * IN THE SOFTWARE.
220b1de5d5SChris Wilson *
230b1de5d5SChris Wilson */
240b1de5d5SChris Wilson
250b1de5d5SChris Wilson #include <linux/kernel.h>
260b1de5d5SChris Wilson #include <asm/fpu/api.h>
270b1de5d5SChris Wilson
289c9082b9SJani Nikula #include "i915_memcpy.h"
290b1de5d5SChris Wilson
306aacb5a3SChris Wilson #if IS_ENABLED(CONFIG_DRM_I915_DEBUG)
316aacb5a3SChris Wilson #define CI_BUG_ON(expr) BUG_ON(expr)
326aacb5a3SChris Wilson #else
336aacb5a3SChris Wilson #define CI_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)
346aacb5a3SChris Wilson #endif
356aacb5a3SChris Wilson
3621aea5ccSChris Wilson static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
370b1de5d5SChris Wilson
__memcpy_ntdqa(void * dst,const void * src,unsigned long len)380b1de5d5SChris Wilson static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
390b1de5d5SChris Wilson {
400b1de5d5SChris Wilson kernel_fpu_begin();
410b1de5d5SChris Wilson
420b1de5d5SChris Wilson while (len >= 4) {
430b1de5d5SChris Wilson asm("movntdqa (%0), %%xmm0\n"
440b1de5d5SChris Wilson "movntdqa 16(%0), %%xmm1\n"
450b1de5d5SChris Wilson "movntdqa 32(%0), %%xmm2\n"
460b1de5d5SChris Wilson "movntdqa 48(%0), %%xmm3\n"
470b1de5d5SChris Wilson "movaps %%xmm0, (%1)\n"
480b1de5d5SChris Wilson "movaps %%xmm1, 16(%1)\n"
490b1de5d5SChris Wilson "movaps %%xmm2, 32(%1)\n"
500b1de5d5SChris Wilson "movaps %%xmm3, 48(%1)\n"
510b1de5d5SChris Wilson :: "r" (src), "r" (dst) : "memory");
520b1de5d5SChris Wilson src += 64;
530b1de5d5SChris Wilson dst += 64;
540b1de5d5SChris Wilson len -= 4;
550b1de5d5SChris Wilson }
560b1de5d5SChris Wilson while (len--) {
570b1de5d5SChris Wilson asm("movntdqa (%0), %%xmm0\n"
580b1de5d5SChris Wilson "movaps %%xmm0, (%1)\n"
590b1de5d5SChris Wilson :: "r" (src), "r" (dst) : "memory");
600b1de5d5SChris Wilson src += 16;
610b1de5d5SChris Wilson dst += 16;
620b1de5d5SChris Wilson }
630b1de5d5SChris Wilson
640b1de5d5SChris Wilson kernel_fpu_end();
650b1de5d5SChris Wilson }
666aacb5a3SChris Wilson
__memcpy_ntdqu(void * dst,const void * src,unsigned long len)676aacb5a3SChris Wilson static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
686aacb5a3SChris Wilson {
696aacb5a3SChris Wilson kernel_fpu_begin();
706aacb5a3SChris Wilson
716aacb5a3SChris Wilson while (len >= 4) {
726aacb5a3SChris Wilson asm("movntdqa (%0), %%xmm0\n"
736aacb5a3SChris Wilson "movntdqa 16(%0), %%xmm1\n"
746aacb5a3SChris Wilson "movntdqa 32(%0), %%xmm2\n"
756aacb5a3SChris Wilson "movntdqa 48(%0), %%xmm3\n"
766aacb5a3SChris Wilson "movups %%xmm0, (%1)\n"
776aacb5a3SChris Wilson "movups %%xmm1, 16(%1)\n"
786aacb5a3SChris Wilson "movups %%xmm2, 32(%1)\n"
796aacb5a3SChris Wilson "movups %%xmm3, 48(%1)\n"
806aacb5a3SChris Wilson :: "r" (src), "r" (dst) : "memory");
816aacb5a3SChris Wilson src += 64;
826aacb5a3SChris Wilson dst += 64;
836aacb5a3SChris Wilson len -= 4;
846aacb5a3SChris Wilson }
856aacb5a3SChris Wilson while (len--) {
866aacb5a3SChris Wilson asm("movntdqa (%0), %%xmm0\n"
876aacb5a3SChris Wilson "movups %%xmm0, (%1)\n"
886aacb5a3SChris Wilson :: "r" (src), "r" (dst) : "memory");
896aacb5a3SChris Wilson src += 16;
906aacb5a3SChris Wilson dst += 16;
916aacb5a3SChris Wilson }
926aacb5a3SChris Wilson
936aacb5a3SChris Wilson kernel_fpu_end();
946aacb5a3SChris Wilson }
950b1de5d5SChris Wilson
960b1de5d5SChris Wilson /**
970b1de5d5SChris Wilson * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
980b1de5d5SChris Wilson * @dst: destination pointer
990b1de5d5SChris Wilson * @src: source pointer
1000b1de5d5SChris Wilson * @len: how many bytes to copy
1010b1de5d5SChris Wilson *
1020b1de5d5SChris Wilson * i915_memcpy_from_wc copies @len bytes from @src to @dst using
1030b1de5d5SChris Wilson * non-temporal instructions where available. Note that all arguments
1040b1de5d5SChris Wilson * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
1050b1de5d5SChris Wilson * of 16.
1060b1de5d5SChris Wilson *
1070b1de5d5SChris Wilson * To test whether accelerated reads from WC are supported, use
1080b1de5d5SChris Wilson * i915_memcpy_from_wc(NULL, NULL, 0);
1090b1de5d5SChris Wilson *
1100b1de5d5SChris Wilson * Returns true if the copy was successful, false if the preconditions
1110b1de5d5SChris Wilson * are not met.
1120b1de5d5SChris Wilson */
i915_memcpy_from_wc(void * dst,const void * src,unsigned long len)1130b1de5d5SChris Wilson bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
1140b1de5d5SChris Wilson {
1150b1de5d5SChris Wilson if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
1160b1de5d5SChris Wilson return false;
1170b1de5d5SChris Wilson
1180b1de5d5SChris Wilson if (static_branch_likely(&has_movntdqa)) {
1190b1de5d5SChris Wilson if (likely(len))
1206aacb5a3SChris Wilson __memcpy_ntdqa(dst, src, len >> 4);
1210b1de5d5SChris Wilson return true;
1220b1de5d5SChris Wilson }
1230b1de5d5SChris Wilson
1240b1de5d5SChris Wilson return false;
1250b1de5d5SChris Wilson }
1260b1de5d5SChris Wilson
1276aacb5a3SChris Wilson /**
1286aacb5a3SChris Wilson * i915_unaligned_memcpy_from_wc: perform a mostly accelerated read from WC
1296aacb5a3SChris Wilson * @dst: destination pointer
1306aacb5a3SChris Wilson * @src: source pointer
1316aacb5a3SChris Wilson * @len: how many bytes to copy
1326aacb5a3SChris Wilson *
1336aacb5a3SChris Wilson * Like i915_memcpy_from_wc(), the unaligned variant copies @len bytes from
1346aacb5a3SChris Wilson * @src to @dst using * non-temporal instructions where available, but
1356aacb5a3SChris Wilson * accepts that its arguments may not be aligned, but are valid for the
1366aacb5a3SChris Wilson * potential 16-byte read past the end.
1376aacb5a3SChris Wilson */
i915_unaligned_memcpy_from_wc(void * dst,const void * src,unsigned long len)138*0edbb9baSMaarten Lankhorst void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len)
1396aacb5a3SChris Wilson {
1406aacb5a3SChris Wilson unsigned long addr;
1416aacb5a3SChris Wilson
1426aacb5a3SChris Wilson CI_BUG_ON(!i915_has_memcpy_from_wc());
1436aacb5a3SChris Wilson
1446aacb5a3SChris Wilson addr = (unsigned long)src;
1456aacb5a3SChris Wilson if (!IS_ALIGNED(addr, 16)) {
1466aacb5a3SChris Wilson unsigned long x = min(ALIGN(addr, 16) - addr, len);
1476aacb5a3SChris Wilson
1486aacb5a3SChris Wilson memcpy(dst, src, x);
1496aacb5a3SChris Wilson
1506aacb5a3SChris Wilson len -= x;
1516aacb5a3SChris Wilson dst += x;
1526aacb5a3SChris Wilson src += x;
1536aacb5a3SChris Wilson }
1546aacb5a3SChris Wilson
1556aacb5a3SChris Wilson if (likely(len))
1566aacb5a3SChris Wilson __memcpy_ntdqu(dst, src, DIV_ROUND_UP(len, 16));
1576aacb5a3SChris Wilson }
1586aacb5a3SChris Wilson
i915_memcpy_init_early(struct drm_i915_private * dev_priv)1590b1de5d5SChris Wilson void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
1600b1de5d5SChris Wilson {
161219af733SChangbin Du /*
162219af733SChangbin Du * Some hypervisors (e.g. KVM) don't support VEX-prefix instructions
163219af733SChangbin Du * emulation. So don't enable movntdqa in hypervisor guest.
164219af733SChangbin Du */
165219af733SChangbin Du if (static_cpu_has(X86_FEATURE_XMM4_1) &&
166219af733SChangbin Du !boot_cpu_has(X86_FEATURE_HYPERVISOR))
1670b1de5d5SChris Wilson static_branch_enable(&has_movntdqa);
1680b1de5d5SChris Wilson }
169