xref: /openbmc/linux/arch/x86/lib/copy_user_uncached_64.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1*034ff37dSLinus Torvalds/* SPDX-License-Identifier: GPL-2.0-only */
2*034ff37dSLinus Torvalds/*
3*034ff37dSLinus Torvalds * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
4*034ff37dSLinus Torvalds */
5*034ff37dSLinus Torvalds
6*034ff37dSLinus Torvalds#include <linux/linkage.h>
7*034ff37dSLinus Torvalds#include <asm/asm.h>
8*034ff37dSLinus Torvalds#include <asm/export.h>
9*034ff37dSLinus Torvalds
10*034ff37dSLinus Torvalds/*
11*034ff37dSLinus Torvalds * copy_user_nocache - Uncached memory copy with exception handling
12*034ff37dSLinus Torvalds *
13*034ff37dSLinus Torvalds * This copies from user space into kernel space, but the kernel
14*034ff37dSLinus Torvalds * space accesses can take a machine check exception, so they too
15*034ff37dSLinus Torvalds * need exception handling.
16*034ff37dSLinus Torvalds *
17*034ff37dSLinus Torvalds * Note: only 32-bit and 64-bit stores have non-temporal versions,
18*034ff37dSLinus Torvalds * and we only use aligned versions. Any unaligned parts at the
19*034ff37dSLinus Torvalds * start or end of the copy will be done using normal cached stores.
20*034ff37dSLinus Torvalds *
21*034ff37dSLinus Torvalds * Input:
22*034ff37dSLinus Torvalds * rdi destination
23*034ff37dSLinus Torvalds * rsi source
24*034ff37dSLinus Torvalds * edx count
25*034ff37dSLinus Torvalds *
26*034ff37dSLinus Torvalds * Output:
27*034ff37dSLinus Torvalds * rax uncopied bytes or 0 if successful.
28*034ff37dSLinus Torvalds */
29*034ff37dSLinus TorvaldsSYM_FUNC_START(__copy_user_nocache)
30*034ff37dSLinus Torvalds	/* If destination is not 7-byte aligned, we'll have to align it */
31*034ff37dSLinus Torvalds	testb $7,%dil
32*034ff37dSLinus Torvalds	jne .Lalign
33*034ff37dSLinus Torvalds
34*034ff37dSLinus Torvalds.Lis_aligned:
35*034ff37dSLinus Torvalds	cmp $64,%edx
36*034ff37dSLinus Torvalds	jb .Lquadwords
37*034ff37dSLinus Torvalds
38*034ff37dSLinus Torvalds	.p2align 4,0x90
39*034ff37dSLinus Torvalds.Lunrolled:
40*034ff37dSLinus Torvalds10:	movq (%rsi),%r8
41*034ff37dSLinus Torvalds11:	movq 8(%rsi),%r9
42*034ff37dSLinus Torvalds12:	movq 16(%rsi),%r10
43*034ff37dSLinus Torvalds13:	movq 24(%rsi),%r11
44*034ff37dSLinus Torvalds20:	movnti %r8,(%rdi)
45*034ff37dSLinus Torvalds21:	movnti %r9,8(%rdi)
46*034ff37dSLinus Torvalds22:	movnti %r10,16(%rdi)
47*034ff37dSLinus Torvalds23:	movnti %r11,24(%rdi)
48*034ff37dSLinus Torvalds30:	movq 32(%rsi),%r8
49*034ff37dSLinus Torvalds31:	movq 40(%rsi),%r9
50*034ff37dSLinus Torvalds32:	movq 48(%rsi),%r10
51*034ff37dSLinus Torvalds33:	movq 56(%rsi),%r11
52*034ff37dSLinus Torvalds40:	movnti %r8,32(%rdi)
53*034ff37dSLinus Torvalds41:	movnti %r9,40(%rdi)
54*034ff37dSLinus Torvalds42:	movnti %r10,48(%rdi)
55*034ff37dSLinus Torvalds43:	movnti %r11,56(%rdi)
56*034ff37dSLinus Torvalds
57*034ff37dSLinus Torvalds	addq $64,%rsi
58*034ff37dSLinus Torvalds	addq $64,%rdi
59*034ff37dSLinus Torvalds	sub $64,%edx
60*034ff37dSLinus Torvalds	cmp $64,%edx
61*034ff37dSLinus Torvalds	jae .Lunrolled
62*034ff37dSLinus Torvalds
63*034ff37dSLinus Torvalds/*
64*034ff37dSLinus Torvalds * First set of user mode loads have been done
65*034ff37dSLinus Torvalds * without any stores, so if they fail, we can
66*034ff37dSLinus Torvalds * just try the non-unrolled loop.
67*034ff37dSLinus Torvalds */
68*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(10b, .Lquadwords)
69*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(11b, .Lquadwords)
70*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(12b, .Lquadwords)
71*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(13b, .Lquadwords)
72*034ff37dSLinus Torvalds
73*034ff37dSLinus Torvalds/*
74*034ff37dSLinus Torvalds * The second set of user mode loads have been
75*034ff37dSLinus Torvalds * done with 32 bytes stored to the destination,
76*034ff37dSLinus Torvalds * so we need to take that into account before
77*034ff37dSLinus Torvalds * falling back to the unrolled loop.
78*034ff37dSLinus Torvalds */
79*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(30b, .Lfixup32)
80*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(31b, .Lfixup32)
81*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(32b, .Lfixup32)
82*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(33b, .Lfixup32)
83*034ff37dSLinus Torvalds
84*034ff37dSLinus Torvalds/*
85*034ff37dSLinus Torvalds * An exception on a write means that we're
86*034ff37dSLinus Torvalds * done, but we need to update the count
87*034ff37dSLinus Torvalds * depending on where in the unrolled loop
88*034ff37dSLinus Torvalds * we were.
89*034ff37dSLinus Torvalds */
90*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(20b, .Ldone0)
91*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(21b, .Ldone8)
92*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(22b, .Ldone16)
93*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(23b, .Ldone24)
94*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(40b, .Ldone32)
95*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(41b, .Ldone40)
96*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(42b, .Ldone48)
97*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(43b, .Ldone56)
98*034ff37dSLinus Torvalds
99*034ff37dSLinus Torvalds.Lquadwords:
100*034ff37dSLinus Torvalds	cmp $8,%edx
101*034ff37dSLinus Torvalds	jb .Llong
102*034ff37dSLinus Torvalds50:	movq (%rsi),%rax
103*034ff37dSLinus Torvalds51:	movnti %rax,(%rdi)
104*034ff37dSLinus Torvalds	addq $8,%rsi
105*034ff37dSLinus Torvalds	addq $8,%rdi
106*034ff37dSLinus Torvalds	sub $8,%edx
107*034ff37dSLinus Torvalds	jmp .Lquadwords
108*034ff37dSLinus Torvalds
109*034ff37dSLinus Torvalds/*
110*034ff37dSLinus Torvalds * If we fail on the last full quadword, we will
111*034ff37dSLinus Torvalds * not try to do any byte-wise cached accesses.
112*034ff37dSLinus Torvalds * We will try to do one more 4-byte uncached
113*034ff37dSLinus Torvalds * one, though.
114*034ff37dSLinus Torvalds */
115*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(50b, .Llast4)
116*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(51b, .Ldone0)
117*034ff37dSLinus Torvalds
118*034ff37dSLinus Torvalds.Llong:
119*034ff37dSLinus Torvalds	test $4,%dl
120*034ff37dSLinus Torvalds	je .Lword
121*034ff37dSLinus Torvalds60:	movl (%rsi),%eax
122*034ff37dSLinus Torvalds61:	movnti %eax,(%rdi)
123*034ff37dSLinus Torvalds	addq $4,%rsi
124*034ff37dSLinus Torvalds	addq $4,%rdi
125*034ff37dSLinus Torvalds	sub $4,%edx
126*034ff37dSLinus Torvalds.Lword:
127*034ff37dSLinus Torvalds	sfence
128*034ff37dSLinus Torvalds	test $2,%dl
129*034ff37dSLinus Torvalds	je .Lbyte
130*034ff37dSLinus Torvalds70:	movw (%rsi),%ax
131*034ff37dSLinus Torvalds71:	movw %ax,(%rdi)
132*034ff37dSLinus Torvalds	addq $2,%rsi
133*034ff37dSLinus Torvalds	addq $2,%rdi
134*034ff37dSLinus Torvalds	sub $2,%edx
135*034ff37dSLinus Torvalds.Lbyte:
136*034ff37dSLinus Torvalds	test $1,%dl
137*034ff37dSLinus Torvalds	je .Ldone
138*034ff37dSLinus Torvalds80:	movb (%rsi),%al
139*034ff37dSLinus Torvalds81:	movb %al,(%rdi)
140*034ff37dSLinus Torvalds	dec %edx
141*034ff37dSLinus Torvalds.Ldone:
142*034ff37dSLinus Torvalds	mov %edx,%eax
143*034ff37dSLinus Torvalds	RET
144*034ff37dSLinus Torvalds
145*034ff37dSLinus Torvalds/*
146*034ff37dSLinus Torvalds * If we fail on the last four bytes, we won't
147*034ff37dSLinus Torvalds * bother with any fixups. It's dead, Jim. Note
148*034ff37dSLinus Torvalds * that there's no need for 'sfence' for any
149*034ff37dSLinus Torvalds * of this, since the exception will have been
150*034ff37dSLinus Torvalds * serializing.
151*034ff37dSLinus Torvalds */
152*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(60b, .Ldone)
153*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(61b, .Ldone)
154*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(70b, .Ldone)
155*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(71b, .Ldone)
156*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(80b, .Ldone)
157*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(81b, .Ldone)
158*034ff37dSLinus Torvalds
159*034ff37dSLinus Torvalds/*
160*034ff37dSLinus Torvalds * This is the "head needs aliging" case when
161*034ff37dSLinus Torvalds * the destination isn't 8-byte aligned. The
162*034ff37dSLinus Torvalds * 4-byte case can be done uncached, but any
163*034ff37dSLinus Torvalds * smaller alignment is done with regular stores.
164*034ff37dSLinus Torvalds */
165*034ff37dSLinus Torvalds.Lalign:
166*034ff37dSLinus Torvalds	test $1,%dil
167*034ff37dSLinus Torvalds	je .Lalign_word
168*034ff37dSLinus Torvalds	test %edx,%edx
169*034ff37dSLinus Torvalds	je .Ldone
170*034ff37dSLinus Torvalds90:	movb (%rsi),%al
171*034ff37dSLinus Torvalds91:	movb %al,(%rdi)
172*034ff37dSLinus Torvalds	inc %rsi
173*034ff37dSLinus Torvalds	inc %rdi
174*034ff37dSLinus Torvalds	dec %edx
175*034ff37dSLinus Torvalds.Lalign_word:
176*034ff37dSLinus Torvalds	test $2,%dil
177*034ff37dSLinus Torvalds	je .Lalign_long
178*034ff37dSLinus Torvalds	cmp $2,%edx
179*034ff37dSLinus Torvalds	jb .Lbyte
180*034ff37dSLinus Torvalds92:	movw (%rsi),%ax
181*034ff37dSLinus Torvalds93:	movw %ax,(%rdi)
182*034ff37dSLinus Torvalds	addq $2,%rsi
183*034ff37dSLinus Torvalds	addq $2,%rdi
184*034ff37dSLinus Torvalds	sub $2,%edx
185*034ff37dSLinus Torvalds.Lalign_long:
186*034ff37dSLinus Torvalds	test $4,%dil
187*034ff37dSLinus Torvalds	je .Lis_aligned
188*034ff37dSLinus Torvalds	cmp $4,%edx
189*034ff37dSLinus Torvalds	jb .Lword
190*034ff37dSLinus Torvalds94:	movl (%rsi),%eax
191*034ff37dSLinus Torvalds95:	movnti %eax,(%rdi)
192*034ff37dSLinus Torvalds	addq $4,%rsi
193*034ff37dSLinus Torvalds	addq $4,%rdi
194*034ff37dSLinus Torvalds	sub $4,%edx
195*034ff37dSLinus Torvalds	jmp .Lis_aligned
196*034ff37dSLinus Torvalds
197*034ff37dSLinus Torvalds/*
198*034ff37dSLinus Torvalds * If we fail on the initial alignment accesses,
199*034ff37dSLinus Torvalds * we're all done. Again, no point in trying to
200*034ff37dSLinus Torvalds * do byte-by-byte probing if the 4-byte load
201*034ff37dSLinus Torvalds * fails - we're not doing any uncached accesses
202*034ff37dSLinus Torvalds * any more.
203*034ff37dSLinus Torvalds */
204*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(90b, .Ldone)
205*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(91b, .Ldone)
206*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(92b, .Ldone)
207*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(93b, .Ldone)
208*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(94b, .Ldone)
209*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(95b, .Ldone)
210*034ff37dSLinus Torvalds
211*034ff37dSLinus Torvalds/*
212*034ff37dSLinus Torvalds * Exception table fixups for faults in the middle
213*034ff37dSLinus Torvalds */
214*034ff37dSLinus Torvalds.Ldone56: sub $8,%edx
215*034ff37dSLinus Torvalds.Ldone48: sub $8,%edx
216*034ff37dSLinus Torvalds.Ldone40: sub $8,%edx
217*034ff37dSLinus Torvalds.Ldone32: sub $8,%edx
218*034ff37dSLinus Torvalds.Ldone24: sub $8,%edx
219*034ff37dSLinus Torvalds.Ldone16: sub $8,%edx
220*034ff37dSLinus Torvalds.Ldone8: sub $8,%edx
221*034ff37dSLinus Torvalds.Ldone0:
222*034ff37dSLinus Torvalds	mov %edx,%eax
223*034ff37dSLinus Torvalds	RET
224*034ff37dSLinus Torvalds
225*034ff37dSLinus Torvalds.Lfixup32:
226*034ff37dSLinus Torvalds	addq $32,%rsi
227*034ff37dSLinus Torvalds	addq $32,%rdi
228*034ff37dSLinus Torvalds	sub $32,%edx
229*034ff37dSLinus Torvalds	jmp .Lquadwords
230*034ff37dSLinus Torvalds
231*034ff37dSLinus Torvalds.Llast4:
232*034ff37dSLinus Torvalds52:	movl (%rsi),%eax
233*034ff37dSLinus Torvalds53:	movnti %eax,(%rdi)
234*034ff37dSLinus Torvalds	sfence
235*034ff37dSLinus Torvalds	sub $4,%edx
236*034ff37dSLinus Torvalds	mov %edx,%eax
237*034ff37dSLinus Torvalds	RET
238*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(52b, .Ldone0)
239*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(53b, .Ldone0)
240*034ff37dSLinus Torvalds
241*034ff37dSLinus TorvaldsSYM_FUNC_END(__copy_user_nocache)
242*034ff37dSLinus TorvaldsEXPORT_SYMBOL(__copy_user_nocache)
243