xref: /openbmc/linux/arch/x86/lib/copy_user_uncached_64.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
4  */
5 
6 #include <linux/linkage.h>
7 #include <asm/asm.h>
8 #include <asm/export.h>
9 
10 /*
11  * copy_user_nocache - Uncached memory copy with exception handling
12  *
13  * This copies from user space into kernel space, but the kernel
14  * space accesses can take a machine check exception, so they too
15  * need exception handling.
16  *
17  * Note: only 32-bit and 64-bit stores have non-temporal versions,
18  * and we only use aligned versions. Any unaligned parts at the
19  * start or end of the copy will be done using normal cached stores.
20  *
21  * Input:
22  * rdi destination
23  * rsi source
24  * edx count
25  *
26  * Output:
27  * rax uncopied bytes or 0 if successful.
28  */
29 SYM_FUNC_START(__copy_user_nocache)
30 	/* If destination is not 7-byte aligned, we'll have to align it */
31 	testb $7,%dil
32 	jne .Lalign
33 
34 .Lis_aligned:
35 	cmp $64,%edx
36 	jb .Lquadwords
37 
38 	.p2align 4,0x90
39 .Lunrolled:
40 10:	movq (%rsi),%r8
41 11:	movq 8(%rsi),%r9
42 12:	movq 16(%rsi),%r10
43 13:	movq 24(%rsi),%r11
44 20:	movnti %r8,(%rdi)
45 21:	movnti %r9,8(%rdi)
46 22:	movnti %r10,16(%rdi)
47 23:	movnti %r11,24(%rdi)
48 30:	movq 32(%rsi),%r8
49 31:	movq 40(%rsi),%r9
50 32:	movq 48(%rsi),%r10
51 33:	movq 56(%rsi),%r11
52 40:	movnti %r8,32(%rdi)
53 41:	movnti %r9,40(%rdi)
54 42:	movnti %r10,48(%rdi)
55 43:	movnti %r11,56(%rdi)
56 
57 	addq $64,%rsi
58 	addq $64,%rdi
59 	sub $64,%edx
60 	cmp $64,%edx
61 	jae .Lunrolled
62 
63 /*
64  * First set of user mode loads have been done
65  * without any stores, so if they fail, we can
66  * just try the non-unrolled loop.
67  */
68 _ASM_EXTABLE_UA(10b, .Lquadwords)
69 _ASM_EXTABLE_UA(11b, .Lquadwords)
70 _ASM_EXTABLE_UA(12b, .Lquadwords)
71 _ASM_EXTABLE_UA(13b, .Lquadwords)
72 
73 /*
74  * The second set of user mode loads have been
75  * done with 32 bytes stored to the destination,
76  * so we need to take that into account before
77  * falling back to the unrolled loop.
78  */
79 _ASM_EXTABLE_UA(30b, .Lfixup32)
80 _ASM_EXTABLE_UA(31b, .Lfixup32)
81 _ASM_EXTABLE_UA(32b, .Lfixup32)
82 _ASM_EXTABLE_UA(33b, .Lfixup32)
83 
84 /*
85  * An exception on a write means that we're
86  * done, but we need to update the count
87  * depending on where in the unrolled loop
88  * we were.
89  */
90 _ASM_EXTABLE_UA(20b, .Ldone0)
91 _ASM_EXTABLE_UA(21b, .Ldone8)
92 _ASM_EXTABLE_UA(22b, .Ldone16)
93 _ASM_EXTABLE_UA(23b, .Ldone24)
94 _ASM_EXTABLE_UA(40b, .Ldone32)
95 _ASM_EXTABLE_UA(41b, .Ldone40)
96 _ASM_EXTABLE_UA(42b, .Ldone48)
97 _ASM_EXTABLE_UA(43b, .Ldone56)
98 
99 .Lquadwords:
100 	cmp $8,%edx
101 	jb .Llong
102 50:	movq (%rsi),%rax
103 51:	movnti %rax,(%rdi)
104 	addq $8,%rsi
105 	addq $8,%rdi
106 	sub $8,%edx
107 	jmp .Lquadwords
108 
109 /*
110  * If we fail on the last full quadword, we will
111  * not try to do any byte-wise cached accesses.
112  * We will try to do one more 4-byte uncached
113  * one, though.
114  */
115 _ASM_EXTABLE_UA(50b, .Llast4)
116 _ASM_EXTABLE_UA(51b, .Ldone0)
117 
118 .Llong:
119 	test $4,%dl
120 	je .Lword
121 60:	movl (%rsi),%eax
122 61:	movnti %eax,(%rdi)
123 	addq $4,%rsi
124 	addq $4,%rdi
125 	sub $4,%edx
126 .Lword:
127 	sfence
128 	test $2,%dl
129 	je .Lbyte
130 70:	movw (%rsi),%ax
131 71:	movw %ax,(%rdi)
132 	addq $2,%rsi
133 	addq $2,%rdi
134 	sub $2,%edx
135 .Lbyte:
136 	test $1,%dl
137 	je .Ldone
138 80:	movb (%rsi),%al
139 81:	movb %al,(%rdi)
140 	dec %edx
141 .Ldone:
142 	mov %edx,%eax
143 	RET
144 
145 /*
146  * If we fail on the last four bytes, we won't
147  * bother with any fixups. It's dead, Jim. Note
148  * that there's no need for 'sfence' for any
149  * of this, since the exception will have been
150  * serializing.
151  */
152 _ASM_EXTABLE_UA(60b, .Ldone)
153 _ASM_EXTABLE_UA(61b, .Ldone)
154 _ASM_EXTABLE_UA(70b, .Ldone)
155 _ASM_EXTABLE_UA(71b, .Ldone)
156 _ASM_EXTABLE_UA(80b, .Ldone)
157 _ASM_EXTABLE_UA(81b, .Ldone)
158 
159 /*
160  * This is the "head needs aliging" case when
161  * the destination isn't 8-byte aligned. The
162  * 4-byte case can be done uncached, but any
163  * smaller alignment is done with regular stores.
164  */
165 .Lalign:
166 	test $1,%dil
167 	je .Lalign_word
168 	test %edx,%edx
169 	je .Ldone
170 90:	movb (%rsi),%al
171 91:	movb %al,(%rdi)
172 	inc %rsi
173 	inc %rdi
174 	dec %edx
175 .Lalign_word:
176 	test $2,%dil
177 	je .Lalign_long
178 	cmp $2,%edx
179 	jb .Lbyte
180 92:	movw (%rsi),%ax
181 93:	movw %ax,(%rdi)
182 	addq $2,%rsi
183 	addq $2,%rdi
184 	sub $2,%edx
185 .Lalign_long:
186 	test $4,%dil
187 	je .Lis_aligned
188 	cmp $4,%edx
189 	jb .Lword
190 94:	movl (%rsi),%eax
191 95:	movnti %eax,(%rdi)
192 	addq $4,%rsi
193 	addq $4,%rdi
194 	sub $4,%edx
195 	jmp .Lis_aligned
196 
197 /*
198  * If we fail on the initial alignment accesses,
199  * we're all done. Again, no point in trying to
200  * do byte-by-byte probing if the 4-byte load
201  * fails - we're not doing any uncached accesses
202  * any more.
203  */
204 _ASM_EXTABLE_UA(90b, .Ldone)
205 _ASM_EXTABLE_UA(91b, .Ldone)
206 _ASM_EXTABLE_UA(92b, .Ldone)
207 _ASM_EXTABLE_UA(93b, .Ldone)
208 _ASM_EXTABLE_UA(94b, .Ldone)
209 _ASM_EXTABLE_UA(95b, .Ldone)
210 
211 /*
212  * Exception table fixups for faults in the middle
213  */
214 .Ldone56: sub $8,%edx
215 .Ldone48: sub $8,%edx
216 .Ldone40: sub $8,%edx
217 .Ldone32: sub $8,%edx
218 .Ldone24: sub $8,%edx
219 .Ldone16: sub $8,%edx
220 .Ldone8: sub $8,%edx
221 .Ldone0:
222 	mov %edx,%eax
223 	RET
224 
225 .Lfixup32:
226 	addq $32,%rsi
227 	addq $32,%rdi
228 	sub $32,%edx
229 	jmp .Lquadwords
230 
231 .Llast4:
232 52:	movl (%rsi),%eax
233 53:	movnti %eax,(%rdi)
234 	sfence
235 	sub $4,%edx
236 	mov %edx,%eax
237 	RET
238 _ASM_EXTABLE_UA(52b, .Ldone0)
239 _ASM_EXTABLE_UA(53b, .Ldone0)
240 
241 SYM_FUNC_END(__copy_user_nocache)
242 EXPORT_SYMBOL(__copy_user_nocache)
243