1*034ff37dSLinus Torvalds/* SPDX-License-Identifier: GPL-2.0-only */ 2*034ff37dSLinus Torvalds/* 3*034ff37dSLinus Torvalds * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> 4*034ff37dSLinus Torvalds */ 5*034ff37dSLinus Torvalds 6*034ff37dSLinus Torvalds#include <linux/linkage.h> 7*034ff37dSLinus Torvalds#include <asm/asm.h> 8*034ff37dSLinus Torvalds#include <asm/export.h> 9*034ff37dSLinus Torvalds 10*034ff37dSLinus Torvalds/* 11*034ff37dSLinus Torvalds * copy_user_nocache - Uncached memory copy with exception handling 12*034ff37dSLinus Torvalds * 13*034ff37dSLinus Torvalds * This copies from user space into kernel space, but the kernel 14*034ff37dSLinus Torvalds * space accesses can take a machine check exception, so they too 15*034ff37dSLinus Torvalds * need exception handling. 16*034ff37dSLinus Torvalds * 17*034ff37dSLinus Torvalds * Note: only 32-bit and 64-bit stores have non-temporal versions, 18*034ff37dSLinus Torvalds * and we only use aligned versions. Any unaligned parts at the 19*034ff37dSLinus Torvalds * start or end of the copy will be done using normal cached stores. 20*034ff37dSLinus Torvalds * 21*034ff37dSLinus Torvalds * Input: 22*034ff37dSLinus Torvalds * rdi destination 23*034ff37dSLinus Torvalds * rsi source 24*034ff37dSLinus Torvalds * edx count 25*034ff37dSLinus Torvalds * 26*034ff37dSLinus Torvalds * Output: 27*034ff37dSLinus Torvalds * rax uncopied bytes or 0 if successful. 28*034ff37dSLinus Torvalds */ 29*034ff37dSLinus TorvaldsSYM_FUNC_START(__copy_user_nocache) 30*034ff37dSLinus Torvalds /* If destination is not 7-byte aligned, we'll have to align it */ 31*034ff37dSLinus Torvalds testb $7,%dil 32*034ff37dSLinus Torvalds jne .Lalign 33*034ff37dSLinus Torvalds 34*034ff37dSLinus Torvalds.Lis_aligned: 35*034ff37dSLinus Torvalds cmp $64,%edx 36*034ff37dSLinus Torvalds jb .Lquadwords 37*034ff37dSLinus Torvalds 38*034ff37dSLinus Torvalds .p2align 4,0x90 39*034ff37dSLinus Torvalds.Lunrolled: 40*034ff37dSLinus Torvalds10: movq (%rsi),%r8 41*034ff37dSLinus Torvalds11: movq 8(%rsi),%r9 42*034ff37dSLinus Torvalds12: movq 16(%rsi),%r10 43*034ff37dSLinus Torvalds13: movq 24(%rsi),%r11 44*034ff37dSLinus Torvalds20: movnti %r8,(%rdi) 45*034ff37dSLinus Torvalds21: movnti %r9,8(%rdi) 46*034ff37dSLinus Torvalds22: movnti %r10,16(%rdi) 47*034ff37dSLinus Torvalds23: movnti %r11,24(%rdi) 48*034ff37dSLinus Torvalds30: movq 32(%rsi),%r8 49*034ff37dSLinus Torvalds31: movq 40(%rsi),%r9 50*034ff37dSLinus Torvalds32: movq 48(%rsi),%r10 51*034ff37dSLinus Torvalds33: movq 56(%rsi),%r11 52*034ff37dSLinus Torvalds40: movnti %r8,32(%rdi) 53*034ff37dSLinus Torvalds41: movnti %r9,40(%rdi) 54*034ff37dSLinus Torvalds42: movnti %r10,48(%rdi) 55*034ff37dSLinus Torvalds43: movnti %r11,56(%rdi) 56*034ff37dSLinus Torvalds 57*034ff37dSLinus Torvalds addq $64,%rsi 58*034ff37dSLinus Torvalds addq $64,%rdi 59*034ff37dSLinus Torvalds sub $64,%edx 60*034ff37dSLinus Torvalds cmp $64,%edx 61*034ff37dSLinus Torvalds jae .Lunrolled 62*034ff37dSLinus Torvalds 63*034ff37dSLinus Torvalds/* 64*034ff37dSLinus Torvalds * First set of user mode loads have been done 65*034ff37dSLinus Torvalds * without any stores, so if they fail, we can 66*034ff37dSLinus Torvalds * just try the non-unrolled loop. 67*034ff37dSLinus Torvalds */ 68*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(10b, .Lquadwords) 69*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(11b, .Lquadwords) 70*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(12b, .Lquadwords) 71*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(13b, .Lquadwords) 72*034ff37dSLinus Torvalds 73*034ff37dSLinus Torvalds/* 74*034ff37dSLinus Torvalds * The second set of user mode loads have been 75*034ff37dSLinus Torvalds * done with 32 bytes stored to the destination, 76*034ff37dSLinus Torvalds * so we need to take that into account before 77*034ff37dSLinus Torvalds * falling back to the unrolled loop. 78*034ff37dSLinus Torvalds */ 79*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(30b, .Lfixup32) 80*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(31b, .Lfixup32) 81*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(32b, .Lfixup32) 82*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(33b, .Lfixup32) 83*034ff37dSLinus Torvalds 84*034ff37dSLinus Torvalds/* 85*034ff37dSLinus Torvalds * An exception on a write means that we're 86*034ff37dSLinus Torvalds * done, but we need to update the count 87*034ff37dSLinus Torvalds * depending on where in the unrolled loop 88*034ff37dSLinus Torvalds * we were. 89*034ff37dSLinus Torvalds */ 90*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(20b, .Ldone0) 91*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(21b, .Ldone8) 92*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(22b, .Ldone16) 93*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(23b, .Ldone24) 94*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(40b, .Ldone32) 95*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(41b, .Ldone40) 96*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(42b, .Ldone48) 97*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(43b, .Ldone56) 98*034ff37dSLinus Torvalds 99*034ff37dSLinus Torvalds.Lquadwords: 100*034ff37dSLinus Torvalds cmp $8,%edx 101*034ff37dSLinus Torvalds jb .Llong 102*034ff37dSLinus Torvalds50: movq (%rsi),%rax 103*034ff37dSLinus Torvalds51: movnti %rax,(%rdi) 104*034ff37dSLinus Torvalds addq $8,%rsi 105*034ff37dSLinus Torvalds addq $8,%rdi 106*034ff37dSLinus Torvalds sub $8,%edx 107*034ff37dSLinus Torvalds jmp .Lquadwords 108*034ff37dSLinus Torvalds 109*034ff37dSLinus Torvalds/* 110*034ff37dSLinus Torvalds * If we fail on the last full quadword, we will 111*034ff37dSLinus Torvalds * not try to do any byte-wise cached accesses. 112*034ff37dSLinus Torvalds * We will try to do one more 4-byte uncached 113*034ff37dSLinus Torvalds * one, though. 114*034ff37dSLinus Torvalds */ 115*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(50b, .Llast4) 116*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(51b, .Ldone0) 117*034ff37dSLinus Torvalds 118*034ff37dSLinus Torvalds.Llong: 119*034ff37dSLinus Torvalds test $4,%dl 120*034ff37dSLinus Torvalds je .Lword 121*034ff37dSLinus Torvalds60: movl (%rsi),%eax 122*034ff37dSLinus Torvalds61: movnti %eax,(%rdi) 123*034ff37dSLinus Torvalds addq $4,%rsi 124*034ff37dSLinus Torvalds addq $4,%rdi 125*034ff37dSLinus Torvalds sub $4,%edx 126*034ff37dSLinus Torvalds.Lword: 127*034ff37dSLinus Torvalds sfence 128*034ff37dSLinus Torvalds test $2,%dl 129*034ff37dSLinus Torvalds je .Lbyte 130*034ff37dSLinus Torvalds70: movw (%rsi),%ax 131*034ff37dSLinus Torvalds71: movw %ax,(%rdi) 132*034ff37dSLinus Torvalds addq $2,%rsi 133*034ff37dSLinus Torvalds addq $2,%rdi 134*034ff37dSLinus Torvalds sub $2,%edx 135*034ff37dSLinus Torvalds.Lbyte: 136*034ff37dSLinus Torvalds test $1,%dl 137*034ff37dSLinus Torvalds je .Ldone 138*034ff37dSLinus Torvalds80: movb (%rsi),%al 139*034ff37dSLinus Torvalds81: movb %al,(%rdi) 140*034ff37dSLinus Torvalds dec %edx 141*034ff37dSLinus Torvalds.Ldone: 142*034ff37dSLinus Torvalds mov %edx,%eax 143*034ff37dSLinus Torvalds RET 144*034ff37dSLinus Torvalds 145*034ff37dSLinus Torvalds/* 146*034ff37dSLinus Torvalds * If we fail on the last four bytes, we won't 147*034ff37dSLinus Torvalds * bother with any fixups. It's dead, Jim. Note 148*034ff37dSLinus Torvalds * that there's no need for 'sfence' for any 149*034ff37dSLinus Torvalds * of this, since the exception will have been 150*034ff37dSLinus Torvalds * serializing. 151*034ff37dSLinus Torvalds */ 152*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(60b, .Ldone) 153*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(61b, .Ldone) 154*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(70b, .Ldone) 155*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(71b, .Ldone) 156*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(80b, .Ldone) 157*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(81b, .Ldone) 158*034ff37dSLinus Torvalds 159*034ff37dSLinus Torvalds/* 160*034ff37dSLinus Torvalds * This is the "head needs aliging" case when 161*034ff37dSLinus Torvalds * the destination isn't 8-byte aligned. The 162*034ff37dSLinus Torvalds * 4-byte case can be done uncached, but any 163*034ff37dSLinus Torvalds * smaller alignment is done with regular stores. 164*034ff37dSLinus Torvalds */ 165*034ff37dSLinus Torvalds.Lalign: 166*034ff37dSLinus Torvalds test $1,%dil 167*034ff37dSLinus Torvalds je .Lalign_word 168*034ff37dSLinus Torvalds test %edx,%edx 169*034ff37dSLinus Torvalds je .Ldone 170*034ff37dSLinus Torvalds90: movb (%rsi),%al 171*034ff37dSLinus Torvalds91: movb %al,(%rdi) 172*034ff37dSLinus Torvalds inc %rsi 173*034ff37dSLinus Torvalds inc %rdi 174*034ff37dSLinus Torvalds dec %edx 175*034ff37dSLinus Torvalds.Lalign_word: 176*034ff37dSLinus Torvalds test $2,%dil 177*034ff37dSLinus Torvalds je .Lalign_long 178*034ff37dSLinus Torvalds cmp $2,%edx 179*034ff37dSLinus Torvalds jb .Lbyte 180*034ff37dSLinus Torvalds92: movw (%rsi),%ax 181*034ff37dSLinus Torvalds93: movw %ax,(%rdi) 182*034ff37dSLinus Torvalds addq $2,%rsi 183*034ff37dSLinus Torvalds addq $2,%rdi 184*034ff37dSLinus Torvalds sub $2,%edx 185*034ff37dSLinus Torvalds.Lalign_long: 186*034ff37dSLinus Torvalds test $4,%dil 187*034ff37dSLinus Torvalds je .Lis_aligned 188*034ff37dSLinus Torvalds cmp $4,%edx 189*034ff37dSLinus Torvalds jb .Lword 190*034ff37dSLinus Torvalds94: movl (%rsi),%eax 191*034ff37dSLinus Torvalds95: movnti %eax,(%rdi) 192*034ff37dSLinus Torvalds addq $4,%rsi 193*034ff37dSLinus Torvalds addq $4,%rdi 194*034ff37dSLinus Torvalds sub $4,%edx 195*034ff37dSLinus Torvalds jmp .Lis_aligned 196*034ff37dSLinus Torvalds 197*034ff37dSLinus Torvalds/* 198*034ff37dSLinus Torvalds * If we fail on the initial alignment accesses, 199*034ff37dSLinus Torvalds * we're all done. Again, no point in trying to 200*034ff37dSLinus Torvalds * do byte-by-byte probing if the 4-byte load 201*034ff37dSLinus Torvalds * fails - we're not doing any uncached accesses 202*034ff37dSLinus Torvalds * any more. 203*034ff37dSLinus Torvalds */ 204*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(90b, .Ldone) 205*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(91b, .Ldone) 206*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(92b, .Ldone) 207*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(93b, .Ldone) 208*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(94b, .Ldone) 209*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(95b, .Ldone) 210*034ff37dSLinus Torvalds 211*034ff37dSLinus Torvalds/* 212*034ff37dSLinus Torvalds * Exception table fixups for faults in the middle 213*034ff37dSLinus Torvalds */ 214*034ff37dSLinus Torvalds.Ldone56: sub $8,%edx 215*034ff37dSLinus Torvalds.Ldone48: sub $8,%edx 216*034ff37dSLinus Torvalds.Ldone40: sub $8,%edx 217*034ff37dSLinus Torvalds.Ldone32: sub $8,%edx 218*034ff37dSLinus Torvalds.Ldone24: sub $8,%edx 219*034ff37dSLinus Torvalds.Ldone16: sub $8,%edx 220*034ff37dSLinus Torvalds.Ldone8: sub $8,%edx 221*034ff37dSLinus Torvalds.Ldone0: 222*034ff37dSLinus Torvalds mov %edx,%eax 223*034ff37dSLinus Torvalds RET 224*034ff37dSLinus Torvalds 225*034ff37dSLinus Torvalds.Lfixup32: 226*034ff37dSLinus Torvalds addq $32,%rsi 227*034ff37dSLinus Torvalds addq $32,%rdi 228*034ff37dSLinus Torvalds sub $32,%edx 229*034ff37dSLinus Torvalds jmp .Lquadwords 230*034ff37dSLinus Torvalds 231*034ff37dSLinus Torvalds.Llast4: 232*034ff37dSLinus Torvalds52: movl (%rsi),%eax 233*034ff37dSLinus Torvalds53: movnti %eax,(%rdi) 234*034ff37dSLinus Torvalds sfence 235*034ff37dSLinus Torvalds sub $4,%edx 236*034ff37dSLinus Torvalds mov %edx,%eax 237*034ff37dSLinus Torvalds RET 238*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(52b, .Ldone0) 239*034ff37dSLinus Torvalds_ASM_EXTABLE_UA(53b, .Ldone0) 240*034ff37dSLinus Torvalds 241*034ff37dSLinus TorvaldsSYM_FUNC_END(__copy_user_nocache) 242*034ff37dSLinus TorvaldsEXPORT_SYMBOL(__copy_user_nocache) 243