1/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2/* 3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5 */ 6 7#include <linux/linkage.h> 8 9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 10.align 32 11IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F 13.section .rodata.cst16.ROT16, "aM", @progbits, 16 14.align 16 15ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 16.section .rodata.cst16.ROR328, "aM", @progbits, 16 17.align 16 18ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 20.align 64 21SIGMA: 22.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 23.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 24.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 25.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 26.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 27.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 28.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 29.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 30.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 31.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 32#ifdef CONFIG_AS_AVX512 33.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 34.align 64 35SIGMA2: 36.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 37.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 38.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 39.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 40.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 41.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 42.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 43.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 44.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 45.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 46#endif /* CONFIG_AS_AVX512 */ 47 48.text 49#ifdef CONFIG_AS_SSSE3 50SYM_FUNC_START(blake2s_compress_ssse3) 51 testq %rdx,%rdx 52 je .Lendofloop 53 movdqu (%rdi),%xmm0 54 movdqu 0x10(%rdi),%xmm1 55 movdqa ROT16(%rip),%xmm12 56 movdqa ROR328(%rip),%xmm13 57 movdqu 0x20(%rdi),%xmm14 58 movq %rcx,%xmm15 59 leaq SIGMA+0xa0(%rip),%r8 60 jmp .Lbeginofloop 61 .align 32 62.Lbeginofloop: 63 movdqa %xmm0,%xmm10 64 movdqa %xmm1,%xmm11 65 paddq %xmm15,%xmm14 66 movdqa IV(%rip),%xmm2 67 movdqa %xmm14,%xmm3 68 pxor IV+0x10(%rip),%xmm3 69 leaq SIGMA(%rip),%rcx 70.Lroundloop: 71 movzbl (%rcx),%eax 72 movd (%rsi,%rax,4),%xmm4 73 movzbl 0x1(%rcx),%eax 74 movd (%rsi,%rax,4),%xmm5 75 movzbl 0x2(%rcx),%eax 76 movd (%rsi,%rax,4),%xmm6 77 movzbl 0x3(%rcx),%eax 78 movd (%rsi,%rax,4),%xmm7 79 punpckldq %xmm5,%xmm4 80 punpckldq %xmm7,%xmm6 81 punpcklqdq %xmm6,%xmm4 82 paddd %xmm4,%xmm0 83 paddd %xmm1,%xmm0 84 pxor %xmm0,%xmm3 85 pshufb %xmm12,%xmm3 86 paddd %xmm3,%xmm2 87 pxor %xmm2,%xmm1 88 movdqa %xmm1,%xmm8 89 psrld $0xc,%xmm1 90 pslld $0x14,%xmm8 91 por %xmm8,%xmm1 92 movzbl 0x4(%rcx),%eax 93 movd (%rsi,%rax,4),%xmm5 94 movzbl 0x5(%rcx),%eax 95 movd (%rsi,%rax,4),%xmm6 96 movzbl 0x6(%rcx),%eax 97 movd (%rsi,%rax,4),%xmm7 98 movzbl 0x7(%rcx),%eax 99 movd (%rsi,%rax,4),%xmm4 100 punpckldq %xmm6,%xmm5 101 punpckldq %xmm4,%xmm7 102 punpcklqdq %xmm7,%xmm5 103 paddd %xmm5,%xmm0 104 paddd %xmm1,%xmm0 105 pxor %xmm0,%xmm3 106 pshufb %xmm13,%xmm3 107 paddd %xmm3,%xmm2 108 pxor %xmm2,%xmm1 109 movdqa %xmm1,%xmm8 110 psrld $0x7,%xmm1 111 pslld $0x19,%xmm8 112 por %xmm8,%xmm1 113 pshufd $0x93,%xmm0,%xmm0 114 pshufd $0x4e,%xmm3,%xmm3 115 pshufd $0x39,%xmm2,%xmm2 116 movzbl 0x8(%rcx),%eax 117 movd (%rsi,%rax,4),%xmm6 118 movzbl 0x9(%rcx),%eax 119 movd (%rsi,%rax,4),%xmm7 120 movzbl 0xa(%rcx),%eax 121 movd (%rsi,%rax,4),%xmm4 122 movzbl 0xb(%rcx),%eax 123 movd (%rsi,%rax,4),%xmm5 124 punpckldq %xmm7,%xmm6 125 punpckldq %xmm5,%xmm4 126 punpcklqdq %xmm4,%xmm6 127 paddd %xmm6,%xmm0 128 paddd %xmm1,%xmm0 129 pxor %xmm0,%xmm3 130 pshufb %xmm12,%xmm3 131 paddd %xmm3,%xmm2 132 pxor %xmm2,%xmm1 133 movdqa %xmm1,%xmm8 134 psrld $0xc,%xmm1 135 pslld $0x14,%xmm8 136 por %xmm8,%xmm1 137 movzbl 0xc(%rcx),%eax 138 movd (%rsi,%rax,4),%xmm7 139 movzbl 0xd(%rcx),%eax 140 movd (%rsi,%rax,4),%xmm4 141 movzbl 0xe(%rcx),%eax 142 movd (%rsi,%rax,4),%xmm5 143 movzbl 0xf(%rcx),%eax 144 movd (%rsi,%rax,4),%xmm6 145 punpckldq %xmm4,%xmm7 146 punpckldq %xmm6,%xmm5 147 punpcklqdq %xmm5,%xmm7 148 paddd %xmm7,%xmm0 149 paddd %xmm1,%xmm0 150 pxor %xmm0,%xmm3 151 pshufb %xmm13,%xmm3 152 paddd %xmm3,%xmm2 153 pxor %xmm2,%xmm1 154 movdqa %xmm1,%xmm8 155 psrld $0x7,%xmm1 156 pslld $0x19,%xmm8 157 por %xmm8,%xmm1 158 pshufd $0x39,%xmm0,%xmm0 159 pshufd $0x4e,%xmm3,%xmm3 160 pshufd $0x93,%xmm2,%xmm2 161 addq $0x10,%rcx 162 cmpq %r8,%rcx 163 jnz .Lroundloop 164 pxor %xmm2,%xmm0 165 pxor %xmm3,%xmm1 166 pxor %xmm10,%xmm0 167 pxor %xmm11,%xmm1 168 addq $0x40,%rsi 169 decq %rdx 170 jnz .Lbeginofloop 171 movdqu %xmm0,(%rdi) 172 movdqu %xmm1,0x10(%rdi) 173 movdqu %xmm14,0x20(%rdi) 174.Lendofloop: 175 ret 176SYM_FUNC_END(blake2s_compress_ssse3) 177#endif /* CONFIG_AS_SSSE3 */ 178 179#ifdef CONFIG_AS_AVX512 180SYM_FUNC_START(blake2s_compress_avx512) 181 vmovdqu (%rdi),%xmm0 182 vmovdqu 0x10(%rdi),%xmm1 183 vmovdqu 0x20(%rdi),%xmm4 184 vmovq %rcx,%xmm5 185 vmovdqa IV(%rip),%xmm14 186 vmovdqa IV+16(%rip),%xmm15 187 jmp .Lblake2s_compress_avx512_mainloop 188.align 32 189.Lblake2s_compress_avx512_mainloop: 190 vmovdqa %xmm0,%xmm10 191 vmovdqa %xmm1,%xmm11 192 vpaddq %xmm5,%xmm4,%xmm4 193 vmovdqa %xmm14,%xmm2 194 vpxor %xmm15,%xmm4,%xmm3 195 vmovdqu (%rsi),%ymm6 196 vmovdqu 0x20(%rsi),%ymm7 197 addq $0x40,%rsi 198 leaq SIGMA2(%rip),%rax 199 movb $0xa,%cl 200.Lblake2s_compress_avx512_roundloop: 201 addq $0x40,%rax 202 vmovdqa -0x40(%rax),%ymm8 203 vmovdqa -0x20(%rax),%ymm9 204 vpermi2d %ymm7,%ymm6,%ymm8 205 vpermi2d %ymm7,%ymm6,%ymm9 206 vmovdqa %ymm8,%ymm6 207 vmovdqa %ymm9,%ymm7 208 vpaddd %xmm8,%xmm0,%xmm0 209 vpaddd %xmm1,%xmm0,%xmm0 210 vpxor %xmm0,%xmm3,%xmm3 211 vprord $0x10,%xmm3,%xmm3 212 vpaddd %xmm3,%xmm2,%xmm2 213 vpxor %xmm2,%xmm1,%xmm1 214 vprord $0xc,%xmm1,%xmm1 215 vextracti128 $0x1,%ymm8,%xmm8 216 vpaddd %xmm8,%xmm0,%xmm0 217 vpaddd %xmm1,%xmm0,%xmm0 218 vpxor %xmm0,%xmm3,%xmm3 219 vprord $0x8,%xmm3,%xmm3 220 vpaddd %xmm3,%xmm2,%xmm2 221 vpxor %xmm2,%xmm1,%xmm1 222 vprord $0x7,%xmm1,%xmm1 223 vpshufd $0x93,%xmm0,%xmm0 224 vpshufd $0x4e,%xmm3,%xmm3 225 vpshufd $0x39,%xmm2,%xmm2 226 vpaddd %xmm9,%xmm0,%xmm0 227 vpaddd %xmm1,%xmm0,%xmm0 228 vpxor %xmm0,%xmm3,%xmm3 229 vprord $0x10,%xmm3,%xmm3 230 vpaddd %xmm3,%xmm2,%xmm2 231 vpxor %xmm2,%xmm1,%xmm1 232 vprord $0xc,%xmm1,%xmm1 233 vextracti128 $0x1,%ymm9,%xmm9 234 vpaddd %xmm9,%xmm0,%xmm0 235 vpaddd %xmm1,%xmm0,%xmm0 236 vpxor %xmm0,%xmm3,%xmm3 237 vprord $0x8,%xmm3,%xmm3 238 vpaddd %xmm3,%xmm2,%xmm2 239 vpxor %xmm2,%xmm1,%xmm1 240 vprord $0x7,%xmm1,%xmm1 241 vpshufd $0x39,%xmm0,%xmm0 242 vpshufd $0x4e,%xmm3,%xmm3 243 vpshufd $0x93,%xmm2,%xmm2 244 decb %cl 245 jne .Lblake2s_compress_avx512_roundloop 246 vpxor %xmm10,%xmm0,%xmm0 247 vpxor %xmm11,%xmm1,%xmm1 248 vpxor %xmm2,%xmm0,%xmm0 249 vpxor %xmm3,%xmm1,%xmm1 250 decq %rdx 251 jne .Lblake2s_compress_avx512_mainloop 252 vmovdqu %xmm0,(%rdi) 253 vmovdqu %xmm1,0x10(%rdi) 254 vmovdqu %xmm4,0x20(%rdi) 255 vzeroupper 256 retq 257SYM_FUNC_END(blake2s_compress_avx512) 258#endif /* CONFIG_AS_AVX512 */ 259