1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions 4 * 5 * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 12#include <linux/linkage.h> 13#include <asm/assembler.h> 14 15 .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 16 .set .Lv\b\().2d, \b 17 .set .Lv\b\().16b, \b 18 .endr 19 20 /* 21 * ARMv8.2 Crypto Extensions instructions 22 */ 23 .macro eor3, rd, rn, rm, ra 24 .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) 25 .endm 26 27 .macro rax1, rd, rn, rm 28 .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16) 29 .endm 30 31 .macro bcax, rd, rn, rm, ra 32 .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) 33 .endm 34 35 .macro xar, rd, rn, rm, imm6 36 .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16) 37 .endm 38 39 /* 40 * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) 41 */ 42 .text 43ENTRY(sha3_ce_transform) 44 frame_push 4 45 46 mov x19, x0 47 mov x20, x1 48 mov x21, x2 49 mov x22, x3 50 510: /* load state */ 52 add x8, x19, #32 53 ld1 { v0.1d- v3.1d}, [x19] 54 ld1 { v4.1d- v7.1d}, [x8], #32 55 ld1 { v8.1d-v11.1d}, [x8], #32 56 ld1 {v12.1d-v15.1d}, [x8], #32 57 ld1 {v16.1d-v19.1d}, [x8], #32 58 ld1 {v20.1d-v23.1d}, [x8], #32 59 ld1 {v24.1d}, [x8] 60 611: sub w21, w21, #1 62 mov w8, #24 63 adr_l x9, .Lsha3_rcon 64 65 /* load input */ 66 ld1 {v25.8b-v28.8b}, [x20], #32 67 ld1 {v29.8b-v31.8b}, [x20], #24 68 eor v0.8b, v0.8b, v25.8b 69 eor v1.8b, v1.8b, v26.8b 70 eor v2.8b, v2.8b, v27.8b 71 eor v3.8b, v3.8b, v28.8b 72 eor v4.8b, v4.8b, v29.8b 73 eor v5.8b, v5.8b, v30.8b 74 eor v6.8b, v6.8b, v31.8b 75 76 tbnz x22, #6, 3f // SHA3-512 77 78 ld1 {v25.8b-v28.8b}, [x20], #32 79 ld1 {v29.8b-v30.8b}, [x20], #16 80 eor v7.8b, v7.8b, v25.8b 81 eor v8.8b, v8.8b, v26.8b 82 eor v9.8b, v9.8b, v27.8b 83 eor v10.8b, v10.8b, v28.8b 84 eor v11.8b, v11.8b, v29.8b 85 eor v12.8b, v12.8b, v30.8b 86 87 tbnz x22, #4, 2f // SHA3-384 or SHA3-224 88 89 // SHA3-256 90 ld1 {v25.8b-v28.8b}, [x20], #32 91 eor v13.8b, v13.8b, v25.8b 92 eor v14.8b, v14.8b, v26.8b 93 eor v15.8b, v15.8b, v27.8b 94 eor v16.8b, v16.8b, v28.8b 95 b 4f 96 972: tbz x22, #2, 4f // bit 2 cleared? SHA-384 98 99 // SHA3-224 100 ld1 {v25.8b-v28.8b}, [x20], #32 101 ld1 {v29.8b}, [x20], #8 102 eor v13.8b, v13.8b, v25.8b 103 eor v14.8b, v14.8b, v26.8b 104 eor v15.8b, v15.8b, v27.8b 105 eor v16.8b, v16.8b, v28.8b 106 eor v17.8b, v17.8b, v29.8b 107 b 4f 108 109 // SHA3-512 1103: ld1 {v25.8b-v26.8b}, [x20], #16 111 eor v7.8b, v7.8b, v25.8b 112 eor v8.8b, v8.8b, v26.8b 113 1144: sub w8, w8, #1 115 116 eor3 v29.16b, v4.16b, v9.16b, v14.16b 117 eor3 v26.16b, v1.16b, v6.16b, v11.16b 118 eor3 v28.16b, v3.16b, v8.16b, v13.16b 119 eor3 v25.16b, v0.16b, v5.16b, v10.16b 120 eor3 v27.16b, v2.16b, v7.16b, v12.16b 121 eor3 v29.16b, v29.16b, v19.16b, v24.16b 122 eor3 v26.16b, v26.16b, v16.16b, v21.16b 123 eor3 v28.16b, v28.16b, v18.16b, v23.16b 124 eor3 v25.16b, v25.16b, v15.16b, v20.16b 125 eor3 v27.16b, v27.16b, v17.16b, v22.16b 126 127 rax1 v30.2d, v29.2d, v26.2d // bc[0] 128 rax1 v26.2d, v26.2d, v28.2d // bc[2] 129 rax1 v28.2d, v28.2d, v25.2d // bc[4] 130 rax1 v25.2d, v25.2d, v27.2d // bc[1] 131 rax1 v27.2d, v27.2d, v29.2d // bc[3] 132 133 eor v0.16b, v0.16b, v30.16b 134 xar v29.2d, v1.2d, v25.2d, (64 - 1) 135 xar v1.2d, v6.2d, v25.2d, (64 - 44) 136 xar v6.2d, v9.2d, v28.2d, (64 - 20) 137 xar v9.2d, v22.2d, v26.2d, (64 - 61) 138 xar v22.2d, v14.2d, v28.2d, (64 - 39) 139 xar v14.2d, v20.2d, v30.2d, (64 - 18) 140 xar v31.2d, v2.2d, v26.2d, (64 - 62) 141 xar v2.2d, v12.2d, v26.2d, (64 - 43) 142 xar v12.2d, v13.2d, v27.2d, (64 - 25) 143 xar v13.2d, v19.2d, v28.2d, (64 - 8) 144 xar v19.2d, v23.2d, v27.2d, (64 - 56) 145 xar v23.2d, v15.2d, v30.2d, (64 - 41) 146 xar v15.2d, v4.2d, v28.2d, (64 - 27) 147 xar v28.2d, v24.2d, v28.2d, (64 - 14) 148 xar v24.2d, v21.2d, v25.2d, (64 - 2) 149 xar v8.2d, v8.2d, v27.2d, (64 - 55) 150 xar v4.2d, v16.2d, v25.2d, (64 - 45) 151 xar v16.2d, v5.2d, v30.2d, (64 - 36) 152 xar v5.2d, v3.2d, v27.2d, (64 - 28) 153 xar v27.2d, v18.2d, v27.2d, (64 - 21) 154 xar v3.2d, v17.2d, v26.2d, (64 - 15) 155 xar v25.2d, v11.2d, v25.2d, (64 - 10) 156 xar v26.2d, v7.2d, v26.2d, (64 - 6) 157 xar v30.2d, v10.2d, v30.2d, (64 - 3) 158 159 bcax v20.16b, v31.16b, v22.16b, v8.16b 160 bcax v21.16b, v8.16b, v23.16b, v22.16b 161 bcax v22.16b, v22.16b, v24.16b, v23.16b 162 bcax v23.16b, v23.16b, v31.16b, v24.16b 163 bcax v24.16b, v24.16b, v8.16b, v31.16b 164 165 ld1r {v31.2d}, [x9], #8 166 167 bcax v17.16b, v25.16b, v19.16b, v3.16b 168 bcax v18.16b, v3.16b, v15.16b, v19.16b 169 bcax v19.16b, v19.16b, v16.16b, v15.16b 170 bcax v15.16b, v15.16b, v25.16b, v16.16b 171 bcax v16.16b, v16.16b, v3.16b, v25.16b 172 173 bcax v10.16b, v29.16b, v12.16b, v26.16b 174 bcax v11.16b, v26.16b, v13.16b, v12.16b 175 bcax v12.16b, v12.16b, v14.16b, v13.16b 176 bcax v13.16b, v13.16b, v29.16b, v14.16b 177 bcax v14.16b, v14.16b, v26.16b, v29.16b 178 179 bcax v7.16b, v30.16b, v9.16b, v4.16b 180 bcax v8.16b, v4.16b, v5.16b, v9.16b 181 bcax v9.16b, v9.16b, v6.16b, v5.16b 182 bcax v5.16b, v5.16b, v30.16b, v6.16b 183 bcax v6.16b, v6.16b, v4.16b, v30.16b 184 185 bcax v3.16b, v27.16b, v0.16b, v28.16b 186 bcax v4.16b, v28.16b, v1.16b, v0.16b 187 bcax v0.16b, v0.16b, v2.16b, v1.16b 188 bcax v1.16b, v1.16b, v27.16b, v2.16b 189 bcax v2.16b, v2.16b, v28.16b, v27.16b 190 191 eor v0.16b, v0.16b, v31.16b 192 193 cbnz w8, 4b 194 cbz w21, 5f 195 196 if_will_cond_yield_neon 197 add x8, x19, #32 198 st1 { v0.1d- v3.1d}, [x19] 199 st1 { v4.1d- v7.1d}, [x8], #32 200 st1 { v8.1d-v11.1d}, [x8], #32 201 st1 {v12.1d-v15.1d}, [x8], #32 202 st1 {v16.1d-v19.1d}, [x8], #32 203 st1 {v20.1d-v23.1d}, [x8], #32 204 st1 {v24.1d}, [x8] 205 do_cond_yield_neon 206 b 0b 207 endif_yield_neon 208 209 b 1b 210 211 /* save state */ 2125: st1 { v0.1d- v3.1d}, [x19], #32 213 st1 { v4.1d- v7.1d}, [x19], #32 214 st1 { v8.1d-v11.1d}, [x19], #32 215 st1 {v12.1d-v15.1d}, [x19], #32 216 st1 {v16.1d-v19.1d}, [x19], #32 217 st1 {v20.1d-v23.1d}, [x19], #32 218 st1 {v24.1d}, [x19] 219 frame_pop 220 ret 221ENDPROC(sha3_ce_transform) 222 223 .section ".rodata", "a" 224 .align 8 225.Lsha3_rcon: 226 .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a 227 .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001 228 .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a 229 .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a 230 .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089 231 .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080 232 .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081 233 .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 234