1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * sm3-neon-core.S - SM3 secure hash using NEON instructions 4 * 5 * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64 6 * 7 * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <linux/cfi_types.h> 13#include <asm/assembler.h> 14 15/* Context structure */ 16 17#define state_h0 0 18#define state_h1 4 19#define state_h2 8 20#define state_h3 12 21#define state_h4 16 22#define state_h5 20 23#define state_h6 24 24#define state_h7 28 25 26/* Stack structure */ 27 28#define STACK_W_SIZE (32 * 2 * 3) 29 30#define STACK_W (0) 31#define STACK_SIZE (STACK_W + STACK_W_SIZE) 32 33/* Register macros */ 34 35#define RSTATE x0 36#define RDATA x1 37#define RNBLKS x2 38#define RKPTR x28 39#define RFRAME x29 40 41#define ra w3 42#define rb w4 43#define rc w5 44#define rd w6 45#define re w7 46#define rf w8 47#define rg w9 48#define rh w10 49 50#define t0 w11 51#define t1 w12 52#define t2 w13 53#define t3 w14 54#define t4 w15 55#define t5 w16 56#define t6 w17 57 58#define k_even w19 59#define k_odd w20 60 61#define addr0 x21 62#define addr1 x22 63 64#define s0 w23 65#define s1 w24 66#define s2 w25 67#define s3 w26 68 69#define W0 v0 70#define W1 v1 71#define W2 v2 72#define W3 v3 73#define W4 v4 74#define W5 v5 75 76#define XTMP0 v6 77#define XTMP1 v7 78#define XTMP2 v16 79#define XTMP3 v17 80#define XTMP4 v18 81#define XTMP5 v19 82#define XTMP6 v20 83 84/* Helper macros. */ 85 86#define _(...) /*_*/ 87 88#define clear_vec(x) \ 89 movi x.8h, #0; 90 91#define rolw(o, a, n) \ 92 ror o, a, #(32 - n); 93 94/* Round function macros. */ 95 96#define GG1_1(x, y, z, o, t) \ 97 eor o, x, y; 98#define GG1_2(x, y, z, o, t) \ 99 eor o, o, z; 100#define GG1_3(x, y, z, o, t) 101 102#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t) 103#define FF1_2(x, y, z, o, t) 104#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t) 105 106#define GG2_1(x, y, z, o, t) \ 107 bic o, z, x; 108#define GG2_2(x, y, z, o, t) \ 109 and t, y, x; 110#define GG2_3(x, y, z, o, t) \ 111 eor o, o, t; 112 113#define FF2_1(x, y, z, o, t) \ 114 eor o, x, y; 115#define FF2_2(x, y, z, o, t) \ 116 and t, x, y; \ 117 and o, o, z; 118#define FF2_3(x, y, z, o, t) \ 119 eor o, o, t; 120 121#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 122 K_LOAD(round); \ 123 ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \ 124 rolw(t0, a, 12); /* rol(a, 12) => t0 */ \ 125 IOP(1, iop_param); \ 126 FF##i##_1(a, b, c, t1, t2); \ 127 ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \ 128 add k, k, e; \ 129 IOP(2, iop_param); \ 130 GG##i##_1(e, f, g, t3, t4); \ 131 FF##i##_2(a, b, c, t1, t2); \ 132 IOP(3, iop_param); \ 133 add k, k, t0; \ 134 add h, h, t5; \ 135 add d, d, t6; /* w1w2 + d => d */ \ 136 IOP(4, iop_param); \ 137 rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \ 138 GG##i##_2(e, f, g, t3, t4); \ 139 add h, h, k; /* h + w1 + k => h */ \ 140 IOP(5, iop_param); \ 141 FF##i##_3(a, b, c, t1, t2); \ 142 eor t0, t0, k; /* k ^ t0 => t0 */ \ 143 GG##i##_3(e, f, g, t3, t4); \ 144 add d, d, t1; /* FF(a,b,c) + d => d */ \ 145 IOP(6, iop_param); \ 146 add t3, t3, h; /* GG(e,f,g) + h => t3 */ \ 147 rolw(b, b, 9); /* rol(b, 9) => b */ \ 148 eor h, t3, t3, ror #(32-9); \ 149 IOP(7, iop_param); \ 150 add d, d, t0; /* t0 + d => d */ \ 151 rolw(f, f, 19); /* rol(f, 19) => f */ \ 152 IOP(8, iop_param); \ 153 eor h, h, t3, ror #(32-17); /* P0(t3) => h */ 154 155#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 156 R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) 157 158#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 159 R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) 160 161#define KL(round) \ 162 ldp k_even, k_odd, [RKPTR, #(4*(round))]; 163 164/* Input expansion macros. */ 165 166/* Byte-swapped input address. */ 167#define IW_W_ADDR(round, widx, offs) \ 168 (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4)) 169 170/* Expanded input address. */ 171#define XW_W_ADDR(round, widx, offs) \ 172 (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4)) 173 174/* Rounds 1-12, byte-swapped input block addresses. */ 175#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32) 176#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48) 177 178/* Rounds 1-12, expanded input block addresses. */ 179#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) 180#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16) 181 182/* Input block loading. 183 * Interleaving within round function needed for in-order CPUs. */ 184#define LOAD_W_VEC_1_1() \ 185 add addr0, sp, #IW_W1_ADDR(0, 0); 186#define LOAD_W_VEC_1_2() \ 187 add addr1, sp, #IW_W1_ADDR(4, 0); 188#define LOAD_W_VEC_1_3() \ 189 ld1 {W0.16b}, [RDATA], #16; 190#define LOAD_W_VEC_1_4() \ 191 ld1 {W1.16b}, [RDATA], #16; 192#define LOAD_W_VEC_1_5() \ 193 ld1 {W2.16b}, [RDATA], #16; 194#define LOAD_W_VEC_1_6() \ 195 ld1 {W3.16b}, [RDATA], #16; 196#define LOAD_W_VEC_1_7() \ 197 rev32 XTMP0.16b, W0.16b; 198#define LOAD_W_VEC_1_8() \ 199 rev32 XTMP1.16b, W1.16b; 200#define LOAD_W_VEC_2_1() \ 201 rev32 XTMP2.16b, W2.16b; 202#define LOAD_W_VEC_2_2() \ 203 rev32 XTMP3.16b, W3.16b; 204#define LOAD_W_VEC_2_3() \ 205 eor XTMP4.16b, XTMP1.16b, XTMP0.16b; 206#define LOAD_W_VEC_2_4() \ 207 eor XTMP5.16b, XTMP2.16b, XTMP1.16b; 208#define LOAD_W_VEC_2_5() \ 209 st1 {XTMP0.16b}, [addr0], #16; 210#define LOAD_W_VEC_2_6() \ 211 st1 {XTMP4.16b}, [addr0]; \ 212 add addr0, sp, #IW_W1_ADDR(8, 0); 213#define LOAD_W_VEC_2_7() \ 214 eor XTMP6.16b, XTMP3.16b, XTMP2.16b; 215#define LOAD_W_VEC_2_8() \ 216 ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */ 217#define LOAD_W_VEC_3_1() \ 218 mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */ 219#define LOAD_W_VEC_3_2() \ 220 st1 {XTMP1.16b}, [addr1], #16; 221#define LOAD_W_VEC_3_3() \ 222 st1 {XTMP5.16b}, [addr1]; \ 223 ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */ 224#define LOAD_W_VEC_3_4() \ 225 ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */ 226#define LOAD_W_VEC_3_5() \ 227 ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */ 228#define LOAD_W_VEC_3_6() \ 229 st1 {XTMP2.16b}, [addr0], #16; 230#define LOAD_W_VEC_3_7() \ 231 st1 {XTMP6.16b}, [addr0]; 232#define LOAD_W_VEC_3_8() \ 233 ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */ 234 235#define LOAD_W_VEC_1(iop_num, ...) \ 236 LOAD_W_VEC_1_##iop_num() 237#define LOAD_W_VEC_2(iop_num, ...) \ 238 LOAD_W_VEC_2_##iop_num() 239#define LOAD_W_VEC_3(iop_num, ...) \ 240 LOAD_W_VEC_3_##iop_num() 241 242/* Message scheduling. Note: 3 words per vector register. 243 * Interleaving within round function needed for in-order CPUs. */ 244#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \ 245 /* Load (w[i - 16]) => XTMP0 */ \ 246 /* Load (w[i - 13]) => XTMP5 */ \ 247 ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */ 248#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \ 249 ext XTMP5.16b, w1.16b, w1.16b, #12; 250#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \ 251 ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */ 252#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \ 253 ext XTMP5.16b, XTMP5.16b, w2.16b, #12; 254#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \ 255 /* w[i - 9] == w3 */ \ 256 /* W3 ^ XTMP0 => XTMP0 */ \ 257 eor XTMP0.16b, XTMP0.16b, w3.16b; 258#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \ 259 /* w[i - 3] == w5 */ \ 260 /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ 261 /* rol(XTMP5, 7) => XTMP1 */ \ 262 add addr0, sp, #XW_W1_ADDR((round), 0); \ 263 shl XTMP2.4s, w5.4s, #15; 264#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \ 265 shl XTMP1.4s, XTMP5.4s, #7; 266#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \ 267 sri XTMP2.4s, w5.4s, #(32-15); 268#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \ 269 sri XTMP1.4s, XTMP5.4s, #(32-7); 270#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \ 271 eor XTMP0.16b, XTMP0.16b, XTMP2.16b; 272#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \ 273 /* w[i - 6] == W4 */ \ 274 /* W4 ^ XTMP1 => XTMP1 */ \ 275 eor XTMP1.16b, XTMP1.16b, w4.16b; 276#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \ 277 /* P1(XTMP0) ^ XTMP1 => W0 */ \ 278 shl XTMP3.4s, XTMP0.4s, #15; 279#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \ 280 shl XTMP4.4s, XTMP0.4s, #23; 281#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \ 282 eor w0.16b, XTMP1.16b, XTMP0.16b; 283#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \ 284 sri XTMP3.4s, XTMP0.4s, #(32-15); 285#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \ 286 sri XTMP4.4s, XTMP0.4s, #(32-23); 287#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \ 288 eor w0.16b, w0.16b, XTMP3.16b; 289#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \ 290 /* Load (w[i - 3]) => XTMP2 */ \ 291 ext XTMP2.16b, w4.16b, w4.16b, #12; 292#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \ 293 eor w0.16b, w0.16b, XTMP4.16b; 294#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \ 295 ext XTMP2.16b, XTMP2.16b, w5.16b, #12; 296#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \ 297 /* W1 ^ W2 => XTMP3 */ \ 298 eor XTMP3.16b, XTMP2.16b, w0.16b; 299#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5) 300#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \ 301 st1 {XTMP2.16b-XTMP3.16b}, [addr0]; 302#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5) 303 304#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \ 305 SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5) 306#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \ 307 SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5) 308#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \ 309 SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5) 310 311#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \ 312 SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0) 313#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \ 314 SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0) 315#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \ 316 SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0) 317 318#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \ 319 SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1) 320#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \ 321 SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1) 322#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \ 323 SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1) 324 325#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \ 326 SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2) 327#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \ 328 SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2) 329#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \ 330 SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2) 331 332#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \ 333 SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3) 334#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \ 335 SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3) 336#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \ 337 SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3) 338 339#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \ 340 SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4) 341#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \ 342 SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4) 343#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \ 344 SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4) 345 346 347 /* 348 * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'. 349 * 350 * void sm3_neon_transform(struct sm3_state *sst, u8 const *src, 351 * int blocks) 352 */ 353 .text 354.align 3 355SYM_TYPED_FUNC_START(sm3_neon_transform) 356 ldp ra, rb, [RSTATE, #0] 357 ldp rc, rd, [RSTATE, #8] 358 ldp re, rf, [RSTATE, #16] 359 ldp rg, rh, [RSTATE, #24] 360 361 stp x28, x29, [sp, #-16]! 362 stp x19, x20, [sp, #-16]! 363 stp x21, x22, [sp, #-16]! 364 stp x23, x24, [sp, #-16]! 365 stp x25, x26, [sp, #-16]! 366 mov RFRAME, sp 367 368 sub addr0, sp, #STACK_SIZE 369 adr_l RKPTR, .LKtable 370 and sp, addr0, #(~63) 371 372 /* Preload first block. */ 373 LOAD_W_VEC_1(1, 0) 374 LOAD_W_VEC_1(2, 0) 375 LOAD_W_VEC_1(3, 0) 376 LOAD_W_VEC_1(4, 0) 377 LOAD_W_VEC_1(5, 0) 378 LOAD_W_VEC_1(6, 0) 379 LOAD_W_VEC_1(7, 0) 380 LOAD_W_VEC_1(8, 0) 381 LOAD_W_VEC_2(1, 0) 382 LOAD_W_VEC_2(2, 0) 383 LOAD_W_VEC_2(3, 0) 384 LOAD_W_VEC_2(4, 0) 385 LOAD_W_VEC_2(5, 0) 386 LOAD_W_VEC_2(6, 0) 387 LOAD_W_VEC_2(7, 0) 388 LOAD_W_VEC_2(8, 0) 389 LOAD_W_VEC_3(1, 0) 390 LOAD_W_VEC_3(2, 0) 391 LOAD_W_VEC_3(3, 0) 392 LOAD_W_VEC_3(4, 0) 393 LOAD_W_VEC_3(5, 0) 394 LOAD_W_VEC_3(6, 0) 395 LOAD_W_VEC_3(7, 0) 396 LOAD_W_VEC_3(8, 0) 397 398.balign 16 399.Loop: 400 /* Transform 0-3 */ 401 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0) 402 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0) 403 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0) 404 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0) 405 406 /* Transform 4-7 + Precalc 12-14 */ 407 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0) 408 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0) 409 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12) 410 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12) 411 412 /* Transform 8-11 + Precalc 12-17 */ 413 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12) 414 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15) 415 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15) 416 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15) 417 418 /* Transform 12-14 + Precalc 18-20 */ 419 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18) 420 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18) 421 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18) 422 423 /* Transform 15-17 + Precalc 21-23 */ 424 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21) 425 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21) 426 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21) 427 428 /* Transform 18-20 + Precalc 24-26 */ 429 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24) 430 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24) 431 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24) 432 433 /* Transform 21-23 + Precalc 27-29 */ 434 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27) 435 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27) 436 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27) 437 438 /* Transform 24-26 + Precalc 30-32 */ 439 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30) 440 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30) 441 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30) 442 443 /* Transform 27-29 + Precalc 33-35 */ 444 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33) 445 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33) 446 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33) 447 448 /* Transform 30-32 + Precalc 36-38 */ 449 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36) 450 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36) 451 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36) 452 453 /* Transform 33-35 + Precalc 39-41 */ 454 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39) 455 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39) 456 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39) 457 458 /* Transform 36-38 + Precalc 42-44 */ 459 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42) 460 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42) 461 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42) 462 463 /* Transform 39-41 + Precalc 45-47 */ 464 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45) 465 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45) 466 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45) 467 468 /* Transform 42-44 + Precalc 48-50 */ 469 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48) 470 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48) 471 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48) 472 473 /* Transform 45-47 + Precalc 51-53 */ 474 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51) 475 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51) 476 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51) 477 478 /* Transform 48-50 + Precalc 54-56 */ 479 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54) 480 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54) 481 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54) 482 483 /* Transform 51-53 + Precalc 57-59 */ 484 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57) 485 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57) 486 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57) 487 488 /* Transform 54-56 + Precalc 60-62 */ 489 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60) 490 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60) 491 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60) 492 493 /* Transform 57-59 + Precalc 63 */ 494 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63) 495 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63) 496 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63) 497 498 /* Transform 60 */ 499 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _) 500 subs RNBLKS, RNBLKS, #1 501 b.eq .Lend 502 503 /* Transform 61-63 + Preload next block */ 504 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _) 505 ldp s0, s1, [RSTATE, #0] 506 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _) 507 ldp s2, s3, [RSTATE, #8] 508 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _) 509 510 /* Update the chaining variables. */ 511 eor ra, ra, s0 512 eor rb, rb, s1 513 ldp s0, s1, [RSTATE, #16] 514 eor rc, rc, s2 515 ldp k_even, k_odd, [RSTATE, #24] 516 eor rd, rd, s3 517 eor re, re, s0 518 stp ra, rb, [RSTATE, #0] 519 eor rf, rf, s1 520 stp rc, rd, [RSTATE, #8] 521 eor rg, rg, k_even 522 stp re, rf, [RSTATE, #16] 523 eor rh, rh, k_odd 524 stp rg, rh, [RSTATE, #24] 525 b .Loop 526 527.Lend: 528 /* Transform 61-63 */ 529 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _) 530 ldp s0, s1, [RSTATE, #0] 531 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _) 532 ldp s2, s3, [RSTATE, #8] 533 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _) 534 535 /* Update the chaining variables. */ 536 eor ra, ra, s0 537 clear_vec(W0) 538 eor rb, rb, s1 539 clear_vec(W1) 540 ldp s0, s1, [RSTATE, #16] 541 clear_vec(W2) 542 eor rc, rc, s2 543 clear_vec(W3) 544 ldp k_even, k_odd, [RSTATE, #24] 545 clear_vec(W4) 546 eor rd, rd, s3 547 clear_vec(W5) 548 eor re, re, s0 549 clear_vec(XTMP0) 550 stp ra, rb, [RSTATE, #0] 551 clear_vec(XTMP1) 552 eor rf, rf, s1 553 clear_vec(XTMP2) 554 stp rc, rd, [RSTATE, #8] 555 clear_vec(XTMP3) 556 eor rg, rg, k_even 557 clear_vec(XTMP4) 558 stp re, rf, [RSTATE, #16] 559 clear_vec(XTMP5) 560 eor rh, rh, k_odd 561 clear_vec(XTMP6) 562 stp rg, rh, [RSTATE, #24] 563 564 /* Clear message expansion area */ 565 add addr0, sp, #STACK_W 566 st1 {W0.16b-W3.16b}, [addr0], #64 567 st1 {W0.16b-W3.16b}, [addr0], #64 568 st1 {W0.16b-W3.16b}, [addr0] 569 570 mov sp, RFRAME 571 572 ldp x25, x26, [sp], #16 573 ldp x23, x24, [sp], #16 574 ldp x21, x22, [sp], #16 575 ldp x19, x20, [sp], #16 576 ldp x28, x29, [sp], #16 577 578 ret 579SYM_FUNC_END(sm3_neon_transform) 580 581 582 .section ".rodata", "a" 583 584 .align 4 585.LKtable: 586 .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb 587 .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc 588 .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce 589 .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6 590 .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c 591 .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce 592 .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec 593 .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 594 .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53 595 .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d 596 .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4 597 .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43 598 .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c 599 .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce 600 .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec 601 .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 602