1a41b2129STianjia Zhang// SPDX-License-Identifier: GPL-2.0-or-later 2a41b2129STianjia Zhang/* 3a41b2129STianjia Zhang * sm3-neon-core.S - SM3 secure hash using NEON instructions 4a41b2129STianjia Zhang * 5a41b2129STianjia Zhang * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64 6a41b2129STianjia Zhang * 7a41b2129STianjia Zhang * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8a41b2129STianjia Zhang * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9a41b2129STianjia Zhang */ 10a41b2129STianjia Zhang 11a41b2129STianjia Zhang#include <linux/linkage.h> 12*be8f6b64SEric Biggers#include <linux/cfi_types.h> 13a41b2129STianjia Zhang#include <asm/assembler.h> 14a41b2129STianjia Zhang 15a41b2129STianjia Zhang/* Context structure */ 16a41b2129STianjia Zhang 17a41b2129STianjia Zhang#define state_h0 0 18a41b2129STianjia Zhang#define state_h1 4 19a41b2129STianjia Zhang#define state_h2 8 20a41b2129STianjia Zhang#define state_h3 12 21a41b2129STianjia Zhang#define state_h4 16 22a41b2129STianjia Zhang#define state_h5 20 23a41b2129STianjia Zhang#define state_h6 24 24a41b2129STianjia Zhang#define state_h7 28 25a41b2129STianjia Zhang 26a41b2129STianjia Zhang/* Stack structure */ 27a41b2129STianjia Zhang 28a41b2129STianjia Zhang#define STACK_W_SIZE (32 * 2 * 3) 29a41b2129STianjia Zhang 30a41b2129STianjia Zhang#define STACK_W (0) 31a41b2129STianjia Zhang#define STACK_SIZE (STACK_W + STACK_W_SIZE) 32a41b2129STianjia Zhang 33a41b2129STianjia Zhang/* Register macros */ 34a41b2129STianjia Zhang 35a41b2129STianjia Zhang#define RSTATE x0 36a41b2129STianjia Zhang#define RDATA x1 37a41b2129STianjia Zhang#define RNBLKS x2 38a41b2129STianjia Zhang#define RKPTR x28 39a41b2129STianjia Zhang#define RFRAME x29 40a41b2129STianjia Zhang 41a41b2129STianjia Zhang#define ra w3 42a41b2129STianjia Zhang#define rb w4 43a41b2129STianjia Zhang#define rc w5 44a41b2129STianjia Zhang#define rd w6 45a41b2129STianjia Zhang#define re w7 46a41b2129STianjia Zhang#define rf w8 47a41b2129STianjia Zhang#define rg w9 48a41b2129STianjia Zhang#define rh w10 49a41b2129STianjia Zhang 50a41b2129STianjia Zhang#define t0 w11 51a41b2129STianjia Zhang#define t1 w12 52a41b2129STianjia Zhang#define t2 w13 53a41b2129STianjia Zhang#define t3 w14 54a41b2129STianjia Zhang#define t4 w15 55a41b2129STianjia Zhang#define t5 w16 56a41b2129STianjia Zhang#define t6 w17 57a41b2129STianjia Zhang 58a41b2129STianjia Zhang#define k_even w19 59a41b2129STianjia Zhang#define k_odd w20 60a41b2129STianjia Zhang 61a41b2129STianjia Zhang#define addr0 x21 62a41b2129STianjia Zhang#define addr1 x22 63a41b2129STianjia Zhang 64a41b2129STianjia Zhang#define s0 w23 65a41b2129STianjia Zhang#define s1 w24 66a41b2129STianjia Zhang#define s2 w25 67a41b2129STianjia Zhang#define s3 w26 68a41b2129STianjia Zhang 69a41b2129STianjia Zhang#define W0 v0 70a41b2129STianjia Zhang#define W1 v1 71a41b2129STianjia Zhang#define W2 v2 72a41b2129STianjia Zhang#define W3 v3 73a41b2129STianjia Zhang#define W4 v4 74a41b2129STianjia Zhang#define W5 v5 75a41b2129STianjia Zhang 76a41b2129STianjia Zhang#define XTMP0 v6 77a41b2129STianjia Zhang#define XTMP1 v7 78a41b2129STianjia Zhang#define XTMP2 v16 79a41b2129STianjia Zhang#define XTMP3 v17 80a41b2129STianjia Zhang#define XTMP4 v18 81a41b2129STianjia Zhang#define XTMP5 v19 82a41b2129STianjia Zhang#define XTMP6 v20 83a41b2129STianjia Zhang 84a41b2129STianjia Zhang/* Helper macros. */ 85a41b2129STianjia Zhang 86a41b2129STianjia Zhang#define _(...) /*_*/ 87a41b2129STianjia Zhang 88a41b2129STianjia Zhang#define clear_vec(x) \ 89a41b2129STianjia Zhang movi x.8h, #0; 90a41b2129STianjia Zhang 91a41b2129STianjia Zhang#define rolw(o, a, n) \ 92a41b2129STianjia Zhang ror o, a, #(32 - n); 93a41b2129STianjia Zhang 94a41b2129STianjia Zhang/* Round function macros. */ 95a41b2129STianjia Zhang 96a41b2129STianjia Zhang#define GG1_1(x, y, z, o, t) \ 97a41b2129STianjia Zhang eor o, x, y; 98a41b2129STianjia Zhang#define GG1_2(x, y, z, o, t) \ 99a41b2129STianjia Zhang eor o, o, z; 100a41b2129STianjia Zhang#define GG1_3(x, y, z, o, t) 101a41b2129STianjia Zhang 102a41b2129STianjia Zhang#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t) 103a41b2129STianjia Zhang#define FF1_2(x, y, z, o, t) 104a41b2129STianjia Zhang#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t) 105a41b2129STianjia Zhang 106a41b2129STianjia Zhang#define GG2_1(x, y, z, o, t) \ 107a41b2129STianjia Zhang bic o, z, x; 108a41b2129STianjia Zhang#define GG2_2(x, y, z, o, t) \ 109a41b2129STianjia Zhang and t, y, x; 110a41b2129STianjia Zhang#define GG2_3(x, y, z, o, t) \ 111a41b2129STianjia Zhang eor o, o, t; 112a41b2129STianjia Zhang 113a41b2129STianjia Zhang#define FF2_1(x, y, z, o, t) \ 114a41b2129STianjia Zhang eor o, x, y; 115a41b2129STianjia Zhang#define FF2_2(x, y, z, o, t) \ 116a41b2129STianjia Zhang and t, x, y; \ 117a41b2129STianjia Zhang and o, o, z; 118a41b2129STianjia Zhang#define FF2_3(x, y, z, o, t) \ 119a41b2129STianjia Zhang eor o, o, t; 120a41b2129STianjia Zhang 121a41b2129STianjia Zhang#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 122a41b2129STianjia Zhang K_LOAD(round); \ 123a41b2129STianjia Zhang ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \ 124a41b2129STianjia Zhang rolw(t0, a, 12); /* rol(a, 12) => t0 */ \ 125a41b2129STianjia Zhang IOP(1, iop_param); \ 126a41b2129STianjia Zhang FF##i##_1(a, b, c, t1, t2); \ 127a41b2129STianjia Zhang ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \ 128a41b2129STianjia Zhang add k, k, e; \ 129a41b2129STianjia Zhang IOP(2, iop_param); \ 130a41b2129STianjia Zhang GG##i##_1(e, f, g, t3, t4); \ 131a41b2129STianjia Zhang FF##i##_2(a, b, c, t1, t2); \ 132a41b2129STianjia Zhang IOP(3, iop_param); \ 133a41b2129STianjia Zhang add k, k, t0; \ 134a41b2129STianjia Zhang add h, h, t5; \ 135a41b2129STianjia Zhang add d, d, t6; /* w1w2 + d => d */ \ 136a41b2129STianjia Zhang IOP(4, iop_param); \ 137a41b2129STianjia Zhang rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \ 138a41b2129STianjia Zhang GG##i##_2(e, f, g, t3, t4); \ 139a41b2129STianjia Zhang add h, h, k; /* h + w1 + k => h */ \ 140a41b2129STianjia Zhang IOP(5, iop_param); \ 141a41b2129STianjia Zhang FF##i##_3(a, b, c, t1, t2); \ 142a41b2129STianjia Zhang eor t0, t0, k; /* k ^ t0 => t0 */ \ 143a41b2129STianjia Zhang GG##i##_3(e, f, g, t3, t4); \ 144a41b2129STianjia Zhang add d, d, t1; /* FF(a,b,c) + d => d */ \ 145a41b2129STianjia Zhang IOP(6, iop_param); \ 146a41b2129STianjia Zhang add t3, t3, h; /* GG(e,f,g) + h => t3 */ \ 147a41b2129STianjia Zhang rolw(b, b, 9); /* rol(b, 9) => b */ \ 148a41b2129STianjia Zhang eor h, t3, t3, ror #(32-9); \ 149a41b2129STianjia Zhang IOP(7, iop_param); \ 150a41b2129STianjia Zhang add d, d, t0; /* t0 + d => d */ \ 151a41b2129STianjia Zhang rolw(f, f, 19); /* rol(f, 19) => f */ \ 152a41b2129STianjia Zhang IOP(8, iop_param); \ 153a41b2129STianjia Zhang eor h, h, t3, ror #(32-17); /* P0(t3) => h */ 154a41b2129STianjia Zhang 155a41b2129STianjia Zhang#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 156a41b2129STianjia Zhang R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) 157a41b2129STianjia Zhang 158a41b2129STianjia Zhang#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 159a41b2129STianjia Zhang R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) 160a41b2129STianjia Zhang 161a41b2129STianjia Zhang#define KL(round) \ 162a41b2129STianjia Zhang ldp k_even, k_odd, [RKPTR, #(4*(round))]; 163a41b2129STianjia Zhang 164a41b2129STianjia Zhang/* Input expansion macros. */ 165a41b2129STianjia Zhang 166a41b2129STianjia Zhang/* Byte-swapped input address. */ 167a41b2129STianjia Zhang#define IW_W_ADDR(round, widx, offs) \ 168a41b2129STianjia Zhang (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4)) 169a41b2129STianjia Zhang 170a41b2129STianjia Zhang/* Expanded input address. */ 171a41b2129STianjia Zhang#define XW_W_ADDR(round, widx, offs) \ 172a41b2129STianjia Zhang (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4)) 173a41b2129STianjia Zhang 174a41b2129STianjia Zhang/* Rounds 1-12, byte-swapped input block addresses. */ 175a41b2129STianjia Zhang#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32) 176a41b2129STianjia Zhang#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48) 177a41b2129STianjia Zhang 178a41b2129STianjia Zhang/* Rounds 1-12, expanded input block addresses. */ 179a41b2129STianjia Zhang#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) 180a41b2129STianjia Zhang#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16) 181a41b2129STianjia Zhang 182a41b2129STianjia Zhang/* Input block loading. 183a41b2129STianjia Zhang * Interleaving within round function needed for in-order CPUs. */ 184a41b2129STianjia Zhang#define LOAD_W_VEC_1_1() \ 185a41b2129STianjia Zhang add addr0, sp, #IW_W1_ADDR(0, 0); 186a41b2129STianjia Zhang#define LOAD_W_VEC_1_2() \ 187a41b2129STianjia Zhang add addr1, sp, #IW_W1_ADDR(4, 0); 188a41b2129STianjia Zhang#define LOAD_W_VEC_1_3() \ 189a41b2129STianjia Zhang ld1 {W0.16b}, [RDATA], #16; 190a41b2129STianjia Zhang#define LOAD_W_VEC_1_4() \ 191a41b2129STianjia Zhang ld1 {W1.16b}, [RDATA], #16; 192a41b2129STianjia Zhang#define LOAD_W_VEC_1_5() \ 193a41b2129STianjia Zhang ld1 {W2.16b}, [RDATA], #16; 194a41b2129STianjia Zhang#define LOAD_W_VEC_1_6() \ 195a41b2129STianjia Zhang ld1 {W3.16b}, [RDATA], #16; 196a41b2129STianjia Zhang#define LOAD_W_VEC_1_7() \ 197a41b2129STianjia Zhang rev32 XTMP0.16b, W0.16b; 198a41b2129STianjia Zhang#define LOAD_W_VEC_1_8() \ 199a41b2129STianjia Zhang rev32 XTMP1.16b, W1.16b; 200a41b2129STianjia Zhang#define LOAD_W_VEC_2_1() \ 201a41b2129STianjia Zhang rev32 XTMP2.16b, W2.16b; 202a41b2129STianjia Zhang#define LOAD_W_VEC_2_2() \ 203a41b2129STianjia Zhang rev32 XTMP3.16b, W3.16b; 204a41b2129STianjia Zhang#define LOAD_W_VEC_2_3() \ 205a41b2129STianjia Zhang eor XTMP4.16b, XTMP1.16b, XTMP0.16b; 206a41b2129STianjia Zhang#define LOAD_W_VEC_2_4() \ 207a41b2129STianjia Zhang eor XTMP5.16b, XTMP2.16b, XTMP1.16b; 208a41b2129STianjia Zhang#define LOAD_W_VEC_2_5() \ 209a41b2129STianjia Zhang st1 {XTMP0.16b}, [addr0], #16; 210a41b2129STianjia Zhang#define LOAD_W_VEC_2_6() \ 211a41b2129STianjia Zhang st1 {XTMP4.16b}, [addr0]; \ 212a41b2129STianjia Zhang add addr0, sp, #IW_W1_ADDR(8, 0); 213a41b2129STianjia Zhang#define LOAD_W_VEC_2_7() \ 214a41b2129STianjia Zhang eor XTMP6.16b, XTMP3.16b, XTMP2.16b; 215a41b2129STianjia Zhang#define LOAD_W_VEC_2_8() \ 216a41b2129STianjia Zhang ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */ 217a41b2129STianjia Zhang#define LOAD_W_VEC_3_1() \ 218a41b2129STianjia Zhang mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */ 219a41b2129STianjia Zhang#define LOAD_W_VEC_3_2() \ 220a41b2129STianjia Zhang st1 {XTMP1.16b}, [addr1], #16; 221a41b2129STianjia Zhang#define LOAD_W_VEC_3_3() \ 222a41b2129STianjia Zhang st1 {XTMP5.16b}, [addr1]; \ 223a41b2129STianjia Zhang ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */ 224a41b2129STianjia Zhang#define LOAD_W_VEC_3_4() \ 225a41b2129STianjia Zhang ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */ 226a41b2129STianjia Zhang#define LOAD_W_VEC_3_5() \ 227a41b2129STianjia Zhang ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */ 228a41b2129STianjia Zhang#define LOAD_W_VEC_3_6() \ 229a41b2129STianjia Zhang st1 {XTMP2.16b}, [addr0], #16; 230a41b2129STianjia Zhang#define LOAD_W_VEC_3_7() \ 231a41b2129STianjia Zhang st1 {XTMP6.16b}, [addr0]; 232a41b2129STianjia Zhang#define LOAD_W_VEC_3_8() \ 233a41b2129STianjia Zhang ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */ 234a41b2129STianjia Zhang 235a41b2129STianjia Zhang#define LOAD_W_VEC_1(iop_num, ...) \ 236a41b2129STianjia Zhang LOAD_W_VEC_1_##iop_num() 237a41b2129STianjia Zhang#define LOAD_W_VEC_2(iop_num, ...) \ 238a41b2129STianjia Zhang LOAD_W_VEC_2_##iop_num() 239a41b2129STianjia Zhang#define LOAD_W_VEC_3(iop_num, ...) \ 240a41b2129STianjia Zhang LOAD_W_VEC_3_##iop_num() 241a41b2129STianjia Zhang 242a41b2129STianjia Zhang/* Message scheduling. Note: 3 words per vector register. 243a41b2129STianjia Zhang * Interleaving within round function needed for in-order CPUs. */ 244a41b2129STianjia Zhang#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \ 245a41b2129STianjia Zhang /* Load (w[i - 16]) => XTMP0 */ \ 246a41b2129STianjia Zhang /* Load (w[i - 13]) => XTMP5 */ \ 247a41b2129STianjia Zhang ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */ 248a41b2129STianjia Zhang#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \ 249a41b2129STianjia Zhang ext XTMP5.16b, w1.16b, w1.16b, #12; 250a41b2129STianjia Zhang#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \ 251a41b2129STianjia Zhang ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */ 252a41b2129STianjia Zhang#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \ 253a41b2129STianjia Zhang ext XTMP5.16b, XTMP5.16b, w2.16b, #12; 254a41b2129STianjia Zhang#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \ 255a41b2129STianjia Zhang /* w[i - 9] == w3 */ \ 256a41b2129STianjia Zhang /* W3 ^ XTMP0 => XTMP0 */ \ 257a41b2129STianjia Zhang eor XTMP0.16b, XTMP0.16b, w3.16b; 258a41b2129STianjia Zhang#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \ 259a41b2129STianjia Zhang /* w[i - 3] == w5 */ \ 260a41b2129STianjia Zhang /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ 261a41b2129STianjia Zhang /* rol(XTMP5, 7) => XTMP1 */ \ 262a41b2129STianjia Zhang add addr0, sp, #XW_W1_ADDR((round), 0); \ 263a41b2129STianjia Zhang shl XTMP2.4s, w5.4s, #15; 264a41b2129STianjia Zhang#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \ 265a41b2129STianjia Zhang shl XTMP1.4s, XTMP5.4s, #7; 266a41b2129STianjia Zhang#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \ 267a41b2129STianjia Zhang sri XTMP2.4s, w5.4s, #(32-15); 268a41b2129STianjia Zhang#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \ 269a41b2129STianjia Zhang sri XTMP1.4s, XTMP5.4s, #(32-7); 270a41b2129STianjia Zhang#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \ 271a41b2129STianjia Zhang eor XTMP0.16b, XTMP0.16b, XTMP2.16b; 272a41b2129STianjia Zhang#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \ 273a41b2129STianjia Zhang /* w[i - 6] == W4 */ \ 274a41b2129STianjia Zhang /* W4 ^ XTMP1 => XTMP1 */ \ 275a41b2129STianjia Zhang eor XTMP1.16b, XTMP1.16b, w4.16b; 276a41b2129STianjia Zhang#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \ 277a41b2129STianjia Zhang /* P1(XTMP0) ^ XTMP1 => W0 */ \ 278a41b2129STianjia Zhang shl XTMP3.4s, XTMP0.4s, #15; 279a41b2129STianjia Zhang#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \ 280a41b2129STianjia Zhang shl XTMP4.4s, XTMP0.4s, #23; 281a41b2129STianjia Zhang#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \ 282a41b2129STianjia Zhang eor w0.16b, XTMP1.16b, XTMP0.16b; 283a41b2129STianjia Zhang#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \ 284a41b2129STianjia Zhang sri XTMP3.4s, XTMP0.4s, #(32-15); 285a41b2129STianjia Zhang#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \ 286a41b2129STianjia Zhang sri XTMP4.4s, XTMP0.4s, #(32-23); 287a41b2129STianjia Zhang#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \ 288a41b2129STianjia Zhang eor w0.16b, w0.16b, XTMP3.16b; 289a41b2129STianjia Zhang#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \ 290a41b2129STianjia Zhang /* Load (w[i - 3]) => XTMP2 */ \ 291a41b2129STianjia Zhang ext XTMP2.16b, w4.16b, w4.16b, #12; 292a41b2129STianjia Zhang#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \ 293a41b2129STianjia Zhang eor w0.16b, w0.16b, XTMP4.16b; 294a41b2129STianjia Zhang#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \ 295a41b2129STianjia Zhang ext XTMP2.16b, XTMP2.16b, w5.16b, #12; 296a41b2129STianjia Zhang#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \ 297a41b2129STianjia Zhang /* W1 ^ W2 => XTMP3 */ \ 298a41b2129STianjia Zhang eor XTMP3.16b, XTMP2.16b, w0.16b; 299a41b2129STianjia Zhang#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5) 300a41b2129STianjia Zhang#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \ 301a41b2129STianjia Zhang st1 {XTMP2.16b-XTMP3.16b}, [addr0]; 302a41b2129STianjia Zhang#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5) 303a41b2129STianjia Zhang 304a41b2129STianjia Zhang#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \ 305a41b2129STianjia Zhang SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5) 306a41b2129STianjia Zhang#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \ 307a41b2129STianjia Zhang SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5) 308a41b2129STianjia Zhang#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \ 309a41b2129STianjia Zhang SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5) 310a41b2129STianjia Zhang 311a41b2129STianjia Zhang#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \ 312a41b2129STianjia Zhang SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0) 313a41b2129STianjia Zhang#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \ 314a41b2129STianjia Zhang SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0) 315a41b2129STianjia Zhang#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \ 316a41b2129STianjia Zhang SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0) 317a41b2129STianjia Zhang 318a41b2129STianjia Zhang#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \ 319a41b2129STianjia Zhang SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1) 320a41b2129STianjia Zhang#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \ 321a41b2129STianjia Zhang SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1) 322a41b2129STianjia Zhang#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \ 323a41b2129STianjia Zhang SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1) 324a41b2129STianjia Zhang 325a41b2129STianjia Zhang#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \ 326a41b2129STianjia Zhang SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2) 327a41b2129STianjia Zhang#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \ 328a41b2129STianjia Zhang SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2) 329a41b2129STianjia Zhang#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \ 330a41b2129STianjia Zhang SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2) 331a41b2129STianjia Zhang 332a41b2129STianjia Zhang#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \ 333a41b2129STianjia Zhang SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3) 334a41b2129STianjia Zhang#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \ 335a41b2129STianjia Zhang SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3) 336a41b2129STianjia Zhang#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \ 337a41b2129STianjia Zhang SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3) 338a41b2129STianjia Zhang 339a41b2129STianjia Zhang#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \ 340a41b2129STianjia Zhang SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4) 341a41b2129STianjia Zhang#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \ 342a41b2129STianjia Zhang SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4) 343a41b2129STianjia Zhang#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \ 344a41b2129STianjia Zhang SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4) 345a41b2129STianjia Zhang 346a41b2129STianjia Zhang 347a41b2129STianjia Zhang /* 348a41b2129STianjia Zhang * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'. 349a41b2129STianjia Zhang * 350a41b2129STianjia Zhang * void sm3_neon_transform(struct sm3_state *sst, u8 const *src, 351a41b2129STianjia Zhang * int blocks) 352a41b2129STianjia Zhang */ 353a41b2129STianjia Zhang .text 354a41b2129STianjia Zhang.align 3 355*be8f6b64SEric BiggersSYM_TYPED_FUNC_START(sm3_neon_transform) 356a41b2129STianjia Zhang ldp ra, rb, [RSTATE, #0] 357a41b2129STianjia Zhang ldp rc, rd, [RSTATE, #8] 358a41b2129STianjia Zhang ldp re, rf, [RSTATE, #16] 359a41b2129STianjia Zhang ldp rg, rh, [RSTATE, #24] 360a41b2129STianjia Zhang 361a41b2129STianjia Zhang stp x28, x29, [sp, #-16]! 362a41b2129STianjia Zhang stp x19, x20, [sp, #-16]! 363a41b2129STianjia Zhang stp x21, x22, [sp, #-16]! 364a41b2129STianjia Zhang stp x23, x24, [sp, #-16]! 365a41b2129STianjia Zhang stp x25, x26, [sp, #-16]! 366a41b2129STianjia Zhang mov RFRAME, sp 367a41b2129STianjia Zhang 368a41b2129STianjia Zhang sub addr0, sp, #STACK_SIZE 369a41b2129STianjia Zhang adr_l RKPTR, .LKtable 370a41b2129STianjia Zhang and sp, addr0, #(~63) 371a41b2129STianjia Zhang 372a41b2129STianjia Zhang /* Preload first block. */ 373a41b2129STianjia Zhang LOAD_W_VEC_1(1, 0) 374a41b2129STianjia Zhang LOAD_W_VEC_1(2, 0) 375a41b2129STianjia Zhang LOAD_W_VEC_1(3, 0) 376a41b2129STianjia Zhang LOAD_W_VEC_1(4, 0) 377a41b2129STianjia Zhang LOAD_W_VEC_1(5, 0) 378a41b2129STianjia Zhang LOAD_W_VEC_1(6, 0) 379a41b2129STianjia Zhang LOAD_W_VEC_1(7, 0) 380a41b2129STianjia Zhang LOAD_W_VEC_1(8, 0) 381a41b2129STianjia Zhang LOAD_W_VEC_2(1, 0) 382a41b2129STianjia Zhang LOAD_W_VEC_2(2, 0) 383a41b2129STianjia Zhang LOAD_W_VEC_2(3, 0) 384a41b2129STianjia Zhang LOAD_W_VEC_2(4, 0) 385a41b2129STianjia Zhang LOAD_W_VEC_2(5, 0) 386a41b2129STianjia Zhang LOAD_W_VEC_2(6, 0) 387a41b2129STianjia Zhang LOAD_W_VEC_2(7, 0) 388a41b2129STianjia Zhang LOAD_W_VEC_2(8, 0) 389a41b2129STianjia Zhang LOAD_W_VEC_3(1, 0) 390a41b2129STianjia Zhang LOAD_W_VEC_3(2, 0) 391a41b2129STianjia Zhang LOAD_W_VEC_3(3, 0) 392a41b2129STianjia Zhang LOAD_W_VEC_3(4, 0) 393a41b2129STianjia Zhang LOAD_W_VEC_3(5, 0) 394a41b2129STianjia Zhang LOAD_W_VEC_3(6, 0) 395a41b2129STianjia Zhang LOAD_W_VEC_3(7, 0) 396a41b2129STianjia Zhang LOAD_W_VEC_3(8, 0) 397a41b2129STianjia Zhang 398a41b2129STianjia Zhang.balign 16 399a41b2129STianjia Zhang.Loop: 400a41b2129STianjia Zhang /* Transform 0-3 */ 401a41b2129STianjia Zhang R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0) 402a41b2129STianjia Zhang R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0) 403a41b2129STianjia Zhang R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0) 404a41b2129STianjia Zhang R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0) 405a41b2129STianjia Zhang 406a41b2129STianjia Zhang /* Transform 4-7 + Precalc 12-14 */ 407a41b2129STianjia Zhang R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0) 408a41b2129STianjia Zhang R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0) 409a41b2129STianjia Zhang R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12) 410a41b2129STianjia Zhang R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12) 411a41b2129STianjia Zhang 412a41b2129STianjia Zhang /* Transform 8-11 + Precalc 12-17 */ 413a41b2129STianjia Zhang R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12) 414a41b2129STianjia Zhang R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15) 415a41b2129STianjia Zhang R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15) 416a41b2129STianjia Zhang R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15) 417a41b2129STianjia Zhang 418a41b2129STianjia Zhang /* Transform 12-14 + Precalc 18-20 */ 419a41b2129STianjia Zhang R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18) 420a41b2129STianjia Zhang R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18) 421a41b2129STianjia Zhang R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18) 422a41b2129STianjia Zhang 423a41b2129STianjia Zhang /* Transform 15-17 + Precalc 21-23 */ 424a41b2129STianjia Zhang R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21) 425a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21) 426a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21) 427a41b2129STianjia Zhang 428a41b2129STianjia Zhang /* Transform 18-20 + Precalc 24-26 */ 429a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24) 430a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24) 431a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24) 432a41b2129STianjia Zhang 433a41b2129STianjia Zhang /* Transform 21-23 + Precalc 27-29 */ 434a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27) 435a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27) 436a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27) 437a41b2129STianjia Zhang 438a41b2129STianjia Zhang /* Transform 24-26 + Precalc 30-32 */ 439a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30) 440a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30) 441a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30) 442a41b2129STianjia Zhang 443a41b2129STianjia Zhang /* Transform 27-29 + Precalc 33-35 */ 444a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33) 445a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33) 446a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33) 447a41b2129STianjia Zhang 448a41b2129STianjia Zhang /* Transform 30-32 + Precalc 36-38 */ 449a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36) 450a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36) 451a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36) 452a41b2129STianjia Zhang 453a41b2129STianjia Zhang /* Transform 33-35 + Precalc 39-41 */ 454a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39) 455a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39) 456a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39) 457a41b2129STianjia Zhang 458a41b2129STianjia Zhang /* Transform 36-38 + Precalc 42-44 */ 459a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42) 460a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42) 461a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42) 462a41b2129STianjia Zhang 463a41b2129STianjia Zhang /* Transform 39-41 + Precalc 45-47 */ 464a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45) 465a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45) 466a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45) 467a41b2129STianjia Zhang 468a41b2129STianjia Zhang /* Transform 42-44 + Precalc 48-50 */ 469a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48) 470a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48) 471a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48) 472a41b2129STianjia Zhang 473a41b2129STianjia Zhang /* Transform 45-47 + Precalc 51-53 */ 474a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51) 475a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51) 476a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51) 477a41b2129STianjia Zhang 478a41b2129STianjia Zhang /* Transform 48-50 + Precalc 54-56 */ 479a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54) 480a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54) 481a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54) 482a41b2129STianjia Zhang 483a41b2129STianjia Zhang /* Transform 51-53 + Precalc 57-59 */ 484a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57) 485a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57) 486a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57) 487a41b2129STianjia Zhang 488a41b2129STianjia Zhang /* Transform 54-56 + Precalc 60-62 */ 489a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60) 490a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60) 491a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60) 492a41b2129STianjia Zhang 493a41b2129STianjia Zhang /* Transform 57-59 + Precalc 63 */ 494a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63) 495a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63) 496a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63) 497a41b2129STianjia Zhang 498a41b2129STianjia Zhang /* Transform 60 */ 499a41b2129STianjia Zhang R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _) 500a41b2129STianjia Zhang subs RNBLKS, RNBLKS, #1 501a41b2129STianjia Zhang b.eq .Lend 502a41b2129STianjia Zhang 503a41b2129STianjia Zhang /* Transform 61-63 + Preload next block */ 504a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _) 505a41b2129STianjia Zhang ldp s0, s1, [RSTATE, #0] 506a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _) 507a41b2129STianjia Zhang ldp s2, s3, [RSTATE, #8] 508a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _) 509a41b2129STianjia Zhang 510a41b2129STianjia Zhang /* Update the chaining variables. */ 511a41b2129STianjia Zhang eor ra, ra, s0 512a41b2129STianjia Zhang eor rb, rb, s1 513a41b2129STianjia Zhang ldp s0, s1, [RSTATE, #16] 514a41b2129STianjia Zhang eor rc, rc, s2 515a41b2129STianjia Zhang ldp k_even, k_odd, [RSTATE, #24] 516a41b2129STianjia Zhang eor rd, rd, s3 517a41b2129STianjia Zhang eor re, re, s0 518a41b2129STianjia Zhang stp ra, rb, [RSTATE, #0] 519a41b2129STianjia Zhang eor rf, rf, s1 520a41b2129STianjia Zhang stp rc, rd, [RSTATE, #8] 521a41b2129STianjia Zhang eor rg, rg, k_even 522a41b2129STianjia Zhang stp re, rf, [RSTATE, #16] 523a41b2129STianjia Zhang eor rh, rh, k_odd 524a41b2129STianjia Zhang stp rg, rh, [RSTATE, #24] 525a41b2129STianjia Zhang b .Loop 526a41b2129STianjia Zhang 527a41b2129STianjia Zhang.Lend: 528a41b2129STianjia Zhang /* Transform 61-63 */ 529a41b2129STianjia Zhang R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _) 530a41b2129STianjia Zhang ldp s0, s1, [RSTATE, #0] 531a41b2129STianjia Zhang R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _) 532a41b2129STianjia Zhang ldp s2, s3, [RSTATE, #8] 533a41b2129STianjia Zhang R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _) 534a41b2129STianjia Zhang 535a41b2129STianjia Zhang /* Update the chaining variables. */ 536a41b2129STianjia Zhang eor ra, ra, s0 537a41b2129STianjia Zhang clear_vec(W0) 538a41b2129STianjia Zhang eor rb, rb, s1 539a41b2129STianjia Zhang clear_vec(W1) 540a41b2129STianjia Zhang ldp s0, s1, [RSTATE, #16] 541a41b2129STianjia Zhang clear_vec(W2) 542a41b2129STianjia Zhang eor rc, rc, s2 543a41b2129STianjia Zhang clear_vec(W3) 544a41b2129STianjia Zhang ldp k_even, k_odd, [RSTATE, #24] 545a41b2129STianjia Zhang clear_vec(W4) 546a41b2129STianjia Zhang eor rd, rd, s3 547a41b2129STianjia Zhang clear_vec(W5) 548a41b2129STianjia Zhang eor re, re, s0 549a41b2129STianjia Zhang clear_vec(XTMP0) 550a41b2129STianjia Zhang stp ra, rb, [RSTATE, #0] 551a41b2129STianjia Zhang clear_vec(XTMP1) 552a41b2129STianjia Zhang eor rf, rf, s1 553a41b2129STianjia Zhang clear_vec(XTMP2) 554a41b2129STianjia Zhang stp rc, rd, [RSTATE, #8] 555a41b2129STianjia Zhang clear_vec(XTMP3) 556a41b2129STianjia Zhang eor rg, rg, k_even 557a41b2129STianjia Zhang clear_vec(XTMP4) 558a41b2129STianjia Zhang stp re, rf, [RSTATE, #16] 559a41b2129STianjia Zhang clear_vec(XTMP5) 560a41b2129STianjia Zhang eor rh, rh, k_odd 561a41b2129STianjia Zhang clear_vec(XTMP6) 562a41b2129STianjia Zhang stp rg, rh, [RSTATE, #24] 563a41b2129STianjia Zhang 564a41b2129STianjia Zhang /* Clear message expansion area */ 565a41b2129STianjia Zhang add addr0, sp, #STACK_W 566a41b2129STianjia Zhang st1 {W0.16b-W3.16b}, [addr0], #64 567a41b2129STianjia Zhang st1 {W0.16b-W3.16b}, [addr0], #64 568a41b2129STianjia Zhang st1 {W0.16b-W3.16b}, [addr0] 569a41b2129STianjia Zhang 570a41b2129STianjia Zhang mov sp, RFRAME 571a41b2129STianjia Zhang 572a41b2129STianjia Zhang ldp x25, x26, [sp], #16 573a41b2129STianjia Zhang ldp x23, x24, [sp], #16 574a41b2129STianjia Zhang ldp x21, x22, [sp], #16 575a41b2129STianjia Zhang ldp x19, x20, [sp], #16 576a41b2129STianjia Zhang ldp x28, x29, [sp], #16 577a41b2129STianjia Zhang 578a41b2129STianjia Zhang ret 579a41b2129STianjia ZhangSYM_FUNC_END(sm3_neon_transform) 580a41b2129STianjia Zhang 581a41b2129STianjia Zhang 582a41b2129STianjia Zhang .section ".rodata", "a" 583a41b2129STianjia Zhang 584a41b2129STianjia Zhang .align 4 585a41b2129STianjia Zhang.LKtable: 586a41b2129STianjia Zhang .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb 587a41b2129STianjia Zhang .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc 588a41b2129STianjia Zhang .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce 589a41b2129STianjia Zhang .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6 590a41b2129STianjia Zhang .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c 591a41b2129STianjia Zhang .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce 592a41b2129STianjia Zhang .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec 593a41b2129STianjia Zhang .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 594a41b2129STianjia Zhang .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53 595a41b2129STianjia Zhang .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d 596a41b2129STianjia Zhang .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4 597a41b2129STianjia Zhang .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43 598a41b2129STianjia Zhang .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c 599a41b2129STianjia Zhang .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce 600a41b2129STianjia Zhang .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec 601a41b2129STianjia Zhang .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 602