1 /* 2 * RISC-V Vector Crypto Extension Helpers for QEMU. 3 * 4 * Copyright (C) 2023 SiFive, Inc. 5 * Written by Codethink Ltd and SiFive. 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2 or later, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 * 16 * You should have received a copy of the GNU General Public License along with 17 * this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu/host-utils.h" 22 #include "qemu/bitops.h" 23 #include "qemu/bswap.h" 24 #include "cpu.h" 25 #include "crypto/aes.h" 26 #include "crypto/aes-round.h" 27 #include "crypto/sm4.h" 28 #include "exec/memop.h" 29 #include "exec/exec-all.h" 30 #include "exec/helper-proto.h" 31 #include "internals.h" 32 #include "vector_internals.h" 33 34 static uint64_t clmul64(uint64_t y, uint64_t x) 35 { 36 uint64_t result = 0; 37 for (int j = 63; j >= 0; j--) { 38 if ((y >> j) & 1) { 39 result ^= (x << j); 40 } 41 } 42 return result; 43 } 44 45 static uint64_t clmulh64(uint64_t y, uint64_t x) 46 { 47 uint64_t result = 0; 48 for (int j = 63; j >= 1; j--) { 49 if ((y >> j) & 1) { 50 result ^= (x >> (64 - j)); 51 } 52 } 53 return result; 54 } 55 56 RVVCALL(OPIVV2, vclmul_vv, OP_UUU_D, H8, H8, H8, clmul64) 57 GEN_VEXT_VV(vclmul_vv, 8) 58 RVVCALL(OPIVX2, vclmul_vx, OP_UUU_D, H8, H8, clmul64) 59 GEN_VEXT_VX(vclmul_vx, 8) 60 RVVCALL(OPIVV2, vclmulh_vv, OP_UUU_D, H8, H8, H8, clmulh64) 61 GEN_VEXT_VV(vclmulh_vv, 8) 62 RVVCALL(OPIVX2, vclmulh_vx, OP_UUU_D, H8, H8, clmulh64) 63 GEN_VEXT_VX(vclmulh_vx, 8) 64 65 RVVCALL(OPIVV2, vror_vv_b, OP_UUU_B, H1, H1, H1, ror8) 66 RVVCALL(OPIVV2, vror_vv_h, OP_UUU_H, H2, H2, H2, ror16) 67 RVVCALL(OPIVV2, vror_vv_w, OP_UUU_W, H4, H4, H4, ror32) 68 RVVCALL(OPIVV2, vror_vv_d, OP_UUU_D, H8, H8, H8, ror64) 69 GEN_VEXT_VV(vror_vv_b, 1) 70 GEN_VEXT_VV(vror_vv_h, 2) 71 GEN_VEXT_VV(vror_vv_w, 4) 72 GEN_VEXT_VV(vror_vv_d, 8) 73 74 RVVCALL(OPIVX2, vror_vx_b, OP_UUU_B, H1, H1, ror8) 75 RVVCALL(OPIVX2, vror_vx_h, OP_UUU_H, H2, H2, ror16) 76 RVVCALL(OPIVX2, vror_vx_w, OP_UUU_W, H4, H4, ror32) 77 RVVCALL(OPIVX2, vror_vx_d, OP_UUU_D, H8, H8, ror64) 78 GEN_VEXT_VX(vror_vx_b, 1) 79 GEN_VEXT_VX(vror_vx_h, 2) 80 GEN_VEXT_VX(vror_vx_w, 4) 81 GEN_VEXT_VX(vror_vx_d, 8) 82 83 RVVCALL(OPIVV2, vrol_vv_b, OP_UUU_B, H1, H1, H1, rol8) 84 RVVCALL(OPIVV2, vrol_vv_h, OP_UUU_H, H2, H2, H2, rol16) 85 RVVCALL(OPIVV2, vrol_vv_w, OP_UUU_W, H4, H4, H4, rol32) 86 RVVCALL(OPIVV2, vrol_vv_d, OP_UUU_D, H8, H8, H8, rol64) 87 GEN_VEXT_VV(vrol_vv_b, 1) 88 GEN_VEXT_VV(vrol_vv_h, 2) 89 GEN_VEXT_VV(vrol_vv_w, 4) 90 GEN_VEXT_VV(vrol_vv_d, 8) 91 92 RVVCALL(OPIVX2, vrol_vx_b, OP_UUU_B, H1, H1, rol8) 93 RVVCALL(OPIVX2, vrol_vx_h, OP_UUU_H, H2, H2, rol16) 94 RVVCALL(OPIVX2, vrol_vx_w, OP_UUU_W, H4, H4, rol32) 95 RVVCALL(OPIVX2, vrol_vx_d, OP_UUU_D, H8, H8, rol64) 96 GEN_VEXT_VX(vrol_vx_b, 1) 97 GEN_VEXT_VX(vrol_vx_h, 2) 98 GEN_VEXT_VX(vrol_vx_w, 4) 99 GEN_VEXT_VX(vrol_vx_d, 8) 100 101 static uint64_t brev8(uint64_t val) 102 { 103 val = ((val & 0x5555555555555555ull) << 1) | 104 ((val & 0xAAAAAAAAAAAAAAAAull) >> 1); 105 val = ((val & 0x3333333333333333ull) << 2) | 106 ((val & 0xCCCCCCCCCCCCCCCCull) >> 2); 107 val = ((val & 0x0F0F0F0F0F0F0F0Full) << 4) | 108 ((val & 0xF0F0F0F0F0F0F0F0ull) >> 4); 109 110 return val; 111 } 112 113 RVVCALL(OPIVV1, vbrev8_v_b, OP_UU_B, H1, H1, brev8) 114 RVVCALL(OPIVV1, vbrev8_v_h, OP_UU_H, H2, H2, brev8) 115 RVVCALL(OPIVV1, vbrev8_v_w, OP_UU_W, H4, H4, brev8) 116 RVVCALL(OPIVV1, vbrev8_v_d, OP_UU_D, H8, H8, brev8) 117 GEN_VEXT_V(vbrev8_v_b, 1) 118 GEN_VEXT_V(vbrev8_v_h, 2) 119 GEN_VEXT_V(vbrev8_v_w, 4) 120 GEN_VEXT_V(vbrev8_v_d, 8) 121 122 #define DO_IDENTITY(a) (a) 123 RVVCALL(OPIVV1, vrev8_v_b, OP_UU_B, H1, H1, DO_IDENTITY) 124 RVVCALL(OPIVV1, vrev8_v_h, OP_UU_H, H2, H2, bswap16) 125 RVVCALL(OPIVV1, vrev8_v_w, OP_UU_W, H4, H4, bswap32) 126 RVVCALL(OPIVV1, vrev8_v_d, OP_UU_D, H8, H8, bswap64) 127 GEN_VEXT_V(vrev8_v_b, 1) 128 GEN_VEXT_V(vrev8_v_h, 2) 129 GEN_VEXT_V(vrev8_v_w, 4) 130 GEN_VEXT_V(vrev8_v_d, 8) 131 132 #define DO_ANDN(a, b) ((a) & ~(b)) 133 RVVCALL(OPIVV2, vandn_vv_b, OP_UUU_B, H1, H1, H1, DO_ANDN) 134 RVVCALL(OPIVV2, vandn_vv_h, OP_UUU_H, H2, H2, H2, DO_ANDN) 135 RVVCALL(OPIVV2, vandn_vv_w, OP_UUU_W, H4, H4, H4, DO_ANDN) 136 RVVCALL(OPIVV2, vandn_vv_d, OP_UUU_D, H8, H8, H8, DO_ANDN) 137 GEN_VEXT_VV(vandn_vv_b, 1) 138 GEN_VEXT_VV(vandn_vv_h, 2) 139 GEN_VEXT_VV(vandn_vv_w, 4) 140 GEN_VEXT_VV(vandn_vv_d, 8) 141 142 RVVCALL(OPIVX2, vandn_vx_b, OP_UUU_B, H1, H1, DO_ANDN) 143 RVVCALL(OPIVX2, vandn_vx_h, OP_UUU_H, H2, H2, DO_ANDN) 144 RVVCALL(OPIVX2, vandn_vx_w, OP_UUU_W, H4, H4, DO_ANDN) 145 RVVCALL(OPIVX2, vandn_vx_d, OP_UUU_D, H8, H8, DO_ANDN) 146 GEN_VEXT_VX(vandn_vx_b, 1) 147 GEN_VEXT_VX(vandn_vx_h, 2) 148 GEN_VEXT_VX(vandn_vx_w, 4) 149 GEN_VEXT_VX(vandn_vx_d, 8) 150 151 RVVCALL(OPIVV1, vbrev_v_b, OP_UU_B, H1, H1, revbit8) 152 RVVCALL(OPIVV1, vbrev_v_h, OP_UU_H, H2, H2, revbit16) 153 RVVCALL(OPIVV1, vbrev_v_w, OP_UU_W, H4, H4, revbit32) 154 RVVCALL(OPIVV1, vbrev_v_d, OP_UU_D, H8, H8, revbit64) 155 GEN_VEXT_V(vbrev_v_b, 1) 156 GEN_VEXT_V(vbrev_v_h, 2) 157 GEN_VEXT_V(vbrev_v_w, 4) 158 GEN_VEXT_V(vbrev_v_d, 8) 159 160 RVVCALL(OPIVV1, vclz_v_b, OP_UU_B, H1, H1, clz8) 161 RVVCALL(OPIVV1, vclz_v_h, OP_UU_H, H2, H2, clz16) 162 RVVCALL(OPIVV1, vclz_v_w, OP_UU_W, H4, H4, clz32) 163 RVVCALL(OPIVV1, vclz_v_d, OP_UU_D, H8, H8, clz64) 164 GEN_VEXT_V(vclz_v_b, 1) 165 GEN_VEXT_V(vclz_v_h, 2) 166 GEN_VEXT_V(vclz_v_w, 4) 167 GEN_VEXT_V(vclz_v_d, 8) 168 169 RVVCALL(OPIVV1, vctz_v_b, OP_UU_B, H1, H1, ctz8) 170 RVVCALL(OPIVV1, vctz_v_h, OP_UU_H, H2, H2, ctz16) 171 RVVCALL(OPIVV1, vctz_v_w, OP_UU_W, H4, H4, ctz32) 172 RVVCALL(OPIVV1, vctz_v_d, OP_UU_D, H8, H8, ctz64) 173 GEN_VEXT_V(vctz_v_b, 1) 174 GEN_VEXT_V(vctz_v_h, 2) 175 GEN_VEXT_V(vctz_v_w, 4) 176 GEN_VEXT_V(vctz_v_d, 8) 177 178 RVVCALL(OPIVV1, vcpop_v_b, OP_UU_B, H1, H1, ctpop8) 179 RVVCALL(OPIVV1, vcpop_v_h, OP_UU_H, H2, H2, ctpop16) 180 RVVCALL(OPIVV1, vcpop_v_w, OP_UU_W, H4, H4, ctpop32) 181 RVVCALL(OPIVV1, vcpop_v_d, OP_UU_D, H8, H8, ctpop64) 182 GEN_VEXT_V(vcpop_v_b, 1) 183 GEN_VEXT_V(vcpop_v_h, 2) 184 GEN_VEXT_V(vcpop_v_w, 4) 185 GEN_VEXT_V(vcpop_v_d, 8) 186 187 #define DO_SLL(N, M) (N << (M & (sizeof(N) * 8 - 1))) 188 RVVCALL(OPIVV2, vwsll_vv_b, WOP_UUU_B, H2, H1, H1, DO_SLL) 189 RVVCALL(OPIVV2, vwsll_vv_h, WOP_UUU_H, H4, H2, H2, DO_SLL) 190 RVVCALL(OPIVV2, vwsll_vv_w, WOP_UUU_W, H8, H4, H4, DO_SLL) 191 GEN_VEXT_VV(vwsll_vv_b, 2) 192 GEN_VEXT_VV(vwsll_vv_h, 4) 193 GEN_VEXT_VV(vwsll_vv_w, 8) 194 195 RVVCALL(OPIVX2, vwsll_vx_b, WOP_UUU_B, H2, H1, DO_SLL) 196 RVVCALL(OPIVX2, vwsll_vx_h, WOP_UUU_H, H4, H2, DO_SLL) 197 RVVCALL(OPIVX2, vwsll_vx_w, WOP_UUU_W, H8, H4, DO_SLL) 198 GEN_VEXT_VX(vwsll_vx_b, 2) 199 GEN_VEXT_VX(vwsll_vx_h, 4) 200 GEN_VEXT_VX(vwsll_vx_w, 8) 201 202 void HELPER(egs_check)(uint32_t egs, CPURISCVState *env) 203 { 204 uint32_t vl = env->vl; 205 uint32_t vstart = env->vstart; 206 207 if (vl % egs != 0 || vstart % egs != 0) { 208 riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC()); 209 } 210 } 211 212 static inline void xor_round_key(AESState *round_state, AESState *round_key) 213 { 214 round_state->v = round_state->v ^ round_key->v; 215 } 216 217 #define GEN_ZVKNED_HELPER_VV(NAME, ...) \ 218 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \ 219 uint32_t desc) \ 220 { \ 221 uint32_t vl = env->vl; \ 222 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \ 223 uint32_t vta = vext_vta(desc); \ 224 \ 225 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \ 226 AESState round_key; \ 227 round_key.d[0] = *((uint64_t *)vs2 + H8(i * 2 + 0)); \ 228 round_key.d[1] = *((uint64_t *)vs2 + H8(i * 2 + 1)); \ 229 AESState round_state; \ 230 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \ 231 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \ 232 __VA_ARGS__; \ 233 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \ 234 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \ 235 } \ 236 env->vstart = 0; \ 237 /* set tail elements to 1s */ \ 238 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \ 239 } 240 241 #define GEN_ZVKNED_HELPER_VS(NAME, ...) \ 242 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \ 243 uint32_t desc) \ 244 { \ 245 uint32_t vl = env->vl; \ 246 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \ 247 uint32_t vta = vext_vta(desc); \ 248 \ 249 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \ 250 AESState round_key; \ 251 round_key.d[0] = *((uint64_t *)vs2 + H8(0)); \ 252 round_key.d[1] = *((uint64_t *)vs2 + H8(1)); \ 253 AESState round_state; \ 254 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \ 255 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \ 256 __VA_ARGS__; \ 257 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \ 258 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \ 259 } \ 260 env->vstart = 0; \ 261 /* set tail elements to 1s */ \ 262 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \ 263 } 264 265 GEN_ZVKNED_HELPER_VV(vaesef_vv, aesenc_SB_SR_AK(&round_state, 266 &round_state, 267 &round_key, 268 false);) 269 GEN_ZVKNED_HELPER_VS(vaesef_vs, aesenc_SB_SR_AK(&round_state, 270 &round_state, 271 &round_key, 272 false);) 273 GEN_ZVKNED_HELPER_VV(vaesdf_vv, aesdec_ISB_ISR_AK(&round_state, 274 &round_state, 275 &round_key, 276 false);) 277 GEN_ZVKNED_HELPER_VS(vaesdf_vs, aesdec_ISB_ISR_AK(&round_state, 278 &round_state, 279 &round_key, 280 false);) 281 GEN_ZVKNED_HELPER_VV(vaesem_vv, aesenc_SB_SR_MC_AK(&round_state, 282 &round_state, 283 &round_key, 284 false);) 285 GEN_ZVKNED_HELPER_VS(vaesem_vs, aesenc_SB_SR_MC_AK(&round_state, 286 &round_state, 287 &round_key, 288 false);) 289 GEN_ZVKNED_HELPER_VV(vaesdm_vv, aesdec_ISB_ISR_AK_IMC(&round_state, 290 &round_state, 291 &round_key, 292 false);) 293 GEN_ZVKNED_HELPER_VS(vaesdm_vs, aesdec_ISB_ISR_AK_IMC(&round_state, 294 &round_state, 295 &round_key, 296 false);) 297 GEN_ZVKNED_HELPER_VS(vaesz_vs, xor_round_key(&round_state, &round_key);) 298 299 void HELPER(vaeskf1_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 300 CPURISCVState *env, uint32_t desc) 301 { 302 uint32_t *vd = vd_vptr; 303 uint32_t *vs2 = vs2_vptr; 304 uint32_t vl = env->vl; 305 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 306 uint32_t vta = vext_vta(desc); 307 308 uimm &= 0b1111; 309 if (uimm > 10 || uimm == 0) { 310 uimm ^= 0b1000; 311 } 312 313 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 314 uint32_t rk[8], tmp; 315 static const uint32_t rcon[] = { 316 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 317 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036, 318 }; 319 320 rk[0] = vs2[i * 4 + H4(0)]; 321 rk[1] = vs2[i * 4 + H4(1)]; 322 rk[2] = vs2[i * 4 + H4(2)]; 323 rk[3] = vs2[i * 4 + H4(3)]; 324 tmp = ror32(rk[3], 8); 325 326 rk[4] = rk[0] ^ (((uint32_t)AES_sbox[(tmp >> 24) & 0xff] << 24) | 327 ((uint32_t)AES_sbox[(tmp >> 16) & 0xff] << 16) | 328 ((uint32_t)AES_sbox[(tmp >> 8) & 0xff] << 8) | 329 ((uint32_t)AES_sbox[(tmp >> 0) & 0xff] << 0)) 330 ^ rcon[uimm - 1]; 331 rk[5] = rk[1] ^ rk[4]; 332 rk[6] = rk[2] ^ rk[5]; 333 rk[7] = rk[3] ^ rk[6]; 334 335 vd[i * 4 + H4(0)] = rk[4]; 336 vd[i * 4 + H4(1)] = rk[5]; 337 vd[i * 4 + H4(2)] = rk[6]; 338 vd[i * 4 + H4(3)] = rk[7]; 339 } 340 env->vstart = 0; 341 /* set tail elements to 1s */ 342 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); 343 } 344 345 void HELPER(vaeskf2_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 346 CPURISCVState *env, uint32_t desc) 347 { 348 uint32_t *vd = vd_vptr; 349 uint32_t *vs2 = vs2_vptr; 350 uint32_t vl = env->vl; 351 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 352 uint32_t vta = vext_vta(desc); 353 354 uimm &= 0b1111; 355 if (uimm > 14 || uimm < 2) { 356 uimm ^= 0b1000; 357 } 358 359 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 360 uint32_t rk[12], tmp; 361 static const uint32_t rcon[] = { 362 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 363 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036, 364 }; 365 366 rk[0] = vd[i * 4 + H4(0)]; 367 rk[1] = vd[i * 4 + H4(1)]; 368 rk[2] = vd[i * 4 + H4(2)]; 369 rk[3] = vd[i * 4 + H4(3)]; 370 rk[4] = vs2[i * 4 + H4(0)]; 371 rk[5] = vs2[i * 4 + H4(1)]; 372 rk[6] = vs2[i * 4 + H4(2)]; 373 rk[7] = vs2[i * 4 + H4(3)]; 374 375 if (uimm % 2 == 0) { 376 tmp = ror32(rk[7], 8); 377 rk[8] = rk[0] ^ (((uint32_t)AES_sbox[(tmp >> 24) & 0xff] << 24) | 378 ((uint32_t)AES_sbox[(tmp >> 16) & 0xff] << 16) | 379 ((uint32_t)AES_sbox[(tmp >> 8) & 0xff] << 8) | 380 ((uint32_t)AES_sbox[(tmp >> 0) & 0xff] << 0)) 381 ^ rcon[(uimm - 1) / 2]; 382 } else { 383 rk[8] = rk[0] ^ (((uint32_t)AES_sbox[(rk[7] >> 24) & 0xff] << 24) | 384 ((uint32_t)AES_sbox[(rk[7] >> 16) & 0xff] << 16) | 385 ((uint32_t)AES_sbox[(rk[7] >> 8) & 0xff] << 8) | 386 ((uint32_t)AES_sbox[(rk[7] >> 0) & 0xff] << 0)); 387 } 388 rk[9] = rk[1] ^ rk[8]; 389 rk[10] = rk[2] ^ rk[9]; 390 rk[11] = rk[3] ^ rk[10]; 391 392 vd[i * 4 + H4(0)] = rk[8]; 393 vd[i * 4 + H4(1)] = rk[9]; 394 vd[i * 4 + H4(2)] = rk[10]; 395 vd[i * 4 + H4(3)] = rk[11]; 396 } 397 env->vstart = 0; 398 /* set tail elements to 1s */ 399 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); 400 } 401 402 static inline uint32_t sig0_sha256(uint32_t x) 403 { 404 return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3); 405 } 406 407 static inline uint32_t sig1_sha256(uint32_t x) 408 { 409 return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10); 410 } 411 412 static inline uint64_t sig0_sha512(uint64_t x) 413 { 414 return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7); 415 } 416 417 static inline uint64_t sig1_sha512(uint64_t x) 418 { 419 return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6); 420 } 421 422 static inline void vsha2ms_e32(uint32_t *vd, uint32_t *vs1, uint32_t *vs2) 423 { 424 uint32_t res[4]; 425 res[0] = sig1_sha256(vs1[H4(2)]) + vs2[H4(1)] + sig0_sha256(vd[H4(1)]) + 426 vd[H4(0)]; 427 res[1] = sig1_sha256(vs1[H4(3)]) + vs2[H4(2)] + sig0_sha256(vd[H4(2)]) + 428 vd[H4(1)]; 429 res[2] = 430 sig1_sha256(res[0]) + vs2[H4(3)] + sig0_sha256(vd[H4(3)]) + vd[H4(2)]; 431 res[3] = 432 sig1_sha256(res[1]) + vs1[H4(0)] + sig0_sha256(vs2[H4(0)]) + vd[H4(3)]; 433 vd[H4(3)] = res[3]; 434 vd[H4(2)] = res[2]; 435 vd[H4(1)] = res[1]; 436 vd[H4(0)] = res[0]; 437 } 438 439 static inline void vsha2ms_e64(uint64_t *vd, uint64_t *vs1, uint64_t *vs2) 440 { 441 uint64_t res[4]; 442 res[0] = sig1_sha512(vs1[2]) + vs2[1] + sig0_sha512(vd[1]) + vd[0]; 443 res[1] = sig1_sha512(vs1[3]) + vs2[2] + sig0_sha512(vd[2]) + vd[1]; 444 res[2] = sig1_sha512(res[0]) + vs2[3] + sig0_sha512(vd[3]) + vd[2]; 445 res[3] = sig1_sha512(res[1]) + vs1[0] + sig0_sha512(vs2[0]) + vd[3]; 446 vd[3] = res[3]; 447 vd[2] = res[2]; 448 vd[1] = res[1]; 449 vd[0] = res[0]; 450 } 451 452 void HELPER(vsha2ms_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 453 uint32_t desc) 454 { 455 uint32_t sew = FIELD_EX64(env->vtype, VTYPE, VSEW); 456 uint32_t esz = sew == MO_32 ? 4 : 8; 457 uint32_t total_elems; 458 uint32_t vta = vext_vta(desc); 459 460 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 461 if (sew == MO_32) { 462 vsha2ms_e32(((uint32_t *)vd) + i * 4, ((uint32_t *)vs1) + i * 4, 463 ((uint32_t *)vs2) + i * 4); 464 } else { 465 /* If not 32 then SEW should be 64 */ 466 vsha2ms_e64(((uint64_t *)vd) + i * 4, ((uint64_t *)vs1) + i * 4, 467 ((uint64_t *)vs2) + i * 4); 468 } 469 } 470 /* set tail elements to 1s */ 471 total_elems = vext_get_total_elems(env, desc, esz); 472 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 473 env->vstart = 0; 474 } 475 476 static inline uint64_t sum0_64(uint64_t x) 477 { 478 return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39); 479 } 480 481 static inline uint32_t sum0_32(uint32_t x) 482 { 483 return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22); 484 } 485 486 static inline uint64_t sum1_64(uint64_t x) 487 { 488 return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41); 489 } 490 491 static inline uint32_t sum1_32(uint32_t x) 492 { 493 return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25); 494 } 495 496 #define ch(x, y, z) ((x & y) ^ ((~x) & z)) 497 498 #define maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z)) 499 500 static void vsha2c_64(uint64_t *vs2, uint64_t *vd, uint64_t *vs1) 501 { 502 uint64_t a = vs2[3], b = vs2[2], e = vs2[1], f = vs2[0]; 503 uint64_t c = vd[3], d = vd[2], g = vd[1], h = vd[0]; 504 uint64_t W0 = vs1[0], W1 = vs1[1]; 505 uint64_t T1 = h + sum1_64(e) + ch(e, f, g) + W0; 506 uint64_t T2 = sum0_64(a) + maj(a, b, c); 507 508 h = g; 509 g = f; 510 f = e; 511 e = d + T1; 512 d = c; 513 c = b; 514 b = a; 515 a = T1 + T2; 516 517 T1 = h + sum1_64(e) + ch(e, f, g) + W1; 518 T2 = sum0_64(a) + maj(a, b, c); 519 h = g; 520 g = f; 521 f = e; 522 e = d + T1; 523 d = c; 524 c = b; 525 b = a; 526 a = T1 + T2; 527 528 vd[0] = f; 529 vd[1] = e; 530 vd[2] = b; 531 vd[3] = a; 532 } 533 534 static void vsha2c_32(uint32_t *vs2, uint32_t *vd, uint32_t *vs1) 535 { 536 uint32_t a = vs2[H4(3)], b = vs2[H4(2)], e = vs2[H4(1)], f = vs2[H4(0)]; 537 uint32_t c = vd[H4(3)], d = vd[H4(2)], g = vd[H4(1)], h = vd[H4(0)]; 538 uint32_t W0 = vs1[H4(0)], W1 = vs1[H4(1)]; 539 uint32_t T1 = h + sum1_32(e) + ch(e, f, g) + W0; 540 uint32_t T2 = sum0_32(a) + maj(a, b, c); 541 542 h = g; 543 g = f; 544 f = e; 545 e = d + T1; 546 d = c; 547 c = b; 548 b = a; 549 a = T1 + T2; 550 551 T1 = h + sum1_32(e) + ch(e, f, g) + W1; 552 T2 = sum0_32(a) + maj(a, b, c); 553 h = g; 554 g = f; 555 f = e; 556 e = d + T1; 557 d = c; 558 c = b; 559 b = a; 560 a = T1 + T2; 561 562 vd[H4(0)] = f; 563 vd[H4(1)] = e; 564 vd[H4(2)] = b; 565 vd[H4(3)] = a; 566 } 567 568 void HELPER(vsha2ch32_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 569 uint32_t desc) 570 { 571 const uint32_t esz = 4; 572 uint32_t total_elems; 573 uint32_t vta = vext_vta(desc); 574 575 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 576 vsha2c_32(((uint32_t *)vs2) + 4 * i, ((uint32_t *)vd) + 4 * i, 577 ((uint32_t *)vs1) + 4 * i + 2); 578 } 579 580 /* set tail elements to 1s */ 581 total_elems = vext_get_total_elems(env, desc, esz); 582 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 583 env->vstart = 0; 584 } 585 586 void HELPER(vsha2ch64_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 587 uint32_t desc) 588 { 589 const uint32_t esz = 8; 590 uint32_t total_elems; 591 uint32_t vta = vext_vta(desc); 592 593 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 594 vsha2c_64(((uint64_t *)vs2) + 4 * i, ((uint64_t *)vd) + 4 * i, 595 ((uint64_t *)vs1) + 4 * i + 2); 596 } 597 598 /* set tail elements to 1s */ 599 total_elems = vext_get_total_elems(env, desc, esz); 600 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 601 env->vstart = 0; 602 } 603 604 void HELPER(vsha2cl32_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 605 uint32_t desc) 606 { 607 const uint32_t esz = 4; 608 uint32_t total_elems; 609 uint32_t vta = vext_vta(desc); 610 611 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 612 vsha2c_32(((uint32_t *)vs2) + 4 * i, ((uint32_t *)vd) + 4 * i, 613 (((uint32_t *)vs1) + 4 * i)); 614 } 615 616 /* set tail elements to 1s */ 617 total_elems = vext_get_total_elems(env, desc, esz); 618 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 619 env->vstart = 0; 620 } 621 622 void HELPER(vsha2cl64_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 623 uint32_t desc) 624 { 625 uint32_t esz = 8; 626 uint32_t total_elems; 627 uint32_t vta = vext_vta(desc); 628 629 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 630 vsha2c_64(((uint64_t *)vs2) + 4 * i, ((uint64_t *)vd) + 4 * i, 631 (((uint64_t *)vs1) + 4 * i)); 632 } 633 634 /* set tail elements to 1s */ 635 total_elems = vext_get_total_elems(env, desc, esz); 636 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 637 env->vstart = 0; 638 } 639 640 static inline uint32_t p1(uint32_t x) 641 { 642 return x ^ rol32(x, 15) ^ rol32(x, 23); 643 } 644 645 static inline uint32_t zvksh_w(uint32_t m16, uint32_t m9, uint32_t m3, 646 uint32_t m13, uint32_t m6) 647 { 648 return p1(m16 ^ m9 ^ rol32(m3, 15)) ^ rol32(m13, 7) ^ m6; 649 } 650 651 void HELPER(vsm3me_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr, 652 CPURISCVState *env, uint32_t desc) 653 { 654 uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW)); 655 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 656 uint32_t vta = vext_vta(desc); 657 uint32_t *vd = vd_vptr; 658 uint32_t *vs1 = vs1_vptr; 659 uint32_t *vs2 = vs2_vptr; 660 661 for (int i = env->vstart / 8; i < env->vl / 8; i++) { 662 uint32_t w[24]; 663 for (int j = 0; j < 8; j++) { 664 w[j] = bswap32(vs1[H4((i * 8) + j)]); 665 w[j + 8] = bswap32(vs2[H4((i * 8) + j)]); 666 } 667 for (int j = 0; j < 8; j++) { 668 w[j + 16] = 669 zvksh_w(w[j], w[j + 7], w[j + 13], w[j + 3], w[j + 10]); 670 } 671 for (int j = 0; j < 8; j++) { 672 vd[(i * 8) + j] = bswap32(w[H4(j + 16)]); 673 } 674 } 675 vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz); 676 env->vstart = 0; 677 } 678 679 static inline uint32_t ff1(uint32_t x, uint32_t y, uint32_t z) 680 { 681 return x ^ y ^ z; 682 } 683 684 static inline uint32_t ff2(uint32_t x, uint32_t y, uint32_t z) 685 { 686 return (x & y) | (x & z) | (y & z); 687 } 688 689 static inline uint32_t ff_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j) 690 { 691 return (j <= 15) ? ff1(x, y, z) : ff2(x, y, z); 692 } 693 694 static inline uint32_t gg1(uint32_t x, uint32_t y, uint32_t z) 695 { 696 return x ^ y ^ z; 697 } 698 699 static inline uint32_t gg2(uint32_t x, uint32_t y, uint32_t z) 700 { 701 return (x & y) | (~x & z); 702 } 703 704 static inline uint32_t gg_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j) 705 { 706 return (j <= 15) ? gg1(x, y, z) : gg2(x, y, z); 707 } 708 709 static inline uint32_t t_j(uint32_t j) 710 { 711 return (j <= 15) ? 0x79cc4519 : 0x7a879d8a; 712 } 713 714 static inline uint32_t p_0(uint32_t x) 715 { 716 return x ^ rol32(x, 9) ^ rol32(x, 17); 717 } 718 719 static void sm3c(uint32_t *vd, uint32_t *vs1, uint32_t *vs2, uint32_t uimm) 720 { 721 uint32_t x0, x1; 722 uint32_t j; 723 uint32_t ss1, ss2, tt1, tt2; 724 x0 = vs2[0] ^ vs2[4]; 725 x1 = vs2[1] ^ vs2[5]; 726 j = 2 * uimm; 727 ss1 = rol32(rol32(vs1[0], 12) + vs1[4] + rol32(t_j(j), j % 32), 7); 728 ss2 = ss1 ^ rol32(vs1[0], 12); 729 tt1 = ff_j(vs1[0], vs1[1], vs1[2], j) + vs1[3] + ss2 + x0; 730 tt2 = gg_j(vs1[4], vs1[5], vs1[6], j) + vs1[7] + ss1 + vs2[0]; 731 vs1[3] = vs1[2]; 732 vd[3] = rol32(vs1[1], 9); 733 vs1[1] = vs1[0]; 734 vd[1] = tt1; 735 vs1[7] = vs1[6]; 736 vd[7] = rol32(vs1[5], 19); 737 vs1[5] = vs1[4]; 738 vd[5] = p_0(tt2); 739 j = 2 * uimm + 1; 740 ss1 = rol32(rol32(vd[1], 12) + vd[5] + rol32(t_j(j), j % 32), 7); 741 ss2 = ss1 ^ rol32(vd[1], 12); 742 tt1 = ff_j(vd[1], vs1[1], vd[3], j) + vs1[3] + ss2 + x1; 743 tt2 = gg_j(vd[5], vs1[5], vd[7], j) + vs1[7] + ss1 + vs2[1]; 744 vd[2] = rol32(vs1[1], 9); 745 vd[0] = tt1; 746 vd[6] = rol32(vs1[5], 19); 747 vd[4] = p_0(tt2); 748 } 749 750 void HELPER(vsm3c_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 751 CPURISCVState *env, uint32_t desc) 752 { 753 uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW)); 754 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 755 uint32_t vta = vext_vta(desc); 756 uint32_t *vd = vd_vptr; 757 uint32_t *vs2 = vs2_vptr; 758 uint32_t v1[8], v2[8], v3[8]; 759 760 for (int i = env->vstart / 8; i < env->vl / 8; i++) { 761 for (int k = 0; k < 8; k++) { 762 v2[k] = bswap32(vd[H4(i * 8 + k)]); 763 v3[k] = bswap32(vs2[H4(i * 8 + k)]); 764 } 765 sm3c(v1, v2, v3, uimm); 766 for (int k = 0; k < 8; k++) { 767 vd[i * 8 + k] = bswap32(v1[H4(k)]); 768 } 769 } 770 vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz); 771 env->vstart = 0; 772 } 773 774 void HELPER(vghsh_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr, 775 CPURISCVState *env, uint32_t desc) 776 { 777 uint64_t *vd = vd_vptr; 778 uint64_t *vs1 = vs1_vptr; 779 uint64_t *vs2 = vs2_vptr; 780 uint32_t vta = vext_vta(desc); 781 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 782 783 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 784 uint64_t Y[2] = {vd[i * 2 + 0], vd[i * 2 + 1]}; 785 uint64_t H[2] = {brev8(vs2[i * 2 + 0]), brev8(vs2[i * 2 + 1])}; 786 uint64_t X[2] = {vs1[i * 2 + 0], vs1[i * 2 + 1]}; 787 uint64_t Z[2] = {0, 0}; 788 789 uint64_t S[2] = {brev8(Y[0] ^ X[0]), brev8(Y[1] ^ X[1])}; 790 791 for (int j = 0; j < 128; j++) { 792 if ((S[j / 64] >> (j % 64)) & 1) { 793 Z[0] ^= H[0]; 794 Z[1] ^= H[1]; 795 } 796 bool reduce = ((H[1] >> 63) & 1); 797 H[1] = H[1] << 1 | H[0] >> 63; 798 H[0] = H[0] << 1; 799 if (reduce) { 800 H[0] ^= 0x87; 801 } 802 } 803 804 vd[i * 2 + 0] = brev8(Z[0]); 805 vd[i * 2 + 1] = brev8(Z[1]); 806 } 807 /* set tail elements to 1s */ 808 vext_set_elems_1s(vd, vta, env->vl * 4, total_elems * 4); 809 env->vstart = 0; 810 } 811 812 void HELPER(vgmul_vv)(void *vd_vptr, void *vs2_vptr, CPURISCVState *env, 813 uint32_t desc) 814 { 815 uint64_t *vd = vd_vptr; 816 uint64_t *vs2 = vs2_vptr; 817 uint32_t vta = vext_vta(desc); 818 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 819 820 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 821 uint64_t Y[2] = {brev8(vd[i * 2 + 0]), brev8(vd[i * 2 + 1])}; 822 uint64_t H[2] = {brev8(vs2[i * 2 + 0]), brev8(vs2[i * 2 + 1])}; 823 uint64_t Z[2] = {0, 0}; 824 825 for (int j = 0; j < 128; j++) { 826 if ((Y[j / 64] >> (j % 64)) & 1) { 827 Z[0] ^= H[0]; 828 Z[1] ^= H[1]; 829 } 830 bool reduce = ((H[1] >> 63) & 1); 831 H[1] = H[1] << 1 | H[0] >> 63; 832 H[0] = H[0] << 1; 833 if (reduce) { 834 H[0] ^= 0x87; 835 } 836 } 837 838 vd[i * 2 + 0] = brev8(Z[0]); 839 vd[i * 2 + 1] = brev8(Z[1]); 840 } 841 /* set tail elements to 1s */ 842 vext_set_elems_1s(vd, vta, env->vl * 4, total_elems * 4); 843 env->vstart = 0; 844 } 845 846 void HELPER(vsm4k_vi)(void *vd, void *vs2, uint32_t uimm5, CPURISCVState *env, 847 uint32_t desc) 848 { 849 const uint32_t egs = 4; 850 uint32_t rnd = uimm5 & 0x7; 851 uint32_t group_start = env->vstart / egs; 852 uint32_t group_end = env->vl / egs; 853 uint32_t esz = sizeof(uint32_t); 854 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 855 856 for (uint32_t i = group_start; i < group_end; ++i) { 857 uint32_t vstart = i * egs; 858 uint32_t vend = (i + 1) * egs; 859 uint32_t rk[4] = {0}; 860 uint32_t tmp[8] = {0}; 861 862 for (uint32_t j = vstart; j < vend; ++j) { 863 rk[j - vstart] = *((uint32_t *)vs2 + H4(j)); 864 } 865 866 for (uint32_t j = 0; j < egs; ++j) { 867 tmp[j] = rk[j]; 868 } 869 870 for (uint32_t j = 0; j < egs; ++j) { 871 uint32_t b, s; 872 b = tmp[j + 1] ^ tmp[j + 2] ^ tmp[j + 3] ^ sm4_ck[rnd * 4 + j]; 873 874 s = sm4_subword(b); 875 876 tmp[j + 4] = tmp[j] ^ (s ^ rol32(s, 13) ^ rol32(s, 23)); 877 } 878 879 for (uint32_t j = vstart; j < vend; ++j) { 880 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 881 } 882 } 883 884 env->vstart = 0; 885 /* set tail elements to 1s */ 886 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 887 } 888 889 static void do_sm4_round(uint32_t *rk, uint32_t *buf) 890 { 891 const uint32_t egs = 4; 892 uint32_t s, b; 893 894 for (uint32_t j = egs; j < egs * 2; ++j) { 895 b = buf[j - 3] ^ buf[j - 2] ^ buf[j - 1] ^ rk[j - 4]; 896 897 s = sm4_subword(b); 898 899 buf[j] = buf[j - 4] ^ (s ^ rol32(s, 2) ^ rol32(s, 10) ^ rol32(s, 18) ^ 900 rol32(s, 24)); 901 } 902 } 903 904 void HELPER(vsm4r_vv)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) 905 { 906 const uint32_t egs = 4; 907 uint32_t group_start = env->vstart / egs; 908 uint32_t group_end = env->vl / egs; 909 uint32_t esz = sizeof(uint32_t); 910 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 911 912 for (uint32_t i = group_start; i < group_end; ++i) { 913 uint32_t vstart = i * egs; 914 uint32_t vend = (i + 1) * egs; 915 uint32_t rk[4] = {0}; 916 uint32_t tmp[8] = {0}; 917 918 for (uint32_t j = vstart; j < vend; ++j) { 919 rk[j - vstart] = *((uint32_t *)vs2 + H4(j)); 920 } 921 922 for (uint32_t j = vstart; j < vend; ++j) { 923 tmp[j - vstart] = *((uint32_t *)vd + H4(j)); 924 } 925 926 do_sm4_round(rk, tmp); 927 928 for (uint32_t j = vstart; j < vend; ++j) { 929 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 930 } 931 } 932 933 env->vstart = 0; 934 /* set tail elements to 1s */ 935 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 936 } 937 938 void HELPER(vsm4r_vs)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) 939 { 940 const uint32_t egs = 4; 941 uint32_t group_start = env->vstart / egs; 942 uint32_t group_end = env->vl / egs; 943 uint32_t esz = sizeof(uint32_t); 944 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 945 946 for (uint32_t i = group_start; i < group_end; ++i) { 947 uint32_t vstart = i * egs; 948 uint32_t vend = (i + 1) * egs; 949 uint32_t rk[4] = {0}; 950 uint32_t tmp[8] = {0}; 951 952 for (uint32_t j = 0; j < egs; ++j) { 953 rk[j] = *((uint32_t *)vs2 + H4(j)); 954 } 955 956 for (uint32_t j = vstart; j < vend; ++j) { 957 tmp[j - vstart] = *((uint32_t *)vd + H4(j)); 958 } 959 960 do_sm4_round(rk, tmp); 961 962 for (uint32_t j = vstart; j < vend; ++j) { 963 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 964 } 965 } 966 967 env->vstart = 0; 968 /* set tail elements to 1s */ 969 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 970 } 971