1 /* 2 * RISC-V Vector Crypto Extension Helpers for QEMU. 3 * 4 * Copyright (C) 2023 SiFive, Inc. 5 * Written by Codethink Ltd and SiFive. 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2 or later, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 * 16 * You should have received a copy of the GNU General Public License along with 17 * this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu/host-utils.h" 22 #include "qemu/bitops.h" 23 #include "qemu/bswap.h" 24 #include "cpu.h" 25 #include "crypto/aes.h" 26 #include "crypto/aes-round.h" 27 #include "crypto/sm4.h" 28 #include "exec/memop.h" 29 #include "exec/exec-all.h" 30 #include "exec/helper-proto.h" 31 #include "internals.h" 32 #include "vector_internals.h" 33 34 static uint64_t clmul64(uint64_t y, uint64_t x) 35 { 36 uint64_t result = 0; 37 for (int j = 63; j >= 0; j--) { 38 if ((y >> j) & 1) { 39 result ^= (x << j); 40 } 41 } 42 return result; 43 } 44 45 static uint64_t clmulh64(uint64_t y, uint64_t x) 46 { 47 uint64_t result = 0; 48 for (int j = 63; j >= 1; j--) { 49 if ((y >> j) & 1) { 50 result ^= (x >> (64 - j)); 51 } 52 } 53 return result; 54 } 55 56 RVVCALL(OPIVV2, vclmul_vv, OP_UUU_D, H8, H8, H8, clmul64) 57 GEN_VEXT_VV(vclmul_vv, 8) 58 RVVCALL(OPIVX2, vclmul_vx, OP_UUU_D, H8, H8, clmul64) 59 GEN_VEXT_VX(vclmul_vx, 8) 60 RVVCALL(OPIVV2, vclmulh_vv, OP_UUU_D, H8, H8, H8, clmulh64) 61 GEN_VEXT_VV(vclmulh_vv, 8) 62 RVVCALL(OPIVX2, vclmulh_vx, OP_UUU_D, H8, H8, clmulh64) 63 GEN_VEXT_VX(vclmulh_vx, 8) 64 65 RVVCALL(OPIVV2, vror_vv_b, OP_UUU_B, H1, H1, H1, ror8) 66 RVVCALL(OPIVV2, vror_vv_h, OP_UUU_H, H2, H2, H2, ror16) 67 RVVCALL(OPIVV2, vror_vv_w, OP_UUU_W, H4, H4, H4, ror32) 68 RVVCALL(OPIVV2, vror_vv_d, OP_UUU_D, H8, H8, H8, ror64) 69 GEN_VEXT_VV(vror_vv_b, 1) 70 GEN_VEXT_VV(vror_vv_h, 2) 71 GEN_VEXT_VV(vror_vv_w, 4) 72 GEN_VEXT_VV(vror_vv_d, 8) 73 74 RVVCALL(OPIVX2, vror_vx_b, OP_UUU_B, H1, H1, ror8) 75 RVVCALL(OPIVX2, vror_vx_h, OP_UUU_H, H2, H2, ror16) 76 RVVCALL(OPIVX2, vror_vx_w, OP_UUU_W, H4, H4, ror32) 77 RVVCALL(OPIVX2, vror_vx_d, OP_UUU_D, H8, H8, ror64) 78 GEN_VEXT_VX(vror_vx_b, 1) 79 GEN_VEXT_VX(vror_vx_h, 2) 80 GEN_VEXT_VX(vror_vx_w, 4) 81 GEN_VEXT_VX(vror_vx_d, 8) 82 83 RVVCALL(OPIVV2, vrol_vv_b, OP_UUU_B, H1, H1, H1, rol8) 84 RVVCALL(OPIVV2, vrol_vv_h, OP_UUU_H, H2, H2, H2, rol16) 85 RVVCALL(OPIVV2, vrol_vv_w, OP_UUU_W, H4, H4, H4, rol32) 86 RVVCALL(OPIVV2, vrol_vv_d, OP_UUU_D, H8, H8, H8, rol64) 87 GEN_VEXT_VV(vrol_vv_b, 1) 88 GEN_VEXT_VV(vrol_vv_h, 2) 89 GEN_VEXT_VV(vrol_vv_w, 4) 90 GEN_VEXT_VV(vrol_vv_d, 8) 91 92 RVVCALL(OPIVX2, vrol_vx_b, OP_UUU_B, H1, H1, rol8) 93 RVVCALL(OPIVX2, vrol_vx_h, OP_UUU_H, H2, H2, rol16) 94 RVVCALL(OPIVX2, vrol_vx_w, OP_UUU_W, H4, H4, rol32) 95 RVVCALL(OPIVX2, vrol_vx_d, OP_UUU_D, H8, H8, rol64) 96 GEN_VEXT_VX(vrol_vx_b, 1) 97 GEN_VEXT_VX(vrol_vx_h, 2) 98 GEN_VEXT_VX(vrol_vx_w, 4) 99 GEN_VEXT_VX(vrol_vx_d, 8) 100 101 static uint64_t brev8(uint64_t val) 102 { 103 val = ((val & 0x5555555555555555ull) << 1) | 104 ((val & 0xAAAAAAAAAAAAAAAAull) >> 1); 105 val = ((val & 0x3333333333333333ull) << 2) | 106 ((val & 0xCCCCCCCCCCCCCCCCull) >> 2); 107 val = ((val & 0x0F0F0F0F0F0F0F0Full) << 4) | 108 ((val & 0xF0F0F0F0F0F0F0F0ull) >> 4); 109 110 return val; 111 } 112 113 RVVCALL(OPIVV1, vbrev8_v_b, OP_UU_B, H1, H1, brev8) 114 RVVCALL(OPIVV1, vbrev8_v_h, OP_UU_H, H2, H2, brev8) 115 RVVCALL(OPIVV1, vbrev8_v_w, OP_UU_W, H4, H4, brev8) 116 RVVCALL(OPIVV1, vbrev8_v_d, OP_UU_D, H8, H8, brev8) 117 GEN_VEXT_V(vbrev8_v_b, 1) 118 GEN_VEXT_V(vbrev8_v_h, 2) 119 GEN_VEXT_V(vbrev8_v_w, 4) 120 GEN_VEXT_V(vbrev8_v_d, 8) 121 122 #define DO_IDENTITY(a) (a) 123 RVVCALL(OPIVV1, vrev8_v_b, OP_UU_B, H1, H1, DO_IDENTITY) 124 RVVCALL(OPIVV1, vrev8_v_h, OP_UU_H, H2, H2, bswap16) 125 RVVCALL(OPIVV1, vrev8_v_w, OP_UU_W, H4, H4, bswap32) 126 RVVCALL(OPIVV1, vrev8_v_d, OP_UU_D, H8, H8, bswap64) 127 GEN_VEXT_V(vrev8_v_b, 1) 128 GEN_VEXT_V(vrev8_v_h, 2) 129 GEN_VEXT_V(vrev8_v_w, 4) 130 GEN_VEXT_V(vrev8_v_d, 8) 131 132 #define DO_ANDN(a, b) ((a) & ~(b)) 133 RVVCALL(OPIVV2, vandn_vv_b, OP_UUU_B, H1, H1, H1, DO_ANDN) 134 RVVCALL(OPIVV2, vandn_vv_h, OP_UUU_H, H2, H2, H2, DO_ANDN) 135 RVVCALL(OPIVV2, vandn_vv_w, OP_UUU_W, H4, H4, H4, DO_ANDN) 136 RVVCALL(OPIVV2, vandn_vv_d, OP_UUU_D, H8, H8, H8, DO_ANDN) 137 GEN_VEXT_VV(vandn_vv_b, 1) 138 GEN_VEXT_VV(vandn_vv_h, 2) 139 GEN_VEXT_VV(vandn_vv_w, 4) 140 GEN_VEXT_VV(vandn_vv_d, 8) 141 142 RVVCALL(OPIVX2, vandn_vx_b, OP_UUU_B, H1, H1, DO_ANDN) 143 RVVCALL(OPIVX2, vandn_vx_h, OP_UUU_H, H2, H2, DO_ANDN) 144 RVVCALL(OPIVX2, vandn_vx_w, OP_UUU_W, H4, H4, DO_ANDN) 145 RVVCALL(OPIVX2, vandn_vx_d, OP_UUU_D, H8, H8, DO_ANDN) 146 GEN_VEXT_VX(vandn_vx_b, 1) 147 GEN_VEXT_VX(vandn_vx_h, 2) 148 GEN_VEXT_VX(vandn_vx_w, 4) 149 GEN_VEXT_VX(vandn_vx_d, 8) 150 151 RVVCALL(OPIVV1, vbrev_v_b, OP_UU_B, H1, H1, revbit8) 152 RVVCALL(OPIVV1, vbrev_v_h, OP_UU_H, H2, H2, revbit16) 153 RVVCALL(OPIVV1, vbrev_v_w, OP_UU_W, H4, H4, revbit32) 154 RVVCALL(OPIVV1, vbrev_v_d, OP_UU_D, H8, H8, revbit64) 155 GEN_VEXT_V(vbrev_v_b, 1) 156 GEN_VEXT_V(vbrev_v_h, 2) 157 GEN_VEXT_V(vbrev_v_w, 4) 158 GEN_VEXT_V(vbrev_v_d, 8) 159 160 RVVCALL(OPIVV1, vclz_v_b, OP_UU_B, H1, H1, clz8) 161 RVVCALL(OPIVV1, vclz_v_h, OP_UU_H, H2, H2, clz16) 162 RVVCALL(OPIVV1, vclz_v_w, OP_UU_W, H4, H4, clz32) 163 RVVCALL(OPIVV1, vclz_v_d, OP_UU_D, H8, H8, clz64) 164 GEN_VEXT_V(vclz_v_b, 1) 165 GEN_VEXT_V(vclz_v_h, 2) 166 GEN_VEXT_V(vclz_v_w, 4) 167 GEN_VEXT_V(vclz_v_d, 8) 168 169 RVVCALL(OPIVV1, vctz_v_b, OP_UU_B, H1, H1, ctz8) 170 RVVCALL(OPIVV1, vctz_v_h, OP_UU_H, H2, H2, ctz16) 171 RVVCALL(OPIVV1, vctz_v_w, OP_UU_W, H4, H4, ctz32) 172 RVVCALL(OPIVV1, vctz_v_d, OP_UU_D, H8, H8, ctz64) 173 GEN_VEXT_V(vctz_v_b, 1) 174 GEN_VEXT_V(vctz_v_h, 2) 175 GEN_VEXT_V(vctz_v_w, 4) 176 GEN_VEXT_V(vctz_v_d, 8) 177 178 RVVCALL(OPIVV1, vcpop_v_b, OP_UU_B, H1, H1, ctpop8) 179 RVVCALL(OPIVV1, vcpop_v_h, OP_UU_H, H2, H2, ctpop16) 180 RVVCALL(OPIVV1, vcpop_v_w, OP_UU_W, H4, H4, ctpop32) 181 RVVCALL(OPIVV1, vcpop_v_d, OP_UU_D, H8, H8, ctpop64) 182 GEN_VEXT_V(vcpop_v_b, 1) 183 GEN_VEXT_V(vcpop_v_h, 2) 184 GEN_VEXT_V(vcpop_v_w, 4) 185 GEN_VEXT_V(vcpop_v_d, 8) 186 187 #define DO_SLL(N, M) (N << (M & (sizeof(N) * 8 - 1))) 188 RVVCALL(OPIVV2, vwsll_vv_b, WOP_UUU_B, H2, H1, H1, DO_SLL) 189 RVVCALL(OPIVV2, vwsll_vv_h, WOP_UUU_H, H4, H2, H2, DO_SLL) 190 RVVCALL(OPIVV2, vwsll_vv_w, WOP_UUU_W, H8, H4, H4, DO_SLL) 191 GEN_VEXT_VV(vwsll_vv_b, 2) 192 GEN_VEXT_VV(vwsll_vv_h, 4) 193 GEN_VEXT_VV(vwsll_vv_w, 8) 194 195 RVVCALL(OPIVX2, vwsll_vx_b, WOP_UUU_B, H2, H1, DO_SLL) 196 RVVCALL(OPIVX2, vwsll_vx_h, WOP_UUU_H, H4, H2, DO_SLL) 197 RVVCALL(OPIVX2, vwsll_vx_w, WOP_UUU_W, H8, H4, DO_SLL) 198 GEN_VEXT_VX(vwsll_vx_b, 2) 199 GEN_VEXT_VX(vwsll_vx_h, 4) 200 GEN_VEXT_VX(vwsll_vx_w, 8) 201 202 void HELPER(egs_check)(uint32_t egs, CPURISCVState *env) 203 { 204 uint32_t vl = env->vl; 205 uint32_t vstart = env->vstart; 206 207 if (vl % egs != 0 || vstart % egs != 0) { 208 riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC()); 209 } 210 } 211 212 static inline void xor_round_key(AESState *round_state, AESState *round_key) 213 { 214 round_state->v = round_state->v ^ round_key->v; 215 } 216 217 #define GEN_ZVKNED_HELPER_VV(NAME, ...) \ 218 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \ 219 uint32_t desc) \ 220 { \ 221 uint32_t vl = env->vl; \ 222 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \ 223 uint32_t vta = vext_vta(desc); \ 224 \ 225 VSTART_CHECK_EARLY_EXIT(env); \ 226 \ 227 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \ 228 AESState round_key; \ 229 round_key.d[0] = *((uint64_t *)vs2 + H8(i * 2 + 0)); \ 230 round_key.d[1] = *((uint64_t *)vs2 + H8(i * 2 + 1)); \ 231 AESState round_state; \ 232 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \ 233 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \ 234 __VA_ARGS__; \ 235 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \ 236 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \ 237 } \ 238 env->vstart = 0; \ 239 /* set tail elements to 1s */ \ 240 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \ 241 } 242 243 #define GEN_ZVKNED_HELPER_VS(NAME, ...) \ 244 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \ 245 uint32_t desc) \ 246 { \ 247 uint32_t vl = env->vl; \ 248 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \ 249 uint32_t vta = vext_vta(desc); \ 250 \ 251 VSTART_CHECK_EARLY_EXIT(env); \ 252 \ 253 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \ 254 AESState round_key; \ 255 round_key.d[0] = *((uint64_t *)vs2 + H8(0)); \ 256 round_key.d[1] = *((uint64_t *)vs2 + H8(1)); \ 257 AESState round_state; \ 258 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \ 259 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \ 260 __VA_ARGS__; \ 261 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \ 262 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \ 263 } \ 264 env->vstart = 0; \ 265 /* set tail elements to 1s */ \ 266 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \ 267 } 268 269 GEN_ZVKNED_HELPER_VV(vaesef_vv, aesenc_SB_SR_AK(&round_state, 270 &round_state, 271 &round_key, 272 false);) 273 GEN_ZVKNED_HELPER_VS(vaesef_vs, aesenc_SB_SR_AK(&round_state, 274 &round_state, 275 &round_key, 276 false);) 277 GEN_ZVKNED_HELPER_VV(vaesdf_vv, aesdec_ISB_ISR_AK(&round_state, 278 &round_state, 279 &round_key, 280 false);) 281 GEN_ZVKNED_HELPER_VS(vaesdf_vs, aesdec_ISB_ISR_AK(&round_state, 282 &round_state, 283 &round_key, 284 false);) 285 GEN_ZVKNED_HELPER_VV(vaesem_vv, aesenc_SB_SR_MC_AK(&round_state, 286 &round_state, 287 &round_key, 288 false);) 289 GEN_ZVKNED_HELPER_VS(vaesem_vs, aesenc_SB_SR_MC_AK(&round_state, 290 &round_state, 291 &round_key, 292 false);) 293 GEN_ZVKNED_HELPER_VV(vaesdm_vv, aesdec_ISB_ISR_AK_IMC(&round_state, 294 &round_state, 295 &round_key, 296 false);) 297 GEN_ZVKNED_HELPER_VS(vaesdm_vs, aesdec_ISB_ISR_AK_IMC(&round_state, 298 &round_state, 299 &round_key, 300 false);) 301 GEN_ZVKNED_HELPER_VS(vaesz_vs, xor_round_key(&round_state, &round_key);) 302 303 void HELPER(vaeskf1_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 304 CPURISCVState *env, uint32_t desc) 305 { 306 uint32_t *vd = vd_vptr; 307 uint32_t *vs2 = vs2_vptr; 308 uint32_t vl = env->vl; 309 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 310 uint32_t vta = vext_vta(desc); 311 312 VSTART_CHECK_EARLY_EXIT(env); 313 314 uimm &= 0b1111; 315 if (uimm > 10 || uimm == 0) { 316 uimm ^= 0b1000; 317 } 318 319 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 320 uint32_t rk[8], tmp; 321 static const uint32_t rcon[] = { 322 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 323 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036, 324 }; 325 326 rk[0] = vs2[i * 4 + H4(0)]; 327 rk[1] = vs2[i * 4 + H4(1)]; 328 rk[2] = vs2[i * 4 + H4(2)]; 329 rk[3] = vs2[i * 4 + H4(3)]; 330 tmp = ror32(rk[3], 8); 331 332 rk[4] = rk[0] ^ (((uint32_t)AES_sbox[(tmp >> 24) & 0xff] << 24) | 333 ((uint32_t)AES_sbox[(tmp >> 16) & 0xff] << 16) | 334 ((uint32_t)AES_sbox[(tmp >> 8) & 0xff] << 8) | 335 ((uint32_t)AES_sbox[(tmp >> 0) & 0xff] << 0)) 336 ^ rcon[uimm - 1]; 337 rk[5] = rk[1] ^ rk[4]; 338 rk[6] = rk[2] ^ rk[5]; 339 rk[7] = rk[3] ^ rk[6]; 340 341 vd[i * 4 + H4(0)] = rk[4]; 342 vd[i * 4 + H4(1)] = rk[5]; 343 vd[i * 4 + H4(2)] = rk[6]; 344 vd[i * 4 + H4(3)] = rk[7]; 345 } 346 env->vstart = 0; 347 /* set tail elements to 1s */ 348 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); 349 } 350 351 void HELPER(vaeskf2_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 352 CPURISCVState *env, uint32_t desc) 353 { 354 uint32_t *vd = vd_vptr; 355 uint32_t *vs2 = vs2_vptr; 356 uint32_t vl = env->vl; 357 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 358 uint32_t vta = vext_vta(desc); 359 360 VSTART_CHECK_EARLY_EXIT(env); 361 362 uimm &= 0b1111; 363 if (uimm > 14 || uimm < 2) { 364 uimm ^= 0b1000; 365 } 366 367 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 368 uint32_t rk[12], tmp; 369 static const uint32_t rcon[] = { 370 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 371 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036, 372 }; 373 374 rk[0] = vd[i * 4 + H4(0)]; 375 rk[1] = vd[i * 4 + H4(1)]; 376 rk[2] = vd[i * 4 + H4(2)]; 377 rk[3] = vd[i * 4 + H4(3)]; 378 rk[4] = vs2[i * 4 + H4(0)]; 379 rk[5] = vs2[i * 4 + H4(1)]; 380 rk[6] = vs2[i * 4 + H4(2)]; 381 rk[7] = vs2[i * 4 + H4(3)]; 382 383 if (uimm % 2 == 0) { 384 tmp = ror32(rk[7], 8); 385 rk[8] = rk[0] ^ (((uint32_t)AES_sbox[(tmp >> 24) & 0xff] << 24) | 386 ((uint32_t)AES_sbox[(tmp >> 16) & 0xff] << 16) | 387 ((uint32_t)AES_sbox[(tmp >> 8) & 0xff] << 8) | 388 ((uint32_t)AES_sbox[(tmp >> 0) & 0xff] << 0)) 389 ^ rcon[(uimm - 1) / 2]; 390 } else { 391 rk[8] = rk[0] ^ (((uint32_t)AES_sbox[(rk[7] >> 24) & 0xff] << 24) | 392 ((uint32_t)AES_sbox[(rk[7] >> 16) & 0xff] << 16) | 393 ((uint32_t)AES_sbox[(rk[7] >> 8) & 0xff] << 8) | 394 ((uint32_t)AES_sbox[(rk[7] >> 0) & 0xff] << 0)); 395 } 396 rk[9] = rk[1] ^ rk[8]; 397 rk[10] = rk[2] ^ rk[9]; 398 rk[11] = rk[3] ^ rk[10]; 399 400 vd[i * 4 + H4(0)] = rk[8]; 401 vd[i * 4 + H4(1)] = rk[9]; 402 vd[i * 4 + H4(2)] = rk[10]; 403 vd[i * 4 + H4(3)] = rk[11]; 404 } 405 env->vstart = 0; 406 /* set tail elements to 1s */ 407 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); 408 } 409 410 static inline uint32_t sig0_sha256(uint32_t x) 411 { 412 return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3); 413 } 414 415 static inline uint32_t sig1_sha256(uint32_t x) 416 { 417 return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10); 418 } 419 420 static inline uint64_t sig0_sha512(uint64_t x) 421 { 422 return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7); 423 } 424 425 static inline uint64_t sig1_sha512(uint64_t x) 426 { 427 return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6); 428 } 429 430 static inline void vsha2ms_e32(uint32_t *vd, uint32_t *vs1, uint32_t *vs2) 431 { 432 uint32_t res[4]; 433 res[0] = sig1_sha256(vs1[H4(2)]) + vs2[H4(1)] + sig0_sha256(vd[H4(1)]) + 434 vd[H4(0)]; 435 res[1] = sig1_sha256(vs1[H4(3)]) + vs2[H4(2)] + sig0_sha256(vd[H4(2)]) + 436 vd[H4(1)]; 437 res[2] = 438 sig1_sha256(res[0]) + vs2[H4(3)] + sig0_sha256(vd[H4(3)]) + vd[H4(2)]; 439 res[3] = 440 sig1_sha256(res[1]) + vs1[H4(0)] + sig0_sha256(vs2[H4(0)]) + vd[H4(3)]; 441 vd[H4(3)] = res[3]; 442 vd[H4(2)] = res[2]; 443 vd[H4(1)] = res[1]; 444 vd[H4(0)] = res[0]; 445 } 446 447 static inline void vsha2ms_e64(uint64_t *vd, uint64_t *vs1, uint64_t *vs2) 448 { 449 uint64_t res[4]; 450 res[0] = sig1_sha512(vs1[2]) + vs2[1] + sig0_sha512(vd[1]) + vd[0]; 451 res[1] = sig1_sha512(vs1[3]) + vs2[2] + sig0_sha512(vd[2]) + vd[1]; 452 res[2] = sig1_sha512(res[0]) + vs2[3] + sig0_sha512(vd[3]) + vd[2]; 453 res[3] = sig1_sha512(res[1]) + vs1[0] + sig0_sha512(vs2[0]) + vd[3]; 454 vd[3] = res[3]; 455 vd[2] = res[2]; 456 vd[1] = res[1]; 457 vd[0] = res[0]; 458 } 459 460 void HELPER(vsha2ms_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 461 uint32_t desc) 462 { 463 uint32_t sew = FIELD_EX64(env->vtype, VTYPE, VSEW); 464 uint32_t esz = sew == MO_32 ? 4 : 8; 465 uint32_t total_elems; 466 uint32_t vta = vext_vta(desc); 467 468 VSTART_CHECK_EARLY_EXIT(env); 469 470 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 471 if (sew == MO_32) { 472 vsha2ms_e32(((uint32_t *)vd) + i * 4, ((uint32_t *)vs1) + i * 4, 473 ((uint32_t *)vs2) + i * 4); 474 } else { 475 /* If not 32 then SEW should be 64 */ 476 vsha2ms_e64(((uint64_t *)vd) + i * 4, ((uint64_t *)vs1) + i * 4, 477 ((uint64_t *)vs2) + i * 4); 478 } 479 } 480 /* set tail elements to 1s */ 481 total_elems = vext_get_total_elems(env, desc, esz); 482 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 483 env->vstart = 0; 484 } 485 486 static inline uint64_t sum0_64(uint64_t x) 487 { 488 return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39); 489 } 490 491 static inline uint32_t sum0_32(uint32_t x) 492 { 493 return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22); 494 } 495 496 static inline uint64_t sum1_64(uint64_t x) 497 { 498 return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41); 499 } 500 501 static inline uint32_t sum1_32(uint32_t x) 502 { 503 return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25); 504 } 505 506 #define ch(x, y, z) ((x & y) ^ ((~x) & z)) 507 508 #define maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z)) 509 510 static void vsha2c_64(uint64_t *vs2, uint64_t *vd, uint64_t *vs1) 511 { 512 uint64_t a = vs2[3], b = vs2[2], e = vs2[1], f = vs2[0]; 513 uint64_t c = vd[3], d = vd[2], g = vd[1], h = vd[0]; 514 uint64_t W0 = vs1[0], W1 = vs1[1]; 515 uint64_t T1 = h + sum1_64(e) + ch(e, f, g) + W0; 516 uint64_t T2 = sum0_64(a) + maj(a, b, c); 517 518 h = g; 519 g = f; 520 f = e; 521 e = d + T1; 522 d = c; 523 c = b; 524 b = a; 525 a = T1 + T2; 526 527 T1 = h + sum1_64(e) + ch(e, f, g) + W1; 528 T2 = sum0_64(a) + maj(a, b, c); 529 h = g; 530 g = f; 531 f = e; 532 e = d + T1; 533 d = c; 534 c = b; 535 b = a; 536 a = T1 + T2; 537 538 vd[0] = f; 539 vd[1] = e; 540 vd[2] = b; 541 vd[3] = a; 542 } 543 544 static void vsha2c_32(uint32_t *vs2, uint32_t *vd, uint32_t *vs1) 545 { 546 uint32_t a = vs2[H4(3)], b = vs2[H4(2)], e = vs2[H4(1)], f = vs2[H4(0)]; 547 uint32_t c = vd[H4(3)], d = vd[H4(2)], g = vd[H4(1)], h = vd[H4(0)]; 548 uint32_t W0 = vs1[H4(0)], W1 = vs1[H4(1)]; 549 uint32_t T1 = h + sum1_32(e) + ch(e, f, g) + W0; 550 uint32_t T2 = sum0_32(a) + maj(a, b, c); 551 552 h = g; 553 g = f; 554 f = e; 555 e = d + T1; 556 d = c; 557 c = b; 558 b = a; 559 a = T1 + T2; 560 561 T1 = h + sum1_32(e) + ch(e, f, g) + W1; 562 T2 = sum0_32(a) + maj(a, b, c); 563 h = g; 564 g = f; 565 f = e; 566 e = d + T1; 567 d = c; 568 c = b; 569 b = a; 570 a = T1 + T2; 571 572 vd[H4(0)] = f; 573 vd[H4(1)] = e; 574 vd[H4(2)] = b; 575 vd[H4(3)] = a; 576 } 577 578 void HELPER(vsha2ch32_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 579 uint32_t desc) 580 { 581 const uint32_t esz = 4; 582 uint32_t total_elems; 583 uint32_t vta = vext_vta(desc); 584 585 VSTART_CHECK_EARLY_EXIT(env); 586 587 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 588 vsha2c_32(((uint32_t *)vs2) + 4 * i, ((uint32_t *)vd) + 4 * i, 589 ((uint32_t *)vs1) + 4 * i + 2); 590 } 591 592 /* set tail elements to 1s */ 593 total_elems = vext_get_total_elems(env, desc, esz); 594 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 595 env->vstart = 0; 596 } 597 598 void HELPER(vsha2ch64_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 599 uint32_t desc) 600 { 601 const uint32_t esz = 8; 602 uint32_t total_elems; 603 uint32_t vta = vext_vta(desc); 604 605 VSTART_CHECK_EARLY_EXIT(env); 606 607 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 608 vsha2c_64(((uint64_t *)vs2) + 4 * i, ((uint64_t *)vd) + 4 * i, 609 ((uint64_t *)vs1) + 4 * i + 2); 610 } 611 612 /* set tail elements to 1s */ 613 total_elems = vext_get_total_elems(env, desc, esz); 614 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 615 env->vstart = 0; 616 } 617 618 void HELPER(vsha2cl32_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 619 uint32_t desc) 620 { 621 const uint32_t esz = 4; 622 uint32_t total_elems; 623 uint32_t vta = vext_vta(desc); 624 625 VSTART_CHECK_EARLY_EXIT(env); 626 627 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 628 vsha2c_32(((uint32_t *)vs2) + 4 * i, ((uint32_t *)vd) + 4 * i, 629 (((uint32_t *)vs1) + 4 * i)); 630 } 631 632 /* set tail elements to 1s */ 633 total_elems = vext_get_total_elems(env, desc, esz); 634 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 635 env->vstart = 0; 636 } 637 638 void HELPER(vsha2cl64_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 639 uint32_t desc) 640 { 641 uint32_t esz = 8; 642 uint32_t total_elems; 643 uint32_t vta = vext_vta(desc); 644 645 VSTART_CHECK_EARLY_EXIT(env); 646 647 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 648 vsha2c_64(((uint64_t *)vs2) + 4 * i, ((uint64_t *)vd) + 4 * i, 649 (((uint64_t *)vs1) + 4 * i)); 650 } 651 652 /* set tail elements to 1s */ 653 total_elems = vext_get_total_elems(env, desc, esz); 654 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 655 env->vstart = 0; 656 } 657 658 static inline uint32_t p1(uint32_t x) 659 { 660 return x ^ rol32(x, 15) ^ rol32(x, 23); 661 } 662 663 static inline uint32_t zvksh_w(uint32_t m16, uint32_t m9, uint32_t m3, 664 uint32_t m13, uint32_t m6) 665 { 666 return p1(m16 ^ m9 ^ rol32(m3, 15)) ^ rol32(m13, 7) ^ m6; 667 } 668 669 void HELPER(vsm3me_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr, 670 CPURISCVState *env, uint32_t desc) 671 { 672 uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW)); 673 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 674 uint32_t vta = vext_vta(desc); 675 uint32_t *vd = vd_vptr; 676 uint32_t *vs1 = vs1_vptr; 677 uint32_t *vs2 = vs2_vptr; 678 679 VSTART_CHECK_EARLY_EXIT(env); 680 681 for (int i = env->vstart / 8; i < env->vl / 8; i++) { 682 uint32_t w[24]; 683 for (int j = 0; j < 8; j++) { 684 w[j] = bswap32(vs1[H4((i * 8) + j)]); 685 w[j + 8] = bswap32(vs2[H4((i * 8) + j)]); 686 } 687 for (int j = 0; j < 8; j++) { 688 w[j + 16] = 689 zvksh_w(w[j], w[j + 7], w[j + 13], w[j + 3], w[j + 10]); 690 } 691 for (int j = 0; j < 8; j++) { 692 vd[(i * 8) + j] = bswap32(w[H4(j + 16)]); 693 } 694 } 695 vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz); 696 env->vstart = 0; 697 } 698 699 static inline uint32_t ff1(uint32_t x, uint32_t y, uint32_t z) 700 { 701 return x ^ y ^ z; 702 } 703 704 static inline uint32_t ff2(uint32_t x, uint32_t y, uint32_t z) 705 { 706 return (x & y) | (x & z) | (y & z); 707 } 708 709 static inline uint32_t ff_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j) 710 { 711 return (j <= 15) ? ff1(x, y, z) : ff2(x, y, z); 712 } 713 714 static inline uint32_t gg1(uint32_t x, uint32_t y, uint32_t z) 715 { 716 return x ^ y ^ z; 717 } 718 719 static inline uint32_t gg2(uint32_t x, uint32_t y, uint32_t z) 720 { 721 return (x & y) | (~x & z); 722 } 723 724 static inline uint32_t gg_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j) 725 { 726 return (j <= 15) ? gg1(x, y, z) : gg2(x, y, z); 727 } 728 729 static inline uint32_t t_j(uint32_t j) 730 { 731 return (j <= 15) ? 0x79cc4519 : 0x7a879d8a; 732 } 733 734 static inline uint32_t p_0(uint32_t x) 735 { 736 return x ^ rol32(x, 9) ^ rol32(x, 17); 737 } 738 739 static void sm3c(uint32_t *vd, uint32_t *vs1, uint32_t *vs2, uint32_t uimm) 740 { 741 uint32_t x0, x1; 742 uint32_t j; 743 uint32_t ss1, ss2, tt1, tt2; 744 x0 = vs2[0] ^ vs2[4]; 745 x1 = vs2[1] ^ vs2[5]; 746 j = 2 * uimm; 747 ss1 = rol32(rol32(vs1[0], 12) + vs1[4] + rol32(t_j(j), j % 32), 7); 748 ss2 = ss1 ^ rol32(vs1[0], 12); 749 tt1 = ff_j(vs1[0], vs1[1], vs1[2], j) + vs1[3] + ss2 + x0; 750 tt2 = gg_j(vs1[4], vs1[5], vs1[6], j) + vs1[7] + ss1 + vs2[0]; 751 vs1[3] = vs1[2]; 752 vd[3] = rol32(vs1[1], 9); 753 vs1[1] = vs1[0]; 754 vd[1] = tt1; 755 vs1[7] = vs1[6]; 756 vd[7] = rol32(vs1[5], 19); 757 vs1[5] = vs1[4]; 758 vd[5] = p_0(tt2); 759 j = 2 * uimm + 1; 760 ss1 = rol32(rol32(vd[1], 12) + vd[5] + rol32(t_j(j), j % 32), 7); 761 ss2 = ss1 ^ rol32(vd[1], 12); 762 tt1 = ff_j(vd[1], vs1[1], vd[3], j) + vs1[3] + ss2 + x1; 763 tt2 = gg_j(vd[5], vs1[5], vd[7], j) + vs1[7] + ss1 + vs2[1]; 764 vd[2] = rol32(vs1[1], 9); 765 vd[0] = tt1; 766 vd[6] = rol32(vs1[5], 19); 767 vd[4] = p_0(tt2); 768 } 769 770 void HELPER(vsm3c_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 771 CPURISCVState *env, uint32_t desc) 772 { 773 uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW)); 774 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 775 uint32_t vta = vext_vta(desc); 776 uint32_t *vd = vd_vptr; 777 uint32_t *vs2 = vs2_vptr; 778 uint32_t v1[8], v2[8], v3[8]; 779 780 VSTART_CHECK_EARLY_EXIT(env); 781 782 for (int i = env->vstart / 8; i < env->vl / 8; i++) { 783 for (int k = 0; k < 8; k++) { 784 v2[k] = bswap32(vd[H4(i * 8 + k)]); 785 v3[k] = bswap32(vs2[H4(i * 8 + k)]); 786 } 787 sm3c(v1, v2, v3, uimm); 788 for (int k = 0; k < 8; k++) { 789 vd[i * 8 + k] = bswap32(v1[H4(k)]); 790 } 791 } 792 vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz); 793 env->vstart = 0; 794 } 795 796 void HELPER(vghsh_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr, 797 CPURISCVState *env, uint32_t desc) 798 { 799 uint64_t *vd = vd_vptr; 800 uint64_t *vs1 = vs1_vptr; 801 uint64_t *vs2 = vs2_vptr; 802 uint32_t vta = vext_vta(desc); 803 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 804 805 VSTART_CHECK_EARLY_EXIT(env); 806 807 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 808 uint64_t Y[2] = {vd[i * 2 + 0], vd[i * 2 + 1]}; 809 uint64_t H[2] = {brev8(vs2[i * 2 + 0]), brev8(vs2[i * 2 + 1])}; 810 uint64_t X[2] = {vs1[i * 2 + 0], vs1[i * 2 + 1]}; 811 uint64_t Z[2] = {0, 0}; 812 813 uint64_t S[2] = {brev8(Y[0] ^ X[0]), brev8(Y[1] ^ X[1])}; 814 815 for (int j = 0; j < 128; j++) { 816 if ((S[j / 64] >> (j % 64)) & 1) { 817 Z[0] ^= H[0]; 818 Z[1] ^= H[1]; 819 } 820 bool reduce = ((H[1] >> 63) & 1); 821 H[1] = H[1] << 1 | H[0] >> 63; 822 H[0] = H[0] << 1; 823 if (reduce) { 824 H[0] ^= 0x87; 825 } 826 } 827 828 vd[i * 2 + 0] = brev8(Z[0]); 829 vd[i * 2 + 1] = brev8(Z[1]); 830 } 831 /* set tail elements to 1s */ 832 vext_set_elems_1s(vd, vta, env->vl * 4, total_elems * 4); 833 env->vstart = 0; 834 } 835 836 void HELPER(vgmul_vv)(void *vd_vptr, void *vs2_vptr, CPURISCVState *env, 837 uint32_t desc) 838 { 839 uint64_t *vd = vd_vptr; 840 uint64_t *vs2 = vs2_vptr; 841 uint32_t vta = vext_vta(desc); 842 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 843 844 VSTART_CHECK_EARLY_EXIT(env); 845 846 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 847 uint64_t Y[2] = {brev8(vd[i * 2 + 0]), brev8(vd[i * 2 + 1])}; 848 uint64_t H[2] = {brev8(vs2[i * 2 + 0]), brev8(vs2[i * 2 + 1])}; 849 uint64_t Z[2] = {0, 0}; 850 851 for (int j = 0; j < 128; j++) { 852 if ((Y[j / 64] >> (j % 64)) & 1) { 853 Z[0] ^= H[0]; 854 Z[1] ^= H[1]; 855 } 856 bool reduce = ((H[1] >> 63) & 1); 857 H[1] = H[1] << 1 | H[0] >> 63; 858 H[0] = H[0] << 1; 859 if (reduce) { 860 H[0] ^= 0x87; 861 } 862 } 863 864 vd[i * 2 + 0] = brev8(Z[0]); 865 vd[i * 2 + 1] = brev8(Z[1]); 866 } 867 /* set tail elements to 1s */ 868 vext_set_elems_1s(vd, vta, env->vl * 4, total_elems * 4); 869 env->vstart = 0; 870 } 871 872 void HELPER(vsm4k_vi)(void *vd, void *vs2, uint32_t uimm5, CPURISCVState *env, 873 uint32_t desc) 874 { 875 const uint32_t egs = 4; 876 uint32_t rnd = uimm5 & 0x7; 877 uint32_t group_start = env->vstart / egs; 878 uint32_t group_end = env->vl / egs; 879 uint32_t esz = sizeof(uint32_t); 880 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 881 882 VSTART_CHECK_EARLY_EXIT(env); 883 884 for (uint32_t i = group_start; i < group_end; ++i) { 885 uint32_t vstart = i * egs; 886 uint32_t vend = (i + 1) * egs; 887 uint32_t rk[4] = {0}; 888 uint32_t tmp[8] = {0}; 889 890 for (uint32_t j = vstart; j < vend; ++j) { 891 rk[j - vstart] = *((uint32_t *)vs2 + H4(j)); 892 } 893 894 for (uint32_t j = 0; j < egs; ++j) { 895 tmp[j] = rk[j]; 896 } 897 898 for (uint32_t j = 0; j < egs; ++j) { 899 uint32_t b, s; 900 b = tmp[j + 1] ^ tmp[j + 2] ^ tmp[j + 3] ^ sm4_ck[rnd * 4 + j]; 901 902 s = sm4_subword(b); 903 904 tmp[j + 4] = tmp[j] ^ (s ^ rol32(s, 13) ^ rol32(s, 23)); 905 } 906 907 for (uint32_t j = vstart; j < vend; ++j) { 908 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 909 } 910 } 911 912 env->vstart = 0; 913 /* set tail elements to 1s */ 914 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 915 } 916 917 static void do_sm4_round(uint32_t *rk, uint32_t *buf) 918 { 919 const uint32_t egs = 4; 920 uint32_t s, b; 921 922 for (uint32_t j = egs; j < egs * 2; ++j) { 923 b = buf[j - 3] ^ buf[j - 2] ^ buf[j - 1] ^ rk[j - 4]; 924 925 s = sm4_subword(b); 926 927 buf[j] = buf[j - 4] ^ (s ^ rol32(s, 2) ^ rol32(s, 10) ^ rol32(s, 18) ^ 928 rol32(s, 24)); 929 } 930 } 931 932 void HELPER(vsm4r_vv)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) 933 { 934 const uint32_t egs = 4; 935 uint32_t group_start = env->vstart / egs; 936 uint32_t group_end = env->vl / egs; 937 uint32_t esz = sizeof(uint32_t); 938 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 939 940 VSTART_CHECK_EARLY_EXIT(env); 941 942 for (uint32_t i = group_start; i < group_end; ++i) { 943 uint32_t vstart = i * egs; 944 uint32_t vend = (i + 1) * egs; 945 uint32_t rk[4] = {0}; 946 uint32_t tmp[8] = {0}; 947 948 for (uint32_t j = vstart; j < vend; ++j) { 949 rk[j - vstart] = *((uint32_t *)vs2 + H4(j)); 950 } 951 952 for (uint32_t j = vstart; j < vend; ++j) { 953 tmp[j - vstart] = *((uint32_t *)vd + H4(j)); 954 } 955 956 do_sm4_round(rk, tmp); 957 958 for (uint32_t j = vstart; j < vend; ++j) { 959 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 960 } 961 } 962 963 env->vstart = 0; 964 /* set tail elements to 1s */ 965 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 966 } 967 968 void HELPER(vsm4r_vs)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) 969 { 970 const uint32_t egs = 4; 971 uint32_t group_start = env->vstart / egs; 972 uint32_t group_end = env->vl / egs; 973 uint32_t esz = sizeof(uint32_t); 974 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 975 976 VSTART_CHECK_EARLY_EXIT(env); 977 978 for (uint32_t i = group_start; i < group_end; ++i) { 979 uint32_t vstart = i * egs; 980 uint32_t vend = (i + 1) * egs; 981 uint32_t rk[4] = {0}; 982 uint32_t tmp[8] = {0}; 983 984 for (uint32_t j = 0; j < egs; ++j) { 985 rk[j] = *((uint32_t *)vs2 + H4(j)); 986 } 987 988 for (uint32_t j = vstart; j < vend; ++j) { 989 tmp[j - vstart] = *((uint32_t *)vd + H4(j)); 990 } 991 992 do_sm4_round(rk, tmp); 993 994 for (uint32_t j = vstart; j < vend; ++j) { 995 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 996 } 997 } 998 999 env->vstart = 0; 1000 /* set tail elements to 1s */ 1001 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 1002 } 1003