167fa3a7fSTianjia Zhang/* SPDX-License-Identifier: GPL-2.0-or-later */ 267fa3a7fSTianjia Zhang/* 367fa3a7fSTianjia Zhang * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions 467fa3a7fSTianjia Zhang * as specified in rfc8998 567fa3a7fSTianjia Zhang * https://datatracker.ietf.org/doc/html/rfc8998 667fa3a7fSTianjia Zhang * 767fa3a7fSTianjia Zhang * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 867fa3a7fSTianjia Zhang */ 967fa3a7fSTianjia Zhang 1067fa3a7fSTianjia Zhang#include <linux/linkage.h> 11*736f8868STianjia Zhang#include <linux/cfi_types.h> 1267fa3a7fSTianjia Zhang#include <asm/assembler.h> 1367fa3a7fSTianjia Zhang#include "sm4-ce-asm.h" 1467fa3a7fSTianjia Zhang 1567fa3a7fSTianjia Zhang.arch armv8-a+crypto 1667fa3a7fSTianjia Zhang 1767fa3a7fSTianjia Zhang.irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31 1867fa3a7fSTianjia Zhang .set .Lv\b\().4s, \b 1967fa3a7fSTianjia Zhang.endr 2067fa3a7fSTianjia Zhang 2167fa3a7fSTianjia Zhang.macro sm4e, vd, vn 2267fa3a7fSTianjia Zhang .inst 0xcec08400 | (.L\vn << 5) | .L\vd 2367fa3a7fSTianjia Zhang.endm 2467fa3a7fSTianjia Zhang 2567fa3a7fSTianjia Zhang/* Register macros */ 2667fa3a7fSTianjia Zhang 2767fa3a7fSTianjia Zhang#define RMAC v16 2867fa3a7fSTianjia Zhang 2967fa3a7fSTianjia Zhang/* Helper macros. */ 3067fa3a7fSTianjia Zhang 3167fa3a7fSTianjia Zhang#define inc_le128(vctr) \ 3267fa3a7fSTianjia Zhang mov vctr.d[1], x8; \ 3367fa3a7fSTianjia Zhang mov vctr.d[0], x7; \ 3467fa3a7fSTianjia Zhang adds x8, x8, #1; \ 3567fa3a7fSTianjia Zhang rev64 vctr.16b, vctr.16b; \ 3667fa3a7fSTianjia Zhang adc x7, x7, xzr; 3767fa3a7fSTianjia Zhang 3867fa3a7fSTianjia Zhang 3967fa3a7fSTianjia Zhang.align 3 4067fa3a7fSTianjia ZhangSYM_FUNC_START(sm4_ce_cbcmac_update) 4167fa3a7fSTianjia Zhang /* input: 4267fa3a7fSTianjia Zhang * x0: round key array, CTX 4367fa3a7fSTianjia Zhang * x1: mac 4467fa3a7fSTianjia Zhang * x2: src 4567fa3a7fSTianjia Zhang * w3: nblocks 4667fa3a7fSTianjia Zhang */ 4767fa3a7fSTianjia Zhang SM4_PREPARE(x0) 4867fa3a7fSTianjia Zhang 4967fa3a7fSTianjia Zhang ld1 {RMAC.16b}, [x1] 5067fa3a7fSTianjia Zhang 5167fa3a7fSTianjia Zhang.Lcbcmac_loop_4x: 5267fa3a7fSTianjia Zhang cmp w3, #4 5367fa3a7fSTianjia Zhang blt .Lcbcmac_loop_1x 5467fa3a7fSTianjia Zhang 5567fa3a7fSTianjia Zhang sub w3, w3, #4 5667fa3a7fSTianjia Zhang 5767fa3a7fSTianjia Zhang ld1 {v0.16b-v3.16b}, [x2], #64 5867fa3a7fSTianjia Zhang 5967fa3a7fSTianjia Zhang SM4_CRYPT_BLK(RMAC) 6067fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v0.16b 6167fa3a7fSTianjia Zhang SM4_CRYPT_BLK(RMAC) 6267fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v1.16b 6367fa3a7fSTianjia Zhang SM4_CRYPT_BLK(RMAC) 6467fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v2.16b 6567fa3a7fSTianjia Zhang SM4_CRYPT_BLK(RMAC) 6667fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v3.16b 6767fa3a7fSTianjia Zhang 6867fa3a7fSTianjia Zhang cbz w3, .Lcbcmac_end 6967fa3a7fSTianjia Zhang b .Lcbcmac_loop_4x 7067fa3a7fSTianjia Zhang 7167fa3a7fSTianjia Zhang.Lcbcmac_loop_1x: 7267fa3a7fSTianjia Zhang sub w3, w3, #1 7367fa3a7fSTianjia Zhang 7467fa3a7fSTianjia Zhang ld1 {v0.16b}, [x2], #16 7567fa3a7fSTianjia Zhang 7667fa3a7fSTianjia Zhang SM4_CRYPT_BLK(RMAC) 7767fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v0.16b 7867fa3a7fSTianjia Zhang 7967fa3a7fSTianjia Zhang cbnz w3, .Lcbcmac_loop_1x 8067fa3a7fSTianjia Zhang 8167fa3a7fSTianjia Zhang.Lcbcmac_end: 8267fa3a7fSTianjia Zhang st1 {RMAC.16b}, [x1] 8367fa3a7fSTianjia Zhang ret 8467fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_cbcmac_update) 8567fa3a7fSTianjia Zhang 8667fa3a7fSTianjia Zhang.align 3 8767fa3a7fSTianjia ZhangSYM_FUNC_START(sm4_ce_ccm_final) 8867fa3a7fSTianjia Zhang /* input: 8967fa3a7fSTianjia Zhang * x0: round key array, CTX 9067fa3a7fSTianjia Zhang * x1: ctr0 (big endian, 128 bit) 9167fa3a7fSTianjia Zhang * x2: mac 9267fa3a7fSTianjia Zhang */ 9367fa3a7fSTianjia Zhang SM4_PREPARE(x0) 9467fa3a7fSTianjia Zhang 9567fa3a7fSTianjia Zhang ld1 {RMAC.16b}, [x2] 9667fa3a7fSTianjia Zhang ld1 {v0.16b}, [x1] 9767fa3a7fSTianjia Zhang 9867fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(RMAC, v0) 9967fa3a7fSTianjia Zhang 10067fa3a7fSTianjia Zhang /* en-/decrypt the mac with ctr0 */ 10167fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v0.16b 10267fa3a7fSTianjia Zhang st1 {RMAC.16b}, [x2] 10367fa3a7fSTianjia Zhang 10467fa3a7fSTianjia Zhang ret 10567fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_ccm_final) 10667fa3a7fSTianjia Zhang 10767fa3a7fSTianjia Zhang.align 3 108*736f8868STianjia ZhangSYM_TYPED_FUNC_START(sm4_ce_ccm_enc) 10967fa3a7fSTianjia Zhang /* input: 11067fa3a7fSTianjia Zhang * x0: round key array, CTX 11167fa3a7fSTianjia Zhang * x1: dst 11267fa3a7fSTianjia Zhang * x2: src 11367fa3a7fSTianjia Zhang * x3: ctr (big endian, 128 bit) 11467fa3a7fSTianjia Zhang * w4: nbytes 11567fa3a7fSTianjia Zhang * x5: mac 11667fa3a7fSTianjia Zhang */ 11767fa3a7fSTianjia Zhang SM4_PREPARE(x0) 11867fa3a7fSTianjia Zhang 11967fa3a7fSTianjia Zhang ldp x7, x8, [x3] 12067fa3a7fSTianjia Zhang rev x7, x7 12167fa3a7fSTianjia Zhang rev x8, x8 12267fa3a7fSTianjia Zhang 12367fa3a7fSTianjia Zhang ld1 {RMAC.16b}, [x5] 12467fa3a7fSTianjia Zhang 12567fa3a7fSTianjia Zhang.Lccm_enc_loop_4x: 12667fa3a7fSTianjia Zhang cmp w4, #(4 * 16) 12767fa3a7fSTianjia Zhang blt .Lccm_enc_loop_1x 12867fa3a7fSTianjia Zhang 12967fa3a7fSTianjia Zhang sub w4, w4, #(4 * 16) 13067fa3a7fSTianjia Zhang 13167fa3a7fSTianjia Zhang /* construct CTRs */ 13267fa3a7fSTianjia Zhang inc_le128(v8) /* +0 */ 13367fa3a7fSTianjia Zhang inc_le128(v9) /* +1 */ 13467fa3a7fSTianjia Zhang inc_le128(v10) /* +2 */ 13567fa3a7fSTianjia Zhang inc_le128(v11) /* +3 */ 13667fa3a7fSTianjia Zhang 13767fa3a7fSTianjia Zhang ld1 {v0.16b-v3.16b}, [x2], #64 13867fa3a7fSTianjia Zhang 13967fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v8, RMAC) 14067fa3a7fSTianjia Zhang eor v8.16b, v8.16b, v0.16b 14167fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v0.16b 14267fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v9, RMAC) 14367fa3a7fSTianjia Zhang eor v9.16b, v9.16b, v1.16b 14467fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v1.16b 14567fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v10, RMAC) 14667fa3a7fSTianjia Zhang eor v10.16b, v10.16b, v2.16b 14767fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v2.16b 14867fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v11, RMAC) 14967fa3a7fSTianjia Zhang eor v11.16b, v11.16b, v3.16b 15067fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v3.16b 15167fa3a7fSTianjia Zhang 15267fa3a7fSTianjia Zhang st1 {v8.16b-v11.16b}, [x1], #64 15367fa3a7fSTianjia Zhang 15467fa3a7fSTianjia Zhang cbz w4, .Lccm_enc_end 15567fa3a7fSTianjia Zhang b .Lccm_enc_loop_4x 15667fa3a7fSTianjia Zhang 15767fa3a7fSTianjia Zhang.Lccm_enc_loop_1x: 15867fa3a7fSTianjia Zhang cmp w4, #16 15967fa3a7fSTianjia Zhang blt .Lccm_enc_tail 16067fa3a7fSTianjia Zhang 16167fa3a7fSTianjia Zhang sub w4, w4, #16 16267fa3a7fSTianjia Zhang 16367fa3a7fSTianjia Zhang /* construct CTRs */ 16467fa3a7fSTianjia Zhang inc_le128(v8) 16567fa3a7fSTianjia Zhang 16667fa3a7fSTianjia Zhang ld1 {v0.16b}, [x2], #16 16767fa3a7fSTianjia Zhang 16867fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v8, RMAC) 16967fa3a7fSTianjia Zhang eor v8.16b, v8.16b, v0.16b 17067fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v0.16b 17167fa3a7fSTianjia Zhang 17267fa3a7fSTianjia Zhang st1 {v8.16b}, [x1], #16 17367fa3a7fSTianjia Zhang 17467fa3a7fSTianjia Zhang cbz w4, .Lccm_enc_end 17567fa3a7fSTianjia Zhang b .Lccm_enc_loop_1x 17667fa3a7fSTianjia Zhang 17767fa3a7fSTianjia Zhang.Lccm_enc_tail: 17867fa3a7fSTianjia Zhang /* construct CTRs */ 17967fa3a7fSTianjia Zhang inc_le128(v8) 18067fa3a7fSTianjia Zhang 18167fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(RMAC, v8) 18267fa3a7fSTianjia Zhang 18367fa3a7fSTianjia Zhang /* store new MAC */ 18467fa3a7fSTianjia Zhang st1 {RMAC.16b}, [x5] 18567fa3a7fSTianjia Zhang 18667fa3a7fSTianjia Zhang.Lccm_enc_tail_loop: 18767fa3a7fSTianjia Zhang ldrb w0, [x2], #1 /* get 1 byte from input */ 18867fa3a7fSTianjia Zhang umov w9, v8.b[0] /* get top crypted CTR byte */ 18967fa3a7fSTianjia Zhang umov w6, RMAC.b[0] /* get top MAC byte */ 19067fa3a7fSTianjia Zhang 19167fa3a7fSTianjia Zhang eor w9, w9, w0 /* w9 = CTR ^ input */ 19267fa3a7fSTianjia Zhang eor w6, w6, w0 /* w6 = MAC ^ input */ 19367fa3a7fSTianjia Zhang 19467fa3a7fSTianjia Zhang strb w9, [x1], #1 /* store out byte */ 19567fa3a7fSTianjia Zhang strb w6, [x5], #1 /* store MAC byte */ 19667fa3a7fSTianjia Zhang 19767fa3a7fSTianjia Zhang subs w4, w4, #1 19867fa3a7fSTianjia Zhang beq .Lccm_enc_ret 19967fa3a7fSTianjia Zhang 20067fa3a7fSTianjia Zhang /* shift out one byte */ 20167fa3a7fSTianjia Zhang ext RMAC.16b, RMAC.16b, RMAC.16b, #1 20267fa3a7fSTianjia Zhang ext v8.16b, v8.16b, v8.16b, #1 20367fa3a7fSTianjia Zhang 20467fa3a7fSTianjia Zhang b .Lccm_enc_tail_loop 20567fa3a7fSTianjia Zhang 20667fa3a7fSTianjia Zhang.Lccm_enc_end: 20767fa3a7fSTianjia Zhang /* store new MAC */ 20867fa3a7fSTianjia Zhang st1 {RMAC.16b}, [x5] 20967fa3a7fSTianjia Zhang 21067fa3a7fSTianjia Zhang /* store new CTR */ 21167fa3a7fSTianjia Zhang rev x7, x7 21267fa3a7fSTianjia Zhang rev x8, x8 21367fa3a7fSTianjia Zhang stp x7, x8, [x3] 21467fa3a7fSTianjia Zhang 21567fa3a7fSTianjia Zhang.Lccm_enc_ret: 21667fa3a7fSTianjia Zhang ret 21767fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_ccm_enc) 21867fa3a7fSTianjia Zhang 21967fa3a7fSTianjia Zhang.align 3 220*736f8868STianjia ZhangSYM_TYPED_FUNC_START(sm4_ce_ccm_dec) 22167fa3a7fSTianjia Zhang /* input: 22267fa3a7fSTianjia Zhang * x0: round key array, CTX 22367fa3a7fSTianjia Zhang * x1: dst 22467fa3a7fSTianjia Zhang * x2: src 22567fa3a7fSTianjia Zhang * x3: ctr (big endian, 128 bit) 22667fa3a7fSTianjia Zhang * w4: nbytes 22767fa3a7fSTianjia Zhang * x5: mac 22867fa3a7fSTianjia Zhang */ 22967fa3a7fSTianjia Zhang SM4_PREPARE(x0) 23067fa3a7fSTianjia Zhang 23167fa3a7fSTianjia Zhang ldp x7, x8, [x3] 23267fa3a7fSTianjia Zhang rev x7, x7 23367fa3a7fSTianjia Zhang rev x8, x8 23467fa3a7fSTianjia Zhang 23567fa3a7fSTianjia Zhang ld1 {RMAC.16b}, [x5] 23667fa3a7fSTianjia Zhang 23767fa3a7fSTianjia Zhang.Lccm_dec_loop_4x: 23867fa3a7fSTianjia Zhang cmp w4, #(4 * 16) 23967fa3a7fSTianjia Zhang blt .Lccm_dec_loop_1x 24067fa3a7fSTianjia Zhang 24167fa3a7fSTianjia Zhang sub w4, w4, #(4 * 16) 24267fa3a7fSTianjia Zhang 24367fa3a7fSTianjia Zhang /* construct CTRs */ 24467fa3a7fSTianjia Zhang inc_le128(v8) /* +0 */ 24567fa3a7fSTianjia Zhang inc_le128(v9) /* +1 */ 24667fa3a7fSTianjia Zhang inc_le128(v10) /* +2 */ 24767fa3a7fSTianjia Zhang inc_le128(v11) /* +3 */ 24867fa3a7fSTianjia Zhang 24967fa3a7fSTianjia Zhang ld1 {v0.16b-v3.16b}, [x2], #64 25067fa3a7fSTianjia Zhang 25167fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v8, RMAC) 25267fa3a7fSTianjia Zhang eor v8.16b, v8.16b, v0.16b 25367fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v8.16b 25467fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v9, RMAC) 25567fa3a7fSTianjia Zhang eor v9.16b, v9.16b, v1.16b 25667fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v9.16b 25767fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v10, RMAC) 25867fa3a7fSTianjia Zhang eor v10.16b, v10.16b, v2.16b 25967fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v10.16b 26067fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v11, RMAC) 26167fa3a7fSTianjia Zhang eor v11.16b, v11.16b, v3.16b 26267fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v11.16b 26367fa3a7fSTianjia Zhang 26467fa3a7fSTianjia Zhang st1 {v8.16b-v11.16b}, [x1], #64 26567fa3a7fSTianjia Zhang 26667fa3a7fSTianjia Zhang cbz w4, .Lccm_dec_end 26767fa3a7fSTianjia Zhang b .Lccm_dec_loop_4x 26867fa3a7fSTianjia Zhang 26967fa3a7fSTianjia Zhang.Lccm_dec_loop_1x: 27067fa3a7fSTianjia Zhang cmp w4, #16 27167fa3a7fSTianjia Zhang blt .Lccm_dec_tail 27267fa3a7fSTianjia Zhang 27367fa3a7fSTianjia Zhang sub w4, w4, #16 27467fa3a7fSTianjia Zhang 27567fa3a7fSTianjia Zhang /* construct CTRs */ 27667fa3a7fSTianjia Zhang inc_le128(v8) 27767fa3a7fSTianjia Zhang 27867fa3a7fSTianjia Zhang ld1 {v0.16b}, [x2], #16 27967fa3a7fSTianjia Zhang 28067fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(v8, RMAC) 28167fa3a7fSTianjia Zhang eor v8.16b, v8.16b, v0.16b 28267fa3a7fSTianjia Zhang eor RMAC.16b, RMAC.16b, v8.16b 28367fa3a7fSTianjia Zhang 28467fa3a7fSTianjia Zhang st1 {v8.16b}, [x1], #16 28567fa3a7fSTianjia Zhang 28667fa3a7fSTianjia Zhang cbz w4, .Lccm_dec_end 28767fa3a7fSTianjia Zhang b .Lccm_dec_loop_1x 28867fa3a7fSTianjia Zhang 28967fa3a7fSTianjia Zhang.Lccm_dec_tail: 29067fa3a7fSTianjia Zhang /* construct CTRs */ 29167fa3a7fSTianjia Zhang inc_le128(v8) 29267fa3a7fSTianjia Zhang 29367fa3a7fSTianjia Zhang SM4_CRYPT_BLK2(RMAC, v8) 29467fa3a7fSTianjia Zhang 29567fa3a7fSTianjia Zhang /* store new MAC */ 29667fa3a7fSTianjia Zhang st1 {RMAC.16b}, [x5] 29767fa3a7fSTianjia Zhang 29867fa3a7fSTianjia Zhang.Lccm_dec_tail_loop: 29967fa3a7fSTianjia Zhang ldrb w0, [x2], #1 /* get 1 byte from input */ 30067fa3a7fSTianjia Zhang umov w9, v8.b[0] /* get top crypted CTR byte */ 30167fa3a7fSTianjia Zhang umov w6, RMAC.b[0] /* get top MAC byte */ 30267fa3a7fSTianjia Zhang 30367fa3a7fSTianjia Zhang eor w9, w9, w0 /* w9 = CTR ^ input */ 30467fa3a7fSTianjia Zhang eor w6, w6, w9 /* w6 = MAC ^ output */ 30567fa3a7fSTianjia Zhang 30667fa3a7fSTianjia Zhang strb w9, [x1], #1 /* store out byte */ 30767fa3a7fSTianjia Zhang strb w6, [x5], #1 /* store MAC byte */ 30867fa3a7fSTianjia Zhang 30967fa3a7fSTianjia Zhang subs w4, w4, #1 31067fa3a7fSTianjia Zhang beq .Lccm_dec_ret 31167fa3a7fSTianjia Zhang 31267fa3a7fSTianjia Zhang /* shift out one byte */ 31367fa3a7fSTianjia Zhang ext RMAC.16b, RMAC.16b, RMAC.16b, #1 31467fa3a7fSTianjia Zhang ext v8.16b, v8.16b, v8.16b, #1 31567fa3a7fSTianjia Zhang 31667fa3a7fSTianjia Zhang b .Lccm_dec_tail_loop 31767fa3a7fSTianjia Zhang 31867fa3a7fSTianjia Zhang.Lccm_dec_end: 31967fa3a7fSTianjia Zhang /* store new MAC */ 32067fa3a7fSTianjia Zhang st1 {RMAC.16b}, [x5] 32167fa3a7fSTianjia Zhang 32267fa3a7fSTianjia Zhang /* store new CTR */ 32367fa3a7fSTianjia Zhang rev x7, x7 32467fa3a7fSTianjia Zhang rev x8, x8 32567fa3a7fSTianjia Zhang stp x7, x8, [x3] 32667fa3a7fSTianjia Zhang 32767fa3a7fSTianjia Zhang.Lccm_dec_ret: 32867fa3a7fSTianjia Zhang ret 32967fa3a7fSTianjia ZhangSYM_FUNC_END(sm4_ce_ccm_dec) 330