1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, 4 * including ChaCha20 (RFC7539) 5 * 6 * Copyright (C) 2015 Martin Willi 7 */ 8 9 #include <crypto/algapi.h> 10 #include <crypto/internal/chacha.h> 11 #include <crypto/internal/simd.h> 12 #include <crypto/internal/skcipher.h> 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/sizes.h> 16 #include <asm/simd.h> 17 18 asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, 19 unsigned int len, int nrounds); 20 asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, 21 unsigned int len, int nrounds); 22 asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); 23 24 asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 25 unsigned int len, int nrounds); 26 asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 27 unsigned int len, int nrounds); 28 asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 29 unsigned int len, int nrounds); 30 31 asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, 32 unsigned int len, int nrounds); 33 asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, 34 unsigned int len, int nrounds); 35 asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, 36 unsigned int len, int nrounds); 37 38 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); 39 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); 40 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); 41 42 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) 43 { 44 len = min(len, maxblocks * CHACHA_BLOCK_SIZE); 45 return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; 46 } 47 48 static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, 49 unsigned int bytes, int nrounds) 50 { 51 if (IS_ENABLED(CONFIG_AS_AVX512) && 52 static_branch_likely(&chacha_use_avx512vl)) { 53 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 54 chacha_8block_xor_avx512vl(state, dst, src, bytes, 55 nrounds); 56 bytes -= CHACHA_BLOCK_SIZE * 8; 57 src += CHACHA_BLOCK_SIZE * 8; 58 dst += CHACHA_BLOCK_SIZE * 8; 59 state[12] += 8; 60 } 61 if (bytes > CHACHA_BLOCK_SIZE * 4) { 62 chacha_8block_xor_avx512vl(state, dst, src, bytes, 63 nrounds); 64 state[12] += chacha_advance(bytes, 8); 65 return; 66 } 67 if (bytes > CHACHA_BLOCK_SIZE * 2) { 68 chacha_4block_xor_avx512vl(state, dst, src, bytes, 69 nrounds); 70 state[12] += chacha_advance(bytes, 4); 71 return; 72 } 73 if (bytes) { 74 chacha_2block_xor_avx512vl(state, dst, src, bytes, 75 nrounds); 76 state[12] += chacha_advance(bytes, 2); 77 return; 78 } 79 } 80 81 if (static_branch_likely(&chacha_use_avx2)) { 82 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 83 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 84 bytes -= CHACHA_BLOCK_SIZE * 8; 85 src += CHACHA_BLOCK_SIZE * 8; 86 dst += CHACHA_BLOCK_SIZE * 8; 87 state[12] += 8; 88 } 89 if (bytes > CHACHA_BLOCK_SIZE * 4) { 90 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 91 state[12] += chacha_advance(bytes, 8); 92 return; 93 } 94 if (bytes > CHACHA_BLOCK_SIZE * 2) { 95 chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); 96 state[12] += chacha_advance(bytes, 4); 97 return; 98 } 99 if (bytes > CHACHA_BLOCK_SIZE) { 100 chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); 101 state[12] += chacha_advance(bytes, 2); 102 return; 103 } 104 } 105 106 while (bytes >= CHACHA_BLOCK_SIZE * 4) { 107 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 108 bytes -= CHACHA_BLOCK_SIZE * 4; 109 src += CHACHA_BLOCK_SIZE * 4; 110 dst += CHACHA_BLOCK_SIZE * 4; 111 state[12] += 4; 112 } 113 if (bytes > CHACHA_BLOCK_SIZE) { 114 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 115 state[12] += chacha_advance(bytes, 4); 116 return; 117 } 118 if (bytes) { 119 chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); 120 state[12]++; 121 } 122 } 123 124 void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) 125 { 126 if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { 127 hchacha_block_generic(state, stream, nrounds); 128 } else { 129 kernel_fpu_begin(); 130 hchacha_block_ssse3(state, stream, nrounds); 131 kernel_fpu_end(); 132 } 133 } 134 EXPORT_SYMBOL(hchacha_block_arch); 135 136 void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) 137 { 138 chacha_init_generic(state, key, iv); 139 } 140 EXPORT_SYMBOL(chacha_init_arch); 141 142 void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, 143 int nrounds) 144 { 145 if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || 146 bytes <= CHACHA_BLOCK_SIZE) 147 return chacha_crypt_generic(state, dst, src, bytes, nrounds); 148 149 do { 150 unsigned int todo = min_t(unsigned int, bytes, SZ_4K); 151 152 kernel_fpu_begin(); 153 chacha_dosimd(state, dst, src, todo, nrounds); 154 kernel_fpu_end(); 155 156 bytes -= todo; 157 src += todo; 158 dst += todo; 159 } while (bytes); 160 } 161 EXPORT_SYMBOL(chacha_crypt_arch); 162 163 static int chacha_simd_stream_xor(struct skcipher_request *req, 164 const struct chacha_ctx *ctx, const u8 *iv) 165 { 166 u32 state[CHACHA_STATE_WORDS] __aligned(8); 167 struct skcipher_walk walk; 168 int err; 169 170 err = skcipher_walk_virt(&walk, req, false); 171 172 chacha_init_generic(state, ctx->key, iv); 173 174 while (walk.nbytes > 0) { 175 unsigned int nbytes = walk.nbytes; 176 177 if (nbytes < walk.total) 178 nbytes = round_down(nbytes, walk.stride); 179 180 if (!static_branch_likely(&chacha_use_simd) || 181 !crypto_simd_usable()) { 182 chacha_crypt_generic(state, walk.dst.virt.addr, 183 walk.src.virt.addr, nbytes, 184 ctx->nrounds); 185 } else { 186 kernel_fpu_begin(); 187 chacha_dosimd(state, walk.dst.virt.addr, 188 walk.src.virt.addr, nbytes, 189 ctx->nrounds); 190 kernel_fpu_end(); 191 } 192 err = skcipher_walk_done(&walk, walk.nbytes - nbytes); 193 } 194 195 return err; 196 } 197 198 static int chacha_simd(struct skcipher_request *req) 199 { 200 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 201 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); 202 203 return chacha_simd_stream_xor(req, ctx, req->iv); 204 } 205 206 static int xchacha_simd(struct skcipher_request *req) 207 { 208 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 209 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); 210 u32 state[CHACHA_STATE_WORDS] __aligned(8); 211 struct chacha_ctx subctx; 212 u8 real_iv[16]; 213 214 chacha_init_generic(state, ctx->key, req->iv); 215 216 if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { 217 kernel_fpu_begin(); 218 hchacha_block_ssse3(state, subctx.key, ctx->nrounds); 219 kernel_fpu_end(); 220 } else { 221 hchacha_block_generic(state, subctx.key, ctx->nrounds); 222 } 223 subctx.nrounds = ctx->nrounds; 224 225 memcpy(&real_iv[0], req->iv + 24, 8); 226 memcpy(&real_iv[8], req->iv + 16, 8); 227 return chacha_simd_stream_xor(req, &subctx, real_iv); 228 } 229 230 static struct skcipher_alg algs[] = { 231 { 232 .base.cra_name = "chacha20", 233 .base.cra_driver_name = "chacha20-simd", 234 .base.cra_priority = 300, 235 .base.cra_blocksize = 1, 236 .base.cra_ctxsize = sizeof(struct chacha_ctx), 237 .base.cra_module = THIS_MODULE, 238 239 .min_keysize = CHACHA_KEY_SIZE, 240 .max_keysize = CHACHA_KEY_SIZE, 241 .ivsize = CHACHA_IV_SIZE, 242 .chunksize = CHACHA_BLOCK_SIZE, 243 .setkey = chacha20_setkey, 244 .encrypt = chacha_simd, 245 .decrypt = chacha_simd, 246 }, { 247 .base.cra_name = "xchacha20", 248 .base.cra_driver_name = "xchacha20-simd", 249 .base.cra_priority = 300, 250 .base.cra_blocksize = 1, 251 .base.cra_ctxsize = sizeof(struct chacha_ctx), 252 .base.cra_module = THIS_MODULE, 253 254 .min_keysize = CHACHA_KEY_SIZE, 255 .max_keysize = CHACHA_KEY_SIZE, 256 .ivsize = XCHACHA_IV_SIZE, 257 .chunksize = CHACHA_BLOCK_SIZE, 258 .setkey = chacha20_setkey, 259 .encrypt = xchacha_simd, 260 .decrypt = xchacha_simd, 261 }, { 262 .base.cra_name = "xchacha12", 263 .base.cra_driver_name = "xchacha12-simd", 264 .base.cra_priority = 300, 265 .base.cra_blocksize = 1, 266 .base.cra_ctxsize = sizeof(struct chacha_ctx), 267 .base.cra_module = THIS_MODULE, 268 269 .min_keysize = CHACHA_KEY_SIZE, 270 .max_keysize = CHACHA_KEY_SIZE, 271 .ivsize = XCHACHA_IV_SIZE, 272 .chunksize = CHACHA_BLOCK_SIZE, 273 .setkey = chacha12_setkey, 274 .encrypt = xchacha_simd, 275 .decrypt = xchacha_simd, 276 }, 277 }; 278 279 static int __init chacha_simd_mod_init(void) 280 { 281 if (!boot_cpu_has(X86_FEATURE_SSSE3)) 282 return 0; 283 284 static_branch_enable(&chacha_use_simd); 285 286 if (boot_cpu_has(X86_FEATURE_AVX) && 287 boot_cpu_has(X86_FEATURE_AVX2) && 288 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { 289 static_branch_enable(&chacha_use_avx2); 290 291 if (IS_ENABLED(CONFIG_AS_AVX512) && 292 boot_cpu_has(X86_FEATURE_AVX512VL) && 293 boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ 294 static_branch_enable(&chacha_use_avx512vl); 295 } 296 return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? 297 crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; 298 } 299 300 static void __exit chacha_simd_mod_fini(void) 301 { 302 if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) 303 crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); 304 } 305 306 module_init(chacha_simd_mod_init); 307 module_exit(chacha_simd_mod_fini); 308 309 MODULE_LICENSE("GPL"); 310 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); 311 MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); 312 MODULE_ALIAS_CRYPTO("chacha20"); 313 MODULE_ALIAS_CRYPTO("chacha20-simd"); 314 MODULE_ALIAS_CRYPTO("xchacha20"); 315 MODULE_ALIAS_CRYPTO("xchacha20-simd"); 316 MODULE_ALIAS_CRYPTO("xchacha12"); 317 MODULE_ALIAS_CRYPTO("xchacha12-simd"); 318