1 /* 2 * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, 3 * including ChaCha20 (RFC7539) 4 * 5 * Copyright (C) 2015 Martin Willi 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 */ 12 13 #include <crypto/algapi.h> 14 #include <crypto/chacha.h> 15 #include <crypto/internal/skcipher.h> 16 #include <linux/kernel.h> 17 #include <linux/module.h> 18 #include <asm/fpu/api.h> 19 #include <asm/simd.h> 20 21 #define CHACHA_STATE_ALIGN 16 22 23 asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, 24 unsigned int len, int nrounds); 25 asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, 26 unsigned int len, int nrounds); 27 asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); 28 #ifdef CONFIG_AS_AVX2 29 asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 30 unsigned int len, int nrounds); 31 asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 32 unsigned int len, int nrounds); 33 asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, 34 unsigned int len, int nrounds); 35 static bool chacha_use_avx2; 36 #ifdef CONFIG_AS_AVX512 37 asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, 38 unsigned int len, int nrounds); 39 asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, 40 unsigned int len, int nrounds); 41 asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, 42 unsigned int len, int nrounds); 43 static bool chacha_use_avx512vl; 44 #endif 45 #endif 46 47 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) 48 { 49 len = min(len, maxblocks * CHACHA_BLOCK_SIZE); 50 return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; 51 } 52 53 static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, 54 unsigned int bytes, int nrounds) 55 { 56 #ifdef CONFIG_AS_AVX2 57 #ifdef CONFIG_AS_AVX512 58 if (chacha_use_avx512vl) { 59 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 60 chacha_8block_xor_avx512vl(state, dst, src, bytes, 61 nrounds); 62 bytes -= CHACHA_BLOCK_SIZE * 8; 63 src += CHACHA_BLOCK_SIZE * 8; 64 dst += CHACHA_BLOCK_SIZE * 8; 65 state[12] += 8; 66 } 67 if (bytes > CHACHA_BLOCK_SIZE * 4) { 68 chacha_8block_xor_avx512vl(state, dst, src, bytes, 69 nrounds); 70 state[12] += chacha_advance(bytes, 8); 71 return; 72 } 73 if (bytes > CHACHA_BLOCK_SIZE * 2) { 74 chacha_4block_xor_avx512vl(state, dst, src, bytes, 75 nrounds); 76 state[12] += chacha_advance(bytes, 4); 77 return; 78 } 79 if (bytes) { 80 chacha_2block_xor_avx512vl(state, dst, src, bytes, 81 nrounds); 82 state[12] += chacha_advance(bytes, 2); 83 return; 84 } 85 } 86 #endif 87 if (chacha_use_avx2) { 88 while (bytes >= CHACHA_BLOCK_SIZE * 8) { 89 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 90 bytes -= CHACHA_BLOCK_SIZE * 8; 91 src += CHACHA_BLOCK_SIZE * 8; 92 dst += CHACHA_BLOCK_SIZE * 8; 93 state[12] += 8; 94 } 95 if (bytes > CHACHA_BLOCK_SIZE * 4) { 96 chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); 97 state[12] += chacha_advance(bytes, 8); 98 return; 99 } 100 if (bytes > CHACHA_BLOCK_SIZE * 2) { 101 chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); 102 state[12] += chacha_advance(bytes, 4); 103 return; 104 } 105 if (bytes > CHACHA_BLOCK_SIZE) { 106 chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); 107 state[12] += chacha_advance(bytes, 2); 108 return; 109 } 110 } 111 #endif 112 while (bytes >= CHACHA_BLOCK_SIZE * 4) { 113 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 114 bytes -= CHACHA_BLOCK_SIZE * 4; 115 src += CHACHA_BLOCK_SIZE * 4; 116 dst += CHACHA_BLOCK_SIZE * 4; 117 state[12] += 4; 118 } 119 if (bytes > CHACHA_BLOCK_SIZE) { 120 chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); 121 state[12] += chacha_advance(bytes, 4); 122 return; 123 } 124 if (bytes) { 125 chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); 126 state[12]++; 127 } 128 } 129 130 static int chacha_simd_stream_xor(struct skcipher_walk *walk, 131 struct chacha_ctx *ctx, u8 *iv) 132 { 133 u32 *state, state_buf[16 + 2] __aligned(8); 134 int next_yield = 4096; /* bytes until next FPU yield */ 135 int err = 0; 136 137 BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); 138 state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); 139 140 crypto_chacha_init(state, ctx, iv); 141 142 while (walk->nbytes > 0) { 143 unsigned int nbytes = walk->nbytes; 144 145 if (nbytes < walk->total) { 146 nbytes = round_down(nbytes, walk->stride); 147 next_yield -= nbytes; 148 } 149 150 chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, 151 nbytes, ctx->nrounds); 152 153 if (next_yield <= 0) { 154 /* temporarily allow preemption */ 155 kernel_fpu_end(); 156 kernel_fpu_begin(); 157 next_yield = 4096; 158 } 159 160 err = skcipher_walk_done(walk, walk->nbytes - nbytes); 161 } 162 163 return err; 164 } 165 166 static int chacha_simd(struct skcipher_request *req) 167 { 168 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 169 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); 170 struct skcipher_walk walk; 171 int err; 172 173 if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) 174 return crypto_chacha_crypt(req); 175 176 err = skcipher_walk_virt(&walk, req, true); 177 if (err) 178 return err; 179 180 kernel_fpu_begin(); 181 err = chacha_simd_stream_xor(&walk, ctx, req->iv); 182 kernel_fpu_end(); 183 return err; 184 } 185 186 static int xchacha_simd(struct skcipher_request *req) 187 { 188 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 189 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); 190 struct skcipher_walk walk; 191 struct chacha_ctx subctx; 192 u32 *state, state_buf[16 + 2] __aligned(8); 193 u8 real_iv[16]; 194 int err; 195 196 if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) 197 return crypto_xchacha_crypt(req); 198 199 err = skcipher_walk_virt(&walk, req, true); 200 if (err) 201 return err; 202 203 BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); 204 state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); 205 crypto_chacha_init(state, ctx, req->iv); 206 207 kernel_fpu_begin(); 208 209 hchacha_block_ssse3(state, subctx.key, ctx->nrounds); 210 subctx.nrounds = ctx->nrounds; 211 212 memcpy(&real_iv[0], req->iv + 24, 8); 213 memcpy(&real_iv[8], req->iv + 16, 8); 214 err = chacha_simd_stream_xor(&walk, &subctx, real_iv); 215 216 kernel_fpu_end(); 217 218 return err; 219 } 220 221 static struct skcipher_alg algs[] = { 222 { 223 .base.cra_name = "chacha20", 224 .base.cra_driver_name = "chacha20-simd", 225 .base.cra_priority = 300, 226 .base.cra_blocksize = 1, 227 .base.cra_ctxsize = sizeof(struct chacha_ctx), 228 .base.cra_module = THIS_MODULE, 229 230 .min_keysize = CHACHA_KEY_SIZE, 231 .max_keysize = CHACHA_KEY_SIZE, 232 .ivsize = CHACHA_IV_SIZE, 233 .chunksize = CHACHA_BLOCK_SIZE, 234 .setkey = crypto_chacha20_setkey, 235 .encrypt = chacha_simd, 236 .decrypt = chacha_simd, 237 }, { 238 .base.cra_name = "xchacha20", 239 .base.cra_driver_name = "xchacha20-simd", 240 .base.cra_priority = 300, 241 .base.cra_blocksize = 1, 242 .base.cra_ctxsize = sizeof(struct chacha_ctx), 243 .base.cra_module = THIS_MODULE, 244 245 .min_keysize = CHACHA_KEY_SIZE, 246 .max_keysize = CHACHA_KEY_SIZE, 247 .ivsize = XCHACHA_IV_SIZE, 248 .chunksize = CHACHA_BLOCK_SIZE, 249 .setkey = crypto_chacha20_setkey, 250 .encrypt = xchacha_simd, 251 .decrypt = xchacha_simd, 252 }, { 253 .base.cra_name = "xchacha12", 254 .base.cra_driver_name = "xchacha12-simd", 255 .base.cra_priority = 300, 256 .base.cra_blocksize = 1, 257 .base.cra_ctxsize = sizeof(struct chacha_ctx), 258 .base.cra_module = THIS_MODULE, 259 260 .min_keysize = CHACHA_KEY_SIZE, 261 .max_keysize = CHACHA_KEY_SIZE, 262 .ivsize = XCHACHA_IV_SIZE, 263 .chunksize = CHACHA_BLOCK_SIZE, 264 .setkey = crypto_chacha12_setkey, 265 .encrypt = xchacha_simd, 266 .decrypt = xchacha_simd, 267 }, 268 }; 269 270 static int __init chacha_simd_mod_init(void) 271 { 272 if (!boot_cpu_has(X86_FEATURE_SSSE3)) 273 return -ENODEV; 274 275 #ifdef CONFIG_AS_AVX2 276 chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && 277 boot_cpu_has(X86_FEATURE_AVX2) && 278 cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); 279 #ifdef CONFIG_AS_AVX512 280 chacha_use_avx512vl = chacha_use_avx2 && 281 boot_cpu_has(X86_FEATURE_AVX512VL) && 282 boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ 283 #endif 284 #endif 285 return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); 286 } 287 288 static void __exit chacha_simd_mod_fini(void) 289 { 290 crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); 291 } 292 293 module_init(chacha_simd_mod_init); 294 module_exit(chacha_simd_mod_fini); 295 296 MODULE_LICENSE("GPL"); 297 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); 298 MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); 299 MODULE_ALIAS_CRYPTO("chacha20"); 300 MODULE_ALIAS_CRYPTO("chacha20-simd"); 301 MODULE_ALIAS_CRYPTO("xchacha20"); 302 MODULE_ALIAS_CRYPTO("xchacha20-simd"); 303 MODULE_ALIAS_CRYPTO("xchacha12"); 304 MODULE_ALIAS_CRYPTO("xchacha12-simd"); 305