1/* 2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3 * 4 * This is AES128/192/256 CTR mode optimization implementation. It requires 5 * the support of Intel(R) AESNI and AVX instructions. 6 * 7 * This work was inspired by the AES CTR mode optimization published 8 * in Intel Optimized IPSEC Cryptograhpic library. 9 * Additional information on it can be found at: 10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11 * 12 * This file is provided under a dual BSD/GPLv2 license. When using or 13 * redistributing this file, you may do so under either license. 14 * 15 * GPL LICENSE SUMMARY 16 * 17 * Copyright(c) 2014 Intel Corporation. 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of version 2 of the GNU General Public License as 21 * published by the Free Software Foundation. 22 * 23 * This program is distributed in the hope that it will be useful, but 24 * WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * General Public License for more details. 27 * 28 * Contact Information: 29 * James Guilford <james.guilford@intel.com> 30 * Sean Gulley <sean.m.gulley@intel.com> 31 * Chandramouli Narayanan <mouli@linux.intel.com> 32 * 33 * BSD LICENSE 34 * 35 * Copyright(c) 2014 Intel Corporation. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 41 * Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in 45 * the documentation and/or other materials provided with the 46 * distribution. 47 * Neither the name of Intel Corporation nor the names of its 48 * contributors may be used to endorse or promote products derived 49 * from this software without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 65#include <linux/linkage.h> 66 67#define VMOVDQ vmovdqu 68 69#define xdata0 %xmm0 70#define xdata1 %xmm1 71#define xdata2 %xmm2 72#define xdata3 %xmm3 73#define xdata4 %xmm4 74#define xdata5 %xmm5 75#define xdata6 %xmm6 76#define xdata7 %xmm7 77#define xcounter %xmm8 78#define xbyteswap %xmm9 79#define xkey0 %xmm10 80#define xkey4 %xmm11 81#define xkey8 %xmm12 82#define xkey12 %xmm13 83#define xkeyA %xmm14 84#define xkeyB %xmm15 85 86#define p_in %rdi 87#define p_iv %rsi 88#define p_keys %rdx 89#define p_out %rcx 90#define num_bytes %r8 91 92#define tmp %r10 93#define DDQ_DATA 0 94#define XDATA 1 95#define KEY_128 1 96#define KEY_192 2 97#define KEY_256 3 98 99.section .rodata 100.align 16 101 102byteswap_const: 103 .octa 0x000102030405060708090A0B0C0D0E0F 104ddq_low_msk: 105 .octa 0x0000000000000000FFFFFFFFFFFFFFFF 106ddq_high_add_1: 107 .octa 0x00000000000000010000000000000000 108ddq_add_1: 109 .octa 0x00000000000000000000000000000001 110ddq_add_2: 111 .octa 0x00000000000000000000000000000002 112ddq_add_3: 113 .octa 0x00000000000000000000000000000003 114ddq_add_4: 115 .octa 0x00000000000000000000000000000004 116ddq_add_5: 117 .octa 0x00000000000000000000000000000005 118ddq_add_6: 119 .octa 0x00000000000000000000000000000006 120ddq_add_7: 121 .octa 0x00000000000000000000000000000007 122ddq_add_8: 123 .octa 0x00000000000000000000000000000008 124 125.text 126 127/* generate a unique variable for ddq_add_x */ 128 129/* generate a unique variable for xmm register */ 130.macro setxdata n 131 var_xdata = %xmm\n 132.endm 133 134/* club the numeric 'id' to the symbol 'name' */ 135 136.macro club name, id 137.altmacro 138 .if \name == XDATA 139 setxdata %\id 140 .endif 141.noaltmacro 142.endm 143 144/* 145 * do_aes num_in_par load_keys key_len 146 * This increments p_in, but not p_out 147 */ 148.macro do_aes b, k, key_len 149 .set by, \b 150 .set load_keys, \k 151 .set klen, \key_len 152 153 .if (load_keys) 154 vmovdqa 0*16(p_keys), xkey0 155 .endif 156 157 vpshufb xbyteswap, xcounter, xdata0 158 159 .set i, 1 160 .rept (by - 1) 161 club XDATA, i 162 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 163 vptest ddq_low_msk(%rip), var_xdata 164 jnz 1f 165 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 166 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 167 1: 168 vpshufb xbyteswap, var_xdata, var_xdata 169 .set i, (i +1) 170 .endr 171 172 vmovdqa 1*16(p_keys), xkeyA 173 174 vpxor xkey0, xdata0, xdata0 175 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 176 vptest ddq_low_msk(%rip), xcounter 177 jnz 1f 178 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 179 1: 180 181 .set i, 1 182 .rept (by - 1) 183 club XDATA, i 184 vpxor xkey0, var_xdata, var_xdata 185 .set i, (i +1) 186 .endr 187 188 vmovdqa 2*16(p_keys), xkeyB 189 190 .set i, 0 191 .rept by 192 club XDATA, i 193 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 194 .set i, (i +1) 195 .endr 196 197 .if (klen == KEY_128) 198 .if (load_keys) 199 vmovdqa 3*16(p_keys), xkey4 200 .endif 201 .else 202 vmovdqa 3*16(p_keys), xkeyA 203 .endif 204 205 .set i, 0 206 .rept by 207 club XDATA, i 208 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 209 .set i, (i +1) 210 .endr 211 212 add $(16*by), p_in 213 214 .if (klen == KEY_128) 215 vmovdqa 4*16(p_keys), xkeyB 216 .else 217 .if (load_keys) 218 vmovdqa 4*16(p_keys), xkey4 219 .endif 220 .endif 221 222 .set i, 0 223 .rept by 224 club XDATA, i 225 /* key 3 */ 226 .if (klen == KEY_128) 227 vaesenc xkey4, var_xdata, var_xdata 228 .else 229 vaesenc xkeyA, var_xdata, var_xdata 230 .endif 231 .set i, (i +1) 232 .endr 233 234 vmovdqa 5*16(p_keys), xkeyA 235 236 .set i, 0 237 .rept by 238 club XDATA, i 239 /* key 4 */ 240 .if (klen == KEY_128) 241 vaesenc xkeyB, var_xdata, var_xdata 242 .else 243 vaesenc xkey4, var_xdata, var_xdata 244 .endif 245 .set i, (i +1) 246 .endr 247 248 .if (klen == KEY_128) 249 .if (load_keys) 250 vmovdqa 6*16(p_keys), xkey8 251 .endif 252 .else 253 vmovdqa 6*16(p_keys), xkeyB 254 .endif 255 256 .set i, 0 257 .rept by 258 club XDATA, i 259 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 260 .set i, (i +1) 261 .endr 262 263 vmovdqa 7*16(p_keys), xkeyA 264 265 .set i, 0 266 .rept by 267 club XDATA, i 268 /* key 6 */ 269 .if (klen == KEY_128) 270 vaesenc xkey8, var_xdata, var_xdata 271 .else 272 vaesenc xkeyB, var_xdata, var_xdata 273 .endif 274 .set i, (i +1) 275 .endr 276 277 .if (klen == KEY_128) 278 vmovdqa 8*16(p_keys), xkeyB 279 .else 280 .if (load_keys) 281 vmovdqa 8*16(p_keys), xkey8 282 .endif 283 .endif 284 285 .set i, 0 286 .rept by 287 club XDATA, i 288 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 289 .set i, (i +1) 290 .endr 291 292 .if (klen == KEY_128) 293 .if (load_keys) 294 vmovdqa 9*16(p_keys), xkey12 295 .endif 296 .else 297 vmovdqa 9*16(p_keys), xkeyA 298 .endif 299 300 .set i, 0 301 .rept by 302 club XDATA, i 303 /* key 8 */ 304 .if (klen == KEY_128) 305 vaesenc xkeyB, var_xdata, var_xdata 306 .else 307 vaesenc xkey8, var_xdata, var_xdata 308 .endif 309 .set i, (i +1) 310 .endr 311 312 vmovdqa 10*16(p_keys), xkeyB 313 314 .set i, 0 315 .rept by 316 club XDATA, i 317 /* key 9 */ 318 .if (klen == KEY_128) 319 vaesenc xkey12, var_xdata, var_xdata 320 .else 321 vaesenc xkeyA, var_xdata, var_xdata 322 .endif 323 .set i, (i +1) 324 .endr 325 326 .if (klen != KEY_128) 327 vmovdqa 11*16(p_keys), xkeyA 328 .endif 329 330 .set i, 0 331 .rept by 332 club XDATA, i 333 /* key 10 */ 334 .if (klen == KEY_128) 335 vaesenclast xkeyB, var_xdata, var_xdata 336 .else 337 vaesenc xkeyB, var_xdata, var_xdata 338 .endif 339 .set i, (i +1) 340 .endr 341 342 .if (klen != KEY_128) 343 .if (load_keys) 344 vmovdqa 12*16(p_keys), xkey12 345 .endif 346 347 .set i, 0 348 .rept by 349 club XDATA, i 350 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 351 .set i, (i +1) 352 .endr 353 354 .if (klen == KEY_256) 355 vmovdqa 13*16(p_keys), xkeyA 356 .endif 357 358 .set i, 0 359 .rept by 360 club XDATA, i 361 .if (klen == KEY_256) 362 /* key 12 */ 363 vaesenc xkey12, var_xdata, var_xdata 364 .else 365 vaesenclast xkey12, var_xdata, var_xdata 366 .endif 367 .set i, (i +1) 368 .endr 369 370 .if (klen == KEY_256) 371 vmovdqa 14*16(p_keys), xkeyB 372 373 .set i, 0 374 .rept by 375 club XDATA, i 376 /* key 13 */ 377 vaesenc xkeyA, var_xdata, var_xdata 378 .set i, (i +1) 379 .endr 380 381 .set i, 0 382 .rept by 383 club XDATA, i 384 /* key 14 */ 385 vaesenclast xkeyB, var_xdata, var_xdata 386 .set i, (i +1) 387 .endr 388 .endif 389 .endif 390 391 .set i, 0 392 .rept (by / 2) 393 .set j, (i+1) 394 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 395 VMOVDQ (j*16 - 16*by)(p_in), xkeyB 396 club XDATA, i 397 vpxor xkeyA, var_xdata, var_xdata 398 club XDATA, j 399 vpxor xkeyB, var_xdata, var_xdata 400 .set i, (i+2) 401 .endr 402 403 .if (i < by) 404 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 405 club XDATA, i 406 vpxor xkeyA, var_xdata, var_xdata 407 .endif 408 409 .set i, 0 410 .rept by 411 club XDATA, i 412 VMOVDQ var_xdata, i*16(p_out) 413 .set i, (i+1) 414 .endr 415.endm 416 417.macro do_aes_load val, key_len 418 do_aes \val, 1, \key_len 419.endm 420 421.macro do_aes_noload val, key_len 422 do_aes \val, 0, \key_len 423.endm 424 425/* main body of aes ctr load */ 426 427.macro do_aes_ctrmain key_len 428 cmp $16, num_bytes 429 jb .Ldo_return2\key_len 430 431 vmovdqa byteswap_const(%rip), xbyteswap 432 vmovdqu (p_iv), xcounter 433 vpshufb xbyteswap, xcounter, xcounter 434 435 mov num_bytes, tmp 436 and $(7*16), tmp 437 jz .Lmult_of_8_blks\key_len 438 439 /* 1 <= tmp <= 7 */ 440 cmp $(4*16), tmp 441 jg .Lgt4\key_len 442 je .Leq4\key_len 443 444.Llt4\key_len: 445 cmp $(2*16), tmp 446 jg .Leq3\key_len 447 je .Leq2\key_len 448 449.Leq1\key_len: 450 do_aes_load 1, \key_len 451 add $(1*16), p_out 452 and $(~7*16), num_bytes 453 jz .Ldo_return2\key_len 454 jmp .Lmain_loop2\key_len 455 456.Leq2\key_len: 457 do_aes_load 2, \key_len 458 add $(2*16), p_out 459 and $(~7*16), num_bytes 460 jz .Ldo_return2\key_len 461 jmp .Lmain_loop2\key_len 462 463 464.Leq3\key_len: 465 do_aes_load 3, \key_len 466 add $(3*16), p_out 467 and $(~7*16), num_bytes 468 jz .Ldo_return2\key_len 469 jmp .Lmain_loop2\key_len 470 471.Leq4\key_len: 472 do_aes_load 4, \key_len 473 add $(4*16), p_out 474 and $(~7*16), num_bytes 475 jz .Ldo_return2\key_len 476 jmp .Lmain_loop2\key_len 477 478.Lgt4\key_len: 479 cmp $(6*16), tmp 480 jg .Leq7\key_len 481 je .Leq6\key_len 482 483.Leq5\key_len: 484 do_aes_load 5, \key_len 485 add $(5*16), p_out 486 and $(~7*16), num_bytes 487 jz .Ldo_return2\key_len 488 jmp .Lmain_loop2\key_len 489 490.Leq6\key_len: 491 do_aes_load 6, \key_len 492 add $(6*16), p_out 493 and $(~7*16), num_bytes 494 jz .Ldo_return2\key_len 495 jmp .Lmain_loop2\key_len 496 497.Leq7\key_len: 498 do_aes_load 7, \key_len 499 add $(7*16), p_out 500 and $(~7*16), num_bytes 501 jz .Ldo_return2\key_len 502 jmp .Lmain_loop2\key_len 503 504.Lmult_of_8_blks\key_len: 505 .if (\key_len != KEY_128) 506 vmovdqa 0*16(p_keys), xkey0 507 vmovdqa 4*16(p_keys), xkey4 508 vmovdqa 8*16(p_keys), xkey8 509 vmovdqa 12*16(p_keys), xkey12 510 .else 511 vmovdqa 0*16(p_keys), xkey0 512 vmovdqa 3*16(p_keys), xkey4 513 vmovdqa 6*16(p_keys), xkey8 514 vmovdqa 9*16(p_keys), xkey12 515 .endif 516.align 16 517.Lmain_loop2\key_len: 518 /* num_bytes is a multiple of 8 and >0 */ 519 do_aes_noload 8, \key_len 520 add $(8*16), p_out 521 sub $(8*16), num_bytes 522 jne .Lmain_loop2\key_len 523 524.Ldo_return2\key_len: 525 /* return updated IV */ 526 vpshufb xbyteswap, xcounter, xcounter 527 vmovdqu xcounter, (p_iv) 528 ret 529.endm 530 531/* 532 * routine to do AES128 CTR enc/decrypt "by8" 533 * XMM registers are clobbered. 534 * Saving/restoring must be done at a higher level 535 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 536 * unsigned int num_bytes) 537 */ 538SYM_FUNC_START(aes_ctr_enc_128_avx_by8) 539 /* call the aes main loop */ 540 do_aes_ctrmain KEY_128 541 542SYM_FUNC_END(aes_ctr_enc_128_avx_by8) 543 544/* 545 * routine to do AES192 CTR enc/decrypt "by8" 546 * XMM registers are clobbered. 547 * Saving/restoring must be done at a higher level 548 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 549 * unsigned int num_bytes) 550 */ 551SYM_FUNC_START(aes_ctr_enc_192_avx_by8) 552 /* call the aes main loop */ 553 do_aes_ctrmain KEY_192 554 555SYM_FUNC_END(aes_ctr_enc_192_avx_by8) 556 557/* 558 * routine to do AES256 CTR enc/decrypt "by8" 559 * XMM registers are clobbered. 560 * Saving/restoring must be done at a higher level 561 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 562 * unsigned int num_bytes) 563 */ 564SYM_FUNC_START(aes_ctr_enc_256_avx_by8) 565 /* call the aes main loop */ 566 do_aes_ctrmain KEY_256 567 568SYM_FUNC_END(aes_ctr_enc_256_avx_by8) 569