1/* 2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3 * 4 * This is AES128/192/256 CTR mode optimization implementation. It requires 5 * the support of Intel(R) AESNI and AVX instructions. 6 * 7 * This work was inspired by the AES CTR mode optimization published 8 * in Intel Optimized IPSEC Cryptograhpic library. 9 * Additional information on it can be found at: 10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11 * 12 * This file is provided under a dual BSD/GPLv2 license. When using or 13 * redistributing this file, you may do so under either license. 14 * 15 * GPL LICENSE SUMMARY 16 * 17 * Copyright(c) 2014 Intel Corporation. 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of version 2 of the GNU General Public License as 21 * published by the Free Software Foundation. 22 * 23 * This program is distributed in the hope that it will be useful, but 24 * WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * General Public License for more details. 27 * 28 * Contact Information: 29 * James Guilford <james.guilford@intel.com> 30 * Sean Gulley <sean.m.gulley@intel.com> 31 * Chandramouli Narayanan <mouli@linux.intel.com> 32 * 33 * BSD LICENSE 34 * 35 * Copyright(c) 2014 Intel Corporation. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 41 * Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in 45 * the documentation and/or other materials provided with the 46 * distribution. 47 * Neither the name of Intel Corporation nor the names of its 48 * contributors may be used to endorse or promote products derived 49 * from this software without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 65#include <linux/linkage.h> 66#include <asm/inst.h> 67 68#define CONCAT(a,b) a##b 69#define VMOVDQ vmovdqu 70 71#define xdata0 %xmm0 72#define xdata1 %xmm1 73#define xdata2 %xmm2 74#define xdata3 %xmm3 75#define xdata4 %xmm4 76#define xdata5 %xmm5 77#define xdata6 %xmm6 78#define xdata7 %xmm7 79#define xcounter %xmm8 80#define xbyteswap %xmm9 81#define xkey0 %xmm10 82#define xkey4 %xmm11 83#define xkey8 %xmm12 84#define xkey12 %xmm13 85#define xkeyA %xmm14 86#define xkeyB %xmm15 87 88#define p_in %rdi 89#define p_iv %rsi 90#define p_keys %rdx 91#define p_out %rcx 92#define num_bytes %r8 93 94#define tmp %r10 95#define DDQ(i) CONCAT(ddq_add_,i) 96#define XMM(i) CONCAT(%xmm, i) 97#define DDQ_DATA 0 98#define XDATA 1 99#define KEY_128 1 100#define KEY_192 2 101#define KEY_256 3 102 103.section .rodata 104.align 16 105 106byteswap_const: 107 .octa 0x000102030405060708090A0B0C0D0E0F 108ddq_low_msk: 109 .octa 0x0000000000000000FFFFFFFFFFFFFFFF 110ddq_high_add_1: 111 .octa 0x00000000000000010000000000000000 112ddq_add_1: 113 .octa 0x00000000000000000000000000000001 114ddq_add_2: 115 .octa 0x00000000000000000000000000000002 116ddq_add_3: 117 .octa 0x00000000000000000000000000000003 118ddq_add_4: 119 .octa 0x00000000000000000000000000000004 120ddq_add_5: 121 .octa 0x00000000000000000000000000000005 122ddq_add_6: 123 .octa 0x00000000000000000000000000000006 124ddq_add_7: 125 .octa 0x00000000000000000000000000000007 126ddq_add_8: 127 .octa 0x00000000000000000000000000000008 128 129.text 130 131/* generate a unique variable for ddq_add_x */ 132 133.macro setddq n 134 var_ddq_add = DDQ(\n) 135.endm 136 137/* generate a unique variable for xmm register */ 138.macro setxdata n 139 var_xdata = XMM(\n) 140.endm 141 142/* club the numeric 'id' to the symbol 'name' */ 143 144.macro club name, id 145.altmacro 146 .if \name == DDQ_DATA 147 setddq %\id 148 .elseif \name == XDATA 149 setxdata %\id 150 .endif 151.noaltmacro 152.endm 153 154/* 155 * do_aes num_in_par load_keys key_len 156 * This increments p_in, but not p_out 157 */ 158.macro do_aes b, k, key_len 159 .set by, \b 160 .set load_keys, \k 161 .set klen, \key_len 162 163 .if (load_keys) 164 vmovdqa 0*16(p_keys), xkey0 165 .endif 166 167 vpshufb xbyteswap, xcounter, xdata0 168 169 .set i, 1 170 .rept (by - 1) 171 club DDQ_DATA, i 172 club XDATA, i 173 vpaddq var_ddq_add(%rip), xcounter, var_xdata 174 vptest ddq_low_msk(%rip), var_xdata 175 jnz 1f 176 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 177 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 178 1: 179 vpshufb xbyteswap, var_xdata, var_xdata 180 .set i, (i +1) 181 .endr 182 183 vmovdqa 1*16(p_keys), xkeyA 184 185 vpxor xkey0, xdata0, xdata0 186 club DDQ_DATA, by 187 vpaddq var_ddq_add(%rip), xcounter, xcounter 188 vptest ddq_low_msk(%rip), xcounter 189 jnz 1f 190 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 191 1: 192 193 .set i, 1 194 .rept (by - 1) 195 club XDATA, i 196 vpxor xkey0, var_xdata, var_xdata 197 .set i, (i +1) 198 .endr 199 200 vmovdqa 2*16(p_keys), xkeyB 201 202 .set i, 0 203 .rept by 204 club XDATA, i 205 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 206 .set i, (i +1) 207 .endr 208 209 .if (klen == KEY_128) 210 .if (load_keys) 211 vmovdqa 3*16(p_keys), xkeyA 212 .endif 213 .else 214 vmovdqa 3*16(p_keys), xkeyA 215 .endif 216 217 .set i, 0 218 .rept by 219 club XDATA, i 220 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 221 .set i, (i +1) 222 .endr 223 224 add $(16*by), p_in 225 226 .if (klen == KEY_128) 227 vmovdqa 4*16(p_keys), xkey4 228 .else 229 .if (load_keys) 230 vmovdqa 4*16(p_keys), xkey4 231 .endif 232 .endif 233 234 .set i, 0 235 .rept by 236 club XDATA, i 237 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */ 238 .set i, (i +1) 239 .endr 240 241 vmovdqa 5*16(p_keys), xkeyA 242 243 .set i, 0 244 .rept by 245 club XDATA, i 246 vaesenc xkey4, var_xdata, var_xdata /* key 4 */ 247 .set i, (i +1) 248 .endr 249 250 .if (klen == KEY_128) 251 .if (load_keys) 252 vmovdqa 6*16(p_keys), xkeyB 253 .endif 254 .else 255 vmovdqa 6*16(p_keys), xkeyB 256 .endif 257 258 .set i, 0 259 .rept by 260 club XDATA, i 261 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 262 .set i, (i +1) 263 .endr 264 265 vmovdqa 7*16(p_keys), xkeyA 266 267 .set i, 0 268 .rept by 269 club XDATA, i 270 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */ 271 .set i, (i +1) 272 .endr 273 274 .if (klen == KEY_128) 275 vmovdqa 8*16(p_keys), xkey8 276 .else 277 .if (load_keys) 278 vmovdqa 8*16(p_keys), xkey8 279 .endif 280 .endif 281 282 .set i, 0 283 .rept by 284 club XDATA, i 285 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 286 .set i, (i +1) 287 .endr 288 289 .if (klen == KEY_128) 290 .if (load_keys) 291 vmovdqa 9*16(p_keys), xkeyA 292 .endif 293 .else 294 vmovdqa 9*16(p_keys), xkeyA 295 .endif 296 297 .set i, 0 298 .rept by 299 club XDATA, i 300 vaesenc xkey8, var_xdata, var_xdata /* key 8 */ 301 .set i, (i +1) 302 .endr 303 304 vmovdqa 10*16(p_keys), xkeyB 305 306 .set i, 0 307 .rept by 308 club XDATA, i 309 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */ 310 .set i, (i +1) 311 .endr 312 313 .if (klen != KEY_128) 314 vmovdqa 11*16(p_keys), xkeyA 315 .endif 316 317 .set i, 0 318 .rept by 319 club XDATA, i 320 /* key 10 */ 321 .if (klen == KEY_128) 322 vaesenclast xkeyB, var_xdata, var_xdata 323 .else 324 vaesenc xkeyB, var_xdata, var_xdata 325 .endif 326 .set i, (i +1) 327 .endr 328 329 .if (klen != KEY_128) 330 .if (load_keys) 331 vmovdqa 12*16(p_keys), xkey12 332 .endif 333 334 .set i, 0 335 .rept by 336 club XDATA, i 337 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 338 .set i, (i +1) 339 .endr 340 341 .if (klen == KEY_256) 342 vmovdqa 13*16(p_keys), xkeyA 343 .endif 344 345 .set i, 0 346 .rept by 347 club XDATA, i 348 .if (klen == KEY_256) 349 /* key 12 */ 350 vaesenc xkey12, var_xdata, var_xdata 351 .else 352 vaesenclast xkey12, var_xdata, var_xdata 353 .endif 354 .set i, (i +1) 355 .endr 356 357 .if (klen == KEY_256) 358 vmovdqa 14*16(p_keys), xkeyB 359 360 .set i, 0 361 .rept by 362 club XDATA, i 363 /* key 13 */ 364 vaesenc xkeyA, var_xdata, var_xdata 365 .set i, (i +1) 366 .endr 367 368 .set i, 0 369 .rept by 370 club XDATA, i 371 /* key 14 */ 372 vaesenclast xkeyB, var_xdata, var_xdata 373 .set i, (i +1) 374 .endr 375 .endif 376 .endif 377 378 .set i, 0 379 .rept (by / 2) 380 .set j, (i+1) 381 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 382 VMOVDQ (j*16 - 16*by)(p_in), xkeyB 383 club XDATA, i 384 vpxor xkeyA, var_xdata, var_xdata 385 club XDATA, j 386 vpxor xkeyB, var_xdata, var_xdata 387 .set i, (i+2) 388 .endr 389 390 .if (i < by) 391 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 392 club XDATA, i 393 vpxor xkeyA, var_xdata, var_xdata 394 .endif 395 396 .set i, 0 397 .rept by 398 club XDATA, i 399 VMOVDQ var_xdata, i*16(p_out) 400 .set i, (i+1) 401 .endr 402.endm 403 404.macro do_aes_load val, key_len 405 do_aes \val, 1, \key_len 406.endm 407 408.macro do_aes_noload val, key_len 409 do_aes \val, 0, \key_len 410.endm 411 412/* main body of aes ctr load */ 413 414.macro do_aes_ctrmain key_len 415 416 cmp $16, num_bytes 417 jb .Ldo_return2\key_len 418 419 vmovdqa byteswap_const(%rip), xbyteswap 420 vmovdqu (p_iv), xcounter 421 vpshufb xbyteswap, xcounter, xcounter 422 423 mov num_bytes, tmp 424 and $(7*16), tmp 425 jz .Lmult_of_8_blks\key_len 426 427 /* 1 <= tmp <= 7 */ 428 cmp $(4*16), tmp 429 jg .Lgt4\key_len 430 je .Leq4\key_len 431 432.Llt4\key_len: 433 cmp $(2*16), tmp 434 jg .Leq3\key_len 435 je .Leq2\key_len 436 437.Leq1\key_len: 438 do_aes_load 1, \key_len 439 add $(1*16), p_out 440 and $(~7*16), num_bytes 441 jz .Ldo_return2\key_len 442 jmp .Lmain_loop2\key_len 443 444.Leq2\key_len: 445 do_aes_load 2, \key_len 446 add $(2*16), p_out 447 and $(~7*16), num_bytes 448 jz .Ldo_return2\key_len 449 jmp .Lmain_loop2\key_len 450 451 452.Leq3\key_len: 453 do_aes_load 3, \key_len 454 add $(3*16), p_out 455 and $(~7*16), num_bytes 456 jz .Ldo_return2\key_len 457 jmp .Lmain_loop2\key_len 458 459.Leq4\key_len: 460 do_aes_load 4, \key_len 461 add $(4*16), p_out 462 and $(~7*16), num_bytes 463 jz .Ldo_return2\key_len 464 jmp .Lmain_loop2\key_len 465 466.Lgt4\key_len: 467 cmp $(6*16), tmp 468 jg .Leq7\key_len 469 je .Leq6\key_len 470 471.Leq5\key_len: 472 do_aes_load 5, \key_len 473 add $(5*16), p_out 474 and $(~7*16), num_bytes 475 jz .Ldo_return2\key_len 476 jmp .Lmain_loop2\key_len 477 478.Leq6\key_len: 479 do_aes_load 6, \key_len 480 add $(6*16), p_out 481 and $(~7*16), num_bytes 482 jz .Ldo_return2\key_len 483 jmp .Lmain_loop2\key_len 484 485.Leq7\key_len: 486 do_aes_load 7, \key_len 487 add $(7*16), p_out 488 and $(~7*16), num_bytes 489 jz .Ldo_return2\key_len 490 jmp .Lmain_loop2\key_len 491 492.Lmult_of_8_blks\key_len: 493 .if (\key_len != KEY_128) 494 vmovdqa 0*16(p_keys), xkey0 495 vmovdqa 4*16(p_keys), xkey4 496 vmovdqa 8*16(p_keys), xkey8 497 vmovdqa 12*16(p_keys), xkey12 498 .else 499 vmovdqa 0*16(p_keys), xkey0 500 vmovdqa 3*16(p_keys), xkey4 501 vmovdqa 6*16(p_keys), xkey8 502 vmovdqa 9*16(p_keys), xkey12 503 .endif 504.align 16 505.Lmain_loop2\key_len: 506 /* num_bytes is a multiple of 8 and >0 */ 507 do_aes_noload 8, \key_len 508 add $(8*16), p_out 509 sub $(8*16), num_bytes 510 jne .Lmain_loop2\key_len 511 512.Ldo_return2\key_len: 513 /* return updated IV */ 514 vpshufb xbyteswap, xcounter, xcounter 515 vmovdqu xcounter, (p_iv) 516 ret 517.endm 518 519/* 520 * routine to do AES128 CTR enc/decrypt "by8" 521 * XMM registers are clobbered. 522 * Saving/restoring must be done at a higher level 523 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 524 * unsigned int num_bytes) 525 */ 526ENTRY(aes_ctr_enc_128_avx_by8) 527 /* call the aes main loop */ 528 do_aes_ctrmain KEY_128 529 530ENDPROC(aes_ctr_enc_128_avx_by8) 531 532/* 533 * routine to do AES192 CTR enc/decrypt "by8" 534 * XMM registers are clobbered. 535 * Saving/restoring must be done at a higher level 536 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 537 * unsigned int num_bytes) 538 */ 539ENTRY(aes_ctr_enc_192_avx_by8) 540 /* call the aes main loop */ 541 do_aes_ctrmain KEY_192 542 543ENDPROC(aes_ctr_enc_192_avx_by8) 544 545/* 546 * routine to do AES256 CTR enc/decrypt "by8" 547 * XMM registers are clobbered. 548 * Saving/restoring must be done at a higher level 549 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 550 * unsigned int num_bytes) 551 */ 552ENTRY(aes_ctr_enc_256_avx_by8) 553 /* call the aes main loop */ 554 do_aes_ctrmain KEY_256 555 556ENDPROC(aes_ctr_enc_256_avx_by8) 557