1/* 2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3 * 4 * This is AES128/192/256 CTR mode optimization implementation. It requires 5 * the support of Intel(R) AESNI and AVX instructions. 6 * 7 * This work was inspired by the AES CTR mode optimization published 8 * in Intel Optimized IPSEC Cryptograhpic library. 9 * Additional information on it can be found at: 10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11 * 12 * This file is provided under a dual BSD/GPLv2 license. When using or 13 * redistributing this file, you may do so under either license. 14 * 15 * GPL LICENSE SUMMARY 16 * 17 * Copyright(c) 2014 Intel Corporation. 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of version 2 of the GNU General Public License as 21 * published by the Free Software Foundation. 22 * 23 * This program is distributed in the hope that it will be useful, but 24 * WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * General Public License for more details. 27 * 28 * Contact Information: 29 * James Guilford <james.guilford@intel.com> 30 * Sean Gulley <sean.m.gulley@intel.com> 31 * Chandramouli Narayanan <mouli@linux.intel.com> 32 * 33 * BSD LICENSE 34 * 35 * Copyright(c) 2014 Intel Corporation. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 41 * Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in 45 * the documentation and/or other materials provided with the 46 * distribution. 47 * Neither the name of Intel Corporation nor the names of its 48 * contributors may be used to endorse or promote products derived 49 * from this software without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 65#include <linux/linkage.h> 66#include <asm/inst.h> 67 68#define CONCAT(a,b) a##b 69#define VMOVDQ vmovdqu 70 71#define xdata0 %xmm0 72#define xdata1 %xmm1 73#define xdata2 %xmm2 74#define xdata3 %xmm3 75#define xdata4 %xmm4 76#define xdata5 %xmm5 77#define xdata6 %xmm6 78#define xdata7 %xmm7 79#define xcounter %xmm8 80#define xbyteswap %xmm9 81#define xkey0 %xmm10 82#define xkey3 %xmm11 83#define xkey6 %xmm12 84#define xkey9 %xmm13 85#define xkey4 %xmm11 86#define xkey8 %xmm12 87#define xkey12 %xmm13 88#define xkeyA %xmm14 89#define xkeyB %xmm15 90 91#define p_in %rdi 92#define p_iv %rsi 93#define p_keys %rdx 94#define p_out %rcx 95#define num_bytes %r8 96 97#define tmp %r10 98#define DDQ(i) CONCAT(ddq_add_,i) 99#define XMM(i) CONCAT(%xmm, i) 100#define DDQ_DATA 0 101#define XDATA 1 102#define KEY_128 1 103#define KEY_192 2 104#define KEY_256 3 105 106.section .rodata 107.align 16 108 109byteswap_const: 110 .octa 0x000102030405060708090A0B0C0D0E0F 111ddq_add_1: 112 .octa 0x00000000000000000000000000000001 113ddq_add_2: 114 .octa 0x00000000000000000000000000000002 115ddq_add_3: 116 .octa 0x00000000000000000000000000000003 117ddq_add_4: 118 .octa 0x00000000000000000000000000000004 119ddq_add_5: 120 .octa 0x00000000000000000000000000000005 121ddq_add_6: 122 .octa 0x00000000000000000000000000000006 123ddq_add_7: 124 .octa 0x00000000000000000000000000000007 125ddq_add_8: 126 .octa 0x00000000000000000000000000000008 127 128.text 129 130/* generate a unique variable for ddq_add_x */ 131 132.macro setddq n 133 var_ddq_add = DDQ(\n) 134.endm 135 136/* generate a unique variable for xmm register */ 137.macro setxdata n 138 var_xdata = XMM(\n) 139.endm 140 141/* club the numeric 'id' to the symbol 'name' */ 142 143.macro club name, id 144.altmacro 145 .if \name == DDQ_DATA 146 setddq %\id 147 .elseif \name == XDATA 148 setxdata %\id 149 .endif 150.noaltmacro 151.endm 152 153/* 154 * do_aes num_in_par load_keys key_len 155 * This increments p_in, but not p_out 156 */ 157.macro do_aes b, k, key_len 158 .set by, \b 159 .set load_keys, \k 160 .set klen, \key_len 161 162 .if (load_keys) 163 vmovdqa 0*16(p_keys), xkey0 164 .endif 165 166 vpshufb xbyteswap, xcounter, xdata0 167 168 .set i, 1 169 .rept (by - 1) 170 club DDQ_DATA, i 171 club XDATA, i 172 vpaddd var_ddq_add(%rip), xcounter, var_xdata 173 vpshufb xbyteswap, var_xdata, var_xdata 174 .set i, (i +1) 175 .endr 176 177 vmovdqa 1*16(p_keys), xkeyA 178 179 vpxor xkey0, xdata0, xdata0 180 club DDQ_DATA, by 181 vpaddd var_ddq_add(%rip), xcounter, xcounter 182 183 .set i, 1 184 .rept (by - 1) 185 club XDATA, i 186 vpxor xkey0, var_xdata, var_xdata 187 .set i, (i +1) 188 .endr 189 190 vmovdqa 2*16(p_keys), xkeyB 191 192 .set i, 0 193 .rept by 194 club XDATA, i 195 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 196 .set i, (i +1) 197 .endr 198 199 .if (klen == KEY_128) 200 .if (load_keys) 201 vmovdqa 3*16(p_keys), xkeyA 202 .endif 203 .else 204 vmovdqa 3*16(p_keys), xkeyA 205 .endif 206 207 .set i, 0 208 .rept by 209 club XDATA, i 210 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 211 .set i, (i +1) 212 .endr 213 214 add $(16*by), p_in 215 216 .if (klen == KEY_128) 217 vmovdqa 4*16(p_keys), xkey4 218 .else 219 .if (load_keys) 220 vmovdqa 4*16(p_keys), xkey4 221 .endif 222 .endif 223 224 .set i, 0 225 .rept by 226 club XDATA, i 227 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */ 228 .set i, (i +1) 229 .endr 230 231 vmovdqa 5*16(p_keys), xkeyA 232 233 .set i, 0 234 .rept by 235 club XDATA, i 236 vaesenc xkey4, var_xdata, var_xdata /* key 4 */ 237 .set i, (i +1) 238 .endr 239 240 .if (klen == KEY_128) 241 .if (load_keys) 242 vmovdqa 6*16(p_keys), xkeyB 243 .endif 244 .else 245 vmovdqa 6*16(p_keys), xkeyB 246 .endif 247 248 .set i, 0 249 .rept by 250 club XDATA, i 251 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 252 .set i, (i +1) 253 .endr 254 255 vmovdqa 7*16(p_keys), xkeyA 256 257 .set i, 0 258 .rept by 259 club XDATA, i 260 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */ 261 .set i, (i +1) 262 .endr 263 264 .if (klen == KEY_128) 265 vmovdqa 8*16(p_keys), xkey8 266 .else 267 .if (load_keys) 268 vmovdqa 8*16(p_keys), xkey8 269 .endif 270 .endif 271 272 .set i, 0 273 .rept by 274 club XDATA, i 275 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 276 .set i, (i +1) 277 .endr 278 279 .if (klen == KEY_128) 280 .if (load_keys) 281 vmovdqa 9*16(p_keys), xkeyA 282 .endif 283 .else 284 vmovdqa 9*16(p_keys), xkeyA 285 .endif 286 287 .set i, 0 288 .rept by 289 club XDATA, i 290 vaesenc xkey8, var_xdata, var_xdata /* key 8 */ 291 .set i, (i +1) 292 .endr 293 294 vmovdqa 10*16(p_keys), xkeyB 295 296 .set i, 0 297 .rept by 298 club XDATA, i 299 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */ 300 .set i, (i +1) 301 .endr 302 303 .if (klen != KEY_128) 304 vmovdqa 11*16(p_keys), xkeyA 305 .endif 306 307 .set i, 0 308 .rept by 309 club XDATA, i 310 /* key 10 */ 311 .if (klen == KEY_128) 312 vaesenclast xkeyB, var_xdata, var_xdata 313 .else 314 vaesenc xkeyB, var_xdata, var_xdata 315 .endif 316 .set i, (i +1) 317 .endr 318 319 .if (klen != KEY_128) 320 .if (load_keys) 321 vmovdqa 12*16(p_keys), xkey12 322 .endif 323 324 .set i, 0 325 .rept by 326 club XDATA, i 327 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 328 .set i, (i +1) 329 .endr 330 331 .if (klen == KEY_256) 332 vmovdqa 13*16(p_keys), xkeyA 333 .endif 334 335 .set i, 0 336 .rept by 337 club XDATA, i 338 .if (klen == KEY_256) 339 /* key 12 */ 340 vaesenc xkey12, var_xdata, var_xdata 341 .else 342 vaesenclast xkey12, var_xdata, var_xdata 343 .endif 344 .set i, (i +1) 345 .endr 346 347 .if (klen == KEY_256) 348 vmovdqa 14*16(p_keys), xkeyB 349 350 .set i, 0 351 .rept by 352 club XDATA, i 353 /* key 13 */ 354 vaesenc xkeyA, var_xdata, var_xdata 355 .set i, (i +1) 356 .endr 357 358 .set i, 0 359 .rept by 360 club XDATA, i 361 /* key 14 */ 362 vaesenclast xkeyB, var_xdata, var_xdata 363 .set i, (i +1) 364 .endr 365 .endif 366 .endif 367 368 .set i, 0 369 .rept (by / 2) 370 .set j, (i+1) 371 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 372 VMOVDQ (j*16 - 16*by)(p_in), xkeyB 373 club XDATA, i 374 vpxor xkeyA, var_xdata, var_xdata 375 club XDATA, j 376 vpxor xkeyB, var_xdata, var_xdata 377 .set i, (i+2) 378 .endr 379 380 .if (i < by) 381 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 382 club XDATA, i 383 vpxor xkeyA, var_xdata, var_xdata 384 .endif 385 386 .set i, 0 387 .rept by 388 club XDATA, i 389 VMOVDQ var_xdata, i*16(p_out) 390 .set i, (i+1) 391 .endr 392.endm 393 394.macro do_aes_load val, key_len 395 do_aes \val, 1, \key_len 396.endm 397 398.macro do_aes_noload val, key_len 399 do_aes \val, 0, \key_len 400.endm 401 402/* main body of aes ctr load */ 403 404.macro do_aes_ctrmain key_len 405 406 cmp $16, num_bytes 407 jb .Ldo_return2\key_len 408 409 vmovdqa byteswap_const(%rip), xbyteswap 410 vmovdqu (p_iv), xcounter 411 vpshufb xbyteswap, xcounter, xcounter 412 413 mov num_bytes, tmp 414 and $(7*16), tmp 415 jz .Lmult_of_8_blks\key_len 416 417 /* 1 <= tmp <= 7 */ 418 cmp $(4*16), tmp 419 jg .Lgt4\key_len 420 je .Leq4\key_len 421 422.Llt4\key_len: 423 cmp $(2*16), tmp 424 jg .Leq3\key_len 425 je .Leq2\key_len 426 427.Leq1\key_len: 428 do_aes_load 1, \key_len 429 add $(1*16), p_out 430 and $(~7*16), num_bytes 431 jz .Ldo_return2\key_len 432 jmp .Lmain_loop2\key_len 433 434.Leq2\key_len: 435 do_aes_load 2, \key_len 436 add $(2*16), p_out 437 and $(~7*16), num_bytes 438 jz .Ldo_return2\key_len 439 jmp .Lmain_loop2\key_len 440 441 442.Leq3\key_len: 443 do_aes_load 3, \key_len 444 add $(3*16), p_out 445 and $(~7*16), num_bytes 446 jz .Ldo_return2\key_len 447 jmp .Lmain_loop2\key_len 448 449.Leq4\key_len: 450 do_aes_load 4, \key_len 451 add $(4*16), p_out 452 and $(~7*16), num_bytes 453 jz .Ldo_return2\key_len 454 jmp .Lmain_loop2\key_len 455 456.Lgt4\key_len: 457 cmp $(6*16), tmp 458 jg .Leq7\key_len 459 je .Leq6\key_len 460 461.Leq5\key_len: 462 do_aes_load 5, \key_len 463 add $(5*16), p_out 464 and $(~7*16), num_bytes 465 jz .Ldo_return2\key_len 466 jmp .Lmain_loop2\key_len 467 468.Leq6\key_len: 469 do_aes_load 6, \key_len 470 add $(6*16), p_out 471 and $(~7*16), num_bytes 472 jz .Ldo_return2\key_len 473 jmp .Lmain_loop2\key_len 474 475.Leq7\key_len: 476 do_aes_load 7, \key_len 477 add $(7*16), p_out 478 and $(~7*16), num_bytes 479 jz .Ldo_return2\key_len 480 jmp .Lmain_loop2\key_len 481 482.Lmult_of_8_blks\key_len: 483 .if (\key_len != KEY_128) 484 vmovdqa 0*16(p_keys), xkey0 485 vmovdqa 4*16(p_keys), xkey4 486 vmovdqa 8*16(p_keys), xkey8 487 vmovdqa 12*16(p_keys), xkey12 488 .else 489 vmovdqa 0*16(p_keys), xkey0 490 vmovdqa 3*16(p_keys), xkey4 491 vmovdqa 6*16(p_keys), xkey8 492 vmovdqa 9*16(p_keys), xkey12 493 .endif 494.align 16 495.Lmain_loop2\key_len: 496 /* num_bytes is a multiple of 8 and >0 */ 497 do_aes_noload 8, \key_len 498 add $(8*16), p_out 499 sub $(8*16), num_bytes 500 jne .Lmain_loop2\key_len 501 502.Ldo_return2\key_len: 503 /* return updated IV */ 504 vpshufb xbyteswap, xcounter, xcounter 505 vmovdqu xcounter, (p_iv) 506 ret 507.endm 508 509/* 510 * routine to do AES128 CTR enc/decrypt "by8" 511 * XMM registers are clobbered. 512 * Saving/restoring must be done at a higher level 513 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 514 * unsigned int num_bytes) 515 */ 516ENTRY(aes_ctr_enc_128_avx_by8) 517 /* call the aes main loop */ 518 do_aes_ctrmain KEY_128 519 520ENDPROC(aes_ctr_enc_128_avx_by8) 521 522/* 523 * routine to do AES192 CTR enc/decrypt "by8" 524 * XMM registers are clobbered. 525 * Saving/restoring must be done at a higher level 526 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 527 * unsigned int num_bytes) 528 */ 529ENTRY(aes_ctr_enc_192_avx_by8) 530 /* call the aes main loop */ 531 do_aes_ctrmain KEY_192 532 533ENDPROC(aes_ctr_enc_192_avx_by8) 534 535/* 536 * routine to do AES256 CTR enc/decrypt "by8" 537 * XMM registers are clobbered. 538 * Saving/restoring must be done at a higher level 539 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 540 * unsigned int num_bytes) 541 */ 542ENTRY(aes_ctr_enc_256_avx_by8) 543 /* call the aes main loop */ 544 do_aes_ctrmain KEY_256 545 546ENDPROC(aes_ctr_enc_256_avx_by8) 547