xref: /openbmc/linux/arch/x86/crypto/aesni-intel_avx-x86_64.S (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125.data
126.align 16
127
128POLY:            .octa     0xC2000000000000000000000000000001
129POLY2:           .octa     0xC20000000000000000000001C2000000
130TWOONE:          .octa     0x00000001000000000000000000000001
131
132# order of these constants should not change.
133# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
134
135SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
136SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
137ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
138ZERO:            .octa     0x00000000000000000000000000000000
139ONE:             .octa     0x00000000000000000000000000000001
140ONEf:            .octa     0x01000000000000000000000000000000
141
142.text
143
144
145##define the fields of the gcm aes context
146#{
147#        u8 expanded_keys[16*11] store expanded keys
148#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
149#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
150#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
151#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
152#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
153#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
154#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
155#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
156#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
157#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
158#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
159#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
160#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
161#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
162#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
163#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
164#} gcm_ctx#
165
166HashKey        = 16*11   # store HashKey <<1 mod poly here
167HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
168HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
169HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
170HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
171HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
172HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
173HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
174HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
175HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
176HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
177HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
178HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
179HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
180HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
181HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
182
183#define arg1 %rdi
184#define arg2 %rsi
185#define arg3 %rdx
186#define arg4 %rcx
187#define arg5 %r8
188#define arg6 %r9
189#define arg7 STACK_OFFSET+8*1(%r14)
190#define arg8 STACK_OFFSET+8*2(%r14)
191#define arg9 STACK_OFFSET+8*3(%r14)
192
193i = 0
194j = 0
195
196out_order = 0
197in_order = 1
198DEC = 0
199ENC = 1
200
201.macro define_reg r n
202reg_\r = %xmm\n
203.endm
204
205.macro setreg
206.altmacro
207define_reg i %i
208define_reg j %j
209.noaltmacro
210.endm
211
212# need to push 4 registers into stack to maintain
213STACK_OFFSET = 8*4
214
215TMP1 =   16*0    # Temporary storage for AAD
216TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
217TMP3 =   16*2    # Temporary storage for AES State 3
218TMP4 =   16*3    # Temporary storage for AES State 4
219TMP5 =   16*4    # Temporary storage for AES State 5
220TMP6 =   16*5    # Temporary storage for AES State 6
221TMP7 =   16*6    # Temporary storage for AES State 7
222TMP8 =   16*7    # Temporary storage for AES State 8
223
224VARIABLE_OFFSET = 16*8
225
226################################
227# Utility Macros
228################################
229
230# Encryption of a single block
231.macro ENCRYPT_SINGLE_BLOCK XMM0
232                vpxor    (arg1), \XMM0, \XMM0
233		i = 1
234		setreg
235.rep 9
236                vaesenc  16*i(arg1), \XMM0, \XMM0
237		i = (i+1)
238		setreg
239.endr
240                vaesenclast 16*10(arg1), \XMM0, \XMM0
241.endm
242
243#ifdef CONFIG_AS_AVX
244###############################################################################
245# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
246# Input: A and B (128-bits each, bit-reflected)
247# Output: C = A*B*x mod poly, (i.e. >>1 )
248# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
249# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
250###############################################################################
251.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
252
253        vpshufd         $0b01001110, \GH, \T2
254        vpshufd         $0b01001110, \HK, \T3
255        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
256        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
257
258        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
259        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
260        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
261        vpxor           \GH, \T2,\T2
262        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
263
264        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
265        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
266        vpxor           \T3, \GH, \GH
267        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
268
269        #first phase of the reduction
270        vpslld  $31, \GH, \T2                   # packed right shifting << 31
271        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
272        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
273
274        vpxor   \T3, \T2, \T2                   # xor the shifted versions
275        vpxor   \T4, \T2, \T2
276
277        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
278
279        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
280        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
281
282        #second phase of the reduction
283
284        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
285        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
286        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
287        vpxor   \T3, \T2, \T2                   # xor the shifted versions
288        vpxor   \T4, \T2, \T2
289
290        vpxor   \T5, \T2, \T2
291        vpxor   \T2, \GH, \GH
292        vpxor   \T1, \GH, \GH                   # the result is in GH
293
294
295.endm
296
297.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
298
299        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
300        vmovdqa  \HK, \T5
301
302        vpshufd  $0b01001110, \T5, \T1
303        vpxor    \T5, \T1, \T1
304        vmovdqa  \T1, HashKey_k(arg1)
305
306        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
307        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
308        vpshufd  $0b01001110, \T5, \T1
309        vpxor    \T5, \T1, \T1
310        vmovdqa  \T1, HashKey_2_k(arg1)
311
312        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
313        vmovdqa  \T5, HashKey_3(arg1)
314        vpshufd  $0b01001110, \T5, \T1
315        vpxor    \T5, \T1, \T1
316        vmovdqa  \T1, HashKey_3_k(arg1)
317
318        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
319        vmovdqa  \T5, HashKey_4(arg1)
320        vpshufd  $0b01001110, \T5, \T1
321        vpxor    \T5, \T1, \T1
322        vmovdqa  \T1, HashKey_4_k(arg1)
323
324        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
325        vmovdqa  \T5, HashKey_5(arg1)
326        vpshufd  $0b01001110, \T5, \T1
327        vpxor    \T5, \T1, \T1
328        vmovdqa  \T1, HashKey_5_k(arg1)
329
330        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
331        vmovdqa  \T5, HashKey_6(arg1)
332        vpshufd  $0b01001110, \T5, \T1
333        vpxor    \T5, \T1, \T1
334        vmovdqa  \T1, HashKey_6_k(arg1)
335
336        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
337        vmovdqa  \T5, HashKey_7(arg1)
338        vpshufd  $0b01001110, \T5, \T1
339        vpxor    \T5, \T1, \T1
340        vmovdqa  \T1, HashKey_7_k(arg1)
341
342        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
343        vmovdqa  \T5, HashKey_8(arg1)
344        vpshufd  $0b01001110, \T5, \T1
345        vpxor    \T5, \T1, \T1
346        vmovdqa  \T1, HashKey_8_k(arg1)
347
348.endm
349
350## if a = number of total plaintext bytes
351## b = floor(a/16)
352## num_initial_blocks = b mod 4#
353## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
354## r10, r11, r12, rax are clobbered
355## arg1, arg2, arg3, r14 are used as a pointer only, not modified
356
357.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
358	i = (8-\num_initial_blocks)
359	setreg
360
361        mov     arg6, %r10                      # r10 = AAD
362        mov     arg7, %r12                      # r12 = aadLen
363
364
365        mov     %r12, %r11
366
367        vpxor   reg_i, reg_i, reg_i
368_get_AAD_loop\@:
369        vmovd   (%r10), \T1
370        vpslldq $12, \T1, \T1
371        vpsrldq $4, reg_i, reg_i
372        vpxor   \T1, reg_i, reg_i
373
374        add     $4, %r10
375        sub     $4, %r12
376        jg      _get_AAD_loop\@
377
378
379        cmp     $16, %r11
380        je      _get_AAD_loop2_done\@
381        mov     $16, %r12
382
383_get_AAD_loop2\@:
384        vpsrldq $4, reg_i, reg_i
385        sub     $4, %r12
386        cmp     %r11, %r12
387        jg      _get_AAD_loop2\@
388
389_get_AAD_loop2_done\@:
390
391        #byte-reflect the AAD data
392        vpshufb SHUF_MASK(%rip), reg_i, reg_i
393
394	# initialize the data pointer offset as zero
395	xor     %r11, %r11
396
397	# start AES for num_initial_blocks blocks
398	mov     arg5, %rax                     # rax = *Y0
399	vmovdqu (%rax), \CTR                   # CTR = Y0
400	vpshufb SHUF_MASK(%rip), \CTR, \CTR
401
402
403	i = (9-\num_initial_blocks)
404	setreg
405.rep \num_initial_blocks
406                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
407                vmovdqa \CTR, reg_i
408                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
409	i = (i+1)
410	setreg
411.endr
412
413	vmovdqa  (arg1), \T_key
414	i = (9-\num_initial_blocks)
415	setreg
416.rep \num_initial_blocks
417                vpxor   \T_key, reg_i, reg_i
418	i = (i+1)
419	setreg
420.endr
421
422	j = 1
423	setreg
424.rep 9
425	vmovdqa  16*j(arg1), \T_key
426	i = (9-\num_initial_blocks)
427	setreg
428.rep \num_initial_blocks
429        vaesenc \T_key, reg_i, reg_i
430	i = (i+1)
431	setreg
432.endr
433
434	j = (j+1)
435	setreg
436.endr
437
438
439	vmovdqa  16*10(arg1), \T_key
440	i = (9-\num_initial_blocks)
441	setreg
442.rep \num_initial_blocks
443        vaesenclast      \T_key, reg_i, reg_i
444	i = (i+1)
445	setreg
446.endr
447
448	i = (9-\num_initial_blocks)
449	setreg
450.rep \num_initial_blocks
451                vmovdqu (arg3, %r11), \T1
452                vpxor   \T1, reg_i, reg_i
453                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
454                add     $16, %r11
455.if  \ENC_DEC == DEC
456                vmovdqa \T1, reg_i
457.endif
458                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
459	i = (i+1)
460	setreg
461.endr
462
463
464	i = (8-\num_initial_blocks)
465	j = (9-\num_initial_blocks)
466	setreg
467        GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
468
469.rep \num_initial_blocks
470        vpxor    reg_i, reg_j, reg_j
471        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
472	i = (i+1)
473	j = (j+1)
474	setreg
475.endr
476        # XMM8 has the combined result here
477
478        vmovdqa  \XMM8, TMP1(%rsp)
479        vmovdqa  \XMM8, \T3
480
481        cmp     $128, %r13
482        jl      _initial_blocks_done\@                  # no need for precomputed constants
483
484###############################################################################
485# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
486                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
487                vmovdqa  \CTR, \XMM1
488                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
489
490                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
491                vmovdqa  \CTR, \XMM2
492                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
493
494                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
495                vmovdqa  \CTR, \XMM3
496                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
497
498                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
499                vmovdqa  \CTR, \XMM4
500                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
501
502                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
503                vmovdqa  \CTR, \XMM5
504                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
505
506                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
507                vmovdqa  \CTR, \XMM6
508                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
509
510                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
511                vmovdqa  \CTR, \XMM7
512                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
513
514                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
515                vmovdqa  \CTR, \XMM8
516                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
517
518                vmovdqa  (arg1), \T_key
519                vpxor    \T_key, \XMM1, \XMM1
520                vpxor    \T_key, \XMM2, \XMM2
521                vpxor    \T_key, \XMM3, \XMM3
522                vpxor    \T_key, \XMM4, \XMM4
523                vpxor    \T_key, \XMM5, \XMM5
524                vpxor    \T_key, \XMM6, \XMM6
525                vpxor    \T_key, \XMM7, \XMM7
526                vpxor    \T_key, \XMM8, \XMM8
527
528		i = 1
529		setreg
530.rep    9       # do 9 rounds
531                vmovdqa  16*i(arg1), \T_key
532                vaesenc  \T_key, \XMM1, \XMM1
533                vaesenc  \T_key, \XMM2, \XMM2
534                vaesenc  \T_key, \XMM3, \XMM3
535                vaesenc  \T_key, \XMM4, \XMM4
536                vaesenc  \T_key, \XMM5, \XMM5
537                vaesenc  \T_key, \XMM6, \XMM6
538                vaesenc  \T_key, \XMM7, \XMM7
539                vaesenc  \T_key, \XMM8, \XMM8
540		i = (i+1)
541		setreg
542.endr
543
544
545                vmovdqa  16*i(arg1), \T_key
546                vaesenclast  \T_key, \XMM1, \XMM1
547                vaesenclast  \T_key, \XMM2, \XMM2
548                vaesenclast  \T_key, \XMM3, \XMM3
549                vaesenclast  \T_key, \XMM4, \XMM4
550                vaesenclast  \T_key, \XMM5, \XMM5
551                vaesenclast  \T_key, \XMM6, \XMM6
552                vaesenclast  \T_key, \XMM7, \XMM7
553                vaesenclast  \T_key, \XMM8, \XMM8
554
555                vmovdqu  (arg3, %r11), \T1
556                vpxor    \T1, \XMM1, \XMM1
557                vmovdqu  \XMM1, (arg2 , %r11)
558                .if   \ENC_DEC == DEC
559                vmovdqa  \T1, \XMM1
560                .endif
561
562                vmovdqu  16*1(arg3, %r11), \T1
563                vpxor    \T1, \XMM2, \XMM2
564                vmovdqu  \XMM2, 16*1(arg2 , %r11)
565                .if   \ENC_DEC == DEC
566                vmovdqa  \T1, \XMM2
567                .endif
568
569                vmovdqu  16*2(arg3, %r11), \T1
570                vpxor    \T1, \XMM3, \XMM3
571                vmovdqu  \XMM3, 16*2(arg2 , %r11)
572                .if   \ENC_DEC == DEC
573                vmovdqa  \T1, \XMM3
574                .endif
575
576                vmovdqu  16*3(arg3, %r11), \T1
577                vpxor    \T1, \XMM4, \XMM4
578                vmovdqu  \XMM4, 16*3(arg2 , %r11)
579                .if   \ENC_DEC == DEC
580                vmovdqa  \T1, \XMM4
581                .endif
582
583                vmovdqu  16*4(arg3, %r11), \T1
584                vpxor    \T1, \XMM5, \XMM5
585                vmovdqu  \XMM5, 16*4(arg2 , %r11)
586                .if   \ENC_DEC == DEC
587                vmovdqa  \T1, \XMM5
588                .endif
589
590                vmovdqu  16*5(arg3, %r11), \T1
591                vpxor    \T1, \XMM6, \XMM6
592                vmovdqu  \XMM6, 16*5(arg2 , %r11)
593                .if   \ENC_DEC == DEC
594                vmovdqa  \T1, \XMM6
595                .endif
596
597                vmovdqu  16*6(arg3, %r11), \T1
598                vpxor    \T1, \XMM7, \XMM7
599                vmovdqu  \XMM7, 16*6(arg2 , %r11)
600                .if   \ENC_DEC == DEC
601                vmovdqa  \T1, \XMM7
602                .endif
603
604                vmovdqu  16*7(arg3, %r11), \T1
605                vpxor    \T1, \XMM8, \XMM8
606                vmovdqu  \XMM8, 16*7(arg2 , %r11)
607                .if   \ENC_DEC == DEC
608                vmovdqa  \T1, \XMM8
609                .endif
610
611                add     $128, %r11
612
613                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
614                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
615                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
616                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
617                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
618                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
619                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
620                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
621                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
622
623###############################################################################
624
625_initial_blocks_done\@:
626
627.endm
628
629# encrypt 8 blocks at a time
630# ghash the 8 previously encrypted ciphertext blocks
631# arg1, arg2, arg3 are used as pointers only, not modified
632# r11 is the data offset value
633.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
634
635        vmovdqa \XMM1, \T2
636        vmovdqa \XMM2, TMP2(%rsp)
637        vmovdqa \XMM3, TMP3(%rsp)
638        vmovdqa \XMM4, TMP4(%rsp)
639        vmovdqa \XMM5, TMP5(%rsp)
640        vmovdqa \XMM6, TMP6(%rsp)
641        vmovdqa \XMM7, TMP7(%rsp)
642        vmovdqa \XMM8, TMP8(%rsp)
643
644.if \loop_idx == in_order
645                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
646                vpaddd  ONE(%rip), \XMM1, \XMM2
647                vpaddd  ONE(%rip), \XMM2, \XMM3
648                vpaddd  ONE(%rip), \XMM3, \XMM4
649                vpaddd  ONE(%rip), \XMM4, \XMM5
650                vpaddd  ONE(%rip), \XMM5, \XMM6
651                vpaddd  ONE(%rip), \XMM6, \XMM7
652                vpaddd  ONE(%rip), \XMM7, \XMM8
653                vmovdqa \XMM8, \CTR
654
655                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
656                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
657                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
658                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
659                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
660                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
661                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
662                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
663.else
664                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
665                vpaddd  ONEf(%rip), \XMM1, \XMM2
666                vpaddd  ONEf(%rip), \XMM2, \XMM3
667                vpaddd  ONEf(%rip), \XMM3, \XMM4
668                vpaddd  ONEf(%rip), \XMM4, \XMM5
669                vpaddd  ONEf(%rip), \XMM5, \XMM6
670                vpaddd  ONEf(%rip), \XMM6, \XMM7
671                vpaddd  ONEf(%rip), \XMM7, \XMM8
672                vmovdqa \XMM8, \CTR
673.endif
674
675
676        #######################################################################
677
678                vmovdqu (arg1), \T1
679                vpxor   \T1, \XMM1, \XMM1
680                vpxor   \T1, \XMM2, \XMM2
681                vpxor   \T1, \XMM3, \XMM3
682                vpxor   \T1, \XMM4, \XMM4
683                vpxor   \T1, \XMM5, \XMM5
684                vpxor   \T1, \XMM6, \XMM6
685                vpxor   \T1, \XMM7, \XMM7
686                vpxor   \T1, \XMM8, \XMM8
687
688        #######################################################################
689
690
691
692
693
694                vmovdqu 16*1(arg1), \T1
695                vaesenc \T1, \XMM1, \XMM1
696                vaesenc \T1, \XMM2, \XMM2
697                vaesenc \T1, \XMM3, \XMM3
698                vaesenc \T1, \XMM4, \XMM4
699                vaesenc \T1, \XMM5, \XMM5
700                vaesenc \T1, \XMM6, \XMM6
701                vaesenc \T1, \XMM7, \XMM7
702                vaesenc \T1, \XMM8, \XMM8
703
704                vmovdqu 16*2(arg1), \T1
705                vaesenc \T1, \XMM1, \XMM1
706                vaesenc \T1, \XMM2, \XMM2
707                vaesenc \T1, \XMM3, \XMM3
708                vaesenc \T1, \XMM4, \XMM4
709                vaesenc \T1, \XMM5, \XMM5
710                vaesenc \T1, \XMM6, \XMM6
711                vaesenc \T1, \XMM7, \XMM7
712                vaesenc \T1, \XMM8, \XMM8
713
714
715        #######################################################################
716
717        vmovdqa         HashKey_8(arg1), \T5
718        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
719        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
720
721        vpshufd         $0b01001110, \T2, \T6
722        vpxor           \T2, \T6, \T6
723
724        vmovdqa         HashKey_8_k(arg1), \T5
725        vpclmulqdq      $0x00, \T5, \T6, \T6
726
727                vmovdqu 16*3(arg1), \T1
728                vaesenc \T1, \XMM1, \XMM1
729                vaesenc \T1, \XMM2, \XMM2
730                vaesenc \T1, \XMM3, \XMM3
731                vaesenc \T1, \XMM4, \XMM4
732                vaesenc \T1, \XMM5, \XMM5
733                vaesenc \T1, \XMM6, \XMM6
734                vaesenc \T1, \XMM7, \XMM7
735                vaesenc \T1, \XMM8, \XMM8
736
737        vmovdqa         TMP2(%rsp), \T1
738        vmovdqa         HashKey_7(arg1), \T5
739        vpclmulqdq      $0x11, \T5, \T1, \T3
740        vpxor           \T3, \T4, \T4
741        vpclmulqdq      $0x00, \T5, \T1, \T3
742        vpxor           \T3, \T7, \T7
743
744        vpshufd         $0b01001110, \T1, \T3
745        vpxor           \T1, \T3, \T3
746        vmovdqa         HashKey_7_k(arg1), \T5
747        vpclmulqdq      $0x10, \T5, \T3, \T3
748        vpxor           \T3, \T6, \T6
749
750                vmovdqu 16*4(arg1), \T1
751                vaesenc \T1, \XMM1, \XMM1
752                vaesenc \T1, \XMM2, \XMM2
753                vaesenc \T1, \XMM3, \XMM3
754                vaesenc \T1, \XMM4, \XMM4
755                vaesenc \T1, \XMM5, \XMM5
756                vaesenc \T1, \XMM6, \XMM6
757                vaesenc \T1, \XMM7, \XMM7
758                vaesenc \T1, \XMM8, \XMM8
759
760        #######################################################################
761
762        vmovdqa         TMP3(%rsp), \T1
763        vmovdqa         HashKey_6(arg1), \T5
764        vpclmulqdq      $0x11, \T5, \T1, \T3
765        vpxor           \T3, \T4, \T4
766        vpclmulqdq      $0x00, \T5, \T1, \T3
767        vpxor           \T3, \T7, \T7
768
769        vpshufd         $0b01001110, \T1, \T3
770        vpxor           \T1, \T3, \T3
771        vmovdqa         HashKey_6_k(arg1), \T5
772        vpclmulqdq      $0x10, \T5, \T3, \T3
773        vpxor           \T3, \T6, \T6
774
775                vmovdqu 16*5(arg1), \T1
776                vaesenc \T1, \XMM1, \XMM1
777                vaesenc \T1, \XMM2, \XMM2
778                vaesenc \T1, \XMM3, \XMM3
779                vaesenc \T1, \XMM4, \XMM4
780                vaesenc \T1, \XMM5, \XMM5
781                vaesenc \T1, \XMM6, \XMM6
782                vaesenc \T1, \XMM7, \XMM7
783                vaesenc \T1, \XMM8, \XMM8
784
785        vmovdqa         TMP4(%rsp), \T1
786        vmovdqa         HashKey_5(arg1), \T5
787        vpclmulqdq      $0x11, \T5, \T1, \T3
788        vpxor           \T3, \T4, \T4
789        vpclmulqdq      $0x00, \T5, \T1, \T3
790        vpxor           \T3, \T7, \T7
791
792        vpshufd         $0b01001110, \T1, \T3
793        vpxor           \T1, \T3, \T3
794        vmovdqa         HashKey_5_k(arg1), \T5
795        vpclmulqdq      $0x10, \T5, \T3, \T3
796        vpxor           \T3, \T6, \T6
797
798                vmovdqu 16*6(arg1), \T1
799                vaesenc \T1, \XMM1, \XMM1
800                vaesenc \T1, \XMM2, \XMM2
801                vaesenc \T1, \XMM3, \XMM3
802                vaesenc \T1, \XMM4, \XMM4
803                vaesenc \T1, \XMM5, \XMM5
804                vaesenc \T1, \XMM6, \XMM6
805                vaesenc \T1, \XMM7, \XMM7
806                vaesenc \T1, \XMM8, \XMM8
807
808
809        vmovdqa         TMP5(%rsp), \T1
810        vmovdqa         HashKey_4(arg1), \T5
811        vpclmulqdq      $0x11, \T5, \T1, \T3
812        vpxor           \T3, \T4, \T4
813        vpclmulqdq      $0x00, \T5, \T1, \T3
814        vpxor           \T3, \T7, \T7
815
816        vpshufd         $0b01001110, \T1, \T3
817        vpxor           \T1, \T3, \T3
818        vmovdqa         HashKey_4_k(arg1), \T5
819        vpclmulqdq      $0x10, \T5, \T3, \T3
820        vpxor           \T3, \T6, \T6
821
822                vmovdqu 16*7(arg1), \T1
823                vaesenc \T1, \XMM1, \XMM1
824                vaesenc \T1, \XMM2, \XMM2
825                vaesenc \T1, \XMM3, \XMM3
826                vaesenc \T1, \XMM4, \XMM4
827                vaesenc \T1, \XMM5, \XMM5
828                vaesenc \T1, \XMM6, \XMM6
829                vaesenc \T1, \XMM7, \XMM7
830                vaesenc \T1, \XMM8, \XMM8
831
832        vmovdqa         TMP6(%rsp), \T1
833        vmovdqa         HashKey_3(arg1), \T5
834        vpclmulqdq      $0x11, \T5, \T1, \T3
835        vpxor           \T3, \T4, \T4
836        vpclmulqdq      $0x00, \T5, \T1, \T3
837        vpxor           \T3, \T7, \T7
838
839        vpshufd         $0b01001110, \T1, \T3
840        vpxor           \T1, \T3, \T3
841        vmovdqa         HashKey_3_k(arg1), \T5
842        vpclmulqdq      $0x10, \T5, \T3, \T3
843        vpxor           \T3, \T6, \T6
844
845
846                vmovdqu 16*8(arg1), \T1
847                vaesenc \T1, \XMM1, \XMM1
848                vaesenc \T1, \XMM2, \XMM2
849                vaesenc \T1, \XMM3, \XMM3
850                vaesenc \T1, \XMM4, \XMM4
851                vaesenc \T1, \XMM5, \XMM5
852                vaesenc \T1, \XMM6, \XMM6
853                vaesenc \T1, \XMM7, \XMM7
854                vaesenc \T1, \XMM8, \XMM8
855
856        vmovdqa         TMP7(%rsp), \T1
857        vmovdqa         HashKey_2(arg1), \T5
858        vpclmulqdq      $0x11, \T5, \T1, \T3
859        vpxor           \T3, \T4, \T4
860        vpclmulqdq      $0x00, \T5, \T1, \T3
861        vpxor           \T3, \T7, \T7
862
863        vpshufd         $0b01001110, \T1, \T3
864        vpxor           \T1, \T3, \T3
865        vmovdqa         HashKey_2_k(arg1), \T5
866        vpclmulqdq      $0x10, \T5, \T3, \T3
867        vpxor           \T3, \T6, \T6
868
869        #######################################################################
870
871                vmovdqu 16*9(arg1), \T5
872                vaesenc \T5, \XMM1, \XMM1
873                vaesenc \T5, \XMM2, \XMM2
874                vaesenc \T5, \XMM3, \XMM3
875                vaesenc \T5, \XMM4, \XMM4
876                vaesenc \T5, \XMM5, \XMM5
877                vaesenc \T5, \XMM6, \XMM6
878                vaesenc \T5, \XMM7, \XMM7
879                vaesenc \T5, \XMM8, \XMM8
880
881        vmovdqa         TMP8(%rsp), \T1
882        vmovdqa         HashKey(arg1), \T5
883        vpclmulqdq      $0x11, \T5, \T1, \T3
884        vpxor           \T3, \T4, \T4
885        vpclmulqdq      $0x00, \T5, \T1, \T3
886        vpxor           \T3, \T7, \T7
887
888        vpshufd         $0b01001110, \T1, \T3
889        vpxor           \T1, \T3, \T3
890        vmovdqa         HashKey_k(arg1), \T5
891        vpclmulqdq      $0x10, \T5, \T3, \T3
892        vpxor           \T3, \T6, \T6
893
894        vpxor           \T4, \T6, \T6
895        vpxor           \T7, \T6, \T6
896
897                vmovdqu 16*10(arg1), \T5
898
899	i = 0
900	j = 1
901	setreg
902.rep 8
903		vpxor	16*i(arg3, %r11), \T5, \T2
904                .if \ENC_DEC == ENC
905                vaesenclast     \T2, reg_j, reg_j
906                .else
907                vaesenclast     \T2, reg_j, \T3
908                vmovdqu 16*i(arg3, %r11), reg_j
909                vmovdqu \T3, 16*i(arg2, %r11)
910                .endif
911	i = (i+1)
912	j = (j+1)
913	setreg
914.endr
915	#######################################################################
916
917
918	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
919	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
920	vpxor	\T3, \T7, \T7
921	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
922
923
924
925	#######################################################################
926	#first phase of the reduction
927	#######################################################################
928        vpslld  $31, \T7, \T2                           # packed right shifting << 31
929        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
930        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
931
932        vpxor   \T3, \T2, \T2                           # xor the shifted versions
933        vpxor   \T4, \T2, \T2
934
935        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
936
937        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
938        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
939	#######################################################################
940                .if \ENC_DEC == ENC
941		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
942		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
943		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
944		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
945		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
946		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
947		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
948		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
949                .endif
950
951	#######################################################################
952	#second phase of the reduction
953        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
954        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
955        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
956        vpxor   \T3, \T2, \T2                           # xor the shifted versions
957        vpxor   \T4, \T2, \T2
958
959        vpxor   \T1, \T2, \T2
960        vpxor   \T2, \T7, \T7
961        vpxor   \T7, \T6, \T6                           # the result is in T6
962	#######################################################################
963
964		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
965		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
966		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
967		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
968		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
969		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
970		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
971		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
972
973
974	vpxor	\T6, \XMM1, \XMM1
975
976
977
978.endm
979
980
981# GHASH the last 4 ciphertext blocks.
982.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
983
984        ## Karatsuba Method
985
986
987        vpshufd         $0b01001110, \XMM1, \T2
988        vpxor           \XMM1, \T2, \T2
989        vmovdqa         HashKey_8(arg1), \T5
990        vpclmulqdq      $0x11, \T5, \XMM1, \T6
991        vpclmulqdq      $0x00, \T5, \XMM1, \T7
992
993        vmovdqa         HashKey_8_k(arg1), \T3
994        vpclmulqdq      $0x00, \T3, \T2, \XMM1
995
996        ######################
997
998        vpshufd         $0b01001110, \XMM2, \T2
999        vpxor           \XMM2, \T2, \T2
1000        vmovdqa         HashKey_7(arg1), \T5
1001        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1002        vpxor           \T4, \T6, \T6
1003
1004        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1005        vpxor           \T4, \T7, \T7
1006
1007        vmovdqa         HashKey_7_k(arg1), \T3
1008        vpclmulqdq      $0x00, \T3, \T2, \T2
1009        vpxor           \T2, \XMM1, \XMM1
1010
1011        ######################
1012
1013        vpshufd         $0b01001110, \XMM3, \T2
1014        vpxor           \XMM3, \T2, \T2
1015        vmovdqa         HashKey_6(arg1), \T5
1016        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1017        vpxor           \T4, \T6, \T6
1018
1019        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1020        vpxor           \T4, \T7, \T7
1021
1022        vmovdqa         HashKey_6_k(arg1), \T3
1023        vpclmulqdq      $0x00, \T3, \T2, \T2
1024        vpxor           \T2, \XMM1, \XMM1
1025
1026        ######################
1027
1028        vpshufd         $0b01001110, \XMM4, \T2
1029        vpxor           \XMM4, \T2, \T2
1030        vmovdqa         HashKey_5(arg1), \T5
1031        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1032        vpxor           \T4, \T6, \T6
1033
1034        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1035        vpxor           \T4, \T7, \T7
1036
1037        vmovdqa         HashKey_5_k(arg1), \T3
1038        vpclmulqdq      $0x00, \T3, \T2, \T2
1039        vpxor           \T2, \XMM1, \XMM1
1040
1041        ######################
1042
1043        vpshufd         $0b01001110, \XMM5, \T2
1044        vpxor           \XMM5, \T2, \T2
1045        vmovdqa         HashKey_4(arg1), \T5
1046        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1047        vpxor           \T4, \T6, \T6
1048
1049        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1050        vpxor           \T4, \T7, \T7
1051
1052        vmovdqa         HashKey_4_k(arg1), \T3
1053        vpclmulqdq      $0x00, \T3, \T2, \T2
1054        vpxor           \T2, \XMM1, \XMM1
1055
1056        ######################
1057
1058        vpshufd         $0b01001110, \XMM6, \T2
1059        vpxor           \XMM6, \T2, \T2
1060        vmovdqa         HashKey_3(arg1), \T5
1061        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1062        vpxor           \T4, \T6, \T6
1063
1064        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1065        vpxor           \T4, \T7, \T7
1066
1067        vmovdqa         HashKey_3_k(arg1), \T3
1068        vpclmulqdq      $0x00, \T3, \T2, \T2
1069        vpxor           \T2, \XMM1, \XMM1
1070
1071        ######################
1072
1073        vpshufd         $0b01001110, \XMM7, \T2
1074        vpxor           \XMM7, \T2, \T2
1075        vmovdqa         HashKey_2(arg1), \T5
1076        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1077        vpxor           \T4, \T6, \T6
1078
1079        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1080        vpxor           \T4, \T7, \T7
1081
1082        vmovdqa         HashKey_2_k(arg1), \T3
1083        vpclmulqdq      $0x00, \T3, \T2, \T2
1084        vpxor           \T2, \XMM1, \XMM1
1085
1086        ######################
1087
1088        vpshufd         $0b01001110, \XMM8, \T2
1089        vpxor           \XMM8, \T2, \T2
1090        vmovdqa         HashKey(arg1), \T5
1091        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1092        vpxor           \T4, \T6, \T6
1093
1094        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1095        vpxor           \T4, \T7, \T7
1096
1097        vmovdqa         HashKey_k(arg1), \T3
1098        vpclmulqdq      $0x00, \T3, \T2, \T2
1099
1100        vpxor           \T2, \XMM1, \XMM1
1101        vpxor           \T6, \XMM1, \XMM1
1102        vpxor           \T7, \XMM1, \T2
1103
1104
1105
1106
1107        vpslldq $8, \T2, \T4
1108        vpsrldq $8, \T2, \T2
1109
1110        vpxor   \T4, \T7, \T7
1111        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1112				# the accumulated carry-less multiplications
1113
1114        #######################################################################
1115        #first phase of the reduction
1116        vpslld  $31, \T7, \T2   # packed right shifting << 31
1117        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1118        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1119
1120        vpxor   \T3, \T2, \T2   # xor the shifted versions
1121        vpxor   \T4, \T2, \T2
1122
1123        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1124
1125        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1126        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1127        #######################################################################
1128
1129
1130        #second phase of the reduction
1131        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1132        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1133        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1134        vpxor   \T3, \T2, \T2   # xor the shifted versions
1135        vpxor   \T4, \T2, \T2
1136
1137        vpxor   \T1, \T2, \T2
1138        vpxor   \T2, \T7, \T7
1139        vpxor   \T7, \T6, \T6   # the result is in T6
1140
1141.endm
1142
1143
1144# combined for GCM encrypt and decrypt functions
1145# clobbering all xmm registers
1146# clobbering r10, r11, r12, r13, r14, r15
1147.macro  GCM_ENC_DEC_AVX     ENC_DEC
1148
1149        #the number of pushes must equal STACK_OFFSET
1150        push    %r12
1151        push    %r13
1152        push    %r14
1153        push    %r15
1154
1155        mov     %rsp, %r14
1156
1157
1158
1159
1160        sub     $VARIABLE_OFFSET, %rsp
1161        and     $~63, %rsp                  # align rsp to 64 bytes
1162
1163
1164        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
1165
1166        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
1167        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
1168
1169        mov     %r13, %r12
1170        shr     $4, %r12
1171        and     $7, %r12
1172        jz      _initial_num_blocks_is_0\@
1173
1174        cmp     $7, %r12
1175        je      _initial_num_blocks_is_7\@
1176        cmp     $6, %r12
1177        je      _initial_num_blocks_is_6\@
1178        cmp     $5, %r12
1179        je      _initial_num_blocks_is_5\@
1180        cmp     $4, %r12
1181        je      _initial_num_blocks_is_4\@
1182        cmp     $3, %r12
1183        je      _initial_num_blocks_is_3\@
1184        cmp     $2, %r12
1185        je      _initial_num_blocks_is_2\@
1186
1187        jmp     _initial_num_blocks_is_1\@
1188
1189_initial_num_blocks_is_7\@:
1190        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1191        sub     $16*7, %r13
1192        jmp     _initial_blocks_encrypted\@
1193
1194_initial_num_blocks_is_6\@:
1195        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1196        sub     $16*6, %r13
1197        jmp     _initial_blocks_encrypted\@
1198
1199_initial_num_blocks_is_5\@:
1200        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1201        sub     $16*5, %r13
1202        jmp     _initial_blocks_encrypted\@
1203
1204_initial_num_blocks_is_4\@:
1205        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1206        sub     $16*4, %r13
1207        jmp     _initial_blocks_encrypted\@
1208
1209_initial_num_blocks_is_3\@:
1210        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1211        sub     $16*3, %r13
1212        jmp     _initial_blocks_encrypted\@
1213
1214_initial_num_blocks_is_2\@:
1215        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1216        sub     $16*2, %r13
1217        jmp     _initial_blocks_encrypted\@
1218
1219_initial_num_blocks_is_1\@:
1220        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1221        sub     $16*1, %r13
1222        jmp     _initial_blocks_encrypted\@
1223
1224_initial_num_blocks_is_0\@:
1225        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1226
1227
1228_initial_blocks_encrypted\@:
1229        cmp     $0, %r13
1230        je      _zero_cipher_left\@
1231
1232        sub     $128, %r13
1233        je      _eight_cipher_left\@
1234
1235
1236
1237
1238        vmovd   %xmm9, %r15d
1239        and     $255, %r15d
1240        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1241
1242
1243_encrypt_by_8_new\@:
1244        cmp     $(255-8), %r15d
1245        jg      _encrypt_by_8\@
1246
1247
1248
1249        add     $8, %r15b
1250        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1251        add     $128, %r11
1252        sub     $128, %r13
1253        jne     _encrypt_by_8_new\@
1254
1255        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1256        jmp     _eight_cipher_left\@
1257
1258_encrypt_by_8\@:
1259        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1260        add     $8, %r15b
1261        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1262        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1263        add     $128, %r11
1264        sub     $128, %r13
1265        jne     _encrypt_by_8_new\@
1266
1267        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1268
1269
1270
1271
1272_eight_cipher_left\@:
1273        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1274
1275
1276_zero_cipher_left\@:
1277        cmp     $16, arg4
1278        jl      _only_less_than_16\@
1279
1280        mov     arg4, %r13
1281        and     $15, %r13                            # r13 = (arg4 mod 16)
1282
1283        je      _multiple_of_16_bytes\@
1284
1285        # handle the last <16 Byte block seperately
1286
1287
1288        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
1289        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1290        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1291
1292        sub     $16, %r11
1293        add     %r13, %r11
1294        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
1295
1296        lea     SHIFT_MASK+16(%rip), %r12
1297        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1298						     # able to shift 16-r13 bytes (r13 is the
1299						     # number of bytes in plaintext mod 16)
1300        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
1301        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
1302        jmp     _final_ghash_mul\@
1303
1304_only_less_than_16\@:
1305        # check for 0 length
1306        mov     arg4, %r13
1307        and     $15, %r13                            # r13 = (arg4 mod 16)
1308
1309        je      _multiple_of_16_bytes\@
1310
1311        # handle the last <16 Byte block seperately
1312
1313
1314        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
1315        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1316        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1317
1318
1319        lea     SHIFT_MASK+16(%rip), %r12
1320        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1321						     # able to shift 16-r13 bytes (r13 is the
1322						     # number of bytes in plaintext mod 16)
1323
1324_get_last_16_byte_loop\@:
1325        movb    (arg3, %r11),  %al
1326        movb    %al,  TMP1 (%rsp , %r11)
1327        add     $1, %r11
1328        cmp     %r13,  %r11
1329        jne     _get_last_16_byte_loop\@
1330
1331        vmovdqu  TMP1(%rsp), %xmm1
1332
1333        sub     $16, %r11
1334
1335_final_ghash_mul\@:
1336        .if  \ENC_DEC ==  DEC
1337        vmovdqa %xmm1, %xmm2
1338        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1339        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1340						     # mask out top 16-r13 bytes of xmm9
1341        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1342        vpand   %xmm1, %xmm2, %xmm2
1343        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1344        vpxor   %xmm2, %xmm14, %xmm14
1345	#GHASH computation for the last <16 Byte block
1346        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1347        sub     %r13, %r11
1348        add     $16, %r11
1349        .else
1350        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1351        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1352						     # mask out top 16-r13 bytes of xmm9
1353        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1354        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1355        vpxor   %xmm9, %xmm14, %xmm14
1356	#GHASH computation for the last <16 Byte block
1357        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1358        sub     %r13, %r11
1359        add     $16, %r11
1360        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
1361        .endif
1362
1363
1364        #############################
1365        # output r13 Bytes
1366        vmovq   %xmm9, %rax
1367        cmp     $8, %r13
1368        jle     _less_than_8_bytes_left\@
1369
1370        mov     %rax, (arg2 , %r11)
1371        add     $8, %r11
1372        vpsrldq $8, %xmm9, %xmm9
1373        vmovq   %xmm9, %rax
1374        sub     $8, %r13
1375
1376_less_than_8_bytes_left\@:
1377        movb    %al, (arg2 , %r11)
1378        add     $1, %r11
1379        shr     $8, %rax
1380        sub     $1, %r13
1381        jne     _less_than_8_bytes_left\@
1382        #############################
1383
1384_multiple_of_16_bytes\@:
1385        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
1386        shl     $3, %r12                             # convert into number of bits
1387        vmovd   %r12d, %xmm15                        # len(A) in xmm15
1388
1389        shl     $3, arg4                             # len(C) in bits  (*128)
1390        vmovq   arg4, %xmm1
1391        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
1392        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
1393
1394        vpxor   %xmm15, %xmm14, %xmm14
1395        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
1396        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
1397
1398        mov     arg5, %rax                           # rax = *Y0
1399        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
1400
1401        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
1402
1403        vpxor   %xmm14, %xmm9, %xmm9
1404
1405
1406
1407_return_T\@:
1408        mov     arg8, %r10              # r10 = authTag
1409        mov     arg9, %r11              # r11 = auth_tag_len
1410
1411        cmp     $16, %r11
1412        je      _T_16\@
1413
1414        cmp     $12, %r11
1415        je      _T_12\@
1416
1417_T_8\@:
1418        vmovq   %xmm9, %rax
1419        mov     %rax, (%r10)
1420        jmp     _return_T_done\@
1421_T_12\@:
1422        vmovq   %xmm9, %rax
1423        mov     %rax, (%r10)
1424        vpsrldq $8, %xmm9, %xmm9
1425        vmovd   %xmm9, %eax
1426        mov     %eax, 8(%r10)
1427        jmp     _return_T_done\@
1428
1429_T_16\@:
1430        vmovdqu %xmm9, (%r10)
1431
1432_return_T_done\@:
1433        mov     %r14, %rsp
1434
1435        pop     %r15
1436        pop     %r14
1437        pop     %r13
1438        pop     %r12
1439.endm
1440
1441
1442#############################################################
1443#void   aesni_gcm_precomp_avx_gen2
1444#        (gcm_data     *my_ctx_data,
1445#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1446#############################################################
1447ENTRY(aesni_gcm_precomp_avx_gen2)
1448        #the number of pushes must equal STACK_OFFSET
1449        push    %r12
1450        push    %r13
1451        push    %r14
1452        push    %r15
1453
1454        mov     %rsp, %r14
1455
1456
1457
1458        sub     $VARIABLE_OFFSET, %rsp
1459        and     $~63, %rsp                  # align rsp to 64 bytes
1460
1461        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
1462
1463        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
1464        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1465        vmovdqa  %xmm6, %xmm2
1466        vpsllq   $1, %xmm6, %xmm6
1467        vpsrlq   $63, %xmm2, %xmm2
1468        vmovdqa  %xmm2, %xmm1
1469        vpslldq  $8, %xmm2, %xmm2
1470        vpsrldq  $8, %xmm1, %xmm1
1471        vpor     %xmm2, %xmm6, %xmm6
1472        #reduction
1473        vpshufd  $0b00100100, %xmm1, %xmm2
1474        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1475        vpand    POLY(%rip), %xmm2, %xmm2
1476        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
1477        #######################################################################
1478        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
1479
1480
1481        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1482
1483        mov     %r14, %rsp
1484
1485        pop     %r15
1486        pop     %r14
1487        pop     %r13
1488        pop     %r12
1489        ret
1490ENDPROC(aesni_gcm_precomp_avx_gen2)
1491
1492###############################################################################
1493#void   aesni_gcm_enc_avx_gen2(
1494#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1495#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1496#        const   u8 *in, /* Plaintext input */
1497#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1498#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1499#			(from Security Association) concatenated with 8 byte
1500#			Initialisation Vector (from IPSec ESP Payload)
1501#			concatenated with 0x00000001. 16-byte aligned pointer. */
1502#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1503#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1504#        u8      *auth_tag, /* Authenticated Tag output. */
1505#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1506#				Valid values are 16 (most likely), 12 or 8. */
1507###############################################################################
1508ENTRY(aesni_gcm_enc_avx_gen2)
1509        GCM_ENC_DEC_AVX     ENC
1510	ret
1511ENDPROC(aesni_gcm_enc_avx_gen2)
1512
1513###############################################################################
1514#void   aesni_gcm_dec_avx_gen2(
1515#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1516#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1517#        const   u8 *in, /* Ciphertext input */
1518#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1519#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1520#			(from Security Association) concatenated with 8 byte
1521#			Initialisation Vector (from IPSec ESP Payload)
1522#			concatenated with 0x00000001. 16-byte aligned pointer. */
1523#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1524#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1525#        u8      *auth_tag, /* Authenticated Tag output. */
1526#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1527#				Valid values are 16 (most likely), 12 or 8. */
1528###############################################################################
1529ENTRY(aesni_gcm_dec_avx_gen2)
1530        GCM_ENC_DEC_AVX     DEC
1531	ret
1532ENDPROC(aesni_gcm_dec_avx_gen2)
1533#endif /* CONFIG_AS_AVX */
1534
1535#ifdef CONFIG_AS_AVX2
1536###############################################################################
1537# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1538# Input: A and B (128-bits each, bit-reflected)
1539# Output: C = A*B*x mod poly, (i.e. >>1 )
1540# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1541# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1542###############################################################################
1543.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1544
1545        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1546        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1547        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1548        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1549        vpxor           \T3, \GH, \GH
1550
1551
1552        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1553        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1554
1555        vpxor           \T3, \T1, \T1
1556        vpxor           \T2, \GH, \GH
1557
1558        #######################################################################
1559        #first phase of the reduction
1560        vmovdqa         POLY2(%rip), \T3
1561
1562        vpclmulqdq      $0x01, \GH, \T3, \T2
1563        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1564
1565        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1566        #######################################################################
1567        #second phase of the reduction
1568        vpclmulqdq      $0x00, \GH, \T3, \T2
1569        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1570
1571        vpclmulqdq      $0x10, \GH, \T3, \GH
1572        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1573
1574        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1575        #######################################################################
1576        vpxor           \T1, \GH, \GH          # the result is in GH
1577
1578
1579.endm
1580
1581.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1582
1583        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1584        vmovdqa  \HK, \T5
1585        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1586        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1587
1588        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1589        vmovdqa  \T5, HashKey_3(arg1)
1590
1591        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1592        vmovdqa  \T5, HashKey_4(arg1)
1593
1594        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1595        vmovdqa  \T5, HashKey_5(arg1)
1596
1597        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1598        vmovdqa  \T5, HashKey_6(arg1)
1599
1600        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1601        vmovdqa  \T5, HashKey_7(arg1)
1602
1603        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1604        vmovdqa  \T5, HashKey_8(arg1)
1605
1606.endm
1607
1608
1609## if a = number of total plaintext bytes
1610## b = floor(a/16)
1611## num_initial_blocks = b mod 4#
1612## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1613## r10, r11, r12, rax are clobbered
1614## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1615
1616.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1617	i = (8-\num_initial_blocks)
1618	setreg
1619
1620        mov     arg6, %r10                       # r10 = AAD
1621        mov     arg7, %r12                       # r12 = aadLen
1622
1623
1624        mov     %r12, %r11
1625
1626        vpxor   reg_i, reg_i, reg_i
1627_get_AAD_loop\@:
1628        vmovd   (%r10), \T1
1629        vpslldq $12, \T1, \T1
1630        vpsrldq $4, reg_i, reg_i
1631        vpxor   \T1, reg_i, reg_i
1632
1633        add     $4, %r10
1634        sub     $4, %r12
1635        jg      _get_AAD_loop\@
1636
1637
1638        cmp     $16, %r11
1639        je      _get_AAD_loop2_done\@
1640        mov     $16, %r12
1641
1642_get_AAD_loop2\@:
1643        vpsrldq $4, reg_i, reg_i
1644        sub     $4, %r12
1645        cmp     %r11, %r12
1646        jg      _get_AAD_loop2\@
1647
1648_get_AAD_loop2_done\@:
1649
1650        #byte-reflect the AAD data
1651        vpshufb SHUF_MASK(%rip), reg_i, reg_i
1652
1653	# initialize the data pointer offset as zero
1654	xor     %r11, %r11
1655
1656	# start AES for num_initial_blocks blocks
1657	mov     arg5, %rax                     # rax = *Y0
1658	vmovdqu (%rax), \CTR                   # CTR = Y0
1659	vpshufb SHUF_MASK(%rip), \CTR, \CTR
1660
1661
1662	i = (9-\num_initial_blocks)
1663	setreg
1664.rep \num_initial_blocks
1665                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1666                vmovdqa \CTR, reg_i
1667                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1668	i = (i+1)
1669	setreg
1670.endr
1671
1672	vmovdqa  (arg1), \T_key
1673	i = (9-\num_initial_blocks)
1674	setreg
1675.rep \num_initial_blocks
1676                vpxor   \T_key, reg_i, reg_i
1677	i = (i+1)
1678	setreg
1679.endr
1680
1681	j = 1
1682	setreg
1683.rep 9
1684	vmovdqa  16*j(arg1), \T_key
1685	i = (9-\num_initial_blocks)
1686	setreg
1687.rep \num_initial_blocks
1688        vaesenc \T_key, reg_i, reg_i
1689	i = (i+1)
1690	setreg
1691.endr
1692
1693	j = (j+1)
1694	setreg
1695.endr
1696
1697
1698	vmovdqa  16*10(arg1), \T_key
1699	i = (9-\num_initial_blocks)
1700	setreg
1701.rep \num_initial_blocks
1702        vaesenclast      \T_key, reg_i, reg_i
1703	i = (i+1)
1704	setreg
1705.endr
1706
1707	i = (9-\num_initial_blocks)
1708	setreg
1709.rep \num_initial_blocks
1710                vmovdqu (arg3, %r11), \T1
1711                vpxor   \T1, reg_i, reg_i
1712                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
1713						       # num_initial_blocks blocks
1714                add     $16, %r11
1715.if  \ENC_DEC == DEC
1716                vmovdqa \T1, reg_i
1717.endif
1718                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1719	i = (i+1)
1720	setreg
1721.endr
1722
1723
1724	i = (8-\num_initial_blocks)
1725	j = (9-\num_initial_blocks)
1726	setreg
1727        GHASH_MUL_AVX2       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1728
1729.rep \num_initial_blocks
1730        vpxor    reg_i, reg_j, reg_j
1731        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1732	i = (i+1)
1733	j = (j+1)
1734	setreg
1735.endr
1736        # XMM8 has the combined result here
1737
1738        vmovdqa  \XMM8, TMP1(%rsp)
1739        vmovdqa  \XMM8, \T3
1740
1741        cmp     $128, %r13
1742        jl      _initial_blocks_done\@                  # no need for precomputed constants
1743
1744###############################################################################
1745# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1746                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1747                vmovdqa  \CTR, \XMM1
1748                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1749
1750                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1751                vmovdqa  \CTR, \XMM2
1752                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1753
1754                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1755                vmovdqa  \CTR, \XMM3
1756                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1757
1758                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1759                vmovdqa  \CTR, \XMM4
1760                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1761
1762                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1763                vmovdqa  \CTR, \XMM5
1764                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1765
1766                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1767                vmovdqa  \CTR, \XMM6
1768                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1769
1770                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1771                vmovdqa  \CTR, \XMM7
1772                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1773
1774                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1775                vmovdqa  \CTR, \XMM8
1776                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1777
1778                vmovdqa  (arg1), \T_key
1779                vpxor    \T_key, \XMM1, \XMM1
1780                vpxor    \T_key, \XMM2, \XMM2
1781                vpxor    \T_key, \XMM3, \XMM3
1782                vpxor    \T_key, \XMM4, \XMM4
1783                vpxor    \T_key, \XMM5, \XMM5
1784                vpxor    \T_key, \XMM6, \XMM6
1785                vpxor    \T_key, \XMM7, \XMM7
1786                vpxor    \T_key, \XMM8, \XMM8
1787
1788		i = 1
1789		setreg
1790.rep    9       # do 9 rounds
1791                vmovdqa  16*i(arg1), \T_key
1792                vaesenc  \T_key, \XMM1, \XMM1
1793                vaesenc  \T_key, \XMM2, \XMM2
1794                vaesenc  \T_key, \XMM3, \XMM3
1795                vaesenc  \T_key, \XMM4, \XMM4
1796                vaesenc  \T_key, \XMM5, \XMM5
1797                vaesenc  \T_key, \XMM6, \XMM6
1798                vaesenc  \T_key, \XMM7, \XMM7
1799                vaesenc  \T_key, \XMM8, \XMM8
1800		i = (i+1)
1801		setreg
1802.endr
1803
1804
1805                vmovdqa  16*i(arg1), \T_key
1806                vaesenclast  \T_key, \XMM1, \XMM1
1807                vaesenclast  \T_key, \XMM2, \XMM2
1808                vaesenclast  \T_key, \XMM3, \XMM3
1809                vaesenclast  \T_key, \XMM4, \XMM4
1810                vaesenclast  \T_key, \XMM5, \XMM5
1811                vaesenclast  \T_key, \XMM6, \XMM6
1812                vaesenclast  \T_key, \XMM7, \XMM7
1813                vaesenclast  \T_key, \XMM8, \XMM8
1814
1815                vmovdqu  (arg3, %r11), \T1
1816                vpxor    \T1, \XMM1, \XMM1
1817                vmovdqu  \XMM1, (arg2 , %r11)
1818                .if   \ENC_DEC == DEC
1819                vmovdqa  \T1, \XMM1
1820                .endif
1821
1822                vmovdqu  16*1(arg3, %r11), \T1
1823                vpxor    \T1, \XMM2, \XMM2
1824                vmovdqu  \XMM2, 16*1(arg2 , %r11)
1825                .if   \ENC_DEC == DEC
1826                vmovdqa  \T1, \XMM2
1827                .endif
1828
1829                vmovdqu  16*2(arg3, %r11), \T1
1830                vpxor    \T1, \XMM3, \XMM3
1831                vmovdqu  \XMM3, 16*2(arg2 , %r11)
1832                .if   \ENC_DEC == DEC
1833                vmovdqa  \T1, \XMM3
1834                .endif
1835
1836                vmovdqu  16*3(arg3, %r11), \T1
1837                vpxor    \T1, \XMM4, \XMM4
1838                vmovdqu  \XMM4, 16*3(arg2 , %r11)
1839                .if   \ENC_DEC == DEC
1840                vmovdqa  \T1, \XMM4
1841                .endif
1842
1843                vmovdqu  16*4(arg3, %r11), \T1
1844                vpxor    \T1, \XMM5, \XMM5
1845                vmovdqu  \XMM5, 16*4(arg2 , %r11)
1846                .if   \ENC_DEC == DEC
1847                vmovdqa  \T1, \XMM5
1848                .endif
1849
1850                vmovdqu  16*5(arg3, %r11), \T1
1851                vpxor    \T1, \XMM6, \XMM6
1852                vmovdqu  \XMM6, 16*5(arg2 , %r11)
1853                .if   \ENC_DEC == DEC
1854                vmovdqa  \T1, \XMM6
1855                .endif
1856
1857                vmovdqu  16*6(arg3, %r11), \T1
1858                vpxor    \T1, \XMM7, \XMM7
1859                vmovdqu  \XMM7, 16*6(arg2 , %r11)
1860                .if   \ENC_DEC == DEC
1861                vmovdqa  \T1, \XMM7
1862                .endif
1863
1864                vmovdqu  16*7(arg3, %r11), \T1
1865                vpxor    \T1, \XMM8, \XMM8
1866                vmovdqu  \XMM8, 16*7(arg2 , %r11)
1867                .if   \ENC_DEC == DEC
1868                vmovdqa  \T1, \XMM8
1869                .endif
1870
1871                add     $128, %r11
1872
1873                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1874                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
1875							   # the corresponding ciphertext
1876                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1877                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1878                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1879                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1880                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1881                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1882                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1883
1884###############################################################################
1885
1886_initial_blocks_done\@:
1887
1888
1889.endm
1890
1891
1892
1893# encrypt 8 blocks at a time
1894# ghash the 8 previously encrypted ciphertext blocks
1895# arg1, arg2, arg3 are used as pointers only, not modified
1896# r11 is the data offset value
1897.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1898
1899        vmovdqa \XMM1, \T2
1900        vmovdqa \XMM2, TMP2(%rsp)
1901        vmovdqa \XMM3, TMP3(%rsp)
1902        vmovdqa \XMM4, TMP4(%rsp)
1903        vmovdqa \XMM5, TMP5(%rsp)
1904        vmovdqa \XMM6, TMP6(%rsp)
1905        vmovdqa \XMM7, TMP7(%rsp)
1906        vmovdqa \XMM8, TMP8(%rsp)
1907
1908.if \loop_idx == in_order
1909                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
1910                vpaddd  ONE(%rip), \XMM1, \XMM2
1911                vpaddd  ONE(%rip), \XMM2, \XMM3
1912                vpaddd  ONE(%rip), \XMM3, \XMM4
1913                vpaddd  ONE(%rip), \XMM4, \XMM5
1914                vpaddd  ONE(%rip), \XMM5, \XMM6
1915                vpaddd  ONE(%rip), \XMM6, \XMM7
1916                vpaddd  ONE(%rip), \XMM7, \XMM8
1917                vmovdqa \XMM8, \CTR
1918
1919                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1920                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1921                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1922                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1923                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1924                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1925                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1926                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1927.else
1928                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
1929                vpaddd  ONEf(%rip), \XMM1, \XMM2
1930                vpaddd  ONEf(%rip), \XMM2, \XMM3
1931                vpaddd  ONEf(%rip), \XMM3, \XMM4
1932                vpaddd  ONEf(%rip), \XMM4, \XMM5
1933                vpaddd  ONEf(%rip), \XMM5, \XMM6
1934                vpaddd  ONEf(%rip), \XMM6, \XMM7
1935                vpaddd  ONEf(%rip), \XMM7, \XMM8
1936                vmovdqa \XMM8, \CTR
1937.endif
1938
1939
1940        #######################################################################
1941
1942                vmovdqu (arg1), \T1
1943                vpxor   \T1, \XMM1, \XMM1
1944                vpxor   \T1, \XMM2, \XMM2
1945                vpxor   \T1, \XMM3, \XMM3
1946                vpxor   \T1, \XMM4, \XMM4
1947                vpxor   \T1, \XMM5, \XMM5
1948                vpxor   \T1, \XMM6, \XMM6
1949                vpxor   \T1, \XMM7, \XMM7
1950                vpxor   \T1, \XMM8, \XMM8
1951
1952        #######################################################################
1953
1954
1955
1956
1957
1958                vmovdqu 16*1(arg1), \T1
1959                vaesenc \T1, \XMM1, \XMM1
1960                vaesenc \T1, \XMM2, \XMM2
1961                vaesenc \T1, \XMM3, \XMM3
1962                vaesenc \T1, \XMM4, \XMM4
1963                vaesenc \T1, \XMM5, \XMM5
1964                vaesenc \T1, \XMM6, \XMM6
1965                vaesenc \T1, \XMM7, \XMM7
1966                vaesenc \T1, \XMM8, \XMM8
1967
1968                vmovdqu 16*2(arg1), \T1
1969                vaesenc \T1, \XMM1, \XMM1
1970                vaesenc \T1, \XMM2, \XMM2
1971                vaesenc \T1, \XMM3, \XMM3
1972                vaesenc \T1, \XMM4, \XMM4
1973                vaesenc \T1, \XMM5, \XMM5
1974                vaesenc \T1, \XMM6, \XMM6
1975                vaesenc \T1, \XMM7, \XMM7
1976                vaesenc \T1, \XMM8, \XMM8
1977
1978
1979        #######################################################################
1980
1981        vmovdqa         HashKey_8(arg1), \T5
1982        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
1983        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
1984        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
1985        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
1986        vpxor           \T5, \T6, \T6
1987
1988                vmovdqu 16*3(arg1), \T1
1989                vaesenc \T1, \XMM1, \XMM1
1990                vaesenc \T1, \XMM2, \XMM2
1991                vaesenc \T1, \XMM3, \XMM3
1992                vaesenc \T1, \XMM4, \XMM4
1993                vaesenc \T1, \XMM5, \XMM5
1994                vaesenc \T1, \XMM6, \XMM6
1995                vaesenc \T1, \XMM7, \XMM7
1996                vaesenc \T1, \XMM8, \XMM8
1997
1998        vmovdqa         TMP2(%rsp), \T1
1999        vmovdqa         HashKey_7(arg1), \T5
2000        vpclmulqdq      $0x11, \T5, \T1, \T3
2001        vpxor           \T3, \T4, \T4
2002
2003        vpclmulqdq      $0x00, \T5, \T1, \T3
2004        vpxor           \T3, \T7, \T7
2005
2006        vpclmulqdq      $0x01, \T5, \T1, \T3
2007        vpxor           \T3, \T6, \T6
2008
2009        vpclmulqdq      $0x10, \T5, \T1, \T3
2010        vpxor           \T3, \T6, \T6
2011
2012                vmovdqu 16*4(arg1), \T1
2013                vaesenc \T1, \XMM1, \XMM1
2014                vaesenc \T1, \XMM2, \XMM2
2015                vaesenc \T1, \XMM3, \XMM3
2016                vaesenc \T1, \XMM4, \XMM4
2017                vaesenc \T1, \XMM5, \XMM5
2018                vaesenc \T1, \XMM6, \XMM6
2019                vaesenc \T1, \XMM7, \XMM7
2020                vaesenc \T1, \XMM8, \XMM8
2021
2022        #######################################################################
2023
2024        vmovdqa         TMP3(%rsp), \T1
2025        vmovdqa         HashKey_6(arg1), \T5
2026        vpclmulqdq      $0x11, \T5, \T1, \T3
2027        vpxor           \T3, \T4, \T4
2028
2029        vpclmulqdq      $0x00, \T5, \T1, \T3
2030        vpxor           \T3, \T7, \T7
2031
2032        vpclmulqdq      $0x01, \T5, \T1, \T3
2033        vpxor           \T3, \T6, \T6
2034
2035        vpclmulqdq      $0x10, \T5, \T1, \T3
2036        vpxor           \T3, \T6, \T6
2037
2038                vmovdqu 16*5(arg1), \T1
2039                vaesenc \T1, \XMM1, \XMM1
2040                vaesenc \T1, \XMM2, \XMM2
2041                vaesenc \T1, \XMM3, \XMM3
2042                vaesenc \T1, \XMM4, \XMM4
2043                vaesenc \T1, \XMM5, \XMM5
2044                vaesenc \T1, \XMM6, \XMM6
2045                vaesenc \T1, \XMM7, \XMM7
2046                vaesenc \T1, \XMM8, \XMM8
2047
2048        vmovdqa         TMP4(%rsp), \T1
2049        vmovdqa         HashKey_5(arg1), \T5
2050        vpclmulqdq      $0x11, \T5, \T1, \T3
2051        vpxor           \T3, \T4, \T4
2052
2053        vpclmulqdq      $0x00, \T5, \T1, \T3
2054        vpxor           \T3, \T7, \T7
2055
2056        vpclmulqdq      $0x01, \T5, \T1, \T3
2057        vpxor           \T3, \T6, \T6
2058
2059        vpclmulqdq      $0x10, \T5, \T1, \T3
2060        vpxor           \T3, \T6, \T6
2061
2062                vmovdqu 16*6(arg1), \T1
2063                vaesenc \T1, \XMM1, \XMM1
2064                vaesenc \T1, \XMM2, \XMM2
2065                vaesenc \T1, \XMM3, \XMM3
2066                vaesenc \T1, \XMM4, \XMM4
2067                vaesenc \T1, \XMM5, \XMM5
2068                vaesenc \T1, \XMM6, \XMM6
2069                vaesenc \T1, \XMM7, \XMM7
2070                vaesenc \T1, \XMM8, \XMM8
2071
2072
2073        vmovdqa         TMP5(%rsp), \T1
2074        vmovdqa         HashKey_4(arg1), \T5
2075        vpclmulqdq      $0x11, \T5, \T1, \T3
2076        vpxor           \T3, \T4, \T4
2077
2078        vpclmulqdq      $0x00, \T5, \T1, \T3
2079        vpxor           \T3, \T7, \T7
2080
2081        vpclmulqdq      $0x01, \T5, \T1, \T3
2082        vpxor           \T3, \T6, \T6
2083
2084        vpclmulqdq      $0x10, \T5, \T1, \T3
2085        vpxor           \T3, \T6, \T6
2086
2087                vmovdqu 16*7(arg1), \T1
2088                vaesenc \T1, \XMM1, \XMM1
2089                vaesenc \T1, \XMM2, \XMM2
2090                vaesenc \T1, \XMM3, \XMM3
2091                vaesenc \T1, \XMM4, \XMM4
2092                vaesenc \T1, \XMM5, \XMM5
2093                vaesenc \T1, \XMM6, \XMM6
2094                vaesenc \T1, \XMM7, \XMM7
2095                vaesenc \T1, \XMM8, \XMM8
2096
2097        vmovdqa         TMP6(%rsp), \T1
2098        vmovdqa         HashKey_3(arg1), \T5
2099        vpclmulqdq      $0x11, \T5, \T1, \T3
2100        vpxor           \T3, \T4, \T4
2101
2102        vpclmulqdq      $0x00, \T5, \T1, \T3
2103        vpxor           \T3, \T7, \T7
2104
2105        vpclmulqdq      $0x01, \T5, \T1, \T3
2106        vpxor           \T3, \T6, \T6
2107
2108        vpclmulqdq      $0x10, \T5, \T1, \T3
2109        vpxor           \T3, \T6, \T6
2110
2111                vmovdqu 16*8(arg1), \T1
2112                vaesenc \T1, \XMM1, \XMM1
2113                vaesenc \T1, \XMM2, \XMM2
2114                vaesenc \T1, \XMM3, \XMM3
2115                vaesenc \T1, \XMM4, \XMM4
2116                vaesenc \T1, \XMM5, \XMM5
2117                vaesenc \T1, \XMM6, \XMM6
2118                vaesenc \T1, \XMM7, \XMM7
2119                vaesenc \T1, \XMM8, \XMM8
2120
2121        vmovdqa         TMP7(%rsp), \T1
2122        vmovdqa         HashKey_2(arg1), \T5
2123        vpclmulqdq      $0x11, \T5, \T1, \T3
2124        vpxor           \T3, \T4, \T4
2125
2126        vpclmulqdq      $0x00, \T5, \T1, \T3
2127        vpxor           \T3, \T7, \T7
2128
2129        vpclmulqdq      $0x01, \T5, \T1, \T3
2130        vpxor           \T3, \T6, \T6
2131
2132        vpclmulqdq      $0x10, \T5, \T1, \T3
2133        vpxor           \T3, \T6, \T6
2134
2135
2136        #######################################################################
2137
2138                vmovdqu 16*9(arg1), \T5
2139                vaesenc \T5, \XMM1, \XMM1
2140                vaesenc \T5, \XMM2, \XMM2
2141                vaesenc \T5, \XMM3, \XMM3
2142                vaesenc \T5, \XMM4, \XMM4
2143                vaesenc \T5, \XMM5, \XMM5
2144                vaesenc \T5, \XMM6, \XMM6
2145                vaesenc \T5, \XMM7, \XMM7
2146                vaesenc \T5, \XMM8, \XMM8
2147
2148        vmovdqa         TMP8(%rsp), \T1
2149        vmovdqa         HashKey(arg1), \T5
2150
2151        vpclmulqdq      $0x00, \T5, \T1, \T3
2152        vpxor           \T3, \T7, \T7
2153
2154        vpclmulqdq      $0x01, \T5, \T1, \T3
2155        vpxor           \T3, \T6, \T6
2156
2157        vpclmulqdq      $0x10, \T5, \T1, \T3
2158        vpxor           \T3, \T6, \T6
2159
2160        vpclmulqdq      $0x11, \T5, \T1, \T3
2161        vpxor           \T3, \T4, \T1
2162
2163
2164                vmovdqu 16*10(arg1), \T5
2165
2166	i = 0
2167	j = 1
2168	setreg
2169.rep 8
2170		vpxor	16*i(arg3, %r11), \T5, \T2
2171                .if \ENC_DEC == ENC
2172                vaesenclast     \T2, reg_j, reg_j
2173                .else
2174                vaesenclast     \T2, reg_j, \T3
2175                vmovdqu 16*i(arg3, %r11), reg_j
2176                vmovdqu \T3, 16*i(arg2, %r11)
2177                .endif
2178	i = (i+1)
2179	j = (j+1)
2180	setreg
2181.endr
2182	#######################################################################
2183
2184
2185	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2186	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2187	vpxor	\T3, \T7, \T7
2188	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2189
2190
2191
2192	#######################################################################
2193	#first phase of the reduction
2194	vmovdqa         POLY2(%rip), \T3
2195
2196	vpclmulqdq	$0x01, \T7, \T3, \T2
2197	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2198
2199	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2200	#######################################################################
2201                .if \ENC_DEC == ENC
2202		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
2203		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
2204		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
2205		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
2206		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
2207		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
2208		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
2209		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
2210                .endif
2211
2212	#######################################################################
2213	#second phase of the reduction
2214	vpclmulqdq	$0x00, \T7, \T3, \T2
2215	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2216
2217	vpclmulqdq	$0x10, \T7, \T3, \T4
2218	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2219
2220	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2221	#######################################################################
2222	vpxor		\T4, \T1, \T1			# the result is in T1
2223
2224		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2225		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2226		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2227		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2228		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2229		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2230		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2231		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2232
2233
2234	vpxor	\T1, \XMM1, \XMM1
2235
2236
2237
2238.endm
2239
2240
2241# GHASH the last 4 ciphertext blocks.
2242.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2243
2244        ## Karatsuba Method
2245
2246        vmovdqa         HashKey_8(arg1), \T5
2247
2248        vpshufd         $0b01001110, \XMM1, \T2
2249        vpshufd         $0b01001110, \T5, \T3
2250        vpxor           \XMM1, \T2, \T2
2251        vpxor           \T5, \T3, \T3
2252
2253        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2254        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2255
2256        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2257
2258        ######################
2259
2260        vmovdqa         HashKey_7(arg1), \T5
2261        vpshufd         $0b01001110, \XMM2, \T2
2262        vpshufd         $0b01001110, \T5, \T3
2263        vpxor           \XMM2, \T2, \T2
2264        vpxor           \T5, \T3, \T3
2265
2266        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2267        vpxor           \T4, \T6, \T6
2268
2269        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2270        vpxor           \T4, \T7, \T7
2271
2272        vpclmulqdq      $0x00, \T3, \T2, \T2
2273
2274        vpxor           \T2, \XMM1, \XMM1
2275
2276        ######################
2277
2278        vmovdqa         HashKey_6(arg1), \T5
2279        vpshufd         $0b01001110, \XMM3, \T2
2280        vpshufd         $0b01001110, \T5, \T3
2281        vpxor           \XMM3, \T2, \T2
2282        vpxor           \T5, \T3, \T3
2283
2284        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2285        vpxor           \T4, \T6, \T6
2286
2287        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2288        vpxor           \T4, \T7, \T7
2289
2290        vpclmulqdq      $0x00, \T3, \T2, \T2
2291
2292        vpxor           \T2, \XMM1, \XMM1
2293
2294        ######################
2295
2296        vmovdqa         HashKey_5(arg1), \T5
2297        vpshufd         $0b01001110, \XMM4, \T2
2298        vpshufd         $0b01001110, \T5, \T3
2299        vpxor           \XMM4, \T2, \T2
2300        vpxor           \T5, \T3, \T3
2301
2302        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2303        vpxor           \T4, \T6, \T6
2304
2305        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2306        vpxor           \T4, \T7, \T7
2307
2308        vpclmulqdq      $0x00, \T3, \T2, \T2
2309
2310        vpxor           \T2, \XMM1, \XMM1
2311
2312        ######################
2313
2314        vmovdqa         HashKey_4(arg1), \T5
2315        vpshufd         $0b01001110, \XMM5, \T2
2316        vpshufd         $0b01001110, \T5, \T3
2317        vpxor           \XMM5, \T2, \T2
2318        vpxor           \T5, \T3, \T3
2319
2320        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2321        vpxor           \T4, \T6, \T6
2322
2323        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2324        vpxor           \T4, \T7, \T7
2325
2326        vpclmulqdq      $0x00, \T3, \T2, \T2
2327
2328        vpxor           \T2, \XMM1, \XMM1
2329
2330        ######################
2331
2332        vmovdqa         HashKey_3(arg1), \T5
2333        vpshufd         $0b01001110, \XMM6, \T2
2334        vpshufd         $0b01001110, \T5, \T3
2335        vpxor           \XMM6, \T2, \T2
2336        vpxor           \T5, \T3, \T3
2337
2338        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2339        vpxor           \T4, \T6, \T6
2340
2341        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2342        vpxor           \T4, \T7, \T7
2343
2344        vpclmulqdq      $0x00, \T3, \T2, \T2
2345
2346        vpxor           \T2, \XMM1, \XMM1
2347
2348        ######################
2349
2350        vmovdqa         HashKey_2(arg1), \T5
2351        vpshufd         $0b01001110, \XMM7, \T2
2352        vpshufd         $0b01001110, \T5, \T3
2353        vpxor           \XMM7, \T2, \T2
2354        vpxor           \T5, \T3, \T3
2355
2356        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2357        vpxor           \T4, \T6, \T6
2358
2359        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2360        vpxor           \T4, \T7, \T7
2361
2362        vpclmulqdq      $0x00, \T3, \T2, \T2
2363
2364        vpxor           \T2, \XMM1, \XMM1
2365
2366        ######################
2367
2368        vmovdqa         HashKey(arg1), \T5
2369        vpshufd         $0b01001110, \XMM8, \T2
2370        vpshufd         $0b01001110, \T5, \T3
2371        vpxor           \XMM8, \T2, \T2
2372        vpxor           \T5, \T3, \T3
2373
2374        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2375        vpxor           \T4, \T6, \T6
2376
2377        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2378        vpxor           \T4, \T7, \T7
2379
2380        vpclmulqdq      $0x00, \T3, \T2, \T2
2381
2382        vpxor           \T2, \XMM1, \XMM1
2383        vpxor           \T6, \XMM1, \XMM1
2384        vpxor           \T7, \XMM1, \T2
2385
2386
2387
2388
2389        vpslldq $8, \T2, \T4
2390        vpsrldq $8, \T2, \T2
2391
2392        vpxor   \T4, \T7, \T7
2393        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2394						   # accumulated carry-less multiplications
2395
2396        #######################################################################
2397        #first phase of the reduction
2398        vmovdqa         POLY2(%rip), \T3
2399
2400        vpclmulqdq      $0x01, \T7, \T3, \T2
2401        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2402
2403        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2404        #######################################################################
2405
2406
2407        #second phase of the reduction
2408        vpclmulqdq      $0x00, \T7, \T3, \T2
2409        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2410
2411        vpclmulqdq      $0x10, \T7, \T3, \T4
2412        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2413
2414        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2415        #######################################################################
2416        vpxor           \T4, \T6, \T6              # the result is in T6
2417.endm
2418
2419
2420
2421# combined for GCM encrypt and decrypt functions
2422# clobbering all xmm registers
2423# clobbering r10, r11, r12, r13, r14, r15
2424.macro  GCM_ENC_DEC_AVX2     ENC_DEC
2425
2426        #the number of pushes must equal STACK_OFFSET
2427        push    %r12
2428        push    %r13
2429        push    %r14
2430        push    %r15
2431
2432        mov     %rsp, %r14
2433
2434
2435
2436
2437        sub     $VARIABLE_OFFSET, %rsp
2438        and     $~63, %rsp                         # align rsp to 64 bytes
2439
2440
2441        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
2442
2443        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
2444        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
2445
2446        mov     %r13, %r12
2447        shr     $4, %r12
2448        and     $7, %r12
2449        jz      _initial_num_blocks_is_0\@
2450
2451        cmp     $7, %r12
2452        je      _initial_num_blocks_is_7\@
2453        cmp     $6, %r12
2454        je      _initial_num_blocks_is_6\@
2455        cmp     $5, %r12
2456        je      _initial_num_blocks_is_5\@
2457        cmp     $4, %r12
2458        je      _initial_num_blocks_is_4\@
2459        cmp     $3, %r12
2460        je      _initial_num_blocks_is_3\@
2461        cmp     $2, %r12
2462        je      _initial_num_blocks_is_2\@
2463
2464        jmp     _initial_num_blocks_is_1\@
2465
2466_initial_num_blocks_is_7\@:
2467        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2468        sub     $16*7, %r13
2469        jmp     _initial_blocks_encrypted\@
2470
2471_initial_num_blocks_is_6\@:
2472        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2473        sub     $16*6, %r13
2474        jmp     _initial_blocks_encrypted\@
2475
2476_initial_num_blocks_is_5\@:
2477        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2478        sub     $16*5, %r13
2479        jmp     _initial_blocks_encrypted\@
2480
2481_initial_num_blocks_is_4\@:
2482        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2483        sub     $16*4, %r13
2484        jmp     _initial_blocks_encrypted\@
2485
2486_initial_num_blocks_is_3\@:
2487        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2488        sub     $16*3, %r13
2489        jmp     _initial_blocks_encrypted\@
2490
2491_initial_num_blocks_is_2\@:
2492        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2493        sub     $16*2, %r13
2494        jmp     _initial_blocks_encrypted\@
2495
2496_initial_num_blocks_is_1\@:
2497        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2498        sub     $16*1, %r13
2499        jmp     _initial_blocks_encrypted\@
2500
2501_initial_num_blocks_is_0\@:
2502        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2503
2504
2505_initial_blocks_encrypted\@:
2506        cmp     $0, %r13
2507        je      _zero_cipher_left\@
2508
2509        sub     $128, %r13
2510        je      _eight_cipher_left\@
2511
2512
2513
2514
2515        vmovd   %xmm9, %r15d
2516        and     $255, %r15d
2517        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2518
2519
2520_encrypt_by_8_new\@:
2521        cmp     $(255-8), %r15d
2522        jg      _encrypt_by_8\@
2523
2524
2525
2526        add     $8, %r15b
2527        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2528        add     $128, %r11
2529        sub     $128, %r13
2530        jne     _encrypt_by_8_new\@
2531
2532        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2533        jmp     _eight_cipher_left\@
2534
2535_encrypt_by_8\@:
2536        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2537        add     $8, %r15b
2538        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2539        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2540        add     $128, %r11
2541        sub     $128, %r13
2542        jne     _encrypt_by_8_new\@
2543
2544        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2545
2546
2547
2548
2549_eight_cipher_left\@:
2550        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2551
2552
2553_zero_cipher_left\@:
2554        cmp     $16, arg4
2555        jl      _only_less_than_16\@
2556
2557        mov     arg4, %r13
2558        and     $15, %r13                            # r13 = (arg4 mod 16)
2559
2560        je      _multiple_of_16_bytes\@
2561
2562        # handle the last <16 Byte block seperately
2563
2564
2565        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
2566        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2567        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2568
2569        sub     $16, %r11
2570        add     %r13, %r11
2571        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
2572
2573        lea     SHIFT_MASK+16(%rip), %r12
2574        sub     %r13, %r12                           # adjust the shuffle mask pointer
2575						     # to be able to shift 16-r13 bytes
2576						     # (r13 is the number of bytes in plaintext mod 16)
2577        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
2578        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
2579        jmp     _final_ghash_mul\@
2580
2581_only_less_than_16\@:
2582        # check for 0 length
2583        mov     arg4, %r13
2584        and     $15, %r13                            # r13 = (arg4 mod 16)
2585
2586        je      _multiple_of_16_bytes\@
2587
2588        # handle the last <16 Byte block seperately
2589
2590
2591        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
2592        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2593        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2594
2595
2596        lea     SHIFT_MASK+16(%rip), %r12
2597        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
2598						     # able to shift 16-r13 bytes (r13 is the
2599						     # number of bytes in plaintext mod 16)
2600
2601_get_last_16_byte_loop\@:
2602        movb    (arg3, %r11),  %al
2603        movb    %al,  TMP1 (%rsp , %r11)
2604        add     $1, %r11
2605        cmp     %r13,  %r11
2606        jne     _get_last_16_byte_loop\@
2607
2608        vmovdqu  TMP1(%rsp), %xmm1
2609
2610        sub     $16, %r11
2611
2612_final_ghash_mul\@:
2613        .if  \ENC_DEC ==  DEC
2614        vmovdqa %xmm1, %xmm2
2615        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2616        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2617        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2618        vpand   %xmm1, %xmm2, %xmm2
2619        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2620        vpxor   %xmm2, %xmm14, %xmm14
2621	#GHASH computation for the last <16 Byte block
2622        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2623        sub     %r13, %r11
2624        add     $16, %r11
2625        .else
2626        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2627        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2628        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2629        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2630        vpxor   %xmm9, %xmm14, %xmm14
2631	#GHASH computation for the last <16 Byte block
2632        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2633        sub     %r13, %r11
2634        add     $16, %r11
2635        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
2636        .endif
2637
2638
2639        #############################
2640        # output r13 Bytes
2641        vmovq   %xmm9, %rax
2642        cmp     $8, %r13
2643        jle     _less_than_8_bytes_left\@
2644
2645        mov     %rax, (arg2 , %r11)
2646        add     $8, %r11
2647        vpsrldq $8, %xmm9, %xmm9
2648        vmovq   %xmm9, %rax
2649        sub     $8, %r13
2650
2651_less_than_8_bytes_left\@:
2652        movb    %al, (arg2 , %r11)
2653        add     $1, %r11
2654        shr     $8, %rax
2655        sub     $1, %r13
2656        jne     _less_than_8_bytes_left\@
2657        #############################
2658
2659_multiple_of_16_bytes\@:
2660        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
2661        shl     $3, %r12                             # convert into number of bits
2662        vmovd   %r12d, %xmm15                        # len(A) in xmm15
2663
2664        shl     $3, arg4                             # len(C) in bits  (*128)
2665        vmovq   arg4, %xmm1
2666        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
2667        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
2668
2669        vpxor   %xmm15, %xmm14, %xmm14
2670        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
2671        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
2672
2673        mov     arg5, %rax                           # rax = *Y0
2674        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
2675
2676        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
2677
2678        vpxor   %xmm14, %xmm9, %xmm9
2679
2680
2681
2682_return_T\@:
2683        mov     arg8, %r10              # r10 = authTag
2684        mov     arg9, %r11              # r11 = auth_tag_len
2685
2686        cmp     $16, %r11
2687        je      _T_16\@
2688
2689        cmp     $12, %r11
2690        je      _T_12\@
2691
2692_T_8\@:
2693        vmovq   %xmm9, %rax
2694        mov     %rax, (%r10)
2695        jmp     _return_T_done\@
2696_T_12\@:
2697        vmovq   %xmm9, %rax
2698        mov     %rax, (%r10)
2699        vpsrldq $8, %xmm9, %xmm9
2700        vmovd   %xmm9, %eax
2701        mov     %eax, 8(%r10)
2702        jmp     _return_T_done\@
2703
2704_T_16\@:
2705        vmovdqu %xmm9, (%r10)
2706
2707_return_T_done\@:
2708        mov     %r14, %rsp
2709
2710        pop     %r15
2711        pop     %r14
2712        pop     %r13
2713        pop     %r12
2714.endm
2715
2716
2717#############################################################
2718#void   aesni_gcm_precomp_avx_gen4
2719#        (gcm_data     *my_ctx_data,
2720#        u8     *hash_subkey)# /* H, the Hash sub key input.
2721#				Data starts on a 16-byte boundary. */
2722#############################################################
2723ENTRY(aesni_gcm_precomp_avx_gen4)
2724        #the number of pushes must equal STACK_OFFSET
2725        push    %r12
2726        push    %r13
2727        push    %r14
2728        push    %r15
2729
2730        mov     %rsp, %r14
2731
2732
2733
2734        sub     $VARIABLE_OFFSET, %rsp
2735        and     $~63, %rsp                    # align rsp to 64 bytes
2736
2737        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
2738
2739        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
2740        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2741        vmovdqa  %xmm6, %xmm2
2742        vpsllq   $1, %xmm6, %xmm6
2743        vpsrlq   $63, %xmm2, %xmm2
2744        vmovdqa  %xmm2, %xmm1
2745        vpslldq  $8, %xmm2, %xmm2
2746        vpsrldq  $8, %xmm1, %xmm1
2747        vpor     %xmm2, %xmm6, %xmm6
2748        #reduction
2749        vpshufd  $0b00100100, %xmm1, %xmm2
2750        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2751        vpand    POLY(%rip), %xmm2, %xmm2
2752        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
2753        #######################################################################
2754        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
2755
2756
2757        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2758
2759        mov     %r14, %rsp
2760
2761        pop     %r15
2762        pop     %r14
2763        pop     %r13
2764        pop     %r12
2765        ret
2766ENDPROC(aesni_gcm_precomp_avx_gen4)
2767
2768
2769###############################################################################
2770#void   aesni_gcm_enc_avx_gen4(
2771#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2772#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2773#        const   u8 *in, /* Plaintext input */
2774#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2775#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2776#			(from Security Association) concatenated with 8 byte
2777#			 Initialisation Vector (from IPSec ESP Payload)
2778#			 concatenated with 0x00000001. 16-byte aligned pointer. */
2779#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2780#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2781#        u8      *auth_tag, /* Authenticated Tag output. */
2782#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2783#				Valid values are 16 (most likely), 12 or 8. */
2784###############################################################################
2785ENTRY(aesni_gcm_enc_avx_gen4)
2786        GCM_ENC_DEC_AVX2     ENC
2787	ret
2788ENDPROC(aesni_gcm_enc_avx_gen4)
2789
2790###############################################################################
2791#void   aesni_gcm_dec_avx_gen4(
2792#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2793#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2794#        const   u8 *in, /* Ciphertext input */
2795#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2796#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2797#			(from Security Association) concatenated with 8 byte
2798#			Initialisation Vector (from IPSec ESP Payload)
2799#			concatenated with 0x00000001. 16-byte aligned pointer. */
2800#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2801#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2802#        u8      *auth_tag, /* Authenticated Tag output. */
2803#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2804#				Valid values are 16 (most likely), 12 or 8. */
2805###############################################################################
2806ENTRY(aesni_gcm_dec_avx_gen4)
2807        GCM_ENC_DEC_AVX2     DEC
2808	ret
2809ENDPROC(aesni_gcm_dec_avx_gen4)
2810
2811#endif /* CONFIG_AS_AVX2 */
2812