1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125# constants in mergeable sections, linker can reorder and merge
126.section	.rodata.cst16.POLY, "aM", @progbits, 16
127.align 16
128POLY:            .octa     0xC2000000000000000000000000000001
129
130.section	.rodata.cst16.POLY2, "aM", @progbits, 16
131.align 16
132POLY2:           .octa     0xC20000000000000000000001C2000000
133
134.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
135.align 16
136TWOONE:          .octa     0x00000001000000000000000000000001
137
138.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139.align 16
140SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
141
142.section	.rodata.cst16.ONE, "aM", @progbits, 16
143.align 16
144ONE:             .octa     0x00000000000000000000000000000001
145
146.section	.rodata.cst16.ONEf, "aM", @progbits, 16
147.align 16
148ONEf:            .octa     0x01000000000000000000000000000000
149
150# order of these constants should not change.
151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152.section	.rodata, "a", @progbits
153.align 16
154SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
155ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
156                 .octa     0x00000000000000000000000000000000
157
158.text
159
160
161##define the fields of the gcm aes context
162#{
163#        u8 expanded_keys[16*11] store expanded keys
164#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
165#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
166#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
167#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
168#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
169#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
170#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
171#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
172#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
173#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
174#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
175#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
176#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
177#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
178#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
179#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
180#} gcm_ctx#
181
182HashKey        = 16*11   # store HashKey <<1 mod poly here
183HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
184HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
185HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
186HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
187HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
188HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
189HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
190HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
191HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
192HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
193HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
194HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
195HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
196HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
197HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
198
199#define arg1 %rdi
200#define arg2 %rsi
201#define arg3 %rdx
202#define arg4 %rcx
203#define arg5 %r8
204#define arg6 %r9
205#define arg7 STACK_OFFSET+8*1(%r14)
206#define arg8 STACK_OFFSET+8*2(%r14)
207#define arg9 STACK_OFFSET+8*3(%r14)
208
209i = 0
210j = 0
211
212out_order = 0
213in_order = 1
214DEC = 0
215ENC = 1
216
217.macro define_reg r n
218reg_\r = %xmm\n
219.endm
220
221.macro setreg
222.altmacro
223define_reg i %i
224define_reg j %j
225.noaltmacro
226.endm
227
228# need to push 4 registers into stack to maintain
229STACK_OFFSET = 8*4
230
231TMP1 =   16*0    # Temporary storage for AAD
232TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
233TMP3 =   16*2    # Temporary storage for AES State 3
234TMP4 =   16*3    # Temporary storage for AES State 4
235TMP5 =   16*4    # Temporary storage for AES State 5
236TMP6 =   16*5    # Temporary storage for AES State 6
237TMP7 =   16*6    # Temporary storage for AES State 7
238TMP8 =   16*7    # Temporary storage for AES State 8
239
240VARIABLE_OFFSET = 16*8
241
242################################
243# Utility Macros
244################################
245
246# Encryption of a single block
247.macro ENCRYPT_SINGLE_BLOCK XMM0
248                vpxor    (arg1), \XMM0, \XMM0
249		i = 1
250		setreg
251.rep 9
252                vaesenc  16*i(arg1), \XMM0, \XMM0
253		i = (i+1)
254		setreg
255.endr
256                vaesenclast 16*10(arg1), \XMM0, \XMM0
257.endm
258
259#ifdef CONFIG_AS_AVX
260###############################################################################
261# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
262# Input: A and B (128-bits each, bit-reflected)
263# Output: C = A*B*x mod poly, (i.e. >>1 )
264# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
265# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
266###############################################################################
267.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
268
269        vpshufd         $0b01001110, \GH, \T2
270        vpshufd         $0b01001110, \HK, \T3
271        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
272        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
273
274        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
275        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
276        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
277        vpxor           \GH, \T2,\T2
278        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
279
280        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
281        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
282        vpxor           \T3, \GH, \GH
283        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
284
285        #first phase of the reduction
286        vpslld  $31, \GH, \T2                   # packed right shifting << 31
287        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
288        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
289
290        vpxor   \T3, \T2, \T2                   # xor the shifted versions
291        vpxor   \T4, \T2, \T2
292
293        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
294
295        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
296        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
297
298        #second phase of the reduction
299
300        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
301        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
302        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
303        vpxor   \T3, \T2, \T2                   # xor the shifted versions
304        vpxor   \T4, \T2, \T2
305
306        vpxor   \T5, \T2, \T2
307        vpxor   \T2, \GH, \GH
308        vpxor   \T1, \GH, \GH                   # the result is in GH
309
310
311.endm
312
313.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
314
315        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
316        vmovdqa  \HK, \T5
317
318        vpshufd  $0b01001110, \T5, \T1
319        vpxor    \T5, \T1, \T1
320        vmovdqa  \T1, HashKey_k(arg1)
321
322        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
323        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
324        vpshufd  $0b01001110, \T5, \T1
325        vpxor    \T5, \T1, \T1
326        vmovdqa  \T1, HashKey_2_k(arg1)
327
328        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
329        vmovdqa  \T5, HashKey_3(arg1)
330        vpshufd  $0b01001110, \T5, \T1
331        vpxor    \T5, \T1, \T1
332        vmovdqa  \T1, HashKey_3_k(arg1)
333
334        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
335        vmovdqa  \T5, HashKey_4(arg1)
336        vpshufd  $0b01001110, \T5, \T1
337        vpxor    \T5, \T1, \T1
338        vmovdqa  \T1, HashKey_4_k(arg1)
339
340        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
341        vmovdqa  \T5, HashKey_5(arg1)
342        vpshufd  $0b01001110, \T5, \T1
343        vpxor    \T5, \T1, \T1
344        vmovdqa  \T1, HashKey_5_k(arg1)
345
346        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
347        vmovdqa  \T5, HashKey_6(arg1)
348        vpshufd  $0b01001110, \T5, \T1
349        vpxor    \T5, \T1, \T1
350        vmovdqa  \T1, HashKey_6_k(arg1)
351
352        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
353        vmovdqa  \T5, HashKey_7(arg1)
354        vpshufd  $0b01001110, \T5, \T1
355        vpxor    \T5, \T1, \T1
356        vmovdqa  \T1, HashKey_7_k(arg1)
357
358        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
359        vmovdqa  \T5, HashKey_8(arg1)
360        vpshufd  $0b01001110, \T5, \T1
361        vpxor    \T5, \T1, \T1
362        vmovdqa  \T1, HashKey_8_k(arg1)
363
364.endm
365
366## if a = number of total plaintext bytes
367## b = floor(a/16)
368## num_initial_blocks = b mod 4#
369## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
370## r10, r11, r12, rax are clobbered
371## arg1, arg2, arg3, r14 are used as a pointer only, not modified
372
373.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
374	i = (8-\num_initial_blocks)
375	setreg
376
377        mov     arg6, %r10                      # r10 = AAD
378        mov     arg7, %r12                      # r12 = aadLen
379
380
381        mov     %r12, %r11
382
383        vpxor   reg_i, reg_i, reg_i
384_get_AAD_loop\@:
385        vmovd   (%r10), \T1
386        vpslldq $12, \T1, \T1
387        vpsrldq $4, reg_i, reg_i
388        vpxor   \T1, reg_i, reg_i
389
390        add     $4, %r10
391        sub     $4, %r12
392        jg      _get_AAD_loop\@
393
394
395        cmp     $16, %r11
396        je      _get_AAD_loop2_done\@
397        mov     $16, %r12
398
399_get_AAD_loop2\@:
400        vpsrldq $4, reg_i, reg_i
401        sub     $4, %r12
402        cmp     %r11, %r12
403        jg      _get_AAD_loop2\@
404
405_get_AAD_loop2_done\@:
406
407        #byte-reflect the AAD data
408        vpshufb SHUF_MASK(%rip), reg_i, reg_i
409
410	# initialize the data pointer offset as zero
411	xor     %r11, %r11
412
413	# start AES for num_initial_blocks blocks
414	mov     arg5, %rax                     # rax = *Y0
415	vmovdqu (%rax), \CTR                   # CTR = Y0
416	vpshufb SHUF_MASK(%rip), \CTR, \CTR
417
418
419	i = (9-\num_initial_blocks)
420	setreg
421.rep \num_initial_blocks
422                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
423                vmovdqa \CTR, reg_i
424                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
425	i = (i+1)
426	setreg
427.endr
428
429	vmovdqa  (arg1), \T_key
430	i = (9-\num_initial_blocks)
431	setreg
432.rep \num_initial_blocks
433                vpxor   \T_key, reg_i, reg_i
434	i = (i+1)
435	setreg
436.endr
437
438	j = 1
439	setreg
440.rep 9
441	vmovdqa  16*j(arg1), \T_key
442	i = (9-\num_initial_blocks)
443	setreg
444.rep \num_initial_blocks
445        vaesenc \T_key, reg_i, reg_i
446	i = (i+1)
447	setreg
448.endr
449
450	j = (j+1)
451	setreg
452.endr
453
454
455	vmovdqa  16*10(arg1), \T_key
456	i = (9-\num_initial_blocks)
457	setreg
458.rep \num_initial_blocks
459        vaesenclast      \T_key, reg_i, reg_i
460	i = (i+1)
461	setreg
462.endr
463
464	i = (9-\num_initial_blocks)
465	setreg
466.rep \num_initial_blocks
467                vmovdqu (arg3, %r11), \T1
468                vpxor   \T1, reg_i, reg_i
469                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
470                add     $16, %r11
471.if  \ENC_DEC == DEC
472                vmovdqa \T1, reg_i
473.endif
474                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
475	i = (i+1)
476	setreg
477.endr
478
479
480	i = (8-\num_initial_blocks)
481	j = (9-\num_initial_blocks)
482	setreg
483        GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
484
485.rep \num_initial_blocks
486        vpxor    reg_i, reg_j, reg_j
487        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
488	i = (i+1)
489	j = (j+1)
490	setreg
491.endr
492        # XMM8 has the combined result here
493
494        vmovdqa  \XMM8, TMP1(%rsp)
495        vmovdqa  \XMM8, \T3
496
497        cmp     $128, %r13
498        jl      _initial_blocks_done\@                  # no need for precomputed constants
499
500###############################################################################
501# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
502                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
503                vmovdqa  \CTR, \XMM1
504                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
505
506                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
507                vmovdqa  \CTR, \XMM2
508                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
509
510                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
511                vmovdqa  \CTR, \XMM3
512                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
513
514                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
515                vmovdqa  \CTR, \XMM4
516                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
517
518                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
519                vmovdqa  \CTR, \XMM5
520                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
521
522                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
523                vmovdqa  \CTR, \XMM6
524                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
525
526                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
527                vmovdqa  \CTR, \XMM7
528                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
529
530                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
531                vmovdqa  \CTR, \XMM8
532                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
533
534                vmovdqa  (arg1), \T_key
535                vpxor    \T_key, \XMM1, \XMM1
536                vpxor    \T_key, \XMM2, \XMM2
537                vpxor    \T_key, \XMM3, \XMM3
538                vpxor    \T_key, \XMM4, \XMM4
539                vpxor    \T_key, \XMM5, \XMM5
540                vpxor    \T_key, \XMM6, \XMM6
541                vpxor    \T_key, \XMM7, \XMM7
542                vpxor    \T_key, \XMM8, \XMM8
543
544		i = 1
545		setreg
546.rep    9       # do 9 rounds
547                vmovdqa  16*i(arg1), \T_key
548                vaesenc  \T_key, \XMM1, \XMM1
549                vaesenc  \T_key, \XMM2, \XMM2
550                vaesenc  \T_key, \XMM3, \XMM3
551                vaesenc  \T_key, \XMM4, \XMM4
552                vaesenc  \T_key, \XMM5, \XMM5
553                vaesenc  \T_key, \XMM6, \XMM6
554                vaesenc  \T_key, \XMM7, \XMM7
555                vaesenc  \T_key, \XMM8, \XMM8
556		i = (i+1)
557		setreg
558.endr
559
560
561                vmovdqa  16*i(arg1), \T_key
562                vaesenclast  \T_key, \XMM1, \XMM1
563                vaesenclast  \T_key, \XMM2, \XMM2
564                vaesenclast  \T_key, \XMM3, \XMM3
565                vaesenclast  \T_key, \XMM4, \XMM4
566                vaesenclast  \T_key, \XMM5, \XMM5
567                vaesenclast  \T_key, \XMM6, \XMM6
568                vaesenclast  \T_key, \XMM7, \XMM7
569                vaesenclast  \T_key, \XMM8, \XMM8
570
571                vmovdqu  (arg3, %r11), \T1
572                vpxor    \T1, \XMM1, \XMM1
573                vmovdqu  \XMM1, (arg2 , %r11)
574                .if   \ENC_DEC == DEC
575                vmovdqa  \T1, \XMM1
576                .endif
577
578                vmovdqu  16*1(arg3, %r11), \T1
579                vpxor    \T1, \XMM2, \XMM2
580                vmovdqu  \XMM2, 16*1(arg2 , %r11)
581                .if   \ENC_DEC == DEC
582                vmovdqa  \T1, \XMM2
583                .endif
584
585                vmovdqu  16*2(arg3, %r11), \T1
586                vpxor    \T1, \XMM3, \XMM3
587                vmovdqu  \XMM3, 16*2(arg2 , %r11)
588                .if   \ENC_DEC == DEC
589                vmovdqa  \T1, \XMM3
590                .endif
591
592                vmovdqu  16*3(arg3, %r11), \T1
593                vpxor    \T1, \XMM4, \XMM4
594                vmovdqu  \XMM4, 16*3(arg2 , %r11)
595                .if   \ENC_DEC == DEC
596                vmovdqa  \T1, \XMM4
597                .endif
598
599                vmovdqu  16*4(arg3, %r11), \T1
600                vpxor    \T1, \XMM5, \XMM5
601                vmovdqu  \XMM5, 16*4(arg2 , %r11)
602                .if   \ENC_DEC == DEC
603                vmovdqa  \T1, \XMM5
604                .endif
605
606                vmovdqu  16*5(arg3, %r11), \T1
607                vpxor    \T1, \XMM6, \XMM6
608                vmovdqu  \XMM6, 16*5(arg2 , %r11)
609                .if   \ENC_DEC == DEC
610                vmovdqa  \T1, \XMM6
611                .endif
612
613                vmovdqu  16*6(arg3, %r11), \T1
614                vpxor    \T1, \XMM7, \XMM7
615                vmovdqu  \XMM7, 16*6(arg2 , %r11)
616                .if   \ENC_DEC == DEC
617                vmovdqa  \T1, \XMM7
618                .endif
619
620                vmovdqu  16*7(arg3, %r11), \T1
621                vpxor    \T1, \XMM8, \XMM8
622                vmovdqu  \XMM8, 16*7(arg2 , %r11)
623                .if   \ENC_DEC == DEC
624                vmovdqa  \T1, \XMM8
625                .endif
626
627                add     $128, %r11
628
629                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
630                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
631                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
632                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
633                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
634                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
635                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
636                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
637                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
638
639###############################################################################
640
641_initial_blocks_done\@:
642
643.endm
644
645# encrypt 8 blocks at a time
646# ghash the 8 previously encrypted ciphertext blocks
647# arg1, arg2, arg3 are used as pointers only, not modified
648# r11 is the data offset value
649.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
650
651        vmovdqa \XMM1, \T2
652        vmovdqa \XMM2, TMP2(%rsp)
653        vmovdqa \XMM3, TMP3(%rsp)
654        vmovdqa \XMM4, TMP4(%rsp)
655        vmovdqa \XMM5, TMP5(%rsp)
656        vmovdqa \XMM6, TMP6(%rsp)
657        vmovdqa \XMM7, TMP7(%rsp)
658        vmovdqa \XMM8, TMP8(%rsp)
659
660.if \loop_idx == in_order
661                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
662                vpaddd  ONE(%rip), \XMM1, \XMM2
663                vpaddd  ONE(%rip), \XMM2, \XMM3
664                vpaddd  ONE(%rip), \XMM3, \XMM4
665                vpaddd  ONE(%rip), \XMM4, \XMM5
666                vpaddd  ONE(%rip), \XMM5, \XMM6
667                vpaddd  ONE(%rip), \XMM6, \XMM7
668                vpaddd  ONE(%rip), \XMM7, \XMM8
669                vmovdqa \XMM8, \CTR
670
671                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
672                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
673                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
674                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
675                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
676                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
677                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
678                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
679.else
680                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
681                vpaddd  ONEf(%rip), \XMM1, \XMM2
682                vpaddd  ONEf(%rip), \XMM2, \XMM3
683                vpaddd  ONEf(%rip), \XMM3, \XMM4
684                vpaddd  ONEf(%rip), \XMM4, \XMM5
685                vpaddd  ONEf(%rip), \XMM5, \XMM6
686                vpaddd  ONEf(%rip), \XMM6, \XMM7
687                vpaddd  ONEf(%rip), \XMM7, \XMM8
688                vmovdqa \XMM8, \CTR
689.endif
690
691
692        #######################################################################
693
694                vmovdqu (arg1), \T1
695                vpxor   \T1, \XMM1, \XMM1
696                vpxor   \T1, \XMM2, \XMM2
697                vpxor   \T1, \XMM3, \XMM3
698                vpxor   \T1, \XMM4, \XMM4
699                vpxor   \T1, \XMM5, \XMM5
700                vpxor   \T1, \XMM6, \XMM6
701                vpxor   \T1, \XMM7, \XMM7
702                vpxor   \T1, \XMM8, \XMM8
703
704        #######################################################################
705
706
707
708
709
710                vmovdqu 16*1(arg1), \T1
711                vaesenc \T1, \XMM1, \XMM1
712                vaesenc \T1, \XMM2, \XMM2
713                vaesenc \T1, \XMM3, \XMM3
714                vaesenc \T1, \XMM4, \XMM4
715                vaesenc \T1, \XMM5, \XMM5
716                vaesenc \T1, \XMM6, \XMM6
717                vaesenc \T1, \XMM7, \XMM7
718                vaesenc \T1, \XMM8, \XMM8
719
720                vmovdqu 16*2(arg1), \T1
721                vaesenc \T1, \XMM1, \XMM1
722                vaesenc \T1, \XMM2, \XMM2
723                vaesenc \T1, \XMM3, \XMM3
724                vaesenc \T1, \XMM4, \XMM4
725                vaesenc \T1, \XMM5, \XMM5
726                vaesenc \T1, \XMM6, \XMM6
727                vaesenc \T1, \XMM7, \XMM7
728                vaesenc \T1, \XMM8, \XMM8
729
730
731        #######################################################################
732
733        vmovdqa         HashKey_8(arg1), \T5
734        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
735        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
736
737        vpshufd         $0b01001110, \T2, \T6
738        vpxor           \T2, \T6, \T6
739
740        vmovdqa         HashKey_8_k(arg1), \T5
741        vpclmulqdq      $0x00, \T5, \T6, \T6
742
743                vmovdqu 16*3(arg1), \T1
744                vaesenc \T1, \XMM1, \XMM1
745                vaesenc \T1, \XMM2, \XMM2
746                vaesenc \T1, \XMM3, \XMM3
747                vaesenc \T1, \XMM4, \XMM4
748                vaesenc \T1, \XMM5, \XMM5
749                vaesenc \T1, \XMM6, \XMM6
750                vaesenc \T1, \XMM7, \XMM7
751                vaesenc \T1, \XMM8, \XMM8
752
753        vmovdqa         TMP2(%rsp), \T1
754        vmovdqa         HashKey_7(arg1), \T5
755        vpclmulqdq      $0x11, \T5, \T1, \T3
756        vpxor           \T3, \T4, \T4
757        vpclmulqdq      $0x00, \T5, \T1, \T3
758        vpxor           \T3, \T7, \T7
759
760        vpshufd         $0b01001110, \T1, \T3
761        vpxor           \T1, \T3, \T3
762        vmovdqa         HashKey_7_k(arg1), \T5
763        vpclmulqdq      $0x10, \T5, \T3, \T3
764        vpxor           \T3, \T6, \T6
765
766                vmovdqu 16*4(arg1), \T1
767                vaesenc \T1, \XMM1, \XMM1
768                vaesenc \T1, \XMM2, \XMM2
769                vaesenc \T1, \XMM3, \XMM3
770                vaesenc \T1, \XMM4, \XMM4
771                vaesenc \T1, \XMM5, \XMM5
772                vaesenc \T1, \XMM6, \XMM6
773                vaesenc \T1, \XMM7, \XMM7
774                vaesenc \T1, \XMM8, \XMM8
775
776        #######################################################################
777
778        vmovdqa         TMP3(%rsp), \T1
779        vmovdqa         HashKey_6(arg1), \T5
780        vpclmulqdq      $0x11, \T5, \T1, \T3
781        vpxor           \T3, \T4, \T4
782        vpclmulqdq      $0x00, \T5, \T1, \T3
783        vpxor           \T3, \T7, \T7
784
785        vpshufd         $0b01001110, \T1, \T3
786        vpxor           \T1, \T3, \T3
787        vmovdqa         HashKey_6_k(arg1), \T5
788        vpclmulqdq      $0x10, \T5, \T3, \T3
789        vpxor           \T3, \T6, \T6
790
791                vmovdqu 16*5(arg1), \T1
792                vaesenc \T1, \XMM1, \XMM1
793                vaesenc \T1, \XMM2, \XMM2
794                vaesenc \T1, \XMM3, \XMM3
795                vaesenc \T1, \XMM4, \XMM4
796                vaesenc \T1, \XMM5, \XMM5
797                vaesenc \T1, \XMM6, \XMM6
798                vaesenc \T1, \XMM7, \XMM7
799                vaesenc \T1, \XMM8, \XMM8
800
801        vmovdqa         TMP4(%rsp), \T1
802        vmovdqa         HashKey_5(arg1), \T5
803        vpclmulqdq      $0x11, \T5, \T1, \T3
804        vpxor           \T3, \T4, \T4
805        vpclmulqdq      $0x00, \T5, \T1, \T3
806        vpxor           \T3, \T7, \T7
807
808        vpshufd         $0b01001110, \T1, \T3
809        vpxor           \T1, \T3, \T3
810        vmovdqa         HashKey_5_k(arg1), \T5
811        vpclmulqdq      $0x10, \T5, \T3, \T3
812        vpxor           \T3, \T6, \T6
813
814                vmovdqu 16*6(arg1), \T1
815                vaesenc \T1, \XMM1, \XMM1
816                vaesenc \T1, \XMM2, \XMM2
817                vaesenc \T1, \XMM3, \XMM3
818                vaesenc \T1, \XMM4, \XMM4
819                vaesenc \T1, \XMM5, \XMM5
820                vaesenc \T1, \XMM6, \XMM6
821                vaesenc \T1, \XMM7, \XMM7
822                vaesenc \T1, \XMM8, \XMM8
823
824
825        vmovdqa         TMP5(%rsp), \T1
826        vmovdqa         HashKey_4(arg1), \T5
827        vpclmulqdq      $0x11, \T5, \T1, \T3
828        vpxor           \T3, \T4, \T4
829        vpclmulqdq      $0x00, \T5, \T1, \T3
830        vpxor           \T3, \T7, \T7
831
832        vpshufd         $0b01001110, \T1, \T3
833        vpxor           \T1, \T3, \T3
834        vmovdqa         HashKey_4_k(arg1), \T5
835        vpclmulqdq      $0x10, \T5, \T3, \T3
836        vpxor           \T3, \T6, \T6
837
838                vmovdqu 16*7(arg1), \T1
839                vaesenc \T1, \XMM1, \XMM1
840                vaesenc \T1, \XMM2, \XMM2
841                vaesenc \T1, \XMM3, \XMM3
842                vaesenc \T1, \XMM4, \XMM4
843                vaesenc \T1, \XMM5, \XMM5
844                vaesenc \T1, \XMM6, \XMM6
845                vaesenc \T1, \XMM7, \XMM7
846                vaesenc \T1, \XMM8, \XMM8
847
848        vmovdqa         TMP6(%rsp), \T1
849        vmovdqa         HashKey_3(arg1), \T5
850        vpclmulqdq      $0x11, \T5, \T1, \T3
851        vpxor           \T3, \T4, \T4
852        vpclmulqdq      $0x00, \T5, \T1, \T3
853        vpxor           \T3, \T7, \T7
854
855        vpshufd         $0b01001110, \T1, \T3
856        vpxor           \T1, \T3, \T3
857        vmovdqa         HashKey_3_k(arg1), \T5
858        vpclmulqdq      $0x10, \T5, \T3, \T3
859        vpxor           \T3, \T6, \T6
860
861
862                vmovdqu 16*8(arg1), \T1
863                vaesenc \T1, \XMM1, \XMM1
864                vaesenc \T1, \XMM2, \XMM2
865                vaesenc \T1, \XMM3, \XMM3
866                vaesenc \T1, \XMM4, \XMM4
867                vaesenc \T1, \XMM5, \XMM5
868                vaesenc \T1, \XMM6, \XMM6
869                vaesenc \T1, \XMM7, \XMM7
870                vaesenc \T1, \XMM8, \XMM8
871
872        vmovdqa         TMP7(%rsp), \T1
873        vmovdqa         HashKey_2(arg1), \T5
874        vpclmulqdq      $0x11, \T5, \T1, \T3
875        vpxor           \T3, \T4, \T4
876        vpclmulqdq      $0x00, \T5, \T1, \T3
877        vpxor           \T3, \T7, \T7
878
879        vpshufd         $0b01001110, \T1, \T3
880        vpxor           \T1, \T3, \T3
881        vmovdqa         HashKey_2_k(arg1), \T5
882        vpclmulqdq      $0x10, \T5, \T3, \T3
883        vpxor           \T3, \T6, \T6
884
885        #######################################################################
886
887                vmovdqu 16*9(arg1), \T5
888                vaesenc \T5, \XMM1, \XMM1
889                vaesenc \T5, \XMM2, \XMM2
890                vaesenc \T5, \XMM3, \XMM3
891                vaesenc \T5, \XMM4, \XMM4
892                vaesenc \T5, \XMM5, \XMM5
893                vaesenc \T5, \XMM6, \XMM6
894                vaesenc \T5, \XMM7, \XMM7
895                vaesenc \T5, \XMM8, \XMM8
896
897        vmovdqa         TMP8(%rsp), \T1
898        vmovdqa         HashKey(arg1), \T5
899        vpclmulqdq      $0x11, \T5, \T1, \T3
900        vpxor           \T3, \T4, \T4
901        vpclmulqdq      $0x00, \T5, \T1, \T3
902        vpxor           \T3, \T7, \T7
903
904        vpshufd         $0b01001110, \T1, \T3
905        vpxor           \T1, \T3, \T3
906        vmovdqa         HashKey_k(arg1), \T5
907        vpclmulqdq      $0x10, \T5, \T3, \T3
908        vpxor           \T3, \T6, \T6
909
910        vpxor           \T4, \T6, \T6
911        vpxor           \T7, \T6, \T6
912
913                vmovdqu 16*10(arg1), \T5
914
915	i = 0
916	j = 1
917	setreg
918.rep 8
919		vpxor	16*i(arg3, %r11), \T5, \T2
920                .if \ENC_DEC == ENC
921                vaesenclast     \T2, reg_j, reg_j
922                .else
923                vaesenclast     \T2, reg_j, \T3
924                vmovdqu 16*i(arg3, %r11), reg_j
925                vmovdqu \T3, 16*i(arg2, %r11)
926                .endif
927	i = (i+1)
928	j = (j+1)
929	setreg
930.endr
931	#######################################################################
932
933
934	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
935	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
936	vpxor	\T3, \T7, \T7
937	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
938
939
940
941	#######################################################################
942	#first phase of the reduction
943	#######################################################################
944        vpslld  $31, \T7, \T2                           # packed right shifting << 31
945        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
946        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
947
948        vpxor   \T3, \T2, \T2                           # xor the shifted versions
949        vpxor   \T4, \T2, \T2
950
951        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
952
953        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
954        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
955	#######################################################################
956                .if \ENC_DEC == ENC
957		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
958		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
959		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
960		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
961		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
962		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
963		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
964		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
965                .endif
966
967	#######################################################################
968	#second phase of the reduction
969        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
970        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
971        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
972        vpxor   \T3, \T2, \T2                           # xor the shifted versions
973        vpxor   \T4, \T2, \T2
974
975        vpxor   \T1, \T2, \T2
976        vpxor   \T2, \T7, \T7
977        vpxor   \T7, \T6, \T6                           # the result is in T6
978	#######################################################################
979
980		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
981		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
982		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
983		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
984		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
985		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
986		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
987		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
988
989
990	vpxor	\T6, \XMM1, \XMM1
991
992
993
994.endm
995
996
997# GHASH the last 4 ciphertext blocks.
998.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
999
1000        ## Karatsuba Method
1001
1002
1003        vpshufd         $0b01001110, \XMM1, \T2
1004        vpxor           \XMM1, \T2, \T2
1005        vmovdqa         HashKey_8(arg1), \T5
1006        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1007        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1008
1009        vmovdqa         HashKey_8_k(arg1), \T3
1010        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1011
1012        ######################
1013
1014        vpshufd         $0b01001110, \XMM2, \T2
1015        vpxor           \XMM2, \T2, \T2
1016        vmovdqa         HashKey_7(arg1), \T5
1017        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1018        vpxor           \T4, \T6, \T6
1019
1020        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1021        vpxor           \T4, \T7, \T7
1022
1023        vmovdqa         HashKey_7_k(arg1), \T3
1024        vpclmulqdq      $0x00, \T3, \T2, \T2
1025        vpxor           \T2, \XMM1, \XMM1
1026
1027        ######################
1028
1029        vpshufd         $0b01001110, \XMM3, \T2
1030        vpxor           \XMM3, \T2, \T2
1031        vmovdqa         HashKey_6(arg1), \T5
1032        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1033        vpxor           \T4, \T6, \T6
1034
1035        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1036        vpxor           \T4, \T7, \T7
1037
1038        vmovdqa         HashKey_6_k(arg1), \T3
1039        vpclmulqdq      $0x00, \T3, \T2, \T2
1040        vpxor           \T2, \XMM1, \XMM1
1041
1042        ######################
1043
1044        vpshufd         $0b01001110, \XMM4, \T2
1045        vpxor           \XMM4, \T2, \T2
1046        vmovdqa         HashKey_5(arg1), \T5
1047        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1048        vpxor           \T4, \T6, \T6
1049
1050        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1051        vpxor           \T4, \T7, \T7
1052
1053        vmovdqa         HashKey_5_k(arg1), \T3
1054        vpclmulqdq      $0x00, \T3, \T2, \T2
1055        vpxor           \T2, \XMM1, \XMM1
1056
1057        ######################
1058
1059        vpshufd         $0b01001110, \XMM5, \T2
1060        vpxor           \XMM5, \T2, \T2
1061        vmovdqa         HashKey_4(arg1), \T5
1062        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1063        vpxor           \T4, \T6, \T6
1064
1065        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1066        vpxor           \T4, \T7, \T7
1067
1068        vmovdqa         HashKey_4_k(arg1), \T3
1069        vpclmulqdq      $0x00, \T3, \T2, \T2
1070        vpxor           \T2, \XMM1, \XMM1
1071
1072        ######################
1073
1074        vpshufd         $0b01001110, \XMM6, \T2
1075        vpxor           \XMM6, \T2, \T2
1076        vmovdqa         HashKey_3(arg1), \T5
1077        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1078        vpxor           \T4, \T6, \T6
1079
1080        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1081        vpxor           \T4, \T7, \T7
1082
1083        vmovdqa         HashKey_3_k(arg1), \T3
1084        vpclmulqdq      $0x00, \T3, \T2, \T2
1085        vpxor           \T2, \XMM1, \XMM1
1086
1087        ######################
1088
1089        vpshufd         $0b01001110, \XMM7, \T2
1090        vpxor           \XMM7, \T2, \T2
1091        vmovdqa         HashKey_2(arg1), \T5
1092        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1093        vpxor           \T4, \T6, \T6
1094
1095        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1096        vpxor           \T4, \T7, \T7
1097
1098        vmovdqa         HashKey_2_k(arg1), \T3
1099        vpclmulqdq      $0x00, \T3, \T2, \T2
1100        vpxor           \T2, \XMM1, \XMM1
1101
1102        ######################
1103
1104        vpshufd         $0b01001110, \XMM8, \T2
1105        vpxor           \XMM8, \T2, \T2
1106        vmovdqa         HashKey(arg1), \T5
1107        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1108        vpxor           \T4, \T6, \T6
1109
1110        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1111        vpxor           \T4, \T7, \T7
1112
1113        vmovdqa         HashKey_k(arg1), \T3
1114        vpclmulqdq      $0x00, \T3, \T2, \T2
1115
1116        vpxor           \T2, \XMM1, \XMM1
1117        vpxor           \T6, \XMM1, \XMM1
1118        vpxor           \T7, \XMM1, \T2
1119
1120
1121
1122
1123        vpslldq $8, \T2, \T4
1124        vpsrldq $8, \T2, \T2
1125
1126        vpxor   \T4, \T7, \T7
1127        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1128				# the accumulated carry-less multiplications
1129
1130        #######################################################################
1131        #first phase of the reduction
1132        vpslld  $31, \T7, \T2   # packed right shifting << 31
1133        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1134        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1135
1136        vpxor   \T3, \T2, \T2   # xor the shifted versions
1137        vpxor   \T4, \T2, \T2
1138
1139        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1140
1141        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1142        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1143        #######################################################################
1144
1145
1146        #second phase of the reduction
1147        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1148        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1149        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1150        vpxor   \T3, \T2, \T2   # xor the shifted versions
1151        vpxor   \T4, \T2, \T2
1152
1153        vpxor   \T1, \T2, \T2
1154        vpxor   \T2, \T7, \T7
1155        vpxor   \T7, \T6, \T6   # the result is in T6
1156
1157.endm
1158
1159
1160# combined for GCM encrypt and decrypt functions
1161# clobbering all xmm registers
1162# clobbering r10, r11, r12, r13, r14, r15
1163.macro  GCM_ENC_DEC_AVX     ENC_DEC
1164
1165        #the number of pushes must equal STACK_OFFSET
1166        push    %r12
1167        push    %r13
1168        push    %r14
1169        push    %r15
1170
1171        mov     %rsp, %r14
1172
1173
1174
1175
1176        sub     $VARIABLE_OFFSET, %rsp
1177        and     $~63, %rsp                  # align rsp to 64 bytes
1178
1179
1180        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
1181
1182        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
1183        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
1184
1185        mov     %r13, %r12
1186        shr     $4, %r12
1187        and     $7, %r12
1188        jz      _initial_num_blocks_is_0\@
1189
1190        cmp     $7, %r12
1191        je      _initial_num_blocks_is_7\@
1192        cmp     $6, %r12
1193        je      _initial_num_blocks_is_6\@
1194        cmp     $5, %r12
1195        je      _initial_num_blocks_is_5\@
1196        cmp     $4, %r12
1197        je      _initial_num_blocks_is_4\@
1198        cmp     $3, %r12
1199        je      _initial_num_blocks_is_3\@
1200        cmp     $2, %r12
1201        je      _initial_num_blocks_is_2\@
1202
1203        jmp     _initial_num_blocks_is_1\@
1204
1205_initial_num_blocks_is_7\@:
1206        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1207        sub     $16*7, %r13
1208        jmp     _initial_blocks_encrypted\@
1209
1210_initial_num_blocks_is_6\@:
1211        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1212        sub     $16*6, %r13
1213        jmp     _initial_blocks_encrypted\@
1214
1215_initial_num_blocks_is_5\@:
1216        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1217        sub     $16*5, %r13
1218        jmp     _initial_blocks_encrypted\@
1219
1220_initial_num_blocks_is_4\@:
1221        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1222        sub     $16*4, %r13
1223        jmp     _initial_blocks_encrypted\@
1224
1225_initial_num_blocks_is_3\@:
1226        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1227        sub     $16*3, %r13
1228        jmp     _initial_blocks_encrypted\@
1229
1230_initial_num_blocks_is_2\@:
1231        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1232        sub     $16*2, %r13
1233        jmp     _initial_blocks_encrypted\@
1234
1235_initial_num_blocks_is_1\@:
1236        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1237        sub     $16*1, %r13
1238        jmp     _initial_blocks_encrypted\@
1239
1240_initial_num_blocks_is_0\@:
1241        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1242
1243
1244_initial_blocks_encrypted\@:
1245        cmp     $0, %r13
1246        je      _zero_cipher_left\@
1247
1248        sub     $128, %r13
1249        je      _eight_cipher_left\@
1250
1251
1252
1253
1254        vmovd   %xmm9, %r15d
1255        and     $255, %r15d
1256        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1257
1258
1259_encrypt_by_8_new\@:
1260        cmp     $(255-8), %r15d
1261        jg      _encrypt_by_8\@
1262
1263
1264
1265        add     $8, %r15b
1266        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1267        add     $128, %r11
1268        sub     $128, %r13
1269        jne     _encrypt_by_8_new\@
1270
1271        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1272        jmp     _eight_cipher_left\@
1273
1274_encrypt_by_8\@:
1275        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1276        add     $8, %r15b
1277        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1278        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1279        add     $128, %r11
1280        sub     $128, %r13
1281        jne     _encrypt_by_8_new\@
1282
1283        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1284
1285
1286
1287
1288_eight_cipher_left\@:
1289        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1290
1291
1292_zero_cipher_left\@:
1293        cmp     $16, arg4
1294        jl      _only_less_than_16\@
1295
1296        mov     arg4, %r13
1297        and     $15, %r13                            # r13 = (arg4 mod 16)
1298
1299        je      _multiple_of_16_bytes\@
1300
1301        # handle the last <16 Byte block seperately
1302
1303
1304        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
1305        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1306        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1307
1308        sub     $16, %r11
1309        add     %r13, %r11
1310        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
1311
1312        lea     SHIFT_MASK+16(%rip), %r12
1313        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1314						     # able to shift 16-r13 bytes (r13 is the
1315						     # number of bytes in plaintext mod 16)
1316        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
1317        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
1318        jmp     _final_ghash_mul\@
1319
1320_only_less_than_16\@:
1321        # check for 0 length
1322        mov     arg4, %r13
1323        and     $15, %r13                            # r13 = (arg4 mod 16)
1324
1325        je      _multiple_of_16_bytes\@
1326
1327        # handle the last <16 Byte block seperately
1328
1329
1330        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
1331        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1332        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1333
1334
1335        lea     SHIFT_MASK+16(%rip), %r12
1336        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1337						     # able to shift 16-r13 bytes (r13 is the
1338						     # number of bytes in plaintext mod 16)
1339
1340_get_last_16_byte_loop\@:
1341        movb    (arg3, %r11),  %al
1342        movb    %al,  TMP1 (%rsp , %r11)
1343        add     $1, %r11
1344        cmp     %r13,  %r11
1345        jne     _get_last_16_byte_loop\@
1346
1347        vmovdqu  TMP1(%rsp), %xmm1
1348
1349        sub     $16, %r11
1350
1351_final_ghash_mul\@:
1352        .if  \ENC_DEC ==  DEC
1353        vmovdqa %xmm1, %xmm2
1354        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1355        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1356						     # mask out top 16-r13 bytes of xmm9
1357        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1358        vpand   %xmm1, %xmm2, %xmm2
1359        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1360        vpxor   %xmm2, %xmm14, %xmm14
1361	#GHASH computation for the last <16 Byte block
1362        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1363        sub     %r13, %r11
1364        add     $16, %r11
1365        .else
1366        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1367        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1368						     # mask out top 16-r13 bytes of xmm9
1369        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1370        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1371        vpxor   %xmm9, %xmm14, %xmm14
1372	#GHASH computation for the last <16 Byte block
1373        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1374        sub     %r13, %r11
1375        add     $16, %r11
1376        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
1377        .endif
1378
1379
1380        #############################
1381        # output r13 Bytes
1382        vmovq   %xmm9, %rax
1383        cmp     $8, %r13
1384        jle     _less_than_8_bytes_left\@
1385
1386        mov     %rax, (arg2 , %r11)
1387        add     $8, %r11
1388        vpsrldq $8, %xmm9, %xmm9
1389        vmovq   %xmm9, %rax
1390        sub     $8, %r13
1391
1392_less_than_8_bytes_left\@:
1393        movb    %al, (arg2 , %r11)
1394        add     $1, %r11
1395        shr     $8, %rax
1396        sub     $1, %r13
1397        jne     _less_than_8_bytes_left\@
1398        #############################
1399
1400_multiple_of_16_bytes\@:
1401        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
1402        shl     $3, %r12                             # convert into number of bits
1403        vmovd   %r12d, %xmm15                        # len(A) in xmm15
1404
1405        shl     $3, arg4                             # len(C) in bits  (*128)
1406        vmovq   arg4, %xmm1
1407        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
1408        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
1409
1410        vpxor   %xmm15, %xmm14, %xmm14
1411        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
1412        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
1413
1414        mov     arg5, %rax                           # rax = *Y0
1415        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
1416
1417        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
1418
1419        vpxor   %xmm14, %xmm9, %xmm9
1420
1421
1422
1423_return_T\@:
1424        mov     arg8, %r10              # r10 = authTag
1425        mov     arg9, %r11              # r11 = auth_tag_len
1426
1427        cmp     $16, %r11
1428        je      _T_16\@
1429
1430        cmp     $12, %r11
1431        je      _T_12\@
1432
1433_T_8\@:
1434        vmovq   %xmm9, %rax
1435        mov     %rax, (%r10)
1436        jmp     _return_T_done\@
1437_T_12\@:
1438        vmovq   %xmm9, %rax
1439        mov     %rax, (%r10)
1440        vpsrldq $8, %xmm9, %xmm9
1441        vmovd   %xmm9, %eax
1442        mov     %eax, 8(%r10)
1443        jmp     _return_T_done\@
1444
1445_T_16\@:
1446        vmovdqu %xmm9, (%r10)
1447
1448_return_T_done\@:
1449        mov     %r14, %rsp
1450
1451        pop     %r15
1452        pop     %r14
1453        pop     %r13
1454        pop     %r12
1455.endm
1456
1457
1458#############################################################
1459#void   aesni_gcm_precomp_avx_gen2
1460#        (gcm_data     *my_ctx_data,
1461#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1462#############################################################
1463ENTRY(aesni_gcm_precomp_avx_gen2)
1464        #the number of pushes must equal STACK_OFFSET
1465        push    %r12
1466        push    %r13
1467        push    %r14
1468        push    %r15
1469
1470        mov     %rsp, %r14
1471
1472
1473
1474        sub     $VARIABLE_OFFSET, %rsp
1475        and     $~63, %rsp                  # align rsp to 64 bytes
1476
1477        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
1478
1479        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
1480        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1481        vmovdqa  %xmm6, %xmm2
1482        vpsllq   $1, %xmm6, %xmm6
1483        vpsrlq   $63, %xmm2, %xmm2
1484        vmovdqa  %xmm2, %xmm1
1485        vpslldq  $8, %xmm2, %xmm2
1486        vpsrldq  $8, %xmm1, %xmm1
1487        vpor     %xmm2, %xmm6, %xmm6
1488        #reduction
1489        vpshufd  $0b00100100, %xmm1, %xmm2
1490        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1491        vpand    POLY(%rip), %xmm2, %xmm2
1492        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
1493        #######################################################################
1494        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
1495
1496
1497        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1498
1499        mov     %r14, %rsp
1500
1501        pop     %r15
1502        pop     %r14
1503        pop     %r13
1504        pop     %r12
1505        ret
1506ENDPROC(aesni_gcm_precomp_avx_gen2)
1507
1508###############################################################################
1509#void   aesni_gcm_enc_avx_gen2(
1510#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1511#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1512#        const   u8 *in, /* Plaintext input */
1513#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1514#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1515#			(from Security Association) concatenated with 8 byte
1516#			Initialisation Vector (from IPSec ESP Payload)
1517#			concatenated with 0x00000001. 16-byte aligned pointer. */
1518#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1519#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1520#        u8      *auth_tag, /* Authenticated Tag output. */
1521#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1522#				Valid values are 16 (most likely), 12 or 8. */
1523###############################################################################
1524ENTRY(aesni_gcm_enc_avx_gen2)
1525        GCM_ENC_DEC_AVX     ENC
1526	ret
1527ENDPROC(aesni_gcm_enc_avx_gen2)
1528
1529###############################################################################
1530#void   aesni_gcm_dec_avx_gen2(
1531#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1532#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1533#        const   u8 *in, /* Ciphertext input */
1534#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1535#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1536#			(from Security Association) concatenated with 8 byte
1537#			Initialisation Vector (from IPSec ESP Payload)
1538#			concatenated with 0x00000001. 16-byte aligned pointer. */
1539#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1540#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1541#        u8      *auth_tag, /* Authenticated Tag output. */
1542#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1543#				Valid values are 16 (most likely), 12 or 8. */
1544###############################################################################
1545ENTRY(aesni_gcm_dec_avx_gen2)
1546        GCM_ENC_DEC_AVX     DEC
1547	ret
1548ENDPROC(aesni_gcm_dec_avx_gen2)
1549#endif /* CONFIG_AS_AVX */
1550
1551#ifdef CONFIG_AS_AVX2
1552###############################################################################
1553# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1554# Input: A and B (128-bits each, bit-reflected)
1555# Output: C = A*B*x mod poly, (i.e. >>1 )
1556# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1557# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1558###############################################################################
1559.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1560
1561        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1562        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1563        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1564        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1565        vpxor           \T3, \GH, \GH
1566
1567
1568        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1569        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1570
1571        vpxor           \T3, \T1, \T1
1572        vpxor           \T2, \GH, \GH
1573
1574        #######################################################################
1575        #first phase of the reduction
1576        vmovdqa         POLY2(%rip), \T3
1577
1578        vpclmulqdq      $0x01, \GH, \T3, \T2
1579        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1580
1581        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1582        #######################################################################
1583        #second phase of the reduction
1584        vpclmulqdq      $0x00, \GH, \T3, \T2
1585        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1586
1587        vpclmulqdq      $0x10, \GH, \T3, \GH
1588        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1589
1590        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1591        #######################################################################
1592        vpxor           \T1, \GH, \GH          # the result is in GH
1593
1594
1595.endm
1596
1597.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1598
1599        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1600        vmovdqa  \HK, \T5
1601        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1602        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1603
1604        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1605        vmovdqa  \T5, HashKey_3(arg1)
1606
1607        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1608        vmovdqa  \T5, HashKey_4(arg1)
1609
1610        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1611        vmovdqa  \T5, HashKey_5(arg1)
1612
1613        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1614        vmovdqa  \T5, HashKey_6(arg1)
1615
1616        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1617        vmovdqa  \T5, HashKey_7(arg1)
1618
1619        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1620        vmovdqa  \T5, HashKey_8(arg1)
1621
1622.endm
1623
1624
1625## if a = number of total plaintext bytes
1626## b = floor(a/16)
1627## num_initial_blocks = b mod 4#
1628## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1629## r10, r11, r12, rax are clobbered
1630## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1631
1632.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1633	i = (8-\num_initial_blocks)
1634	setreg
1635
1636        mov     arg6, %r10                       # r10 = AAD
1637        mov     arg7, %r12                       # r12 = aadLen
1638
1639
1640        mov     %r12, %r11
1641
1642        vpxor   reg_i, reg_i, reg_i
1643_get_AAD_loop\@:
1644        vmovd   (%r10), \T1
1645        vpslldq $12, \T1, \T1
1646        vpsrldq $4, reg_i, reg_i
1647        vpxor   \T1, reg_i, reg_i
1648
1649        add     $4, %r10
1650        sub     $4, %r12
1651        jg      _get_AAD_loop\@
1652
1653
1654        cmp     $16, %r11
1655        je      _get_AAD_loop2_done\@
1656        mov     $16, %r12
1657
1658_get_AAD_loop2\@:
1659        vpsrldq $4, reg_i, reg_i
1660        sub     $4, %r12
1661        cmp     %r11, %r12
1662        jg      _get_AAD_loop2\@
1663
1664_get_AAD_loop2_done\@:
1665
1666        #byte-reflect the AAD data
1667        vpshufb SHUF_MASK(%rip), reg_i, reg_i
1668
1669	# initialize the data pointer offset as zero
1670	xor     %r11, %r11
1671
1672	# start AES for num_initial_blocks blocks
1673	mov     arg5, %rax                     # rax = *Y0
1674	vmovdqu (%rax), \CTR                   # CTR = Y0
1675	vpshufb SHUF_MASK(%rip), \CTR, \CTR
1676
1677
1678	i = (9-\num_initial_blocks)
1679	setreg
1680.rep \num_initial_blocks
1681                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1682                vmovdqa \CTR, reg_i
1683                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1684	i = (i+1)
1685	setreg
1686.endr
1687
1688	vmovdqa  (arg1), \T_key
1689	i = (9-\num_initial_blocks)
1690	setreg
1691.rep \num_initial_blocks
1692                vpxor   \T_key, reg_i, reg_i
1693	i = (i+1)
1694	setreg
1695.endr
1696
1697	j = 1
1698	setreg
1699.rep 9
1700	vmovdqa  16*j(arg1), \T_key
1701	i = (9-\num_initial_blocks)
1702	setreg
1703.rep \num_initial_blocks
1704        vaesenc \T_key, reg_i, reg_i
1705	i = (i+1)
1706	setreg
1707.endr
1708
1709	j = (j+1)
1710	setreg
1711.endr
1712
1713
1714	vmovdqa  16*10(arg1), \T_key
1715	i = (9-\num_initial_blocks)
1716	setreg
1717.rep \num_initial_blocks
1718        vaesenclast      \T_key, reg_i, reg_i
1719	i = (i+1)
1720	setreg
1721.endr
1722
1723	i = (9-\num_initial_blocks)
1724	setreg
1725.rep \num_initial_blocks
1726                vmovdqu (arg3, %r11), \T1
1727                vpxor   \T1, reg_i, reg_i
1728                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
1729						       # num_initial_blocks blocks
1730                add     $16, %r11
1731.if  \ENC_DEC == DEC
1732                vmovdqa \T1, reg_i
1733.endif
1734                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1735	i = (i+1)
1736	setreg
1737.endr
1738
1739
1740	i = (8-\num_initial_blocks)
1741	j = (9-\num_initial_blocks)
1742	setreg
1743        GHASH_MUL_AVX2       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1744
1745.rep \num_initial_blocks
1746        vpxor    reg_i, reg_j, reg_j
1747        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1748	i = (i+1)
1749	j = (j+1)
1750	setreg
1751.endr
1752        # XMM8 has the combined result here
1753
1754        vmovdqa  \XMM8, TMP1(%rsp)
1755        vmovdqa  \XMM8, \T3
1756
1757        cmp     $128, %r13
1758        jl      _initial_blocks_done\@                  # no need for precomputed constants
1759
1760###############################################################################
1761# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1762                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1763                vmovdqa  \CTR, \XMM1
1764                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1765
1766                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1767                vmovdqa  \CTR, \XMM2
1768                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1769
1770                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1771                vmovdqa  \CTR, \XMM3
1772                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1773
1774                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1775                vmovdqa  \CTR, \XMM4
1776                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1777
1778                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1779                vmovdqa  \CTR, \XMM5
1780                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1781
1782                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1783                vmovdqa  \CTR, \XMM6
1784                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1785
1786                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1787                vmovdqa  \CTR, \XMM7
1788                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1789
1790                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1791                vmovdqa  \CTR, \XMM8
1792                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1793
1794                vmovdqa  (arg1), \T_key
1795                vpxor    \T_key, \XMM1, \XMM1
1796                vpxor    \T_key, \XMM2, \XMM2
1797                vpxor    \T_key, \XMM3, \XMM3
1798                vpxor    \T_key, \XMM4, \XMM4
1799                vpxor    \T_key, \XMM5, \XMM5
1800                vpxor    \T_key, \XMM6, \XMM6
1801                vpxor    \T_key, \XMM7, \XMM7
1802                vpxor    \T_key, \XMM8, \XMM8
1803
1804		i = 1
1805		setreg
1806.rep    9       # do 9 rounds
1807                vmovdqa  16*i(arg1), \T_key
1808                vaesenc  \T_key, \XMM1, \XMM1
1809                vaesenc  \T_key, \XMM2, \XMM2
1810                vaesenc  \T_key, \XMM3, \XMM3
1811                vaesenc  \T_key, \XMM4, \XMM4
1812                vaesenc  \T_key, \XMM5, \XMM5
1813                vaesenc  \T_key, \XMM6, \XMM6
1814                vaesenc  \T_key, \XMM7, \XMM7
1815                vaesenc  \T_key, \XMM8, \XMM8
1816		i = (i+1)
1817		setreg
1818.endr
1819
1820
1821                vmovdqa  16*i(arg1), \T_key
1822                vaesenclast  \T_key, \XMM1, \XMM1
1823                vaesenclast  \T_key, \XMM2, \XMM2
1824                vaesenclast  \T_key, \XMM3, \XMM3
1825                vaesenclast  \T_key, \XMM4, \XMM4
1826                vaesenclast  \T_key, \XMM5, \XMM5
1827                vaesenclast  \T_key, \XMM6, \XMM6
1828                vaesenclast  \T_key, \XMM7, \XMM7
1829                vaesenclast  \T_key, \XMM8, \XMM8
1830
1831                vmovdqu  (arg3, %r11), \T1
1832                vpxor    \T1, \XMM1, \XMM1
1833                vmovdqu  \XMM1, (arg2 , %r11)
1834                .if   \ENC_DEC == DEC
1835                vmovdqa  \T1, \XMM1
1836                .endif
1837
1838                vmovdqu  16*1(arg3, %r11), \T1
1839                vpxor    \T1, \XMM2, \XMM2
1840                vmovdqu  \XMM2, 16*1(arg2 , %r11)
1841                .if   \ENC_DEC == DEC
1842                vmovdqa  \T1, \XMM2
1843                .endif
1844
1845                vmovdqu  16*2(arg3, %r11), \T1
1846                vpxor    \T1, \XMM3, \XMM3
1847                vmovdqu  \XMM3, 16*2(arg2 , %r11)
1848                .if   \ENC_DEC == DEC
1849                vmovdqa  \T1, \XMM3
1850                .endif
1851
1852                vmovdqu  16*3(arg3, %r11), \T1
1853                vpxor    \T1, \XMM4, \XMM4
1854                vmovdqu  \XMM4, 16*3(arg2 , %r11)
1855                .if   \ENC_DEC == DEC
1856                vmovdqa  \T1, \XMM4
1857                .endif
1858
1859                vmovdqu  16*4(arg3, %r11), \T1
1860                vpxor    \T1, \XMM5, \XMM5
1861                vmovdqu  \XMM5, 16*4(arg2 , %r11)
1862                .if   \ENC_DEC == DEC
1863                vmovdqa  \T1, \XMM5
1864                .endif
1865
1866                vmovdqu  16*5(arg3, %r11), \T1
1867                vpxor    \T1, \XMM6, \XMM6
1868                vmovdqu  \XMM6, 16*5(arg2 , %r11)
1869                .if   \ENC_DEC == DEC
1870                vmovdqa  \T1, \XMM6
1871                .endif
1872
1873                vmovdqu  16*6(arg3, %r11), \T1
1874                vpxor    \T1, \XMM7, \XMM7
1875                vmovdqu  \XMM7, 16*6(arg2 , %r11)
1876                .if   \ENC_DEC == DEC
1877                vmovdqa  \T1, \XMM7
1878                .endif
1879
1880                vmovdqu  16*7(arg3, %r11), \T1
1881                vpxor    \T1, \XMM8, \XMM8
1882                vmovdqu  \XMM8, 16*7(arg2 , %r11)
1883                .if   \ENC_DEC == DEC
1884                vmovdqa  \T1, \XMM8
1885                .endif
1886
1887                add     $128, %r11
1888
1889                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1890                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
1891							   # the corresponding ciphertext
1892                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1893                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1894                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1895                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1896                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1897                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1898                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1899
1900###############################################################################
1901
1902_initial_blocks_done\@:
1903
1904
1905.endm
1906
1907
1908
1909# encrypt 8 blocks at a time
1910# ghash the 8 previously encrypted ciphertext blocks
1911# arg1, arg2, arg3 are used as pointers only, not modified
1912# r11 is the data offset value
1913.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1914
1915        vmovdqa \XMM1, \T2
1916        vmovdqa \XMM2, TMP2(%rsp)
1917        vmovdqa \XMM3, TMP3(%rsp)
1918        vmovdqa \XMM4, TMP4(%rsp)
1919        vmovdqa \XMM5, TMP5(%rsp)
1920        vmovdqa \XMM6, TMP6(%rsp)
1921        vmovdqa \XMM7, TMP7(%rsp)
1922        vmovdqa \XMM8, TMP8(%rsp)
1923
1924.if \loop_idx == in_order
1925                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
1926                vpaddd  ONE(%rip), \XMM1, \XMM2
1927                vpaddd  ONE(%rip), \XMM2, \XMM3
1928                vpaddd  ONE(%rip), \XMM3, \XMM4
1929                vpaddd  ONE(%rip), \XMM4, \XMM5
1930                vpaddd  ONE(%rip), \XMM5, \XMM6
1931                vpaddd  ONE(%rip), \XMM6, \XMM7
1932                vpaddd  ONE(%rip), \XMM7, \XMM8
1933                vmovdqa \XMM8, \CTR
1934
1935                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1936                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1937                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1938                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1939                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1940                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1941                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1942                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1943.else
1944                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
1945                vpaddd  ONEf(%rip), \XMM1, \XMM2
1946                vpaddd  ONEf(%rip), \XMM2, \XMM3
1947                vpaddd  ONEf(%rip), \XMM3, \XMM4
1948                vpaddd  ONEf(%rip), \XMM4, \XMM5
1949                vpaddd  ONEf(%rip), \XMM5, \XMM6
1950                vpaddd  ONEf(%rip), \XMM6, \XMM7
1951                vpaddd  ONEf(%rip), \XMM7, \XMM8
1952                vmovdqa \XMM8, \CTR
1953.endif
1954
1955
1956        #######################################################################
1957
1958                vmovdqu (arg1), \T1
1959                vpxor   \T1, \XMM1, \XMM1
1960                vpxor   \T1, \XMM2, \XMM2
1961                vpxor   \T1, \XMM3, \XMM3
1962                vpxor   \T1, \XMM4, \XMM4
1963                vpxor   \T1, \XMM5, \XMM5
1964                vpxor   \T1, \XMM6, \XMM6
1965                vpxor   \T1, \XMM7, \XMM7
1966                vpxor   \T1, \XMM8, \XMM8
1967
1968        #######################################################################
1969
1970
1971
1972
1973
1974                vmovdqu 16*1(arg1), \T1
1975                vaesenc \T1, \XMM1, \XMM1
1976                vaesenc \T1, \XMM2, \XMM2
1977                vaesenc \T1, \XMM3, \XMM3
1978                vaesenc \T1, \XMM4, \XMM4
1979                vaesenc \T1, \XMM5, \XMM5
1980                vaesenc \T1, \XMM6, \XMM6
1981                vaesenc \T1, \XMM7, \XMM7
1982                vaesenc \T1, \XMM8, \XMM8
1983
1984                vmovdqu 16*2(arg1), \T1
1985                vaesenc \T1, \XMM1, \XMM1
1986                vaesenc \T1, \XMM2, \XMM2
1987                vaesenc \T1, \XMM3, \XMM3
1988                vaesenc \T1, \XMM4, \XMM4
1989                vaesenc \T1, \XMM5, \XMM5
1990                vaesenc \T1, \XMM6, \XMM6
1991                vaesenc \T1, \XMM7, \XMM7
1992                vaesenc \T1, \XMM8, \XMM8
1993
1994
1995        #######################################################################
1996
1997        vmovdqa         HashKey_8(arg1), \T5
1998        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
1999        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2000        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2001        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2002        vpxor           \T5, \T6, \T6
2003
2004                vmovdqu 16*3(arg1), \T1
2005                vaesenc \T1, \XMM1, \XMM1
2006                vaesenc \T1, \XMM2, \XMM2
2007                vaesenc \T1, \XMM3, \XMM3
2008                vaesenc \T1, \XMM4, \XMM4
2009                vaesenc \T1, \XMM5, \XMM5
2010                vaesenc \T1, \XMM6, \XMM6
2011                vaesenc \T1, \XMM7, \XMM7
2012                vaesenc \T1, \XMM8, \XMM8
2013
2014        vmovdqa         TMP2(%rsp), \T1
2015        vmovdqa         HashKey_7(arg1), \T5
2016        vpclmulqdq      $0x11, \T5, \T1, \T3
2017        vpxor           \T3, \T4, \T4
2018
2019        vpclmulqdq      $0x00, \T5, \T1, \T3
2020        vpxor           \T3, \T7, \T7
2021
2022        vpclmulqdq      $0x01, \T5, \T1, \T3
2023        vpxor           \T3, \T6, \T6
2024
2025        vpclmulqdq      $0x10, \T5, \T1, \T3
2026        vpxor           \T3, \T6, \T6
2027
2028                vmovdqu 16*4(arg1), \T1
2029                vaesenc \T1, \XMM1, \XMM1
2030                vaesenc \T1, \XMM2, \XMM2
2031                vaesenc \T1, \XMM3, \XMM3
2032                vaesenc \T1, \XMM4, \XMM4
2033                vaesenc \T1, \XMM5, \XMM5
2034                vaesenc \T1, \XMM6, \XMM6
2035                vaesenc \T1, \XMM7, \XMM7
2036                vaesenc \T1, \XMM8, \XMM8
2037
2038        #######################################################################
2039
2040        vmovdqa         TMP3(%rsp), \T1
2041        vmovdqa         HashKey_6(arg1), \T5
2042        vpclmulqdq      $0x11, \T5, \T1, \T3
2043        vpxor           \T3, \T4, \T4
2044
2045        vpclmulqdq      $0x00, \T5, \T1, \T3
2046        vpxor           \T3, \T7, \T7
2047
2048        vpclmulqdq      $0x01, \T5, \T1, \T3
2049        vpxor           \T3, \T6, \T6
2050
2051        vpclmulqdq      $0x10, \T5, \T1, \T3
2052        vpxor           \T3, \T6, \T6
2053
2054                vmovdqu 16*5(arg1), \T1
2055                vaesenc \T1, \XMM1, \XMM1
2056                vaesenc \T1, \XMM2, \XMM2
2057                vaesenc \T1, \XMM3, \XMM3
2058                vaesenc \T1, \XMM4, \XMM4
2059                vaesenc \T1, \XMM5, \XMM5
2060                vaesenc \T1, \XMM6, \XMM6
2061                vaesenc \T1, \XMM7, \XMM7
2062                vaesenc \T1, \XMM8, \XMM8
2063
2064        vmovdqa         TMP4(%rsp), \T1
2065        vmovdqa         HashKey_5(arg1), \T5
2066        vpclmulqdq      $0x11, \T5, \T1, \T3
2067        vpxor           \T3, \T4, \T4
2068
2069        vpclmulqdq      $0x00, \T5, \T1, \T3
2070        vpxor           \T3, \T7, \T7
2071
2072        vpclmulqdq      $0x01, \T5, \T1, \T3
2073        vpxor           \T3, \T6, \T6
2074
2075        vpclmulqdq      $0x10, \T5, \T1, \T3
2076        vpxor           \T3, \T6, \T6
2077
2078                vmovdqu 16*6(arg1), \T1
2079                vaesenc \T1, \XMM1, \XMM1
2080                vaesenc \T1, \XMM2, \XMM2
2081                vaesenc \T1, \XMM3, \XMM3
2082                vaesenc \T1, \XMM4, \XMM4
2083                vaesenc \T1, \XMM5, \XMM5
2084                vaesenc \T1, \XMM6, \XMM6
2085                vaesenc \T1, \XMM7, \XMM7
2086                vaesenc \T1, \XMM8, \XMM8
2087
2088
2089        vmovdqa         TMP5(%rsp), \T1
2090        vmovdqa         HashKey_4(arg1), \T5
2091        vpclmulqdq      $0x11, \T5, \T1, \T3
2092        vpxor           \T3, \T4, \T4
2093
2094        vpclmulqdq      $0x00, \T5, \T1, \T3
2095        vpxor           \T3, \T7, \T7
2096
2097        vpclmulqdq      $0x01, \T5, \T1, \T3
2098        vpxor           \T3, \T6, \T6
2099
2100        vpclmulqdq      $0x10, \T5, \T1, \T3
2101        vpxor           \T3, \T6, \T6
2102
2103                vmovdqu 16*7(arg1), \T1
2104                vaesenc \T1, \XMM1, \XMM1
2105                vaesenc \T1, \XMM2, \XMM2
2106                vaesenc \T1, \XMM3, \XMM3
2107                vaesenc \T1, \XMM4, \XMM4
2108                vaesenc \T1, \XMM5, \XMM5
2109                vaesenc \T1, \XMM6, \XMM6
2110                vaesenc \T1, \XMM7, \XMM7
2111                vaesenc \T1, \XMM8, \XMM8
2112
2113        vmovdqa         TMP6(%rsp), \T1
2114        vmovdqa         HashKey_3(arg1), \T5
2115        vpclmulqdq      $0x11, \T5, \T1, \T3
2116        vpxor           \T3, \T4, \T4
2117
2118        vpclmulqdq      $0x00, \T5, \T1, \T3
2119        vpxor           \T3, \T7, \T7
2120
2121        vpclmulqdq      $0x01, \T5, \T1, \T3
2122        vpxor           \T3, \T6, \T6
2123
2124        vpclmulqdq      $0x10, \T5, \T1, \T3
2125        vpxor           \T3, \T6, \T6
2126
2127                vmovdqu 16*8(arg1), \T1
2128                vaesenc \T1, \XMM1, \XMM1
2129                vaesenc \T1, \XMM2, \XMM2
2130                vaesenc \T1, \XMM3, \XMM3
2131                vaesenc \T1, \XMM4, \XMM4
2132                vaesenc \T1, \XMM5, \XMM5
2133                vaesenc \T1, \XMM6, \XMM6
2134                vaesenc \T1, \XMM7, \XMM7
2135                vaesenc \T1, \XMM8, \XMM8
2136
2137        vmovdqa         TMP7(%rsp), \T1
2138        vmovdqa         HashKey_2(arg1), \T5
2139        vpclmulqdq      $0x11, \T5, \T1, \T3
2140        vpxor           \T3, \T4, \T4
2141
2142        vpclmulqdq      $0x00, \T5, \T1, \T3
2143        vpxor           \T3, \T7, \T7
2144
2145        vpclmulqdq      $0x01, \T5, \T1, \T3
2146        vpxor           \T3, \T6, \T6
2147
2148        vpclmulqdq      $0x10, \T5, \T1, \T3
2149        vpxor           \T3, \T6, \T6
2150
2151
2152        #######################################################################
2153
2154                vmovdqu 16*9(arg1), \T5
2155                vaesenc \T5, \XMM1, \XMM1
2156                vaesenc \T5, \XMM2, \XMM2
2157                vaesenc \T5, \XMM3, \XMM3
2158                vaesenc \T5, \XMM4, \XMM4
2159                vaesenc \T5, \XMM5, \XMM5
2160                vaesenc \T5, \XMM6, \XMM6
2161                vaesenc \T5, \XMM7, \XMM7
2162                vaesenc \T5, \XMM8, \XMM8
2163
2164        vmovdqa         TMP8(%rsp), \T1
2165        vmovdqa         HashKey(arg1), \T5
2166
2167        vpclmulqdq      $0x00, \T5, \T1, \T3
2168        vpxor           \T3, \T7, \T7
2169
2170        vpclmulqdq      $0x01, \T5, \T1, \T3
2171        vpxor           \T3, \T6, \T6
2172
2173        vpclmulqdq      $0x10, \T5, \T1, \T3
2174        vpxor           \T3, \T6, \T6
2175
2176        vpclmulqdq      $0x11, \T5, \T1, \T3
2177        vpxor           \T3, \T4, \T1
2178
2179
2180                vmovdqu 16*10(arg1), \T5
2181
2182	i = 0
2183	j = 1
2184	setreg
2185.rep 8
2186		vpxor	16*i(arg3, %r11), \T5, \T2
2187                .if \ENC_DEC == ENC
2188                vaesenclast     \T2, reg_j, reg_j
2189                .else
2190                vaesenclast     \T2, reg_j, \T3
2191                vmovdqu 16*i(arg3, %r11), reg_j
2192                vmovdqu \T3, 16*i(arg2, %r11)
2193                .endif
2194	i = (i+1)
2195	j = (j+1)
2196	setreg
2197.endr
2198	#######################################################################
2199
2200
2201	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2202	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2203	vpxor	\T3, \T7, \T7
2204	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2205
2206
2207
2208	#######################################################################
2209	#first phase of the reduction
2210	vmovdqa         POLY2(%rip), \T3
2211
2212	vpclmulqdq	$0x01, \T7, \T3, \T2
2213	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2214
2215	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2216	#######################################################################
2217                .if \ENC_DEC == ENC
2218		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
2219		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
2220		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
2221		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
2222		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
2223		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
2224		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
2225		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
2226                .endif
2227
2228	#######################################################################
2229	#second phase of the reduction
2230	vpclmulqdq	$0x00, \T7, \T3, \T2
2231	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2232
2233	vpclmulqdq	$0x10, \T7, \T3, \T4
2234	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2235
2236	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2237	#######################################################################
2238	vpxor		\T4, \T1, \T1			# the result is in T1
2239
2240		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2241		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2242		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2243		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2244		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2245		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2246		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2247		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2248
2249
2250	vpxor	\T1, \XMM1, \XMM1
2251
2252
2253
2254.endm
2255
2256
2257# GHASH the last 4 ciphertext blocks.
2258.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2259
2260        ## Karatsuba Method
2261
2262        vmovdqa         HashKey_8(arg1), \T5
2263
2264        vpshufd         $0b01001110, \XMM1, \T2
2265        vpshufd         $0b01001110, \T5, \T3
2266        vpxor           \XMM1, \T2, \T2
2267        vpxor           \T5, \T3, \T3
2268
2269        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2270        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2271
2272        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2273
2274        ######################
2275
2276        vmovdqa         HashKey_7(arg1), \T5
2277        vpshufd         $0b01001110, \XMM2, \T2
2278        vpshufd         $0b01001110, \T5, \T3
2279        vpxor           \XMM2, \T2, \T2
2280        vpxor           \T5, \T3, \T3
2281
2282        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2283        vpxor           \T4, \T6, \T6
2284
2285        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2286        vpxor           \T4, \T7, \T7
2287
2288        vpclmulqdq      $0x00, \T3, \T2, \T2
2289
2290        vpxor           \T2, \XMM1, \XMM1
2291
2292        ######################
2293
2294        vmovdqa         HashKey_6(arg1), \T5
2295        vpshufd         $0b01001110, \XMM3, \T2
2296        vpshufd         $0b01001110, \T5, \T3
2297        vpxor           \XMM3, \T2, \T2
2298        vpxor           \T5, \T3, \T3
2299
2300        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2301        vpxor           \T4, \T6, \T6
2302
2303        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2304        vpxor           \T4, \T7, \T7
2305
2306        vpclmulqdq      $0x00, \T3, \T2, \T2
2307
2308        vpxor           \T2, \XMM1, \XMM1
2309
2310        ######################
2311
2312        vmovdqa         HashKey_5(arg1), \T5
2313        vpshufd         $0b01001110, \XMM4, \T2
2314        vpshufd         $0b01001110, \T5, \T3
2315        vpxor           \XMM4, \T2, \T2
2316        vpxor           \T5, \T3, \T3
2317
2318        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2319        vpxor           \T4, \T6, \T6
2320
2321        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2322        vpxor           \T4, \T7, \T7
2323
2324        vpclmulqdq      $0x00, \T3, \T2, \T2
2325
2326        vpxor           \T2, \XMM1, \XMM1
2327
2328        ######################
2329
2330        vmovdqa         HashKey_4(arg1), \T5
2331        vpshufd         $0b01001110, \XMM5, \T2
2332        vpshufd         $0b01001110, \T5, \T3
2333        vpxor           \XMM5, \T2, \T2
2334        vpxor           \T5, \T3, \T3
2335
2336        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2337        vpxor           \T4, \T6, \T6
2338
2339        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2340        vpxor           \T4, \T7, \T7
2341
2342        vpclmulqdq      $0x00, \T3, \T2, \T2
2343
2344        vpxor           \T2, \XMM1, \XMM1
2345
2346        ######################
2347
2348        vmovdqa         HashKey_3(arg1), \T5
2349        vpshufd         $0b01001110, \XMM6, \T2
2350        vpshufd         $0b01001110, \T5, \T3
2351        vpxor           \XMM6, \T2, \T2
2352        vpxor           \T5, \T3, \T3
2353
2354        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2355        vpxor           \T4, \T6, \T6
2356
2357        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2358        vpxor           \T4, \T7, \T7
2359
2360        vpclmulqdq      $0x00, \T3, \T2, \T2
2361
2362        vpxor           \T2, \XMM1, \XMM1
2363
2364        ######################
2365
2366        vmovdqa         HashKey_2(arg1), \T5
2367        vpshufd         $0b01001110, \XMM7, \T2
2368        vpshufd         $0b01001110, \T5, \T3
2369        vpxor           \XMM7, \T2, \T2
2370        vpxor           \T5, \T3, \T3
2371
2372        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2373        vpxor           \T4, \T6, \T6
2374
2375        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2376        vpxor           \T4, \T7, \T7
2377
2378        vpclmulqdq      $0x00, \T3, \T2, \T2
2379
2380        vpxor           \T2, \XMM1, \XMM1
2381
2382        ######################
2383
2384        vmovdqa         HashKey(arg1), \T5
2385        vpshufd         $0b01001110, \XMM8, \T2
2386        vpshufd         $0b01001110, \T5, \T3
2387        vpxor           \XMM8, \T2, \T2
2388        vpxor           \T5, \T3, \T3
2389
2390        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2391        vpxor           \T4, \T6, \T6
2392
2393        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2394        vpxor           \T4, \T7, \T7
2395
2396        vpclmulqdq      $0x00, \T3, \T2, \T2
2397
2398        vpxor           \T2, \XMM1, \XMM1
2399        vpxor           \T6, \XMM1, \XMM1
2400        vpxor           \T7, \XMM1, \T2
2401
2402
2403
2404
2405        vpslldq $8, \T2, \T4
2406        vpsrldq $8, \T2, \T2
2407
2408        vpxor   \T4, \T7, \T7
2409        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2410						   # accumulated carry-less multiplications
2411
2412        #######################################################################
2413        #first phase of the reduction
2414        vmovdqa         POLY2(%rip), \T3
2415
2416        vpclmulqdq      $0x01, \T7, \T3, \T2
2417        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2418
2419        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2420        #######################################################################
2421
2422
2423        #second phase of the reduction
2424        vpclmulqdq      $0x00, \T7, \T3, \T2
2425        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2426
2427        vpclmulqdq      $0x10, \T7, \T3, \T4
2428        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2429
2430        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2431        #######################################################################
2432        vpxor           \T4, \T6, \T6              # the result is in T6
2433.endm
2434
2435
2436
2437# combined for GCM encrypt and decrypt functions
2438# clobbering all xmm registers
2439# clobbering r10, r11, r12, r13, r14, r15
2440.macro  GCM_ENC_DEC_AVX2     ENC_DEC
2441
2442        #the number of pushes must equal STACK_OFFSET
2443        push    %r12
2444        push    %r13
2445        push    %r14
2446        push    %r15
2447
2448        mov     %rsp, %r14
2449
2450
2451
2452
2453        sub     $VARIABLE_OFFSET, %rsp
2454        and     $~63, %rsp                         # align rsp to 64 bytes
2455
2456
2457        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
2458
2459        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
2460        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
2461
2462        mov     %r13, %r12
2463        shr     $4, %r12
2464        and     $7, %r12
2465        jz      _initial_num_blocks_is_0\@
2466
2467        cmp     $7, %r12
2468        je      _initial_num_blocks_is_7\@
2469        cmp     $6, %r12
2470        je      _initial_num_blocks_is_6\@
2471        cmp     $5, %r12
2472        je      _initial_num_blocks_is_5\@
2473        cmp     $4, %r12
2474        je      _initial_num_blocks_is_4\@
2475        cmp     $3, %r12
2476        je      _initial_num_blocks_is_3\@
2477        cmp     $2, %r12
2478        je      _initial_num_blocks_is_2\@
2479
2480        jmp     _initial_num_blocks_is_1\@
2481
2482_initial_num_blocks_is_7\@:
2483        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2484        sub     $16*7, %r13
2485        jmp     _initial_blocks_encrypted\@
2486
2487_initial_num_blocks_is_6\@:
2488        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2489        sub     $16*6, %r13
2490        jmp     _initial_blocks_encrypted\@
2491
2492_initial_num_blocks_is_5\@:
2493        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2494        sub     $16*5, %r13
2495        jmp     _initial_blocks_encrypted\@
2496
2497_initial_num_blocks_is_4\@:
2498        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2499        sub     $16*4, %r13
2500        jmp     _initial_blocks_encrypted\@
2501
2502_initial_num_blocks_is_3\@:
2503        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2504        sub     $16*3, %r13
2505        jmp     _initial_blocks_encrypted\@
2506
2507_initial_num_blocks_is_2\@:
2508        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2509        sub     $16*2, %r13
2510        jmp     _initial_blocks_encrypted\@
2511
2512_initial_num_blocks_is_1\@:
2513        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2514        sub     $16*1, %r13
2515        jmp     _initial_blocks_encrypted\@
2516
2517_initial_num_blocks_is_0\@:
2518        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2519
2520
2521_initial_blocks_encrypted\@:
2522        cmp     $0, %r13
2523        je      _zero_cipher_left\@
2524
2525        sub     $128, %r13
2526        je      _eight_cipher_left\@
2527
2528
2529
2530
2531        vmovd   %xmm9, %r15d
2532        and     $255, %r15d
2533        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2534
2535
2536_encrypt_by_8_new\@:
2537        cmp     $(255-8), %r15d
2538        jg      _encrypt_by_8\@
2539
2540
2541
2542        add     $8, %r15b
2543        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2544        add     $128, %r11
2545        sub     $128, %r13
2546        jne     _encrypt_by_8_new\@
2547
2548        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2549        jmp     _eight_cipher_left\@
2550
2551_encrypt_by_8\@:
2552        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2553        add     $8, %r15b
2554        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2555        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2556        add     $128, %r11
2557        sub     $128, %r13
2558        jne     _encrypt_by_8_new\@
2559
2560        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2561
2562
2563
2564
2565_eight_cipher_left\@:
2566        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2567
2568
2569_zero_cipher_left\@:
2570        cmp     $16, arg4
2571        jl      _only_less_than_16\@
2572
2573        mov     arg4, %r13
2574        and     $15, %r13                            # r13 = (arg4 mod 16)
2575
2576        je      _multiple_of_16_bytes\@
2577
2578        # handle the last <16 Byte block seperately
2579
2580
2581        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
2582        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2583        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2584
2585        sub     $16, %r11
2586        add     %r13, %r11
2587        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
2588
2589        lea     SHIFT_MASK+16(%rip), %r12
2590        sub     %r13, %r12                           # adjust the shuffle mask pointer
2591						     # to be able to shift 16-r13 bytes
2592						     # (r13 is the number of bytes in plaintext mod 16)
2593        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
2594        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
2595        jmp     _final_ghash_mul\@
2596
2597_only_less_than_16\@:
2598        # check for 0 length
2599        mov     arg4, %r13
2600        and     $15, %r13                            # r13 = (arg4 mod 16)
2601
2602        je      _multiple_of_16_bytes\@
2603
2604        # handle the last <16 Byte block seperately
2605
2606
2607        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
2608        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2609        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2610
2611
2612        lea     SHIFT_MASK+16(%rip), %r12
2613        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
2614						     # able to shift 16-r13 bytes (r13 is the
2615						     # number of bytes in plaintext mod 16)
2616
2617_get_last_16_byte_loop\@:
2618        movb    (arg3, %r11),  %al
2619        movb    %al,  TMP1 (%rsp , %r11)
2620        add     $1, %r11
2621        cmp     %r13,  %r11
2622        jne     _get_last_16_byte_loop\@
2623
2624        vmovdqu  TMP1(%rsp), %xmm1
2625
2626        sub     $16, %r11
2627
2628_final_ghash_mul\@:
2629        .if  \ENC_DEC ==  DEC
2630        vmovdqa %xmm1, %xmm2
2631        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2632        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2633        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2634        vpand   %xmm1, %xmm2, %xmm2
2635        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2636        vpxor   %xmm2, %xmm14, %xmm14
2637	#GHASH computation for the last <16 Byte block
2638        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2639        sub     %r13, %r11
2640        add     $16, %r11
2641        .else
2642        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2643        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2644        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2645        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2646        vpxor   %xmm9, %xmm14, %xmm14
2647	#GHASH computation for the last <16 Byte block
2648        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2649        sub     %r13, %r11
2650        add     $16, %r11
2651        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
2652        .endif
2653
2654
2655        #############################
2656        # output r13 Bytes
2657        vmovq   %xmm9, %rax
2658        cmp     $8, %r13
2659        jle     _less_than_8_bytes_left\@
2660
2661        mov     %rax, (arg2 , %r11)
2662        add     $8, %r11
2663        vpsrldq $8, %xmm9, %xmm9
2664        vmovq   %xmm9, %rax
2665        sub     $8, %r13
2666
2667_less_than_8_bytes_left\@:
2668        movb    %al, (arg2 , %r11)
2669        add     $1, %r11
2670        shr     $8, %rax
2671        sub     $1, %r13
2672        jne     _less_than_8_bytes_left\@
2673        #############################
2674
2675_multiple_of_16_bytes\@:
2676        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
2677        shl     $3, %r12                             # convert into number of bits
2678        vmovd   %r12d, %xmm15                        # len(A) in xmm15
2679
2680        shl     $3, arg4                             # len(C) in bits  (*128)
2681        vmovq   arg4, %xmm1
2682        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
2683        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
2684
2685        vpxor   %xmm15, %xmm14, %xmm14
2686        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
2687        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
2688
2689        mov     arg5, %rax                           # rax = *Y0
2690        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
2691
2692        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
2693
2694        vpxor   %xmm14, %xmm9, %xmm9
2695
2696
2697
2698_return_T\@:
2699        mov     arg8, %r10              # r10 = authTag
2700        mov     arg9, %r11              # r11 = auth_tag_len
2701
2702        cmp     $16, %r11
2703        je      _T_16\@
2704
2705        cmp     $12, %r11
2706        je      _T_12\@
2707
2708_T_8\@:
2709        vmovq   %xmm9, %rax
2710        mov     %rax, (%r10)
2711        jmp     _return_T_done\@
2712_T_12\@:
2713        vmovq   %xmm9, %rax
2714        mov     %rax, (%r10)
2715        vpsrldq $8, %xmm9, %xmm9
2716        vmovd   %xmm9, %eax
2717        mov     %eax, 8(%r10)
2718        jmp     _return_T_done\@
2719
2720_T_16\@:
2721        vmovdqu %xmm9, (%r10)
2722
2723_return_T_done\@:
2724        mov     %r14, %rsp
2725
2726        pop     %r15
2727        pop     %r14
2728        pop     %r13
2729        pop     %r12
2730.endm
2731
2732
2733#############################################################
2734#void   aesni_gcm_precomp_avx_gen4
2735#        (gcm_data     *my_ctx_data,
2736#        u8     *hash_subkey)# /* H, the Hash sub key input.
2737#				Data starts on a 16-byte boundary. */
2738#############################################################
2739ENTRY(aesni_gcm_precomp_avx_gen4)
2740        #the number of pushes must equal STACK_OFFSET
2741        push    %r12
2742        push    %r13
2743        push    %r14
2744        push    %r15
2745
2746        mov     %rsp, %r14
2747
2748
2749
2750        sub     $VARIABLE_OFFSET, %rsp
2751        and     $~63, %rsp                    # align rsp to 64 bytes
2752
2753        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
2754
2755        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
2756        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2757        vmovdqa  %xmm6, %xmm2
2758        vpsllq   $1, %xmm6, %xmm6
2759        vpsrlq   $63, %xmm2, %xmm2
2760        vmovdqa  %xmm2, %xmm1
2761        vpslldq  $8, %xmm2, %xmm2
2762        vpsrldq  $8, %xmm1, %xmm1
2763        vpor     %xmm2, %xmm6, %xmm6
2764        #reduction
2765        vpshufd  $0b00100100, %xmm1, %xmm2
2766        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2767        vpand    POLY(%rip), %xmm2, %xmm2
2768        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
2769        #######################################################################
2770        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
2771
2772
2773        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2774
2775        mov     %r14, %rsp
2776
2777        pop     %r15
2778        pop     %r14
2779        pop     %r13
2780        pop     %r12
2781        ret
2782ENDPROC(aesni_gcm_precomp_avx_gen4)
2783
2784
2785###############################################################################
2786#void   aesni_gcm_enc_avx_gen4(
2787#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2788#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2789#        const   u8 *in, /* Plaintext input */
2790#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2791#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2792#			(from Security Association) concatenated with 8 byte
2793#			 Initialisation Vector (from IPSec ESP Payload)
2794#			 concatenated with 0x00000001. 16-byte aligned pointer. */
2795#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2796#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2797#        u8      *auth_tag, /* Authenticated Tag output. */
2798#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2799#				Valid values are 16 (most likely), 12 or 8. */
2800###############################################################################
2801ENTRY(aesni_gcm_enc_avx_gen4)
2802        GCM_ENC_DEC_AVX2     ENC
2803	ret
2804ENDPROC(aesni_gcm_enc_avx_gen4)
2805
2806###############################################################################
2807#void   aesni_gcm_dec_avx_gen4(
2808#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2809#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2810#        const   u8 *in, /* Ciphertext input */
2811#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2812#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2813#			(from Security Association) concatenated with 8 byte
2814#			Initialisation Vector (from IPSec ESP Payload)
2815#			concatenated with 0x00000001. 16-byte aligned pointer. */
2816#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2817#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2818#        u8      *auth_tag, /* Authenticated Tag output. */
2819#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2820#				Valid values are 16 (most likely), 12 or 8. */
2821###############################################################################
2822ENTRY(aesni_gcm_dec_avx_gen4)
2823        GCM_ENC_DEC_AVX2     DEC
2824	ret
2825ENDPROC(aesni_gcm_dec_avx_gen4)
2826
2827#endif /* CONFIG_AS_AVX2 */
2828