1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125# constants in mergeable sections, linker can reorder and merge
126.section	.rodata.cst16.POLY, "aM", @progbits, 16
127.align 16
128POLY:            .octa     0xC2000000000000000000000000000001
129
130.section	.rodata.cst16.POLY2, "aM", @progbits, 16
131.align 16
132POLY2:           .octa     0xC20000000000000000000001C2000000
133
134.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
135.align 16
136TWOONE:          .octa     0x00000001000000000000000000000001
137
138.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139.align 16
140SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
141
142.section	.rodata.cst16.ONE, "aM", @progbits, 16
143.align 16
144ONE:             .octa     0x00000000000000000000000000000001
145
146.section	.rodata.cst16.ONEf, "aM", @progbits, 16
147.align 16
148ONEf:            .octa     0x01000000000000000000000000000000
149
150# order of these constants should not change.
151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152.section	.rodata, "a", @progbits
153.align 16
154SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
155ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
156                 .octa     0x00000000000000000000000000000000
157
158.section .rodata
159.align 16
160.type aad_shift_arr, @object
161.size aad_shift_arr, 272
162aad_shift_arr:
163        .octa     0xffffffffffffffffffffffffffffffff
164        .octa     0xffffffffffffffffffffffffffffff0C
165        .octa     0xffffffffffffffffffffffffffff0D0C
166        .octa     0xffffffffffffffffffffffffff0E0D0C
167        .octa     0xffffffffffffffffffffffff0F0E0D0C
168        .octa     0xffffffffffffffffffffff0C0B0A0908
169        .octa     0xffffffffffffffffffff0D0C0B0A0908
170        .octa     0xffffffffffffffffff0E0D0C0B0A0908
171        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
172        .octa     0xffffffffffffff0C0B0A090807060504
173        .octa     0xffffffffffff0D0C0B0A090807060504
174        .octa     0xffffffffff0E0D0C0B0A090807060504
175        .octa     0xffffffff0F0E0D0C0B0A090807060504
176        .octa     0xffffff0C0B0A09080706050403020100
177        .octa     0xffff0D0C0B0A09080706050403020100
178        .octa     0xff0E0D0C0B0A09080706050403020100
179        .octa     0x0F0E0D0C0B0A09080706050403020100
180
181
182.text
183
184
185#define AadHash 16*0
186#define AadLen 16*1
187#define InLen (16*1)+8
188#define PBlockEncKey 16*2
189#define OrigIV 16*3
190#define CurCount 16*4
191#define PBlockLen 16*5
192
193HashKey        = 16*6   # store HashKey <<1 mod poly here
194HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
195HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
196HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
197HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
198HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
199HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
200HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
201HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
209
210#define arg1 %rdi
211#define arg2 %rsi
212#define arg3 %rdx
213#define arg4 %rcx
214#define arg5 %r8
215#define arg6 %r9
216#define arg7 STACK_OFFSET+8*1(%r14)
217#define arg8 STACK_OFFSET+8*2(%r14)
218#define arg9 STACK_OFFSET+8*3(%r14)
219#define arg10 STACK_OFFSET+8*4(%r14)
220#define keysize 2*15*16(arg1)
221
222i = 0
223j = 0
224
225out_order = 0
226in_order = 1
227DEC = 0
228ENC = 1
229
230.macro define_reg r n
231reg_\r = %xmm\n
232.endm
233
234.macro setreg
235.altmacro
236define_reg i %i
237define_reg j %j
238.noaltmacro
239.endm
240
241# need to push 4 registers into stack to maintain
242STACK_OFFSET = 8*4
243
244TMP1 =   16*0    # Temporary storage for AAD
245TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246TMP3 =   16*2    # Temporary storage for AES State 3
247TMP4 =   16*3    # Temporary storage for AES State 4
248TMP5 =   16*4    # Temporary storage for AES State 5
249TMP6 =   16*5    # Temporary storage for AES State 6
250TMP7 =   16*6    # Temporary storage for AES State 7
251TMP8 =   16*7    # Temporary storage for AES State 8
252
253VARIABLE_OFFSET = 16*8
254
255################################
256# Utility Macros
257################################
258
259.macro FUNC_SAVE
260        #the number of pushes must equal STACK_OFFSET
261        push    %r12
262        push    %r13
263        push    %r14
264        push    %r15
265
266        mov     %rsp, %r14
267
268
269
270        sub     $VARIABLE_OFFSET, %rsp
271        and     $~63, %rsp                    # align rsp to 64 bytes
272.endm
273
274.macro FUNC_RESTORE
275        mov     %r14, %rsp
276
277        pop     %r15
278        pop     %r14
279        pop     %r13
280        pop     %r12
281.endm
282
283# Encryption of a single block
284.macro ENCRYPT_SINGLE_BLOCK REP XMM0
285                vpxor    (arg1), \XMM0, \XMM0
286               i = 1
287               setreg
288.rep \REP
289                vaesenc  16*i(arg1), \XMM0, \XMM0
290               i = (i+1)
291               setreg
292.endr
293                vaesenclast 16*i(arg1), \XMM0, \XMM0
294.endm
295
296# combined for GCM encrypt and decrypt functions
297# clobbering all xmm registers
298# clobbering r10, r11, r12, r13, r14, r15
299.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300        vmovdqu AadHash(arg2), %xmm8
301        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
302        add arg5, InLen(arg2)
303
304        # initialize the data pointer offset as zero
305        xor     %r11d, %r11d
306
307        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
308        sub %r11, arg5
309
310        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
311        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
312
313        mov     %r13, %r12
314        shr     $4, %r12
315        and     $7, %r12
316        jz      _initial_num_blocks_is_0\@
317
318        cmp     $7, %r12
319        je      _initial_num_blocks_is_7\@
320        cmp     $6, %r12
321        je      _initial_num_blocks_is_6\@
322        cmp     $5, %r12
323        je      _initial_num_blocks_is_5\@
324        cmp     $4, %r12
325        je      _initial_num_blocks_is_4\@
326        cmp     $3, %r12
327        je      _initial_num_blocks_is_3\@
328        cmp     $2, %r12
329        je      _initial_num_blocks_is_2\@
330
331        jmp     _initial_num_blocks_is_1\@
332
333_initial_num_blocks_is_7\@:
334        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335        sub     $16*7, %r13
336        jmp     _initial_blocks_encrypted\@
337
338_initial_num_blocks_is_6\@:
339        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
340        sub     $16*6, %r13
341        jmp     _initial_blocks_encrypted\@
342
343_initial_num_blocks_is_5\@:
344        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
345        sub     $16*5, %r13
346        jmp     _initial_blocks_encrypted\@
347
348_initial_num_blocks_is_4\@:
349        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
350        sub     $16*4, %r13
351        jmp     _initial_blocks_encrypted\@
352
353_initial_num_blocks_is_3\@:
354        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355        sub     $16*3, %r13
356        jmp     _initial_blocks_encrypted\@
357
358_initial_num_blocks_is_2\@:
359        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
360        sub     $16*2, %r13
361        jmp     _initial_blocks_encrypted\@
362
363_initial_num_blocks_is_1\@:
364        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
365        sub     $16*1, %r13
366        jmp     _initial_blocks_encrypted\@
367
368_initial_num_blocks_is_0\@:
369        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
370
371
372_initial_blocks_encrypted\@:
373        cmp     $0, %r13
374        je      _zero_cipher_left\@
375
376        sub     $128, %r13
377        je      _eight_cipher_left\@
378
379
380
381
382        vmovd   %xmm9, %r15d
383        and     $255, %r15d
384        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
385
386
387_encrypt_by_8_new\@:
388        cmp     $(255-8), %r15d
389        jg      _encrypt_by_8\@
390
391
392
393        add     $8, %r15b
394        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
395        add     $128, %r11
396        sub     $128, %r13
397        jne     _encrypt_by_8_new\@
398
399        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400        jmp     _eight_cipher_left\@
401
402_encrypt_by_8\@:
403        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
404        add     $8, %r15b
405        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
406        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
407        add     $128, %r11
408        sub     $128, %r13
409        jne     _encrypt_by_8_new\@
410
411        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
412
413
414
415
416_eight_cipher_left\@:
417        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
418
419
420_zero_cipher_left\@:
421        vmovdqu %xmm14, AadHash(arg2)
422        vmovdqu %xmm9, CurCount(arg2)
423
424        # check for 0 length
425        mov     arg5, %r13
426        and     $15, %r13                            # r13 = (arg5 mod 16)
427
428        je      _multiple_of_16_bytes\@
429
430        # handle the last <16 Byte block separately
431
432        mov %r13, PBlockLen(arg2)
433
434        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
435        vmovdqu %xmm9, CurCount(arg2)
436        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
437
438        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
439        vmovdqu %xmm9, PBlockEncKey(arg2)
440
441        cmp $16, arg5
442        jge _large_enough_update\@
443
444        lea (arg4,%r11,1), %r10
445        mov %r13, %r12
446
447        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
448
449        lea     SHIFT_MASK+16(%rip), %r12
450        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
451						     # able to shift 16-r13 bytes (r13 is the
452	# number of bytes in plaintext mod 16)
453
454        jmp _final_ghash_mul\@
455
456_large_enough_update\@:
457        sub $16, %r11
458        add %r13, %r11
459
460        # receive the last <16 Byte block
461        vmovdqu	(arg4, %r11, 1), %xmm1
462
463        sub	%r13, %r11
464        add	$16, %r11
465
466        lea	SHIFT_MASK+16(%rip), %r12
467        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
468        # (r13 is the number of bytes in plaintext mod 16)
469        sub	%r13, %r12
470        # get the appropriate shuffle mask
471        vmovdqu	(%r12), %xmm2
472        # shift right 16-r13 bytes
473        vpshufb  %xmm2, %xmm1, %xmm1
474
475_final_ghash_mul\@:
476        .if  \ENC_DEC ==  DEC
477        vmovdqa %xmm1, %xmm2
478        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
479        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
480						     # mask out top 16-r13 bytes of xmm9
481        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
482        vpand   %xmm1, %xmm2, %xmm2
483        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
484        vpxor   %xmm2, %xmm14, %xmm14
485
486        vmovdqu %xmm14, AadHash(arg2)
487        .else
488        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
489        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
490						     # mask out top 16-r13 bytes of xmm9
491        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
492        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
493        vpxor   %xmm9, %xmm14, %xmm14
494
495        vmovdqu %xmm14, AadHash(arg2)
496        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
497        .endif
498
499
500        #############################
501        # output r13 Bytes
502        vmovq   %xmm9, %rax
503        cmp     $8, %r13
504        jle     _less_than_8_bytes_left\@
505
506        mov     %rax, (arg3 , %r11)
507        add     $8, %r11
508        vpsrldq $8, %xmm9, %xmm9
509        vmovq   %xmm9, %rax
510        sub     $8, %r13
511
512_less_than_8_bytes_left\@:
513        movb    %al, (arg3 , %r11)
514        add     $1, %r11
515        shr     $8, %rax
516        sub     $1, %r13
517        jne     _less_than_8_bytes_left\@
518        #############################
519
520_multiple_of_16_bytes\@:
521.endm
522
523
524# GCM_COMPLETE Finishes update of tag of last partial block
525# Output: Authorization Tag (AUTH_TAG)
526# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
527.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
528        vmovdqu AadHash(arg2), %xmm14
529        vmovdqu HashKey(arg2), %xmm13
530
531        mov PBlockLen(arg2), %r12
532        cmp $0, %r12
533        je _partial_done\@
534
535	#GHASH computation for the last <16 Byte block
536        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
537
538_partial_done\@:
539        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
540        shl     $3, %r12                             # convert into number of bits
541        vmovd   %r12d, %xmm15                        # len(A) in xmm15
542
543        mov InLen(arg2), %r12
544        shl     $3, %r12                        # len(C) in bits  (*128)
545        vmovq   %r12, %xmm1
546        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
547        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
548
549        vpxor   %xmm15, %xmm14, %xmm14
550        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
551        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
552
553        vmovdqu OrigIV(arg2), %xmm9
554
555        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
556
557        vpxor   %xmm14, %xmm9, %xmm9
558
559
560
561_return_T\@:
562        mov     \AUTH_TAG, %r10              # r10 = authTag
563        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
564
565        cmp     $16, %r11
566        je      _T_16\@
567
568        cmp     $8, %r11
569        jl      _T_4\@
570
571_T_8\@:
572        vmovq   %xmm9, %rax
573        mov     %rax, (%r10)
574        add     $8, %r10
575        sub     $8, %r11
576        vpsrldq $8, %xmm9, %xmm9
577        cmp     $0, %r11
578        je     _return_T_done\@
579_T_4\@:
580        vmovd   %xmm9, %eax
581        mov     %eax, (%r10)
582        add     $4, %r10
583        sub     $4, %r11
584        vpsrldq     $4, %xmm9, %xmm9
585        cmp     $0, %r11
586        je     _return_T_done\@
587_T_123\@:
588        vmovd     %xmm9, %eax
589        cmp     $2, %r11
590        jl     _T_1\@
591        mov     %ax, (%r10)
592        cmp     $2, %r11
593        je     _return_T_done\@
594        add     $2, %r10
595        sar     $16, %eax
596_T_1\@:
597        mov     %al, (%r10)
598        jmp     _return_T_done\@
599
600_T_16\@:
601        vmovdqu %xmm9, (%r10)
602
603_return_T_done\@:
604.endm
605
606.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
607
608	mov     \AAD, %r10                      # r10 = AAD
609	mov     \AADLEN, %r12                      # r12 = aadLen
610
611
612	mov     %r12, %r11
613
614	vpxor   \T8, \T8, \T8
615	vpxor   \T7, \T7, \T7
616	cmp     $16, %r11
617	jl      _get_AAD_rest8\@
618_get_AAD_blocks\@:
619	vmovdqu (%r10), \T7
620	vpshufb SHUF_MASK(%rip), \T7, \T7
621	vpxor   \T7, \T8, \T8
622	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
623	add     $16, %r10
624	sub     $16, %r12
625	sub     $16, %r11
626	cmp     $16, %r11
627	jge     _get_AAD_blocks\@
628	vmovdqu \T8, \T7
629	cmp     $0, %r11
630	je      _get_AAD_done\@
631
632	vpxor   \T7, \T7, \T7
633
634	/* read the last <16B of AAD. since we have at least 4B of
635	data right after the AAD (the ICV, and maybe some CT), we can
636	read 4B/8B blocks safely, and then get rid of the extra stuff */
637_get_AAD_rest8\@:
638	cmp     $4, %r11
639	jle     _get_AAD_rest4\@
640	movq    (%r10), \T1
641	add     $8, %r10
642	sub     $8, %r11
643	vpslldq $8, \T1, \T1
644	vpsrldq $8, \T7, \T7
645	vpxor   \T1, \T7, \T7
646	jmp     _get_AAD_rest8\@
647_get_AAD_rest4\@:
648	cmp     $0, %r11
649	jle      _get_AAD_rest0\@
650	mov     (%r10), %eax
651	movq    %rax, \T1
652	add     $4, %r10
653	sub     $4, %r11
654	vpslldq $12, \T1, \T1
655	vpsrldq $4, \T7, \T7
656	vpxor   \T1, \T7, \T7
657_get_AAD_rest0\@:
658	/* finalize: shift out the extra bytes we read, and align
659	left. since pslldq can only shift by an immediate, we use
660	vpshufb and an array of shuffle masks */
661	movq    %r12, %r11
662	salq    $4, %r11
663	vmovdqu  aad_shift_arr(%r11), \T1
664	vpshufb \T1, \T7, \T7
665_get_AAD_rest_final\@:
666	vpshufb SHUF_MASK(%rip), \T7, \T7
667	vpxor   \T8, \T7, \T7
668	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
669
670_get_AAD_done\@:
671        vmovdqu \T7, AadHash(arg2)
672.endm
673
674.macro INIT GHASH_MUL PRECOMPUTE
675        mov arg6, %r11
676        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
677        xor %r11d, %r11d
678        mov %r11, InLen(arg2) # ctx_data.in_length = 0
679
680        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
681        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
682        mov arg3, %rax
683        movdqu (%rax), %xmm0
684        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
685
686        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
687        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
688
689        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
690
691        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
692        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
693        vmovdqa  %xmm6, %xmm2
694        vpsllq   $1, %xmm6, %xmm6
695        vpsrlq   $63, %xmm2, %xmm2
696        vmovdqa  %xmm2, %xmm1
697        vpslldq  $8, %xmm2, %xmm2
698        vpsrldq  $8, %xmm1, %xmm1
699        vpor     %xmm2, %xmm6, %xmm6
700        #reduction
701        vpshufd  $0b00100100, %xmm1, %xmm2
702        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
703        vpand    POLY(%rip), %xmm2, %xmm2
704        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
705        #######################################################################
706        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
707
708        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
709
710        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
711.endm
712
713
714# Reads DLEN bytes starting at DPTR and stores in XMMDst
715# where 0 < DLEN < 16
716# Clobbers %rax, DLEN
717.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
718        vpxor \XMMDst, \XMMDst, \XMMDst
719
720        cmp $8, \DLEN
721        jl _read_lt8_\@
722        mov (\DPTR), %rax
723        vpinsrq $0, %rax, \XMMDst, \XMMDst
724        sub $8, \DLEN
725        jz _done_read_partial_block_\@
726        xor %eax, %eax
727_read_next_byte_\@:
728        shl $8, %rax
729        mov 7(\DPTR, \DLEN, 1), %al
730        dec \DLEN
731        jnz _read_next_byte_\@
732        vpinsrq $1, %rax, \XMMDst, \XMMDst
733        jmp _done_read_partial_block_\@
734_read_lt8_\@:
735        xor %eax, %eax
736_read_next_byte_lt8_\@:
737        shl $8, %rax
738        mov -1(\DPTR, \DLEN, 1), %al
739        dec \DLEN
740        jnz _read_next_byte_lt8_\@
741        vpinsrq $0, %rax, \XMMDst, \XMMDst
742_done_read_partial_block_\@:
743.endm
744
745# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
746# between update calls.
747# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
748# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
749# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
750.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
751        AAD_HASH ENC_DEC
752        mov 	PBlockLen(arg2), %r13
753        cmp	$0, %r13
754        je	_partial_block_done_\@	# Leave Macro if no partial blocks
755        # Read in input data without over reading
756        cmp	$16, \PLAIN_CYPH_LEN
757        jl	_fewer_than_16_bytes_\@
758        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
759        jmp	_data_read_\@
760
761_fewer_than_16_bytes_\@:
762        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
763        mov	\PLAIN_CYPH_LEN, %r12
764        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
765
766        mov PBlockLen(arg2), %r13
767
768_data_read_\@:				# Finished reading in data
769
770        vmovdqu	PBlockEncKey(arg2), %xmm9
771        vmovdqu	HashKey(arg2), %xmm13
772
773        lea	SHIFT_MASK(%rip), %r12
774
775        # adjust the shuffle mask pointer to be able to shift r13 bytes
776        # r16-r13 is the number of bytes in plaintext mod 16)
777        add	%r13, %r12
778        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
779        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
780
781.if  \ENC_DEC ==  DEC
782        vmovdqa	%xmm1, %xmm3
783        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
784
785        mov	\PLAIN_CYPH_LEN, %r10
786        add	%r13, %r10
787        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
788        sub	$16, %r10
789        # Determine if if partial block is not being filled and
790        # shift mask accordingly
791        jge	_no_extra_mask_1_\@
792        sub	%r10, %r12
793_no_extra_mask_1_\@:
794
795        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
796        # get the appropriate mask to mask out bottom r13 bytes of xmm9
797        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
798
799        vpand	%xmm1, %xmm3, %xmm3
800        vmovdqa	SHUF_MASK(%rip), %xmm10
801        vpshufb	%xmm10, %xmm3, %xmm3
802        vpshufb	%xmm2, %xmm3, %xmm3
803        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
804
805        cmp	$0, %r10
806        jl	_partial_incomplete_1_\@
807
808        # GHASH computation for the last <16 Byte block
809        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
810        xor	%eax,%eax
811
812        mov	%rax, PBlockLen(arg2)
813        jmp	_dec_done_\@
814_partial_incomplete_1_\@:
815        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
816_dec_done_\@:
817        vmovdqu	\AAD_HASH, AadHash(arg2)
818.else
819        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
820
821        mov	\PLAIN_CYPH_LEN, %r10
822        add	%r13, %r10
823        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
824        sub	$16, %r10
825        # Determine if if partial block is not being filled and
826        # shift mask accordingly
827        jge	_no_extra_mask_2_\@
828        sub	%r10, %r12
829_no_extra_mask_2_\@:
830
831        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
832        # get the appropriate mask to mask out bottom r13 bytes of xmm9
833        vpand	%xmm1, %xmm9, %xmm9
834
835        vmovdqa	SHUF_MASK(%rip), %xmm1
836        vpshufb %xmm1, %xmm9, %xmm9
837        vpshufb %xmm2, %xmm9, %xmm9
838        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
839
840        cmp	$0, %r10
841        jl	_partial_incomplete_2_\@
842
843        # GHASH computation for the last <16 Byte block
844        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
845        xor	%eax,%eax
846
847        mov	%rax, PBlockLen(arg2)
848        jmp	_encode_done_\@
849_partial_incomplete_2_\@:
850        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
851_encode_done_\@:
852        vmovdqu	\AAD_HASH, AadHash(arg2)
853
854        vmovdqa	SHUF_MASK(%rip), %xmm10
855        # shuffle xmm9 back to output as ciphertext
856        vpshufb	%xmm10, %xmm9, %xmm9
857        vpshufb	%xmm2, %xmm9, %xmm9
858.endif
859        # output encrypted Bytes
860        cmp	$0, %r10
861        jl	_partial_fill_\@
862        mov	%r13, %r12
863        mov	$16, %r13
864        # Set r13 to be the number of bytes to write out
865        sub	%r12, %r13
866        jmp	_count_set_\@
867_partial_fill_\@:
868        mov	\PLAIN_CYPH_LEN, %r13
869_count_set_\@:
870        vmovdqa	%xmm9, %xmm0
871        vmovq	%xmm0, %rax
872        cmp	$8, %r13
873        jle	_less_than_8_bytes_left_\@
874
875        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
876        add	$8, \DATA_OFFSET
877        psrldq	$8, %xmm0
878        vmovq	%xmm0, %rax
879        sub	$8, %r13
880_less_than_8_bytes_left_\@:
881        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
882        add	$1, \DATA_OFFSET
883        shr	$8, %rax
884        sub	$1, %r13
885        jne	_less_than_8_bytes_left_\@
886_partial_block_done_\@:
887.endm # PARTIAL_BLOCK
888
889###############################################################################
890# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
891# Input: A and B (128-bits each, bit-reflected)
892# Output: C = A*B*x mod poly, (i.e. >>1 )
893# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
894# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
895###############################################################################
896.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
897
898        vpshufd         $0b01001110, \GH, \T2
899        vpshufd         $0b01001110, \HK, \T3
900        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
901        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
902
903        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
904        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
905        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
906        vpxor           \GH, \T2,\T2
907        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
908
909        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
910        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
911        vpxor           \T3, \GH, \GH
912        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
913
914        #first phase of the reduction
915        vpslld  $31, \GH, \T2                   # packed right shifting << 31
916        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
917        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
918
919        vpxor   \T3, \T2, \T2                   # xor the shifted versions
920        vpxor   \T4, \T2, \T2
921
922        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
923
924        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
925        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
926
927        #second phase of the reduction
928
929        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
930        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
931        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
932        vpxor   \T3, \T2, \T2                   # xor the shifted versions
933        vpxor   \T4, \T2, \T2
934
935        vpxor   \T5, \T2, \T2
936        vpxor   \T2, \GH, \GH
937        vpxor   \T1, \GH, \GH                   # the result is in GH
938
939
940.endm
941
942.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
943
944        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
945        vmovdqa  \HK, \T5
946
947        vpshufd  $0b01001110, \T5, \T1
948        vpxor    \T5, \T1, \T1
949        vmovdqu  \T1, HashKey_k(arg2)
950
951        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
952        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
953        vpshufd  $0b01001110, \T5, \T1
954        vpxor    \T5, \T1, \T1
955        vmovdqu  \T1, HashKey_2_k(arg2)
956
957        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
958        vmovdqu  \T5, HashKey_3(arg2)
959        vpshufd  $0b01001110, \T5, \T1
960        vpxor    \T5, \T1, \T1
961        vmovdqu  \T1, HashKey_3_k(arg2)
962
963        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
964        vmovdqu  \T5, HashKey_4(arg2)
965        vpshufd  $0b01001110, \T5, \T1
966        vpxor    \T5, \T1, \T1
967        vmovdqu  \T1, HashKey_4_k(arg2)
968
969        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
970        vmovdqu  \T5, HashKey_5(arg2)
971        vpshufd  $0b01001110, \T5, \T1
972        vpxor    \T5, \T1, \T1
973        vmovdqu  \T1, HashKey_5_k(arg2)
974
975        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
976        vmovdqu  \T5, HashKey_6(arg2)
977        vpshufd  $0b01001110, \T5, \T1
978        vpxor    \T5, \T1, \T1
979        vmovdqu  \T1, HashKey_6_k(arg2)
980
981        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
982        vmovdqu  \T5, HashKey_7(arg2)
983        vpshufd  $0b01001110, \T5, \T1
984        vpxor    \T5, \T1, \T1
985        vmovdqu  \T1, HashKey_7_k(arg2)
986
987        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
988        vmovdqu  \T5, HashKey_8(arg2)
989        vpshufd  $0b01001110, \T5, \T1
990        vpxor    \T5, \T1, \T1
991        vmovdqu  \T1, HashKey_8_k(arg2)
992
993.endm
994
995## if a = number of total plaintext bytes
996## b = floor(a/16)
997## num_initial_blocks = b mod 4#
998## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
999## r10, r11, r12, rax are clobbered
1000## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1001
1002.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1003	i = (8-\num_initial_blocks)
1004	setreg
1005        vmovdqu AadHash(arg2), reg_i
1006
1007	# start AES for num_initial_blocks blocks
1008	vmovdqu CurCount(arg2), \CTR
1009
1010	i = (9-\num_initial_blocks)
1011	setreg
1012.rep \num_initial_blocks
1013                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
1014                vmovdqa \CTR, reg_i
1015                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1016	i = (i+1)
1017	setreg
1018.endr
1019
1020	vmovdqa  (arg1), \T_key
1021	i = (9-\num_initial_blocks)
1022	setreg
1023.rep \num_initial_blocks
1024                vpxor   \T_key, reg_i, reg_i
1025	i = (i+1)
1026	setreg
1027.endr
1028
1029       j = 1
1030       setreg
1031.rep \REP
1032       vmovdqa  16*j(arg1), \T_key
1033	i = (9-\num_initial_blocks)
1034	setreg
1035.rep \num_initial_blocks
1036        vaesenc \T_key, reg_i, reg_i
1037	i = (i+1)
1038	setreg
1039.endr
1040
1041       j = (j+1)
1042       setreg
1043.endr
1044
1045	vmovdqa  16*j(arg1), \T_key
1046	i = (9-\num_initial_blocks)
1047	setreg
1048.rep \num_initial_blocks
1049        vaesenclast      \T_key, reg_i, reg_i
1050	i = (i+1)
1051	setreg
1052.endr
1053
1054	i = (9-\num_initial_blocks)
1055	setreg
1056.rep \num_initial_blocks
1057                vmovdqu (arg4, %r11), \T1
1058                vpxor   \T1, reg_i, reg_i
1059                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1060                add     $16, %r11
1061.if  \ENC_DEC == DEC
1062                vmovdqa \T1, reg_i
1063.endif
1064                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1065	i = (i+1)
1066	setreg
1067.endr
1068
1069
1070	i = (8-\num_initial_blocks)
1071	j = (9-\num_initial_blocks)
1072	setreg
1073
1074.rep \num_initial_blocks
1075        vpxor    reg_i, reg_j, reg_j
1076        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1077	i = (i+1)
1078	j = (j+1)
1079	setreg
1080.endr
1081        # XMM8 has the combined result here
1082
1083        vmovdqa  \XMM8, TMP1(%rsp)
1084        vmovdqa  \XMM8, \T3
1085
1086        cmp     $128, %r13
1087        jl      _initial_blocks_done\@                  # no need for precomputed constants
1088
1089###############################################################################
1090# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1091                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1092                vmovdqa  \CTR, \XMM1
1093                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1094
1095                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1096                vmovdqa  \CTR, \XMM2
1097                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1098
1099                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1100                vmovdqa  \CTR, \XMM3
1101                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1102
1103                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1104                vmovdqa  \CTR, \XMM4
1105                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1106
1107                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1108                vmovdqa  \CTR, \XMM5
1109                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1110
1111                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1112                vmovdqa  \CTR, \XMM6
1113                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1114
1115                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1116                vmovdqa  \CTR, \XMM7
1117                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1118
1119                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1120                vmovdqa  \CTR, \XMM8
1121                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1122
1123                vmovdqa  (arg1), \T_key
1124                vpxor    \T_key, \XMM1, \XMM1
1125                vpxor    \T_key, \XMM2, \XMM2
1126                vpxor    \T_key, \XMM3, \XMM3
1127                vpxor    \T_key, \XMM4, \XMM4
1128                vpxor    \T_key, \XMM5, \XMM5
1129                vpxor    \T_key, \XMM6, \XMM6
1130                vpxor    \T_key, \XMM7, \XMM7
1131                vpxor    \T_key, \XMM8, \XMM8
1132
1133               i = 1
1134               setreg
1135.rep    \REP       # do REP rounds
1136                vmovdqa  16*i(arg1), \T_key
1137                vaesenc  \T_key, \XMM1, \XMM1
1138                vaesenc  \T_key, \XMM2, \XMM2
1139                vaesenc  \T_key, \XMM3, \XMM3
1140                vaesenc  \T_key, \XMM4, \XMM4
1141                vaesenc  \T_key, \XMM5, \XMM5
1142                vaesenc  \T_key, \XMM6, \XMM6
1143                vaesenc  \T_key, \XMM7, \XMM7
1144                vaesenc  \T_key, \XMM8, \XMM8
1145               i = (i+1)
1146               setreg
1147.endr
1148
1149                vmovdqa  16*i(arg1), \T_key
1150                vaesenclast  \T_key, \XMM1, \XMM1
1151                vaesenclast  \T_key, \XMM2, \XMM2
1152                vaesenclast  \T_key, \XMM3, \XMM3
1153                vaesenclast  \T_key, \XMM4, \XMM4
1154                vaesenclast  \T_key, \XMM5, \XMM5
1155                vaesenclast  \T_key, \XMM6, \XMM6
1156                vaesenclast  \T_key, \XMM7, \XMM7
1157                vaesenclast  \T_key, \XMM8, \XMM8
1158
1159                vmovdqu  (arg4, %r11), \T1
1160                vpxor    \T1, \XMM1, \XMM1
1161                vmovdqu  \XMM1, (arg3 , %r11)
1162                .if   \ENC_DEC == DEC
1163                vmovdqa  \T1, \XMM1
1164                .endif
1165
1166                vmovdqu  16*1(arg4, %r11), \T1
1167                vpxor    \T1, \XMM2, \XMM2
1168                vmovdqu  \XMM2, 16*1(arg3 , %r11)
1169                .if   \ENC_DEC == DEC
1170                vmovdqa  \T1, \XMM2
1171                .endif
1172
1173                vmovdqu  16*2(arg4, %r11), \T1
1174                vpxor    \T1, \XMM3, \XMM3
1175                vmovdqu  \XMM3, 16*2(arg3 , %r11)
1176                .if   \ENC_DEC == DEC
1177                vmovdqa  \T1, \XMM3
1178                .endif
1179
1180                vmovdqu  16*3(arg4, %r11), \T1
1181                vpxor    \T1, \XMM4, \XMM4
1182                vmovdqu  \XMM4, 16*3(arg3 , %r11)
1183                .if   \ENC_DEC == DEC
1184                vmovdqa  \T1, \XMM4
1185                .endif
1186
1187                vmovdqu  16*4(arg4, %r11), \T1
1188                vpxor    \T1, \XMM5, \XMM5
1189                vmovdqu  \XMM5, 16*4(arg3 , %r11)
1190                .if   \ENC_DEC == DEC
1191                vmovdqa  \T1, \XMM5
1192                .endif
1193
1194                vmovdqu  16*5(arg4, %r11), \T1
1195                vpxor    \T1, \XMM6, \XMM6
1196                vmovdqu  \XMM6, 16*5(arg3 , %r11)
1197                .if   \ENC_DEC == DEC
1198                vmovdqa  \T1, \XMM6
1199                .endif
1200
1201                vmovdqu  16*6(arg4, %r11), \T1
1202                vpxor    \T1, \XMM7, \XMM7
1203                vmovdqu  \XMM7, 16*6(arg3 , %r11)
1204                .if   \ENC_DEC == DEC
1205                vmovdqa  \T1, \XMM7
1206                .endif
1207
1208                vmovdqu  16*7(arg4, %r11), \T1
1209                vpxor    \T1, \XMM8, \XMM8
1210                vmovdqu  \XMM8, 16*7(arg3 , %r11)
1211                .if   \ENC_DEC == DEC
1212                vmovdqa  \T1, \XMM8
1213                .endif
1214
1215                add     $128, %r11
1216
1217                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1218                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1219                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1220                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1221                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1222                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1223                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1224                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1225                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1226
1227###############################################################################
1228
1229_initial_blocks_done\@:
1230
1231.endm
1232
1233# encrypt 8 blocks at a time
1234# ghash the 8 previously encrypted ciphertext blocks
1235# arg1, arg3, arg4 are used as pointers only, not modified
1236# r11 is the data offset value
1237.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1238
1239        vmovdqa \XMM1, \T2
1240        vmovdqa \XMM2, TMP2(%rsp)
1241        vmovdqa \XMM3, TMP3(%rsp)
1242        vmovdqa \XMM4, TMP4(%rsp)
1243        vmovdqa \XMM5, TMP5(%rsp)
1244        vmovdqa \XMM6, TMP6(%rsp)
1245        vmovdqa \XMM7, TMP7(%rsp)
1246        vmovdqa \XMM8, TMP8(%rsp)
1247
1248.if \loop_idx == in_order
1249                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1250                vpaddd  ONE(%rip), \XMM1, \XMM2
1251                vpaddd  ONE(%rip), \XMM2, \XMM3
1252                vpaddd  ONE(%rip), \XMM3, \XMM4
1253                vpaddd  ONE(%rip), \XMM4, \XMM5
1254                vpaddd  ONE(%rip), \XMM5, \XMM6
1255                vpaddd  ONE(%rip), \XMM6, \XMM7
1256                vpaddd  ONE(%rip), \XMM7, \XMM8
1257                vmovdqa \XMM8, \CTR
1258
1259                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1260                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1261                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1262                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1263                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1264                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1265                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1266                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1267.else
1268                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1269                vpaddd  ONEf(%rip), \XMM1, \XMM2
1270                vpaddd  ONEf(%rip), \XMM2, \XMM3
1271                vpaddd  ONEf(%rip), \XMM3, \XMM4
1272                vpaddd  ONEf(%rip), \XMM4, \XMM5
1273                vpaddd  ONEf(%rip), \XMM5, \XMM6
1274                vpaddd  ONEf(%rip), \XMM6, \XMM7
1275                vpaddd  ONEf(%rip), \XMM7, \XMM8
1276                vmovdqa \XMM8, \CTR
1277.endif
1278
1279
1280        #######################################################################
1281
1282                vmovdqu (arg1), \T1
1283                vpxor   \T1, \XMM1, \XMM1
1284                vpxor   \T1, \XMM2, \XMM2
1285                vpxor   \T1, \XMM3, \XMM3
1286                vpxor   \T1, \XMM4, \XMM4
1287                vpxor   \T1, \XMM5, \XMM5
1288                vpxor   \T1, \XMM6, \XMM6
1289                vpxor   \T1, \XMM7, \XMM7
1290                vpxor   \T1, \XMM8, \XMM8
1291
1292        #######################################################################
1293
1294
1295
1296
1297
1298                vmovdqu 16*1(arg1), \T1
1299                vaesenc \T1, \XMM1, \XMM1
1300                vaesenc \T1, \XMM2, \XMM2
1301                vaesenc \T1, \XMM3, \XMM3
1302                vaesenc \T1, \XMM4, \XMM4
1303                vaesenc \T1, \XMM5, \XMM5
1304                vaesenc \T1, \XMM6, \XMM6
1305                vaesenc \T1, \XMM7, \XMM7
1306                vaesenc \T1, \XMM8, \XMM8
1307
1308                vmovdqu 16*2(arg1), \T1
1309                vaesenc \T1, \XMM1, \XMM1
1310                vaesenc \T1, \XMM2, \XMM2
1311                vaesenc \T1, \XMM3, \XMM3
1312                vaesenc \T1, \XMM4, \XMM4
1313                vaesenc \T1, \XMM5, \XMM5
1314                vaesenc \T1, \XMM6, \XMM6
1315                vaesenc \T1, \XMM7, \XMM7
1316                vaesenc \T1, \XMM8, \XMM8
1317
1318
1319        #######################################################################
1320
1321        vmovdqu         HashKey_8(arg2), \T5
1322        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1323        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1324
1325        vpshufd         $0b01001110, \T2, \T6
1326        vpxor           \T2, \T6, \T6
1327
1328        vmovdqu         HashKey_8_k(arg2), \T5
1329        vpclmulqdq      $0x00, \T5, \T6, \T6
1330
1331                vmovdqu 16*3(arg1), \T1
1332                vaesenc \T1, \XMM1, \XMM1
1333                vaesenc \T1, \XMM2, \XMM2
1334                vaesenc \T1, \XMM3, \XMM3
1335                vaesenc \T1, \XMM4, \XMM4
1336                vaesenc \T1, \XMM5, \XMM5
1337                vaesenc \T1, \XMM6, \XMM6
1338                vaesenc \T1, \XMM7, \XMM7
1339                vaesenc \T1, \XMM8, \XMM8
1340
1341        vmovdqa         TMP2(%rsp), \T1
1342        vmovdqu         HashKey_7(arg2), \T5
1343        vpclmulqdq      $0x11, \T5, \T1, \T3
1344        vpxor           \T3, \T4, \T4
1345        vpclmulqdq      $0x00, \T5, \T1, \T3
1346        vpxor           \T3, \T7, \T7
1347
1348        vpshufd         $0b01001110, \T1, \T3
1349        vpxor           \T1, \T3, \T3
1350        vmovdqu         HashKey_7_k(arg2), \T5
1351        vpclmulqdq      $0x10, \T5, \T3, \T3
1352        vpxor           \T3, \T6, \T6
1353
1354                vmovdqu 16*4(arg1), \T1
1355                vaesenc \T1, \XMM1, \XMM1
1356                vaesenc \T1, \XMM2, \XMM2
1357                vaesenc \T1, \XMM3, \XMM3
1358                vaesenc \T1, \XMM4, \XMM4
1359                vaesenc \T1, \XMM5, \XMM5
1360                vaesenc \T1, \XMM6, \XMM6
1361                vaesenc \T1, \XMM7, \XMM7
1362                vaesenc \T1, \XMM8, \XMM8
1363
1364        #######################################################################
1365
1366        vmovdqa         TMP3(%rsp), \T1
1367        vmovdqu         HashKey_6(arg2), \T5
1368        vpclmulqdq      $0x11, \T5, \T1, \T3
1369        vpxor           \T3, \T4, \T4
1370        vpclmulqdq      $0x00, \T5, \T1, \T3
1371        vpxor           \T3, \T7, \T7
1372
1373        vpshufd         $0b01001110, \T1, \T3
1374        vpxor           \T1, \T3, \T3
1375        vmovdqu         HashKey_6_k(arg2), \T5
1376        vpclmulqdq      $0x10, \T5, \T3, \T3
1377        vpxor           \T3, \T6, \T6
1378
1379                vmovdqu 16*5(arg1), \T1
1380                vaesenc \T1, \XMM1, \XMM1
1381                vaesenc \T1, \XMM2, \XMM2
1382                vaesenc \T1, \XMM3, \XMM3
1383                vaesenc \T1, \XMM4, \XMM4
1384                vaesenc \T1, \XMM5, \XMM5
1385                vaesenc \T1, \XMM6, \XMM6
1386                vaesenc \T1, \XMM7, \XMM7
1387                vaesenc \T1, \XMM8, \XMM8
1388
1389        vmovdqa         TMP4(%rsp), \T1
1390        vmovdqu         HashKey_5(arg2), \T5
1391        vpclmulqdq      $0x11, \T5, \T1, \T3
1392        vpxor           \T3, \T4, \T4
1393        vpclmulqdq      $0x00, \T5, \T1, \T3
1394        vpxor           \T3, \T7, \T7
1395
1396        vpshufd         $0b01001110, \T1, \T3
1397        vpxor           \T1, \T3, \T3
1398        vmovdqu         HashKey_5_k(arg2), \T5
1399        vpclmulqdq      $0x10, \T5, \T3, \T3
1400        vpxor           \T3, \T6, \T6
1401
1402                vmovdqu 16*6(arg1), \T1
1403                vaesenc \T1, \XMM1, \XMM1
1404                vaesenc \T1, \XMM2, \XMM2
1405                vaesenc \T1, \XMM3, \XMM3
1406                vaesenc \T1, \XMM4, \XMM4
1407                vaesenc \T1, \XMM5, \XMM5
1408                vaesenc \T1, \XMM6, \XMM6
1409                vaesenc \T1, \XMM7, \XMM7
1410                vaesenc \T1, \XMM8, \XMM8
1411
1412
1413        vmovdqa         TMP5(%rsp), \T1
1414        vmovdqu         HashKey_4(arg2), \T5
1415        vpclmulqdq      $0x11, \T5, \T1, \T3
1416        vpxor           \T3, \T4, \T4
1417        vpclmulqdq      $0x00, \T5, \T1, \T3
1418        vpxor           \T3, \T7, \T7
1419
1420        vpshufd         $0b01001110, \T1, \T3
1421        vpxor           \T1, \T3, \T3
1422        vmovdqu         HashKey_4_k(arg2), \T5
1423        vpclmulqdq      $0x10, \T5, \T3, \T3
1424        vpxor           \T3, \T6, \T6
1425
1426                vmovdqu 16*7(arg1), \T1
1427                vaesenc \T1, \XMM1, \XMM1
1428                vaesenc \T1, \XMM2, \XMM2
1429                vaesenc \T1, \XMM3, \XMM3
1430                vaesenc \T1, \XMM4, \XMM4
1431                vaesenc \T1, \XMM5, \XMM5
1432                vaesenc \T1, \XMM6, \XMM6
1433                vaesenc \T1, \XMM7, \XMM7
1434                vaesenc \T1, \XMM8, \XMM8
1435
1436        vmovdqa         TMP6(%rsp), \T1
1437        vmovdqu         HashKey_3(arg2), \T5
1438        vpclmulqdq      $0x11, \T5, \T1, \T3
1439        vpxor           \T3, \T4, \T4
1440        vpclmulqdq      $0x00, \T5, \T1, \T3
1441        vpxor           \T3, \T7, \T7
1442
1443        vpshufd         $0b01001110, \T1, \T3
1444        vpxor           \T1, \T3, \T3
1445        vmovdqu         HashKey_3_k(arg2), \T5
1446        vpclmulqdq      $0x10, \T5, \T3, \T3
1447        vpxor           \T3, \T6, \T6
1448
1449
1450                vmovdqu 16*8(arg1), \T1
1451                vaesenc \T1, \XMM1, \XMM1
1452                vaesenc \T1, \XMM2, \XMM2
1453                vaesenc \T1, \XMM3, \XMM3
1454                vaesenc \T1, \XMM4, \XMM4
1455                vaesenc \T1, \XMM5, \XMM5
1456                vaesenc \T1, \XMM6, \XMM6
1457                vaesenc \T1, \XMM7, \XMM7
1458                vaesenc \T1, \XMM8, \XMM8
1459
1460        vmovdqa         TMP7(%rsp), \T1
1461        vmovdqu         HashKey_2(arg2), \T5
1462        vpclmulqdq      $0x11, \T5, \T1, \T3
1463        vpxor           \T3, \T4, \T4
1464        vpclmulqdq      $0x00, \T5, \T1, \T3
1465        vpxor           \T3, \T7, \T7
1466
1467        vpshufd         $0b01001110, \T1, \T3
1468        vpxor           \T1, \T3, \T3
1469        vmovdqu         HashKey_2_k(arg2), \T5
1470        vpclmulqdq      $0x10, \T5, \T3, \T3
1471        vpxor           \T3, \T6, \T6
1472
1473        #######################################################################
1474
1475                vmovdqu 16*9(arg1), \T5
1476                vaesenc \T5, \XMM1, \XMM1
1477                vaesenc \T5, \XMM2, \XMM2
1478                vaesenc \T5, \XMM3, \XMM3
1479                vaesenc \T5, \XMM4, \XMM4
1480                vaesenc \T5, \XMM5, \XMM5
1481                vaesenc \T5, \XMM6, \XMM6
1482                vaesenc \T5, \XMM7, \XMM7
1483                vaesenc \T5, \XMM8, \XMM8
1484
1485        vmovdqa         TMP8(%rsp), \T1
1486        vmovdqu         HashKey(arg2), \T5
1487        vpclmulqdq      $0x11, \T5, \T1, \T3
1488        vpxor           \T3, \T4, \T4
1489        vpclmulqdq      $0x00, \T5, \T1, \T3
1490        vpxor           \T3, \T7, \T7
1491
1492        vpshufd         $0b01001110, \T1, \T3
1493        vpxor           \T1, \T3, \T3
1494        vmovdqu         HashKey_k(arg2), \T5
1495        vpclmulqdq      $0x10, \T5, \T3, \T3
1496        vpxor           \T3, \T6, \T6
1497
1498        vpxor           \T4, \T6, \T6
1499        vpxor           \T7, \T6, \T6
1500
1501                vmovdqu 16*10(arg1), \T5
1502
1503        i = 11
1504        setreg
1505.rep (\REP-9)
1506
1507        vaesenc \T5, \XMM1, \XMM1
1508        vaesenc \T5, \XMM2, \XMM2
1509        vaesenc \T5, \XMM3, \XMM3
1510        vaesenc \T5, \XMM4, \XMM4
1511        vaesenc \T5, \XMM5, \XMM5
1512        vaesenc \T5, \XMM6, \XMM6
1513        vaesenc \T5, \XMM7, \XMM7
1514        vaesenc \T5, \XMM8, \XMM8
1515
1516        vmovdqu 16*i(arg1), \T5
1517        i = i + 1
1518        setreg
1519.endr
1520
1521	i = 0
1522	j = 1
1523	setreg
1524.rep 8
1525		vpxor	16*i(arg4, %r11), \T5, \T2
1526                .if \ENC_DEC == ENC
1527                vaesenclast     \T2, reg_j, reg_j
1528                .else
1529                vaesenclast     \T2, reg_j, \T3
1530                vmovdqu 16*i(arg4, %r11), reg_j
1531                vmovdqu \T3, 16*i(arg3, %r11)
1532                .endif
1533	i = (i+1)
1534	j = (j+1)
1535	setreg
1536.endr
1537	#######################################################################
1538
1539
1540	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1541	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1542	vpxor	\T3, \T7, \T7
1543	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1544
1545
1546
1547	#######################################################################
1548	#first phase of the reduction
1549	#######################################################################
1550        vpslld  $31, \T7, \T2                           # packed right shifting << 31
1551        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1552        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1553
1554        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1555        vpxor   \T4, \T2, \T2
1556
1557        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1558
1559        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1560        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1561	#######################################################################
1562                .if \ENC_DEC == ENC
1563		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1564		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1565		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1566		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1567		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1568		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1569		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1570		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1571                .endif
1572
1573	#######################################################################
1574	#second phase of the reduction
1575        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1576        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1577        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1578        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1579        vpxor   \T4, \T2, \T2
1580
1581        vpxor   \T1, \T2, \T2
1582        vpxor   \T2, \T7, \T7
1583        vpxor   \T7, \T6, \T6                           # the result is in T6
1584	#######################################################################
1585
1586		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1587		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1588		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1589		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1590		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1591		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1592		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1593		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1594
1595
1596	vpxor	\T6, \XMM1, \XMM1
1597
1598
1599
1600.endm
1601
1602
1603# GHASH the last 4 ciphertext blocks.
1604.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1605
1606        ## Karatsuba Method
1607
1608
1609        vpshufd         $0b01001110, \XMM1, \T2
1610        vpxor           \XMM1, \T2, \T2
1611        vmovdqu         HashKey_8(arg2), \T5
1612        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1613        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1614
1615        vmovdqu         HashKey_8_k(arg2), \T3
1616        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1617
1618        ######################
1619
1620        vpshufd         $0b01001110, \XMM2, \T2
1621        vpxor           \XMM2, \T2, \T2
1622        vmovdqu         HashKey_7(arg2), \T5
1623        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1624        vpxor           \T4, \T6, \T6
1625
1626        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1627        vpxor           \T4, \T7, \T7
1628
1629        vmovdqu         HashKey_7_k(arg2), \T3
1630        vpclmulqdq      $0x00, \T3, \T2, \T2
1631        vpxor           \T2, \XMM1, \XMM1
1632
1633        ######################
1634
1635        vpshufd         $0b01001110, \XMM3, \T2
1636        vpxor           \XMM3, \T2, \T2
1637        vmovdqu         HashKey_6(arg2), \T5
1638        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1639        vpxor           \T4, \T6, \T6
1640
1641        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1642        vpxor           \T4, \T7, \T7
1643
1644        vmovdqu         HashKey_6_k(arg2), \T3
1645        vpclmulqdq      $0x00, \T3, \T2, \T2
1646        vpxor           \T2, \XMM1, \XMM1
1647
1648        ######################
1649
1650        vpshufd         $0b01001110, \XMM4, \T2
1651        vpxor           \XMM4, \T2, \T2
1652        vmovdqu         HashKey_5(arg2), \T5
1653        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1654        vpxor           \T4, \T6, \T6
1655
1656        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1657        vpxor           \T4, \T7, \T7
1658
1659        vmovdqu         HashKey_5_k(arg2), \T3
1660        vpclmulqdq      $0x00, \T3, \T2, \T2
1661        vpxor           \T2, \XMM1, \XMM1
1662
1663        ######################
1664
1665        vpshufd         $0b01001110, \XMM5, \T2
1666        vpxor           \XMM5, \T2, \T2
1667        vmovdqu         HashKey_4(arg2), \T5
1668        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1669        vpxor           \T4, \T6, \T6
1670
1671        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1672        vpxor           \T4, \T7, \T7
1673
1674        vmovdqu         HashKey_4_k(arg2), \T3
1675        vpclmulqdq      $0x00, \T3, \T2, \T2
1676        vpxor           \T2, \XMM1, \XMM1
1677
1678        ######################
1679
1680        vpshufd         $0b01001110, \XMM6, \T2
1681        vpxor           \XMM6, \T2, \T2
1682        vmovdqu         HashKey_3(arg2), \T5
1683        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1684        vpxor           \T4, \T6, \T6
1685
1686        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1687        vpxor           \T4, \T7, \T7
1688
1689        vmovdqu         HashKey_3_k(arg2), \T3
1690        vpclmulqdq      $0x00, \T3, \T2, \T2
1691        vpxor           \T2, \XMM1, \XMM1
1692
1693        ######################
1694
1695        vpshufd         $0b01001110, \XMM7, \T2
1696        vpxor           \XMM7, \T2, \T2
1697        vmovdqu         HashKey_2(arg2), \T5
1698        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1699        vpxor           \T4, \T6, \T6
1700
1701        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1702        vpxor           \T4, \T7, \T7
1703
1704        vmovdqu         HashKey_2_k(arg2), \T3
1705        vpclmulqdq      $0x00, \T3, \T2, \T2
1706        vpxor           \T2, \XMM1, \XMM1
1707
1708        ######################
1709
1710        vpshufd         $0b01001110, \XMM8, \T2
1711        vpxor           \XMM8, \T2, \T2
1712        vmovdqu         HashKey(arg2), \T5
1713        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1714        vpxor           \T4, \T6, \T6
1715
1716        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1717        vpxor           \T4, \T7, \T7
1718
1719        vmovdqu         HashKey_k(arg2), \T3
1720        vpclmulqdq      $0x00, \T3, \T2, \T2
1721
1722        vpxor           \T2, \XMM1, \XMM1
1723        vpxor           \T6, \XMM1, \XMM1
1724        vpxor           \T7, \XMM1, \T2
1725
1726
1727
1728
1729        vpslldq $8, \T2, \T4
1730        vpsrldq $8, \T2, \T2
1731
1732        vpxor   \T4, \T7, \T7
1733        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1734				# the accumulated carry-less multiplications
1735
1736        #######################################################################
1737        #first phase of the reduction
1738        vpslld  $31, \T7, \T2   # packed right shifting << 31
1739        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1740        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1741
1742        vpxor   \T3, \T2, \T2   # xor the shifted versions
1743        vpxor   \T4, \T2, \T2
1744
1745        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1746
1747        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1748        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1749        #######################################################################
1750
1751
1752        #second phase of the reduction
1753        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1754        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1755        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1756        vpxor   \T3, \T2, \T2   # xor the shifted versions
1757        vpxor   \T4, \T2, \T2
1758
1759        vpxor   \T1, \T2, \T2
1760        vpxor   \T2, \T7, \T7
1761        vpxor   \T7, \T6, \T6   # the result is in T6
1762
1763.endm
1764
1765#############################################################
1766#void   aesni_gcm_precomp_avx_gen2
1767#        (gcm_data     *my_ctx_data,
1768#         gcm_context_data *data,
1769#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1770#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1771#			(from Security Association) concatenated with 8 byte
1772#			Initialisation Vector (from IPSec ESP Payload)
1773#			concatenated with 0x00000001. 16-byte aligned pointer. */
1774#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1775#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1776#############################################################
1777SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1778        FUNC_SAVE
1779        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1780        FUNC_RESTORE
1781        ret
1782SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1783
1784###############################################################################
1785#void   aesni_gcm_enc_update_avx_gen2(
1786#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1787#        gcm_context_data *data,
1788#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1789#        const   u8 *in, /* Plaintext input */
1790#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1791###############################################################################
1792SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1793        FUNC_SAVE
1794        mov     keysize, %eax
1795        cmp     $32, %eax
1796        je      key_256_enc_update
1797        cmp     $16, %eax
1798        je      key_128_enc_update
1799        # must be 192
1800        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1801        FUNC_RESTORE
1802        ret
1803key_128_enc_update:
1804        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1805        FUNC_RESTORE
1806        ret
1807key_256_enc_update:
1808        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1809        FUNC_RESTORE
1810        ret
1811SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1812
1813###############################################################################
1814#void   aesni_gcm_dec_update_avx_gen2(
1815#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1816#        gcm_context_data *data,
1817#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1818#        const   u8 *in, /* Ciphertext input */
1819#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1820###############################################################################
1821SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1822        FUNC_SAVE
1823        mov     keysize,%eax
1824        cmp     $32, %eax
1825        je      key_256_dec_update
1826        cmp     $16, %eax
1827        je      key_128_dec_update
1828        # must be 192
1829        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1830        FUNC_RESTORE
1831        ret
1832key_128_dec_update:
1833        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1834        FUNC_RESTORE
1835        ret
1836key_256_dec_update:
1837        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1838        FUNC_RESTORE
1839        ret
1840SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1841
1842###############################################################################
1843#void   aesni_gcm_finalize_avx_gen2(
1844#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1845#        gcm_context_data *data,
1846#        u8      *auth_tag, /* Authenticated Tag output. */
1847#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1848#				Valid values are 16 (most likely), 12 or 8. */
1849###############################################################################
1850SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1851        FUNC_SAVE
1852        mov	keysize,%eax
1853        cmp     $32, %eax
1854        je      key_256_finalize
1855        cmp     $16, %eax
1856        je      key_128_finalize
1857        # must be 192
1858        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1859        FUNC_RESTORE
1860        ret
1861key_128_finalize:
1862        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1863        FUNC_RESTORE
1864        ret
1865key_256_finalize:
1866        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1867        FUNC_RESTORE
1868        ret
1869SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1870
1871###############################################################################
1872# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1873# Input: A and B (128-bits each, bit-reflected)
1874# Output: C = A*B*x mod poly, (i.e. >>1 )
1875# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1876# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1877###############################################################################
1878.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1879
1880        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1881        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1882        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1883        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1884        vpxor           \T3, \GH, \GH
1885
1886
1887        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1888        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1889
1890        vpxor           \T3, \T1, \T1
1891        vpxor           \T2, \GH, \GH
1892
1893        #######################################################################
1894        #first phase of the reduction
1895        vmovdqa         POLY2(%rip), \T3
1896
1897        vpclmulqdq      $0x01, \GH, \T3, \T2
1898        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1899
1900        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1901        #######################################################################
1902        #second phase of the reduction
1903        vpclmulqdq      $0x00, \GH, \T3, \T2
1904        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1905
1906        vpclmulqdq      $0x10, \GH, \T3, \GH
1907        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1908
1909        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1910        #######################################################################
1911        vpxor           \T1, \GH, \GH          # the result is in GH
1912
1913
1914.endm
1915
1916.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1917
1918        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1919        vmovdqa  \HK, \T5
1920        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1921        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1922
1923        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1924        vmovdqu  \T5, HashKey_3(arg2)
1925
1926        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1927        vmovdqu  \T5, HashKey_4(arg2)
1928
1929        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1930        vmovdqu  \T5, HashKey_5(arg2)
1931
1932        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1933        vmovdqu  \T5, HashKey_6(arg2)
1934
1935        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1936        vmovdqu  \T5, HashKey_7(arg2)
1937
1938        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1939        vmovdqu  \T5, HashKey_8(arg2)
1940
1941.endm
1942
1943## if a = number of total plaintext bytes
1944## b = floor(a/16)
1945## num_initial_blocks = b mod 4#
1946## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1947## r10, r11, r12, rax are clobbered
1948## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1949
1950.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1951	i = (8-\num_initial_blocks)
1952	setreg
1953	vmovdqu AadHash(arg2), reg_i
1954
1955	# start AES for num_initial_blocks blocks
1956	vmovdqu CurCount(arg2), \CTR
1957
1958	i = (9-\num_initial_blocks)
1959	setreg
1960.rep \num_initial_blocks
1961                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1962                vmovdqa \CTR, reg_i
1963                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1964	i = (i+1)
1965	setreg
1966.endr
1967
1968	vmovdqa  (arg1), \T_key
1969	i = (9-\num_initial_blocks)
1970	setreg
1971.rep \num_initial_blocks
1972                vpxor   \T_key, reg_i, reg_i
1973	i = (i+1)
1974	setreg
1975.endr
1976
1977	j = 1
1978	setreg
1979.rep \REP
1980	vmovdqa  16*j(arg1), \T_key
1981	i = (9-\num_initial_blocks)
1982	setreg
1983.rep \num_initial_blocks
1984        vaesenc \T_key, reg_i, reg_i
1985	i = (i+1)
1986	setreg
1987.endr
1988
1989	j = (j+1)
1990	setreg
1991.endr
1992
1993
1994	vmovdqa  16*j(arg1), \T_key
1995	i = (9-\num_initial_blocks)
1996	setreg
1997.rep \num_initial_blocks
1998        vaesenclast      \T_key, reg_i, reg_i
1999	i = (i+1)
2000	setreg
2001.endr
2002
2003	i = (9-\num_initial_blocks)
2004	setreg
2005.rep \num_initial_blocks
2006                vmovdqu (arg4, %r11), \T1
2007                vpxor   \T1, reg_i, reg_i
2008                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
2009						       # num_initial_blocks blocks
2010                add     $16, %r11
2011.if  \ENC_DEC == DEC
2012                vmovdqa \T1, reg_i
2013.endif
2014                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2015	i = (i+1)
2016	setreg
2017.endr
2018
2019
2020	i = (8-\num_initial_blocks)
2021	j = (9-\num_initial_blocks)
2022	setreg
2023
2024.rep \num_initial_blocks
2025        vpxor    reg_i, reg_j, reg_j
2026        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2027	i = (i+1)
2028	j = (j+1)
2029	setreg
2030.endr
2031        # XMM8 has the combined result here
2032
2033        vmovdqa  \XMM8, TMP1(%rsp)
2034        vmovdqa  \XMM8, \T3
2035
2036        cmp     $128, %r13
2037        jl      _initial_blocks_done\@                  # no need for precomputed constants
2038
2039###############################################################################
2040# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2041                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2042                vmovdqa  \CTR, \XMM1
2043                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2044
2045                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2046                vmovdqa  \CTR, \XMM2
2047                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2048
2049                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2050                vmovdqa  \CTR, \XMM3
2051                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2052
2053                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2054                vmovdqa  \CTR, \XMM4
2055                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2056
2057                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2058                vmovdqa  \CTR, \XMM5
2059                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2060
2061                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2062                vmovdqa  \CTR, \XMM6
2063                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2064
2065                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2066                vmovdqa  \CTR, \XMM7
2067                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2068
2069                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2070                vmovdqa  \CTR, \XMM8
2071                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2072
2073                vmovdqa  (arg1), \T_key
2074                vpxor    \T_key, \XMM1, \XMM1
2075                vpxor    \T_key, \XMM2, \XMM2
2076                vpxor    \T_key, \XMM3, \XMM3
2077                vpxor    \T_key, \XMM4, \XMM4
2078                vpxor    \T_key, \XMM5, \XMM5
2079                vpxor    \T_key, \XMM6, \XMM6
2080                vpxor    \T_key, \XMM7, \XMM7
2081                vpxor    \T_key, \XMM8, \XMM8
2082
2083		i = 1
2084		setreg
2085.rep    \REP       # do REP rounds
2086                vmovdqa  16*i(arg1), \T_key
2087                vaesenc  \T_key, \XMM1, \XMM1
2088                vaesenc  \T_key, \XMM2, \XMM2
2089                vaesenc  \T_key, \XMM3, \XMM3
2090                vaesenc  \T_key, \XMM4, \XMM4
2091                vaesenc  \T_key, \XMM5, \XMM5
2092                vaesenc  \T_key, \XMM6, \XMM6
2093                vaesenc  \T_key, \XMM7, \XMM7
2094                vaesenc  \T_key, \XMM8, \XMM8
2095		i = (i+1)
2096		setreg
2097.endr
2098
2099
2100                vmovdqa  16*i(arg1), \T_key
2101                vaesenclast  \T_key, \XMM1, \XMM1
2102                vaesenclast  \T_key, \XMM2, \XMM2
2103                vaesenclast  \T_key, \XMM3, \XMM3
2104                vaesenclast  \T_key, \XMM4, \XMM4
2105                vaesenclast  \T_key, \XMM5, \XMM5
2106                vaesenclast  \T_key, \XMM6, \XMM6
2107                vaesenclast  \T_key, \XMM7, \XMM7
2108                vaesenclast  \T_key, \XMM8, \XMM8
2109
2110                vmovdqu  (arg4, %r11), \T1
2111                vpxor    \T1, \XMM1, \XMM1
2112                vmovdqu  \XMM1, (arg3 , %r11)
2113                .if   \ENC_DEC == DEC
2114                vmovdqa  \T1, \XMM1
2115                .endif
2116
2117                vmovdqu  16*1(arg4, %r11), \T1
2118                vpxor    \T1, \XMM2, \XMM2
2119                vmovdqu  \XMM2, 16*1(arg3 , %r11)
2120                .if   \ENC_DEC == DEC
2121                vmovdqa  \T1, \XMM2
2122                .endif
2123
2124                vmovdqu  16*2(arg4, %r11), \T1
2125                vpxor    \T1, \XMM3, \XMM3
2126                vmovdqu  \XMM3, 16*2(arg3 , %r11)
2127                .if   \ENC_DEC == DEC
2128                vmovdqa  \T1, \XMM3
2129                .endif
2130
2131                vmovdqu  16*3(arg4, %r11), \T1
2132                vpxor    \T1, \XMM4, \XMM4
2133                vmovdqu  \XMM4, 16*3(arg3 , %r11)
2134                .if   \ENC_DEC == DEC
2135                vmovdqa  \T1, \XMM4
2136                .endif
2137
2138                vmovdqu  16*4(arg4, %r11), \T1
2139                vpxor    \T1, \XMM5, \XMM5
2140                vmovdqu  \XMM5, 16*4(arg3 , %r11)
2141                .if   \ENC_DEC == DEC
2142                vmovdqa  \T1, \XMM5
2143                .endif
2144
2145                vmovdqu  16*5(arg4, %r11), \T1
2146                vpxor    \T1, \XMM6, \XMM6
2147                vmovdqu  \XMM6, 16*5(arg3 , %r11)
2148                .if   \ENC_DEC == DEC
2149                vmovdqa  \T1, \XMM6
2150                .endif
2151
2152                vmovdqu  16*6(arg4, %r11), \T1
2153                vpxor    \T1, \XMM7, \XMM7
2154                vmovdqu  \XMM7, 16*6(arg3 , %r11)
2155                .if   \ENC_DEC == DEC
2156                vmovdqa  \T1, \XMM7
2157                .endif
2158
2159                vmovdqu  16*7(arg4, %r11), \T1
2160                vpxor    \T1, \XMM8, \XMM8
2161                vmovdqu  \XMM8, 16*7(arg3 , %r11)
2162                .if   \ENC_DEC == DEC
2163                vmovdqa  \T1, \XMM8
2164                .endif
2165
2166                add     $128, %r11
2167
2168                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2169                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2170							   # the corresponding ciphertext
2171                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2172                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2173                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2174                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2175                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2176                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2177                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2178
2179###############################################################################
2180
2181_initial_blocks_done\@:
2182
2183
2184.endm
2185
2186
2187
2188# encrypt 8 blocks at a time
2189# ghash the 8 previously encrypted ciphertext blocks
2190# arg1, arg3, arg4 are used as pointers only, not modified
2191# r11 is the data offset value
2192.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2193
2194        vmovdqa \XMM1, \T2
2195        vmovdqa \XMM2, TMP2(%rsp)
2196        vmovdqa \XMM3, TMP3(%rsp)
2197        vmovdqa \XMM4, TMP4(%rsp)
2198        vmovdqa \XMM5, TMP5(%rsp)
2199        vmovdqa \XMM6, TMP6(%rsp)
2200        vmovdqa \XMM7, TMP7(%rsp)
2201        vmovdqa \XMM8, TMP8(%rsp)
2202
2203.if \loop_idx == in_order
2204                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2205                vpaddd  ONE(%rip), \XMM1, \XMM2
2206                vpaddd  ONE(%rip), \XMM2, \XMM3
2207                vpaddd  ONE(%rip), \XMM3, \XMM4
2208                vpaddd  ONE(%rip), \XMM4, \XMM5
2209                vpaddd  ONE(%rip), \XMM5, \XMM6
2210                vpaddd  ONE(%rip), \XMM6, \XMM7
2211                vpaddd  ONE(%rip), \XMM7, \XMM8
2212                vmovdqa \XMM8, \CTR
2213
2214                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2215                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2216                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2217                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2218                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2219                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2220                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2221                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2222.else
2223                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2224                vpaddd  ONEf(%rip), \XMM1, \XMM2
2225                vpaddd  ONEf(%rip), \XMM2, \XMM3
2226                vpaddd  ONEf(%rip), \XMM3, \XMM4
2227                vpaddd  ONEf(%rip), \XMM4, \XMM5
2228                vpaddd  ONEf(%rip), \XMM5, \XMM6
2229                vpaddd  ONEf(%rip), \XMM6, \XMM7
2230                vpaddd  ONEf(%rip), \XMM7, \XMM8
2231                vmovdqa \XMM8, \CTR
2232.endif
2233
2234
2235        #######################################################################
2236
2237                vmovdqu (arg1), \T1
2238                vpxor   \T1, \XMM1, \XMM1
2239                vpxor   \T1, \XMM2, \XMM2
2240                vpxor   \T1, \XMM3, \XMM3
2241                vpxor   \T1, \XMM4, \XMM4
2242                vpxor   \T1, \XMM5, \XMM5
2243                vpxor   \T1, \XMM6, \XMM6
2244                vpxor   \T1, \XMM7, \XMM7
2245                vpxor   \T1, \XMM8, \XMM8
2246
2247        #######################################################################
2248
2249
2250
2251
2252
2253                vmovdqu 16*1(arg1), \T1
2254                vaesenc \T1, \XMM1, \XMM1
2255                vaesenc \T1, \XMM2, \XMM2
2256                vaesenc \T1, \XMM3, \XMM3
2257                vaesenc \T1, \XMM4, \XMM4
2258                vaesenc \T1, \XMM5, \XMM5
2259                vaesenc \T1, \XMM6, \XMM6
2260                vaesenc \T1, \XMM7, \XMM7
2261                vaesenc \T1, \XMM8, \XMM8
2262
2263                vmovdqu 16*2(arg1), \T1
2264                vaesenc \T1, \XMM1, \XMM1
2265                vaesenc \T1, \XMM2, \XMM2
2266                vaesenc \T1, \XMM3, \XMM3
2267                vaesenc \T1, \XMM4, \XMM4
2268                vaesenc \T1, \XMM5, \XMM5
2269                vaesenc \T1, \XMM6, \XMM6
2270                vaesenc \T1, \XMM7, \XMM7
2271                vaesenc \T1, \XMM8, \XMM8
2272
2273
2274        #######################################################################
2275
2276        vmovdqu         HashKey_8(arg2), \T5
2277        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2278        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2279        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2280        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2281        vpxor           \T5, \T6, \T6
2282
2283                vmovdqu 16*3(arg1), \T1
2284                vaesenc \T1, \XMM1, \XMM1
2285                vaesenc \T1, \XMM2, \XMM2
2286                vaesenc \T1, \XMM3, \XMM3
2287                vaesenc \T1, \XMM4, \XMM4
2288                vaesenc \T1, \XMM5, \XMM5
2289                vaesenc \T1, \XMM6, \XMM6
2290                vaesenc \T1, \XMM7, \XMM7
2291                vaesenc \T1, \XMM8, \XMM8
2292
2293        vmovdqa         TMP2(%rsp), \T1
2294        vmovdqu         HashKey_7(arg2), \T5
2295        vpclmulqdq      $0x11, \T5, \T1, \T3
2296        vpxor           \T3, \T4, \T4
2297
2298        vpclmulqdq      $0x00, \T5, \T1, \T3
2299        vpxor           \T3, \T7, \T7
2300
2301        vpclmulqdq      $0x01, \T5, \T1, \T3
2302        vpxor           \T3, \T6, \T6
2303
2304        vpclmulqdq      $0x10, \T5, \T1, \T3
2305        vpxor           \T3, \T6, \T6
2306
2307                vmovdqu 16*4(arg1), \T1
2308                vaesenc \T1, \XMM1, \XMM1
2309                vaesenc \T1, \XMM2, \XMM2
2310                vaesenc \T1, \XMM3, \XMM3
2311                vaesenc \T1, \XMM4, \XMM4
2312                vaesenc \T1, \XMM5, \XMM5
2313                vaesenc \T1, \XMM6, \XMM6
2314                vaesenc \T1, \XMM7, \XMM7
2315                vaesenc \T1, \XMM8, \XMM8
2316
2317        #######################################################################
2318
2319        vmovdqa         TMP3(%rsp), \T1
2320        vmovdqu         HashKey_6(arg2), \T5
2321        vpclmulqdq      $0x11, \T5, \T1, \T3
2322        vpxor           \T3, \T4, \T4
2323
2324        vpclmulqdq      $0x00, \T5, \T1, \T3
2325        vpxor           \T3, \T7, \T7
2326
2327        vpclmulqdq      $0x01, \T5, \T1, \T3
2328        vpxor           \T3, \T6, \T6
2329
2330        vpclmulqdq      $0x10, \T5, \T1, \T3
2331        vpxor           \T3, \T6, \T6
2332
2333                vmovdqu 16*5(arg1), \T1
2334                vaesenc \T1, \XMM1, \XMM1
2335                vaesenc \T1, \XMM2, \XMM2
2336                vaesenc \T1, \XMM3, \XMM3
2337                vaesenc \T1, \XMM4, \XMM4
2338                vaesenc \T1, \XMM5, \XMM5
2339                vaesenc \T1, \XMM6, \XMM6
2340                vaesenc \T1, \XMM7, \XMM7
2341                vaesenc \T1, \XMM8, \XMM8
2342
2343        vmovdqa         TMP4(%rsp), \T1
2344        vmovdqu         HashKey_5(arg2), \T5
2345        vpclmulqdq      $0x11, \T5, \T1, \T3
2346        vpxor           \T3, \T4, \T4
2347
2348        vpclmulqdq      $0x00, \T5, \T1, \T3
2349        vpxor           \T3, \T7, \T7
2350
2351        vpclmulqdq      $0x01, \T5, \T1, \T3
2352        vpxor           \T3, \T6, \T6
2353
2354        vpclmulqdq      $0x10, \T5, \T1, \T3
2355        vpxor           \T3, \T6, \T6
2356
2357                vmovdqu 16*6(arg1), \T1
2358                vaesenc \T1, \XMM1, \XMM1
2359                vaesenc \T1, \XMM2, \XMM2
2360                vaesenc \T1, \XMM3, \XMM3
2361                vaesenc \T1, \XMM4, \XMM4
2362                vaesenc \T1, \XMM5, \XMM5
2363                vaesenc \T1, \XMM6, \XMM6
2364                vaesenc \T1, \XMM7, \XMM7
2365                vaesenc \T1, \XMM8, \XMM8
2366
2367
2368        vmovdqa         TMP5(%rsp), \T1
2369        vmovdqu         HashKey_4(arg2), \T5
2370        vpclmulqdq      $0x11, \T5, \T1, \T3
2371        vpxor           \T3, \T4, \T4
2372
2373        vpclmulqdq      $0x00, \T5, \T1, \T3
2374        vpxor           \T3, \T7, \T7
2375
2376        vpclmulqdq      $0x01, \T5, \T1, \T3
2377        vpxor           \T3, \T6, \T6
2378
2379        vpclmulqdq      $0x10, \T5, \T1, \T3
2380        vpxor           \T3, \T6, \T6
2381
2382                vmovdqu 16*7(arg1), \T1
2383                vaesenc \T1, \XMM1, \XMM1
2384                vaesenc \T1, \XMM2, \XMM2
2385                vaesenc \T1, \XMM3, \XMM3
2386                vaesenc \T1, \XMM4, \XMM4
2387                vaesenc \T1, \XMM5, \XMM5
2388                vaesenc \T1, \XMM6, \XMM6
2389                vaesenc \T1, \XMM7, \XMM7
2390                vaesenc \T1, \XMM8, \XMM8
2391
2392        vmovdqa         TMP6(%rsp), \T1
2393        vmovdqu         HashKey_3(arg2), \T5
2394        vpclmulqdq      $0x11, \T5, \T1, \T3
2395        vpxor           \T3, \T4, \T4
2396
2397        vpclmulqdq      $0x00, \T5, \T1, \T3
2398        vpxor           \T3, \T7, \T7
2399
2400        vpclmulqdq      $0x01, \T5, \T1, \T3
2401        vpxor           \T3, \T6, \T6
2402
2403        vpclmulqdq      $0x10, \T5, \T1, \T3
2404        vpxor           \T3, \T6, \T6
2405
2406                vmovdqu 16*8(arg1), \T1
2407                vaesenc \T1, \XMM1, \XMM1
2408                vaesenc \T1, \XMM2, \XMM2
2409                vaesenc \T1, \XMM3, \XMM3
2410                vaesenc \T1, \XMM4, \XMM4
2411                vaesenc \T1, \XMM5, \XMM5
2412                vaesenc \T1, \XMM6, \XMM6
2413                vaesenc \T1, \XMM7, \XMM7
2414                vaesenc \T1, \XMM8, \XMM8
2415
2416        vmovdqa         TMP7(%rsp), \T1
2417        vmovdqu         HashKey_2(arg2), \T5
2418        vpclmulqdq      $0x11, \T5, \T1, \T3
2419        vpxor           \T3, \T4, \T4
2420
2421        vpclmulqdq      $0x00, \T5, \T1, \T3
2422        vpxor           \T3, \T7, \T7
2423
2424        vpclmulqdq      $0x01, \T5, \T1, \T3
2425        vpxor           \T3, \T6, \T6
2426
2427        vpclmulqdq      $0x10, \T5, \T1, \T3
2428        vpxor           \T3, \T6, \T6
2429
2430
2431        #######################################################################
2432
2433                vmovdqu 16*9(arg1), \T5
2434                vaesenc \T5, \XMM1, \XMM1
2435                vaesenc \T5, \XMM2, \XMM2
2436                vaesenc \T5, \XMM3, \XMM3
2437                vaesenc \T5, \XMM4, \XMM4
2438                vaesenc \T5, \XMM5, \XMM5
2439                vaesenc \T5, \XMM6, \XMM6
2440                vaesenc \T5, \XMM7, \XMM7
2441                vaesenc \T5, \XMM8, \XMM8
2442
2443        vmovdqa         TMP8(%rsp), \T1
2444        vmovdqu         HashKey(arg2), \T5
2445
2446        vpclmulqdq      $0x00, \T5, \T1, \T3
2447        vpxor           \T3, \T7, \T7
2448
2449        vpclmulqdq      $0x01, \T5, \T1, \T3
2450        vpxor           \T3, \T6, \T6
2451
2452        vpclmulqdq      $0x10, \T5, \T1, \T3
2453        vpxor           \T3, \T6, \T6
2454
2455        vpclmulqdq      $0x11, \T5, \T1, \T3
2456        vpxor           \T3, \T4, \T1
2457
2458
2459                vmovdqu 16*10(arg1), \T5
2460
2461        i = 11
2462        setreg
2463.rep (\REP-9)
2464        vaesenc \T5, \XMM1, \XMM1
2465        vaesenc \T5, \XMM2, \XMM2
2466        vaesenc \T5, \XMM3, \XMM3
2467        vaesenc \T5, \XMM4, \XMM4
2468        vaesenc \T5, \XMM5, \XMM5
2469        vaesenc \T5, \XMM6, \XMM6
2470        vaesenc \T5, \XMM7, \XMM7
2471        vaesenc \T5, \XMM8, \XMM8
2472
2473        vmovdqu 16*i(arg1), \T5
2474        i = i + 1
2475        setreg
2476.endr
2477
2478	i = 0
2479	j = 1
2480	setreg
2481.rep 8
2482		vpxor	16*i(arg4, %r11), \T5, \T2
2483                .if \ENC_DEC == ENC
2484                vaesenclast     \T2, reg_j, reg_j
2485                .else
2486                vaesenclast     \T2, reg_j, \T3
2487                vmovdqu 16*i(arg4, %r11), reg_j
2488                vmovdqu \T3, 16*i(arg3, %r11)
2489                .endif
2490	i = (i+1)
2491	j = (j+1)
2492	setreg
2493.endr
2494	#######################################################################
2495
2496
2497	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2498	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2499	vpxor	\T3, \T7, \T7
2500	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2501
2502
2503
2504	#######################################################################
2505	#first phase of the reduction
2506	vmovdqa         POLY2(%rip), \T3
2507
2508	vpclmulqdq	$0x01, \T7, \T3, \T2
2509	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2510
2511	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2512	#######################################################################
2513                .if \ENC_DEC == ENC
2514		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2515		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2516		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2517		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2518		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2519		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2520		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2521		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2522                .endif
2523
2524	#######################################################################
2525	#second phase of the reduction
2526	vpclmulqdq	$0x00, \T7, \T3, \T2
2527	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2528
2529	vpclmulqdq	$0x10, \T7, \T3, \T4
2530	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2531
2532	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2533	#######################################################################
2534	vpxor		\T4, \T1, \T1			# the result is in T1
2535
2536		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2537		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2538		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2539		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2540		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2541		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2542		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2543		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2544
2545
2546	vpxor	\T1, \XMM1, \XMM1
2547
2548
2549
2550.endm
2551
2552
2553# GHASH the last 4 ciphertext blocks.
2554.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2555
2556        ## Karatsuba Method
2557
2558        vmovdqu         HashKey_8(arg2), \T5
2559
2560        vpshufd         $0b01001110, \XMM1, \T2
2561        vpshufd         $0b01001110, \T5, \T3
2562        vpxor           \XMM1, \T2, \T2
2563        vpxor           \T5, \T3, \T3
2564
2565        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2566        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2567
2568        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2569
2570        ######################
2571
2572        vmovdqu         HashKey_7(arg2), \T5
2573        vpshufd         $0b01001110, \XMM2, \T2
2574        vpshufd         $0b01001110, \T5, \T3
2575        vpxor           \XMM2, \T2, \T2
2576        vpxor           \T5, \T3, \T3
2577
2578        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2579        vpxor           \T4, \T6, \T6
2580
2581        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2582        vpxor           \T4, \T7, \T7
2583
2584        vpclmulqdq      $0x00, \T3, \T2, \T2
2585
2586        vpxor           \T2, \XMM1, \XMM1
2587
2588        ######################
2589
2590        vmovdqu         HashKey_6(arg2), \T5
2591        vpshufd         $0b01001110, \XMM3, \T2
2592        vpshufd         $0b01001110, \T5, \T3
2593        vpxor           \XMM3, \T2, \T2
2594        vpxor           \T5, \T3, \T3
2595
2596        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2597        vpxor           \T4, \T6, \T6
2598
2599        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2600        vpxor           \T4, \T7, \T7
2601
2602        vpclmulqdq      $0x00, \T3, \T2, \T2
2603
2604        vpxor           \T2, \XMM1, \XMM1
2605
2606        ######################
2607
2608        vmovdqu         HashKey_5(arg2), \T5
2609        vpshufd         $0b01001110, \XMM4, \T2
2610        vpshufd         $0b01001110, \T5, \T3
2611        vpxor           \XMM4, \T2, \T2
2612        vpxor           \T5, \T3, \T3
2613
2614        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2615        vpxor           \T4, \T6, \T6
2616
2617        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2618        vpxor           \T4, \T7, \T7
2619
2620        vpclmulqdq      $0x00, \T3, \T2, \T2
2621
2622        vpxor           \T2, \XMM1, \XMM1
2623
2624        ######################
2625
2626        vmovdqu         HashKey_4(arg2), \T5
2627        vpshufd         $0b01001110, \XMM5, \T2
2628        vpshufd         $0b01001110, \T5, \T3
2629        vpxor           \XMM5, \T2, \T2
2630        vpxor           \T5, \T3, \T3
2631
2632        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2633        vpxor           \T4, \T6, \T6
2634
2635        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2636        vpxor           \T4, \T7, \T7
2637
2638        vpclmulqdq      $0x00, \T3, \T2, \T2
2639
2640        vpxor           \T2, \XMM1, \XMM1
2641
2642        ######################
2643
2644        vmovdqu         HashKey_3(arg2), \T5
2645        vpshufd         $0b01001110, \XMM6, \T2
2646        vpshufd         $0b01001110, \T5, \T3
2647        vpxor           \XMM6, \T2, \T2
2648        vpxor           \T5, \T3, \T3
2649
2650        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2651        vpxor           \T4, \T6, \T6
2652
2653        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2654        vpxor           \T4, \T7, \T7
2655
2656        vpclmulqdq      $0x00, \T3, \T2, \T2
2657
2658        vpxor           \T2, \XMM1, \XMM1
2659
2660        ######################
2661
2662        vmovdqu         HashKey_2(arg2), \T5
2663        vpshufd         $0b01001110, \XMM7, \T2
2664        vpshufd         $0b01001110, \T5, \T3
2665        vpxor           \XMM7, \T2, \T2
2666        vpxor           \T5, \T3, \T3
2667
2668        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2669        vpxor           \T4, \T6, \T6
2670
2671        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2672        vpxor           \T4, \T7, \T7
2673
2674        vpclmulqdq      $0x00, \T3, \T2, \T2
2675
2676        vpxor           \T2, \XMM1, \XMM1
2677
2678        ######################
2679
2680        vmovdqu         HashKey(arg2), \T5
2681        vpshufd         $0b01001110, \XMM8, \T2
2682        vpshufd         $0b01001110, \T5, \T3
2683        vpxor           \XMM8, \T2, \T2
2684        vpxor           \T5, \T3, \T3
2685
2686        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2687        vpxor           \T4, \T6, \T6
2688
2689        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2690        vpxor           \T4, \T7, \T7
2691
2692        vpclmulqdq      $0x00, \T3, \T2, \T2
2693
2694        vpxor           \T2, \XMM1, \XMM1
2695        vpxor           \T6, \XMM1, \XMM1
2696        vpxor           \T7, \XMM1, \T2
2697
2698
2699
2700
2701        vpslldq $8, \T2, \T4
2702        vpsrldq $8, \T2, \T2
2703
2704        vpxor   \T4, \T7, \T7
2705        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2706						   # accumulated carry-less multiplications
2707
2708        #######################################################################
2709        #first phase of the reduction
2710        vmovdqa         POLY2(%rip), \T3
2711
2712        vpclmulqdq      $0x01, \T7, \T3, \T2
2713        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2714
2715        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2716        #######################################################################
2717
2718
2719        #second phase of the reduction
2720        vpclmulqdq      $0x00, \T7, \T3, \T2
2721        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2722
2723        vpclmulqdq      $0x10, \T7, \T3, \T4
2724        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2725
2726        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2727        #######################################################################
2728        vpxor           \T4, \T6, \T6              # the result is in T6
2729.endm
2730
2731
2732
2733#############################################################
2734#void   aesni_gcm_init_avx_gen4
2735#        (gcm_data     *my_ctx_data,
2736#         gcm_context_data *data,
2737#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2738#			(from Security Association) concatenated with 8 byte
2739#			Initialisation Vector (from IPSec ESP Payload)
2740#			concatenated with 0x00000001. 16-byte aligned pointer. */
2741#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2742#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2743#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2744#############################################################
2745SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2746        FUNC_SAVE
2747        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2748        FUNC_RESTORE
2749        ret
2750SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2751
2752###############################################################################
2753#void   aesni_gcm_enc_avx_gen4(
2754#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2755#        gcm_context_data *data,
2756#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2757#        const   u8 *in, /* Plaintext input */
2758#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2759###############################################################################
2760SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2761        FUNC_SAVE
2762        mov     keysize,%eax
2763        cmp     $32, %eax
2764        je      key_256_enc_update4
2765        cmp     $16, %eax
2766        je      key_128_enc_update4
2767        # must be 192
2768        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2769        FUNC_RESTORE
2770	ret
2771key_128_enc_update4:
2772        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2773        FUNC_RESTORE
2774	ret
2775key_256_enc_update4:
2776        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2777        FUNC_RESTORE
2778	ret
2779SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2780
2781###############################################################################
2782#void   aesni_gcm_dec_update_avx_gen4(
2783#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2784#        gcm_context_data *data,
2785#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2786#        const   u8 *in, /* Ciphertext input */
2787#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2788###############################################################################
2789SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2790        FUNC_SAVE
2791        mov     keysize,%eax
2792        cmp     $32, %eax
2793        je      key_256_dec_update4
2794        cmp     $16, %eax
2795        je      key_128_dec_update4
2796        # must be 192
2797        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2798        FUNC_RESTORE
2799        ret
2800key_128_dec_update4:
2801        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2802        FUNC_RESTORE
2803        ret
2804key_256_dec_update4:
2805        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2806        FUNC_RESTORE
2807        ret
2808SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2809
2810###############################################################################
2811#void   aesni_gcm_finalize_avx_gen4(
2812#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2813#        gcm_context_data *data,
2814#        u8      *auth_tag, /* Authenticated Tag output. */
2815#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2816#                              Valid values are 16 (most likely), 12 or 8. */
2817###############################################################################
2818SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2819        FUNC_SAVE
2820        mov	keysize,%eax
2821        cmp     $32, %eax
2822        je      key_256_finalize4
2823        cmp     $16, %eax
2824        je      key_128_finalize4
2825        # must be 192
2826        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2827        FUNC_RESTORE
2828        ret
2829key_128_finalize4:
2830        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2831        FUNC_RESTORE
2832        ret
2833key_256_finalize4:
2834        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2835        FUNC_RESTORE
2836        ret
2837SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2838