1*2874c5fdSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-or-later */ 2cf1521a1SJussi Kivilinna/* 3cf1521a1SJussi Kivilinna * Shared glue code for 128bit block ciphers, AVX2 assembler macros 4cf1521a1SJussi Kivilinna * 5cf1521a1SJussi Kivilinna * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6cf1521a1SJussi Kivilinna */ 7cf1521a1SJussi Kivilinna 8cf1521a1SJussi Kivilinna#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ 9cf1521a1SJussi Kivilinna vmovdqu (0*32)(src), x0; \ 10cf1521a1SJussi Kivilinna vmovdqu (1*32)(src), x1; \ 11cf1521a1SJussi Kivilinna vmovdqu (2*32)(src), x2; \ 12cf1521a1SJussi Kivilinna vmovdqu (3*32)(src), x3; \ 13cf1521a1SJussi Kivilinna vmovdqu (4*32)(src), x4; \ 14cf1521a1SJussi Kivilinna vmovdqu (5*32)(src), x5; \ 15cf1521a1SJussi Kivilinna vmovdqu (6*32)(src), x6; \ 16cf1521a1SJussi Kivilinna vmovdqu (7*32)(src), x7; 17cf1521a1SJussi Kivilinna 18cf1521a1SJussi Kivilinna#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 19cf1521a1SJussi Kivilinna vmovdqu x0, (0*32)(dst); \ 20cf1521a1SJussi Kivilinna vmovdqu x1, (1*32)(dst); \ 21cf1521a1SJussi Kivilinna vmovdqu x2, (2*32)(dst); \ 22cf1521a1SJussi Kivilinna vmovdqu x3, (3*32)(dst); \ 23cf1521a1SJussi Kivilinna vmovdqu x4, (4*32)(dst); \ 24cf1521a1SJussi Kivilinna vmovdqu x5, (5*32)(dst); \ 25cf1521a1SJussi Kivilinna vmovdqu x6, (6*32)(dst); \ 26cf1521a1SJussi Kivilinna vmovdqu x7, (7*32)(dst); 27cf1521a1SJussi Kivilinna 28cf1521a1SJussi Kivilinna#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \ 29cf1521a1SJussi Kivilinna vpxor t0, t0, t0; \ 30cf1521a1SJussi Kivilinna vinserti128 $1, (src), t0, t0; \ 31cf1521a1SJussi Kivilinna vpxor t0, x0, x0; \ 32cf1521a1SJussi Kivilinna vpxor (0*32+16)(src), x1, x1; \ 33cf1521a1SJussi Kivilinna vpxor (1*32+16)(src), x2, x2; \ 34cf1521a1SJussi Kivilinna vpxor (2*32+16)(src), x3, x3; \ 35cf1521a1SJussi Kivilinna vpxor (3*32+16)(src), x4, x4; \ 36cf1521a1SJussi Kivilinna vpxor (4*32+16)(src), x5, x5; \ 37cf1521a1SJussi Kivilinna vpxor (5*32+16)(src), x6, x6; \ 38cf1521a1SJussi Kivilinna vpxor (6*32+16)(src), x7, x7; \ 39cf1521a1SJussi Kivilinna store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 40cf1521a1SJussi Kivilinna 41cf1521a1SJussi Kivilinna#define inc_le128(x, minus_one, tmp) \ 42cf1521a1SJussi Kivilinna vpcmpeqq minus_one, x, tmp; \ 43cf1521a1SJussi Kivilinna vpsubq minus_one, x, x; \ 44cf1521a1SJussi Kivilinna vpslldq $8, tmp, tmp; \ 45cf1521a1SJussi Kivilinna vpsubq tmp, x, x; 46cf1521a1SJussi Kivilinna 47cf1521a1SJussi Kivilinna#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 48cf1521a1SJussi Kivilinna vpcmpeqq minus_one, x, tmp1; \ 49cf1521a1SJussi Kivilinna vpcmpeqq minus_two, x, tmp2; \ 50cf1521a1SJussi Kivilinna vpsubq minus_two, x, x; \ 51cf1521a1SJussi Kivilinna vpor tmp2, tmp1, tmp1; \ 52cf1521a1SJussi Kivilinna vpslldq $8, tmp1, tmp1; \ 53cf1521a1SJussi Kivilinna vpsubq tmp1, x, x; 54cf1521a1SJussi Kivilinna 55cf1521a1SJussi Kivilinna#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ 56cf1521a1SJussi Kivilinna t1x, t2, t2x, t3, t3x, t4, t5) \ 57cf1521a1SJussi Kivilinna vpcmpeqd t0, t0, t0; \ 58cf1521a1SJussi Kivilinna vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ 59cf1521a1SJussi Kivilinna vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ 60cf1521a1SJussi Kivilinna \ 61cf1521a1SJussi Kivilinna /* load IV and byteswap */ \ 62cf1521a1SJussi Kivilinna vmovdqu (iv), t2x; \ 63cf1521a1SJussi Kivilinna vmovdqa t2x, t3x; \ 64cf1521a1SJussi Kivilinna inc_le128(t2x, t0x, t1x); \ 65cf1521a1SJussi Kivilinna vbroadcasti128 bswap, t1; \ 66cf1521a1SJussi Kivilinna vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ 67cf1521a1SJussi Kivilinna vpshufb t1, t2, x0; \ 68cf1521a1SJussi Kivilinna \ 69cf1521a1SJussi Kivilinna /* construct IVs */ \ 70cf1521a1SJussi Kivilinna add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ 71cf1521a1SJussi Kivilinna vpshufb t1, t2, x1; \ 72cf1521a1SJussi Kivilinna add2_le128(t2, t0, t4, t3, t5); \ 73cf1521a1SJussi Kivilinna vpshufb t1, t2, x2; \ 74cf1521a1SJussi Kivilinna add2_le128(t2, t0, t4, t3, t5); \ 75cf1521a1SJussi Kivilinna vpshufb t1, t2, x3; \ 76cf1521a1SJussi Kivilinna add2_le128(t2, t0, t4, t3, t5); \ 77cf1521a1SJussi Kivilinna vpshufb t1, t2, x4; \ 78cf1521a1SJussi Kivilinna add2_le128(t2, t0, t4, t3, t5); \ 79cf1521a1SJussi Kivilinna vpshufb t1, t2, x5; \ 80cf1521a1SJussi Kivilinna add2_le128(t2, t0, t4, t3, t5); \ 81cf1521a1SJussi Kivilinna vpshufb t1, t2, x6; \ 82cf1521a1SJussi Kivilinna add2_le128(t2, t0, t4, t3, t5); \ 83cf1521a1SJussi Kivilinna vpshufb t1, t2, x7; \ 84cf1521a1SJussi Kivilinna vextracti128 $1, t2, t2x; \ 85cf1521a1SJussi Kivilinna inc_le128(t2x, t0x, t3x); \ 86cf1521a1SJussi Kivilinna vmovdqu t2x, (iv); 87cf1521a1SJussi Kivilinna 88cf1521a1SJussi Kivilinna#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 89cf1521a1SJussi Kivilinna vpxor (0*32)(src), x0, x0; \ 90cf1521a1SJussi Kivilinna vpxor (1*32)(src), x1, x1; \ 91cf1521a1SJussi Kivilinna vpxor (2*32)(src), x2, x2; \ 92cf1521a1SJussi Kivilinna vpxor (3*32)(src), x3, x3; \ 93cf1521a1SJussi Kivilinna vpxor (4*32)(src), x4, x4; \ 94cf1521a1SJussi Kivilinna vpxor (5*32)(src), x5, x5; \ 95cf1521a1SJussi Kivilinna vpxor (6*32)(src), x6, x6; \ 96cf1521a1SJussi Kivilinna vpxor (7*32)(src), x7, x7; \ 97cf1521a1SJussi Kivilinna store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 98cf1521a1SJussi Kivilinna 99cf1521a1SJussi Kivilinna#define gf128mul_x_ble(iv, mask, tmp) \ 100cf1521a1SJussi Kivilinna vpsrad $31, iv, tmp; \ 101cf1521a1SJussi Kivilinna vpaddq iv, iv, iv; \ 102cf1521a1SJussi Kivilinna vpshufd $0x13, tmp, tmp; \ 103cf1521a1SJussi Kivilinna vpand mask, tmp, tmp; \ 104cf1521a1SJussi Kivilinna vpxor tmp, iv, iv; 105cf1521a1SJussi Kivilinna 106cf1521a1SJussi Kivilinna#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 107cf1521a1SJussi Kivilinna vpsrad $31, iv, tmp0; \ 108cf1521a1SJussi Kivilinna vpaddq iv, iv, tmp1; \ 109cf1521a1SJussi Kivilinna vpsllq $2, iv, iv; \ 110cf1521a1SJussi Kivilinna vpshufd $0x13, tmp0, tmp0; \ 111cf1521a1SJussi Kivilinna vpsrad $31, tmp1, tmp1; \ 112cf1521a1SJussi Kivilinna vpand mask2, tmp0, tmp0; \ 113cf1521a1SJussi Kivilinna vpshufd $0x13, tmp1, tmp1; \ 114cf1521a1SJussi Kivilinna vpxor tmp0, iv, iv; \ 115cf1521a1SJussi Kivilinna vpand mask1, tmp1, tmp1; \ 116cf1521a1SJussi Kivilinna vpxor tmp1, iv, iv; 117cf1521a1SJussi Kivilinna 118cf1521a1SJussi Kivilinna#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ 119cf1521a1SJussi Kivilinna tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ 120cf1521a1SJussi Kivilinna xts_gf128mul_and_shl1_mask_0, \ 121cf1521a1SJussi Kivilinna xts_gf128mul_and_shl1_mask_1) \ 122cf1521a1SJussi Kivilinna vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ 123cf1521a1SJussi Kivilinna \ 124cf1521a1SJussi Kivilinna /* load IV and construct second IV */ \ 125cf1521a1SJussi Kivilinna vmovdqu (iv), tivx; \ 126cf1521a1SJussi Kivilinna vmovdqa tivx, t0x; \ 127cf1521a1SJussi Kivilinna gf128mul_x_ble(tivx, t1x, t2x); \ 128cf1521a1SJussi Kivilinna vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ 129cf1521a1SJussi Kivilinna vinserti128 $1, tivx, t0, tiv; \ 130cf1521a1SJussi Kivilinna vpxor (0*32)(src), tiv, x0; \ 131cf1521a1SJussi Kivilinna vmovdqu tiv, (0*32)(dst); \ 132cf1521a1SJussi Kivilinna \ 133cf1521a1SJussi Kivilinna /* construct and store IVs, also xor with source */ \ 134cf1521a1SJussi Kivilinna gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 135cf1521a1SJussi Kivilinna vpxor (1*32)(src), tiv, x1; \ 136cf1521a1SJussi Kivilinna vmovdqu tiv, (1*32)(dst); \ 137cf1521a1SJussi Kivilinna \ 138cf1521a1SJussi Kivilinna gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 139cf1521a1SJussi Kivilinna vpxor (2*32)(src), tiv, x2; \ 140cf1521a1SJussi Kivilinna vmovdqu tiv, (2*32)(dst); \ 141cf1521a1SJussi Kivilinna \ 142cf1521a1SJussi Kivilinna gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 143cf1521a1SJussi Kivilinna vpxor (3*32)(src), tiv, x3; \ 144cf1521a1SJussi Kivilinna vmovdqu tiv, (3*32)(dst); \ 145cf1521a1SJussi Kivilinna \ 146cf1521a1SJussi Kivilinna gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 147cf1521a1SJussi Kivilinna vpxor (4*32)(src), tiv, x4; \ 148cf1521a1SJussi Kivilinna vmovdqu tiv, (4*32)(dst); \ 149cf1521a1SJussi Kivilinna \ 150cf1521a1SJussi Kivilinna gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 151cf1521a1SJussi Kivilinna vpxor (5*32)(src), tiv, x5; \ 152cf1521a1SJussi Kivilinna vmovdqu tiv, (5*32)(dst); \ 153cf1521a1SJussi Kivilinna \ 154cf1521a1SJussi Kivilinna gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 155cf1521a1SJussi Kivilinna vpxor (6*32)(src), tiv, x6; \ 156cf1521a1SJussi Kivilinna vmovdqu tiv, (6*32)(dst); \ 157cf1521a1SJussi Kivilinna \ 158cf1521a1SJussi Kivilinna gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 159cf1521a1SJussi Kivilinna vpxor (7*32)(src), tiv, x7; \ 160cf1521a1SJussi Kivilinna vmovdqu tiv, (7*32)(dst); \ 161cf1521a1SJussi Kivilinna \ 162cf1521a1SJussi Kivilinna vextracti128 $1, tiv, tivx; \ 163cf1521a1SJussi Kivilinna gf128mul_x_ble(tivx, t1x, t2x); \ 164cf1521a1SJussi Kivilinna vmovdqu tivx, (iv); 165cf1521a1SJussi Kivilinna 166cf1521a1SJussi Kivilinna#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 167cf1521a1SJussi Kivilinna vpxor (0*32)(dst), x0, x0; \ 168cf1521a1SJussi Kivilinna vpxor (1*32)(dst), x1, x1; \ 169cf1521a1SJussi Kivilinna vpxor (2*32)(dst), x2, x2; \ 170cf1521a1SJussi Kivilinna vpxor (3*32)(dst), x3, x3; \ 171cf1521a1SJussi Kivilinna vpxor (4*32)(dst), x4, x4; \ 172cf1521a1SJussi Kivilinna vpxor (5*32)(dst), x5, x5; \ 173cf1521a1SJussi Kivilinna vpxor (6*32)(dst), x6, x6; \ 174cf1521a1SJussi Kivilinna vpxor (7*32)(dst), x7, x7; \ 175cf1521a1SJussi Kivilinna store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 176