1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved. 4 * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 5 * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 6 */ 7 8 #include <crypto/curve25519.h> 9 #include <crypto/internal/kpp.h> 10 11 #include <linux/types.h> 12 #include <linux/jump_label.h> 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 16 #include <asm/cpufeature.h> 17 #include <asm/processor.h> 18 19 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2); 20 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx); 21 22 enum { NUM_WORDS_ELTFP25519 = 4 }; 23 typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519]; 24 typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519]; 25 26 #define mul_eltfp25519_1w_adx(c, a, b) do { \ 27 mul_256x256_integer_adx(m.buffer, a, b); \ 28 red_eltfp25519_1w_adx(c, m.buffer); \ 29 } while (0) 30 31 #define mul_eltfp25519_1w_bmi2(c, a, b) do { \ 32 mul_256x256_integer_bmi2(m.buffer, a, b); \ 33 red_eltfp25519_1w_bmi2(c, m.buffer); \ 34 } while (0) 35 36 #define sqr_eltfp25519_1w_adx(a) do { \ 37 sqr_256x256_integer_adx(m.buffer, a); \ 38 red_eltfp25519_1w_adx(a, m.buffer); \ 39 } while (0) 40 41 #define sqr_eltfp25519_1w_bmi2(a) do { \ 42 sqr_256x256_integer_bmi2(m.buffer, a); \ 43 red_eltfp25519_1w_bmi2(a, m.buffer); \ 44 } while (0) 45 46 #define mul_eltfp25519_2w_adx(c, a, b) do { \ 47 mul2_256x256_integer_adx(m.buffer, a, b); \ 48 red_eltfp25519_2w_adx(c, m.buffer); \ 49 } while (0) 50 51 #define mul_eltfp25519_2w_bmi2(c, a, b) do { \ 52 mul2_256x256_integer_bmi2(m.buffer, a, b); \ 53 red_eltfp25519_2w_bmi2(c, m.buffer); \ 54 } while (0) 55 56 #define sqr_eltfp25519_2w_adx(a) do { \ 57 sqr2_256x256_integer_adx(m.buffer, a); \ 58 red_eltfp25519_2w_adx(a, m.buffer); \ 59 } while (0) 60 61 #define sqr_eltfp25519_2w_bmi2(a) do { \ 62 sqr2_256x256_integer_bmi2(m.buffer, a); \ 63 red_eltfp25519_2w_bmi2(a, m.buffer); \ 64 } while (0) 65 66 #define sqrn_eltfp25519_1w_adx(a, times) do { \ 67 int ____counter = (times); \ 68 while (____counter-- > 0) \ 69 sqr_eltfp25519_1w_adx(a); \ 70 } while (0) 71 72 #define sqrn_eltfp25519_1w_bmi2(a, times) do { \ 73 int ____counter = (times); \ 74 while (____counter-- > 0) \ 75 sqr_eltfp25519_1w_bmi2(a); \ 76 } while (0) 77 78 #define copy_eltfp25519_1w(C, A) do { \ 79 (C)[0] = (A)[0]; \ 80 (C)[1] = (A)[1]; \ 81 (C)[2] = (A)[2]; \ 82 (C)[3] = (A)[3]; \ 83 } while (0) 84 85 #define setzero_eltfp25519_1w(C) do { \ 86 (C)[0] = 0; \ 87 (C)[1] = 0; \ 88 (C)[2] = 0; \ 89 (C)[3] = 0; \ 90 } while (0) 91 92 __aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = { 93 /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL, 94 0xffffffffffffffffUL, 0x5fffffffffffffffUL, 95 /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL, 96 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL, 97 /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL, 98 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL, 99 /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL, 100 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL, 101 /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL, 102 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL, 103 /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL, 104 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL, 105 /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL, 106 0xc1c20d06231f7614UL, 0x2938218da274f972UL, 107 /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL, 108 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL, 109 /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL, 110 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL, 111 /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL, 112 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL, 113 /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL, 114 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL, 115 /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL, 116 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL, 117 /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL, 118 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL, 119 /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL, 120 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL, 121 /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL, 122 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL, 123 /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL, 124 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL, 125 /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL, 126 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL, 127 /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL, 128 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL, 129 /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL, 130 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL, 131 /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL, 132 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL, 133 /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL, 134 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL, 135 /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL, 136 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL, 137 /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL, 138 0x23758739f630a257UL, 0x295a407a01a78580UL, 139 /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL, 140 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL, 141 /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL, 142 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL, 143 /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL, 144 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL, 145 /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL, 146 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL, 147 /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL, 148 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL, 149 /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL, 150 0x74b4c4ceab102f64UL, 0x183abadd10139845UL, 151 /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL, 152 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL, 153 /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL, 154 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL, 155 /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL, 156 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL, 157 /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL, 158 0xd88768e4904032d8UL, 0x131384427b3aaeecUL, 159 /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL, 160 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL, 161 /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL, 162 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL, 163 /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL, 164 0xa401760b882c797aUL, 0x1fc223e28dc88730UL, 165 /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL, 166 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL, 167 /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL, 168 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL, 169 /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL, 170 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL, 171 /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL, 172 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL, 173 /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL, 174 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL, 175 /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL, 176 0x5c217736fa279374UL, 0x7dde05734afeb1faUL, 177 /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL, 178 0xe6053bf89595bf7aUL, 0x394faf38da245530UL, 179 /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL, 180 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL, 181 /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL, 182 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL, 183 /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL, 184 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL, 185 /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL, 186 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL, 187 /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL, 188 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL, 189 /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL, 190 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL, 191 /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL, 192 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL, 193 /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL, 194 0xc189218075e91436UL, 0x6d9284169b3b8484UL, 195 /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL, 196 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL, 197 /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL, 198 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL, 199 /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL, 200 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL, 201 /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL, 202 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL, 203 /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL, 204 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL, 205 /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL, 206 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL, 207 /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL, 208 0xf826842130f5ad28UL, 0x3ea988f75301a441UL, 209 /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL, 210 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL, 211 /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL, 212 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL, 213 /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL, 214 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL, 215 /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL, 216 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL, 217 /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL, 218 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL, 219 /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL, 220 0x25232973322dbef4UL, 0x445dc4758c17f770UL, 221 /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL, 222 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL, 223 /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL, 224 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL, 225 /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL, 226 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL, 227 /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL, 228 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL, 229 /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL, 230 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL, 231 /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL, 232 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL, 233 /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL, 234 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL, 235 /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL, 236 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL, 237 /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL, 238 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL, 239 /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL, 240 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL, 241 /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL, 242 0x674f1288f8e11217UL, 0x5682250f329f93d0UL, 243 /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL, 244 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL, 245 /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL, 246 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL, 247 /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL, 248 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL, 249 /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL, 250 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL, 251 /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL, 252 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL, 253 /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL, 254 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL, 255 /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL, 256 0x894d1d855ae52359UL, 0x68e122157b743d69UL, 257 /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL, 258 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL, 259 /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL, 260 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL, 261 /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL, 262 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL, 263 /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL, 264 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL, 265 /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL, 266 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL, 267 /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL, 268 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL, 269 /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL, 270 0x45adb16e76cefcf2UL, 0x01f768aead232999UL, 271 /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL, 272 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL, 273 /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL, 274 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL, 275 /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL, 276 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL, 277 /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL, 278 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL, 279 /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL, 280 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL, 281 /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL, 282 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL, 283 /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL, 284 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL, 285 /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL, 286 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL, 287 /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL, 288 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL, 289 /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL, 290 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL, 291 /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL, 292 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL, 293 /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL, 294 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL, 295 /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL, 296 0x4a497962066e6043UL, 0x705b3aab41355b44UL, 297 /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL, 298 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL, 299 /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL, 300 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL, 301 /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL, 302 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL, 303 /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL, 304 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL, 305 /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL, 306 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL, 307 /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL, 308 0x2088ce1570033c68UL, 0x7fba1f495c837987UL, 309 /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL, 310 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL, 311 /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL, 312 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL, 313 /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL, 314 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL, 315 /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL, 316 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL, 317 /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL, 318 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL, 319 /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL, 320 0x00f52e3f67280294UL, 0x566d4fc14730c509UL, 321 /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL, 322 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL, 323 /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL, 324 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL, 325 /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL, 326 0x508e862f121692fcUL, 0x3a81907fa093c291UL, 327 /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL, 328 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL, 329 /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL, 330 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL, 331 /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL, 332 0xe488de11d761e352UL, 0x0e878a01a085545cUL, 333 /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL, 334 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL, 335 /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL, 336 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL, 337 /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL, 338 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL, 339 /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL, 340 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL, 341 /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL, 342 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL, 343 /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL, 344 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL, 345 /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL, 346 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL, 347 /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL, 348 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL, 349 /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL, 350 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL, 351 /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL, 352 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL, 353 /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL, 354 0x904659bb686e3772UL, 0x7215c371746ba8c8UL, 355 /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL, 356 0x266fd5809208f294UL, 0x5c847085619a26b9UL, 357 /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL, 358 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL, 359 /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL, 360 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL, 361 /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL, 362 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL, 363 /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL, 364 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL, 365 /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL, 366 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL, 367 /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL, 368 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL, 369 /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL, 370 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL, 371 /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL, 372 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL, 373 /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL, 374 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL, 375 /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL, 376 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL, 377 /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL, 378 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL, 379 /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL, 380 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL, 381 /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL, 382 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL, 383 /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL, 384 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL, 385 /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL, 386 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL, 387 /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL, 388 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL, 389 /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL, 390 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL, 391 /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL, 392 0x52d17436309d4253UL, 0x356f97e13efae576UL, 393 /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL, 394 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL, 395 /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL, 396 0x66124c6f97bda770UL, 0x0f81a0290654124aUL, 397 /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL, 398 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL, 399 /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL, 400 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL, 401 /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL, 402 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL, 403 /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL, 404 0x5da643cb4bf30035UL, 0x77db28d63940f721UL, 405 /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL, 406 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL, 407 /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL, 408 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL, 409 /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL, 410 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL, 411 /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL, 412 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL, 413 /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL, 414 0x497d723f802e88e1UL, 0x30684dea602f408dUL, 415 /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL, 416 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL, 417 /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL, 418 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL, 419 /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL, 420 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL, 421 /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL, 422 0x026df551dbb85c20UL, 0x74fcd91047e21901UL, 423 /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL, 424 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL, 425 /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL, 426 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL, 427 /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL, 428 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL, 429 /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL, 430 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL, 431 /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL, 432 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL, 433 /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL, 434 0x13033ac001f66697UL, 0x273b24fe3b367d75UL, 435 /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL, 436 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL, 437 /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL, 438 0xacc63ca34b8ec145UL, 0x74621888fee66574UL, 439 /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL, 440 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL, 441 /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL, 442 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL, 443 /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL, 444 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL, 445 /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL, 446 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL, 447 /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL, 448 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL, 449 /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL, 450 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL, 451 /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL, 452 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL, 453 /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL, 454 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL, 455 /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL, 456 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL, 457 /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL, 458 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL, 459 /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL, 460 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL, 461 /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL, 462 0x81004b71e33cc191UL, 0x44e6be345122803cUL, 463 /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL, 464 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL, 465 /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL, 466 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL, 467 /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL, 468 0x12bc8d6915783712UL, 0x498194c0fc620abbUL, 469 /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL, 470 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL, 471 /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL, 472 0x1e60c24598c71fffUL, 0x59f2f014979983efUL, 473 /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL, 474 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL, 475 /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL, 476 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL, 477 /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL, 478 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL, 479 /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL, 480 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL, 481 /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL, 482 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL, 483 /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL, 484 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL, 485 /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL, 486 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL, 487 /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL, 488 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL, 489 /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL, 490 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL, 491 /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL, 492 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL, 493 /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL, 494 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL, 495 /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL, 496 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL, 497 /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL, 498 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL, 499 /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL, 500 0x33979624f0e917beUL, 0x2c018dc527356b30UL, 501 /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL, 502 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL, 503 /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL, 504 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL, 505 /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL, 506 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL, 507 /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL, 508 0x345ead5e972d091eUL, 0x18c8df11a83103baUL, 509 /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL, 510 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL, 511 /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL, 512 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL, 513 /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL, 514 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL, 515 /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL, 516 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL, 517 /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL, 518 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL, 519 /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL, 520 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL, 521 /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL, 522 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL, 523 /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL, 524 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL, 525 /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL, 526 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL, 527 /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL, 528 0x1df4c0af01314a60UL, 0x09a62dab89289527UL, 529 /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL, 530 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL, 531 /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL, 532 0x49b96853d7a7084aUL, 0x4980a319601420a8UL, 533 /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL, 534 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL, 535 /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL, 536 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL, 537 /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL, 538 0xddeb34a061615d99UL, 0x5129cecceb64b773UL, 539 /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL, 540 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL, 541 /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL, 542 0x680bd77c73edad2eUL, 0x487c02354edd9041UL, 543 /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL, 544 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL, 545 /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL, 546 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL, 547 /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL, 548 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL, 549 /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL, 550 0xe9834262d13921edUL, 0x27fedafaa54bb592UL, 551 /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL, 552 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL, 553 /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL, 554 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL, 555 /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL, 556 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL, 557 /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL, 558 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL, 559 /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL, 560 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL, 561 /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL, 562 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL, 563 /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL, 564 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL, 565 /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL, 566 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL, 567 /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL, 568 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL, 569 /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL, 570 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL, 571 /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL, 572 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL, 573 /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL, 574 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL, 575 /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL, 576 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL, 577 /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL, 578 0x2c43ecea0107c1ddUL, 0x526028809372de35UL, 579 /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL, 580 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL, 581 /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL, 582 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL, 583 /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL, 584 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL, 585 /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL, 586 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL, 587 /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL, 588 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL, 589 /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL, 590 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL, 591 /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL, 592 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL, 593 /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL, 594 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL, 595 /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL, 596 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL 597 }; 598 599 /* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7] 600 * a is two 256-bit integers: a0[0:3] and a1[4:7] 601 * b is two 256-bit integers: b0[0:3] and b1[4:7] 602 */ 603 static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a, 604 const u64 *const b) 605 { 606 asm volatile( 607 "xorl %%r14d, %%r14d ;" 608 "movq (%1), %%rdx; " /* A[0] */ 609 "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ 610 "xorl %%r10d, %%r10d ;" 611 "movq %%r8, (%0) ;" 612 "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ 613 "adox %%r10, %%r15 ;" 614 "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ 615 "adox %%r8, %%rax ;" 616 "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ 617 "adox %%r10, %%rbx ;" 618 /******************************************/ 619 "adox %%r14, %%rcx ;" 620 621 "movq 8(%1), %%rdx; " /* A[1] */ 622 "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ 623 "adox %%r15, %%r8 ;" 624 "movq %%r8, 8(%0) ;" 625 "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ 626 "adox %%r10, %%r9 ;" 627 "adcx %%r9, %%rax ;" 628 "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ 629 "adox %%r8, %%r11 ;" 630 "adcx %%r11, %%rbx ;" 631 "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ 632 "adox %%r10, %%r13 ;" 633 "adcx %%r13, %%rcx ;" 634 /******************************************/ 635 "adox %%r14, %%r15 ;" 636 "adcx %%r14, %%r15 ;" 637 638 "movq 16(%1), %%rdx; " /* A[2] */ 639 "xorl %%r10d, %%r10d ;" 640 "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ 641 "adox %%rax, %%r8 ;" 642 "movq %%r8, 16(%0) ;" 643 "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ 644 "adox %%r10, %%r9 ;" 645 "adcx %%r9, %%rbx ;" 646 "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ 647 "adox %%r8, %%r11 ;" 648 "adcx %%r11, %%rcx ;" 649 "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ 650 "adox %%r10, %%r13 ;" 651 "adcx %%r13, %%r15 ;" 652 /******************************************/ 653 "adox %%r14, %%rax ;" 654 "adcx %%r14, %%rax ;" 655 656 "movq 24(%1), %%rdx; " /* A[3] */ 657 "xorl %%r10d, %%r10d ;" 658 "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ 659 "adox %%rbx, %%r8 ;" 660 "movq %%r8, 24(%0) ;" 661 "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ 662 "adox %%r10, %%r9 ;" 663 "adcx %%r9, %%rcx ;" 664 "movq %%rcx, 32(%0) ;" 665 "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ 666 "adox %%r8, %%r11 ;" 667 "adcx %%r11, %%r15 ;" 668 "movq %%r15, 40(%0) ;" 669 "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ 670 "adox %%r10, %%r13 ;" 671 "adcx %%r13, %%rax ;" 672 "movq %%rax, 48(%0) ;" 673 /******************************************/ 674 "adox %%r14, %%rbx ;" 675 "adcx %%r14, %%rbx ;" 676 "movq %%rbx, 56(%0) ;" 677 678 "movq 32(%1), %%rdx; " /* C[0] */ 679 "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ 680 "xorl %%r10d, %%r10d ;" 681 "movq %%r8, 64(%0);" 682 "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ 683 "adox %%r10, %%r15 ;" 684 "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ 685 "adox %%r8, %%rax ;" 686 "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ 687 "adox %%r10, %%rbx ;" 688 /******************************************/ 689 "adox %%r14, %%rcx ;" 690 691 "movq 40(%1), %%rdx; " /* C[1] */ 692 "xorl %%r10d, %%r10d ;" 693 "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ 694 "adox %%r15, %%r8 ;" 695 "movq %%r8, 72(%0);" 696 "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ 697 "adox %%r10, %%r9 ;" 698 "adcx %%r9, %%rax ;" 699 "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ 700 "adox %%r8, %%r11 ;" 701 "adcx %%r11, %%rbx ;" 702 "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ 703 "adox %%r10, %%r13 ;" 704 "adcx %%r13, %%rcx ;" 705 /******************************************/ 706 "adox %%r14, %%r15 ;" 707 "adcx %%r14, %%r15 ;" 708 709 "movq 48(%1), %%rdx; " /* C[2] */ 710 "xorl %%r10d, %%r10d ;" 711 "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ 712 "adox %%rax, %%r8 ;" 713 "movq %%r8, 80(%0);" 714 "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ 715 "adox %%r10, %%r9 ;" 716 "adcx %%r9, %%rbx ;" 717 "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ 718 "adox %%r8, %%r11 ;" 719 "adcx %%r11, %%rcx ;" 720 "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ 721 "adox %%r10, %%r13 ;" 722 "adcx %%r13, %%r15 ;" 723 /******************************************/ 724 "adox %%r14, %%rax ;" 725 "adcx %%r14, %%rax ;" 726 727 "movq 56(%1), %%rdx; " /* C[3] */ 728 "xorl %%r10d, %%r10d ;" 729 "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ 730 "adox %%rbx, %%r8 ;" 731 "movq %%r8, 88(%0);" 732 "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ 733 "adox %%r10, %%r9 ;" 734 "adcx %%r9, %%rcx ;" 735 "movq %%rcx, 96(%0) ;" 736 "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ 737 "adox %%r8, %%r11 ;" 738 "adcx %%r11, %%r15 ;" 739 "movq %%r15, 104(%0) ;" 740 "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ 741 "adox %%r10, %%r13 ;" 742 "adcx %%r13, %%rax ;" 743 "movq %%rax, 112(%0) ;" 744 /******************************************/ 745 "adox %%r14, %%rbx ;" 746 "adcx %%r14, %%rbx ;" 747 "movq %%rbx, 120(%0) ;" 748 : 749 : "r"(c), "r"(a), "r"(b) 750 : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", 751 "%r10", "%r11", "%r13", "%r14", "%r15"); 752 } 753 754 static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a, 755 const u64 *const b) 756 { 757 asm volatile( 758 "movq (%1), %%rdx; " /* A[0] */ 759 "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ 760 "movq %%r8, (%0) ;" 761 "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ 762 "addq %%r10, %%r15 ;" 763 "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ 764 "adcq %%r8, %%rax ;" 765 "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ 766 "adcq %%r10, %%rbx ;" 767 /******************************************/ 768 "adcq $0, %%rcx ;" 769 770 "movq 8(%1), %%rdx; " /* A[1] */ 771 "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ 772 "addq %%r15, %%r8 ;" 773 "movq %%r8, 8(%0) ;" 774 "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ 775 "adcq %%r10, %%r9 ;" 776 "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ 777 "adcq %%r8, %%r11 ;" 778 "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ 779 "adcq %%r10, %%r13 ;" 780 /******************************************/ 781 "adcq $0, %%r15 ;" 782 783 "addq %%r9, %%rax ;" 784 "adcq %%r11, %%rbx ;" 785 "adcq %%r13, %%rcx ;" 786 "adcq $0, %%r15 ;" 787 788 "movq 16(%1), %%rdx; " /* A[2] */ 789 "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ 790 "addq %%rax, %%r8 ;" 791 "movq %%r8, 16(%0) ;" 792 "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ 793 "adcq %%r10, %%r9 ;" 794 "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ 795 "adcq %%r8, %%r11 ;" 796 "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ 797 "adcq %%r10, %%r13 ;" 798 /******************************************/ 799 "adcq $0, %%rax ;" 800 801 "addq %%r9, %%rbx ;" 802 "adcq %%r11, %%rcx ;" 803 "adcq %%r13, %%r15 ;" 804 "adcq $0, %%rax ;" 805 806 "movq 24(%1), %%rdx; " /* A[3] */ 807 "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ 808 "addq %%rbx, %%r8 ;" 809 "movq %%r8, 24(%0) ;" 810 "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ 811 "adcq %%r10, %%r9 ;" 812 "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ 813 "adcq %%r8, %%r11 ;" 814 "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ 815 "adcq %%r10, %%r13 ;" 816 /******************************************/ 817 "adcq $0, %%rbx ;" 818 819 "addq %%r9, %%rcx ;" 820 "movq %%rcx, 32(%0) ;" 821 "adcq %%r11, %%r15 ;" 822 "movq %%r15, 40(%0) ;" 823 "adcq %%r13, %%rax ;" 824 "movq %%rax, 48(%0) ;" 825 "adcq $0, %%rbx ;" 826 "movq %%rbx, 56(%0) ;" 827 828 "movq 32(%1), %%rdx; " /* C[0] */ 829 "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ 830 "movq %%r8, 64(%0) ;" 831 "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ 832 "addq %%r10, %%r15 ;" 833 "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ 834 "adcq %%r8, %%rax ;" 835 "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ 836 "adcq %%r10, %%rbx ;" 837 /******************************************/ 838 "adcq $0, %%rcx ;" 839 840 "movq 40(%1), %%rdx; " /* C[1] */ 841 "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ 842 "addq %%r15, %%r8 ;" 843 "movq %%r8, 72(%0) ;" 844 "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ 845 "adcq %%r10, %%r9 ;" 846 "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ 847 "adcq %%r8, %%r11 ;" 848 "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ 849 "adcq %%r10, %%r13 ;" 850 /******************************************/ 851 "adcq $0, %%r15 ;" 852 853 "addq %%r9, %%rax ;" 854 "adcq %%r11, %%rbx ;" 855 "adcq %%r13, %%rcx ;" 856 "adcq $0, %%r15 ;" 857 858 "movq 48(%1), %%rdx; " /* C[2] */ 859 "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ 860 "addq %%rax, %%r8 ;" 861 "movq %%r8, 80(%0) ;" 862 "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ 863 "adcq %%r10, %%r9 ;" 864 "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ 865 "adcq %%r8, %%r11 ;" 866 "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ 867 "adcq %%r10, %%r13 ;" 868 /******************************************/ 869 "adcq $0, %%rax ;" 870 871 "addq %%r9, %%rbx ;" 872 "adcq %%r11, %%rcx ;" 873 "adcq %%r13, %%r15 ;" 874 "adcq $0, %%rax ;" 875 876 "movq 56(%1), %%rdx; " /* C[3] */ 877 "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ 878 "addq %%rbx, %%r8 ;" 879 "movq %%r8, 88(%0) ;" 880 "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ 881 "adcq %%r10, %%r9 ;" 882 "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ 883 "adcq %%r8, %%r11 ;" 884 "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ 885 "adcq %%r10, %%r13 ;" 886 /******************************************/ 887 "adcq $0, %%rbx ;" 888 889 "addq %%r9, %%rcx ;" 890 "movq %%rcx, 96(%0) ;" 891 "adcq %%r11, %%r15 ;" 892 "movq %%r15, 104(%0) ;" 893 "adcq %%r13, %%rax ;" 894 "movq %%rax, 112(%0) ;" 895 "adcq $0, %%rbx ;" 896 "movq %%rbx, 120(%0) ;" 897 : 898 : "r"(c), "r"(a), "r"(b) 899 : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", 900 "%r10", "%r11", "%r13", "%r15"); 901 } 902 903 static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a) 904 { 905 asm volatile( 906 "movq (%1), %%rdx ;" /* A[0] */ 907 "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ 908 "xorl %%r15d, %%r15d;" 909 "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ 910 "adcx %%r14, %%r9 ;" 911 "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ 912 "adcx %%rax, %%r10 ;" 913 "movq 24(%1), %%rdx ;" /* A[3] */ 914 "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ 915 "adcx %%rcx, %%r11 ;" 916 "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ 917 "adcx %%rax, %%rbx ;" 918 "movq 8(%1), %%rdx ;" /* A[1] */ 919 "adcx %%r15, %%r13 ;" 920 "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ 921 "movq $0, %%r14 ;" 922 /******************************************/ 923 "adcx %%r15, %%r14 ;" 924 925 "xorl %%r15d, %%r15d;" 926 "adox %%rax, %%r10 ;" 927 "adcx %%r8, %%r8 ;" 928 "adox %%rcx, %%r11 ;" 929 "adcx %%r9, %%r9 ;" 930 "adox %%r15, %%rbx ;" 931 "adcx %%r10, %%r10 ;" 932 "adox %%r15, %%r13 ;" 933 "adcx %%r11, %%r11 ;" 934 "adox %%r15, %%r14 ;" 935 "adcx %%rbx, %%rbx ;" 936 "adcx %%r13, %%r13 ;" 937 "adcx %%r14, %%r14 ;" 938 939 "movq (%1), %%rdx ;" 940 "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ 941 /*******************/ 942 "movq %%rax, 0(%0) ;" 943 "addq %%rcx, %%r8 ;" 944 "movq %%r8, 8(%0) ;" 945 "movq 8(%1), %%rdx ;" 946 "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ 947 "adcq %%rax, %%r9 ;" 948 "movq %%r9, 16(%0) ;" 949 "adcq %%rcx, %%r10 ;" 950 "movq %%r10, 24(%0) ;" 951 "movq 16(%1), %%rdx ;" 952 "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ 953 "adcq %%rax, %%r11 ;" 954 "movq %%r11, 32(%0) ;" 955 "adcq %%rcx, %%rbx ;" 956 "movq %%rbx, 40(%0) ;" 957 "movq 24(%1), %%rdx ;" 958 "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ 959 "adcq %%rax, %%r13 ;" 960 "movq %%r13, 48(%0) ;" 961 "adcq %%rcx, %%r14 ;" 962 "movq %%r14, 56(%0) ;" 963 964 965 "movq 32(%1), %%rdx ;" /* B[0] */ 966 "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ 967 "xorl %%r15d, %%r15d;" 968 "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ 969 "adcx %%r14, %%r9 ;" 970 "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ 971 "adcx %%rax, %%r10 ;" 972 "movq 56(%1), %%rdx ;" /* B[3] */ 973 "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */ 974 "adcx %%rcx, %%r11 ;" 975 "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ 976 "adcx %%rax, %%rbx ;" 977 "movq 40(%1), %%rdx ;" /* B[1] */ 978 "adcx %%r15, %%r13 ;" 979 "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ 980 "movq $0, %%r14 ;" 981 /******************************************/ 982 "adcx %%r15, %%r14 ;" 983 984 "xorl %%r15d, %%r15d;" 985 "adox %%rax, %%r10 ;" 986 "adcx %%r8, %%r8 ;" 987 "adox %%rcx, %%r11 ;" 988 "adcx %%r9, %%r9 ;" 989 "adox %%r15, %%rbx ;" 990 "adcx %%r10, %%r10 ;" 991 "adox %%r15, %%r13 ;" 992 "adcx %%r11, %%r11 ;" 993 "adox %%r15, %%r14 ;" 994 "adcx %%rbx, %%rbx ;" 995 "adcx %%r13, %%r13 ;" 996 "adcx %%r14, %%r14 ;" 997 998 "movq 32(%1), %%rdx ;" 999 "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */ 1000 /*******************/ 1001 "movq %%rax, 64(%0) ;" 1002 "addq %%rcx, %%r8 ;" 1003 "movq %%r8, 72(%0) ;" 1004 "movq 40(%1), %%rdx ;" 1005 "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */ 1006 "adcq %%rax, %%r9 ;" 1007 "movq %%r9, 80(%0) ;" 1008 "adcq %%rcx, %%r10 ;" 1009 "movq %%r10, 88(%0) ;" 1010 "movq 48(%1), %%rdx ;" 1011 "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */ 1012 "adcq %%rax, %%r11 ;" 1013 "movq %%r11, 96(%0) ;" 1014 "adcq %%rcx, %%rbx ;" 1015 "movq %%rbx, 104(%0) ;" 1016 "movq 56(%1), %%rdx ;" 1017 "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */ 1018 "adcq %%rax, %%r13 ;" 1019 "movq %%r13, 112(%0) ;" 1020 "adcq %%rcx, %%r14 ;" 1021 "movq %%r14, 120(%0) ;" 1022 : 1023 : "r"(c), "r"(a) 1024 : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", 1025 "%r10", "%r11", "%r13", "%r14", "%r15"); 1026 } 1027 1028 static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a) 1029 { 1030 asm volatile( 1031 "movq 8(%1), %%rdx ;" /* A[1] */ 1032 "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ 1033 "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ 1034 "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ 1035 1036 "movq 16(%1), %%rdx ;" /* A[2] */ 1037 "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ 1038 "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ 1039 1040 "addq %%rax, %%r9 ;" 1041 "adcq %%rdx, %%r10 ;" 1042 "adcq %%rcx, %%r11 ;" 1043 "adcq %%r14, %%r15 ;" 1044 "adcq $0, %%r13 ;" 1045 "movq $0, %%r14 ;" 1046 "adcq $0, %%r14 ;" 1047 1048 "movq (%1), %%rdx ;" /* A[0] */ 1049 "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ 1050 1051 "addq %%rax, %%r10 ;" 1052 "adcq %%rcx, %%r11 ;" 1053 "adcq $0, %%r15 ;" 1054 "adcq $0, %%r13 ;" 1055 "adcq $0, %%r14 ;" 1056 1057 "shldq $1, %%r13, %%r14 ;" 1058 "shldq $1, %%r15, %%r13 ;" 1059 "shldq $1, %%r11, %%r15 ;" 1060 "shldq $1, %%r10, %%r11 ;" 1061 "shldq $1, %%r9, %%r10 ;" 1062 "shldq $1, %%r8, %%r9 ;" 1063 "shlq $1, %%r8 ;" 1064 1065 /*******************/ 1066 "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */ 1067 /*******************/ 1068 "movq %%rax, 0(%0) ;" 1069 "addq %%rcx, %%r8 ;" 1070 "movq %%r8, 8(%0) ;" 1071 "movq 8(%1), %%rdx ;" 1072 "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */ 1073 "adcq %%rax, %%r9 ;" 1074 "movq %%r9, 16(%0) ;" 1075 "adcq %%rcx, %%r10 ;" 1076 "movq %%r10, 24(%0) ;" 1077 "movq 16(%1), %%rdx ;" 1078 "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */ 1079 "adcq %%rax, %%r11 ;" 1080 "movq %%r11, 32(%0) ;" 1081 "adcq %%rcx, %%r15 ;" 1082 "movq %%r15, 40(%0) ;" 1083 "movq 24(%1), %%rdx ;" 1084 "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */ 1085 "adcq %%rax, %%r13 ;" 1086 "movq %%r13, 48(%0) ;" 1087 "adcq %%rcx, %%r14 ;" 1088 "movq %%r14, 56(%0) ;" 1089 1090 "movq 40(%1), %%rdx ;" /* B[1] */ 1091 "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */ 1092 "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */ 1093 "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */ 1094 1095 "movq 48(%1), %%rdx ;" /* B[2] */ 1096 "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */ 1097 "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */ 1098 1099 "addq %%rax, %%r9 ;" 1100 "adcq %%rdx, %%r10 ;" 1101 "adcq %%rcx, %%r11 ;" 1102 "adcq %%r14, %%r15 ;" 1103 "adcq $0, %%r13 ;" 1104 "movq $0, %%r14 ;" 1105 "adcq $0, %%r14 ;" 1106 1107 "movq 32(%1), %%rdx ;" /* B[0] */ 1108 "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */ 1109 1110 "addq %%rax, %%r10 ;" 1111 "adcq %%rcx, %%r11 ;" 1112 "adcq $0, %%r15 ;" 1113 "adcq $0, %%r13 ;" 1114 "adcq $0, %%r14 ;" 1115 1116 "shldq $1, %%r13, %%r14 ;" 1117 "shldq $1, %%r15, %%r13 ;" 1118 "shldq $1, %%r11, %%r15 ;" 1119 "shldq $1, %%r10, %%r11 ;" 1120 "shldq $1, %%r9, %%r10 ;" 1121 "shldq $1, %%r8, %%r9 ;" 1122 "shlq $1, %%r8 ;" 1123 1124 /*******************/ 1125 "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */ 1126 /*******************/ 1127 "movq %%rax, 64(%0) ;" 1128 "addq %%rcx, %%r8 ;" 1129 "movq %%r8, 72(%0) ;" 1130 "movq 40(%1), %%rdx ;" 1131 "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */ 1132 "adcq %%rax, %%r9 ;" 1133 "movq %%r9, 80(%0) ;" 1134 "adcq %%rcx, %%r10 ;" 1135 "movq %%r10, 88(%0) ;" 1136 "movq 48(%1), %%rdx ;" 1137 "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */ 1138 "adcq %%rax, %%r11 ;" 1139 "movq %%r11, 96(%0) ;" 1140 "adcq %%rcx, %%r15 ;" 1141 "movq %%r15, 104(%0) ;" 1142 "movq 56(%1), %%rdx ;" 1143 "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */ 1144 "adcq %%rax, %%r13 ;" 1145 "movq %%r13, 112(%0) ;" 1146 "adcq %%rcx, %%r14 ;" 1147 "movq %%r14, 120(%0) ;" 1148 : 1149 : "r"(c), "r"(a) 1150 : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", 1151 "%r11", "%r13", "%r14", "%r15"); 1152 } 1153 1154 static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a) 1155 { 1156 asm volatile( 1157 "movl $38, %%edx; " /* 2*c = 38 = 2^256 */ 1158 "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ 1159 "xorl %%ebx, %%ebx ;" 1160 "adox (%1), %%r8 ;" 1161 "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ 1162 "adcx %%r10, %%r9 ;" 1163 "adox 8(%1), %%r9 ;" 1164 "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ 1165 "adcx %%r11, %%r10 ;" 1166 "adox 16(%1), %%r10 ;" 1167 "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ 1168 "adcx %%rax, %%r11 ;" 1169 "adox 24(%1), %%r11 ;" 1170 /***************************************/ 1171 "adcx %%rbx, %%rcx ;" 1172 "adox %%rbx, %%rcx ;" 1173 "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ 1174 "adcx %%rcx, %%r8 ;" 1175 "adcx %%rbx, %%r9 ;" 1176 "movq %%r9, 8(%0) ;" 1177 "adcx %%rbx, %%r10 ;" 1178 "movq %%r10, 16(%0) ;" 1179 "adcx %%rbx, %%r11 ;" 1180 "movq %%r11, 24(%0) ;" 1181 "mov $0, %%ecx ;" 1182 "cmovc %%edx, %%ecx ;" 1183 "addq %%rcx, %%r8 ;" 1184 "movq %%r8, (%0) ;" 1185 1186 "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ 1187 "xorl %%ebx, %%ebx ;" 1188 "adox 64(%1), %%r8 ;" 1189 "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ 1190 "adcx %%r10, %%r9 ;" 1191 "adox 72(%1), %%r9 ;" 1192 "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ 1193 "adcx %%r11, %%r10 ;" 1194 "adox 80(%1), %%r10 ;" 1195 "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ 1196 "adcx %%rax, %%r11 ;" 1197 "adox 88(%1), %%r11 ;" 1198 /****************************************/ 1199 "adcx %%rbx, %%rcx ;" 1200 "adox %%rbx, %%rcx ;" 1201 "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ 1202 "adcx %%rcx, %%r8 ;" 1203 "adcx %%rbx, %%r9 ;" 1204 "movq %%r9, 40(%0) ;" 1205 "adcx %%rbx, %%r10 ;" 1206 "movq %%r10, 48(%0) ;" 1207 "adcx %%rbx, %%r11 ;" 1208 "movq %%r11, 56(%0) ;" 1209 "mov $0, %%ecx ;" 1210 "cmovc %%edx, %%ecx ;" 1211 "addq %%rcx, %%r8 ;" 1212 "movq %%r8, 32(%0) ;" 1213 : 1214 : "r"(c), "r"(a) 1215 : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", 1216 "%r10", "%r11"); 1217 } 1218 1219 static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a) 1220 { 1221 asm volatile( 1222 "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */ 1223 "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ 1224 "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ 1225 "addq %%r10, %%r9 ;" 1226 "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ 1227 "adcq %%r11, %%r10 ;" 1228 "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ 1229 "adcq %%rax, %%r11 ;" 1230 /***************************************/ 1231 "adcq $0, %%rcx ;" 1232 "addq (%1), %%r8 ;" 1233 "adcq 8(%1), %%r9 ;" 1234 "adcq 16(%1), %%r10 ;" 1235 "adcq 24(%1), %%r11 ;" 1236 "adcq $0, %%rcx ;" 1237 "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ 1238 "addq %%rcx, %%r8 ;" 1239 "adcq $0, %%r9 ;" 1240 "movq %%r9, 8(%0) ;" 1241 "adcq $0, %%r10 ;" 1242 "movq %%r10, 16(%0) ;" 1243 "adcq $0, %%r11 ;" 1244 "movq %%r11, 24(%0) ;" 1245 "mov $0, %%ecx ;" 1246 "cmovc %%edx, %%ecx ;" 1247 "addq %%rcx, %%r8 ;" 1248 "movq %%r8, (%0) ;" 1249 1250 "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */ 1251 "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ 1252 "addq %%r10, %%r9 ;" 1253 "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ 1254 "adcq %%r11, %%r10 ;" 1255 "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ 1256 "adcq %%rax, %%r11 ;" 1257 /****************************************/ 1258 "adcq $0, %%rcx ;" 1259 "addq 64(%1), %%r8 ;" 1260 "adcq 72(%1), %%r9 ;" 1261 "adcq 80(%1), %%r10 ;" 1262 "adcq 88(%1), %%r11 ;" 1263 "adcq $0, %%rcx ;" 1264 "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ 1265 "addq %%rcx, %%r8 ;" 1266 "adcq $0, %%r9 ;" 1267 "movq %%r9, 40(%0) ;" 1268 "adcq $0, %%r10 ;" 1269 "movq %%r10, 48(%0) ;" 1270 "adcq $0, %%r11 ;" 1271 "movq %%r11, 56(%0) ;" 1272 "mov $0, %%ecx ;" 1273 "cmovc %%edx, %%ecx ;" 1274 "addq %%rcx, %%r8 ;" 1275 "movq %%r8, 32(%0) ;" 1276 : 1277 : "r"(c), "r"(a) 1278 : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", 1279 "%r11"); 1280 } 1281 1282 static void mul_256x256_integer_adx(u64 *const c, const u64 *const a, 1283 const u64 *const b) 1284 { 1285 asm volatile( 1286 "movq (%1), %%rdx; " /* A[0] */ 1287 "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ 1288 "xorl %%r10d, %%r10d ;" 1289 "movq %%r8, (%0) ;" 1290 "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ 1291 "adox %%r9, %%r10 ;" 1292 "movq %%r10, 8(%0) ;" 1293 "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */ 1294 "adox %%r11, %%r15 ;" 1295 "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ 1296 "adox %%r13, %%r14 ;" 1297 "movq $0, %%rax ;" 1298 /******************************************/ 1299 "adox %%rdx, %%rax ;" 1300 1301 "movq 8(%1), %%rdx; " /* A[1] */ 1302 "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ 1303 "xorl %%r10d, %%r10d ;" 1304 "adcx 8(%0), %%r8 ;" 1305 "movq %%r8, 8(%0) ;" 1306 "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ 1307 "adox %%r9, %%r10 ;" 1308 "adcx %%r15, %%r10 ;" 1309 "movq %%r10, 16(%0) ;" 1310 "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */ 1311 "adox %%r11, %%r15 ;" 1312 "adcx %%r14, %%r15 ;" 1313 "movq $0, %%r8 ;" 1314 "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ 1315 "adox %%r13, %%r14 ;" 1316 "adcx %%rax, %%r14 ;" 1317 "movq $0, %%rax ;" 1318 /******************************************/ 1319 "adox %%rdx, %%rax ;" 1320 "adcx %%r8, %%rax ;" 1321 1322 "movq 16(%1), %%rdx; " /* A[2] */ 1323 "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ 1324 "xorl %%r10d, %%r10d ;" 1325 "adcx 16(%0), %%r8 ;" 1326 "movq %%r8, 16(%0) ;" 1327 "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ 1328 "adox %%r9, %%r10 ;" 1329 "adcx %%r15, %%r10 ;" 1330 "movq %%r10, 24(%0) ;" 1331 "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */ 1332 "adox %%r11, %%r15 ;" 1333 "adcx %%r14, %%r15 ;" 1334 "movq $0, %%r8 ;" 1335 "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ 1336 "adox %%r13, %%r14 ;" 1337 "adcx %%rax, %%r14 ;" 1338 "movq $0, %%rax ;" 1339 /******************************************/ 1340 "adox %%rdx, %%rax ;" 1341 "adcx %%r8, %%rax ;" 1342 1343 "movq 24(%1), %%rdx; " /* A[3] */ 1344 "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ 1345 "xorl %%r10d, %%r10d ;" 1346 "adcx 24(%0), %%r8 ;" 1347 "movq %%r8, 24(%0) ;" 1348 "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ 1349 "adox %%r9, %%r10 ;" 1350 "adcx %%r15, %%r10 ;" 1351 "movq %%r10, 32(%0) ;" 1352 "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */ 1353 "adox %%r11, %%r15 ;" 1354 "adcx %%r14, %%r15 ;" 1355 "movq %%r15, 40(%0) ;" 1356 "movq $0, %%r8 ;" 1357 "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ 1358 "adox %%r13, %%r14 ;" 1359 "adcx %%rax, %%r14 ;" 1360 "movq %%r14, 48(%0) ;" 1361 "movq $0, %%rax ;" 1362 /******************************************/ 1363 "adox %%rdx, %%rax ;" 1364 "adcx %%r8, %%rax ;" 1365 "movq %%rax, 56(%0) ;" 1366 : 1367 : "r"(c), "r"(a), "r"(b) 1368 : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", 1369 "%r13", "%r14", "%r15"); 1370 } 1371 1372 static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a, 1373 const u64 *const b) 1374 { 1375 asm volatile( 1376 "movq (%1), %%rdx; " /* A[0] */ 1377 "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ 1378 "movq %%r8, (%0) ;" 1379 "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ 1380 "addq %%r10, %%r15 ;" 1381 "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ 1382 "adcq %%r8, %%rax ;" 1383 "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ 1384 "adcq %%r10, %%rbx ;" 1385 /******************************************/ 1386 "adcq $0, %%rcx ;" 1387 1388 "movq 8(%1), %%rdx; " /* A[1] */ 1389 "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ 1390 "addq %%r15, %%r8 ;" 1391 "movq %%r8, 8(%0) ;" 1392 "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ 1393 "adcq %%r10, %%r9 ;" 1394 "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ 1395 "adcq %%r8, %%r11 ;" 1396 "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ 1397 "adcq %%r10, %%r13 ;" 1398 /******************************************/ 1399 "adcq $0, %%r15 ;" 1400 1401 "addq %%r9, %%rax ;" 1402 "adcq %%r11, %%rbx ;" 1403 "adcq %%r13, %%rcx ;" 1404 "adcq $0, %%r15 ;" 1405 1406 "movq 16(%1), %%rdx; " /* A[2] */ 1407 "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ 1408 "addq %%rax, %%r8 ;" 1409 "movq %%r8, 16(%0) ;" 1410 "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ 1411 "adcq %%r10, %%r9 ;" 1412 "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ 1413 "adcq %%r8, %%r11 ;" 1414 "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ 1415 "adcq %%r10, %%r13 ;" 1416 /******************************************/ 1417 "adcq $0, %%rax ;" 1418 1419 "addq %%r9, %%rbx ;" 1420 "adcq %%r11, %%rcx ;" 1421 "adcq %%r13, %%r15 ;" 1422 "adcq $0, %%rax ;" 1423 1424 "movq 24(%1), %%rdx; " /* A[3] */ 1425 "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ 1426 "addq %%rbx, %%r8 ;" 1427 "movq %%r8, 24(%0) ;" 1428 "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ 1429 "adcq %%r10, %%r9 ;" 1430 "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ 1431 "adcq %%r8, %%r11 ;" 1432 "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ 1433 "adcq %%r10, %%r13 ;" 1434 /******************************************/ 1435 "adcq $0, %%rbx ;" 1436 1437 "addq %%r9, %%rcx ;" 1438 "movq %%rcx, 32(%0) ;" 1439 "adcq %%r11, %%r15 ;" 1440 "movq %%r15, 40(%0) ;" 1441 "adcq %%r13, %%rax ;" 1442 "movq %%rax, 48(%0) ;" 1443 "adcq $0, %%rbx ;" 1444 "movq %%rbx, 56(%0) ;" 1445 : 1446 : "r"(c), "r"(a), "r"(b) 1447 : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", 1448 "%r10", "%r11", "%r13", "%r15"); 1449 } 1450 1451 static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a) 1452 { 1453 asm volatile( 1454 "movq (%1), %%rdx ;" /* A[0] */ 1455 "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ 1456 "xorl %%r15d, %%r15d;" 1457 "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ 1458 "adcx %%r14, %%r9 ;" 1459 "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ 1460 "adcx %%rax, %%r10 ;" 1461 "movq 24(%1), %%rdx ;" /* A[3] */ 1462 "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ 1463 "adcx %%rcx, %%r11 ;" 1464 "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ 1465 "adcx %%rax, %%rbx ;" 1466 "movq 8(%1), %%rdx ;" /* A[1] */ 1467 "adcx %%r15, %%r13 ;" 1468 "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ 1469 "movq $0, %%r14 ;" 1470 /******************************************/ 1471 "adcx %%r15, %%r14 ;" 1472 1473 "xorl %%r15d, %%r15d;" 1474 "adox %%rax, %%r10 ;" 1475 "adcx %%r8, %%r8 ;" 1476 "adox %%rcx, %%r11 ;" 1477 "adcx %%r9, %%r9 ;" 1478 "adox %%r15, %%rbx ;" 1479 "adcx %%r10, %%r10 ;" 1480 "adox %%r15, %%r13 ;" 1481 "adcx %%r11, %%r11 ;" 1482 "adox %%r15, %%r14 ;" 1483 "adcx %%rbx, %%rbx ;" 1484 "adcx %%r13, %%r13 ;" 1485 "adcx %%r14, %%r14 ;" 1486 1487 "movq (%1), %%rdx ;" 1488 "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ 1489 /*******************/ 1490 "movq %%rax, 0(%0) ;" 1491 "addq %%rcx, %%r8 ;" 1492 "movq %%r8, 8(%0) ;" 1493 "movq 8(%1), %%rdx ;" 1494 "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ 1495 "adcq %%rax, %%r9 ;" 1496 "movq %%r9, 16(%0) ;" 1497 "adcq %%rcx, %%r10 ;" 1498 "movq %%r10, 24(%0) ;" 1499 "movq 16(%1), %%rdx ;" 1500 "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ 1501 "adcq %%rax, %%r11 ;" 1502 "movq %%r11, 32(%0) ;" 1503 "adcq %%rcx, %%rbx ;" 1504 "movq %%rbx, 40(%0) ;" 1505 "movq 24(%1), %%rdx ;" 1506 "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ 1507 "adcq %%rax, %%r13 ;" 1508 "movq %%r13, 48(%0) ;" 1509 "adcq %%rcx, %%r14 ;" 1510 "movq %%r14, 56(%0) ;" 1511 : 1512 : "r"(c), "r"(a) 1513 : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", 1514 "%r10", "%r11", "%r13", "%r14", "%r15"); 1515 } 1516 1517 static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a) 1518 { 1519 asm volatile( 1520 "movq 8(%1), %%rdx ;" /* A[1] */ 1521 "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ 1522 "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ 1523 "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ 1524 1525 "movq 16(%1), %%rdx ;" /* A[2] */ 1526 "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ 1527 "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ 1528 1529 "addq %%rax, %%r9 ;" 1530 "adcq %%rdx, %%r10 ;" 1531 "adcq %%rcx, %%r11 ;" 1532 "adcq %%r14, %%r15 ;" 1533 "adcq $0, %%r13 ;" 1534 "movq $0, %%r14 ;" 1535 "adcq $0, %%r14 ;" 1536 1537 "movq (%1), %%rdx ;" /* A[0] */ 1538 "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ 1539 1540 "addq %%rax, %%r10 ;" 1541 "adcq %%rcx, %%r11 ;" 1542 "adcq $0, %%r15 ;" 1543 "adcq $0, %%r13 ;" 1544 "adcq $0, %%r14 ;" 1545 1546 "shldq $1, %%r13, %%r14 ;" 1547 "shldq $1, %%r15, %%r13 ;" 1548 "shldq $1, %%r11, %%r15 ;" 1549 "shldq $1, %%r10, %%r11 ;" 1550 "shldq $1, %%r9, %%r10 ;" 1551 "shldq $1, %%r8, %%r9 ;" 1552 "shlq $1, %%r8 ;" 1553 1554 /*******************/ 1555 "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ 1556 /*******************/ 1557 "movq %%rax, 0(%0) ;" 1558 "addq %%rcx, %%r8 ;" 1559 "movq %%r8, 8(%0) ;" 1560 "movq 8(%1), %%rdx ;" 1561 "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ 1562 "adcq %%rax, %%r9 ;" 1563 "movq %%r9, 16(%0) ;" 1564 "adcq %%rcx, %%r10 ;" 1565 "movq %%r10, 24(%0) ;" 1566 "movq 16(%1), %%rdx ;" 1567 "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ 1568 "adcq %%rax, %%r11 ;" 1569 "movq %%r11, 32(%0) ;" 1570 "adcq %%rcx, %%r15 ;" 1571 "movq %%r15, 40(%0) ;" 1572 "movq 24(%1), %%rdx ;" 1573 "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ 1574 "adcq %%rax, %%r13 ;" 1575 "movq %%r13, 48(%0) ;" 1576 "adcq %%rcx, %%r14 ;" 1577 "movq %%r14, 56(%0) ;" 1578 : 1579 : "r"(c), "r"(a) 1580 : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", 1581 "%r11", "%r13", "%r14", "%r15"); 1582 } 1583 1584 static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a) 1585 { 1586 asm volatile( 1587 "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ 1588 "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ 1589 "xorl %%ebx, %%ebx ;" 1590 "adox (%1), %%r8 ;" 1591 "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ 1592 "adcx %%r10, %%r9 ;" 1593 "adox 8(%1), %%r9 ;" 1594 "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ 1595 "adcx %%r11, %%r10 ;" 1596 "adox 16(%1), %%r10 ;" 1597 "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ 1598 "adcx %%rax, %%r11 ;" 1599 "adox 24(%1), %%r11 ;" 1600 /***************************************/ 1601 "adcx %%rbx, %%rcx ;" 1602 "adox %%rbx, %%rcx ;" 1603 "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ 1604 "adcx %%rcx, %%r8 ;" 1605 "adcx %%rbx, %%r9 ;" 1606 "movq %%r9, 8(%0) ;" 1607 "adcx %%rbx, %%r10 ;" 1608 "movq %%r10, 16(%0) ;" 1609 "adcx %%rbx, %%r11 ;" 1610 "movq %%r11, 24(%0) ;" 1611 "mov $0, %%ecx ;" 1612 "cmovc %%edx, %%ecx ;" 1613 "addq %%rcx, %%r8 ;" 1614 "movq %%r8, (%0) ;" 1615 : 1616 : "r"(c), "r"(a) 1617 : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", 1618 "%r10", "%r11"); 1619 } 1620 1621 static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) 1622 { 1623 asm volatile( 1624 "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ 1625 "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ 1626 "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ 1627 "addq %%r10, %%r9 ;" 1628 "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ 1629 "adcq %%r11, %%r10 ;" 1630 "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ 1631 "adcq %%rax, %%r11 ;" 1632 /***************************************/ 1633 "adcq $0, %%rcx ;" 1634 "addq (%1), %%r8 ;" 1635 "adcq 8(%1), %%r9 ;" 1636 "adcq 16(%1), %%r10 ;" 1637 "adcq 24(%1), %%r11 ;" 1638 "adcq $0, %%rcx ;" 1639 "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ 1640 "addq %%rcx, %%r8 ;" 1641 "adcq $0, %%r9 ;" 1642 "movq %%r9, 8(%0) ;" 1643 "adcq $0, %%r10 ;" 1644 "movq %%r10, 16(%0) ;" 1645 "adcq $0, %%r11 ;" 1646 "movq %%r11, 24(%0) ;" 1647 "mov $0, %%ecx ;" 1648 "cmovc %%edx, %%ecx ;" 1649 "addq %%rcx, %%r8 ;" 1650 "movq %%r8, (%0) ;" 1651 : 1652 : "r"(c), "r"(a) 1653 : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", 1654 "%r11"); 1655 } 1656 1657 static __always_inline void 1658 add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b) 1659 { 1660 asm volatile( 1661 "mov $38, %%eax ;" 1662 "xorl %%ecx, %%ecx ;" 1663 "movq (%2), %%r8 ;" 1664 "adcx (%1), %%r8 ;" 1665 "movq 8(%2), %%r9 ;" 1666 "adcx 8(%1), %%r9 ;" 1667 "movq 16(%2), %%r10 ;" 1668 "adcx 16(%1), %%r10 ;" 1669 "movq 24(%2), %%r11 ;" 1670 "adcx 24(%1), %%r11 ;" 1671 "cmovc %%eax, %%ecx ;" 1672 "xorl %%eax, %%eax ;" 1673 "adcx %%rcx, %%r8 ;" 1674 "adcx %%rax, %%r9 ;" 1675 "movq %%r9, 8(%0) ;" 1676 "adcx %%rax, %%r10 ;" 1677 "movq %%r10, 16(%0) ;" 1678 "adcx %%rax, %%r11 ;" 1679 "movq %%r11, 24(%0) ;" 1680 "mov $38, %%ecx ;" 1681 "cmovc %%ecx, %%eax ;" 1682 "addq %%rax, %%r8 ;" 1683 "movq %%r8, (%0) ;" 1684 : 1685 : "r"(c), "r"(a), "r"(b) 1686 : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); 1687 } 1688 1689 static __always_inline void 1690 add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b) 1691 { 1692 asm volatile( 1693 "mov $38, %%eax ;" 1694 "movq (%2), %%r8 ;" 1695 "addq (%1), %%r8 ;" 1696 "movq 8(%2), %%r9 ;" 1697 "adcq 8(%1), %%r9 ;" 1698 "movq 16(%2), %%r10 ;" 1699 "adcq 16(%1), %%r10 ;" 1700 "movq 24(%2), %%r11 ;" 1701 "adcq 24(%1), %%r11 ;" 1702 "mov $0, %%ecx ;" 1703 "cmovc %%eax, %%ecx ;" 1704 "addq %%rcx, %%r8 ;" 1705 "adcq $0, %%r9 ;" 1706 "movq %%r9, 8(%0) ;" 1707 "adcq $0, %%r10 ;" 1708 "movq %%r10, 16(%0) ;" 1709 "adcq $0, %%r11 ;" 1710 "movq %%r11, 24(%0) ;" 1711 "mov $0, %%ecx ;" 1712 "cmovc %%eax, %%ecx ;" 1713 "addq %%rcx, %%r8 ;" 1714 "movq %%r8, (%0) ;" 1715 : 1716 : "r"(c), "r"(a), "r"(b) 1717 : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); 1718 } 1719 1720 static __always_inline void 1721 sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b) 1722 { 1723 asm volatile( 1724 "mov $38, %%eax ;" 1725 "movq (%1), %%r8 ;" 1726 "subq (%2), %%r8 ;" 1727 "movq 8(%1), %%r9 ;" 1728 "sbbq 8(%2), %%r9 ;" 1729 "movq 16(%1), %%r10 ;" 1730 "sbbq 16(%2), %%r10 ;" 1731 "movq 24(%1), %%r11 ;" 1732 "sbbq 24(%2), %%r11 ;" 1733 "mov $0, %%ecx ;" 1734 "cmovc %%eax, %%ecx ;" 1735 "subq %%rcx, %%r8 ;" 1736 "sbbq $0, %%r9 ;" 1737 "movq %%r9, 8(%0) ;" 1738 "sbbq $0, %%r10 ;" 1739 "movq %%r10, 16(%0) ;" 1740 "sbbq $0, %%r11 ;" 1741 "movq %%r11, 24(%0) ;" 1742 "mov $0, %%ecx ;" 1743 "cmovc %%eax, %%ecx ;" 1744 "subq %%rcx, %%r8 ;" 1745 "movq %%r8, (%0) ;" 1746 : 1747 : "r"(c), "r"(a), "r"(b) 1748 : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); 1749 } 1750 1751 /* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */ 1752 static __always_inline void 1753 mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a) 1754 { 1755 const u64 a24 = 121666; 1756 asm volatile( 1757 "movq %2, %%rdx ;" 1758 "mulx (%1), %%r8, %%r10 ;" 1759 "mulx 8(%1), %%r9, %%r11 ;" 1760 "addq %%r10, %%r9 ;" 1761 "mulx 16(%1), %%r10, %%rax ;" 1762 "adcq %%r11, %%r10 ;" 1763 "mulx 24(%1), %%r11, %%rcx ;" 1764 "adcq %%rax, %%r11 ;" 1765 /**************************/ 1766 "adcq $0, %%rcx ;" 1767 "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/ 1768 "imul %%rdx, %%rcx ;" 1769 "addq %%rcx, %%r8 ;" 1770 "adcq $0, %%r9 ;" 1771 "movq %%r9, 8(%0) ;" 1772 "adcq $0, %%r10 ;" 1773 "movq %%r10, 16(%0) ;" 1774 "adcq $0, %%r11 ;" 1775 "movq %%r11, 24(%0) ;" 1776 "mov $0, %%ecx ;" 1777 "cmovc %%edx, %%ecx ;" 1778 "addq %%rcx, %%r8 ;" 1779 "movq %%r8, (%0) ;" 1780 : 1781 : "r"(c), "r"(a), "r"(a24) 1782 : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", 1783 "%r11"); 1784 } 1785 1786 static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a) 1787 { 1788 struct { 1789 eltfp25519_1w_buffer buffer; 1790 eltfp25519_1w x0, x1, x2; 1791 } __aligned(32) m; 1792 u64 *T[4]; 1793 1794 T[0] = m.x0; 1795 T[1] = c; /* x^(-1) */ 1796 T[2] = m.x1; 1797 T[3] = m.x2; 1798 1799 copy_eltfp25519_1w(T[1], a); 1800 sqrn_eltfp25519_1w_adx(T[1], 1); 1801 copy_eltfp25519_1w(T[2], T[1]); 1802 sqrn_eltfp25519_1w_adx(T[2], 2); 1803 mul_eltfp25519_1w_adx(T[0], a, T[2]); 1804 mul_eltfp25519_1w_adx(T[1], T[1], T[0]); 1805 copy_eltfp25519_1w(T[2], T[1]); 1806 sqrn_eltfp25519_1w_adx(T[2], 1); 1807 mul_eltfp25519_1w_adx(T[0], T[0], T[2]); 1808 copy_eltfp25519_1w(T[2], T[0]); 1809 sqrn_eltfp25519_1w_adx(T[2], 5); 1810 mul_eltfp25519_1w_adx(T[0], T[0], T[2]); 1811 copy_eltfp25519_1w(T[2], T[0]); 1812 sqrn_eltfp25519_1w_adx(T[2], 10); 1813 mul_eltfp25519_1w_adx(T[2], T[2], T[0]); 1814 copy_eltfp25519_1w(T[3], T[2]); 1815 sqrn_eltfp25519_1w_adx(T[3], 20); 1816 mul_eltfp25519_1w_adx(T[3], T[3], T[2]); 1817 sqrn_eltfp25519_1w_adx(T[3], 10); 1818 mul_eltfp25519_1w_adx(T[3], T[3], T[0]); 1819 copy_eltfp25519_1w(T[0], T[3]); 1820 sqrn_eltfp25519_1w_adx(T[0], 50); 1821 mul_eltfp25519_1w_adx(T[0], T[0], T[3]); 1822 copy_eltfp25519_1w(T[2], T[0]); 1823 sqrn_eltfp25519_1w_adx(T[2], 100); 1824 mul_eltfp25519_1w_adx(T[2], T[2], T[0]); 1825 sqrn_eltfp25519_1w_adx(T[2], 50); 1826 mul_eltfp25519_1w_adx(T[2], T[2], T[3]); 1827 sqrn_eltfp25519_1w_adx(T[2], 5); 1828 mul_eltfp25519_1w_adx(T[1], T[1], T[2]); 1829 1830 memzero_explicit(&m, sizeof(m)); 1831 } 1832 1833 static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) 1834 { 1835 struct { 1836 eltfp25519_1w_buffer buffer; 1837 eltfp25519_1w x0, x1, x2; 1838 } __aligned(32) m; 1839 u64 *T[5]; 1840 1841 T[0] = m.x0; 1842 T[1] = c; /* x^(-1) */ 1843 T[2] = m.x1; 1844 T[3] = m.x2; 1845 1846 copy_eltfp25519_1w(T[1], a); 1847 sqrn_eltfp25519_1w_bmi2(T[1], 1); 1848 copy_eltfp25519_1w(T[2], T[1]); 1849 sqrn_eltfp25519_1w_bmi2(T[2], 2); 1850 mul_eltfp25519_1w_bmi2(T[0], a, T[2]); 1851 mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]); 1852 copy_eltfp25519_1w(T[2], T[1]); 1853 sqrn_eltfp25519_1w_bmi2(T[2], 1); 1854 mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); 1855 copy_eltfp25519_1w(T[2], T[0]); 1856 sqrn_eltfp25519_1w_bmi2(T[2], 5); 1857 mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); 1858 copy_eltfp25519_1w(T[2], T[0]); 1859 sqrn_eltfp25519_1w_bmi2(T[2], 10); 1860 mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); 1861 copy_eltfp25519_1w(T[3], T[2]); 1862 sqrn_eltfp25519_1w_bmi2(T[3], 20); 1863 mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]); 1864 sqrn_eltfp25519_1w_bmi2(T[3], 10); 1865 mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]); 1866 copy_eltfp25519_1w(T[0], T[3]); 1867 sqrn_eltfp25519_1w_bmi2(T[0], 50); 1868 mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]); 1869 copy_eltfp25519_1w(T[2], T[0]); 1870 sqrn_eltfp25519_1w_bmi2(T[2], 100); 1871 mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); 1872 sqrn_eltfp25519_1w_bmi2(T[2], 50); 1873 mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]); 1874 sqrn_eltfp25519_1w_bmi2(T[2], 5); 1875 mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]); 1876 1877 memzero_explicit(&m, sizeof(m)); 1878 } 1879 1880 /* Given c, a 256-bit number, fred_eltfp25519_1w updates c 1881 * with a number such that 0 <= C < 2**255-19. 1882 */ 1883 static __always_inline void fred_eltfp25519_1w(u64 *const c) 1884 { 1885 u64 tmp0 = 38, tmp1 = 19; 1886 asm volatile( 1887 "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ 1888 "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ 1889 1890 /* Add either 19 or 38 to c */ 1891 "addq %4, %0 ;" 1892 "adcq $0, %1 ;" 1893 "adcq $0, %2 ;" 1894 "adcq $0, %3 ;" 1895 1896 /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ 1897 "movl $0, %k4 ;" 1898 "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ 1899 "btrq $63, %3 ;" /* Clear bit 255 */ 1900 1901 /* Subtract 19 if necessary */ 1902 "subq %4, %0 ;" 1903 "sbbq $0, %1 ;" 1904 "sbbq $0, %2 ;" 1905 "sbbq $0, %3 ;" 1906 1907 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0), 1908 "+r"(tmp1) 1909 : 1910 : "memory", "cc"); 1911 } 1912 1913 static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) 1914 { 1915 u64 temp; 1916 asm volatile( 1917 "test %9, %9 ;" 1918 "movq %0, %8 ;" 1919 "cmovnzq %4, %0 ;" 1920 "cmovnzq %8, %4 ;" 1921 "movq %1, %8 ;" 1922 "cmovnzq %5, %1 ;" 1923 "cmovnzq %8, %5 ;" 1924 "movq %2, %8 ;" 1925 "cmovnzq %6, %2 ;" 1926 "cmovnzq %8, %6 ;" 1927 "movq %3, %8 ;" 1928 "cmovnzq %7, %3 ;" 1929 "cmovnzq %8, %7 ;" 1930 : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]), 1931 "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]), 1932 "=r"(temp) 1933 : "r"(bit) 1934 : "cc" 1935 ); 1936 } 1937 1938 static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py) 1939 { 1940 asm volatile( 1941 "test %4, %4 ;" 1942 "cmovnzq %5, %0 ;" 1943 "cmovnzq %6, %1 ;" 1944 "cmovnzq %7, %2 ;" 1945 "cmovnzq %8, %3 ;" 1946 : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]) 1947 : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3]) 1948 : "cc" 1949 ); 1950 } 1951 1952 static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE], 1953 const u8 private_key[CURVE25519_KEY_SIZE], 1954 const u8 session_key[CURVE25519_KEY_SIZE]) 1955 { 1956 struct { 1957 u64 buffer[4 * NUM_WORDS_ELTFP25519]; 1958 u64 coordinates[4 * NUM_WORDS_ELTFP25519]; 1959 u64 workspace[6 * NUM_WORDS_ELTFP25519]; 1960 u8 session[CURVE25519_KEY_SIZE]; 1961 u8 private[CURVE25519_KEY_SIZE]; 1962 } __aligned(32) m; 1963 1964 int i = 0, j = 0; 1965 u64 prev = 0; 1966 u64 *const X1 = (u64 *)m.session; 1967 u64 *const key = (u64 *)m.private; 1968 u64 *const Px = m.coordinates + 0; 1969 u64 *const Pz = m.coordinates + 4; 1970 u64 *const Qx = m.coordinates + 8; 1971 u64 *const Qz = m.coordinates + 12; 1972 u64 *const X2 = Qx; 1973 u64 *const Z2 = Qz; 1974 u64 *const X3 = Px; 1975 u64 *const Z3 = Pz; 1976 u64 *const X2Z2 = Qx; 1977 u64 *const X3Z3 = Px; 1978 1979 u64 *const A = m.workspace + 0; 1980 u64 *const B = m.workspace + 4; 1981 u64 *const D = m.workspace + 8; 1982 u64 *const C = m.workspace + 12; 1983 u64 *const DA = m.workspace + 16; 1984 u64 *const CB = m.workspace + 20; 1985 u64 *const AB = A; 1986 u64 *const DC = D; 1987 u64 *const DACB = DA; 1988 1989 memcpy(m.private, private_key, sizeof(m.private)); 1990 memcpy(m.session, session_key, sizeof(m.session)); 1991 1992 curve25519_clamp_secret(m.private); 1993 1994 /* As in the draft: 1995 * When receiving such an array, implementations of curve25519 1996 * MUST mask the most-significant bit in the final byte. This 1997 * is done to preserve compatibility with point formats which 1998 * reserve the sign bit for use in other protocols and to 1999 * increase resistance to implementation fingerprinting 2000 */ 2001 m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; 2002 2003 copy_eltfp25519_1w(Px, X1); 2004 setzero_eltfp25519_1w(Pz); 2005 setzero_eltfp25519_1w(Qx); 2006 setzero_eltfp25519_1w(Qz); 2007 2008 Pz[0] = 1; 2009 Qx[0] = 1; 2010 2011 /* main-loop */ 2012 prev = 0; 2013 j = 62; 2014 for (i = 3; i >= 0; --i) { 2015 while (j >= 0) { 2016 u64 bit = (key[i] >> j) & 0x1; 2017 u64 swap = bit ^ prev; 2018 prev = bit; 2019 2020 add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */ 2021 sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ 2022 add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */ 2023 sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ 2024 mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ 2025 2026 cselect(swap, A, C); 2027 cselect(swap, B, D); 2028 2029 sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */ 2030 add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */ 2031 sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ 2032 sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ 2033 2034 copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ 2035 sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ 2036 2037 mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ 2038 add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */ 2039 mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ 2040 mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ 2041 --j; 2042 } 2043 j = 63; 2044 } 2045 2046 inv_eltfp25519_1w_adx(A, Qz); 2047 mul_eltfp25519_1w_adx((u64 *)shared, Qx, A); 2048 fred_eltfp25519_1w((u64 *)shared); 2049 2050 memzero_explicit(&m, sizeof(m)); 2051 } 2052 2053 static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE], 2054 const u8 private_key[CURVE25519_KEY_SIZE]) 2055 { 2056 struct { 2057 u64 buffer[4 * NUM_WORDS_ELTFP25519]; 2058 u64 coordinates[4 * NUM_WORDS_ELTFP25519]; 2059 u64 workspace[4 * NUM_WORDS_ELTFP25519]; 2060 u8 private[CURVE25519_KEY_SIZE]; 2061 } __aligned(32) m; 2062 2063 const int ite[4] = { 64, 64, 64, 63 }; 2064 const int q = 3; 2065 u64 swap = 1; 2066 2067 int i = 0, j = 0, k = 0; 2068 u64 *const key = (u64 *)m.private; 2069 u64 *const Ur1 = m.coordinates + 0; 2070 u64 *const Zr1 = m.coordinates + 4; 2071 u64 *const Ur2 = m.coordinates + 8; 2072 u64 *const Zr2 = m.coordinates + 12; 2073 2074 u64 *const UZr1 = m.coordinates + 0; 2075 u64 *const ZUr2 = m.coordinates + 8; 2076 2077 u64 *const A = m.workspace + 0; 2078 u64 *const B = m.workspace + 4; 2079 u64 *const C = m.workspace + 8; 2080 u64 *const D = m.workspace + 12; 2081 2082 u64 *const AB = m.workspace + 0; 2083 u64 *const CD = m.workspace + 8; 2084 2085 const u64 *const P = table_ladder_8k; 2086 2087 memcpy(m.private, private_key, sizeof(m.private)); 2088 2089 curve25519_clamp_secret(m.private); 2090 2091 setzero_eltfp25519_1w(Ur1); 2092 setzero_eltfp25519_1w(Zr1); 2093 setzero_eltfp25519_1w(Zr2); 2094 Ur1[0] = 1; 2095 Zr1[0] = 1; 2096 Zr2[0] = 1; 2097 2098 /* G-S */ 2099 Ur2[3] = 0x1eaecdeee27cab34UL; 2100 Ur2[2] = 0xadc7a0b9235d48e2UL; 2101 Ur2[1] = 0xbbf095ae14b2edf8UL; 2102 Ur2[0] = 0x7e94e1fec82faabdUL; 2103 2104 /* main-loop */ 2105 j = q; 2106 for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { 2107 while (j < ite[i]) { 2108 u64 bit = (key[i] >> j) & 0x1; 2109 k = (64 * i + j - q); 2110 swap = swap ^ bit; 2111 cswap(swap, Ur1, Ur2); 2112 cswap(swap, Zr1, Zr2); 2113 swap = bit; 2114 /* Addition */ 2115 sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ 2116 add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ 2117 mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */ 2118 sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ 2119 add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ 2120 sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */ 2121 mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ 2122 ++j; 2123 } 2124 j = 0; 2125 } 2126 2127 /* Doubling */ 2128 for (i = 0; i < q; ++i) { 2129 add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ 2130 sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ 2131 sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */ 2132 copy_eltfp25519_1w(C, B); /* C = B */ 2133 sub_eltfp25519_1w(B, A, B); /* B = A-B */ 2134 mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ 2135 add_eltfp25519_1w_adx(D, D, C); /* D = D+C */ 2136 mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ 2137 } 2138 2139 /* Convert to affine coordinates */ 2140 inv_eltfp25519_1w_adx(A, Zr1); 2141 mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A); 2142 fred_eltfp25519_1w((u64 *)session_key); 2143 2144 memzero_explicit(&m, sizeof(m)); 2145 } 2146 2147 static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE], 2148 const u8 private_key[CURVE25519_KEY_SIZE], 2149 const u8 session_key[CURVE25519_KEY_SIZE]) 2150 { 2151 struct { 2152 u64 buffer[4 * NUM_WORDS_ELTFP25519]; 2153 u64 coordinates[4 * NUM_WORDS_ELTFP25519]; 2154 u64 workspace[6 * NUM_WORDS_ELTFP25519]; 2155 u8 session[CURVE25519_KEY_SIZE]; 2156 u8 private[CURVE25519_KEY_SIZE]; 2157 } __aligned(32) m; 2158 2159 int i = 0, j = 0; 2160 u64 prev = 0; 2161 u64 *const X1 = (u64 *)m.session; 2162 u64 *const key = (u64 *)m.private; 2163 u64 *const Px = m.coordinates + 0; 2164 u64 *const Pz = m.coordinates + 4; 2165 u64 *const Qx = m.coordinates + 8; 2166 u64 *const Qz = m.coordinates + 12; 2167 u64 *const X2 = Qx; 2168 u64 *const Z2 = Qz; 2169 u64 *const X3 = Px; 2170 u64 *const Z3 = Pz; 2171 u64 *const X2Z2 = Qx; 2172 u64 *const X3Z3 = Px; 2173 2174 u64 *const A = m.workspace + 0; 2175 u64 *const B = m.workspace + 4; 2176 u64 *const D = m.workspace + 8; 2177 u64 *const C = m.workspace + 12; 2178 u64 *const DA = m.workspace + 16; 2179 u64 *const CB = m.workspace + 20; 2180 u64 *const AB = A; 2181 u64 *const DC = D; 2182 u64 *const DACB = DA; 2183 2184 memcpy(m.private, private_key, sizeof(m.private)); 2185 memcpy(m.session, session_key, sizeof(m.session)); 2186 2187 curve25519_clamp_secret(m.private); 2188 2189 /* As in the draft: 2190 * When receiving such an array, implementations of curve25519 2191 * MUST mask the most-significant bit in the final byte. This 2192 * is done to preserve compatibility with point formats which 2193 * reserve the sign bit for use in other protocols and to 2194 * increase resistance to implementation fingerprinting 2195 */ 2196 m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; 2197 2198 copy_eltfp25519_1w(Px, X1); 2199 setzero_eltfp25519_1w(Pz); 2200 setzero_eltfp25519_1w(Qx); 2201 setzero_eltfp25519_1w(Qz); 2202 2203 Pz[0] = 1; 2204 Qx[0] = 1; 2205 2206 /* main-loop */ 2207 prev = 0; 2208 j = 62; 2209 for (i = 3; i >= 0; --i) { 2210 while (j >= 0) { 2211 u64 bit = (key[i] >> j) & 0x1; 2212 u64 swap = bit ^ prev; 2213 prev = bit; 2214 2215 add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */ 2216 sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ 2217 add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */ 2218 sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ 2219 mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ 2220 2221 cselect(swap, A, C); 2222 cselect(swap, B, D); 2223 2224 sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ 2225 add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ 2226 sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ 2227 sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ 2228 2229 copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ 2230 sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ 2231 2232 mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ 2233 add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */ 2234 mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ 2235 mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ 2236 --j; 2237 } 2238 j = 63; 2239 } 2240 2241 inv_eltfp25519_1w_bmi2(A, Qz); 2242 mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A); 2243 fred_eltfp25519_1w((u64 *)shared); 2244 2245 memzero_explicit(&m, sizeof(m)); 2246 } 2247 2248 static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE], 2249 const u8 private_key[CURVE25519_KEY_SIZE]) 2250 { 2251 struct { 2252 u64 buffer[4 * NUM_WORDS_ELTFP25519]; 2253 u64 coordinates[4 * NUM_WORDS_ELTFP25519]; 2254 u64 workspace[4 * NUM_WORDS_ELTFP25519]; 2255 u8 private[CURVE25519_KEY_SIZE]; 2256 } __aligned(32) m; 2257 2258 const int ite[4] = { 64, 64, 64, 63 }; 2259 const int q = 3; 2260 u64 swap = 1; 2261 2262 int i = 0, j = 0, k = 0; 2263 u64 *const key = (u64 *)m.private; 2264 u64 *const Ur1 = m.coordinates + 0; 2265 u64 *const Zr1 = m.coordinates + 4; 2266 u64 *const Ur2 = m.coordinates + 8; 2267 u64 *const Zr2 = m.coordinates + 12; 2268 2269 u64 *const UZr1 = m.coordinates + 0; 2270 u64 *const ZUr2 = m.coordinates + 8; 2271 2272 u64 *const A = m.workspace + 0; 2273 u64 *const B = m.workspace + 4; 2274 u64 *const C = m.workspace + 8; 2275 u64 *const D = m.workspace + 12; 2276 2277 u64 *const AB = m.workspace + 0; 2278 u64 *const CD = m.workspace + 8; 2279 2280 const u64 *const P = table_ladder_8k; 2281 2282 memcpy(m.private, private_key, sizeof(m.private)); 2283 2284 curve25519_clamp_secret(m.private); 2285 2286 setzero_eltfp25519_1w(Ur1); 2287 setzero_eltfp25519_1w(Zr1); 2288 setzero_eltfp25519_1w(Zr2); 2289 Ur1[0] = 1; 2290 Zr1[0] = 1; 2291 Zr2[0] = 1; 2292 2293 /* G-S */ 2294 Ur2[3] = 0x1eaecdeee27cab34UL; 2295 Ur2[2] = 0xadc7a0b9235d48e2UL; 2296 Ur2[1] = 0xbbf095ae14b2edf8UL; 2297 Ur2[0] = 0x7e94e1fec82faabdUL; 2298 2299 /* main-loop */ 2300 j = q; 2301 for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { 2302 while (j < ite[i]) { 2303 u64 bit = (key[i] >> j) & 0x1; 2304 k = (64 * i + j - q); 2305 swap = swap ^ bit; 2306 cswap(swap, Ur1, Ur2); 2307 cswap(swap, Zr1, Zr2); 2308 swap = bit; 2309 /* Addition */ 2310 sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ 2311 add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ 2312 mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */ 2313 sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ 2314 add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ 2315 sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */ 2316 mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ 2317 ++j; 2318 } 2319 j = 0; 2320 } 2321 2322 /* Doubling */ 2323 for (i = 0; i < q; ++i) { 2324 add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ 2325 sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ 2326 sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */ 2327 copy_eltfp25519_1w(C, B); /* C = B */ 2328 sub_eltfp25519_1w(B, A, B); /* B = A-B */ 2329 mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ 2330 add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */ 2331 mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ 2332 } 2333 2334 /* Convert to affine coordinates */ 2335 inv_eltfp25519_1w_bmi2(A, Zr1); 2336 mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A); 2337 fred_eltfp25519_1w((u64 *)session_key); 2338 2339 memzero_explicit(&m, sizeof(m)); 2340 } 2341 2342 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], 2343 const u8 secret[CURVE25519_KEY_SIZE], 2344 const u8 basepoint[CURVE25519_KEY_SIZE]) 2345 { 2346 if (static_branch_likely(&curve25519_use_adx)) 2347 curve25519_adx(mypublic, secret, basepoint); 2348 else if (static_branch_likely(&curve25519_use_bmi2)) 2349 curve25519_bmi2(mypublic, secret, basepoint); 2350 else 2351 curve25519_generic(mypublic, secret, basepoint); 2352 } 2353 EXPORT_SYMBOL(curve25519_arch); 2354 2355 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], 2356 const u8 secret[CURVE25519_KEY_SIZE]) 2357 { 2358 if (static_branch_likely(&curve25519_use_adx)) 2359 curve25519_adx_base(pub, secret); 2360 else if (static_branch_likely(&curve25519_use_bmi2)) 2361 curve25519_bmi2_base(pub, secret); 2362 else 2363 curve25519_generic(pub, secret, curve25519_base_point); 2364 } 2365 EXPORT_SYMBOL(curve25519_base_arch); 2366 2367 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, 2368 unsigned int len) 2369 { 2370 u8 *secret = kpp_tfm_ctx(tfm); 2371 2372 if (!len) 2373 curve25519_generate_secret(secret); 2374 else if (len == CURVE25519_KEY_SIZE && 2375 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) 2376 memcpy(secret, buf, CURVE25519_KEY_SIZE); 2377 else 2378 return -EINVAL; 2379 return 0; 2380 } 2381 2382 static int curve25519_generate_public_key(struct kpp_request *req) 2383 { 2384 struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); 2385 const u8 *secret = kpp_tfm_ctx(tfm); 2386 u8 buf[CURVE25519_KEY_SIZE]; 2387 int copied, nbytes; 2388 2389 if (req->src) 2390 return -EINVAL; 2391 2392 curve25519_base_arch(buf, secret); 2393 2394 /* might want less than we've got */ 2395 nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); 2396 copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, 2397 nbytes), 2398 buf, nbytes); 2399 if (copied != nbytes) 2400 return -EINVAL; 2401 return 0; 2402 } 2403 2404 static int curve25519_compute_shared_secret(struct kpp_request *req) 2405 { 2406 struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); 2407 const u8 *secret = kpp_tfm_ctx(tfm); 2408 u8 public_key[CURVE25519_KEY_SIZE]; 2409 u8 buf[CURVE25519_KEY_SIZE]; 2410 int copied, nbytes; 2411 2412 if (!req->src) 2413 return -EINVAL; 2414 2415 copied = sg_copy_to_buffer(req->src, 2416 sg_nents_for_len(req->src, 2417 CURVE25519_KEY_SIZE), 2418 public_key, CURVE25519_KEY_SIZE); 2419 if (copied != CURVE25519_KEY_SIZE) 2420 return -EINVAL; 2421 2422 curve25519_arch(buf, secret, public_key); 2423 2424 /* might want less than we've got */ 2425 nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); 2426 copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, 2427 nbytes), 2428 buf, nbytes); 2429 if (copied != nbytes) 2430 return -EINVAL; 2431 return 0; 2432 } 2433 2434 static unsigned int curve25519_max_size(struct crypto_kpp *tfm) 2435 { 2436 return CURVE25519_KEY_SIZE; 2437 } 2438 2439 static struct kpp_alg curve25519_alg = { 2440 .base.cra_name = "curve25519", 2441 .base.cra_driver_name = "curve25519-x86", 2442 .base.cra_priority = 200, 2443 .base.cra_module = THIS_MODULE, 2444 .base.cra_ctxsize = CURVE25519_KEY_SIZE, 2445 2446 .set_secret = curve25519_set_secret, 2447 .generate_public_key = curve25519_generate_public_key, 2448 .compute_shared_secret = curve25519_compute_shared_secret, 2449 .max_size = curve25519_max_size, 2450 }; 2451 2452 static int __init curve25519_mod_init(void) 2453 { 2454 if (boot_cpu_has(X86_FEATURE_BMI2)) 2455 static_branch_enable(&curve25519_use_bmi2); 2456 else if (boot_cpu_has(X86_FEATURE_ADX)) 2457 static_branch_enable(&curve25519_use_adx); 2458 else 2459 return 0; 2460 return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? 2461 crypto_register_kpp(&curve25519_alg) : 0; 2462 } 2463 2464 static void __exit curve25519_mod_exit(void) 2465 { 2466 if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && 2467 (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX))) 2468 crypto_unregister_kpp(&curve25519_alg); 2469 } 2470 2471 module_init(curve25519_mod_init); 2472 module_exit(curve25519_mod_exit); 2473 2474 MODULE_ALIAS_CRYPTO("curve25519"); 2475 MODULE_ALIAS_CRYPTO("curve25519-x86"); 2476 MODULE_LICENSE("GPL v2"); 2477