xref: /openbmc/linux/arch/x86/crypto/curve25519-x86_64.c (revision 03ab8e6297acd1bc0eedaa050e2a1635c576fd11)
1  // SPDX-License-Identifier: GPL-2.0 OR MIT
2  /*
3   * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4   * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
5   */
6  
7  #include <crypto/curve25519.h>
8  #include <crypto/internal/kpp.h>
9  
10  #include <linux/types.h>
11  #include <linux/jump_label.h>
12  #include <linux/kernel.h>
13  #include <linux/module.h>
14  #include <linux/scatterlist.h>
15  
16  #include <asm/cpufeature.h>
17  #include <asm/processor.h>
18  
eq_mask(u64 a,u64 b)19  static __always_inline u64 eq_mask(u64 a, u64 b)
20  {
21  	u64 x = a ^ b;
22  	u64 minus_x = ~x + (u64)1U;
23  	u64 x_or_minus_x = x | minus_x;
24  	u64 xnx = x_or_minus_x >> (u32)63U;
25  	return xnx - (u64)1U;
26  }
27  
gte_mask(u64 a,u64 b)28  static __always_inline u64 gte_mask(u64 a, u64 b)
29  {
30  	u64 x = a;
31  	u64 y = b;
32  	u64 x_xor_y = x ^ y;
33  	u64 x_sub_y = x - y;
34  	u64 x_sub_y_xor_y = x_sub_y ^ y;
35  	u64 q = x_xor_y | x_sub_y_xor_y;
36  	u64 x_xor_q = x ^ q;
37  	u64 x_xor_q_ = x_xor_q >> (u32)63U;
38  	return x_xor_q_ - (u64)1U;
39  }
40  
41  /* Computes the addition of four-element f1 with value in f2
42   * and returns the carry (if any) */
add_scalar(u64 * out,const u64 * f1,u64 f2)43  static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
44  {
45  	u64 carry_r;
46  
47  	asm volatile(
48  		/* Clear registers to propagate the carry bit */
49  		"  xor %%r8d, %%r8d;"
50  		"  xor %%r9d, %%r9d;"
51  		"  xor %%r10d, %%r10d;"
52  		"  xor %%r11d, %%r11d;"
53  		"  xor %k1, %k1;"
54  
55  		/* Begin addition chain */
56  		"  addq 0(%3), %0;"
57  		"  movq %0, 0(%2);"
58  		"  adcxq 8(%3), %%r8;"
59  		"  movq %%r8, 8(%2);"
60  		"  adcxq 16(%3), %%r9;"
61  		"  movq %%r9, 16(%2);"
62  		"  adcxq 24(%3), %%r10;"
63  		"  movq %%r10, 24(%2);"
64  
65  		/* Return the carry bit in a register */
66  		"  adcx %%r11, %1;"
67  		: "+&r"(f2), "=&r"(carry_r)
68  		: "r"(out), "r"(f1)
69  		: "%r8", "%r9", "%r10", "%r11", "memory", "cc");
70  
71  	return carry_r;
72  }
73  
74  /* Computes the field addition of two field elements */
fadd(u64 * out,const u64 * f1,const u64 * f2)75  static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
76  {
77  	asm volatile(
78  		/* Compute the raw addition of f1 + f2 */
79  		"  movq 0(%0), %%r8;"
80  		"  addq 0(%2), %%r8;"
81  		"  movq 8(%0), %%r9;"
82  		"  adcxq 8(%2), %%r9;"
83  		"  movq 16(%0), %%r10;"
84  		"  adcxq 16(%2), %%r10;"
85  		"  movq 24(%0), %%r11;"
86  		"  adcxq 24(%2), %%r11;"
87  
88  		/* Wrap the result back into the field */
89  
90  		/* Step 1: Compute carry*38 */
91  		"  mov $0, %%rax;"
92  		"  mov $38, %0;"
93  		"  cmovc %0, %%rax;"
94  
95  		/* Step 2: Add carry*38 to the original sum */
96  		"  xor %%ecx, %%ecx;"
97  		"  add %%rax, %%r8;"
98  		"  adcx %%rcx, %%r9;"
99  		"  movq %%r9, 8(%1);"
100  		"  adcx %%rcx, %%r10;"
101  		"  movq %%r10, 16(%1);"
102  		"  adcx %%rcx, %%r11;"
103  		"  movq %%r11, 24(%1);"
104  
105  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
106  		"  mov $0, %%rax;"
107  		"  cmovc %0, %%rax;"
108  		"  add %%rax, %%r8;"
109  		"  movq %%r8, 0(%1);"
110  		: "+&r"(f2)
111  		: "r"(out), "r"(f1)
112  		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
113  }
114  
115  /* Computes the field subtraction of two field elements */
fsub(u64 * out,const u64 * f1,const u64 * f2)116  static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
117  {
118  	asm volatile(
119  		/* Compute the raw subtraction of f1-f2 */
120  		"  movq 0(%1), %%r8;"
121  		"  subq 0(%2), %%r8;"
122  		"  movq 8(%1), %%r9;"
123  		"  sbbq 8(%2), %%r9;"
124  		"  movq 16(%1), %%r10;"
125  		"  sbbq 16(%2), %%r10;"
126  		"  movq 24(%1), %%r11;"
127  		"  sbbq 24(%2), %%r11;"
128  
129  		/* Wrap the result back into the field */
130  
131  		/* Step 1: Compute carry*38 */
132  		"  mov $0, %%rax;"
133  		"  mov $38, %%rcx;"
134  		"  cmovc %%rcx, %%rax;"
135  
136  		/* Step 2: Subtract carry*38 from the original difference */
137  		"  sub %%rax, %%r8;"
138  		"  sbb $0, %%r9;"
139  		"  sbb $0, %%r10;"
140  		"  sbb $0, %%r11;"
141  
142  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
143  		"  mov $0, %%rax;"
144  		"  cmovc %%rcx, %%rax;"
145  		"  sub %%rax, %%r8;"
146  
147  		/* Store the result */
148  		"  movq %%r8, 0(%0);"
149  		"  movq %%r9, 8(%0);"
150  		"  movq %%r10, 16(%0);"
151  		"  movq %%r11, 24(%0);"
152  		:
153  		: "r"(out), "r"(f1), "r"(f2)
154  		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
155  }
156  
157  /* Computes a field multiplication: out <- f1 * f2
158   * Uses the 8-element buffer tmp for intermediate results */
fmul(u64 * out,const u64 * f1,const u64 * f2,u64 * tmp)159  static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
160  {
161  	asm volatile(
162  
163  		/* Compute the raw multiplication: tmp <- src1 * src2 */
164  
165  		/* Compute src1[0] * src2 */
166  		"  movq 0(%0), %%rdx;"
167  		"  mulxq 0(%1), %%r8, %%r9;"
168  		"  xor %%r10d, %%r10d;"
169  		"  movq %%r8, 0(%2);"
170  		"  mulxq 8(%1), %%r10, %%r11;"
171  		"  adox %%r9, %%r10;"
172  		"  movq %%r10, 8(%2);"
173  		"  mulxq 16(%1), %%rbx, %%r13;"
174  		"  adox %%r11, %%rbx;"
175  		"  mulxq 24(%1), %%r14, %%rdx;"
176  		"  adox %%r13, %%r14;"
177  		"  mov $0, %%rax;"
178  		"  adox %%rdx, %%rax;"
179  
180  		/* Compute src1[1] * src2 */
181  		"  movq 8(%0), %%rdx;"
182  		"  mulxq 0(%1), %%r8, %%r9;"
183  		"  xor %%r10d, %%r10d;"
184  		"  adcxq 8(%2), %%r8;"
185  		"  movq %%r8, 8(%2);"
186  		"  mulxq 8(%1), %%r10, %%r11;"
187  		"  adox %%r9, %%r10;"
188  		"  adcx %%rbx, %%r10;"
189  		"  movq %%r10, 16(%2);"
190  		"  mulxq 16(%1), %%rbx, %%r13;"
191  		"  adox %%r11, %%rbx;"
192  		"  adcx %%r14, %%rbx;"
193  		"  mov $0, %%r8;"
194  		"  mulxq 24(%1), %%r14, %%rdx;"
195  		"  adox %%r13, %%r14;"
196  		"  adcx %%rax, %%r14;"
197  		"  mov $0, %%rax;"
198  		"  adox %%rdx, %%rax;"
199  		"  adcx %%r8, %%rax;"
200  
201  		/* Compute src1[2] * src2 */
202  		"  movq 16(%0), %%rdx;"
203  		"  mulxq 0(%1), %%r8, %%r9;"
204  		"  xor %%r10d, %%r10d;"
205  		"  adcxq 16(%2), %%r8;"
206  		"  movq %%r8, 16(%2);"
207  		"  mulxq 8(%1), %%r10, %%r11;"
208  		"  adox %%r9, %%r10;"
209  		"  adcx %%rbx, %%r10;"
210  		"  movq %%r10, 24(%2);"
211  		"  mulxq 16(%1), %%rbx, %%r13;"
212  		"  adox %%r11, %%rbx;"
213  		"  adcx %%r14, %%rbx;"
214  		"  mov $0, %%r8;"
215  		"  mulxq 24(%1), %%r14, %%rdx;"
216  		"  adox %%r13, %%r14;"
217  		"  adcx %%rax, %%r14;"
218  		"  mov $0, %%rax;"
219  		"  adox %%rdx, %%rax;"
220  		"  adcx %%r8, %%rax;"
221  
222  		/* Compute src1[3] * src2 */
223  		"  movq 24(%0), %%rdx;"
224  		"  mulxq 0(%1), %%r8, %%r9;"
225  		"  xor %%r10d, %%r10d;"
226  		"  adcxq 24(%2), %%r8;"
227  		"  movq %%r8, 24(%2);"
228  		"  mulxq 8(%1), %%r10, %%r11;"
229  		"  adox %%r9, %%r10;"
230  		"  adcx %%rbx, %%r10;"
231  		"  movq %%r10, 32(%2);"
232  		"  mulxq 16(%1), %%rbx, %%r13;"
233  		"  adox %%r11, %%rbx;"
234  		"  adcx %%r14, %%rbx;"
235  		"  movq %%rbx, 40(%2);"
236  		"  mov $0, %%r8;"
237  		"  mulxq 24(%1), %%r14, %%rdx;"
238  		"  adox %%r13, %%r14;"
239  		"  adcx %%rax, %%r14;"
240  		"  movq %%r14, 48(%2);"
241  		"  mov $0, %%rax;"
242  		"  adox %%rdx, %%rax;"
243  		"  adcx %%r8, %%rax;"
244  		"  movq %%rax, 56(%2);"
245  
246  		/* Line up pointers */
247  		"  mov %2, %0;"
248  		"  mov %3, %2;"
249  
250  		/* Wrap the result back into the field */
251  
252  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
253  		"  mov $38, %%rdx;"
254  		"  mulxq 32(%0), %%r8, %%r13;"
255  		"  xor %k1, %k1;"
256  		"  adoxq 0(%0), %%r8;"
257  		"  mulxq 40(%0), %%r9, %%rbx;"
258  		"  adcx %%r13, %%r9;"
259  		"  adoxq 8(%0), %%r9;"
260  		"  mulxq 48(%0), %%r10, %%r13;"
261  		"  adcx %%rbx, %%r10;"
262  		"  adoxq 16(%0), %%r10;"
263  		"  mulxq 56(%0), %%r11, %%rax;"
264  		"  adcx %%r13, %%r11;"
265  		"  adoxq 24(%0), %%r11;"
266  		"  adcx %1, %%rax;"
267  		"  adox %1, %%rax;"
268  		"  imul %%rdx, %%rax;"
269  
270  		/* Step 2: Fold the carry back into dst */
271  		"  add %%rax, %%r8;"
272  		"  adcx %1, %%r9;"
273  		"  movq %%r9, 8(%2);"
274  		"  adcx %1, %%r10;"
275  		"  movq %%r10, 16(%2);"
276  		"  adcx %1, %%r11;"
277  		"  movq %%r11, 24(%2);"
278  
279  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
280  		"  mov $0, %%rax;"
281  		"  cmovc %%rdx, %%rax;"
282  		"  add %%rax, %%r8;"
283  		"  movq %%r8, 0(%2);"
284  		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
285  		: "r"(out)
286  		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
287  		  "%r14", "memory", "cc");
288  }
289  
290  /* Computes two field multiplications:
291   *   out[0] <- f1[0] * f2[0]
292   *   out[1] <- f1[1] * f2[1]
293   * Uses the 16-element buffer tmp for intermediate results: */
fmul2(u64 * out,const u64 * f1,const u64 * f2,u64 * tmp)294  static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
295  {
296  	asm volatile(
297  
298  		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
299  
300  		/* Compute src1[0] * src2 */
301  		"  movq 0(%0), %%rdx;"
302  		"  mulxq 0(%1), %%r8, %%r9;"
303  		"  xor %%r10d, %%r10d;"
304  		"  movq %%r8, 0(%2);"
305  		"  mulxq 8(%1), %%r10, %%r11;"
306  		"  adox %%r9, %%r10;"
307  		"  movq %%r10, 8(%2);"
308  		"  mulxq 16(%1), %%rbx, %%r13;"
309  		"  adox %%r11, %%rbx;"
310  		"  mulxq 24(%1), %%r14, %%rdx;"
311  		"  adox %%r13, %%r14;"
312  		"  mov $0, %%rax;"
313  		"  adox %%rdx, %%rax;"
314  
315  		/* Compute src1[1] * src2 */
316  		"  movq 8(%0), %%rdx;"
317  		"  mulxq 0(%1), %%r8, %%r9;"
318  		"  xor %%r10d, %%r10d;"
319  		"  adcxq 8(%2), %%r8;"
320  		"  movq %%r8, 8(%2);"
321  		"  mulxq 8(%1), %%r10, %%r11;"
322  		"  adox %%r9, %%r10;"
323  		"  adcx %%rbx, %%r10;"
324  		"  movq %%r10, 16(%2);"
325  		"  mulxq 16(%1), %%rbx, %%r13;"
326  		"  adox %%r11, %%rbx;"
327  		"  adcx %%r14, %%rbx;"
328  		"  mov $0, %%r8;"
329  		"  mulxq 24(%1), %%r14, %%rdx;"
330  		"  adox %%r13, %%r14;"
331  		"  adcx %%rax, %%r14;"
332  		"  mov $0, %%rax;"
333  		"  adox %%rdx, %%rax;"
334  		"  adcx %%r8, %%rax;"
335  
336  		/* Compute src1[2] * src2 */
337  		"  movq 16(%0), %%rdx;"
338  		"  mulxq 0(%1), %%r8, %%r9;"
339  		"  xor %%r10d, %%r10d;"
340  		"  adcxq 16(%2), %%r8;"
341  		"  movq %%r8, 16(%2);"
342  		"  mulxq 8(%1), %%r10, %%r11;"
343  		"  adox %%r9, %%r10;"
344  		"  adcx %%rbx, %%r10;"
345  		"  movq %%r10, 24(%2);"
346  		"  mulxq 16(%1), %%rbx, %%r13;"
347  		"  adox %%r11, %%rbx;"
348  		"  adcx %%r14, %%rbx;"
349  		"  mov $0, %%r8;"
350  		"  mulxq 24(%1), %%r14, %%rdx;"
351  		"  adox %%r13, %%r14;"
352  		"  adcx %%rax, %%r14;"
353  		"  mov $0, %%rax;"
354  		"  adox %%rdx, %%rax;"
355  		"  adcx %%r8, %%rax;"
356  
357  		/* Compute src1[3] * src2 */
358  		"  movq 24(%0), %%rdx;"
359  		"  mulxq 0(%1), %%r8, %%r9;"
360  		"  xor %%r10d, %%r10d;"
361  		"  adcxq 24(%2), %%r8;"
362  		"  movq %%r8, 24(%2);"
363  		"  mulxq 8(%1), %%r10, %%r11;"
364  		"  adox %%r9, %%r10;"
365  		"  adcx %%rbx, %%r10;"
366  		"  movq %%r10, 32(%2);"
367  		"  mulxq 16(%1), %%rbx, %%r13;"
368  		"  adox %%r11, %%rbx;"
369  		"  adcx %%r14, %%rbx;"
370  		"  movq %%rbx, 40(%2);"
371  		"  mov $0, %%r8;"
372  		"  mulxq 24(%1), %%r14, %%rdx;"
373  		"  adox %%r13, %%r14;"
374  		"  adcx %%rax, %%r14;"
375  		"  movq %%r14, 48(%2);"
376  		"  mov $0, %%rax;"
377  		"  adox %%rdx, %%rax;"
378  		"  adcx %%r8, %%rax;"
379  		"  movq %%rax, 56(%2);"
380  
381  		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
382  
383  		/* Compute src1[0] * src2 */
384  		"  movq 32(%0), %%rdx;"
385  		"  mulxq 32(%1), %%r8, %%r9;"
386  		"  xor %%r10d, %%r10d;"
387  		"  movq %%r8, 64(%2);"
388  		"  mulxq 40(%1), %%r10, %%r11;"
389  		"  adox %%r9, %%r10;"
390  		"  movq %%r10, 72(%2);"
391  		"  mulxq 48(%1), %%rbx, %%r13;"
392  		"  adox %%r11, %%rbx;"
393  		"  mulxq 56(%1), %%r14, %%rdx;"
394  		"  adox %%r13, %%r14;"
395  		"  mov $0, %%rax;"
396  		"  adox %%rdx, %%rax;"
397  
398  		/* Compute src1[1] * src2 */
399  		"  movq 40(%0), %%rdx;"
400  		"  mulxq 32(%1), %%r8, %%r9;"
401  		"  xor %%r10d, %%r10d;"
402  		"  adcxq 72(%2), %%r8;"
403  		"  movq %%r8, 72(%2);"
404  		"  mulxq 40(%1), %%r10, %%r11;"
405  		"  adox %%r9, %%r10;"
406  		"  adcx %%rbx, %%r10;"
407  		"  movq %%r10, 80(%2);"
408  		"  mulxq 48(%1), %%rbx, %%r13;"
409  		"  adox %%r11, %%rbx;"
410  		"  adcx %%r14, %%rbx;"
411  		"  mov $0, %%r8;"
412  		"  mulxq 56(%1), %%r14, %%rdx;"
413  		"  adox %%r13, %%r14;"
414  		"  adcx %%rax, %%r14;"
415  		"  mov $0, %%rax;"
416  		"  adox %%rdx, %%rax;"
417  		"  adcx %%r8, %%rax;"
418  
419  		/* Compute src1[2] * src2 */
420  		"  movq 48(%0), %%rdx;"
421  		"  mulxq 32(%1), %%r8, %%r9;"
422  		"  xor %%r10d, %%r10d;"
423  		"  adcxq 80(%2), %%r8;"
424  		"  movq %%r8, 80(%2);"
425  		"  mulxq 40(%1), %%r10, %%r11;"
426  		"  adox %%r9, %%r10;"
427  		"  adcx %%rbx, %%r10;"
428  		"  movq %%r10, 88(%2);"
429  		"  mulxq 48(%1), %%rbx, %%r13;"
430  		"  adox %%r11, %%rbx;"
431  		"  adcx %%r14, %%rbx;"
432  		"  mov $0, %%r8;"
433  		"  mulxq 56(%1), %%r14, %%rdx;"
434  		"  adox %%r13, %%r14;"
435  		"  adcx %%rax, %%r14;"
436  		"  mov $0, %%rax;"
437  		"  adox %%rdx, %%rax;"
438  		"  adcx %%r8, %%rax;"
439  
440  		/* Compute src1[3] * src2 */
441  		"  movq 56(%0), %%rdx;"
442  		"  mulxq 32(%1), %%r8, %%r9;"
443  		"  xor %%r10d, %%r10d;"
444  		"  adcxq 88(%2), %%r8;"
445  		"  movq %%r8, 88(%2);"
446  		"  mulxq 40(%1), %%r10, %%r11;"
447  		"  adox %%r9, %%r10;"
448  		"  adcx %%rbx, %%r10;"
449  		"  movq %%r10, 96(%2);"
450  		"  mulxq 48(%1), %%rbx, %%r13;"
451  		"  adox %%r11, %%rbx;"
452  		"  adcx %%r14, %%rbx;"
453  		"  movq %%rbx, 104(%2);"
454  		"  mov $0, %%r8;"
455  		"  mulxq 56(%1), %%r14, %%rdx;"
456  		"  adox %%r13, %%r14;"
457  		"  adcx %%rax, %%r14;"
458  		"  movq %%r14, 112(%2);"
459  		"  mov $0, %%rax;"
460  		"  adox %%rdx, %%rax;"
461  		"  adcx %%r8, %%rax;"
462  		"  movq %%rax, 120(%2);"
463  
464  		/* Line up pointers */
465  		"  mov %2, %0;"
466  		"  mov %3, %2;"
467  
468  		/* Wrap the results back into the field */
469  
470  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
471  		"  mov $38, %%rdx;"
472  		"  mulxq 32(%0), %%r8, %%r13;"
473  		"  xor %k1, %k1;"
474  		"  adoxq 0(%0), %%r8;"
475  		"  mulxq 40(%0), %%r9, %%rbx;"
476  		"  adcx %%r13, %%r9;"
477  		"  adoxq 8(%0), %%r9;"
478  		"  mulxq 48(%0), %%r10, %%r13;"
479  		"  adcx %%rbx, %%r10;"
480  		"  adoxq 16(%0), %%r10;"
481  		"  mulxq 56(%0), %%r11, %%rax;"
482  		"  adcx %%r13, %%r11;"
483  		"  adoxq 24(%0), %%r11;"
484  		"  adcx %1, %%rax;"
485  		"  adox %1, %%rax;"
486  		"  imul %%rdx, %%rax;"
487  
488  		/* Step 2: Fold the carry back into dst */
489  		"  add %%rax, %%r8;"
490  		"  adcx %1, %%r9;"
491  		"  movq %%r9, 8(%2);"
492  		"  adcx %1, %%r10;"
493  		"  movq %%r10, 16(%2);"
494  		"  adcx %1, %%r11;"
495  		"  movq %%r11, 24(%2);"
496  
497  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
498  		"  mov $0, %%rax;"
499  		"  cmovc %%rdx, %%rax;"
500  		"  add %%rax, %%r8;"
501  		"  movq %%r8, 0(%2);"
502  
503  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
504  		"  mov $38, %%rdx;"
505  		"  mulxq 96(%0), %%r8, %%r13;"
506  		"  xor %k1, %k1;"
507  		"  adoxq 64(%0), %%r8;"
508  		"  mulxq 104(%0), %%r9, %%rbx;"
509  		"  adcx %%r13, %%r9;"
510  		"  adoxq 72(%0), %%r9;"
511  		"  mulxq 112(%0), %%r10, %%r13;"
512  		"  adcx %%rbx, %%r10;"
513  		"  adoxq 80(%0), %%r10;"
514  		"  mulxq 120(%0), %%r11, %%rax;"
515  		"  adcx %%r13, %%r11;"
516  		"  adoxq 88(%0), %%r11;"
517  		"  adcx %1, %%rax;"
518  		"  adox %1, %%rax;"
519  		"  imul %%rdx, %%rax;"
520  
521  		/* Step 2: Fold the carry back into dst */
522  		"  add %%rax, %%r8;"
523  		"  adcx %1, %%r9;"
524  		"  movq %%r9, 40(%2);"
525  		"  adcx %1, %%r10;"
526  		"  movq %%r10, 48(%2);"
527  		"  adcx %1, %%r11;"
528  		"  movq %%r11, 56(%2);"
529  
530  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
531  		"  mov $0, %%rax;"
532  		"  cmovc %%rdx, %%rax;"
533  		"  add %%rax, %%r8;"
534  		"  movq %%r8, 32(%2);"
535  		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
536  		: "r"(out)
537  		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
538  		  "%r14", "memory", "cc");
539  }
540  
541  /* Computes the field multiplication of four-element f1 with value in f2
542   * Requires f2 to be smaller than 2^17 */
fmul_scalar(u64 * out,const u64 * f1,u64 f2)543  static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
544  {
545  	register u64 f2_r asm("rdx") = f2;
546  
547  	asm volatile(
548  		/* Compute the raw multiplication of f1*f2 */
549  		"  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
550  		"  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
551  		"  add %%rcx, %%r9;"
552  		"  mov $0, %%rcx;"
553  		"  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
554  		"  adcx %%rbx, %%r10;"
555  		"  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
556  		"  adcx %%r13, %%r11;"
557  		"  adcx %%rcx, %%rax;"
558  
559  		/* Wrap the result back into the field */
560  
561  		/* Step 1: Compute carry*38 */
562  		"  mov $38, %%rdx;"
563  		"  imul %%rdx, %%rax;"
564  
565  		/* Step 2: Fold the carry back into dst */
566  		"  add %%rax, %%r8;"
567  		"  adcx %%rcx, %%r9;"
568  		"  movq %%r9, 8(%1);"
569  		"  adcx %%rcx, %%r10;"
570  		"  movq %%r10, 16(%1);"
571  		"  adcx %%rcx, %%r11;"
572  		"  movq %%r11, 24(%1);"
573  
574  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
575  		"  mov $0, %%rax;"
576  		"  cmovc %%rdx, %%rax;"
577  		"  add %%rax, %%r8;"
578  		"  movq %%r8, 0(%1);"
579  		: "+&r"(f2_r)
580  		: "r"(out), "r"(f1)
581  		: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
582  		  "memory", "cc");
583  }
584  
585  /* Computes p1 <- bit ? p2 : p1 in constant time */
cswap2(u64 bit,const u64 * p1,const u64 * p2)586  static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
587  {
588  	asm volatile(
589  		/* Transfer bit into CF flag */
590  		"  add $18446744073709551615, %0;"
591  
592  		/* cswap p1[0], p2[0] */
593  		"  movq 0(%1), %%r8;"
594  		"  movq 0(%2), %%r9;"
595  		"  mov %%r8, %%r10;"
596  		"  cmovc %%r9, %%r8;"
597  		"  cmovc %%r10, %%r9;"
598  		"  movq %%r8, 0(%1);"
599  		"  movq %%r9, 0(%2);"
600  
601  		/* cswap p1[1], p2[1] */
602  		"  movq 8(%1), %%r8;"
603  		"  movq 8(%2), %%r9;"
604  		"  mov %%r8, %%r10;"
605  		"  cmovc %%r9, %%r8;"
606  		"  cmovc %%r10, %%r9;"
607  		"  movq %%r8, 8(%1);"
608  		"  movq %%r9, 8(%2);"
609  
610  		/* cswap p1[2], p2[2] */
611  		"  movq 16(%1), %%r8;"
612  		"  movq 16(%2), %%r9;"
613  		"  mov %%r8, %%r10;"
614  		"  cmovc %%r9, %%r8;"
615  		"  cmovc %%r10, %%r9;"
616  		"  movq %%r8, 16(%1);"
617  		"  movq %%r9, 16(%2);"
618  
619  		/* cswap p1[3], p2[3] */
620  		"  movq 24(%1), %%r8;"
621  		"  movq 24(%2), %%r9;"
622  		"  mov %%r8, %%r10;"
623  		"  cmovc %%r9, %%r8;"
624  		"  cmovc %%r10, %%r9;"
625  		"  movq %%r8, 24(%1);"
626  		"  movq %%r9, 24(%2);"
627  
628  		/* cswap p1[4], p2[4] */
629  		"  movq 32(%1), %%r8;"
630  		"  movq 32(%2), %%r9;"
631  		"  mov %%r8, %%r10;"
632  		"  cmovc %%r9, %%r8;"
633  		"  cmovc %%r10, %%r9;"
634  		"  movq %%r8, 32(%1);"
635  		"  movq %%r9, 32(%2);"
636  
637  		/* cswap p1[5], p2[5] */
638  		"  movq 40(%1), %%r8;"
639  		"  movq 40(%2), %%r9;"
640  		"  mov %%r8, %%r10;"
641  		"  cmovc %%r9, %%r8;"
642  		"  cmovc %%r10, %%r9;"
643  		"  movq %%r8, 40(%1);"
644  		"  movq %%r9, 40(%2);"
645  
646  		/* cswap p1[6], p2[6] */
647  		"  movq 48(%1), %%r8;"
648  		"  movq 48(%2), %%r9;"
649  		"  mov %%r8, %%r10;"
650  		"  cmovc %%r9, %%r8;"
651  		"  cmovc %%r10, %%r9;"
652  		"  movq %%r8, 48(%1);"
653  		"  movq %%r9, 48(%2);"
654  
655  		/* cswap p1[7], p2[7] */
656  		"  movq 56(%1), %%r8;"
657  		"  movq 56(%2), %%r9;"
658  		"  mov %%r8, %%r10;"
659  		"  cmovc %%r9, %%r8;"
660  		"  cmovc %%r10, %%r9;"
661  		"  movq %%r8, 56(%1);"
662  		"  movq %%r9, 56(%2);"
663  		: "+&r"(bit)
664  		: "r"(p1), "r"(p2)
665  		: "%r8", "%r9", "%r10", "memory", "cc");
666  }
667  
668  /* Computes the square of a field element: out <- f * f
669   * Uses the 8-element buffer tmp for intermediate results */
fsqr(u64 * out,const u64 * f,u64 * tmp)670  static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
671  {
672  	asm volatile(
673  		/* Compute the raw multiplication: tmp <- f * f */
674  
675  		/* Step 1: Compute all partial products */
676  		"  movq 0(%0), %%rdx;" /* f[0] */
677  		"  mulxq 8(%0), %%r8, %%r14;"
678  		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
679  		"  mulxq 16(%0), %%r9, %%r10;"
680  		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
681  		"  mulxq 24(%0), %%rax, %%rcx;"
682  		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
683  		"  movq 24(%0), %%rdx;" /* f[3] */
684  		"  mulxq 8(%0), %%r11, %%rbx;"
685  		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
686  		"  mulxq 16(%0), %%rax, %%r13;"
687  		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
688  		"  movq 8(%0), %%rdx;"
689  		"  adcx %%r15, %%r13;" /* f1 */
690  		"  mulxq 16(%0), %%rax, %%rcx;"
691  		"  mov $0, %%r14;" /* f[2]*f[1] */
692  
693  		/* Step 2: Compute two parallel carry chains */
694  		"  xor %%r15d, %%r15d;"
695  		"  adox %%rax, %%r10;"
696  		"  adcx %%r8, %%r8;"
697  		"  adox %%rcx, %%r11;"
698  		"  adcx %%r9, %%r9;"
699  		"  adox %%r15, %%rbx;"
700  		"  adcx %%r10, %%r10;"
701  		"  adox %%r15, %%r13;"
702  		"  adcx %%r11, %%r11;"
703  		"  adox %%r15, %%r14;"
704  		"  adcx %%rbx, %%rbx;"
705  		"  adcx %%r13, %%r13;"
706  		"  adcx %%r14, %%r14;"
707  
708  		/* Step 3: Compute intermediate squares */
709  		"  movq 0(%0), %%rdx;"
710  		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
711  		"  movq %%rax, 0(%1);"
712  		"  add %%rcx, %%r8;"
713  		"  movq %%r8, 8(%1);"
714  		"  movq 8(%0), %%rdx;"
715  		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
716  		"  adcx %%rax, %%r9;"
717  		"  movq %%r9, 16(%1);"
718  		"  adcx %%rcx, %%r10;"
719  		"  movq %%r10, 24(%1);"
720  		"  movq 16(%0), %%rdx;"
721  		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
722  		"  adcx %%rax, %%r11;"
723  		"  movq %%r11, 32(%1);"
724  		"  adcx %%rcx, %%rbx;"
725  		"  movq %%rbx, 40(%1);"
726  		"  movq 24(%0), %%rdx;"
727  		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
728  		"  adcx %%rax, %%r13;"
729  		"  movq %%r13, 48(%1);"
730  		"  adcx %%rcx, %%r14;"
731  		"  movq %%r14, 56(%1);"
732  
733  		/* Line up pointers */
734  		"  mov %1, %0;"
735  		"  mov %2, %1;"
736  
737  		/* Wrap the result back into the field */
738  
739  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
740  		"  mov $38, %%rdx;"
741  		"  mulxq 32(%0), %%r8, %%r13;"
742  		"  xor %%ecx, %%ecx;"
743  		"  adoxq 0(%0), %%r8;"
744  		"  mulxq 40(%0), %%r9, %%rbx;"
745  		"  adcx %%r13, %%r9;"
746  		"  adoxq 8(%0), %%r9;"
747  		"  mulxq 48(%0), %%r10, %%r13;"
748  		"  adcx %%rbx, %%r10;"
749  		"  adoxq 16(%0), %%r10;"
750  		"  mulxq 56(%0), %%r11, %%rax;"
751  		"  adcx %%r13, %%r11;"
752  		"  adoxq 24(%0), %%r11;"
753  		"  adcx %%rcx, %%rax;"
754  		"  adox %%rcx, %%rax;"
755  		"  imul %%rdx, %%rax;"
756  
757  		/* Step 2: Fold the carry back into dst */
758  		"  add %%rax, %%r8;"
759  		"  adcx %%rcx, %%r9;"
760  		"  movq %%r9, 8(%1);"
761  		"  adcx %%rcx, %%r10;"
762  		"  movq %%r10, 16(%1);"
763  		"  adcx %%rcx, %%r11;"
764  		"  movq %%r11, 24(%1);"
765  
766  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
767  		"  mov $0, %%rax;"
768  		"  cmovc %%rdx, %%rax;"
769  		"  add %%rax, %%r8;"
770  		"  movq %%r8, 0(%1);"
771  		: "+&r"(f), "+&r"(tmp)
772  		: "r"(out)
773  		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
774  		  "%r13", "%r14", "%r15", "memory", "cc");
775  }
776  
777  /* Computes two field squarings:
778   *   out[0] <- f[0] * f[0]
779   *   out[1] <- f[1] * f[1]
780   * Uses the 16-element buffer tmp for intermediate results */
fsqr2(u64 * out,const u64 * f,u64 * tmp)781  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
782  {
783  	asm volatile(
784  		/* Step 1: Compute all partial products */
785  		"  movq 0(%0), %%rdx;" /* f[0] */
786  		"  mulxq 8(%0), %%r8, %%r14;"
787  		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
788  		"  mulxq 16(%0), %%r9, %%r10;"
789  		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
790  		"  mulxq 24(%0), %%rax, %%rcx;"
791  		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
792  		"  movq 24(%0), %%rdx;" /* f[3] */
793  		"  mulxq 8(%0), %%r11, %%rbx;"
794  		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
795  		"  mulxq 16(%0), %%rax, %%r13;"
796  		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
797  		"  movq 8(%0), %%rdx;"
798  		"  adcx %%r15, %%r13;" /* f1 */
799  		"  mulxq 16(%0), %%rax, %%rcx;"
800  		"  mov $0, %%r14;" /* f[2]*f[1] */
801  
802  		/* Step 2: Compute two parallel carry chains */
803  		"  xor %%r15d, %%r15d;"
804  		"  adox %%rax, %%r10;"
805  		"  adcx %%r8, %%r8;"
806  		"  adox %%rcx, %%r11;"
807  		"  adcx %%r9, %%r9;"
808  		"  adox %%r15, %%rbx;"
809  		"  adcx %%r10, %%r10;"
810  		"  adox %%r15, %%r13;"
811  		"  adcx %%r11, %%r11;"
812  		"  adox %%r15, %%r14;"
813  		"  adcx %%rbx, %%rbx;"
814  		"  adcx %%r13, %%r13;"
815  		"  adcx %%r14, %%r14;"
816  
817  		/* Step 3: Compute intermediate squares */
818  		"  movq 0(%0), %%rdx;"
819  		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
820  		"  movq %%rax, 0(%1);"
821  		"  add %%rcx, %%r8;"
822  		"  movq %%r8, 8(%1);"
823  		"  movq 8(%0), %%rdx;"
824  		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
825  		"  adcx %%rax, %%r9;"
826  		"  movq %%r9, 16(%1);"
827  		"  adcx %%rcx, %%r10;"
828  		"  movq %%r10, 24(%1);"
829  		"  movq 16(%0), %%rdx;"
830  		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
831  		"  adcx %%rax, %%r11;"
832  		"  movq %%r11, 32(%1);"
833  		"  adcx %%rcx, %%rbx;"
834  		"  movq %%rbx, 40(%1);"
835  		"  movq 24(%0), %%rdx;"
836  		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
837  		"  adcx %%rax, %%r13;"
838  		"  movq %%r13, 48(%1);"
839  		"  adcx %%rcx, %%r14;"
840  		"  movq %%r14, 56(%1);"
841  
842  		/* Step 1: Compute all partial products */
843  		"  movq 32(%0), %%rdx;" /* f[0] */
844  		"  mulxq 40(%0), %%r8, %%r14;"
845  		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
846  		"  mulxq 48(%0), %%r9, %%r10;"
847  		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
848  		"  mulxq 56(%0), %%rax, %%rcx;"
849  		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
850  		"  movq 56(%0), %%rdx;" /* f[3] */
851  		"  mulxq 40(%0), %%r11, %%rbx;"
852  		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
853  		"  mulxq 48(%0), %%rax, %%r13;"
854  		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
855  		"  movq 40(%0), %%rdx;"
856  		"  adcx %%r15, %%r13;" /* f1 */
857  		"  mulxq 48(%0), %%rax, %%rcx;"
858  		"  mov $0, %%r14;" /* f[2]*f[1] */
859  
860  		/* Step 2: Compute two parallel carry chains */
861  		"  xor %%r15d, %%r15d;"
862  		"  adox %%rax, %%r10;"
863  		"  adcx %%r8, %%r8;"
864  		"  adox %%rcx, %%r11;"
865  		"  adcx %%r9, %%r9;"
866  		"  adox %%r15, %%rbx;"
867  		"  adcx %%r10, %%r10;"
868  		"  adox %%r15, %%r13;"
869  		"  adcx %%r11, %%r11;"
870  		"  adox %%r15, %%r14;"
871  		"  adcx %%rbx, %%rbx;"
872  		"  adcx %%r13, %%r13;"
873  		"  adcx %%r14, %%r14;"
874  
875  		/* Step 3: Compute intermediate squares */
876  		"  movq 32(%0), %%rdx;"
877  		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
878  		"  movq %%rax, 64(%1);"
879  		"  add %%rcx, %%r8;"
880  		"  movq %%r8, 72(%1);"
881  		"  movq 40(%0), %%rdx;"
882  		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
883  		"  adcx %%rax, %%r9;"
884  		"  movq %%r9, 80(%1);"
885  		"  adcx %%rcx, %%r10;"
886  		"  movq %%r10, 88(%1);"
887  		"  movq 48(%0), %%rdx;"
888  		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
889  		"  adcx %%rax, %%r11;"
890  		"  movq %%r11, 96(%1);"
891  		"  adcx %%rcx, %%rbx;"
892  		"  movq %%rbx, 104(%1);"
893  		"  movq 56(%0), %%rdx;"
894  		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
895  		"  adcx %%rax, %%r13;"
896  		"  movq %%r13, 112(%1);"
897  		"  adcx %%rcx, %%r14;"
898  		"  movq %%r14, 120(%1);"
899  
900  		/* Line up pointers */
901  		"  mov %1, %0;"
902  		"  mov %2, %1;"
903  
904  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
905  		"  mov $38, %%rdx;"
906  		"  mulxq 32(%0), %%r8, %%r13;"
907  		"  xor %%ecx, %%ecx;"
908  		"  adoxq 0(%0), %%r8;"
909  		"  mulxq 40(%0), %%r9, %%rbx;"
910  		"  adcx %%r13, %%r9;"
911  		"  adoxq 8(%0), %%r9;"
912  		"  mulxq 48(%0), %%r10, %%r13;"
913  		"  adcx %%rbx, %%r10;"
914  		"  adoxq 16(%0), %%r10;"
915  		"  mulxq 56(%0), %%r11, %%rax;"
916  		"  adcx %%r13, %%r11;"
917  		"  adoxq 24(%0), %%r11;"
918  		"  adcx %%rcx, %%rax;"
919  		"  adox %%rcx, %%rax;"
920  		"  imul %%rdx, %%rax;"
921  
922  		/* Step 2: Fold the carry back into dst */
923  		"  add %%rax, %%r8;"
924  		"  adcx %%rcx, %%r9;"
925  		"  movq %%r9, 8(%1);"
926  		"  adcx %%rcx, %%r10;"
927  		"  movq %%r10, 16(%1);"
928  		"  adcx %%rcx, %%r11;"
929  		"  movq %%r11, 24(%1);"
930  
931  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
932  		"  mov $0, %%rax;"
933  		"  cmovc %%rdx, %%rax;"
934  		"  add %%rax, %%r8;"
935  		"  movq %%r8, 0(%1);"
936  
937  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
938  		"  mov $38, %%rdx;"
939  		"  mulxq 96(%0), %%r8, %%r13;"
940  		"  xor %%ecx, %%ecx;"
941  		"  adoxq 64(%0), %%r8;"
942  		"  mulxq 104(%0), %%r9, %%rbx;"
943  		"  adcx %%r13, %%r9;"
944  		"  adoxq 72(%0), %%r9;"
945  		"  mulxq 112(%0), %%r10, %%r13;"
946  		"  adcx %%rbx, %%r10;"
947  		"  adoxq 80(%0), %%r10;"
948  		"  mulxq 120(%0), %%r11, %%rax;"
949  		"  adcx %%r13, %%r11;"
950  		"  adoxq 88(%0), %%r11;"
951  		"  adcx %%rcx, %%rax;"
952  		"  adox %%rcx, %%rax;"
953  		"  imul %%rdx, %%rax;"
954  
955  		/* Step 2: Fold the carry back into dst */
956  		"  add %%rax, %%r8;"
957  		"  adcx %%rcx, %%r9;"
958  		"  movq %%r9, 40(%1);"
959  		"  adcx %%rcx, %%r10;"
960  		"  movq %%r10, 48(%1);"
961  		"  adcx %%rcx, %%r11;"
962  		"  movq %%r11, 56(%1);"
963  
964  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
965  		"  mov $0, %%rax;"
966  		"  cmovc %%rdx, %%rax;"
967  		"  add %%rax, %%r8;"
968  		"  movq %%r8, 32(%1);"
969  		: "+&r"(f), "+&r"(tmp)
970  		: "r"(out)
971  		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
972  		  "%r13", "%r14", "%r15", "memory", "cc");
973  }
974  
point_add_and_double(u64 * q,u64 * p01_tmp1,u64 * tmp2)975  static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
976  {
977  	u64 *nq = p01_tmp1;
978  	u64 *nq_p1 = p01_tmp1 + (u32)8U;
979  	u64 *tmp1 = p01_tmp1 + (u32)16U;
980  	u64 *x1 = q;
981  	u64 *x2 = nq;
982  	u64 *z2 = nq + (u32)4U;
983  	u64 *z3 = nq_p1 + (u32)4U;
984  	u64 *a = tmp1;
985  	u64 *b = tmp1 + (u32)4U;
986  	u64 *ab = tmp1;
987  	u64 *dc = tmp1 + (u32)8U;
988  	u64 *x3;
989  	u64 *z31;
990  	u64 *d0;
991  	u64 *c0;
992  	u64 *a1;
993  	u64 *b1;
994  	u64 *d;
995  	u64 *c;
996  	u64 *ab1;
997  	u64 *dc1;
998  	fadd(a, x2, z2);
999  	fsub(b, x2, z2);
1000  	x3 = nq_p1;
1001  	z31 = nq_p1 + (u32)4U;
1002  	d0 = dc;
1003  	c0 = dc + (u32)4U;
1004  	fadd(c0, x3, z31);
1005  	fsub(d0, x3, z31);
1006  	fmul2(dc, dc, ab, tmp2);
1007  	fadd(x3, d0, c0);
1008  	fsub(z31, d0, c0);
1009  	a1 = tmp1;
1010  	b1 = tmp1 + (u32)4U;
1011  	d = tmp1 + (u32)8U;
1012  	c = tmp1 + (u32)12U;
1013  	ab1 = tmp1;
1014  	dc1 = tmp1 + (u32)8U;
1015  	fsqr2(dc1, ab1, tmp2);
1016  	fsqr2(nq_p1, nq_p1, tmp2);
1017  	a1[0U] = c[0U];
1018  	a1[1U] = c[1U];
1019  	a1[2U] = c[2U];
1020  	a1[3U] = c[3U];
1021  	fsub(c, d, c);
1022  	fmul_scalar(b1, c, (u64)121665U);
1023  	fadd(b1, b1, d);
1024  	fmul2(nq, dc1, ab1, tmp2);
1025  	fmul(z3, z3, x1, tmp2);
1026  }
1027  
point_double(u64 * nq,u64 * tmp1,u64 * tmp2)1028  static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
1029  {
1030  	u64 *x2 = nq;
1031  	u64 *z2 = nq + (u32)4U;
1032  	u64 *a = tmp1;
1033  	u64 *b = tmp1 + (u32)4U;
1034  	u64 *d = tmp1 + (u32)8U;
1035  	u64 *c = tmp1 + (u32)12U;
1036  	u64 *ab = tmp1;
1037  	u64 *dc = tmp1 + (u32)8U;
1038  	fadd(a, x2, z2);
1039  	fsub(b, x2, z2);
1040  	fsqr2(dc, ab, tmp2);
1041  	a[0U] = c[0U];
1042  	a[1U] = c[1U];
1043  	a[2U] = c[2U];
1044  	a[3U] = c[3U];
1045  	fsub(c, d, c);
1046  	fmul_scalar(b, c, (u64)121665U);
1047  	fadd(b, b, d);
1048  	fmul2(nq, dc, ab, tmp2);
1049  }
1050  
montgomery_ladder(u64 * out,const u8 * key,u64 * init1)1051  static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
1052  {
1053  	u64 tmp2[16U] = { 0U };
1054  	u64 p01_tmp1_swap[33U] = { 0U };
1055  	u64 *p0 = p01_tmp1_swap;
1056  	u64 *p01 = p01_tmp1_swap;
1057  	u64 *p03 = p01;
1058  	u64 *p11 = p01 + (u32)8U;
1059  	u64 *x0;
1060  	u64 *z0;
1061  	u64 *p01_tmp1;
1062  	u64 *p01_tmp11;
1063  	u64 *nq10;
1064  	u64 *nq_p11;
1065  	u64 *swap1;
1066  	u64 sw0;
1067  	u64 *nq1;
1068  	u64 *tmp1;
1069  	memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
1070  	x0 = p03;
1071  	z0 = p03 + (u32)4U;
1072  	x0[0U] = (u64)1U;
1073  	x0[1U] = (u64)0U;
1074  	x0[2U] = (u64)0U;
1075  	x0[3U] = (u64)0U;
1076  	z0[0U] = (u64)0U;
1077  	z0[1U] = (u64)0U;
1078  	z0[2U] = (u64)0U;
1079  	z0[3U] = (u64)0U;
1080  	p01_tmp1 = p01_tmp1_swap;
1081  	p01_tmp11 = p01_tmp1_swap;
1082  	nq10 = p01_tmp1_swap;
1083  	nq_p11 = p01_tmp1_swap + (u32)8U;
1084  	swap1 = p01_tmp1_swap + (u32)32U;
1085  	cswap2((u64)1U, nq10, nq_p11);
1086  	point_add_and_double(init1, p01_tmp11, tmp2);
1087  	swap1[0U] = (u64)1U;
1088  	{
1089  		u32 i;
1090  		for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
1091  			u64 *p01_tmp12 = p01_tmp1_swap;
1092  			u64 *swap2 = p01_tmp1_swap + (u32)32U;
1093  			u64 *nq2 = p01_tmp12;
1094  			u64 *nq_p12 = p01_tmp12 + (u32)8U;
1095  			u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
1096  			u64 sw = swap2[0U] ^ bit;
1097  			cswap2(sw, nq2, nq_p12);
1098  			point_add_and_double(init1, p01_tmp12, tmp2);
1099  			swap2[0U] = bit;
1100  		}
1101  	}
1102  	sw0 = swap1[0U];
1103  	cswap2(sw0, nq10, nq_p11);
1104  	nq1 = p01_tmp1;
1105  	tmp1 = p01_tmp1 + (u32)16U;
1106  	point_double(nq1, tmp1, tmp2);
1107  	point_double(nq1, tmp1, tmp2);
1108  	point_double(nq1, tmp1, tmp2);
1109  	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
1110  
1111  	memzero_explicit(tmp2, sizeof(tmp2));
1112  	memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
1113  }
1114  
fsquare_times(u64 * o,const u64 * inp,u64 * tmp,u32 n1)1115  static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
1116  {
1117  	u32 i;
1118  	fsqr(o, inp, tmp);
1119  	for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
1120  		fsqr(o, o, tmp);
1121  }
1122  
finv(u64 * o,const u64 * i,u64 * tmp)1123  static void finv(u64 *o, const u64 *i, u64 *tmp)
1124  {
1125  	u64 t1[16U] = { 0U };
1126  	u64 *a0 = t1;
1127  	u64 *b = t1 + (u32)4U;
1128  	u64 *c = t1 + (u32)8U;
1129  	u64 *t00 = t1 + (u32)12U;
1130  	u64 *tmp1 = tmp;
1131  	u64 *a;
1132  	u64 *t0;
1133  	fsquare_times(a0, i, tmp1, (u32)1U);
1134  	fsquare_times(t00, a0, tmp1, (u32)2U);
1135  	fmul(b, t00, i, tmp);
1136  	fmul(a0, b, a0, tmp);
1137  	fsquare_times(t00, a0, tmp1, (u32)1U);
1138  	fmul(b, t00, b, tmp);
1139  	fsquare_times(t00, b, tmp1, (u32)5U);
1140  	fmul(b, t00, b, tmp);
1141  	fsquare_times(t00, b, tmp1, (u32)10U);
1142  	fmul(c, t00, b, tmp);
1143  	fsquare_times(t00, c, tmp1, (u32)20U);
1144  	fmul(t00, t00, c, tmp);
1145  	fsquare_times(t00, t00, tmp1, (u32)10U);
1146  	fmul(b, t00, b, tmp);
1147  	fsquare_times(t00, b, tmp1, (u32)50U);
1148  	fmul(c, t00, b, tmp);
1149  	fsquare_times(t00, c, tmp1, (u32)100U);
1150  	fmul(t00, t00, c, tmp);
1151  	fsquare_times(t00, t00, tmp1, (u32)50U);
1152  	fmul(t00, t00, b, tmp);
1153  	fsquare_times(t00, t00, tmp1, (u32)5U);
1154  	a = t1;
1155  	t0 = t1 + (u32)12U;
1156  	fmul(o, t0, a, tmp);
1157  }
1158  
store_felem(u64 * b,u64 * f)1159  static void store_felem(u64 *b, u64 *f)
1160  {
1161  	u64 f30 = f[3U];
1162  	u64 top_bit0 = f30 >> (u32)63U;
1163  	u64 f31;
1164  	u64 top_bit;
1165  	u64 f0;
1166  	u64 f1;
1167  	u64 f2;
1168  	u64 f3;
1169  	u64 m0;
1170  	u64 m1;
1171  	u64 m2;
1172  	u64 m3;
1173  	u64 mask;
1174  	u64 f0_;
1175  	u64 f1_;
1176  	u64 f2_;
1177  	u64 f3_;
1178  	u64 o0;
1179  	u64 o1;
1180  	u64 o2;
1181  	u64 o3;
1182  	f[3U] = f30 & (u64)0x7fffffffffffffffU;
1183  	add_scalar(f, f, (u64)19U * top_bit0);
1184  	f31 = f[3U];
1185  	top_bit = f31 >> (u32)63U;
1186  	f[3U] = f31 & (u64)0x7fffffffffffffffU;
1187  	add_scalar(f, f, (u64)19U * top_bit);
1188  	f0 = f[0U];
1189  	f1 = f[1U];
1190  	f2 = f[2U];
1191  	f3 = f[3U];
1192  	m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
1193  	m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
1194  	m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
1195  	m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
1196  	mask = ((m0 & m1) & m2) & m3;
1197  	f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
1198  	f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
1199  	f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
1200  	f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
1201  	o0 = f0_;
1202  	o1 = f1_;
1203  	o2 = f2_;
1204  	o3 = f3_;
1205  	b[0U] = o0;
1206  	b[1U] = o1;
1207  	b[2U] = o2;
1208  	b[3U] = o3;
1209  }
1210  
encode_point(u8 * o,const u64 * i)1211  static void encode_point(u8 *o, const u64 *i)
1212  {
1213  	const u64 *x = i;
1214  	const u64 *z = i + (u32)4U;
1215  	u64 tmp[4U] = { 0U };
1216  	u64 tmp_w[16U] = { 0U };
1217  	finv(tmp, z, tmp_w);
1218  	fmul(tmp, tmp, x, tmp_w);
1219  	store_felem((u64 *)o, tmp);
1220  }
1221  
curve25519_ever64(u8 * out,const u8 * priv,const u8 * pub)1222  static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1223  {
1224  	u64 init1[8U] = { 0U };
1225  	u64 tmp[4U] = { 0U };
1226  	u64 tmp3;
1227  	u64 *x;
1228  	u64 *z;
1229  	{
1230  		u32 i;
1231  		for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1232  			u64 *os = tmp;
1233  			const u8 *bj = pub + i * (u32)8U;
1234  			u64 u = *(u64 *)bj;
1235  			u64 r = u;
1236  			u64 x0 = r;
1237  			os[i] = x0;
1238  		}
1239  	}
1240  	tmp3 = tmp[3U];
1241  	tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1242  	x = init1;
1243  	z = init1 + (u32)4U;
1244  	z[0U] = (u64)1U;
1245  	z[1U] = (u64)0U;
1246  	z[2U] = (u64)0U;
1247  	z[3U] = (u64)0U;
1248  	x[0U] = tmp[0U];
1249  	x[1U] = tmp[1U];
1250  	x[2U] = tmp[2U];
1251  	x[3U] = tmp[3U];
1252  	montgomery_ladder(init1, priv, init1);
1253  	encode_point(out, init1);
1254  }
1255  
1256  /* The below constants were generated using this sage script:
1257   *
1258   * #!/usr/bin/env sage
1259   * import sys
1260   * from sage.all import *
1261   * def limbs(n):
1262   * 	n = int(n)
1263   * 	l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1264   * 	return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1265   * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1266   * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1267   * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1268   * print("static const u64 table_ladder[] = {")
1269   * p = ec.lift_x(9)
1270   * for i in range(252):
1271   * 	l = (p[0] + p[2]) / (p[0] - p[2])
1272   * 	print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1273   * 	p = p * 2
1274   * print("};")
1275   *
1276   */
1277  
1278  static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1279  
1280  static const u64 table_ladder[] = {
1281  	0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1282  	0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1283  	0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1284  	0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1285  	0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1286  	0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1287  	0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1288  	0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1289  	0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1290  	0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1291  	0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1292  	0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1293  	0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1294  	0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1295  	0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1296  	0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1297  	0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1298  	0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1299  	0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1300  	0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1301  	0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1302  	0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1303  	0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1304  	0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1305  	0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1306  	0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1307  	0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1308  	0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1309  	0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1310  	0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1311  	0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1312  	0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1313  	0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1314  	0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1315  	0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1316  	0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1317  	0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1318  	0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1319  	0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1320  	0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1321  	0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1322  	0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1323  	0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1324  	0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1325  	0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1326  	0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1327  	0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1328  	0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1329  	0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1330  	0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1331  	0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1332  	0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1333  	0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1334  	0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1335  	0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1336  	0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1337  	0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1338  	0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1339  	0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1340  	0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1341  	0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1342  	0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1343  	0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1344  	0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1345  	0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1346  	0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1347  	0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1348  	0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1349  	0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1350  	0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1351  	0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1352  	0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1353  	0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1354  	0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1355  	0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1356  	0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1357  	0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1358  	0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1359  	0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1360  	0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1361  	0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1362  	0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1363  	0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1364  	0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1365  	0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1366  	0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1367  	0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1368  	0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1369  	0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1370  	0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1371  	0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1372  	0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1373  	0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1374  	0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1375  	0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1376  	0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1377  	0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1378  	0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1379  	0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1380  	0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1381  	0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1382  	0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1383  	0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1384  	0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1385  	0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1386  	0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1387  	0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1388  	0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1389  	0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1390  	0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1391  	0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1392  	0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1393  	0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1394  	0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1395  	0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1396  	0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1397  	0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1398  	0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1399  	0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1400  	0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1401  	0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1402  	0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1403  	0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1404  	0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1405  	0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1406  	0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1407  	0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1408  	0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1409  	0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1410  	0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1411  	0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1412  	0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1413  	0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1414  	0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1415  	0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1416  	0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1417  	0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1418  	0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1419  	0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1420  	0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1421  	0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1422  	0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1423  	0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1424  	0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1425  	0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1426  	0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1427  	0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1428  	0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1429  	0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1430  	0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1431  	0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1432  	0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1433  	0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1434  	0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1435  	0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1436  	0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1437  	0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1438  	0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1439  	0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1440  	0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1441  	0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1442  	0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1443  	0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1444  	0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1445  	0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1446  	0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1447  	0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1448  	0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1449  	0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1450  	0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1451  	0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1452  	0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1453  	0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1454  	0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1455  	0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1456  	0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1457  	0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1458  	0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1459  	0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1460  	0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1461  	0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1462  	0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1463  	0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1464  	0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1465  	0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1466  	0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1467  	0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1468  	0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1469  	0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1470  	0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1471  	0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1472  	0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1473  	0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1474  	0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1475  	0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1476  	0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1477  	0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1478  	0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1479  	0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1480  	0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1481  	0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1482  	0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1483  	0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1484  	0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1485  	0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1486  	0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1487  	0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1488  	0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1489  	0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1490  	0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1491  	0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1492  	0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1493  	0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1494  	0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1495  	0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1496  	0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1497  	0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1498  	0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1499  	0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1500  	0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1501  	0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1502  	0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1503  	0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1504  	0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1505  	0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1506  	0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1507  	0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1508  	0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1509  	0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1510  	0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1511  	0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1512  	0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1513  	0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1514  	0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1515  	0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1516  	0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1517  	0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1518  	0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1519  	0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1520  	0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1521  	0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1522  	0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1523  	0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1524  	0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1525  	0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1526  	0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1527  	0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1528  	0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1529  	0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1530  	0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1531  	0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1532  	0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1533  };
1534  
curve25519_ever64_base(u8 * out,const u8 * priv)1535  static void curve25519_ever64_base(u8 *out, const u8 *priv)
1536  {
1537  	u64 swap = 1;
1538  	int i, j, k;
1539  	u64 tmp[16 + 32 + 4];
1540  	u64 *x1 = &tmp[0];
1541  	u64 *z1 = &tmp[4];
1542  	u64 *x2 = &tmp[8];
1543  	u64 *z2 = &tmp[12];
1544  	u64 *xz1 = &tmp[0];
1545  	u64 *xz2 = &tmp[8];
1546  	u64 *a = &tmp[0 + 16];
1547  	u64 *b = &tmp[4 + 16];
1548  	u64 *c = &tmp[8 + 16];
1549  	u64 *ab = &tmp[0 + 16];
1550  	u64 *abcd = &tmp[0 + 16];
1551  	u64 *ef = &tmp[16 + 16];
1552  	u64 *efgh = &tmp[16 + 16];
1553  	u64 *key = &tmp[0 + 16 + 32];
1554  
1555  	memcpy(key, priv, 32);
1556  	((u8 *)key)[0] &= 248;
1557  	((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1558  
1559  	x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1560  	z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1561  	z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1562  	memcpy(x2, p_minus_s, sizeof(p_minus_s));
1563  
1564  	j = 3;
1565  	for (i = 0; i < 4; ++i) {
1566  		while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1567  			u64 bit = (key[i] >> j) & 1;
1568  			k = (64 * i + j - 3);
1569  			swap = swap ^ bit;
1570  			cswap2(swap, xz1, xz2);
1571  			swap = bit;
1572  			fsub(b, x1, z1);
1573  			fadd(a, x1, z1);
1574  			fmul(c, &table_ladder[4 * k], b, ef);
1575  			fsub(b, a, c);
1576  			fadd(a, a, c);
1577  			fsqr2(ab, ab, efgh);
1578  			fmul2(xz1, xz2, ab, efgh);
1579  			++j;
1580  		}
1581  		j = 0;
1582  	}
1583  
1584  	point_double(xz1, abcd, efgh);
1585  	point_double(xz1, abcd, efgh);
1586  	point_double(xz1, abcd, efgh);
1587  	encode_point(out, xz1);
1588  
1589  	memzero_explicit(tmp, sizeof(tmp));
1590  }
1591  
1592  static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1593  
curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],const u8 secret[CURVE25519_KEY_SIZE],const u8 basepoint[CURVE25519_KEY_SIZE])1594  void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1595  		     const u8 secret[CURVE25519_KEY_SIZE],
1596  		     const u8 basepoint[CURVE25519_KEY_SIZE])
1597  {
1598  	if (static_branch_likely(&curve25519_use_bmi2_adx))
1599  		curve25519_ever64(mypublic, secret, basepoint);
1600  	else
1601  		curve25519_generic(mypublic, secret, basepoint);
1602  }
1603  EXPORT_SYMBOL(curve25519_arch);
1604  
curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],const u8 secret[CURVE25519_KEY_SIZE])1605  void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1606  			  const u8 secret[CURVE25519_KEY_SIZE])
1607  {
1608  	if (static_branch_likely(&curve25519_use_bmi2_adx))
1609  		curve25519_ever64_base(pub, secret);
1610  	else
1611  		curve25519_generic(pub, secret, curve25519_base_point);
1612  }
1613  EXPORT_SYMBOL(curve25519_base_arch);
1614  
curve25519_set_secret(struct crypto_kpp * tfm,const void * buf,unsigned int len)1615  static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1616  				 unsigned int len)
1617  {
1618  	u8 *secret = kpp_tfm_ctx(tfm);
1619  
1620  	if (!len)
1621  		curve25519_generate_secret(secret);
1622  	else if (len == CURVE25519_KEY_SIZE &&
1623  		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1624  		memcpy(secret, buf, CURVE25519_KEY_SIZE);
1625  	else
1626  		return -EINVAL;
1627  	return 0;
1628  }
1629  
curve25519_generate_public_key(struct kpp_request * req)1630  static int curve25519_generate_public_key(struct kpp_request *req)
1631  {
1632  	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1633  	const u8 *secret = kpp_tfm_ctx(tfm);
1634  	u8 buf[CURVE25519_KEY_SIZE];
1635  	int copied, nbytes;
1636  
1637  	if (req->src)
1638  		return -EINVAL;
1639  
1640  	curve25519_base_arch(buf, secret);
1641  
1642  	/* might want less than we've got */
1643  	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1644  	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1645  								nbytes),
1646  				     buf, nbytes);
1647  	if (copied != nbytes)
1648  		return -EINVAL;
1649  	return 0;
1650  }
1651  
curve25519_compute_shared_secret(struct kpp_request * req)1652  static int curve25519_compute_shared_secret(struct kpp_request *req)
1653  {
1654  	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1655  	const u8 *secret = kpp_tfm_ctx(tfm);
1656  	u8 public_key[CURVE25519_KEY_SIZE];
1657  	u8 buf[CURVE25519_KEY_SIZE];
1658  	int copied, nbytes;
1659  
1660  	if (!req->src)
1661  		return -EINVAL;
1662  
1663  	copied = sg_copy_to_buffer(req->src,
1664  				   sg_nents_for_len(req->src,
1665  						    CURVE25519_KEY_SIZE),
1666  				   public_key, CURVE25519_KEY_SIZE);
1667  	if (copied != CURVE25519_KEY_SIZE)
1668  		return -EINVAL;
1669  
1670  	curve25519_arch(buf, secret, public_key);
1671  
1672  	/* might want less than we've got */
1673  	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1674  	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1675  								nbytes),
1676  				     buf, nbytes);
1677  	if (copied != nbytes)
1678  		return -EINVAL;
1679  	return 0;
1680  }
1681  
curve25519_max_size(struct crypto_kpp * tfm)1682  static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1683  {
1684  	return CURVE25519_KEY_SIZE;
1685  }
1686  
1687  static struct kpp_alg curve25519_alg = {
1688  	.base.cra_name		= "curve25519",
1689  	.base.cra_driver_name	= "curve25519-x86",
1690  	.base.cra_priority	= 200,
1691  	.base.cra_module	= THIS_MODULE,
1692  	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
1693  
1694  	.set_secret		= curve25519_set_secret,
1695  	.generate_public_key	= curve25519_generate_public_key,
1696  	.compute_shared_secret	= curve25519_compute_shared_secret,
1697  	.max_size		= curve25519_max_size,
1698  };
1699  
1700  
curve25519_mod_init(void)1701  static int __init curve25519_mod_init(void)
1702  {
1703  	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1704  		static_branch_enable(&curve25519_use_bmi2_adx);
1705  	else
1706  		return 0;
1707  	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1708  		crypto_register_kpp(&curve25519_alg) : 0;
1709  }
1710  
curve25519_mod_exit(void)1711  static void __exit curve25519_mod_exit(void)
1712  {
1713  	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1714  	    static_branch_likely(&curve25519_use_bmi2_adx))
1715  		crypto_unregister_kpp(&curve25519_alg);
1716  }
1717  
1718  module_init(curve25519_mod_init);
1719  module_exit(curve25519_mod_exit);
1720  
1721  MODULE_ALIAS_CRYPTO("curve25519");
1722  MODULE_ALIAS_CRYPTO("curve25519-x86");
1723  MODULE_LICENSE("GPL v2");
1724  MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
1725