smult_donna.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. /* Copyright 2008, Google Inc.
  2. * All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. *
  8. * * Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * * Redistributions in binary form must reproduce the above
  11. * copyright notice, this list of conditions and the following disclaimer
  12. * in the documentation and/or other materials provided with the
  13. * distribution.
  14. * * Neither the name of Google Inc. nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * curve25519-donna: Curve25519 elliptic curve, public key function
  31. *
  32. * http://code.google.com/p/curve25519-donna/
  33. *
  34. * Adam Langley <agl@imperialviolet.org>
  35. *
  36. * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  37. *
  38. * More information about curve25519 can be found here
  39. * http://cr.yp.to/ecdh.html
  40. *
  41. * djb's sample implementation of curve25519 is written in a special assembly
  42. * language called qhasm and uses the floating point registers.
  43. *
  44. * This is, almost, a clean room reimplementation from the curve25519 paper. It
  45. * uses many of the tricks described therein. Only the crecip function is taken
  46. * from the sample implementation. */
  47. #if defined(__arm__) || defined(__i386__)
  48. #include <string.h>
  49. #include <stdint.h>
  50. #ifdef _MSC_VER
  51. #define inline __inline
  52. #endif
  53. typedef uint8_t u8;
  54. typedef int32_t s32;
  55. typedef int64_t limb;
  56. /* Field element representation:
  57. *
  58. * Field elements are written as an array of signed, 64-bit limbs, least
  59. * significant first. The value of the field element is:
  60. * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
  61. *
  62. * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
  63. /* Sum two numbers: output += in */
  64. static void fsum(limb *output, const limb *in) {
  65. unsigned i;
  66. for (i = 0; i < 10; i += 2) {
  67. output[0+i] = output[0+i] + in[0+i];
  68. output[1+i] = output[1+i] + in[1+i];
  69. }
  70. }
  71. /* Find the difference of two numbers: output = in - output
  72. * (note the order of the arguments!). */
  73. static void fdifference(limb *output, const limb *in) {
  74. unsigned i;
  75. for (i = 0; i < 10; ++i) {
  76. output[i] = in[i] - output[i];
  77. }
  78. }
  79. /* Multiply a number by a scalar: output = in * scalar */
  80. static void fscalar_product(limb *output, const limb *in, const limb scalar) {
  81. unsigned i;
  82. for (i = 0; i < 10; ++i) {
  83. output[i] = in[i] * scalar;
  84. }
  85. }
  86. /* Multiply two numbers: output = in2 * in
  87. *
  88. * output must be distinct to both inputs. The inputs are reduced coefficient
  89. * form, the output is not.
  90. *
  91. * output[x] <= 14 * the largest product of the input limbs. */
  92. static void fproduct(limb *output, const limb *in2, const limb *in) {
  93. output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
  94. output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
  95. ((limb) ((s32) in2[1])) * ((s32) in[0]);
  96. output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
  97. ((limb) ((s32) in2[0])) * ((s32) in[2]) +
  98. ((limb) ((s32) in2[2])) * ((s32) in[0]);
  99. output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) +
  100. ((limb) ((s32) in2[2])) * ((s32) in[1]) +
  101. ((limb) ((s32) in2[0])) * ((s32) in[3]) +
  102. ((limb) ((s32) in2[3])) * ((s32) in[0]);
  103. output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) +
  104. 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
  105. ((limb) ((s32) in2[3])) * ((s32) in[1])) +
  106. ((limb) ((s32) in2[0])) * ((s32) in[4]) +
  107. ((limb) ((s32) in2[4])) * ((s32) in[0]);
  108. output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) +
  109. ((limb) ((s32) in2[3])) * ((s32) in[2]) +
  110. ((limb) ((s32) in2[1])) * ((s32) in[4]) +
  111. ((limb) ((s32) in2[4])) * ((s32) in[1]) +
  112. ((limb) ((s32) in2[0])) * ((s32) in[5]) +
  113. ((limb) ((s32) in2[5])) * ((s32) in[0]);
  114. output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
  115. ((limb) ((s32) in2[1])) * ((s32) in[5]) +
  116. ((limb) ((s32) in2[5])) * ((s32) in[1])) +
  117. ((limb) ((s32) in2[2])) * ((s32) in[4]) +
  118. ((limb) ((s32) in2[4])) * ((s32) in[2]) +
  119. ((limb) ((s32) in2[0])) * ((s32) in[6]) +
  120. ((limb) ((s32) in2[6])) * ((s32) in[0]);
  121. output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) +
  122. ((limb) ((s32) in2[4])) * ((s32) in[3]) +
  123. ((limb) ((s32) in2[2])) * ((s32) in[5]) +
  124. ((limb) ((s32) in2[5])) * ((s32) in[2]) +
  125. ((limb) ((s32) in2[1])) * ((s32) in[6]) +
  126. ((limb) ((s32) in2[6])) * ((s32) in[1]) +
  127. ((limb) ((s32) in2[0])) * ((s32) in[7]) +
  128. ((limb) ((s32) in2[7])) * ((s32) in[0]);
  129. output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) +
  130. 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
  131. ((limb) ((s32) in2[5])) * ((s32) in[3]) +
  132. ((limb) ((s32) in2[1])) * ((s32) in[7]) +
  133. ((limb) ((s32) in2[7])) * ((s32) in[1])) +
  134. ((limb) ((s32) in2[2])) * ((s32) in[6]) +
  135. ((limb) ((s32) in2[6])) * ((s32) in[2]) +
  136. ((limb) ((s32) in2[0])) * ((s32) in[8]) +
  137. ((limb) ((s32) in2[8])) * ((s32) in[0]);
  138. output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) +
  139. ((limb) ((s32) in2[5])) * ((s32) in[4]) +
  140. ((limb) ((s32) in2[3])) * ((s32) in[6]) +
  141. ((limb) ((s32) in2[6])) * ((s32) in[3]) +
  142. ((limb) ((s32) in2[2])) * ((s32) in[7]) +
  143. ((limb) ((s32) in2[7])) * ((s32) in[2]) +
  144. ((limb) ((s32) in2[1])) * ((s32) in[8]) +
  145. ((limb) ((s32) in2[8])) * ((s32) in[1]) +
  146. ((limb) ((s32) in2[0])) * ((s32) in[9]) +
  147. ((limb) ((s32) in2[9])) * ((s32) in[0]);
  148. output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
  149. ((limb) ((s32) in2[3])) * ((s32) in[7]) +
  150. ((limb) ((s32) in2[7])) * ((s32) in[3]) +
  151. ((limb) ((s32) in2[1])) * ((s32) in[9]) +
  152. ((limb) ((s32) in2[9])) * ((s32) in[1])) +
  153. ((limb) ((s32) in2[4])) * ((s32) in[6]) +
  154. ((limb) ((s32) in2[6])) * ((s32) in[4]) +
  155. ((limb) ((s32) in2[2])) * ((s32) in[8]) +
  156. ((limb) ((s32) in2[8])) * ((s32) in[2]);
  157. output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) +
  158. ((limb) ((s32) in2[6])) * ((s32) in[5]) +
  159. ((limb) ((s32) in2[4])) * ((s32) in[7]) +
  160. ((limb) ((s32) in2[7])) * ((s32) in[4]) +
  161. ((limb) ((s32) in2[3])) * ((s32) in[8]) +
  162. ((limb) ((s32) in2[8])) * ((s32) in[3]) +
  163. ((limb) ((s32) in2[2])) * ((s32) in[9]) +
  164. ((limb) ((s32) in2[9])) * ((s32) in[2]);
  165. output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) +
  166. 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
  167. ((limb) ((s32) in2[7])) * ((s32) in[5]) +
  168. ((limb) ((s32) in2[3])) * ((s32) in[9]) +
  169. ((limb) ((s32) in2[9])) * ((s32) in[3])) +
  170. ((limb) ((s32) in2[4])) * ((s32) in[8]) +
  171. ((limb) ((s32) in2[8])) * ((s32) in[4]);
  172. output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) +
  173. ((limb) ((s32) in2[7])) * ((s32) in[6]) +
  174. ((limb) ((s32) in2[5])) * ((s32) in[8]) +
  175. ((limb) ((s32) in2[8])) * ((s32) in[5]) +
  176. ((limb) ((s32) in2[4])) * ((s32) in[9]) +
  177. ((limb) ((s32) in2[9])) * ((s32) in[4]);
  178. output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
  179. ((limb) ((s32) in2[5])) * ((s32) in[9]) +
  180. ((limb) ((s32) in2[9])) * ((s32) in[5])) +
  181. ((limb) ((s32) in2[6])) * ((s32) in[8]) +
  182. ((limb) ((s32) in2[8])) * ((s32) in[6]);
  183. output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) +
  184. ((limb) ((s32) in2[8])) * ((s32) in[7]) +
  185. ((limb) ((s32) in2[6])) * ((s32) in[9]) +
  186. ((limb) ((s32) in2[9])) * ((s32) in[6]);
  187. output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) +
  188. 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
  189. ((limb) ((s32) in2[9])) * ((s32) in[7]));
  190. output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
  191. ((limb) ((s32) in2[9])) * ((s32) in[8]);
  192. output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
  193. }
  194. /* Reduce a long form to a short form by taking the input mod 2^255 - 19.
  195. *
  196. * On entry: |output[i]| < 14*2^54
  197. * On exit: |output[0..8]| < 280*2^54 */
  198. static void freduce_degree(limb *output) {
  199. /* Each of these shifts and adds ends up multiplying the value by 19.
  200. *
  201. * For output[0..8], the absolute entry value is < 14*2^54 and we add, at
  202. * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */
  203. output[8] += output[18] << 4;
  204. output[8] += output[18] << 1;
  205. output[8] += output[18];
  206. output[7] += output[17] << 4;
  207. output[7] += output[17] << 1;
  208. output[7] += output[17];
  209. output[6] += output[16] << 4;
  210. output[6] += output[16] << 1;
  211. output[6] += output[16];
  212. output[5] += output[15] << 4;
  213. output[5] += output[15] << 1;
  214. output[5] += output[15];
  215. output[4] += output[14] << 4;
  216. output[4] += output[14] << 1;
  217. output[4] += output[14];
  218. output[3] += output[13] << 4;
  219. output[3] += output[13] << 1;
  220. output[3] += output[13];
  221. output[2] += output[12] << 4;
  222. output[2] += output[12] << 1;
  223. output[2] += output[12];
  224. output[1] += output[11] << 4;
  225. output[1] += output[11] << 1;
  226. output[1] += output[11];
  227. output[0] += output[10] << 4;
  228. output[0] += output[10] << 1;
  229. output[0] += output[10];
  230. }
  231. #if (-1 & 3) != 3
  232. #error "This code only works on a two's complement system"
  233. #endif
  234. /* return v / 2^26, using only shifts and adds.
  235. *
  236. * On entry: v can take any value. */
  237. static inline limb
  238. div_by_2_26(const limb v)
  239. {
  240. /* High word of v; no shift needed. */
  241. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  242. /* Set to all 1s if v was negative; else set to 0s. */
  243. const int32_t sign = ((int32_t) highword) >> 31;
  244. /* Set to 0x3ffffff if v was negative; else set to 0. */
  245. const int32_t roundoff = ((uint32_t) sign) >> 6;
  246. /* Should return v / (1<<26) */
  247. return (v + roundoff) >> 26;
  248. }
  249. /* return v / (2^25), using only shifts and adds.
  250. *
  251. * On entry: v can take any value. */
  252. static inline limb
  253. div_by_2_25(const limb v)
  254. {
  255. /* High word of v; no shift needed*/
  256. const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
  257. /* Set to all 1s if v was negative; else set to 0s. */
  258. const int32_t sign = ((int32_t) highword) >> 31;
  259. /* Set to 0x1ffffff if v was negative; else set to 0. */
  260. const int32_t roundoff = ((uint32_t) sign) >> 7;
  261. /* Should return v / (1<<25) */
  262. return (v + roundoff) >> 25;
  263. }
  264. /* Reduce all coefficients of the short form input so that |x| < 2^26.
  265. *
  266. * On entry: |output[i]| < 280*2^54 */
  267. static void freduce_coefficients(limb *output) {
  268. unsigned i;
  269. output[10] = 0;
  270. for (i = 0; i < 10; i += 2) {
  271. limb over = div_by_2_26(output[i]);
  272. /* The entry condition (that |output[i]| < 280*2^54) means that over is, at
  273. * most, 280*2^28 in the first iteration of this loop. This is added to the
  274. * next limb and we can approximate the resulting bound of that limb by
  275. * 281*2^54. */
  276. output[i] -= over << 26;
  277. output[i+1] += over;
  278. /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| <
  279. * 281*2^29. When this is added to the next limb, the resulting bound can
  280. * be approximated as 281*2^54.
  281. *
  282. * For subsequent iterations of the loop, 281*2^54 remains a conservative
  283. * bound and no overflow occurs. */
  284. over = div_by_2_25(output[i+1]);
  285. output[i+1] -= over << 25;
  286. output[i+2] += over;
  287. }
  288. /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */
  289. output[0] += output[10] << 4;
  290. output[0] += output[10] << 1;
  291. output[0] += output[10];
  292. output[10] = 0;
  293. /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29
  294. * So |over| will be no more than 2^16. */
  295. {
  296. limb over = div_by_2_26(output[0]);
  297. output[0] -= over << 26;
  298. output[1] += over;
  299. }
  300. /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The
  301. * bound on |output[1]| is sufficient to meet our needs. */
  302. }
  303. /* A helpful wrapper around fproduct: output = in * in2.
  304. *
  305. * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.
  306. *
  307. * output must be distinct to both inputs. The output is reduced degree
  308. * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */
  309. static void
  310. fmul(limb *output, const limb *in, const limb *in2) {
  311. limb t[19];
  312. fproduct(t, in, in2);
  313. /* |t[i]| < 14*2^54 */
  314. freduce_degree(t);
  315. freduce_coefficients(t);
  316. /* |t[i]| < 2^26 */
  317. memcpy(output, t, sizeof(limb) * 10);
  318. }
  319. /* Square a number: output = in**2
  320. *
  321. * output must be distinct from the input. The inputs are reduced coefficient
  322. * form, the output is not.
  323. *
  324. * output[x] <= 14 * the largest product of the input limbs. */
  325. static void fsquare_inner(limb *output, const limb *in) {
  326. output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
  327. output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
  328. output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
  329. ((limb) ((s32) in[0])) * ((s32) in[2]));
  330. output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
  331. ((limb) ((s32) in[0])) * ((s32) in[3]));
  332. output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) +
  333. 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) +
  334. 2 * ((limb) ((s32) in[0])) * ((s32) in[4]);
  335. output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
  336. ((limb) ((s32) in[1])) * ((s32) in[4]) +
  337. ((limb) ((s32) in[0])) * ((s32) in[5]));
  338. output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
  339. ((limb) ((s32) in[2])) * ((s32) in[4]) +
  340. ((limb) ((s32) in[0])) * ((s32) in[6]) +
  341. 2 * ((limb) ((s32) in[1])) * ((s32) in[5]));
  342. output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
  343. ((limb) ((s32) in[2])) * ((s32) in[5]) +
  344. ((limb) ((s32) in[1])) * ((s32) in[6]) +
  345. ((limb) ((s32) in[0])) * ((s32) in[7]));
  346. output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) +
  347. 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
  348. ((limb) ((s32) in[0])) * ((s32) in[8]) +
  349. 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
  350. ((limb) ((s32) in[3])) * ((s32) in[5])));
  351. output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
  352. ((limb) ((s32) in[3])) * ((s32) in[6]) +
  353. ((limb) ((s32) in[2])) * ((s32) in[7]) +
  354. ((limb) ((s32) in[1])) * ((s32) in[8]) +
  355. ((limb) ((s32) in[0])) * ((s32) in[9]));
  356. output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
  357. ((limb) ((s32) in[4])) * ((s32) in[6]) +
  358. ((limb) ((s32) in[2])) * ((s32) in[8]) +
  359. 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
  360. ((limb) ((s32) in[1])) * ((s32) in[9])));
  361. output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
  362. ((limb) ((s32) in[4])) * ((s32) in[7]) +
  363. ((limb) ((s32) in[3])) * ((s32) in[8]) +
  364. ((limb) ((s32) in[2])) * ((s32) in[9]));
  365. output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) +
  366. 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
  367. 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
  368. ((limb) ((s32) in[3])) * ((s32) in[9])));
  369. output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
  370. ((limb) ((s32) in[5])) * ((s32) in[8]) +
  371. ((limb) ((s32) in[4])) * ((s32) in[9]));
  372. output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
  373. ((limb) ((s32) in[6])) * ((s32) in[8]) +
  374. 2 * ((limb) ((s32) in[5])) * ((s32) in[9]));
  375. output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
  376. ((limb) ((s32) in[6])) * ((s32) in[9]));
  377. output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) +
  378. 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
  379. output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
  380. output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
  381. }
  382. /* fsquare sets output = in^2.
  383. *
  384. * On entry: The |in| argument is in reduced coefficients form and |in[i]| <
  385. * 2^27.
  386. *
  387. * On exit: The |output| argument is in reduced coefficients form (indeed, one
  388. * need only provide storage for 10 limbs) and |out[i]| < 2^26. */
  389. static void
  390. fsquare(limb *output, const limb *in) {
  391. limb t[19];
  392. fsquare_inner(t, in);
  393. /* |t[i]| < 14*2^54 because the largest product of two limbs will be <
  394. * 2^(27+27) and fsquare_inner adds together, at most, 14 of those
  395. * products. */
  396. freduce_degree(t);
  397. freduce_coefficients(t);
  398. /* |t[i]| < 2^26 */
  399. memcpy(output, t, sizeof(limb) * 10);
  400. }
  401. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  402. static void
  403. fexpand(limb *output, const u8 *input) {
  404. #define F(n,start,shift,mask) \
  405. output[n] = ((((limb) input[start + 0]) | \
  406. ((limb) input[start + 1]) << 8 | \
  407. ((limb) input[start + 2]) << 16 | \
  408. ((limb) input[start + 3]) << 24) >> shift) & mask;
  409. F(0, 0, 0, 0x3ffffff);
  410. F(1, 3, 2, 0x1ffffff);
  411. F(2, 6, 3, 0x3ffffff);
  412. F(3, 9, 5, 0x1ffffff);
  413. F(4, 12, 6, 0x3ffffff);
  414. F(5, 16, 0, 0x1ffffff);
  415. F(6, 19, 1, 0x3ffffff);
  416. F(7, 22, 3, 0x1ffffff);
  417. F(8, 25, 4, 0x3ffffff);
  418. F(9, 28, 6, 0x1ffffff);
  419. #undef F
  420. }
  421. #if (-32 >> 1) != -16
  422. #error "This code only works when >> does sign-extension on negative numbers"
  423. #endif
  424. /* s32_eq returns 0xffffffff iff a == b and zero otherwise. */
  425. static s32 s32_eq(s32 a, s32 b) {
  426. a = ~(a ^ b);
  427. a &= a << 16;
  428. a &= a << 8;
  429. a &= a << 4;
  430. a &= a << 2;
  431. a &= a << 1;
  432. return a >> 31;
  433. }
  434. /* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are
  435. * both non-negative. */
  436. static s32 s32_gte(s32 a, s32 b) {
  437. a -= b;
  438. /* a >= 0 iff a >= b. */
  439. return ~(a >> 31);
  440. }
  441. /* Take a fully reduced polynomial form number and contract it into a
  442. * little-endian, 32-byte array.
  443. *
  444. * On entry: |input_limbs[i]| < 2^26 */
  445. static void
  446. fcontract(u8 *output, limb *input_limbs) {
  447. int i;
  448. int j;
  449. s32 input[10];
  450. s32 mask;
  451. /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */
  452. for (i = 0; i < 10; i++) {
  453. input[i] = input_limbs[i];
  454. }
  455. for (j = 0; j < 2; ++j) {
  456. for (i = 0; i < 9; ++i) {
  457. if ((i & 1) == 1) {
  458. /* This calculation is a time-invariant way to make input[i]
  459. * non-negative by borrowing from the next-larger limb. */
  460. const s32 mask = input[i] >> 31;
  461. const s32 carry = -((input[i] & mask) >> 25);
  462. input[i] = input[i] + (carry << 25);
  463. input[i+1] = input[i+1] - carry;
  464. } else {
  465. const s32 mask = input[i] >> 31;
  466. const s32 carry = -((input[i] & mask) >> 26);
  467. input[i] = input[i] + (carry << 26);
  468. input[i+1] = input[i+1] - carry;
  469. }
  470. }
  471. /* There's no greater limb for input[9] to borrow from, but we can multiply
  472. * by 19 and borrow from input[0], which is valid mod 2^255-19. */
  473. {
  474. const s32 mask = input[9] >> 31;
  475. const s32 carry = -((input[9] & mask) >> 25);
  476. input[9] = input[9] + (carry << 25);
  477. input[0] = input[0] - (carry * 19);
  478. }
  479. /* After the first iteration, input[1..9] are non-negative and fit within
  480. * 25 or 26 bits, depending on position. However, input[0] may be
  481. * negative. */
  482. }
  483. /* The first borrow-propagation pass above ended with every limb
  484. except (possibly) input[0] non-negative.
  485. If input[0] was negative after the first pass, then it was because of a
  486. carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
  487. one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19.
  488. In the second pass, each limb is decreased by at most one. Thus the second
  489. borrow-propagation pass could only have wrapped around to decrease
  490. input[0] again if the first pass left input[0] negative *and* input[1]
  491. through input[9] were all zero. In that case, input[1] is now 2^25 - 1,
  492. and this last borrow-propagation step will leave input[1] non-negative. */
  493. {
  494. const s32 mask = input[0] >> 31;
  495. const s32 carry = -((input[0] & mask) >> 26);
  496. input[0] = input[0] + (carry << 26);
  497. input[1] = input[1] - carry;
  498. }
  499. /* All input[i] are now non-negative. However, there might be values between
  500. * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
  501. for (j = 0; j < 2; j++) {
  502. for (i = 0; i < 9; i++) {
  503. if ((i & 1) == 1) {
  504. const s32 carry = input[i] >> 25;
  505. input[i] &= 0x1ffffff;
  506. input[i+1] += carry;
  507. } else {
  508. const s32 carry = input[i] >> 26;
  509. input[i] &= 0x3ffffff;
  510. input[i+1] += carry;
  511. }
  512. }
  513. {
  514. const s32 carry = input[9] >> 25;
  515. input[9] &= 0x1ffffff;
  516. input[0] += 19*carry;
  517. }
  518. }
  519. /* If the first carry-chain pass, just above, ended up with a carry from
  520. * input[9], and that caused input[0] to be out-of-bounds, then input[0] was
  521. * < 2^26 + 2*19, because the carry was, at most, two.
  522. *
  523. * If the second pass carried from input[9] again then input[0] is < 2*19 and
  524. * the input[9] -> input[0] carry didn't push input[0] out of bounds. */
  525. /* It still remains the case that input might be between 2^255-19 and 2^255.
  526. * In this case, input[1..9] must take their maximum value and input[0] must
  527. * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
  528. mask = s32_gte(input[0], 0x3ffffed);
  529. for (i = 1; i < 10; i++) {
  530. if ((i & 1) == 1) {
  531. mask &= s32_eq(input[i], 0x1ffffff);
  532. } else {
  533. mask &= s32_eq(input[i], 0x3ffffff);
  534. }
  535. }
  536. /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus
  537. * this conditionally subtracts 2^255-19. */
  538. input[0] -= mask & 0x3ffffed;
  539. for (i = 1; i < 10; i++) {
  540. if ((i & 1) == 1) {
  541. input[i] -= mask & 0x1ffffff;
  542. } else {
  543. input[i] -= mask & 0x3ffffff;
  544. }
  545. }
  546. input[1] <<= 2;
  547. input[2] <<= 3;
  548. input[3] <<= 5;
  549. input[4] <<= 6;
  550. input[6] <<= 1;
  551. input[7] <<= 3;
  552. input[8] <<= 4;
  553. input[9] <<= 6;
  554. #define F(i, s) \
  555. output[s+0] |= input[i] & 0xff; \
  556. output[s+1] = (input[i] >> 8) & 0xff; \
  557. output[s+2] = (input[i] >> 16) & 0xff; \
  558. output[s+3] = (input[i] >> 24) & 0xff;
  559. output[0] = 0;
  560. output[16] = 0;
  561. F(0,0);
  562. F(1,3);
  563. F(2,6);
  564. F(3,9);
  565. F(4,12);
  566. F(5,16);
  567. F(6,19);
  568. F(7,22);
  569. F(8,25);
  570. F(9,28);
  571. #undef F
  572. }
  573. /* Input: Q, Q', Q-Q'
  574. * Output: 2Q, Q+Q'
  575. *
  576. * x2 z3: long form
  577. * x3 z3: long form
  578. * x z: short form, destroyed
  579. * xprime zprime: short form, destroyed
  580. * qmqp: short form, preserved
  581. *
  582. * On entry and exit, the absolute value of the limbs of all inputs and outputs
  583. * are < 2^26. */
  584. static void fmonty(limb *x2, limb *z2, /* output 2Q */
  585. limb *x3, limb *z3, /* output Q + Q' */
  586. limb *x, limb *z, /* input Q */
  587. limb *xprime, limb *zprime, /* input Q' */
  588. const limb *qmqp /* input Q - Q' */) {
  589. limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
  590. zzprime[19], zzzprime[19], xxxprime[19];
  591. memcpy(origx, x, 10 * sizeof(limb));
  592. fsum(x, z);
  593. /* |x[i]| < 2^27 */
  594. fdifference(z, origx); /* does x - z */
  595. /* |z[i]| < 2^27 */
  596. memcpy(origxprime, xprime, sizeof(limb) * 10);
  597. fsum(xprime, zprime);
  598. /* |xprime[i]| < 2^27 */
  599. fdifference(zprime, origxprime);
  600. /* |zprime[i]| < 2^27 */
  601. fproduct(xxprime, xprime, z);
  602. /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be <
  603. * 2^(27+27) and fproduct adds together, at most, 14 of those products.
  604. * (Approximating that to 2^58 doesn't work out.) */
  605. fproduct(zzprime, x, zprime);
  606. /* |zzprime[i]| < 14*2^54 */
  607. freduce_degree(xxprime);
  608. freduce_coefficients(xxprime);
  609. /* |xxprime[i]| < 2^26 */
  610. freduce_degree(zzprime);
  611. freduce_coefficients(zzprime);
  612. /* |zzprime[i]| < 2^26 */
  613. memcpy(origxprime, xxprime, sizeof(limb) * 10);
  614. fsum(xxprime, zzprime);
  615. /* |xxprime[i]| < 2^27 */
  616. fdifference(zzprime, origxprime);
  617. /* |zzprime[i]| < 2^27 */
  618. fsquare(xxxprime, xxprime);
  619. /* |xxxprime[i]| < 2^26 */
  620. fsquare(zzzprime, zzprime);
  621. /* |zzzprime[i]| < 2^26 */
  622. fproduct(zzprime, zzzprime, qmqp);
  623. /* |zzprime[i]| < 14*2^52 */
  624. freduce_degree(zzprime);
  625. freduce_coefficients(zzprime);
  626. /* |zzprime[i]| < 2^26 */
  627. memcpy(x3, xxxprime, sizeof(limb) * 10);
  628. memcpy(z3, zzprime, sizeof(limb) * 10);
  629. fsquare(xx, x);
  630. /* |xx[i]| < 2^26 */
  631. fsquare(zz, z);
  632. /* |zz[i]| < 2^26 */
  633. fproduct(x2, xx, zz);
  634. /* |x2[i]| < 14*2^52 */
  635. freduce_degree(x2);
  636. freduce_coefficients(x2);
  637. /* |x2[i]| < 2^26 */
  638. fdifference(zz, xx); // does zz = xx - zz
  639. /* |zz[i]| < 2^27 */
  640. memset(zzz + 10, 0, sizeof(limb) * 9);
  641. fscalar_product(zzz, zz, 121665);
  642. /* |zzz[i]| < 2^(27+17) */
  643. /* No need to call freduce_degree here:
  644. fscalar_product doesn't increase the degree of its input. */
  645. freduce_coefficients(zzz);
  646. /* |zzz[i]| < 2^26 */
  647. fsum(zzz, xx);
  648. /* |zzz[i]| < 2^27 */
  649. fproduct(z2, zz, zzz);
  650. /* |z2[i]| < 14*2^(26+27) */
  651. freduce_degree(z2);
  652. freduce_coefficients(z2);
  653. /* |z2|i| < 2^26 */
  654. }
  655. /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
  656. * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
  657. * side-channel attacks.
  658. *
  659. * NOTE that this function requires that 'iswap' be 1 or 0; other values give
  660. * wrong results. Also, the two limb arrays must be in reduced-coefficient,
  661. * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
  662. * and all all values in a[0..9],b[0..9] must have magnitude less than
  663. * INT32_MAX. */
  664. static void
  665. swap_conditional(limb a[19], limb b[19], limb iswap) {
  666. unsigned i;
  667. const s32 swap = (s32) -iswap;
  668. for (i = 0; i < 10; ++i) {
  669. const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
  670. a[i] = ((s32)a[i]) ^ x;
  671. b[i] = ((s32)b[i]) ^ x;
  672. }
  673. }
  674. /* Calculates nQ where Q is the x-coordinate of a point on the curve
  675. *
  676. * resultx/resultz: the x coordinate of the resulting curve point (short form)
  677. * n: a little endian, 32-byte number
  678. * q: a point of the curve (short form) */
  679. static void
  680. cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
  681. limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
  682. limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
  683. limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
  684. limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
  685. unsigned i, j;
  686. memcpy(nqpqx, q, sizeof(limb) * 10);
  687. for (i = 0; i < 32; ++i) {
  688. u8 byte = n[31 - i];
  689. for (j = 0; j < 8; ++j) {
  690. const limb bit = byte >> 7;
  691. swap_conditional(nqx, nqpqx, bit);
  692. swap_conditional(nqz, nqpqz, bit);
  693. fmonty(nqx2, nqz2,
  694. nqpqx2, nqpqz2,
  695. nqx, nqz,
  696. nqpqx, nqpqz,
  697. q);
  698. swap_conditional(nqx2, nqpqx2, bit);
  699. swap_conditional(nqz2, nqpqz2, bit);
  700. t = nqx;
  701. nqx = nqx2;
  702. nqx2 = t;
  703. t = nqz;
  704. nqz = nqz2;
  705. nqz2 = t;
  706. t = nqpqx;
  707. nqpqx = nqpqx2;
  708. nqpqx2 = t;
  709. t = nqpqz;
  710. nqpqz = nqpqz2;
  711. nqpqz2 = t;
  712. byte <<= 1;
  713. }
  714. }
  715. memcpy(resultx, nqx, sizeof(limb) * 10);
  716. memcpy(resultz, nqz, sizeof(limb) * 10);
  717. }
  718. // -----------------------------------------------------------------------------
  719. // Shamelessly copied from djb's code
  720. // -----------------------------------------------------------------------------
  721. static void
  722. crecip(limb *out, const limb *z) {
  723. limb z2[10];
  724. limb z9[10];
  725. limb z11[10];
  726. limb z2_5_0[10];
  727. limb z2_10_0[10];
  728. limb z2_20_0[10];
  729. limb z2_50_0[10];
  730. limb z2_100_0[10];
  731. limb t0[10];
  732. limb t1[10];
  733. int i;
  734. /* 2 */ fsquare(z2,z);
  735. /* 4 */ fsquare(t1,z2);
  736. /* 8 */ fsquare(t0,t1);
  737. /* 9 */ fmul(z9,t0,z);
  738. /* 11 */ fmul(z11,z9,z2);
  739. /* 22 */ fsquare(t0,z11);
  740. /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
  741. /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
  742. /* 2^7 - 2^2 */ fsquare(t1,t0);
  743. /* 2^8 - 2^3 */ fsquare(t0,t1);
  744. /* 2^9 - 2^4 */ fsquare(t1,t0);
  745. /* 2^10 - 2^5 */ fsquare(t0,t1);
  746. /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
  747. /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
  748. /* 2^12 - 2^2 */ fsquare(t1,t0);
  749. /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  750. /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
  751. /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
  752. /* 2^22 - 2^2 */ fsquare(t1,t0);
  753. /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  754. /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
  755. /* 2^41 - 2^1 */ fsquare(t1,t0);
  756. /* 2^42 - 2^2 */ fsquare(t0,t1);
  757. /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  758. /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
  759. /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
  760. /* 2^52 - 2^2 */ fsquare(t1,t0);
  761. /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  762. /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
  763. /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
  764. /* 2^102 - 2^2 */ fsquare(t0,t1);
  765. /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
  766. /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
  767. /* 2^201 - 2^1 */ fsquare(t0,t1);
  768. /* 2^202 - 2^2 */ fsquare(t1,t0);
  769. /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
  770. /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
  771. /* 2^251 - 2^1 */ fsquare(t1,t0);
  772. /* 2^252 - 2^2 */ fsquare(t0,t1);
  773. /* 2^253 - 2^3 */ fsquare(t1,t0);
  774. /* 2^254 - 2^4 */ fsquare(t0,t1);
  775. /* 2^255 - 2^5 */ fsquare(t1,t0);
  776. /* 2^255 - 21 */ fmul(out,t1,z11);
  777. }
  778. int
  779. crypto_scalarmult_curve25519(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
  780. limb bp[10], x[10], z[11], zmone[10];
  781. uint8_t e[32];
  782. int i;
  783. for (i = 0; i < 32; ++i) e[i] = secret[i];
  784. e[0] &= 248;
  785. e[31] &= 127;
  786. e[31] |= 64;
  787. fexpand(bp, basepoint);
  788. cmult(x, z, e, bp);
  789. crecip(zmone, z);
  790. fmul(z, x, zmone);
  791. fcontract(mypublic, z);
  792. return 0;
  793. }
  794. #endif