platform/external/busybox.git - Unnamed repository; edit this file 'description' to name the repository.

1 /* vi: set sw=4 ts=4: */
2 /*
3  * Utility routines.
4  *
5  * Copyright (C) 2010 Denys Vlasenko
6  *
7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8  */
9
10 #include "libbb.h"
11
12 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
13  * (for rotX32, there is no difference). Why? My guess is that
14  * macro requires clever common subexpression elimination heuristics
15  * in gcc, while inline basically forces it to happen.
16  */
17 //#define rotl32(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
18 static ALWAYS_INLINE uint32_t rotl32(uint32_t x, unsigned n)
19 {
20 	return (x << n) | (x >> (32 - n));
21 }
22 //#define rotr32(x,n) (((x) >> (n)) | ((x) << (32 - (n))))
23 static ALWAYS_INLINE uint32_t rotr32(uint32_t x, unsigned n)
24 {
25 	return (x >> n) | (x << (32 - n));
26 }
27 /* rotr64 in needed for sha512 only: */
28 //#define rotr64(x,n) (((x) >> (n)) | ((x) << (64 - (n))))
29 static ALWAYS_INLINE uint64_t rotr64(uint64_t x, unsigned n)
30 {
31 	return (x >> n) | (x << (64 - n));
32 }
33
34 /* rotl64 only used for sha3 currently */
35 static ALWAYS_INLINE uint64_t rotl64(uint64_t x, unsigned n)
36 {
37 	return (x << n) | (x >> (64 - n));
38 }
39
40 /* Feed data through a temporary buffer.
41  * The internal buffer remembers previous data until it has 64
42  * bytes worth to pass on.
43  */
44 static void FAST_FUNC common64_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
45 {
46 	unsigned bufpos = ctx->total64 & 63;
47
48 	ctx->total64 += len;
49
50 	while (1) {
51 		unsigned remaining = 64 - bufpos;
52 		if (remaining > len)
53 			remaining = len;
54 		/* Copy data into aligned buffer */
55 		memcpy(ctx->wbuffer + bufpos, buffer, remaining);
56 		len -= remaining;
57 		buffer = (const char *)buffer + remaining;
58 		bufpos += remaining;
59 		/* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
60 		bufpos -= 64;
61 		if (bufpos != 0)
62 			break;
63 		/* Buffer is filled up, process it */
64 		ctx->process_block(ctx);
65 		/*bufpos = 0; - already is */
66 	}
67 }
68
69 /* Process the remaining bytes in the buffer */
70 static void FAST_FUNC common64_end(md5_ctx_t *ctx, int swap_needed)
71 {
72 	unsigned bufpos = ctx->total64 & 63;
73 	/* Pad the buffer to the next 64-byte boundary with 0x80,0,0,0... */
74 	ctx->wbuffer[bufpos++] = 0x80;
75
76 	/* This loop iterates either once or twice, no more, no less */
77 	while (1) {
78 		unsigned remaining = 64 - bufpos;
79 		memset(ctx->wbuffer + bufpos, 0, remaining);
80 		/* Do we have enough space for the length count? */
81 		if (remaining >= 8) {
82 			/* Store the 64-bit counter of bits in the buffer */
83 			uint64_t t = ctx->total64 << 3;
84 			if (swap_needed)
85 				t = bb_bswap_64(t);
86 			/* wbuffer is suitably aligned for this */
87 			*(bb__aliased_uint64_t *) (&ctx->wbuffer[64 - 8]) = t;
88 		}
89 		ctx->process_block(ctx);
90 		if (remaining >= 8)
91 			break;
92 		bufpos = 0;
93 	}
94 }
95
96
97 /*
98  * Compute MD5 checksum of strings according to the
99  * definition of MD5 in RFC 1321 from April 1992.
100  *
101  * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
102  *
103  * Copyright (C) 1995-1999 Free Software Foundation, Inc.
104  * Copyright (C) 2001 Manuel Novoa III
105  * Copyright (C) 2003 Glenn L. McGrath
106  * Copyright (C) 2003 Erik Andersen
107  *
108  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
109  */
110
111 /* 0: fastest, 3: smallest */
112 #if CONFIG_MD5_SMALL < 0
113 # define MD5_SMALL 0
114 #elif CONFIG_MD5_SMALL > 3
115 # define MD5_SMALL 3
116 #else
117 # define MD5_SMALL CONFIG_MD5_SMALL
118 #endif
119
120 /* These are the four functions used in the four steps of the MD5 algorithm
121  * and defined in the RFC 1321.  The first function is a little bit optimized
122  * (as found in Colin Plumbs public domain implementation).
123  * #define FF(b, c, d) ((b & c) | (~b & d))
124  */
125 #undef FF
126 #undef FG
127 #undef FH
128 #undef FI
129 #define FF(b, c, d) (d ^ (b & (c ^ d)))
130 #define FG(b, c, d) FF(d, b, c)
131 #define FH(b, c, d) (b ^ c ^ d)
132 #define FI(b, c, d) (c ^ (b | ~d))
133
134 /* Hash a single block, 64 bytes long and 4-byte aligned */
135 static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
136 {
137 #if MD5_SMALL > 0
138 	/* Before we start, one word to the strange constants.
139 	   They are defined in RFC 1321 as
140 	   T[i] = (int)(2^32 * fabs(sin(i))), i=1..64
141 	 */
142 	static const uint32_t C_array[] = {
143 		/* round 1 */
144 		0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
145 		0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
146 		0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
147 		0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
148 		/* round 2 */
149 		0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
150 		0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
151 		0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
152 		0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
153 		/* round 3 */
154 		0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
155 		0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
156 		0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
157 		0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
158 		/* round 4 */
159 		0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
160 		0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
161 		0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
162 		0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
163 	};
164 	static const char P_array[] ALIGN1 = {
165 # if MD5_SMALL > 1
166 		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* 1 */
167 # endif
168 		1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* 2 */
169 		5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* 3 */
170 		0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9  /* 4 */
171 	};
172 #endif
173 	uint32_t *words = (void*) ctx->wbuffer;
174 	uint32_t A = ctx->hash[0];
175 	uint32_t B = ctx->hash[1];
176 	uint32_t C = ctx->hash[2];
177 	uint32_t D = ctx->hash[3];
178
179 #if MD5_SMALL >= 2  /* 2 or 3 */
180
181 	static const char S_array[] ALIGN1 = {
182 		7, 12, 17, 22,
183 		5, 9, 14, 20,
184 		4, 11, 16, 23,
185 		6, 10, 15, 21
186 	};
187 	const uint32_t *pc;
188 	const char *pp;
189 	const char *ps;
190 	int i;
191 	uint32_t temp;
192
193 	if (BB_BIG_ENDIAN)
194 		for (i = 0; i < 16; i++)
195 			words[i] = SWAP_LE32(words[i]);
196
197 # if MD5_SMALL == 3
198 	pc = C_array;
199 	pp = P_array;
200 	ps = S_array - 4;
201
202 	for (i = 0; i < 64; i++) {
203 		if ((i & 0x0f) == 0)
204 			ps += 4;
205 		temp = A;
206 		switch (i >> 4) {
207 		case 0:
208 			temp += FF(B, C, D);
209 			break;
210 		case 1:
211 			temp += FG(B, C, D);
212 			break;
213 		case 2:
214 			temp += FH(B, C, D);
215 			break;
216 		default: /* case 3 */
217 			temp += FI(B, C, D);
218 		}
219 		temp += words[(int) (*pp++)] + *pc++;
220 		temp = rotl32(temp, ps[i & 3]);
221 		temp += B;
222 		A = D;
223 		D = C;
224 		C = B;
225 		B = temp;
226 	}
227 # else  /* MD5_SMALL == 2 */
228 	pc = C_array;
229 	pp = P_array;
230 	ps = S_array;
231
232 	for (i = 0; i < 16; i++) {
233 		temp = A + FF(B, C, D) + words[(int) (*pp++)] + *pc++;
234 		temp = rotl32(temp, ps[i & 3]);
235 		temp += B;
236 		A = D;
237 		D = C;
238 		C = B;
239 		B = temp;
240 	}
241 	ps += 4;
242 	for (i = 0; i < 16; i++) {
243 		temp = A + FG(B, C, D) + words[(int) (*pp++)] + *pc++;
244 		temp = rotl32(temp, ps[i & 3]);
245 		temp += B;
246 		A = D;
247 		D = C;
248 		C = B;
249 		B = temp;
250 	}
251 	ps += 4;
252 	for (i = 0; i < 16; i++) {
253 		temp = A + FH(B, C, D) + words[(int) (*pp++)] + *pc++;
254 		temp = rotl32(temp, ps[i & 3]);
255 		temp += B;
256 		A = D;
257 		D = C;
258 		C = B;
259 		B = temp;
260 	}
261 	ps += 4;
262 	for (i = 0; i < 16; i++) {
263 		temp = A + FI(B, C, D) + words[(int) (*pp++)] + *pc++;
264 		temp = rotl32(temp, ps[i & 3]);
265 		temp += B;
266 		A = D;
267 		D = C;
268 		C = B;
269 		B = temp;
270 	}
271 # endif
272 	/* Add checksum to the starting values */
273 	ctx->hash[0] += A;
274 	ctx->hash[1] += B;
275 	ctx->hash[2] += C;
276 	ctx->hash[3] += D;
277
278 #else  /* MD5_SMALL == 0 or 1 */
279
280 # if MD5_SMALL == 1
281 	const uint32_t *pc;
282 	const char *pp;
283 	int i;
284 # endif
285
286 	/* First round: using the given function, the context and a constant
287 	   the next context is computed.  Because the algorithm's processing
288 	   unit is a 32-bit word and it is determined to work on words in
289 	   little endian byte order we perhaps have to change the byte order
290 	   before the computation.  To reduce the work for the next steps
291 	   we save swapped words in WORDS array.  */
292 # undef OP
293 # define OP(a, b, c, d, s, T) \
294 	do { \
295 		a += FF(b, c, d) + (*words IF_BIG_ENDIAN(= SWAP_LE32(*words))) + T; \
296 		words++; \
297 		a = rotl32(a, s); \
298 		a += b; \
299 	} while (0)
300
301 	/* Round 1 */
302 # if MD5_SMALL == 1
303 	pc = C_array;
304 	for (i = 0; i < 4; i++) {
305 		OP(A, B, C, D, 7, *pc++);
306 		OP(D, A, B, C, 12, *pc++);
307 		OP(C, D, A, B, 17, *pc++);
308 		OP(B, C, D, A, 22, *pc++);
309 	}
310 # else
311 	OP(A, B, C, D, 7, 0xd76aa478);
312 	OP(D, A, B, C, 12, 0xe8c7b756);
313 	OP(C, D, A, B, 17, 0x242070db);
314 	OP(B, C, D, A, 22, 0xc1bdceee);
315 	OP(A, B, C, D, 7, 0xf57c0faf);
316 	OP(D, A, B, C, 12, 0x4787c62a);
317 	OP(C, D, A, B, 17, 0xa8304613);
318 	OP(B, C, D, A, 22, 0xfd469501);
319 	OP(A, B, C, D, 7, 0x698098d8);
320 	OP(D, A, B, C, 12, 0x8b44f7af);
321 	OP(C, D, A, B, 17, 0xffff5bb1);
322 	OP(B, C, D, A, 22, 0x895cd7be);
323 	OP(A, B, C, D, 7, 0x6b901122);
324 	OP(D, A, B, C, 12, 0xfd987193);
325 	OP(C, D, A, B, 17, 0xa679438e);
326 	OP(B, C, D, A, 22, 0x49b40821);
327 # endif
328 	words -= 16;
329
330 	/* For the second to fourth round we have the possibly swapped words
331 	   in WORDS.  Redefine the macro to take an additional first
332 	   argument specifying the function to use.  */
333 # undef OP
334 # define OP(f, a, b, c, d, k, s, T) \
335 	do { \
336 		a += f(b, c, d) + words[k] + T; \
337 		a = rotl32(a, s); \
338 		a += b; \
339 	} while (0)
340
341 	/* Round 2 */
342 # if MD5_SMALL == 1
343 	pp = P_array;
344 	for (i = 0; i < 4; i++) {
345 		OP(FG, A, B, C, D, (int) (*pp++), 5, *pc++);
346 		OP(FG, D, A, B, C, (int) (*pp++), 9, *pc++);
347 		OP(FG, C, D, A, B, (int) (*pp++), 14, *pc++);
348 		OP(FG, B, C, D, A, (int) (*pp++), 20, *pc++);
349 	}
350 # else
351 	OP(FG, A, B, C, D, 1, 5, 0xf61e2562);
352 	OP(FG, D, A, B, C, 6, 9, 0xc040b340);
353 	OP(FG, C, D, A, B, 11, 14, 0x265e5a51);
354 	OP(FG, B, C, D, A, 0, 20, 0xe9b6c7aa);
355 	OP(FG, A, B, C, D, 5, 5, 0xd62f105d);
356 	OP(FG, D, A, B, C, 10, 9, 0x02441453);
357 	OP(FG, C, D, A, B, 15, 14, 0xd8a1e681);
358 	OP(FG, B, C, D, A, 4, 20, 0xe7d3fbc8);
359 	OP(FG, A, B, C, D, 9, 5, 0x21e1cde6);
360 	OP(FG, D, A, B, C, 14, 9, 0xc33707d6);
361 	OP(FG, C, D, A, B, 3, 14, 0xf4d50d87);
362 	OP(FG, B, C, D, A, 8, 20, 0x455a14ed);
363 	OP(FG, A, B, C, D, 13, 5, 0xa9e3e905);
364 	OP(FG, D, A, B, C, 2, 9, 0xfcefa3f8);
365 	OP(FG, C, D, A, B, 7, 14, 0x676f02d9);
366 	OP(FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
367 # endif
368
369 	/* Round 3 */
370 # if MD5_SMALL == 1
371 	for (i = 0; i < 4; i++) {
372 		OP(FH, A, B, C, D, (int) (*pp++), 4, *pc++);
373 		OP(FH, D, A, B, C, (int) (*pp++), 11, *pc++);
374 		OP(FH, C, D, A, B, (int) (*pp++), 16, *pc++);
375 		OP(FH, B, C, D, A, (int) (*pp++), 23, *pc++);
376 	}
377 # else
378 	OP(FH, A, B, C, D, 5, 4, 0xfffa3942);
379 	OP(FH, D, A, B, C, 8, 11, 0x8771f681);
380 	OP(FH, C, D, A, B, 11, 16, 0x6d9d6122);
381 	OP(FH, B, C, D, A, 14, 23, 0xfde5380c);
382 	OP(FH, A, B, C, D, 1, 4, 0xa4beea44);
383 	OP(FH, D, A, B, C, 4, 11, 0x4bdecfa9);
384 	OP(FH, C, D, A, B, 7, 16, 0xf6bb4b60);
385 	OP(FH, B, C, D, A, 10, 23, 0xbebfbc70);
386 	OP(FH, A, B, C, D, 13, 4, 0x289b7ec6);
387 	OP(FH, D, A, B, C, 0, 11, 0xeaa127fa);
388 	OP(FH, C, D, A, B, 3, 16, 0xd4ef3085);
389 	OP(FH, B, C, D, A, 6, 23, 0x04881d05);
390 	OP(FH, A, B, C, D, 9, 4, 0xd9d4d039);
391 	OP(FH, D, A, B, C, 12, 11, 0xe6db99e5);
392 	OP(FH, C, D, A, B, 15, 16, 0x1fa27cf8);
393 	OP(FH, B, C, D, A, 2, 23, 0xc4ac5665);
394 # endif
395
396 	/* Round 4 */
397 # if MD5_SMALL == 1
398 	for (i = 0; i < 4; i++) {
399 		OP(FI, A, B, C, D, (int) (*pp++), 6, *pc++);
400 		OP(FI, D, A, B, C, (int) (*pp++), 10, *pc++);
401 		OP(FI, C, D, A, B, (int) (*pp++), 15, *pc++);
402 		OP(FI, B, C, D, A, (int) (*pp++), 21, *pc++);
403 	}
404 # else
405 	OP(FI, A, B, C, D, 0, 6, 0xf4292244);
406 	OP(FI, D, A, B, C, 7, 10, 0x432aff97);
407 	OP(FI, C, D, A, B, 14, 15, 0xab9423a7);
408 	OP(FI, B, C, D, A, 5, 21, 0xfc93a039);
409 	OP(FI, A, B, C, D, 12, 6, 0x655b59c3);
410 	OP(FI, D, A, B, C, 3, 10, 0x8f0ccc92);
411 	OP(FI, C, D, A, B, 10, 15, 0xffeff47d);
412 	OP(FI, B, C, D, A, 1, 21, 0x85845dd1);
413 	OP(FI, A, B, C, D, 8, 6, 0x6fa87e4f);
414 	OP(FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
415 	OP(FI, C, D, A, B, 6, 15, 0xa3014314);
416 	OP(FI, B, C, D, A, 13, 21, 0x4e0811a1);
417 	OP(FI, A, B, C, D, 4, 6, 0xf7537e82);
418 	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
419 	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
420 	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
421 # undef OP
422 # endif
423 	/* Add checksum to the starting values */
424 	ctx->hash[0] += A;
425 	ctx->hash[1] += B;
426 	ctx->hash[2] += C;
427 	ctx->hash[3] += D;
428 #endif
429 }
430 #undef FF
431 #undef FG
432 #undef FH
433 #undef FI
434
435 /* Initialize structure containing state of computation.
436  * (RFC 1321, 3.3: Step 3)
437  */
438 void FAST_FUNC md5_begin(md5_ctx_t *ctx)
439 {
440 	ctx->hash[0] = 0x67452301;
441 	ctx->hash[1] = 0xefcdab89;
442 	ctx->hash[2] = 0x98badcfe;
443 	ctx->hash[3] = 0x10325476;
444 	ctx->total64 = 0;
445 	ctx->process_block = md5_process_block64;
446 }
447
448 /* Used also for sha1 and sha256 */
449 void FAST_FUNC md5_hash(md5_ctx_t *ctx, const void *buffer, size_t len)
450 {
451 	common64_hash(ctx, buffer, len);
452 }
453
454 /* Process the remaining bytes in the buffer and put result from CTX
455  * in first 16 bytes following RESBUF.  The result is always in little
456  * endian byte order, so that a byte-wise output yields to the wanted
457  * ASCII representation of the message digest.
458  */
459 void FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
460 {
461 	/* MD5 stores total in LE, need to swap on BE arches: */
462 	common64_end(ctx, /*swap_needed:*/ BB_BIG_ENDIAN);
463
464 	/* The MD5 result is in little endian byte order */
465 	if (BB_BIG_ENDIAN) {
466 		ctx->hash[0] = SWAP_LE32(ctx->hash[0]);
467 		ctx->hash[1] = SWAP_LE32(ctx->hash[1]);
468 		ctx->hash[2] = SWAP_LE32(ctx->hash[2]);
469 		ctx->hash[3] = SWAP_LE32(ctx->hash[3]);
470 	}
471
472 	memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * 4);
473 }
474
475
476 /*
477  * SHA1 part is:
478  * Copyright 2007 Rob Landley <rob@landley.net>
479  *
480  * Based on the public domain SHA-1 in C by Steve Reid <steve@edmweb.com>
481  * from http://www.mirrors.wiretapped.net/security/cryptography/hashes/sha1/
482  *
483  * Licensed under GPLv2, see file LICENSE in this source tree.
484  *
485  * ---------------------------------------------------------------------------
486  *
487  * SHA256 and SHA512 parts are:
488  * Released into the Public Domain by Ulrich Drepper <drepper@redhat.com>.
489  * Shrank by Denys Vlasenko.
490  *
491  * ---------------------------------------------------------------------------
492  *
493  * The best way to test random blocksizes is to go to coreutils/md5_sha1_sum.c
494  * and replace "4096" with something like "2000 + time(NULL) % 2097",
495  * then rebuild and compare "shaNNNsum bigfile" results.
496  */
497
498 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
499 {
500 	static const uint32_t rconsts[] = {
501 		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
502 	};
503 	int i, j;
504 	int cnt;
505 	uint32_t W[16+16];
506 	uint32_t a, b, c, d, e;
507
508 	/* On-stack work buffer frees up one register in the main loop
509 	 * which otherwise will be needed to hold ctx pointer */
510 	for (i = 0; i < 16; i++)
511 		W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
512
513 	a = ctx->hash[0];
514 	b = ctx->hash[1];
515 	c = ctx->hash[2];
516 	d = ctx->hash[3];
517 	e = ctx->hash[4];
518
519 	/* 4 rounds of 20 operations each */
520 	cnt = 0;
521 	for (i = 0; i < 4; i++) {
522 		j = 19;
523 		do {
524 			uint32_t work;
525
526 			work = c ^ d;
527 			if (i == 0) {
528 				work = (work & b) ^ d;
529 				if (j <= 3)
530 					goto ge16;
531 				/* Used to do SWAP_BE32 here, but this
532 				 * requires ctx (see comment above) */
533 				work += W[cnt];
534 			} else {
535 				if (i == 2)
536 					work = ((b | c) & d) | (b & c);
537 				else /* i = 1 or 3 */
538 					work ^= b;
539  ge16:
540 				W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
541 				work += W[cnt];
542 			}
543 			work += e + rotl32(a, 5) + rconsts[i];
544
545 			/* Rotate by one for next time */
546 			e = d;
547 			d = c;
548 			c = /* b = */ rotl32(b, 30);
549 			b = a;
550 			a = work;
551 			cnt = (cnt + 1) & 15;
552 		} while (--j >= 0);
553 	}
554
555 	ctx->hash[0] += a;
556 	ctx->hash[1] += b;
557 	ctx->hash[2] += c;
558 	ctx->hash[3] += d;
559 	ctx->hash[4] += e;
560 }
561
562 /* Constants for SHA512 from FIPS 180-2:4.2.3.
563  * SHA256 constants from FIPS 180-2:4.2.2
564  * are the most significant half of first 64 elements
565  * of the same array.
566  */
567 static const uint64_t sha_K[80] = {
568 	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
569 	0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
570 	0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
571 	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
572 	0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
573 	0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
574 	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
575 	0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
576 	0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
577 	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
578 	0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
579 	0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
580 	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
581 	0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
582 	0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
583 	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
584 	0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
585 	0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
586 	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
587 	0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
588 	0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
589 	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
590 	0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
591 	0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
592 	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
593 	0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
594 	0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
595 	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
596 	0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
597 	0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
598 	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
599 	0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
600 	0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, /* [64]+ are used for sha512 only */
601 	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
602 	0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
603 	0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
604 	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
605 	0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
606 	0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
607 	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
608 };
609
610 #undef Ch
611 #undef Maj
612 #undef S0
613 #undef S1
614 #undef R0
615 #undef R1
616
617 static void FAST_FUNC sha256_process_block64(sha256_ctx_t *ctx)
618 {
619 	unsigned t;
620 	uint32_t W[64], a, b, c, d, e, f, g, h;
621 	const uint32_t *words = (uint32_t*) ctx->wbuffer;
622
623 	/* Operators defined in FIPS 180-2:4.1.2.  */
624 #define Ch(x, y, z) ((x & y) ^ (~x & z))
625 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
626 #define S0(x) (rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22))
627 #define S1(x) (rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25))
628 #define R0(x) (rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3))
629 #define R1(x) (rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10))
630
631 	/* Compute the message schedule according to FIPS 180-2:6.2.2 step 2.  */
632 	for (t = 0; t < 16; ++t)
633 		W[t] = SWAP_BE32(words[t]);
634 	for (/*t = 16*/; t < 64; ++t)
635 		W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
636
637 	a = ctx->hash[0];
638 	b = ctx->hash[1];
639 	c = ctx->hash[2];
640 	d = ctx->hash[3];
641 	e = ctx->hash[4];
642 	f = ctx->hash[5];
643 	g = ctx->hash[6];
644 	h = ctx->hash[7];
645
646 	/* The actual computation according to FIPS 180-2:6.2.2 step 3.  */
647 	for (t = 0; t < 64; ++t) {
648 		/* Need to fetch upper half of sha_K[t]
649 		 * (I hope compiler is clever enough to just fetch
650 		 * upper half)
651 		 */
652 		uint32_t K_t = sha_K[t] >> 32;
653 		uint32_t T1 = h + S1(e) + Ch(e, f, g) + K_t + W[t];
654 		uint32_t T2 = S0(a) + Maj(a, b, c);
655 		h = g;
656 		g = f;
657 		f = e;
658 		e = d + T1;
659 		d = c;
660 		c = b;
661 		b = a;
662 		a = T1 + T2;
663 	}
664 #undef Ch
665 #undef Maj
666 #undef S0
667 #undef S1
668 #undef R0
669 #undef R1
670 	/* Add the starting values of the context according to FIPS 180-2:6.2.2
671 	   step 4.  */
672 	ctx->hash[0] += a;
673 	ctx->hash[1] += b;
674 	ctx->hash[2] += c;
675 	ctx->hash[3] += d;
676 	ctx->hash[4] += e;
677 	ctx->hash[5] += f;
678 	ctx->hash[6] += g;
679 	ctx->hash[7] += h;
680 }
681
682 static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
683 {
684 	unsigned t;
685 	uint64_t W[80];
686 	/* On i386, having assignments here (not later as sha256 does)
687 	 * produces 99 bytes smaller code with gcc 4.3.1
688 	 */
689 	uint64_t a = ctx->hash[0];
690 	uint64_t b = ctx->hash[1];
691 	uint64_t c = ctx->hash[2];
692 	uint64_t d = ctx->hash[3];
693 	uint64_t e = ctx->hash[4];
694 	uint64_t f = ctx->hash[5];
695 	uint64_t g = ctx->hash[6];
696 	uint64_t h = ctx->hash[7];
697 	const uint64_t *words = (uint64_t*) ctx->wbuffer;
698
699 	/* Operators defined in FIPS 180-2:4.1.2.  */
700 #define Ch(x, y, z) ((x & y) ^ (~x & z))
701 #define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
702 #define S0(x) (rotr64(x, 28) ^ rotr64(x, 34) ^ rotr64(x, 39))
703 #define S1(x) (rotr64(x, 14) ^ rotr64(x, 18) ^ rotr64(x, 41))
704 #define R0(x) (rotr64(x, 1) ^ rotr64(x, 8) ^ (x >> 7))
705 #define R1(x) (rotr64(x, 19) ^ rotr64(x, 61) ^ (x >> 6))
706
707 	/* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
708 	for (t = 0; t < 16; ++t)
709 		W[t] = SWAP_BE64(words[t]);
710 	for (/*t = 16*/; t < 80; ++t)
711 		W[t] = R1(W[t - 2]) + W[t - 7] + R0(W[t - 15]) + W[t - 16];
712
713 	/* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
714 	for (t = 0; t < 80; ++t) {
715 		uint64_t T1 = h + S1(e) + Ch(e, f, g) + sha_K[t] + W[t];
716 		uint64_t T2 = S0(a) + Maj(a, b, c);
717 		h = g;
718 		g = f;
719 		f = e;
720 		e = d + T1;
721 		d = c;
722 		c = b;
723 		b = a;
724 		a = T1 + T2;
725 	}
726 #undef Ch
727 #undef Maj
728 #undef S0
729 #undef S1
730 #undef R0
731 #undef R1
732 	/* Add the starting values of the context according to FIPS 180-2:6.3.2
733 	   step 4.  */
734 	ctx->hash[0] += a;
735 	ctx->hash[1] += b;
736 	ctx->hash[2] += c;
737 	ctx->hash[3] += d;
738 	ctx->hash[4] += e;
739 	ctx->hash[5] += f;
740 	ctx->hash[6] += g;
741 	ctx->hash[7] += h;
742 }
743
744
745 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
746 {
747 	ctx->hash[0] = 0x67452301;
748 	ctx->hash[1] = 0xefcdab89;
749 	ctx->hash[2] = 0x98badcfe;
750 	ctx->hash[3] = 0x10325476;
751 	ctx->hash[4] = 0xc3d2e1f0;
752 	ctx->total64 = 0;
753 	ctx->process_block = sha1_process_block64;
754 }
755
756 static const uint32_t init256[] = {
757 	0,
758 	0,
759 	0x6a09e667,
760 	0xbb67ae85,
761 	0x3c6ef372,
762 	0xa54ff53a,
763 	0x510e527f,
764 	0x9b05688c,
765 	0x1f83d9ab,
766 	0x5be0cd19,
767 };
768 static const uint32_t init512_lo[] = {
769 	0,
770 	0,
771 	0xf3bcc908,
772 	0x84caa73b,
773 	0xfe94f82b,
774 	0x5f1d36f1,
775 	0xade682d1,
776 	0x2b3e6c1f,
777 	0xfb41bd6b,
778 	0x137e2179,
779 };
780
781 /* Initialize structure containing state of computation.
782    (FIPS 180-2:5.3.2)  */
783 void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
784 {
785 	memcpy(&ctx->total64, init256, sizeof(init256));
786 	/*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
787 	ctx->process_block = sha256_process_block64;
788 }
789
790 /* Initialize structure containing state of computation.
791    (FIPS 180-2:5.3.3)  */
792 void FAST_FUNC sha512_begin(sha512_ctx_t *ctx)
793 {
794 	int i;
795 	/* Two extra iterations zero out ctx->total64[2] */
796 	uint64_t *tp = ctx->total64;
797 	for (i = 0; i < 2+8; i++)
798 		tp[i] = ((uint64_t)(init256[i]) << 32) + init512_lo[i];
799 	/*ctx->total64[0] = ctx->total64[1] = 0; - already done */
800 }
801
802 void FAST_FUNC sha512_hash(sha512_ctx_t *ctx, const void *buffer, size_t len)
803 {
804 	unsigned bufpos = ctx->total64[0] & 127;
805 	unsigned remaining;
806
807 	/* First increment the byte count.  FIPS 180-2 specifies the possible
808 	   length of the file up to 2^128 _bits_.
809 	   We compute the number of _bytes_ and convert to bits later.  */
810 	ctx->total64[0] += len;
811 	if (ctx->total64[0] < len)
812 		ctx->total64[1]++;
813 #if 0
814 	remaining = 128 - bufpos;
815
816 	/* Hash whole blocks */
817 	while (len >= remaining) {
818 		memcpy(ctx->wbuffer + bufpos, buffer, remaining);
819 		buffer = (const char *)buffer + remaining;
820 		len -= remaining;
821 		remaining = 128;
822 		bufpos = 0;
823 		sha512_process_block128(ctx);
824 	}
825
826 	/* Save last, partial blosk */
827 	memcpy(ctx->wbuffer + bufpos, buffer, len);
828 #else
829 	while (1) {
830 		remaining = 128 - bufpos;
831 		if (remaining > len)
832 			remaining = len;
833 		/* Copy data into aligned buffer */
834 		memcpy(ctx->wbuffer + bufpos, buffer, remaining);
835 		len -= remaining;
836 		buffer = (const char *)buffer + remaining;
837 		bufpos += remaining;
838 		/* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
839 		bufpos -= 128;
840 		if (bufpos != 0)
841 			break;
842 		/* Buffer is filled up, process it */
843 		sha512_process_block128(ctx);
844 		/*bufpos = 0; - already is */
845 	}
846 #endif
847 }
848
849 /* Used also for sha256 */
850 void FAST_FUNC sha1_end(sha1_ctx_t *ctx, void *resbuf)
851 {
852 	unsigned hash_size;
853
854 	/* SHA stores total in BE, need to swap on LE arches: */
855 	common64_end(ctx, /*swap_needed:*/ BB_LITTLE_ENDIAN);
856
857 	hash_size = (ctx->process_block == sha1_process_block64) ? 5 : 8;
858 	/* This way we do not impose alignment constraints on resbuf: */
859 	if (BB_LITTLE_ENDIAN) {
860 		unsigned i;
861 		for (i = 0; i < hash_size; ++i)
862 			ctx->hash[i] = SWAP_BE32(ctx->hash[i]);
863 	}
864 	memcpy(resbuf, ctx->hash, sizeof(ctx->hash[0]) * hash_size);
865 }
866
867 void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf)
868 {
869 	unsigned bufpos = ctx->total64[0] & 127;
870
871 	/* Pad the buffer to the next 128-byte boundary with 0x80,0,0,0... */
872 	ctx->wbuffer[bufpos++] = 0x80;
873
874 	while (1) {
875 		unsigned remaining = 128 - bufpos;
876 		memset(ctx->wbuffer + bufpos, 0, remaining);
877 		if (remaining >= 16) {
878 			/* Store the 128-bit counter of bits in the buffer in BE format */
879 			uint64_t t;
880 			t = ctx->total64[0] << 3;
881 			t = SWAP_BE64(t);
882 			*(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 8]) = t;
883 			t = (ctx->total64[1] << 3) | (ctx->total64[0] >> 61);
884 			t = SWAP_BE64(t);
885 			*(bb__aliased_uint64_t *) (&ctx->wbuffer[128 - 16]) = t;
886 		}
887 		sha512_process_block128(ctx);
888 		if (remaining >= 16)
889 			break;
890 		bufpos = 0;
891 	}
892
893 	if (BB_LITTLE_ENDIAN) {
894 		unsigned i;
895 		for (i = 0; i < ARRAY_SIZE(ctx->hash); ++i)
896 			ctx->hash[i] = SWAP_BE64(ctx->hash[i]);
897 	}
898 	memcpy(resbuf, ctx->hash, sizeof(ctx->hash));
899 }
900
901
902 /*
903  * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
904  * Michael Peeters and Gilles Van Assche. For more information, feedback or
905  * questions, please refer to our website: http://keccak.noekeon.org/
906  *
907  * Implementation by Ronny Van Keer,
908  * hereby denoted as "the implementer".
909  *
910  * To the extent possible under law, the implementer has waived all copyright
911  * and related or neighboring rights to the source code in this file.
912  * http://creativecommons.org/publicdomain/zero/1.0/
913  *
914  * Busybox modifications (C) Lauri Kasanen, under the GPLv2.
915  */
916
917 #if CONFIG_SHA3_SMALL < 0
918 # define SHA3_SMALL 0
919 #elif CONFIG_SHA3_SMALL > 1
920 # define SHA3_SMALL 1
921 #else
922 # define SHA3_SMALL CONFIG_SHA3_SMALL
923 #endif
924
925 #define OPTIMIZE_SHA3_FOR_32 0
926 /*
927  * SHA3 can be optimized for 32-bit CPUs with bit-slicing:
928  * every 64-bit word of state[] can be split into two 32-bit words
929  * by even/odd bits. In this form, all rotations of sha3 round
930  * are 32-bit - and there are lots of them.
931  * However, it requires either splitting/combining state words
932  * before/after sha3 round (code does this now)
933  * or shuffling bits before xor'ing them into state and in sha3_end.
934  * Without shuffling, bit-slicing results in -130 bytes of code
935  * and marginal speedup (but of course it gives wrong result).
936  * With shuffling it works, but +260 code bytes, and slower.
937  * Disabled for now:
938  */
939 #if 0 /* LONG_MAX == 0x7fffffff */
940 # undef OPTIMIZE_SHA3_FOR_32
941 # define OPTIMIZE_SHA3_FOR_32 1
942 #endif
943
944 #if OPTIMIZE_SHA3_FOR_32
945 /* This splits every 64-bit word into a pair of 32-bit words,
946  * even bits go into first word, odd bits go to second one.
947  * The conversion is done in-place.
948  */
949 static void split_halves(uint64_t *state)
950 {
951 	/* Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
952 	uint32_t *s32 = (uint32_t*)state;
953 	uint32_t t, x0, x1;
954 	int i;
955 	for (i = 24; i >= 0; --i) {
956 		x0 = s32[0];
957 		t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
958 		t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
959 		t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
960 		t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
961 		x1 = s32[1];
962 		t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
963 		t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
964 		t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
965 		t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
966 		*s32++ = (x0 & 0x0000FFFF) | (x1 << 16);
967 		*s32++ = (x0 >> 16) | (x1 & 0xFFFF0000);
968 	}
969 }
970 /* The reverse operation */
971 static void combine_halves(uint64_t *state)
972 {
973 	uint32_t *s32 = (uint32_t*)state;
974 	uint32_t t, x0, x1;
975 	int i;
976 	for (i = 24; i >= 0; --i) {
977 		x0 = s32[0];
978 		x1 = s32[1];
979 		t = (x0 & 0x0000FFFF) | (x1 << 16);
980 		x1 = (x0 >> 16) | (x1 & 0xFFFF0000);
981 		x0 = t;
982 		t = (x0 ^ (x0 >> 8)) & 0x0000FF00; x0 = x0 ^ t ^ (t << 8);
983 		t = (x0 ^ (x0 >> 4)) & 0x00F000F0; x0 = x0 ^ t ^ (t << 4);
984 		t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0C; x0 = x0 ^ t ^ (t << 2);
985 		t = (x0 ^ (x0 >> 1)) & 0x22222222; x0 = x0 ^ t ^ (t << 1);
986 		*s32++ = x0;
987 		t = (x1 ^ (x1 >> 8)) & 0x0000FF00; x1 = x1 ^ t ^ (t << 8);
988 		t = (x1 ^ (x1 >> 4)) & 0x00F000F0; x1 = x1 ^ t ^ (t << 4);
989 		t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0C; x1 = x1 ^ t ^ (t << 2);
990 		t = (x1 ^ (x1 >> 1)) & 0x22222222; x1 = x1 ^ t ^ (t << 1);
991 		*s32++ = x1;
992 	}
993 }
994 #endif
995
996 /*
997  * In the crypto literature this function is usually called Keccak-f().
998  */
999 static void sha3_process_block72(uint64_t *state)
1000 {
1001 	enum { NROUNDS = 24 };
1002
1003 #if OPTIMIZE_SHA3_FOR_32
1004 	/*
1005 	static const uint32_t IOTA_CONST_0[NROUNDS] = {
1006 		0x00000001UL,
1007 		0x00000000UL,
1008 		0x00000000UL,
1009 		0x00000000UL,
1010 		0x00000001UL,
1011 		0x00000001UL,
1012 		0x00000001UL,
1013 		0x00000001UL,
1014 		0x00000000UL,
1015 		0x00000000UL,
1016 		0x00000001UL,
1017 		0x00000000UL,
1018 		0x00000001UL,
1019 		0x00000001UL,
1020 		0x00000001UL,
1021 		0x00000001UL,
1022 		0x00000000UL,
1023 		0x00000000UL,
1024 		0x00000000UL,
1025 		0x00000000UL,
1026 		0x00000001UL,
1027 		0x00000000UL,
1028 		0x00000001UL,
1029 		0x00000000UL,
1030 	};
1031 	** bits are in lsb: 0101 0000 1111 0100 1111 0001
1032 	*/
1033 	uint32_t IOTA_CONST_0bits = (uint32_t)(0x0050f4f1);
1034 	static const uint32_t IOTA_CONST_1[NROUNDS] = {
1035 		0x00000000UL,
1036 		0x00000089UL,
1037 		0x8000008bUL,
1038 		0x80008080UL,
1039 		0x0000008bUL,
1040 		0x00008000UL,
1041 		0x80008088UL,
1042 		0x80000082UL,
1043 		0x0000000bUL,
1044 		0x0000000aUL,
1045 		0x00008082UL,
1046 		0x00008003UL,
1047 		0x0000808bUL,
1048 		0x8000000bUL,
1049 		0x8000008aUL,
1050 		0x80000081UL,
1051 		0x80000081UL,
1052 		0x80000008UL,
1053 		0x00000083UL,
1054 		0x80008003UL,
1055 		0x80008088UL,
1056 		0x80000088UL,
1057 		0x00008000UL,
1058 		0x80008082UL,
1059 	};
1060
1061 	uint32_t *const s32 = (uint32_t*)state;
1062 	unsigned round;
1063
1064 	split_halves(state);
1065
1066 	for (round = 0; round < NROUNDS; round++) {
1067 		unsigned x;
1068
1069 		/* Theta */
1070 		{
1071 			uint32_t BC[20];
1072 			for (x = 0; x < 10; ++x) {
1073 				BC[x+10] = BC[x] = s32[x]^s32[x+10]^s32[x+20]^s32[x+30]^s32[x+40];
1074 			}
1075 			for (x = 0; x < 10; x += 2) {
1076 				uint32_t ta, tb;
1077 				ta = BC[x+8] ^ rotl32(BC[x+3], 1);
1078 				tb = BC[x+9] ^ BC[x+2];
1079 				s32[x+0] ^= ta;
1080 				s32[x+1] ^= tb;
1081 				s32[x+10] ^= ta;
1082 				s32[x+11] ^= tb;
1083 				s32[x+20] ^= ta;
1084 				s32[x+21] ^= tb;
1085 				s32[x+30] ^= ta;
1086 				s32[x+31] ^= tb;
1087 				s32[x+40] ^= ta;
1088 				s32[x+41] ^= tb;
1089 			}
1090 		}
1091 		/* RhoPi */
1092 		{
1093 			uint32_t t0a,t0b, t1a,t1b;
1094 			t1a = s32[1*2+0];
1095 			t1b = s32[1*2+1];
1096
1097 #define RhoPi(PI_LANE, ROT_CONST) \
1098 	t0a = s32[PI_LANE*2+0];\
1099 	t0b = s32[PI_LANE*2+1];\
1100 	if (ROT_CONST & 1) {\
1101 		s32[PI_LANE*2+0] = rotl32(t1b, ROT_CONST/2+1);\
1102 		s32[PI_LANE*2+1] = ROT_CONST == 1 ? t1a : rotl32(t1a, ROT_CONST/2+0);\
1103 	} else {\
1104 		s32[PI_LANE*2+0] = rotl32(t1a, ROT_CONST/2);\
1105 		s32[PI_LANE*2+1] = rotl32(t1b, ROT_CONST/2);\
1106 	}\
1107 	t1a = t0a; t1b = t0b;
1108
1109 			RhoPi(10, 1)
1110 			RhoPi( 7, 3)
1111 			RhoPi(11, 6)
1112 			RhoPi(17,10)
1113 			RhoPi(18,15)
1114 			RhoPi( 3,21)
1115 			RhoPi( 5,28)
1116 			RhoPi(16,36)
1117 			RhoPi( 8,45)
1118 			RhoPi(21,55)
1119 			RhoPi(24, 2)
1120 			RhoPi( 4,14)
1121 			RhoPi(15,27)
1122 			RhoPi(23,41)
1123 			RhoPi(19,56)
1124 			RhoPi(13, 8)
1125 			RhoPi(12,25)
1126 			RhoPi( 2,43)
1127 			RhoPi(20,62)
1128 			RhoPi(14,18)
1129 			RhoPi(22,39)
1130 			RhoPi( 9,61)
1131 			RhoPi( 6,20)
1132 			RhoPi( 1,44)
1133 #undef RhoPi
1134 		}
1135 		/* Chi */
1136 		for (x = 0; x <= 40;) {
1137 			uint32_t BC0, BC1, BC2, BC3, BC4;
1138 			BC0 = s32[x + 0*2];
1139 			BC1 = s32[x + 1*2];
1140 			BC2 = s32[x + 2*2];
1141 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1142 			BC3 = s32[x + 3*2];
1143 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1144 			BC4 = s32[x + 4*2];
1145 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1146 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1147 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1148 			x++;
1149 			BC0 = s32[x + 0*2];
1150 			BC1 = s32[x + 1*2];
1151 			BC2 = s32[x + 2*2];
1152 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1153 			BC3 = s32[x + 3*2];
1154 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1155 			BC4 = s32[x + 4*2];
1156 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1157 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1158 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1159 			x += 9;
1160 		}
1161 		/* Iota */
1162 		s32[0] ^= IOTA_CONST_0bits & 1;
1163 		IOTA_CONST_0bits >>= 1;
1164 		s32[1] ^= IOTA_CONST_1[round];
1165 	}
1166
1167 	combine_halves(state);
1168 #else
1169 	/* Native 64-bit algorithm */
1170 	static const uint16_t IOTA_CONST[NROUNDS] = {
1171 		/* Elements should be 64-bit, but top half is always zero
1172 		 * or 0x80000000. We encode 63rd bits in a separate word below.
1173 		 * Same is true for 31th bits, which lets us use 16-bit table
1174 		 * instead of 64-bit. The speed penalty is lost in the noise.
1175 		 */
1176 		0x0001,
1177 		0x8082,
1178 		0x808a,
1179 		0x8000,
1180 		0x808b,
1181 		0x0001,
1182 		0x8081,
1183 		0x8009,
1184 		0x008a,
1185 		0x0088,
1186 		0x8009,
1187 		0x000a,
1188 		0x808b,
1189 		0x008b,
1190 		0x8089,
1191 		0x8003,
1192 		0x8002,
1193 		0x0080,
1194 		0x800a,
1195 		0x000a,
1196 		0x8081,
1197 		0x8080,
1198 		0x0001,
1199 		0x8008,
1200 	};
1201 	/* bit for CONST[0] is in msb: 0011 0011 0000 0111 1101 1101 */
1202 	const uint32_t IOTA_CONST_bit63 = (uint32_t)(0x3307dd00);
1203 	/* bit for CONST[0] is in msb: 0001 0110 0011 1000 0001 1011 */
1204 	const uint32_t IOTA_CONST_bit31 = (uint32_t)(0x16381b00);
1205
1206 	static const uint8_t ROT_CONST[24] = {
1207 		1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
1208 		27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
1209 	};
1210 	static const uint8_t PI_LANE[24] = {
1211 		10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
1212 		15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
1213 	};
1214 	/*static const uint8_t MOD5[10] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, };*/
1215
1216 	unsigned x;
1217 	unsigned round;
1218
1219 	if (BB_BIG_ENDIAN) {
1220 		for (x = 0; x < 25; x++) {
1221 			state[x] = SWAP_LE64(state[x]);
1222 		}
1223 	}
1224
1225 	for (round = 0; round < NROUNDS; ++round) {
1226 		/* Theta */
1227 		{
1228 			uint64_t BC[10];
1229 			for (x = 0; x < 5; ++x) {
1230 				BC[x + 5] = BC[x] = state[x]
1231 					^ state[x + 5] ^ state[x + 10]
1232 					^ state[x + 15]	^ state[x + 20];
1233 			}
1234 			/* Using 2x5 vector above eliminates the need to use
1235 			 * BC[MOD5[x+N]] trick below to fetch BC[(x+N) % 5],
1236 			 * and the code is a bit _smaller_.
1237 			 */
1238 			for (x = 0; x < 5; ++x) {
1239 				uint64_t temp = BC[x + 4] ^ rotl64(BC[x + 1], 1);
1240 				state[x] ^= temp;
1241 				state[x + 5] ^= temp;
1242 				state[x + 10] ^= temp;
1243 				state[x + 15] ^= temp;
1244 				state[x + 20] ^= temp;
1245 			}
1246 		}
1247
1248 		/* Rho Pi */
1249 		if (SHA3_SMALL) {
1250 			uint64_t t1 = state[1];
1251 			for (x = 0; x < 24; ++x) {
1252 				uint64_t t0 = state[PI_LANE[x]];
1253 				state[PI_LANE[x]] = rotl64(t1, ROT_CONST[x]);
1254 				t1 = t0;
1255 			}
1256 		} else {
1257 			/* Especially large benefit for 32-bit arch (75% faster):
1258 			 * 64-bit rotations by non-constant usually are SLOW on those.
1259 			 * We resort to unrolling here.
1260 			 * This optimizes out PI_LANE[] and ROT_CONST[],
1261 			 * but generates 300-500 more bytes of code.
1262 			 */
1263 			uint64_t t0;
1264 			uint64_t t1 = state[1];
1265 #define RhoPi_twice(x) \
1266 	t0 = state[PI_LANE[x  ]]; \
1267 	state[PI_LANE[x  ]] = rotl64(t1, ROT_CONST[x  ]); \
1268 	t1 = state[PI_LANE[x+1]]; \
1269 	state[PI_LANE[x+1]] = rotl64(t0, ROT_CONST[x+1]);
1270 			RhoPi_twice(0); RhoPi_twice(2);
1271 			RhoPi_twice(4); RhoPi_twice(6);
1272 			RhoPi_twice(8); RhoPi_twice(10);
1273 			RhoPi_twice(12); RhoPi_twice(14);
1274 			RhoPi_twice(16); RhoPi_twice(18);
1275 			RhoPi_twice(20); RhoPi_twice(22);
1276 #undef RhoPi_twice
1277 		}
1278 		/* Chi */
1279 # if LONG_MAX > 0x7fffffff
1280 		for (x = 0; x <= 20; x += 5) {
1281 			uint64_t BC0, BC1, BC2, BC3, BC4;
1282 			BC0 = state[x + 0];
1283 			BC1 = state[x + 1];
1284 			BC2 = state[x + 2];
1285 			state[x + 0] = BC0 ^ ((~BC1) & BC2);
1286 			BC3 = state[x + 3];
1287 			state[x + 1] = BC1 ^ ((~BC2) & BC3);
1288 			BC4 = state[x + 4];
1289 			state[x + 2] = BC2 ^ ((~BC3) & BC4);
1290 			state[x + 3] = BC3 ^ ((~BC4) & BC0);
1291 			state[x + 4] = BC4 ^ ((~BC0) & BC1);
1292 		}
1293 # else
1294 		/* Reduced register pressure version
1295 		 * for register-starved 32-bit arches
1296 		 * (i386: -95 bytes, and it is _faster_)
1297 		 */
1298 		for (x = 0; x <= 40;) {
1299 			uint32_t BC0, BC1, BC2, BC3, BC4;
1300 			uint32_t *const s32 = (uint32_t*)state;
1301 #  if SHA3_SMALL
1302  do_half:
1303 #  endif
1304 			BC0 = s32[x + 0*2];
1305 			BC1 = s32[x + 1*2];
1306 			BC2 = s32[x + 2*2];
1307 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1308 			BC3 = s32[x + 3*2];
1309 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1310 			BC4 = s32[x + 4*2];
1311 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1312 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1313 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1314 			x++;
1315 #  if SHA3_SMALL
1316 			if (x & 1)
1317 				goto do_half;
1318 			x += 8;
1319 #  else
1320 			BC0 = s32[x + 0*2];
1321 			BC1 = s32[x + 1*2];
1322 			BC2 = s32[x + 2*2];
1323 			s32[x + 0*2] = BC0 ^ ((~BC1) & BC2);
1324 			BC3 = s32[x + 3*2];
1325 			s32[x + 1*2] = BC1 ^ ((~BC2) & BC3);
1326 			BC4 = s32[x + 4*2];
1327 			s32[x + 2*2] = BC2 ^ ((~BC3) & BC4);
1328 			s32[x + 3*2] = BC3 ^ ((~BC4) & BC0);
1329 			s32[x + 4*2] = BC4 ^ ((~BC0) & BC1);
1330 			x += 9;
1331 #  endif
1332 		}
1333 # endif /* long is 32-bit */
1334 		/* Iota */
1335 		state[0] ^= IOTA_CONST[round]
1336 			| (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000)
1337 			| (uint64_t)((IOTA_CONST_bit63 << round) & 0x80000000) << 32;
1338 	}
1339
1340 	if (BB_BIG_ENDIAN) {
1341 		for (x = 0; x < 25; x++) {
1342 			state[x] = SWAP_LE64(state[x]);
1343 		}
1344 	}
1345 #endif
1346 }
1347
1348 void FAST_FUNC sha3_begin(sha3_ctx_t *ctx)
1349 {
1350 	memset(ctx, 0, sizeof(*ctx));
1351 	/* SHA3-512, user can override */
1352 	ctx->input_block_bytes = (1600 - 512*2) / 8; /* 72 bytes */
1353 }
1354
1355 void FAST_FUNC sha3_hash(sha3_ctx_t *ctx, const void *buffer, size_t len)
1356 {
1357 #if SHA3_SMALL
1358 	const uint8_t *data = buffer;
1359 	unsigned bufpos = ctx->bytes_queued;
1360
1361 	while (1) {
1362 		unsigned remaining = ctx->input_block_bytes - bufpos;
1363 		if (remaining > len)
1364 			remaining = len;
1365 		len -= remaining;
1366 		/* XOR data into buffer */
1367 		while (remaining != 0) {
1368 			uint8_t *buf = (uint8_t*)ctx->state;
1369 			buf[bufpos] ^= *data++;
1370 			bufpos++;
1371 			remaining--;
1372 		}
1373 		/* Clever way to do "if (bufpos != N) break; ... ; bufpos = 0;" */
1374 		bufpos -= ctx->input_block_bytes;
1375 		if (bufpos != 0)
1376 			break;
1377 		/* Buffer is filled up, process it */
1378 		sha3_process_block72(ctx->state);
1379 		/*bufpos = 0; - already is */
1380 	}
1381 	ctx->bytes_queued = bufpos + ctx->input_block_bytes;
1382 #else
1383 	/* +50 bytes code size, but a bit faster because of long-sized XORs */
1384 	const uint8_t *data = buffer;
1385 	unsigned bufpos = ctx->bytes_queued;
1386 	unsigned iblk_bytes = ctx->input_block_bytes;
1387
1388 	/* If already data in queue, continue queuing first */
1389 	if (bufpos != 0) {
1390 		while (len != 0) {
1391 			uint8_t *buf = (uint8_t*)ctx->state;
1392 			buf[bufpos] ^= *data++;
1393 			len--;
1394 			bufpos++;
1395 			if (bufpos == iblk_bytes) {
1396 				bufpos = 0;
1397 				goto do_block;
1398 			}
1399 		}
1400 	}
1401
1402 	/* Absorb complete blocks */
1403 	while (len >= iblk_bytes) {
1404 		/* XOR data onto beginning of state[].
1405 		 * We try to be efficient - operate one word at a time, not byte.
1406 		 * Careful wrt unaligned access: can't just use "*(long*)data"!
1407 		 */
1408 		unsigned count = iblk_bytes / sizeof(long);
1409 		long *buf = (long*)ctx->state;
1410 		do {
1411 			long v;
1412 			move_from_unaligned_long(v, (long*)data);
1413 			*buf++ ^= v;
1414 			data += sizeof(long);
1415 		} while (--count);
1416 		len -= iblk_bytes;
1417  do_block:
1418 		sha3_process_block72(ctx->state);
1419 	}
1420
1421 	/* Queue remaining data bytes */
1422 	while (len != 0) {
1423 		uint8_t *buf = (uint8_t*)ctx->state;
1424 		buf[bufpos] ^= *data++;
1425 		bufpos++;
1426 		len--;
1427 	}
1428
1429 	ctx->bytes_queued = bufpos;
1430 #endif
1431 }
1432
1433 void FAST_FUNC sha3_end(sha3_ctx_t *ctx, void *resbuf)
1434 {
1435 	/* Padding */
1436 	uint8_t *buf = (uint8_t*)ctx->state;
1437 	/*
1438 	 * Keccak block padding is: add 1 bit after last bit of input,
1439 	 * then add zero bits until the end of block, and add the last 1 bit
1440 	 * (the last bit in the block) - the "10*1" pattern.
1441 	 * SHA3 standard appends additional two bits, 01,  before that padding:
1442 	 *
1443 	 * SHA3-224(M) = KECCAK[448](M||01, 224)
1444 	 * SHA3-256(M) = KECCAK[512](M||01, 256)
1445 	 * SHA3-384(M) = KECCAK[768](M||01, 384)
1446 	 * SHA3-512(M) = KECCAK[1024](M||01, 512)
1447 	 * (M is the input, || is bit concatenation)
1448 	 *
1449 	 * The 6 below contains 01 "SHA3" bits and the first 1 "Keccak" bit:
1450 	 */
1451 	buf[ctx->bytes_queued]          ^= 6; /* bit pattern 00000110 */
1452 	buf[ctx->input_block_bytes - 1] ^= 0x80;
1453
1454 	sha3_process_block72(ctx->state);
1455
1456 	/* Output */
1457 	memcpy(resbuf, ctx->state, 64);
1458 }
1459