This library implements some hash and cryptographic algorithms.

Dependents:   mBuinoBlinky PB_Emma_Ethernet SLOTrashHTTP Garagem ... more

This library implements the following algorithms :

  • RC4
  • AES (AES-128, AES-192, AES-256)
  • DES
  • Triple DES (EDE)
  • MD2
  • MD4
  • MD5
  • SHA-1
  • SHA-2 (SHA-224, SHA-256, SHA-384, SHA-512)

The hash algorithms have been optimized for the mbed and you should get decent performance. However, I did not optimize the ciphers. Also, I did not test extensively these algorithms : it should work but you may find some bugs. Block ciphers support two modes : ECB and CBC.

Warning

If you are using SHA-384 or SHA-512, be aware that it produces large binary files and the compilation (using the online compiler) takes much longer to execute. It may happen that the compiler stops because it timed-out. In this case, just compile again and it should work.

Computing hash

You can compute the hash of some data in two different ways. The first one is the easiest, each hash algorithm has a static method that takes some data and compute the hash from it.

Computing hash using method 1

#include "Crypto.h"
#include "mbed.h"

static const char msg[] = "mbed is great !";

int main()
{
    uint8_t hash[16];
    MD2::computeHash(hash, (uint8_t*)msg, strlen(msg));
    printf("hash: ");
    for(int i = 0; i < 16; ++i)
        printf("%02x", hash[i]);
    printf("\n");
    
    return 0;
}

The second one is slightly slower (around 2-3% slower) but it allows you to compute the hash of some data in several steps (by calling update method). This is the method you should use if you need to compute the hash from a large source and you don't have enough memory to store it in a single buffer.

Computing hash using method 2

#include "Crypto.h"
#include "mbed.h"

static const char msg[] = "mbed is great !";

int main()
{
    uint8_t hash[16];
    MD2 h;
    h.update((uint8_t*)msg, strlen(msg));
    h.finalize(hash);
    printf("hash: ");
    for(int i = 0; i < 16; ++i)
        printf("%02x", hash[i]);
    printf("\n");
    
    return 0;
}

TODO

  • optimize ciphers
  • add doc

Files at this revision

API Documentation at this revision

Comitter:
feb11
Date:
Wed Sep 11 17:22:40 2013 +0000
Parent:
2:473bac39ae7c
Child:
4:0da19393bd57
Commit message:
improved speed of MD2, MD5, SHA-1 and SHA-2 (32bits)

Changed in this revision

MD2.cpp Show annotated file Show diff for this revision Revisions of this file
MD5.cpp Show annotated file Show diff for this revision Revisions of this file
SHA1.cpp Show annotated file Show diff for this revision Revisions of this file
SHA2_32.cpp Show annotated file Show diff for this revision Revisions of this file
SHA2_64.cpp Show annotated file Show diff for this revision Revisions of this file
--- a/MD2.cpp	Mon Sep 09 16:16:24 2013 +0000
+++ b/MD2.cpp	Wed Sep 11 17:22:40 2013 +0000
@@ -28,17 +28,20 @@
 l(0)
 {
     memset(checksum, 0, 16);
-    memset(x, 0, 48);
+    memset(x, 0, 16);
 }
 
 void MD2::computeBlock(uint8_t *checksum2, uint8_t *x2, uint8_t *l2, uint8_t *buffer2)
 {
-    for(int j = 0; j < 16; ++j)
+    if(checksum2 != buffer2)
     {
-        uint8_t c = buffer2[j];
-        *l2 = (checksum2[j] ^= s[c^(*l2)]);
+        for(int j = 0; j < 16; ++j)
+        {
+            uint8_t c = buffer2[j];
+            *l2 = (checksum2[j] ^= s[c^(*l2)]);
+        }
     }
-
+    
     uint32_t *x3 = (uint32_t*)x2;
     uint32_t *buffer3 = (uint32_t*)buffer2;
     
@@ -53,35 +56,94 @@
     
     for(int j = 0; j < 18; ++j)
     {
-        for(int k = 0; k < 48; ++k)
-        {
-            t = (x2[k] ^= s[t]);
-        }
+        t = (x2[0] ^= s[t]);
+        t = (x2[1] ^= s[t]);
+        t = (x2[2] ^= s[t]);
+        t = (x2[3] ^= s[t]);
+        t = (x2[4] ^= s[t]);
+        t = (x2[5] ^= s[t]);
+        t = (x2[6] ^= s[t]);
+        t = (x2[7] ^= s[t]);
+        t = (x2[8] ^= s[t]);
+        t = (x2[9] ^= s[t]);
+        t = (x2[10] ^= s[t]);
+        t = (x2[11] ^= s[t]);
+        t = (x2[12] ^= s[t]);
+        t = (x2[13] ^= s[t]);
+        t = (x2[14] ^= s[t]);
+        t = (x2[15] ^= s[t]);
+        t = (x2[16] ^= s[t]);
+        t = (x2[17] ^= s[t]);
+        t = (x2[18] ^= s[t]);
+        t = (x2[19] ^= s[t]);
+        t = (x2[20] ^= s[t]);
+        t = (x2[21] ^= s[t]);
+        t = (x2[22] ^= s[t]);
+        t = (x2[23] ^= s[t]);            
+        t = (x2[24] ^= s[t]);
+        t = (x2[25] ^= s[t]);
+        t = (x2[26] ^= s[t]);
+        t = (x2[27] ^= s[t]);
+        t = (x2[28] ^= s[t]);
+        t = (x2[29] ^= s[t]);
+        t = (x2[30] ^= s[t]);
+        t = (x2[31] ^= s[t]);
+        t = (x2[32] ^= s[t]);
+        t = (x2[33] ^= s[t]);
+        t = (x2[34] ^= s[t]);
+        t = (x2[35] ^= s[t]);
+        t = (x2[36] ^= s[t]);
+        t = (x2[37] ^= s[t]);
+        t = (x2[38] ^= s[t]);
+        t = (x2[39] ^= s[t]);
+        t = (x2[40] ^= s[t]);
+        t = (x2[41] ^= s[t]);
+        t = (x2[42] ^= s[t]);
+        t = (x2[43] ^= s[t]);
+        t = (x2[44] ^= s[t]);
+        t = (x2[45] ^= s[t]);
+        t = (x2[46] ^= s[t]);
+        t = (x2[47] ^= s[t]);            
+
         t += j;
     }
 }
 
 void MD2::add(uint8_t *in, uint32_t length)
-{
-    if(length < 16-bufferLength)
+{ 
+    if(bufferLength == 0)
+    {
+        while(length >= 16)
+        {
+            computeBlock(checksum, x, &l, in);
+            length -= 16;
+            in += 16;
+        }
+        bufferLength = length;
+        memcpy(buffer, in, length);
+    }
+    else if(length < 16-bufferLength)
     {
         memcpy(&buffer[bufferLength], in, length);
         bufferLength += length;
-        return;
     }
-    int offset = 16-bufferLength;
-    memcpy(&buffer[bufferLength], in, offset);
-    computeBlock(checksum, x, &l, buffer);
-    while(length-offset > 16)
+    else
     {
-        memcpy(buffer, &in[offset], 16);
+        int offset = 16-bufferLength;
+        memcpy(&buffer[bufferLength], in, offset);
         computeBlock(checksum, x, &l, buffer);
-        offset += 16;
+        in += offset;
+        length -= offset;
+        while(length >= 16)
+        {
+            computeBlock(checksum, x, &l, in);
+            in += 16;
+            length -= 16;
+        }
+        bufferLength = length;
+        memcpy(buffer, &in, length);
     }
-    if(offset > length)
-        offset -= 16;
-    bufferLength = length - offset;
-    memcpy(buffer, &in[offset], bufferLength);
+    
 }
 
 void MD2::computeDigest(uint8_t *digest)
@@ -90,30 +152,19 @@
     int padding = 16 - bufferLength;
     memset(&buffer[bufferLength], padding, padding);
     computeBlock(checksum, x, &l, buffer);
-    
-    for(int j = 0; j < 16; ++j)
-    {
-        x[16+j] = checksum[j];
-        x[32+j] = x[16+j] ^ x[j];
-    }
+    computeBlock(checksum, x, &l, checksum);
+    memcpy(digest, x, 16);
 
-    uint8_t t = 0;
-        
-    for(int j = 0; j < 18; ++j)
-    {
-        for(int k = 0; k < 48; ++k)
-        {
-            t = (x[k] ^= s[t]);
-        }
-        t += j;
-    }
-    
+    uint32_t *x2 = (uint32_t*)x;
+    uint32_t *checksum2 = (uint32_t*)checksum;
+
     // reset state
     bufferLength = 0;
     l = 0;
-    memset(checksum, 0, 16);
-    memcpy(digest, x, 16);
-    memset(x,0,48);
+    checksum2[0] = x2[0] = 0;
+    checksum2[1] = x2[1] = 0;
+    checksum2[2] = x2[2] = 0;
+    checksum2[3] = x2[3] = 0;
 }
 
 uint8_t MD2::outputSize() const
@@ -123,43 +174,23 @@
 
 void MD2::computeDigest(uint8_t *digest, uint8_t *in, uint32_t length)
 {
-    uint8_t data[80];
-    memset(data, 0, 64);
-    uint8_t *x = data;
-    uint32_t *x2 = (uint32_t*)data;
-    uint8_t *checksum = &data[48];
-    uint32_t *checksum2 = (uint32_t*)&data[48];
-    uint8_t *buffer = &data[64];
+    uint8_t x[48];
+    uint8_t checksum[16];
+    uint8_t buffer[16];
+    memset(x, 0, 16);
+    memset(checksum, 0, 16);
     uint8_t l = 0;
-    uint32_t offset = 0;
-    while(length - offset >= 16)
+    while(length >= 16)
     {
-        computeBlock(checksum, x, &l, &in[offset]);
-        offset += 16;
+        computeBlock(checksum, x, &l, in);
+        length -= 16;
+        in += 16;
     }
 
-    uint8_t bufferLength = length - offset;
-    memcpy(buffer, &in[offset], bufferLength);
-    uint8_t padding = 16-bufferLength;
-    memset(&buffer[bufferLength], padding, padding);
+    memcpy(buffer, in, length);
+    uint8_t padding = 16-length;
+    memset(&buffer[length], padding, padding);
     computeBlock(checksum, x, &l, buffer);
-
-    for(int j = 0; j < 4; ++j)
-    {
-        x2[4+j] = checksum2[j];
-        x2[8+j] = x2[4+j] ^ x2[j];
-    }   
-
-    uint8_t t = 0;
-
-    for(int j = 0; j < 18; ++j)
-    {
-        for(int k = 0; k < 48; ++k)
-        {
-            t = (x[k] ^= s[t]);
-        }
-        t += j;
-    }
-
+    computeBlock(checksum,x, &l, checksum);
     memcpy(digest, x, 16);
 }
--- a/MD5.cpp	Mon Sep 09 16:16:24 2013 +0000
+++ b/MD5.cpp	Wed Sep 11 17:22:40 2013 +0000
@@ -1,72 +1,31 @@
 #include "MD5.h"
 #include <string.h>
 
-static const uint32_t T[] =
-{
-    0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
-    0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
-    0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
-    0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
-    0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
-    0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
-    0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
-    0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
-    0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
-    0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
-    0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
-    0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
-    0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
-    0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
-    0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
-    0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
-};
-
 static const uint32_t A = 0x67452301;
 static const uint32_t B = 0xefcdab89;
 static const uint32_t C = 0x98badcfe;
 static const uint32_t D = 0x10325476;
 
-static uint32_t F(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) | ((~x) & z);
-}
-
-static uint32_t G(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & z) | (y & (~z));
-}
 
-static uint32_t H(uint32_t x, uint32_t y, uint32_t z)
-{
-    return x ^ y ^ z;
-}
+#define F(X,Y,Z) (((X) & (Y)) | ((~(X)) & (Z)))
+#define G(X,Y,Z) (((X) & (Z)) | ((Y) & (~(Z))))
+#define H(X,Y,Z) ((X) ^ (Y) ^ (Z))
+#define I(X,Y,Z) ((Y) ^ ((X) | (~(Z))))
 
-static uint32_t I(uint32_t x, uint32_t y, uint32_t z)
-{
-    return y ^ (x | (~z));
-}
+#define ROTL(W,N) (((W) << N) | ((W) >> (32-N)))
 
-static uint32_t rotLeft(uint32_t w, uint8_t s)
-{
-    return (w << s) | (w >> (32-s));
-}
+#define ROUND1(a,b,c,d,x,s,t) \
+    a = ROTL(a + F(b,c,d) + x + t,s) + b; 
+
+#define ROUND2(a,b,c,d,x,s,t) \
+    a = ROTL(a + G(b,c,d) + x + t,s) + b; 
 
-#define ROUND1(a,b,c,d,k,s,i) \
-    a += F(b,c,d) + x[k] + T[i-1]; \
-    a = rotLeft(a,s);\
-    a += b;
-#define ROUND2(a,b,c,d,k,s,i) \
-    a += G(b,c,d) + x[k] + T[i-1]; \
-    a = rotLeft(a,s);\
-    a += b;
-#define ROUND3(a,b,c,d,k,s,i) \
-    a += H(b,c,d) + x[k] + T[i-1]; \
-    a = rotLeft(a,s);\
-    a += b;
-#define ROUND4(a,b,c,d,k,s,i) \
-    a += I(b,c,d) + x[k] + T[i-1]; \
-    a = rotLeft(a,s);\
-    a += b;
+#define ROUND3(a,b,c,d,x,s,t) \
+    a = ROTL(a + H(b,c,d) + x + t,s) + b; 
+
+#define ROUND4(a,b,c,d,x,s,t) \
+    a = ROTL(a + I(b,c,d) + x + t,s) + b; 
+
 
     
 MD5::MD5():
@@ -118,17 +77,22 @@
         padding = 56 - (totalBufferLength % 64);
     else
         padding = 56 + (64 - (totalBufferLength % 64));
-    uint8_t val = 0x80;
-    add(&val, 1);
-    val = 0;
-    for(int i = 0; i < padding-1; ++i)
-        add(&val,1);
-    totalBufferLength -= padding;
-    uint64_t lengthBit = totalBufferLength * 8;
+    buffer[bufferLength++] = 0x80;
+    padding--;
+    if(padding+bufferLength == 56)
+        memset(&buffer[bufferLength], 0, padding);
+    else
+    {
+        memset(&buffer[bufferLength], 0, 64-bufferLength);
+        computeRounds(&a, &b, &c, &d, buffer);
+        memset(buffer, 0, bufferLength);
+    }
+    uint64_t lengthBit = totalBufferLength << 3;
     uint32_t lengthBitLow = lengthBit;
     uint32_t lengthBitHigh = lengthBit >> 32;
-    add((uint8_t*)&lengthBitLow,4);
-    add((uint8_t*)&lengthBitHigh,4);
+    memcpy(&buffer[56], &lengthBitLow, 4);
+    memcpy(&buffer[60], &lengthBitHigh, 4);
+    computeRounds(&a, &b, &c, &d, buffer);
 
     memcpy(digest, &a, 4);
     memcpy(&digest[4], &b, 4);
@@ -149,47 +113,44 @@
     uint32_t a = *a2, b = *b2, c = *c2, d = *d2;
     uint32_t tmpA = a, tmpB = b, tmpC = c, tmpD = d;
 
-    uint32_t x[16];
-    for(int j = 0; j < 16; ++j)
-        memcpy(&x[j], &buffer[j*4], 4); 
-        
+    uint32_t *x = (uint32_t*)buffer;
+       
     // Round 1
-    ROUND1(a,b,c,d,0,7,1);      ROUND1(d,a,b,c,1,12,2);     ROUND1(c,d,a,b,2,17,3);     ROUND1(b,c,d,a,3,22,4);
-    ROUND1(a,b,c,d,4,7,5);      ROUND1(d,a,b,c,5,12,6);     ROUND1(c,d,a,b,6,17,7);     ROUND1(b,c,d,a,7,22,8);
-    ROUND1(a,b,c,d,8,7,9);      ROUND1(d,a,b,c,9,12,10);    ROUND1(c,d,a,b,10,17,11);   ROUND1(b,c,d,a,11,22,12);
-    ROUND1(a,b,c,d,12,7,13);    ROUND1(d,a,b,c,13,12,14);   ROUND1(c,d,a,b,14,17,15);   ROUND1(b,c,d,a,15,22,16);
-    
+    ROUND1(a,b,c,d,x[0],7,0xd76aa478);     ROUND1(d,a,b,c,x[1],12,0xe8c7b756);    ROUND1(c,d,a,b,x[2],17,0x242070db);    ROUND1(b,c,d,a,x[3],22,0xc1bdceee);
+    ROUND1(a,b,c,d,x[4],7,0xf57c0faf);     ROUND1(d,a,b,c,x[5],12,0x4787c62a);    ROUND1(c,d,a,b,x[6],17,0xa8304613);    ROUND1(b,c,d,a,x[7],22,0xfd469501);
+    ROUND1(a,b,c,d,x[8],7,0x698098d8);     ROUND1(d,a,b,c,x[9],12,0x8b44f7af);    ROUND1(c,d,a,b,x[10],17,0xffff5bb1);   ROUND1(b,c,d,a,x[11],22,0x895cd7be);
+    ROUND1(a,b,c,d,x[12],7,0x6b901122);    ROUND1(d,a,b,c,x[13],12,0xfd987193);   ROUND1(c,d,a,b,x[14],17,0xa679438e);   ROUND1(b,c,d,a,x[15],22,0x49b40821);
+
+
     // Round 2      
-    ROUND2(a,b,c,d,1,5,17);     ROUND2(d,a,b,c,6,9,18);     ROUND2(c,d,a,b,11,14,19);   ROUND2(b,c,d,a,0,20,20);
-    ROUND2(a,b,c,d,5,5,21);     ROUND2(d,a,b,c,10,9,22);    ROUND2(c,d,a,b,15,14,23);   ROUND2(b,c,d,a,4,20,24);
-    ROUND2(a,b,c,d,9,5,25);     ROUND2(d,a,b,c,14,9,26);    ROUND2(c,d,a,b,3,14,27);    ROUND2(b,c,d,a,8,20,28);
-    ROUND2(a,b,c,d,13,5,29);    ROUND2(d,a,b,c,2,9,30);     ROUND2(c,d,a,b,7,14,31);    ROUND2(b,c,d,a,12,20,32);
+    ROUND2(a,b,c,d,x[1],5,0xf61e2562);     ROUND2(d,a,b,c,x[6],9,0xc040b340);     ROUND2(c,d,a,b,x[11],14,0x265e5a51);   ROUND2(b,c,d,a,x[0],20,0xe9b6c7aa);
+    ROUND2(a,b,c,d,x[5],5,0xd62f105d);     ROUND2(d,a,b,c,x[10],9,0x02441453);    ROUND2(c,d,a,b,x[15],14,0xd8a1e681);   ROUND2(b,c,d,a,x[4],20,0xe7d3fbc8);
+    ROUND2(a,b,c,d,x[9],5,0x21e1cde6);     ROUND2(d,a,b,c,x[14],9,0xc33707d6);    ROUND2(c,d,a,b,x[3],14,0xf4d50d87);    ROUND2(b,c,d,a,x[8],20,0x455a14ed);
+    ROUND2(a,b,c,d,x[13],5,0xa9e3e905);    ROUND2(d,a,b,c,x[2],9,0xfcefa3f8);     ROUND2(c,d,a,b,x[7],14,0x676f02d9);    ROUND2(b,c,d,a,x[12],20,0x8d2a4c8a);
     
+
     // Round 3      
-    ROUND3(a,b,c,d,5,4,33);     ROUND3(d,a,b,c,8,11,34);    ROUND3(c,d,a,b,11,16,35);   ROUND3(b,c,d,a,14,23,36);
-    ROUND3(a,b,c,d,1,4,37);     ROUND3(d,a,b,c,4,11,38);    ROUND3(c,d,a,b,7,16,39);    ROUND3(b,c,d,a,10,23,40);
-    ROUND3(a,b,c,d,13,4,41);    ROUND3(d,a,b,c,0,11,42);    ROUND3(c,d,a,b,3,16,43);    ROUND3(b,c,d,a,6,23,44);
-    ROUND3(a,b,c,d,9,4,45);     ROUND3(d,a,b,c,12,11,46);   ROUND3(c,d,a,b,15,16,47);   ROUND3(b,c,d,a,2,23,48);
-    
+    ROUND3(a,b,c,d,x[5],4,0xfffa3942);     ROUND3(d,a,b,c,x[8],11,0x8771f681);    ROUND3(c,d,a,b,x[11],16,0x6d9d6122);   ROUND3(b,c,d,a,x[14],23,0xfde5380c);
+    ROUND3(a,b,c,d,x[1],4,0xa4beea44);     ROUND3(d,a,b,c,x[4],11,0x4bdecfa9);    ROUND3(c,d,a,b,x[7],16,0xf6bb4b60);    ROUND3(b,c,d,a,x[10],23,0xbebfbc70);
+    ROUND3(a,b,c,d,x[13],4,0x289b7ec6);    ROUND3(d,a,b,c,x[0],11,0xeaa127fa);    ROUND3(c,d,a,b,x[3],16,0xd4ef3085);    ROUND3(b,c,d,a,x[6],23,0x04881d05);
+    ROUND3(a,b,c,d,x[9],4,0xd9d4d039);     ROUND3(d,a,b,c,x[12],11,0xe6db99e5);   ROUND3(c,d,a,b,x[15],16,0x1fa27cf8);   ROUND3(b,c,d,a,x[2],23,0xc4ac5665);
+ 
+ 
     // Round 4
-    ROUND4(a,b,c,d,0,6,49);     ROUND4(d,a,b,c,7,10,50);    ROUND4(c,d,a,b,14,15,51);   ROUND4(b,c,d,a,5,21,52);
-    ROUND4(a,b,c,d,12,6,53);    ROUND4(d,a,b,c,3,10,54);    ROUND4(c,d,a,b,10,15,55);   ROUND4(b,c,d,a,1,21,56);
-    ROUND4(a,b,c,d,8,6,57);     ROUND4(d,a,b,c,15,10,58);   ROUND4(c,d,a,b,6,15,59);    ROUND4(b,c,d,a,13,21,60);
-    ROUND4(a,b,c,d,4,6,61);     ROUND4(d,a,b,c,11,10,62);   ROUND4(c,d,a,b,2,15,63);    ROUND4(b,c,d,a,9,21,64);
-
-    a += tmpA;
-    b += tmpB;
-    c += tmpC;
-    d += tmpD;
-
-    *a2 = a;
-    *b2 = b;
-    *c2 = c;
-    *d2 = d;
+    ROUND4(a,b,c,d,x[0],6,0xf4292244);     ROUND4(d,a,b,c,x[7],10,0x432aff97);    ROUND4(c,d,a,b,x[14],15,0xab9423a7);   ROUND4(b,c,d,a,x[5],21,0xfc93a039);
+    ROUND4(a,b,c,d,x[12],6,0x655b59c3);    ROUND4(d,a,b,c,x[3],10,0x8f0ccc92);    ROUND4(c,d,a,b,x[10],15,0xffeff47d);   ROUND4(b,c,d,a,x[1],21,0x85845dd1);
+    ROUND4(a,b,c,d,x[8],6,0x6fa87e4f);     ROUND4(d,a,b,c,x[15],10,0xfe2ce6e0);   ROUND4(c,d,a,b,x[6],15,0xa3014314);    ROUND4(b,c,d,a,x[13],21,0x4e0811a1);
+    ROUND4(a,b,c,d,x[4],6,0xf7537e82);     ROUND4(d,a,b,c,x[11],10,0xbd3af235);   ROUND4(c,d,a,b,x[2],15,0x2ad7d2bb);    ROUND4(b,c,d,a,x[9],21,0xeb86d391);
+    
+    *a2 = a + tmpA;
+    *b2 = b + tmpB;
+    *c2 = c + tmpC;
+    *d2 = d + tmpD;
 }
 
 void MD5::computeDigest(uint8_t *digest, uint8_t *msg, uint32_t length)
 {
+    uint64_t lengthBit = length << 3;
     uint16_t padding;
     if(length % 64 < 56)
         padding = 56 - (length % 64);
@@ -197,29 +158,25 @@
         padding = 56 + (64 - (length % 64));
         
     uint32_t a = A, b = B, c = C, d = D;
-
-    uint32_t offset = 0;
-    while(length - offset >= 64)
+    while(length >= 64)
     {
-        computeRounds(&a, &b, &c, &d, &msg[offset]);
-        offset += 64;
+        computeRounds(&a, &b, &c, &d, msg);
+        msg += 64;
+        length -= 64;
     }
     uint8_t buffer[64];
-    memcpy(buffer, &msg[offset], length-offset);
-    uint8_t bufferLength = length - offset;
-    buffer[bufferLength++] = 0x80;
+    memcpy(buffer, msg, length);
+    buffer[length++] = 0x80;
     padding--;
-    while(padding > 0)
+    if(padding+length == 56)
+        memset(&buffer[length], 0, padding);
+    else
     {
-        if(bufferLength == 64)
-        {
-            computeRounds(&a, &b, &c, &d, buffer);
-            bufferLength = 0;
-        }
-        buffer[bufferLength++] = 0;
-        padding--;
+        memset(&buffer[length], 0, 64-length);
+        computeRounds(&a, &b, &c, &d, msg);
+        memset(buffer, 0, length);
     }
-    uint64_t lengthBit = length * 8;
+
     uint32_t lengthBitLow = lengthBit;
     uint32_t lengthBitHigh = lengthBit >> 32;
     memcpy(&buffer[56], &lengthBitLow, 4);
--- a/SHA1.cpp	Mon Sep 09 16:16:24 2013 +0000
+++ b/SHA1.cpp	Wed Sep 11 17:22:40 2013 +0000
@@ -1,47 +1,41 @@
 #include "SHA1.h"
 #include <string.h>
-
-static uint32_t f(uint8_t t, uint32_t B, uint32_t C, uint32_t D)
-{
-    if(t <= 19)
-        return (B & C) | ((~B) & D);
-    else if(t <= 39)
-        return B ^ C ^ D;
-    else if(t <= 59)
-        return (B & C) | (B & D) | (C & D);
-    else if(t <= 79)
-        return B ^ C ^ D;
-    
-    return 0;
-}
+#include <stdio.h>
+#include <stdlib.h>
 
-static uint32_t K(uint8_t t)
-{
-    if(t <= 19)
-      return 0x5A827999;
-    else if(t <= 39)
-      return 0x6ED9EBA1;
-    else if(t <= 59)
-      return 0x8F1BBCDC;
-    else if(t <= 79)
-      return 0xCA62C1D6;
-    
-    return 0;
-}
+#define F0(B,C,D) ((B & C) | ((~B) & D))
+#define F1(B,C,D) (B ^ C ^ D)
+#define F2(B,C,D) ((B & C) | (B & D) | (C & D))
+#define ROTL(W,N) (((W) << N) | ((W) >> (32-N)))
+                        
+static const uint32_t K0 = 0x5A827999;
+static const uint32_t K1 = 0x6ED9EBA1;
+static const uint32_t K2 = 0x8F1BBCDC;
+static const uint32_t K3 = 0xCA62C1D6;
 
-static uint32_t rotLeft(uint32_t w, uint8_t n)
-{
-    return (w << n) | (w >> (32-n));
-}
 
 static const uint32_t H0 = 0x67452301;
 static const uint32_t H1 = 0xEFCDAB89;
 static const uint32_t H2 = 0x98BADCFE;
 static const uint32_t H3 = 0x10325476;
 static const uint32_t H4 = 0xC3D2E1F0;
-static const uint32_t MASK = 0x0000000F;
+
+static const uint32_t MASK = 0xF;
+
+#define W(s) ( w[s] = ROTL(w[((s) + 13) & MASK] ^ w[((s) + 8) & MASK] ^ w[((s) + 2) & MASK] ^ w[s],1))
 
+#define R0(A,B,C,D,E,T) E += ROTL(A, 5) + F0(B, C, D) + w[T] + K0; \
+                        B = ROTL(B,30);
+#define R1(A,B,C,D,E,T) E += ROTL(A, 5) + F0(B, C, D) + W(T & MASK) + K0; \
+                        B = ROTL(B,30); 
+#define R2(A,B,C,D,E,T) E += ROTL(A, 5) + F1(B, C, D) + W(T & MASK) + K1; \
+                        B = ROTL(B,30); 
+#define R3(A,B,C,D,E,T) E += ROTL(A, 5) + F2(B, C, D) + W(T & MASK) + K2; \
+                        B = ROTL(B,30); 
+#define R4(A,B,C,D,E,T) E += ROTL(A, 5) + F1(B, C, D) + W(T & MASK) + K3; \
+                        B = ROTL(B,30); 
 
+                        
 SHA1::SHA1():
 HashAlgorithm(),
 h0(H0),
@@ -87,53 +81,38 @@
 
 void SHA1::computeDigest(uint8_t *digest)
 {
+    uint32_t *digest2 = (uint32_t*)digest;
     uint16_t padding;
     if(totalBufferLength % 64 < 56)
         padding = 56 - (totalBufferLength % 64);
     else
         padding = 56 + (64 - (totalBufferLength % 64));
-    uint8_t val = 0x80;
-    add(&val, 1);
-    val = 0;
-    for(int i = 0; i < padding-1; ++i)
-        add(&val,1);
-    totalBufferLength -= padding;
-    uint64_t lengthBit = totalBufferLength * 8;
+        
+    buffer[bufferLength++] = 0x80;
+    padding--;
+    if(padding+bufferLength == 56)
+        memset(&buffer[bufferLength], 0, padding);
+    else
+    {
+        memset(&buffer[bufferLength], 0, 64-bufferLength);
+        computeBlock(&h0,&h1,&h2,&h3,&h4, buffer);
+        memset(buffer, 0, 48);
+    }
+    
+    uint64_t lengthBit = totalBufferLength << 3;
     uint32_t lengthBitLow = lengthBit;
     uint32_t lengthBitHigh = lengthBit >> 32;
-    uint8_t l[4];
-
-    l[0] = lengthBitHigh >> 24;
-    l[1] = lengthBitHigh >> 16;
-    l[2] = lengthBitHigh >> 8;
-    l[3] = lengthBitHigh;
-    add(l, 4);
-    l[0] = lengthBitLow >> 24;
-    l[1] = lengthBitLow >> 16;
-    l[2] = lengthBitLow >> 8;
-    l[3] = lengthBitLow;
-    add(l, 4);
-
-    digest[0] = h0 >> 24;
-    digest[1] = h0 >> 16;
-    digest[2] = h0 >> 8;
-    digest[3] = h0;
-    digest[4] = h1 >> 24;
-    digest[5] = h1 >> 16;
-    digest[6] = h1 >> 8;
-    digest[7] = h1;
-    digest[8] = h2 >> 24;
-    digest[9] = h2 >> 16;
-    digest[10] = h2 >> 8;
-    digest[11] = h2;
-    digest[12] = h3 >> 24;
-    digest[13] = h3 >> 16;
-    digest[14] = h3 >> 8;
-    digest[15] = h3;
-    digest[16] = h4 >> 24;
-    digest[17] = h4 >> 16;
-    digest[18] = h4 >> 8;
-    digest[19] = h4;
+    lengthBitLow = __rev(lengthBitLow);
+    lengthBitHigh = __rev(lengthBitHigh);
+    memcpy(&buffer[56], &lengthBitHigh, 4);
+    memcpy(&buffer[60], &lengthBitLow, 4);
+    computeBlock(&h0,&h1,&h2,&h3,&h4, buffer);
+    
+    digest2[0] = __rev(h0);
+    digest2[1] = __rev(h1);
+    digest2[2] = __rev(h2);
+    digest2[3] = __rev(h3);
+    digest2[4] = __rev(h4);
     
     // reset state
     h0 = H0;
@@ -147,27 +126,39 @@
 
 void SHA1::computeBlock(uint32_t *h02, uint32_t *h12, uint32_t *h22, uint32_t *h32, uint32_t *h42, uint8_t *buffer)
 {
+    uint32_t *buffer2 = (uint32_t*)buffer;
     uint32_t w[16];
-    for(int j = 0; j < 16; ++j)
-        w[j] = (buffer[j*4] << 24) | (buffer[j*4+1] << 16) | (buffer[j*4+2] << 8) | buffer[j*4+3];
+    for(int t = 0; t < 16; ++t)
+        w[t] = __rev(buffer2[t]);
     
     uint32_t a = *h02, b = *h12, c = *h22, d = *h32, e = *h42;
-    for(uint8_t t = 0; t < 80; ++t)
-    {
-        uint32_t s = t & MASK;
-        if(t >= 16)
-        {
-            w[s%16] = w[((s + 13) & MASK)%16] ^ w[((s + 8) & MASK)%16] ^ w[((s + 2) & MASK)%16] ^ w[s%16];
-            w[s%16] = rotLeft(w[s%16], 1);
-        }
+    
+    R0(a,b,c,d,e, 0) R0(e,a,b,c,d, 1) R0(d,e,a,b,c, 2) R0(c,d,e,a,b, 3)
+    R0(b,c,d,e,a, 4) R0(a,b,c,d,e, 5) R0(e,a,b,c,d, 6) R0(d,e,a,b,c, 7)
+    R0(c,d,e,a,b, 8) R0(b,c,d,e,a, 9) R0(a,b,c,d,e,10) R0(e,a,b,c,d,11)
+    R0(d,e,a,b,c,12) R0(c,d,e,a,b,13) R0(b,c,d,e,a,14) R0(a,b,c,d,e,15)
+    R1(e,a,b,c,d,16) R1(d,e,a,b,c,17) R1(c,d,e,a,b,18) R1(b,c,d,e,a,19)    
+    
+    
+    R2(a,b,c,d,e,20) R2(e,a,b,c,d,21) R2(d,e,a,b,c,22) R2(c,d,e,a,b,23)
+    R2(b,c,d,e,a,24) R2(a,b,c,d,e,25) R2(e,a,b,c,d,26) R2(d,e,a,b,c,27)
+    R2(c,d,e,a,b,28) R2(b,c,d,e,a,29) R2(a,b,c,d,e,30) R2(e,a,b,c,d,31)
+    R2(d,e,a,b,c,32) R2(c,d,e,a,b,33) R2(b,c,d,e,a,34) R2(a,b,c,d,e,35)
+    R2(e,a,b,c,d,36) R2(d,e,a,b,c,37) R2(c,d,e,a,b,38) R2(b,c,d,e,a,39)    
+    
+    R3(a,b,c,d,e,40) R3(e,a,b,c,d,41) R3(d,e,a,b,c,42) R3(c,d,e,a,b,43)
+    R3(b,c,d,e,a,44) R3(a,b,c,d,e,45) R3(e,a,b,c,d,46) R3(d,e,a,b,c,47)
+    R3(c,d,e,a,b,48) R3(b,c,d,e,a,49) R3(a,b,c,d,e,50) R3(e,a,b,c,d,51)
+    R3(d,e,a,b,c,52) R3(c,d,e,a,b,53) R3(b,c,d,e,a,54) R3(a,b,c,d,e,55)
+    R3(e,a,b,c,d,56) R3(d,e,a,b,c,57) R3(c,d,e,a,b,58) R3(b,c,d,e,a,59)    
+    
+    
+    R4(a,b,c,d,e,60) R4(e,a,b,c,d,61) R4(d,e,a,b,c,62) R4(c,d,e,a,b,63)
+    R4(b,c,d,e,a,64) R4(a,b,c,d,e,65) R4(e,a,b,c,d,66) R4(d,e,a,b,c,67)
+    R4(c,d,e,a,b,68) R4(b,c,d,e,a,69) R4(a,b,c,d,e,70) R4(e,a,b,c,d,71)
+    R4(d,e,a,b,c,72) R4(c,d,e,a,b,73) R4(b,c,d,e,a,74) R4(a,b,c,d,e,75)
+    R4(e,a,b,c,d,76) R4(d,e,a,b,c,77) R4(c,d,e,a,b,78) R4(b,c,d,e,a,79)    
         
-        uint32_t temp = rotLeft(a, 5) + f(t, b, c, d) + e + w[s%16] + K(t);
-        e = d;
-        d = c;
-        c = rotLeft(b,30);
-        b = a;
-        a = temp;
-    }
     *h02 += a;
     *h12 += b;
     *h22 += c;
@@ -177,39 +168,37 @@
 
 
 
-/* method 2 */
+/* method 1 */
 void SHA1::computeDigest(uint8_t *digest, uint8_t *in, uint32_t length)
 {
-    uint16_t padding;
+    uint64_t lengthBit = length << 3;
+    uint32_t padding;
     if(length % 64 < 56)
         padding = 56 - (length % 64);
     else
         padding = 56 + (64 - (length % 64));
         
     uint32_t h0 = H0, h1 = H1, h2 = H2, h3 = H3, h4 = H4;
-    uint32_t offset = 0;
-    while(length - offset >= 64)
+    while(length >= 64)
     {
-        computeBlock(&h0,&h1,&h2,&h3,&h4, &in[offset]);
-        offset += 64;
+        computeBlock(&h0,&h1,&h2,&h3,&h4, in);
+        length -= 64;
+        in += 64;
+    }
+   
+    uint8_t buffer[64];
+    memcpy(buffer, in, length);
+    buffer[length++] = 0x80;
+    padding--;
+    if(padding+length+8 == 64)
+        memset(&buffer[length], 0, padding);
+    else
+    {
+        memset(&buffer[length], 0, 64-length);
+        computeBlock(&h0,&h1,&h2,&h3,&h4, buffer);
+        memset(buffer, 0, length);
     }
 
-    uint8_t bufferLength = length - offset;
-    uint8_t buffer[64];
-    memcpy(buffer, &in[offset], bufferLength);
-    buffer[bufferLength++] = 0x80;
-    padding--;
-    if(padding+bufferLength+8 == 64)
-        memset(&buffer[bufferLength], 0, padding);
-    else
-    {
-        memset(&buffer[bufferLength], 0, 64-bufferLength);
-        padding -= 64-bufferLength;
-        computeBlock(&h0,&h1,&h2,&h3,&h4, buffer);
-        memset(buffer, 0, 48);
-    }
-
-    uint64_t lengthBit = length * 8;
     uint32_t lengthBitLow = lengthBit;
     uint32_t lengthBitHigh = lengthBit >> 32;
     lengthBitLow = __rev(lengthBitLow);
--- a/SHA2_32.cpp	Mon Sep 09 16:16:24 2013 +0000
+++ b/SHA2_32.cpp	Wed Sep 11 17:22:40 2013 +0000
@@ -23,47 +23,19 @@
     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 
-static uint32_t rotLeft(uint32_t w, uint8_t n)
-{
-    return (w << n) | (w >> (32-n));
-}
-
-static uint32_t rotRight(uint32_t w, uint8_t n)
-{
-    return rotLeft(w,32-n);
-}
-
-static uint32_t CH(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) ^ ((~x) & z);
-}
-
-static uint32_t MAJ(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) ^ (x & z) ^ (y & z);
-}
-
-static uint32_t BSIG0(uint32_t x)
-{
-    return rotRight(x,2) ^ rotRight(x,13) ^ rotRight(x,22);
-}
-
-static uint32_t BSIG1(uint32_t x)
-{
-    return rotRight(x,6) ^ rotRight(x,11) ^ rotRight(x,25);
-}
-
-static uint32_t SSIG0(uint32_t x)
-{
-    return rotRight(x,7) ^ rotRight(x,18) ^ (x >> 3);
-}
- 
-static uint32_t SSIG1(uint32_t x)
-{
-    return rotRight(x,17) ^ rotRight(x,19) ^ (x >> 10);
-}
-
-
+#define ROTL(W,N) (((W) << (N)) | ((W) >> (32-(N))))
+#define ROTR(W,N) (((W) >> (N)) | ((W) << (32-(N))))
+#define CH(X,Y,Z) (((X) & (Y)) ^ ((~(X)) & (Z)))
+#define MAJ(X,Y,Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z)))
+#define BSIG0(X) (ROTR(X,2) ^ ROTR(X,13) ^ ROTR(X,22))
+#define BSIG1(X) (ROTR(X,6) ^ ROTR(X,11) ^ ROTR(X,25))
+#define SSIG0(X) (ROTR((X),7) ^ ROTR((X),18) ^ ((X) >> 3))
+#define SSIG1(X) (ROTR((X),17) ^ ROTR((X),19) ^ ((X) >> 10))
+#define R(A,B,C,D,E,F,G,H,T)  T1 = H + BSIG1(E) + CH(E,F,G) + K[T] + w[T]; \
+                              T2 = BSIG0(A) + MAJ(A,B,C); \
+                              D += T1; \
+                              H = T1 + T2;
+        
 static const uint32_t H[] =
 {
     // SHA-224
@@ -138,62 +110,46 @@
         padding = 56 - (totalBufferLength % 64);
     else
         padding = 56 + (64 - (totalBufferLength % 64));
-    uint8_t val = 0x80;
-    add(&val, 1);
-    val = 0;
-    for(int i = 0; i < padding-1; ++i)
-        add(&val,1);
-    totalBufferLength -= padding;
-    uint64_t lengthBit = totalBufferLength * 8;
+
+    buffer[bufferLength++] = 0x80;
+    padding--;
+    if(padding+bufferLength == 56)
+        memset(&buffer[bufferLength], 0, padding);
+    else
+    {
+        memset(&buffer[bufferLength], 0, 64-bufferLength);
+        computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, buffer);
+        memset(buffer, 0, bufferLength);
+    }
+    
+    uint64_t lengthBit = totalBufferLength << 3;
     uint32_t lengthBitLow = lengthBit;
     uint32_t lengthBitHigh = lengthBit >> 32;
-    uint8_t tmp[4];
-    tmp[0] = lengthBitHigh >> 24;
-    tmp[1] = lengthBitHigh >> 16;
-    tmp[2] = lengthBitHigh >> 8;
-    tmp[3] = lengthBitHigh;
-    add(tmp, 4);
-    tmp[0] = lengthBitLow >> 24;
-    tmp[1] = lengthBitLow >> 16;
-    tmp[2] = lengthBitLow >> 8;
-    tmp[3] = lengthBitLow;
-    add(tmp, 4);
+    lengthBitLow = __rev(lengthBitLow);
+    lengthBitHigh = __rev(lengthBitHigh);
+    memcpy(&buffer[60], &lengthBitLow, 4);    
+    memcpy(&buffer[56], &lengthBitHigh, 4);    
+    computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, buffer);
 
-    digest[0] = h0 >> 24;
-    digest[1] = h0 >> 16;
-    digest[2] = h0 >> 8;
-    digest[3] = h0;
-    digest[4] = h1 >> 24;
-    digest[5] = h1 >> 16;
-    digest[6] = h1 >> 8;
-    digest[7] = h1;
-    digest[8] = h2 >> 24;
-    digest[9] = h2 >> 16;
-    digest[10] = h2 >> 8;
-    digest[11] = h2;
-    digest[12] = h3 >> 24;
-    digest[13] = h3 >> 16;
-    digest[14] = h3 >> 8;
-    digest[15] = h3;
-    digest[16] = h4 >> 24;
-    digest[17] = h4 >> 16;
-    digest[18] = h4 >> 8;
-    digest[19] = h4;
-    digest[20] = h5 >> 24;
-    digest[21] = h5 >> 16;
-    digest[22] = h5 >> 8;
-    digest[23] = h5;
-    digest[24] = h6 >> 24;
-    digest[25] = h6 >> 16;
-    digest[26] = h6 >> 8;
-    digest[27] = h6;
-
+    h0 = __rev(h0);
+    h1 = __rev(h1);
+    h2 = __rev(h2);
+    h3 = __rev(h3);
+    h4 = __rev(h4);
+    h5 = __rev(h5);
+    h6 = __rev(h6);
+    memcpy(digest, &h0, 4);
+    memcpy(&digest[4], &h1, 4);
+    memcpy(&digest[8], &h2, 4);
+    memcpy(&digest[12], &h3, 4);
+    memcpy(&digest[16], &h4, 4);
+    memcpy(&digest[20], &h5, 4);
+    memcpy(&digest[24], &h6, 4);
+    
     if(type == SHA_256)
     {
-        digest[28] = h7 >> 24;
-        digest[29] = h7 >> 16;
-        digest[30] = h7 >> 8;
-        digest[31] = h7;
+        h7 = __rev(h7);
+        memcpy(&digest[28], &h7, 4);
     }
     
     // reset state
@@ -236,27 +192,102 @@
                         uint8_t *buffer)
 {
     uint32_t w[64];
-    for(int t = 0; t < 16; ++t)
-    {
-        w[t] = (buffer[t*4] << 24) | (buffer[t*4+1] << 16) | (buffer[t*4+2] << 8) | buffer[t*4+3]; 
-    }
+    uint32_t *buffer2 = (uint32_t*)buffer;
+    w[0] = __rev(buffer2[0]);
+    w[1] = __rev(buffer2[1]);
+    w[2] = __rev(buffer2[2]);
+    w[3] = __rev(buffer2[3]);
+    w[4] = __rev(buffer2[4]);
+    w[5] = __rev(buffer2[5]);
+    w[6] = __rev(buffer2[6]);
+    w[7] = __rev(buffer2[7]);
+    w[8] = __rev(buffer2[8]);
+    w[9] = __rev(buffer2[9]);
+    w[10] = __rev(buffer2[10]);
+    w[11] = __rev(buffer2[11]);
+    w[12] = __rev(buffer2[12]);
+    w[13] = __rev(buffer2[13]);
+    w[14] = __rev(buffer2[14]);
+    w[15] = __rev(buffer2[15]);
+
     for(int t = 16; t < 64; ++t)
         w[t] = SSIG1(w[t-2]) + w[t-7] + SSIG0(w[t-15]) + w[t-16];
     
-     uint32_t a = *h02, b = *h12, c = *h22, d = *h32, e = *h42, f = *h52, g = *h62, h = *h72;
-    for(int t = 0; t < 64; ++t)
-    {
-        uint32_t T1 = h + BSIG1(e) + CH(e,f,g) + K[t] + w[t];
-        uint32_t T2 = BSIG0(a) + MAJ(a,b,c);
-        h = g;
-        g = f;
-        f = e;
-        e = d + T1;
-        d = c;
-        c = b;
-        b = a;
-        a = T1 + T2;
-    }
+    uint32_t a = *h02, b = *h12, c = *h22, d = *h32, e = *h42, f = *h52, g = *h62, h = *h72;
+    uint32_t T1, T2;
+    
+    R(a,b,c,d,e,f,g,h,0)
+    R(h,a,b,c,d,e,f,g,1)
+    R(g,h,a,b,c,d,e,f,2)
+    R(f,g,h,a,b,c,d,e,3)
+    R(e,f,g,h,a,b,c,d,4)
+    R(d,e,f,g,h,a,b,c,5)
+    R(c,d,e,f,g,h,a,b,6)
+    R(b,c,d,e,f,g,h,a,7)
+
+    R(a,b,c,d,e,f,g,h,8)
+    R(h,a,b,c,d,e,f,g,9)
+    R(g,h,a,b,c,d,e,f,10)
+    R(f,g,h,a,b,c,d,e,11)
+    R(e,f,g,h,a,b,c,d,12)
+    R(d,e,f,g,h,a,b,c,13)
+    R(c,d,e,f,g,h,a,b,14)
+    R(b,c,d,e,f,g,h,a,15)
+    
+    R(a,b,c,d,e,f,g,h,16)
+    R(h,a,b,c,d,e,f,g,17)
+    R(g,h,a,b,c,d,e,f,18)
+    R(f,g,h,a,b,c,d,e,19)
+    R(e,f,g,h,a,b,c,d,20)
+    R(d,e,f,g,h,a,b,c,21)
+    R(c,d,e,f,g,h,a,b,22)
+    R(b,c,d,e,f,g,h,a,23)
+    
+    R(a,b,c,d,e,f,g,h,24)
+    R(h,a,b,c,d,e,f,g,25)
+    R(g,h,a,b,c,d,e,f,26)
+    R(f,g,h,a,b,c,d,e,27)
+    R(e,f,g,h,a,b,c,d,28)
+    R(d,e,f,g,h,a,b,c,29)
+    R(c,d,e,f,g,h,a,b,30)
+    R(b,c,d,e,f,g,h,a,31) 
+    
+    R(a,b,c,d,e,f,g,h,32)
+    R(h,a,b,c,d,e,f,g,33)
+    R(g,h,a,b,c,d,e,f,34)
+    R(f,g,h,a,b,c,d,e,35)
+    R(e,f,g,h,a,b,c,d,36)
+    R(d,e,f,g,h,a,b,c,37)
+    R(c,d,e,f,g,h,a,b,38)
+    R(b,c,d,e,f,g,h,a,39)
+    
+    R(a,b,c,d,e,f,g,h,40)
+    R(h,a,b,c,d,e,f,g,41)
+    R(g,h,a,b,c,d,e,f,42)
+    R(f,g,h,a,b,c,d,e,43)
+    R(e,f,g,h,a,b,c,d,44)
+    R(d,e,f,g,h,a,b,c,45)
+    R(c,d,e,f,g,h,a,b,46)
+    R(b,c,d,e,f,g,h,a,47)
+
+    R(a,b,c,d,e,f,g,h,48)
+    R(h,a,b,c,d,e,f,g,49)
+    R(g,h,a,b,c,d,e,f,50)
+    R(f,g,h,a,b,c,d,e,51)
+    R(e,f,g,h,a,b,c,d,52)
+    R(d,e,f,g,h,a,b,c,53)
+    R(c,d,e,f,g,h,a,b,54)
+    R(b,c,d,e,f,g,h,a,55)
+    
+    R(a,b,c,d,e,f,g,h,56)
+    R(h,a,b,c,d,e,f,g,57)
+    R(g,h,a,b,c,d,e,f,58)
+    R(f,g,h,a,b,c,d,e,59)
+    R(e,f,g,h,a,b,c,d,60)
+    R(d,e,f,g,h,a,b,c,61)
+    R(c,d,e,f,g,h,a,b,62)
+    R(b,c,d,e,f,g,h,a,63)
+    
     
     *h02 += a;
     *h12 += b;
@@ -272,35 +303,32 @@
 {
     uint32_t h0 = H[type*8], h1 = H[type*8+1], h2 = H[type*8+2], h3 = H[type*8+3];
     uint32_t h4 = H[type*8+4], h5 = H[type*8+5], h6 = H[type*8+6], h7 = H[type*8+7];
-    int offset = 0;
-    while(length - offset >= 64)
-    {
-        computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &in[offset]);
-        offset += 64;
-    }
-    uint8_t bufferLength = length-offset;
-    uint8_t buffer[64];
-    memcpy(buffer, &in[offset],bufferLength); 
+    uint64_t lengthBit = length << 3;
     uint16_t padding;
     if(length % 64 < 56)
         padding = 56 - (length % 64);
     else
         padding = 56 + (64 - (length % 64));
-    buffer[bufferLength] = 0x80;
-    bufferLength++;
-    padding--;
-    while(padding > 0)
+        
+    while(length >= 64)
     {
-        if(bufferLength == 64)
-        {
-            computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, buffer);
-            bufferLength = 0;
-        }
-        buffer[bufferLength] = 0;
-        bufferLength++;
-        padding--;
+        computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, in);
+        length -= 64;
+        in += 64;
     }
-    uint64_t lengthBit = length * 8;
+    uint8_t buffer[64];
+    memcpy(buffer, in,length); 
+    buffer[length++] = 0x80;
+    padding--;
+    if(padding+length == 56)
+        memset(&buffer[length], 0, padding);
+    else
+    {
+        memset(&buffer[length], 0, 64-length);
+        computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, buffer);
+        memset(buffer, 0, length);
+    }
+    
     uint32_t lengthBitLow = lengthBit;
     uint32_t lengthBitHigh = lengthBit >> 32;
     lengthBitLow = __rev(lengthBitLow);
--- a/SHA2_64.cpp	Mon Sep 09 16:16:24 2013 +0000
+++ b/SHA2_64.cpp	Wed Sep 11 17:22:40 2013 +0000
@@ -39,74 +39,26 @@
 
 static uint64_t revWord(uint64_t w)
 {
-    uint8_t buffer[8];
-    buffer[0] = w >> 56;
-    buffer[1] = w >> 48;
-    buffer[2] = w >> 40;
-    buffer[3] = w >> 32;
-    buffer[4] = w >> 24;
-    buffer[5] = w >> 16;
-    buffer[6] = w >> 8;
-    buffer[7] = w;
-    
-    uint64_t res = buffer[7];
-    res <<= 8;
-    res |= buffer[6];
-    res <<= 8;
-    res |= buffer[5];
-    res <<= 8;
-    res |= buffer[4];
-    res <<= 8;
-    res |= buffer[3];
-    res <<= 8;
-    res |= buffer[2];
-    res <<= 8;
-    res |= buffer[1];
-    res <<= 8;
-    res |= buffer[0];
-
-    return res;
-}
-    
-static uint64_t rotLeft(uint64_t w, uint8_t n)
-{
-    return (w << n) | (w >> (64-n));
+    return (w >> 56) 
+        | ((w & 0x00FF000000000000) >> 40) 
+        | ((w & 0x0000FF0000000000) >> 24) 
+        | ((w & 0x000000FF00000000) >> 8) 
+        | ((w & 0x00000000FF000000) << 8) 
+        | ((w & 0x0000000000FF0000) << 24) 
+        | ((w & 0x000000000000FF00) << 40) 
+        | ((w & 0x00000000000000FF) << 56);
 }
 
-static uint64_t rotRight(uint64_t w, uint8_t n)
-{
-    return rotLeft(w,64-n);
-}
-
-static uint64_t CH(uint64_t x, uint64_t y, uint64_t z)
-{
-    return (x & y) ^ ((~x) & z);
-}
-
-static uint64_t MAJ(uint64_t x, uint64_t y, uint64_t z)
-{
-    return (x & y) ^ (x & z) ^ (y & z);
-}
+#define ROTL(W,N) (((W) << (N)) | ((W) >> (64-(N))))
+#define ROTR(W,N) (((W) >> (N)) | ((W) << (64-(N))))
+#define CH(X,Y,Z) (((X) & (Y)) ^ ((~(X)) & (Z)))
+#define MAJ(X,Y,Z) (((X) & (Y)) ^ ((X) & (Z)) ^ ((Y) & (Z)))
+#define BSIG0(X) (ROTR(X,28) ^ ROTR(X,34) ^ ROTR(X,39))
+#define BSIG1(X) (ROTR(X,14) ^ ROTR(X,18) ^ ROTR(X,41))
+#define SSIG0(X) (ROTR((X),1) ^ ROTR((X),8) ^ ((X) >> 7))
+#define SSIG1(X) (ROTR((X),19) ^ ROTR((X),61) ^ ((X) >> 6))
 
-static uint64_t BSIG0(uint64_t x)
-{
-    return rotRight(x,28) ^ rotRight(x,34) ^ rotRight(x,39);
-}
-
-static uint64_t BSIG1(uint64_t x)
-{
-    return rotRight(x,14) ^ rotRight(x,18) ^ rotRight(x,41);
-}
-
-static uint64_t SSIG0(uint64_t x)
-{
-    return rotRight(x,1) ^ rotRight(x,8) ^ (x >> 7);
-}
-
-static uint64_t SSIG1(uint64_t x)
-{
-    return rotRight(x,19) ^ rotRight(x,61) ^ (x>>6);
-}
+    
 
 SHA2_64::SHA2_64(SHA2_64_TYPE t):
 type(t),
@@ -280,37 +232,37 @@
 
 void SHA2_64::computeDigest(SHA2_64_TYPE type, uint8_t *digest, uint8_t *in, uint32_t length)
 {
+    uint64_t lengthBit = length * 8;
     uint64_t h0 = H[type*8], h1 = H[type*8+1], h2 = H[type*8+2], h3 = H[type*8+3];
     uint64_t h4 = H[type*8+4], h5 = H[type*8+5], h6 = H[type*8+6], h7 = H[type*8+7];
-    int offset = 0;
-    while(length - offset >= 128)
-    {
-        computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, &in[offset]);
-        offset += 128;
-    }
-    uint8_t bufferLength = length-offset;
-    uint8_t buffer[128];
-    memcpy(buffer, &in[offset],bufferLength); 
-    uint16_t padding;
+    
+    int padding;
     if(length % 128 < 112)
         padding = 112 - (length % 128);
     else
         padding = 112 + (128 - (length % 128));
-    buffer[bufferLength] = 0x80;
-    bufferLength++;
-    padding--;
-    while(padding > 0)
+        
+    while(length >= 128)
     {
-        if(bufferLength == 128)
-        {
-            computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, buffer);
-            bufferLength = 0;
-        }
-        buffer[bufferLength] = 0;
-        bufferLength++;
-        padding--;
+        computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, in);
+        in += 128;
+        length -= 128;
     }
-    uint64_t lengthBit = length * 8;
+    uint8_t buffer[128];
+    memcpy(buffer, in,length); 
+    buffer[length] = 0x80;
+    length++;
+    padding--;
+
+    if(padding+length == 112)
+        memset(&buffer[length], 0, padding);
+    else
+    {
+        memset(&buffer[length], 0, 128-length);
+        computeBlock(&h0, &h1, &h2, &h3, &h4, &h5, &h6, &h7, buffer);
+        memset(buffer, 0, length);
+    }
+    
     lengthBit = revWord(lengthBit);
     memset(&buffer[112], 0, 8); 
     memcpy(&buffer[120], &lengthBit, 8);