aes with critcl

yahalom: This is my first real work with critcl and the tcl objects. they were easier to work with than I thought. Basically I replaced the HEAVY math procs with C code that does it faster. In my tests it runs 14-15 times faster, which for me is a big difference as I need to encrypt files that took 40 seconds to encrypt!

      package require critcl
      critcl::ccode {
        #include <string.h>
        unsigned char Sbox[256] = {        
        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
        0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
        0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
        0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
        0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
        0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
        0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
        0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
        0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
        0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
        0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
        0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
        0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
        0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
        0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
        0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
        0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
        0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
        0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
        0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
        0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
        0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
        0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
        0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
        0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
        0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};

    unsigned char  xtime[256] = {
       0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, 
       0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 
       0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 
       0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, 
       0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 
       0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, 
       0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, 
       0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, 
       0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, 
       0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, 
       0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, 
       0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, 
       0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, 
       0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, 
       0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, 
       0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5
        };
         unsigned char Xobs[256] = {
                0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38,0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb,
                0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87,0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb,
                0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d,0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e,
                0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2,0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25,
                0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16,0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92,
                0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda,0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84,
                0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a,0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06,
                0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02,0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b,
                0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea,0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73,
                0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85,0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e,
                0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89,0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b,
                0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20,0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4,
                0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31,0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f,
                0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d,0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef,
                0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0,0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61,
                0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26,0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d,
        };

    }

    critcl::ccommand MixColumns {cd interp objc objv} {
        Tcl_Obj * resObj = Tcl_NewListObj (0,0);
        Tcl_Obj * tempObj = NULL;
        int i;
        int r0,r1,r2,r3,s0,s1,s2,s3;
        Tcl_WideInt w;
        for (i=0;i<4;i++) {
                Tcl_ListObjIndex(interp,objv[1],i,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&w);
                r0= (w >> 24) & 255;
                r1= (w >> 16) & 255;

                r2= (w >> 8 ) & 255;
                r3=  w        & 255;
                s0= xtime[r0] ^ (r1^xtime[r1]) ^ r2 ^ r3;
                s1= r0 ^ xtime[r1] ^ (r2^xtime[r2])^ r3;
                s2= r0 ^ r1 ^ (xtime[r2]) ^ (r3^xtime[r3]);
                s3= r0^xtime[r0] ^ r1 ^ r2 ^ xtime[r3];
                Tcl_Obj * l=Tcl_NewWideIntObj((s0 << 24) | (s1 << 16) | (s2 << 8) | s3 ); 
                Tcl_ListObjAppendElement(interp,resObj,l);
        }
        Tcl_SetObjResult(interp,resObj);
        return TCL_OK;
      }
      critcl::ccommand InvMixColumns {cd interp objc objv} {
        Tcl_Obj * resObj = Tcl_NewListObj (0,0);
        Tcl_Obj * tempObj = NULL;
        int i;
        int r0,r1,r2,r3,s0,s1,s2,s3;
        Tcl_WideInt w;
        for (i=0;i<4;i++) {
                Tcl_ListObjIndex(interp,objv[1],i,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&w);
                r0= (w >> 24) & 255;
                r1= (w >> 16) & 255;
                r2= (w >> 8 ) & 255;
                r3=  w        & 255;

                s0= (xtime[xtime[xtime[r0]]] ^ (xtime[xtime[r0]]) ^ xtime[r0])  ^  (xtime[xtime[xtime[r1]]] ^ xtime[r1] ^ r1)  ^  (xtime[xtime[xtime[r2]]] ^ (xtime[xtime[r2]] ^ r2))  ^  (xtime[xtime[xtime[r3]]] ^ r3);
                s1=(xtime[xtime[xtime[r0]]] ^ r0)  ^  (xtime[xtime[xtime[r1]]] ^ (xtime[xtime[r1]]) ^ xtime[r1])  ^  (xtime[xtime[xtime[r2]]]^xtime[r2]^r2) ^ (xtime[xtime[xtime[r3]]]^(xtime[xtime[r3]] ^ r3));
                s2=(xtime[xtime[xtime[r0]]] ^ (xtime[xtime[r0]] ^ r0)) ^ (xtime[xtime[xtime[r1]]] ^ r1)  ^  (xtime[xtime[xtime[r2]]] ^ (xtime[xtime[r2]]) ^ xtime[r2])  ^  (xtime[xtime[xtime[r3]]]^xtime[r3]^r3);   
                s3=(xtime[xtime[xtime[r0]]]^xtime[r0]^r0)  ^  (xtime[xtime[xtime[r1]]]^(xtime[xtime[r1]] ^ r1))  ^  (xtime[xtime[xtime[r2]]] ^ r2)  ^  (xtime[xtime[xtime[r3]]] ^ (xtime[xtime[r3]]) ^ xtime[r3]);

                Tcl_Obj * l=Tcl_NewWideIntObj((s0 << 24) | (s1 << 16) | (s2 << 8) | s3 ); 
                Tcl_ListObjAppendElement(interp,resObj,l);
        }
        Tcl_SetObjResult(interp,resObj);
        return TCL_OK;
      }



      critcl::ccommand ShiftRows {cd interp obj objv} {
        SubBytes(interp,objv);
        int n0,n1,n2,n3;
        Tcl_Obj * tempObj = NULL;
        Tcl_Obj * resObj = Tcl_NewListObj (0,0);
        Tcl_WideInt n0l;
        Tcl_WideInt n1l;
        Tcl_WideInt n2l;
        Tcl_WideInt n3l;
        for (n0=0;n0 < 4; n0++) {
           n1= (n0 + 1) % 4;
           n2= (n0 + 2) % 4;
              n3= (n0 + 3) % 4;
                 Tcl_ListObjIndex(interp,objv[1],n0,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n0l);
                Tcl_ListObjIndex(interp,objv[1],n1,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n1l);
                Tcl_ListObjIndex(interp,objv[1],n2,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n2l);
                Tcl_ListObjIndex(interp,objv[1],n3,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n3l);
                Tcl_Obj * l=Tcl_NewWideIntObj((n0l & 0xff000000) | (n1l & 0x00ff0000) | (n2l & 0x0000ff00) | (n3l& 0x000000ff));
                Tcl_ListObjAppendElement(interp,resObj,l);
    }
        Tcl_SetObjResult(interp,resObj);
        return TCL_OK;
    }

     critcl::ccommand InvShiftRows {cd interp obj objv} {
        int n0,n1,n2,n3;
        Tcl_Obj * tempObj = NULL;
        Tcl_WideInt n0l;
        Tcl_WideInt n1l;
        Tcl_WideInt n2l;
        Tcl_WideInt n3l;
    for (n0=0;n0 < 4; n0++) {
        n1= (n0 + 1) % 4;
        n2= (n0 + 2) % 4;
        n3= (n0 + 3) % 4;
                Tcl_ListObjIndex(interp,objv[1],n0,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n0l);
                Tcl_ListObjIndex(interp,objv[1],n1,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n1l);
                Tcl_ListObjIndex(interp,objv[1],n2,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n2l);
                Tcl_ListObjIndex(interp,objv[1],n3,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&n3l);
                Tcl_Obj * l=Tcl_NewWideIntObj((n0l & 0xff000000) | (n3l & 0x00ff0000) | (n2l & 0x0000ff00) | (n1l& 0x000000ff));
        }
        return TCL_OK;
   }

    critcl::ccommand SubWord {cd interp obj objv} {
        int s0,s1,s2,s3;
        Tcl_WideInt w;
        Tcl_GetWideIntFromObj(interp,objv[1],&w);
        s3=Sbox[((w >> 24) & 255)];
        s2=Sbox[((w >> 16) & 255)];
        s1=Sbox[((w >> 8 ) & 255)];
        s0=Sbox[(w        & 255)];
        Tcl_Obj * resObj = Tcl_NewWideIntObj((s3 << 24) | (s2 << 16) | (s1 << 8) | s0);
        Tcl_SetObjResult(interp,resObj);
        return TCL_OK;
   }
     critcl::ccommand InvSubWord {cd interp obj objv} {
        int s0,s1,s2,s3;
        Tcl_WideInt w;
        Tcl_GetWideIntFromObj(interp,objv[1],&w);
        s3=Xobs[((w >> 24) & 255)];
        s2=Xobs[((w >> 16) & 255)];
        s1=Xobs[((w >> 8 ) & 255)];
        s0=Xobs[(w        & 255)];
        Tcl_Obj * resObj = Tcl_NewWideIntObj((s3 << 24) | (s2 << 16) | (s1 << 8) | s0);
        Tcl_SetObjResult(interp,resObj);
        return TCL_OK;
     }

     critcl::ccommand SubBytes {cd interp obj objv} {
        int i;
        int s0,s1,s2,s3;
        Tcl_Obj * tempObj = NULL;
        Tcl_WideInt w;
        for (i=0;i<4;i++) {
                Tcl_ListObjIndex(interp,objv[1],i,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&w);
                s3=Sbox[((w >> 24) & 255)];
                s2=Sbox[((w >> 16) & 255)];
                s1=Sbox[((w >> 8 ) & 255)];
                s0=Sbox[(w        & 255)];
                tempObj = Tcl_NewWideIntObj((s3 << 24) | (s2 << 16) | (s1 << 8) | s0);
        }
        return TCL_OK;
    }

      critcl::ccommand InvSubBytes {cd interp obj objv} {
        Tcl_Obj * listObj=InvShiftRows(interp,objv);
        int i;
        int s0,s1,s2,s3;
        Tcl_Obj * tempObj = NULL;
        Tcl_Obj * resObj = Tcl_NewListObj (0,0);
        Tcl_WideInt w;
        for (i=0;i<4;i++) {
                Tcl_ListObjIndex(interp,listObj,i,&tempObj);
                Tcl_GetWideIntFromObj(interp,tempObj,&w);
                s3=Xobs[((w >> 24) & 255)];
                s2=Xobs[((w >> 16) & 255)];
                s1=Xobs[((w >> 8 ) & 255)];
                s0=Xobs[(w        & 255)];
                tempObj = Tcl_NewWideIntObj((s3 << 24) | (s2 << 16) | (s1 << 8) | s0);
                Tcl_ListObjAppendElement(interp,resObj,tempObj);
        }
        Tcl_DecrRefCount(listObj);
        Tcl_SetObjResult(interp,resObj);
        return TCL_OK;
 }

it works together with the aes tcl package. delete from the tcl code these procs (or do replace on them) and source the file with the above source code and it shall work. I created a package from it aes-critcl that is simiar in API to the tcl aes package so on platfrom that this does not work I will work with the tcl code. I think this is also a good example of moving some tcl code to C code. all the methods do exactly as the tcl methods so one can compare then both and learn.

PT - 29-June-2008: This is coming along - but the approach is wrong. You should attempt to minimize the number of switches between C code and Tcl code. If you look at the other critcl packages in tcllib (rc4 or md5) you can see that we always split the implementation into a Tcl wrapper that handles the option processing and a core command that actually does all the work. So we can switch the rc4 implementation between two Tcl implemenations and one critcl one just using the [interp alias] command.

To get the maximum speed working with binary data you should try to preallocate the result object. In block ciphers you can calculate how many blocks will be output knowning the length of the input and cal use Tcl_SetByteArrayLength to setup the output buffer held in a byte array.

The md5 implementation uses the same kind of Init/Final/Process split and should be a good basis to work from.

yahalom: well you convinced me not to do it :-). it sounds very complicated and I am really more a tcl developer than a c developer. managing to do this critcl took a lot of work from me. to dive deeper into C means another week will pass. my bosses are used to tcl development pace - not C development pace.

PT: If you are on a time limit then you should make use of the open part of open source. Search for a BSD licensed implementation in C then wrap that up [1 ]. The original AES/Rjindael implementation was in C and public domain I believe.

yahalom: truely this is what I tried in the beginning. I did not find a proper C code that works simple and only needs wrapping. all the C code I found needed more work or did not encrypt properly so I could not decrypt with the tcl aes package. Checking what was wrong again was too compilcated for me. I think you miss one very important feature of critcl. for me it is a tool that allowes a not experinced C developers to speed up his tcl code without diving too deep into C. of course his code will be better/faster if he do it properly in C but then one can argue that you can develop all in C and drop Tcl. Well, the all point of developing in Tcl is fast development and ease. Critcl is also part of this.

yahalom: I finally followed the above suggestion and created proper Tcl binding to a C implementation of AES 256 CBC mode. The C code runs around 250 faster then the Tcl code and 30 times faster than the above critcl-aes implementation according to a basic test I made. For large data better use this code. The code is compatible with the Tcl code so the Tcl implementation will be able to decrypt the C implementation encrypted data (using the same key of course) and vice versa.

I use Critcl to create the binary. The code can be downloaded from https://github.com/yahalom/tclAES256

any code improvements are welcome.

RLE (2015-10-04): I wonder if you would see any further improvement in performance if you compiled in C99 mode and marked all the inner aes functions that you call in the main loop as 'inline' functions (http://www.greenend.org.uk/rjk/tech/inline.html ). Plus, using C99 mode you could drop the malloc/free pair and just utilize a C99 dynamically sized array within aes256Encrypt and aes256Decrypt functions (provided of course that the length in lBuf did not exceed your system/compilers stack size limits).