Optimised Assembler Loop
Go round only 32 times not 256
   mov ecx ,32       ; load counter with 32
l1: movq mm0,[esi]    ; load 8 bytes
    add esi,8       ; inc src pntr
    paddusb mm0,[edx] ; packed unsigned add bytes
    add edx,8       ; inc src pntr
    movq [edi],mm0    ; store 8 byte result
    add edi,8       ; inc dest pntr
    loop nz,l1       ; dec counter,
                      ; repeat non zero
Total of 6 instructions in kernel