Optimised Assembler Loop
n   mov ecx ,32     ; load counter with 32
nl1: movq mm0,[esi]    ; load 8 bytes
n    add esi,8     ; inc src pntr
n    paddusb mm0,[edx] ; packed unsigned add bytes
n    add edx,8     ; inc src pntr
n    movq [edi],mm0    ; store 8 byte result
n    add edi,8     ; inc dest pntr
n    loop nz,l1     ; dec counter,
n                      ; repeat non zero
Go round only 32 times not 256
Total of 6 instructions in kernel