c# - Huge performance difference in byte-array access between x64 and x86 -


i'm currenty doing micro-benchmarks better understanding of clr performance , version issues. micro-benchmark in question xoring 2 byte arrays of each 64 bytes together.

i'm making reference implementation safe .net before try beat .net framework implementation unsafe , on.

my reference implementation in question is:

for (int p = 0; p < 64; p++)     a[p] ^= b[p]; 

where a , b byte[] = new byte[64] , filled data .net rng.

this code runs on x64 double fast on x86. first thought ok, because jit make *long^=*long out of , *int^=*int on x86.

but optimized unsafe-version:

fixed (byte* pa = a) fixed (byte* pb = b) {     long* ppa = (long*)pa;     long* ppb = (long*)pb;      (int p = 0; p < 8; p++)     {         *ppa ^= *ppb;          ppa++;         ppb++;     } } 

runs factor 4 times faster x64 reference-implementation. thoughts *long^=*long , *int^=*int optimization of compiler not right.

where huge performance difference in reference implementation come from? posted asm code: why can't c# compiler optimize x86 version way?

il code x86 , x64 reference implementation (they identical):

il_0059: ldloc.3 il_005a: ldloc.s p il_005c: ldelema [mscorlib]system.byte il_0061: dup il_0062: ldobj [mscorlib]system.byte il_0067: ldloc.s b il_0069: ldloc.s p il_006b: ldelem.u1 il_006c: xor il_006d: conv.u1 il_006e: stobj [mscorlib]system.byte il_0073: ldloc.s p il_0075: ldc.i4.1 il_0076: add il_0077: stloc.s p  il_0079: ldloc.s p il_007b: ldc.i4.s 64 il_007d: blt.s il_0059 

i think ldloc.3 a.

resulting asm code x86:

                (int p = 0; p < 64; p++) 010900df  xor         edx,edx 010900e1  mov         edi,dword ptr [ebx+4]                     a[p] ^= b[p]; 010900e4  cmp         edx,edi 010900e6  jae         0109010c 010900e8  lea         esi,[ebx+edx+8] 010900ec  mov         eax,dword ptr [ebp-14h] 010900ef  cmp         edx,dword ptr [eax+4] 010900f2  jae         0109010c 010900f4  movzx       eax,byte ptr [eax+edx+8] 010900f9  xor         byte ptr [esi],al                 (int p = 0; p < 64; p++) 010900fb  inc         edx 010900fc  cmp         edx,40h 010900ff  jl          010900e4 

resulting asm code x64:

                    a[p] ^= b[p]; 00007fff4a8b01c6  mov         eax,3eh 00007fff4a8b01cb  cmp         rax,rcx 00007fff4a8b01ce  jae         00007fff4a8b0245 00007fff4a8b01d0  mov         rax,qword ptr [rbx+8] 00007fff4a8b01d4  mov         r9d,3eh 00007fff4a8b01da  cmp         r9,rax 00007fff4a8b01dd  jae         00007fff4a8b0245 00007fff4a8b01df  mov         r9d,3fh 00007fff4a8b01e5  cmp         r9,rcx 00007fff4a8b01e8  jae         00007fff4a8b0245 00007fff4a8b01ea  mov         ecx,3fh 00007fff4a8b01ef  cmp         rcx,rax 00007fff4a8b01f2  jae         00007fff4a8b0245 00007fff4a8b01f4  nop         word ptr [rax+rax] 00007fff4a8b0200  movzx       ecx,byte ptr [rdi+rdx+10h] 00007fff4a8b0205  movzx       eax,byte ptr [rbx+rdx+10h] 00007fff4a8b020a  xor         ecx,eax 00007fff4a8b020c  mov         byte ptr [rdi+rdx+10h],cl 00007fff4a8b0210  movzx       ecx,byte ptr [rdi+rdx+11h] 00007fff4a8b0215  movzx       eax,byte ptr [rbx+rdx+11h] 00007fff4a8b021a  xor         ecx,eax 00007fff4a8b021c  mov         byte ptr [rdi+rdx+11h],cl 00007fff4a8b0220  add         rdx,2                 (int p = 0; p < 64; p++) 00007fff4a8b0224  cmp         rdx,40h 00007fff4a8b0228  jl          00007fff4a8b0200 

you've made classic mistake, attempting performance analysis on non-optimized code. here complete minimal compilable example:

using system;  namespace so30558357 {     class program     {         static void xorarray(byte[] a, byte[] b)         {             (int p = 0; p< 64; p++)                 a[p] ^= b[p];         }          static void main(string[] args)         {             byte[] = new byte[64];             byte[] b = new byte[64];             random r = new random();              r.nextbytes(a);             r.nextbytes(b);              xorarray(a, b);             console.readline();  // when program stops here                                  // use debug -> attach process         }     } } 

i compiled using visual studio 2013 update 3, default "release build" settings c# console application except architecture, , ran clr v4.0.30319. oh think have roslyn installed, shouldn't replace jit, translation msil identical on both architectures.

the actual x86 assembly xorarray:

006f00d8  push        ebp   006f00d9  mov         ebp,esp   006f00db  push        edi   006f00dc  push        esi   006f00dd  push        ebx   006f00de  push        eax   006f00df  mov         dword ptr [ebp-10h],edx   006f00e2  xor         edi,edi   006f00e4  mov         ebx,dword ptr [ecx+4]   006f00e7  cmp         edi,ebx   006f00e9  jae         006f010f   006f00eb  lea         esi,[ecx+edi+8]   006f00ef  movzx       eax,byte ptr [esi]   006f00f2  mov         edx,dword ptr [ebp-10h]   006f00f5  cmp         edi,dword ptr [edx+4]   006f00f8  jae         006f010f   006f00fa  movzx       edx,byte ptr [edx+edi+8]   006f00ff  xor         eax,edx   006f0101  mov         byte ptr [esi],al   006f0103  inc         edi   006f0104  cmp         edi,40h   006f0107  jl          006f00e7   006f0109  pop         ecx   006f010a  pop         ebx   006f010b  pop         esi   006f010c  pop         edi   006f010d  pop         ebp   006f010e  ret 

and x64:

00007ffd4a3000fb  mov         rax,qword ptr [rsi+8]   00007ffd4a3000ff  mov         rax,qword ptr [rbp+8]   00007ffd4a300103  nop         word ptr [rax+rax]   00007ffd4a300110  movzx       ecx,byte ptr [rsi+rdx+10h]   00007ffd4a300115  movzx       eax,byte ptr [rdx+rbp+10h]   00007ffd4a30011a  xor         ecx,eax   00007ffd4a30011c  mov         byte ptr [rsi+rdx+10h],cl   00007ffd4a300120  movzx       ecx,byte ptr [rsi+rdx+11h]   00007ffd4a300125  movzx       eax,byte ptr [rdx+rbp+11h]   00007ffd4a30012a  xor         ecx,eax   00007ffd4a30012c  mov         byte ptr [rsi+rdx+11h],cl   00007ffd4a300130  movzx       ecx,byte ptr [rsi+rdx+12h]   00007ffd4a300135  movzx       eax,byte ptr [rdx+rbp+12h]   00007ffd4a30013a  xor         ecx,eax   00007ffd4a30013c  mov         byte ptr [rsi+rdx+12h],cl   00007ffd4a300140  movzx       ecx,byte ptr [rsi+rdx+13h]   00007ffd4a300145  movzx       eax,byte ptr [rdx+rbp+13h]   00007ffd4a30014a  xor         ecx,eax   00007ffd4a30014c  mov         byte ptr [rsi+rdx+13h],cl   00007ffd4a300150  add         rdx,4   00007ffd4a300154  cmp         rdx,40h   00007ffd4a300158  jl          00007ffd4a300110 

bottom line: x64 optimizer worked lot better. while still using byte-sized transfers, unrolled loop factor of 4, , inlined function call.

since in x86 version, loop control logic corresponds half code, unrolling can expected yield twice performance.

inlining allowed compiler perform context-sensitive optimization, knowing size of arrays , eliminating runtime bounds check.

if inline hand, x86 compiler yields:

00a000b1  xor         edi,edi   00a000b3  mov         eax,dword ptr [ebp-10h]   00a000b6  mov         ebx,dword ptr [eax+4]                   a[p] ^= b[p]; 00a000b9  mov         eax,dword ptr [ebp-10h]   00a000bc  cmp         edi,ebx   00a000be  jae         00a000f5   00a000c0  lea         esi,[eax+edi+8]   00a000c4  movzx       eax,byte ptr [esi]   00a000c7  mov         edx,dword ptr [ebp-14h]   00a000ca  cmp         edi,dword ptr [edx+4]   00a000cd  jae         00a000f5   00a000cf  movzx       edx,byte ptr [edx+edi+8]   00a000d4  xor         eax,edx   00a000d6  mov         byte ptr [esi],al               (int p = 0; p< 64; p++) 00a000d8  inc         edi   00a000d9  cmp         edi,40h   00a000dc  jl          00a000b9  

didn't much, loop still not unroll , runtime bounds checking still there.

notably, x86 compiler found register (ebx) cache length of 1 array, ran out of registers , forced access other array length memory on every iteration. should "cheap" l1 cache access, that's still slower register access, , slower no bounds check @ all.


Comments

Popular posts from this blog

angularjs - ADAL JS Angular- WebAPI add a new role claim to the token -

php - CakePHP HttpSockets send array of paramms -

node.js - Using Node without global install -