i write piece of code calculate pi using monte carlo method running on 2013 mac book air 1.7 ghz intel core i7 (seems 4650u). when loop count 10^8 took 2 ~ 3 second, , when loop count 10^9 took 25 second.
import foundation func randomnumber(lowerbound:double, upperbound:double) -> double { return lowerbound + double(rand()) / double(rand_max) * (upperbound - lowerbound) } let pointnumber = 1000000000 var pointinsidecount = 0 in 0...pointnumber { let x = randomnumber(-1.0, upperbound:1.0) let y = randomnumber(-1.0, upperbound:1.0) if x*x+y*y <= 1 { pointinsidecount += 1 } } let result = double(pointinsidecount) / double(pointnumber) * 4 let pistring = string(format: "%.50f", result) print("pi \(pistring)")
i run "di -n randomnumber" assemble code of randomnumber function
swifttest`swifttest.randomnumber (swift.double, upperbound : swift.double) -> swift.double: 0x10023c160 <+0>: pushq %rbp 0x10023c161 <+1>: movq %rsp, %rbp 0x10023c164 <+4>: subq $0x20, %rsp 0x10023c168 <+8>: movsd %xmm0, -0x8(%rbp) 0x10023c16d <+13>: movsd %xmm1, -0x10(%rbp) 0x10023c172 <+18>: movsd %xmm0, -0x18(%rbp) 0x10023c177 <+23>: movsd %xmm1, -0x20(%rbp) 0x10023c17c <+28>: callq 0x10027585e ; symbol stub for: rand 0x10023c181 <+33>: movsd 0x3bc1f(%rip), %xmm0 ; witness table offset swift.valistbuilder.__allocating_init (swift.valistbuilder.type)() -> swift.valistbuilder + 352 0x10023c189 <+41>: cvtsi2sdl %eax, %xmm1 0x10023c18d <+45>: divsd %xmm0, %xmm1 0x10023c191 <+49>: movsd -0x20(%rbp), %xmm0 0x10023c196 <+54>: movsd -0x18(%rbp), %xmm2 0x10023c19b <+59>: subsd %xmm2, %xmm0 0x10023c19f <+63>: mulsd %xmm0, %xmm1 0x10023c1a3 <+67>: addsd %xmm1, %xmm2 0x10023c1a7 <+71>: movaps %xmm2, %xmm0 0x10023c1aa <+74>: addq $0x20, %rsp 0x10023c1ae <+78>: popq %rbp 0x10023c1af <+79>: retq
and run "di -f" assemble code of hole file
swifttest`main: 0x10023bcd0 <+0>: pushq %rbp 0x10023bcd1 <+1>: movq %rsp, %rbp 0x10023bcd4 <+4>: subq $0x120, %rsp 0x10023bcdb <+11>: leaq 0x9340e(%rip), %rax ; globalinit_33_1bdf70ffc18749bab495a73b459ed2f0_token6 0x10023bce2 <+18>: leaq 0x933ff(%rip), %rcx ; static swift.process._argc : swift.int32 0x10023bce9 <+25>: movl %edi, (%rcx) 0x10023bceb <+27>: cmpq $-0x1, (%rax) 0x10023bcf2 <+34>: movq %rsi, -0x60(%rbp) 0x10023bcf6 <+38>: je 0x10023bd0e ; <+62> @ main.swift 0x10023bcf8 <+40>: leaq 0x933f1(%rip), %rdi ; globalinit_33_1bdf70ffc18749bab495a73b459ed2f0_token6 0x10023bcff <+47>: leaq -0x99d56(%rip), %rax ; globalinit_33_1bdf70ffc18749bab495a73b459ed2f0_func6 0x10023bd06 <+54>: movq %rax, %rsi 0x10023bd09 <+57>: callq 0x100266870 ; swift_once 0x10023bd0e <+62>: leaq 0x933e3(%rip), %rax ; static swift.process._unsafeargv : swift.unsafemutablepointer<swift.unsafemutablepointer<swift.int8>> 0x10023bd15 <+69>: movq -0x60(%rbp), %rcx 0x10023bd19 <+73>: movq %rcx, (%rax) 0x10023bd1c <+76>: movq $0x989680, 0x93499(%rip) ; lazy cache variable type metadata swift.valistbuilder + 4 0x10023bd27 <+87>: movq $0x0, 0x93496(%rip) ; swifttest.pointnumber : swift.int + 4 0x10023bd32 <+98>: movq 0x93487(%rip), %rax ; swifttest.pointnumber : swift.int 0x10023bd39 <+105>: movq %rax, -0x68(%rbp) 0x10023bd3d <+109>: xorl %eax, %eax 0x10023bd3f <+111>: movl %eax, %ecx 0x10023bd41 <+113>: movq -0x68(%rbp), %rdx 0x10023bd45 <+117>: cmpq %rdx, %rcx 0x10023bd48 <+120>: setle %sil 0x10023bd4c <+124>: testb $0x1, %sil 0x10023bd50 <+128>: jne 0x10023bd54 ; <+132> @ main.swift:17 0x10023bd52 <+130>: jmp 0x10023bdb3 ; <+227> @ main.swift:17 0x10023bd54 <+132>: movq -0x68(%rbp), %rax 0x10023bd58 <+136>: incq %rax 0x10023bd5b <+139>: seto %cl 0x10023bd5e <+142>: movq -0x68(%rbp), %rdx 0x10023bd62 <+146>: cmpq %rdx, %rax 0x10023bd65 <+149>: setg %sil 0x10023bd69 <+153>: testb $0x1, %sil 0x10023bd6d <+157>: movb %cl, -0x69(%rbp) 0x10023bd70 <+160>: jne 0x10023bd74 ; <+164> @ main.swift:17 0x10023bd72 <+162>: jmp 0x10023bd87 ; <+183> @ main.swift:17 0x10023bd74 <+164>: movq -0x68(%rbp), %rax 0x10023bd78 <+168>: incq %rax 0x10023bd7b <+171>: seto %cl 0x10023bd7e <+174>: movq %rax, -0x78(%rbp) 0x10023bd82 <+178>: movb %cl, -0x79(%rbp) 0x10023bd85 <+181>: jmp 0x10023bddf ; <+271> @ main.swift:17 0x10023bd87 <+183>: leaq 0x418a2(%rip), %rdi ; "fatal error" 0x10023bd8e <+190>: movl $0xb, %eax 0x10023bd93 <+195>: movl %eax, %esi 0x10023bd95 <+197>: movl $0x2, %eax 0x10023bd9a <+202>: leaq 0x487af(%rip), %rcx ; "range end index has no valid successor" 0x10023bda1 <+209>: movl $0x26, %edx 0x10023bda6 <+214>: movl %edx, %r8d 0x10023bda9 <+217>: movl %eax, %edx 0x10023bdab <+219>: movl %eax, %r9d 0x10023bdae <+222>: callq 0x1001a80f0 ; function signature specialization <arg[0] = exploded, arg[1] = exploded, arg[2] = dead, arg[3] = dead> of swift._fatalerrormessage (swift.staticstring, swift.staticstring, swift.staticstring, swift.uint) -> () 0x10023bdb3 <+227>: leaq 0x41876(%rip), %rdi ; "fatal error" 0x10023bdba <+234>: movl $0xb, %eax 0x10023bdbf <+239>: movl %eax, %esi 0x10023bdc1 <+241>: movl $0x2, %eax 0x10023bdc6 <+246>: leaq 0x48753(%rip), %rcx ; "can't form range end < start" 0x10023bdcd <+253>: movl $0x21, %edx 0x10023bdd2 <+258>: movl %edx, %r8d 0x10023bdd5 <+261>: movl %eax, %edx 0x10023bdd7 <+263>: movl %eax, %r9d 0x10023bdda <+266>: callq 0x1001a80f0 ; function signature specialization <arg[0] = exploded, arg[1] = exploded, arg[2] = dead, arg[3] = dead> of swift._fatalerrormessage (swift.staticstring, swift.staticstring, swift.staticstring, swift.uint) -> () 0x10023bddf <+271>: leaq -0x30(%rbp), %rdi 0x10023bde3 <+275>: leaq -0x20(%rbp), %rsi 0x10023bde7 <+279>: movq $0x0, -0x20(%rbp) 0x10023bdef <+287>: movq -0x78(%rbp), %rax 0x10023bdf3 <+291>: movq %rax, -0x18(%rbp) 0x10023bdf7 <+295>: callq 0x1000362e0 ; generic specialization <swift.int swift.int : swift.forwardindextype in swift, swift.int swift.int : swift._signedintegertype in swift, swift.int swift.int : swift._builtinintegerliteralconvertible in swift, swift.int> of swift.range.generate <a a: swift.forwardindextype> (swift.range<a>)() -> swift.rangegenerator<a> 0x10023bdfc <+300>: movq -0x30(%rbp), %rax 0x10023be00 <+304>: movq -0x28(%rbp), %rsi 0x10023be04 <+308>: movq %rax, -0x10(%rbp) 0x10023be08 <+312>: movq %rsi, -0x8(%rbp) 0x10023be0c <+316>: leaq -0x40(%rbp), %rdi 0x10023be10 <+320>: leaq -0x10(%rbp), %rsi 0x10023be14 <+324>: callq 0x100036960 ; generic specialization <swift.int swift.int : swift.forwardindextype in swift, swift.int swift.int : swift._signedintegertype in swift, swift.int swift.int : swift._builtinintegerliteralconvertible in swift, swift.int> of swift.rangegenerator.next <a a: swift.forwardindextype> (inout swift.rangegenerator<a>)() -> swift.optional<a> 0x10023be19 <+329>: movq -0x40(%rbp), %rsi 0x10023be1d <+333>: movb -0x38(%rbp), %al 0x10023be20 <+336>: xorb $0x1, %al 0x10023be22 <+338>: testb $0x1, %al 0x10023be24 <+340>: movq %rsi, -0x88(%rbp) 0x10023be2b <+347>: jne 0x10023be32 ; <+354> @ main.swift:17 0x10023be2d <+349>: jmp 0x10023bed4 ; <+516> @ main.swift:23 0x10023be32 <+354>: movsd 0x3bf66(%rip), %xmm0 ; witness table offset swift.valistbuilder.__allocating_init (swift.valistbuilder.type)() -> swift.valistbuilder + 344 0x10023be3a <+362>: movsd 0x3bf56(%rip), %xmm1 ; witness table offset swift.valistbuilder.__allocating_init (swift.valistbuilder.type)() -> swift.valistbuilder + 336 0x10023be42 <+370>: movq -0x88(%rbp), %rax 0x10023be49 <+377>: movq %rax, -0x48(%rbp) 0x10023be4d <+381>: callq 0x10023c160 ; swifttest.randomnumber (swift.double, upperbound : swift.double) -> swift.double @ main.swift:11 0x10023be52 <+386>: movsd 0x3bf46(%rip), %xmm1 ; witness table offset swift.valistbuilder.__allocating_init (swift.valistbuilder.type)() -> swift.valistbuilder + 344 0x10023be5a <+394>: movsd 0x3bf36(%rip), %xmm2 ; witness table offset swift.valistbuilder.__allocating_init (swift.valistbuilder.type)() -> swift.valistbuilder + 336 0x10023be62 <+402>: movsd %xmm0, -0x50(%rbp) 0x10023be67 <+407>: movsd %xmm0, -0x90(%rbp) 0x10023be6f <+415>: movaps %xmm1, %xmm0 0x10023be72 <+418>: movaps %xmm2, %xmm1 0x10023be75 <+421>: callq 0x10023c160 ; swifttest.randomnumber (swift.double, upperbound : swift.double) -> swift.double @ main.swift:11 0x10023be7a <+426>: movsd 0x3bf16(%rip), %xmm1 ; witness table offset swift.valistbuilder.__allocating_init (swift.valistbuilder.type)() -> swift.valistbuilder + 336 0x10023be82 <+434>: movsd %xmm0, -0x58(%rbp) 0x10023be87 <+439>: movsd -0x90(%rbp), %xmm2 0x10023be8f <+447>: mulsd %xmm2, %xmm2 0x10023be93 <+451>: mulsd %xmm0, %xmm0 0x10023be97 <+455>: addsd %xmm0, %xmm2 0x10023be9b <+459>: ucomisd %xmm2, %xmm1 0x10023be9f <+463>: jb 0x10023becf ; <+511> @ main.swift:23 0x10023bea1 <+465>: movq 0x93320(%rip), %rax ; swifttest.pointinsidecount : swift.int 0x10023bea8 <+472>: incq %rax 0x10023beab <+475>: seto %cl 0x10023beae <+478>: movq %rax, -0x98(%rbp) 0x10023beb5 <+485>: movb %cl, -0x99(%rbp) 0x10023bebb <+491>: jo 0x10023c155 ; <+1157> @ main.swift:21 0x10023bec1 <+497>: movq -0x98(%rbp), %rax 0x10023bec8 <+504>: movq %rax, 0x932f9(%rip) ; swifttest.pointinsidecount : swift.int 0x10023becf <+511>: jmp 0x10023be0c ; <+316> @ main.swift:17 0x10023bed4 <+516>: movsd 0x3beb4(%rip), %xmm0 ; witness table offset swift.valistbuilder.__allocating_init (swift.valistbuilder.type)() -> swift.valistbuilder + 328 0x10023bedc <+524>: cvtsi2sdq 0x932e3(%rip), %xmm1 ; swifttest.pointinsidecount : swift.int 0x10023bee5 <+533>: cvtsi2sdq 0x932d2(%rip), %xmm2 ; swifttest.pointnumber : swift.int 0x10023beee <+542>: divsd %xmm2, %xmm1 0x10023bef2 <+546>: mulsd %xmm0, %xmm1 0x10023bef6 <+550>: movsd %xmm1, 0x932d2(%rip) ; swifttest.result : swift.double 0x10023befe <+558>: callq 0x10023c1b0 ; type metadata accessor swift.cvarargtype 0x10023bf03 <+563>: movl $0x1, %ecx 0x10023bf08 <+568>: movl %ecx, %edi 0x10023bf0a <+570>: movq %rax, %rsi 0x10023bf0d <+573>: callq 0x100045770 ; swift._allocateuninitializedarray <a> (builtin.word) -> (swift.array<a>, builtin.rawpointer) 0x10023bf12 <+578>: leaq 0x4865e(%rip), %rdi ; "%.50f" 0x10023bf19 <+585>: movl $0x5, %ecx 0x10023bf1e <+590>: movl %ecx, %esi 0x10023bf20 <+592>: movl $0x1, %ecx 0x10023bf25 <+597>: movq %rdx, -0xa8(%rbp) 0x10023bf2c <+604>: movl %ecx, %edx 0x10023bf2e <+606>: movq %rax, -0xb0(%rbp) 0x10023bf35 <+613>: callq 0x100001aa0 ; swift.string.init (swift.string.type)(_builtinstringliteral : builtin.rawpointer, bytesize : builtin.word, isascii : builtin.int1) -> swift.string 0x10023bf3a <+618>: leaq 0x667b7(%rip), %rsi ; protocol witness table swift.double : swift.cvarargtype in swift 0x10023bf41 <+625>: leaq 0x6a258(%rip), %rdi ; direct type metadata swift.double 0x10023bf48 <+632>: addq $0x8, %rdi 0x10023bf4f <+639>: movq -0xa8(%rbp), %r8 0x10023bf56 <+646>: movq %rdi, 0x18(%r8) 0x10023bf5a <+650>: movq %rsi, 0x20(%r8) 0x10023bf5e <+654>: movsd 0x9326a(%rip), %xmm0 ; swifttest.result : swift.double 0x10023bf66 <+662>: movsd %xmm0, (%r8) 0x10023bf6b <+667>: movq %rax, %rdi 0x10023bf6e <+670>: movq %rdx, %rsi 0x10023bf71 <+673>: movq %rcx, %rdx 0x10023bf74 <+676>: movq -0xb0(%rbp), %rcx 0x10023bf7b <+683>: callq 0x10002dfa0 ; ext.foundation.swift.string.init (swift.string.type)(format : swift.string, swift.array<swift.cvarargtype>...) -> swift.string 0x10023bf80 <+688>: movq %rax, 0x93251(%rip) ; swifttest.pistring : swift.string 0x10023bf87 <+695>: movq %rdx, 0x93252(%rip) ; swifttest.pistring : swift.string + 8 0x10023bf8e <+702>: movq %rcx, 0x93253(%rip) ; swifttest.pistring : swift.string + 16 -> 0x10023bf95 <+709>: callq 0x10023c200 ; type metadata accessor protocol<> 0x10023bf9a <+714>: movl $0x1, %r9d 0x10023bfa0 <+720>: movl %r9d, %edi 0x10023bfa3 <+723>: movq %rax, %rsi 0x10023bfa6 <+726>: callq 0x100045770 ; swift._allocateuninitializedarray <a> (builtin.word) -> (swift.array<a>, builtin.rawpointer) 0x10023bfab <+731>: movl $0x3, %r9d 0x10023bfb1 <+737>: movl %r9d, %edi 0x10023bfb4 <+740>: leaq 0x6fe25(%rip), %rcx ; direct type metadata swift.string 0x10023bfbb <+747>: addq $0x8, %rcx 0x10023bfc2 <+754>: movq %rcx, 0x18(%rdx) 0x10023bfc6 <+758>: movq %rcx, %rsi 0x10023bfc9 <+761>: movq %rax, -0xb8(%rbp) 0x10023bfd0 <+768>: movq %rdx, -0xc0(%rbp) 0x10023bfd7 <+775>: callq 0x100045770 ; swift._allocateuninitializedarray <a> (builtin.word) -> (swift.array<a>, builtin.rawpointer) 0x10023bfdc <+780>: leaq 0x4859a(%rip), %rdi ; "pi " 0x10023bfe3 <+787>: movl $0x6, %r9d 0x10023bfe9 <+793>: movl %r9d, %esi 0x10023bfec <+796>: movl $0x1, %r9d 0x10023bff2 <+802>: movq %rdx, -0xc8(%rbp) 0x10023bff9 <+809>: movl %r9d, %edx 0x10023bffc <+812>: movq %rax, -0xd0(%rbp) 0x10023c003 <+819>: callq 0x100001aa0 ; swift.string.init (swift.string.type)(_builtinstringliteral : builtin.rawpointer, bytesize : builtin.word, isascii : builtin.int1) -> swift.string 0x10023c008 <+824>: movq %rax, %rdi 0x10023c00b <+827>: movq %rdx, %rsi 0x10023c00e <+830>: movq %rcx, %rdx 0x10023c011 <+833>: callq 0x1000470d0 ; swift.string.init (swift.string.type)(stringinterpolationsegment : swift.string) -> swift.string 0x10023c016 <+838>: movq -0xc8(%rbp), %rsi 0x10023c01d <+845>: movq %rax, (%rsi) 0x10023c020 <+848>: movq %rdx, 0x8(%rsi) 0x10023c024 <+852>: movq %rcx, 0x10(%rsi) 0x10023c028 <+856>: movq 0x931a9(%rip), %rdi ; swifttest.pistring : swift.string 0x10023c02f <+863>: movq 0x931aa(%rip), %rsi ; swifttest.pistring : swift.string + 8 0x10023c036 <+870>: movq 0x931ab(%rip), %rax ; swifttest.pistring : swift.string + 16 0x10023c03d <+877>: movq %rdi, -0xd8(%rbp) 0x10023c044 <+884>: movq %rax, %rdi 0x10023c047 <+887>: movq %rsi, -0xe0(%rbp) 0x10023c04e <+894>: movq %rax, -0xe8(%rbp) 0x10023c055 <+901>: callq 0x100268160 ; swift_unknownretain 0x10023c05a <+906>: movq -0xd8(%rbp), %rdi 0x10023c061 <+913>: movq -0xe0(%rbp), %rsi 0x10023c068 <+920>: movq -0xe8(%rbp), %rdx 0x10023c06f <+927>: callq 0x1000470d0 ; swift.string.init (swift.string.type)(stringinterpolationsegment : swift.string) -> swift.string 0x10023c074 <+932>: leaq 0x40d15(%rip), %rdi ; "" 0x10023c07b <+939>: xorl %r9d, %r9d 0x10023c07e <+942>: movl %r9d, %esi 0x10023c081 <+945>: movl $0x1, %r9d 0x10023c087 <+951>: movq -0xc8(%rbp), %r8 0x10023c08e <+958>: movq %rax, 0x18(%r8) 0x10023c092 <+962>: movq %rdx, 0x20(%r8) 0x10023c096 <+966>: movq %rcx, 0x28(%r8) 0x10023c09a <+970>: movl %r9d, %edx 0x10023c09d <+973>: callq 0x100001aa0 ; swift.string.init (swift.string.type)(_builtinstringliteral : builtin.rawpointer, bytesize : builtin.word, isascii : builtin.int1) -> swift.string 0x10023c0a2 <+978>: movq %rax, %rdi 0x10023c0a5 <+981>: movq %rdx, %rsi 0x10023c0a8 <+984>: movq %rcx, %rdx 0x10023c0ab <+987>: callq 0x1000470d0 ; swift.string.init (swift.string.type)(stringinterpolationsegment : swift.string) -> swift.string 0x10023c0b0 <+992>: movq -0xc8(%rbp), %rsi 0x10023c0b7 <+999>: movq %rax, 0x30(%rsi) 0x10023c0bb <+1003>: movq %rdx, 0x38(%rsi) 0x10023c0bf <+1007>: movq %rcx, 0x40(%rsi) 0x10023c0c3 <+1011>: movq -0xd0(%rbp), %rdi 0x10023c0ca <+1018>: callq 0x1000470c0 ; swift.string.init (swift.string.type)(stringinterpolation : swift.array<swift.string>...) -> swift.string 0x10023c0cf <+1023>: movq -0xc0(%rbp), %rsi 0x10023c0d6 <+1030>: movq %rax, (%rsi) 0x10023c0d9 <+1033>: movq %rdx, 0x8(%rsi) 0x10023c0dd <+1037>: movq %rcx, 0x10(%rsi) 0x10023c0e1 <+1041>: callq 0x10012aa70 ; swift.(print (swift.array<protocol<>>, separator : swift.string, terminator : swift.string) -> ()).(default argument 1) 0x10023c0e6 <+1046>: movq %rax, -0xf0(%rbp) 0x10023c0ed <+1053>: movq %rdx, -0xf8(%rbp) 0x10023c0f4 <+1060>: movq %rcx, -0x100(%rbp) 0x10023c0fb <+1067>: callq 0x10012aa90 ; swift.(print (swift.array<protocol<>>, separator : swift.string, terminator : swift.string) -> ()).(default argument 2) 0x10023c100 <+1072>: movq -0xb8(%rbp), %rdi 0x10023c107 <+1079>: movq -0xf0(%rbp), %rsi 0x10023c10e <+1086>: movq -0xf8(%rbp), %r8 0x10023c115 <+1093>: movq %rdx, -0x108(%rbp) 0x10023c11c <+1100>: movq %r8, %rdx 0x10023c11f <+1103>: movq -0x100(%rbp), %r10 0x10023c126 <+1110>: movq %rcx, -0x110(%rbp) 0x10023c12d <+1117>: movq %r10, %rcx 0x10023c130 <+1120>: movq %rax, %r8 0x10023c133 <+1123>: movq -0x108(%rbp), %r9 0x10023c13a <+1130>: movq -0x110(%rbp), %rax 0x10023c141 <+1137>: movq %rax, (%rsp) 0x10023c145 <+1141>: callq 0x10012aab0 ; swift.print (swift.array<protocol<>>, separator : swift.string, terminator : swift.string) -> () 0x10023c14a <+1146>: xorl %eax, %eax 0x10023c14c <+1148>: addq $0x120, %rsp 0x10023c153 <+1155>: popq %rbp 0x10023c154 <+1156>: retq 0x10023c155 <+1157>: ud2 0x10023c157 <+1159>: nopw (%rax,%rax)
can estimate time consumption below?
the randomnumber function consists of 20 instructions, hence calculations of x , y consist of 40 instructions. adding of pointinsidecount execute several instructions, in loop there 4 ~ 5 dozen of instructions(assume 50). time consumption outside of loop can ignored.
if assume 4560u run 2 instructions per cycle in program on average, when loop count 10^8, hole time consumption 50 * 10^8 / (1.7 * 10^9 * 2)
you can't assume same ipc loops. sure loop runs 2 ipc, doesn't tell other loops. have analyse code find bottlenecks , amount of parallelism.
if can safely assume no cache-misses or branch-mispredicts, can reasonable cycle-count estimates small loops specific intel microarchitectures using iaca, intel's static code analyser. far full cycle-accurate simulation of real hardware, have own model distributing uops ports. gets sensible numbers.
you can same sort of analysis hand (including cpus iaca doesn't know about) using agner fog's instruction tables , microarchitecture guides.
things work out quite accurately when loop bottlenecked on latency of loop-carried dependency chain, or on saturating 1 execution port.
at high throughputs, there many subtle effects can bottleneck code you'd hope run @ 4 fused-domain uops per clock. frontend can sustain quite small loops (~28 or 56 uops), because uop cache has limited throughput because of uop-cache-line boundaries , uops not being in groups of 4.
significant fma performance anomaly experienced in intel broadwell processor example of how things can hard understand. you'd expect code saturate 3 vector execution ports, on haswell, , on skylake, not close on broadwell. , that's not front-end bottleneck, since loop small enough fit in loop buffer.
again, of without considering branch mispredicts or cache misses.
if sounds hard , complicated, that's because is. why benchmarks more useful static analysis. however, microbenchmarks really easy wrong. should @ asm make sure didn't screw , let compiler optimize away thing wanted test. need understand lot how cpus work avoid pitfalls, putting else slow microbenchmark, , having dominate run-time instead of thing wanted test.
Comments
Post a Comment