#!/bin/bash # buildclang - build a release of clang. mkdir build || echo 'Using build...' cd build CC=gcc CXX=g++ ../configure --enable-optimized --enable-targets=host-only --disable-assertions make -j6 clang-only make install-clangI then got the source code to GCC 6.2.0, unpacked the sources, and ran this script:
# buildgcc - do a bootstrap build of gcc. ./configure --enable-bootstrap --enable-languages=c,c++,fortran CFLAGS='-Os -g0' CXXFLAGS='-Os -g0' make -j6 make installSo now I had the latest of both compiler trees. For this web page I am not using any Apple built compiler, but I am using those that I built.
Both of the builds are lengthy and can consume a lot of drive space (almost 1 GB).
Next I wanted to look at their code generation, so I dusted off a very old benchmark app and streamlined it. This has 3 small loops and it tests how well these modern compilers can shrink a loop. Here is the source code:
/* * loops.c - benchmark application. * loops is Copyright Daniel K. Allen, 1993-2016. * All rights reserved. * * 26 Aug 2016 - Slimmed down from bench.c. * */ #include <math.h> #include <stdio.h> #include <time.h> #define STR_HELPER(x) #x #define STR(x) STR_HELPER(x) #if defined(__clang__) #define COMPILER "clang " __clang_version__ #elif defined(__GNUC__) #define COMPILER "gcc " STR(__GNUC__) "." STR(__GNUC_MINOR__) "." STR(__GNUC_PATCHLEVEL__) #endif #define PI 3.1415926535897932384626433 static void AddIntegers() { clock_t t; unsigned long i,n,sumLo,sumHi; printf("\nAdding %lu integers...\n",n=1000000000); t = clock(); for (sumHi = sumLo = 0,i = 1; i <= n; ++i) { sumLo += i; sumHi += sumLo < i; } t = clock() - t; if (sumHi * pow(2,32) + sumLo != (n*(n+1.0)/2)) printf("INTEGER MATH BROKEN!\n"); else printf("Sum: $ %08lX %08lX = %.0f\nTime: %.2f seconds\n",sumHi,sumLo, (n*(n+1.0)/2),t / (double) CLOCKS_PER_SEC); } static void AddSquareRoots() { clock_t t; unsigned long i,n; double sum; printf("\nAdding %lu square roots...\n",n=1000000000); t = clock(); for (sum = 0,i = 1; i <= n; ++i) sum += sqrt(i); t = clock() - t; printf("Sum: %.18g\nTime: %.2f seconds\n",sum,t / (double) CLOCKS_PER_SEC); } static void TrigAccuracy() { clock_t t; long i,n; double sum; printf("\nAdding %ld trig identities sin(x)^2 + cos(x)^2...\n",n=10000000); t = clock(); #define ARG (2*PI*(double)i/(double)n) for (sum = 0,i = 1; i <= n; ++i) sum += fabs(pow(sin(ARG),2) + pow(cos(ARG),2) - 1); t = clock() - t; printf("Err: %.18g\nTime: %.2f seconds\n",sum,t / (double) CLOCKS_PER_SEC); } static int ShouldReturnZero() /* tests constant folding & global optimizer */ { double x; x = 2; x *= 3; if (x > 10) x = 2; x += 3; x /= 3; if (x == 3) return 0; return (int) x; } int main() { setbuf(stdout,0); printf("Compiler: %s\n",COMPILER); if (ShouldReturnZero() != 0) printf("Constant folding BROKEN!\n"); AddIntegers(); AddSquareRoots(); TrigAccuracy(); return 0; }
First up was GCC:
% gcc loops.c -o loops -lm -Os % ./loops Compiler: gcc 6.2.0 Adding 1000000000 integers... Sum: $ 00000000 6F05B59F17F6500 = 500000000500000000 Time: 0.52 seconds Adding 1000000000 square roots... Sum: 21081851083600.5586 Time: 7.12 seconds Adding 10000000 trig identities sin(x)^2 + cos(x)^2... Err: 3.31648708495890787e-10 Time: 0.35 secondsNext was clang:
% clang loops.c -o loops -lm -Os % ./loops Compiler: clang 3.8.0 (tags/RELEASE_381/final) Adding 1000000000 integers... Sum: $ 00000000 6F05B59F17F6500 = 500000000500000000 Time: 0.27 seconds Adding 1000000000 square roots... Sum: 21081851083600.5586 Time: 3.62 seconds Adding 10000000 trig identities sin(x)^2 + cos(x)^2... Err: 3.31648708495890787e-10 Time: 0.17 secondsWow! Quite a difference. Let's look at the generated code.
Tip: Simply tack a '-S' onto your build command and your resulting output file will instead be full of text showing you the generated assembly code.
movl $1, %eax L4: xorl %ecx, %ecx addq %rax, %rdx setc %cl incq %rax addq %rcx, %rbx cmpq $1000000001, %rax jne L4Now let's compare it to Clang's code:
xorl %eax, %eax LBB0_1: ## =>This Inner Loop Header: Depth=1 addq %rbx, %rax adcq $0, %r15 incq %rbx cmpq $1000000001, %rbx ## imm = 0x3B9ACA01 jne LBB0_1So GCC has 5 main loop instructions before the compare and jump, while clang has only 3. This easily accounts for a 48% speedup. GCC really should do this optimization.
L12: cvtsi2sdq %rbx, %xmm0 incq %rbx call _sqrt ## Library function call?? - should be an inline sqrtsd instruction! addsd 8(%rsp), %xmm0 cmpq $1000000001, %rbx movsd %xmm0, 8(%rsp) jne L12and clang instead has this:
LBB0_6: ## =>This Inner Loop Header: Depth=1 movd %rbx, %xmm2 punpckldq %xmm0, %xmm2 ## xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] subpd %xmm1, %xmm2 haddpd %xmm2, %xmm2 sqrtsd %xmm2, %xmm2 ## Direct inline hardware square root for the win! addsd %xmm2, %xmm3 incq %rbx cmpq $1000000001, %rbx ## imm = 0x3B9ACA01 jne LBB0_6GCC actually has only six instruction before the inner loop jump, compared to the eight instructions that clang generates, but one of the GCC instructions is a call, which is deadly: clang is 49% faster!
If you build our test program with GCC and the '-O3 -march=native -mfpmath=both' flags then it runs in 3.71 seconds, now only 1% slower than clang.
L15: cvtsi2sdq %rbx, %xmm1 xorps %xmm0, %xmm0 incq %rbx mulsd LC12(%rip), %xmm1 divsd LC13(%rip), %xmm1 call _cexp ## uses complex exp() function to do sin & cos cmpq $10000001, %rbx mulsd %xmm0, %xmm0 mulsd %xmm1, %xmm1 addsd %xmm1, %xmm0 subsd LC14(%rip), %xmm0 andps LC15(%rip), %xmm0 addsd 8(%rsp), %xmm0 movsd %xmm0, 8(%rsp) jne L15and here is the clang code:
LBB0_8: ## =>This Inner Loop Header: Depth=1 xorps %xmm0, %xmm0 cvtsi2sdq %rbx, %xmm0 mulsd LCPI0_5(%rip), %xmm0 divsd LCPI0_6(%rip), %xmm0 callq ___sincos_stret ## sin & cos all at once mulsd %xmm0, %xmm0 mulsd %xmm1, %xmm1 addsd %xmm0, %xmm1 addsd LCPI0_7(%rip), %xmm1 andpd LCPI0_8(%rip), %xmm1 movsd -32(%rbp), %xmm0 ## 8-byte Reload; xmm0 = mem[0],zero addsd %xmm1, %xmm0 movsd %xmm0, -32(%rbp) ## 8-byte Spill incq %rbx cmpq $10000001, %rbx ## imm = 0x989681 jne LBB0_8GCC's loop is 14 instructions, while clang's is 15 instructions, but they execute much faster.
Comments and suggestions about this comparison are welcomed.
Created: 26 Aug 2016 Modified: 20 Sep 2016