#!/bin/bash # buildclang - build a release of clang. mkdir build || echo 'Using build...' cd build CC=gcc CXX=g++ ../configure --enable-optimized --enable-targets=host-only --disable-assertions make -j6 clang-only make install-clangI then got the source code to GCC 6.2.0, unpacked the sources, and ran this script:
# buildgcc - do a bootstrap build of gcc. ./configure --enable-bootstrap --enable-languages=c,c++,fortran CFLAGS='-Os -g0' CXXFLAGS='-Os -g0' make -j6 make installSo now I had the latest of both compiler trees. For this web page I am not using any Apple built compiler, but I am using those that I built.
Both of the builds are lengthy and can consume a lot of drive space (almost 1 GB).
Next I wanted to look at their code generation, so I dusted off a very old benchmark app and streamlined it. This has 3 small loops and it tests how well these modern compilers can shrink a loop. Here is the source code:
/*
* loops.c - benchmark application.
* loops is Copyright Daniel K. Allen, 1993-2016.
* All rights reserved.
*
* 26 Aug 2016 - Slimmed down from bench.c.
*
*/
#include <math.h>
#include <stdio.h>
#include <time.h>
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
#if defined(__clang__)
#define COMPILER "clang " __clang_version__
#elif defined(__GNUC__)
#define COMPILER "gcc " STR(__GNUC__) "." STR(__GNUC_MINOR__) "." STR(__GNUC_PATCHLEVEL__)
#endif
#define PI 3.1415926535897932384626433
static void AddIntegers()
{
clock_t t;
unsigned long i,n,sumLo,sumHi;
printf("\nAdding %lu integers...\n",n=1000000000);
t = clock();
for (sumHi = sumLo = 0,i = 1; i <= n; ++i) {
sumLo += i;
sumHi += sumLo < i;
}
t = clock() - t;
if (sumHi * pow(2,32) + sumLo != (n*(n+1.0)/2))
printf("INTEGER MATH BROKEN!\n");
else
printf("Sum: $ %08lX %08lX = %.0f\nTime: %.2f seconds\n",sumHi,sumLo,
(n*(n+1.0)/2),t / (double) CLOCKS_PER_SEC);
}
static void AddSquareRoots()
{
clock_t t;
unsigned long i,n;
double sum;
printf("\nAdding %lu square roots...\n",n=1000000000);
t = clock();
for (sum = 0,i = 1; i <= n; ++i)
sum += sqrt(i);
t = clock() - t;
printf("Sum: %.18g\nTime: %.2f seconds\n",sum,t / (double) CLOCKS_PER_SEC);
}
static void TrigAccuracy()
{
clock_t t;
long i,n;
double sum;
printf("\nAdding %ld trig identities sin(x)^2 + cos(x)^2...\n",n=10000000);
t = clock();
#define ARG (2*PI*(double)i/(double)n)
for (sum = 0,i = 1; i <= n; ++i)
sum += fabs(pow(sin(ARG),2) + pow(cos(ARG),2) - 1);
t = clock() - t;
printf("Err: %.18g\nTime: %.2f seconds\n",sum,t / (double) CLOCKS_PER_SEC);
}
static int ShouldReturnZero() /* tests constant folding & global optimizer */
{
double x;
x = 2;
x *= 3;
if (x > 10) x = 2;
x += 3;
x /= 3;
if (x == 3) return 0;
return (int) x;
}
int main()
{
setbuf(stdout,0);
printf("Compiler: %s\n",COMPILER);
if (ShouldReturnZero() != 0) printf("Constant folding BROKEN!\n");
AddIntegers();
AddSquareRoots();
TrigAccuracy();
return 0;
}
First up was GCC:
% gcc loops.c -o loops -lm -Os % ./loops Compiler: gcc 6.2.0 Adding 1000000000 integers... Sum: $ 00000000 6F05B59F17F6500 = 500000000500000000 Time: 0.52 seconds Adding 1000000000 square roots... Sum: 21081851083600.5586 Time: 7.12 seconds Adding 10000000 trig identities sin(x)^2 + cos(x)^2... Err: 3.31648708495890787e-10 Time: 0.35 secondsNext was clang:
% clang loops.c -o loops -lm -Os % ./loops Compiler: clang 3.8.0 (tags/RELEASE_381/final) Adding 1000000000 integers... Sum: $ 00000000 6F05B59F17F6500 = 500000000500000000 Time: 0.27 seconds Adding 1000000000 square roots... Sum: 21081851083600.5586 Time: 3.62 seconds Adding 10000000 trig identities sin(x)^2 + cos(x)^2... Err: 3.31648708495890787e-10 Time: 0.17 secondsWow! Quite a difference. Let's look at the generated code.
Tip: Simply tack a '-S' onto your build command and your resulting output file will instead be full of text showing you the generated assembly code.
movl $1, %eax
L4:
xorl %ecx, %ecx
addq %rax, %rdx
setc %cl
incq %rax
addq %rcx, %rbx
cmpq $1000000001, %rax
jne L4
Now let's compare it to Clang's code:
xorl %eax, %eax
LBB0_1: ## =>This Inner Loop Header: Depth=1
addq %rbx, %rax
adcq $0, %r15
incq %rbx
cmpq $1000000001, %rbx ## imm = 0x3B9ACA01
jne LBB0_1
So GCC has 5 main loop instructions before the compare and jump, while clang has only 3.
This easily accounts for a 48% speedup. GCC really should do this optimization.
L12:
cvtsi2sdq %rbx, %xmm0
incq %rbx
call _sqrt ## Library function call?? - should be an inline sqrtsd instruction!
addsd 8(%rsp), %xmm0
cmpq $1000000001, %rbx
movsd %xmm0, 8(%rsp)
jne L12
and clang instead has this:
LBB0_6: ## =>This Inner Loop Header: Depth=1
movd %rbx, %xmm2
punpckldq %xmm0, %xmm2 ## xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
subpd %xmm1, %xmm2
haddpd %xmm2, %xmm2
sqrtsd %xmm2, %xmm2 ## Direct inline hardware square root for the win!
addsd %xmm2, %xmm3
incq %rbx
cmpq $1000000001, %rbx ## imm = 0x3B9ACA01
jne LBB0_6
GCC actually has only six instruction before the inner loop jump, compared to the eight
instructions that clang generates, but one of the GCC instructions is a call, which is
deadly: clang is 49% faster!If you build our test program with GCC and the '-O3 -march=native -mfpmath=both' flags then it runs in 3.71 seconds, now only 1% slower than clang.
L15:
cvtsi2sdq %rbx, %xmm1
xorps %xmm0, %xmm0
incq %rbx
mulsd LC12(%rip), %xmm1
divsd LC13(%rip), %xmm1
call _cexp ## uses complex exp() function to do sin & cos
cmpq $10000001, %rbx
mulsd %xmm0, %xmm0
mulsd %xmm1, %xmm1
addsd %xmm1, %xmm0
subsd LC14(%rip), %xmm0
andps LC15(%rip), %xmm0
addsd 8(%rsp), %xmm0
movsd %xmm0, 8(%rsp)
jne L15
and here is the clang code:
LBB0_8: ## =>This Inner Loop Header: Depth=1
xorps %xmm0, %xmm0
cvtsi2sdq %rbx, %xmm0
mulsd LCPI0_5(%rip), %xmm0
divsd LCPI0_6(%rip), %xmm0
callq ___sincos_stret ## sin & cos all at once
mulsd %xmm0, %xmm0
mulsd %xmm1, %xmm1
addsd %xmm0, %xmm1
addsd LCPI0_7(%rip), %xmm1
andpd LCPI0_8(%rip), %xmm1
movsd -32(%rbp), %xmm0 ## 8-byte Reload; xmm0 = mem[0],zero
addsd %xmm1, %xmm0
movsd %xmm0, -32(%rbp) ## 8-byte Spill
incq %rbx
cmpq $10000001, %rbx ## imm = 0x989681
jne LBB0_8
GCC's loop is 14 instructions, while clang's is 15 instructions, but they execute much
faster.
Comments and suggestions about this comparison are welcomed.
Created: 26 Aug 2016 Modified: 20 Sep 2016