/* Written by Clifford Wolf , http://www.clifford.at/ * * Short example program to calculate the APPROXIMATED sum of the first * 1 000 000 multiples of pi (including pi itself) brute-force, as example * for how to waste cpu cycles in SSE instructions. ;-) * * Note that a significant higher setting for LIMIT would just return bogus * results because we are running into the limitations of single-precision * floating point then... * * Compile: gcc -O2 -msse demo.c -o demo * SSE Performance: time ./demo +1000 * FPU Performance: time ./demo -1000 */ #define LIMIT 1000000 #define PI 3.1416 typedef float v4sf __attribute__ ((mode(V4SF))); #define GCC_VERSION (__GNUC__ * 10000 + \ __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) float waste_time_sse() { float buf[4] = { PI*1, PI*2, PI*3, PI*4 }; v4sf counter, sum, step; unsigned int i; sum = counter = __builtin_ia32_loadups(buf); buf[0] = buf[1] = buf[2] = buf[3]; step = __builtin_ia32_loadups(buf); for (i=1; i < LIMIT/4; i++) #if GCC_VERSION >= 30300 sum += (counter += step); #else /* gcc versions prior to 3.3.0 did not overload math operators */ { counter = __builtin_ia32_addps(counter, step); sum = __builtin_ia32_addps(counter, sum); } #endif __builtin_ia32_storeups(buf, sum); return buf[0]+buf[1]+buf[2]+buf[3]; } float waste_time_fpu() { float sum = PI, counter = PI; unsigned int i; for (i=1; i < LIMIT; i++) sum += (counter += PI); return sum; } int main(int argc, char ** argv) { int i = argc == 2 ? atoi(argv[1]) : 0; if ( !i ) { printf("SSE: %f\n", waste_time_sse()); printf("FPU: %f\n", waste_time_fpu()); } while (i>0) { waste_time_sse(); i--; } while (i<0) { waste_time_fpu(); i++; } return 0; }