Some demonstrations of nice/osbcure gcc features


Nested functions and labels as values:
This doesn't work e.g. on OpenBSD because it requires an executable stack (gcc generates code which auto-generates code on the stack and is using it as function pointer).
#include <stdio.h>
#include <string.h>

void bar(void(*callback)(int))
{
        callback(getc(stdin));
}

int foobar()
{
        __label__ fin_error, fin_multiplex;
        void *finishp = &&fin_ok;
        int value = 0;

        void foo(int ch) {
                if ( ch < 0 ) goto fin_error;
                if ( ch == '\n' ) goto fin_multiplex;
                if ( ch < '0' || ch > '9' ) finishp = &&fin_error;
                value = value * 10 + ch - '0';
        }

        while (1) bar(foo);

fin_multiplex:
        goto *finishp;

fin_error:
        return -1;

fin_ok:
        return value;
}

int main()
{
        int tmp = foobar();
        printf("%d = 0x%X\n", tmp, tmp);
        return 0;
}

(
download)


Compound statements as expressions, ##__VA_ARGS__ and alloca:
#include <alloca.h>
#include <stdio.h>

#define ssprintf(...) \
        ({ int _ss_size = snprintf(0, 0, ##__VA_ARGS__);        \
        char *_ss_ret = __builtin_alloca(_ss_size+1);           \
        snprintf(_ss_ret, _ss_size+1, ##__VA_ARGS__);           \
        _ss_ret; })

int main() {
        char *tmp[5];
        tmp[0] = ssprintf("This text is dynamically allocated");
        tmp[1] = ssprintf("on the stack frame of main() and doesn't");
        tmp[2] = ssprintf("need to be freed explicitely. It is");
        tmp[3] = ssprintf("automatically freed when the function returns.");
        tmp[4] = ssprintf("%s\n%s\n%s\n%s\n", tmp[0], tmp[1], tmp[2], tmp[3]);
        printf("\nt: %p\t0: %p\t1: %p\n2: %p\t3: %p\t4: %p\n\n%s\n",
                        tmp, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[4]);
        return 0;
}

(
download)


Wrapping a function with variable parameter list:
#include <stdio.h>

int my_printf(char *fmt, ...)
{
        void *arg = __builtin_apply_args();
        void *ret = __builtin_apply((void*)printf, arg, 100);
        __builtin_return(ret);
}

int main()
{
        my_printf("%s %s %s %s ", "This", "is", "a", "test");
        my_printf("%s the anser is %d.\n", &3["expand"], 42);
        return 0;
}

(
download)

As Steve Summit said: "Getting involved with dynamic argument lists is a lot like getting involved with absinthe: darkly exotic and stimulating at first, but destructive and mind-rotting in the end."


Using SSE instructions in C code without assembler templates:
/* Written by Clifford Wolf <clifford@clifford.at>, http://www.clifford.at/
 *
 * Short example program to calculate the APPROXIMATED sum of the first
 * 1 000 000 multiples of pi (including pi itself) brute-force, as example
 * for how to waste cpu cycles in SSE instructions.  ;-)
 *
 * Note that a significant higher setting for LIMIT would just return bogus
 * results because we are running into the limitations of single-precision
 * floating point then...
 *
 * Compile: gcc -O2 -msse demo.c -o demo
 * SSE Performance: time ./demo +1000
 * FPU Performance: time ./demo -1000
 */

#define LIMIT 1000000
#define PI 3.1416

typedef float v4sf __attribute__ ((mode(V4SF)));

#define GCC_VERSION (__GNUC__ * 10000 + \
                __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)

float waste_time_sse()
{
        float buf[4] = { PI*1, PI*2, PI*3, PI*4 };
        v4sf counter, sum, step;
        unsigned int i;

        sum = counter = __builtin_ia32_loadups(buf);

        buf[0] = buf[1] = buf[2] = buf[3];
        step = __builtin_ia32_loadups(buf);

        for (i=1; i < LIMIT/4; i++)
#if GCC_VERSION >= 30300
                sum += (counter += step);
#else
        /* gcc versions prior to 3.3.0 did not overload math operators */
        {
                counter = __builtin_ia32_addps(counter, step);
                sum = __builtin_ia32_addps(counter, sum);
        }
#endif

        __builtin_ia32_storeups(buf, sum);

        return buf[0]+buf[1]+buf[2]+buf[3];
}

float waste_time_fpu()
{
        float sum = PI, counter = PI;
        unsigned int i;

        for (i=1; i < LIMIT; i++) 
                sum += (counter += PI);

        return sum;
}

int main(int argc, char ** argv)
{
        int i = argc == 2 ? atoi(argv[1]) : 0;
        if ( !i ) {
                printf("SSE: %f\n", waste_time_sse());
                printf("FPU: %f\n", waste_time_fpu());
        }
        while (i>0) { waste_time_sse(); i--; }
        while (i<0) { waste_time_fpu(); i++; }
        return 0;
}

(
download)


Array initialization with ranges and cases with ranges:
#include <stdio.h>

char switch_cases_1_tab[256] = {
        ['a' ... 'z' ] = 'A' - 'a',
        ['A' ... 'Z' ] = 'a' - 'A',
};

char switch_cases_1(char ch)
{
        return ch + switch_cases_1_tab[(unsigned char)ch];
}

char switch_cases_2(char ch)
{
        switch (ch) {
                case 'A' ... 'Z':
                        return ch + ('a' - 'A');
                case 'a' ... 'z':
                        return ch + ('A' - 'a');
                default:
                        return ch;
        }
}

int main()
{
        char demo[] = "This Is A Simple Test.\n";
        int i;

        printf("%s", demo);
        for (i=0; demo[i]; i++) demo[i] = switch_cases_1(demo[i]);
        printf("%s", demo);
        for (i=0; demo[i]; i++) demo[i] = switch_cases_2(demo[i]);
        printf("%s", demo);

        return 0;
}

(
download)


GCC can create dumps of it's intermediate RTL tree. I wrote a small perl script to graph them using graphviz:
(
download)