Optimum Design of Experiments

Speed Comparison: C++, generic C++, D, generic D

Posted on: April 15, 2008

So I wondered in which programming language I should write my next project.There
are two cool sites that compare programming languages:

The source code for different tasks: http://rosettacode.org/wiki/Generic_swap
The speed/memory consumption/compile time: http://shootout.alioth.debian.org/gp4/benchmark.php?test=all〈=all

For small projects I’d always use Python and extend it with C++ using Boost:Python (which works exceptionally well once you get to know it, but takes ages to figure out). However, I think it is better to program the whole core functionality in a statically typed language such that it can easily wrapped and used for example in Python.

The D programming language looks really nice and it also shows up quite high on the benchmark (on linux on second place). So I wondered how it performs speedwise to C++. So I copied some example from the web, the bubble sort problem and implemented it four times:

  1. c++
  2. generic c++
  3. D
  4. generic D

The code is below.

Turns out that the C++ code is approximately twice as fast as the D code which
disqualifies D for me at the moment. Also, there is practically no speed gain
for the templated implementation in D.

So for the comparison of C++ and templated C++ code I obtained.
Its hard to read, but on the y-axis the relative speedgain of the templated
version is plotted. So if your array is longer than 16, the templated version
is even slower than the normal. I actually expected some kind of monotonic
decay. Couldnt figure out why there is a max around array length = 8.

/*****************************************
A comparison between different implementations
of the Bubble sort algorithms. The question is: How much faster is the algorithm with template programming.
Examples inspired by http://ubiety.uwaterloo.ca/~tveldhui/papers/Template-Metaprograms/meta-art.html written by Todd Veldhuizen
*****************************************/

/*****************************************
OUTPUT:
> ./bubblesort
0.027000,       0.030000,       0.128000,       0.016000,       0.016000
*****************************************


#include
#include
#include <SDL.h>
#include <sys/time.h>

using namespace std;

/* isn't actually used, we use the SDL timer instead,
it gives the same results */

int mtime(void)
{
    timeval tv;
    gettimeofday(&tv,NULL);
    return (int)(tv.tv_sec*1000 + (tv.tv_usec / 1000));
}

void print_vec(vector &v, string description)
{
    printf("%s\t",description.c_str());
    printf("[ ");
    for(int i = 0; i< v.size(); ++i)
    {
        printf("%d ",v[i]);
    }
        printf("]\n");
}

inline void swap(int &a, int &b)
{
    int tmp = a; a = b; b = tmp;
}


/* straight forward implementation of Bubblesort */
void bubbleSort1(int *data, int N)
{
    for(int i = N-1; i > 0; --i)
    {
        for(int j = 0; j<i; ++j)
        {
            if(data[j] > data[j+1])
            {
                swap(data[j], data[j+1]);
            }
        }
    }
}

/* recursive Bubble sort, one for loop replaced by recursion */
void bubbleSort2(int *data, int N)
{
    for(int j = 0; j < N-1; ++j)
    {
        if(data[j] > data[j+1])
            swap(data[j], data[j+1]);
    }
    if (N>2)
        bubbleSort2(data, N-1);
}

/* recursive recursive Bubble sort, both loops replaced by recursion */
void bubbleSortLoop3(int *data, int N)
{
    if (N>1)
        bubbleSortLoop3(data, N-1);
    if(data[N-1] > data[N])
        swap(data[N-1],data[N]);
}

void bubbleSort3(int *data, int N)
{
    bubbleSortLoop3(data, N-1);
    if (N>2)
        bubbleSort3(data, N-1);
}

/* template metaprogramming Bubble sort */
template class tswap
{
public:
    static inline void compareAndSwap(int* data)
    {
        if (data[I] > data[J])
            swap(data[I], data[J]);
    }
};

template class tbubbleSortLoop
{
public:
    static inline void loop(int *data)
    {
        tbubbleSortLoop::loop(data);
        tswap<N-1,N>::compareAndSwap(data);
    }
};

template<>
class tbubbleSortLoop
{
public:
    static inline void loop(int *data)
    {
        tswap<0,1>::compareAndSwap(data);
    }
};

template
class tbubbleSort
{
public:
    static inline void sort(int *data)
    {
        tbubbleSortLoop::loop(data);
        tbubbleSort::sort(data);
    }
};

template<>
class tbubbleSort
{
public:
    static inline void sort(int *data)
    {
    tbubbleSortLoop::loop(data);
    }
};

/* Todd Veldhuizens reference implementation */

template
class IntSwap
{
public:
    static inline void compareAndSwap(int* data)
    {
        if (data[I] > data[J])
            swap(data[I], data[J]);
    }
};

template
class IntBubbleSortLoop
{
private:
enum { go = (J <= I-2) };

public:
    static inline void loop(int* data)
    {
        IntSwap<J,J+1>::compareAndSwap(data);
        IntBubbleSortLoop::loop(data);
    }
};

template<>
class IntBubbleSortLoop<0,0>
{
public:
    static inline void loop(int*)
    { }
};

template
class IntBubbleSort
{
public:
    static inline void sort(int* data)
    {
        IntBubbleSortLoop<N-1,0>::loop(data);
        IntBubbleSort::sort(data);
    }
};

template<>
class IntBubbleSort
{
public:
    static inline void sort(int* data)
    { }
};

int main(int argc, char* argv[])
{
    /*generate array */
    const int N = 4; /* array length */
    const int NUMBER_OF_SORTS = 2000000;
    Uint32 start,end; /* for timing */

    vector original_data(N*NUMBER_OF_SORTS);
    for(int j = 0; j < NUMBER_OF_SORTS; ++j)
    {
        for(int i = 0; i < N ; ++i){
            original_data[i+N*j] = N-i;
        }
    }
    vector data(original_data);

    /* bubbleSort1 */
    data = original_data;
    start = SDL_GetTicks();
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        bubbleSort1(&data[i*N],N);
    }
    end = SDL_GetTicks();
    printf("%f,\t",(end-start)/1000.);

    /* bubbleSort2 */
    data = original_data;
    start = SDL_GetTicks();
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        bubbleSort2(&data[i*N],N);
    }
    end = SDL_GetTicks();
    printf("%f,\t",(end-start)/1000.);

    /* bubbleSort3 */
    data = original_data;
    start = SDL_GetTicks();
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        bubbleSort3(&data[i*N],N);
    }
    end = SDL_GetTicks();

    printf("%f,\t",(end-start)/1000.);

    /* templated bubbleSort */
    data = original_data;
    start = SDL_GetTicks();
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        tbubbleSort::sort(&data[i*N]);
    }
    end = SDL_GetTicks();
    printf("%f,\t",(end-start)/1000.);

    /* templated bubbleSort, Todd Veldhuizens reference implementation */
    data = original_data;
    start = SDL_GetTicks();
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        IntBubbleSort::sort(&data[i*N]);
    }
    end = SDL_GetTicks();
    printf("%f\n",(end-start)/1000.);

    return 0;
}

/**************************************
OUTPUT:
> ./D_bubblesort
straight forward implementation
0.058
recursive implementation
0.071
templated implementation
0.062
testing static array filling
[ 0 1 2 3 4 ]

**************************************/

//compile with dmd -O -inline -release D_bubblesort.d
version(Win32){
    /*
    * GetTickCount() wraps after 47 days or so, but this does
    * not matter, as we use unsigned for creating the difference.
    */
    private import std.c.windows.windows;
    static int
    diffmtime(inout DWORD old){

        int   Result;
        DWORD now = GetTickCount();
        Result = now - old;
        old = now;
        return Result;
    }
}

version(linux){
    /*
    * On linux, we could run with higher resulution,
    * but we don't
    */
    private import std.c.linux.linux;
    static int
    diffmtime(inout timeval old){

        int Result;
        timeval now;
        gettimeofday(&now,null);
        /* Convert to msec */
        Result = (now.tv_sec-old.tv_sec)*1000
        +(now.tv_usec-old.tv_usec)/1000 ;
        old = now;
        return Result;
    }
}

import std.stdio;

void print_vec(in int v[], string description="")
{
    writef("[ ");
    for(int i = 0; i< v.length; ++i)
    {
        writef("%d ",v[i]);
    }
    writef("]\n");
}


/*    NORMAL IMPLEMENTATION    */
void swap(int *a, int *b)
{
    int tmp = *a; *a = *b; *b = tmp;
}

/* straight forward implementation of Bubblesort */
void bubbleSort1(int *v, int N)
{
    for(int i = N; i > 0; --i)
    {
    for(int j = 0; j<i; ++j)
    {
    if(v[j] > v[j+1])
        swap(&v[j], &v[j+1]);
    }
    }
}

/* recursive implementation of Bubblesort */
void bubbleSortLoop2(int *data, int N)
{
    if (N>1) bubbleSortLoop2(data,N-1);
    if(data[N-1] > data[N])
        swap(&data[N-1], &data[N]);
}

void bubbleSort2(int *data, int N)
{
    bubbleSortLoop2(data, N-1);
    if (N>2)
        bubbleSort2(data,N-1);
}

/*    TEMPLATED IMPLEMENTATION OF ARRAY INITIALIZATION    */
template a(int N)
{
    static void fill(int *data){
        static if(N>=0)
        {
            a!(N-1).fill(data);
            data[N] = N;
        }
    }
}

/*     TEMPLATED IMPLEMENTATION OF BUBBLESORT    */

template tswap(int I, int J)
{
    static void perform(int *data){
        if(data[I] > data[J])
        {
            swap(&data[I],&data[J]);
        }
    }
}

template tbubbleSortLoop(int N)
{
    static void perform(int *data)
    {
        static if(N>1) tbubbleSortLoop!(N-1).perform(data);
        tswap!(N-1,N).perform(data);
    }
}

template tbubbleSort(int N)
{
    static void perform(int *data)
    {
        tbubbleSortLoop!(N-1).perform(data);
        static if(N>2) tbubbleSort!(N-1).perform(data);
    }
}

class Foo
{
    template TBar(T)
    {
        static T yy;                // Ok
        static int func(T t, int y)
        {
            return 0;  // Ok
        }    
    }
}

template Factorial(ulong n)
{
    static if( n <= 1 )
        const Factorial = 1;
    else
        const Factorial = n * Factorial!(n-1);
}

int main()
{
    const int N = 4;

    /* array length */
    const int NUMBER_OF_SORTS = 2000000;
    timeval t;

    /* fill some array */
    int[] original_data;
    original_data.length = N*NUMBER_OF_SORTS;
    for(int j = 0; j < NUMBER_OF_SORTS; ++j)
    {
        for(int i = 0; i < N ; ++i)
        {
            original_data[i+N*j] = N-i;
        }
    }

    writefln("straight forward implementation");
    int[] data = original_data.dup;
    gettimeofday(&t,null);
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        bubbleSort1(&data[i*N],N);
    }
    writefln(diffmtime(t)/1000.);

    writefln("recursive implementation");
    data = original_data.dup;
    gettimeofday(&t,null);
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        bubbleSort2(&data[i*N],N);
    }
    writefln(diffmtime(t)/1000.);

    writefln("templated implementation");
    data = original_data.dup;
    gettimeofday(&t,null);
    for(int i = 0; i < NUMBER_OF_SORTS; ++i)
    {
        tbubbleSort!(N).perform(&data[i*N]);
    }
    writefln(diffmtime(t)/1000.);

    writefln("testing static array filling");
    int[] lala;
    lala.length=5;
    a!(5).fill(&lala[0]);
    print_vec(lala);


    return 0;
}

1 Response to "Speed Comparison: C++, generic C++, D, generic D"

Note the benchmarks game also shows full source code and build & run logs for all the programs, for example

http://shootout.alioth.debian.org/gp4/benchmark.php?test=spectralnorm&lang=nice&id=0

Leave a comment


  • None
  • Isaac Gouy: Note the benchmarks game also shows full source code and build & run logs for all the programs, for example http://shootout.alioth.debian.org/g