Lockless Inc

The Mandelbrot Set

The Mandelbrot set is the "King" of all fractals. It describes all Julia Sets, has endless variety, and yet has an extremely simple definition. To construct it, one starts a complex number (a point on the complex plane). Next, that number is squared. Finally one adds the original number again. These steps are repeated over and over again. "Most" points quickly diverge to infinity: 2, 2*2+2 = 6, 6*6+2 = 38, etc. However, some points do not. 0, 0*0 + 0 = 0, 0*0 + 0 = 0. These points that do not are within the set.

The either-or nature of the set isn't that interesting to draw. However, one can liven things up with a little colour. To do this, count how long it takes for the starting point to diverge beyond some boundary. By colouring by this "escape time" we can see extra detail. By choosing different boundaries, one gets different patterns of colours.

As a general rule people tend to use the smallest circle about the origin that contains the set: If the magnitude of the iterate, z = x+iy, is greater than 2: x2 + y2 > 22, then it can be shown that that sequence must diverge. Thus every iteration we need to do two things. The first is calculate the new point value znew = z2+c = x2-y2+2xyi+c, and the second is the above magnitude test.

Displaying Images

The first thing we need in order to do this is have some way of displaying the results. Fortunately, it doesn't take much code to create a raw X11 window and then use a pixmap to show an image.


#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <err.h>

#include <X11/Xlib.h>
#include <X11/Xutil.h>

/* X11 data */
static Display *dpy;
static Window win;
static XImage *bitmap;
static Atom wmDeleteMessage;
static GC gc;

static void exit_x11(void)
{
	XDestroyImage(bitmap);
	XDestroyWindow(dpy, win);
	XCloseDisplay(dpy);
}

static void init_x11(int size)
{
	int bytes_per_pixel;
	
	/* Attempt to open the display */
	dpy = XOpenDisplay(NULL);
	
	/* Failure */
	if (!dpy) exit(0);
	
	unsigned long white = WhitePixel(dpy,DefaultScreen(dpy));
	unsigned long black = BlackPixel(dpy,DefaultScreen(dpy));
	

	win = XCreateSimpleWindow(dpy,
                               DefaultRootWindow(dpy),
                               0, 0,
                               size, size,
                               0, black,
                               white);
	
	/* We want to be notified when the window appears */
	XSelectInput(dpy, win, StructureNotifyMask);
	
	/* Make it appear */
	XMapWindow(dpy, win);
	
	while (1)
	{
		XEvent e;
		XNextEvent(dpy, &e);
		if (e.type == MapNotify) break;
	}
	
	XTextProperty tp;
	char name[128] = "Mandelbrot";
	char *n = name;
	Status st = XStringListToTextProperty(&n, 1, &tp);
	if (st) XSetWMName(dpy, win, &tp);

	/* Wait for the MapNotify event */
	XFlush(dpy);
	
	int ii, jj;
	int depth = DefaultDepth(dpy, DefaultScreen(dpy));
	Visual *visual = DefaultVisual(dpy, DefaultScreen(dpy));
	int total;

	/* Determine total bytes needed for image */
	ii = 1;
	jj = (depth - 1) >> 2;
	while (jj >>= 1) ii <<= 1;

	/* Pad the scanline to a multiple of 4 bytes */
	total = size * ii;
	total = (total + 3) & ~3;
	total *= size;
	bytes_per_pixel = ii;
	
	if (bytes_per_pixel != 4)
	{
		printf("Need 32bit colour screen!");
		
	}
	
	/* Make bitmap */
	bitmap = XCreateImage(dpy, visual, depth,
		ZPixmap, 0, malloc(total),
		size, size, 32, 0);
	
	/* Init GC */
	gc = XCreateGC(dpy, win, 0, NULL);
	XSetForeground(dpy, gc, black);
	
	if (bytes_per_pixel != 4)
	{
		printf("Need 32bit colour screen!\n");
		exit_x11();
		exit(0);
	}
	
	XSelectInput(dpy, win, ExposureMask | KeyPressMask | StructureNotifyMask);
	
	wmDeleteMessage = XInternAtom(dpy, "WM_DELETE_WINDOW", False);
	XSetWMProtocols(dpy, win, &wmDeleteMessage, 1);
}

Next we need some sort of colour map for the escape-time colouring. To save space, we will just initialize one randomly, with a little bit of an intensity gradient built-in:


#define MAX_ITER	(1 << 14)
static unsigned cols[MAX_ITER + 1];

static void init_colours(void)
{
	int i;
	
	for (i = 0; i < MAX_ITER; i++)
	{
		char r = (rand() & 0xff) * MAX_ITER / (i + MAX_ITER + 1);
		char g = (rand() & 0xff) * MAX_ITER / (i + MAX_ITER + 1);
		char b = (rand() & 0xff) * MAX_ITER / (i + MAX_ITER + 1);
		
		cols[i] = b + 256 * g + 256 * 256 * r;
	}
	
	cols[MAX_ITER] = 0;
}

Now for the code that fills in the bitmap:


/* The Inner Loop */
static unsigned mandel_double(double cr, double ci)
{
	double zr = cr, zi = ci;
	double tmp;
	
	unsigned i;
	
	for (i = 0; i < MAX_ITER; i++)
	{
		tmp = zr * zr - zi * zi + cr;
		zi *= 2 * zr;
		zi += ci;
		zr = tmp;
		
		if (zr * zr + zi * zi > 4.0) break;
	}
	
	return i;
}

/* For each point, evaluate its colour */
static void display_double(int size, double xmin, double xmax, double ymin, double ymax)
{
	int x, y;
	
	double cr, ci;
	
	double xscal = (xmax - xmin) / size;
	double yscal = (ymax - ymin) / size;
	
	unsigned counts;

	for (y = 0; y < size; y++)
	{
		for (x = 0; x < size; x++)
		{
			cr = xmin + x * xscal;
			ci = ymin + y * yscal;
			
			counts = mandel_double(cr, ci);
			
			((unsigned *) bitmap->data)[x + y*size] = cols[counts];
		}
		
		/* Display it line-by-line for speed */
		XPutImage(dpy, win, gc, bitmap,
			0, y, 0, y,
			size, 1);
	}
	
	XFlush(dpy);
}

Finally all that is needed is the main() routine to finish the program.


/* Image size */
#define ASIZE 1000

/* Comment out this for benchmarking */
#define WAIT_EXIT 

int main(void)
{
	double xmin = -2;
	double xmax = 1.0;
	double ymin = -1.5;
	double ymax = 1.5;
	
	/* Make a window! */
	init_x11(ASIZE);
	
	init_colours();
	
	display_double(ASIZE, xmin, xmax, ymin, ymax);

#ifdef WAIT_EXIT
	while(1)
	{
		XEvent event;
		KeySym key;
		char text[255];
		
		XNextEvent(dpy, &event);
	
		/* Just redraw everything on expose */
		if ((event.type == Expose) && !event.xexpose.count)
		{
			XPutImage(dpy, win, gc, bitmap,
				0, 0, 0, 0,
				ASIZE, ASIZE);
		}
		
		/* Press 'q' to quit */
		if ((event.type == KeyPress) &&
			XLookupString(&event.xkey, text, 255, &key, 0) == 1)
		{
			if (text[0] == 'q') break;
		}
		
		/* Or simply close the window */
		if ((event.type == ClientMessage) &&
			((Atom) event.xclient.data.l[0] == wmDeleteMessage))
		{
			break;
		}
	}
#endif

	/* Done! */
	exit_x11();
	
	return 0;
}

The above code has an extremely simple inner loop... and as a result is quite slow, taking 24.9 seconds to run. It could be made faster by decreasing the maximum iteration count degrading the image somewhat, but as we'll see, that isn't the bottleneck. On the other hand, the results appear nicely:

The Mandelbrot Set

Optimization

The first thing to notice is that we are using double precision. Since the "zoom level" is quite low, this excess precision isn't needed, and we can make do with just single precision floats. Since operations on floats tend to be faster than doubles, we can obtain an immediate speedup:


static unsigned mandel_float(float cr, float ci)
{
	float zr = cr, zi = ci;
	float tmp;
	
	unsigned i;
	
	for (i = 0; i < MAX_ITER; i++)
	{
		tmp = zr * zr - zi * zi + cr;
		zi *= 2 * zr;
		zi += ci;
		zr = tmp;
		
		if (zr * zr + zi * zi > 4.0) break;
	}
	
	return i;
}

static void display_float(int size, float xmin, float xmax, float ymin, float ymax)
{
	int x, y;
	
	float cr, ci;
	
	float xscal = (xmax - xmin) / size;
	float yscal = (ymax - ymin) / size;
	
	unsigned counts;

	for (y = 0; y < size; y++)
	{
		for (x = 0; x < size; x++)
		{
			cr = xmin + x * xscal;
			ci = ymin + y * yscal;
			
			counts = mandel_float(cr, ci);
			
			((unsigned *) bitmap->data)[x + y*size] = cols[counts];
		}
		
		/* Display it */
		XPutImage(dpy, win, gc, bitmap,
			0, y, 0, y,
			size, 1);
	}
	
	XFlush(dpy);
}

Unfortunately, things are still too slow, taking 20.6 to complete the drawing of the image.

The next obvious thing to do is to use vectorization via SSE, where we can work on four floats at a time. The resulting speedup should be impressive. One way to do this is to use SSE intrinsics, however we will go directly to assembly language to try to extract as much speed as possible. First we need an SSE-enabled display function. Fortunately, gcc provides vector intrinsics that allow us to pass the data in SSE form to the inner loop:


typedef float v4sf __attribute__((vector_size(16)));

void mandel_sse(v4sf cr, v4sf ci, unsigned *counts);

static void display_sse(int size, float xmin, float xmax, float ymin, float ymax)
{
	int x, y;
	
	float xscal = (xmax - xmin) / size;
	float yscal = (ymax - ymin) / size;
	
	unsigned counts[4];

	for (y = 0; y < size; y++)
	{
		for (x = 0; x < size; x += 4)
		{
			v4sf cr = {xmin + x * xscal, xmin + (x + 1) * xscal,
				xmin + (x + 2) * xscal, xmin + (x + 3) * xscal};
				
			v4sf ci = {ymin + y * yscal, ymin + y  * yscal,
				ymin + y * yscal, ymin + y * yscal};
			
			mandel_sse(cr, ci, counts);
			
			((unsigned *) bitmap->data)[x + y*size] = cols[counts[0]];
			((unsigned *) bitmap->data)[x + 1 + y*size] = cols[counts[1]];
			((unsigned *) bitmap->data)[x + 2 + y*size] = cols[counts[2]];
			((unsigned *) bitmap->data)[x + 3 + y*size] = cols[counts[3]];
		}
		
		/* Display it */
		XPutImage(dpy, win, gc, bitmap,
			0, y, 0, y,
			size, 1);
	}
	
	XFlush(dpy);
}

The reason we will go directly to asm is that some inner loop functions have already been written. Converting these to the AT&T syntax used by gcc isn't difficult


/* Change the define to point to the function you want to use */
#define mandel_something mandel_sse

/* The gnu assembler supports C-style macros nicely */
#define MAX_ITER (1 << 14)

/* SSE constants, aligned to prevent access errors */
	.align 16

ones:
.float 1.0, 1.0, 1.0, 1.0
twos:
.float 2.0, 2.0, 2.0, 2.0
fours:
.float 4.0, 4.0, 4.0, 4.0
quarter:
.float 0.25, 0.25, 0.25, 0.25
sixteenth:
.float 0.0625, 0.0625, 0.0625, 0.0625

maxiter:
.long MAX_ITER, MAX_ITER, MAX_ITER, MAX_ITER

.globl mandel_softlab
.type mandel_softlab,@function
mandel_softlab:
	xor     %ecx, %ecx
	movaps  fours(%rip), %xmm5
	movaps  %xmm0, %xmm6 # real parts
	movaps  %xmm1, %xmm7 # imaginary parts
	
	xorps   %xmm3, %xmm3 # Clear counters

1:	movaps  %xmm0, %xmm2
	mulps   %xmm1, %xmm2 # xmm2 = x * y
	mulps   %xmm0, %xmm0 # xmm0 = x * x
	mulps   %xmm1, %xmm1 # xmm1 = y * y
	
	movaps  %xmm0, %xmm4
	addps   %xmm1, %xmm4 # xmm4 = x*x + y*y
	subps   %xmm1, %xmm0 # xmm0 = x*x - y*y
	addps   %xmm6, %xmm0 # xmm0 = new real part

	movaps  %xmm2, %xmm1
	addps	%xmm1, %xmm1
	addps   %xmm7, %xmm1 # xmm1 = new imaginary part

	cmpleps %xmm5, %xmm4
	movaps  %xmm4, %xmm2 # xmm2 is all 1s in non-overflowed pixels
	movmskps %xmm4, %eax # lower four bits reflect comparison
	
	andps   ones(%rip), %xmm4
	addps   %xmm4, %xmm3 # Increment counters

	or      %eax, %eax   # Have they all overflowed?
	jz      2f

	inc     %ecx
	cmp     $MAX_ITER, %ecx # Done all the iterations?
	jnz     1b

2:	cvtps2dq %xmm3, %xmm3 # Done: write counters to memory
	movaps	%xmm3, (%rdi)
	ret
.size mandel_softlab, .-mandel_softlab

.globl mandel_codep1
.type mandel_codep1,@function
mandel_codep1:
	movaps  %xmm0, %xmm4
	xorps	%xmm6, %xmm6
	movaps	%xmm1, %xmm5
	mov     $MAX_ITER, %ecx
	movaps  fours(%rip), %xmm7

1:	movaps  %xmm0, %xmm2
	mulps	%xmm0, %xmm0 # xmm0 = x * x
	movaps  %xmm1, %xmm3
	addps   %xmm1, %xmm1 # xmm1 = 2 * y
	mulps   %xmm2, %xmm1 # xmm1 = 2 * x * y
	movaps  %xmm0, %xmm2 # xmm2 = x * x
	mulps   %xmm3, %xmm3 # xmm3 = y * y
	addps   %xmm5, %xmm1 # xmm1 = new imaginary part
	subps   %xmm3, %xmm0 # xmm0 = x * x - y * y
	addps   %xmm3, %xmm2 # xmm2 = x * x + y * y
	
	cmpleps %xmm7, %xmm2 # xmm2 is all 1s in non-overflowed pixels
	addps   %xmm4, %xmm0 # xmm0 = new real part

	movmskps %xmm2, %eax # lower four bits reflect comparison
	test    %eax, %eax
	jz      2f
	andps   %xmm7, %xmm2
	addps   %xmm2, %xmm6 # Increment counters by 4
	
	sub     $1, %ecx     # Done all the iterations? 
	jnz     1b
	
2:	mulps   quarter(%rip), %xmm6 # Done: write (rescaled) counters to memory
	cvtps2dq %xmm6, %xmm6
	movaps	%xmm6, (%rdi)
	ret
.size mandel_codep1, .-mandel_codep1

.globl mandel_codep2
.type mandel_codep2,@function
mandel_codep2:
	movaps  %xmm0, %xmm4
	xorps   %xmm6, %xmm6
	movaps  %xmm1, %xmm5
	mov     $MAX_ITER, %ecx
	movaps  fours(%rip), %xmm7
	
1:	movaps  %xmm0, %xmm2
	mulps   %xmm1, %xmm2 # xmm2 = x * y
	mulps   %xmm0, %xmm0 # xmm0 = x * x
	mulps   %xmm1, %xmm1 # xmm1 = y * y
	addps   %xmm2, %xmm2 # xmm2 = 2 * x * y
	movaps  %xmm0, %xmm3
	addps   %xmm1, %xmm3 # xmm3 = x * x + y * y
	cmpleps %xmm7, %xmm3 # xmm3 is all 1s in non-overflowed pixels

	movmskps %xmm3, %eax # lower four bits reflect comparison
	test    %eax, %eax
	jz      2f
	
	subps   %xmm1, %xmm0 # xmm0 = x * x - y * y
	movaps  %xmm2, %xmm1
	
	andps   %xmm7, %xmm3 # xmm3 is 4.0 if not overflowed
	addps   %xmm3, %xmm6 # Add to counters
	addps   %xmm4, %xmm0 # xmm0 = new real part
	addps   %xmm5, %xmm1 # xmm1 = new imaginary part
	
	dec     %ecx	     # Done all the iterations? 	
	jnz     1b
	
2:	mulps   quarter(%rip), %xmm6 # Done: write (rescaled) counters to memory
	cvtps2dq %xmm6, %xmm6
	movaps	%xmm6, (%rdi)
	ret
.size mandel_codep2, .-mandel_codep2

.globl mandel_vodnaya
.type mandel_vodnaya,@function
mandel_vodnaya:
	movaps  %xmm0, %xmm4
	xorps   %xmm6, %xmm6
	movaps  %xmm1, %xmm5
	mov     $MAX_ITER, %ecx
	movaps  fours(%rip), %xmm7

1:	movaps  %xmm1, %xmm2
	mulps   %xmm2, %xmm2 # xmm2 = y * y
	mulps   %xmm0, %xmm1 # xmm1 = x * y
	movaps  %xmm2, %xmm3
	mulps   %xmm0, %xmm0 # xmm0 = x * x
	addps   %xmm0, %xmm2 # xmm2 = x * x + y * y
	
	cmpleps %xmm7, %xmm2 # xmm2 is all 1s in non-overflowed pixels
	addps   %xmm1, %xmm1 # xmm1 = 2 * x * y
	subps   %xmm3, %xmm0 # xmm0 = x * x - y * y
	addps   %xmm5, %xmm1 # xmm1 = new imaginary part
	addps   %xmm4, %xmm0 # xmm0 = new real part

	movmskps %xmm2, %eax # lower four bits reflect comparison
	test    %eax, %eax
	jz      2f
	
	andps   %xmm7, %xmm2 # xmm2 is 4.0 is not overflowed
	addps   %xmm2, %xmm6 # Add to counters

	sub     $1, %ecx
	jnz     1b

2:	mulps   quarter(%rip), %xmm6 # Done: write (rescaled) counters to memory
	cvtps2dq %xmm6, %xmm6
	movaps	%xmm6, (%rdi)
	ret
.size mandel_vodnaya, .-mandel_vodnaya

These take on average 7.4 seconds for the "softlab" algorithm, 6.7 seconds for the first codeproject algorithm, 7.0 seconds for the second codeproject algorithm, and 6.7 seconds for the "Vodnaya" algorithm on this machine. Note how the "optimal" code depends quite strongly on machine type, but the timings don't vary all that much.

The above code isn't quite the best on this machine. We can speed things up a little more by noticing that that output from the cmpleps instruction is either all-ones or all-zeros. Noting that all-ones is the integer -1, we can use the psubd instruction to update the iteration counters. This saves an instruction on the critical path. A little more rearrangement gives an inner loop that takes 6.2 seconds:


.globl mandel_fast
.type mandel_fast,@function
mandel_fast:
	movaps  %xmm0, %xmm4
	xorps   %xmm6, %xmm6
	movaps  %xmm1, %xmm5
	mov     $MAX_ITER, %ecx
	movaps  fours(%rip), %xmm7

1:	movaps  %xmm1, %xmm2
	mulps   %xmm0, %xmm1 # xmm1 = x * y
	mulps   %xmm2, %xmm2 # xmm2 = y * y
	mulps   %xmm0, %xmm0 # xmm0 = x * x
	
	addps   %xmm1, %xmm1 # xmm1 = 2 * x * y
	movaps  %xmm2, %xmm3
	addps   %xmm0, %xmm2 # xmm2 = x * x + y * y
	
	cmpleps %xmm7, %xmm2 # xmm2 is all 1s in non-overflowed pixels
	subps   %xmm3, %xmm0 # xmm0 = x * x - y * y
	addps   %xmm5, %xmm1 # xmm1 = new imaginary part
	addps   %xmm4, %xmm0 # xmm0 = new real part

	movmskps %xmm2, %eax # lower four bits reflect comparison
	psubd   %xmm2, %xmm6 # Use an integer add to collate the -ve counters
	
	test    %eax, %eax
	jz      2f

	sub     $1, %ecx
	jnz     1b

2:	movaps	%xmm6, (%rdi)
	ret
.size mandel_fast, .-mandel_fast

The above is about as fast as we can go with the algorithm we are using. However, by changing that we can do much better. Note how most of the time spent is in the calculation of points within the Mandelbrot Set itself. Those points need to iterate MAX_ITER times before we are done. Fortunately, in many cases there is a faster way to find out if a point never escapes to infinity.

Periodicity Checking

Many points within the Mandelbrot Set eventually reach periodic orbits. i.e. they converge to a sequence of values that repeats as the square-and-add operation is done. If we can detect this repetition, then we can bail-out early, and perhaps avoid quite a bit of work. To do this, we record a value to test against. We then test the next n iterations against that number. If they become equal to it, then we know we are in a periodic loop. If not, we record a new number and double the number of iterations to test with it. If we ever enter into a loop, this will find it. Some code which implements this in C is:


static unsigned mandel_double_period(double cr, double ci)
{
	double zr = cr, zi = ci;
	double tmp;
	
	double ckr, cki;
	
	unsigned p = 0, ptot = 8;
	
	do
	{
		ckr = zr;
		cki = zi;

		ptot += ptot;
		if (ptot > MAX_ITER) ptot = MAX_ITER;
		
		for (; p < ptot; p++)
		{
			tmp = zr * zr - zi * zi + cr;
			zi *= 2 * zr;
			zi += ci;
			zr = tmp;

			if (zr * zr + zi * zi > 4.0) return p;

			if ((zr == ckr) && (zi == cki)) return MAX_ITER;
		}
	}
	while (ptot != MAX_ITER);
	
	return MAX_ITER;
}

static void display_double_period(int size, double xmin, double xmax, double ymin, double ymax)
{
	int x, y;
	
	double cr, ci;
	
	double xscal = (xmax - xmin) / size;
	double yscal = (ymax - ymin) / size;
	
	unsigned counts;

	for (y = 0; y < size; y++)
	{
		for (x = 0; x < size; x++)
		{
			cr = xmin + x * xscal;
			ci = ymin + y * yscal;
			
			counts = mandel_double_period(cr, ci);
			
			((unsigned *) bitmap->data)[x + y*size] = cols[counts];
		}
		
		/* Display it */
		XPutImage(dpy, win, gc, bitmap,
			0, y, 0, y,
			size, 1);
	}
	
	XFlush(dpy);
}

The above code uses double precision (rather than single precision) and doesn't use any SSE tricks or assembly optimization. However, it completes the benchmark in 2.2 seconds! This is quite a bit faster than the "fastest" code. Of course, we can apply the same techniques to try and speed this up. Rewriting the inner loop to use assembly:


void mandel_sse_period(v4sf cr, v4sf ci, unsigned *counts);

/* SSE +periodicity checking */
static void display_sse_period(int size, float xmin, float xmax, float ymin, float ymax)
{
	int x, y;
	
	float xscal = (xmax - xmin) / size;
	float yscal = (ymax - ymin) / size;
	
	unsigned counts[4];

	for (y = 0; y < size; y++)
	{
		for (x = 0; x < size; x += 4)
		{
			v4sf cr = {xmin + x * xscal, xmin + (x + 1) * xscal,
				xmin + (x + 2) * xscal, xmin + (x + 3) * xscal};
				
			v4sf ci = {ymin + y * yscal, ymin + y  * yscal,
				ymin + y * yscal, ymin + y * yscal};
			
			mandel_sse_period(cr, ci, counts);
			
			((unsigned *) bitmap->data)[x + y*size] = cols[counts[0]];
			((unsigned *) bitmap->data)[x + 1 + y*size] = cols[counts[1]];
			((unsigned *) bitmap->data)[x + 2 + y*size] = cols[counts[2]];
			((unsigned *) bitmap->data)[x + 3 + y*size] = cols[counts[3]];
		}
		
		/* Display it */
		XPutImage(dpy, win, gc, bitmap,
			0, y, 0, y,
			size, 1);
	}
	
	XFlush(dpy);
}

.globl mandel_sse_period
.type mandel_sse_period,@function
mandel_sse_period:
	mov     $8, %esi     # Start with 8*2 = 16 steps
	movaps  %xmm0, %xmm4
	xorps   %xmm6, %xmm6 # Clear counters
	movaps  %xmm1, %xmm5
	xor     %ecx, %ecx
	movaps  fours(%rip), %xmm7
	pxor    %xmm10, %xmm10 # Clear periodicity mask
1:
	movaps  %xmm0, %xmm8 # Update xmm8 and xmm9 with value to test 
	movaps  %xmm1, %xmm9
	add     %esi, %esi   # Double iterations
	cmp     $MAX_ITER, %esi # Don't iterate for more than MAX_ITER steps
	jg      5f

2:	movaps  %xmm1, %xmm2
	mulps   %xmm0, %xmm1 # xmm1 = x * y
	mulps   %xmm2, %xmm2 # xmm2 = y * y
	mulps   %xmm0, %xmm0 # xmm0 = x * x
	
	addps   %xmm1, %xmm1 # xmm1 = 2 * x * y
	movaps  %xmm2, %xmm3
	addps   %xmm0, %xmm2 # xmm2 = x * x + y * y
	
	cmpleps %xmm7, %xmm2 # xmm2 is all 1s in non-overflowed pixels
	subps   %xmm3, %xmm0 # xmm0 = x * x - y * y
	addps   %xmm5, %xmm1 # xmm1 = new imaginary part
	addps   %xmm4, %xmm0 # xmm0 = new real part

	movmskps %xmm2, %eax # lower four bits reflect comparison
	psubd   %xmm2, %xmm6 # Use an integer add to collate the -ve counters
	movaps  %xmm0, %xmm2
	movaps  %xmm1, %xmm3
	
	test    %eax, %eax   # Have all points escaped?
	jz      3f
	
	cmpeqps %xmm8, %xmm2 # Compare with saved value
	cmpeqps %xmm9, %xmm3
	pand    %xmm2, %xmm3
	por     %xmm3, %xmm10 # Record matches
	add     $1, %ecx

	movmskps %xmm10, %edx
	cmp     %edx, %eax   # Are the remaining points periodic? 
	je      4f

	cmp     %ecx, %esi   # Inner loop is complete?
	jne     2b
	cmp     $MAX_ITER, %esi # Are we done?
	jne     1b
	
3:	movaps	%xmm6, (%rdi)
	ret

4:	movaps  %xmm10, %xmm0 # Replace the counts for periodic points with MAX_ITER
	pandn   %xmm6, %xmm10
	pand    maxiter(%rip), %xmm0
	por     %xmm10, %xmm0
	movaps	%xmm0, (%rdi)
	ret

5:	mov     $MAX_ITER, %esi # Max value of counter is MAX_ITER
	jmp     2b
.size mandel_sse_period, .-mandel_sse_period

This gives us a little bit more speed, completing in 1.3 seconds.

Threads

The next thing to try is a little bit of parallelization. This machine is a quad-core, and so far we have only been using a single one of those cores. By using multiple threads we can extract some more speed. Using the pthreads library with the same inner loop:


typedef struct thread_data thread_data;
struct thread_data
{
	int size;
	int num;
	float xmin, xmax;
	float ymin, ymax;
};

/* Change this to 1 to update the image line-by-line */
static const int show = 0;
static pthread_mutex_t x_lock = PTHREAD_MUTEX_INITIALIZER;

static void *thread_func(void *data)
{
	thread_data *td = data;
	
	int x, y;
	
	float xscal = (td->xmax - td->xmin) / td->size;
	float yscal = (td->ymax - td->ymin) / td->size;

	unsigned counts[4];

	for (y = td->num; y < td->size; y += MAX_THREADS)
	{
		for (x = 0; x < td->size; x += 4)
		{
			v4sf cr = {td->xmin + x * xscal, td->xmin + (x + 1) * xscal,
				td->xmin + (x + 2) * xscal, td->xmin + (x + 3) * xscal};
				
			v4sf ci = {td->ymin + y * yscal, td->ymin + y  * yscal,
				td->ymin + y * yscal, td->ymin + y * yscal};
			
			mandel_sse_period(cr, ci, counts);
			
			((unsigned *) bitmap->data)[x + y*td->size] = cols[counts[0]];
			((unsigned *) bitmap->data)[x + 1 + y*td->size] = cols[counts[1]];
			((unsigned *) bitmap->data)[x + 2 + y*td->size] = cols[counts[2]];
			((unsigned *) bitmap->data)[x + 3 + y*td->size] = cols[counts[3]];
		}
		
		if (show)
		{
			pthread_mutex_lock(&x_lock);
			/* Display it */
			XPutImage(dpy, win, gc, bitmap,
				0, y, 0, y,
				td->size, 1);
			pthread_mutex_unlock(&x_lock);
		}
	}
	
	free(td);
	
	return NULL;
}

static void display_sse_period_pthreads(int size, float xmin, float xmax, float ymin, float ymax)
{
	int i;
	thread_data *td;
	
	pthread_t *threads = malloc(sizeof(pthread_t) * MAX_THREADS);
	if (!threads) errx(1, "Out of memory\n");
	
	for (i = 0; i < MAX_THREADS; i++)
	{
		td = malloc(sizeof(thread_data));
		if (!td) errx(1, "Out of memory\n");
		
		td->size = size;
		td->num = i;
		td->xmin = xmin;
		td->xmax = xmax;
		td->ymin = ymin;
		td->ymax = ymax;
		
		if (pthread_create(&threads[i], NULL, thread_func, td)) errx(1, "pthread_create failed\n");
	}
	
	for (i = 0; i < MAX_THREADS; i++)
	{
		if (pthread_join(threads[i], NULL)) errx(1, "pthread_join failed\n");
	}
	
	free(threads);
	
	if (!show)
	{
		XPutImage(dpy, win, gc, bitmap,
			0, 0, 0, 0,
			size, size);
	}
	
	XFlush(dpy);
}

The above completes in 0.3 seconds, which corresponds to the 4-times speedup expected. It works by using scan-line interleaving. Each thread does roughly the same amount of work this way. Unfortunately, updating the screen becomes a bottleneck. This is due to the X11 calls not being thread-safe. Therefore we need to wrap them with locks to prevent data corruption. This locking slows things down. Fortunately, the result is so fast we don't need to worry too much about line-by-line updates. By only printing the image when done, we save a little more time.

The is one more trick available. We can save some more time by exploiting some of the mathematics of the Mandelbrot set. A large part of it is described by some simple mathematical functions. By evaluating these, we can work out if points lie within those regions. If so, then we don't need to evaluate the inner loop at all. This could obviously save quite a bit of time.

Wikipedia has a formula for the largest circle in the Set, given z = x+iy, points within it satisfy (x+1)2+y2<1/16. The Cardioid is a little more complicated. A test for points within that may be cheaply done by first calculating q=(x-1/4)2+y2. Then points within satisfy q(q+x-1/4)<y2/4. Adding these checks to the inner loop function is simple, we just overload the meaning of the periodicity checking mask:


.type mandel_sse_period_card,@function
mandel_sse_period_card:
	mov     $8, %esi     # Start with 8*2 = 16 steps
	movaps  %xmm0, %xmm4
	movaps  %xmm1, %xmm5
	xor     %ecx, %ecx
	
	movaps  %xmm0, %xmm2 # Check for being inside circle
	movaps  %xmm1, %xmm3
	addps   ones(%rip), %xmm2 # xmm2 = x + 1
	mulps   %xmm3, %xmm3 # xmm3 = y * y
	mulps   %xmm2, %xmm2 # xmm2 = (x + 1)^2
	movaps  %xmm3, %xmm10
	addps   %xmm2, %xmm10
	cmpltps sixteenth(%rip), %xmm10 # xmm10 is all-ones inside circle
	
	movaps  %xmm0, %xmm2 # Check for being inside cardioid
	subps   quarter(%rip), %xmm2 # xmm2 = x-1/4
	movaps  %xmm2, %xmm6 # xmm6 = x-1/4
	mulps   %xmm2, %xmm2 # xmm2 = (x-1/4)^2
	addps   %xmm3, %xmm2 # xmm2 = (x-1/4)^2 + y^2 = q
	mulps   quarter(%rip), %xmm3 # xmm3 = y^2/4
	movaps  %xmm2, %xmm7 # xmm7 = q
	addps   %xmm6, %xmm2 # xmm2 = q + x - 1/4
	mulps   %xmm7, %xmm2 # xmm2 = q(q + x - 1/4)
	cmpltps %xmm2, %xmm3 # xmm3 is all-ones inside cardioid
	
	orps    %xmm3, %xmm10 # xmm10 is all-ones inside circle + cardioid
	
	movaps  fours(%rip), %xmm7
	xorps   %xmm6, %xmm6 # Clear counters
	
1:
	movaps  %xmm0, %xmm8 # Update xmm8 and xmm9 with value to test 
	movaps  %xmm1, %xmm9
	add     %esi, %esi   # Double iterations
	cmp     $MAX_ITER, %esi # Don't iterate for more than MAX_ITER steps
	jg      5f

2:	movaps  %xmm1, %xmm2
	mulps   %xmm0, %xmm1 # xmm1 = x * y
	mulps   %xmm2, %xmm2 # xmm2 = y * y
	mulps   %xmm0, %xmm0 # xmm0 = x * x
	
	addps   %xmm1, %xmm1 # xmm1 = 2 * x * y
	movaps  %xmm2, %xmm3
	addps   %xmm0, %xmm2 # xmm2 = x * x + y * y
	
	cmpleps %xmm7, %xmm2 # xmm2 is all 1s in non-overflowed pixels
	subps   %xmm3, %xmm0 # xmm0 = x * x - y * y
	addps   %xmm5, %xmm1 # xmm1 = new imaginary part
	addps   %xmm4, %xmm0 # xmm0 = new real part

	movmskps %xmm2, %eax # lower four bits reflect comparison
	psubd   %xmm2, %xmm6 # Use an integer add to collate the -ve counters
	movaps  %xmm0, %xmm2
	movaps  %xmm1, %xmm3
	
	test    %eax, %eax   # Have all points escaped?
	jz      3f
	
	cmpeqps %xmm8, %xmm2 # Compare with saved value
	cmpeqps %xmm9, %xmm3
	pand    %xmm2, %xmm3
	por     %xmm3, %xmm10 # Record matches
	add     $1, %ecx

	movmskps %xmm10, %edx
	cmp     %edx, %eax   # Are the remaining points periodic? 
	je      4f

	cmp     %ecx, %esi   # Inner loop is complete?
	jne     2b
	cmp     $MAX_ITER, %esi # Are we done?
	jne     1b
	
3:	movaps	%xmm6, (%rdi)
	ret

4:	movaps  %xmm10, %xmm0 # Replace the counts for periodic points with MAX_ITER
	pandn   %xmm6, %xmm10
	pand    maxiter(%rip), %xmm0
	por     %xmm10, %xmm0
	movaps	%xmm0, (%rdi)
	ret

5:	mov     $MAX_ITER, %esi # Max value of counter is MAX_ITER
	jmp     2b
.size mandel_sse_period_card, .-mandel_sse_period_card

So how much faster is it? Unfortunately, the time taken is still 0.3 seconds. It appears that the program can't get that much faster due to the time taken by X11 to create the window. This extra optimization also seems to be mostly duplicating what the periodicity-checking code does (albeit in a much more efficient manner). However, the result is still much (20+ times) faster than the "optimal" asm code discussed on code project. (Another possibility is to use the reflection symmetry along the real axis to try and gain another factor of two, but again this doesn't help much in reality.)

Onwards

As can be seen, from the naive initial code to the final version there are almost a couple of orders of magnitude in difference in speed. So is the above the best code? Nope. There are still other algorithms that can be used to speed things up even more. However, they all add inaccuracy in some way or another.

One way to speed things up is to not calculate every point. Since the image consists of blocks of colours, if we can detect regions of constant colour we can fill the entire block at once. Of course, it is possible to get this checking wrong, and thus draw something incorrect. Different algorithms have differing sensitivities to this. One simple one, known as "solid guessing" looks at the four corner points of a square. If they are all the same colour, it assumes that the entire square is. If they differ, it subdivides and tries again. Another algorithm traces the boundaries of coloured regions. Since the Mandelbrot Set is simply-connected this works fairly well. However, since the boundary-tracing algorithm only tests a finite number of points it doesn't prevent fine features, smaller than a pixel in width from confusing it.

For deeper zooms, one can gain improved speed by noticing that the points on the screen all start out very close to each other in the complex plane. Instead of doing iterations for every point, one can discover how that rectangle (quadrilateral) is moved and distorted by the square-and-add operation. If the size is small enough, then the relative distortion is also small, and can be approximated in a simple functional form. Eventually, the distortion rises to be large enough that the quadrilateral needs to be subdivided. However, in the process we can save a large amount of work.

Even deeper, one needs to worry about arbitrary precision arithmetic. As such, there is no obviously fastest algorithm. The best code needs to seamlessly change from single precision, to double, and then arbitrary precision as required.

Comments

firr said...
Great thing, especially this cycle detections i never heard of. I recently played with mandelbrot and found thet there is possble a version with only two muls not three (when you use four cmp instead of one )

   

   for(n = 0 ; n <= max_iter; n++)
   {
     reim2 = (re + re) * im;

     re = (re - im) * (re + im) + cRe;
     im = reim2 + cIm;

     if( re > 2.0 || re<-2.0 && im> 2.0 || im<-2.0 ) break;

   }

had anyon tried how it works when rewritten to sse ?
(terrible captcha!)
said...
Enter your comments here
Bayu said...
When I tried to run this program, my caolalutcr gave me an error syntax message which it located at lines:IS>(Z,N:Goto 8IS>(C,NIt also gave me a window range error when I attempted to get and type of rectangle on my graph and brought me to the line:Line(J,0,J,KI took out the two syntax error program steps that I listed above and then attempted to run the program again. When I tried the graphing option, the graph would only calculate and show one rectangle before the calculating indicator just kept running without anything happening. When I broke from this and hit the go to option the caolalutcr took me to the step:Lbl DWhen I tried the non graphing option which would just calculate the sum, the same thing happened where the calculating indicator kept running after I had entered the endpoints, what I wanted to calculate, and n. This time the caolalutcr took me to the step:Lbl 7I don't have extensive knowledge in programming or understand the concept of all the different tedious steps of the program. I also don't really have a clue how to fix these errors which is why I need some assistance. I copied the code on this web page verbatim into my caolalutcr and the only theory I have for why it doesn't work is because somewhere along the line I have confused the letter I with the number 1, (or maybe the letter O with the number 0 but I don't think the letter O was used as a variable in this code). My only attempt at a solution was taking out the syntax error lines I have listed above but that didn't work as the caolalutcr wasn't able to complete calculations for either the sum or the rectangles in the graph. This is the best and most detailed and useful program I have found online and if I were able to get it to work I could save loads of time on my Calculus Homework. I hope you will be able to help me solve the problem I am facing.
Rogelio said...
With thanks for this<a href="http://umkuwb.com"> pctiraular</a> very good content; this is the type of consideration that preserves me though out the day.Weve always heard been looking around for your personal web-site right after I observed about them from a pal and was pleased when I was able to unearth it immediately after researching for a while. Being a enthusiastic blogger, I'm pleased to view other folks taking move and adding to to your community. I just needed to comment to exhibit my thanks for ones post as it is incredibly stimulating, and many authors tend not to get the credit score they deserve. I am confident I'll be again and will deliver a number of my associates.
Corina said...
Thanks for your post. I would also love to say that the very first thing you will need to carry out is verify if you rellay need repairing credit. To do that you have got to get your hands on a duplicate of your credit score. That should rellay not be difficult, considering that the government necessitates that you are allowed to obtain one no cost copy of your real credit report each year. You just have to request that from the right persons. You can either look at website with the Federal Trade Commission or even contact one of the main credit agencies instantly.
Kunjeethu said...
I am just commenting to let you<a href="http://wgprqu.com"> uratnsednd</a> of the notable discovery my friend's princess enjoyed reading through your webblog. She mastered a good number of details, most notably what it is like to possess a very effective giving mood to let other folks very easily know just exactly selected complicated topics. You undoubtedly exceeded her expected results. Many thanks for imparting the necessary, dependable, educational and also fun guidance on this topic to Julie.
Your mom just said...
Go to ur room.

Enter the 10 characters above here


Name
About Us Returns Policy Privacy Policy Send us Feedback
Company Info | Product Index | Category Index | Help | Terms of Use
Copyright © Lockless Inc All Rights Reserved.