[Add] Some SSE stuff for speedups.

This commit is contained in:
Allanis 2013-07-11 18:44:37 +01:00
parent 1442259c0b
commit ec6cabf2fe

View File

@ -2,6 +2,10 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#ifdef __SSE__
#include <xmmintrin.h>
#endif /* __SSE__ */
#include "lephisto.h" #include "lephisto.h"
#include "log.h" #include "log.h"
#include "rng.h" #include "rng.h"
@ -45,7 +49,36 @@ static void noise_delete(noise_t noise);
static float lattice(perling_data_t* pdata, int ix, float fx, int iy, static float lattice(perling_data_t* pdata, int ix, float fx, int iy,
float fy, int iz, float fz, int iw, float fw) { float fy, int iz, float fz, int iw, float fw) {
#ifdef __SSE__
(void)iw;
(void)fw;
int nindex;
__m128 a, b, c;
nindex = 0;
nindex = pdata->map[(nindex + ix) & 0xFF];
nindex = pdata->map[(nindex + iy) & 0xFF];
nindex = pdata->map[(nindex + iz) & 0xFF];
float inp_sse1[4] __attribute__((aligned(16))) = {
pdata->buffer[nindex][0],
pdata->buffer[nindex][1],
pdata->buffer[nindex][2],
0.
};
float inp_sse2[4] __attribute__ ((aligned(16))) = {
fx, fy, fz, 0.
};
float out_sse[4] __attribute__((aligned(16)));
a = _mm_load_ps(inp_sse1);
b = _mm_load_ps(inp_sse2);
c = _mm_mul_ps(a, b);
_mm_store_ps(out_sse, c);
return out_sse[0] + out_sse[1] + out_sse[2];
#else /* __SSE__ */
int n[4] = { ix, iy, iz, iw }; int n[4] = { ix, iy, iz, iw };
float f[4] = { fx, fy, fz, fw }; float f[4] = { fx, fy, fz, fw };
int nindex = 0; int nindex = 0;
@ -58,6 +91,7 @@ static float lattice(perling_data_t* pdata, int ix, float fx, int iy,
value += pdata->buffer[nindex][i] * f[i]; value += pdata->buffer[nindex][i] * f[i];
return value; return value;
#endif /* __SSE__ */
} }
#define DEFAULT_SEED 0x15687436 #define DEFAULT_SEED 0x15687436
@ -248,11 +282,14 @@ static float* genNebulaeMap(const int w, const int h, const int n, float rug) {
noise_t noise; noise_t noise;
float* nebulae;; float* nebulae;;
float value; float value;
unsigned int* t, s;
/* Pretty default values. */
octaves = 3.; octaves = 3.;
hurst = NOISE_DEFAULT_HURST; hurst = NOISE_DEFAULT_HURST;
lacunarity = NOISE_DEFAULT_LACUNARITY; lacunarity = NOISE_DEFAULT_LACUNARITY;
/* Create noiuse and data. */
noise = noise_new(2, hurst, lacunarity); noise = noise_new(2, hurst, lacunarity);
nebulae = malloc(sizeof(float)*w*h*n); nebulae = malloc(sizeof(float)*w*h*n);
@ -261,6 +298,12 @@ static float* genNebulaeMap(const int w, const int h, const int n, float rug) {
return NULL; return NULL;
} }
/* Some debug information and time setting. */
s = SDL_GetTicks();
t = malloc(sizeof(unsigned int)*n);
DEBUG("Generating Nebulae of size %dx%dx%d", w, h, n);
/* Start to create the nebulae. */
f[2] = 0.; f[2] = 0.;
for(z = 0; z < n; z++) { for(z = 0; z < n; z++) {
for(y = 0; y < h; y++) { for(y = 0; y < h; y++) {
@ -278,10 +321,18 @@ static float* genNebulaeMap(const int w, const int h, const int n, float rug) {
} }
} }
f[2] += 0.01; f[2] += 0.01;
/* More time magic debug. */
t[z] = SDL_GetTicks();
DEBUG(" Layer %d/%d generated in %dms", z+1, n,
(z>0) ? t[z] - t[z-1] : t[z] - s);
} }
/* Cleanup. */
noise_delete(noise); noise_delete(noise);
/* Results. */
DEBUG("Nebulae Generated in %dms", SDL_GetTicks() - s);
return nebulae; return nebulae;
} }