diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..caefb80add0205973008c0b6e6003e119b7f8f2a --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ + +# Define Lattice size and cache blocking size: +#LATTICE=-Dnx=16 -Dny=16 -Dnz=16 -Dnt=16 -DDIM_BLOCK_X=8 -DDIM_BLOCK_Y=8 -DDIM_BLOCK_Z=8 -DDIM_BLOCK_T=4 +LATTICE=-Dnx=32 -Dny=32 -Dnz=32 -Dnt=32 -DDIM_BLOCK_X=8 -DDIM_BLOCK_Y=8 -DDIM_BLOCK_Z=8 -DDIM_BLOCK_T=4 + +############################################################################### + +CC=nvcc + +# K80 +#CC_CFLAGS=-O3 --ptxas-options=-v -lineinfo -gencode arch=compute_37,code=sm_37 -keep + +# V100 +CC_CFLAGS=-O3 --ptxas-options=-v -lineinfo -gencode arch=compute_70,code=sm_70 -keep + +############################################################################### + +all: cuda + +############################################################################### + + +# NVIDIA CUDA COMPILER + +cuda: common-cuda.h + nvcc $(LATTICE) $(CC_CFLAGS) main.cu -o test-cuda-gpu + +# CLEAN + +clean: + rm -rf *.o + diff --git a/common-cuda.h b/common-cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..0883a31a348fdeef0380178d150add1452fae2a7 --- /dev/null +++ b/common-cuda.h @@ -0,0 +1,399 @@ +#include <math.h> +#include <complex.h> +#include <cuda.h> +#include <cuComplex.h> + +// lattice dimensions +// defined in the Makefile +//#define nx 48 +//#define ny 48 +//#define nz 48 +//#define nt 48 + +// defined in the Makefile +//#define DIM_BLOCK_X 12 // This should divide (nx/2) +//#define DIM_BLOCK_Y 4 // This should divide ny +//#define DIM_BLOCK_Z 1 // This should divide nz*nt + +// Number of iterations +#define NITER 10 + +//Decomment this to allocate and store the 3rd su3 matrix line +//#define ALLOCROW3 +//Decomment this to store and read the 3rd su3 matrix line +//#define READROW3 +//If we want to read it we should also allocate it +#ifdef READROW3 + #define ALLOCROW3 +#endif + +#define vol1 nx +#define vol2 (ny * vol1) +#define vol3 (nz * vol2) +#define vol4 (nt * vol3) + +#define nxh (nx >> 1) // nx/2 +#define nyh (ny >> 1) +#define nzh (nz >> 1) +#define nth (nt >> 1) + + +#define size vol4 +#define size2 (2*size) +#define size3 (3*size) + +#define sizeh (size / 2) +#define no_links (4 * vol4) + +#define ALIGN 128 + + +typedef cuDoubleComplex d_complex; + +typedef struct vec3_soa_t { + d_complex c0[sizeh]; + d_complex c1[sizeh]; + d_complex c2[sizeh]; +} vec3_soa; + +typedef struct su3_soa_t { + vec3_soa r0; + vec3_soa r1; +#ifdef ALLOCROW3 + vec3_soa r2; +#endif +} su3_soa; + +typedef struct vec3_t { + d_complex c0; + d_complex c1; + d_complex c2; +} vec3; + + +// Common functions + +inline void checkCUDAError(const char *msg) { + cudaError_t err = cudaGetLastError(); + if ( cudaSuccess != err) { + fprintf(stderr, "ERROR: %s %s.\n", msg, cudaGetErrorString( err) ); + exit(-1); + } +} + +__host__ __device__ static __inline__ uint snum(uint x, uint y, uint z, uint t) { + + uint ris; + + ris = x + (y*vol1) + (z*vol2) + (t*vol3); + + return ris/2; // <--- /2 Pay attention to even/odd (see init_geo) + +} + +__host__ __device__ static __inline__ d_complex myConj(d_complex a) { + + d_complex res; + + res.x = a.x; + res.y = -a.y; + + return res; + +} + +void loadFermionFromFile( vec3_soa * fermion, const char *filename) { + + FILE *fp; + double ar, ai, br, bi, cr, ci; + int i = 0; + int error =0; + + fp = fopen(filename, "rt"); + + if (fp == NULL) { + printf("Could not open file %s \n", filename); + exit(-1); + } + + while ( (i < sizeh) && (!error) ) { + if (fscanf(fp, "(%lf,%lf) (%lf,%lf) (%lf,%lf) \n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { +// fermion->c0[i] = (ar + ai * I); +// fermion->c1[i] = (br + bi * I); +// fermion->c2[i] = (cr + ci * I); + fermion->c0[i] = make_cuDoubleComplex(ar, ai); + fermion->c1[i] = make_cuDoubleComplex(br, bi); + fermion->c2[i] = make_cuDoubleComplex(cr, ci); + } else { + printf("Read error... \n"); + error = 1; + } + //printf("Read line: (%lf,%lf) (%lf,%lf) (%lf,%lf) \n", ar, ai, br, bi, cr, ci); + i++; + } + + printf("Read %d fermions from file %s \n", i, filename); + + fclose(fp); + +} + + +void loadSu3FromFile(su3_soa * u, const char *filename){ + + FILE *fp; + int nx_l, ny_l, nz_l, nt_l, update_iterations; + double beta_l, mass_l, no_flavours_l; + double ar, ai, br, bi, cr, ci; + int idx; + int i = 0; + int j = 0; + int error = 0; + + fp = fopen(filename, "rt"); + + if (fp == NULL) { + printf("Could not open file %s \n", filename); + exit(-1); + } + + + fscanf(fp, "%d %d %d %d %lf %lf %lf %d \n", &nx_l, &ny_l, &nz_l, &nt_l, + &beta_l, &mass_l, &no_flavours_l, + &update_iterations); + + printf("Reading configuration file with header: \n"); + printf("nx_l: %d, ny_l: %d, nz_l: %d, nt_l: %d \n", nx_l, ny_l, nz_l, nt_l); + printf("beta_l: %lf, mass_l: %lf, no_flavours_l: %lf \n", beta_l, mass_l, no_flavours_l); + printf("update_iterations: %d \n", update_iterations); + +// while ( (i < no_links) && (!error) ) { + + while ( (i < sizeh*8) && (!error) ) { + + j = i / sizeh; + idx = i % sizeh; + + if (fscanf(fp, "(%lf,%lf) (%lf,%lf) (%lf,%lf) \n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { + //u[j].r0.c0[idx] = (ar + ai * I); + //u[j].r0.c1[idx] = (br + bi * I); + //u[j].r0.c2[idx] = (cr + ci * I); + u[j].r0.c0[idx] = make_cuDoubleComplex(ar, ai); + u[j].r0.c1[idx] = make_cuDoubleComplex(br, bi); + u[j].r0.c2[idx] = make_cuDoubleComplex(cr, ci); + } else { + printf("Read error... "); + error = 1; + } + + if (fscanf(fp, "(%lf,%lf) (%lf,%lf) (%lf,%lf) \n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { + //u[j].r1.c0[idx] = (ar + ai * I); + //u[j].r1.c1[idx] = (br + bi * I); + //u[j].r1.c2[idx] = (cr + ci * I); + u[j].r1.c0[idx] = make_cuDoubleComplex(ar, ai); + u[j].r1.c1[idx] = make_cuDoubleComplex(br, bi); + u[j].r1.c2[idx] = make_cuDoubleComplex(cr, ci); + } else { + printf("Read error... "); + error = 1; + } + + if (fscanf(fp, "(%lf,%lf) (%lf,%lf) (%lf,%lf) \n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { +#ifdef ALLOCROW3 + //u[j].r2.c0[idx] = (ar + ai * I); + //u[j].r2.c1[idx] = (br + bi * I); + //u[j].r2.c2[idx] = (cr + ci * I); + u[j].r2.c0[idx] = make_cuDoubleComplex(ar, ai); + u[j].r2.c1[idx] = make_cuDoubleComplex(br, bi); + u[j].r2.c2[idx] = make_cuDoubleComplex(cr, ci); +#endif + } else { + printf("Read error... "); + error = 1; + } + + i++; + + } + + printf("Read %d matrices from file %s \n", i*j, filename); + + fclose(fp); + +} + +void loadFermionFromFileNew( vec3_soa * fermion, const char *filename) { + + FILE *fp; + double ar, ai, br, bi, cr, ci; + int i = 0; + int error =0; + + fp = fopen(filename, "rt"); + + if (fp == NULL) { + printf("Could not open file %s \n", filename); + exit(-1); + } + + while ( (i < sizeh) && (!error) ) { + if (fscanf(fp, "%lf %lf\n%lf %lf\n%lf %lf\n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { +// fermion->c0[i] = (ar + ai * I); +// fermion->c1[i] = (br + bi * I); +// fermion->c2[i] = (cr + ci * I); + fermion->c0[i] = make_cuDoubleComplex(ar, ai); + fermion->c1[i] = make_cuDoubleComplex(br, bi); + fermion->c2[i] = make_cuDoubleComplex(cr, ci); + } else { + printf("Read error... \n"); + error = 1; + } + //printf("Read line: (%lf,%lf) (%lf,%lf) (%lf,%lf) \n", ar, ai, br, bi, cr, ci); + i++; + } + + printf("Read %d fermions from file %s \n", i, filename); + + fclose(fp); + +} + + +void loadSu3FromFileNew(su3_soa * u, const char *filename){ + + FILE *fp; + int nx_l, ny_l, nz_l, nt_l, update_iterations; + double ar, ai, br, bi, cr, ci; + int idx; + int i = 0; + int j = 0; + int error = 0; + + fp = fopen(filename, "rt"); + + if (fp == NULL) { + printf("Could not open file %s \n", filename); + exit(-1); + } + + + fscanf(fp, "%d %d %d %d %d \n", &nx_l, &ny_l, &nz_l, &nt_l,&update_iterations); + + printf("Reading configuration file with header: \n"); + printf("nx_l: %d, ny_l: %d, nz_l: %d, nt_l: %d \n", nx_l, ny_l, nz_l, nt_l); + printf("update_iterations: %d \n", update_iterations); + +// while ( (i < no_links) && (!error) ) { + + while ( (i < sizeh*8) && (!error) ) { + + j = i / sizeh; + idx = i % sizeh; + + if (fscanf(fp, "%lf %lf\n%lf %lf\n%lf %lf\n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { + //u[j].r0.c0[idx] = (ar + ai * I); + //u[j].r0.c1[idx] = (br + bi * I); + //u[j].r0.c2[idx] = (cr + ci * I); + u[j].r0.c0[idx] = make_cuDoubleComplex(ar, ai); + u[j].r0.c1[idx] = make_cuDoubleComplex(br, bi); + u[j].r0.c2[idx] = make_cuDoubleComplex(cr, ci); + } else { + printf("Read error... "); + error = 1; + } + + if (fscanf(fp, "%lf %lf\n%lf %lf\n%lf %lf\n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { + //u[j].r1.c0[idx] = (ar + ai * I); + //u[j].r1.c1[idx] = (br + bi * I); + //u[j].r1.c2[idx] = (cr + ci * I); + u[j].r1.c0[idx] = make_cuDoubleComplex(ar, ai); + u[j].r1.c1[idx] = make_cuDoubleComplex(br, bi); + u[j].r1.c2[idx] = make_cuDoubleComplex(cr, ci); + + } else { + printf("Read error... "); + error = 1; + } + + if (fscanf(fp, "%lf %lf\n%lf %lf\n%lf %lf\n", &ar, &ai, &br, &bi, &cr, &ci) == 6) { +#ifdef ALLOCROW3 + //u[j].r2.c0[idx] = (ar + ai * I); + //u[j].r2.c1[idx] = (br + bi * I); + //u[j].r2.c2[idx] = (cr + ci * I); + u[j].r2.c0[idx] = make_cuDoubleComplex(ar, ai); + u[j].r2.c1[idx] = make_cuDoubleComplex(br, bi); + u[j].r2.c2[idx] = make_cuDoubleComplex(cr, ci); +#endif + } else { + printf("Read error... "); + error = 1; + } + + i++; + + } + + printf("Read %d matrices from file %s \n", i, filename); + + fclose(fp); + +} + + + +void writeFermionToFile(vec3_soa * fermion, const char *filename){ + + FILE *fp; + int i = 0; + int error = 0; + + fp = fopen(filename, "w"); + + if (fp == NULL) { + printf("Could not open file %s \n", filename); + exit(-1); + } + + while ( (i < sizeh) && (!error) ) { + +// if (fprintf(fp, "(%lf,%lf) (%lf,%lf) (%lf,%lf) \n", cuCreal(fermion->c0[i]), cuCimag(fermion->c0[i]), +// cuCreal(fermion->c1[i]), cuCimag(fermion->c1[i]), +// cuCreal(fermion->c2[i]), cuCimag(fermion->c2[i])) < 0) { + + if (fprintf(fp, "%lf %lf\n%lf %lf\n%lf %lf\n", cuCreal(fermion->c0[i]), cuCimag(fermion->c0[i]), + cuCreal(fermion->c1[i]), cuCimag(fermion->c1[i]), + cuCreal(fermion->c2[i]), cuCimag(fermion->c2[i])) < 0) { + + printf("Write error... "); + + error = 1; + + } + + i++; + + } + + printf("Wrote %d fermions from file %s \n", i, filename); + + fclose(fp); + +} + + + + + + +// Just for debugging: +void showbits(unsigned int x) { + + int i; + + for(i=(sizeof(int)*8)-1; i>=0; i--) + (x&(1<<i))?putchar('1'):putchar('0'); + + printf("\n"); + +} + diff --git a/main.cu b/main.cu new file mode 100644 index 0000000000000000000000000000000000000000..3f6da376cbdcc2ab86552b07a655a229ae189533 --- /dev/null +++ b/main.cu @@ -0,0 +1,406 @@ +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> + +#include "common-cuda.h" + +__host__ __device__ static __inline__ void mat_vec_mul( const __restrict su3_soa * const matrix, + const int idx_mat, + const int eta, + const __restrict vec3_soa * const in_vect, + const int idx_vect, + __restrict vec3 * const out_vect) { + + cuDoubleComplex vec0 = in_vect->c0[idx_vect]; + cuDoubleComplex vec1 = in_vect->c1[idx_vect]; + cuDoubleComplex vec2 = in_vect->c2[idx_vect]; + + cuDoubleComplex mat00 = matrix->r0.c0[idx_mat]; + cuDoubleComplex mat01 = matrix->r0.c1[idx_mat]; + cuDoubleComplex mat02 = matrix->r0.c2[idx_mat]; + + cuDoubleComplex mat10 = matrix->r1.c0[idx_mat]; + cuDoubleComplex mat11 = matrix->r1.c1[idx_mat]; + cuDoubleComplex mat12 = matrix->r1.c2[idx_mat]; + +#ifdef READROW3 +// Load 3rd matrix row from global memory + cuDoubleComplex mat20 = matrix->r2.c0[idx_mat]; + cuDoubleComplex mat21 = matrix->r2.c1[idx_mat]; + cuDoubleComplex mat22 = matrix->r2.c2[idx_mat]; +#else +//Compute 3rd matrix row from the first two + cuDoubleComplex mat20 = cuConj( cuCsub( cuCmul( mat01, mat12 ), cuCmul( mat02, mat11) ) ); + cuDoubleComplex mat21 = cuConj( cuCsub( cuCmul( mat02, mat10 ), cuCmul( mat00, mat12) ) ); + cuDoubleComplex mat22 = cuConj( cuCsub( cuCmul( mat00, mat11 ), cuCmul( mat01, mat10) ) ); +#endif + +//Multiply 3rd row by eta + mat20 = make_cuDoubleComplex(cuCreal(mat20)*eta, cuCimag(mat20)*eta); + mat21 = make_cuDoubleComplex(cuCreal(mat21)*eta, cuCimag(mat21)*eta); + mat22 = make_cuDoubleComplex(cuCreal(mat22)*eta, cuCimag(mat22)*eta); + + out_vect->c0 = cuCadd( cuCadd( cuCmul( mat00, vec0 ), + cuCmul( mat01, vec1 )), + cuCmul( mat02, vec2 )); + + out_vect->c1 = cuCadd( cuCadd( cuCmul( mat10, vec0 ), + cuCmul( mat11, vec1 )), + cuCmul( mat12, vec2 )); + + out_vect->c2 = cuCadd( cuCadd( cuCmul( mat20, vec0 ), + cuCmul( mat21, vec1 )), + cuCmul( mat22, vec2 )); + +} + +__host__ __device__ static __inline__ void conjmat_vec_mul( const __restrict su3_soa * const matrix, + const int idx_mat, + const int eta, + const __restrict vec3_soa * const in_vect, + const int idx_vect, + __restrict vec3 * const out_vect) { + + cuDoubleComplex vec0 = in_vect->c0[idx_vect]; + cuDoubleComplex vec1 = in_vect->c1[idx_vect]; + cuDoubleComplex vec2 = in_vect->c2[idx_vect]; + + cuDoubleComplex mat00 = matrix->r0.c0[idx_mat]; + cuDoubleComplex mat01 = matrix->r0.c1[idx_mat]; + cuDoubleComplex mat02 = matrix->r0.c2[idx_mat]; + + cuDoubleComplex mat10 = matrix->r1.c0[idx_mat]; + cuDoubleComplex mat11 = matrix->r1.c1[idx_mat]; + cuDoubleComplex mat12 = matrix->r1.c2[idx_mat]; + +#ifdef READROW3 +// Load 3rd matrix row from global memory +// cuDoubleComplex mat20 = matrix->r2.c0[idx_mat]; +// cuDoubleComplex mat21 = matrix->r2.c1[idx_mat]; +// cuDoubleComplex mat22 = matrix->r2.c2[idx_mat]; +#else +//Compute 3rd matrix row from the first two + cuDoubleComplex mat20 = cuConj( cuCsub( cuCmul( mat01, mat12 ), cuCmul( mat02, mat11) ) ); + cuDoubleComplex mat21 = cuConj( cuCsub( cuCmul( mat02, mat10 ), cuCmul( mat00, mat12) ) ); + cuDoubleComplex mat22 = cuConj( cuCsub( cuCmul( mat00, mat11 ), cuCmul( mat01, mat10) ) ); +#endif + +//Multiply 3rd row by eta + mat20 = make_cuDoubleComplex(cuCreal(mat20)*eta, cuCimag(mat20)*eta); + mat21 = make_cuDoubleComplex(cuCreal(mat21)*eta, cuCimag(mat21)*eta); + mat22 = make_cuDoubleComplex(cuCreal(mat22)*eta, cuCimag(mat22)*eta); + + out_vect->c0 = cuCadd( cuCadd( cuCmul( cuConj(mat00), vec0 ), + cuCmul( cuConj(mat10), vec1 )), + cuCmul( cuConj(mat20), vec2 )); + + out_vect->c1 = cuCadd( cuCadd( cuCmul( cuConj(mat01), vec0 ), + cuCmul( cuConj(mat11), vec1 )), + cuCmul( cuConj(mat21), vec2 )); + + out_vect->c2 = cuCadd( cuCadd( cuCmul( cuConj(mat02), vec0 ), + cuCmul( cuConj(mat12), vec1 )), + cuCmul( cuConj(mat22), vec2 )); + +} + +__host__ __device__ static __inline__ vec3 sumResult ( vec3 aux, vec3 aux_tmp) { + + aux.c0 = cuCadd ( aux.c0, aux_tmp.c0); + aux.c1 = cuCadd ( aux.c1, aux_tmp.c1); + aux.c2 = cuCadd ( aux.c2, aux_tmp.c2); + + return aux; + +} + +__host__ __device__ static __inline__ vec3 subResult ( vec3 aux, vec3 aux_tmp) { + + aux.c0 = cuCsub ( aux.c0, aux_tmp.c0); + aux.c1 = cuCsub ( aux.c1, aux_tmp.c1); + aux.c2 = cuCsub ( aux.c2, aux_tmp.c2); + + return aux; + +} + +__global__ void Deo(const __restrict su3_soa * const u, __restrict vec3_soa * const out, const __restrict vec3_soa * const in) { + + int x, y, z, t, xm, ym, zm, tm, xp, yp, zp, tp, idxh, eta; //, idx; + + vec3 aux_tmp; + vec3 aux; + + idxh = ((blockIdx.z * blockDim.z + threadIdx.z) * nxh * ny) + + ((blockIdx.y * blockDim.y + threadIdx.y) * nxh) + + (blockIdx.x * blockDim.x + threadIdx.x); // idxh = snum(x,y,z,t) + +// idx = 2*idxh; +// t = (idx / vol3) % nt; +// z = (idx / vol2) % nz; +// y = (blockIdx.y * blockDim.y + threadIdx.y); +// x = 2*(blockIdx.x * blockDim.x + threadIdx.x) + ((y+z+t) % 2); + + t = (blockIdx.z * blockDim.z + threadIdx.z) / nz; + z = (blockIdx.z * blockDim.z + threadIdx.z) % nz; + y = (blockIdx.y * blockDim.y + threadIdx.y); + x = 2*(blockIdx.x * blockDim.x + threadIdx.x) + ((y+z+t) & 0x1); + + xm = x - 1; + xm = xm + (((xm >> 31) & 0x1) * nx); + ym = y -1; + ym = ym + (((ym >> 31) & 0x1) * ny); + zm = z -1; + zm = zm + (((zm >> 31) & 0x1) * nz); + tm = t -1; + tm = tm + (((tm >> 31) & 0x1) * nt); + + xp = (x+1); + xp *= (((xp-nx) >> 31) & 0x1); + yp = (y+1); + yp *= (((yp-ny) >> 31) & 0x1); + zp = (z+1); + zp *= (((zp-nz) >> 31) & 0x1); + tp = (t+1); + tp *= (((tp-nt) >> 31) & 0x1); + + eta = 1; +// mat_vec_mul( &(u_work[snum(x,y,z,t) ]), &(in[snum(xp,y,z,t)]), &aux_tmp ); + mat_vec_mul( &u[0], idxh, eta, in, snum(xp,y,z,t), &aux_tmp ); + aux = aux_tmp; + + eta = 1 - ( 2*(x & 0x1) ); // if (x % 2 = 0) eta = 1 else -1 +// mat_vec_mul( &(u_work[snum(x,y,z,t) + size ]), &(in[snum(x,yp,z,t)]), &aux_tmp ); + mat_vec_mul( &u[2], idxh, eta, in, snum(x,yp,z,t), &aux_tmp ); + aux = sumResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y) & 0x1) ); +// mat_vec_mul( &(u_work[snum(x,y,z,t) + size2]), &(in[snum(x,y,zp,t)]), &aux_tmp ); + mat_vec_mul( &u[4], idxh, eta, in, snum(x,y,zp,t), &aux_tmp); + aux = sumResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y+z) & 0x1) ); +// mat_vec_mul( &(u_work[snum(x,y,z,t) + size3]), &(in[snum(x,y,z,tp)]), &aux_tmp ); + mat_vec_mul( &u[6], idxh, eta, in, snum(x,y,z,tp), &aux_tmp ); + aux = sumResult(aux, aux_tmp); + +////////////////////////////////////////////////////////////////////////////////////////////// + + eta = 1; +// conjmat_vec_mul( &(u_work[sizeh + snum(xm,y,z,t) ]), &(in[ snum(xm,y,z,t) ]), &aux_tmp ); + conjmat_vec_mul( &u[1], snum(xm,y,z,t), eta, in, snum(xm,y,z,t), &aux_tmp ); + aux = subResult(aux, aux_tmp); + + eta = 1 - ( 2*(x & 0x1) ); +// conjmat_vec_mul( &(u_work[sizeh + snum(x,ym,z,t) + size ]), &(in[ snum(x,ym,z,t) ]), &aux_tmp ); + conjmat_vec_mul( &u[3], snum(x,ym,z,t), eta, in, snum(x,ym,z,t), &aux_tmp ); + aux = subResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y) & 0x1) ); +// conjmat_vec_mul( &(u_work[sizeh + snum(x,y,zm,t) + size2]), &(in[ snum(x,y,zm,t) ]), &aux_tmp ); + conjmat_vec_mul( &u[5], snum(x,y,zm,t), eta, in, snum(x,y,zm,t), &aux_tmp ); + aux = subResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y+z) & 0x1) ); +// conjmat_vec_mul( &(u_work[sizeh + snum(x,y,z,tm) + size3]), &(in[ snum(x,y,z,tm) ]), &aux_tmp ); + conjmat_vec_mul( &u[7], snum(x,y,z,tm), eta, in, snum(x,y,z,tm), &aux_tmp ); + aux = subResult(aux, aux_tmp); + +////////////////////////////////////////////////////////////////////////////////////////////// + + out->c0[idxh] = make_cuDoubleComplex(cuCreal(aux.c0)*0.5, cuCimag(aux.c0)*0.5); + out->c1[idxh] = make_cuDoubleComplex(cuCreal(aux.c1)*0.5, cuCimag(aux.c1)*0.5); + out->c2[idxh] = make_cuDoubleComplex(cuCreal(aux.c2)*0.5, cuCimag(aux.c2)*0.5); + +} + +__global__ void Doe(const __restrict su3_soa * const u, __restrict vec3_soa * const out, const __restrict vec3_soa * const in) { + + int x, y, z, t, xm, ym, zm, tm, xp, yp, zp, tp, idxh, eta; //, idx; + + vec3 aux_tmp; + vec3 aux; + + idxh = ((blockIdx.z * blockDim.z + threadIdx.z) * nxh * ny) + + ((blockIdx.y * blockDim.y + threadIdx.y) * nxh) + + (blockIdx.x * blockDim.x + threadIdx.x); // idxh = snum(x,y,z,t) + +// idx = 2*idxh; +// t = (idx / vol3) % nt; +// z = (idx / vol2) % nz; +// y = (blockIdx.y * blockDim.y + threadIdx.y); +// x = 2*(blockIdx.x * blockDim.x + threadIdx.x) + ((y+z+t+1) % 2); + + t = (blockIdx.z * blockDim.z + threadIdx.z) / nz; + z = (blockIdx.z * blockDim.z + threadIdx.z) % nz; + y = (blockIdx.y * blockDim.y + threadIdx.y); + x = 2*(blockIdx.x * blockDim.x + threadIdx.x) + ((y+z+t+1) & 0x1); + + xm = x - 1; + xm = xm + (((xm >> 31) & 0x1) * nx); + ym = y -1; + ym = ym + (((ym >> 31) & 0x1) * ny); + zm = z -1; + zm = zm + (((zm >> 31) & 0x1) * nz); + tm = t -1; + tm = tm + (((tm >> 31) & 0x1) * nt); + + xp = (x+1); + xp *= (((xp-nx) >> 31) & 0x1); + yp = (y+1); + yp *= (((yp-ny) >> 31) & 0x1); + zp = (z+1); + zp *= (((zp-nz) >> 31) & 0x1); + tp = (t+1); + tp *= (((tp-nt) >> 31) & 0x1); + + eta = 1; +// mat_vec_mul( &(u_work[snum(x,y,z,t) + sizeh ]), &(in[ snum(xp,y,z,t) ]), &aux_tmp ); + mat_vec_mul( &u[1], idxh, eta, in, snum(xp,y,z,t), &aux_tmp ); + aux = aux_tmp; + + eta = 1 - ( 2*(x & 0x1) ); +// mat_vec_mul( &(u_work[snum(x,y,z,t) + sizeh + size ]), &(in[ snum(x,yp,z,t) ]), &aux_tmp ); + mat_vec_mul( &u[3], idxh, eta, in, snum(x,yp,z,t), &aux_tmp ); + aux = sumResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y) & 0x1) ); +// mat_vec_mul( &( u_work[snum(x,y,z,t) + sizeh + size2]), &(in[ snum(x,y,zp,t) ]), &aux_tmp ); + mat_vec_mul( &u[5], idxh, eta, in, snum(x,y,zp,t), &aux_tmp ); + aux = sumResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y+z) & 0x1) ); +// mat_vec_mul( &(u_work[snum(x,y,z,t) + sizeh + size3]), &(in[ snum(x,y,z,tp) ]), &aux_tmp ); + mat_vec_mul( &u[7], idxh, eta, in, snum(x,y,z,tp), &aux_tmp ); + aux = sumResult(aux, aux_tmp); + +////////////////////////////////////////////////////////////////////////////////////////////// + + eta = 1; +// conjmat_vec_mul( &(u_work[snum(xm,y,z,t) ]), &(in[ snum(xm,y,z,t) ]), &aux_tmp ); + conjmat_vec_mul( &u[0], snum(xm,y,z,t), eta, in, snum(xm,y,z,t), &aux_tmp ); + aux = subResult(aux, aux_tmp); + + eta = 1 - ( 2*(x & 0x1) ); +// conjmat_vec_mul( &(u_work[snum(x,ym,z,t) + size ]), &(in[ snum(x,ym,z,t) ]), &aux_tmp ); + conjmat_vec_mul( &u[2], snum(x,ym,z,t), eta, in, snum(x,ym,z,t), &aux_tmp ); + aux = subResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y) & 0x1) ); +// conjmat_vec_mul( &(u_work[snum(x,y,zm,t) + size2]), &(in[ snum(x,y,zm,t) ]), &aux_tmp ); + conjmat_vec_mul( &u[4], snum(x,y,zm,t), eta, in, snum(x,y,zm,t), &aux_tmp ); + aux = subResult(aux, aux_tmp); + + eta = 1 - ( 2*((x+y+z) & 0x1) ); +// conjmat_vec_mul( &(u_work[snum(x,y,z,tm) + size3]), &(in[ snum(x,y,z,tm) ]), &aux_tmp ); + conjmat_vec_mul( &u[6], snum(x,y,z,tm), eta, in, snum(x,y,z,tm), &aux_tmp ); + aux = subResult(aux, aux_tmp); + +////////////////////////////////////////////////////////////////////////////////////////////// + + out->c0[idxh] = make_cuDoubleComplex(cuCreal(aux.c0)*0.5, cuCimag(aux.c0)*0.5); + out->c1[idxh] = make_cuDoubleComplex(cuCreal(aux.c1)*0.5, cuCimag(aux.c1)*0.5); + out->c2[idxh] = make_cuDoubleComplex(cuCreal(aux.c2)*0.5, cuCimag(aux.c2)*0.5); + +} + + +int main() { + + int i; + struct timeval t0, t1; + double dt_tot = 0.0; + + dim3 dimBlockK1 (DIM_BLOCK_X, DIM_BLOCK_Y, DIM_BLOCK_Z); +// dim3 dimGridK1 ((nx*ny*nz*nt/2)/DIM_BLOCK_X, 1, 1 ); + dim3 dimGridK1 ((nx/2)/DIM_BLOCK_X, ny/DIM_BLOCK_Y, (nz*nt)/DIM_BLOCK_Z ); + + if ( ((nx % 2) != 0) || (((nx/2) % DIM_BLOCK_X) != 0) ) { + fprintf(stderr, "ERROR: nx should be even and nx/2 should be divisible by DIM_BLOCK_X."); + return -1; + } + + su3_soa * u_h; + vec3_soa * fermion1_h; + vec3_soa * fermion2_h; + + // 8 = number of directions times 2 (even/odd) + // no_links = sizeh * 8 + posix_memalign((void **)&u_h, ALIGN, 8*sizeof(su3_soa)); + posix_memalign((void **)&fermion1_h, ALIGN, sizeof(vec3_soa)); + posix_memalign((void **)&fermion2_h, ALIGN, sizeof(vec3_soa)); + +// printf("Sizeof su3_soa is: %d \n", sizeof(su3_soa)); +// printf("Sizeof su3_soa_d is: %d \n", sizeof(su3_soa_d)); + + su3_soa * u_d; + vec3_soa * fermion1_d; + vec3_soa * fermion2_d; + + cudaMalloc ((void**)&u_d, 8*sizeof(su3_soa)); + checkCUDAError("Allocating u_d"); + cudaMalloc ((void**)&fermion1_d, sizeof(vec3_soa)); + checkCUDAError("Allocating fermion1_d"); + cudaMalloc ((void**)&fermion2_d, sizeof(vec3_soa)); + checkCUDAError("Allocating fermion2_d"); + + +if ((nx == 32) && (ny == 32) && (nz == 32) && (nt == 32)) { + loadSu3FromFileNew( u_h, "gaugeconf_save_32_4"); + loadFermionFromFileNew(fermion1_h, "test_fermion_32_4"); +} else if ((nx == 16) && (ny == 16) && (nz == 16) && (nt == 16)) { + loadSu3FromFile( u_h, "TestConf_16_4.cnf"); + loadFermionFromFile(fermion1_h, "StartFermion_16_4.fer"); +} else { + fprintf(stdout, "Lattice not available... \n"); + exit(1); +} + + cudaMemcpy( u_d, u_h, 8*sizeof(su3_soa), cudaMemcpyHostToDevice ); + checkCUDAError("Copying u_d to device"); + cudaMemcpy( fermion1_d, fermion1_h, sizeof(vec3_soa), cudaMemcpyHostToDevice ); + checkCUDAError("Copying fermion1_d to device"); + cudaMemcpy( fermion2_d, fermion2_h, sizeof(vec3_soa), cudaMemcpyHostToDevice ); + checkCUDAError("Copying fermion2_d to device"); + + // Prefer larger L1 cache than shared mem + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + // Prefer larger shared mem than L1 cache + // cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); + + gettimeofday ( &t0, NULL ); + + for (i = 0; i < NITER; i++) { + Deo<<< dimGridK1, dimBlockK1 >>>( u_d, fermion2_d, fermion1_d); + checkCUDAError("Running kernel Deo"); + //cudaDeviceSynchronize(); + //checkCUDAError("Cuda synch after Deo"); + Doe<<< dimGridK1, dimBlockK1 >>>( u_d, fermion1_d, fermion2_d); + checkCUDAError("Running kernel Doe"); + //cudaDeviceSynchronize(); + //checkCUDAError("Cuda synch after Doe"); + } + + cudaDeviceSynchronize(); + gettimeofday ( &t1, NULL ); + +// cudaMemcpy( fermion1_h, fermion2_d, sizeof(vec3_soa), cudaMemcpyDeviceToHost ); + cudaMemcpy( fermion1_h, fermion1_d, sizeof(vec3_soa), cudaMemcpyDeviceToHost ); + checkCUDAError("Copying fermion1_d to host"); + + dt_tot = (double)(t1.tv_sec - t0.tv_sec) + ((double)(t1.tv_usec - t0.tv_usec)/1.0e6); + + printf("TOTAL Exec time: Tot time: % 3.2f sec Avg: % 3.02f ms Avg/site: % 3.02f ns\n", + dt_tot, \ + (dt_tot/NITER)*(1.0e3), + ((dt_tot/NITER)/size)*(1.0e9) ); + + writeFermionToFile(fermion1_h, "EndFermion.fer"); + + free(u_h); + free(fermion1_h); + free(fermion2_h); + + return 0; + +}