Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
Dirac Operator
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
COKA
Dirac Operator
Commits
bf34b408
Commit
bf34b408
authored
5 years ago
by
Enrico Calore
Browse files
Options
Downloads
Patches
Plain Diff
Initial commit
parent
d782b427
Branches
master
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
Makefile
+32
-0
32 additions, 0 deletions
Makefile
common-cuda.h
+399
-0
399 additions, 0 deletions
common-cuda.h
main.cu
+406
-0
406 additions, 0 deletions
main.cu
with
837 additions
and
0 deletions
Makefile
0 → 100644
+
32
−
0
View file @
bf34b408
# Define Lattice size and cache blocking size:
#LATTICE=-Dnx=16 -Dny=16 -Dnz=16 -Dnt=16 -DDIM_BLOCK_X=8 -DDIM_BLOCK_Y=8 -DDIM_BLOCK_Z=8 -DDIM_BLOCK_T=4
LATTICE
=
-Dnx
=
32
-Dny
=
32
-Dnz
=
32
-Dnt
=
32
-DDIM_BLOCK_X
=
8
-DDIM_BLOCK_Y
=
8
-DDIM_BLOCK_Z
=
8
-DDIM_BLOCK_T
=
4
###############################################################################
CC
=
nvcc
# K80
#CC_CFLAGS=-O3 --ptxas-options=-v -lineinfo -gencode arch=compute_37,code=sm_37 -keep
# V100
CC_CFLAGS
=
-O3
--ptxas-options
=
-v
-lineinfo
-gencode
arch
=
compute_70,code
=
sm_70
-keep
###############################################################################
all
:
cuda
###############################################################################
# NVIDIA CUDA COMPILER
cuda
:
common-cuda.h
nvcc
$(
LATTICE
)
$(
CC_CFLAGS
)
main.cu
-o
test-cuda-gpu
# CLEAN
clean
:
rm
-rf
*
.o
This diff is collapsed.
Click to expand it.
common-cuda.h
0 → 100644
+
399
−
0
View file @
bf34b408
#include
<math.h>
#include
<complex.h>
#include
<cuda.h>
#include
<cuComplex.h>
// lattice dimensions
// defined in the Makefile
//#define nx 48
//#define ny 48
//#define nz 48
//#define nt 48
// defined in the Makefile
//#define DIM_BLOCK_X 12 // This should divide (nx/2)
//#define DIM_BLOCK_Y 4 // This should divide ny
//#define DIM_BLOCK_Z 1 // This should divide nz*nt
// Number of iterations
#define NITER 10
//Decomment this to allocate and store the 3rd su3 matrix line
//#define ALLOCROW3
//Decomment this to store and read the 3rd su3 matrix line
//#define READROW3
//If we want to read it we should also allocate it
#ifdef READROW3
#define ALLOCROW3
#endif
#define vol1 nx
#define vol2 (ny * vol1)
#define vol3 (nz * vol2)
#define vol4 (nt * vol3)
#define nxh (nx >> 1) // nx/2
#define nyh (ny >> 1)
#define nzh (nz >> 1)
#define nth (nt >> 1)
#define size vol4
#define size2 (2*size)
#define size3 (3*size)
#define sizeh (size / 2)
#define no_links (4 * vol4)
#define ALIGN 128
typedef
cuDoubleComplex
d_complex
;
typedef
struct
vec3_soa_t
{
d_complex
c0
[
sizeh
];
d_complex
c1
[
sizeh
];
d_complex
c2
[
sizeh
];
}
vec3_soa
;
typedef
struct
su3_soa_t
{
vec3_soa
r0
;
vec3_soa
r1
;
#ifdef ALLOCROW3
vec3_soa
r2
;
#endif
}
su3_soa
;
typedef
struct
vec3_t
{
d_complex
c0
;
d_complex
c1
;
d_complex
c2
;
}
vec3
;
// Common functions
inline
void
checkCUDAError
(
const
char
*
msg
)
{
cudaError_t
err
=
cudaGetLastError
();
if
(
cudaSuccess
!=
err
)
{
fprintf
(
stderr
,
"ERROR: %s %s.
\n
"
,
msg
,
cudaGetErrorString
(
err
)
);
exit
(
-
1
);
}
}
__host__
__device__
static
__inline__
uint
snum
(
uint
x
,
uint
y
,
uint
z
,
uint
t
)
{
uint
ris
;
ris
=
x
+
(
y
*
vol1
)
+
(
z
*
vol2
)
+
(
t
*
vol3
);
return
ris
/
2
;
// <--- /2 Pay attention to even/odd (see init_geo)
}
__host__
__device__
static
__inline__
d_complex
myConj
(
d_complex
a
)
{
d_complex
res
;
res
.
x
=
a
.
x
;
res
.
y
=
-
a
.
y
;
return
res
;
}
void
loadFermionFromFile
(
vec3_soa
*
fermion
,
const
char
*
filename
)
{
FILE
*
fp
;
double
ar
,
ai
,
br
,
bi
,
cr
,
ci
;
int
i
=
0
;
int
error
=
0
;
fp
=
fopen
(
filename
,
"rt"
);
if
(
fp
==
NULL
)
{
printf
(
"Could not open file %s
\n
"
,
filename
);
exit
(
-
1
);
}
while
(
(
i
<
sizeh
)
&&
(
!
error
)
)
{
if
(
fscanf
(
fp
,
"(%lf,%lf) (%lf,%lf) (%lf,%lf)
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
// fermion->c0[i] = (ar + ai * I);
// fermion->c1[i] = (br + bi * I);
// fermion->c2[i] = (cr + ci * I);
fermion
->
c0
[
i
]
=
make_cuDoubleComplex
(
ar
,
ai
);
fermion
->
c1
[
i
]
=
make_cuDoubleComplex
(
br
,
bi
);
fermion
->
c2
[
i
]
=
make_cuDoubleComplex
(
cr
,
ci
);
}
else
{
printf
(
"Read error...
\n
"
);
error
=
1
;
}
//printf("Read line: (%lf,%lf) (%lf,%lf) (%lf,%lf) \n", ar, ai, br, bi, cr, ci);
i
++
;
}
printf
(
"Read %d fermions from file %s
\n
"
,
i
,
filename
);
fclose
(
fp
);
}
void
loadSu3FromFile
(
su3_soa
*
u
,
const
char
*
filename
){
FILE
*
fp
;
int
nx_l
,
ny_l
,
nz_l
,
nt_l
,
update_iterations
;
double
beta_l
,
mass_l
,
no_flavours_l
;
double
ar
,
ai
,
br
,
bi
,
cr
,
ci
;
int
idx
;
int
i
=
0
;
int
j
=
0
;
int
error
=
0
;
fp
=
fopen
(
filename
,
"rt"
);
if
(
fp
==
NULL
)
{
printf
(
"Could not open file %s
\n
"
,
filename
);
exit
(
-
1
);
}
fscanf
(
fp
,
"%d %d %d %d %lf %lf %lf %d
\n
"
,
&
nx_l
,
&
ny_l
,
&
nz_l
,
&
nt_l
,
&
beta_l
,
&
mass_l
,
&
no_flavours_l
,
&
update_iterations
);
printf
(
"Reading configuration file with header:
\n
"
);
printf
(
"nx_l: %d, ny_l: %d, nz_l: %d, nt_l: %d
\n
"
,
nx_l
,
ny_l
,
nz_l
,
nt_l
);
printf
(
"beta_l: %lf, mass_l: %lf, no_flavours_l: %lf
\n
"
,
beta_l
,
mass_l
,
no_flavours_l
);
printf
(
"update_iterations: %d
\n
"
,
update_iterations
);
// while ( (i < no_links) && (!error) ) {
while
(
(
i
<
sizeh
*
8
)
&&
(
!
error
)
)
{
j
=
i
/
sizeh
;
idx
=
i
%
sizeh
;
if
(
fscanf
(
fp
,
"(%lf,%lf) (%lf,%lf) (%lf,%lf)
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
//u[j].r0.c0[idx] = (ar + ai * I);
//u[j].r0.c1[idx] = (br + bi * I);
//u[j].r0.c2[idx] = (cr + ci * I);
u
[
j
].
r0
.
c0
[
idx
]
=
make_cuDoubleComplex
(
ar
,
ai
);
u
[
j
].
r0
.
c1
[
idx
]
=
make_cuDoubleComplex
(
br
,
bi
);
u
[
j
].
r0
.
c2
[
idx
]
=
make_cuDoubleComplex
(
cr
,
ci
);
}
else
{
printf
(
"Read error... "
);
error
=
1
;
}
if
(
fscanf
(
fp
,
"(%lf,%lf) (%lf,%lf) (%lf,%lf)
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
//u[j].r1.c0[idx] = (ar + ai * I);
//u[j].r1.c1[idx] = (br + bi * I);
//u[j].r1.c2[idx] = (cr + ci * I);
u
[
j
].
r1
.
c0
[
idx
]
=
make_cuDoubleComplex
(
ar
,
ai
);
u
[
j
].
r1
.
c1
[
idx
]
=
make_cuDoubleComplex
(
br
,
bi
);
u
[
j
].
r1
.
c2
[
idx
]
=
make_cuDoubleComplex
(
cr
,
ci
);
}
else
{
printf
(
"Read error... "
);
error
=
1
;
}
if
(
fscanf
(
fp
,
"(%lf,%lf) (%lf,%lf) (%lf,%lf)
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
#ifdef ALLOCROW3
//u[j].r2.c0[idx] = (ar + ai * I);
//u[j].r2.c1[idx] = (br + bi * I);
//u[j].r2.c2[idx] = (cr + ci * I);
u
[
j
].
r2
.
c0
[
idx
]
=
make_cuDoubleComplex
(
ar
,
ai
);
u
[
j
].
r2
.
c1
[
idx
]
=
make_cuDoubleComplex
(
br
,
bi
);
u
[
j
].
r2
.
c2
[
idx
]
=
make_cuDoubleComplex
(
cr
,
ci
);
#endif
}
else
{
printf
(
"Read error... "
);
error
=
1
;
}
i
++
;
}
printf
(
"Read %d matrices from file %s
\n
"
,
i
*
j
,
filename
);
fclose
(
fp
);
}
void
loadFermionFromFileNew
(
vec3_soa
*
fermion
,
const
char
*
filename
)
{
FILE
*
fp
;
double
ar
,
ai
,
br
,
bi
,
cr
,
ci
;
int
i
=
0
;
int
error
=
0
;
fp
=
fopen
(
filename
,
"rt"
);
if
(
fp
==
NULL
)
{
printf
(
"Could not open file %s
\n
"
,
filename
);
exit
(
-
1
);
}
while
(
(
i
<
sizeh
)
&&
(
!
error
)
)
{
if
(
fscanf
(
fp
,
"%lf %lf
\n
%lf %lf
\n
%lf %lf
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
// fermion->c0[i] = (ar + ai * I);
// fermion->c1[i] = (br + bi * I);
// fermion->c2[i] = (cr + ci * I);
fermion
->
c0
[
i
]
=
make_cuDoubleComplex
(
ar
,
ai
);
fermion
->
c1
[
i
]
=
make_cuDoubleComplex
(
br
,
bi
);
fermion
->
c2
[
i
]
=
make_cuDoubleComplex
(
cr
,
ci
);
}
else
{
printf
(
"Read error...
\n
"
);
error
=
1
;
}
//printf("Read line: (%lf,%lf) (%lf,%lf) (%lf,%lf) \n", ar, ai, br, bi, cr, ci);
i
++
;
}
printf
(
"Read %d fermions from file %s
\n
"
,
i
,
filename
);
fclose
(
fp
);
}
void
loadSu3FromFileNew
(
su3_soa
*
u
,
const
char
*
filename
){
FILE
*
fp
;
int
nx_l
,
ny_l
,
nz_l
,
nt_l
,
update_iterations
;
double
ar
,
ai
,
br
,
bi
,
cr
,
ci
;
int
idx
;
int
i
=
0
;
int
j
=
0
;
int
error
=
0
;
fp
=
fopen
(
filename
,
"rt"
);
if
(
fp
==
NULL
)
{
printf
(
"Could not open file %s
\n
"
,
filename
);
exit
(
-
1
);
}
fscanf
(
fp
,
"%d %d %d %d %d
\n
"
,
&
nx_l
,
&
ny_l
,
&
nz_l
,
&
nt_l
,
&
update_iterations
);
printf
(
"Reading configuration file with header:
\n
"
);
printf
(
"nx_l: %d, ny_l: %d, nz_l: %d, nt_l: %d
\n
"
,
nx_l
,
ny_l
,
nz_l
,
nt_l
);
printf
(
"update_iterations: %d
\n
"
,
update_iterations
);
// while ( (i < no_links) && (!error) ) {
while
(
(
i
<
sizeh
*
8
)
&&
(
!
error
)
)
{
j
=
i
/
sizeh
;
idx
=
i
%
sizeh
;
if
(
fscanf
(
fp
,
"%lf %lf
\n
%lf %lf
\n
%lf %lf
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
//u[j].r0.c0[idx] = (ar + ai * I);
//u[j].r0.c1[idx] = (br + bi * I);
//u[j].r0.c2[idx] = (cr + ci * I);
u
[
j
].
r0
.
c0
[
idx
]
=
make_cuDoubleComplex
(
ar
,
ai
);
u
[
j
].
r0
.
c1
[
idx
]
=
make_cuDoubleComplex
(
br
,
bi
);
u
[
j
].
r0
.
c2
[
idx
]
=
make_cuDoubleComplex
(
cr
,
ci
);
}
else
{
printf
(
"Read error... "
);
error
=
1
;
}
if
(
fscanf
(
fp
,
"%lf %lf
\n
%lf %lf
\n
%lf %lf
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
//u[j].r1.c0[idx] = (ar + ai * I);
//u[j].r1.c1[idx] = (br + bi * I);
//u[j].r1.c2[idx] = (cr + ci * I);
u
[
j
].
r1
.
c0
[
idx
]
=
make_cuDoubleComplex
(
ar
,
ai
);
u
[
j
].
r1
.
c1
[
idx
]
=
make_cuDoubleComplex
(
br
,
bi
);
u
[
j
].
r1
.
c2
[
idx
]
=
make_cuDoubleComplex
(
cr
,
ci
);
}
else
{
printf
(
"Read error... "
);
error
=
1
;
}
if
(
fscanf
(
fp
,
"%lf %lf
\n
%lf %lf
\n
%lf %lf
\n
"
,
&
ar
,
&
ai
,
&
br
,
&
bi
,
&
cr
,
&
ci
)
==
6
)
{
#ifdef ALLOCROW3
//u[j].r2.c0[idx] = (ar + ai * I);
//u[j].r2.c1[idx] = (br + bi * I);
//u[j].r2.c2[idx] = (cr + ci * I);
u
[
j
].
r2
.
c0
[
idx
]
=
make_cuDoubleComplex
(
ar
,
ai
);
u
[
j
].
r2
.
c1
[
idx
]
=
make_cuDoubleComplex
(
br
,
bi
);
u
[
j
].
r2
.
c2
[
idx
]
=
make_cuDoubleComplex
(
cr
,
ci
);
#endif
}
else
{
printf
(
"Read error... "
);
error
=
1
;
}
i
++
;
}
printf
(
"Read %d matrices from file %s
\n
"
,
i
,
filename
);
fclose
(
fp
);
}
void
writeFermionToFile
(
vec3_soa
*
fermion
,
const
char
*
filename
){
FILE
*
fp
;
int
i
=
0
;
int
error
=
0
;
fp
=
fopen
(
filename
,
"w"
);
if
(
fp
==
NULL
)
{
printf
(
"Could not open file %s
\n
"
,
filename
);
exit
(
-
1
);
}
while
(
(
i
<
sizeh
)
&&
(
!
error
)
)
{
// if (fprintf(fp, "(%lf,%lf) (%lf,%lf) (%lf,%lf) \n", cuCreal(fermion->c0[i]), cuCimag(fermion->c0[i]),
// cuCreal(fermion->c1[i]), cuCimag(fermion->c1[i]),
// cuCreal(fermion->c2[i]), cuCimag(fermion->c2[i])) < 0) {
if
(
fprintf
(
fp
,
"%lf %lf
\n
%lf %lf
\n
%lf %lf
\n
"
,
cuCreal
(
fermion
->
c0
[
i
]),
cuCimag
(
fermion
->
c0
[
i
]),
cuCreal
(
fermion
->
c1
[
i
]),
cuCimag
(
fermion
->
c1
[
i
]),
cuCreal
(
fermion
->
c2
[
i
]),
cuCimag
(
fermion
->
c2
[
i
]))
<
0
)
{
printf
(
"Write error... "
);
error
=
1
;
}
i
++
;
}
printf
(
"Wrote %d fermions from file %s
\n
"
,
i
,
filename
);
fclose
(
fp
);
}
// Just for debugging:
void
showbits
(
unsigned
int
x
)
{
int
i
;
for
(
i
=
(
sizeof
(
int
)
*
8
)
-
1
;
i
>=
0
;
i
--
)
(
x
&
(
1
<<
i
))
?
putchar
(
'1'
)
:
putchar
(
'0'
);
printf
(
"
\n
"
);
}
This diff is collapsed.
Click to expand it.
main.cu
0 → 100644
+
406
−
0
View file @
bf34b408
#include
<stdio.h>
#include
<stdlib.h>
#include
<sys/time.h>
#include
"common-cuda.h"
__host__
__device__
static
__inline__
void
mat_vec_mul
(
const
__restrict
su3_soa
*
const
matrix
,
const
int
idx_mat
,
const
int
eta
,
const
__restrict
vec3_soa
*
const
in_vect
,
const
int
idx_vect
,
__restrict
vec3
*
const
out_vect
)
{
cuDoubleComplex
vec0
=
in_vect
->
c0
[
idx_vect
];
cuDoubleComplex
vec1
=
in_vect
->
c1
[
idx_vect
];
cuDoubleComplex
vec2
=
in_vect
->
c2
[
idx_vect
];
cuDoubleComplex
mat00
=
matrix
->
r0
.
c0
[
idx_mat
];
cuDoubleComplex
mat01
=
matrix
->
r0
.
c1
[
idx_mat
];
cuDoubleComplex
mat02
=
matrix
->
r0
.
c2
[
idx_mat
];
cuDoubleComplex
mat10
=
matrix
->
r1
.
c0
[
idx_mat
];
cuDoubleComplex
mat11
=
matrix
->
r1
.
c1
[
idx_mat
];
cuDoubleComplex
mat12
=
matrix
->
r1
.
c2
[
idx_mat
];
#ifdef READROW3
// Load 3rd matrix row from global memory
cuDoubleComplex
mat20
=
matrix
->
r2
.
c0
[
idx_mat
];
cuDoubleComplex
mat21
=
matrix
->
r2
.
c1
[
idx_mat
];
cuDoubleComplex
mat22
=
matrix
->
r2
.
c2
[
idx_mat
];
#else
//Compute 3rd matrix row from the first two
cuDoubleComplex
mat20
=
cuConj
(
cuCsub
(
cuCmul
(
mat01
,
mat12
),
cuCmul
(
mat02
,
mat11
)
)
);
cuDoubleComplex
mat21
=
cuConj
(
cuCsub
(
cuCmul
(
mat02
,
mat10
),
cuCmul
(
mat00
,
mat12
)
)
);
cuDoubleComplex
mat22
=
cuConj
(
cuCsub
(
cuCmul
(
mat00
,
mat11
),
cuCmul
(
mat01
,
mat10
)
)
);
#endif
//Multiply 3rd row by eta
mat20
=
make_cuDoubleComplex
(
cuCreal
(
mat20
)
*
eta
,
cuCimag
(
mat20
)
*
eta
);
mat21
=
make_cuDoubleComplex
(
cuCreal
(
mat21
)
*
eta
,
cuCimag
(
mat21
)
*
eta
);
mat22
=
make_cuDoubleComplex
(
cuCreal
(
mat22
)
*
eta
,
cuCimag
(
mat22
)
*
eta
);
out_vect
->
c0
=
cuCadd
(
cuCadd
(
cuCmul
(
mat00
,
vec0
),
cuCmul
(
mat01
,
vec1
)),
cuCmul
(
mat02
,
vec2
));
out_vect
->
c1
=
cuCadd
(
cuCadd
(
cuCmul
(
mat10
,
vec0
),
cuCmul
(
mat11
,
vec1
)),
cuCmul
(
mat12
,
vec2
));
out_vect
->
c2
=
cuCadd
(
cuCadd
(
cuCmul
(
mat20
,
vec0
),
cuCmul
(
mat21
,
vec1
)),
cuCmul
(
mat22
,
vec2
));
}
__host__
__device__
static
__inline__
void
conjmat_vec_mul
(
const
__restrict
su3_soa
*
const
matrix
,
const
int
idx_mat
,
const
int
eta
,
const
__restrict
vec3_soa
*
const
in_vect
,
const
int
idx_vect
,
__restrict
vec3
*
const
out_vect
)
{
cuDoubleComplex
vec0
=
in_vect
->
c0
[
idx_vect
];
cuDoubleComplex
vec1
=
in_vect
->
c1
[
idx_vect
];
cuDoubleComplex
vec2
=
in_vect
->
c2
[
idx_vect
];
cuDoubleComplex
mat00
=
matrix
->
r0
.
c0
[
idx_mat
];
cuDoubleComplex
mat01
=
matrix
->
r0
.
c1
[
idx_mat
];
cuDoubleComplex
mat02
=
matrix
->
r0
.
c2
[
idx_mat
];
cuDoubleComplex
mat10
=
matrix
->
r1
.
c0
[
idx_mat
];
cuDoubleComplex
mat11
=
matrix
->
r1
.
c1
[
idx_mat
];
cuDoubleComplex
mat12
=
matrix
->
r1
.
c2
[
idx_mat
];
#ifdef READROW3
// Load 3rd matrix row from global memory
// cuDoubleComplex mat20 = matrix->r2.c0[idx_mat];
// cuDoubleComplex mat21 = matrix->r2.c1[idx_mat];
// cuDoubleComplex mat22 = matrix->r2.c2[idx_mat];
#else
//Compute 3rd matrix row from the first two
cuDoubleComplex
mat20
=
cuConj
(
cuCsub
(
cuCmul
(
mat01
,
mat12
),
cuCmul
(
mat02
,
mat11
)
)
);
cuDoubleComplex
mat21
=
cuConj
(
cuCsub
(
cuCmul
(
mat02
,
mat10
),
cuCmul
(
mat00
,
mat12
)
)
);
cuDoubleComplex
mat22
=
cuConj
(
cuCsub
(
cuCmul
(
mat00
,
mat11
),
cuCmul
(
mat01
,
mat10
)
)
);
#endif
//Multiply 3rd row by eta
mat20
=
make_cuDoubleComplex
(
cuCreal
(
mat20
)
*
eta
,
cuCimag
(
mat20
)
*
eta
);
mat21
=
make_cuDoubleComplex
(
cuCreal
(
mat21
)
*
eta
,
cuCimag
(
mat21
)
*
eta
);
mat22
=
make_cuDoubleComplex
(
cuCreal
(
mat22
)
*
eta
,
cuCimag
(
mat22
)
*
eta
);
out_vect
->
c0
=
cuCadd
(
cuCadd
(
cuCmul
(
cuConj
(
mat00
),
vec0
),
cuCmul
(
cuConj
(
mat10
),
vec1
)),
cuCmul
(
cuConj
(
mat20
),
vec2
));
out_vect
->
c1
=
cuCadd
(
cuCadd
(
cuCmul
(
cuConj
(
mat01
),
vec0
),
cuCmul
(
cuConj
(
mat11
),
vec1
)),
cuCmul
(
cuConj
(
mat21
),
vec2
));
out_vect
->
c2
=
cuCadd
(
cuCadd
(
cuCmul
(
cuConj
(
mat02
),
vec0
),
cuCmul
(
cuConj
(
mat12
),
vec1
)),
cuCmul
(
cuConj
(
mat22
),
vec2
));
}
__host__
__device__
static
__inline__
vec3
sumResult
(
vec3
aux
,
vec3
aux_tmp
)
{
aux
.
c0
=
cuCadd
(
aux
.
c0
,
aux_tmp
.
c0
);
aux
.
c1
=
cuCadd
(
aux
.
c1
,
aux_tmp
.
c1
);
aux
.
c2
=
cuCadd
(
aux
.
c2
,
aux_tmp
.
c2
);
return
aux
;
}
__host__
__device__
static
__inline__
vec3
subResult
(
vec3
aux
,
vec3
aux_tmp
)
{
aux
.
c0
=
cuCsub
(
aux
.
c0
,
aux_tmp
.
c0
);
aux
.
c1
=
cuCsub
(
aux
.
c1
,
aux_tmp
.
c1
);
aux
.
c2
=
cuCsub
(
aux
.
c2
,
aux_tmp
.
c2
);
return
aux
;
}
__global__
void
Deo
(
const
__restrict
su3_soa
*
const
u
,
__restrict
vec3_soa
*
const
out
,
const
__restrict
vec3_soa
*
const
in
)
{
int
x
,
y
,
z
,
t
,
xm
,
ym
,
zm
,
tm
,
xp
,
yp
,
zp
,
tp
,
idxh
,
eta
;
//, idx;
vec3
aux_tmp
;
vec3
aux
;
idxh
=
((
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
*
nxh
*
ny
)
+
((
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
)
*
nxh
)
+
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// idxh = snum(x,y,z,t)
// idx = 2*idxh;
// t = (idx / vol3) % nt;
// z = (idx / vol2) % nz;
// y = (blockIdx.y * blockDim.y + threadIdx.y);
// x = 2*(blockIdx.x * blockDim.x + threadIdx.x) + ((y+z+t) % 2);
t
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
/
nz
;
z
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
%
nz
;
y
=
(
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
);
x
=
2
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
+
((
y
+
z
+
t
)
&
0x1
);
xm
=
x
-
1
;
xm
=
xm
+
(((
xm
>>
31
)
&
0x1
)
*
nx
);
ym
=
y
-
1
;
ym
=
ym
+
(((
ym
>>
31
)
&
0x1
)
*
ny
);
zm
=
z
-
1
;
zm
=
zm
+
(((
zm
>>
31
)
&
0x1
)
*
nz
);
tm
=
t
-
1
;
tm
=
tm
+
(((
tm
>>
31
)
&
0x1
)
*
nt
);
xp
=
(
x
+
1
);
xp
*=
(((
xp
-
nx
)
>>
31
)
&
0x1
);
yp
=
(
y
+
1
);
yp
*=
(((
yp
-
ny
)
>>
31
)
&
0x1
);
zp
=
(
z
+
1
);
zp
*=
(((
zp
-
nz
)
>>
31
)
&
0x1
);
tp
=
(
t
+
1
);
tp
*=
(((
tp
-
nt
)
>>
31
)
&
0x1
);
eta
=
1
;
// mat_vec_mul( &(u_work[snum(x,y,z,t) ]), &(in[snum(xp,y,z,t)]), &aux_tmp );
mat_vec_mul
(
&
u
[
0
],
idxh
,
eta
,
in
,
snum
(
xp
,
y
,
z
,
t
),
&
aux_tmp
);
aux
=
aux_tmp
;
eta
=
1
-
(
2
*
(
x
&
0x1
)
);
// if (x % 2 = 0) eta = 1 else -1
// mat_vec_mul( &(u_work[snum(x,y,z,t) + size ]), &(in[snum(x,yp,z,t)]), &aux_tmp );
mat_vec_mul
(
&
u
[
2
],
idxh
,
eta
,
in
,
snum
(
x
,
yp
,
z
,
t
),
&
aux_tmp
);
aux
=
sumResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
)
&
0x1
)
);
// mat_vec_mul( &(u_work[snum(x,y,z,t) + size2]), &(in[snum(x,y,zp,t)]), &aux_tmp );
mat_vec_mul
(
&
u
[
4
],
idxh
,
eta
,
in
,
snum
(
x
,
y
,
zp
,
t
),
&
aux_tmp
);
aux
=
sumResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
+
z
)
&
0x1
)
);
// mat_vec_mul( &(u_work[snum(x,y,z,t) + size3]), &(in[snum(x,y,z,tp)]), &aux_tmp );
mat_vec_mul
(
&
u
[
6
],
idxh
,
eta
,
in
,
snum
(
x
,
y
,
z
,
tp
),
&
aux_tmp
);
aux
=
sumResult
(
aux
,
aux_tmp
);
//////////////////////////////////////////////////////////////////////////////////////////////
eta
=
1
;
// conjmat_vec_mul( &(u_work[sizeh + snum(xm,y,z,t) ]), &(in[ snum(xm,y,z,t) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
1
],
snum
(
xm
,
y
,
z
,
t
),
eta
,
in
,
snum
(
xm
,
y
,
z
,
t
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
(
x
&
0x1
)
);
// conjmat_vec_mul( &(u_work[sizeh + snum(x,ym,z,t) + size ]), &(in[ snum(x,ym,z,t) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
3
],
snum
(
x
,
ym
,
z
,
t
),
eta
,
in
,
snum
(
x
,
ym
,
z
,
t
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
)
&
0x1
)
);
// conjmat_vec_mul( &(u_work[sizeh + snum(x,y,zm,t) + size2]), &(in[ snum(x,y,zm,t) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
5
],
snum
(
x
,
y
,
zm
,
t
),
eta
,
in
,
snum
(
x
,
y
,
zm
,
t
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
+
z
)
&
0x1
)
);
// conjmat_vec_mul( &(u_work[sizeh + snum(x,y,z,tm) + size3]), &(in[ snum(x,y,z,tm) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
7
],
snum
(
x
,
y
,
z
,
tm
),
eta
,
in
,
snum
(
x
,
y
,
z
,
tm
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
//////////////////////////////////////////////////////////////////////////////////////////////
out
->
c0
[
idxh
]
=
make_cuDoubleComplex
(
cuCreal
(
aux
.
c0
)
*
0.5
,
cuCimag
(
aux
.
c0
)
*
0.5
);
out
->
c1
[
idxh
]
=
make_cuDoubleComplex
(
cuCreal
(
aux
.
c1
)
*
0.5
,
cuCimag
(
aux
.
c1
)
*
0.5
);
out
->
c2
[
idxh
]
=
make_cuDoubleComplex
(
cuCreal
(
aux
.
c2
)
*
0.5
,
cuCimag
(
aux
.
c2
)
*
0.5
);
}
__global__
void
Doe
(
const
__restrict
su3_soa
*
const
u
,
__restrict
vec3_soa
*
const
out
,
const
__restrict
vec3_soa
*
const
in
)
{
int
x
,
y
,
z
,
t
,
xm
,
ym
,
zm
,
tm
,
xp
,
yp
,
zp
,
tp
,
idxh
,
eta
;
//, idx;
vec3
aux_tmp
;
vec3
aux
;
idxh
=
((
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
*
nxh
*
ny
)
+
((
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
)
*
nxh
)
+
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// idxh = snum(x,y,z,t)
// idx = 2*idxh;
// t = (idx / vol3) % nt;
// z = (idx / vol2) % nz;
// y = (blockIdx.y * blockDim.y + threadIdx.y);
// x = 2*(blockIdx.x * blockDim.x + threadIdx.x) + ((y+z+t+1) % 2);
t
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
/
nz
;
z
=
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
)
%
nz
;
y
=
(
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
);
x
=
2
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
+
((
y
+
z
+
t
+
1
)
&
0x1
);
xm
=
x
-
1
;
xm
=
xm
+
(((
xm
>>
31
)
&
0x1
)
*
nx
);
ym
=
y
-
1
;
ym
=
ym
+
(((
ym
>>
31
)
&
0x1
)
*
ny
);
zm
=
z
-
1
;
zm
=
zm
+
(((
zm
>>
31
)
&
0x1
)
*
nz
);
tm
=
t
-
1
;
tm
=
tm
+
(((
tm
>>
31
)
&
0x1
)
*
nt
);
xp
=
(
x
+
1
);
xp
*=
(((
xp
-
nx
)
>>
31
)
&
0x1
);
yp
=
(
y
+
1
);
yp
*=
(((
yp
-
ny
)
>>
31
)
&
0x1
);
zp
=
(
z
+
1
);
zp
*=
(((
zp
-
nz
)
>>
31
)
&
0x1
);
tp
=
(
t
+
1
);
tp
*=
(((
tp
-
nt
)
>>
31
)
&
0x1
);
eta
=
1
;
// mat_vec_mul( &(u_work[snum(x,y,z,t) + sizeh ]), &(in[ snum(xp,y,z,t) ]), &aux_tmp );
mat_vec_mul
(
&
u
[
1
],
idxh
,
eta
,
in
,
snum
(
xp
,
y
,
z
,
t
),
&
aux_tmp
);
aux
=
aux_tmp
;
eta
=
1
-
(
2
*
(
x
&
0x1
)
);
// mat_vec_mul( &(u_work[snum(x,y,z,t) + sizeh + size ]), &(in[ snum(x,yp,z,t) ]), &aux_tmp );
mat_vec_mul
(
&
u
[
3
],
idxh
,
eta
,
in
,
snum
(
x
,
yp
,
z
,
t
),
&
aux_tmp
);
aux
=
sumResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
)
&
0x1
)
);
// mat_vec_mul( &( u_work[snum(x,y,z,t) + sizeh + size2]), &(in[ snum(x,y,zp,t) ]), &aux_tmp );
mat_vec_mul
(
&
u
[
5
],
idxh
,
eta
,
in
,
snum
(
x
,
y
,
zp
,
t
),
&
aux_tmp
);
aux
=
sumResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
+
z
)
&
0x1
)
);
// mat_vec_mul( &(u_work[snum(x,y,z,t) + sizeh + size3]), &(in[ snum(x,y,z,tp) ]), &aux_tmp );
mat_vec_mul
(
&
u
[
7
],
idxh
,
eta
,
in
,
snum
(
x
,
y
,
z
,
tp
),
&
aux_tmp
);
aux
=
sumResult
(
aux
,
aux_tmp
);
//////////////////////////////////////////////////////////////////////////////////////////////
eta
=
1
;
// conjmat_vec_mul( &(u_work[snum(xm,y,z,t) ]), &(in[ snum(xm,y,z,t) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
0
],
snum
(
xm
,
y
,
z
,
t
),
eta
,
in
,
snum
(
xm
,
y
,
z
,
t
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
(
x
&
0x1
)
);
// conjmat_vec_mul( &(u_work[snum(x,ym,z,t) + size ]), &(in[ snum(x,ym,z,t) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
2
],
snum
(
x
,
ym
,
z
,
t
),
eta
,
in
,
snum
(
x
,
ym
,
z
,
t
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
)
&
0x1
)
);
// conjmat_vec_mul( &(u_work[snum(x,y,zm,t) + size2]), &(in[ snum(x,y,zm,t) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
4
],
snum
(
x
,
y
,
zm
,
t
),
eta
,
in
,
snum
(
x
,
y
,
zm
,
t
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
eta
=
1
-
(
2
*
((
x
+
y
+
z
)
&
0x1
)
);
// conjmat_vec_mul( &(u_work[snum(x,y,z,tm) + size3]), &(in[ snum(x,y,z,tm) ]), &aux_tmp );
conjmat_vec_mul
(
&
u
[
6
],
snum
(
x
,
y
,
z
,
tm
),
eta
,
in
,
snum
(
x
,
y
,
z
,
tm
),
&
aux_tmp
);
aux
=
subResult
(
aux
,
aux_tmp
);
//////////////////////////////////////////////////////////////////////////////////////////////
out
->
c0
[
idxh
]
=
make_cuDoubleComplex
(
cuCreal
(
aux
.
c0
)
*
0.5
,
cuCimag
(
aux
.
c0
)
*
0.5
);
out
->
c1
[
idxh
]
=
make_cuDoubleComplex
(
cuCreal
(
aux
.
c1
)
*
0.5
,
cuCimag
(
aux
.
c1
)
*
0.5
);
out
->
c2
[
idxh
]
=
make_cuDoubleComplex
(
cuCreal
(
aux
.
c2
)
*
0.5
,
cuCimag
(
aux
.
c2
)
*
0.5
);
}
int
main
()
{
int
i
;
struct
timeval
t0
,
t1
;
double
dt_tot
=
0.0
;
dim3
dimBlockK1
(
DIM_BLOCK_X
,
DIM_BLOCK_Y
,
DIM_BLOCK_Z
);
// dim3 dimGridK1 ((nx*ny*nz*nt/2)/DIM_BLOCK_X, 1, 1 );
dim3
dimGridK1
((
nx
/
2
)
/
DIM_BLOCK_X
,
ny
/
DIM_BLOCK_Y
,
(
nz
*
nt
)
/
DIM_BLOCK_Z
);
if
(
((
nx
%
2
)
!=
0
)
||
(((
nx
/
2
)
%
DIM_BLOCK_X
)
!=
0
)
)
{
fprintf
(
stderr
,
"ERROR: nx should be even and nx/2 should be divisible by DIM_BLOCK_X."
);
return
-
1
;
}
su3_soa
*
u_h
;
vec3_soa
*
fermion1_h
;
vec3_soa
*
fermion2_h
;
// 8 = number of directions times 2 (even/odd)
// no_links = sizeh * 8
posix_memalign
((
void
**
)
&
u_h
,
ALIGN
,
8
*
sizeof
(
su3_soa
));
posix_memalign
((
void
**
)
&
fermion1_h
,
ALIGN
,
sizeof
(
vec3_soa
));
posix_memalign
((
void
**
)
&
fermion2_h
,
ALIGN
,
sizeof
(
vec3_soa
));
// printf("Sizeof su3_soa is: %d \n", sizeof(su3_soa));
// printf("Sizeof su3_soa_d is: %d \n", sizeof(su3_soa_d));
su3_soa
*
u_d
;
vec3_soa
*
fermion1_d
;
vec3_soa
*
fermion2_d
;
cudaMalloc
((
void
**
)
&
u_d
,
8
*
sizeof
(
su3_soa
));
checkCUDAError
(
"Allocating u_d"
);
cudaMalloc
((
void
**
)
&
fermion1_d
,
sizeof
(
vec3_soa
));
checkCUDAError
(
"Allocating fermion1_d"
);
cudaMalloc
((
void
**
)
&
fermion2_d
,
sizeof
(
vec3_soa
));
checkCUDAError
(
"Allocating fermion2_d"
);
if
((
nx
==
32
)
&&
(
ny
==
32
)
&&
(
nz
==
32
)
&&
(
nt
==
32
))
{
loadSu3FromFileNew
(
u_h
,
"gaugeconf_save_32_4"
);
loadFermionFromFileNew
(
fermion1_h
,
"test_fermion_32_4"
);
}
else
if
((
nx
==
16
)
&&
(
ny
==
16
)
&&
(
nz
==
16
)
&&
(
nt
==
16
))
{
loadSu3FromFile
(
u_h
,
"TestConf_16_4.cnf"
);
loadFermionFromFile
(
fermion1_h
,
"StartFermion_16_4.fer"
);
}
else
{
fprintf
(
stdout
,
"Lattice not available...
\n
"
);
exit
(
1
);
}
cudaMemcpy
(
u_d
,
u_h
,
8
*
sizeof
(
su3_soa
),
cudaMemcpyHostToDevice
);
checkCUDAError
(
"Copying u_d to device"
);
cudaMemcpy
(
fermion1_d
,
fermion1_h
,
sizeof
(
vec3_soa
),
cudaMemcpyHostToDevice
);
checkCUDAError
(
"Copying fermion1_d to device"
);
cudaMemcpy
(
fermion2_d
,
fermion2_h
,
sizeof
(
vec3_soa
),
cudaMemcpyHostToDevice
);
checkCUDAError
(
"Copying fermion2_d to device"
);
// Prefer larger L1 cache than shared mem
cudaDeviceSetCacheConfig
(
cudaFuncCachePreferL1
);
// Prefer larger shared mem than L1 cache
// cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
gettimeofday
(
&
t0
,
NULL
);
for
(
i
=
0
;
i
<
NITER
;
i
++
)
{
Deo
<<<
dimGridK1
,
dimBlockK1
>>>
(
u_d
,
fermion2_d
,
fermion1_d
);
checkCUDAError
(
"Running kernel Deo"
);
//cudaDeviceSynchronize();
//checkCUDAError("Cuda synch after Deo");
Doe
<<<
dimGridK1
,
dimBlockK1
>>>
(
u_d
,
fermion1_d
,
fermion2_d
);
checkCUDAError
(
"Running kernel Doe"
);
//cudaDeviceSynchronize();
//checkCUDAError("Cuda synch after Doe");
}
cudaDeviceSynchronize
();
gettimeofday
(
&
t1
,
NULL
);
// cudaMemcpy( fermion1_h, fermion2_d, sizeof(vec3_soa), cudaMemcpyDeviceToHost );
cudaMemcpy
(
fermion1_h
,
fermion1_d
,
sizeof
(
vec3_soa
),
cudaMemcpyDeviceToHost
);
checkCUDAError
(
"Copying fermion1_d to host"
);
dt_tot
=
(
double
)(
t1
.
tv_sec
-
t0
.
tv_sec
)
+
((
double
)(
t1
.
tv_usec
-
t0
.
tv_usec
)
/
1.0e6
);
printf
(
"TOTAL Exec time: Tot time: % 3.2f sec Avg: % 3.02f ms Avg/site: % 3.02f ns
\n
"
,
dt_tot
,
\
(
dt_tot
/
NITER
)
*
(
1.0e3
),
((
dt_tot
/
NITER
)
/
size
)
*
(
1.0e9
)
);
writeFermionToFile
(
fermion1_h
,
"EndFermion.fer"
);
free
(
u_h
);
free
(
fermion1_h
);
free
(
fermion2_h
);
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment