#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X 4 //fix to N for N cores (N=4,16,64)
#define LINE_X (1<<L_LINE_X)
#define L_COLUMN_X (L_LINE_X-1)
#define COLUMN_X (1<<L_COLUMN_X)
#define L_LINE_Y L_COLUMN_X
#define LINE_Y COLUMN_X
#define L_COLUMN_Y L_LINE_X
#define COLUMN_Y (1<<L_COLUMN_Y)
#define L_LINE_Z L_LINE_X
#define LINE_Z LINE_X
#define L_COLUMN_Z L_COLUMN_Y
#define COLUMN_Z COLUMN_Y
#define L_NB_CORE (L_LINE_X-2)
#define NB_CORE (1<<L_NB_CORE)
#define L_NB_HART 2
#define NB_HART (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE (1<<L_DATA_MEM_SIZE)
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
int *p_ax, *p_ay, *p_az, *p_ay_old, *p_az_end1, *p_az_end2;
int tmp, offset_x, offset_y, offset_z;
offset_x=0;
offset_y=LINE_X*COLUMN_X;
offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
p_ax=t*LINE_X_PER_THREAD*COLUMN_X+offset_x+global_mem;
p_az=t*LINE_Z_PER_THREAD*COLUMN_Z+offset_z+global_mem;
p_az_end1=p_az;
p_az_end2=p_az+SIZE_Z_PER_THREAD;
do{
p_ay_old=offset_y+global_mem;
p_az_end1+=COLUMN_Z;
do{
tmp = 0;
p_ay=p_ay_old;
do{
tmp += *p_ax * *p_ay;
p_ax++;
p_ay+=COLUMN_Y;
} while (p_ay!=p_ay_old+COLUMN_X*COLUMN_Y);
*p_az=tmp;
p_az++;
p_ay_old++;
} while (p_az!=p_az_end1);
} while (p_az!=p_az_end2);
}
void main(){
int t;
int l, c, offset_z;
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
for (t=0; t<NUM_THREADS; t++) thread(t);
offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
for (l=0; l<LINE_Z; l++){
for (c=0; c<COLUMN_Z; c++)
printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
printf("\n");
}
}
#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X 4 //fix to N for N cores (N=4,16,64)
#define LINE_X (1<<L_LINE_X)
#define L_COLUMN_X (L_LINE_X-1)
#define COLUMN_X (1<<L_COLUMN_X)
#define L_LINE_Y L_COLUMN_X
#define LINE_Y COLUMN_X
#define L_COLUMN_Y L_LINE_X
#define COLUMN_Y (1<<L_COLUMN_Y)
#define L_LINE_Z L_LINE_X
#define LINE_Z LINE_X
#define L_COLUMN_Z L_COLUMN_Y
#define COLUMN_Z COLUMN_Y
#define L_NB_CORE (L_LINE_X-2)
#define NB_CORE (1<<L_NB_CORE)
#define L_NB_HART 2
#define NB_HART (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE (1<<L_DATA_MEM_SIZE)
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
int *p_at, *p_ax, *p_ay, *p_az, *p_ay_old, *p_at_end, *p_az_end1, *p_az_end2;
int line_x[COLUMN_X], tmp, offset_x, offset_y, offset_z;
offset_x=0;
offset_y=LINE_X*COLUMN_X;
offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
p_ax=t*LINE_X_PER_THREAD*COLUMN_X+offset_x+global_mem;
p_az=t*LINE_Z_PER_THREAD*COLUMN_Z+offset_z+global_mem;
p_az_end1=p_az;
p_az_end2=p_az+SIZE_Z_PER_THREAD;
p_at_end=line_x+COLUMN_X;
do{
p_ay_old=offset_y+global_mem;
p_az_end1+=COLUMN_Z;
p_at=line_x;
do{
*p_at=*p_ax;
p_at++;
p_ax++;
} while (p_at!=p_at_end);
do{
tmp = 0;
p_ay=p_ay_old;
p_at=line_x;
do{
tmp += *p_at * *p_ay;
p_at++;
p_ay+=COLUMN_Y;
} while (p_ay!=p_ay_old+COLUMN_X*COLUMN_Y);
*p_az=tmp;
p_az++;
p_ay_old++;
} while (p_az!=p_az_end1);
} while (p_az!=p_az_end2);
}
void main(){
int t;
int l, c, offset_z;
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
for (t=0; t<NUM_THREADS; t++) thread(t);
offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
for (l=0; l<LINE_Z; l++){
for (c=0; c<COLUMN_Z; c++)
printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
printf("\n");
}
#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X 4 //fix to N for N cores (N=4,16,64)
#define LINE_X (1<<L_LINE_X)
#define L_COLUMN_X (L_LINE_X-1)
#define COLUMN_X (1<<L_COLUMN_X)
#define L_LINE_Y L_COLUMN_X
#define LINE_Y COLUMN_X
#define L_COLUMN_Y L_LINE_X
#define COLUMN_Y LINE_X
#define L_LINE_Z L_LINE_X
#define LINE_Z LINE_X
#define L_COLUMN_Z L_COLUMN_Y
#define COLUMN_Z COLUMN_Y
#define L_NB_CORE (L_LINE_X-2)
#define NB_CORE (1<<L_NB_CORE)
#define L_NB_HART 2
#define NB_HART (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE (1<<L_DATA_MEM_SIZE)
#define L_BANK_SIZE (L_DATA_MEM_SIZE-L_NB_CORE)
#define BANK_SIZE (1<<L_BANK_SIZE)
#define NUM_BANKS NB_CORE
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define LINE_X_PER_BANK (1<<(L_BANK_SIZE-2-L_COLUMN_X))
#define LINE_Y_PER_BANK (1<<(L_BANK_SIZE-2-L_COLUMN_Y))
#define LINE_Z_PER_BANK (1<<(L_BANK_SIZE-1-L_COLUMN_Z))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
int *p_ax, *p_ay, *p_az, *p_ay_old, *p_ay_end1, *p_ay_end2, *p_az_end1, *p_az_end2;
int tmp, offset_x, offset_z;
int bx, bz, sx, sz;
bx=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_X-L_COLUMN_X-2);
sx=(t<<(L_LINE_X-L_NUM_THREADS))%(1<<(L_BANK_SIZE-2-L_COLUMN_X));
bz=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_Z-L_COLUMN_Z-1);
sz=(t<<(L_LINE_Z-L_NUM_THREADS))%(1<<(L_BANK_SIZE-1-L_COLUMN_Z));
offset_x=bx<<L_BANK_SIZE;
offset_z=(bz<<L_BANK_SIZE)+(BANK_SIZE/2);
p_az=sz*COLUMN_Z+offset_z+global_mem;
p_az_end1=p_az;
p_az_end2=p_az+SIZE_Z_PER_THREAD;
do{
p_ay_old=BANK_SIZE/4+global_mem;
p_az_end1+=COLUMN_Z;
do{
tmp = 0;
p_ax=sx*COLUMN_X+offset_x+global_mem;
p_ay=p_ay_old-BANK_SIZE/2-BANK_SIZE/4;
p_ay_end2=p_ay_old+(NUM_BANKS-1)*BANK_SIZE+BANK_SIZE/4;
do{
p_ay+=BANK_SIZE/2+BANK_SIZE/4;
p_ay_end1=p_ay+COLUMN_Y*LINE_Y_PER_BANK;
do{
tmp += *p_ax * *p_ay;
p_ax++;
p_ay+=COLUMN_Y;
} while (p_ay!=p_ay_end1);
} while (p_ay!=p_ay_end2);
*p_az=tmp;
p_az++;
p_ay_old++;
} while (p_az!=p_az_end1);
} while (p_az!=p_az_end2);
}
void main(){
int t;
int l, c, offset_z;
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
for (t=0; t<NUM_THREADS; t++) thread(t);
offset_z=(BANK_SIZE/2);
for (l=0; l<LINE_Z; l++){
for (c=0; c<COLUMN_Z; c++)
printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
if ((l%LINE_Z_PER_BANK)==LINE_Z_PER_BANK-1) offset_z+=(BANK_SIZE/2);
printf("\n");
}
}
#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X 4 //fix to N for N cores (N=4,16,64)
#define LINE_X (1<<L_LINE_X)
#define L_COLUMN_X (L_LINE_X-1)
#define COLUMN_X (1<<L_COLUMN_X)
#define L_LINE_Y L_COLUMN_X
#define LINE_Y COLUMN_X
#define L_COLUMN_Y L_LINE_X
#define COLUMN_Y (1<<L_COLUMN_Y)
#define L_LINE_Z L_LINE_X
#define LINE_Z LINE_X
#define L_COLUMN_Z L_COLUMN_Y
#define COLUMN_Z COLUMN_Y
#define L_NB_CORE (L_LINE_X-2)
#define NB_CORE (1<<L_NB_CORE)
#define L_NB_HART 2
#define NB_HART (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE (1<<L_DATA_MEM_SIZE)
#define L_BANK_SIZE (L_DATA_MEM_SIZE-L_NB_CORE)
#define BANK_SIZE (1<<L_BANK_SIZE)
#define NUM_BANKS NB_CORE
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define LINE_X_PER_BANK (1<<(L_BANK_SIZE-2-L_COLUMN_X))
#define LINE_Y_PER_BANK (1<<(L_BANK_SIZE-2-L_COLUMN_Y))
#define LINE_Z_PER_BANK (1<<(L_BANK_SIZE-1-L_COLUMN_Z))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
int *p_at, *p_ax, *p_ay, *p_az, *p_ay_old, *p_ay_end1, *p_ay_end2, *p_at_end, *p_az_end1, *p_az_end2;
int line_x[COLUMN_X], tmp, offset_x, offset_z;
int bx, bz, sx, sz;
bx=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_X-L_COLUMN_X-2);
sx=(t<<(L_LINE_X-L_NUM_THREADS))%(1<<(L_BANK_SIZE-2-L_COLUMN_X));
bz=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_Z-L_COLUMN_Z-1);
sz=(t<<(L_LINE_Z-L_NUM_THREADS))%(1<<(L_BANK_SIZE-1-L_COLUMN_Z));
offset_x=bx<<L_BANK_SIZE;
offset_z=(bz<<L_BANK_SIZE)+(BANK_SIZE/2);
p_ax=sx*COLUMN_X+offset_x+global_mem;
p_az=sz*COLUMN_Z+offset_z+global_mem;
p_az_end1=p_az;
p_az_end2=p_az+SIZE_Z_PER_THREAD;
p_at_end=line_x+COLUMN_X;
do{
p_ay_old=BANK_SIZE/4+global_mem;
p_at=line_x;
do{
*p_at=*p_ax;
p_at++;
p_ax++;
} while (p_at!=p_at_end);
p_az_end1+=COLUMN_Z;
do{
tmp = 0;
p_ay=p_ay_old-BANK_SIZE/2-BANK_SIZE/4;
p_ay_end2=p_ay_old+(NUM_BANKS-1)*BANK_SIZE+BANK_SIZE/4;
do{
p_ay+=BANK_SIZE/2+BANK_SIZE/4;
p_ay_end1=p_ay+COLUMN_Y*LINE_Y_PER_BANK;
p_at=line_x;
do{
tmp += *p_at * *p_ay;
p_at++;
p_ay+=COLUMN_Y;
} while (p_ay!=p_ay_end1);
} while (p_ay!=p_ay_end2);
*p_az=tmp;
p_az++;
p_ay_old++;
} while (p_az!=p_az_end1);
} while (p_az!=p_az_end2);
}
void main(){
int t;
int l, c, offset_z;
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
for (t=0; t<NUM_THREADS; t++) thread(t);
offset_z=(BANK_SIZE/2);
for (l=0; l<LINE_Z; l++){
for (c=0; c<COLUMN_Z; c++)
printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
if ((l%LINE_Z_PER_BANK)==LINE_Z_PER_BANK-1) offset_z+=(BANK_SIZE/2);
printf("\n");
}
}
#include <det_omp.h>
#include <stdio.h>
#define SQRT_LLX 2 //fix to 2,3,4 for 4,16,64 cores
#define LINE_BX (1<<SQRT_LLX)
#define COLUMN_BX (LINE_BX/2)
#define LINE_BY COLUMN_BX
#define COLUMN_BY LINE_BX
#define LINE_BZ LINE_BX
#define COLUMN_BZ COLUMN_BY
#define L_LINE_X (2*SQRT_LLX)
#define LINE_X (1<<L_LINE_X)
#define L_COLUMN_X (L_LINE_X-1)
#define COLUMN_X (LINE_X/2)
#define L_LINE_Y L_COLUMN_X
#define LINE_Y COLUMN_X
#define L_COLUMN_Y L_LINE_X
#define COLUMN_Y LINE_X
#define L_LINE_Z L_LINE_X
#define LINE_Z LINE_X
#define L_COLUMN_Z L_COLUMN_Y
#define COLUMN_Z COLUMN_Y
#define L_NB_CORE (L_LINE_X-2)
#define NB_CORE (1<<L_NB_CORE)
#define L_NB_HART 2
#define NB_HART (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE (1<<L_DATA_MEM_SIZE)
int global_mem[DATA_MEM_SIZE]={
[0 ... LINE_X*COLUMN_X+LINE_Y*COLUMN_Y-1]=1
};
void read_block(int block[], int lb, int cb, int offset, int cm){
int *ab, *ag, *ab_last1, *ab_last2, *old_ag;
ab=block;
ag=global_mem+offset;
old_ag=ag;
ab_last1=ab;
ab_last2=ab+cb*lb;
do{
ab_last1+=cb;
do{
*ab=*ag;
ab++;
ag++;
} while (ab!=ab_last1);
ag=old_ag+cm;
old_ag=ag;
} while (ab!=ab_last2);
}
void write_block(int block[], int lb, int cb, int offset, int cm){
int *ab, *ag, *ab_last1, *ab_last2, *old_ag;
ab=block;
ag=global_mem+offset;
old_ag=ag;
ab_last1=ab;
ab_last2=ab+cb*lb;
do{
ab_last1+=cb;
do{
*ag=*ab;
ab++;
ag++;
} while (ab!=ab_last1);
ag=old_ag+cm;
old_ag=ag;
} while (ab!=ab_last2);
}
void thread(int t){
int i, j, k, offset_matrix, offset_block;
int tmp;
int block_x[LINE_BX*COLUMN_BX], *bx, *bx_old, *bx_last;
int block_y[LINE_BY*COLUMN_BY], *by, *by_old;
int block_z[LINE_BZ*COLUMN_BZ], *bz, *bz_last1, *bz_last2;
i=t/LINE_BZ;
j=t%LINE_BZ;
offset_matrix=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
offset_block=offset_matrix+i*LINE_BZ*COLUMN_Z+j*COLUMN_BZ;
read_block(block_z,LINE_BZ,COLUMN_BZ,offset_block,COLUMN_Z);
for (k=0; k<COLUMN_X/COLUMN_BX; k++){
offset_block=i*LINE_BX*COLUMN_X+k*COLUMN_BX;
read_block(block_x,LINE_BX,COLUMN_BX,offset_block,COLUMN_X);
offset_matrix=LINE_X*COLUMN_X;
offset_block=offset_matrix+k*LINE_BY*COLUMN_Y+j*COLUMN_BY;
read_block(block_y,LINE_BY,COLUMN_BY,offset_block,COLUMN_Y);
bx_old=block_x;
bz=block_z;
bx_last=block_x;
bz_last1=block_z;
bz_last2=block_z+LINE_BZ*COLUMN_BZ;
do{
bx_last+=COLUMN_BX;
bz_last1+=COLUMN_BZ;
by_old=block_y;
do{
by=by_old;
bx=bx_old;
tmp=*bz;
do{
tmp+=*bx * *by;
bx++;
by+=COLUMN_BY;
} while (bx!=bx_last);
*bz=tmp;
bz++;
by_old++;
} while (bz!=bz_last1);
bx_old+=COLUMN_BX;
} while (bz!=bz_last2);
}
offset_matrix=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
offset_block=offset_matrix+i*LINE_BZ*COLUMN_Z+j*COLUMN_BZ;
write_block(block_z,LINE_BZ,COLUMN_BZ,offset_block,COLUMN_Z);
}
void main(){
int t;
int l, c, offset_z;
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
for (t=0; t<NUM_THREADS; t++) thread(t);
offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
for (l=0; l<LINE_Z; l++){
for (c=0; c<COLUMN_Z; c++)
printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
printf("\n");
}
}