The Deterministic OpenMP matrix multiplication source code

The Deterministic OpenMP base version
    
#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X                  4 //fix to N for N cores (N=4,16,64)
#define LINE_X        (1<<L_LINE_X)
#define L_COLUMN_X     (L_LINE_X-1) 
#define COLUMN_X    (1<<L_COLUMN_X)
#define L_LINE_Y         L_COLUMN_X
#define LINE_Y             COLUMN_X
#define L_COLUMN_Y         L_LINE_X 
#define COLUMN_Y    (1<<L_COLUMN_Y)
#define L_LINE_Z           L_LINE_X
#define LINE_Z               LINE_X
#define L_COLUMN_Z       L_COLUMN_Y
#define COLUMN_Z           COLUMN_Y
#define L_NB_CORE              (L_LINE_X-2)
#define NB_CORE              (1<<L_NB_CORE)
#define L_NB_HART                         2
#define NB_HART              (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS      (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE      (1<<L_DATA_MEM_SIZE)            
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
 int *p_ax, *p_ay, *p_az, *p_ay_old, *p_az_end1, *p_az_end2;
 int tmp, offset_x, offset_y, offset_z;
 offset_x=0;
 offset_y=LINE_X*COLUMN_X;
 offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
 p_ax=t*LINE_X_PER_THREAD*COLUMN_X+offset_x+global_mem;
 p_az=t*LINE_Z_PER_THREAD*COLUMN_Z+offset_z+global_mem;
 p_az_end1=p_az;
 p_az_end2=p_az+SIZE_Z_PER_THREAD;
 do{
  p_ay_old=offset_y+global_mem;
  p_az_end1+=COLUMN_Z;
  do{
   tmp = 0;
   p_ay=p_ay_old;
   do{
    tmp += *p_ax * *p_ay;
    p_ax++;
    p_ay+=COLUMN_Y;
   } while (p_ay!=p_ay_old+COLUMN_X*COLUMN_Y);
   *p_az=tmp;
   p_az++;
   p_ay_old++;
  } while (p_az!=p_az_end1);
 } while (p_az!=p_az_end2);
}
void main(){
 int t;
 int l, c, offset_z;
 omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
 for (t=0; t<NUM_THREADS; t++) thread(t);
 offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
 for (l=0; l<LINE_Z; l++){
  for (c=0; c<COLUMN_Z; c++)
   printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
  printf("\n");
 }
}
    
  
The Deterministic OpenMP copy version
    
#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X                  4 //fix to N for N cores (N=4,16,64)
#define LINE_X        (1<<L_LINE_X)
#define L_COLUMN_X     (L_LINE_X-1) 
#define COLUMN_X    (1<<L_COLUMN_X)
#define L_LINE_Y         L_COLUMN_X
#define LINE_Y             COLUMN_X
#define L_COLUMN_Y         L_LINE_X 
#define COLUMN_Y    (1<<L_COLUMN_Y)
#define L_LINE_Z           L_LINE_X
#define LINE_Z               LINE_X
#define L_COLUMN_Z       L_COLUMN_Y
#define COLUMN_Z           COLUMN_Y
#define L_NB_CORE              (L_LINE_X-2)
#define NB_CORE              (1<<L_NB_CORE)
#define L_NB_HART                         2
#define NB_HART              (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS      (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE      (1<<L_DATA_MEM_SIZE)
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
 int *p_at, *p_ax, *p_ay, *p_az, *p_ay_old, *p_at_end, *p_az_end1, *p_az_end2;
 int line_x[COLUMN_X], tmp, offset_x, offset_y, offset_z;
 offset_x=0;
 offset_y=LINE_X*COLUMN_X;
 offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
 p_ax=t*LINE_X_PER_THREAD*COLUMN_X+offset_x+global_mem;
 p_az=t*LINE_Z_PER_THREAD*COLUMN_Z+offset_z+global_mem;
 p_az_end1=p_az;
 p_az_end2=p_az+SIZE_Z_PER_THREAD;
 p_at_end=line_x+COLUMN_X;
 do{
  p_ay_old=offset_y+global_mem;
  p_az_end1+=COLUMN_Z;
  p_at=line_x;
  do{
   *p_at=*p_ax;
   p_at++;
   p_ax++;
  } while (p_at!=p_at_end);
  do{
   tmp = 0;
   p_ay=p_ay_old;
   p_at=line_x;
   do{
    tmp += *p_at * *p_ay;
    p_at++;
    p_ay+=COLUMN_Y;
   } while (p_ay!=p_ay_old+COLUMN_X*COLUMN_Y);
   *p_az=tmp;
   p_az++;
   p_ay_old++;
  } while (p_az!=p_az_end1);
 } while (p_az!=p_az_end2);
}
void main(){
 int t;
 int l, c, offset_z;
 omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
 for (t=0; t<NUM_THREADS; t++) thread(t);
 offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
 for (l=0; l<LINE_Z; l++){
  for (c=0; c<COLUMN_Z; c++)
   printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
  printf("\n");
 }
    
  
The Deterministic OpenMP distributed version
    
#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X                  4 //fix to N for N cores (N=4,16,64)
#define LINE_X        (1<<L_LINE_X)
#define L_COLUMN_X     (L_LINE_X-1)
#define COLUMN_X    (1<<L_COLUMN_X)
#define L_LINE_Y         L_COLUMN_X
#define LINE_Y             COLUMN_X
#define L_COLUMN_Y         L_LINE_X
#define COLUMN_Y             LINE_X
#define L_LINE_Z           L_LINE_X
#define LINE_Z               LINE_X
#define L_COLUMN_Z       L_COLUMN_Y
#define COLUMN_Z           COLUMN_Y
#define L_NB_CORE              (L_LINE_X-2)
#define NB_CORE              (1<<L_NB_CORE)
#define L_NB_HART                         2
#define NB_HART              (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS      (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE      (1<<L_DATA_MEM_SIZE)
#define L_BANK_SIZE (L_DATA_MEM_SIZE-L_NB_CORE)
#define BANK_SIZE              (1<<L_BANK_SIZE)
#define NUM_BANKS                       NB_CORE
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define LINE_X_PER_BANK   (1<<(L_BANK_SIZE-2-L_COLUMN_X))
#define LINE_Y_PER_BANK   (1<<(L_BANK_SIZE-2-L_COLUMN_Y))
#define LINE_Z_PER_BANK   (1<<(L_BANK_SIZE-1-L_COLUMN_Z))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
 int *p_ax, *p_ay, *p_az, *p_ay_old, *p_ay_end1, *p_ay_end2, *p_az_end1, *p_az_end2;
 int tmp, offset_x, offset_z;
 int bx, bz, sx, sz;
 bx=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_X-L_COLUMN_X-2);
 sx=(t<<(L_LINE_X-L_NUM_THREADS))%(1<<(L_BANK_SIZE-2-L_COLUMN_X));
 bz=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_Z-L_COLUMN_Z-1);
 sz=(t<<(L_LINE_Z-L_NUM_THREADS))%(1<<(L_BANK_SIZE-1-L_COLUMN_Z));
 offset_x=bx<<L_BANK_SIZE;
 offset_z=(bz<<L_BANK_SIZE)+(BANK_SIZE/2);
 p_az=sz*COLUMN_Z+offset_z+global_mem;
 p_az_end1=p_az;
 p_az_end2=p_az+SIZE_Z_PER_THREAD;
 do{
  p_ay_old=BANK_SIZE/4+global_mem;
  p_az_end1+=COLUMN_Z;
  do{
   tmp = 0;
   p_ax=sx*COLUMN_X+offset_x+global_mem;
   p_ay=p_ay_old-BANK_SIZE/2-BANK_SIZE/4;
   p_ay_end2=p_ay_old+(NUM_BANKS-1)*BANK_SIZE+BANK_SIZE/4;
   do{
    p_ay+=BANK_SIZE/2+BANK_SIZE/4;
    p_ay_end1=p_ay+COLUMN_Y*LINE_Y_PER_BANK;
    do{
     tmp += *p_ax * *p_ay;
     p_ax++;
     p_ay+=COLUMN_Y;
    } while (p_ay!=p_ay_end1);
   } while (p_ay!=p_ay_end2);
   *p_az=tmp;
   p_az++;
   p_ay_old++;
  } while (p_az!=p_az_end1);
 } while (p_az!=p_az_end2);
}
void main(){
 int t;
 int l, c, offset_z;
 omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
 for (t=0; t<NUM_THREADS; t++) thread(t);
 offset_z=(BANK_SIZE/2);
 for (l=0; l<LINE_Z; l++){
  for (c=0; c<COLUMN_Z; c++)
   printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
  if ((l%LINE_Z_PER_BANK)==LINE_Z_PER_BANK-1) offset_z+=(BANK_SIZE/2);
  printf("\n");
 }
}
    
  
The Deterministic OpenMP d+c version
    
#include <det_omp.h>
#include <stdio.h>
#define L_LINE_X                  4 //fix to N for N cores (N=4,16,64)
#define LINE_X        (1<<L_LINE_X)
#define L_COLUMN_X     (L_LINE_X-1) 
#define COLUMN_X    (1<<L_COLUMN_X)
#define L_LINE_Y         L_COLUMN_X
#define LINE_Y             COLUMN_X
#define L_COLUMN_Y         L_LINE_X 
#define COLUMN_Y    (1<<L_COLUMN_Y)
#define L_LINE_Z           L_LINE_X
#define LINE_Z               LINE_X
#define L_COLUMN_Z       L_COLUMN_Y
#define COLUMN_Z           COLUMN_Y
#define L_NB_CORE              (L_LINE_X-2) 
#define NB_CORE              (1<<L_NB_CORE)
#define L_NB_HART                         2
#define NB_HART              (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS      (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE      (1<<L_DATA_MEM_SIZE)            
#define L_BANK_SIZE (L_DATA_MEM_SIZE-L_NB_CORE)
#define BANK_SIZE              (1<<L_BANK_SIZE)
#define NUM_BANKS                       NB_CORE
#define LINE_X_PER_THREAD (1<<(L_LINE_X-L_NUM_THREADS))
#define LINE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS))
#define LINE_X_PER_BANK   (1<<(L_BANK_SIZE-2-L_COLUMN_X))
#define LINE_Y_PER_BANK   (1<<(L_BANK_SIZE-2-L_COLUMN_Y))
#define LINE_Z_PER_BANK   (1<<(L_BANK_SIZE-1-L_COLUMN_Z))
#define SIZE_Z_PER_THREAD (1<<(L_LINE_Z-L_NUM_THREADS+L_COLUMN_Z))
int global_mem[DATA_MEM_SIZE]={[0 ... DATA_MEM_SIZE-1]=1};
void thread(int t){
 int *p_at, *p_ax, *p_ay, *p_az, *p_ay_old, *p_ay_end1, *p_ay_end2, *p_at_end, *p_az_end1, *p_az_end2;
 int line_x[COLUMN_X], tmp, offset_x, offset_z;
 int bx, bz, sx, sz;
 bx=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_X-L_COLUMN_X-2);
 sx=(t<<(L_LINE_X-L_NUM_THREADS))%(1<<(L_BANK_SIZE-2-L_COLUMN_X));
 bz=t>>(L_BANK_SIZE+L_NUM_THREADS-L_LINE_Z-L_COLUMN_Z-1);
 sz=(t<<(L_LINE_Z-L_NUM_THREADS))%(1<<(L_BANK_SIZE-1-L_COLUMN_Z));
 offset_x=bx<<L_BANK_SIZE;
 offset_z=(bz<<L_BANK_SIZE)+(BANK_SIZE/2);
 p_ax=sx*COLUMN_X+offset_x+global_mem;
 p_az=sz*COLUMN_Z+offset_z+global_mem;
 p_az_end1=p_az;
 p_az_end2=p_az+SIZE_Z_PER_THREAD;
 p_at_end=line_x+COLUMN_X;
 do{
  p_ay_old=BANK_SIZE/4+global_mem;
  p_at=line_x;
  do{
   *p_at=*p_ax;
   p_at++;
   p_ax++;
  } while (p_at!=p_at_end);
  p_az_end1+=COLUMN_Z;
  do{
   tmp = 0;
   p_ay=p_ay_old-BANK_SIZE/2-BANK_SIZE/4;
   p_ay_end2=p_ay_old+(NUM_BANKS-1)*BANK_SIZE+BANK_SIZE/4;
   do{
    p_ay+=BANK_SIZE/2+BANK_SIZE/4;
    p_ay_end1=p_ay+COLUMN_Y*LINE_Y_PER_BANK;
    p_at=line_x;
    do{
     tmp += *p_at * *p_ay;
     p_at++;
     p_ay+=COLUMN_Y;
    } while (p_ay!=p_ay_end1);
   } while (p_ay!=p_ay_end2);
   *p_az=tmp;
   p_az++;
   p_ay_old++;
  } while (p_az!=p_az_end1);
 } while (p_az!=p_az_end2);
}
void main(){
 int t;
 int l, c, offset_z;
 omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
 for (t=0; t<NUM_THREADS; t++) thread(t);
 offset_z=(BANK_SIZE/2);
 for (l=0; l<LINE_Z; l++){
  for (c=0; c<COLUMN_Z; c++)
   printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
  if ((l%LINE_Z_PER_BANK)==LINE_Z_PER_BANK-1) offset_z+=(BANK_SIZE/2);
  printf("\n");
 }
}
    
  
The Deterministic OpenMP tiled version
    
#include <det_omp.h>
#include <stdio.h>
#define SQRT_LLX                  2 //fix to 2,3,4 for 4,16,64 cores
#define LINE_BX       (1<<SQRT_LLX)
#define COLUMN_BX       (LINE_BX/2)
#define LINE_BY           COLUMN_BX
#define COLUMN_BY           LINE_BX
#define LINE_BZ             LINE_BX
#define COLUMN_BZ         COLUMN_BY
#define L_LINE_X       (2*SQRT_LLX)
#define LINE_X        (1<<L_LINE_X)
#define L_COLUMN_X     (L_LINE_X-1) 
#define COLUMN_X         (LINE_X/2)
#define L_LINE_Y         L_COLUMN_X
#define LINE_Y             COLUMN_X
#define L_COLUMN_Y         L_LINE_X 
#define COLUMN_Y             LINE_X
#define L_LINE_Z           L_LINE_X
#define LINE_Z               LINE_X
#define L_COLUMN_Z       L_COLUMN_Y
#define COLUMN_Z           COLUMN_Y
#define L_NB_CORE              (L_LINE_X-2)
#define NB_CORE              (1<<L_NB_CORE)
#define L_NB_HART                         2
#define NB_HART              (1<<L_NB_HART)
#define L_NUM_THREADS (L_NB_CORE+L_NB_HART)
#define NUM_THREADS      (1<<L_NUM_THREADS)
#define L_DATA_MEM_SIZE (L_LINE_Z+L_COLUMN_Z+1)
#define DATA_MEM_SIZE      (1<<L_DATA_MEM_SIZE)
int global_mem[DATA_MEM_SIZE]={
[0 ... LINE_X*COLUMN_X+LINE_Y*COLUMN_Y-1]=1
};
void read_block(int block[], int lb, int cb, int offset, int cm){
 int *ab, *ag, *ab_last1, *ab_last2, *old_ag;
 ab=block;
 ag=global_mem+offset;
 old_ag=ag;
 ab_last1=ab;
 ab_last2=ab+cb*lb;
 do{
  ab_last1+=cb;
  do{
   *ab=*ag;
   ab++;
   ag++;
  } while (ab!=ab_last1);
  ag=old_ag+cm;
  old_ag=ag;
 } while (ab!=ab_last2);
}
void write_block(int block[], int lb, int cb, int offset, int cm){
 int *ab, *ag, *ab_last1, *ab_last2, *old_ag;
 ab=block;
 ag=global_mem+offset;
 old_ag=ag;
 ab_last1=ab;
 ab_last2=ab+cb*lb;
 do{
  ab_last1+=cb;
  do{
   *ag=*ab;
   ab++;
   ag++;
  } while (ab!=ab_last1);
  ag=old_ag+cm;
  old_ag=ag;
 } while (ab!=ab_last2);
}
void thread(int t){
 int i, j, k, offset_matrix, offset_block;
 int tmp;
 int block_x[LINE_BX*COLUMN_BX], *bx, *bx_old, *bx_last;
 int block_y[LINE_BY*COLUMN_BY], *by, *by_old;
 int block_z[LINE_BZ*COLUMN_BZ], *bz, *bz_last1, *bz_last2;
 i=t/LINE_BZ;
 j=t%LINE_BZ;
 offset_matrix=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
 offset_block=offset_matrix+i*LINE_BZ*COLUMN_Z+j*COLUMN_BZ;
 read_block(block_z,LINE_BZ,COLUMN_BZ,offset_block,COLUMN_Z);
 for (k=0; k<COLUMN_X/COLUMN_BX; k++){
  offset_block=i*LINE_BX*COLUMN_X+k*COLUMN_BX;
  read_block(block_x,LINE_BX,COLUMN_BX,offset_block,COLUMN_X);
  offset_matrix=LINE_X*COLUMN_X;
  offset_block=offset_matrix+k*LINE_BY*COLUMN_Y+j*COLUMN_BY;
  read_block(block_y,LINE_BY,COLUMN_BY,offset_block,COLUMN_Y);
  bx_old=block_x;
  bz=block_z;
  bx_last=block_x;
  bz_last1=block_z;
  bz_last2=block_z+LINE_BZ*COLUMN_BZ;
  do{
   bx_last+=COLUMN_BX;
   bz_last1+=COLUMN_BZ;
   by_old=block_y;
   do{
    by=by_old;
    bx=bx_old;
    tmp=*bz;
    do{
     tmp+=*bx * *by;
     bx++;
     by+=COLUMN_BY;
    } while (bx!=bx_last);
    *bz=tmp;
    bz++;
    by_old++;
   } while (bz!=bz_last1);
   bx_old+=COLUMN_BX;
  } while (bz!=bz_last2);
 }
 offset_matrix=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
 offset_block=offset_matrix+i*LINE_BZ*COLUMN_Z+j*COLUMN_BZ;
 write_block(block_z,LINE_BZ,COLUMN_BZ,offset_block,COLUMN_Z);
}
void main(){
 int t;
 int l, c, offset_z;
 omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for
 for (t=0; t<NUM_THREADS; t++) thread(t);
 offset_z=LINE_X*COLUMN_X+LINE_Y*COLUMN_Y;
 for (l=0; l<LINE_Z; l++){
  for (c=0; c<COLUMN_Z; c++)
   printf("%3d ",*(global_mem+offset_z+l*COLUMN_Z+c));
  printf("\n");
 }
}