#include #include #include #include #include "./book.h" #include "./scriptio.h" #define NUM_BLOCKS 4 // ------------ BLOCKS --------------- #define NUM_THREADS 4 // +++++++ THREADS +++++++++ #define WHATEVER 10 #define ESF_EMPTY (-1) #define CDATA_SIZE 7 #define COutReady 0 #define COutData_CMD 1 #define COutData_A 2 #define COutData_IND 3 #define COutData_VAL 4 #define CInAck 5 #define COutData_N 6 #define GPUDATA_SIZE 6 #define GPUOutReady 0 #define GPUOutData 1 #define GPUInAck 2 #define GPUOutError 3 #define GPUStart 4 #define GPUStop 5 #define CELL_LIMIT NUM_THREADS #define NUM_CELLS (NUM_BLOCKS * NUM_THREADS) #define UPDATE 103 // see John's table #define LOOKUP 105 // see John's table #define DELETE 104 // see John's table #define INIT 101 // see John's table #define FETCH_CELL 106 // value left by John's table #define NOP 107 // value left by John's table #define TERMINATE 102 // see John's table #define WAIT_GPU 1000 #define WAIT_LIMIT 30 #define FOLD_NOT_READY 1000000 #define WARP_SIZE 32 #define HALF_WARP 16 #define DEVICE_RESULT_NEWNAME_ERROR 0 #define DEVICE_RESULT_NEWNAME 1 #define DEVICE_RESULT_C_ERROR 2 #define DEVICE_RESULT_C 3 #define DEVICE_RESULT_CNEW 4 #define DEVICE_RESULT_CNEW_PLACEHOLDER 5 #define DEVICE_RESULT_INPUT_INDEX 6 #define DEVICE_RESULT_NOP 7 #define DEVICE_RESULT_FOLD_NEWNAME 8 #define DEVICE_RESULT_FOLD_CNEW_PLACEHOLDER 9 #define DEVICE_RESULT_FOLD_NEWNAME_ERROR 10 #define DEVICE_RESULT_FOLD_C 11 #define DEVICE_RESULT_FOLD_CNEW 12 #define DEVICE_RESULT_FOLD_C_ERROR 13 #define DEVICE_RESULT_A 14 #define DEVICE_RESULT_IND 15 #define DEVICE_RESULT_VAL 16 #define DEVICE_RESULT_I 17 #define DEVICE_RESULT_FOLD_LOOKUP 18 #define DEVICE_RESULT_FOLD_LOOKUP_ERROR 19 #define DEVICE_RESULT_LOOKUP 20 #define DEVICE_RESULT_FOLD_RANK 21 #define DEVICE_RESULT_FOLD_RANK_ERROR 22 #define DEVICE_RESULT_RANK 23 #define DEVICE_RESULT_LOOKUP_ERROR 24 #define DEVICE_RESULT_RANK_ERROR 25 #define DEVICE_RESULT_OUTPUT_INDEX 26 #define DEVICE_RESULT_CMD 27 #define DEVICE_RESULT_N 28 #define DEVICE_RESULT_GPU_CLOCK 29 #define DEVICE_RESULT_SIZE 50 // number of slots in the device_results array #define UPDATE_FOLD 0 #define LOOKUP_FOLD 1 #define UPDATE_ERROR_CODE 0 #define LOOKUP_ERROR_CODE -1 #define UPDATE_SUCCESS 1 #define OUT_PRINT_NUM 10 #define INPUT_DATA_OPCODE 0 #define INPUT_DATA_NAME 1 #define INPUT_DATA_INDEX 2 #define INPUT_DATA_VALUE 3 #define INPUT_DATA_NOTIFY 4 #define INPUT_DATA_RECORD_SIZE 5 #define OUTPUT_DATA_ERROR 0 #define OUTPUT_DATA_VALUE 1 #define OUTPUT_DATA_RECORD_SIZE 2 #define OPARRAY_LINE_NUMBER 0 #define OPARRAY_OPCODE 1 #define OPARRAY_NAME 2 #define OPARRAY_INDEX 3 #define OPARRAY_VALUE 4 #define OPARRAY_NOTIFY 5 #define OPARRAY_ERROR 6 #define OPARRAY_EXPECTED_VALUE 7 #define OPARRAY_RECORD_SIZE 8 #define PROBE_LIMIT (NUM_BLOCKS * NUM_THREADS) #define MAGIC_LOOP 100 // 100 - this is the limit of the loop on the cpu that ensures the output is produced // and we wish we could use something more logical. We have tried lower numbers on the 32 x 256 version // (our largest) and they do not work, so this is what we choose for all of the versions. /* __device__ void broadcast (int value,int *y) { y[0] = value; __shared__ int x[NUM_THREADS]; // x[threadIdx.x] = y[threadIdx.x + blockIdx.x * blockDim.x]; int j,j2,tid; j = NUM_THREADS; j2 = j/2; tid = threadIdx.x; while (j2 > 0 && tid < blockDim.x) { if ((tid % j) == 0) { x[tid + j2] = x[tid]; } else {}; j = j2; j2 = j2 / 2; __syncthreads(); } } */ /* __device__ void check_array_for_zeros (int *probe, volatile int *arr, int limit, int code,int loc) { int i,bad; bad = 1; for (i = 0; i < limit; i++) if (arr[i] == 0) bad = 0; if (bad == 0) probe[loc] = code; } */ __device__ void init_gpu_sync (int *arr1,int *arr2) { int idx = threadIdx.x + blockIdx.x * blockDim.x; while (idx < NUM_BLOCKS * NUM_THREADS) { arr1[idx] = 0; arr2[idx] = 0; idx += blockDim.x * gridDim.x; } } __device__ __shared__ volatile int theNextGoal; __device__ void gpu_sync (volatile int goalVal, volatile int *Arrayin, volatile int *Arrayout,int *y) { int tid_in_block = threadIdx.x * blockDim.y + threadIdx.y; int nBlockNum = gridDim.x * gridDim.y; int bid = blockIdx.x * gridDim.y + blockIdx.y; if (tid_in_block == 0) { Arrayin[bid] = goalVal; } // line in the paper that appeared below -- if (bid == 1) { if (bid == 0) { if (tid_in_block < nBlockNum) { while (Arrayin[tid_in_block] != goalVal) { y[tid_in_block] = atomicCAS(y,y[tid_in_block],1); } } else {}; __syncthreads(); if (tid_in_block < nBlockNum) { Arrayout[tid_in_block] = goalVal; } else {}; } if (tid_in_block == 0) { while (Arrayout[bid] != goalVal) { y[tid_in_block] = atomicCAS(y,y[tid_in_block],0); } } else {}; __syncthreads(); } // ------------------------------------------------------------------------------- // Broadcast a value from global y[0] to all the elements of a shared array x // ------------------------------------------------------------------------------- __device__ void broadcasting (volatile int value,volatile int *x,volatile int *y) { y[threadIdx.x + blockIdx.x * blockDim.x] = 0; if (threadIdx.x == 0 && blockIdx.x == 0) y[0] = value; else {}; int j,j2,tid; j = NUM_BLOCKS; j2 = j/2; tid = threadIdx.x; // Phase 1: broadcast the value y[0] to the first NUMBLOCKS elements of y while (j2 > 0 && tid < NUM_BLOCKS) { if ((tid % j) == 0) { y[tid + j2] = y[tid]; } else {}; j = j2; j2 = j2 / 2; __syncthreads(); } // Phase 2: copy y[i] to x[0] within block i, assuming each block addresses x[0] on up... // x[threadIdx.x] = y[threadIdx.x + blockIdx.x * blockDim.x]; x[0] = y [blockIdx.x]; // Phase 3: within each block, broadcast x[0] to all elements within thab block, // i.e x[0] is copied to x[0], ..., x[NTHREADS-1]. This is similar to the Phase 1 loop, // but there are differences: // (1) here, we are doing the copies within a shared array, but Phase 1 uses global // (2) we are copying more values: up to index NTHREADS-1, but Phase 1 only copies up to NBLOCKS-1 // (3) the Phase 2 code is executed by every block, but Phase 1 is executed only by block 0 j = NUM_THREADS; j2 = j/2; tid = threadIdx.x; while (j2 > 0 && tid < blockDim.x) { if ((tid % j) == 0) { x[tid + j2] = x[tid]; } else {}; j = j2; j2 = j2 / 2; __syncthreads(); } y[threadIdx.x + blockIdx.x * blockDim.x] = x[threadIdx.x]; } // ------------------------------------------------------------------------------- __device__ int wait_gpu (int j) { while (j-- > 0) {}; } int wait_cpu (int j) { while (j-- > 0) {}; } __device__ int get_object_right (int w) {int new_w = (int) w & 0x0000ffff; return new_w; } int cpu_get_object_right (int w) { int new_w = (int) w & 0x0000ffff; return new_w; } __device__ int get_object_left (int w) {int new_w = (int) w >> 16; return new_w; } int cpu_get_object_left (int w) { int new_w = (int) w >> 16; return new_w; } __device__ int mk_object (int left, int right) { int w1 = 0x0000ffff & right; int w2 = left << 16; int new_w = w1 | w2; return new_w; } int cpu_mk_object(int left,int right) { int w1 = 0x0000ffff & right; int w2 = left << 16; int new_w = w1 | w2; return new_w; } __device__ unsigned int get_dde_object_right (int w) { unsigned int w2 = w >> 1; unsigned int new_w = w2 & 0x00007fff; return new_w; } unsigned int cpu_get_dde_object_right (int w) { unsigned int w2 = w >> 1; unsigned int new_w = w2 & 0x00007fff; return new_w; } __device__ unsigned int get_dde_object_left (int w) { unsigned int new_w2 = w >> 16; return new_w2; } unsigned int cpu_get_dde_object_left (int w) {unsigned int new_w2 = w >> 16; return new_w2; } __device__ unsigned int mk_dde_object(int left,int right) { unsigned int w1 = (0x00007fff & right) << 1; unsigned int w2 = left << 16; unsigned int new_w = (w1 | w2) | 0x00000001; return new_w; } unsigned int cpu_mk_dde_object(int left,int right) { unsigned int w1 = (0x00007fff & right) << 1; unsigned int w2 = left << 16; unsigned int new_w = (w1 | w2) | 0x00000001; return new_w; } /* put a vector of numbers into probe, starting at probebaseaddress. Each thread should put a value into its slot, which is at probebaseaddress + threadid. The value which the thread puts in should be an expression that uses threadid and blockid, so that it will have a different value for each thread. For example, call probe_update (probe, currentprobeloc, threadid.x + blockid.x * blockdim.x + 5000) Another useful call would be probe_update (probe, currentprobeloc, cellID[threadid.x + blockid.x * blockdim.x]) */ __device__ void probe_update (volatile int *probe, int probebaseaddress, int val) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < NUM_BLOCKS * NUM_THREADS) { probe[probebaseaddress + i +1] = val; } else {}; } __device__ void probe_init (int *probe, int address, int probecode) { /* probecode is a special recognizable number for searching the otput */ probe[address] = probecode; } __device__ int wait_gpu (int *probe,int j) { // probe_update(probe,100+NUM_THREADS + NUM_THREADS,WAIT_GPU); ++++++++++++++++++++ while (j-- > 0); return 1; } __device__ int add_op (int a, int b) {return (a + b);} __device__ int update_op (volatile int *probe, int location, int x_o, int y_o) { int x_b = get_object_left (x_o); int x_i = get_object_right(x_o); int y_b = get_object_left(y_o); int y_i = get_object_right(y_o); int res_b = x_b || y_b; int res_i; if (x_b == 1) {res_i = x_i;} else {res_i = y_i;} return mk_object (res_b,res_i); } __device__ int lookup_op (volatile int *probe, int location, int x_o, int y_o) { int d_x = get_object_left (x_o); int d_y = get_object_left(y_o); if (d_x > d_y) {return x_o;} else {return y_o;} } __device__ int fold_op (volatile int *probe, int location, int x_o, int y_o,int which_op) { switch (which_op) { case 0: return update_op(probe,location,x_o,y_o); case 1: return lookup_op(probe,location,x_o,y_o); } } __device__ void update_fold_in_parallel (volatile int *probe,volatile int *s_data, volatile int *sums, int which_op) { int j = threadIdx.x; int offset = 1; int c = 0; while (offset < blockDim.x) { if ((j % (offset * 2)) == 0) { s_data[j] = fold_op (probe,1,s_data[j],s_data[(j + offset)],which_op); } else {}; c++; offset *= 2; __syncthreads(); } __syncthreads(); //sums[blockIdx.x * blockDim.x] = s_data[0]; sums[blockIdx.x * blockDim.x] = s_data[0]; __syncthreads(); } __device__ void update_fold_over_blocks (volatile int *probe,volatile int *sums_data, volatile int *sums,int which_op) { int j = threadIdx.x; int offset = 1; //int c = 0; while (offset < blockDim.x) { if ((j % (offset * 2)) == 0) { sums_data[j] = fold_op (probe,2,sums_data[j],sums_data[j + offset],which_op); } else {}; // if (c == 1) probe[threadIdx.x + blockIdx.x * blockDim.x] = q; // c++; offset *= 2; __syncthreads(); } __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) sums[0] = sums_data[0]; else {}; __syncthreads(); } __device__ void transpose1 (volatile int *probe,volatile int *s_data, volatile int *sums_data, volatile int *sums) { int i; int keepwaiting = 1; int waitcount = 0; int waitlimit = WAIT_LIMIT; i = threadIdx.x; // probe_update(probe,100,i); +++++++++++++++ // probe_update(probe,100+NUM_THREADS,i); +++++++++++++++++++ int bid = 0; int tid = threadIdx.x; while (bid < NUM_BLOCKS && tid < NUM_THREADS) { /* while (keepwaiting) { if (sums[bid * blockDim.x] != 0 || waitcount>waitlimit) {keepwaiting = 0;} else { waitcount++; wait_gpu(probe,100);} } __syncthreads(); */ sums_data[tid] = sums[bid * blockDim.x]; /* keepwaiting = 1; waitcount = 0; while (keepwaiting) { if (sums_data[tid] != 0 || waitcount>waitlimit) {keepwaiting = 0;} else { waitcount++; wait_gpu(probe,100);} } */ tid++; bid++; } __syncthreads(); } __device__ void fold (volatile int *probe,int initial_value, volatile int *sums, int *gs_in, int *gs_out, volatile int *device_results,volatile int *s_data, int *result, int *result_error, int which_op,int *y) { volatile __shared__ int sums_data [NUM_THREADS]; // note that NUM_THREADS should be >= NUM_BLOCKS int idx,sum,i,ci; *result = FOLD_NOT_READY; *result_error = FOLD_NOT_READY; __syncthreads(); sums_data[threadIdx.x] = initial_value; __syncthreads(); sums[threadIdx.x + blockIdx.x * blockDim.x] = initial_value; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); update_fold_in_parallel(probe,s_data,sums,which_op); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); s_data[threadIdx.x] = initial_value; //= __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); transpose1(probe,s_data,s_data,sums); //= __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); update_fold_over_blocks (probe,s_data,sums,which_op); //= __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); *result = get_object_right(sums[0]); *result_error = get_object_left(sums[0]); __syncthreads(); } __device__ int delete_apply_mapdef_change (volatile int *device_results, int cmd,int array_name,int code,int code_ok,int mapdef,int id) { if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == DELETE && array_name != ESF_EMPTY && code_ok == 1) return (mapdef && (array_name != id)); else return mapdef; } __device__ int delete_apply_mapcode_change (volatile int *device_results, int cmd,int array_name,int code,int code_ok,int mapdef,int mapcode) { if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == DELETE && array_name != ESF_EMPTY && code_ok == 1 && mapdef == 1 && mapcode > code) return (mapcode - 1); else return mapcode; } __device__ int delete_apply_low_change (volatile int *device_results, int cmd,int array_name,int code,int code_ok,int eltdef,int low) { if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == DELETE && array_name != ESF_EMPTY && code_ok == 1 && eltdef == 1 && low > code) return (low - 1); else return low; } __device__ int delete_apply_high_change (volatile int *device_results, int cmd,int array_name,int code,int code_ok,int eltdef,int high) { if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == DELETE && array_name != ESF_EMPTY && code_ok == 1 && eltdef == 1 && high >= code) return (high - 1); else return high; } __device__ int delete_apply_eltdef_change(volatile int *device_results, int cmd,int array_name,int code,int code_ok,int eltdef, int low, int high) { if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == DELETE && array_name != ESF_EMPTY && code_ok == 1) return (eltdef && (low <= high)); else return eltdef; } void dump_cell (int cellid,int eltdef,int high,int index, int low,int mapcode, int mapdef,int mark,int rank,int value) { if (eltdef || mapdef) { printf (" %d. ",cellid); if (eltdef) {printf ("*");} else {printf (" ");} if (mark) {printf ("@");} else {printf (" ");} if (eltdef) { printf ("Low=%d, High=%d, [%d] = %d",low,high,index,value); } else {printf(" ");} if (mapdef) { printf (" : array %d -> %d",cellid,mapcode); } printf ("\n"); } return; } void dump_cells (int *outprint) { int i; for (i = 0; i < OUT_PRINT_NUM * NUM_BLOCKS * NUM_THREADS; i+=OUT_PRINT_NUM) { dump_cell(outprint[i],outprint[i+1],outprint[i+2],outprint[i+3],outprint[i+4],outprint[i+5], outprint[i+6],outprint[i+7],outprint[i+8],outprint[i+9]); } for (i = 0; i < OUT_PRINT_NUM * NUM_BLOCKS * NUM_THREADS; i++) outprint[i] = 0; } __device__ void prepare_out(int *out, int *cellid, int *eltdef, int *high, int *index, int *low, int *mapcode, int *mapdef, int *mark, int *rank, int *value, volatile int *gpudata, volatile int *device_results,volatile int *y) { int tid11 = threadIdx.x * OUT_PRINT_NUM; int blockD11 = blockIdx.x * blockDim.x * OUT_PRINT_NUM; int whole_tid = threadIdx.x; __syncthreads(); out[tid11 + blockD11] = cellid[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 1]= eltdef[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 2] = high[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 3]= index[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 4] = low[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 5] = mapcode[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 6] = mapdef[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 7] = mark[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 8] = rank[whole_tid]; __syncthreads(); out[tid11 + blockD11 + 9] = value[whole_tid]; __syncthreads(); } __device__ void terminate(volatile int *gpudata,volatile int *device_results, int *broadcast,volatile int *y, int output_index) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutData] = 0; gpudata[GPUOutError] = 0; } __syncthreads(); } __device__ int get_mapcode(int a,int mapdef,int mapcode,int cellid) { if (a == cellid && mapdef) return mk_object(1,mapcode); else return mk_object(0,0); } __device__ int get_rank(int a,int mapdef,int rank,int cellid) { if (a == cellid && mapdef) return mk_object(1,rank); else return mk_object(0,0); } __device__ void encode (volatile int *probe, int *cellid, int *eltdef, int *mapcode,int *mapdef, int *rank, int *gs_in, int *gs_out, int a,volatile int *device_results, volatile int *encode_result,int *y,int *result, int *result_error, int *var_rank) { volatile __shared__ int s_data[NUM_THREADS]; int var_rank_error; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; __syncthreads(); s_data[threadIdx.x] = get_mapcode(a,mapdef[threadIdx.x],mapcode[threadIdx.x],cellid[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); fold (probe,mk_object(0,0),encode_result,gs_in,gs_out,device_results,s_data, result,result_error,UPDATE_FOLD,y); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; __syncthreads(); s_data[threadIdx.x] = get_rank(a,mapdef[threadIdx.x],rank[threadIdx.x],cellid[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); fold (probe,mk_object(0,0),encode_result,gs_in,gs_out,device_results,s_data, var_rank,&var_rank_error,UPDATE_FOLD,y); __syncthreads(); } __device__ void encode2 (int *probe, int *cellid, int *eltdef, int *mapcode,int *mapdef, int *rank, int *gs_in, int *gs_out, int a,volatile int *device_results, volatile int *encode_result,int *y,int *result, int *result_error, int *var_rank) { int mapd,mapc,vrank; //init_gpu_sync(gs_in,gs_out); //gpu_sync(1,gs_in,gs_out); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); encode_result[threadIdx.x + blockIdx.x * blockDim.x] = mapdef[threadIdx.x]; __syncthreads(); mapd = encode_result[a]; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); encode_result[threadIdx.x + blockIdx.x * blockDim.x] = mapcode[threadIdx.x]; __syncthreads(); mapc = encode_result[a]; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); encode_result[threadIdx.x + blockIdx.x * blockDim.x] = rank[threadIdx.x]; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); vrank = encode_result[a]; if (mapd == 1) { *result = mapc; *result_error = UPDATE_SUCCESS; *var_rank = vrank; } else { *result = 0; *result_error = UPDATE_ERROR_CODE; *var_rank = 0; }; } __device__ int map_alloc (int eltdef, int cellid) { return mk_object(!eltdef,cellid); } __device__ void allocateCell (volatile int *probe, int *cellid, int *eltdef, volatile int *sums, int *gs_in, int *gs_out, int ok,volatile int *device_results, int *result, int *result_error, int *y) { volatile __shared__ int s_data [NUM_THREADS]; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; __syncthreads(); s_data[threadIdx.x] = map_alloc(eltdef[threadIdx.x],cellid[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); fold (probe,mk_object(0,0),sums,gs_in,gs_out,device_results,s_data, result,result_error,UPDATE_FOLD,y); __syncthreads(); } __device__ int lookup_map_fun (int mark, int rank,int value) { if (mark) {return mk_object (rank,value);} else {return mk_object (-1,0);} } __device__ int apply_lookup_change (int ind,int i,int low,int c,int high,int mark) { if (ind == i && low <= c && c <= high) return 1; else return 0; } __device__ void lookup_fold (volatile int *probe, int *high, int *index, int *low, int *mark,int *rank,int *value,int *gs_in, int *gs_out, int ok,int c,int i, volatile int *device_results, volatile int *temp, int *result, int *result_error, int *y) { volatile __shared__ int s_data[NUM_THREADS]; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; __syncthreads(); mark[threadIdx.x] = apply_lookup_change(index[threadIdx.x],i,low[threadIdx.x],c,high[threadIdx.x],mark[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; __syncthreads(); s_data[threadIdx.x] = lookup_map_fun(mark[threadIdx.x],rank[threadIdx.x],value[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); fold (probe,mk_object(-1,0),temp,gs_in,gs_out,device_results,s_data, result,result_error, LOOKUP_FOLD,y); __syncthreads(); mark[threadIdx.x] = 0; __syncthreads(); } __device__ int update_maybe_inc_low (volatile int *probe,volatile int *device_results,int cmd, int array_name, int code,int code_ok,int newname,int newname_ok, int cellid,int eltdef,int low) { int the_code,the_code_ok,the_cnew; if (array_name == ESF_EMPTY) { the_code = 0; the_code_ok = 1; the_cnew = 1; } else { the_code = code; the_code_ok = code_ok; the_cnew = code + 1; }; __syncthreads(); if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == UPDATE && the_code_ok == 1 && newname_ok == 1) { if (cellid == newname) return the_cnew; else if (eltdef == 1 && low > the_code) return (low + 1); else return low; } else return low; } __device__ int update_maybe_inc_high (volatile int *probe,volatile int *device_results, int cmd, int array_name, int code,int code_ok,int newname, int newname_ok, int cellid, int eltdef,int high) { int the_code,the_code_ok, the_cnew; if (array_name == ESF_EMPTY) { the_code = 0; the_code_ok = 1; the_cnew = 1; } else { the_code = code; the_code_ok = code_ok; the_cnew = code + 1; }; __syncthreads(); if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == UPDATE && the_code_ok == 1 && newname_ok == 1) { if (cellid == newname) return the_cnew; else if (eltdef == 1 && high >= the_code) return (high + 1); else return high; } else return high; } __device__ int update_maybe_inc_mapcode (volatile int *probe,volatile int *device_results, int cmd, int array_name,int code,int code_ok,int newname, int newname_ok,int cellid,int mapdef,int mapcode) { int the_code,the_code_ok,the_cnew; if (array_name == ESF_EMPTY) { the_code = 0; the_code_ok = 1; the_cnew = 1; } else { the_code = code; the_code_ok = code_ok; the_cnew = code + 1; }; __syncthreads(); if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == UPDATE && the_code_ok == 1 && newname_ok == 1) { if (cellid == newname) return the_cnew; else if (mapdef == 1 && mapcode > the_code) return (mapcode + 1); else return mapcode; } else return mapcode; } __device__ int update_change_bool (volatile int *probe,volatile int *device_results, int cmd, int array_name, int code, int code_ok, int newname,int newname_ok, int id, int bool_val) { int the_code,the_code_ok; if (array_name == ESF_EMPTY) { the_code = 0; the_code_ok = 1; } else { the_code = code; the_code_ok = code_ok; }; __syncthreads(); if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == UPDATE && the_code_ok == 1 && newname_ok == 1 && id == newname) return 1; else return bool_val; } __device__ int update_change_elt (volatile int *probe,volatile int *device_results, int cmd, int array_name, int code, int code_ok, int newname,int newname_ok, int id, int value, int oldvalue) { int the_code,the_code_ok; if (array_name == ESF_EMPTY) { the_code = 0; the_code_ok = 1; } else { the_code = code; the_code_ok = code_ok; }; __syncthreads(); probe[48] = 555555; probe[49] = newname; if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == UPDATE && the_code_ok == 1 && newname_ok == 1 && id == newname) return value; else return oldvalue; } __device__ void init (volatile int *probe,int *cellid,int *eltdef, int *high, int *index,int *low, int *mapcode, int *mapdef, int *mark, int *rank, int *value, volatile int *gpudata, volatile int *device_results,int *y, int output_index) { __syncthreads(); eltdef[threadIdx.x] = 0; __syncthreads(); high[threadIdx.x] = 0; __syncthreads(); index[threadIdx.x] = 0; __syncthreads(); low[threadIdx.x] = 0; __syncthreads(); mapcode[threadIdx.x] = 0; __syncthreads(); mapdef[threadIdx.x] = 0; __syncthreads(); mark[threadIdx.x] = 0; __syncthreads(); rank[threadIdx.x] = 0; __syncthreads(); value[threadIdx.x] = 0; __syncthreads(); } __global__ void kernel (volatile int *probe,int *out,int *gs_in, volatile int *sums, volatile int *temp,int *gs_out, volatile int *gpudata, volatile int *cdata, int *rank_temp,int *y, volatile int *device_results, volatile int *device_results_unsigned,volatile int *encode_result) { __shared__ int cellid [NUM_THREADS]; __shared__ int eltdef [NUM_THREADS]; __shared__ int high [NUM_THREADS]; __shared__ int index [NUM_THREADS]; __shared__ int low [NUM_THREADS]; __shared__ int mapcode [NUM_THREADS]; __shared__ int mapdef [NUM_THREADS]; __shared__ int mark [NUM_THREADS]; __shared__ int rank [NUM_THREADS]; __shared__ int value [NUM_THREADS]; __shared__ int broadcast [NUM_THREADS]; __syncthreads(); cellid[threadIdx.x] = threadIdx.x + blockIdx.x * blockDim.x; __syncthreads(); int input_index; int output_index; int cmd; int not_yet_finished; __syncthreads(); input_index = 0; output_index = 0; theNextGoal = 0; cmd = 0; not_yet_finished = 1; __syncthreads(); init_gpu_sync(gs_in,gs_out); __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutData] = 0; gpudata[GPUOutReady] = 0; gpudata[GPUInAck] = 0; gpudata[GPUStart] = 1; } else {}; __syncthreads(); device_results[DEVICE_RESULT_GPU_CLOCK] = 0; int array_name,array_index,array_value,code,code_ok, var_rank,lookup,lookup_error,newname,newname_ok; while (not_yet_finished) { if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { while (cdata[COutReady] == 0) {rank_temp[0] = atomicCAS(rank_temp,rank_temp[1],0);}; } else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { device_results[DEVICE_RESULT_CMD] = cdata[COutData_CMD]; device_results[DEVICE_RESULT_A] = cdata[COutData_A]; device_results[DEVICE_RESULT_IND] = cdata[COutData_IND]; device_results[DEVICE_RESULT_VAL] = cdata[COutData_VAL]; device_results[DEVICE_RESULT_N] = cdata[COutData_N]; __threadfence(); } else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); /* broadcasting(device_results[DEVICE_RESULT_CMD],broadcast,y); __syncthreads(); cmd = broadcast[threadIdx.x]; __syncthreads(); broadcasting(device_results[DEVICE_RESULT_A],broadcast,y); __syncthreads(); array_name = broadcast[threadIdx.x]; __syncthreads(); broadcasting(device_results[DEVICE_RESULT_IND],broadcast,y); __syncthreads(); array_index = broadcast[threadIdx.x]; __syncthreads(); broadcasting(device_results[DEVICE_RESULT_VAL],broadcast,y); __syncthreads(); array_value = broadcast[threadIdx.x]; __syncthreads(); */ cmd = device_results[DEVICE_RESULT_CMD]; array_name = device_results[DEVICE_RESULT_A]; array_index = device_results[DEVICE_RESULT_IND]; array_value = device_results[DEVICE_RESULT_VAL]; __syncthreads(); if (cmd == UPDATE) probe[5] = newname; code = -1; code_ok = -1; var_rank = -1; lookup = -1; lookup_error = -1; newname = -1; newname_ok = -1; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUInAck] = 1; } else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); if (cmd == TERMINATE) not_yet_finished = 0; else {}; if (cmd == INIT) { init(probe,cellid,eltdef,high,index,low,mapcode,mapdef,mark, rank,value,gpudata,device_results,y,output_index); } else {}; __syncthreads(); encode (probe,cellid,eltdef,mapcode,mapdef,rank,gs_in,gs_out, array_name,device_results, encode_result,y,&code,&code_ok,&var_rank); __syncthreads(); lookup_fold (probe,high,index,low,mark,rank,value,gs_in,gs_out,code_ok,code,array_index,device_results, temp,&lookup,&lookup_error,y); __syncthreads(); allocateCell (probe,cellid,eltdef,sums,gs_in,gs_out,code_ok,device_results,&newname,&newname_ok,y); __syncthreads(); if (cmd == UPDATE) probe[6] = newname; __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); low[threadIdx.x] = update_maybe_inc_low(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok,cellid[threadIdx.x], eltdef[threadIdx.x],low[threadIdx.x]); __syncthreads(); high[threadIdx.x] = update_maybe_inc_high(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok,cellid[threadIdx.x], eltdef[threadIdx.x],high[threadIdx.x]); __syncthreads(); mapcode[threadIdx.x] = update_maybe_inc_mapcode(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok,cellid[threadIdx.x], mapdef[threadIdx.x],mapcode[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); index [threadIdx.x] = update_change_elt(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok,cellid[threadIdx.x], array_index,index[threadIdx.x]); __syncthreads(); value [threadIdx.x] = update_change_elt(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok,cellid[threadIdx.x], array_value,value[threadIdx.x]); __syncthreads(); rank [threadIdx.x] = update_change_elt(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok,cellid[threadIdx.x], var_rank + 1,rank[threadIdx.x]); __syncthreads(); mapdef [threadIdx.x] = update_change_bool(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok, cellid[threadIdx.x],mapdef[threadIdx.x]); __syncthreads(); eltdef [threadIdx.x] = update_change_bool(probe,device_results, cmd,array_name,code,code_ok,newname,newname_ok, cellid[threadIdx.x],eltdef[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); low[threadIdx.x] = delete_apply_low_change(device_results, cmd,array_name,code,code_ok,eltdef[threadIdx.x],low[threadIdx.x]); __syncthreads(); high[threadIdx.x] = delete_apply_high_change(device_results, cmd,array_name,code,code_ok,eltdef[threadIdx.x],high[threadIdx.x]); __syncthreads(); mapcode[threadIdx.x] = delete_apply_mapcode_change(device_results, cmd,array_name,code,code_ok, mapdef[threadIdx.x],mapcode[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); mapdef[threadIdx.x] = delete_apply_mapdef_change(device_results,cmd,array_name,code,code_ok, mapdef[threadIdx.x],cellid[threadIdx.x]); __syncthreads(); eltdef[threadIdx.x] = delete_apply_eltdef_change(device_results,cmd,array_name,code,code_ok, eltdef[threadIdx.x],low[threadIdx.x],high[threadIdx.x]); __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); // default is error if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N]) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutData] = 0; gpudata[GPUOutError] = 0; } else {}; __syncthreads(); } else {}; int the_code_ok; if (array_name == ESF_EMPTY) the_code_ok = 1; else the_code_ok = code_ok; // note - only for UPDATE __syncthreads(); if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == UPDATE && the_code_ok == 1 && newname_ok == 1) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutError] = newname_ok; gpudata[GPUOutData] = newname; } else {}; __syncthreads(); } else {}; if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == UPDATE && (the_code_ok == UPDATE_ERROR_CODE || newname_ok == UPDATE_ERROR_CODE)) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutData] = 0; gpudata[GPUOutError] = UPDATE_ERROR_CODE; } else {}; __syncthreads(); } else {}; if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == DELETE && code_ok == 1 && array_name != ESF_EMPTY) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutError] = 1; gpudata[GPUOutData] = 0; } else {}; __syncthreads(); } else {}; if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && cmd == DELETE && (code_ok == 0 || array_name == ESF_EMPTY)) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutData] = 0; gpudata[GPUOutError] = UPDATE_ERROR_CODE; } else {}; __syncthreads(); } else {}; if (cmd == LOOKUP && code_ok == 1 && array_name != ESF_EMPTY) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutError] = lookup_error; gpudata[GPUOutData] = lookup; } else {}; __syncthreads(); } else {}; if (cmd == LOOKUP && (code_ok == 0 || array_name == ESF_EMPTY)) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutData] = 0; gpudata[GPUOutError] = UPDATE_ERROR_CODE; } else {}; __syncthreads(); } else {}; if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N] && (cmd == INIT || cmd == TERMINATE)) { if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutError] = 1; gpudata[GPUOutData] = 0; } else {}; __syncthreads(); } else {}; if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUOutReady] = 1; } else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { while (cdata[CInAck] == 0) {rank_temp[0] = atomicCAS(rank_temp,rank_temp[1],1);}; } else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); __syncthreads(); //if (threadIdx.x == 0) theNextGoal += 1; else {}; //__syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) { gpudata[GPUInAck] = 0; gpudata[GPUOutReady] = 0; } else {}; __syncthreads(); //gpu_sync(theNextGoal,gs_in,gs_out,y); //__syncthreads(); __threadfence(); if (threadIdx.x == 0 && blockIdx.x == 0) { if (device_results[DEVICE_RESULT_GPU_CLOCK] == device_results[DEVICE_RESULT_N]) device_results[DEVICE_RESULT_GPU_CLOCK] += 1; else {}; } else {}; __syncthreads(); } prepare_out(out,cellid,eltdef,high,index,low,mapcode,mapdef,mark,rank,value,gpudata,device_results,y); if (threadIdx.x == 0) theNextGoal += 1; else {}; __syncthreads(); if (threadIdx.x == 0 && blockIdx.x == 0) gpudata[GPUStop] = 1; else {}; __syncthreads(); gpu_sync(theNextGoal,gs_in,gs_out,y); } void setup_commands (int *n) { n[0] = INIT; n[1] = 0; n[2] = 0; n[3] = 0; n[4] = UPDATE; n[5] = ESF_EMPTY; n[6] = 0; n[6] = 0; int k; for (k = 8; k < 198; k+=4) {n[k] = UPDATE; n[k+1] = 0; n[k+2] = 1; n[k+3] = n[k+2]++;} n[k] = FETCH_CELL; n[k+1] = k; n[k+2] = 0; n[k+3] = 0; k+=4; n[k] = TERMINATE; n[k+1] = k; n[k+2] = 0; n[k+3] = 0; }; void dump_fold_info (int *out_fold) { int k,n; for (k = 0; k < NUM_BLOCKS; k++) { printf("\n"); for (n = 0; n < NUM_THREADS; n++) { printf("[%d]=b %d, id %d",n+(NUM_THREADS * k),cpu_get_object_left(out_fold[n+(NUM_THREADS * k)]) ,cpu_get_object_right(out_fold[n+(NUM_THREADS * k)])); } } } void dump_data(int *output_data) { int k; for (k = 0; k < OUTPUT_DATA_RECORD_SIZE * OPERATIONSLIMIT; k+= OUTPUT_DATA_RECORD_SIZE) printf("error [%d]=%d, value [%d]=%d\n",k,output_data[k],k+1,output_data[k+1]); } int interpret_error (int code,int raw_error) { if (code == LOOKUP) {if (raw_error == -1) return 1; else return 0;} else if (code == UPDATE) {if (raw_error == 0) return 1; else return 0;} else return 0; } void make_report (int stopped_ln_no,int *oparray, int *output_data) { int ok,k,k2; k2 = 0; ok = 1; k = 0; while (oparray[k + OPARRAY_LINE_NUMBER] <= stopped_ln_no && oparray[k + OPARRAY_LINE_NUMBER] != 0) { printf("line number %7d, ",oparray[k+OPARRAY_LINE_NUMBER]); switch (oparray[k+OPARRAY_OPCODE]) { case 101: printf("I, "); break; case 103: printf("U, "); break; case 104: printf("D, "); break; case 105: printf("L, "); break; case 102: printf("T, "); break; } printf("%7d, [%7d], %7d,",oparray[k+OPARRAY_NAME],oparray[k+OPARRAY_INDEX],oparray[k+OPARRAY_VALUE]); int error = interpret_error(oparray[k+OPARRAY_OPCODE],output_data[k2 + OUTPUT_DATA_ERROR]); printf("%7d, < %d >",error,error == oparray[k+OPARRAY_ERROR]); printf("%7d, ( %d ) *%d\n",output_data[k2+OUTPUT_DATA_VALUE], output_data[k2+OUTPUT_DATA_VALUE] == oparray[k+OPARRAY_EXPECTED_VALUE], oparray[k+OPARRAY_EXPECTED_VALUE]); if (output_data[k2+OUTPUT_DATA_VALUE] != oparray[k+OPARRAY_EXPECTED_VALUE]) ok = 0; else {}; k2 += OUTPUT_DATA_RECORD_SIZE; k += OPARRAY_RECORD_SIZE; } if (ok) printf("No errors\n"); else printf ("+++++++++++++++++++ ERRORS ++++++++++++++++++++\n"); } void copy_to_input_data (int *oparray,int *input_data) { int k,k2; k2 = 0; for (k = 0; k < OPSDATASIZE; k += OPARRAY_RECORD_SIZE) { input_data[k2+INPUT_DATA_OPCODE] = oparray[k+OPARRAY_OPCODE]; input_data[k2+INPUT_DATA_NAME] = oparray[k+OPARRAY_NAME]; input_data[k2+INPUT_DATA_INDEX] = oparray[k+OPARRAY_INDEX]; input_data[k2+INPUT_DATA_VALUE] = oparray[k+OPARRAY_VALUE]; input_data[k2+INPUT_DATA_NOTIFY] = 0; k2 += INPUT_DATA_RECORD_SIZE; } } int main (int argc, char* argv[]) { cudaDeviceProp prop; int whichDevice; cudaGetDevice (&whichDevice); cudaGetDeviceProperties(&prop, whichDevice ); if (prop.canMapHostMemory != 1) { printf("Device cannot map memory\n"); return 0; }; cudaSetDeviceFlags( cudaDeviceMapHost ); cudaEvent_t start,stop; float elapsedTime; int gs_in[NUM_BLOCKS * NUM_THREADS]= {0}; int *dev_gs_in; int sums [NUM_BLOCKS * NUM_THREADS] = {0}; int *dev_sums; int encode_result[NUM_BLOCKS * NUM_THREADS] = {0}; int *dev_encode_result; int temp[NUM_BLOCKS * NUM_THREADS] = {0}; int *dev_temp; int rank_temp[NUM_BLOCKS * NUM_THREADS] = {0}; int *dev_rank_temp; int y[NUM_BLOCKS * NUM_THREADS] = {0}; int *dev_y; int device_results[DEVICE_RESULT_SIZE] = {0}; int *dev_device_results; int device_results_unsigned[20] = {0}; int *dev_device_results_unsigned; int gs_out[NUM_BLOCKS * NUM_THREADS] = {0}; int *dev_gs_out; int out_print[OUT_PRINT_NUM * NUM_BLOCKS * NUM_THREADS] = {0}; int *dev_out_print; int input_data[INPUT_DATA_RECORD_SIZE * OPERATIONSLIMIT] = {0}; int *dev_input_data; int output_data[OUTPUT_DATA_RECORD_SIZE * OPERATIONSLIMIT] = {0}; int *dev_output_data; int *cdata,*dev_cdata,*gpudata,*dev_gpudata; int *probe, *dev_probe; int oparray [OPSDATASIZE]; int special_k; special_k = readoperations(oparray); printoperations (oparray, special_k); copy_to_input_data(oparray,input_data); cudaEventCreate( &start); cudaEventCreate( &stop); // cudaMallocs cudaHostAlloc ( (void**)&cdata, CDATA_SIZE * sizeof(int), cudaHostAllocWriteCombined | cudaHostAllocMapped ); cudaHostAlloc ( (void**)&gpudata, GPUDATA_SIZE * sizeof(int), cudaHostAllocWriteCombined | cudaHostAllocMapped ); cudaHostAlloc ( (void**)&probe, PROBE_LIMIT * sizeof(int), cudaHostAllocWriteCombined | cudaHostAllocMapped ); cudaMalloc ( (void**)&dev_gs_in, NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_sums, NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_encode_result, NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_temp, NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_y, NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_rank_temp, NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_device_results, DEVICE_RESULT_SIZE * sizeof(int)); cudaMalloc ( (void**)&dev_device_results_unsigned, 20 * sizeof(int)); cudaMalloc ( (void**)&dev_gs_out, NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_out_print, OUT_PRINT_NUM * NUM_BLOCKS * NUM_THREADS * sizeof(int)); cudaMalloc ( (void**)&dev_input_data, INPUT_DATA_RECORD_SIZE * OPERATIONSLIMIT * sizeof(int)); cudaMalloc ( (void**)&dev_output_data, OUTPUT_DATA_RECORD_SIZE * OPERATIONSLIMIT * sizeof(int)); cudaMalloc ( (void**)&dev_probe, PROBE_LIMIT * sizeof(int)); cudaHostGetDevicePointer( &dev_cdata, cdata, 0); cudaHostGetDevicePointer( &dev_gpudata, gpudata, 0); cudaHostGetDevicePointer( &dev_probe, probe, 0); gpudata[GPUStart] = -1; gpudata[GPUStop] = -1; int j; for (j = 0; j < PROBE_LIMIT; j++) probe[j] = -10; for (j = 0; j < DEVICE_RESULT_SIZE; j++) device_results[j] = 0; for (j = 0; j < NUM_BLOCKS; j++) {gs_in[j] = 0; gs_out[j] = 0;}; for (j = 0; j < OUT_PRINT_NUM * NUM_BLOCKS * NUM_THREADS; j++) out_print[j] = 0; cudaMemcpy (dev_gs_in, gs_in, NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_gs_out, gs_out, NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_encode_result, encode_result, NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_temp, temp, NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_rank_temp, rank_temp, NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_sums, sums, NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_y, y, NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_device_results, device_results, DEVICE_RESULT_SIZE * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_device_results_unsigned, device_results_unsigned, 20 * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy (dev_out_print, out_print, OUT_PRINT_NUM * NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyHostToDevice); cudaEventRecord( start, 0); kernel<<>>(dev_probe,dev_out_print,dev_gs_in,dev_sums,dev_temp,dev_gs_out, dev_gpudata,dev_cdata,dev_rank_temp,dev_y, dev_device_results,dev_device_results_unsigned,dev_encode_result); int cmd,a,ind,val,n,result,result_err,stopped,line_number; n = 0; result = -1; stopped = 0; line_number = 0; cdata[COutData_CMD] = 0; cdata[COutData_A] = 0; cdata[COutData_IND] = 0; cdata[COutData_VAL] = 0; cdata[COutReady] = 0; cdata[CInAck] = 0; printf("starting loop\n"); while (gpudata[GPUStart] != 1) {probe[0] = probe[1]++;}; while (n < OPERATIONSLIMIT && stopped == 0) { cdata[COutData_CMD] = cmd = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_OPCODE]; cdata[COutData_A] = a = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_NAME]; cdata[COutData_IND] = ind = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_INDEX]; cdata[COutData_VAL] = val = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_VALUE]; cdata[COutData_N] = n; cdata[COutReady] = 1; switch (cmd) { case 103:{ // UPDATE while (gpudata[GPUInAck] == 0) {probe[0] = probe[1]++;}; cdata[CInAck] = 0; cdata[COutReady] = 0; while (gpudata[GPUOutReady] == 0){probe[0] = probe[1]++;}; result = gpudata[GPUOutData]; result_err = gpudata[GPUOutError]; cdata[CInAck] = 1; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_VALUE] = result; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_ERROR] = result_err; break; } case 105: { // LOOKUP while (gpudata[GPUInAck] == 0){probe[0] = probe[1]++;}; cdata[CInAck] = 0; cdata[COutReady] = 0; while (gpudata[GPUOutReady] == 0){probe[0] = probe[1]++;}; result = gpudata[GPUOutData]; result_err = gpudata[GPUOutError]; cdata[CInAck] = 1; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_VALUE] = result; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_ERROR] = result_err; break; } case 104: { // DELETE while (gpudata[GPUInAck] == 0){probe[0] = probe[1]++;}; cdata[CInAck] = 0; cdata[COutReady] = 0; while (gpudata[GPUOutReady] == 0){probe[0] = probe[1]++;}; result = gpudata[GPUOutData]; result_err = gpudata[GPUOutError]; cdata[CInAck] = 1; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_VALUE] = result; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_ERROR] = result_err; break; } case 101: { // INIT while (gpudata[GPUInAck] == 0){probe[0] = probe[1]++;}; cdata[CInAck] = 0; cdata[COutReady] = 0; while (gpudata[GPUOutReady] == 0){probe[0] = probe[1]++;}; result = gpudata[GPUOutData]; result_err = gpudata[GPUOutError]; cdata[CInAck] = 1; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_VALUE] = result; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_ERROR] = result_err; break; } case 102: { // TERMINATE printf("TERMINATING!\n"); while (gpudata[GPUInAck] == 0){probe[0] = probe[1]++;}; cdata[CInAck] = 0; cdata[COutReady] = 0; while (gpudata[GPUOutReady] == 0){probe[0] = probe[1]++;}; result = gpudata[GPUOutData]; cdata[CInAck] = 1; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_VALUE] = 0; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_ERROR] = 0; while (gpudata[GPUStop] != 1) {probe[0] = probe[1]++;}; stopped = 1; break; } } int i; for (i = 0; i < MAGIC_LOOP; i++) {if (probe[1] != 0) { probe[0] += probe[1]; } } /* for (i = 4; i < PROBE_LIMIT + 4; i++) { if (probe[4] != probe[i]) probe[3] = 9999; } */ line_number = oparray[(n * OPARRAY_RECORD_SIZE) + OPARRAY_LINE_NUMBER]; if (output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_VALUE] != oparray[(n * OPARRAY_RECORD_SIZE) + OPARRAY_EXPECTED_VALUE] && !stopped) { printf("line no is %d\n",line_number); while (input_data [(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_OPCODE] != TERMINATE) n++; cdata[COutData_CMD] = cmd = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_OPCODE]; cdata[COutData_A] = a = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_NAME]; cdata[COutData_IND] = ind = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_INDEX]; cdata[COutData_VAL] = val = input_data[(n * INPUT_DATA_RECORD_SIZE) + INPUT_DATA_VALUE]; cdata[COutData_N] = n; cdata[COutReady] = 1; while (gpudata[GPUInAck] == 0) {probe[0] = probe[1] ++;}; cdata[CInAck] = 0; cdata[COutReady] = 0; while (gpudata[GPUOutReady] == 0) {probe[0] = probe[1] ++;}; result = gpudata[GPUOutData]; cdata[CInAck] = 1; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_VALUE] = 0; output_data [(n * OUTPUT_DATA_RECORD_SIZE) + OUTPUT_DATA_ERROR] = 0; while (gpudata[GPUStop] != 1) {probe[0] = probe[1] ++;}; stopped = 1; } n++; } printf("synch threads\n"); cudaEventRecord( stop, 0); cudaEventSynchronize( stop ); cudaEventElapsedTime( &elapsedTime, start, stop ); cudaDeviceSynchronize(); cudaMemcpy (device_results, dev_device_results, DEVICE_RESULT_SIZE * sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy (out_print, dev_out_print, OUT_PRINT_NUM * NUM_BLOCKS * NUM_THREADS * sizeof(int), cudaMemcpyDeviceToHost); int i; for (i = 0; i < 80; i++) printf("probe[%d]=%d, left=%d, right=%d\n",i,probe[i],cpu_get_object_left(probe[i]),cpu_get_object_right(probe[i])); printf("\n\n"); /* for (i = 0; i < DEVICE_RESULT_SIZE; i++) printf("dr[%d]=%d\n",i,device_results[i]); printf("\n\n"); printf("\n\n"); dump_data(output_data); */ make_report(line_number,oparray,output_data); printf("\n\n"); dump_cells(out_print); printf("\nSHOULD END NOW\n"); printf("Time taken: %3.1f ms \n", elapsedTime); /* unsigned int h = 1; unsigned int g = 4; unsigned int obj = cpu_mk_dde_object(h,g); unsigned int lft = cpu_get_dde_object_left (obj); unsigned int rght = cpu_get_dde_object_right (obj); printf("args = %d, %d. object = (%08x) left=%d, right=%d\n", h, g, obj, lft, rght); */ cudaFreeHost(cdata); cudaFreeHost(gpudata); cudaFreeHost(probe); cudaFree(dev_y); cudaFree(dev_out_print); cudaFree(dev_gs_in); cudaFree(dev_sums); cudaFree(dev_device_results); cudaFree(dev_device_results_unsigned); cudaFree(dev_gs_out); cudaFree(dev_encode_result); cudaFree(dev_temp); cudaFree(dev_rank_temp); cudaFree(dev_input_data); cudaFree(dev_output_data); cudaEventDestroy( start ); cudaEventDestroy( stop ); return 0; }