c++ - CUDA: Group every n-th point of array passed to GPU -
i trying implement k-means algorithm on cuda using tesla card on external unix. read input file , store coordinates of data points in datax , datay arrays. next step select every centreinterval-th point , store in array allocated in gpu memory. however, have no idea how may check what's problem if can 'segmentation error' , obvious reasons can't print kind of output kernel.
edit 2: simplified example shortest possible solution. found solution during process, decided provide version, not solved yet in question make more clear caused problem.
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <strings.h> #include <math.h> #include <time.h> #include <unistd.h> #define block_size 16 // kernel - selects centres @ beginning of algorithm , stores @ appropriate place __global__ void kmeansselectinitialcentres(float* d_datax, float* d_datay, float* d_centresx, float* d_centresy, int centreinterval) { int = blockidx.x * blockdim.x + threadidx.x; int idx = * centreinterval; d_centresx[i] = d_datax[idx]; d_centresy[i] = d_datay[idx]; } // simplified example int main(int argn, char ** argc) { // data - let's 32 floats in each int datasize = 32; float* datax = new float[datasize]; float* datay = new float[datasize]; // fill arrays numbers (int = 0; < datasize; i++) { datax[i] = i; datay[i] = i; } // interval - select first number, 1 + n * centreinterval int centreinterval = 2; // there store results in program int centresize = datasize / centreinterval; float* centresx = new float[centresize]; float* centresy = new float[centresize]; // pointers arrays stored in gpu memory float* d_datax; float* d_datay; float* d_centresx; float* d_centresy; // allocate memory arrays // calculate how space in memory need size_t d_centresize = sizeof(float) * centresize; size_t d_datasize = sizeof(float) * datasize; // memory raw data cudamalloc((void**)&d_datax, d_datasize); cudamalloc((void**)&d_datay, d_datasize); // copy raw data device memory can operate on freely cudamemcpy(d_datay, datay, d_datasize, cudamemcpyhosttodevice); cudamemcpy(d_datax, datax, d_datasize, cudamemcpyhosttodevice); // memory centre results cudamalloc((void**)&d_centresx, d_datasize); cudamalloc((void**)&d_centresy, d_datasize); // call kernel dim3 dimblock(block_size); dim3 dimgridk((centresize + dimblock.x) / dimblock.x); kmeansselectinitialcentres <<<dimgridk, dimblock>>> (d_datax, d_datay, d_centresx, d_centresy, centreinterval); // check results - every n-th point float* check_x = new float[centresize]; float* check_y = new float[centresize]; cudamemcpy(check_x, d_centresx, d_datasize, cudamemcpydevicetohost); cudamemcpy(check_y, d_centresy, d_datasize, cudamemcpydevicetohost); printf("x: "); (int = 0; < centresize; i++) printf("%.2f ", check_x[i]); printf("\ny: "); (int = 0; < centresize; i++) printf("%.2f ", check_y[i]); printf("\n"); } main question: wrong kernel / check-out of data?
side question: there fair way debug program kernels in such situations?
so, here's solution came after simplifying case. there problem memory usage - tried store / read different amount of data claimed use when allocating it. hope helpful in future:
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <strings.h> #include <math.h> #include <time.h> #include <unistd.h> #define block_size 16 // kernel - selects centres @ beginning of algorithm , stores @ appropriate place __global__ void kmeansselectinitialcentres(float* d_datax, float* d_datay, float* d_centresx, float* d_centresy, int centreinterval) { int = blockidx.x * blockdim.x + threadidx.x; int idx = * centreinterval; d_centresx[i] = d_datax[idx]; d_centresy[i] = d_datay[idx]; } // simplified example int main(int argn, char ** argc) { // data - let's 32 floats in each int datasize = 32; float* datax = new float[datasize]; float* datay = new float[datasize]; // fill arrays numbers (int = 0; < datasize; i++) { datax[i] = i; datay[i] = i; } // interval - select first number, 1 + n * centreinterval int centreinterval = 2; // there store results in program int centresize = datasize / centreinterval; float* centresx = new float[centresize]; float* centresy = new float[centresize]; // pointers arrays stored in gpu memory float* d_datax; float* d_datay; float* d_centresx; float* d_centresy; // allocate memory arrays // calculate how space in memory need size_t d_centresize = sizeof(float) * centresize; size_t d_datasize = sizeof(float) * datasize; // memory raw data cudamalloc((void**)&d_datax, d_datasize); cudamalloc((void**)&d_datay, d_datasize); // copy raw data device memory can operate on freely cudamemcpy(d_datay, datay, d_datasize, cudamemcpyhosttodevice); cudamemcpy(d_datax, datax, d_datasize, cudamemcpyhosttodevice); // memory centre results cudamalloc((void**)&d_centresx, d_centresize); cudamalloc((void**)&d_centresy, d_centresize); // call kernel dim3 dimblock(block_size); dim3 dimgridk((centresize + dimblock.x) / dimblock.x); kmeansselectinitialcentres <<<dimgridk, dimblock>>> (d_datax, d_datay, d_centresx, d_centresy, centreinterval); // check results - every n-th point float* check_x = new float[centresize]; float* check_y = new float[centresize]; cudamemcpy(check_x, d_centresx, d_centresize, cudamemcpydevicetohost); cudamemcpy(check_y, d_centresy, d_centresize, cudamemcpydevicetohost); printf("x: "); (int = 0; < centresize; i++) printf("%.2f ", check_x[i]); printf("\ny: "); (int = 0; < centresize; i++) printf("%.2f ", check_y[i]); printf("\n"); }
Comments
Post a Comment