c++ - CUDA: Group every n-th point of array passed to GPU -

i trying implement k-means algorithm on cuda using tesla card on external unix. read input file , store coordinates of data points in datax , datay arrays. next step select every centreinterval-th point , store in array allocated in gpu memory. however, have no idea how may check what's problem if can 'segmentation error' , obvious reasons can't print kind of output kernel.

edit 2: simplified example shortest possible solution. found solution during process, decided provide version, not solved yet in question make more clear caused problem.

#include <stdlib.h> #include <stdio.h> #include <string.h> #include <strings.h> #include <math.h> #include <time.h> #include <unistd.h> #define block_size 16  // kernel - selects centres @ beginning of algorithm , stores @ appropriate place __global__ void kmeansselectinitialcentres(float* d_datax, float* d_datay, float* d_centresx, float* d_centresy, int centreinterval) {      int = blockidx.x * blockdim.x + threadidx.x;     int idx = * centreinterval;     d_centresx[i] = d_datax[idx];     d_centresy[i] = d_datay[idx]; }  // simplified example int main(int argn, char ** argc) {      // data - let's 32 floats in each     int datasize = 32;     float* datax = new float[datasize];     float* datay = new float[datasize];       // fill arrays numbers     (int = 0; < datasize; i++) {         datax[i] = i;         datay[i] = i;     }      // interval - select first number, 1 + n * centreinterval      int centreinterval = 2;      // there store results in program     int centresize = datasize / centreinterval;     float* centresx = new float[centresize];     float* centresy = new float[centresize];      // pointers arrays stored in gpu memory     float* d_datax;     float* d_datay;     float* d_centresx;     float* d_centresy;  // allocate memory arrays     // calculate how space in memory need     size_t d_centresize = sizeof(float) * centresize;     size_t d_datasize = sizeof(float) * datasize;      // memory raw data     cudamalloc((void**)&d_datax, d_datasize);        cudamalloc((void**)&d_datay, d_datasize);      // copy raw data device memory can operate on freely     cudamemcpy(d_datay, datay, d_datasize, cudamemcpyhosttodevice);     cudamemcpy(d_datax, datax, d_datasize, cudamemcpyhosttodevice);      // memory centre results     cudamalloc((void**)&d_centresx, d_datasize);     cudamalloc((void**)&d_centresy, d_datasize);      // call kernel     dim3 dimblock(block_size);     dim3 dimgridk((centresize + dimblock.x) / dimblock.x);     kmeansselectinitialcentres <<<dimgridk, dimblock>>> (d_datax, d_datay, d_centresx, d_centresy, centreinterval);      // check results - every n-th point     float* check_x = new float[centresize];     float* check_y = new float[centresize];      cudamemcpy(check_x, d_centresx, d_datasize, cudamemcpydevicetohost);     cudamemcpy(check_y, d_centresy, d_datasize, cudamemcpydevicetohost);      printf("x: ");       (int = 0; < centresize; i++)         printf("%.2f ", check_x[i]);     printf("\ny: ");     (int = 0; < centresize; i++)         printf("%.2f ", check_y[i]);     printf("\n");  }

main question: wrong kernel / check-out of data?

side question: there fair way debug program kernels in such situations?

so, here's solution came after simplifying case. there problem memory usage - tried store / read different amount of data claimed use when allocating it. hope helpful in future:

#include <stdlib.h> #include <stdio.h> #include <string.h> #include <strings.h> #include <math.h> #include <time.h> #include <unistd.h> #define block_size 16  // kernel - selects centres @ beginning of algorithm , stores @ appropriate place __global__ void kmeansselectinitialcentres(float* d_datax, float* d_datay, float* d_centresx, float* d_centresy, int centreinterval) {      int = blockidx.x * blockdim.x + threadidx.x;     int idx = * centreinterval;     d_centresx[i] = d_datax[idx];     d_centresy[i] = d_datay[idx]; }  // simplified example int main(int argn, char ** argc) {      // data - let's 32 floats in each     int datasize = 32;     float* datax = new float[datasize];     float* datay = new float[datasize];       // fill arrays numbers     (int = 0; < datasize; i++) {         datax[i] = i;         datay[i] = i;     }      // interval - select first number, 1 + n * centreinterval      int centreinterval = 2;      // there store results in program     int centresize = datasize / centreinterval;     float* centresx = new float[centresize];     float* centresy = new float[centresize];      // pointers arrays stored in gpu memory     float* d_datax;     float* d_datay;     float* d_centresx;     float* d_centresy;  // allocate memory arrays     // calculate how space in memory need     size_t d_centresize = sizeof(float) * centresize;     size_t d_datasize = sizeof(float) * datasize;      // memory raw data     cudamalloc((void**)&d_datax, d_datasize);        cudamalloc((void**)&d_datay, d_datasize);      // copy raw data device memory can operate on freely     cudamemcpy(d_datay, datay, d_datasize, cudamemcpyhosttodevice);     cudamemcpy(d_datax, datax, d_datasize, cudamemcpyhosttodevice);      // memory centre results     cudamalloc((void**)&d_centresx, d_centresize);     cudamalloc((void**)&d_centresy, d_centresize);      // call kernel     dim3 dimblock(block_size);     dim3 dimgridk((centresize + dimblock.x) / dimblock.x);     kmeansselectinitialcentres <<<dimgridk, dimblock>>> (d_datax, d_datay, d_centresx, d_centresy, centreinterval);      // check results - every n-th point     float* check_x = new float[centresize];     float* check_y = new float[centresize];      cudamemcpy(check_x, d_centresx, d_centresize, cudamemcpydevicetohost);     cudamemcpy(check_y, d_centresy, d_centresize, cudamemcpydevicetohost);      printf("x: ");       (int = 0; < centresize; i++)         printf("%.2f ", check_x[i]);     printf("\ny: ");     (int = 0; < centresize; i++)         printf("%.2f ", check_y[i]);     printf("\n");  }

Search This Blog

Call

c++ - CUDA: Group every n-th point of array passed to GPU -

Comments

Post a Comment

Popular posts from this blog

node.js - Using Node without global install -

How to access a php class file from PHPFox framework into javascript code written in simple HTML file? -

java - Null response to php query in android, even though php works properly -