debugging - (Homework) Converting a function to a CUDA kernel function -


i'm working on assignment asks optimise this c program using cuda parallelisation.

this managed come with:

//...  __global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results,                                    int frame_rowstride, int pattern_rowstride,                                    int pattern_width, int pattern_height,                                    int frame_width, int frame_height) {     if ((blockidx.y * blockdim.y + threadidx.y < frame_height - pattern_height) &&         (blockidx.x * blockdim.x + threadidx.x < frame_width - pattern_width)) {         guchar *frame_pixels = (guchar *) gpu_frame_pixels +                                (blockidx.y * blockdim.y + threadidx.y) * frame_rowstride +                                (blockidx.x * blockdim.x + threadidx.x) * n_channels;         guchar *pattern_pixels = (guchar *) gpu_pattern_pixels;         int *results = (int *) gpu_results;          int res = 0;         (int y = 0; y < pattern_height; ++y) {             if (blockidx.y * blockdim.y + threadidx.y + y < frame_height - pattern_height) {                 (int x = 0; x < pattern_width; ++x) {                     if (blockidx.x * blockdim.x + threadidx.x + x < frame_width - pattern_width) {                         const guchar *frame_pixel = frame_pixels + x * n_channels;                         const guchar *pattern_pixel = pattern_pixels + x * n_channels;                         (int c = 0; c < n_channels; ++c) {                             res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128);                         }                     } else {                         break;                     }                 }                 frame_pixels += frame_rowstride;                 pattern_pixels += pattern_rowstride;             } else {                 break;             }         }          results[(blockidx.y * blockdim.y + threadidx.y) * (frame_width - pattern_width) + blockidx.x * blockdim.x + threadidx.x] = res;     } }  int main(int argc, const char *argv[]) {     //...      /**      * cuda      */     void *gpu_pattern_pixels;     void *gpu_frame_pixels;     void *gpu_results;      cudamalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar));     cudamalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar));     cudamalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));      cudamemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar),                cudamemcpyhosttodevice);     cudamemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar),                cudamemcpyhosttodevice);      //kernel configuration, two-dimensional grid ,     //three-dimensional blocks configured.     dim3 dimgrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32));     dim3 dimblock(32, 32);     gpu_score_function<<<dimgrid, dimblock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height);     cudadevicesynchronize();      int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));     cudamemcpy((void *) results, gpu_results,                (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudamemcpydevicetohost);      int gpu_x_best, gpu_y_best;     double gpu_best_score;      (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) {         if (cur == results || *cur > gpu_best_score) {             gpu_best_score = *cur;             gpu_x_best = (cur - results) % (frame_width - pattern_width);             gpu_y_best = (cur - results) / (frame_width - pattern_width);         }     }      cudafree(gpu_pattern_pixels);     cudafree(gpu_frame_pixels);     cudafree(gpu_results);     free(results);      // cudadevicereset causes driver clean state. while     // not mandatory in normal operation, practice.      // needed ensure correct operation when application being     // profiled. calling cudadevicereset causes profile data     // flushed before application exits     cudadevicereset();      /**      * end cuda      */      //...      return 0; } 

the program doesn't segfault, cuda-memcheck gives 0 errors , result matrix filled. problem is, results wrong.

i'm quite sure it's off-by-one pointer error, have no idea how spot it.

i'm working on osx 10.9, tools use debug program?

any appreciated.

i found bug.

the 2 if statements inside loops of gpu_score_function make no sense. deleting them solved problem.


Comments

Popular posts from this blog

angularjs - ADAL JS Angular- WebAPI add a new role claim to the token -

node.js - Using Node without global install -

php - CakePHP HttpSockets send array of paramms -