debugging - (Homework) Converting a function to a CUDA kernel function -
i'm working on assignment asks optimise this c program using cuda parallelisation.
this managed come with:
//... __global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results, int frame_rowstride, int pattern_rowstride, int pattern_width, int pattern_height, int frame_width, int frame_height) { if ((blockidx.y * blockdim.y + threadidx.y < frame_height - pattern_height) && (blockidx.x * blockdim.x + threadidx.x < frame_width - pattern_width)) { guchar *frame_pixels = (guchar *) gpu_frame_pixels + (blockidx.y * blockdim.y + threadidx.y) * frame_rowstride + (blockidx.x * blockdim.x + threadidx.x) * n_channels; guchar *pattern_pixels = (guchar *) gpu_pattern_pixels; int *results = (int *) gpu_results; int res = 0; (int y = 0; y < pattern_height; ++y) { if (blockidx.y * blockdim.y + threadidx.y + y < frame_height - pattern_height) { (int x = 0; x < pattern_width; ++x) { if (blockidx.x * blockdim.x + threadidx.x + x < frame_width - pattern_width) { const guchar *frame_pixel = frame_pixels + x * n_channels; const guchar *pattern_pixel = pattern_pixels + x * n_channels; (int c = 0; c < n_channels; ++c) { res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128); } } else { break; } } frame_pixels += frame_rowstride; pattern_pixels += pattern_rowstride; } else { break; } } results[(blockidx.y * blockdim.y + threadidx.y) * (frame_width - pattern_width) + blockidx.x * blockdim.x + threadidx.x] = res; } } int main(int argc, const char *argv[]) { //... /** * cuda */ void *gpu_pattern_pixels; void *gpu_frame_pixels; void *gpu_results; cudamalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar)); cudamalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar)); cudamalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int)); cudamemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar), cudamemcpyhosttodevice); cudamemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar), cudamemcpyhosttodevice); //kernel configuration, two-dimensional grid , //three-dimensional blocks configured. dim3 dimgrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32)); dim3 dimblock(32, 32); gpu_score_function<<<dimgrid, dimblock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height); cudadevicesynchronize(); int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int)); cudamemcpy((void *) results, gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudamemcpydevicetohost); int gpu_x_best, gpu_y_best; double gpu_best_score; (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) { if (cur == results || *cur > gpu_best_score) { gpu_best_score = *cur; gpu_x_best = (cur - results) % (frame_width - pattern_width); gpu_y_best = (cur - results) / (frame_width - pattern_width); } } cudafree(gpu_pattern_pixels); cudafree(gpu_frame_pixels); cudafree(gpu_results); free(results); // cudadevicereset causes driver clean state. while // not mandatory in normal operation, practice. // needed ensure correct operation when application being // profiled. calling cudadevicereset causes profile data // flushed before application exits cudadevicereset(); /** * end cuda */ //... return 0; }
the program doesn't segfault, cuda-memcheck gives 0 errors , result matrix filled. problem is, results wrong.
i'm quite sure it's off-by-one pointer error, have no idea how spot it.
i'm working on osx 10.9, tools use debug program?
any appreciated.
i found bug.
the 2 if statements inside loops of gpu_score_function make no sense. deleting them solved problem.
Comments
Post a Comment