c++ - Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3 -


i have read post allocate 2d array cudamallocpitch , copying cudamemcpy2d among many others including nvidia docs , can't cudamallocpitch work cudamemcpy2d.

i need copy big matrix in array format (matrix[width*height]) along simple array perform matrix * vector operations. not optional me use cudamallocpitch in order avoid conflicts , have better performance.

so, started trying copy matrix (vector in case) device , check if correctly copied code not print anything. if use cudamalloc , cudamemcpy works fine. not know cudamallocpitch , cudamemcpy2d.

what can fix this?

#include <stdio.h> __global__ void kernel(size_t mpitch, double * a, int n) {     int idx = threadidx.x + blockidx.x * blockdim.x;     while (idx < n)     {         double e = *(double *)(((char *) + idx * mpitch) + n);         printf("(%f)", e);     } } int main() {     int n = 1500;     double *  = new double[n], * d_a;     size_t pitch;      (int = 0; < n; ++i)     {         a[i] = i;     }     cudamallocpitch(&d_a, &pitch,  sizeof(double) * n, 1);     cudamemcpy2d(d_a, pitch, a, n * sizeof(double), sizeof(double) * n, 1, cudamemcpyhosttodevice);     unsigned int blocksize = 1024;     unsigned int nblocks = (n + blocksize - 1) / blocksize;     kernel <<<nblocks, blocksize>>>(pitch, d_a, n);     cudafree(d_a);     delete [] a;     return 0; } 

error checking can make big difference in debugging. should use before coming here.

it wasn't clear if wanted row or column vector i.e. matrix of [1xn] or [nx1]

i've added explanation on talomnies suggestion, first 'working slabs of code'

here's [nx1]

#include <cstdio> #include <iostream> #include <cuda.h>  using namespace std;  __global__ void kernel(size_t mpitch, double * a, int n) {     int idx = threadidx.x + blockidx.x * blockdim.x;     if(idx>=n) return;     double e = *(double *)(((char *) + idx * mpitch));     printf("(%f)", e);  } int main() {     int n = 15;     double *  = new double[n], * d_a;     size_t pitch;      (int = 0; < n; ++i)     {         a[i] = i;     }      cudaerror_t err = cudamallocpitch(&d_a, &pitch, sizeof(double), n);     if(err!=cudasuccess) cout<<"err0:"<<cudageterrorstring(err)<<endl;      err = cudamemcpy2d(d_a, pitch, a, sizeof(double), sizeof(double), n, cudamemcpyhosttodevice);     if(err!=cudasuccess) cout<<"err1:"<<cudageterrorstring(err)<<endl;      unsigned int blocksize = 1024;     unsigned int nblocks = (n + blocksize - 1) / blocksize;     kernel <<<nblocks, blocksize>>>(pitch, d_a, n);      cudadevicesynchronize();     err = cudagetlasterror();     if(err!=cudasuccess) cout<<"err2:"<<cudageterrorstring(err)<<endl;      cudafree(d_a);     delete [] a;     return 0; } 

[1xn]:

#include <cstdio> #include <iostream> #include <cuda.h>  using namespace std;  __global__ void kernel(size_t mpitch, double * a, int n) {     int idx = threadidx.x + blockidx.x * blockdim.x;     if(idx>=n) return;     int row=0;//only 1 row      double *row_ptr = (double *)( (char *) (a + mpitch * row) );     double e = row_ptr[idx];     printf("(%f)", e);  } int main() {     int n = 15;     double *  = new double[n], * d_a;     size_t pitch;      (int = 0; < n; ++i)     {         a[i] = i;     }      cudaerror_t err = cudamallocpitch(&d_a, &pitch, sizeof(double)*n, 1);     if(err!=cudasuccess) cout<<"err0:"<<cudageterrorstring(err)<<endl;      err = cudamemcpy2d(d_a, pitch, a, sizeof(double)*n, sizeof(double)*n, 1, cudamemcpyhosttodevice);     if(err!=cudasuccess) cout<<"err1:"<<cudageterrorstring(err)<<endl;      unsigned int blocksize = 1024;     unsigned int nblocks = (n + blocksize - 1) / blocksize;     kernel <<<nblocks, blocksize>>>(pitch, d_a, n);      cudadevicesynchronize();     err = cudagetlasterror();     if(err!=cudasuccess) cout<<"err2:"<<cudageterrorstring(err)<<endl;      cudafree(d_a);     delete [] a;     return 0; } 

explanation

firslty, error handling:

considering how easy error handling in cuda there isn't excuse not put in.

cudaerror_t err = cudamallocpitch(&d_a, &pitch, sizeof(double)*n, 1); if(err!=cudasuccess) cout<<"err0:"<<cudageterrorstring(err)<<endl; 

second, didn't specify if wanted column vector or row vector. since row vector 1-d array in linear memory , don't need pitched memory that, assume explanation meant column vector.

the reoccurring problem having "misaligned address" in kernel. indicates problem book-keeping, lets walk through 3 major steps of handling aligned 2d array (even though our arrays either column or row vector).

allocating: allocation written out as

cudamallocpitch(&d_a, &pitch,  sizeof(double) * n, 1); 

this correct row vector api cudamallocpitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) if column vector correct call is

cudamallocpitch(&d_a, &pitch, sizeof(double), n); 

accessing: accessing mixing accessing row, , accessing element in row.

double e = *(double *)(((char *) + idx * mpitch) + n); 

once again stick documentation. api documentation cudamallocpitch includes

t* pelement = (t*)((char*)baseaddress + row * pitch) + column; 

for translates into

int column=0; double element=(double*) ((char*)a + idx * mpitch) + column; 

i've used column = 0 completeness since not have more 1 column.

copying:

cudamemcpy2d(d_a, pitch, a, n * sizeof(double), sizeof(double) * n, 1, cudamemcpyhosttodevice); 

for case correct. api cudamemcpy2d is

cudamemcpy2d(void* destination, size_t pitch_from_mallocpitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer); 

Comments

Popular posts from this blog

node.js - Using Node without global install -

How to access a php class file from PHPFox framework into javascript code written in simple HTML file? -

java - Null response to php query in android, even though php works properly -