c++ - Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3 -
i have read post allocate 2d array cudamallocpitch , copying cudamemcpy2d among many others including nvidia docs , can't cudamallocpitch work cudamemcpy2d.
i need copy big matrix in array format (matrix[width*height]) along simple array perform matrix * vector operations. not optional me use cudamallocpitch in order avoid conflicts , have better performance.
so, started trying copy matrix (vector in case) device , check if correctly copied code not print anything. if use cudamalloc , cudamemcpy works fine. not know cudamallocpitch , cudamemcpy2d.
what can fix this?
#include <stdio.h> __global__ void kernel(size_t mpitch, double * a, int n) { int idx = threadidx.x + blockidx.x * blockdim.x; while (idx < n) { double e = *(double *)(((char *) + idx * mpitch) + n); printf("(%f)", e); } } int main() { int n = 1500; double * = new double[n], * d_a; size_t pitch; (int = 0; < n; ++i) { a[i] = i; } cudamallocpitch(&d_a, &pitch, sizeof(double) * n, 1); cudamemcpy2d(d_a, pitch, a, n * sizeof(double), sizeof(double) * n, 1, cudamemcpyhosttodevice); unsigned int blocksize = 1024; unsigned int nblocks = (n + blocksize - 1) / blocksize; kernel <<<nblocks, blocksize>>>(pitch, d_a, n); cudafree(d_a); delete [] a; return 0; }
error checking can make big difference in debugging. should use before coming here.
it wasn't clear if wanted row or column vector i.e. matrix of [1xn] or [nx1]
i've added explanation on talomnies suggestion, first 'working slabs of code'
here's [nx1]
#include <cstdio> #include <iostream> #include <cuda.h> using namespace std; __global__ void kernel(size_t mpitch, double * a, int n) { int idx = threadidx.x + blockidx.x * blockdim.x; if(idx>=n) return; double e = *(double *)(((char *) + idx * mpitch)); printf("(%f)", e); } int main() { int n = 15; double * = new double[n], * d_a; size_t pitch; (int = 0; < n; ++i) { a[i] = i; } cudaerror_t err = cudamallocpitch(&d_a, &pitch, sizeof(double), n); if(err!=cudasuccess) cout<<"err0:"<<cudageterrorstring(err)<<endl; err = cudamemcpy2d(d_a, pitch, a, sizeof(double), sizeof(double), n, cudamemcpyhosttodevice); if(err!=cudasuccess) cout<<"err1:"<<cudageterrorstring(err)<<endl; unsigned int blocksize = 1024; unsigned int nblocks = (n + blocksize - 1) / blocksize; kernel <<<nblocks, blocksize>>>(pitch, d_a, n); cudadevicesynchronize(); err = cudagetlasterror(); if(err!=cudasuccess) cout<<"err2:"<<cudageterrorstring(err)<<endl; cudafree(d_a); delete [] a; return 0; } [1xn]:
#include <cstdio> #include <iostream> #include <cuda.h> using namespace std; __global__ void kernel(size_t mpitch, double * a, int n) { int idx = threadidx.x + blockidx.x * blockdim.x; if(idx>=n) return; int row=0;//only 1 row double *row_ptr = (double *)( (char *) (a + mpitch * row) ); double e = row_ptr[idx]; printf("(%f)", e); } int main() { int n = 15; double * = new double[n], * d_a; size_t pitch; (int = 0; < n; ++i) { a[i] = i; } cudaerror_t err = cudamallocpitch(&d_a, &pitch, sizeof(double)*n, 1); if(err!=cudasuccess) cout<<"err0:"<<cudageterrorstring(err)<<endl; err = cudamemcpy2d(d_a, pitch, a, sizeof(double)*n, sizeof(double)*n, 1, cudamemcpyhosttodevice); if(err!=cudasuccess) cout<<"err1:"<<cudageterrorstring(err)<<endl; unsigned int blocksize = 1024; unsigned int nblocks = (n + blocksize - 1) / blocksize; kernel <<<nblocks, blocksize>>>(pitch, d_a, n); cudadevicesynchronize(); err = cudagetlasterror(); if(err!=cudasuccess) cout<<"err2:"<<cudageterrorstring(err)<<endl; cudafree(d_a); delete [] a; return 0; } explanation
firslty, error handling:
considering how easy error handling in cuda there isn't excuse not put in.
cudaerror_t err = cudamallocpitch(&d_a, &pitch, sizeof(double)*n, 1); if(err!=cudasuccess) cout<<"err0:"<<cudageterrorstring(err)<<endl; second, didn't specify if wanted column vector or row vector. since row vector 1-d array in linear memory , don't need pitched memory that, assume explanation meant column vector.
the reoccurring problem having "misaligned address" in kernel. indicates problem book-keeping, lets walk through 3 major steps of handling aligned 2d array (even though our arrays either column or row vector).
allocating: allocation written out as
cudamallocpitch(&d_a, &pitch, sizeof(double) * n, 1); this correct row vector api cudamallocpitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) if column vector correct call is
cudamallocpitch(&d_a, &pitch, sizeof(double), n); accessing: accessing mixing accessing row, , accessing element in row.
double e = *(double *)(((char *) + idx * mpitch) + n); once again stick documentation. api documentation cudamallocpitch includes
t* pelement = (t*)((char*)baseaddress + row * pitch) + column; for translates into
int column=0; double element=(double*) ((char*)a + idx * mpitch) + column; i've used column = 0 completeness since not have more 1 column.
copying:
cudamemcpy2d(d_a, pitch, a, n * sizeof(double), sizeof(double) * n, 1, cudamemcpyhosttodevice); for case correct. api cudamemcpy2d is
cudamemcpy2d(void* destination, size_t pitch_from_mallocpitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);
Comments
Post a Comment