15 cudaError_t status = cudaSetDevice(n);
22 cudaError_t status = cudaGetDevice(&n);
27 void check_error(cudaError_t status)
30 cudaError_t status2 = cudaGetLastError();
31 if (status != cudaSuccess)
33 const char *s = cudaGetErrorString(status);
35 printf(
"CUDA Error: %s\n", s);
37 snprintf(buffer, 256,
"CUDA Error: %s", s);
40 if (status2 != cudaSuccess)
42 const char *s = cudaGetErrorString(status);
44 printf(
"CUDA Error Prev: %s\n", s);
46 snprintf(buffer, 256,
"CUDA Error Prev: %s", s);
51 dim3 cuda_gridsize(
size_t n){
52 size_t k = (n-1) / BLOCK + 1;
57 y = (n-1)/(x*BLOCK) + 1;
65 cudnnHandle_t cudnn_handle()
67 static int init[16] = {0};
68 static cudnnHandle_t handle[16];
69 int i = cuda_get_device();
71 cudnnCreate(&handle[i]);
78 cublasHandle_t blas_handle()
80 static int init[16] = {0};
81 static cublasHandle_t handle[16];
82 int i = cuda_get_device();
84 cublasCreate(&handle[i]);
90 float *cuda_make_array(
float *x,
size_t n)
93 size_t size =
sizeof(float)*n;
94 cudaError_t status = cudaMalloc((
void **)&x_gpu, size);
97 status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
102 if(!x_gpu)
error(
"Cuda malloc failed\n");
106 void cuda_random(
float *x_gpu,
size_t n)
108 static curandGenerator_t gen[16];
109 static int init[16] = {0};
110 int i = cuda_get_device();
112 curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
113 curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
116 curandGenerateUniform(gen[i], x_gpu, n);
117 check_error(cudaPeekAtLastError());
120 float cuda_compare(
float *x_gpu,
float *x,
size_t n,
char *s)
122 float *tmp = calloc(n,
sizeof(
float));
123 cuda_pull_array(x_gpu, tmp, n);
127 float err =
dot_cpu(n, tmp, 1, tmp, 1);
128 printf(
"Error %s: %f\n", s, sqrt(err/n));
133 int *cuda_make_int_array(
int *x,
size_t n)
136 size_t size =
sizeof(int)*n;
137 cudaError_t status = cudaMalloc((
void **)&x_gpu, size);
140 status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
143 if(!x_gpu)
error(
"Cuda malloc failed\n");
147 void cuda_free(
float *x_gpu)
149 cudaError_t status = cudaFree(x_gpu);
153 void cuda_push_array(
float *x_gpu,
float *x,
size_t n)
155 size_t size =
sizeof(float)*n;
156 cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
160 void cuda_pull_array(
float *x_gpu,
float *x,
size_t n)
162 size_t size =
sizeof(float)*n;
163 cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost);
167 float cuda_mag_array(
float *x_gpu,
size_t n)
169 float *temp = calloc(n,
sizeof(
float));
170 cuda_pull_array(x_gpu, temp, n);
void fill_gpu(int N, float ALPHA, float *X, int INCX)
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
void cuda_set_device(int n)
float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
float mag_array(float *a, int n)
void error(const char *s)