8 void gemm_bin(
int M,
int N,
int K,
float ALPHA,
14 for(i = 0; i < M; ++i){
15 for(k = 0; k < K; ++k){
16 char A_PART = A[i*lda+k];
18 for(j = 0; j < N; ++j){
19 C[i*ldc+j] += B[k*ldb+j];
22 for(j = 0; j < N; ++j){
23 C[i*ldc+j] -= B[k*ldb+j];
33 float *m = calloc(rows*cols,
sizeof(
float));
34 for(i = 0; i < rows*cols; ++i){
35 m[i] = (float)rand()/RAND_MAX;
53 clock_t start = clock(), end;
54 for(i = 0; i<10; ++i){
55 gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
58 printf(
"Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (
float)(end-start)/CLOCKS_PER_SEC);
65 void gemm(
int TA,
int TB,
int M,
int N,
int K,
float ALPHA,
71 gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
74 void gemm_nn(
int M,
int N,
int K,
float ALPHA,
80 #pragma omp parallel for 81 for(i = 0; i < M; ++i){
82 for(k = 0; k < K; ++k){
83 register float A_PART = ALPHA*A[i*lda+k];
84 for(j = 0; j < N; ++j){
85 C[i*ldc+j] += A_PART*B[k*ldb+j];
91 void gemm_nt(
int M,
int N,
int K,
float ALPHA,
97 #pragma omp parallel for 98 for(i = 0; i < M; ++i){
99 for(j = 0; j < N; ++j){
100 register float sum = 0;
101 for(k = 0; k < K; ++k){
102 sum += ALPHA*A[i*lda+k]*B[j*ldb + k];
109 void gemm_tn(
int M,
int N,
int K,
float ALPHA,
115 #pragma omp parallel for 116 for(i = 0; i < M; ++i){
117 for(k = 0; k < K; ++k){
118 register float A_PART = ALPHA*A[k*lda+i];
119 for(j = 0; j < N; ++j){
120 C[i*ldc+j] += A_PART*B[k*ldb+j];
126 void gemm_tt(
int M,
int N,
int K,
float ALPHA,
132 #pragma omp parallel for 133 for(i = 0; i < M; ++i){
134 for(j = 0; j < N; ++j){
135 register float sum = 0;
136 for(k = 0; k < K; ++k){
137 sum += ALPHA*A[i+k*lda]*B[k+j*ldb];
145 void gemm_cpu(
int TA,
int TB,
int M,
int N,
int K,
float ALPHA,
153 for(i = 0; i < M; ++i){
154 for(j = 0; j < N; ++j){
155 C[i*ldc + j] *= BETA;
159 gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
161 gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
163 gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
165 gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
172 void gemm_gpu(
int TA,
int TB,
int M,
int N,
int K,
float ALPHA,
173 float *A_gpu,
int lda,
174 float *B_gpu,
int ldb,
176 float *C_gpu,
int ldc)
178 cublasHandle_t handle = blas_handle();
179 cudaError_t status = cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N),
180 (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
189 void time_gpu_random_matrix(
int TA,
int TB,
int m,
int k,
int n)
202 clock_t start = clock(), end;
203 for(i = 0; i<32; ++i){
204 gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
207 printf(
"Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s\n",m,k,k,n, TA, TB, (
float)(end-start)/CLOCKS_PER_SEC);
213 void time_gpu(
int TA,
int TB,
int m,
int k,
int n)
224 float *a_cl = cuda_make_array(a, m*k);
225 float *b_cl = cuda_make_array(b, k*n);
226 float *c_cl = cuda_make_array(c, m*n);
229 clock_t start = clock(), end;
230 for(i = 0; i<iter; ++i){
231 gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
232 cudaThreadSynchronize();
234 double flop = ((double)m)*n*(2.*k + 2.)*iter;
235 double gflop = flop/pow(10., 9);
237 double seconds =
sec(end-start);
238 printf(
"Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
248 void test_gpu_accuracy(
int TA,
int TB,
int m,
int k,
int n)
262 memset(c, 0, m*n*
sizeof(
float));
263 memset(c_gpu, 0, m*n*
sizeof(
float));
266 gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n);
270 gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
274 for(i = 0; i < m*n; ++i) {
276 sse += pow(c[i]-c_gpu[i], 2);
278 printf(
"Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
312 time_gpu(0,0,64,75,12544);
313 time_gpu(0,0,64,75,12544);
314 time_gpu(0,0,64,75,12544);
315 time_gpu(0,0,64,576,12544);
316 time_gpu(0,0,256,2304,784);
317 time_gpu(1,1,2304,256,784);
318 time_gpu(0,0,512,4608,196);
319 time_gpu(1,1,4608,512,196);
void gemm_tt(int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
float * random_matrix(int rows, int cols)
void time_random_matrix(int TA, int TB, int m, int k, int n)
void gemm_nt(int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
void gemm_tn(int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc)
void gemm_bin(int M, int N, int K, float ALPHA, char *A, int lda, float *B, int ldb, float *C, int ldc)
float sec(clock_t clocks)
void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc)
void gemm_nn(int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)