cudaFunctions.cu 16 KB
Newer Older
1
2
3
#include <stdio.h>
#include <math.h>
#include <stdio.h>
4
5
6
7
8
9
10
11
12
13
14
15
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
16
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file was written by A. Marek, MPCDF


54
55
56
57
58
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <alloca.h>
#include <stdint.h>
59
#include <complex.h>
Pavel Kus's avatar
Pavel Kus committed
60
#include <cublas_v2.h>
61
62
63
64
65
66
67
68
69
70
71
72
73

#include "config-f90.h"

#define errormessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)

#ifdef DEBUG_CUDA
#define debugmessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
#else
#define debugmessage(x, ...)
#endif

#ifdef WITH_GPU_VERSION
extern "C" {
Pavel Kus's avatar
Pavel Kus committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  
    int cublasCreateFromC(intptr_t **cublas_handle) {
//     printf("in c: %p\n", *cublas_handle);
    *cublas_handle = (intptr_t*) malloc(sizeof(cublasHandle_t));
//     printf("in c: %p\n", *cublas_handle);
    cublasStatus_t status = cublasCreate((cublasHandle_t*) *cublas_handle);
    if (status == CUBLAS_STATUS_SUCCESS) {
//       printf("all OK\n");
      return 1;
    }
    else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
      errormessage("Error in cublasCreate: %s\n", "the CUDA Runtime initialization failed");
      return 0;      
    }       
    else if (status == CUBLAS_STATUS_ALLOC_FAILED) {
      errormessage("Error in cublasCreate: %s\n", "the resources could not be allocated");
      return 0;
    }
    else{
      errormessage("Error in cublasCreate: %s\n", "unknown error");
      return 0;
    }                
  }

  int cublasDestroyFromC(intptr_t *cublas_handle) {
    cublasStatus_t status = cublasDestroy(*((cublasHandle_t*) *cublas_handle));
    *cublas_handle = (intptr_t) NULL;
    if (status == CUBLAS_STATUS_SUCCESS) {
//       printf("all OK\n");
      return 1;
    }
    else if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
      errormessage("Error in cublasDestroy: %s\n", "the library has not been initialized");
      return 0;      
    }       
    else{
      errormessage("Error in cublasCreate: %s\n", "unknown error");
      return 0;
    }                
  }
114
115
116
117
118
119
120
121
122
123
124

  int cudaThreadSynchronizeFromC() {
    cudaError_t cuerr = cudaThreadSynchronize();
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaThreadSynchronize: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }


125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
  int cudaSetDeviceFromC(int n) {

    cudaError_t cuerr = cudaSetDevice(n);
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaSetDevice: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }

  int cudaGetDeviceCountFromC(int *count) {

    cudaError_t cuerr = cudaGetDeviceCount(count);
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }

  int cudaDeviceSynchronizeFromC() {

    cudaError_t cuerr = cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }


  int cudaMallocFromC(intptr_t *a, size_t width_height) {

    cudaError_t cuerr = cudaMalloc((void **) a, width_height);
#ifdef DEBUG_CUDA
    printf("Malloc pointer address: %p \n", *a);
#endif
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaMalloc: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }
  int cudaFreeFromC(intptr_t *a) {
#ifdef DEBUG_CUDA
    printf("Free pointer address: %p \n", a);
#endif
    cudaError_t cuerr = cudaFree(a);

    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaFree: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }

  int cudaMemsetFromC(intptr_t *a, int value, size_t count) {

    cudaError_t cuerr = cudaMemset( a, value, count);
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaMemset: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }

  int cudaMemcpyFromC(intptr_t *dest, intptr_t *src, size_t count, int dir) {

    cudaError_t cuerr = cudaMemcpy( dest, src, count, (cudaMemcpyKind)dir);
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaMemcpy: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }

  int cudaMemcpy2dFromC(intptr_t *dest, size_t dpitch, intptr_t *src, size_t spitch, size_t width, size_t height, int dir) {

    cudaError_t cuerr = cudaMemcpy2D( dest, dpitch, src, spitch, width, height, (cudaMemcpyKind)dir);
    if (cuerr != cudaSuccess) {
      errormessage("Error in cudaMemcpy2d: %s\n",cudaGetErrorString(cuerr));
      return 0;
    }
    return 1;
  }
  int cudaMemcpyDeviceToDeviceFromC(void) {
      int val = cudaMemcpyDeviceToDevice;
      return val;
  }
  int cudaMemcpyHostToDeviceFromC(void) {
      int val = cudaMemcpyHostToDevice;
      return val;
  }
  int cudaMemcpyDeviceToHostFromC(void) {
      int val = cudaMemcpyDeviceToHost;
      return val;
  }
  int cudaHostRegisterPortableFromC(void) {
      int val = cudaHostRegisterPortable;
      return val;
  }
  int cudaHostRegisterMappedFromC(void) {
      int val = cudaHostRegisterMapped;
      return val;
  }
230
  
Pavel Kus's avatar
Pavel Kus committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
  cublasOperation_t operation_new_api(char trans) {
    if (trans == 'N' || trans == 'n') {
      return CUBLAS_OP_N;
    }
    else if (trans == 'T' || trans == 't') {
      return CUBLAS_OP_T;
    }
    else if (trans == 'C' || trans == 'c') {
      return CUBLAS_OP_C;
    }
    else {
      errormessage("Error when transfering %c to cublasOperation_t\n",trans);
      // or abort?
      return CUBLAS_OP_N;
    }
  }


  cublasFillMode_t fill_mode_new_api(char uplo) {
    if (uplo == 'L' || uplo == 'l') {
      return CUBLAS_FILL_MODE_LOWER;
    }
    else if(uplo == 'U' || uplo == 'u') {
      return CUBLAS_FILL_MODE_UPPER;
    }
    else {
      errormessage("Error when transfering %c to cublasFillMode_t\n", uplo);
      // or abort?
      return CUBLAS_FILL_MODE_LOWER;
    }
  }
  
  cublasSideMode_t side_mode_new_api(char side) {
    if (side == 'L' || side == 'l') {
      return CUBLAS_SIDE_LEFT;
    }    
    else if (side == 'R' || side == 'r') {
      return CUBLAS_SIDE_RIGHT;
    }
    else{
      errormessage("Error when transfering %c to cublasSideMode_t\n", side);
      // or abort?
      return CUBLAS_SIDE_LEFT;
    }
  }
  
  cublasDiagType_t diag_type_new_api(char diag) {
    if (diag == 'N' || diag == 'n') {
      return CUBLAS_DIAG_NON_UNIT;
    }
    else if (diag == 'U' || diag == 'u') {
      return CUBLAS_DIAG_UNIT;
    }
    else {
      errormessage("Error when transfering %c to cublasDiagMode_t\n", diag);
      // or abort?
      return CUBLAS_DIAG_NON_UNIT;
    }
  }
      

  
  void cublasDgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double alpha,
                               const double *A, int lda,  const double *x, int incx,
                               double beta, double *y, int incy) {    
    
    cublasDgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
                m, n, &alpha, A, lda, x, incx, &beta, y, incy);     
  }
  
  void cublasSgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float alpha,
                               const float *A, int lda,  const float *x, int incx,
                               float beta, float *y, int incy) {    
    
    cublasSgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
                m, n, &alpha, A, lda, x, incx, &beta, y, incy);     
  }
  
  void cublasZgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, double complex alpha,
310
311
312
313
314
315
316
317
318
319
                               const double complex *A, int lda,  const double complex *x, int incx,
                               double complex beta, double complex *y, int incy) {    

    cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
    cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
    
    const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
    const cuDoubleComplex* x_casted = (const cuDoubleComplex*) x;
    cuDoubleComplex* y_casted = (cuDoubleComplex*) y;
    
Pavel Kus's avatar
Pavel Kus committed
320
321
    cublasZgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
                m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);     
322
323
  }
  
Pavel Kus's avatar
Pavel Kus committed
324
  void cublasCgemv_elpa_wrapper (intptr_t handle, char trans, int m, int n, float complex alpha,
325
326
                               const float complex *A, int lda,  const float complex *x, int incx,
                               float complex beta, float complex *y, int incy) {    
327

328
329
    cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
    cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
330
    
331
332
333
334
    const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
    const cuFloatComplex* x_casted = (const cuFloatComplex*) x;
    cuFloatComplex* y_casted = (cuFloatComplex*) y;
    
Pavel Kus's avatar
Pavel Kus committed
335
336
    cublasCgemv(*((cublasHandle_t*)handle), operation_new_api(trans), 
                m, n, &alpha_casted, A_casted, lda, x_casted, incx, &beta_casted, y_casted, incy);     
337
338
  }
  
Pavel Kus's avatar
Pavel Kus committed
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
  
  void cublasDgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
                               double alpha, const double *A, int lda,
                               const double *B, int ldb, double beta,
                               double *C, int ldc) {
    
    cublasDgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
                m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
  }

  void cublasSgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
                               float alpha, const float *A, int lda,
                               const float *B, int ldb, float beta,
                               float *C, int ldc) {
    
    cublasSgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
                m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
  }

  void cublasZgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
359
360
361
362
363
364
365
366
367
368
369
                               double complex alpha, const double complex *A, int lda,
                               const double complex *B, int ldb, double complex beta,
                               double complex *C, int ldc) {
    
    cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
    cuDoubleComplex beta_casted = *((cuDoubleComplex*)(&beta));
    
    const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
    const cuDoubleComplex* B_casted = (const cuDoubleComplex*) B;
    cuDoubleComplex* C_casted = (cuDoubleComplex*) C;
    
Pavel Kus's avatar
Pavel Kus committed
370
371
    cublasZgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
                m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
372
373
  }

Pavel Kus's avatar
Pavel Kus committed
374
  void cublasCgemm_elpa_wrapper (intptr_t handle, char transa, char transb, int m, int n, int k,
375
376
377
378
379
380
                               float complex alpha, const float complex *A, int lda,
                               const float complex *B, int ldb, float complex beta,
                               float complex *C, int ldc) {
    
    cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
    cuFloatComplex beta_casted = *((cuFloatComplex*)(&beta));
381
    
382
383
384
385
    const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
    const cuFloatComplex* B_casted = (const cuFloatComplex*) B;
    cuFloatComplex* C_casted = (cuFloatComplex*) C;
    
Pavel Kus's avatar
Pavel Kus committed
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
    cublasCgemm(*((cublasHandle_t*)handle), operation_new_api(transa), operation_new_api(transb), 
                m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
  }

  
  // todo: new CUBLAS API diverged from standard BLAS api for these functions
  // todo: it provides out-of-place (and apparently more efficient) implementation
  // todo: by passing B twice (in place of C as well), we should fall back to in-place algorithm
  
  void cublasDtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
                               int m, int n, double alpha, const double *A,
                               int lda, double *B, int ldb){

    cublasDtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
                diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
  }

  void cublasStrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
                               int m, int n, float alpha, const float *A,
                               int lda, float *B, int ldb){

    cublasStrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
                diag_type_new_api(diag), m, n, &alpha, A, lda, B, ldb, B, ldb);
409
410
  }

Pavel Kus's avatar
Pavel Kus committed
411
  void cublasZtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
412
413
414
415
416
417
418
419
                               int m, int n, double complex alpha, const double complex *A,
                               int lda, double complex *B, int ldb){

    cuDoubleComplex alpha_casted = *((cuDoubleComplex*)(&alpha));
    
    const cuDoubleComplex* A_casted = (const cuDoubleComplex*) A;
    cuDoubleComplex* B_casted = (cuDoubleComplex*) B;    
    
Pavel Kus's avatar
Pavel Kus committed
420
421
    cublasZtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
                diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
422
423
  }

Pavel Kus's avatar
Pavel Kus committed
424
  void cublasCtrmm_elpa_wrapper (intptr_t handle, char side, char uplo, char transa, char diag,
425
426
                               int m, int n, float complex alpha, const float complex *A,
                               int lda, float complex *B, int ldb){
427

428
429
430
431
    cuFloatComplex alpha_casted = *((cuFloatComplex*)(&alpha));
    
    const cuFloatComplex* A_casted = (const cuFloatComplex*) A;
    cuFloatComplex* B_casted = (cuFloatComplex*) B;    
432
    
Pavel Kus's avatar
Pavel Kus committed
433
434
    cublasCtrmm(*((cublasHandle_t*)handle), side_mode_new_api(side), fill_mode_new_api(uplo), operation_new_api(transa), 
                diag_type_new_api(diag), m, n, &alpha_casted, A_casted, lda, B_casted, ldb, B_casted, ldb);
435
436
437
  }

  
438
439
}
#endif /* WITH_GPU_VERSION */