ev_tridi_band_gpu_c_v2_complex_template.cu 17.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file was originally written by NVIDIA
// and re-written by A. Marek, MPCDF


#include <stdio.h>
53
#include <assert.h>
54
55
56
57
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuComplex.h>
#include "config-f90.h"
58
59
60
61
62


#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// ===========================================================================================================
// Important:   due to the use of warp shuffling, the C version of the backtransformation kernel only works on
//              devices with compute capability 3.x; for older devices, please use the Fortran kernel version
// ===========================================================================================================


#if 0
static __device__ __forceinline__ cuDoubleComplex  shfl_xor_complex(cuDoubleComplex r, int mask)
{
    double real = cuCreal(r) ;
    double imag =  cuCimag(r);


    int hr = __shfl_xor(__double2hiint(real), mask);
    int lr = __shfl_xor(__double2loint(real), mask);

    int hi = __shfl_xor(__double2hiint(imag), mask);
    int li = __shfl_xor(__double2loint(imag), mask);



    real =      __hiloint2double(hr, lr);
    imag = __hiloint2double(hi, li);
    return       make_cuDoubleComplex(real, imag);

}
#endif


// Perform the equivalent of "__shfl_down" on an 8-byte value
#ifdef DOUBLE_PRECISION_COMPLEX
94
static __device__ __forceinline__ double shfl_down_complex_double(double r, int offset)
95
#else
96
static __device__ __forceinline__ float shfl_down_complex_single(float r, int offset)
97
98
#endif
{
99
100
101
    // The following operations do not exist in CUDA 10.1 any more
    // It has been commented out. The code is still compiled, but not used
    // TODO do it properly
102

103
104
105
106
107
108
    assert(0);
    //int hi = __shfl_down(__double2hiint(r), offset);
    //int lo = __shfl_down(__double2loint(r), offset);

    //return __hiloint2double(hi, lo);
    return 0.;
109
110
111
}

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
112
__device__ void warp_reduce_1_complex_double( cuDoubleComplex *s_block)
113
#else
Andreas Marek's avatar
Andreas Marek committed
114
__device__ void warp_reduce_1_complex_single( cuFloatComplex *s_block)
115
116
117
118
119
#endif
{
    int t_idx ;
    t_idx = threadIdx.x;
    __syncthreads();
120
    // attention
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#ifdef DOUBLE_PRECISION_COMPLEX
        if (t_idx < 32)
        {

	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) );
        if (t_idx < 8)
        {
	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) );

        }
        if (t_idx < 4)
        {
        s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ;
        }
        if (t_idx < 1)
        {
	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) );
        }
        }
#else
        if (t_idx < 32)
        {

	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 32]) , cuCaddf( s_block[t_idx + 64], s_block[t_idx + 96]) );
        if (t_idx < 8)
        {
	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) );

        }
        if (t_idx < 4)
        {
        s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ;
        }
        if (t_idx < 1)
        {
	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) );
        }
        }
#endif
}

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
163
__device__ void warp_reduce_2_complex_double( cuDoubleComplex *s_block)
164
#else
Andreas Marek's avatar
Andreas Marek committed
165
__device__ void warp_reduce_2_complex_single( cuFloatComplex *s_block)
166
167
168
169
170
#endif
{
    int t_idx ;
    t_idx = threadIdx.x;
    __syncthreads();
Andreas Marek's avatar
Andreas Marek committed
171
        // attention
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#ifdef DOUBLE_PRECISION_COMPLEX
        if(t_idx < 64)
        {
	s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 64]) , cuCadd( s_block[t_idx + 128], s_block[t_idx + 192]) );
        if (t_idx < 32)
        {
        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) );
        }
        if (t_idx < 8)
        {
        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) );

        }
        if (t_idx < 4)
        {
        s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ;
        }
        if (t_idx < 1)
        {
        s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) );
        }
        }
#else
        if(t_idx < 64)
        {
	s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 64]) , cuCaddf( s_block[t_idx + 128], s_block[t_idx + 192]) );
        if (t_idx < 32)
        {
        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 32]) , cuCaddf( s_block[t_idx + 64], s_block[t_idx + 96]) );
        }
        if (t_idx < 8)
        {
        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) );

        }
        if (t_idx < 4)
        {
        s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ;
        }
        if (t_idx < 1)
        {
        s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) );
        }
        }

#endif
}

template <unsigned int REDUCE_START_OFFSET>
#ifdef DOUBLE_PRECISION_COMPLEX
222
__device__ __forceinline__ cuDoubleComplex warp_reduce_complex_double( cuDoubleComplex r)
223
#else
224
__device__ __forceinline__ cuFloatComplex warp_reduce_complex_single( cuFloatComplex r)
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
     double real = cuCreal(r);
     double imag = cuCimag(r);
#else
     float real = cuCrealf(r);
     float imag = cuCimagf(r);
#endif

#pragma unroll
    for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
    {
#ifdef DOUBLE_PRECISION_COMPLEX
240
        real += shfl_down_complex_double(real, i);
241
#else
242
        real += shfl_down_complex_single(real, i);
243
244
245
246
247
248
#endif
    }
#pragma unroll
    for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
    {
#ifdef DOUBLE_PRECISION_COMPLEX
249
        imag += shfl_down_complex_double(imag, i);
250
#else
251
        imag += shfl_down_complex_single(imag, i);
252
253
254
255
256
257
258
259
260
261
#endif
    }

#ifdef DOUBLE_PRECISION_COMPLEX
    return make_cuDoubleComplex(real,imag);
#else
    return make_cuFloatComplex(real,imag);
#endif
}

262
#if 0 /* not used anywhere */
263
264
template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_COMPLEX
265
__device__ __forceinline__ void driver_warp_reduce_complex_double(cuDoubleComplex * dotp_s, int w_off)
266
#else
267
__device__ __forceinline__ void driver_warp_reduce_complex_single(cuFloatComplex * dotp_s, int w_off)
268
269
270
271
272
273
274
#endif
{
    int t_idx = threadIdx.x;

    if (HAVE_2_WARPS)
    {
        // In this case, we have 2 warps, each doing 1 reduction
275
	//attention
276
277
278
        if (t_idx < 64)
        {
#ifdef DOUBLE_PRECISION_COMPLEX
279
            dotp_s[w_off + t_idx] = warp_reduce_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
280
#else
281
            dotp_s[w_off + t_idx] = warp_reduce_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[w_off + t_idx] , dotp_s[w_off + t_idx + 32]));
282
283
284
285
286
287
#endif
        }
    }
    else
    {
        // In this case we have 1 warp that performs both reductions
288
	// attention
289
290
291
        if (t_idx < 32)
        {
#ifdef DOUBLE_PRECISION_COMPLEX
292
293
            dotp_s[t_idx] = warp_reduce_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx] ,  dotp_s[t_idx + 32]));
            dotp_s[t_idx + 64] = warp_reduce_complex_double<REDUCE_START_OFFSET>(cuCadd(dotp_s[t_idx + 64] ,  dotp_s[t_idx + 96]));
294
#else
295
296
            dotp_s[t_idx] = warp_reduce_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx] ,  dotp_s[t_idx + 32]));
            dotp_s[t_idx + 64] = warp_reduce_complex_single<REDUCE_START_OFFSET>(cuCaddf(dotp_s[t_idx + 64] ,  dotp_s[t_idx + 96]));
297
298
299
300
#endif
        }
    }
}
301
#endif /* not used anywhere */
Andreas Marek's avatar
Andreas Marek committed
302

303

304
305
306
#ifndef ALREADY_DEFINED_SYNC
// Synchronization wrapper, removing explicit synchronization when the thread-block is at most 32 threads (1 warp) in size
template <bool MUST_SYNC>
Andreas Marek's avatar
Andreas Marek committed
307
__device__ __forceinline__ void sync_real_threads()
308
309
310
311
312
313
314
315
316
{
    if (MUST_SYNC)
    {
        __syncthreads();
    }
}
#define ALREADY_DEFINED_SYNC 1
#endif

317
318
319
320
321
322
#ifdef DOUBLE_PRECISION_COMPLEX
__device__  void reset_dotp_buffers_complex_double( cuDoubleComplex  * const __restrict__ s_block)
#else
__device__  void reset_dotp_buffers_complex_single( cuFloatComplex  * const __restrict__ s_block)
#endif
{
323
    // attention
324
325
326
327
328
329
330
331
332
333
334
335
336
    if (blockDim.x >= 64)
    {
        int t_idx = threadIdx.x;

        if (t_idx < 64)
        {
            s_block[t_idx].x = s_block[t_idx + 64].x = 0.0;
	    s_block[t_idx].y = s_block[t_idx + 64].y = 0.0;

        }
    }
    else
    {
337
        int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
338
339
340
341
342
343
344
345
346
347
348
349
350
#ifdef DOUBLE_PRECISION_COMPLEX
        int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
        int s_chunk_size = s_chunk * sizeof(cuFloatComplex);
#endif

        // Each thread resets an equally-sized, contiguous portion of the buffer
        memset(&(s_block[ threadIdx.x * s_chunk].x), 0, s_chunk_size);
	memset( & (s_block[ threadIdx.x * s_chunk].y), 0, s_chunk_size);

    }
}
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
351
__device__  void reset_dotp_buffers_2_complex_double( cuDoubleComplex  * const __restrict__ s_block)
352
#else
Andreas Marek's avatar
Andreas Marek committed
353
__device__  void reset_dotp_buffers_2_complex_single( cuFloatComplex  * const __restrict__ s_block)
354
355
#endif
{
356
    if (blockDim.x >= BLOCK_CYCLIC_BLOCKSIZE)
357
358
359
    {
        int t_idx = threadIdx.x;

360
        if (t_idx < BLOCK_CYCLIC_BLOCKSIZE)
361
        {
362
363
            s_block[t_idx].x = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].x = 0.0;
            s_block[t_idx].y = s_block[t_idx + BLOCK_CYCLIC_BLOCKSIZE].y = 0.0;
364
365
366
367
368

        }
    }
    else
    {
369
        int s_chunk = GLOBAL_STRIPE_WIDTH / blockDim.x;
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
#ifdef DOUBLE_PRECISION_COMPLEX
        int s_chunk_size = s_chunk * sizeof(cuDoubleComplex);
#else
        int s_chunk_size = s_chunk * sizeof(cuFloatComplex);
#endif
        // Each thread resets an equally-sized, contiguous portion of the buffer
        memset(&(s_block[ threadIdx.x * s_chunk].x), 0, s_chunk_size);
        memset( & (s_block[ threadIdx.x * s_chunk].y), 0, s_chunk_size);

    }
}


// =========================
// Backtransformation kernel
// =========================
#ifdef DOUBLE_PRECISION_COMPLEX
387
template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_kernel_2_2_complex_double(cuDoubleComplex * const __restrict__  q, const cuDoubleComplex  * const __restrict__   hh,   const cuDoubleComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
388
#else
389
template<unsigned int REDUCE_START_OFFSET>__global__ void compute_hh_trafo_kernel_2_2_complex_single(cuFloatComplex * const __restrict__  q, const cuFloatComplex  * const __restrict__   hh,   const cuFloatComplex * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
390
391
392
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
393
394
    __shared__ cuDoubleComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
    __shared__ cuDoubleComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
395
396
397

     cuDoubleComplex q_v2, tau ;
#else
398
399
    __shared__ cuFloatComplex q_s[BLOCK_CYCLIC_BLOCKSIZE];
    __shared__ cuFloatComplex dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444

     cuFloatComplex q_v2, tau ;
#endif

    int  t_idx,q_off, h_off, j , b_idx;

    // The thread index selects the position inside the eigenvector selected above
    t_idx = threadIdx.x;
    b_idx = blockIdx.x ;

    // Compute intial index
    j = ncols ;
     q_off = b_idx + (j + t_idx) * ldq;
         h_off = j * nb + t_idx;

   if(t_idx>0)
   {    q_s[t_idx] = q[ q_off ];
   }

   while (j>=1)
   {

        if ((j == ncols) || (t_idx ==0))
        {
              q_s[t_idx] = q[q_off ];
        }

        q_v2 = q_s[t_idx];
       tau =  hh_tau[j];

        __syncthreads();

        if(t_idx==0)
        {
                dotp_s[t_idx]= q_v2  ;
        }
       else
        {
#ifdef DOUBLE_PRECISION_COMPLEX
		dotp_s[t_idx]  =  cuCmul(q_v2,cuConj( hh[h_off]));
#else
		dotp_s[t_idx]  =  cuCmulf(q_v2,cuConjf( hh[h_off]));
#endif
        }
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
445
        warp_reduce_1_complex_double( dotp_s);
446
#else
Andreas Marek's avatar
Andreas Marek committed
447
        warp_reduce_1_complex_single( dotp_s);
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
#endif

        __syncthreads();
        if(t_idx ==0)
        {
#ifdef DOUBLE_PRECISION_COMPLEX
		q_v2 =  cuCsub(q_v2,cuCmul(dotp_s[0], tau) );
#else
		q_v2 =  cuCsubf(q_v2,cuCmulf(dotp_s[0], tau) );
#endif
        }
        else
        {
#ifdef DOUBLE_PRECISION_COMPLEX
		q_v2 =  cuCsub(q_v2,cuCmul(cuCmul(dotp_s[0], tau),hh[h_off]));
#else
		q_v2 =  cuCsubf(q_v2,cuCmulf(cuCmulf(dotp_s[0], tau),hh[h_off]));
#endif
        }

        if(t_idx < blockDim.x-1)
       {q_s[t_idx+1 ] = q_v2;
        }
       if ((j ==  1) || (t_idx == blockDim.x-1))
       {q[q_off] = q_v2;
        }
       __syncthreads();
       q_off -= ldq;
       h_off -= nb;
	j -=1;
}
}
#ifdef DOUBLE_PRECISION_COMPLEX
extern "C" void launch_compute_hh_trafo_c_kernel_complex_double( cuDoubleComplex* q, cuDoubleComplex * hh, cuDoubleComplex * hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
#else
extern "C" void launch_compute_hh_trafo_c_kernel_complex_single( cuFloatComplex* q, cuFloatComplex * hh, cuFloatComplex * hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
#endif
{

487
#if 0
488
489
490
	cudaDeviceSynchronize();
        cudaError_t err = cudaGetLastError();
        if(err != cudaSuccess) printf("error prior to compute_ hh_ trafo c kernel: %s, %d\n",cudaGetErrorString(err), err);
491
        dim3 n_block, n_thread;
492
493
	n_block = dim3(nev,1,1);
	n_thread = dim3(nb,1,1);
494
#endif
495
496
497

    switch (nb)
    {
498
      // attention
499
500
501
502
      case  256:
       case 128:
        case 64:
#ifdef DOUBLE_PRECISION_COMPLEX
503
	     compute_hh_trafo_kernel_2_2_complex_double<16><<<nev, nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
504
#else
505
	     compute_hh_trafo_kernel_2_2_complex_single<16><<<nev, nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
506
507
508
509
510
#endif
            break;

        case 32:
#ifdef DOUBLE_PRECISION_COMPLEX
511
            compute_hh_trafo_kernel_2_2_complex_double<8><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
512
#else
513
            compute_hh_trafo_kernel_2_2_complex_single<8><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
514
515
516
517
518
#endif
            break;

        case 16:
#ifdef DOUBLE_PRECISION_COMPLEX
519
            compute_hh_trafo_kernel_2_2_complex_double<4><<<nev ,nb>>>(q, hh,  hh_tau, nb, ldq, off, ncols);
520
#else
521
            compute_hh_trafo_kernel_2_2_complex_single<4><<<nev ,nb>>>(q, hh,  hh_tau, nb, ldq, off, ncols);
522
523
524
525
526
#endif
            break;

        case 8:
#ifdef DOUBLE_PRECISION_COMPLEX
527
            compute_hh_trafo_kernel_2_2_complex_double<2><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
528
#else
529
            compute_hh_trafo_kernel_2_2_complex_single<2><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
530
531
532
533
534
#endif
            break;

        case 4:
#ifdef DOUBLE_PRECISION_COMPLEX
535
            compute_hh_trafo_kernel_2_2_complex_double<1><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
536
#else
537
            compute_hh_trafo_kernel_2_2_complex_single<1><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
538
539
540
541
542
543
#endif
            break;

        case 2:
        case 1:
#ifdef DOUBLE_PRECISION_COMPLEX
544
	    compute_hh_trafo_kernel_2_2_complex_double<0><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
545
#else
546
	    compute_hh_trafo_kernel_2_2_complex_single<0><<<nev ,nb>>>(q, hh, hh_tau, nb, ldq, off, ncols);
547
548
549
#endif
            break;
        default:
550
            printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE.\n");
551
552
    }

553
#if 0
554
555
556
557
558
559
	cudaDeviceSynchronize();
	 err = cudaGetLastError();
        if ( err!= cudaSuccess)
        {
                printf("\n compute hh trafo c kernel failed  %s \n",cudaGetErrorString(err) );
        }
560
#endif
561
562
563
564

}