ev_tridi_band_gpu_c_v2_real_template.Xcu 15.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file was originally written by NVIDIA
// and re-written by A. Marek, MPCDF


#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include "config-f90.h"

57 58 59
#define BLOCK_CYCLIC_BLOCKSIZE 128
#define GLOBAL_STRIPE_WIDTH 256

Andreas Marek's avatar
Andreas Marek committed
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
// Perform the equivalent of "__shfl_xor" on an 8-byte value
#ifdef DOUBLE_PRECISION_REAL
static __device__ __forceinline__ double shfl_xor_real_double(double r, int mask)
#else
static __device__ __forceinline__ float shfl_xor_real_single(float r, int mask)
#endif
{
    int hi = __shfl_xor(__double2hiint(r), mask);
    int lo = __shfl_xor(__double2loint(r), mask);

    return __hiloint2double(hi, lo);
}

// Perform the equivalent of "__shfl_down" on an 8-byte value
#ifdef DOUBLE_PRECISION_REAL
static __device__ __forceinline__ double shfl_down_real_double(double r, int offset)
#else
static __device__ __forceinline__ float shfl_down_real_single(float r, int offset)
#endif
{
    int hi = __shfl_down(__double2hiint(r), offset);
    int lo = __shfl_down(__double2loint(r), offset);

    return __hiloint2double(hi, lo);
}

// Perform a reduction on a warp or the first part of it
template <unsigned int REDUCE_START_OFFSET>
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ double warp_reduce_real_double(double r)
#else
__device__ __forceinline__ float warp_reduce_real_single(float r)
#endif
{
#pragma unroll
    for (int i = REDUCE_START_OFFSET; i >= 1; i >>= 1)
    {
#ifdef DOUBLE_PRECISION_REAL
        r += shfl_down_real_double(r, i);
#else
        r += shfl_down_real_single(r, i);
#endif
    }

    return r;
}

// Perform 2 reductions, using either 1 or 2 warps
template <unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
__device__ __forceinline__ void double_warp_reduce_real_double(double * dotp_s, int w_off)
#else
__device__ __forceinline__ void float_warp_reduce_real_single(float * dotp_s, int w_off)
#endif
{
    int t_idx = threadIdx.x;

    if (HAVE_2_WARPS)
    {
        // In this case, we have 2 warps, each doing 1 reduction
        // attention
        if (t_idx < 64)
        {
#ifdef DOUBLE_PRECISION_REAL
            dotp_s[w_off + t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#else
            dotp_s[w_off + t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[w_off + t_idx] + dotp_s[w_off + t_idx + 32]);
#endif
        }
    }
    else
    {
        // In this case we have 1 warp that performs both reductions
        // attention
        if (t_idx < 32)
        {
#ifdef DOUBLE_PRECISION_REAL
            dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
            dotp_s[t_idx + 64] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#else
            dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
            dotp_s[t_idx + 64] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx + 64] + dotp_s[t_idx + 96]);
#endif
        }
    }
}

147 148
// Reset the entire contents of a shared reduction block; the thread block size must be a power-of-2
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
149
__device__ __forceinline__ void reset_dotp_buffers_real_double(double * const __restrict__ s_block)
150
#else
Andreas Marek's avatar
Andreas Marek committed
151
__device__ __forceinline__ void reset_dotp_buffers_real_single(float * const __restrict__ s_block)
152 153
#endif
{
154
    // attention
155 156 157 158 159 160 161 162 163 164 165
    if (blockDim.x >= 64)
    {
        int t_idx = threadIdx.x;

        if (t_idx < 64)
        {
            s_block[t_idx] = s_block[t_idx + 64] = 0.0;
        }
    }
    else
    {
166
        int s_chunk = BLOCK_CYCLIC_BLOCKSIZE / blockDim.x;
167 168 169 170 171 172 173 174 175 176
#ifdef DOUBLE_PRECISION_REAL
        int s_chunk_size = s_chunk * sizeof(double);
#else
        int s_chunk_size = s_chunk * sizeof(float);
#endif
        // Each thread resets an equally-sized, contiguous portion of the buffer
        memset(s_block + threadIdx.x * s_chunk, 0, s_chunk_size);
    }
}

177 178 179 180 181 182 183
// =========================
// Backtransformation kernel
// =========================

// We use templates here to avoid additional branching based on the actual size of the thread-block
template<unsigned int REDUCE_START_OFFSET, bool HAVE_2_WARPS>
#ifdef DOUBLE_PRECISION_REAL
184
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_kernel_real_double(double * const __restrict__ q, const double * const __restrict__ hh, const double * const __restrict__ hh_dot,
185 186
    const double * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#else
187
__global__ void __launch_bounds__( BLOCK_CYCLIC_BLOCKSIZE ) compute_hh_trafo_kernel_real_single(float * const __restrict__ q, const float * const __restrict__ hh, const float * const __restrict__ hh_dot,
188 189 190 191 192
    const float * const __restrict__ hh_tau, const int nb, const int ldq, const int off, const int ncols)
#endif

{
#ifdef DOUBLE_PRECISION_REAL
193 194
    __shared__ double dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
    __shared__ double q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
195
#else
196 197
    __shared__ float dotp_s[BLOCK_CYCLIC_BLOCKSIZE];
    __shared__ float q_s[BLOCK_CYCLIC_BLOCKSIZE+1];
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
#endif

    int b_idx, t_idx, q_off, h_off, w_off, j, t_s, q_delta, hh_delta;
#ifdef DOUBLE_PRECISION_REAL
    double q_v_1, q_v_2, hh_v_1, hh_v_2, tau_1, tau_2, s_1, s_2, dot_p, hh_v_3, my_r1, my_r2;
#else
    float q_v_1, q_v_2, hh_v_1, hh_v_2, tau_1, tau_2, s_1, s_2, dot_p, hh_v_3, my_r1, my_r2;
#endif
    // The block index selects the eigenvector (EV) which the current block is responsible for
    b_idx = blockIdx.x;

    // The thread index selects the position inside the eigenvector selected above
    t_idx = threadIdx.x;

    // The warp offset for the current thread: 0 for the first warp, 32 for the second etc.
    w_off = (t_idx >> 5) << 5;

    // The entire contents of the shared reduction buffers must be reset

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
218
   reset_dotp_buffers_real_double(dotp_s);
219
#else
Andreas Marek's avatar
Andreas Marek committed
220
    reset_dotp_buffers_real_single(dotp_s);
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
#endif

    // Compute initial access indices
    j = off + ncols - 1;
    q_off = b_idx + (j + t_idx) * ldq;
    h_off = j * nb + t_idx;
    t_s = t_idx >> 1;
    q_delta = ldq << 1;
    hh_delta = nb << 1;

    // Load the last EV components in the EV cache
    if (t_idx > 0)
    {
        q_s[t_idx + 1] = q[q_off];
    }

    // Ensure the ring buffer and reduction buffers are initialized
Andreas Marek's avatar
Andreas Marek committed
238
    sync_real_threads<HAVE_2_WARPS>();
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271

    while (j >= off + 1)
    {
        // Per-iteration GMem I/O reads are in order to improve cache hit ratio

        // Read the corresponding compotents in the 2 Householder reflectors
        hh_v_1 = __ldg(&hh[h_off]);
        hh_v_2 = __ldg(&hh[h_off - nb]);
        hh_v_3 = (t_idx == 0)? 0.0 : __ldg(&hh[h_off - 1]);

        // Read the pre-computed dot-product of the 2 Householder reflectors
        dot_p = __ldg(&hh_dot[j - 1]);

        // Read the pre-computed values for "Tau" corresponding to the 2 Householder reflectors
        tau_1 = __ldg(&hh_tau[j]);
        tau_2 = __ldg(&hh_tau[j - 1]);

        // Only read the new EV components (the others are already stored in the shared EV cache, q_s)
        if (t_idx == 0)
        {
            q_s[0] = q[q_off - ldq];
            q_s[1] = q[q_off];
        }

        // Fill the shared buffers for the dot products bewtween the EV subset and the Householder reflectors
        q_v_1 = q_s[t_idx + 1];
        q_v_2 = q_s[t_idx];

        my_r1 = q_v_1 * hh_v_1 * tau_1;
        my_r2 = q_v_2 * hh_v_2 * tau_2;

        // After using "shfl_xor", both threads in a pair will hold the same values
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
272 273
        my_r1 += shfl_xor_real_double(my_r1, 1);
        my_r2 += shfl_xor_real_double(my_r2, 1);
274
#else
Andreas Marek's avatar
Andreas Marek committed
275 276
        my_r1 += shfl_xor_real_single(my_r1, 1);
        my_r2 += shfl_xor_real_single(my_r2, 1);
277 278 279 280
#endif

        // Now both threads in a pair can write to the same reduction buffer address without race-condition issues
        dotp_s[t_s] = my_r1;
281
	//attention
282 283 284
        dotp_s[t_s + 64] = my_r2;

        // Ensure the reduction buffers are fully populated
Andreas Marek's avatar
Andreas Marek committed
285
        sync_real_threads<HAVE_2_WARPS>();
286 287 288

        // Perform the 2 reductions using only the first warp (we assume the warp size is 32, valid up to CC 3.x)
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
289
        double_warp_reduce_real_double<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
290
#else
Andreas Marek's avatar
Andreas Marek committed
291
        float_warp_reduce_real_single<REDUCE_START_OFFSET, HAVE_2_WARPS>(dotp_s, w_off);
292 293
#endif
        // Ensure every thread will have access to the reduction results
Andreas Marek's avatar
Andreas Marek committed
294
        sync_real_threads<HAVE_2_WARPS>();
295 296 297

        // Each thread collects the reduction results
        s_1 = dotp_s[0];
298 299

	// attention
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
        s_2 = dotp_s[64];

        // Each thread updates its corresponding EV component
        q_v_2 = q_v_2 - hh_v_3 * s_1 - hh_v_2 * s_2 + tau_2 * hh_v_2 * s_1 * dot_p;

        if (t_idx == blockDim.x - 1)
        {
            // The last thread writes the last 2 EV components to the EV matrix
            q[q_off] = q_v_1 - hh_v_1 * s_1;
            q[q_off - ldq] = q_v_2;
        }
        else
        {
            // All other threads update the EV cache for the next iteration
            q_s[t_idx + 2] = q_v_2;
        }

Andreas Marek's avatar
Andreas Marek committed
317
        sync_real_threads<HAVE_2_WARPS>();
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351

        // Update access indices
        q_off -= q_delta;
        h_off -= hh_delta;
        j -= 2;
    }

    // Once the previous loop has finished, we have at most 1 more iteration to perform

    if (j == off - 1)
    {
        // No iterations remain, so the final contents of the EV matrix are updated
        if (t_idx < blockDim.x - 1)
        {
            q[q_off + ldq] = q_v_2;
        }
    }
    else
    {
        // One iteration remains; it must be processed separately
        if (t_idx == 0)
        {
            // Only one more EV element needs to be loaded
            q_s[1] = q[q_off];
        }

        // As before, we first read the EV and Householder components
        q_v_1 = q_s[t_idx + 1];
        hh_v_1 = __ldg(&hh[h_off]);
        tau_1 = __ldg(&hh_tau[j]);

        // We prepare the reduction buffer
        my_r1 = q_v_1 * hh_v_1 * tau_1;
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
352
        my_r1 += shfl_xor_real_double(my_r1, 1);
353
#else
Andreas Marek's avatar
Andreas Marek committed
354
        my_r1 += shfl_xor_real_single(my_r1, 1);
355 356 357
#endif
        dotp_s[t_s] = my_r1;

Andreas Marek's avatar
Andreas Marek committed
358
        sync_real_threads<HAVE_2_WARPS>();
359 360

        // We perform the reduction using the first warp only
361
	// attention
362 363 364
        if (t_idx < 32)
        {
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
365
            dotp_s[t_idx] = warp_reduce_real_double<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
366
#else
Andreas Marek's avatar
Andreas Marek committed
367
            dotp_s[t_idx] = warp_reduce_real_single<REDUCE_START_OFFSET>(dotp_s[t_idx] + dotp_s[t_idx + 32]);
368 369 370
#endif
        }

Andreas Marek's avatar
Andreas Marek committed
371
        sync_real_threads<HAVE_2_WARPS>();
372 373 374 375 376 377 378 379

        // The last EV components are written to the EV matrix
        q[q_off] = q_v_1 - hh_v_1 * dotp_s[0];
    }
}

// This is a host wrapper for calling the appropriate back-transformation kernel, based on the SCALAPACK block size
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
380
 extern "C" void launch_compute_hh_trafo_c_kernel_real_double(double * const q, const double * const hh, const double * const hh_dot,  const double * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
381
#else
Andreas Marek's avatar
Andreas Marek committed
382
 extern "C" void launch_compute_hh_trafo_c_kernel_real_single(float * const q, const float * const hh, const float * const hh_dot,  const float * const hh_tau, const int nev, const int nb, const int ldq, const int off, const int ncols)
383 384 385 386
#endif
{
    switch (nb)
    {
387
        // attention
388 389 390
        case 128:
        case 64:
#ifdef DOUBLE_PRECISION_REAL
391
            compute_hh_trafo_kernel_real_double<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
392
#else
393
            compute_hh_trafo_kernel_real_single<16, true><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
394 395 396 397 398
#endif
            break;

        case 32:
#ifdef DOUBLE_PRECISION_REAL
399
            compute_hh_trafo_kernel_real_double<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
400
#else
401
            compute_hh_trafo_kernel_real_single<8, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
402 403 404 405 406
#endif
            break;

        case 16:
#ifdef DOUBLE_PRECISION_REAL
407
            compute_hh_trafo_kernel_real_double<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
408
#else
409
            compute_hh_trafo_kernel_real_single<4, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
410 411 412 413 414
#endif
            break;

        case 8:
#ifdef DOUBLE_PRECISION_REAL
415
            compute_hh_trafo_kernel_real_double<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
416
#else
417
            compute_hh_trafo_kernel_real_single<2, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
418 419 420 421 422
#endif
            break;

        case 4:
#ifdef DOUBLE_PRECISION_REAL
423
            compute_hh_trafo_kernel_real_double<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
424
#else
425
            compute_hh_trafo_kernel_real_single<1, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
426 427 428 429 430 431
#endif
            break;

        case 2:
        case 1:
#ifdef DOUBLE_PRECISION_REAL
432
            compute_hh_trafo_kernel_real_double<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
433
#else
434
            compute_hh_trafo_kernel_real_single<0, false><<<nev, nb>>>(q, hh, hh_dot, hh_tau, nb, ldq, off, ncols);
435 436 437 438
#endif
            break;

        default:
439
            printf("Error: please use a power-of-2 SCALAPACK block size which is between 1 and BLOCK_CYCLIC_BLOCKSIZE .\n");
440 441 442
    }
}