Commit 45a9b338 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master_fix_up' into ELPA_KNL

parents b6e1b918 51003ba0
......@@ -963,8 +963,7 @@ EXTRA_DIST = \
src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
src/ev_tridi_band_gpu_c_v2_real_template.Xcu \
src/cuUtils_complex_template.Xcu \
src/cuUtils_real_template.Xcu \
src/cuUtils_template.Xcu \
nvcc_wrap \
test_project/Makefile.am \
test_project/autogen.sh \
......
......@@ -51,26 +51,30 @@
#include "config-f90.h"
// The real part
#define REALCASE 1
#undef COMPLEXCASE
#define DOUBLE_PRECISION_REAL 1
#include "cuUtils_real_template.Xcu"
#include "cuUtils_template.Xcu"
#undef DOUBLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#include "cuUtils_real_template.Xcu"
#include "cuUtils_template.Xcu"
#endif
// The complex part
#define COMPLEXCASE 1
#undef REALCASE
#define DOUBLE_PRECISION_COMPLEX 1
#include "cuUtils_complex_template.Xcu"
#include "cuUtils_template.Xcu"
#undef DOUBLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#include "cuUtils_complex_template.Xcu"
#include "cuUtils_template.Xcu"
#endif
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file was originally written by NVIDIA
// and re-written by A. Marek, MPCDF
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
#ifdef DOUBLE_PRECISION_REAL
__device__ void reset_shared_block_c_double ( double * s_block, int b_size)
#else
__device__ void reset_shared_block_c_single ( float * s_block, int b_size)
#endif
{
int i, t_idx, s_chunk ;
t_idx = threadIdx.x;
s_chunk = b_size / blockDim.x;
for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++)
s_block[i] = 0.0 ;
__syncthreads();
}
// Reset 2 reduction blocks without an explicit synchronization at the end
// Limitation: : the thread-block size must be a divider of the reduction block's size
#ifdef DOUBLE_PRECISION_REAL
__device__ void reset_shared_block_pair_c_double( double *s_block_1, double *s_block_2, int b_size)
#else
__device__ void reset_shared_block_pair_c_single( float *s_block_1, float *s_block_2, int b_size)
#endif
{
int i, t_idx, s_chunk;
t_idx = threadIdx.x;
s_chunk = b_size / blockDim.x;
for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++)
{ s_block_1[i] = 0.0 ;
s_block_2[i] = 0.0 ;
}
}
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
#ifdef DOUBLE_PRECISION_REAL
__device__ void warp_reduce_c_double( double *s_block)
#else
__device__ void warp_reduce_c_single( float *s_block)
#endif
{
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
if (t_idx < 32)
{
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
if (t_idx < 8)
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 8] + s_block[t_idx + 16] + s_block[t_idx + 24];
if (t_idx < 4)
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 4];
if (t_idx < 1)
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3];
}
}
#ifdef DOUBLE_PRECISION_REAL
__global__ void my_pack_c_kernel_double(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst)
#else
__global__ void my_pack_c_kernel_single(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst)
#endif
{
int b_id, t_id ;
int dst_ind ;
b_id = blockIdx.y;
t_id = threadIdx.x;
dst_ind = b_id * stripe_width + t_id;
if (dst_ind < max_idx)
{
// dimension of dst - lnev, nblk
// dimension of src - stripe_width,a_dim2,stripe_count
*(dst + dst_ind + (l_nev*blockIdx.x) ) = *(src + t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2 ));
}
}
#ifdef DOUBLE_PRECISION_REAL
__global__ void my_unpack_c_kernel_double( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst)
#else
__global__ void my_unpack_c_kernel_single( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst)
#endif
{
int b_id, t_id ;
int src_ind;
b_id = blockIdx.y;
t_id = threadIdx.x;
src_ind = b_id * stripe_width + t_id;
if (src_ind < max_idx)
*(dst + (t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 ))) = *(src + src_ind + (blockIdx.x) *l_nev );
}
#ifdef DOUBLE_PRECISION_REAL
__global__ void extract_hh_tau_c_kernel_double(double* hh, double* hh_tau, const int nbw, const int n, int val)
#else
__global__ void extract_hh_tau_c_kernel_single(float* hh, float* hh_tau, const int nbw, const int n, int val)
#endif
{
int h_idx ;
h_idx = (blockIdx.x) * blockDim.x + threadIdx.x;
if (h_idx < n)
{
//dimension of hh - (nbw, max_blk_size)
//dimension of hh_tau - max_blk_size
*(hh_tau + h_idx ) = *(hh + (h_idx * nbw)) ;
// Replace the first element in the HH reflector with 1.0 or 0.0
if( val == 0)
*(hh + (h_idx * nbw)) = 1.0;
else
*(hh + (h_idx * nbw)) = 0.0;
}
}
#ifdef DOUBLE_PRECISION_REAL
__global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n)
{
__shared__ double hh_s[128] ;
#else
__global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n)
{
__shared__ float hh_s[128] ;
#endif
int t_idx, v_idx;
// The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
v_idx = blockIdx.x ;
// The thread index indicates the position within the two HH reflectors
t_idx = threadIdx.x ;
// // The contents of the shared memory must be fully reset
// reset_shared_block_c(hh_s, 128);
// Initialize the contents of the shared buffer (preparing for reduction)
if (t_idx > 0)
*(hh_s + t_idx) = *(hh + t_idx + v_idx * nbw ) * (*(hh + (t_idx - 1) + (v_idx +1)* nbw)) ;
else
*(hh_s + t_idx) = 0.0 ;
// Compute the dot product using a fast reduction
#ifdef DOUBLE_PRECISION_REAL
warp_reduce_c_double(hh_s);
#else
warp_reduce_c_single(hh_s);
#endif
if(t_idx == 0)
*(v_dot + v_idx) = *(hh_s) ;
}
#ifdef DOUBLE_PRECISION_REAL
extern "C" void launch_my_pack_c_kernel_double(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* a_dev, double* row_group_dev)
#else
extern "C" void launch_my_pack_c_kernel_single(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* a_dev, float* row_group_dev)
#endif
{
dim3 grid_size;
grid_size = dim3(row_count, stripe_count, 1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to mypack kernel: %s, %d\n",cudaGetErrorString(err), err);
#ifdef DOUBLE_PRECISION_REAL
my_pack_c_kernel_double<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev);
#else
my_pack_c_kernel_single<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev);
#endif
err = cudaGetLastError();
if ( err!= cudaSuccess)
{
printf("\n my pack_kernel failed %s \n",cudaGetErrorString(err) );
}
}
#ifdef DOUBLE_PRECISION_REAL
extern "C" void launch_compute_hh_dotp_c_kernel_double(double* bcast_buffer_dev, double* hh_dot_dev,const int nbw,const int n)
#else
extern "C" void launch_compute_hh_dotp_c_kernel_single(float* bcast_buffer_dev, float* hh_dot_dev,const int nbw,const int n)
#endif
{
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to compute_hh kernel: %s, %d\n",cudaGetErrorString(err), err);
#ifdef DOUBLE_PRECISION_REAL
compute_hh_dotp_c_kernel_double<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n);
#else
compute_hh_dotp_c_kernel_single<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n);
#endif
err = cudaGetLastError();
if ( err!= cudaSuccess)
{
printf("\n compute _kernel failed %s \n",cudaGetErrorString(err) );
}
}
#ifdef DOUBLE_PRECISION_REAL
extern "C" void launch_extract_hh_tau_c_kernel_double(double* bcast_buffer_dev, double* hh_tau_dev, const int nbw, const int n , const int is_zero)
#else
extern "C" void launch_extract_hh_tau_c_kernel_single(float* bcast_buffer_dev, float* hh_tau_dev, const int nbw, const int n , const int is_zero)
#endif
{
int grid_size;
grid_size = 1 + (n - 1) / 256;
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
#ifdef DOUBLE_PRECISION_REAL
extract_hh_tau_c_kernel_double<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#else
extract_hh_tau_c_kernel_single<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
#endif
err = cudaGetLastError();
if ( err!= cudaSuccess)
{
printf("\n extract _kernel failed %s \n",cudaGetErrorString(err) );
}
}
#ifdef DOUBLE_PRECISION_REAL
extern "C" void launch_my_unpack_c_kernel_double( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, double* row_group_dev, double* a_dev)
#else
extern "C" void launch_my_unpack_c_kernel_single( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, float* row_group_dev, float* a_dev)
#endif
{
dim3 grid_size;
grid_size = dim3(row_count, stripe_count, 1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to unpack kernel: %s, %d\n",cudaGetErrorString(err), err);
#ifdef DOUBLE_PRECISION_REAL
my_unpack_c_kernel_double<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev);
#else
my_unpack_c_kernel_single<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev);
#endif
err = cudaGetLastError();
if ( err != cudaSuccess)
{
printf("\n my_unpack_c_kernel failed %s \n",cudaGetErrorString(err) );
}
}
#ifndef MEMCPY_ALREADY_DEFINED
extern "C" int cuda_MemcpyDeviceToDevice(int val)
{
val = cudaMemcpyDeviceToDevice;
return val;
}
#define MEMCPY_ALREADY_DEFINED 1
#endif
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment