Fixed maxlinpc_gpu_check.sh

411e07e1 · Max Lindqvist · 995e2c16 · 411e07e1 · 411e07e1 · 411e07e1
Commit 411e07e1 authored 7 months ago by Max Lindqvist
--- a/2025-ACH-STRUPHY/00_GPU_TESTS/maxlinpc_gpu_check.sh
+++ b/2025-ACH-STRUPHY/00_GPU_TESTS/maxlinpc_gpu_check.sh
@@ -47,50 +47,55 @@ cd $FOLDER_NAME

 # ------------ HIP ------------ #
 echo "=== Compiling hello_hip_gpu ==="
-#hipcc -o hello_hip_gpu hello_hip_gpu.cpp
+# hipcc -o cpp/hello_hip_gpu cpp/hello_hip_gpu.cpp

 echo "=== Running ./hello_hip_gpu ==="
-#./hello_hip_gpu > hello_hip_gpu.out
+# ./cpp/hello_hip_gpu > cpp/hello_hip_gpu.out


 # ------------ CUDA ------------ #
 echo "=== Compiling hello_cuda_gpu ==="
-nvcc -o hello_cuda_gpu hello_cuda_gpu.c
+nvcc -o cuda/hello_cuda cuda/hello.cu

-echo "=== Running ./hello_cuda_gpu ==="
-./hello_cuda_gpu > hello_cuda_gpu.out
+echo "=== Running ./cuda/hello_cuda_gpu ==="
+./cuda/hello_cuda > cuda/hello.out


 # ------------ OpenMP ------------ #
 echo "=== Compiling hello_openmp_gpu_c ==="
 # nvcc -Xcompiler -fopenmp hello_openmp_gpu.c -o hello_openmp_gpu
-clang -fopenmp -fopenmp-targets=nvptx64 -L/usr/lib/llvm-18/lib -Wl,-rpath,/usr/lib/llvm-18/lib -o hello_openmp_gpu_c hello_openmp_gpu_c.c
+clang -fopenmp -fopenmp-targets=nvptx64 -L/usr/lib/llvm-18/lib -Wl,-rpath,/usr/lib/llvm-18/lib -o C/hello_openmp_gpu C/hello_openmp_gpu.c
+clang -fopenmp -fopenmp-targets=nvptx64 -L/usr/lib/llvm-18/lib -Wl,-rpath,/usr/lib/llvm-18/lib -o C/matmul_openmp_gpu C/matmul_openmp_gpu.c

 echo "=== Running ./hello_openmp_gpu_c ==="
-./hello_openmp_gpu_c > hello_openmp_gpu_c.out
+./C/hello_openmp_gpu > C/hello_openmp_gpu.out
+# ./C/matmul_openmp_gpu > ./C/matmul_openmp_gpu.out


 echo "=== Compiling hello_openmp_gpu_fortran ==="
-nvfortran -gpu=cc75 -O3 -fast -acc -mp=gpu -o hello_openmp_gpu_fortran hello_openmp_gpu_fortran.f90
+nvfortran -gpu=cc75 -O3 -fast -acc -mp=gpu -o fortran/hello_openmp_gpu_fortran fortran/hello_openmp_gpu_fortran.f90
+
+echo "=== Running ./fortran/hello_openmp_gpu_fortran ==="
+./fortran/hello_openmp_gpu_fortran > fortran/hello_openmp_gpu_fortran.out

-echo "=== Running ./hello_openmp_gpu_fortran ==="
-./hello_openmp_gpu_fortran > hello_openmp_gpu_fortran.out

 # # ------------ OpenMP + pyccel ------------ #
 export LD_LIBRARY_PATH="/usr/lib/llvm-18/lib:$LD_LIBRARY_PATH"

 echo "=== Compiling hello_openmp_gpu_pyccel ==="
-pyccel --language c --openmp --compiler compiler_clang_maxlinpc.json --verbose hello_openmp_gpu_pyccel.py
+#pyccel --language c --openmp --compiler compiler_clang_maxlinpc.json --verbose hello_openmp_gpu_pyccel.py
 #pyccel --language fortran --openmp --compiler compiler_nvfortran_maxlinpc.json --verbose hello_openmp_gpu_pyccel.py
-echo "=== Running ./hello_openmp_gpu_pyccel ==="
-nsys profile ./hello_openmp_gpu_pyccel > hello_openmp_gpu_pyccel.out
+#echo "=== Running ./hello_openmp_gpu_pyccel ==="
+#nsys profile ./hello_openmp_gpu_pyccel > hello_openmp_gpu_pyccel.out


-echo "=== Compiling pyccel_kernels ==="
-pyccel --language fortran --openmp --compiler compiler_nvfortran_maxlinpc.json --verbose pyccel_kernels.py
-echo "=== Running compute_gpu_pyccel.py ==="
+#echo "=== Compiling pyccel_kernels ==="
+#pyccel --language fortran --openmp --compiler compiler_nvfortran_maxlinpc.json --verbose pyccel_kernels.py
+#echo "=== Running compute_gpu_pyccel.py ==="
 # python compute_gpu_pyccel.py
-nsys profile --stats=true python compute_gpu_pyccel.py
+#nsys profile --stats=true python compute_gpu_pyccel.py
+
+

 # ------------ Cleanup ------------ #

--- a/2025-ACH-STRUPHY/00_GPU_TESTS/tests/python/makefile
+++ b/2025-ACH-STRUPHY/00_GPU_TESTS/tests/python/makefile
@@ -16,7 +16,8 @@ all: $(OUTPUT_SO)

 # Rule to run pyccel and generate the shared library
 $(OUTPUT_SO): $(PYTHON_FILE)
-	$(PYCCEL) --language $(LANGUAGE) --openmp --compiler $(COMPILER) --libdir $(LIBDIR) --verbose $(PYTHON_FILE)
+	# $(PYCCEL) --language $(LANGUAGE) --openmp --compiler $(COMPILER) --libdir $(LIBDIR) --verbose $(PYTHON_FILE)
+	$(PYCCEL) --language $(LANGUAGE) --openmp --compiler $(COMPILER) --verbose $(PYTHON_FILE)


 #--------------------------------------


--- a/2025-ACH-STRUPHY/00_GPU_TESTS/tests/python/pyccel_kernels.py
+++ b/2025-ACH-STRUPHY/00_GPU_TESTS/tests/python/pyccel_kernels.py
@@ -10,86 +10,86 @@ from pyccel.stdlib.internal.openmp import (
 def set_pi(pi: float) -> None:
    pi = 3.14159

-# def print_cpu_gpu_thread_info(
-#     N : int,
-#     data : 'int[:,:]',
-#     ):
-#     # Get the number of available GPU devices.
-#     num_devices = omp_get_num_devices()
-#     print("Number of available GPUs: ", num_devices)
-#     temp : int = 0
-#     # data[:,:] = np.empty((N, 4), dtype=int)
-#     # CPU
-#     print('Loop with CPU:')
-#     #$ omp parallel for 
-#     for i in range(N):
-#         tid = omp_get_thread_num()          # Thread id within the team.
-#         nthreads = omp_get_num_threads()    # Number of threads.
-#         team = omp_get_team_num()           # Team number.
-#         nteams = omp_get_num_teams()
+def print_cpu_gpu_thread_info(
+    N : int,
+    data : 'int[:,:]',
+    ):
+    # Get the number of available GPU devices.
+    num_devices = omp_get_num_devices()
+    print("Number of available GPUs: ", num_devices)
+    temp : int = 0
+    # data[:,:] = np.empty((N, 4), dtype=int)
+    # CPU
+    print('Loop with CPU:')
+    #$ omp parallel for 
+    for i in range(N):
+        tid = omp_get_thread_num()          # Thread id within the team.
+        nthreads = omp_get_num_threads()    # Number of threads.
+        team = omp_get_team_num()           # Team number.
+        nteams = omp_get_num_teams()
+
+        data[i, 0] = team
+        data[i, 1] = nteams
+        data[i, 2] = tid
+        data[i, 3] = nthreads
+        temp = temp + 1
+    
+    for i in range(N):
+        print("CPU: Iteration", i, "processed by team", data[i, 0], "/", data[i, 1], ", thread", data[i, 2],  "/", data[i, 3])
+
+    print('----------------------------------\n\n\n')
+    # GPU
+    print('Loop with GPU:')
+    # 
+    #$ omp target teams distribute parallel for schedule(static) private(temp)

+    for i in range(N):
+        tid = omp_get_thread_num()          # Thread id within the team.
+        nthreads = omp_get_num_threads()    # Number of threads.
+        team = omp_get_team_num()           # Team number.
+        nteams = omp_get_num_teams()
+        # print("GPU: Iteration", i, "processed by team", team, "/", nteams, "thread", tid,  "/", nthreads, "threads")
        # data[i, 0] = team
        # data[i, 1] = nteams
        # data[i, 2] = tid
        # data[i, 3] = nthreads
        # temp = temp + 1

-#     for i in range(N):
-#         print("CPU: Iteration", i, "processed by team", data[i, 0], "/", data[i, 1], ", thread", data[i, 2],  "/", data[i, 3])
-
-#     print('----------------------------------\n\n\n')
-#     # GPU
-#     print('Loop with GPU:')
-#     # 
-#     #$ omp target teams distribute parallel for schedule(static) private(temp)
    
 #     for i in range(N):
-#         tid = omp_get_thread_num()          # Thread id within the team.
-#         nthreads = omp_get_num_threads()    # Number of threads.
-#         team = omp_get_team_num()           # Team number.
-#         nteams = omp_get_num_teams()
-#         # print("GPU: Iteration", i, "processed by team", team, "/", nteams, "thread", tid,  "/", nthreads, "threads")
-#         # data[i, 0] = team
-#         # data[i, 1] = nteams
-#         # data[i, 2] = tid
-#         # data[i, 3] = nthreads
-#         # temp = temp + 1
-
-    
-# #     for i in range(N):
-# #         print("GPU: Iteration", i, "processed by team", data[i, 0], "/", data[i, 1], ", thread", data[i, 2],  "/", data[i, 3])
+#         print("GPU: Iteration", i, "processed by team", data[i, 0], "/", data[i, 1], ", thread", data[i, 2],  "/", data[i, 3])

 def axpy(a: 'float', x: 'float[:]', y: 'float[:]'):
    N: int = x.shape[0]
    for i in range(N):
        y[i] = a * x[i] + y[i]

-# def axpy_gpu(a: float, x: 'float[:]', y: 'float[:]'):
-#     N: int = x.shape[0]
-#     #$ omp target teams distribute parallel for schedule(static)
-#     for i in range(N):
-#         y[i] = a * x[i] + y[i]
+def axpy_gpu(a: float, x: 'float[:]', y: 'float[:]'):
+    N: int = x.shape[0]
+    #$ omp target teams distribute parallel for schedule(static)
+    for i in range(N):
+        y[i] = a * x[i] + y[i]
            
-# def heavy_compute_cpu(x: 'float[:]', y: 'float[:]'):
-#     N: int = x.shape[0]
-#     temp: float = 0.0
-#     for i in range(N):
-#         temp = x[i]
-#         # A heavy inner loop to increase arithmetic intensity
-#         for j in range(1000):
-#             temp = np.sqrt(temp + 1.0)
-#         y[i] = temp
-
-# # GPU version: offloaded using OpenMP target directive
-# def heavy_compute_gpu(x: 'float[:]', y: 'float[:]'):
-#     N: int = x.shape[0]
-#     temp: float = 0.0
-#     #$ omp target teams distribute parallel for schedule(static)
-#     for i in range(N):
-#         temp = x[i]
-#         for j in range(1000):
-#             temp =  np.sqrt(temp + 1.0)
-#         y[i] = temp
+def heavy_compute_cpu(x: 'float[:]', y: 'float[:]'):
+    N: int = x.shape[0]
+    temp: float = 0.0
+    for i in range(N):
+        temp = x[i]
+        # A heavy inner loop to increase arithmetic intensity
+        for j in range(1000):
+            temp = np.sqrt(temp + 1.0)
+        y[i] = temp
+
+# GPU version: offloaded using OpenMP target directive
+def heavy_compute_gpu(x: 'float[:]', y: 'float[:]'):
+    N: int = x.shape[0]
+    temp: float = 0.0
+    #$ omp target teams distribute parallel for schedule(static)
+    for i in range(N):
+        temp = x[i]
+        for j in range(1000):
+            temp =  np.sqrt(temp + 1.0)
+        y[i] = temp

 def matmul_cpu(A: 'float[:,:]', B: 'float[:,:]', C: 'float[:,:]'):
    N: int = A.shape[0]
@@ -101,16 +101,16 @@ def matmul_cpu(A: 'float[:,:]', B: 'float[:,:]', C: 'float[:,:]'):
                s += A[i, k] * B[k, j]
            C[i, j] = s

-# def matmul_gpu(A: 'float[:,:]', B: 'float[:,:]', C: 'float[:,:]'):
-#     N: int = A.shape[0]
-#     s: float = 0.0
-#     #$ omp target teams distribute parallel for collapse(2) map(tofrom:C)
-#     for i in range(N):
-#         for j in range(N):
-#             s = 0.0
-#             for k in range(N):
-#                 s += A[i, k] * B[k, j] 
-#             C[i, j] = s
+def matmul_gpu(A: 'float[:,:]', B: 'float[:,:]', C: 'float[:,:]'):
+    N: int = A.shape[0]
+    s: float = 0.0
+    #$ omp target teams distribute parallel for collapse(2) map(tofrom:C)
+    for i in range(N):
+        for j in range(N):
+            s = 0.0
+            for k in range(N):
+                s += A[i, k] * B[k, j] 
+            C[i, j] = s


 def test_matmul():