From 751aedeba15a0018924c8baf4de90b11fc282d48 Mon Sep 17 00:00:00 2001
From: Repo Updater <noreply@mpcdf.mpg.de>
Date: Wed, 6 Oct 2021 07:08:16 +0200
Subject: [PATCH] 62e5e3b4 some adaptations of Diffusion and Cython

---
 notebooks/2b--Cython.ipynb    |  53 ++++++++------
 notebooks/2d--Diffusion.ipynb | 133 ++++++++++++++++++----------------
 2 files changed, 102 insertions(+), 84 deletions(-)

diff --git a/notebooks/2b--Cython.ipynb b/notebooks/2b--Cython.ipynb
index 0dc9768..647474c 100644
--- a/notebooks/2b--Cython.ipynb
+++ b/notebooks/2b--Cython.ipynb
@@ -11,7 +11,7 @@
     "# Cython\n",
     "**Python for HPC course**\n",
     "\n",
-    "Sebastian Ohlmann, Klaus Reuter\n",
+    "2018 - 2021 Sebastian Ohlmann, Klaus Reuter\n",
     "\n",
     "Max Planck Computing and Data Facility, Garching"
    ]
@@ -45,16 +45,16 @@
     }
    },
    "source": [
-    "### Cython Overview\n",
-    "* \"Cython is Python with C data types\"\n",
-    "    * Cython is a superset of the Python language\n",
-    "    * Cython is a source-to-source compiler\n",
+    "### Cython?\n",
     "\n",
+    "* Cython is Python extended with C data types $\\to$ Cython is a superset of the Python language\n",
+    "* Cython is a source-to-source compiler\n",
     "\n",
-    "### Cython Workflow\n",
-    "1.  move performance-critical code from Python to Cython (`.pyx`)\n",
-    "2.  Cython compiler translates `.pyx` code into `.c` code\n",
-    "3.  C code is finally compiled into a Python module (Linux `.so`) by a C compiler (e.g. `gcc`)\n"
+    "### Workflow\n",
+    "\n",
+    "1. create Cython source file (`.pyx`), e.g. by moving performance-critical code from Python to Cython\n",
+    "2. apply Cython compiler which translates `.pyx` code into `.c` code\n",
+    "3. C code is finally compiled into a Python module (Linux `.so`) by a C compiler (e.g. `gcc`)\n"
    ]
   },
   {
@@ -65,8 +65,9 @@
     }
    },
    "source": [
-    "### Cython Advantages\n",
-    "* achieve performance close to native C/C++ or Fortran code\n",
+    "### Advantages\n",
+    "\n",
+    "* achieve performance close to native C/C++ or Fortran code while keeping Python-like code\n",
     "    * Cython code is compiled, not interpreted\n",
     "    * compiler optimizations can be applied (e.g. vectorization)\n",
     "    * OpenMP thread parallelization becomes possible\n",
@@ -82,9 +83,10 @@
     }
    },
    "source": [
-    "### Cython Compilation\n",
-    "* in principle, shell scripts or Makefiles can be used, but avoid this\n",
-    "* better: use a simple `setup.py` to compile your Cython code reliably\n",
+    "### Compilation\n",
+    "\n",
+    "* in principle, shell scripts or Makefiles can be used (but avoid this)\n",
+    "* better: use a simple `setup.py` script to compile and install your Cython code properly\n",
     "* see the simple example at `cython/hello_world`"
    ]
   },
@@ -157,10 +159,11 @@
     }
    },
    "source": [
-    "### Integration with Jupyter notebooks\n",
-    "* Cython code can be compiled and used directly from a Jupyter notebook (`%load_ext Cython`)\n",
-    "* use the cell magic `%%cython` to compile a Jupyter cell\n",
-    "* using `-c=` is is possible to specify compiler optimization flags"
+    "### Cython integration with Jupyter notebooks\n",
+    "\n",
+    "* Cython code can be compiled and used directly from a Jupyter notebook after loading (`%load_ext Cython`)\n",
+    "* use the cell magic `%%cython` to compile a Jupyter cell with Cython code\n",
+    "* using `-c=` allows to specify compiler optimization flags, for linker flags use `--link-args`"
    ]
   },
   {
@@ -356,7 +359,7 @@
    "source": [
     "## Interfacing C/C++ code with Cython\n",
     "\n",
-    "### $\\rightarrow$ see `Interfacing_with_C_and_F.ipynb`"
+    "### $\\rightarrow$ continue with the `Interfacing_with_C_and_F.ipynb` notebook"
    ]
   },
   {
@@ -368,9 +371,10 @@
    },
    "source": [
     "## Cython summary\n",
+    "\n",
     "* Cython speeds up Python code by converting it into C and compiling it\n",
     "* Workflow\n",
-    "    * start with existing Python code, move it into `.pyx` file, create basic `setup.py`\n",
+    "    * start with existing (critical) Python code, move it to `.pyx` file, create basic `setup.py`\n",
     "    * introduce basic type declarations, e.g. `cdef int a`\n",
     "    * introduce NumPy array declarations, e.g.  \n",
     "      `np.ndarray[np.float64_t, ndim=2] grid`\n",
@@ -380,6 +384,13 @@
     "    * http://cython.org/ for in-depth information, in particular\n",
     "    * http://cython.readthedocs.io/en/latest/src/tutorial/numpy.html"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -399,7 +410,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.3"
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/2d--Diffusion.ipynb b/notebooks/2d--Diffusion.ipynb
index c152ebe..6d75981 100644
--- a/notebooks/2d--Diffusion.ipynb
+++ b/notebooks/2d--Diffusion.ipynb
@@ -246,8 +246,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 192 ms, sys: 1.37 ms, total: 194 ms\n",
-      "Wall time: 192 ms\n"
+      "CPU times: user 192 ms, sys: 3.83 ms, total: 196 ms\n",
+      "Wall time: 195 ms\n"
      ]
     }
    ],
@@ -299,8 +299,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 4.32 ms, sys: 67 µs, total: 4.39 ms\n",
-      "Wall time: 3.41 ms\n"
+      "CPU times: user 4.39 ms, sys: 29 µs, total: 4.42 ms\n",
+      "Wall time: 3.51 ms\n"
      ]
     }
    ],
@@ -339,8 +339,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2.9 ms, sys: 0 ns, total: 2.9 ms\n",
-      "Wall time: 2.38 ms\n"
+      "CPU times: user 4.59 ms, sys: 48 µs, total: 4.64 ms\n",
+      "Wall time: 3.64 ms\n"
      ]
     }
    ],
@@ -362,8 +362,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 14.3 s, sys: 4.85 s, total: 19.1 s\n",
-      "Wall time: 19.1 s\n"
+      "CPU times: user 13.7 s, sys: 5.24 s, total: 19 s\n",
+      "Wall time: 19 s\n"
      ]
     }
    ],
@@ -481,8 +481,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 12.1 s, sys: 3.87 ms, total: 12.1 s\n",
-      "Wall time: 12.1 s\n"
+      "CPU times: user 12.2 s, sys: 5.63 ms, total: 12.2 s\n",
+      "Wall time: 12.2 s\n"
      ]
     }
    ],
@@ -553,8 +553,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 12 s, sys: 7.75 ms, total: 12 s\n",
-      "Wall time: 12 s\n"
+      "CPU times: user 12.5 s, sys: 18.8 ms, total: 12.5 s\n",
+      "Wall time: 12.5 s\n"
      ]
     }
    ],
@@ -633,8 +633,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1.72 ms, sys: 210 µs, total: 1.93 ms\n",
-      "Wall time: 1.93 ms\n"
+      "CPU times: user 694 µs, sys: 90 µs, total: 784 µs\n",
+      "Wall time: 788 µs\n"
      ]
     }
    ],
@@ -656,8 +656,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 297 µs, sys: 36 µs, total: 333 µs\n",
-      "Wall time: 243 µs\n"
+      "CPU times: user 228 µs, sys: 19 µs, total: 247 µs\n",
+      "Wall time: 185 µs\n"
      ]
     }
    ],
@@ -698,8 +698,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 9.91 s, sys: 11.8 ms, total: 9.92 s\n",
-      "Wall time: 9.92 s\n"
+      "CPU times: user 10.1 s, sys: 19.3 ms, total: 10.1 s\n",
+      "Wall time: 10.1 s\n"
      ]
     }
    ],
@@ -770,8 +770,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 16.5 s, sys: 3.72 ms, total: 16.5 s\n",
-      "Wall time: 16.5 s\n"
+      "CPU times: user 16.1 s, sys: 7.69 ms, total: 16.1 s\n",
+      "Wall time: 16.1 s\n"
      ]
     }
    ],
@@ -891,8 +891,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2.48 ms, sys: 183 µs, total: 2.66 ms\n",
-      "Wall time: 2.05 ms\n"
+      "CPU times: user 2.58 ms, sys: 0 ns, total: 2.58 ms\n",
+      "Wall time: 2.03 ms\n"
      ]
     }
    ],
@@ -4726,8 +4726,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2.12 s, sys: 368 µs, total: 2.12 s\n",
-      "Wall time: 2.12 s\n"
+      "CPU times: user 1.93 s, sys: 4 ms, total: 1.93 s\n",
+      "Wall time: 1.93 s\n"
      ]
     }
    ],
@@ -4831,8 +4831,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3.37 s, sys: 22 µs, total: 3.37 s\n",
-      "Wall time: 3.37 s\n"
+      "CPU times: user 3.36 s, sys: 54 µs, total: 3.37 s\n",
+      "Wall time: 3.36 s\n"
      ]
     }
    ],
@@ -4878,7 +4878,7 @@
      "output_type": "stream",
      "text": [
       "rm -f diff_mpi.exe diff.exe *.o *.mod\n",
-      "gfortran -O3 -march=native -fopt-info-vec -fopenmp -fno-strict-aliasing -o diff.exe diff.F90\n",
+      "f77 -O3 -march=native -fopt-info-vec -fopenmp -o diff.exe diff.F90\n",
       "diff.F90:30:0: optimized: loop vectorized using 32 byte vectors\n",
       "diff.F90:30:0: optimized:  loop versioned for vectorization because of possible aliasing\n",
       "diff.F90:19:0: optimized: loop vectorized using 32 byte vectors\n",
@@ -4931,7 +4931,7 @@
       "          90 %\n",
       "          95 %\n",
       "         100 %\n",
-      " main loop time =   1.2827631459999793     \n"
+      " main loop time =   1.2748106609797105     \n"
      ]
     }
    ],
@@ -5046,8 +5046,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2.19 s, sys: 7.93 ms, total: 2.2 s\n",
-      "Wall time: 1.11 s\n"
+      "CPU times: user 2.07 s, sys: 434 µs, total: 2.07 s\n",
+      "Wall time: 1.04 s\n"
      ]
     }
    ],
@@ -5096,7 +5096,7 @@
       "          90 %\n",
       "          95 %\n",
       "         100 %\n",
-      " main loop time =   1.2201892590001080     \n"
+      " main loop time =   1.1477611069567502     \n"
      ]
     }
    ],
@@ -5130,7 +5130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 63,
    "metadata": {
     "slideshow": {
      "slide_type": "-"
@@ -5143,7 +5143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 64,
    "metadata": {
     "slideshow": {
      "slide_type": "subslide"
@@ -5151,7 +5151,7 @@
    },
    "outputs": [],
    "source": [
-    "@jit(nopython=True, fastmath=True)\n",
+    "@jit(nopython=True)\n",
     "def apply_periodic_bc_python_numba(grid, n_points):\n",
     "    \"\"\"Explicitly apply periodic boundary conditions, via Python loops.\"\"\"\n",
     "    for j in range(n_points + 2):\n",
@@ -5164,7 +5164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 65,
    "metadata": {
     "slideshow": {
      "slide_type": "-"
@@ -5172,7 +5172,7 @@
    },
    "outputs": [],
    "source": [
-    "@jit(nopython=True, fastmath=True)  # parallel=True turns out to slow down the code a bit\n",
+    "@jit(nopython=True)\n",
     "def evolve_python_numba(grid, grid_tmp, n_points, dt, D):\n",
     "    apply_periodic_bc_python_numba(grid, n_points)\n",
     "    for i in range(1, n_points+1):\n",
@@ -5185,7 +5185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 66,
    "metadata": {
     "scrolled": true,
     "slideshow": {
@@ -5200,7 +5200,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 67,
    "metadata": {
     "scrolled": true,
     "slideshow": {
@@ -5212,8 +5212,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3.91 s, sys: 3.84 ms, total: 3.91 s\n",
-      "Wall time: 3.91 s\n"
+      "CPU times: user 3.69 s, sys: 271 µs, total: 3.69 s\n",
+      "Wall time: 3.7 s\n"
      ]
     }
    ],
@@ -5225,7 +5225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 68,
    "metadata": {
     "slideshow": {
      "slide_type": "subslide"
@@ -5233,7 +5233,7 @@
    },
    "outputs": [],
    "source": [
-    "@jit(nopython=True, fastmath=True, parallel=True)\n",
+    "@jit(nopython=True, parallel=True)\n",
     "def evolve_python_numba_parallel(grid, grid_tmp, n_points, dt, D):\n",
     "    apply_periodic_bc_python_numba(grid, n_points)\n",
     "    for i in prange(1, n_points+1):\n",
@@ -5246,7 +5246,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 69,
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
@@ -5260,7 +5260,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 70,
    "metadata": {
     "slideshow": {
      "slide_type": "-"
@@ -5271,8 +5271,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3.98 s, sys: 126 ms, total: 4.11 s\n",
-      "Wall time: 1.96 s\n"
+      "CPU times: user 3.76 s, sys: 95.6 ms, total: 3.85 s\n",
+      "Wall time: 1.84 s\n"
      ]
     }
    ],
@@ -5284,7 +5284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 71,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -5326,7 +5326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 72,
    "metadata": {
     "slideshow": {
      "slide_type": "subslide"
@@ -5334,7 +5334,7 @@
    },
    "outputs": [],
    "source": [
-    "@jit(nopython=True, fastmath=True, parallel=True)\n",
+    "@jit(nopython=True, parallel=True)\n",
     "def apply_periodic_bc_numba(grid, n_points):\n",
     "    \"\"\"Explicitly apply periodic boundary conditions, using NumPy ranges.\"\"\"\n",
     "    grid[ 0, :] = grid[-2, :]\n",
@@ -5345,7 +5345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 73,
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
@@ -5353,7 +5353,7 @@
    },
    "outputs": [],
    "source": [
-    "@jit(nopython=True, fastmath=True)\n",
+    "@jit(nopython=True)\n",
     "def evolve_np_slicing_numba(grid, grid_tmp, n_points, dt, D):\n",
     "    \"\"\"Time step based on an explicitly coded Laplacian using array slicing.\"\"\"\n",
     "    apply_periodic_bc_numba(grid, n_points)\n",
@@ -5365,7 +5365,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 74,
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
@@ -5379,7 +5379,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 75,
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
@@ -5390,8 +5390,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 9.37 s, sys: 342 ms, total: 9.71 s\n",
-      "Wall time: 8.5 s\n"
+      "CPU times: user 9.02 s, sys: 267 ms, total: 9.29 s\n",
+      "Wall time: 7.97 s\n"
      ]
     }
    ],
@@ -5403,7 +5403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 76,
    "metadata": {
     "slideshow": {
      "slide_type": "subslide"
@@ -5411,7 +5411,7 @@
    },
    "outputs": [],
    "source": [
-    "@jit(nopython=True, fastmath=True, parallel=True)\n",
+    "@jit(nopython=True, parallel=True)\n",
     "def evolve_np_slicing_numba_parallel(grid, grid_tmp, n_points, dt, D):\n",
     "    \"\"\"Time step based on an explicitly coded Laplacian using array slicing.\"\"\"\n",
     "    apply_periodic_bc_numba(grid, n_points)\n",
@@ -5423,7 +5423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 77,
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
@@ -5437,7 +5437,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 78,
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
@@ -5448,8 +5448,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 4.75 s, sys: 423 ms, total: 5.17 s\n",
-      "Wall time: 2.03 s\n"
+      "CPU times: user 4.06 s, sys: 425 ms, total: 4.49 s\n",
+      "Wall time: 1.75 s\n"
      ]
     }
    ],
@@ -5461,7 +5461,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 79,
    "metadata": {
     "slideshow": {
      "slide_type": "skip"
@@ -5515,8 +5515,15 @@
     "* due to knowledge about the structure of the stencil, the code can be highly optimized and may outperform cython or Numba\n",
     "* install via conda or pip, find more information at  \n",
     "  https://i10git.cs.fau.de/pycodegen/pystencils\n",
-    "* optional exercise: implement the computation using pystencils"
+    "* optional exercise: implement the diffusion computation using pystencils"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
-- 
GitLab