From 22ae6ac2ad0b003ebd1c627146b9e7539b0abd18 Mon Sep 17 00:00:00 2001
From: Haoyu Yang <yanghaoyu97@outlook.com>
Date: Mon, 7 Apr 2025 11:07:48 +0000
Subject: [PATCH] Resolve "Optimize APP startup time by lazily import some
 packages"

---
 nomad/atomutils.py                            |  4 +-
 nomad/config/__init__.py                      |  2 +-
 nomad/datamodel/datamodel.py                  | 14 ++--
 nomad/datamodel/metainfo/eln/__init__.py      | 37 +++++++--
 nomad/datamodel/metainfo/eln/eqe_parser.py    |  5 +-
 nomad/datamodel/metainfo/plot.py              |  9 ++-
 .../datamodel/metainfo/simulation/workflow.py | 55 +++++++------
 nomad/parsing/parser.py                       | 77 +++++++++++--------
 nomad/patch.py                                |  2 +-
 nomad/utils/__init__.py                       |  3 +-
 10 files changed, 128 insertions(+), 80 deletions(-)

diff --git a/nomad/atomutils.py b/nomad/atomutils.py
index 517fb63388..4a5aea5dd2 100644
--- a/nomad/atomutils.py
+++ b/nomad/atomutils.py
@@ -34,7 +34,6 @@ import numpy as np
 from ase import Atoms
 from ase.formula import Formula as ASEFormula
 from ase.utils import pbc2pbc
-from scipy.spatial import Voronoi  # pylint: disable=no-name-in-module
 
 from nomad.aflow_prototypes import aflow_prototypes
 from nomad.constants import atomic_masses
@@ -810,6 +809,9 @@ def get_brillouin_zone(reciprocal_lattice: np.ndarray) -> dict:
             first Brillouin zone. The order of these indices matter, because
             only when combined sequentially they form the correct face.
     """
+    # Lazily import expensive `scipy`
+    from scipy.spatial import Voronoi
+
     # Create the near lattice points that surround the origin
     b1 = reciprocal_lattice[0, :]
     b2 = reciprocal_lattice[1, :]
diff --git a/nomad/config/__init__.py b/nomad/config/__init__.py
index b898ed97ee..7fc6618898 100644
--- a/nomad/config/__init__.py
+++ b/nomad/config/__init__.py
@@ -24,7 +24,7 @@ this module.
 
 All parameters are structured into objects for two reasons. First, to have
 categories. Second, to allow runtime manipulation that is not effected
-by python import logic. The categories are choosen along infrastructure components:
+by python import logic. The categories are chosen along infrastructure components:
 ``mongo``, ``elastic``, etc.
 
 This module also provides utilities to read the configuration from environment variables
diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py
index 3b048eec51..e067705c84 100644
--- a/nomad/datamodel/datamodel.py
+++ b/nomad/datamodel/datamodel.py
@@ -57,13 +57,13 @@ from .util import parse_path
 # due to the next imports requiring the m_package already, this would be too late.
 m_package = Package()
 
-from .results import Results  # noqa
-from .data import EntryData, ArchiveSection, User, UserReference, AuthorReference  # noqa
-from .optimade import OptimadeEntry  # noqa
-from .metainfo.simulation.legacy_workflows import Workflow as LegacySimulationWorkflow  # noqa
-from .metainfo.workflow import Workflow  # noqa
-from .metainfo.measurements import Measurement  # noqa
-from .metainfo.tabulartree import TabularTree  # noqa
+from .results import Results  # noqa: I001
+from .data import EntryData, ArchiveSection, User, UserReference, AuthorReference
+from .optimade import OptimadeEntry
+from .metainfo.simulation.legacy_workflows import Workflow as LegacySimulationWorkflow
+from .metainfo.workflow import Workflow
+from .metainfo.measurements import Measurement
+from .metainfo.tabulartree import TabularTree
 
 try:
     from runschema.run import Run as run_def
diff --git a/nomad/datamodel/metainfo/eln/__init__.py b/nomad/datamodel/metainfo/eln/__init__.py
index e9faca6042..f8db1db864 100644
--- a/nomad/datamodel/metainfo/eln/__init__.py
+++ b/nomad/datamodel/metainfo/eln/__init__.py
@@ -17,7 +17,9 @@
 #
 
 import datetime
+import importlib
 import re
+import warnings
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
@@ -25,12 +27,6 @@ from unidecode import unidecode
 
 from nomad.datamodel.metainfo.plot import PlotSection
 
-if TYPE_CHECKING:
-    from structlog.stdlib import (
-        BoundLogger,
-    )
-from ase.data import atomic_masses, atomic_numbers, chemical_symbols
-
 from nomad import utils
 from nomad.datamodel.data import (
     ArchiveSection,
@@ -68,7 +64,6 @@ from nomad.datamodel.metainfo.basesections.v1 import (
     SystemComponent as Component,
 )
 from nomad.datamodel.metainfo.common import ProvenanceTracker
-from nomad.datamodel.metainfo.eln.eqe_parser import EQEAnalyzer
 from nomad.datamodel.results import (
     ELN,
     BandGap,
@@ -87,6 +82,31 @@ from nomad.metainfo.metainfo import Category, MCategory, MEnum, MProxy, MSection
 from nomad.units import ureg
 
 
+class _LazyEQEAnalyzer:
+    """Lazily import expensive EQEAnalyzer."""
+
+    def __new__(cls, *args, **kwargs):
+        warnings.warn(
+            "Importing 'EQEAnalyzer' from this module is deprecated. "
+            "Please import it directly from 'nomad.datamodel.metainfo.eln.eqe_parser'.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        EQEAnalyzer = importlib.import_module(
+            'nomad.datamodel.metainfo.eln.eqe_parser'
+        ).EQEAnalyzer
+        return EQEAnalyzer(*args, **kwargs)
+
+
+EQEAnalyzer = _LazyEQEAnalyzer
+
+if TYPE_CHECKING:
+    from structlog.stdlib import (
+        BoundLogger,
+    )
+
+
 def add_band_gap(archive, band_gap):
     """Adds a band gap value (in eV) with the additional section structure for solar
     cell data.eV=
@@ -1679,6 +1699,9 @@ class SolarCellEQE(PlotSection):
 
         if self.eqe_data_file:
             with archive.m_context.raw_file(self.eqe_data_file) as f:
+                # Import `EQEAnalyzer` is slow (owing to scipy)
+                from nomad.datamodel.metainfo.eln.eqe_parser import EQEAnalyzer
+
                 eqe_dict = EQEAnalyzer(
                     f.name, header_lines=self.header_lines
                 ).eqe_dict()
diff --git a/nomad/datamodel/metainfo/eln/eqe_parser.py b/nomad/datamodel/metainfo/eln/eqe_parser.py
index fb8bcf5177..ab52e9a9d4 100644
--- a/nomad/datamodel/metainfo/eln/eqe_parser.py
+++ b/nomad/datamodel/metainfo/eln/eqe_parser.py
@@ -24,7 +24,6 @@
 
 import os
 
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from scipy import integrate, optimize
@@ -335,6 +334,8 @@ class EQEAnalyzer:
         """
         Plots the extrapolated eqe ad the raw eqe.
         """
+        import matplotlib.pyplot as plt
+
         x, y = self.arrange_eqe_columns()
         photon_energy_extrapolated, eqe_extrapolated = self.extrapolate_eqe()
         bandgap = self.calculate_bandgap()
@@ -355,6 +356,8 @@ class EQEAnalyzer:
         plt.show()
 
     def plot_eqe_raw(self):
+        import matplotlib.pyplot as plt
+
         x, y = self.arrange_eqe_columns()
         plt.rcParams.update({'font.size': 16, 'font.family': 'Arial'})
         plt.ylim(1e-4, 1.1)
diff --git a/nomad/datamodel/metainfo/plot.py b/nomad/datamodel/metainfo/plot.py
index caaf30037f..098b12871c 100644
--- a/nomad/datamodel/metainfo/plot.py
+++ b/nomad/datamodel/metainfo/plot.py
@@ -19,9 +19,6 @@ from copy import deepcopy
 from datetime import datetime
 
 import numpy as np
-import plotly.express as px
-import plotly.graph_objs as go
-from plotly.subplots import make_subplots
 
 from nomad.datamodel.data import ArchiveSection
 from nomad.metainfo import JSON, MSection, Package, Quantity, Section, SubSection
@@ -100,6 +97,8 @@ def resolve_plot_references(annotations, section, archive, logger):
 
 
 def express_do_plot(plotly_express_annotation, section, archive, logger):
+    import plotly.express as px
+
     method_name = plotly_express_annotation.pop('method')
     layout = plotly_express_annotation.get('layout', None)
     if layout:
@@ -271,6 +270,8 @@ class PlotSection(ArchiveSection):
                 )
 
         if plotly_express_annotations:
+            import plotly.graph_objs as go
+
             for plotly_express_annotation in plotly_express_annotations:
                 try:
                     label, figure_index, figure_open = get_figure_layout(
@@ -318,6 +319,8 @@ class PlotSection(ArchiveSection):
                         raise PlotSectionError(error)
 
         if plotly_subplots_annotations:
+            from plotly.subplots import make_subplots
+
             for plotly_subplots_annotation in plotly_subplots_annotations:
                 try:
                     label, figure_index, figure_open = get_figure_layout(
diff --git a/nomad/datamodel/metainfo/simulation/workflow.py b/nomad/datamodel/metainfo/simulation/workflow.py
index f0cbe78470..dc831eccff 100644
--- a/nomad/datamodel/metainfo/simulation/workflow.py
+++ b/nomad/datamodel/metainfo/simulation/workflow.py
@@ -22,9 +22,7 @@
 
 import numpy as np
 from ase import Atoms
-from ase.eos import EquationOfState as aseEOS
 
-from nomad.atomutils import get_volume
 from nomad.datamodel.data import ArchiveSection
 from nomad.datamodel.metainfo.common import FastAccess
 from nomad.datamodel.metainfo.simulation.calculation import (
@@ -3718,6 +3716,8 @@ class EquationOfState(ParallelSimulation):
                 pass
 
         if self.results.volumes is None:
+            from nomad.atomutils import get_volume
+
             try:
                 volumes = []
                 unit = 1
@@ -3730,7 +3730,14 @@ class EquationOfState(ParallelSimulation):
             except Exception:
                 pass
 
-        if not self.results.eos_fit:
+        if (
+            not self.results.eos_fit
+            and self.results.volumes is not None
+            and self.results.energies is not None
+        ):
+            # `aseEOS` import is slow (owing to internal `scipy`)
+            from ase.eos import EquationOfState as aseEOS
+
             function_name_map = {
                 'birch_murnaghan': 'birchmurnaghan',
                 'pourier_tarantola': 'pouriertarantola',
@@ -3738,27 +3745,27 @@ class EquationOfState(ParallelSimulation):
                 'murnaghan': 'murnaghan',
                 'birch_euler': 'birch',
             }
-            if self.results.volumes is not None and self.results.energies is not None:
-                # convert to ase units in order for function optimization to work
-                volumes = self.results.volumes.to('angstrom ** 3').magnitude
-                energies = self.results.energies.to('eV').magnitude
-                for function_name, ase_name in function_name_map.items():
-                    try:
-                        eos = aseEOS(volumes, energies, ase_name)
-                        eos.fit()
-                        fitted_energies = eos.func(volumes, *eos.eos_parameters)
-                        rms_error = np.sqrt(np.mean((fitted_energies - energies) ** 2))
-                        eos_fit = EOSFit(
-                            function_name=function_name,
-                            fitted_energies=fitted_energies * ureg.eV,
-                            bulk_modulus=eos.B * ureg.eV / ureg.angstrom**3,
-                            equilibrium_volume=eos.v0 * ureg.angstrom**3,
-                            equilibrium_energy=eos.e0 * ureg.eV,
-                            rms_error=rms_error,
-                        )
-                        self.results.eos_fit.append(eos_fit)
-                    except Exception:
-                        self.logger.warning('EOS fit not succesful.')
+
+            # convert to ase units in order for function optimization to work
+            volumes = self.results.volumes.to('angstrom ** 3').magnitude
+            energies = self.results.energies.to('eV').magnitude
+            for function_name, ase_name in function_name_map.items():
+                try:
+                    eos = aseEOS(volumes, energies, ase_name)
+                    eos.fit()
+                    fitted_energies = eos.func(volumes, *eos.eos_parameters)
+                    rms_error = np.sqrt(np.mean((fitted_energies - energies) ** 2))
+                    eos_fit = EOSFit(
+                        function_name=function_name,
+                        fitted_energies=fitted_energies * ureg.eV,
+                        bulk_modulus=eos.B * ureg.eV / ureg.angstrom**3,
+                        equilibrium_volume=eos.v0 * ureg.angstrom**3,
+                        equilibrium_energy=eos.e0 * ureg.eV,
+                        rms_error=rms_error,
+                    )
+                    self.results.eos_fit.append(eos_fit)
+                except Exception:
+                    self.logger.warning('EOS fit not succesful.')
 
 
 class ChemicalReactionMethod(SimulationWorkflowMethod):
diff --git a/nomad/parsing/parser.py b/nomad/parsing/parser.py
index a8bc104ca9..3c8f8ac41b 100644
--- a/nomad/parsing/parser.py
+++ b/nomad/parsing/parser.py
@@ -26,7 +26,6 @@ from collections.abc import Iterable
 from functools import lru_cache
 from typing import IO, Any
 
-import h5py
 import numpy as np
 import yaml
 from pydantic import BaseModel, Extra  # noqa: F401
@@ -250,6 +249,9 @@ class MatchingParser(Parser):
 
         self._ls = lru_cache(maxsize=16)(lambda directory: os.listdir(directory))
 
+    def __repr__(self):
+        return self.name
+
     def read_metadata_file(self, metadata_file: str) -> dict[str, Any]:
         """
         Read parser metadata from a yaml file.
@@ -311,39 +313,47 @@ class MatchingParser(Parser):
                 if sibling_is_mainfile:
                     return False
 
-        def match(value, reference):
-            if not isinstance(value, dict):
-                equal = value == (
-                    reference[()] if isinstance(reference, h5py.Dataset) else reference
-                )
-                return equal.all() if isinstance(equal, np.ndarray) else equal
+        if self._mainfile_contents_dict is not None:
+            import h5py
+
+            def match(value, reference):
+                if not isinstance(value, dict):
+                    equal = value == (
+                        reference[()]
+                        if isinstance(reference, h5py.Dataset)
+                        else reference
+                    )
+                    return equal.all() if isinstance(equal, np.ndarray) else equal
 
-            if not hasattr(reference, 'keys'):
-                return False
+                if not hasattr(reference, 'keys'):
+                    return False
 
-            matches = []
-            reference_keys = list(reference.keys())
-            tmp = value.pop('__has_comment', None)
-            for key, val in value.items():
-                if key == '__has_key':
-                    matches.append(val in reference_keys)
-                elif key == '__has_all_keys':
-                    assert isinstance(val, list) and isinstance(reference_keys, list)
-                    matches.append(False not in [v in reference_keys for v in val])
-                elif key == '__has_only_keys':
-                    assert isinstance(val, list) and isinstance(reference_keys, list)
-                    matches.append(False not in [v in val for v in reference_keys])
-                else:
-                    if key not in reference_keys:
-                        matches.append(False)
-                        continue
-
-                    matches.append(match(val, reference[key]))
-            if tmp:
-                value.update({'__has_comment': tmp})
-            return False not in matches
+                matches = []
+                reference_keys = list(reference.keys())
+                tmp = value.pop('__has_comment', None)
+                for key, val in value.items():
+                    if key == '__has_key':
+                        matches.append(val in reference_keys)
+                    elif key == '__has_all_keys':
+                        assert isinstance(val, list) and isinstance(
+                            reference_keys, list
+                        )
+                        matches.append(False not in [v in reference_keys for v in val])
+                    elif key == '__has_only_keys':
+                        assert isinstance(val, list) and isinstance(
+                            reference_keys, list
+                        )
+                        matches.append(False not in [v in val for v in reference_keys])
+                    else:
+                        if key not in reference_keys:
+                            matches.append(False)
+                            continue
+
+                        matches.append(match(val, reference[key]))
+                if tmp:
+                    value.update({'__has_comment': tmp})
+                return False not in matches
 
-        if self._mainfile_contents_dict is not None:
             is_match = False
             if (
                 mime.startswith('application/json')
@@ -389,12 +399,11 @@ class MatchingParser(Parser):
     ) -> None:
         raise NotImplementedError()
 
-    def __repr__(self):
-        return self.name
-
 
 # TODO remove this after merging hdf5 reference, only for parser compatibility
 def to_hdf5(value: Any, f: str | IO, path: str):
+    import h5py
+
     with h5py.File(f, 'a') as root:
         segments = path.rsplit('/', 1)
         group = root.require_group(segments[0]) if len(segments) == 2 else root
diff --git a/nomad/patch.py b/nomad/patch.py
index 975076767d..13a982d1a8 100644
--- a/nomad/patch.py
+++ b/nomad/patch.py
@@ -22,7 +22,7 @@ import matid.utils.segfault_protect  # pylint: disable=import-error
 
 # A patch for the segfault protection of systax (internally uses protection for spglib calls.)
 # We basically disable the protection. The multiprocessing based original protection.
-# somehow interfers with the celery work infrastructure and leads to a deadlock. Its a TODO.
+# somehow interferes with the celery work infrastructure and leads to a deadlock. Its a TODO.
 # It also seems to deadlock without celery .. just not working consistently.
 def segfault_protect_patch(f, *args, **kwargs):
     return f(*args, **kwargs)
diff --git a/nomad/utils/__init__.py b/nomad/utils/__init__.py
index f124328c29..01705e668b 100644
--- a/nomad/utils/__init__.py
+++ b/nomad/utils/__init__.py
@@ -60,7 +60,6 @@ import orjson
 import os
 import unicodedata
 import re
-import pandas as pd
 
 from nomad.config import config
 
@@ -1011,6 +1010,8 @@ def dict_to_dataframe(
         result: Pandas DataFrame with flattened and sorted data.
     """
 
+    import pandas as pd
+
     if not keys_to_filter:
         keys_to_filter = []
 
-- 
GitLab