From 22ae6ac2ad0b003ebd1c627146b9e7539b0abd18 Mon Sep 17 00:00:00 2001 From: Haoyu Yang <yanghaoyu97@outlook.com> Date: Mon, 7 Apr 2025 11:07:48 +0000 Subject: [PATCH] Resolve "Optimize APP startup time by lazily import some packages" --- nomad/atomutils.py | 4 +- nomad/config/__init__.py | 2 +- nomad/datamodel/datamodel.py | 14 ++-- nomad/datamodel/metainfo/eln/__init__.py | 37 +++++++-- nomad/datamodel/metainfo/eln/eqe_parser.py | 5 +- nomad/datamodel/metainfo/plot.py | 9 ++- .../datamodel/metainfo/simulation/workflow.py | 55 +++++++------ nomad/parsing/parser.py | 77 +++++++++++-------- nomad/patch.py | 2 +- nomad/utils/__init__.py | 3 +- 10 files changed, 128 insertions(+), 80 deletions(-) diff --git a/nomad/atomutils.py b/nomad/atomutils.py index 517fb63388..4a5aea5dd2 100644 --- a/nomad/atomutils.py +++ b/nomad/atomutils.py @@ -34,7 +34,6 @@ import numpy as np from ase import Atoms from ase.formula import Formula as ASEFormula from ase.utils import pbc2pbc -from scipy.spatial import Voronoi # pylint: disable=no-name-in-module from nomad.aflow_prototypes import aflow_prototypes from nomad.constants import atomic_masses @@ -810,6 +809,9 @@ def get_brillouin_zone(reciprocal_lattice: np.ndarray) -> dict: first Brillouin zone. The order of these indices matter, because only when combined sequentially they form the correct face. """ + # Lazily import expensive `scipy` + from scipy.spatial import Voronoi + # Create the near lattice points that surround the origin b1 = reciprocal_lattice[0, :] b2 = reciprocal_lattice[1, :] diff --git a/nomad/config/__init__.py b/nomad/config/__init__.py index b898ed97ee..7fc6618898 100644 --- a/nomad/config/__init__.py +++ b/nomad/config/__init__.py @@ -24,7 +24,7 @@ this module. All parameters are structured into objects for two reasons. First, to have categories. Second, to allow runtime manipulation that is not effected -by python import logic. The categories are choosen along infrastructure components: +by python import logic. The categories are chosen along infrastructure components: ``mongo``, ``elastic``, etc. This module also provides utilities to read the configuration from environment variables diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py index 3b048eec51..e067705c84 100644 --- a/nomad/datamodel/datamodel.py +++ b/nomad/datamodel/datamodel.py @@ -57,13 +57,13 @@ from .util import parse_path # due to the next imports requiring the m_package already, this would be too late. m_package = Package() -from .results import Results # noqa -from .data import EntryData, ArchiveSection, User, UserReference, AuthorReference # noqa -from .optimade import OptimadeEntry # noqa -from .metainfo.simulation.legacy_workflows import Workflow as LegacySimulationWorkflow # noqa -from .metainfo.workflow import Workflow # noqa -from .metainfo.measurements import Measurement # noqa -from .metainfo.tabulartree import TabularTree # noqa +from .results import Results # noqa: I001 +from .data import EntryData, ArchiveSection, User, UserReference, AuthorReference +from .optimade import OptimadeEntry +from .metainfo.simulation.legacy_workflows import Workflow as LegacySimulationWorkflow +from .metainfo.workflow import Workflow +from .metainfo.measurements import Measurement +from .metainfo.tabulartree import TabularTree try: from runschema.run import Run as run_def diff --git a/nomad/datamodel/metainfo/eln/__init__.py b/nomad/datamodel/metainfo/eln/__init__.py index e9faca6042..f8db1db864 100644 --- a/nomad/datamodel/metainfo/eln/__init__.py +++ b/nomad/datamodel/metainfo/eln/__init__.py @@ -17,7 +17,9 @@ # import datetime +import importlib import re +import warnings from typing import TYPE_CHECKING, Any import numpy as np @@ -25,12 +27,6 @@ from unidecode import unidecode from nomad.datamodel.metainfo.plot import PlotSection -if TYPE_CHECKING: - from structlog.stdlib import ( - BoundLogger, - ) -from ase.data import atomic_masses, atomic_numbers, chemical_symbols - from nomad import utils from nomad.datamodel.data import ( ArchiveSection, @@ -68,7 +64,6 @@ from nomad.datamodel.metainfo.basesections.v1 import ( SystemComponent as Component, ) from nomad.datamodel.metainfo.common import ProvenanceTracker -from nomad.datamodel.metainfo.eln.eqe_parser import EQEAnalyzer from nomad.datamodel.results import ( ELN, BandGap, @@ -87,6 +82,31 @@ from nomad.metainfo.metainfo import Category, MCategory, MEnum, MProxy, MSection from nomad.units import ureg +class _LazyEQEAnalyzer: + """Lazily import expensive EQEAnalyzer.""" + + def __new__(cls, *args, **kwargs): + warnings.warn( + "Importing 'EQEAnalyzer' from this module is deprecated. " + "Please import it directly from 'nomad.datamodel.metainfo.eln.eqe_parser'.", + DeprecationWarning, + stacklevel=2, + ) + + EQEAnalyzer = importlib.import_module( + 'nomad.datamodel.metainfo.eln.eqe_parser' + ).EQEAnalyzer + return EQEAnalyzer(*args, **kwargs) + + +EQEAnalyzer = _LazyEQEAnalyzer + +if TYPE_CHECKING: + from structlog.stdlib import ( + BoundLogger, + ) + + def add_band_gap(archive, band_gap): """Adds a band gap value (in eV) with the additional section structure for solar cell data.eV= @@ -1679,6 +1699,9 @@ class SolarCellEQE(PlotSection): if self.eqe_data_file: with archive.m_context.raw_file(self.eqe_data_file) as f: + # Import `EQEAnalyzer` is slow (owing to scipy) + from nomad.datamodel.metainfo.eln.eqe_parser import EQEAnalyzer + eqe_dict = EQEAnalyzer( f.name, header_lines=self.header_lines ).eqe_dict() diff --git a/nomad/datamodel/metainfo/eln/eqe_parser.py b/nomad/datamodel/metainfo/eln/eqe_parser.py index fb8bcf5177..ab52e9a9d4 100644 --- a/nomad/datamodel/metainfo/eln/eqe_parser.py +++ b/nomad/datamodel/metainfo/eln/eqe_parser.py @@ -24,7 +24,6 @@ import os -import matplotlib.pyplot as plt import numpy as np import pandas as pd from scipy import integrate, optimize @@ -335,6 +334,8 @@ class EQEAnalyzer: """ Plots the extrapolated eqe ad the raw eqe. """ + import matplotlib.pyplot as plt + x, y = self.arrange_eqe_columns() photon_energy_extrapolated, eqe_extrapolated = self.extrapolate_eqe() bandgap = self.calculate_bandgap() @@ -355,6 +356,8 @@ class EQEAnalyzer: plt.show() def plot_eqe_raw(self): + import matplotlib.pyplot as plt + x, y = self.arrange_eqe_columns() plt.rcParams.update({'font.size': 16, 'font.family': 'Arial'}) plt.ylim(1e-4, 1.1) diff --git a/nomad/datamodel/metainfo/plot.py b/nomad/datamodel/metainfo/plot.py index caaf30037f..098b12871c 100644 --- a/nomad/datamodel/metainfo/plot.py +++ b/nomad/datamodel/metainfo/plot.py @@ -19,9 +19,6 @@ from copy import deepcopy from datetime import datetime import numpy as np -import plotly.express as px -import plotly.graph_objs as go -from plotly.subplots import make_subplots from nomad.datamodel.data import ArchiveSection from nomad.metainfo import JSON, MSection, Package, Quantity, Section, SubSection @@ -100,6 +97,8 @@ def resolve_plot_references(annotations, section, archive, logger): def express_do_plot(plotly_express_annotation, section, archive, logger): + import plotly.express as px + method_name = plotly_express_annotation.pop('method') layout = plotly_express_annotation.get('layout', None) if layout: @@ -271,6 +270,8 @@ class PlotSection(ArchiveSection): ) if plotly_express_annotations: + import plotly.graph_objs as go + for plotly_express_annotation in plotly_express_annotations: try: label, figure_index, figure_open = get_figure_layout( @@ -318,6 +319,8 @@ class PlotSection(ArchiveSection): raise PlotSectionError(error) if plotly_subplots_annotations: + from plotly.subplots import make_subplots + for plotly_subplots_annotation in plotly_subplots_annotations: try: label, figure_index, figure_open = get_figure_layout( diff --git a/nomad/datamodel/metainfo/simulation/workflow.py b/nomad/datamodel/metainfo/simulation/workflow.py index f0cbe78470..dc831eccff 100644 --- a/nomad/datamodel/metainfo/simulation/workflow.py +++ b/nomad/datamodel/metainfo/simulation/workflow.py @@ -22,9 +22,7 @@ import numpy as np from ase import Atoms -from ase.eos import EquationOfState as aseEOS -from nomad.atomutils import get_volume from nomad.datamodel.data import ArchiveSection from nomad.datamodel.metainfo.common import FastAccess from nomad.datamodel.metainfo.simulation.calculation import ( @@ -3718,6 +3716,8 @@ class EquationOfState(ParallelSimulation): pass if self.results.volumes is None: + from nomad.atomutils import get_volume + try: volumes = [] unit = 1 @@ -3730,7 +3730,14 @@ class EquationOfState(ParallelSimulation): except Exception: pass - if not self.results.eos_fit: + if ( + not self.results.eos_fit + and self.results.volumes is not None + and self.results.energies is not None + ): + # `aseEOS` import is slow (owing to internal `scipy`) + from ase.eos import EquationOfState as aseEOS + function_name_map = { 'birch_murnaghan': 'birchmurnaghan', 'pourier_tarantola': 'pouriertarantola', @@ -3738,27 +3745,27 @@ class EquationOfState(ParallelSimulation): 'murnaghan': 'murnaghan', 'birch_euler': 'birch', } - if self.results.volumes is not None and self.results.energies is not None: - # convert to ase units in order for function optimization to work - volumes = self.results.volumes.to('angstrom ** 3').magnitude - energies = self.results.energies.to('eV').magnitude - for function_name, ase_name in function_name_map.items(): - try: - eos = aseEOS(volumes, energies, ase_name) - eos.fit() - fitted_energies = eos.func(volumes, *eos.eos_parameters) - rms_error = np.sqrt(np.mean((fitted_energies - energies) ** 2)) - eos_fit = EOSFit( - function_name=function_name, - fitted_energies=fitted_energies * ureg.eV, - bulk_modulus=eos.B * ureg.eV / ureg.angstrom**3, - equilibrium_volume=eos.v0 * ureg.angstrom**3, - equilibrium_energy=eos.e0 * ureg.eV, - rms_error=rms_error, - ) - self.results.eos_fit.append(eos_fit) - except Exception: - self.logger.warning('EOS fit not succesful.') + + # convert to ase units in order for function optimization to work + volumes = self.results.volumes.to('angstrom ** 3').magnitude + energies = self.results.energies.to('eV').magnitude + for function_name, ase_name in function_name_map.items(): + try: + eos = aseEOS(volumes, energies, ase_name) + eos.fit() + fitted_energies = eos.func(volumes, *eos.eos_parameters) + rms_error = np.sqrt(np.mean((fitted_energies - energies) ** 2)) + eos_fit = EOSFit( + function_name=function_name, + fitted_energies=fitted_energies * ureg.eV, + bulk_modulus=eos.B * ureg.eV / ureg.angstrom**3, + equilibrium_volume=eos.v0 * ureg.angstrom**3, + equilibrium_energy=eos.e0 * ureg.eV, + rms_error=rms_error, + ) + self.results.eos_fit.append(eos_fit) + except Exception: + self.logger.warning('EOS fit not succesful.') class ChemicalReactionMethod(SimulationWorkflowMethod): diff --git a/nomad/parsing/parser.py b/nomad/parsing/parser.py index a8bc104ca9..3c8f8ac41b 100644 --- a/nomad/parsing/parser.py +++ b/nomad/parsing/parser.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from functools import lru_cache from typing import IO, Any -import h5py import numpy as np import yaml from pydantic import BaseModel, Extra # noqa: F401 @@ -250,6 +249,9 @@ class MatchingParser(Parser): self._ls = lru_cache(maxsize=16)(lambda directory: os.listdir(directory)) + def __repr__(self): + return self.name + def read_metadata_file(self, metadata_file: str) -> dict[str, Any]: """ Read parser metadata from a yaml file. @@ -311,39 +313,47 @@ class MatchingParser(Parser): if sibling_is_mainfile: return False - def match(value, reference): - if not isinstance(value, dict): - equal = value == ( - reference[()] if isinstance(reference, h5py.Dataset) else reference - ) - return equal.all() if isinstance(equal, np.ndarray) else equal + if self._mainfile_contents_dict is not None: + import h5py + + def match(value, reference): + if not isinstance(value, dict): + equal = value == ( + reference[()] + if isinstance(reference, h5py.Dataset) + else reference + ) + return equal.all() if isinstance(equal, np.ndarray) else equal - if not hasattr(reference, 'keys'): - return False + if not hasattr(reference, 'keys'): + return False - matches = [] - reference_keys = list(reference.keys()) - tmp = value.pop('__has_comment', None) - for key, val in value.items(): - if key == '__has_key': - matches.append(val in reference_keys) - elif key == '__has_all_keys': - assert isinstance(val, list) and isinstance(reference_keys, list) - matches.append(False not in [v in reference_keys for v in val]) - elif key == '__has_only_keys': - assert isinstance(val, list) and isinstance(reference_keys, list) - matches.append(False not in [v in val for v in reference_keys]) - else: - if key not in reference_keys: - matches.append(False) - continue - - matches.append(match(val, reference[key])) - if tmp: - value.update({'__has_comment': tmp}) - return False not in matches + matches = [] + reference_keys = list(reference.keys()) + tmp = value.pop('__has_comment', None) + for key, val in value.items(): + if key == '__has_key': + matches.append(val in reference_keys) + elif key == '__has_all_keys': + assert isinstance(val, list) and isinstance( + reference_keys, list + ) + matches.append(False not in [v in reference_keys for v in val]) + elif key == '__has_only_keys': + assert isinstance(val, list) and isinstance( + reference_keys, list + ) + matches.append(False not in [v in val for v in reference_keys]) + else: + if key not in reference_keys: + matches.append(False) + continue + + matches.append(match(val, reference[key])) + if tmp: + value.update({'__has_comment': tmp}) + return False not in matches - if self._mainfile_contents_dict is not None: is_match = False if ( mime.startswith('application/json') @@ -389,12 +399,11 @@ class MatchingParser(Parser): ) -> None: raise NotImplementedError() - def __repr__(self): - return self.name - # TODO remove this after merging hdf5 reference, only for parser compatibility def to_hdf5(value: Any, f: str | IO, path: str): + import h5py + with h5py.File(f, 'a') as root: segments = path.rsplit('/', 1) group = root.require_group(segments[0]) if len(segments) == 2 else root diff --git a/nomad/patch.py b/nomad/patch.py index 975076767d..13a982d1a8 100644 --- a/nomad/patch.py +++ b/nomad/patch.py @@ -22,7 +22,7 @@ import matid.utils.segfault_protect # pylint: disable=import-error # A patch for the segfault protection of systax (internally uses protection for spglib calls.) # We basically disable the protection. The multiprocessing based original protection. -# somehow interfers with the celery work infrastructure and leads to a deadlock. Its a TODO. +# somehow interferes with the celery work infrastructure and leads to a deadlock. Its a TODO. # It also seems to deadlock without celery .. just not working consistently. def segfault_protect_patch(f, *args, **kwargs): return f(*args, **kwargs) diff --git a/nomad/utils/__init__.py b/nomad/utils/__init__.py index f124328c29..01705e668b 100644 --- a/nomad/utils/__init__.py +++ b/nomad/utils/__init__.py @@ -60,7 +60,6 @@ import orjson import os import unicodedata import re -import pandas as pd from nomad.config import config @@ -1011,6 +1010,8 @@ def dict_to_dataframe( result: Pandas DataFrame with flattened and sorted data. """ + import pandas as pd + if not keys_to_filter: keys_to_filter = [] -- GitLab