From 29682053937b75accc60bd406ebec1db9440b61a Mon Sep 17 00:00:00 2001 From: Theodore Chang <theodore.chang@physik.hu-berlin.de> Date: Fri, 28 Mar 2025 11:20:30 +0000 Subject: [PATCH] Resolve "Extend contexts to provide same functionality in CLI, on Server, and during Tests" --- docs/howto/develop/normalizing.md | 79 +++++++++++++++++++++ nomad/client/processing.py | 23 ++++-- nomad/datamodel/context.py | 114 ++++++++++++++++++++++++++++++ nomad/metainfo/metainfo.py | 40 +++++++++++ nomad/parsing/parsers.py | 27 ++++--- 5 files changed, 263 insertions(+), 20 deletions(-) create mode 100644 docs/howto/develop/normalizing.md diff --git a/docs/howto/develop/normalizing.md b/docs/howto/develop/normalizing.md new file mode 100644 index 0000000000..73f06a6746 --- /dev/null +++ b/docs/howto/develop/normalizing.md @@ -0,0 +1,79 @@ +## The `update_entry` method + +The root context, which is available from the `.m_context` of a `EntryArchive`, which could be accessed via `section.m_root().m_context` if `section` is attached to a `EntryArchive`, provides the functionality to update/create child entries on-the-fly and invoke the processing if necessary. + +!!! note + The usage of this functionality is strongly discouraged and should be avoided if possible. + +The method has the following signature. + +```python + @contextmanager + def update_entry( + self, + mainfile: str, + *, + write: bool = False, + process: bool = False, + **kwargs, + ): + """ + Open the target file and send it to the updater function. + The updater function shall return the updated file content. + The updated file will be stored and processed if needed. + + WARNING: + If `process=True`, the updated file will be processed immediately. + Please be aware of the fact that this method may be called during the processing of + the parent/main file. + This means if there are any data dependencies, there is a risk of infinite loops, + racing conditions and/or other unexpected behavior. + You must carefully design the logic to mitigate these risks. + + To use this function, you shall use the with-statement as follows: + + ```python + with context.update_entry('mainfile.json',**kwargs) as content: + # do something with content + ``` + + Parameters: + mainfile: The relative path (from upload root) to the file to update. + write: Whether to write the updated file back to the storage. + If False, no processing will be triggered whatsoever. + process: Whether to trigger processing of the updated file. + """ + ... +``` + +It is wrapped with a `@contextmanager` decorator, thus it shall be used with a `with` block. +It yields a plain `dict` object that represents the content of the file. + +```python +# get the context from the current archive +context = section.m_root().m_context +# create/update the file 'mainfile.json' and process it +with context.update_entry('mainfile.json', process=True) as content_dict: + # do something with content + content_dict['key'] = 'value' + ... +``` + +The main file must be a `json` or `yaml` file. +Other formats are not supported. + +If only need to read the content, leave `write=False`. +Otherwise, set `write=True` to store the updated content back to the storage. + +It is possible to invoke the processing immediately by setting `process=True`. +However, this is not recommended due to various security concerns. + +The following caveats must be acknowledged when using this method: + +1. The specific logic of creating/updating the file must be re-entrant safe, see [details](https://en.wikipedia.org/wiki/Reentrancy_(computing)). + To put simply, the first call and subsequent calls must yield the same result regardless of what is already stored in the file. +2. A child entry must **not** be accessed by multiple parent entries. + Because the parent entries are processed in parallel (by multiple `celery` workers), there is a risk of racing conditions if the child entry is accessed by multiple parent entries. +3. The child entry shall not modify the parent entry (and any other entries). + Otherwise, there is a risk of infinite loops and data corruption. +4. A child entry shall **not** depend on other child entries. diff --git a/nomad/client/processing.py b/nomad/client/processing.py index 03d54ba78c..209412ed26 100644 --- a/nomad/client/processing.py +++ b/nomad/client/processing.py @@ -41,7 +41,15 @@ def parse( Run the given parser on the provided mainfile. If parser_name is given, we only try to match this parser, otherwise we try to match all parsers. """ - from nomad import parsing + recursive_kwargs: dict = dict( + backend_factory=backend_factory, + strict=strict, + logger=logger, + server_context=server_context, + username=username, + password=password, + ) + from nomad.parsing import parsers mainfile = os.path.basename(mainfile_path) @@ -53,10 +61,6 @@ def parse( parser, mainfile_keys = parsers.match_parser( mainfile_path, strict=strict, parser_name=parser_name ) - if isinstance(parser, parsing.MatchingParser): - parser_name = parser.name - else: - parser_name = parser.__class__.__name__ assert parser is not None, f'there is no parser matching {mainfile}' logger = logger.bind(parser=parser.name) # type: ignore @@ -65,7 +69,14 @@ def parse( setattr(parser, 'backend_factory', backend_factory) entry_archives = parsers.run_parser( - mainfile_path, parser, mainfile_keys, logger, server_context, username, password + mainfile_path, + parser, + mainfile_keys, + logger, + server_context, + username, + password, + recursive_kwargs=recursive_kwargs, ) logger.info('ran parser') diff --git a/nomad/datamodel/context.py b/nomad/datamodel/context.py index 18d579893e..4f4653ef2c 100644 --- a/nomad/datamodel/context.py +++ b/nomad/datamodel/context.py @@ -16,11 +16,17 @@ # limitations under the License. # +from __future__ import annotations + +import json import os.path import re +from contextlib import contextmanager +from pathlib import Path from urllib.parse import urlsplit, urlunsplit import requests +from ruamel import yaml from nomad import utils from nomad.config import config @@ -31,6 +37,20 @@ from nomad.metainfo import Context as MetainfoContext from nomad.metainfo import MetainfoReferenceError, MSection, Package, Quantity +def _opener_and_dumper(file_name: str) -> tuple: + if not file_name.endswith(('json', 'yaml', 'yml')): + raise AssertionError('Only json/yaml files can be updated.') + + if file_name.endswith('json'): + loader = json.load + dumper = json.dump + else: + loader = yaml.safe_load # type: ignore + dumper = yaml.dump # type: ignore + + return loader, dumper + + class Context(MetainfoContext): """ The nomad implementation of a metainfo context. @@ -407,6 +427,50 @@ class ServerContext(Context): return self.upload_files.archive_hdf5_location(entry_id) + @contextmanager + def update_entry( + self, + mainfile: str, + *, + write: bool = False, + process: bool = False, + **kwargs, + ): + if not self.upload: + raise AssertionError( + 'A valid upload must be attached before updating files.' + ) + + loader, dumper = _opener_and_dumper(mainfile) + + content: dict = {} + if self.upload_files.raw_path_exists(mainfile): + with self.upload_files.raw_file(mainfile, 'r') as f: + content = loader(f) + + yield content + + if not write: + return + + with self.upload_files.raw_file(mainfile, 'w') as f: + dumper(content, f, indent=2) + + if process: + self.upload.process_updated_raw_file(mainfile, True) + + +class ServerLocalContext(Context): + def __init__(self, mainfile_dir): + super().__init__() + self._mainfile_dir: Path = Path(mainfile_dir) + + def raw_file(self, path, *args, **kwargs): + return (self._mainfile_dir / path).open(*args, **kwargs) + + def raw_path_exists(self, path: str) -> bool: + return (self._mainfile_dir / path).exists() + def _validate_url(url): return config.api_url(api='api/v1') if url is None else url @@ -426,10 +490,12 @@ class ClientContext(Context): def __init__( self, installation_url: str = None, + *, local_dir: str = None, upload_id: str = None, username: str = None, password: str = None, + recursive_kwargs: dict = None, auth=None, ): super().__init__( @@ -444,6 +510,14 @@ class ClientContext(Context): self._auth = Auth(user=username, password=password) self._upload_id = upload_id + self._recursive_kwargs = recursive_kwargs if recursive_kwargs else {} + self._child_archives: list = [] + + @property + def child_archives(self): + holder = self._child_archives + self._child_archives = [] + return holder @property def upload_id(self): @@ -552,3 +626,43 @@ class ClientContext(Context): ) return response.json()['data'] + + @contextmanager + def update_entry( + self, + mainfile: str, + *, + write: bool = False, + process: bool = False, + **kwargs, + ): + if not self.local_dir: + raise AssertionError( + 'A valid local folder must be attached before updating files.' + ) + + loader, dumper = _opener_and_dumper(mainfile) + + file_path: Path = Path(self.local_dir) / mainfile + + content: dict = {} + if file_path.exists(): + with file_path.open() as f: + content = loader(f) + + yield content + + if not write: + return + + with file_path.open('w') as f: + dumper(content, f, indent=2) + + if process: + from nomad.client import parse + + self._child_archives.extend( + parse( + file_path.absolute().as_posix(), **(self._recursive_kwargs | kwargs) + ) + ) diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py index c6deb3a24f..4f992866ca 100644 --- a/nomad/metainfo/metainfo.py +++ b/nomad/metainfo/metainfo.py @@ -729,6 +729,10 @@ class Context: multiple hierarchies (e.g. archives) with references. """ + @property + def child_archives(self): + return [] + def warning(self, event, **kwargs): """ Used to log (or otherwise handle) warning that are issued, e.g. while serialization, @@ -819,6 +823,42 @@ class Context: def hdf5_path(self, section: MSection): raise NotImplementedError + def update_entry( + self, + mainfile: str, + *, + write: bool = True, + process: bool = True, + **kwargs, + ): + """ + Open the target file and send it to the updater function. + The updater function shall return the updated file content. + The updated file will be stored and processed if needed. + + WARNING: + If `process=True`, the updated file will be processed immediately. + Please be aware of the fact that this method may be called during the processing of + the parent/main file. + This means if there are any data dependencies, there is a risk of infinite loops, + racing conditions and/or other unexpected behavior. + You must carefully design the logic to mitigate these risks. + + To use this function, you shall use the with-statement as follows: + + ```python + with context.update_entry('mainfile.json',**kwargs) as content: + # do something with content + ``` + + Parameters: + mainfile: The relative path (from upload root) to the file to update. + write: Whether to write the updated file back to the storage. + If False, no processing will be triggered whatsoever. + process: Whether to trigger processing of the updated file. + """ + raise NotImplementedError + # TODO find a way to make this a subclass of collections.abs.Mapping class MSection(metaclass=MObjectMeta): diff --git a/nomad/parsing/parsers.py b/nomad/parsing/parsers.py index efc42e2a8f..ba20b1291a 100644 --- a/nomad/parsing/parsers.py +++ b/nomad/parsing/parsers.py @@ -16,6 +16,8 @@ # limitations under the License. # +from __future__ import annotations + import os.path from collections.abc import Iterable @@ -23,7 +25,7 @@ from nomad.config import config from nomad.config.models.plugins import Parser as ParserPlugin from nomad.config.models.plugins import ParserEntryPoint from nomad.datamodel import EntryArchive, EntryMetadata, results -from nomad.datamodel.context import ClientContext, Context +from nomad.datamodel.context import ClientContext, ServerLocalContext from .artificial import ChaosParser, EmptyParser, GenerateRandomParser, TemplateParser from .parser import ( @@ -152,17 +154,6 @@ def match_parser( return None, None -class ParserContext(Context): - def __init__(self, mainfile_dir): - self._mainfile_dir = mainfile_dir - - def raw_file(self, path, *args, **kwargs): - return open(os.path.join(self._mainfile_dir, path), *args, **kwargs) - - def raw_path_exists(self, path: str) -> bool: - return os.path.exists(os.path.join(self._mainfile_dir, path)) - - def run_parser( mainfile_path: str, parser: Parser, @@ -171,6 +162,8 @@ def run_parser( server_context: bool = False, username: str = None, password: str = None, + *, + recursive_kwargs: dict = None, ) -> list[EntryArchive]: """ Parses a file, given the path, the parser, and mainfile_keys, as returned by @@ -184,11 +177,14 @@ def run_parser( # TODO this looks totally wrong. ParserContext is not a server context at all. # There should be three different context. Client, Server and Parser. Currently, # ClientContext seems to cover both the client and the local parser use-case. - entry_archive = EntryArchive(m_context=ParserContext(directory)) + entry_archive = EntryArchive(m_context=ServerLocalContext(directory)) else: entry_archive = EntryArchive( m_context=ClientContext( - local_dir=directory, username=username, password=password + local_dir=directory, + username=username, + password=password, + recursive_kwargs=recursive_kwargs, ) ) @@ -223,6 +219,9 @@ def run_parser( for entry_archive in entry_archives: if entry_archive.metadata.domain is None: entry_archive.metadata.domain = parser.domain + + entry_archives.extend(entry_archives[0].m_context.child_archives) + return entry_archives -- GitLab