From 29682053937b75accc60bd406ebec1db9440b61a Mon Sep 17 00:00:00 2001
From: Theodore Chang <theodore.chang@physik.hu-berlin.de>
Date: Fri, 28 Mar 2025 11:20:30 +0000
Subject: [PATCH] Resolve "Extend contexts to provide same functionality in
 CLI, on Server, and during Tests"

---
 docs/howto/develop/normalizing.md |  79 +++++++++++++++++++++
 nomad/client/processing.py        |  23 ++++--
 nomad/datamodel/context.py        | 114 ++++++++++++++++++++++++++++++
 nomad/metainfo/metainfo.py        |  40 +++++++++++
 nomad/parsing/parsers.py          |  27 ++++---
 5 files changed, 263 insertions(+), 20 deletions(-)
 create mode 100644 docs/howto/develop/normalizing.md

diff --git a/docs/howto/develop/normalizing.md b/docs/howto/develop/normalizing.md
new file mode 100644
index 0000000000..73f06a6746
--- /dev/null
+++ b/docs/howto/develop/normalizing.md
@@ -0,0 +1,79 @@
+## The `update_entry` method
+
+The root context, which is available from the `.m_context` of a `EntryArchive`, which could be accessed via `section.m_root().m_context` if `section` is attached to a `EntryArchive`, provides the functionality to update/create child entries on-the-fly and invoke the processing if necessary.
+
+!!! note
+    The usage of this functionality is strongly discouraged and should be avoided if possible.
+
+The method has the following signature.
+
+```python
+    @contextmanager
+    def update_entry(
+        self,
+        mainfile: str,
+        *,
+        write: bool = False,
+        process: bool = False,
+        **kwargs,
+    ):
+        """
+        Open the target file and send it to the updater function.
+        The updater function shall return the updated file content.
+        The updated file will be stored and processed if needed.
+
+        WARNING:
+            If `process=True`, the updated file will be processed immediately.
+            Please be aware of the fact that this method may be called during the processing of
+            the parent/main file.
+            This means if there are any data dependencies, there is a risk of infinite loops,
+            racing conditions and/or other unexpected behavior.
+            You must carefully design the logic to mitigate these risks.
+
+        To use this function, you shall use the with-statement as follows:
+
+        ```python
+        with context.update_entry('mainfile.json',**kwargs) as content:
+            # do something with content
+        ```
+
+        Parameters:
+            mainfile: The relative path (from upload root) to the file to update.
+            write: Whether to write the updated file back to the storage.
+                If False, no processing will be triggered whatsoever.
+            process: Whether to trigger processing of the updated file.
+        """
+        ...
+```
+
+It is wrapped with a `@contextmanager` decorator, thus it shall be used with a `with` block.
+It yields a plain `dict` object that represents the content of the file.
+
+```python
+# get the context from the current archive
+context = section.m_root().m_context
+# create/update the file 'mainfile.json' and process it
+with context.update_entry('mainfile.json', process=True) as content_dict:
+    # do something with content
+    content_dict['key'] = 'value'
+    ...
+```
+
+The main file must be a `json` or `yaml` file.
+Other formats are not supported.
+
+If only need to read the content, leave `write=False`.
+Otherwise, set `write=True` to store the updated content back to the storage.
+
+It is possible to invoke the processing immediately by setting `process=True`.
+However, this is not recommended due to various security concerns.
+
+The following caveats must be acknowledged when using this method:
+
+1. The specific logic of creating/updating the file must be re-entrant safe, see [details](https://en.wikipedia.org/wiki/Reentrancy_(computing)).
+   To put simply, the first call and subsequent calls must yield the same result regardless of what is already stored in the file.
+2. A child entry must **not** be accessed by multiple parent entries.
+   Because the parent entries are processed in parallel (by multiple `celery` workers), there is a risk of racing conditions if the child entry is accessed by multiple parent entries.
+3. The child entry shall not modify the parent entry (and any other entries).
+   Otherwise, there is a risk of infinite loops and data corruption.
+4. A child entry shall **not** depend on other child entries.
diff --git a/nomad/client/processing.py b/nomad/client/processing.py
index 03d54ba78c..209412ed26 100644
--- a/nomad/client/processing.py
+++ b/nomad/client/processing.py
@@ -41,7 +41,15 @@ def parse(
     Run the given parser on the provided mainfile. If parser_name is given, we only try
     to match this parser, otherwise we try to match all parsers.
     """
-    from nomad import parsing
+    recursive_kwargs: dict = dict(
+        backend_factory=backend_factory,
+        strict=strict,
+        logger=logger,
+        server_context=server_context,
+        username=username,
+        password=password,
+    )
+
     from nomad.parsing import parsers
 
     mainfile = os.path.basename(mainfile_path)
@@ -53,10 +61,6 @@ def parse(
     parser, mainfile_keys = parsers.match_parser(
         mainfile_path, strict=strict, parser_name=parser_name
     )
-    if isinstance(parser, parsing.MatchingParser):
-        parser_name = parser.name
-    else:
-        parser_name = parser.__class__.__name__
 
     assert parser is not None, f'there is no parser matching {mainfile}'
     logger = logger.bind(parser=parser.name)  # type: ignore
@@ -65,7 +69,14 @@ def parse(
         setattr(parser, 'backend_factory', backend_factory)
 
     entry_archives = parsers.run_parser(
-        mainfile_path, parser, mainfile_keys, logger, server_context, username, password
+        mainfile_path,
+        parser,
+        mainfile_keys,
+        logger,
+        server_context,
+        username,
+        password,
+        recursive_kwargs=recursive_kwargs,
     )
 
     logger.info('ran parser')
diff --git a/nomad/datamodel/context.py b/nomad/datamodel/context.py
index 18d579893e..4f4653ef2c 100644
--- a/nomad/datamodel/context.py
+++ b/nomad/datamodel/context.py
@@ -16,11 +16,17 @@
 # limitations under the License.
 #
 
+from __future__ import annotations
+
+import json
 import os.path
 import re
+from contextlib import contextmanager
+from pathlib import Path
 from urllib.parse import urlsplit, urlunsplit
 
 import requests
+from ruamel import yaml
 
 from nomad import utils
 from nomad.config import config
@@ -31,6 +37,20 @@ from nomad.metainfo import Context as MetainfoContext
 from nomad.metainfo import MetainfoReferenceError, MSection, Package, Quantity
 
 
+def _opener_and_dumper(file_name: str) -> tuple:
+    if not file_name.endswith(('json', 'yaml', 'yml')):
+        raise AssertionError('Only json/yaml files can be updated.')
+
+    if file_name.endswith('json'):
+        loader = json.load
+        dumper = json.dump
+    else:
+        loader = yaml.safe_load  # type: ignore
+        dumper = yaml.dump  # type: ignore
+
+    return loader, dumper
+
+
 class Context(MetainfoContext):
     """
     The nomad implementation of a metainfo context.
@@ -407,6 +427,50 @@ class ServerContext(Context):
 
         return self.upload_files.archive_hdf5_location(entry_id)
 
+    @contextmanager
+    def update_entry(
+        self,
+        mainfile: str,
+        *,
+        write: bool = False,
+        process: bool = False,
+        **kwargs,
+    ):
+        if not self.upload:
+            raise AssertionError(
+                'A valid upload must be attached before updating files.'
+            )
+
+        loader, dumper = _opener_and_dumper(mainfile)
+
+        content: dict = {}
+        if self.upload_files.raw_path_exists(mainfile):
+            with self.upload_files.raw_file(mainfile, 'r') as f:
+                content = loader(f)
+
+        yield content
+
+        if not write:
+            return
+
+        with self.upload_files.raw_file(mainfile, 'w') as f:
+            dumper(content, f, indent=2)
+
+        if process:
+            self.upload.process_updated_raw_file(mainfile, True)
+
+
+class ServerLocalContext(Context):
+    def __init__(self, mainfile_dir):
+        super().__init__()
+        self._mainfile_dir: Path = Path(mainfile_dir)
+
+    def raw_file(self, path, *args, **kwargs):
+        return (self._mainfile_dir / path).open(*args, **kwargs)
+
+    def raw_path_exists(self, path: str) -> bool:
+        return (self._mainfile_dir / path).exists()
+
 
 def _validate_url(url):
     return config.api_url(api='api/v1') if url is None else url
@@ -426,10 +490,12 @@ class ClientContext(Context):
     def __init__(
         self,
         installation_url: str = None,
+        *,
         local_dir: str = None,
         upload_id: str = None,
         username: str = None,
         password: str = None,
+        recursive_kwargs: dict = None,
         auth=None,
     ):
         super().__init__(
@@ -444,6 +510,14 @@ class ClientContext(Context):
 
             self._auth = Auth(user=username, password=password)
         self._upload_id = upload_id
+        self._recursive_kwargs = recursive_kwargs if recursive_kwargs else {}
+        self._child_archives: list = []
+
+    @property
+    def child_archives(self):
+        holder = self._child_archives
+        self._child_archives = []
+        return holder
 
     @property
     def upload_id(self):
@@ -552,3 +626,43 @@ class ClientContext(Context):
             )
 
         return response.json()['data']
+
+    @contextmanager
+    def update_entry(
+        self,
+        mainfile: str,
+        *,
+        write: bool = False,
+        process: bool = False,
+        **kwargs,
+    ):
+        if not self.local_dir:
+            raise AssertionError(
+                'A valid local folder must be attached before updating files.'
+            )
+
+        loader, dumper = _opener_and_dumper(mainfile)
+
+        file_path: Path = Path(self.local_dir) / mainfile
+
+        content: dict = {}
+        if file_path.exists():
+            with file_path.open() as f:
+                content = loader(f)
+
+        yield content
+
+        if not write:
+            return
+
+        with file_path.open('w') as f:
+            dumper(content, f, indent=2)
+
+        if process:
+            from nomad.client import parse
+
+            self._child_archives.extend(
+                parse(
+                    file_path.absolute().as_posix(), **(self._recursive_kwargs | kwargs)
+                )
+            )
diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py
index c6deb3a24f..4f992866ca 100644
--- a/nomad/metainfo/metainfo.py
+++ b/nomad/metainfo/metainfo.py
@@ -729,6 +729,10 @@ class Context:
     multiple hierarchies (e.g. archives) with references.
     """
 
+    @property
+    def child_archives(self):
+        return []
+
     def warning(self, event, **kwargs):
         """
         Used to log (or otherwise handle) warning that are issued, e.g. while serialization,
@@ -819,6 +823,42 @@ class Context:
     def hdf5_path(self, section: MSection):
         raise NotImplementedError
 
+    def update_entry(
+        self,
+        mainfile: str,
+        *,
+        write: bool = True,
+        process: bool = True,
+        **kwargs,
+    ):
+        """
+        Open the target file and send it to the updater function.
+        The updater function shall return the updated file content.
+        The updated file will be stored and processed if needed.
+
+        WARNING:
+            If `process=True`, the updated file will be processed immediately.
+            Please be aware of the fact that this method may be called during the processing of
+            the parent/main file.
+            This means if there are any data dependencies, there is a risk of infinite loops,
+            racing conditions and/or other unexpected behavior.
+            You must carefully design the logic to mitigate these risks.
+
+        To use this function, you shall use the with-statement as follows:
+
+        ```python
+        with context.update_entry('mainfile.json',**kwargs) as content:
+            # do something with content
+        ```
+
+        Parameters:
+            mainfile: The relative path (from upload root) to the file to update.
+            write: Whether to write the updated file back to the storage.
+                If False, no processing will be triggered whatsoever.
+            process: Whether to trigger processing of the updated file.
+        """
+        raise NotImplementedError
+
 
 # TODO find a way to make this a subclass of collections.abs.Mapping
 class MSection(metaclass=MObjectMeta):
diff --git a/nomad/parsing/parsers.py b/nomad/parsing/parsers.py
index efc42e2a8f..ba20b1291a 100644
--- a/nomad/parsing/parsers.py
+++ b/nomad/parsing/parsers.py
@@ -16,6 +16,8 @@
 # limitations under the License.
 #
 
+from __future__ import annotations
+
 import os.path
 from collections.abc import Iterable
 
@@ -23,7 +25,7 @@ from nomad.config import config
 from nomad.config.models.plugins import Parser as ParserPlugin
 from nomad.config.models.plugins import ParserEntryPoint
 from nomad.datamodel import EntryArchive, EntryMetadata, results
-from nomad.datamodel.context import ClientContext, Context
+from nomad.datamodel.context import ClientContext, ServerLocalContext
 
 from .artificial import ChaosParser, EmptyParser, GenerateRandomParser, TemplateParser
 from .parser import (
@@ -152,17 +154,6 @@ def match_parser(
     return None, None
 
 
-class ParserContext(Context):
-    def __init__(self, mainfile_dir):
-        self._mainfile_dir = mainfile_dir
-
-    def raw_file(self, path, *args, **kwargs):
-        return open(os.path.join(self._mainfile_dir, path), *args, **kwargs)
-
-    def raw_path_exists(self, path: str) -> bool:
-        return os.path.exists(os.path.join(self._mainfile_dir, path))
-
-
 def run_parser(
     mainfile_path: str,
     parser: Parser,
@@ -171,6 +162,8 @@ def run_parser(
     server_context: bool = False,
     username: str = None,
     password: str = None,
+    *,
+    recursive_kwargs: dict = None,
 ) -> list[EntryArchive]:
     """
     Parses a file, given the path, the parser, and mainfile_keys, as returned by
@@ -184,11 +177,14 @@ def run_parser(
         # TODO this looks totally wrong. ParserContext is not a server context at all.
         # There should be three different context. Client, Server and Parser. Currently,
         # ClientContext seems to cover both the client and the local parser use-case.
-        entry_archive = EntryArchive(m_context=ParserContext(directory))
+        entry_archive = EntryArchive(m_context=ServerLocalContext(directory))
     else:
         entry_archive = EntryArchive(
             m_context=ClientContext(
-                local_dir=directory, username=username, password=password
+                local_dir=directory,
+                username=username,
+                password=password,
+                recursive_kwargs=recursive_kwargs,
             )
         )
 
@@ -223,6 +219,9 @@ def run_parser(
     for entry_archive in entry_archives:
         if entry_archive.metadata.domain is None:
             entry_archive.metadata.domain = parser.domain
+
+    entry_archives.extend(entry_archives[0].m_context.child_archives)
+
     return entry_archives
 
 
-- 
GitLab