From b428aad8ac606e1e6027bde15b978022688e3642 Mon Sep 17 00:00:00 2001
From: Theodore Chang <theodore.chang@physik.hu-berlin.de>
Date: Wed, 9 Oct 2024 15:01:31 +0000
Subject: [PATCH] Further metainfo refactoring

---
 nomad/app/v1/routers/entries.py |   2 +-
 nomad/datamodel/datamodel.py    |   4 +-
 nomad/metainfo/data_type.py     |  21 +++++-
 nomad/metainfo/metainfo.py      | 114 ++++++++++++++++++++------------
 nomad/parsing/parser.py         |   2 +-
 nomad/processing/data.py        |   4 +-
 tests/metainfo/test_metainfo.py |  11 +++
 7 files changed, 107 insertions(+), 51 deletions(-)

diff --git a/nomad/app/v1/routers/entries.py b/nomad/app/v1/routers/entries.py
index 0e3f4d8e97..9480871f93 100644
--- a/nomad/app/v1/routers/entries.py
+++ b/nomad/app/v1/routers/entries.py
@@ -1502,7 +1502,7 @@ def edit(
             for entry in proc.Entry.objects(entry_id__in=entry_ids):
                 entry_metadata = entry.mongo_metadata(entry.upload)
                 # Ensure that updated fields are marked as "set", even if they are cleared
-                entry_metadata.m_update_from_dict(mongo_update)
+                entry_metadata.m_update_from_dict(mongo_update, force_none=True)
                 # Add to list
                 updated_metadata.append(entry_metadata)
 
diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py
index ebbaa47736..b5a48f2204 100644
--- a/nomad/datamodel/datamodel.py
+++ b/nomad/datamodel/datamodel.py
@@ -1222,8 +1222,8 @@ class EntryArchive(ArchiveSection):
         if not archive.metadata.entry_name and archive.metadata.mainfile:
             archive.metadata.entry_name = os.path.basename(archive.metadata.mainfile)
 
-    def m_update_from_dict(self, dct) -> None:
-        super().m_update_from_dict(dct)
+    def m_update_from_dict(self, data: dict, **kwargs) -> None:
+        super().m_update_from_dict(data, **kwargs)
         if self.definitions is not None:
             self.definitions.archive = self
 
diff --git a/nomad/metainfo/data_type.py b/nomad/metainfo/data_type.py
index 72b068c2ba..7ee24a76bc 100644
--- a/nomad/metainfo/data_type.py
+++ b/nomad/metainfo/data_type.py
@@ -518,6 +518,25 @@ class InexactNumber(Number):
         super().__init__(dtype)
         self._np_base = np.inexact
 
+    def normalize(self, value, **kwargs):
+        """
+        Accept the following additional flags to tweak the behavior.
+
+        Parameters:
+            treat_none_as_nan: If True, treat None as NaN.
+        """
+
+        def _preprocess(v):
+            if isinstance(v, (list, tuple)):
+                return [_preprocess(x) for x in v]
+
+            return float('nan') if v is None else v
+
+        if kwargs.get('treat_none_as_nan', False):
+            value = _preprocess(value)
+
+        return super().normalize(value, **kwargs)
+
 
 class m_float(InexactNumber):
     """
@@ -966,7 +985,7 @@ class Datetime(NonPrimitive):
 
     __slots__ = ()
 
-    def _normalize_impl(self, value, section=None):
+    def _normalize_impl(self, value, **kwargs):
         if isinstance(value, datetime):
             datetime_obj = value
         elif isinstance(value, date):
diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py
index 05302daff9..3e8305e6df 100644
--- a/nomad/metainfo/metainfo.py
+++ b/nomad/metainfo/metainfo.py
@@ -61,6 +61,7 @@ from nomad.metainfo.data_type import (
     Any as AnyType,
     check_dimensionality,
     ExactNumber,
+    InexactNumber,
 )
 from nomad.metainfo.util import (
     Annotation,
@@ -549,7 +550,7 @@ class Reference:
 
         raise TypeError(f'{value} is not a valid value of {self._definition}.')
 
-    def normalize(self, value, *, section=None):
+    def normalize(self, value, *, section=None, **kwargs):
         def _convert(_v):
             if isinstance(_v, list):
                 return [_convert(v) for v in _v]
@@ -714,15 +715,15 @@ def metainfo_setter(method):
     This differs from `modifier` in that it is used for setter methods.
     """
     assert method.__name__ == '__set__'
-    assert method.__code__.co_argcount == 3
+    assert method.__code__.co_argcount >= 3
 
     @wraps(method)
-    def wrapper(self, obj, value):
+    def wrapper(self, obj, value, **kwargs):
         if obj is None:
             raise KeyError('Cannot overwrite definition. Only data can be set.')
 
         obj.m_mod_count += 1
-        return method(self, obj, value)
+        return method(self, obj, value, **kwargs)
 
     return wrapper
 
@@ -732,11 +733,11 @@ def metainfo_getter(method):
     Decorate a method as a getter.
     """
     assert method.__name__ == '__get__'
-    assert method.__code__.co_argcount == 3
+    assert method.__code__.co_argcount >= 3
 
     @wraps(method)
-    def wrapper(self, obj, cls=None):
-        return self if obj is None else method(self, obj, cls)
+    def wrapper(self, obj, cls=None, **kwargs):
+        return self if obj is None else method(self, obj, cls, **kwargs)
 
     return wrapper
 
@@ -1229,6 +1230,7 @@ class MSection(metaclass=MObjectMeta):
         skip_virtual: bool = False,
         index: int | None = None,
         context: Context | MSection | None = None,
+        **kwargs,
     ):
         """
         Set the given value for the given property.
@@ -1246,6 +1248,7 @@ class MSection(metaclass=MObjectMeta):
                 No effect in 'append' mode.
             context: The context to use when deserializing the value.
                 This is used in setting sections, while the value is given as a serialized dictionary.
+            kwargs: Additional keyword arguments to pass to the property setter.
         """
         definition: Property = self._ensure_definition(def_or_name, hint=hint)
 
@@ -1253,18 +1256,18 @@ class MSection(metaclass=MObjectMeta):
             if isinstance(value, list):
                 # this is used when deserializing a list of subsections
                 for v in value:
-                    self.m_append(definition, v, context=context)
+                    self.m_append(definition, v, context=context, **kwargs)
                 return
 
             if isinstance(value, dict):
                 value = definition.sub_section.section_cls.m_from_dict(
-                    value, m_context=context, m_parent=self
+                    value, m_context=context, m_parent=self, **kwargs
                 )
         elif definition.virtual and skip_virtual:
             return
 
         if index is None:
-            return definition.__set__(self, value)
+            return definition.__set__(self, value, **kwargs)
 
         def _populate(_target):
             # overwrite mode with valid index
@@ -1278,7 +1281,7 @@ class MSection(metaclass=MObjectMeta):
                 # guaranteed to be a MSubSectionList
                 _populate(definition.__get__(self))
             else:
-                definition.__set__(self, value)
+                definition.__set__(self, value, **kwargs)
         elif isinstance(definition, Quantity):
             if definition.is_scalar:
                 target = value
@@ -1289,7 +1292,7 @@ class MSection(metaclass=MObjectMeta):
                     target = []
                 _populate(target)
             # regardless, need to revalidate the value
-            definition.__set__(self, target)
+            definition.__set__(self, target, **kwargs)
         else:
             # should never reach here
             raise NotImplementedError
@@ -1339,11 +1342,9 @@ class MSection(metaclass=MObjectMeta):
 
         if isinstance(definition, Quantity):
             if definition.use_full_storage:
-                target = self.__dict__.get(definition.name, {}).get(def_or_name, None)
-                if full or target is None:
-                    return _wrap(target)
-
-                return _wrap(cast(MQuantity, target).get())
+                return _wrap(
+                    definition.__get__(self, full=full, actual_name=def_or_name)
+                )
 
             target = definition.__get__(self)
             if isinstance(target, list) and index is not None:
@@ -1388,6 +1389,7 @@ class MSection(metaclass=MObjectMeta):
         hint: str | None = None,
         skip_virtual: bool = False,
         context: Context | MSection | None = None,
+        **kwargs,
     ):
         """
         Append the given value to the given property.
@@ -1413,18 +1415,18 @@ class MSection(metaclass=MObjectMeta):
                 assert definition.repeats
                 # this is used when deserializing a list of subsections
                 for v in value:
-                    self.m_append(definition, v, context=context)
+                    self.m_append(definition, v, context=context, **kwargs)
                 return
 
             if isinstance(value, dict):
                 value = definition.sub_section.section_cls.m_from_dict(
-                    value, m_context=context, m_parent=self
+                    value, m_context=context, m_parent=self, **kwargs
                 )
 
             if definition.repeats:
                 definition.__get__(self).append(value)
             else:
-                definition.__set__(self, value)
+                definition.__set__(self, value, **kwargs)
         elif isinstance(definition, Quantity):
             if definition.virtual and skip_virtual:
                 return
@@ -1438,7 +1440,7 @@ class MSection(metaclass=MObjectMeta):
                     target = []
                 target.append(value)
             # regardless, need to revalidate the value
-            definition.__set__(self, target)
+            definition.__set__(self, target, **kwargs)
         else:
             # should never reach here
             raise NotImplementedError
@@ -2124,13 +2126,17 @@ class MSection(metaclass=MObjectMeta):
 
         return {key: value for key, value in items()}
 
-    def m_update_from_dict(self, data: dict) -> None:
+    def m_update_from_dict(self, data: dict, **kwargs) -> None:
         """
         Updates this section with the serialized data from the given dict, e.g. data
         produced by :func:`m_to_dict`.
         """
         m_context: Context | MSection = self.m_context if self.m_context else self
 
+        # todo: the hack flag shall be removed
+        # it is required due to json serialize nan and inf to null
+        treat_none_as_nan = kwargs.get('treat_none_as_nan', False)
+
         # need to deserialize the definitions first as they are needed for the rest
         if 'definitions' in data:
             self.m_set('definitions', data['definitions'], context=m_context)
@@ -2151,13 +2157,16 @@ class MSection(metaclass=MObjectMeta):
                     if definition.repeats and isinstance(value, dict)
                     else value,
                     context=m_context,
+                    treat_none_as_nan=treat_none_as_nan,
                 )
-            elif definition.use_full_storage:
-                self.m_set(definition, value)
             elif not definition.virtual:
-                # todo: setting None has different implications
-                self.__dict__[definition.name] = definition.type.normalize(
-                    value, section=self
+                self.m_set(
+                    definition,
+                    value,
+                    # todo: the hack flag shall be removed
+                    # it is required in editing, None is used to remove the value in indexing
+                    force_none=kwargs.get('force_none', False),
+                    treat_none_as_nan=treat_none_as_nan,
                 )
 
         for name, value in data.get('m_attributes', {}).items():
@@ -2190,6 +2199,9 @@ class MSection(metaclass=MObjectMeta):
         through the optional parameter. Otherwise, the section definition is read from
         the `m_def` key in the section data.
         """
+
+        treat_none_as_nan: bool = kwargs.pop('treat_none_as_nan', False)
+
         if 'm_ref_archives' in dct and isinstance(m_context, Context):
             # dct['m_ref_archives'] guarantees that 'm_def' exists
             for entry_url, archive_json in dct['m_ref_archives'].items():
@@ -2251,7 +2263,7 @@ class MSection(metaclass=MObjectMeta):
             section.m_annotations.update(m_annotations)
             section.m_parse_annotations()
 
-        section.m_update_from_dict(dct)
+        section.m_update_from_dict(dct, treat_none_as_nan=treat_none_as_nan)
         return section
 
     def m_to_json(self, **kwargs):
@@ -3167,16 +3179,19 @@ class Quantity(Property):
 
         check_dimensionality(self, self.unit)
 
-    def __get__(self, obj, cls=None):
+    def __get__(self, obj, cls=None, **kwargs):
         if obj is None:
             return self
 
         if self.name in obj.__dict__:
             value = obj.__dict__[self.name]
+            actual_name = kwargs.get('actual_name', self.name)
             # appears to be a quantity using full storage
             # cannot use .use_full_storage as this is not set yet
-            if isinstance(value, dict) and self.name in value:
-                value = value[self.name].get()
+            if isinstance(value, dict) and actual_name in value:
+                if kwargs.get('full', False):  # full storage requested
+                    return value[actual_name]
+                value = value[actual_name].get()
         else:
             if self.derived is not None:
                 try:
@@ -3219,24 +3234,33 @@ class Quantity(Property):
         return value
 
     @metainfo_setter
-    def __set__(self, obj, value):
+    def __set__(self, obj, value, **kwargs):
         if self.derived is not None:
             raise MetainfoError(f'The quantity {self} is derived and cannot be set.')
 
         item_name: str = self.name
 
         if value is None:
-            # This implements the implicit "unset" semantics of assigned None as a value
-            to_remove: dict | None = obj.__dict__.pop(item_name, None)
-            # if full storage is used, also need to clear quantities created for convenient access
-            if self.use_full_storage and to_remove:
-                # self.__dict__[full_name] is guaranteed to be a 'dict[str, MQuantity]'
-                for key in to_remove.keys():
-                    obj.__dict__.pop(key, None)
-            return
+            if kwargs.get('force_none', False):
+                # todo: this is due to the bug in editing which set None as label to remove it in indexing
+                # this should be removed when the bug is fixed
+                obj.__dict__[item_name] = value
+            else:
+                # This implements the implicit "unset" semantics of assigned None as a value
+                to_remove: dict | None = obj.__dict__.pop(item_name, None)
+                # if full storage is used, also need to clear quantities created for convenient access
+                if self.use_full_storage and to_remove:
+                    # self.__dict__[full_name] is guaranteed to be a 'dict[str, MQuantity]'
+                    for key in to_remove.keys():
+                        obj.__dict__.pop(key, None)
+            if not (
+                kwargs.get('treat_none_as_nan', False)
+                and isinstance(self.type, InexactNumber)
+            ):
+                return
 
         if not self.use_full_storage:
-            value = self.type.normalize(value, section=obj)
+            value = self.type.normalize(value, section=obj, **kwargs)
             if isinstance(self.type, Reference):
                 if value == _UNSET_:
                     return
@@ -3288,7 +3312,9 @@ class Quantity(Property):
                     "Variadic quantities only accept raw values wrapped in 'MQuantity'."
                 )
 
-            m_quantity.value = self.type.normalize(m_quantity.value, section=obj)
+            m_quantity.value = self.type.normalize(
+                m_quantity.value, section=obj, **kwargs
+            )
 
             if self.unit is None:
                 # no prescribed unit, need to check dimensionality, no need to convert
@@ -3401,7 +3427,7 @@ class DirectQuantity(Quantity):
         return obj.__dict__.get(self._name, self._default)
 
     @metainfo_setter
-    def __set__(self, obj, value):
+    def __set__(self, obj, value, **kwargs):
         if value is None:
             obj.__dict__.pop(self._name, None)
         else:
@@ -3448,7 +3474,7 @@ class SubSection(Property):
         return obj.__dict__.get(self.name, None)
 
     @metainfo_setter
-    def __set__(self, obj, value):
+    def __set__(self, obj, value, **kwargs):
         """
         Accepts a single section or a list of sections.
         Accepts None to unset the subsection.
diff --git a/nomad/parsing/parser.py b/nomad/parsing/parser.py
index f571e88690..6808267483 100644
--- a/nomad/parsing/parser.py
+++ b/nomad/parsing/parser.py
@@ -543,7 +543,7 @@ class ArchiveParser(MatchingParser):
 
             del archive_data['definitions']
 
-        archive.m_update_from_dict(archive_data)
+        archive.m_update_from_dict(archive_data, treat_none_as_nan=True)
 
     def parse(
         self, mainfile: str, archive: EntryArchive, logger=None, child_archives=None
diff --git a/nomad/processing/data.py b/nomad/processing/data.py
index e96805dfaf..2d8b816c29 100644
--- a/nomad/processing/data.py
+++ b/nomad/processing/data.py
@@ -2879,8 +2879,8 @@ class Upload(Proc):
             # Create updates for ES
             entry_metadata = entry.mongo_metadata(self)
             if upload_updates:
-                entry_metadata.m_update_from_dict(upload_updates)
-            entry_metadata.m_update_from_dict(entry_updates)
+                entry_metadata.m_update_from_dict(upload_updates, force_none=True)
+            entry_metadata.m_update_from_dict(entry_updates, force_none=True)
             updated_metadata.append(entry_metadata)
 
         # Update mongo
diff --git a/tests/metainfo/test_metainfo.py b/tests/metainfo/test_metainfo.py
index 1e6c4a9cf5..5e91c5c765 100644
--- a/tests/metainfo/test_metainfo.py
+++ b/tests/metainfo/test_metainfo.py
@@ -19,6 +19,7 @@
 # Contains more general test cases that are replaced continiously by more specialized
 # in-depth tests in test_* files of the same module.
 
+from math import isnan
 import pytest
 import numpy as np
 import pandas as pd
@@ -678,6 +679,16 @@ class TestM1:
         assert test_section.test_quantity == 12
         assert type(test_section.test_quantity) is np.int32
 
+    def test_scalar_float(self):
+        class TestSection(MSection):
+            test_quantity = Quantity(type=float)
+
+        assert isnan(
+            TestSection.m_from_dict(
+                {'test_quantity': None}, treat_none_as_nan=True
+            ).test_quantity
+        )
+
     def test_pd_dataframe_quantity(self):
         class TestSection(MSection):
             test_quantity = Quantity(type=np.dtype('int32'), shape=['*', '*'])
-- 
GitLab