From 24b00e804db9da3e17708adc35ed415ef6f514d4 Mon Sep 17 00:00:00 2001
From: Theodore Chang <theodore.chang@physik.hu-berlin.de>
Date: Wed, 4 Sep 2024 12:53:24 +0000
Subject: [PATCH] Revise `m_to_dict` serialization, simplify logic.

Simplify pseudo reference types, decouple from `MProxy`.
---
 gui/tests/artifacts.js                    |   6 +-
 nomad/datamodel/data.py                   |  39 +-
 nomad/datamodel/datamodel.py              |  17 +-
 nomad/datamodel/metainfo/annotations.py   |   2 +-
 nomad/metainfo/__init__.py                |   1 -
 nomad/metainfo/data_type.py               |  75 +--
 nomad/metainfo/elasticsearch_extension.py |   2 +-
 nomad/metainfo/metainfo.py                | 560 +++++++---------------
 nomad/metainfo/util.py                    |  86 ----
 tests/metainfo/__init__.py                |  48 ++
 tests/metainfo/test_metainfo.py           |   2 +-
 tests/metainfo/test_quantities.py         |   2 +-
 tests/metainfo/test_references.py         |  18 -
 tests/processing/test_data.py             |   2 +-
 14 files changed, 310 insertions(+), 550 deletions(-)

diff --git a/gui/tests/artifacts.js b/gui/tests/artifacts.js
index 6962daea15..f32f40faf3 100644
--- a/gui/tests/artifacts.js
+++ b/gui/tests/artifacts.js
@@ -6598,8 +6598,7 @@ window.nomadArtifacts = {
             ],
             "constraints": [
               "dimensions",
-              "has_type",
-              "higher_shapes_require_dtype"
+              "has_type"
             ]
           },
           {
@@ -60799,8 +60798,7 @@ window.nomadArtifacts = {
             ],
             "constraints": [
               "dimensions",
-              "has_type",
-              "higher_shapes_require_dtype"
+              "has_type"
             ],
             "quantities": [
               {
diff --git a/nomad/datamodel/data.py b/nomad/datamodel/data.py
index a3c149c8a8..fb35e9d3f6 100644
--- a/nomad/datamodel/data.py
+++ b/nomad/datamodel/data.py
@@ -30,7 +30,6 @@ from nomad.metainfo.metainfo import (
     MCategory,
     MSection,
     Quantity,
-    MProxy,
     Capitalized,
     Section,
     Datetime,
@@ -199,13 +198,24 @@ class UserReference(Reference):
         return {'type_kind': 'User', 'type_data': 'User'}
 
     def _normalize_impl(self, section, value):
-        # todo: need data validation
+        if isinstance(value, User):
+            return value
+
         if isinstance(value, str):
-            return MProxy(value, m_proxy_section=section, m_proxy_type=self._proxy_type)
-        return value
+            try:
+                return User.get(value)
+            except Exception as _exc:  # noqa
+                return value
+
+        raise ValueError(f'Cannot normalize {value}.')
 
     def _serialize_impl(self, section, value):
-        return value.user_id
+        if isinstance(value, str):
+            return value
+        if isinstance(value, User):
+            return value.user_id
+
+        raise ValueError(f'Cannot serialize {value}.')
 
 
 class AuthorReference(Reference):
@@ -216,12 +226,23 @@ class AuthorReference(Reference):
         return {'type_kind': 'Author', 'type_data': 'Author'}
 
     def _normalize_impl(self, section, value):
-        # todo: need data validation
-        if isinstance(value, (str, dict)):
-            return MProxy(value, m_proxy_section=section, m_proxy_type=self._proxy_type)
-        return value
+        if isinstance(value, Author):
+            return value
+
+        if isinstance(value, dict):
+            return Author.m_from_dict(value)
+
+        if isinstance(value, str):
+            try:
+                return User.get(value)
+            except Exception as _exc:  # noqa
+                return value
+
+        raise ValueError(f'Cannot normalize {value}.')
 
     def _serialize_impl(self, section, value):
+        if isinstance(value, str):
+            return value
         if isinstance(value, User):
             return value.user_id
         if isinstance(value, Author):
diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py
index 7621971c6c..69b1e773fe 100644
--- a/nomad/datamodel/datamodel.py
+++ b/nomad/datamodel/datamodel.py
@@ -40,13 +40,11 @@ from ..metainfo import (
     Bytes,
     Package,
     Definition,
-    MProxy,
     MSection,
     MCategory,
     Section,
     SubSection,
     Quantity,
-    Reference,
     MEnum,
     Datetime,
     JSON,
@@ -189,14 +187,19 @@ class DatasetReference(Reference):
         super().__init__(Dataset.m_def)
 
     def _normalize_impl(self, section, value):
-        # todo: need data validation
+        if isinstance(value, Dataset):
+            return value
+
         if isinstance(value, str):
-            return MProxy(value, m_proxy_section=section, m_proxy_type=self._proxy_type)
-        return value
+            if (target := Dataset.m_def.a_mongo.get(dataset_id=value)) is not None:
+                return target
+            return value
+
+        raise ValueError(f'Cannot normalize {value}.')
 
     def _serialize_impl(self, section, value):
-        if isinstance(value, MProxy):
-            return value.m_proxy_value
+        if isinstance(value, str):
+            return value
 
         return value.dataset_id
 
diff --git a/nomad/datamodel/metainfo/annotations.py b/nomad/datamodel/metainfo/annotations.py
index 60fc442795..8de3a066ff 100644
--- a/nomad/datamodel/metainfo/annotations.py
+++ b/nomad/datamodel/metainfo/annotations.py
@@ -24,7 +24,7 @@ import re
 from pydantic.main import BaseModel
 
 from nomad.utils import strip
-from nomad.metainfo import AnnotationModel, MEnum, MTypes, Datetime, Reference, Quantity
+from nomad.metainfo import AnnotationModel, MEnum, Datetime, Reference, Quantity
 from .plot import PlotlyError
 from ..data import Query
 from ...metainfo.data_type import Datatype
diff --git a/nomad/metainfo/__init__.py b/nomad/metainfo/__init__.py
index 49e83565cb..2beac67794 100644
--- a/nomad/metainfo/__init__.py
+++ b/nomad/metainfo/__init__.py
@@ -30,7 +30,6 @@ including JSON, (HDF5), mongodb, and elastic search.
 """
 
 from .metainfo import (
-    MTypes,
     MSectionBound,
     MSection,
     MCategory,
diff --git a/nomad/metainfo/data_type.py b/nomad/metainfo/data_type.py
index facd79daf6..d6a05aa5f5 100644
--- a/nomad/metainfo/data_type.py
+++ b/nomad/metainfo/data_type.py
@@ -20,6 +20,7 @@ from __future__ import annotations
 import builtins
 import importlib
 import re
+import typing
 from base64 import b64decode, b64encode
 from datetime import datetime, date
 from functools import reduce
@@ -165,6 +166,19 @@ class Datatype:
         The given value is the actual value stored in the corresponding section.
 
         This method shall return an object that is JSON serializable.
+
+        Optional keyword arguments:
+            section: the section object that the value belongs to
+            transform: a function that transforms the value, this function will apply to each element of the value
+                if the value is an array, or a nested array.
+
+                The function shall have the following signature:
+                    ```python
+                    def transform(value, path):
+                        pass
+                    ```
+                The value is the actual value, or the element in the array.
+                The path shall be None if the value is a scalar, or a list of indices if the value is an array.
         """
         raise NotImplementedError()
 
@@ -351,13 +365,24 @@ class Primitive(Datatype):
         """
         This handles both scalar and array like values.
         """
+
+        transform: typing.Callable | None = kwargs.get('transform', None)
+
+        def _convert(v, p=None):
+            if isinstance(v, list):
+                return [
+                    _convert(x, [i] if p is None else p + [i]) for i, x in enumerate(v)
+                ]
+
+            return v if transform is None else transform(v, p)
+
         if isinstance(value, np.ndarray):
-            return value.tolist()
+            return _convert(value.tolist())
 
         if isinstance(value, np.generic):
-            return value.item()
+            return _convert(value.item())
 
-        return value
+        return _convert(value)
 
 
 class Number(Primitive):
@@ -738,11 +763,17 @@ class NonPrimitive(Datatype):
         Transparently return the given value.
         """
 
-        def _convert(v):
+        transform: typing.Callable | None = kwargs.get('transform', None)
+
+        def _convert(v, p=None):
             if isinstance(v, list):
-                return [_convert(x) for x in v]
+                return [
+                    _convert(x, [i] if p is None else p + [i]) for i, x in enumerate(v)
+                ]
+
+            intermediate = self._serialize_impl(v, **kwargs)
 
-            return self._serialize_impl(v, **kwargs)
+            return intermediate if transform is None else transform(intermediate, p)
 
         return _convert(value)
 
@@ -894,7 +925,7 @@ class Unit(NonPrimitive):
         else:
             raise TypeError('Units must be given as str or pint.Unit instances.')
 
-        _check_dimensionality(self._definition, unit_obj)
+        check_dimensionality(self._definition, unit_obj)
 
         return unit_obj
 
@@ -1400,7 +1431,7 @@ def _normalize_complex(value, complex_type, to_unit: str | ureg.Unit | None):
     raise ValueError(f'Cannot convert {value} to complex number.')
 
 
-def _check_dimensionality(quantity_def, unit: pint.Unit | None) -> None:
+def check_dimensionality(quantity_def, unit: pint.Unit | None) -> None:
     if quantity_def is None or unit is None:
         return
 
@@ -1422,33 +1453,5 @@ def _check_dimensionality(quantity_def, unit: pint.Unit | None) -> None:
     raise TypeError(f'Dimensionality {dimensionality} is not met by unit {unit}.')
 
 
-def _split_python_definition(definition_with_id: str) -> tuple[list, str | None]:
-    """
-    Split a Python type name into names and an optional ID.
-
-    Example:
-        my_package.my_section            ==> (['my_package', 'my_section'], None)
-        my_package.my_section@my_id      ==> (['my_package', 'my_section'], 'my_id')
-        my_package/section_definitions/0 ==> (['my_package', 'section_definitions/0'], None)
-    """
-
-    def __split(name: str):
-        # The definition name must contain at least one dot which comes from the module name.
-        # The actual definition could be either a path (e.g., my_package/section_definitions/0)
-        # or a name (e.g., my_section).
-        # If it is a path (e.g., a.b.c/section_definitions/0), after splitting at '.', the last segment
-        # (c/section_definitions/0) contains the package name (c). It needs to be relocated.
-        segments: list = name.split('.')
-        if '/' in segments[-1]:
-            segments.extend(segments.pop().split('/', 1))
-        return segments
-
-    if '@' not in definition_with_id:
-        return __split(definition_with_id), None
-
-    definition_names, definition_id = definition_with_id.split('@')
-    return __split(definition_names), definition_id
-
-
 if __name__ == '__main__':
     pass
diff --git a/nomad/metainfo/elasticsearch_extension.py b/nomad/metainfo/elasticsearch_extension.py
index 598915b9f7..5181fe56b1 100644
--- a/nomad/metainfo/elasticsearch_extension.py
+++ b/nomad/metainfo/elasticsearch_extension.py
@@ -285,7 +285,7 @@ class DocumentType:
                             suggestion_value = value
                         section_path = section.m_path()[len(root.m_path()) :]
                         name = elasticsearch_annotation.property_name
-                        if path:
+                        if not isinstance(quantity.type, Datatype):
                             suggestion_path = f'{section_path}/{path}/{name}'
                         else:
                             suggestion_path = f'{section_path}/{name}'
diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py
index 92d0de6e32..39d5e63c7b 100644
--- a/nomad/metainfo/metainfo.py
+++ b/nomad/metainfo/metainfo.py
@@ -40,6 +40,7 @@ from typing import (
     cast,
     ClassVar,
 )
+from urllib.parse import urlsplit, urlunsplit
 
 import docstring_parser
 import jmespath
@@ -65,6 +66,7 @@ from nomad.metainfo.data_type import (
     File as FileType,
     HDF5Reference as HDF5ReferenceType,
     Any as AnyType,
+    check_dimensionality,
 )
 from nomad.metainfo.util import (
     Annotation,
@@ -72,10 +74,7 @@ from nomad.metainfo.util import (
     MEnum,
     MQuantity,
     MSubSectionList,
-    MTypes,
-    ReferenceURL,
     SectionAnnotation,
-    check_dimensionality,
     convert_to,
     default_hash,
     dict_to_named_list,
@@ -184,7 +183,7 @@ class MProxy:
 
     def __init__(
         self,
-        m_proxy_value: Union[str, int, dict],
+        m_proxy_value: str | int,
         m_proxy_section: MSection = None,
         m_proxy_context: Context = None,
         m_proxy_type: Reference = None,
@@ -248,26 +247,17 @@ class MProxy:
         return context_section.m_resolve(fragment_with_id)
 
     def _resolve(self):
-        from nomad.datamodel.datamodel import Dataset, DatasetReference
-        from nomad.datamodel.data import UserReference, AuthorReference, User, Author
-
-        if isinstance(self.m_proxy_type, DatasetReference):
-            return Dataset.m_def.a_mongo.get(dataset_id=self.m_proxy_value)
-        if isinstance(self.m_proxy_type, UserReference):
-            return User.get(user_id=self.m_proxy_value)
-        if isinstance(self.m_proxy_type, AuthorReference):
-            if isinstance(self.m_proxy_value, str):
-                return User.get(user_id=self.m_proxy_value)
-            if isinstance(self.m_proxy_value, dict):
-                return Author.m_from_dict(self.m_proxy_value)
-
-            raise MetainfoReferenceError()
-
-        url = ReferenceURL(self.m_proxy_value)
+        url_parts = urlsplit(
+            self.m_proxy_value
+            if '#' in self.m_proxy_value
+            else f'#{self.m_proxy_value}'
+        )
+        archive_url: str = str(urlunsplit(url_parts[:4] + ('',)))
+        fragment = url_parts.fragment
         context_section = self.m_proxy_section
         if context_section is not None:
             context_section = context_section.m_root()
-        if url.archive_url or '@' in url.fragment:
+        if archive_url or '@' in fragment:
             context = self.m_proxy_context
             if context is None:
                 context = context_section.m_context
@@ -275,21 +265,19 @@ class MProxy:
                 raise MetainfoReferenceError(
                     'Proxy with archive url, but no context to resolve it.'
                 )
-            if '@' in url.fragment:
+            if '@' in fragment:
                 # It's a reference to a section definition
-                definition, definition_id = f'{url.archive_url}#{url.fragment}'.split(
-                    '@'
-                )
+                definition, definition_id = f'{archive_url}#{fragment}'.split('@')
                 return context.resolve_section_definition(
                     definition, definition_id
                 ).m_def
 
-            context_section = context.resolve_archive_url(url.archive_url)
+            context_section = context.resolve_archive_url(archive_url)
 
-        if isinstance(context_section, Package) and 'definitions' in url.fragment:
-            url.fragment = url.fragment.replace('/definitions', '')
+        if isinstance(context_section, Package) and 'definitions' in fragment:
+            fragment = fragment.replace('/definitions', '')
 
-        return self._resolve_fragment(context_section, url.fragment)
+        return self._resolve_fragment(context_section, fragment)
 
     def m_proxy_resolve(self):
         if not self.m_proxy_resolved:
@@ -448,7 +436,9 @@ class QuantityType(Datatype):
         if isinstance(value, Datatype):
             return value.serialize_self()
         if isinstance(value, Reference):
-            return value.serialize_self(kwargs.get('section'))
+            transform = kwargs.get('transform')
+            serialized = value.serialize_self(kwargs.get('section'))
+            return transform(serialized) if transform is not None else serialized
 
         raise MetainfoError(f'Type {value} is not a valid quantity type.')
 
@@ -567,12 +557,19 @@ class Reference:
             value,
         )
 
-    def serialize(self, section, value):
-        def _convert(_v):
-            if isinstance(_v, list):
-                return [_convert(v) for v in _v]
+    def serialize(self, value, *, section, transform=None):
+        def _convert(v, p=None):
+            if isinstance(v, list):
+                return [
+                    _convert(x, [i] if p is None else p + [i]) for i, x in enumerate(v)
+                ]
+
+            if isinstance(v, MProxy) and v.m_proxy_resolved is None:
+                intermediate = v.m_serialize_proxy_value()
+            else:
+                intermediate = self._serialize_impl(section, v)
 
-            return self._serialize_impl(section, _v)
+            return intermediate if transform is None else transform(intermediate, p)
 
         return _convert(value)
 
@@ -1769,171 +1766,93 @@ class MSection(
                     )
 
         def serialize_quantity(quantity, is_set, is_derived, path, target_value=None):
-            quantity_type = quantity.type
-
-            if resolve_references and isinstance(quantity_type, QuantityReference):
-                quantity_type = quantity_type.target_quantity_def.type
-
-            serialize: TypingCallable[[Any], Any]
-
-            # define serialization functions for all valid data types
-            is_reference = False
-            if isinstance(quantity_type, Reference):
-                is_reference = True
-
-                def serialize_reference(value, path_override):
-                    if resolve_references:
-                        assert not isinstance(quantity_type, QuantityReference)
-                        value = value.m_resolved()
-                        ref_kwargs = dict(kwargs)
-                        if kwargs['transform']:
-                            ref_kwargs['transform'] = lambda q, s, v, p: kwargs[
-                                'transform'
-                            ](q, s, v, path_override)
-                        return value.m_to_dict(**ref_kwargs)
-
-                    type_with_def = quantity_type.attach_definition(quantity)
-
-                    if isinstance(value, MProxy):
-                        if value.m_proxy_resolved is not None:
-                            return type_with_def.serialize(self, value)
-
-                        return value.m_serialize_proxy_value()
-
-                    return type_with_def.serialize(self, value)
-
-                serialize = serialize_reference
-
-            elif isinstance(quantity_type, Datatype):
-                serialize = None
-            else:
-                raise MetainfoError(
-                    f'Do not know how to serialize data with type {quantity_type} for quantity {quantity}'
-                )
-
-            quantity_type = quantity.type
-            if resolve_references and isinstance(quantity_type, QuantityReference):
-                serialize_before_reference_resolution = serialize
-
-                def serialize_reference_v2(value: Any):
-                    resolved = value.m_resolved()
-                    target_name = quantity_type.target_quantity_def.name
-                    try:
-                        # should not use the following line alone
-                        # to account for derived quantities
-                        value = resolved.__dict__[target_name]
-                    except KeyError:
-                        # should not use the following line directly as
-                        # it returns `pint.Quantity` for quantities with units
-                        # here we want to get the value of the quantity stored in memory
-                        value = getattr(resolved, target_name)
-
-                    if isinstance(quantity_type.target_quantity_def.type, Datatype):
-                        return quantity_type.target_quantity_def.type.serialize(value)
-
-                    return serialize_before_reference_resolution(value)
-
-                serialize = serialize_reference_v2
-
             # get the value to be serialized
-            # explicitly assigning the target value overrides the value from the section
+            # explicitly assigned the target value overrides the value from the section
             if target_value is None:
                 if is_set:
                     target_value = self.__dict__[quantity.name]
                 elif is_derived:
                     try:
                         target_value = quantity.derived(self)
-                    except Exception:
+                    except Exception:  # noqa
                         target_value = quantity.default
                 else:
                     target_value = quantity.default
 
-            if transform is not None:
-                serialize_before_transform = serialize
+            def _transform_wrapper(_value, _stack=None):
+                _path = path
+                if _stack is not None:
+                    _path += '/' + '/'.join(str(i) for i in _stack)
+                return (
+                    _value
+                    if transform is None
+                    else transform(quantity, self, _value, _path)
+                )
 
-                def serialize_and_transform(value: Any, path_override=None):
-                    if not is_reference:
-                        return transform(
-                            quantity,
-                            self,
-                            serialize_before_transform(value),
-                            path_override,
-                        )
+            quantity_type = quantity.type
 
-                    return transform(
-                        quantity,
-                        self,
-                        serialize_before_transform(value, path_override),
-                        path_override,
-                    )
+            if isinstance(quantity_type, Datatype) or not resolve_references:
+                return quantity_type.serialize(
+                    target_value, section=self, transform=_transform_wrapper
+                )
 
-                serialize = serialize_and_transform
-
-            if isinstance(quantity_type, Datatype):
-                intermediate_value = quantity_type.serialize(target_value, section=self)
-                if transform is None:
-                    return intermediate_value
-                if isinstance(quantity_type, Number) or len(quantity.shape) == 0:
-                    return transform(
-                        quantity,
-                        self,
-                        intermediate_value,
-                        None,
-                    )
+            # need to resolve references
+            if isinstance(quantity_type, QuantityReference):
+                target_definition = quantity_type.target_quantity_def
+                target_name = target_definition.name
+                target_type = target_definition.type
 
-                if len(quantity.shape) == 1:
-                    return [
-                        transform(
-                            quantity,
-                            self,
-                            x,
-                            None,
-                        )
-                        for x in intermediate_value
-                    ]
+                def _serialize_resolved(v, p=None):
+                    if isinstance(v, list):
+                        return [
+                            _serialize_resolved(x, [i] if p is None else p + [i])
+                            for i, x in enumerate(v)
+                        ]
 
-                raise NotImplementedError('nOtSupporteD')
+                    resolved_section = v.m_resolved()
+                    try:
+                        # should not use the following line alone
+                        # to account for derived quantities
+                        resolved_value = resolved_section.__dict__[target_name]
+                    except KeyError:
+                        # should not use the following line directly as
+                        # it returns `pint.Quantity` for quantities with units
+                        # here we want to get the value of the quantity stored in memory
+                        resolved_value = getattr(resolved_section, target_name)
 
-            # serialization starts here
-            if len(quantity.shape) == 0:
-                return (
-                    serialize(target_value, path)
-                    if is_reference
-                    else serialize(target_value)
-                )
+                    return target_type.serialize(
+                        resolved_value,
+                        section=resolved_section,
+                        transform=_transform_wrapper,
+                    )
 
-            if len(quantity.shape) == 1:
-                if not is_reference:
-                    return [serialize(item) for item in target_value]
+                return _serialize_resolved(target_value)
 
-                return [
-                    serialize(item, f'{path}/{index}')
-                    for index, item in enumerate(target_value)
-                ]
+            # other references
+            def _serialize_section(v, p):
+                if isinstance(v, list):
+                    return [_serialize_section(x, f'{p}/{i}') for i, x in enumerate(v)]
 
-            raise NotImplementedError(
-                f'Higher shapes ({quantity.shape}) not supported: {quantity}'
-            )
+                ref_kwargs = {k: v for k, v in kwargs.items() if k != 'transform'}
+                if transform:
+
+                    def _new_transform(_q, _s, _v, _):
+                        return transform(_q, _s, _v, p)
 
-        def serialize_attribute(attribute: Attribute, value: Any) -> Any:
-            if isinstance(attribute.type, Datatype):
-                return attribute.type.serialize(value)
+                    ref_kwargs['transform'] = _new_transform
 
-            if isinstance(attribute.type, Reference):
-                return attribute.type.attach_definition(None).serialize(self, value)
+                return v.m_resolved().m_to_dict(**ref_kwargs)
 
-            raise MetainfoError()
+            return _serialize_section(target_value, path)
 
-        def collect_attributes(attr_map: dict, all_attr: dict):
+        def serialize_attributes(attr_map: dict, all_attr: dict):
             result: dict = {}
             for attr_key, attr_value in attr_map.items():
                 attr_def = resolve_variadic_name(all_attr, attr_key)
-                result[attr_key] = serialize_attribute(attr_def, attr_value)
+                result[attr_key] = attr_def.type.serialize(attr_value, section=self)
             return result
 
-        def serialize_full_quantity(
-            quantity_def: Quantity, values: Dict[str, MQuantity]
-        ):
+        def serialize_full(quantity_def: Quantity, values: dict[str, MQuantity]):
             result: dict = {}
             for m_quantity in values.values():
                 m_result: dict = {
@@ -1946,10 +1865,9 @@ class MSection(
                 if m_quantity.original_unit:
                     m_result['m_original_unit'] = str(m_quantity.original_unit)
                 if m_quantity.attributes:
-                    a_result: dict = collect_attributes(
+                    if a_result := serialize_attributes(
                         m_quantity.attributes, quantity_def.all_attributes
-                    )
-                    if a_result:
+                    ):
                         m_result['m_attributes'] = a_result
                 result[m_quantity.name] = m_result
 
@@ -1958,51 +1876,52 @@ class MSection(
         def serialize_annotation(annotation):
             if isinstance(annotation, Annotation):
                 return annotation.m_to_dict()
-            elif isinstance(annotation, Dict):
-                try:
-                    json.dumps(annotation)
-                    return annotation
-                except Exception:
-                    return str(annotation)
-            else:
+
+            if not isinstance(annotation, dict):
+                return str(annotation)
+
+            try:
+                json.dumps(annotation)
+                return annotation
+            except Exception:  # noqa
                 return str(annotation)
 
         def items() -> Iterable[Tuple[str, Any]]:
             # metadata
-            if with_meta:
+            if (
+                with_meta
+                or with_root_def
+                or (
+                    self.m_parent
+                    and self.m_parent_sub_section.sub_section != self.m_def
+                )
+            ):
                 yield 'm_def', self.m_def.definition_reference(self)
                 if with_def_id:
                     yield 'm_def_id', self.m_def.definition_id
+
+            if with_meta:
                 if self.m_parent_index != -1:
                     yield 'm_parent_index', self.m_parent_index
                 if self.m_parent_sub_section is not None:
                     yield 'm_parent_sub_section', self.m_parent_sub_section.name
 
-            elif with_root_def:
-                yield 'm_def', self.m_def.definition_reference(self)
-                if with_def_id:
-                    yield 'm_def_id', self.m_def.definition_id
-            elif self.m_parent and self.m_parent_sub_section.sub_section != self.m_def:
-                # The subsection definition's section def is different from our
-                # own section def. We are probably a specialized derived section
-                # from the base section that was used in the subsection def. To allow
-                # clients to recognize the concrete section def, we force the export
-                # of the section def.
-                yield 'm_def', self.m_def.definition_reference(self)
-                if with_def_id:
-                    yield 'm_def_id', self.m_def.definition_id
-
-            annotations = {}
-            for annotation_name, annotation in self.m_annotations.items():
-                if isinstance(annotation, list):
-                    annotation_value = [
-                        serialize_annotation(item) for item in annotation
+            if len(self.m_annotations) > 0:
+                m_annotations: dict = {
+                    k: [
+                        serialize_annotation(item)
+                        for item in (v if isinstance(v, list) else [v])
                     ]
-                else:
-                    annotation_value = [serialize_annotation(annotation)]
-                annotations[annotation_name] = annotation_value
-            if len(annotations) > 0:
-                yield 'm_annotations', annotations
+                    for k, v in self.m_annotations.items()
+                }
+                yield 'm_annotations', m_annotations
+
+            # section attributes
+            if attributes := self.__dict__.get('m_attributes', {}):
+                yield (
+                    'm_attributes',
+                    serialize_attributes(attributes, self.m_def.all_attributes),
+                )
 
             # quantities
             sec_path = self.m_path()
@@ -2017,55 +1936,34 @@ class MSection(
                             yield name, serialize_quantity(quantity, False, True, path)
                         continue
 
-                    is_set = self.m_is_set(quantity)
-                    if not is_set:
-                        if not include_defaults or not quantity.m_is_set(
-                            Quantity.default
-                        ):
-                            continue
+                    if not (is_set := self.m_is_set(quantity)) and (
+                        not include_defaults or not quantity.m_is_set(Quantity.default)
+                    ):
+                        continue
 
-                    if not quantity.use_full_storage:
-                        yield name, serialize_quantity(quantity, is_set, False, path)
+                    if quantity.use_full_storage:
+                        yield name, serialize_full(quantity, self.__dict__[name])
                     else:
-                        yield (
-                            name,
-                            serialize_full_quantity(
-                                quantity, self.__dict__[quantity.name]
-                            ),
-                        )
+                        yield name, serialize_quantity(quantity, is_set, False, path)
 
                 except ValueError as e:
                     raise ValueError(f'Value error ({str(e)}) for {quantity}')
 
-            # section attributes
-            if 'm_attributes' in self.__dict__:
-                yield (
-                    'm_attributes',
-                    collect_attributes(
-                        self.__dict__['m_attributes'], self.m_def.all_attributes
-                    ),
-                )
-
             # subsections
             for name, sub_section_def in self.m_def.all_sub_sections.items():
                 if exclude(sub_section_def, self):
                     continue
 
-                is_set = False
                 if sub_section_def.repeats:
                     if self.m_sub_section_count(sub_section_def) > 0:
-                        is_set = True
                         subsections = self.m_get_sub_sections(sub_section_def)
                         if subsection_as_dict:
-                            subsection_keys: list = [
+                            all_keys: list = [
                                 item.m_key
                                 for item in subsections
                                 if item and item.m_key
                             ]
-                            has_dup: bool = (
-                                0 < len(subsection_keys) != len(set(subsection_keys))
-                            )
-                            if not has_dup:
+                            if not (0 < len(all_keys) != len(set(all_keys))):
                                 serialised_dict: dict = {}
                                 for index, item in enumerate(subsections):
                                     if item is None:
@@ -2073,92 +1971,74 @@ class MSection(
                                     item_key = item.m_key if item.m_key else index
                                     serialised_dict[item_key] = item.m_to_dict(**kwargs)
                                 yield name, serialised_dict
-                            else:
-                                yield (
-                                    name,
-                                    [
-                                        None
-                                        if item is None
-                                        else item.m_to_dict(**kwargs)
-                                        for item in subsections
-                                    ],
-                                )
-                        else:
-                            yield (
-                                name,
-                                [
-                                    None if item is None else item.m_to_dict(**kwargs)
-                                    for item in subsections
-                                ],
-                            )
-                else:
-                    sub_section = self.m_get_sub_section(sub_section_def, -1)
-                    if sub_section is not None:
-                        is_set = True
-                        yield name, sub_section.m_to_dict(**kwargs)
-
-                # attributes are disabled for subsections
-                # if is_set:
-                #     yield from collect_attributes(sub_section_def.all_attributes)
+                                continue
+
+                        serialised_list: list = [
+                            None if item is None else item.m_to_dict(**kwargs)
+                            for item in subsections
+                        ]
+                        yield name, serialised_list
+                elif (
+                    sub_section := self.m_get_sub_section(sub_section_def, -1)
+                ) is not None:
+                    yield name, sub_section.m_to_dict(**kwargs)
 
         return {key: value for key, value in items()}
 
-    def m_update_from_dict(self, dct: Dict[str, Any]) -> None:
+    def m_update_from_dict(self, data: dict) -> None:
         """
         Updates this section with the serialized data from the given dict, e.g. data
         produced by :func:`m_to_dict`.
         """
-        section_def = self.m_def
-        section = self
         m_context = self.m_context if self.m_context else self
 
-        if 'definitions' in dct:
-            definition_def = section_def.all_aliases['definitions']
+        if 'definitions' in data:
+            definition_def = self.m_def.all_aliases['definitions']
             definition_cls = definition_def.sub_section.section_cls
             definition_section = definition_cls.m_from_dict(
-                dct['definitions'], m_parent=self, m_context=m_context
+                data['definitions'], m_parent=self, m_context=m_context
             )
-            section.m_add_sub_section(definition_def, definition_section)
+            self.m_add_sub_section(definition_def, definition_section)
 
-        for name, property_def in section_def.all_aliases.items():
-            if name not in dct or name == 'definitions':
+        for name, property_def in self.m_def.all_aliases.items():
+            if name not in data or name == 'definitions':
                 continue
 
+            target_value = data.get(name)
+
             if isinstance(property_def, SubSection):
                 sub_section_def = property_def
-                sub_section_value = dct.get(name)
                 sub_section_cls = sub_section_def.sub_section.section_cls
+
+                def _append(value=None):
+                    sub_section = None
+                    if value is not None:
+                        sub_section = sub_section_cls.m_from_dict(
+                            value, m_parent=self, m_context=m_context
+                        )
+                    self.m_add_sub_section(sub_section_def, sub_section)
+
                 if sub_section_def.repeats:
                     for sub_section_dct in (
-                        sub_section_value
-                        if isinstance(sub_section_value, list)
-                        else sub_section_value.values()
+                        target_value
+                        if isinstance(target_value, list)
+                        else target_value.values()
                     ):
-                        sub_section = None
-                        if sub_section_dct is not None:
-                            sub_section = sub_section_cls.m_from_dict(
-                                sub_section_dct, m_parent=self, m_context=m_context
-                            )
-                        section.m_add_sub_section(sub_section_def, sub_section)
+                        _append(sub_section_dct)
                 else:
-                    sub_section = sub_section_cls.m_from_dict(
-                        sub_section_value, m_parent=self, m_context=m_context
-                    )
-                    section.m_add_sub_section(sub_section_def, sub_section)
+                    _append(target_value)
 
-            if isinstance(property_def, Quantity):
+            elif isinstance(property_def, Quantity):
                 quantity_def = property_def
-                quantity_value = dct[name]
 
                 if quantity_def.virtual:
-                    # We silently ignore this, similar to how we ignore additional values.
                     continue
 
                 if quantity_def.use_full_storage:
-                    if not isinstance(quantity_value, dict):
+                    if not isinstance(target_value, dict):
                         raise MetainfoError('Full storage quantity must be a dict')
 
-                    for each_name, each_quantity in quantity_value.items():
+                    for each_name, each_quantity in target_value.items():
                         m_quantity = MQuantity(each_name, each_quantity['m_value'])
                         if 'm_unit' in each_quantity:
                             m_quantity.unit = units.parse_units(each_quantity['m_unit'])
@@ -2169,16 +2049,15 @@ class MSection(
                         if 'm_attributes' in each_quantity:
                             m_quantity.attributes = each_quantity['m_attributes']
 
-                        section.m_set(quantity_def, m_quantity)
+                        self.m_set(quantity_def, m_quantity)
                 else:
                     # todo: setting None has different implications
-                    section.__dict__[property_def.name] = quantity_def.type.normalize(
-                        quantity_value, section=section
+                    self.__dict__[property_def.name] = quantity_def.type.normalize(
+                        target_value, section=self
                     )
 
-        if 'm_attributes' in dct:
-            for attr_key, attr_value in dct['m_attributes'].items():
-                section.m_set_section_attribute(attr_key, attr_value)
+        for attr_key, attr_value in data.get('m_attributes', {}).items():
+            self.m_set_section_attribute(attr_key, attr_value)
 
     @classmethod
     def m_from_dict(
@@ -3234,19 +3113,6 @@ class Quantity(Property):
         if self.derived is not None:
             self.virtual = True  # type: ignore
 
-        # replace the quantity implementation with an optimized version for the most
-        # primitive quantities if applicable
-        is_primitive = not self.derived and not self.use_full_storage
-        is_primitive = is_primitive and len(self.shape) <= 1
-        is_primitive = is_primitive and self.type in [str, bool, float, int]
-        is_primitive = is_primitive and self.type not in MTypes.num_numpy
-        if is_primitive:
-            self._default = self.default
-            self._name = self.name
-            self._type = self.type
-            self._list = len(self.shape) == 1
-            self.__class__ = PrimitiveQuantity
-
         check_dimensionality(self, self.unit)
 
     def __get__(self, obj, cls):
@@ -3367,13 +3233,6 @@ class Quantity(Property):
                 f'and int ({dim_quantity.type}) typed.'
             )
 
-    @constraint(warning=True)
-    def higher_shapes_require_dtype(self):
-        if len(self.shape) > 1:
-            assert (
-                self.type in MTypes.numpy
-            ), f'Higher dimensional quantities ({self}) need a dtype and will be treated as numpy arrays.'
-
     def _hash_seed(self) -> str:
         """
         Generate a unique representation for this quantity.
@@ -3459,73 +3318,6 @@ class DirectQuantity(Quantity):
         obj.__dict__[self._name] = ensure_complete_type(value, obj)
 
 
-class PrimitiveQuantity(Quantity):
-    """An optimized replacement for Quantity suitable for primitive properties."""
-
-    def __get__(self, obj, cls):
-        try:
-            value = obj.__dict__[self._name]
-        except KeyError:
-            value = self._default
-        except AttributeError:
-            return self
-        if value is not None and self.unit is not None and self.type in MTypes.num:
-            return value * self.unit  # type: ignore
-        return value
-
-    def __set__(self, obj, value):
-        obj.m_mod_count += 1
-
-        if value is None:
-            obj.__dict__.pop(self.name, None)
-            return
-
-        # Handle pint quantities. Conversion is done automatically between
-        # units. Notice that currently converting from float to int or vice
-        # versa is not allowed for primitive types.
-        if isinstance(value, pint.Quantity):
-            if self.unit is None:
-                if value.units.dimensionless:
-                    value = value.magnitude
-                else:
-                    raise TypeError(
-                        f'The quantity {self} does not have a unit, but value {value} has.'
-                    )
-            elif self.type in MTypes.int:
-                raise TypeError(
-                    f'Cannot save data with unit conversion into the quantity {self} '
-                    'with integer data type due to possible precision loss.'
-                )
-            else:
-                value = value.to(self.unit).magnitude
-
-        if self._list:
-            if not isinstance(value, list):
-                if hasattr(value, 'tolist'):
-                    value = value.tolist()
-                else:
-                    raise TypeError(
-                        f'The value {value} for quantity {self} has no shape {self.shape}'
-                    )
-
-            if any(v is not None and type(v) is not self._type for v in value):
-                raise TypeError(
-                    f'The value {value} with type {type(value)} for quantity {self} is not of type {self.type}'
-                )
-
-        elif type(value) is not self._type:
-            raise TypeError(
-                f'The value {value} with type {type(value)} for quantity {self} is not of type {self.type}'
-            )
-
-        try:
-            obj.__dict__[self._name] = value
-        except AttributeError:
-            raise KeyError(
-                'Cannot overwrite quantity definition. Only values can be set.'
-            )
-
-
 class SubSection(Property):
     """
     Like quantities, subsections are defined in a `section class` as attributes
diff --git a/nomad/metainfo/util.py b/nomad/metainfo/util.py
index a511368462..32db44e213 100644
--- a/nomad/metainfo/util.py
+++ b/nomad/metainfo/util.py
@@ -18,12 +18,9 @@
 
 import hashlib
 import re
-from dataclasses import dataclass
 from difflib import SequenceMatcher
 from typing import Any, Dict, Optional, Tuple, Union
-from urllib.parse import SplitResult, urlsplit, urlunsplit
 
-import numpy as np
 import pint
 
 from nomad.metainfo.data_type import Enum
@@ -32,51 +29,6 @@ from nomad.units import ureg
 __hash_method = 'sha1'  # choose from hashlib.algorithms_guaranteed
 
 
-@dataclass(frozen=True)
-class MTypes:
-    # todo: account for bytes which cannot be naturally serialized to JSON
-    primitive = {
-        str: lambda v: None if v is None else str(v),
-        int: lambda v: None if v is None else int(v),
-        float: lambda v: None if v is None else float(v),
-        complex: lambda v: None if v is None else complex(v),
-        bool: lambda v: None if v is None else bool(v),
-        np.bool_: lambda v: None if v is None else bool(v),
-    }
-
-    primitive_name = {v.__name__: v for v in primitive} | {
-        'string': str,
-        'boolean': bool,
-    }
-
-    int_numpy = {
-        np.int8,
-        np.int16,
-        np.int32,
-        np.int64,
-        np.uint8,
-        np.uint16,
-        np.uint32,
-        np.uint64,
-    }
-    int_python = {int}
-    int = int_python | int_numpy
-    float_numpy = {np.float16, np.float32, np.float64}
-    complex_numpy = {np.complex64, np.complex128}
-    float_python = {float}
-    complex_python = {complex}
-    float = float_python | float_numpy
-    complex = complex_python | complex_numpy
-    num_numpy = int_numpy | float_numpy | complex_numpy
-    num_python = int_python | float_python | complex_python
-    num = num_python | num_numpy
-    str_numpy = {np.str_}
-    bool_numpy = {np.bool_}
-    bool = {bool, np.bool_}
-    numpy = num_numpy | str_numpy | bool_numpy
-    str = {str} | str_numpy
-
-
 MEnum = Enum  # type: ignore
 
 
@@ -217,22 +169,6 @@ class MSubSectionList(list):
             self.section._on_remove_sub_section(self.sub_section_def, old_value)
 
 
-@dataclass
-class ReferenceURL:
-    fragment: str
-    archive_url: str
-    url_parts: SplitResult
-
-    def __init__(self, url: str):
-        if '#' not in url:
-            url = f'#{url}'
-
-        self.url_parts = urlsplit(url)
-        archive_url = urlunsplit(self.url_parts[0:4] + ('',))
-        self.archive_url = None if archive_url is None else archive_url
-        self.fragment = self.url_parts.fragment
-
-
 class Annotation:
     """Base class for annotations."""
 
@@ -454,28 +390,6 @@ def split_python_definition(definition_with_id: str) -> Tuple[list, Optional[str
     return __split(definition_names), definition_id
 
 
-def check_dimensionality(quantity_def, unit: Optional[pint.Unit]) -> None:
-    if quantity_def is None or unit is None:
-        return
-
-    dimensionality = getattr(quantity_def, 'dimensionality', None)
-
-    if dimensionality is None:  # not set, do not validate
-        return
-
-    if dimensionality in ('dimensionless', '1') and unit.dimensionless:  # dimensionless
-        return
-
-    if dimensionality == 'transformation':
-        # todo: check transformation dimensionality
-        return
-
-    if ureg.Quantity(1 * unit).check(dimensionality):  # dimensional
-        return
-
-    raise TypeError(f'Dimensionality {dimensionality} is not met by unit {unit}')
-
-
 def dict_to_named_list(data) -> list:
     if not isinstance(data, dict):
         return data
diff --git a/tests/metainfo/__init__.py b/tests/metainfo/__init__.py
index e69de29bb2..6735eb66d8 100644
--- a/tests/metainfo/__init__.py
+++ b/tests/metainfo/__init__.py
@@ -0,0 +1,48 @@
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class MTypes:
+    # todo: account for bytes which cannot be naturally serialized to JSON
+    primitive = {
+        str: lambda v: None if v is None else str(v),
+        int: lambda v: None if v is None else int(v),
+        float: lambda v: None if v is None else float(v),
+        complex: lambda v: None if v is None else complex(v),
+        bool: lambda v: None if v is None else bool(v),
+        np.bool_: lambda v: None if v is None else bool(v),
+    }
+
+    primitive_name = {v.__name__: v for v in primitive} | {
+        'string': str,
+        'boolean': bool,
+    }
+
+    int_numpy = {
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+    }
+    int_python = {int}
+    int = int_python | int_numpy
+    float_numpy = {np.float16, np.float32, np.float64}
+    complex_numpy = {np.complex64, np.complex128}
+    float_python = {float}
+    complex_python = {complex}
+    float = float_python | float_numpy
+    complex = complex_python | complex_numpy
+    num_numpy = int_numpy | float_numpy | complex_numpy
+    num_python = int_python | float_python | complex_python
+    num = num_python | num_numpy
+    str_numpy = {np.str_}
+    bool_numpy = {np.bool_}
+    bool = {bool, np.bool_}
+    numpy = num_numpy | str_numpy | bool_numpy
+    str = {str} | str_numpy
diff --git a/tests/metainfo/test_metainfo.py b/tests/metainfo/test_metainfo.py
index fb6fbc9781..8cdac1efb5 100644
--- a/tests/metainfo/test_metainfo.py
+++ b/tests/metainfo/test_metainfo.py
@@ -41,7 +41,6 @@ from nomad.metainfo.metainfo import (
     Context,
     DefinitionAnnotation,
     derived,
-    MTypes,
 )
 from nomad.metainfo.example import (
     Run,
@@ -56,6 +55,7 @@ from nomad import utils
 from nomad.units import ureg
 
 from tests import utils as test_utils
+from tests.metainfo import MTypes
 
 
 def assert_section_def(section_def: Section):
diff --git a/tests/metainfo/test_quantities.py b/tests/metainfo/test_quantities.py
index 3e57b2386d..feed120d44 100644
--- a/tests/metainfo/test_quantities.py
+++ b/tests/metainfo/test_quantities.py
@@ -30,12 +30,12 @@ from nomad.metainfo.metainfo import (
     Dimension,
     JSON,
     MSection,
-    MTypes,
     Quantity,
     URL,
     Unit,
     units,
 )
+from tests.metainfo import MTypes
 
 
 @pytest.mark.parametrize(
diff --git a/tests/metainfo/test_references.py b/tests/metainfo/test_references.py
index e050719d9a..c9f20fcc98 100644
--- a/tests/metainfo/test_references.py
+++ b/tests/metainfo/test_references.py
@@ -420,16 +420,6 @@ def test_user_author(def_type, value, expected_name):
 
     # test assignment
     section.quantity = value
-    quantity = section.quantity
-    resolved_quantity = quantity.m_resolved()
-
-    assert quantity.m_proxy_value == value
-    assert (
-        quantity.m_proxy_type.target_section_def.name
-        == def_type().target_section_def.name
-    )
-    assert quantity.m_proxy_section == section
-    assert resolved_quantity.name == expected_name
 
     # test serialization
     serialized_section = section.m_to_dict()
@@ -438,11 +428,3 @@ def test_user_author(def_type, value, expected_name):
     # test deserialization
     deserialized_section = UserAuthorSection().m_from_dict(serialized_section)
     deserialized_quantity = deserialized_section.quantity
-    resolved_deserialized_quantity = deserialized_quantity.m_resolved()
-
-    assert deserialized_quantity.m_proxy_value == value
-    assert (
-        deserialized_quantity.m_proxy_type.target_section_def.name
-        == def_type().target_section_def.name
-    )
-    assert resolved_deserialized_quantity.name == expected_name
diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py
index 0a674c08b5..408ac84a21 100644
--- a/tests/processing/test_data.py
+++ b/tests/processing/test_data.py
@@ -1160,7 +1160,7 @@ def test_read_metadata_from_file(proc_infra, user1, user2, tmp):
         assert entry_metadata.comment == comment[i]
         assert entry_metadata.references == references[i]
         assert entry_metadata.external_id == external_ids[i]
-        coauthors = [a.m_proxy_resolve() for a in entry_metadata.coauthors]
+        coauthors = entry_metadata.coauthors
         assert len(coauthors) == len(expected_coauthors)
         for j in range(len(coauthors)):
             assert coauthors[j].user_id == expected_coauthors[j].user_id
-- 
GitLab