Compare revisions

80250fe3 · 80250fe3 · 80250fe3 · 80250fe3 · 80250fe3 · 80250fe3
--- a/gui/src/config.js
+++ b/gui/src/config.js
@@ -122,7 +122,7 @@ export const searchQuantities = window.nomadArtifacts.searchQuantities
 export const metainfo = window.nomadArtifacts.metainfo
 export const parserMetadata = window.nomadArtifacts.parserMetadata
 export const toolkitMetadata = window.nomadArtifacts.toolkitMetadata
-export const exampleUploads = window.nomadArtifacts.exampleUploads || {}
+export const exampleUploads = {}
 Object.values(entry_points?.options || [])
  .filter(entry_point => entry_point.entry_point_type === 'example_upload')
  .forEach(entry_point => {

--- a/gui/tests/artifacts.js
+++ b/gui/tests/artifacts.js
--- a/gui/tests/env.js
+++ b/gui/tests/env.js
--- a/gui/tests/nomad.yaml
+++ b/gui/tests/nomad.yaml
@@ -13,7 +13,7 @@ fs:
 plugins:
  entry_points:
    include:
-      - schema/simulation/run
-      - schema/simulation/workflow
-      - parsers/vasp
+      - runschema:run_schema_entry_point
+      - simulationworkflowschema:simulationworkflow_schema_entry_point
+      - electronicparsers:vasp_parser_entry_point

--- a/gui/yarn.lock
+++ b/gui/yarn.lock
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -83,7 +83,7 @@ nav:
    - Data structure: explanation/data.md
    - Processing: explanation/processing.md
    - Architecture: explanation/architecture.md
-    - Why you need an Oasis: explanation/oasis.md
+    - Federation and Oasis: explanation/oasis.md
  - Reference:
    - reference/config.md
    - reference/annotations.md
@@ -156,4 +156,4 @@ extra_css:
 extra_javascript:
  - javascript.js
  - https://polyfill.io/v3/polyfill.min.js?features=es6
-  - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
\ No newline at end of file
+  - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
--- a/nomad/app/v1/routers/groups.py
+++ b/nomad/app/v1/routers/groups.py
@@ -117,19 +117,24 @@ async def get_user_groups(
    group_id: Optional[List[str]] = Query(
        None, description='Search groups by their full id.'
    ),
+    user_id: Optional[str] = Query(
+        None, description='Search groups by their owner or members ids.'
+    ),
    search_terms: Optional[str] = Query(
        None, description='Search groups by parts of their name.'
    ),
 ):
    """Get data about user groups."""
-    if group_id is not None and search_terms is not None:
+    if sum(param is not None for param in (group_id, user_id, search_terms)) > 1:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
-            detail='Only one of group_id or search_terms may be used at a time.',
+            detail='Only one of (group_id, user_id, search_terms) may be used at a time.',
        )

    if group_id is not None:
        user_groups = MongoUserGroup.get_by_ids(group_id)
+    elif user_id is not None:
+        user_groups = MongoUserGroup.get_by_user_id(user_id)
    elif search_terms is not None:
        user_groups = MongoUserGroup.get_by_search_terms(search_terms)
    else:

--- a/nomad/archive/__init__.py
+++ b/nomad/archive/__init__.py
@@ -35,15 +35,14 @@ section annotations/categories.

 from .storage import (
    to_json,
-    write_archive,
    read_archive,
    ArchiveError,
    ArchiveReader,
-    ArchiveWriter,
    ArchiveDict,
    ArchiveList,
    ArchiveItem,
 )
+from .storage_v2 import write_archive
 from .query import query_archive, filter_archive, ArchiveQueryError
 from .partial import (
    read_partial_archive_from_mongo,

--- a/nomad/archive/converter.py
+++ b/nomad/archive/converter.py
@@ -58,6 +58,7 @@ def convert_archive(
    delete_old: bool = False,
    counter: Counter = None,
    force_repack: bool = False,
+    size_limit: int = -1,
 ):
    """
    Convert an archive of the old format to the new format.
@@ -83,6 +84,7 @@ def convert_archive(
        delete_old (bool, optional): Whether to delete the old file after conversion. Defaults to False.
        counter (Counter, optional): A counter to track the progress of the conversion. Defaults to None.
        force_repack (bool, optional): Force repacking the archive that is already in the new format. Defaults to False.
+        size_limit (int, optional): The size limit in GB for the archive. Defaults to -1 (no limit).
    """
    prefix: str = counter.increment() if counter else ''

@@ -111,6 +113,13 @@ def convert_archive(
        flush(f'{prefix} [ERROR] File already exists: {new_path}')
        return

+    original_size = os.path.getsize(original_path)
+    if size_limit > 0 and original_size > size_limit * 1024**3:
+        flush(
+            f'{prefix} [WARNING] File size exceeds limit {size_limit} GB: {original_path}'
+        )
+        return
+
    def safe_remove(path: str):
        if not path:
            return
@@ -165,6 +174,7 @@ def convert_folder(
    overwrite: bool = False,
    delete_old: bool = False,
    force_repack: bool = False,
+    size_limit: int = -1,
 ):
    """
    Convert archives in the specified folder to the new format using parallel processing.
@@ -181,6 +191,7 @@ def convert_folder(
        overwrite (bool): Whether to overwrite existing files (default is False).
        delete_old (bool): Whether to delete the old file after conversion (default is False).
        force_repack (bool): Force repacking the archive (default is False).
+        size_limit (int): Size limit in GB for the archive (default is -1, no limit).
    """
    file_list: list = []

@@ -217,6 +228,7 @@ def convert_folder(
        delete_old=delete_old,
        counter=counter,
        force_repack=force_repack,
+        size_limit=size_limit,
    )

    with ProcessPoolExecutor(max_workers=processes) as executor:
@@ -242,6 +254,7 @@ def convert_upload(
    overwrite: bool = False,
    delete_old: bool = False,
    force_repack: bool = False,
+    size_limit: int = -1,
 ):
    """
    Function to convert an upload with the given upload_id to the new format.
@@ -258,6 +271,7 @@ def convert_upload(
        overwrite (bool, optional): Whether to overwrite existing files. Defaults to False.
        delete_old (bool, optional): Whether to delete the old file after conversion. Defaults to False.
        force_repack (bool, optional): Force repacking the existing archive (in new format). Defaults to False.
+        size_limit (int, optional): Size limit in GB for the archive. Defaults to -1 (no limit).
    """
    if isinstance(uploads, (str, Upload)):
        uploads = [uploads]
@@ -289,6 +303,7 @@ def convert_upload(
        overwrite=overwrite,
        delete_old=delete_old,
        force_repack=force_repack,
+        size_limit=size_limit,
    )



--- a/nomad/archive/storage.py
+++ b/nomad/archive/storage.py
@@ -17,40 +17,26 @@
 #
 from __future__ import annotations

-from typing import Iterable, Any, Tuple, Dict, BinaryIO, Union, List, cast, Generator
+from typing import Any, Tuple, Dict, Union, cast, Generator
 from io import BytesIO, BufferedReader
 from collections.abc import Mapping, Sequence

 import msgpack
-from msgpack.fallback import Packer, StringIO
 import struct
-import json

 from nomad import utils
 from nomad.config import config

-__packer = msgpack.Packer(autoreset=True, use_bin_type=True)
-
 _toc_uuid_size = utils.default_hash_len + 1
 _toc_item_size = _toc_uuid_size + 25  # packed(uuid + [10-byte-pos, 10-byte-pos])
 _entries_per_block = config.archive.block_size // _toc_item_size
 _bytes_per_block = _entries_per_block * _toc_item_size


-def packb(o):
-    return __packer.pack(o)
-
-
 def unpackb(o):
    return msgpack.unpackb(o, raw=False)


-def _encode(start: int, end: int) -> bytes:
-    return start.to_bytes(5, byteorder='little', signed=False) + end.to_bytes(
-        5, byteorder='little', signed=False
-    )
-
-
 def _decode(position: bytes) -> Tuple[int, int]:
    return int.from_bytes(
        position[:5], byteorder='little', signed=False
@@ -77,208 +63,6 @@ class ArchiveError(Exception):
    pass


-class TOCPacker(Packer):
-    """
-    A special msgpack packer that records a TOC while packing.
-
-    Uses a combination of the pure python msgpack fallback packer and the "real"
-    c-based packing.
-    """
-
-    def __init__(self, toc_depth: int, *args, **kwargs):
-        self.toc_depth = toc_depth
-        # noinspection PyTypeChecker
-        self.toc: Dict[str, Any] = None
-        self._depth = 0
-
-        # Because we cannot change msgpacks interface of _pack, this _stack is used to
-        # transfer the result of _pack calls in terms of the TOC.
-        self._stack: List[Any] = []
-
-        super().__init__(*args, **kwargs)
-
-    def _pos(self):
-        return self._buffer.getbuffer().nbytes
-
-    def _pack_list(self, obj, *args, **kwargs):
-        pack_result = super()._pack(obj, *args, **kwargs)
-
-        toc_result = []
-        # same assumption and condition as above
-        if len(obj) > 0 and isinstance(obj[0], dict):
-            for _ in obj:
-                toc_result.append(self._stack.pop())
-
-            self._stack.append(list(reversed(toc_result)))
-
-        return pack_result
-
-    def _pack_dict(self, obj, *args, **kwargs):
-        toc_result = {}
-        start = self._pos()
-        if self._depth >= self.toc_depth:
-            pack_result = self._buffer.write(packb(obj))
-        else:
-            self._depth += 1
-            pack_result = super()._pack(obj, *args, **kwargs)
-            self._depth -= 1
-
-            toc = {}
-            for key, value in reversed(list(obj.items())):
-                if isinstance(value, dict) or (
-                    isinstance(value, (list, tuple))
-                    and len(value) > 0
-                    and isinstance(value[0], dict)
-                ):
-                    # assumes non emptiness and uniformity of array items
-                    toc[key] = self._stack.pop()
-
-            toc_result['toc'] = {
-                key: value for key, value in reversed(list(toc.items()))
-            }
-
-        end = self._pos()
-        toc_result['pos'] = [start, end]
-
-        self._stack.append(toc_result)
-
-        return pack_result
-
-    def _pack(self, obj, *args, **kwargs):
-        if isinstance(obj, dict):
-            return self._pack_dict(obj, *args, **kwargs)
-
-        if isinstance(obj, list):
-            return self._pack_list(obj, *args, **kwargs)
-
-        return self._buffer.write(packb(obj))
-
-    def pack(self, obj):
-        assert isinstance(obj, dict), f'TOC packer can only pack dicts, {obj.__class__}'
-        self._depth = 0
-        self._buffer = StringIO()
-        result = super().pack(obj)
-        self.toc = self._stack.pop()
-        assert len(self._stack) == 0
-        return result
-
-
-class ArchiveWriter:
-    def __init__(
-        self, file_or_path: Union[str, BytesIO], n_entries: int, entry_toc_depth: int
-    ):
-        self.file_or_path = file_or_path
-        self.n_entries = n_entries
-
-        self._pos = 0
-        # noinspection PyTypeChecker
-        self._toc_position: Tuple[int, int] = None
-        self._toc: Dict[str, Tuple[Tuple[int, int], Tuple[int, int]]] = {}
-        # noinspection PyTypeChecker
-        self._f: BinaryIO = None
-        self._toc_packer = TOCPacker(toc_depth=entry_toc_depth)
-
-    def __enter__(self):
-        if isinstance(self.file_or_path, str):
-            self._f = open(self.file_or_path, 'wb')
-        elif isinstance(self.file_or_path, BytesIO):
-            self._f = self.file_or_path
-            self._f.seek(0)
-        else:
-            raise ValueError('not a file or path')
-
-        # write empty placeholder header
-        self._write_map_header(3)
-        self._writeb('toc_pos')
-        self._writeb(_encode(0, 0))
-
-        self._writeb('toc')
-        toc_start, _ = self._write_map_header(self.n_entries)
-        _, toc_end = self._write(b'0' * _toc_item_size * self.n_entries)
-        self._toc_position = toc_start, toc_end
-
-        self._writeb('data')
-        self._write_map_header(self.n_entries)
-
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_val is not None:
-            raise exc_val
-
-        # go back and write the real TOC to the header
-        self._f.seek(0)
-        self._pos = 0
-
-        assert len(self._toc) == self.n_entries
-        toc_items = sorted(self._toc.items(), key=lambda item: item[0])
-        toc = {
-            uuid: [_encode(*positions[0]), _encode(*positions[1])]
-            for uuid, positions in toc_items
-        }
-
-        self._write_map_header(3)
-        self._writeb('toc_pos')
-        self._writeb(_encode(*self._toc_position))
-
-        self._writeb('toc')
-        toc_position = self._writeb(toc)
-        assert (
-            toc_position == self._toc_position
-        ), f'{toc_position} - {self._toc_position}'
-
-        if isinstance(self.file_or_path, str):
-            self._f.close()
-
-    def _write_map_header(self, n):
-        if n <= 0x0F:
-            return self._write(struct.pack('B', 0x80 + n))
-        if n <= 0xFFFF:
-            return self._write(struct.pack('>BH', 0xDE, n))
-        if n <= 0xFFFFFFFF:
-            return self._write(struct.pack('>BI', 0xDF, n))
-        raise ValueError('Dict is too large')
-
-    def _write(self, b: bytes) -> Tuple[int, int]:
-        start = self._pos
-        self._pos += self._f.write(b)
-        return start, self._pos
-
-    def _writeb(self, obj):
-        return self._write(packb(obj))
-
-    def _write_entry(self, uuid: str, toc: dict, packed: bytes | Generator):
-        uuid = utils.adjust_uuid_size(uuid)
-
-        self._writeb(uuid)
-        self._write_map_header(2)
-        self._writeb('toc')
-        toc_pos = self._writeb(toc)
-        self._writeb('data')
-
-        if isinstance(packed, bytes):
-            data_pos = self._write(packed)
-        elif isinstance(packed, Generator):
-            start = self._pos
-            for chunk in packed:
-                self._pos += self._f.write(chunk)
-            data_pos = start, self._pos
-        else:
-            raise ValueError('Invalid type for packed data.')
-
-        self._toc[uuid] = (toc_pos, data_pos)
-
-    def add(self, uuid: str, data: Any) -> None:
-        self._toc_packer.reset()
-        packed = self._toc_packer.pack(data)
-        toc = self._toc_packer.toc
-
-        self._write_entry(uuid, toc, packed)
-
-    def add_raw(self, uuid: str, toc: dict, packed: Generator) -> None:
-        self._write_entry(uuid, toc, packed)
-
-
 class ArchiveItem:
    def __init__(self, f: BytesIO, offset: int = 0):
        self._f = f
@@ -528,112 +312,6 @@ class ArchiveReader(ArchiveDict):
        return self._f.closed if isinstance(self._file_or_path, str) else True


-def combine_archive(path: str, n_entries: int, data: Iterable[Tuple[str, Any]]):
-    if config.archive.use_new_writer:
-        from .storage_v2 import (
-            ArchiveWriter as ArchiveWriterNew,
-            ArchiveReader as ArchiveReaderNew,
-        )
-
-        with ArchiveWriterNew(
-            path, n_entries, toc_depth=config.archive.toc_depth
-        ) as writer:
-            for uuid, reader in data:
-                if not reader:
-                    writer.add(uuid, {})
-                elif isinstance(reader, ArchiveReaderNew):
-                    toc, data = reader.get_raw(uuid)
-                    writer.add_raw(uuid, toc, data)
-                else:
-                    # rare case, old reader new writer, toc is not compatible, has to repack
-                    writer.add(uuid, to_json(reader[uuid]))
-    else:
-        with ArchiveWriter(path, n_entries, entry_toc_depth=2) as writer:
-            for uuid, reader in data:
-                if not reader:
-                    writer.add(uuid, {})
-                else:
-                    toc, data = reader.get_raw(uuid)
-                    writer.add_raw(uuid, toc, data)
-
-
-def write_archive(
-    path_or_file: Union[str, BytesIO],
-    n_entries: int,
-    data: Iterable[Tuple[str, Any]],
-    entry_toc_depth: int = 2,
-) -> None:
-    """
-    Writes a msgpack-based archive file. The file contents will be a valid msgpack-object.
-    The data will contain extra table-of-contents (TOC) objects that map some keys to
-    positions in the file. Data can be partially read from these positions and deserialized
-    with msgpack.
-
-    The data in the archive file will have the following layout:
-
-    .. code-block:: python
-
-        {
-            'toc_pos': b[start, end],
-            'toc': {
-                entry_uuid: [b[start, end], b[start, end]], ...
-            },
-            'data': {
-                entry_uuid: {
-                    'toc': {
-                        key: {
-                            'pos': [start, end],
-                            'toc': ...
-                        },
-                        key: [
-                            {
-                                'pos': [start, end]
-                                'toc': ...
-                            }, ...
-                        ],
-                        ...
-                    },
-                    'data': ...
-                }, ...
-            }
-        }
-
-
-    The top-level TOC will map entry_uuids to positions. The key 'toc_pos' holds the
-    position of the entry TOC, the second ('toc') the position of each entry. These positions
-    will be absolute positions in the file. The top-level TOC will be ordered by entry_uuid.
-    The top-level TOC positions are 2*5byte encoded integers. This will give the top-level TOC a
-    predictable layout and will allow to partially read this TOC.
-
-    The TOC of each entry will have the same structure than the data up to a certain
-    TOC depth. A TOC object will hold the position of the object it refers to (key 'pos')
-    and further deeper TOC data (key 'toc'). Only data objects (dict instances) will
-    have TOC objects and only object count towards the TOC depth. Positions in the entry
-    TOCs are regular msgpack encoded integers.
-
-    Arguments:
-        path_or_file: A file path or file-like to the archive file that should be written.
-        n_entries: The number of entries that will be added to the file.
-        data: The file contents as an iterator of entry id, data tuples.
-        entry_toc_depth: The depth of the table of contents in each entry. Only objects will
-            count for calculating the depth.
-    """
-    if config.archive.use_new_writer:
-        from .storage_v2 import ArchiveWriter as ArchiveWriterNew
-
-        with ArchiveWriterNew(
-            path_or_file, n_entries, toc_depth=entry_toc_depth
-        ) as writer:
-            for uuid, entry in data:
-                writer.add(uuid, entry)
-    else:
-        with ArchiveWriter(
-            path_or_file, n_entries, entry_toc_depth=entry_toc_depth
-        ) as writer:
-            for uuid, entry in data:
-                writer.add(uuid, entry)
-
-
 def read_archive(file_or_path: Union[str, BytesIO], **kwargs) -> ArchiveReader:
    """
    Allows to read a msgpack-based archive.
@@ -676,57 +354,4 @@ def read_archive(file_or_path: Union[str, BytesIO], **kwargs) -> ArchiveReader:


 if __name__ == '__main__':
-
-    def benchmark():
-        from time import time
-        import sys
-
-        with open('archive_test.json') as f:
-            example_data = json.load(f)
-
-        size = 5000 if len(sys.argv) == 1 else int(sys.argv[1])
-        access_every = 2
-        example_archive = [(utils.create_uuid(), example_data) for _ in range(0, size)]
-        example_uuid = example_archive[int(size / 2)][0]
-
-        # this impl
-        # create archive
-        start = time()
-        buffer = BytesIO()
-        write_archive(buffer, len(example_archive), example_archive, entry_toc_depth=2)
-        print('archive.py: create archive (1): ', time() - start)
-
-        # read single entry from archive
-        buffer = BytesIO(buffer.getbuffer())
-        for use_blocked_toc in [False, True]:
-            start = time()
-            for _ in range(0, 23):
-                read_archive(buffer, use_blocked_toc=use_blocked_toc)[example_uuid][
-                    'run'
-                ]['system']
-            print(
-                f'archive.py: access single entry system (23), blocked {use_blocked_toc:d}: ',
-                (time() - start) / 23,
-            )
-
-        # read every n-ed entry from archive
-        buffer = BytesIO(buffer.getbuffer())
-        for use_blocked_toc in [False, True]:
-            start = time()
-            for _ in range(0, 23):
-                with read_archive(buffer, use_blocked_toc=use_blocked_toc) as data:
-                    for i, entry in enumerate(example_archive):
-                        if i % access_every == 0:
-                            data[entry[0]]['run']['system']
-            print(
-                f'archive.py: access every {access_every:d}-ed entry single entry system (23), '
-                f'blocked {use_blocked_toc:d}: ',
-                (time() - start) / 23,
-            )
-
-        # just msgpack
-        start = time()
-        packb(example_archive)
-        print('msgpack: create archive (1): ', time() - start)
-
-    benchmark()
+    pass
--- a/nomad/archive/storage_v2.py
+++ b/nomad/archive/storage_v2.py
@@ -18,8 +18,8 @@
 from __future__ import annotations

 import struct
+from collections.abc import Generator, Iterable
 from io import BytesIO
-from typing import Generator

 import msgpack
 from bitarray import bitarray
@@ -373,7 +373,7 @@ class ArchiveWriter:


 def to_json(v):
-    return v.to_json() if isinstance(v, ArchiveItem) else v
+    return v.to_json() if hasattr(v, 'to_json') else v


 class ArchiveReadCounter:
@@ -842,9 +842,25 @@ class ArchiveReader(ArchiveItem):
        return self._full_cache


+def combine_archive(path: str, n_entries: int, data: Iterable[tuple]):
+    with ArchiveWriter(path, n_entries, toc_depth=config.archive.toc_depth) as writer:
+        for uuid, reader in data:
+            if not reader:
+                writer.add(uuid, {})
+            elif isinstance(reader, ArchiveReader):
+                toc, data = reader.get_raw(uuid)
+                writer.add_raw(uuid, toc, data)
+            else:
+                # rare case, old reader new writer, toc is not compatible, has to repack
+                writer.add(uuid, to_json(reader[uuid]))
+
+
 def write_archive(
-    path_or_file: str | BytesIO, data: list, toc_depth: int = config.archive.toc_depth
-):
-    with ArchiveWriter(path_or_file, len(data), toc_depth=toc_depth) as writer:
+    path_or_file: str | BytesIO,
+    n_entries: int,
+    data: Iterable[tuple],
+    entry_toc_depth: int = config.archive.toc_depth,
+) -> None:
+    with ArchiveWriter(path_or_file, n_entries, toc_depth=entry_toc_depth) as writer:
        for uuid, entry in data:
            writer.add(uuid, entry)
--- a/nomad/cli/__init__.py
+++ b/nomad/cli/__init__.py
@@ -24,5 +24,5 @@ Use it from the command line with ``nomad --help`` or ``python -m nomad.cli --he
 more.
 """

-from . import dev, parse, client, admin  # noqa
+from . import dev, parse, client, admin, clean  # noqa
 from .cli import run_cli, cli  # noqa
--- a/nomad/cli/admin/springer.py
+++ b/nomad/cli/admin/springer.py
@@ -30,6 +30,7 @@ import bs4
 import time
 import os.path

+import nomad.archive.storage_v2
 from nomad import archive
 from nomad.config import config
 from nomad.archive import read_archive
@@ -244,7 +245,7 @@ def update_springer(max_n_query: int = 10, retry_time: int = 120):

        page += 1

-    archive.write_archive(
+    nomad.archive.storage_v2.write_archive(
        config.normalize.springer_db_path,
        len(sp_data),
        sp_data.items(),

--- a/nomad/cli/admin/uploads.py
+++ b/nomad/cli/admin/uploads.py
@@ -1429,11 +1429,13 @@ def only_v1(path: str):
 )
 @click.option(
    '--migrate',
+    '-m',
    is_flag=True,
    help='Only convert v1 archive files to v1.2 archive files.',
 )
 @click.option(
    '--force-repack',
+    '-f',
    is_flag=True,
    help='Force repacking existing archives that are already in the new format',
 )
@@ -1444,9 +1446,16 @@ def only_v1(path: str):
    default=os.cpu_count(),
    help='Number of processes to use for conversion. Default is os.cpu_count().',
 )
+@click.option(
+    '--size-limit',
+    '-s',
+    type=int,
+    default=-1,
+    help='Only handle archives under limited size in GB. Default is -1 (no limit).',
+)
 @click.pass_context
 def convert_archive(
-    ctx, uploads, overwrite, delete_old, migrate, force_repack, parallel
+    ctx, uploads, overwrite, delete_old, migrate, force_repack, parallel, size_limit
 ):
    _, selected = _query_uploads(uploads, **ctx.obj.uploads_kwargs)

@@ -1461,6 +1470,7 @@ def convert_archive(
            if_include=only_v1,
            processes=parallel,
            force_repack=force_repack,
+            size_limit=size_limit,
        )
    else:
        convert_upload(
@@ -1469,4 +1479,5 @@ def convert_archive(
            delete_old=delete_old,
            processes=parallel,
            force_repack=force_repack,
+            size_limit=size_limit,
        )
--- a/nomad/cli/clean.py
+++ b/nomad/cli/clean.py
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD. See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+import click
+from tqdm import tqdm
+
+from .cli import cli
+
+
+@cli.command(
+    help='Cleanse the given path by removing empty folders.',
+    name='clean',
+)
+@click.option(
+    '--path',
+    type=str,
+    help='Cleanse the given path by removing empty folders.',
+)
+def clean_staging(path):
+    if 'staging' not in path:
+        print('Path must contain "staging".')
+        return
+
+    print(f'Cleaning path: "{path}".')
+    print('Are you sure you want to continue? (y/N)', end=' ')
+    response = input()
+    if response.lower() != 'y':
+        print('Exiting...')
+        return
+
+    print('Cleaning...')
+
+    def safe_remove(_p):
+        try:
+            os.rmdir(_p)
+        except Exception:  # noqa
+            pass
+
+    for root, folders, _ in tqdm(os.walk(path, topdown=False)):
+        for folder in folders:
+            if not os.listdir(full_path := os.path.join(root, folder)):  # noqa
+                safe_remove(full_path)
--- a/nomad/cli/dev.py
+++ b/nomad/cli/dev.py
@@ -87,7 +87,6 @@ def get_gui_artifacts_js() -> str:
        'searchQuantities': _generate_search_quantities(),
        'metainfo': _generate_metainfo(all_metainfo_packages),
        'parserMetadata': code_metadata,
-        'exampleUploads': _generate_example_upload_metadata(),
        'northTools': {k: v.dict() for k, v in config.north.tools.filtered_items()},
        'unitList': unit_list_json,
        'unitPrefixes': prefixes_json,
@@ -272,32 +271,6 @@ def gui_config():
    print(get_gui_config())


-def _generate_example_upload_metadata():
-    import yaml
-
-    example_uploads_path = 'examples/data/uploads/example_uploads.yml'
-
-    if not os.path.exists(example_uploads_path):
-        example_uploads_path = os.path.join(
-            os.path.dirname(__file__),
-            '../../',
-            'examples/data/uploads/example_uploads.yml',
-        )
-
-    if not os.path.exists(example_uploads_path):
-        raise FileNotFoundError('Cannot find example_uploads.yml file')
-
-    with open(example_uploads_path, 'r') as infile:
-        return yaml.load(infile, Loader=yaml.SafeLoader)
-
-
-@dev.command(
-    help='Generates a JSON file from example-uploads metadata in the YAML file.'
-)
-def example_upload_metadata():
-    print(json.dumps(_generate_example_upload_metadata(), indent=2))
-
-
 @dev.command(
    help=(
        'Updates parser`s README files by combining a general template with  '

--- a/nomad/config/defaults.yaml
+++ b/nomad/config/defaults.yaml
--- a/nomad/config/models/config.py
+++ b/nomad/config/models/config.py
@@ -879,7 +879,6 @@ class Archive(ConfigBaseModel):
        """,
    )
    toc_depth = Field(10, description='Depths of table of contents in the archive.')
-    use_new_writer = True  # todo: to be removed
    small_obj_optimization_threshold = Field(
        1 * 2**20,
        description="""
@@ -1142,15 +1141,20 @@ class Config(ConfigBaseModel):

            for key, plugin in _plugins['entry_points']['options'].items():
                if key not in plugin_entry_point_ids:
-                    plugin_config = load_plugin_yaml(key, plugin)
-                    plugin_config['id'] = key
-                    plugin_class = {
-                        'parser': Parser,
-                        'normalizer': Normalizer,
-                        'schema': Schema,
-                    }.get(plugin_config['plugin_type'])
-                    _plugins['entry_points']['options'][key] = plugin_class.parse_obj(
-                        plugin_config
-                    )
+                    # Handle new style plugins that are declared directly in nomad.yaml
+                    if plugin.get('entry_point_type') and not plugin.get('id'):
+                        plugin['id'] = key
+                    # Update information for old style plugins
+                    else:
+                        plugin_config = load_plugin_yaml(key, plugin)
+                        plugin_config['id'] = key
+                        plugin_class = {
+                            'parser': Parser,
+                            'normalizer': Normalizer,
+                            'schema': Schema,
+                        }.get(plugin_config['plugin_type'])
+                        _plugins['entry_points']['options'][key] = (
+                            plugin_class.parse_obj(plugin_config)
+                        )

            self.plugins = Plugins.parse_obj(_plugins)
--- a/nomad/config/models/plugins.py
+++ b/nomad/config/models/plugins.py
@@ -123,7 +123,7 @@ class ParserEntryPoint(EntryPoint, metaclass=ABCMeta):
        level will attempt to match raw files first.
    """,
    )
-
+    aliases: List[str] = Field([], description="""List of alternative parser names.""")
    mainfile_contents_re: Optional[str] = Field(
        description="""
        A regular expression that is applied the content of a potential mainfile.
@@ -215,7 +215,8 @@ class ExampleUploadEntryPoint(EntryPoint):
    local_path: Optional[str] = Field(
        description="""
        The final path to use when creating the upload. This field will be
-        automatically generated by the 'load' function.
+        automatically generated by the 'load' function and is typically not set
+        manually.
        """
    )

@@ -224,10 +225,11 @@ class ExampleUploadEntryPoint(EntryPoint):
        """Checks that only either path or url is given."""
        path = values.get('path')
        url = values.get('url')
+        local_path = values.get('local_path')
        if path and url:
            raise ValueError('Provide only "path" or "url", not both.')
-        if not path and not url:
-            raise ValueError('Provide either "path" or "url".')
+        if not path and not url and not local_path:
+            raise ValueError('Provide one of "path", "url" or "local_path".')

        return values

@@ -245,6 +247,10 @@ class ExampleUploadEntryPoint(EntryPoint):
        startup.
        """
        path = self.path
+        # If local path is already set, use it
+        if self.local_path:
+            return
+        # Create local path from given path or url
        if not path and self.url:
            final_folder = os.path.join(
                get_package_path(self.plugin_package), 'example_uploads'
@@ -465,7 +471,11 @@ class Parser(PythonPluginBase):
    )
    mainfile_contents_dict: Optional[dict] = Field(
        description="""
-        Is used to match structured data files like JSON or HDF5.
+        Is used to match structured data files like JSON, HDF5 or csv/excel files. In case of a csv/excel file
+        for example, in order to check if certain columns exist in a given sheet, one can set this attribute to
+        `{'<sheet name>': {'__has_all_keys': [<column names>]}}`. In case the csv/excel file contains comments that
+        are supposed to be ignored, use this reserved key-value pair
+        `'__comment_symbol': '<symbol>'` at the top level of the dict right next to the <sheet name>.
    """
    )
    supported_compressions: List[str] = Field(

--- a/nomad/datamodel/data.py
+++ b/nomad/datamodel/data.py
@@ -30,7 +30,6 @@ from nomad.metainfo.metainfo import (
    MCategory,
    MSection,
    Quantity,
-    MProxy,
    Capitalized,
    Section,
    Datetime,
@@ -199,13 +198,24 @@ class UserReference(Reference):
        return {'type_kind': 'User', 'type_data': 'User'}

    def _normalize_impl(self, section, value):
-        # todo: need data validation
+        if isinstance(value, User):
+            return value
+
        if isinstance(value, str):
-            return MProxy(value, m_proxy_section=section, m_proxy_type=self._proxy_type)
-        return value
+            try:
+                return User.get(value)
+            except Exception as _exc:  # noqa
+                return value
+
+        raise ValueError(f'Cannot normalize {value}.')

    def _serialize_impl(self, section, value):
-        return value.user_id
+        if isinstance(value, str):
+            return value
+        if isinstance(value, User):
+            return value.user_id
+
+        raise ValueError(f'Cannot serialize {value}.')


 class AuthorReference(Reference):
@@ -216,12 +226,23 @@ class AuthorReference(Reference):
        return {'type_kind': 'Author', 'type_data': 'Author'}

    def _normalize_impl(self, section, value):
-        # todo: need data validation
-        if isinstance(value, (str, dict)):
-            return MProxy(value, m_proxy_section=section, m_proxy_type=self._proxy_type)
-        return value
+        if isinstance(value, Author):
+            return value
+
+        if isinstance(value, dict):
+            return Author.m_from_dict(value)
+
+        if isinstance(value, str):
+            try:
+                return User.get(value)
+            except Exception as _exc:  # noqa
+                return value
+
+        raise ValueError(f'Cannot normalize {value}.')

    def _serialize_impl(self, section, value):
+        if isinstance(value, str):
+            return value
        if isinstance(value, User):
            return value.user_id
        if isinstance(value, Author):
No results found