Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nomad-lab/nomad-FAIR
  • pgoel/nomad-FAIR
  • jpd47/nomad-FAIR
3 results
Show changes
Showing
with 2167 additions and 4953 deletions
......@@ -122,7 +122,7 @@ export const searchQuantities = window.nomadArtifacts.searchQuantities
export const metainfo = window.nomadArtifacts.metainfo
export const parserMetadata = window.nomadArtifacts.parserMetadata
export const toolkitMetadata = window.nomadArtifacts.toolkitMetadata
export const exampleUploads = window.nomadArtifacts.exampleUploads || {}
export const exampleUploads = {}
Object.values(entry_points?.options || [])
.filter(entry_point => entry_point.entry_point_type === 'example_upload')
.forEach(entry_point => {
......
This diff is collapsed.
This diff is collapsed.
......@@ -13,7 +13,7 @@ fs:
plugins:
entry_points:
include:
- schema/simulation/run
- schema/simulation/workflow
- parsers/vasp
- runschema:run_schema_entry_point
- simulationworkflowschema:simulationworkflow_schema_entry_point
- electronicparsers:vasp_parser_entry_point
This diff is collapsed.
......@@ -83,7 +83,7 @@ nav:
- Data structure: explanation/data.md
- Processing: explanation/processing.md
- Architecture: explanation/architecture.md
- Why you need an Oasis: explanation/oasis.md
- Federation and Oasis: explanation/oasis.md
- Reference:
- reference/config.md
- reference/annotations.md
......@@ -156,4 +156,4 @@ extra_css:
extra_javascript:
- javascript.js
- https://polyfill.io/v3/polyfill.min.js?features=es6
- https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
\ No newline at end of file
- https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
......@@ -117,19 +117,24 @@ async def get_user_groups(
group_id: Optional[List[str]] = Query(
None, description='Search groups by their full id.'
),
user_id: Optional[str] = Query(
None, description='Search groups by their owner or members ids.'
),
search_terms: Optional[str] = Query(
None, description='Search groups by parts of their name.'
),
):
"""Get data about user groups."""
if group_id is not None and search_terms is not None:
if sum(param is not None for param in (group_id, user_id, search_terms)) > 1:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail='Only one of group_id or search_terms may be used at a time.',
detail='Only one of (group_id, user_id, search_terms) may be used at a time.',
)
if group_id is not None:
user_groups = MongoUserGroup.get_by_ids(group_id)
elif user_id is not None:
user_groups = MongoUserGroup.get_by_user_id(user_id)
elif search_terms is not None:
user_groups = MongoUserGroup.get_by_search_terms(search_terms)
else:
......
......@@ -35,15 +35,14 @@ section annotations/categories.
from .storage import (
to_json,
write_archive,
read_archive,
ArchiveError,
ArchiveReader,
ArchiveWriter,
ArchiveDict,
ArchiveList,
ArchiveItem,
)
from .storage_v2 import write_archive
from .query import query_archive, filter_archive, ArchiveQueryError
from .partial import (
read_partial_archive_from_mongo,
......
......@@ -58,6 +58,7 @@ def convert_archive(
delete_old: bool = False,
counter: Counter = None,
force_repack: bool = False,
size_limit: int = -1,
):
"""
Convert an archive of the old format to the new format.
......@@ -83,6 +84,7 @@ def convert_archive(
delete_old (bool, optional): Whether to delete the old file after conversion. Defaults to False.
counter (Counter, optional): A counter to track the progress of the conversion. Defaults to None.
force_repack (bool, optional): Force repacking the archive that is already in the new format. Defaults to False.
size_limit (int, optional): The size limit in GB for the archive. Defaults to -1 (no limit).
"""
prefix: str = counter.increment() if counter else ''
......@@ -111,6 +113,13 @@ def convert_archive(
flush(f'{prefix} [ERROR] File already exists: {new_path}')
return
original_size = os.path.getsize(original_path)
if size_limit > 0 and original_size > size_limit * 1024**3:
flush(
f'{prefix} [WARNING] File size exceeds limit {size_limit} GB: {original_path}'
)
return
def safe_remove(path: str):
if not path:
return
......@@ -165,6 +174,7 @@ def convert_folder(
overwrite: bool = False,
delete_old: bool = False,
force_repack: bool = False,
size_limit: int = -1,
):
"""
Convert archives in the specified folder to the new format using parallel processing.
......@@ -181,6 +191,7 @@ def convert_folder(
overwrite (bool): Whether to overwrite existing files (default is False).
delete_old (bool): Whether to delete the old file after conversion (default is False).
force_repack (bool): Force repacking the archive (default is False).
size_limit (int): Size limit in GB for the archive (default is -1, no limit).
"""
file_list: list = []
......@@ -217,6 +228,7 @@ def convert_folder(
delete_old=delete_old,
counter=counter,
force_repack=force_repack,
size_limit=size_limit,
)
with ProcessPoolExecutor(max_workers=processes) as executor:
......@@ -242,6 +254,7 @@ def convert_upload(
overwrite: bool = False,
delete_old: bool = False,
force_repack: bool = False,
size_limit: int = -1,
):
"""
Function to convert an upload with the given upload_id to the new format.
......@@ -258,6 +271,7 @@ def convert_upload(
overwrite (bool, optional): Whether to overwrite existing files. Defaults to False.
delete_old (bool, optional): Whether to delete the old file after conversion. Defaults to False.
force_repack (bool, optional): Force repacking the existing archive (in new format). Defaults to False.
size_limit (int, optional): Size limit in GB for the archive. Defaults to -1 (no limit).
"""
if isinstance(uploads, (str, Upload)):
uploads = [uploads]
......@@ -289,6 +303,7 @@ def convert_upload(
overwrite=overwrite,
delete_old=delete_old,
force_repack=force_repack,
size_limit=size_limit,
)
......
......@@ -17,40 +17,26 @@
#
from __future__ import annotations
from typing import Iterable, Any, Tuple, Dict, BinaryIO, Union, List, cast, Generator
from typing import Any, Tuple, Dict, Union, cast, Generator
from io import BytesIO, BufferedReader
from collections.abc import Mapping, Sequence
import msgpack
from msgpack.fallback import Packer, StringIO
import struct
import json
from nomad import utils
from nomad.config import config
__packer = msgpack.Packer(autoreset=True, use_bin_type=True)
_toc_uuid_size = utils.default_hash_len + 1
_toc_item_size = _toc_uuid_size + 25 # packed(uuid + [10-byte-pos, 10-byte-pos])
_entries_per_block = config.archive.block_size // _toc_item_size
_bytes_per_block = _entries_per_block * _toc_item_size
def packb(o):
return __packer.pack(o)
def unpackb(o):
return msgpack.unpackb(o, raw=False)
def _encode(start: int, end: int) -> bytes:
return start.to_bytes(5, byteorder='little', signed=False) + end.to_bytes(
5, byteorder='little', signed=False
)
def _decode(position: bytes) -> Tuple[int, int]:
return int.from_bytes(
position[:5], byteorder='little', signed=False
......@@ -77,208 +63,6 @@ class ArchiveError(Exception):
pass
class TOCPacker(Packer):
"""
A special msgpack packer that records a TOC while packing.
Uses a combination of the pure python msgpack fallback packer and the "real"
c-based packing.
"""
def __init__(self, toc_depth: int, *args, **kwargs):
self.toc_depth = toc_depth
# noinspection PyTypeChecker
self.toc: Dict[str, Any] = None
self._depth = 0
# Because we cannot change msgpacks interface of _pack, this _stack is used to
# transfer the result of _pack calls in terms of the TOC.
self._stack: List[Any] = []
super().__init__(*args, **kwargs)
def _pos(self):
return self._buffer.getbuffer().nbytes
def _pack_list(self, obj, *args, **kwargs):
pack_result = super()._pack(obj, *args, **kwargs)
toc_result = []
# same assumption and condition as above
if len(obj) > 0 and isinstance(obj[0], dict):
for _ in obj:
toc_result.append(self._stack.pop())
self._stack.append(list(reversed(toc_result)))
return pack_result
def _pack_dict(self, obj, *args, **kwargs):
toc_result = {}
start = self._pos()
if self._depth >= self.toc_depth:
pack_result = self._buffer.write(packb(obj))
else:
self._depth += 1
pack_result = super()._pack(obj, *args, **kwargs)
self._depth -= 1
toc = {}
for key, value in reversed(list(obj.items())):
if isinstance(value, dict) or (
isinstance(value, (list, tuple))
and len(value) > 0
and isinstance(value[0], dict)
):
# assumes non emptiness and uniformity of array items
toc[key] = self._stack.pop()
toc_result['toc'] = {
key: value for key, value in reversed(list(toc.items()))
}
end = self._pos()
toc_result['pos'] = [start, end]
self._stack.append(toc_result)
return pack_result
def _pack(self, obj, *args, **kwargs):
if isinstance(obj, dict):
return self._pack_dict(obj, *args, **kwargs)
if isinstance(obj, list):
return self._pack_list(obj, *args, **kwargs)
return self._buffer.write(packb(obj))
def pack(self, obj):
assert isinstance(obj, dict), f'TOC packer can only pack dicts, {obj.__class__}'
self._depth = 0
self._buffer = StringIO()
result = super().pack(obj)
self.toc = self._stack.pop()
assert len(self._stack) == 0
return result
class ArchiveWriter:
def __init__(
self, file_or_path: Union[str, BytesIO], n_entries: int, entry_toc_depth: int
):
self.file_or_path = file_or_path
self.n_entries = n_entries
self._pos = 0
# noinspection PyTypeChecker
self._toc_position: Tuple[int, int] = None
self._toc: Dict[str, Tuple[Tuple[int, int], Tuple[int, int]]] = {}
# noinspection PyTypeChecker
self._f: BinaryIO = None
self._toc_packer = TOCPacker(toc_depth=entry_toc_depth)
def __enter__(self):
if isinstance(self.file_or_path, str):
self._f = open(self.file_or_path, 'wb')
elif isinstance(self.file_or_path, BytesIO):
self._f = self.file_or_path
self._f.seek(0)
else:
raise ValueError('not a file or path')
# write empty placeholder header
self._write_map_header(3)
self._writeb('toc_pos')
self._writeb(_encode(0, 0))
self._writeb('toc')
toc_start, _ = self._write_map_header(self.n_entries)
_, toc_end = self._write(b'0' * _toc_item_size * self.n_entries)
self._toc_position = toc_start, toc_end
self._writeb('data')
self._write_map_header(self.n_entries)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_val is not None:
raise exc_val
# go back and write the real TOC to the header
self._f.seek(0)
self._pos = 0
assert len(self._toc) == self.n_entries
toc_items = sorted(self._toc.items(), key=lambda item: item[0])
toc = {
uuid: [_encode(*positions[0]), _encode(*positions[1])]
for uuid, positions in toc_items
}
self._write_map_header(3)
self._writeb('toc_pos')
self._writeb(_encode(*self._toc_position))
self._writeb('toc')
toc_position = self._writeb(toc)
assert (
toc_position == self._toc_position
), f'{toc_position} - {self._toc_position}'
if isinstance(self.file_or_path, str):
self._f.close()
def _write_map_header(self, n):
if n <= 0x0F:
return self._write(struct.pack('B', 0x80 + n))
if n <= 0xFFFF:
return self._write(struct.pack('>BH', 0xDE, n))
if n <= 0xFFFFFFFF:
return self._write(struct.pack('>BI', 0xDF, n))
raise ValueError('Dict is too large')
def _write(self, b: bytes) -> Tuple[int, int]:
start = self._pos
self._pos += self._f.write(b)
return start, self._pos
def _writeb(self, obj):
return self._write(packb(obj))
def _write_entry(self, uuid: str, toc: dict, packed: bytes | Generator):
uuid = utils.adjust_uuid_size(uuid)
self._writeb(uuid)
self._write_map_header(2)
self._writeb('toc')
toc_pos = self._writeb(toc)
self._writeb('data')
if isinstance(packed, bytes):
data_pos = self._write(packed)
elif isinstance(packed, Generator):
start = self._pos
for chunk in packed:
self._pos += self._f.write(chunk)
data_pos = start, self._pos
else:
raise ValueError('Invalid type for packed data.')
self._toc[uuid] = (toc_pos, data_pos)
def add(self, uuid: str, data: Any) -> None:
self._toc_packer.reset()
packed = self._toc_packer.pack(data)
toc = self._toc_packer.toc
self._write_entry(uuid, toc, packed)
def add_raw(self, uuid: str, toc: dict, packed: Generator) -> None:
self._write_entry(uuid, toc, packed)
class ArchiveItem:
def __init__(self, f: BytesIO, offset: int = 0):
self._f = f
......@@ -528,112 +312,6 @@ class ArchiveReader(ArchiveDict):
return self._f.closed if isinstance(self._file_or_path, str) else True
def combine_archive(path: str, n_entries: int, data: Iterable[Tuple[str, Any]]):
if config.archive.use_new_writer:
from .storage_v2 import (
ArchiveWriter as ArchiveWriterNew,
ArchiveReader as ArchiveReaderNew,
)
with ArchiveWriterNew(
path, n_entries, toc_depth=config.archive.toc_depth
) as writer:
for uuid, reader in data:
if not reader:
writer.add(uuid, {})
elif isinstance(reader, ArchiveReaderNew):
toc, data = reader.get_raw(uuid)
writer.add_raw(uuid, toc, data)
else:
# rare case, old reader new writer, toc is not compatible, has to repack
writer.add(uuid, to_json(reader[uuid]))
else:
with ArchiveWriter(path, n_entries, entry_toc_depth=2) as writer:
for uuid, reader in data:
if not reader:
writer.add(uuid, {})
else:
toc, data = reader.get_raw(uuid)
writer.add_raw(uuid, toc, data)
def write_archive(
path_or_file: Union[str, BytesIO],
n_entries: int,
data: Iterable[Tuple[str, Any]],
entry_toc_depth: int = 2,
) -> None:
"""
Writes a msgpack-based archive file. The file contents will be a valid msgpack-object.
The data will contain extra table-of-contents (TOC) objects that map some keys to
positions in the file. Data can be partially read from these positions and deserialized
with msgpack.
The data in the archive file will have the following layout:
.. code-block:: python
{
'toc_pos': b[start, end],
'toc': {
entry_uuid: [b[start, end], b[start, end]], ...
},
'data': {
entry_uuid: {
'toc': {
key: {
'pos': [start, end],
'toc': ...
},
key: [
{
'pos': [start, end]
'toc': ...
}, ...
],
...
},
'data': ...
}, ...
}
}
The top-level TOC will map entry_uuids to positions. The key 'toc_pos' holds the
position of the entry TOC, the second ('toc') the position of each entry. These positions
will be absolute positions in the file. The top-level TOC will be ordered by entry_uuid.
The top-level TOC positions are 2*5byte encoded integers. This will give the top-level TOC a
predictable layout and will allow to partially read this TOC.
The TOC of each entry will have the same structure than the data up to a certain
TOC depth. A TOC object will hold the position of the object it refers to (key 'pos')
and further deeper TOC data (key 'toc'). Only data objects (dict instances) will
have TOC objects and only object count towards the TOC depth. Positions in the entry
TOCs are regular msgpack encoded integers.
Arguments:
path_or_file: A file path or file-like to the archive file that should be written.
n_entries: The number of entries that will be added to the file.
data: The file contents as an iterator of entry id, data tuples.
entry_toc_depth: The depth of the table of contents in each entry. Only objects will
count for calculating the depth.
"""
if config.archive.use_new_writer:
from .storage_v2 import ArchiveWriter as ArchiveWriterNew
with ArchiveWriterNew(
path_or_file, n_entries, toc_depth=entry_toc_depth
) as writer:
for uuid, entry in data:
writer.add(uuid, entry)
else:
with ArchiveWriter(
path_or_file, n_entries, entry_toc_depth=entry_toc_depth
) as writer:
for uuid, entry in data:
writer.add(uuid, entry)
def read_archive(file_or_path: Union[str, BytesIO], **kwargs) -> ArchiveReader:
"""
Allows to read a msgpack-based archive.
......@@ -676,57 +354,4 @@ def read_archive(file_or_path: Union[str, BytesIO], **kwargs) -> ArchiveReader:
if __name__ == '__main__':
def benchmark():
from time import time
import sys
with open('archive_test.json') as f:
example_data = json.load(f)
size = 5000 if len(sys.argv) == 1 else int(sys.argv[1])
access_every = 2
example_archive = [(utils.create_uuid(), example_data) for _ in range(0, size)]
example_uuid = example_archive[int(size / 2)][0]
# this impl
# create archive
start = time()
buffer = BytesIO()
write_archive(buffer, len(example_archive), example_archive, entry_toc_depth=2)
print('archive.py: create archive (1): ', time() - start)
# read single entry from archive
buffer = BytesIO(buffer.getbuffer())
for use_blocked_toc in [False, True]:
start = time()
for _ in range(0, 23):
read_archive(buffer, use_blocked_toc=use_blocked_toc)[example_uuid][
'run'
]['system']
print(
f'archive.py: access single entry system (23), blocked {use_blocked_toc:d}: ',
(time() - start) / 23,
)
# read every n-ed entry from archive
buffer = BytesIO(buffer.getbuffer())
for use_blocked_toc in [False, True]:
start = time()
for _ in range(0, 23):
with read_archive(buffer, use_blocked_toc=use_blocked_toc) as data:
for i, entry in enumerate(example_archive):
if i % access_every == 0:
data[entry[0]]['run']['system']
print(
f'archive.py: access every {access_every:d}-ed entry single entry system (23), '
f'blocked {use_blocked_toc:d}: ',
(time() - start) / 23,
)
# just msgpack
start = time()
packb(example_archive)
print('msgpack: create archive (1): ', time() - start)
benchmark()
pass
......@@ -18,8 +18,8 @@
from __future__ import annotations
import struct
from collections.abc import Generator, Iterable
from io import BytesIO
from typing import Generator
import msgpack
from bitarray import bitarray
......@@ -373,7 +373,7 @@ class ArchiveWriter:
def to_json(v):
return v.to_json() if isinstance(v, ArchiveItem) else v
return v.to_json() if hasattr(v, 'to_json') else v
class ArchiveReadCounter:
......@@ -842,9 +842,25 @@ class ArchiveReader(ArchiveItem):
return self._full_cache
def combine_archive(path: str, n_entries: int, data: Iterable[tuple]):
with ArchiveWriter(path, n_entries, toc_depth=config.archive.toc_depth) as writer:
for uuid, reader in data:
if not reader:
writer.add(uuid, {})
elif isinstance(reader, ArchiveReader):
toc, data = reader.get_raw(uuid)
writer.add_raw(uuid, toc, data)
else:
# rare case, old reader new writer, toc is not compatible, has to repack
writer.add(uuid, to_json(reader[uuid]))
def write_archive(
path_or_file: str | BytesIO, data: list, toc_depth: int = config.archive.toc_depth
):
with ArchiveWriter(path_or_file, len(data), toc_depth=toc_depth) as writer:
path_or_file: str | BytesIO,
n_entries: int,
data: Iterable[tuple],
entry_toc_depth: int = config.archive.toc_depth,
) -> None:
with ArchiveWriter(path_or_file, n_entries, toc_depth=entry_toc_depth) as writer:
for uuid, entry in data:
writer.add(uuid, entry)
......@@ -24,5 +24,5 @@ Use it from the command line with ``nomad --help`` or ``python -m nomad.cli --he
more.
"""
from . import dev, parse, client, admin # noqa
from . import dev, parse, client, admin, clean # noqa
from .cli import run_cli, cli # noqa
......@@ -30,6 +30,7 @@ import bs4
import time
import os.path
import nomad.archive.storage_v2
from nomad import archive
from nomad.config import config
from nomad.archive import read_archive
......@@ -244,7 +245,7 @@ def update_springer(max_n_query: int = 10, retry_time: int = 120):
page += 1
archive.write_archive(
nomad.archive.storage_v2.write_archive(
config.normalize.springer_db_path,
len(sp_data),
sp_data.items(),
......
......@@ -1429,11 +1429,13 @@ def only_v1(path: str):
)
@click.option(
'--migrate',
'-m',
is_flag=True,
help='Only convert v1 archive files to v1.2 archive files.',
)
@click.option(
'--force-repack',
'-f',
is_flag=True,
help='Force repacking existing archives that are already in the new format',
)
......@@ -1444,9 +1446,16 @@ def only_v1(path: str):
default=os.cpu_count(),
help='Number of processes to use for conversion. Default is os.cpu_count().',
)
@click.option(
'--size-limit',
'-s',
type=int,
default=-1,
help='Only handle archives under limited size in GB. Default is -1 (no limit).',
)
@click.pass_context
def convert_archive(
ctx, uploads, overwrite, delete_old, migrate, force_repack, parallel
ctx, uploads, overwrite, delete_old, migrate, force_repack, parallel, size_limit
):
_, selected = _query_uploads(uploads, **ctx.obj.uploads_kwargs)
......@@ -1461,6 +1470,7 @@ def convert_archive(
if_include=only_v1,
processes=parallel,
force_repack=force_repack,
size_limit=size_limit,
)
else:
convert_upload(
......@@ -1469,4 +1479,5 @@ def convert_archive(
delete_old=delete_old,
processes=parallel,
force_repack=force_repack,
size_limit=size_limit,
)
#
# Copyright The NOMAD Authors.
#
# This file is part of NOMAD. See https://nomad-lab.eu for further info.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import click
from tqdm import tqdm
from .cli import cli
@cli.command(
help='Cleanse the given path by removing empty folders.',
name='clean',
)
@click.option(
'--path',
type=str,
help='Cleanse the given path by removing empty folders.',
)
def clean_staging(path):
if 'staging' not in path:
print('Path must contain "staging".')
return
print(f'Cleaning path: "{path}".')
print('Are you sure you want to continue? (y/N)', end=' ')
response = input()
if response.lower() != 'y':
print('Exiting...')
return
print('Cleaning...')
def safe_remove(_p):
try:
os.rmdir(_p)
except Exception: # noqa
pass
for root, folders, _ in tqdm(os.walk(path, topdown=False)):
for folder in folders:
if not os.listdir(full_path := os.path.join(root, folder)): # noqa
safe_remove(full_path)
......@@ -87,7 +87,6 @@ def get_gui_artifacts_js() -> str:
'searchQuantities': _generate_search_quantities(),
'metainfo': _generate_metainfo(all_metainfo_packages),
'parserMetadata': code_metadata,
'exampleUploads': _generate_example_upload_metadata(),
'northTools': {k: v.dict() for k, v in config.north.tools.filtered_items()},
'unitList': unit_list_json,
'unitPrefixes': prefixes_json,
......@@ -272,32 +271,6 @@ def gui_config():
print(get_gui_config())
def _generate_example_upload_metadata():
import yaml
example_uploads_path = 'examples/data/uploads/example_uploads.yml'
if not os.path.exists(example_uploads_path):
example_uploads_path = os.path.join(
os.path.dirname(__file__),
'../../',
'examples/data/uploads/example_uploads.yml',
)
if not os.path.exists(example_uploads_path):
raise FileNotFoundError('Cannot find example_uploads.yml file')
with open(example_uploads_path, 'r') as infile:
return yaml.load(infile, Loader=yaml.SafeLoader)
@dev.command(
help='Generates a JSON file from example-uploads metadata in the YAML file.'
)
def example_upload_metadata():
print(json.dumps(_generate_example_upload_metadata(), indent=2))
@dev.command(
help=(
'Updates parser`s README files by combining a general template with '
......
This diff is collapsed.
......@@ -879,7 +879,6 @@ class Archive(ConfigBaseModel):
""",
)
toc_depth = Field(10, description='Depths of table of contents in the archive.')
use_new_writer = True # todo: to be removed
small_obj_optimization_threshold = Field(
1 * 2**20,
description="""
......@@ -1142,15 +1141,20 @@ class Config(ConfigBaseModel):
for key, plugin in _plugins['entry_points']['options'].items():
if key not in plugin_entry_point_ids:
plugin_config = load_plugin_yaml(key, plugin)
plugin_config['id'] = key
plugin_class = {
'parser': Parser,
'normalizer': Normalizer,
'schema': Schema,
}.get(plugin_config['plugin_type'])
_plugins['entry_points']['options'][key] = plugin_class.parse_obj(
plugin_config
)
# Handle new style plugins that are declared directly in nomad.yaml
if plugin.get('entry_point_type') and not plugin.get('id'):
plugin['id'] = key
# Update information for old style plugins
else:
plugin_config = load_plugin_yaml(key, plugin)
plugin_config['id'] = key
plugin_class = {
'parser': Parser,
'normalizer': Normalizer,
'schema': Schema,
}.get(plugin_config['plugin_type'])
_plugins['entry_points']['options'][key] = (
plugin_class.parse_obj(plugin_config)
)
self.plugins = Plugins.parse_obj(_plugins)
......@@ -123,7 +123,7 @@ class ParserEntryPoint(EntryPoint, metaclass=ABCMeta):
level will attempt to match raw files first.
""",
)
aliases: List[str] = Field([], description="""List of alternative parser names.""")
mainfile_contents_re: Optional[str] = Field(
description="""
A regular expression that is applied the content of a potential mainfile.
......@@ -215,7 +215,8 @@ class ExampleUploadEntryPoint(EntryPoint):
local_path: Optional[str] = Field(
description="""
The final path to use when creating the upload. This field will be
automatically generated by the 'load' function.
automatically generated by the 'load' function and is typically not set
manually.
"""
)
......@@ -224,10 +225,11 @@ class ExampleUploadEntryPoint(EntryPoint):
"""Checks that only either path or url is given."""
path = values.get('path')
url = values.get('url')
local_path = values.get('local_path')
if path and url:
raise ValueError('Provide only "path" or "url", not both.')
if not path and not url:
raise ValueError('Provide either "path" or "url".')
if not path and not url and not local_path:
raise ValueError('Provide one of "path", "url" or "local_path".')
return values
......@@ -245,6 +247,10 @@ class ExampleUploadEntryPoint(EntryPoint):
startup.
"""
path = self.path
# If local path is already set, use it
if self.local_path:
return
# Create local path from given path or url
if not path and self.url:
final_folder = os.path.join(
get_package_path(self.plugin_package), 'example_uploads'
......@@ -465,7 +471,11 @@ class Parser(PythonPluginBase):
)
mainfile_contents_dict: Optional[dict] = Field(
description="""
Is used to match structured data files like JSON or HDF5.
Is used to match structured data files like JSON, HDF5 or csv/excel files. In case of a csv/excel file
for example, in order to check if certain columns exist in a given sheet, one can set this attribute to
`{'<sheet name>': {'__has_all_keys': [<column names>]}}`. In case the csv/excel file contains comments that
are supposed to be ignored, use this reserved key-value pair
`'__comment_symbol': '<symbol>'` at the top level of the dict right next to the <sheet name>.
"""
)
supported_compressions: List[str] = Field(
......
......@@ -30,7 +30,6 @@ from nomad.metainfo.metainfo import (
MCategory,
MSection,
Quantity,
MProxy,
Capitalized,
Section,
Datetime,
......@@ -199,13 +198,24 @@ class UserReference(Reference):
return {'type_kind': 'User', 'type_data': 'User'}
def _normalize_impl(self, section, value):
# todo: need data validation
if isinstance(value, User):
return value
if isinstance(value, str):
return MProxy(value, m_proxy_section=section, m_proxy_type=self._proxy_type)
return value
try:
return User.get(value)
except Exception as _exc: # noqa
return value
raise ValueError(f'Cannot normalize {value}.')
def _serialize_impl(self, section, value):
return value.user_id
if isinstance(value, str):
return value
if isinstance(value, User):
return value.user_id
raise ValueError(f'Cannot serialize {value}.')
class AuthorReference(Reference):
......@@ -216,12 +226,23 @@ class AuthorReference(Reference):
return {'type_kind': 'Author', 'type_data': 'Author'}
def _normalize_impl(self, section, value):
# todo: need data validation
if isinstance(value, (str, dict)):
return MProxy(value, m_proxy_section=section, m_proxy_type=self._proxy_type)
return value
if isinstance(value, Author):
return value
if isinstance(value, dict):
return Author.m_from_dict(value)
if isinstance(value, str):
try:
return User.get(value)
except Exception as _exc: # noqa
return value
raise ValueError(f'Cannot normalize {value}.')
def _serialize_impl(self, section, value):
if isinstance(value, str):
return value
if isinstance(value, User):
return value.user_id
if isinstance(value, Author):
......