diff --git a/nomad/archive_library/README.MD b/docs/archive_tutorial.md similarity index 86% rename from nomad/archive_library/README.MD rename to docs/archive_tutorial.md index cc90b676c7263fa0acd5f19b263383b51ed5cbe0..01324d091c7ffb823d83350c5cd01a763670ef92 100644 --- a/nomad/archive_library/README.MD +++ b/docs/archive_tutorial.md @@ -1,4 +1,4 @@ -# New archive implementation +# Archive API tutorial This contains the tutorials to use the new archive query functionality. It uses the new metainfo definition for the archive data. In addition, the archive data @@ -46,15 +46,12 @@ parameters are downloaded. The results are then expressed in the new metainfo sc which offers auto-completion feature, among others. ## Msgpack container -The archive data are now stored in a binary format called msgpack. The archive data are -fragmented and upon query will access only the relevant fragment without loading the whole -archives collection. This is beneficial when one only query small chunks but will approach the -efficiency of zip files when one accesses the whole archive. To create a msgpack database +The archive data are now stored in a binary format called msgpack. To create a msgpack database from the archive data and query it, one uses ArchiveFileDB. ```python from nomad.archive_library.filedb import ArchiveFileDB -db = ArchiveFileDB('archive.msg', mode='w', max_lfragment=2) +db = ArchiveFileDB('archive.msg', mode='w', entry_toc_depth=2) db.add_data({'calc1':{'secA': {'subsecA': {'propA': 1.0}}, 'secB': {'propB': 'X'}}}) db.add_data({'calc2':{'secA': {'subsecA': {'propA': 2.0}}, 'secB': {'propB': 'Y'}}}) db.close() @@ -64,4 +61,4 @@ db.query({'calc1':{'secA': None}}) ``` In the example, we first create a database in 'archive.msg', and data which are added will be fragmented down to subsections. We reload it for reading and query all entries -under 'secA' of 'calc1'. \ No newline at end of file +under 'secA' of 'calc1'. diff --git a/docs/index.rst b/docs/index.rst index 2dbc41f6f172ec50cb893b71580b9abf27a332d1..3b447e5c7fcaeac92d2a56ec8130c70eedd61dc1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,5 +14,6 @@ and infrastructure with a simplyfied architecture and consolidated code base. api metainfo parser_tutorial + archive_tutorial reference ops diff --git a/nomad/app/api/archive.py b/nomad/app/api/archive.py index 03029d304fb626b51f5e9345dcc908c8c206cdf1..28375d2710f7261ea3efc73cb53899be063fcdc2 100644 --- a/nomad/app/api/archive.py +++ b/nomad/app/api/archive.py @@ -30,9 +30,8 @@ import nomad_meta_info from nomad.files import UploadFiles, Restricted from nomad import search, config -from nomad.archive_library.utils import get_dbs -from nomad import search, config from nomad.app import common +from nomad.archive import ArchiveFileDB from .auth import authenticate, create_authorization_predicate from .api import api @@ -228,6 +227,17 @@ _archive_query_model['Scroll'] = _archive_query_model.pop('scroll') _archive_query_model['Pagination'] = _archive_query_model.pop('pagination') +def get_dbs(upload_id): + upload_files = UploadFiles.get(upload_id, create_authorization_predicate(upload_id)) + + if upload_files is None: + return [] + + files = upload_files.archive_file_msg(',') + msgdbs = [ArchiveFileDB(f) for f in files if f is not None] + return msgdbs + + @ns.route('/query') class ArchiveQueryResource(Resource): @api.doc('post_archive_query') diff --git a/nomad/archive.py b/nomad/archive.py index 40938334e34974ae919fd7fa2b121bc89e11125a..fdfb1911ee815344c39a19b6d441378b7573cb67 100644 --- a/nomad/archive.py +++ b/nomad/archive.py @@ -1,11 +1,12 @@ from typing import Iterable, Any, Tuple, Dict, BinaryIO, Union, List, cast -from io import BytesIO +from io import BytesIO, BufferedReader from collections.abc import Mapping, Sequence import msgpack from msgpack.fallback import Packer, StringIO import struct import json import math +import os.path from nomad import utils @@ -98,6 +99,10 @@ class TOCPacker(Packer): return pack_result +_toc_uuid_size = utils.default_hash_len + 1 +_toc_item_size = _toc_uuid_size + 25 # packed(uuid + [10-byte-pos, 10-byte-pos]) + + class ArchiveWriter: def __init__(self, file_or_path: Union[str, BytesIO], n_entries: int, entry_toc_depth: int): self.file_or_path = file_or_path @@ -119,17 +124,13 @@ class ArchiveWriter: raise ValueError('not a file or path') # write empty placeholder header - toc_item_length = len(packb(utils.create_uuid())) - toc_item_length += len(packb([ - ArchiveWriter._encode_position(0, 0), ArchiveWriter._encode_position(0, 0)])) - self._write_map_header(3) self.write(packb('toc_pos')) self.write(packb(ArchiveWriter._encode_position(0, 0))) self.write(packb('toc')) toc_start, _ = self._write_map_header(self.n_entries) - _, toc_end = self.write(b'0' * toc_item_length * self.n_entries) + _, toc_end = self.write(b'0' * _toc_item_size * self.n_entries) self._toc_position = toc_start, toc_end self.write(packb('data')) @@ -162,7 +163,7 @@ class ArchiveWriter: self.write(packb('toc')) toc_position = self.write(packb(toc, use_bin_type=True)) - assert toc_position == self._toc_position + assert toc_position == self._toc_position, '%s - %s' % (toc_position, self._toc_position) if isinstance(self.file_or_path, str): self._f.close() @@ -272,6 +273,8 @@ class ArchiveReader(ArchiveObject): f = cast(BytesIO, open(self.file_or_path, 'rb')) elif isinstance(self.file_or_path, BytesIO): f = self.file_or_path + elif isinstance(self.file_or_path, BufferedReader): + f = self.file_or_path else: raise ValueError('not a file or path') @@ -309,10 +312,8 @@ class ArchiveReader(ArchiveObject): return self fs_block_size = 4096 - toc_uuid_size = 23 - toc_entry_size = 48 - toc_block_size_entries = math.ceil(4096 / 48) - toc_block_size_bytes = math.ceil(4096 / 48) * 48 + toc_block_size_entries = math.ceil(4096 / _toc_item_size) + toc_block_size_bytes = math.ceil(4096 / 54) * _toc_item_size def _load_toc_block(self, i_entry: int): i_block = math.floor(i_entry / ArchiveReader.toc_block_size_entries) @@ -325,9 +326,9 @@ class ArchiveReader(ArchiveObject): self._n_toc - i_block * ArchiveReader.toc_block_size_entries) for i in range(0, toc_block_size_entries): - offset = i * ArchiveReader.toc_entry_size - entry_uuid = unpackb(block_data[offset:offset + ArchiveReader.toc_uuid_size]) - positions_encoded = unpackb(block_data[offset + ArchiveReader.toc_uuid_size:offset + ArchiveReader.toc_entry_size]) + offset = i * _toc_item_size + entry_uuid = unpackb(block_data[offset:offset + _toc_uuid_size]) + positions_encoded = unpackb(block_data[offset + _toc_uuid_size:offset + _toc_item_size]) positions = ( ArchiveReader._decode_position(positions_encoded[0]), ArchiveReader._decode_position(positions_encoded[1])) @@ -468,13 +469,161 @@ def read_archive(file_or_path: str, **kwargs) -> ArchiveReader: return ArchiveReader(file_or_path, **kwargs) +class ArchiveFileDB: + def __init__(self, fileio: Union[str, BytesIO], mode: str = 'r', entry_toc_depth: int = 2, **kwargs): + self._fileobj = fileio + self._mode = mode + self._entry_toc_depth = entry_toc_depth + self._data: Dict[str, Any] = {} + self._key_length = utils.default_hash_len + self._db = None + self._ids: List[str] = [] + self._infokey = self._adjust_key('INFO', 'X') + + def write(self, abspath: str, relpath: str): + """ + Mimic the zipfile function to write files to database. + Arguments: + abspath: The absolute path to the file to be read + relpath: For compatibility with zipfile + """ + self.add_data(abspath) + + def close(self, save: bool = True): + """ + Mimic the zipfile function to close the msgpack file. + Will trigger the creation of the database when in write mode. + Arguments: + save: If True will add the current data in memory to database + """ + if 'w' in self._mode: + self.create_db() + if isinstance(self._fileobj, BytesIO) and save: + self._fileobj.close() + self._fileobj = None + + def create_db(self): + with ArchiveWriter(self._fileobj, len(self._data) + 1, self._entry_toc_depth) as db: + for key, val in self._data.items(): + key = self._adjust_key(key) + self._ids.append(key) + db.add(key, val) + db.add(self._infokey, dict(ids=self._ids, entry_toc_depth=self._entry_toc_depth)) + + def _adjust_key(self, key: str, fill_with: str = ' '): + key = key.rjust(self._key_length, fill_with) + assert len(key) == self._key_length + return key + + def add_data(self, data: Union[str, Dict[str, Any], List[Union[str, Dict]]]): + """ + Add data to the msgpack database. + Arguments: + data: Can be a filename or dictionary or list of both + """ + if isinstance(data, str): + key = os.path.basename(data) + if data.endswith('json'): + uid = key.split('.')[0] + val = json.load(open(data)) + if isinstance(val, dict): + self._data[uid] = val + + else: + try: + uid = key.split('.')[0] + dtype = key.split('.')[-1] + val = open(data).read() + if dtype not in self._data: + self._data[dtype] = {} + if val: + self._data[dtype].update({uid: val}) + + except Exception: + pass + + elif isinstance(data, dict): + for key, val in data.items(): + if val: + self._data[key] = val + + elif isinstance(data, list): + for i in range(len(data)): + self.add_data(data[i]) + + else: + raise ValueError + + @property + def ids(self): + if not self._ids: + with ArchiveReader(self._fileobj) as db: + self._ids = db[self._infokey]['ids'] + return self._ids + + @staticmethod + def _get_index(key: str) -> Union[Tuple[int, int], int]: + key = key.strip() + bracket = key.find('[') + if bracket <= 0: + return None + + assert key[-1] == ']' + str_index = key[bracket + 1: -1] + if ':' in str_index: + lo_str, hi_str = str_index.split(':') + lo = int(lo_str) if lo_str else 0 + hi = int(hi_str) if hi_str else 10000000 + return lo, hi + + else: + # if db structure should be maintained, return lo, lo + 1 + # if conform with python indexing, return lo + lo = int(str_index) + return lo + + def _load_data(self, qdict: Dict[str, Any], db: ArchiveObject, main_section: bool = False): + if not isinstance(qdict, dict): + if isinstance(db, ArchiveObject): + return db.to_dict() + elif isinstance(db, ArchiveList): + return list(db) + elif isinstance(db, ArchiveItem): + return dict(db) + else: + return db + + res = {} + for key, val in qdict.items(): + index = ArchiveFileDB._get_index(key) + dbkey = key.split('[')[0] + if main_section: + dbkey = self._adjust_key(dbkey) + try: + if index is None: + res[key] = self._load_data(val, db[dbkey]) + elif isinstance(index, int): + res[key] = self._load_data(val, db[dbkey])[index] + else: + res[key] = self._load_data(val, db[dbkey])[index[0]: index[1]] + + except Exception: + continue + + return res + + def query(self, qdict): + with ArchiveReader(self._fileobj) as db: + return self._load_data(qdict, db, True) + + if __name__ == '__main__': def benchmark(): from time import time import sys - with open('local/test_be.json') as f: + with open('archive_test.json') as f: example_data = json.load(f) size = 5000 if len(sys.argv) == 1 else int(sys.argv[1]) @@ -513,30 +662,4 @@ if __name__ == '__main__': packb(example_archive) print('msgpack: create archive (1): ', time() - start) - # v0.8.0 impl - from nomad.archive_library import filedb - start = time() - buffer = BytesIO() - db = filedb.ArchiveFileDB(buffer, mode='w', max_lfragment=3) - db.add_data({ - uuid: data for uuid, data in example_archive}) - db.close(save=False) - print('filedb.py: create archive (1): ', time() - start) - - buffer = BytesIO(buffer.getbuffer()) - start = time() - for _ in range(0, 23): - db = filedb.ArchiveFileDB(buffer, mode='r', max_lfragment=3) - db.get_docs(db.ids[example_uuid + '/section_run/section_system'][0]) - print('filedb.py: access single entry system (23): ', (time() - start) / 23) - - buffer = BytesIO(buffer.getbuffer()) - start = time() - db = filedb.ArchiveFileDB(buffer, mode='r', max_lfragment=3) - for _ in range(0, 23): - for i, entry in enumerate(example_archive): - if i % access_every == 0: - db.get_docs(db.ids[entry[0] + '/section_run/section_system'][0]) - print('filedb.py: access every %d-ed entry single entry system (23): ' % access_every, (time() - start) / 23) - benchmark() diff --git a/nomad/archive_library/filedb.py b/nomad/archive_library/filedb.py deleted file mode 100644 index 92361a964bf7668227ee1c3313d4e8710d14f8ea..0000000000000000000000000000000000000000 --- a/nomad/archive_library/filedb.py +++ /dev/null @@ -1,496 +0,0 @@ -# Copyright 2019 Alvin Noe Ladines, Markus Scheidgen -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an"AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Module for storage of archive data using the msgpack module. - -In module ``ArchiveFileDB, the data are fragmented and saved as a list -in a msgpack file. -A component can be retrieved by giving an offset to the msgpack file to -be unpacked. The database can then be queried by giving a schema similar -to the archive data. - -To build the database, - -.. code-block:: python - db = ArchiveFileDB("db.msg", mode='w', max_lfragment=3) - db.add_data(["archive1.json", "archive2.json"]) - db.close() - -To query the database, - -.. code-block:: python - db = ArchiveFileDB("db.msg", mode='r') - db.query({'idX':{'sectionX':{'propertyX':'*'}}}) - db.close() -""" -import msgpack -import json -import os -import re -from io import StringIO, BufferedWriter, BufferedReader, BytesIO -from typing import Union, Dict, List, Any, IO - - -_PLACEHOLDER = '*' - - -class JSONdata: - """ - Provides a graphQL-style query for a given json data and query schema. - Arguments: - data: The json data to be queried - """ - def __init__(self, data: Dict[str, Any]): - self.data = self._merge_list(data) - - def _merge_list(self, data: Dict[str, Any]) -> Dict[str, Any]: - if not isinstance(data, dict): - return data - - merged: Dict[str, Any] = {} - main_keys = [] - for key, val in data.items(): - bracket = key.find('[') - val = self._merge_list(val) - if bracket > 0: - index = int(key[bracket + 1:].strip().rstrip(']')) - main_key = key[:bracket] - if main_key not in merged: - main_keys.append(main_key) - merged[main_key] = {} - merged[main_key][index] = val - else: - merged[key] = val - - for key in main_keys: - merged[key] = [merged[key][n] for n in sorted(merged[key].keys())] - return merged - - def _get_index(self, str_index: str, nval_ref: int): - str_index = str_index.strip().lstrip('[').rstrip(']') - if ':' in str_index: - lo_str, hi_str = str_index.split(':') - lo = int(lo_str) if lo_str else 0 - hi = int(hi_str) if hi_str else 0 - lo = nval_ref + lo if lo < 0 else lo - hi = nval_ref + hi if hi <= 0 else hi - if hi > nval_ref: - hi = nval_ref - if lo > hi or lo < 0 or hi < 0: - return - if lo > hi or lo < 0 or hi < 0: - return - index = list(range(lo, hi)) - else: - index = [int(str_index)] - return index - - def get_data(self, entry: Dict[str, Any], ref=None) -> Dict[str, Any]: - out_data: Dict[str, Any] = {} - if ref is None: - ref = self.data - - if not isinstance(entry, dict): - return ref - - for key, val in entry.items(): - index = None - bracket = key.find('[') - if bracket > 0: - str_index = key[bracket:] - key = key[:bracket] - index = self._get_index(str_index, len(ref[key])) - - if key not in ref: - continue - - if index is None: - out_data[key] = self.get_data(val, ref[key]) - else: - try: - data = [self.get_data(val, ref[key][n]) for n in index] - out_data[key] = data - except Exception: - continue - - out_data = self._merge_list(out_data) - - return out_data - - -class ArchiveFileDB: - """ - An interface to the messagepack module to provide an searchable - container of archive data. - Arguments: - fileio: can be a string or file object to read/write the msgpack file - mode: r/w to indicate read or write the msgpack file - max_lfragment: the maximum level for which the archive data will - be fragmented for more efficient unpacking of msgpack components - """ - def __init__(self, fileio: Union[str, IO, BytesIO], mode: str = 'r', max_lfragment: int = None): - self._fileobj = fileio - if isinstance(fileio, str) or isinstance(fileio, BytesIO): - self._mode = mode - elif isinstance(fileio, BufferedReader): - self._mode = 'rb' - elif isinstance(fileio, BufferedWriter): - self._mode = 'wb' - else: - raise TypeError - self._max_lfragment = max_lfragment - if 'w' in self._mode and self._max_lfragment is None: - self._max_lfragment = 2 - self._ids = None - self._data: Dict[str, Any] = {} - - @property - def max_lfragment(self) -> int: - if self._max_lfragment is None: - orig_mode = self.mode - self.mode = 'rb' - self._max_lfragment = self.get_docs('max_lfragment') - self.mode = orig_mode - return self._max_lfragment - - def _fragment_json(self, data: Dict[str, Any], key='', cur_lfragment=0) -> List[Dict[str, Dict]]: - if cur_lfragment >= self.max_lfragment: - pass - - elif isinstance(data, list): - res: List[Dict[str, Any]] = [] - main = dict(path=key, data=[]) - for i in range(len(data)): - if not isinstance(data[i], dict): - break - p = '%s[%d]' % (key, i) - res += self._fragment_json(data[i], p, cur_lfragment) - main['data'].append(p) - res += [main] - return res - - elif isinstance(data, dict): - res = [] - cur_lfragment += 1 - main = dict(path=key, data=[]) - for k, v in data.items(): - p = os.path.join(key, k) - res += self._fragment_json(v, p, cur_lfragment) - main['data'].append(p) - res += [main] - return res - - return [dict(path=key, data={os.path.basename(key): data})] - - def write(self, abspath: str, relpath: str): - """ - Mimic the zipfile function to write files to database. - Arguments: - abspath: The absolute path to the file to be read - relpath: For compatibility with zipfile - """ - self.add_data(abspath) - - def close(self, save=True): - """ - Mimic the zipfile function to close the msgpack file. - Will trigger the creation of the database when in write mode. - Arguments: - save: If True will add the current data in memory to database - """ - if 'w' in self._mode: - self.create_db() - if self._fileobj and save: - self._fileobj.close() - self._fileobj = None - - def save(self): - """ - Commits current data in memory to database - """ - self.create_db() - - def add_data(self, data: Union[str, Dict[str, Any], List[Union[str, Dict]]]): - """ - Add data to the msgpack database. - Arguments: - data: Can be a filename or dictionary or list of both - """ - if isinstance(data, str): - key = os.path.basename(data) - if data.endswith('json'): - key = key.split('.')[0] - val = json.load(open(data)) - if val: - self._data[key] = val - else: - key = key.replace('.', '_') - val = open(data).read() - if val: - self._data[key] = val - - elif isinstance(data, dict): - for key, val in data.items(): - if val: - self._data[key] = val - - elif isinstance(data, list): - for i in range(len(data)): - self.add_data(data[i]) - - else: - raise NotImplementedError - - def _load_data(self): - orig_mode = self.mode - self.mode = 'rb' - self.fileobj.seek(0) - data_loaded = msgpack.load(self.fileobj) - self.mode = orig_mode - return data_loaded - - def create_db(self): - """ - Creates the database and writes it to the msgpack file. - The database consists of the list of the fragmented data - and the list of footers such as the ids of the data. - """ - # data to be added in memory - data = self._data - # segment the data, each entry is a dict with 'path' and 'data' values - entries = self._fragment_json(data) - - # initialize packer - packer = msgpack.Packer() - cur_pointer = 0 - - # make space for header to write offset to toc - cur_pointer += self.fileobj.write(packer.pack(' ')) - - # write data to msgpack and get pointers - pointers = {} - for entry in entries: - path = entry['path'] - data = entry['data'] - pointers[path] = [cur_pointer] - cur_pointer += self.fileobj.write(packer.pack(data)) - - # add fragmentation level info - pointers['max_lfragment'] = cur_pointer - cur_pointer += self.fileobj.write(packer.pack(self.max_lfragment)) - - # add toc - pointers['ids'] = cur_pointer - self.fileobj.write(packer.pack(pointers)) - # write offset to toc at start - self.fileobj.seek(0) - self.fileobj.write(packer.pack(cur_pointer)) - - self._data = {} - - def _reduce_to_section(self, entry: Dict[str, Any], cur_lfragment=0) -> Union[Dict[str, Any], str, None]: - if not isinstance(entry, dict): - return entry - - cur_lfragment += 1 - if cur_lfragment > self.max_lfragment: - return _PLACEHOLDER - - new_dict = {} - for key, val in entry.items(): - v = self._reduce_to_section(val, cur_lfragment) - new_dict[key] = v - return new_dict - - @staticmethod - def to_list_path_str(entries: Dict[str, Any], root: str = '', paths: List = []) -> Union[List[str], None]: - if not isinstance(entries, dict): - return None - - if len(paths) > 0: - paths.remove(root) - - for key, val in entries.items(): - p = os.path.join(root, key) - paths.append(p) - ArchiveFileDB.to_list_path_str(val, p, paths) - - return list(paths) - - @staticmethod - def to_nested_dict(path_str: Union[str, List]) -> Dict[str, Any]: - if isinstance(path_str, str): - path_str = path_str.split('/') - - if len(path_str) == 1: - return {path_str[0]: _PLACEHOLDER} - else: - pdict = {} - pdict[path_str[0]] = ArchiveFileDB.to_nested_dict(path_str[1:]) - return pdict - - @staticmethod - def append_data(entry: Dict[str, Any], val: Any) -> Dict[str, Any]: - for k, v in entry.items(): - if v == _PLACEHOLDER or v is None: - entry[k] = val - else: - entry[k] = ArchiveFileDB.append_data(v, val) - return entry - - @staticmethod - def merge_dict(dict0: Dict[str, Any], dict1: Dict[str, Any]) -> Dict[str, Any]: - if not isinstance(dict1, dict) or not isinstance(dict0, dict): - return dict1 - - for k, v in dict1.items(): - if k in dict0: - dict0[k] = ArchiveFileDB.merge_dict(dict0[k], v) - else: - dict0[k] = v - return dict0 - - @property - def mode(self) -> str: - if 'b' not in self._mode: - self._mode += 'b' - return self._mode - - @mode.setter - def mode(self, m: str): - if self.mode == m or isinstance(self._fileobj, str): - return - self._mode = m - if self._fileobj and not isinstance(self._fileobj, BytesIO): - self._fileobj.close() - self._fileobj = None - - @property - def fileobj(self) -> IO: - if self._fileobj is None or isinstance(self._fileobj, str): - mode = self.mode - self._fileobj = open(self._fileobj, mode) - return self._fileobj - - def get_docs(self, key: Union[int, str]) -> Any: - """ - Provides an entry in the database. - Arguments: - key: int to indicate the offset for unpacking the - msgpack file or a string corresponding to the id of the entry - """ - if isinstance(key, str): - # get offset to toc - self.fileobj.seek(0) - unpacker = msgpack.Unpacker(self.fileobj, raw=False) - index = unpacker.unpack() - # load toc - self.fileobj.seek(index) - unpacker = msgpack.Unpacker(self.fileobj, raw=False) - info = unpacker.unpack() - # get offset to key - offset = info.get(key, None) - if offset is None: - return - else: - offset = key - - self.fileobj.seek(offset) - unpacker = msgpack.Unpacker(self.fileobj, raw=False) - res = unpacker.unpack() - return res - - @property - def ids(self) -> Dict[str, Union[List, int]]: - if self._ids is None: - self._ids = self.get_docs('ids') - return self._ids - - def _query_path(self, path_str: str, path_index: int) -> Union[Dict[str, Any], None]: - data = self.get_docs(path_index) - if isinstance(data, dict): - entry = ArchiveFileDB.to_nested_dict(path_str) - return ArchiveFileDB.append_data(entry, list(data.values())[0]) - - elif isinstance(data, list): - data_all: Dict[str, Any] = {} - for p in data: - data_all = ArchiveFileDB.merge_dict(data_all, self._query(p)) - return data_all - - else: - return None - - def _query(self, path_str: str) -> Union[Dict[str, Any], None]: - data: Dict[str, Any] = {} - # if section list is not fragmented, remove dangling index - if '[' in path_str[-3:]: - path_str = path_str[:-3] - - if "[:]" in path_str: - # if path_str contains a wildcard, get all - path_re = path_str.replace('[:]', '...') - for path in self.ids: - if not re.search(path_re, path): - continue - data = ArchiveFileDB.merge_dict(data, self._query(path)) - return data - else: - path_indexes = self.ids.get(path_str, []) - - if isinstance(path_indexes, int): - path_indexes = [path_indexes] - if len(path_indexes) == 0: - return None - - for path_index in path_indexes: - datai = self._query_path(path_str, path_index) - data = ArchiveFileDB.merge_dict(data, datai) - - return data - - def query(self, entries: Dict[str, Any], dtype='dict') -> Union[Dict[str, Any], List, IO, str]: - """ - Queries the database given a schema. - Arguments: - entries: A dictionary with a schema similar to the database - entries but containing null values corresponding to the - desired quantity. - dtype: format of the outfile can be file, string, dict - """ - reduced_entries = self._reduce_to_section(entries) - if not isinstance(reduced_entries, dict): - return {} - - path_strs = ArchiveFileDB.to_list_path_str(reduced_entries, root='', paths=[]) - - data_to_query: Dict[str, Any] = {} - for path_str in path_strs: - data_entry = self._query(path_str) - if data_entry: - data_to_query = ArchiveFileDB.merge_dict(data_to_query, data_entry) - - if data_to_query: - jdata = JSONdata(data_to_query) - result = jdata.get_data(entries) - else: - return {} - - if dtype == 'file': - return StringIO(json.dumps(result, indent=4)) - elif dtype == 'string': - return json.dumps(result) - else: - return result diff --git a/nomad/archive_library/query.py b/nomad/archive_library/query.py deleted file mode 100644 index 88d33c9e31f80514acfe947efde8c8ff47ef09db..0000000000000000000000000000000000000000 --- a/nomad/archive_library/query.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2019 Alvin Noe Ladines, Markus Scheidgen -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an"AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Interface to archive api. A dict of query parameters and a query schema similar to -the archive json format can be provided to filter archive. - -q = ArchiveQuery({'atoms':'Si'}) -metainfo = q.query() - -for c in metainfo.calcs: - print(c.section_run.section_single_configuration_calculation[0]({'energy_total':None})) -""" - -import requests -import os.path -from urllib.parse import urlparse - -from nomad import config as nomad_config -from nomad.archive_library.metainfo import ArchiveMetainfo -from nomad.cli.client.client import KeycloakAuthenticator - - -class ArchiveQuery: - def __init__(self, *args, **kwargs): - self._archive_path = 'archive' - self._query_path = 'query' - self.archive_data = [] - self._scroll_id = None - self._page = None - self._query_params = {} - if args: - self._query_params = args[0] - if kwargs: - self._query_params.update(kwargs) - self._archive_schema = self._query_params.pop('archive_data', None) - if not isinstance(self._archive_schema, list): - self._archive_schema = [self._archive_schema] - self._max_n_pages = self._query_params.pop('max_n_pages', 100000) - self._authentication = self._query_params.pop('authentication', None) - self._url = self._query_params.pop('url', None) - self._user = self._query_params.pop('user', None) - self._password = self._query_params.pop('password', None) - if self._url: - nomad_config.client.url = self._url - if self._user: - nomad_config.client.user = self._user - if self._password: - nomad_config.client.password = self._password - - def _get_value(self, name, in_dict): - if not isinstance(in_dict, dict): - return - for key, val in in_dict.items(): - if key == name: - res = val - else: - res = self._get_value(name, val) - return res - - def _set_value(self, name, value, in_dict): - if not isinstance(in_dict, dict): - return - for key, val in in_dict.items(): - if key == name: - in_dict[name] = value - return - else: - self._set_value(name, value, val) - in_dict[name] = value - - def _get_authentication(self): - if self._authentication is None: - host = urlparse(nomad_config.client.url).netloc.split(':')[0] - self._authentication = KeycloakAuthenticator( - host=host, - user=nomad_config.client.user, - password=nomad_config.client.password, - server_url=nomad_config.keycloak.server_external_url, - realm_name=nomad_config.keycloak.realm_name, - client_id=nomad_config.keycloak.public_client_id) - if isinstance(self._authentication, KeycloakAuthenticator): - return self._authentication.apply() - else: - return self._authentication - - def _api_query(self): - url = os.path.join(nomad_config.client.url, self._archive_path, self._query_path) - data = self._query_params - data['results'] = self._archive_schema - - if self._page is not None: - # increment the page number - self._set_value('page', self._page + 1, data) - if self._scroll_id is not None: - self._set_value('scroll_id', self._scroll_id, data) - - response = requests.post(url, headers=self._get_authentication(), json=data) - if response.status_code != 200: - raise Exception('Query returned %s' % response.status_code) - - data = response.json - if not isinstance(data, dict): - data = data() - - results = data.get('results', []) - scroll = data.get('Scroll', None) - if scroll: - self._scroll_id = scroll.get('scroll_id', None) - pagination = data.get('Pagination', None) - if pagination: - self._page = pagination.get('page', None) - - return results - - def _get_archive_data(self): - n_page = 0 - while True: - results = self._api_query() - self.archive_data += results - n_page += 1 - if n_page >= self._max_n_pages: - break - if len(results) == 0: - break - - def query(self): - self._get_archive_data() - if self.archive_data: - self.metainfo = ArchiveMetainfo(archive_data=self.archive_data, archive_schema='*') diff --git a/nomad/archive_library/utils.py b/nomad/archive_library/utils.py deleted file mode 100644 index de3deaf8ff9ffadef39b8de85fb589f3bc944570..0000000000000000000000000000000000000000 --- a/nomad/archive_library/utils.py +++ /dev/null @@ -1,14 +0,0 @@ -from nomad.files import UploadFiles -from nomad.app.api.auth import create_authorization_predicate -from nomad.archive_library.filedb import ArchiveFileDB - - -def get_dbs(upload_id): - upload_files = UploadFiles.get(upload_id, create_authorization_predicate(upload_id)) - - if upload_files is None: - return [] - - files = upload_files.archive_file_msg('X') - msgdbs = [ArchiveFileDB(f) for f in files if f is not None] - return msgdbs diff --git a/nomad/archive_library/metainfo.py b/nomad/archive_query.py similarity index 61% rename from nomad/archive_library/metainfo.py rename to nomad/archive_query.py index 132a0d0ba9e6cecc3fb096b4e63babbd5bec5d83..ae5a1ee77102c19d612b825f9d73ae0cab46ba72 100644 --- a/nomad/archive_library/metainfo.py +++ b/nomad/archive_query.py @@ -13,7 +13,7 @@ # limitations under the License. """ -Module for converting archive data into the new metainfo format. +Contains interfaces to the archive metainfo and query. In module ``ArchiveMetainfo``, the data is provided either from raw json data or as a filename of an existing msgpack database. The metainfo @@ -23,14 +23,30 @@ can then queried by providing a schema. am = ArchiveMetainfo("db.msg") for calc in am.calcs: c.section_run.section_single_configuration_calculation[0]({'energy_total':None}) + +The ArchiveQuery enables a query interface to the archive data. A dict of query parameters +and a query schema similar to the archive json format can be provided to filter archive. + +.. code-block: python + q = ArchiveQuery({'atoms':'Si'}) + metainfo = q.query() + for c in metainfo.calcs: + print(c.section_run.section_single_configuration_calculation[0]({'energy_total':'*'})) """ import numpy as np from io import BytesIO +import json +import requests +import os.path +from urllib.parse import urlparse +from typing import Dict, List, Any, Union from nomad.metainfo import MSection, Quantity, SubSection from nomad.metainfo.metainfo import MObjectMeta -from nomad.archive_library.filedb import ArchiveFileDB +from nomad.archive import ArchiveFileDB +from nomad import config as nomad_config +from nomad.cli.client.client import KeycloakAuthenticator class ArchiveMetainfo: @@ -72,13 +88,17 @@ class ArchiveMetainfo: db.create_db() self._archive_db = db + @property + def archive_schema(self): + return json.loads(json.dumps(self._archive_schema)) + def _init_calcs(self): for i in range(len(self.calc_ids)): calc_id = self.calc_ids[i] if self._archive_schema is None: self._calcs[calc_id] = self.base_metainfo else: - data = self._archive_db.query({calc_id: self._archive_schema})[calc_id] + data = self._archive_db.query({calc_id: self.archive_schema})[calc_id] self._calcs[calc_id] = self.base_metacls.m_from_dict(data) self._calcs[calc_id].archive_db = self._archive_db @@ -118,6 +138,27 @@ class ArchiveMetainfo: calc.archive_db = self._archive_db return calc + @staticmethod + def to_nested_dict(path_str: Union[str, List]) -> Dict[str, Any]: + if isinstance(path_str, str): + path_str = path_str.split('/') + + if len(path_str) == 1: + return {path_str[0]: '*'} + else: + pdict = {} + pdict[path_str[0]] = ArchiveMetainfo.to_nested_dict(path_str[1:]) + return pdict + + @staticmethod + def append_data(entry: Dict[str, Any], val: Any) -> Dict[str, Any]: + for k, v in entry.items(): + if not isinstance(v, dict): + entry[k] = val + else: + entry[k] = ArchiveMetainfo.append_data(v, val) + return entry + @staticmethod def get_path_from_section(content): path = content.m_path() @@ -136,7 +177,7 @@ class ArchiveMetainfo: db = content.m_root().archive_db calc_id = content.m_root().calc_id root = calc_id + ArchiveMetainfo.get_path_from_section(content) - qs = ArchiveFileDB.append_data(ArchiveFileDB.to_nested_dict(root), qschema) + qs = ArchiveMetainfo.append_data(ArchiveMetainfo.to_nested_dict(root), qschema) data = db.query(qs) return data @@ -156,10 +197,7 @@ class ArchiveMetainfo: @property def calc_ids(self): if not self._calc_ids: - ids = self._archive_db.ids - ids = [idx for idx in ids if '/' not in idx and idx] - ids = [idx for idx in ids if idx not in ['ids', 'max_lfragment']] - self._calc_ids = ids + self._calc_ids = [s.strip() for s in self._archive_db.ids] return self._calc_ids def _nullify_metainfo(self, metainfo): @@ -185,7 +223,7 @@ class ArchiveMetainfo: def base_data(self): if self._base_data is None: calc_id = self.calc_ids[0] - self._base_data = self._archive_db.query({calc_id: self._archive_schema})[calc_id] + self._base_data = self._archive_db.query({calc_id: self.archive_schema})[calc_id] return self._base_data @property @@ -280,3 +318,112 @@ class ArchiveMetainfo: if data is None: data = self._archive_data self.metainfo = self.base_metacls.m_from_dict(data) + + +class ArchiveQuery: + def __init__(self, *args, **kwargs): + self._archive_path = 'archive' + self._query_path = 'query' + self.archive_data = [] + self._scroll_id = None + self._page = None + self._query_params = {} + if args: + self._query_params = args[0] + if kwargs: + self._query_params.update(kwargs) + self._archive_schema = self._query_params.pop('archive_data', None) + if not isinstance(self._archive_schema, list): + self._archive_schema = [self._archive_schema] + self._max_n_pages = self._query_params.pop('max_n_pages', 100000) + self._authentication = self._query_params.pop('authentication', None) + self._url = self._query_params.pop('url', None) + self._user = self._query_params.pop('user', None) + self._password = self._query_params.pop('password', None) + if self._url: + nomad_config.client.url = self._url + if self._user: + nomad_config.client.user = self._user + if self._password: + nomad_config.client.password = self._password + + def _get_value(self, name, in_dict): + if not isinstance(in_dict, dict): + return + for key, val in in_dict.items(): + if key == name: + res = val + else: + res = self._get_value(name, val) + return res + + def _set_value(self, name, value, in_dict): + if not isinstance(in_dict, dict): + return + for key, val in in_dict.items(): + if key == name: + in_dict[name] = value + return + else: + self._set_value(name, value, val) + in_dict[name] = value + + def _get_authentication(self): + if self._authentication is None: + host = urlparse(nomad_config.client.url).netloc.split(':')[0] + self._authentication = KeycloakAuthenticator( + host=host, + user=nomad_config.client.user, + password=nomad_config.client.password, + server_url=nomad_config.keycloak.server_external_url, + realm_name=nomad_config.keycloak.realm_name, + client_id=nomad_config.keycloak.public_client_id) + if isinstance(self._authentication, KeycloakAuthenticator): + return self._authentication.apply() + else: + return self._authentication + + def _api_query(self): + url = os.path.join(nomad_config.client.url, self._archive_path, self._query_path) + data = self._query_params + data['results'] = self._archive_schema + + if self._page is not None: + # increment the page number + self._set_value('page', self._page + 1, data) + if self._scroll_id is not None: + self._set_value('scroll_id', self._scroll_id, data) + + response = requests.post(url, headers=self._get_authentication(), json=data) + if response.status_code != 200: + raise response.raise_for_status() + + data = response.json + if not isinstance(data, dict): + data = data() + + results = data.get('results', []) + scroll = data.get('Scroll', None) + if scroll: + self._scroll_id = scroll.get('scroll_id', None) + pagination = data.get('Pagination', None) + if pagination: + self._page = pagination.get('page', None) + + return results + + def _get_archive_data(self): + n_page = 0 + while True: + results = self._api_query() + self.archive_data += results + n_page += 1 + if n_page >= self._max_n_pages: + break + if len(results) == 0: + break + + def query(self): + self._get_archive_data() + if self.archive_data: + self.metainfo = ArchiveMetainfo(archive_data=self.archive_data, archive_schema='*') diff --git a/nomad/files.py b/nomad/files.py index 3b7407323619b3b0370a1bd7b83e8961a44be9de..73882ab3c2cf9bfe5c8872442933e7a4269ba35e 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -61,7 +61,7 @@ import pickle from nomad import config, utils from nomad.datamodel import UploadWithMetadata -from nomad.archive_library.filedb import ArchiveFileDB +from nomad.archive import ArchiveFileDB # TODO this should become obsolete, once we are going beyong python 3.6. For now # python 3.6's zipfile does not allow to seek/tell within a file-like opened from a diff --git a/nomad/normalizing/data/springer_msgpack.py b/nomad/normalizing/data/springer_msgpack.py index 158d7203ba52bd2da0e4d84fdf3f193a591a415a..0ceca12421e6aa7e314a9eaa113228e9f1f641a1 100644 --- a/nomad/normalizing/data/springer_msgpack.py +++ b/nomad/normalizing/data/springer_msgpack.py @@ -28,7 +28,7 @@ import re import os from bs4 import BeautifulSoup -from nomad.archive_library.filedb import ArchiveFileDB +from nomad.archive import ArchiveFileDB DB_NAME = '.springer.msg' @@ -187,6 +187,18 @@ def parse(htmltext): return results +def _merge_dict(dict0, dict1): + if not isinstance(dict1, dict) or not isinstance(dict0, dict): + return dict1 + + for k, v in dict1.items(): + if k in dict0: + dict0[k] = _merge_dict(dict0[k], v) + else: + dict0[k] = v + return dict0 + + def download_entries(formula, space_group_number): """ Downloads the springer quantities related to a structure from springer. @@ -216,7 +228,7 @@ def download_entries(formula, space_group_number): compound = data.get('compound_classes', None) classification = data.get('classification', None) entry = dict(id=id, aformula=aformula, url=path, compound=compound, classification=classification) - entries = ArchiveFileDB.merge_dict(entries, {str(space_group_number): {normalized_formula: {id: entry}}}) + entries = _merge_dict(entries, {str(space_group_number): {normalized_formula: {id: entry}}}) return entries diff --git a/tests/app/test_api.py b/tests/app/test_api.py index 6da865ea8c0d61822766da892f1e0e19e7a6c64b..01c836b15aa9fcc979adadca6c34976cbfedb0d4 100644 --- a/tests/app/test_api.py +++ b/tests/app/test_api.py @@ -31,7 +31,6 @@ from nomad import search, parsing, files, config, utils, infrastructure from nomad.files import UploadFiles, PublicUploadFiles from nomad.processing import Upload, Calc, SUCCESS from nomad.datamodel import UploadWithMetadata, CalcWithMetadata, User, Dataset -from nomad.archive_library.filedb import _PLACEHOLDER from tests.conftest import create_auth_headers, clear_elastic, create_test_structure from tests.test_files import example_file, example_file_mainfile, example_file_contents @@ -670,7 +669,7 @@ class TestArchive(UploadFilesBasedTests): assert_zip_file(rv, files=1) def test_post_archive_query(self, api, published_wo_user_metadata): - schema = {"section_run": {"section_single_configuration_calculation": {"energy_total": _PLACEHOLDER}}} + schema = {"section_run": {"section_single_configuration_calculation": {"energy_total": '*'}}} data = {'results': [schema], 'per_page': 5} uri = '/archive/query' rv = api.post(uri, content_type='application/json', data=json.dumps(data)) diff --git a/tests/test_archive.py b/tests/test_archive.py index 57bb8be123ec6ac25a30779f16f5f31ad7dfcf55..d33d133622f9b95afe468c3d74ac10ac9ecc8e24 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -4,12 +4,16 @@ import msgpack from io import BytesIO from nomad import utils -from nomad.archive import TOCPacker, write_archive, read_archive, ArchiveReader +from nomad.archive import TOCPacker, write_archive, read_archive, ArchiveReader, ArchiveFileDB + + +def create_example_uuid(index: int = 0): + return ('{:%dd}' % utils.default_hash_len).format(index) @pytest.fixture(scope='session') def example_uuid(): - return '0' * len(utils.create_uuid()) + return create_example_uuid() @pytest.fixture(scope='session') @@ -76,6 +80,7 @@ def test_toc_packer(example_entry): def test_write_archive_single(example_uuid, example_entry): f = BytesIO() + print('#', len(example_uuid)) write_archive(f, 1, [(example_uuid, example_entry)]) packed_archive = f.getbuffer() archive = _unpack(packed_archive) @@ -102,7 +107,7 @@ def test_write_archive_single(example_uuid, example_entry): def test_write_archive_multi(example_uuid, example_entry): f = BytesIO() - example_uuids = utils.create_uuid(), utils.create_uuid() + example_uuids = create_example_uuid(0), create_example_uuid(1) write_archive(f, 2, [ (example_uuids[0], example_entry), (example_uuids[1], example_entry)]) @@ -143,18 +148,39 @@ def test_read_archive_multi(example_uuid, example_entry, use_blocked_toc): f = BytesIO() write_archive( f, archive_size, - [('{:22d}'.format(i), example_entry) for i in range(0, archive_size)]) + [(create_example_uuid(i), example_entry) for i in range(0, archive_size)]) packed_archive = f.getbuffer() f = BytesIO(packed_archive) with ArchiveReader(f, use_blocked_toc=use_blocked_toc) as reader: if use_blocked_toc: reader._load_toc_block(0) - assert reader._toc.get('{:22d}'.format(0)) is not None + assert reader._toc.get(create_example_uuid(0)) is not None assert len(reader._toc) == ArchiveReader.toc_block_size_entries reader._load_toc_block(archive_size - 1) - assert reader._toc.get('{:22d}'.format(archive_size - 1)) is not None + assert reader._toc.get(create_example_uuid(archive_size - 1)) is not None assert len(reader._toc) > ArchiveReader.toc_block_size_entries for i in range(0, archive_size): - reader.get('{:22d}'.format(i)) is not None + reader.get(create_example_uuid(i)) is not None + + +def test_archivefiledb(): + payload = [ + {'calc1': { + 'secA': {'subsecA1': [{'propA1a': 1.0}]}, 'secB': {'propB1a': ['a', 'b']}}}, + {'calc2': { + 'secA': {'subsecA1': [{'propA1a': 2.0}]}, 'secB': {'propB1a': ['c', 'd']}}}] + + f = BytesIO() + msgdb = ArchiveFileDB(f, mode='w', entry_toc_depth=1) + msgdb.add_data(payload) + msgdb.close(save=False) + + f = BytesIO(f.getbuffer()) + msgdb = ArchiveFileDB(f, mode='r') + + assert msgdb.query({'calc1': '*'}) == payload[0] + assert msgdb.query({'calc2': {'secA': {'subsecA1[0]': '*'}}}) == {'calc2': {'secA': {'subsecA1[0]': {'propA1a': 2.0}}}} + + msgdb.close() diff --git a/tests/test_archive_library.py b/tests/test_archive_library.py index f9cfb888c67c0dff536b119f495542e43465ca06..11b47eedbab6a8e1385bf9b115c0b14ae90a1c1e 100644 --- a/tests/test_archive_library.py +++ b/tests/test_archive_library.py @@ -1,9 +1,8 @@ import pytest import os -from nomad.archive_library.filedb import ArchiveFileDB -from nomad.archive_library.metainfo import ArchiveMetainfo -from nomad.archive_library.query import ArchiveQuery +from nomad.archive import ArchiveFileDB +from nomad.archive_query import ArchiveQuery, ArchiveMetainfo from tests.app.test_app import BlueprintClient @@ -11,7 +10,7 @@ from tests.app.test_app import BlueprintClient def example_msgdb(): def create_msgdb(payload): filename = 'archive_test.msg' - msgdbo = ArchiveFileDB(filename, mode='w', max_lfragment=1) + msgdbo = ArchiveFileDB(filename, mode='w', entry_toc_depth=1) msgdbo.add_data(payload) msgdbo.close() msgdbo = ArchiveFileDB(filename, mode='r') @@ -22,73 +21,6 @@ def example_msgdb(): os.remove(filename) -class TestArchiveFileDB: - def get_value(self, data, key): - if key in data: - return data[key] - if isinstance(data, list): - for i in range(len(data)): - return self.get_value(data[i], key) - else: - for v in data.values(): - return self.get_value(v, key) - - def get_keys(self, data, key=''): - if data is None: - return [key] - keys = [] - for k, v in data.items(): - keys += self.get_keys(v, k) - return keys - - @pytest.mark.parametrize('payload', [ - [ - 'tests/data/proc/examples_archive/3Sqa0yIQnBrAautsn38YNhyZrOoE.json', - 'tests/data/proc/examples_archive/3Sqa0yIQnBrAautsn38YNhyZrOoE.log'], - [ - {'secA': {'propA': 'X'}}]]) - def test_pack(self, example_msgdb, payload): - fo = example_msgdb(payload) - assert fo is not None - - @pytest.mark.parametrize('schema, dtype', [ - ({'secA': {'subsecA1[0]': {'propA1a': None}}}, {'propA1a': float}), - ({'secB': {'propB1a': None}}, {'propB1a': list}), - ({'secA': {'subsecA1[-1:]': {'propA1a': None}}}, {'propA1a': float})]) - def test_query(self, example_msgdb, schema, dtype): - payload = [ - {'calc1': { - 'secA': {'subsecA1': [{'propA1a': 1.0}]}, 'secB': {'propB1a': ['a', 'b']}}}, - {'calc2': { - 'secA': {'subsecA1': [{'propA1a': 2.0}]}, 'secB': {'propB1a': ['c', 'd']}}}] - msgdb = example_msgdb(payload) - calc_ids = msgdb.ids.keys() - calc_ids = [c for c in calc_ids if not os.path.dirname(c)] - calc_ids = [c for c in calc_ids if not c.endswith('log') and c] - calc_ids = [c for c in calc_ids if c not in ['ids', 'max_lfragment']] - qs = {calc_id: schema for calc_id in calc_ids} - results = msgdb.query(qs) - assert len(results) == len(calc_ids) - for calc_id in results: - for key in self.get_keys(schema): - assert(isinstance(self.get_value(results[calc_id], key), dtype[key])) - - def test_error(self, example_msgdb): - vals = [ - [{'atom_labels': ['X']}, {'atom_labels': ['X', 'X']}], - [{'atom_labels': ['X']}], {'atom_labels': 'X'}] - payload = [{'calc_%d' % i: {'section_run': {'section_system': vals[i]}}} for i in range(len(vals))] - msgdb = example_msgdb(payload) - # invalid key - qs = {'calc_%d' % i: {'sction_run': {'section_system[:]': {'atom_labels': None}}} for i in range(len(vals))} - results = msgdb.query(qs) - assert results == {'calc_%d' % i: {} for i in range(len(vals))} - # invalid calculation - qs = {'calc_100': None} - results = msgdb.query(qs) - assert results == {} - - class TestArchiveMetainfo: @pytest.fixture(scope='function') def data(self): @@ -98,12 +30,12 @@ class TestArchiveMetainfo: def assert_metainfo(self, metainfo): for calc in metainfo.calcs: - assert calc.secA({'propA': None}) is not None - assert calc({'secA': {'propA': None, 'propB': None}}) is not None + assert calc.secA({'propA': '*'}) is not None + assert calc({'secA': {'propA': '*', 'propB': '*'}}) is not None def test_query_from_file(self, data, example_msgdb): _ = example_msgdb(data) - metainfo = ArchiveMetainfo(archive_data='archive_test.msg') + metainfo = ArchiveMetainfo(archive_data='archive_test.msg', archive_schema={'secA': '*'}) self.assert_metainfo(metainfo) def test_query_from_data(self, data): @@ -117,18 +49,18 @@ class TestArchiveQuery: monkeypatch.setattr('nomad.config.client.url', '') return BlueprintClient(client, '/api') - def test_query_from_json(self, api, published_wo_user_metadata, other_test_user_auth, monkeypatch): - monkeypatch.setattr('nomad.archive_library.query.requests', api) + def test_query_from_json(self, api, published_wo_user_metadata, test_user_auth, monkeypatch): + monkeypatch.setattr('nomad.archive_query.requests', api) q_params = {'Pagination': {'order': 1, 'per_page': 5}} - q_schema = {'section_entry_info': None} - q = ArchiveQuery(q_params, archive_data=q_schema, authentication=other_test_user_auth) + q_schema = {'section_entry_info': '*'} + q = ArchiveQuery(q_params, archive_data=q_schema, authentication=test_user_auth) q.query() for calc in q.metainfo: assert calc.section_entry_info.calc_id is not None def test_query_from_kwargs(self, api, published_wo_user_metadata, other_test_user_auth, monkeypatch): - monkeypatch.setattr('nomad.archive_library.query.requests', api) - q_schema = {'section_entry_info': None} + monkeypatch.setattr('nomad.archive_query.requests', api) + q_schema = {'section_entry_info': '*'} q = ArchiveQuery(order=1, per_page=5, scroll=True, archive_data=q_schema, authentication=other_test_user_auth) q.query() for calc in q.metainfo: