diff --git a/gui/src/components/entry/RepoEntryView.js b/gui/src/components/entry/RepoEntryView.js index 52a5c71a2fb5a4ce8b0d105af0b1a04089f5fdfe..91dc6859d345d3bec876d026719df2b283b94956 100644 --- a/gui/src/components/entry/RepoEntryView.js +++ b/gui/src/components/entry/RepoEntryView.js @@ -146,12 +146,12 @@ class RepoEntryView extends React.Component { <Quantity quantity="upload_id" label='upload id' {...quantityProps} noWrap withClipboard /> <Quantity quantity="upload_time" label='upload time' noWrap {...quantityProps} > <Typography noWrap> - {new Date(calcData.upload_time * 1000).toLocaleString()} + {new Date(calcData.upload_time).toLocaleString()} </Typography> </Quantity> <Quantity quantity="last_processing" label='last processing' loading={loading} placeholder="not processed" noWrap {...quantityProps}> <Typography noWrap> - {new Date(calcData.last_processing * 1000).toLocaleString()} + {new Date(calcData.last_processing).toLocaleString()} </Typography> </Quantity> <Quantity quantity="last_processing" label='processing version' loading={loading} noWrap placeholder="not processed" {...quantityProps}> diff --git a/nomad/app/api/archive.py b/nomad/app/api/archive.py index fd9685720ab8ac3f7da8879cce25dfdde6e09039..2bbce2321673c3feb995eab27050adacc4854d11 100644 --- a/nomad/app/api/archive.py +++ b/nomad/app/api/archive.py @@ -26,7 +26,6 @@ import json import orjson import importlib import urllib.parse -from collections.abc import Mapping import metainfo @@ -100,9 +99,10 @@ class ArchiveCalcResource(Resource): try: with upload_files.read_archive(calc_id) as archive: return { - key: value.to_dict() - for key, value in archive[calc_id].items() - if isinstance(value, Mapping)} + key: value + for key, value in archive[calc_id].to_dict().items() + if key != 'processing_logs'} + except Restricted: abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id)) except KeyError: diff --git a/nomad/app/api/repo.py b/nomad/app/api/repo.py index 6ddfcc689f783df423d892d26d45f11e5593d887..18f5937b022191a4f6b1596262f805a9f08b1463 100644 --- a/nomad/app/api/repo.py +++ b/nomad/app/api/repo.py @@ -25,7 +25,7 @@ from elasticsearch.exceptions import NotFoundError import elasticsearch.helpers from datetime import datetime -from nomad import search, utils, datamodel, processing as proc, infrastructure +from nomad import search, utils, datamodel, processing as proc, infrastructure, files from nomad.metainfo import search_extension from nomad.datamodel import Dataset, User, EditableUserMetadata from nomad.app import common @@ -315,6 +315,7 @@ def edit(parsed_query: Dict[str, Any], mongo_update: Dict[str, Any] = None, re_i apply_search_parameters(search_request, parsed_query) upload_ids = set() calc_ids = [] + for hit in search_request.execute_scan(): calc_ids.append(hit['calc_id']) upload_ids.add(hit['upload_id']) @@ -330,12 +331,24 @@ def edit(parsed_query: Dict[str, Any], mongo_update: Dict[str, Any] = None, re_i with utils.timer(common.logger, 'edit elastic update executed', size=len(calc_ids)): if re_index: def elastic_updates(): + upload_files_cache: Dict[str, files.UploadFiles] = dict() + for calc in proc.Calc.objects(calc_id__in=calc_ids): - entry_metadata = datamodel.EntryMetadata.m_from_dict(calc['metadata']) + upload_id = calc.upload_id + upload_files = upload_files_cache.get(upload_id) + if upload_files is None: + upload_files = files.UploadFiles.get(upload_id, is_authorized=lambda: True) + upload_files_cache[upload_id] = upload_files + + entry_metadata = calc.entry_metadata(upload_files) entry = entry_metadata.a_elastic.create_index_entry().to_dict(include_meta=True) entry['_op_type'] = 'index' + yield entry + for upload_files in upload_files_cache.values(): + upload_files.close() + _, failed = elasticsearch.helpers.bulk( infrastructure.elastic_client, elastic_updates(), stats_only=True) search.refresh() diff --git a/nomad/app/api/upload.py b/nomad/app/api/upload.py index f84aa6012f616e3436d99699edb4ff9ca19bb249..be05ee57b904e18b28df96e2f9400886a76e017c 100644 --- a/nomad/app/api/upload.py +++ b/nomad/app/api/upload.py @@ -27,7 +27,7 @@ import os import io from functools import wraps -from nomad import config, utils, files, datamodel +from nomad import config, utils, files, search from nomad.processing import Upload, FAILURE from nomad.processing import ProcessAlreadyRunning from nomad.app import common @@ -42,13 +42,6 @@ ns = api.namespace( 'uploads', description='Uploading data and tracing uploaded data and its processing.') - -class CalcMetadata(fields.Raw): - def format(self, value): - entry_metadata = datamodel.EntryMetadata.m_from_dict(value) - return entry_metadata.a_elastic.create_index_entry().to_dict() - - proc_model = api.model('Processing', { 'tasks': fields.List(fields.String), 'current_task': fields.String, @@ -96,7 +89,9 @@ calc_model = api.inherit('UploadCalculationProcessing', proc_model, { 'mainfile': fields.String, 'upload_id': fields.String, 'parser': fields.String, - 'metadata': CalcMetadata(description='The repository metadata for this entry.') + 'metadata': fields.Raw( + attribute='_entry_metadata', + description='The repository metadata for this entry.') }) upload_with_calcs_model = api.inherit('UploadWithPaginatedCalculations', upload_model, { @@ -381,13 +376,24 @@ class UploadResource(Resource): order_by = ('-%s' if order == -1 else '+%s') % order_by - calcs = upload.all_calcs((page - 1) * per_page, page * per_page, order_by=order_by) + # load upload's calcs + calcs = list(upload.all_calcs( + (page - 1) * per_page, page * per_page, order_by=order_by)) + + calc_ids = [calc.calc_id for calc in calcs] + search_results = { + hit['calc_id']: hit + for hit in search.SearchRequest().search_parameter('calc_id', calc_ids).execute_scan()} + + for calc in calcs: + calc._entry_metadata = search_results.get(calc.calc_id) + failed_calcs = upload.failed_calcs result = ProxyUpload(upload, { 'pagination': dict( total=upload.total_calcs, page=page, per_page=per_page, successes=upload.processed_calcs - failed_calcs, failures=failed_calcs), - 'results': [calc for calc in calcs] + 'results': calcs }) return result, 200 diff --git a/nomad/app/optimade/endpoints.py b/nomad/app/optimade/endpoints.py index accc47f0d2b66260b580d76cf14f3bd42c3ff234..8b48fa4da007bf080261fde012d2740a2c79e791 100644 --- a/nomad/app/optimade/endpoints.py +++ b/nomad/app/optimade/endpoints.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Dict, Any from flask_restplus import Resource, abort from flask import request from elasticsearch_dsl import Q -from nomad import search +from nomad import search, files, datamodel from nomad.datamodel import OptimadeEntry from .api import api, url @@ -46,6 +47,31 @@ def base_search_request(): Q('exists', field='dft.optimade.elements')) # TODO use the elastic annotations when done +def to_calc_with_metadata(results: List[Dict[str, Any]]): + ''' Translates search results into :class:`EntryMetadata` objects read from archive. ''' + + upload_files_cache: Dict[str, files.UploadFiles] = {} + + def transform(result): + calc_id, upload_id = result['calc_id'], result['upload_id'] + upload_files = upload_files_cache.get(upload_id) + + if upload_files is None: + upload_files = files.UploadFiles.get(upload_id) + upload_files_cache[upload_id] = upload_files + + archive = upload_files.read_archive(calc_id) # , access='public') + metadata = archive[calc_id]['section_metadata'].to_dict() + return datamodel.EntryMetadata.m_from_dict(metadata) + + result = [transform(result) for result in results] + + for upload_files in upload_files_cache.values(): + upload_files.close() + + return result + + @ns.route('/calculations') class CalculationList(Resource): @api.doc('list_calculations') @@ -65,7 +91,7 @@ class CalculationList(Resource): except Exception: abort(400, message='bad parameter types') # TODO Specific json API error handling - search_request = base_search_request().include('calc_id') + search_request = base_search_request().include('calc_id', 'upload_id') if filter is not None: try: @@ -79,7 +105,7 @@ class CalculationList(Resource): # order_by='optimade.%s' % sort) # TODO map the Optimade property available = result['pagination']['total'] - results = search.to_calc_with_metadata(result['results']) + results = to_calc_with_metadata(result['results']) assert len(results) == len(result['results']), 'Mongodb and elasticsearch are not consistent' return dict( @@ -115,7 +141,7 @@ class Calculation(Resource): per_page=1) available = result['pagination']['total'] - results = search.to_calc_with_metadata(result['results']) + results = to_calc_with_metadata(result['results']) assert len(results) == len(result['results']), 'Mongodb and elasticsearch are not consistent' if available == 0: diff --git a/nomad/cli/admin/admin.py b/nomad/cli/admin/admin.py index 88558c1104903720fd0d1c7b2a611e5267b9e8d8..54861ec2bb6f92f8dc4592434a10b0cd2641d983 100644 --- a/nomad/cli/admin/admin.py +++ b/nomad/cli/admin/admin.py @@ -158,7 +158,8 @@ def lift_embargo(dry, parallel): uploads_to_repack.append(upload) upload.save() - search.index_all(upload.entries_metadata()) + with upload.entries_metadata() as entries: + search.index_all(entries) if not dry: __run_processing(uploads_to_repack, parallel, lambda upload: upload.re_pack(), 're-packing') diff --git a/nomad/cli/admin/uploads.py b/nomad/cli/admin/uploads.py index f2fa2abfbe1f4058eb77b1b0c0b7795a4a77e4fb..56434bff8c223891d5666c9545fc6aad0cadce20 100644 --- a/nomad/cli/admin/uploads.py +++ b/nomad/cli/admin/uploads.py @@ -146,16 +146,17 @@ def chown(ctx, username, uploads): upload.user_id = user.user_id calcs = upload.entries_metadata() - def create_update(calc): + def create_update(calc_id): return UpdateOne( - {'_id': calc.calc_id}, + {'_id': calc_id}, {'$set': {'metadata.uploader': user.user_id}}) - proc.Calc._get_collection().bulk_write([create_update(calc) for calc in calcs]) + proc.Calc._get_collection().bulk_write( + [create_update(calc_id) for calc_id in upload.entry_ids()]) upload.save() - calcs = upload.entries_metadata() - search.index_all(calcs, do_refresh=False) + with upload.entries_metadata() as calcs: + search.index_all(calcs, do_refresh=False) search.refresh() @@ -192,9 +193,9 @@ def index(ctx, uploads): i, failed = 0, 0 for upload in uploads: - calcs = upload.entries_metadata() - failed += search.index_all(calcs) - i += 1 + with upload.entries_metadata() as calcs: + failed += search.index_all(calcs) + i += 1 print(' indexed %d of %d uploads, failed to index %d entries' % (i, uploads_count, failed)) diff --git a/nomad/cli/client/mirror.py b/nomad/cli/client/mirror.py index 8bc6ded4a4f99fad29498ac278c30bd93ac1422d..89787f0c161d27aaf7ee220b01b395da53332c02 100644 --- a/nomad/cli/client/mirror.py +++ b/nomad/cli/client/mirror.py @@ -318,7 +318,8 @@ def mirror( proc.Calc._get_collection().insert(upload_data.calcs) # index es - search.index_all(upload.entries_metadata()) + with upload.entries_metadata() as entries: + search.index_all(entries) print( 'Mirrored %s with %d calcs at %s' % diff --git a/nomad/cli/parse.py b/nomad/cli/parse.py index 55e39848efca1eed8605c2882dec6ae653b68bdc..5f311861e738043d1d7abf1081cd62cc67b904f8 100644 --- a/nomad/cli/parse.py +++ b/nomad/cli/parse.py @@ -4,7 +4,7 @@ import json import click import sys -from nomad import config, utils +from nomad import utils from nomad.parsing import Backend, parser_dict, match_parser, MatchingParser from nomad.normalizing import normalizers from nomad.datamodel import EntryMetadata @@ -48,14 +48,6 @@ def parse( if not parser_backend.status[0] == 'ParseSuccess': logger.error('parsing was not successful', status=parser_backend.status) - parser_backend.openNonOverlappingSection('section_entry_info') - parser_backend.addValue('upload_id', config.services.unavailable_value) - parser_backend.addValue('calc_id', config.services.unavailable_value) - parser_backend.addValue('calc_hash', "no hash") - parser_backend.addValue('mainfile', mainfile) - parser_backend.addValue('parser_name', parser_name) - parser_backend.closeNonOverlappingSection('section_entry_info') - logger.info('ran parser') return parser_backend diff --git a/nomad/client.py b/nomad/client.py index 7567f8d12fb5be5e57c04b8690cbdc47f7c1da88..2265da0396798f1452f7ea55a6859abab12da24f 100644 --- a/nomad/client.py +++ b/nomad/client.py @@ -40,8 +40,12 @@ from collections import Sequence import requests from urllib.parse import urlparse -from nomad import config, metainfo, parsing +from nomad import config from nomad.cli.client.client import KeycloakAuthenticator +from nomad.datamodel import EntryArchive + +# TODO this import is necessary to load all metainfo defintions that the parsers are using +from nomad import parsing # pylint: disable=unused-import class ArchiveQuery(Sequence): @@ -123,13 +127,7 @@ class ArchiveQuery(Sequence): results = data.get('results', []) for result in results: - parser_name = result['parser_name'] - parser = parsing.parser_dict[parser_name] - metainfo_env = parser.metainfo_env - - root_section_key = next(iter(result['archive'])) - section_def = metainfo_env.resolve_definition(root_section_key, metainfo.Section) - archive = section_def.section_cls.m_from_dict(result['archive'][root_section_key]) + archive = EntryArchive.m_from_dict(result['archive']) self._results.append(archive) diff --git a/nomad/datamodel/__init__.py b/nomad/datamodel/__init__.py index c3fea7df964a01f7104665b0290ac3c1d6328048..e0a1db9997d9d11ee8dcd4106037a3fdff7620ba 100644 --- a/nomad/datamodel/__init__.py +++ b/nomad/datamodel/__init__.py @@ -32,9 +32,9 @@ The class :class:`Dataset` is used to represent datasets and their attributes. .. autoclass:: nomad.datamodel.Dataset :members: -The class :class:`UserMetadata` is used to represent user determined entry metadata. +The class :class:`MongoMetadata` is used to tag metadata stored in mongodb. -.. autoclass:: nomad.datamodel.UserMetadata +.. autoclass:: nomad.datamodel.MongoMetadata :members: The class :class:`EntryMetadata` is used to represent all metadata about an entry. @@ -56,7 +56,7 @@ In addition there are domain specific metadata classes: from .dft import DFTMetadata from .ems import EMSMetadata -from .datamodel import Dataset, User, EditableUserMetadata, UserMetadata, EntryMetadata +from .datamodel import Dataset, User, EditableUserMetadata, MongoMetadata, EntryMetadata, EntryArchive from .optimade import OptimadeEntry, Species domains = { diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py index 8288ad688bcbe3e9274fd2ecaf2f4b59d6fa0ffa..69d2695423685f2872a58b919acf3ebf8be90c77 100644 --- a/nomad/datamodel/datamodel.py +++ b/nomad/datamodel/datamodel.py @@ -26,6 +26,8 @@ from nomad.metainfo.mongoengine_extension import Mongo, MongoDocument from .dft import DFTMetadata from .ems import EMSMetadata +from .metainfo.public import section_run +from .metainfo.general_experimental import section_experiment def _only_atoms(atoms): @@ -189,8 +191,8 @@ class EditableUserMetadata(metainfo.MCategory): ''' NOMAD entry quantities that can be edited by the user after publish. ''' -class UserMetadata(metainfo.MCategory): - ''' NOMAD entry quantities that are given by the user or determined by user actions. ''' +class MongoMetadata(metainfo.MCategory): + ''' NOMAD entry quantities that are stored in mongodb and not necessarely in the archive. ''' pass @@ -249,6 +251,7 @@ class EntryMetadata(metainfo.MSection): calc_hash = metainfo.Quantity( type=str, description='A raw file content based checksum/hash.', + categories=[MongoMetadata], a_search=Search( many_or='append', metric_name='unique_entries', metric='cardinality')) @@ -280,39 +283,48 @@ class EntryMetadata(metainfo.MSection): pid = metainfo.Quantity( type=int, description='The unique, sequentially enumerated, integer persistent identifier', + categories=[MongoMetadata], a_search=Search(many_or='append')) raw_id = metainfo.Quantity( type=str, description='A raw format specific id that was acquired from the files of this entry', + categories=[MongoMetadata], a_search=Search(many_or='append')) domain = metainfo.Quantity( type=metainfo.MEnum('dft', 'ems'), description='The material science domain', + categories=[MongoMetadata], a_search=Search()) published = metainfo.Quantity( type=bool, default=False, description='Indicates if the entry is published', + categories=[MongoMetadata], a_search=Search()) processed = metainfo.Quantity( type=bool, default=False, description='Indicates that the entry is successfully processed.', + categories=[MongoMetadata], a_search=Search()) last_processing = metainfo.Quantity( type=metainfo.Datetime, - description='The datetime of the last attempted processing.') + description='The datetime of the last attempted processing.', + categories=[MongoMetadata], + a_search=Search()) nomad_version = metainfo.Quantity( type=str, description='The NOMAD version used for the last processing attempt.', + categories=[MongoMetadata], a_search=Search(many_or='append')) nomad_commit = metainfo.Quantity( type=str, description='The NOMAD commit used for the last processing attempt.', + categories=[MongoMetadata], a_search=Search(many_or='append')) parser_name = metainfo.Quantity( type=str, @@ -320,17 +332,17 @@ class EntryMetadata(metainfo.MSection): a_search=Search(many_or='append')) comment = metainfo.Quantity( - type=str, categories=[UserMetadata, EditableUserMetadata], + type=str, categories=[MongoMetadata, EditableUserMetadata], description='A user provided comment.', a_search=Search(mapping=Text())) references = metainfo.Quantity( - type=str, shape=['0..*'], categories=[UserMetadata, EditableUserMetadata], + type=str, shape=['0..*'], categories=[MongoMetadata, EditableUserMetadata], description='User provided references (URLs).', a_search=Search()) uploader = metainfo.Quantity( - type=user_reference, categories=[UserMetadata], + type=user_reference, categories=[MongoMetadata], description='The uploader of the entry', a_flask=dict(admin_only=True, verify=User), a_search=[ @@ -343,7 +355,7 @@ class EntryMetadata(metainfo.MSection): ]) coauthors = metainfo.Quantity( - type=user_reference, shape=['0..*'], default=[], categories=[UserMetadata, EditableUserMetadata], + type=user_reference, shape=['0..*'], default=[], categories=[MongoMetadata, EditableUserMetadata], description='A user provided list of co-authors.', a_flask=dict(verify=User)) @@ -357,7 +369,7 @@ class EntryMetadata(metainfo.MSection): many_or='append', search_field='authors.name.keyword', statistic_size=1000)) shared_with = metainfo.Quantity( - type=user_reference, shape=['0..*'], default=[], categories=[UserMetadata, EditableUserMetadata], + type=user_reference, shape=['0..*'], default=[], categories=[MongoMetadata, EditableUserMetadata], description='A user provided list of userts to share the entry with.', a_flask=dict(verify=User)) @@ -370,24 +382,24 @@ class EntryMetadata(metainfo.MSection): many_or='append', search_field='owners.name.keyword')) with_embargo = metainfo.Quantity( - type=bool, default=False, categories=[UserMetadata, EditableUserMetadata], + type=bool, default=False, categories=[MongoMetadata, EditableUserMetadata], description='Indicated if this entry is under an embargo', a_search=Search()) upload_time = metainfo.Quantity( - type=metainfo.Datetime, categories=[UserMetadata], + type=metainfo.Datetime, categories=[MongoMetadata], description='The datetime this entry was uploaded to nomad', a_flask=dict(admin_only=True), a_search=Search(order_default=True)) upload_name = metainfo.Quantity( - type=str, categories=[UserMetadata], + type=str, categories=[MongoMetadata], description='The user provided upload name', a_search=Search(many_or='append')) datasets = metainfo.Quantity( type=dataset_reference, shape=['0..*'], default=[], - categories=[UserMetadata, EditableUserMetadata], + categories=[MongoMetadata, EditableUserMetadata], description='A list of user curated datasets this entry belongs to.', a_flask=dict(verify=Dataset), a_search=[ @@ -401,12 +413,12 @@ class EntryMetadata(metainfo.MSection): description='Search for a particular dataset by its id.')]) external_id = metainfo.Quantity( - type=str, categories=[UserMetadata], + type=str, categories=[MongoMetadata], description='A user provided external id.', a_search=Search(many_or='split')) last_edit = metainfo.Quantity( - type=metainfo.Datetime, categories=[UserMetadata], + type=metainfo.Datetime, categories=[MongoMetadata], description='The datetime the user metadata was edited last.', a_search=Search()) @@ -441,7 +453,24 @@ class EntryMetadata(metainfo.MSection): def apply_domain_metadata(self, backend): assert self.domain is not None, 'all entries must have a domain' - domain_section_def = self.m_def.all_sub_sections.get(self.domain).sub_section + domain_sub_section_def = self.m_def.all_sub_sections.get(self.domain) + domain_section_def = domain_sub_section_def.sub_section assert domain_section_def is not None, 'unknown domain %s' % self.domain - domain_section = self.m_create(domain_section_def.section_cls) + + # add domain section if not already there + domain_section = self.m_get_sub_section(domain_sub_section_def, -1) + if domain_section is None: + domain_section = self.m_create(domain_section_def.section_cls) + domain_section.apply_domain_metadata(backend) + + +class EntryArchive(metainfo.MSection): + + section_run = metainfo.SubSection(sub_section=section_run, repeats=True) + section_experiment = metainfo.SubSection(sub_section=section_experiment) + section_metadata = metainfo.SubSection(sub_section=EntryMetadata) + + processing_logs = metainfo.Quantity( + type=Any, shape=['0..*'], + description='The processing logs for this entry as a list of structlog entries.') diff --git a/nomad/datamodel/dft.py b/nomad/datamodel/dft.py index 909ee11f7f7313ce1618503fc27947f0b9576396..1a557bc95af6ed98c9f4a8c52a595fa10aeacc77 100644 --- a/nomad/datamodel/dft.py +++ b/nomad/datamodel/dft.py @@ -24,7 +24,6 @@ from nomad.metainfo.search_extension import Search from .common import get_optional_backend_value from .optimade import OptimadeEntry -from .metainfo.public import section_run xc_treatments = { @@ -235,14 +234,11 @@ class DFTMetadata(MSection): n_total_energies = 0 n_geometries = 0 - for root_section in backend.resource.contents: - if not root_section.m_follows(section_run.m_def): - continue - - quantities.add(root_section.m_def.name) + for section_run in backend.entry_archive.section_run: + quantities.add(section_run.m_def.name) n_quantities += 1 - for section, property_def, _ in root_section.m_traverse(): + for section, property_def, _ in section_run.m_traverse(): property_name = property_def.name quantities.add(property_name) n_quantities += 1 @@ -284,8 +280,3 @@ class DFTMetadata(MSection): if aflow_id is not None and aflow_label is not None: self.labels.append(Label(label=aflow_label, type='prototype', source='aflow_prototype_library')) self.labels.append(Label(label=aflow_id, type='prototype_id', source='aflow_prototype_library')) - - # optimade - optimade = backend.get_mi2_section(OptimadeEntry.m_def) - if optimade is not None: - self.optimade = optimade.m_copy() diff --git a/nomad/datamodel/ems.py b/nomad/datamodel/ems.py index 3a85b46b65d63201ce2d65624d69030cecd72e91..f29816c0b9fd4d5541f6618f238fa069e5afc0ec 100644 --- a/nomad/datamodel/ems.py +++ b/nomad/datamodel/ems.py @@ -21,7 +21,6 @@ from nomad.metainfo import Quantity, MSection, Section, Datetime from nomad.metainfo.search_extension import Search from .common import get_optional_backend_value -from .metainfo.general_experimental import section_experiment class EMSMetadata(MSection): @@ -107,12 +106,9 @@ class EMSMetadata(MSection): quantities = set() - for root_section in backend.resource.contents: - if not root_section.m_follows(section_experiment.m_def): - continue - - quantities.add(root_section.m_def.name) - for _, property_def, _ in root_section.m_traverse(): - quantities.add(property_def.name) + root_section = backend.entry_archive.section_experiment + quantities.add(root_section.m_def.name) + for _, property_def, _ in root_section.m_traverse(): + quantities.add(property_def.name) self.quantities = list(quantities) diff --git a/nomad/files.py b/nomad/files.py index c758ca26341b2264ea2217b6270e34b7897ae51d..672fda251e3712876e4c28d9836a35a468809cb9 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -230,7 +230,7 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta): with open(self._user_metadata_file.os_path, 'wb') as f: pickle.dump(data, f) - def to_staging_upload_files(self, create: bool = False) -> 'StagingUploadFiles': + def to_staging_upload_files(self, create: bool = False, **kwargs) -> 'StagingUploadFiles': ''' Casts to or creates corresponding staging upload files or returns None. ''' raise NotImplementedError() @@ -308,7 +308,7 @@ class StagingUploadFiles(UploadFiles): self._size = 0 self._shared = DirectoryObject(config.fs.public, upload_id, create=create) - def to_staging_upload_files(self, create: bool = False) -> 'StagingUploadFiles': + def to_staging_upload_files(self, create: bool = False, **kwargs) -> 'StagingUploadFiles': return self @property @@ -348,7 +348,11 @@ class StagingUploadFiles(UploadFiles): if not self._is_authorized(): raise Restricted - return read_archive(self.archive_file_object(calc_id).os_path) + try: + return read_archive(self.archive_file_object(calc_id).os_path) + + except FileNotFoundError: + raise KeyError(calc_id) def archive_file_object(self, calc_id: str) -> PathObject: return self._archive_dir.join_file('%s.%s' % (calc_id, 'msg')) @@ -678,13 +682,14 @@ class PublicUploadFilesBasedStagingUploadFiles(StagingUploadFiles): def extract(self, include_archive: bool = False) -> None: assert next(self.raw_file_manifest(), None) is None, 'can only extract once' for access in ['public', 'restricted']: - super().add_rawfiles( - self.public_upload_files._zip_file_object('raw', access, 'plain').os_path, - force_archive=True) + raw_file_zip = self.public_upload_files._zip_file_object('raw', access, 'plain') + if raw_file_zip.exists(): + super().add_rawfiles(raw_file_zip.os_path, force_archive=True) if include_archive: with self.public_upload_files._open_msg_file('archive', access, 'msg') as archive: for calc_id, data in archive.items(): + calc_id = calc_id.strip() self.write_archive(calc_id, data.to_dict()) def add_rawfiles(self, *args, **kwargs) -> None: @@ -771,13 +776,15 @@ class PublicUploadFiles(UploadFiles): def to_staging_upload_files(self, create: bool = False, **kwargs) -> 'StagingUploadFiles': exists = False try: - staging_upload_files = PublicUploadFilesBasedStagingUploadFiles(self, is_authorized=lambda: True) + staging_upload_files = PublicUploadFilesBasedStagingUploadFiles( + self, is_authorized=lambda: True) exists = True except KeyError: if not create: return None - staging_upload_files = PublicUploadFilesBasedStagingUploadFiles(self, create=True, is_authorized=lambda: True) + staging_upload_files = PublicUploadFilesBasedStagingUploadFiles( + self, create=True, is_authorized=lambda: True) staging_upload_files.extract(**kwargs) if exists and create: @@ -854,10 +861,10 @@ class PublicUploadFiles(UploadFiles): for access in accesses: try: archive = self._open_msg_file('archive', access, 'msg') - if access == 'restricted' and not self._is_authorized(): - raise Restricted - if calc_id in archive: + if access == 'restricted' and not self._is_authorized(): + raise Restricted + return archive except FileNotFoundError: pass diff --git a/nomad/metainfo/example.py b/nomad/metainfo/example.py index d5ec015378d0561d107bef3ff31d54d4e168cc67..f585a2c4a2aef8ddac7f6a012ff008f419535e2f 100644 --- a/nomad/metainfo/example.py +++ b/nomad/metainfo/example.py @@ -70,6 +70,8 @@ class System(MSection): type=bool, shape=[3], default=[False, False, False], categories=[SystemHash], description='A vector of booleans indicating in which dimensions the unit cell is repeated.') + system_type = Quantity(type=str) + class SCC(MSection): diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py index d8554da8132309ee55111ca29ddf205f7e4d093c..5afc41201caa5e8e2f174932deaf4836bac37a63 100644 --- a/nomad/metainfo/metainfo.py +++ b/nomad/metainfo/metainfo.py @@ -520,8 +520,6 @@ class MResource(): self.contents.append(section) def remove(self, section): - import traceback - traceback.print_stack() assert section.m_resource == self, 'Can only remove section from the resource that contains it.' section.m_resource = None self.__data.get(section.m_def).remove(section) @@ -1029,7 +1027,9 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas raise TypeError('%s is of class Section' % definition) return self.m_def == definition or definition in self.m_def.all_base_sections - def m_to_dict(self, with_meta: bool = False, include_defaults: bool = False) -> Dict[str, Any]: + def m_to_dict( + self, with_meta: bool = False, include_defaults: bool = False, + categories: List[Union['Category', Type['MCategory']]] = None) -> Dict[str, Any]: ''' Returns the data of this section as a json serializeable dictionary. @@ -1037,7 +1037,19 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas with_meta: Include information about the section definition and the sections position in its parent. include_defaults: Include default values of unset quantities. + categories: A list of category classes or category definitions that is used + to filter the included quantities and sub sections. ''' + category_defs: List[Category] = None + if categories is not None: + category_defs = [] + for category in categories: + if issubclass(category, MCategory): # type: ignore + category_defs.append(category.m_def) # type: ignore + elif isinstance(category, Category): + category_defs.append(category) + else: + raise TypeError('%s is not a category' % category) def items() -> Iterable[Tuple[str, Any]]: # metadata @@ -1050,6 +1062,12 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas # quantities for name, quantity in self.m_def.all_quantities.items(): + if categories is not None: + if not any( + quantity in category.get_all_definitions() + for category in category_defs): + continue + if quantity.virtual: continue @@ -1126,6 +1144,12 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas # sub sections for name, sub_section_def in self.m_def.all_sub_sections.items(): + if categories is not None: + if not any( + sub_section_def in category.get_all_definitions() + for category in category_defs): + continue + if sub_section_def.repeats: if self.m_sub_section_count(sub_section_def) > 0: yield name, [ @@ -1138,16 +1162,13 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas return {key: value for key, value in items()} - @classmethod - def m_from_dict(cls: Type[MSectionBound], dct: Dict[str, Any]) -> MSectionBound: - ''' Creates a section from the given serializable data dictionary. - - This is the 'opposite' of :func:`m_to_dict`. It takes a deserialised dict, e.g - loaded from JSON, and turns it into a proper section, i.e. instance of the given - section class. + def m_update_from_dict(self, dct: Dict[str, Any]) -> None: ''' - - section_def = cls.m_def + Updates this section with the serialized data from the given dict, e.g. data + produced by :func:`m_to_dict`. + ''' + section_def = self.m_def + section = self # remove m_def, m_parent_index, m_parent_sub_section metadata, # they set themselves automatically @@ -1155,8 +1176,6 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas dct.pop('m_parent_index', None) dct.pop('m_parent_sub_section', None) - section = cls() - for name, sub_section_def in section_def.all_sub_sections.items(): if name in dct: sub_section_value = dct.pop(name) @@ -1191,6 +1210,16 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas section.__dict__[name] = quantity_value # type: ignore + @classmethod + def m_from_dict(cls: Type[MSectionBound], dct: Dict[str, Any]) -> MSectionBound: + ''' Creates a section from the given serializable data dictionary. + + This is the 'opposite' of :func:`m_to_dict`. It takes a deserialised dict, e.g + loaded from JSON, and turns it into a proper section, i.e. instance of the given + section class. + ''' + section = cls() + section.m_update_from_dict(dct) return section def m_to_json(self, **kwargs): diff --git a/nomad/normalizing/optimade.py b/nomad/normalizing/optimade.py index 30bd8735b00389407d946d651e764b7a0c48357b..6e5dc4694f6510fa45aac74a640d3237878e091e 100644 --- a/nomad/normalizing/optimade.py +++ b/nomad/normalizing/optimade.py @@ -12,16 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict +from typing import Any, Dict, cast import numpy as np import re import ase.data from string import ascii_uppercase import pint.quantity +from nomad.parsing.legacy import Backend from nomad.normalizing.normalizer import SystemBasedNormalizer from nomad.metainfo import units -from nomad.datamodel import OptimadeEntry, Species +from nomad.datamodel import OptimadeEntry, Species, DFTMetadata, EntryMetadata species_re = re.compile(r'^([A-Z][a-z]?)(\d*)$') @@ -35,13 +36,19 @@ class OptimadeNormalizer(SystemBasedNormalizer): def __init__(self, backend): super().__init__(backend, only_representatives=True) - def get_optimade_data(self, index) -> OptimadeEntry: + def add_optimade_data(self, index) -> OptimadeEntry: ''' The 'main' method of this :class:`SystemBasedNormalizer`. Normalizes the section with the given `index`. Normalizes geometry, classifies, system_type, and runs symmetry analysis. ''' - optimade = OptimadeEntry() + + backend = cast(Backend, self._backend) + if backend.entry_archive.section_metadata is None: + backend.entry_archive.m_create(EntryMetadata) + if backend.entry_archive.section_metadata.dft is None: + backend.entry_archive.section_metadata.m_create(DFTMetadata) + optimade = backend.entry_archive.section_metadata.dft.m_create(OptimadeEntry) def get_value(key: str, default: Any = None, numpy: bool = False, unit=None) -> Any: try: @@ -125,8 +132,7 @@ class OptimadeNormalizer(SystemBasedNormalizer): return False try: - optimade = self.get_optimade_data(index) - self._backend.add_mi2_section(optimade) + self.add_optimade_data(index) return True except Exception as e: diff --git a/nomad/parsing/legacy.py b/nomad/parsing/legacy.py index 7e1949ab6a0838e2d7def8b105957aee8846bc14..362d05c8ea48bf931ae147302070447c25c9ef49 100644 --- a/nomad/parsing/legacy.py +++ b/nomad/parsing/legacy.py @@ -168,14 +168,6 @@ class AbstractParserBackend(metaclass=ABCMeta): ''' Used to catch parser warnings. ''' pass - # The following are extensions to the origin NOMAD-coe parser backend. And allow - # access to existing data - - # @property - # @abstractmethod - # def data(self) -> Results: - # pass - @abstractmethod def get_sections(self, meta_name: str, g_index: int = -1) -> List[int]: ''' Return all gIndices for existing sections of the given meta_name and parent section index. ''' @@ -189,19 +181,6 @@ class AbstractParserBackend(metaclass=ABCMeta): ''' pass - # def add_mi2_section(self, section: MSection): - # ''' Allows to mix a metainfo2 style section into backend. ''' - # pass - - # def get_mi2_section(self, section_def: MI2Section): - # ''' Allows to mix a metainfo2 style section into backend. ''' - # pass - - # def traverse(self, *args, **kwargs) -> Iterable[Tuple[str, str, Any]]: - # ''' Traverses the backend data and yiels tuples with metainfo name, event type, - # and value ''' - # pass - @abstractmethod def __getitem__(self, key): pass @@ -221,6 +200,7 @@ class Backend(AbstractParserBackend): domain: The domain that this backend contains data for. env: The metainfo environment (all available definitions). resource: The metainfo resource that contains all data. + entry_archive: The root section of the archive behind this backend. logger: A logger that can be used to log metainfo and backend operation related warnings and errors. ''' @@ -242,6 +222,8 @@ class Backend(AbstractParserBackend): self.env: LegacyMetainfoEnvironment = cast(LegacyMetainfoEnvironment, metainfo) self.__legacy_env = None self.resource = MResource() + self.entry_archive = datamodel.EntryArchive() + self.resource.add(self.entry_archive) self.__open_sections: Dict[Tuple[Section, int], MSection] = {} self.strict = False # TODO @@ -286,7 +268,7 @@ class Backend(AbstractParserBackend): if section_def is None: raise BadContextUri(context_uri) i = 0 - for content in self.resource.contents: + for content in self.entry_archive.m_contents(): if content.m_follows(section_def): if i == index: current = content @@ -339,7 +321,7 @@ class Backend(AbstractParserBackend): section_def = self.resolve_definition(name, Section) if name in datamodel.root_sections: - section = self.resource.create(section_def.section_cls) + section = self.entry_archive.m_create(section_def.section_cls) else: sub_section_def = self.resolve_definition(name, SubSection) @@ -483,14 +465,6 @@ class Backend(AbstractParserBackend): return value - def add_mi2_section(self, section: MSection): - self.resource.add(section) - - def get_mi2_section(self, section_def: Section): - for content in self.resource.contents: - if content.m_def == section_def: - return content - def startedParsingSession( self, mainFileUri, parserInfo, parserStatus=None, parserErrors=None): self.reset_status() diff --git a/nomad/processing/data.py b/nomad/processing/data.py index e25e9246323f9ac59aebe8d31fcfbc17c5b73ccd..1444db3aad4cdc655ffddda4b2b6078d43bf7992 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -24,7 +24,7 @@ calculations, and files ''' -from typing import cast, List, Any, Tuple, Generator, Dict, cast, Iterable +from typing import cast, List, Any, Tuple, Iterator, Dict, cast, Iterable from mongoengine import StringField, DateTimeField, DictField, BooleanField, IntField import logging from structlog import wrap_logger @@ -110,15 +110,7 @@ class Calc(Proc): self._upload_files: ArchiveBasedStagingUploadFiles = None self._calc_proc_logs: List[Any] = None - @classmethod - def from_entry_metadata(cls, entry_metadata): - calc = Calc.create( - calc_id=entry_metadata.calc_id, - upload_id=entry_metadata.upload_id, - mainfile=entry_metadata.mainfile, - metadata=entry_metadata.m_to_dict(include_defaults=True)) - - return calc + self._entry_metadata = None @classmethod def get(cls, id): @@ -135,6 +127,61 @@ class Calc(Proc): self._upload.worker_hostname = self.worker_hostname return self._upload + def apply_entry_metadata(self, entry_metadata: datamodel.EntryMetadata): + self.metadata = entry_metadata.m_to_dict( + include_defaults=True, + categories=[datamodel.MongoMetadata]) # TODO use embedded doc? + + def create_metadata(self) -> datamodel.EntryMetadata: + ''' + Returns a :class:`nomad.datamodel.EntryMetadata` with values from this + processing object, not necessarely the user metadata nor the metadata from + the archive. + ''' + entry_metadata = datamodel.EntryMetadata() + entry_metadata.domain = parser_dict[self.parser].domain + entry_metadata.upload_id = self.upload_id + entry_metadata.calc_id = self.calc_id + entry_metadata.mainfile = self.mainfile + entry_metadata.nomad_version = config.version + entry_metadata.nomad_commit = config.commit + entry_metadata.uploader = self.upload.user_id + entry_metadata.upload_time = self.upload.upload_time + entry_metadata.upload_name = self.upload.name + + return entry_metadata + + def entry_metadata(self, upload_files: UploadFiles) -> datamodel.EntryMetadata: + ''' + Returns a complete set of :class:`nomad.datamodel.EntryMetadata` including + the user metadata and metadata from the archive. + + Arguments: + upload_files: + The :class:`nomad.files.UploadFiles` instance to read the archive from. + cache: + A boolean that indicates if the archive file should be left unclosed, + e.g. if this method is called for many entries of the same upload. + ''' + archive = upload_files.read_archive(self.calc_id) + entry_metadata = datamodel.EntryMetadata.m_from_dict( + archive[self.calc_id][datamodel.EntryArchive.section_metadata.name].to_dict()) + + entry_metadata.m_update_from_dict(self.metadata) + + return entry_metadata + + def user_metadata(self) -> datamodel.EntryMetadata: + ''' + Returns a :class:`nomad.datamodel.EntryMetadata` with values from this + processing object and the user metadata, not necessarely the metadata from + the archive. + ''' + entry_metadata = self.create_metadata() + entry_metadata.m_update_from_dict(self.metadata) + + return entry_metadata + @property def upload_files(self) -> ArchiveBasedStagingUploadFiles: if not self._upload_files: @@ -176,10 +223,19 @@ class Calc(Proc): records. ''' parser = match_parser(self.upload_files.raw_file_object(self.mainfile).os_path, strict=False) + logger = self.get_logger() if parser is None and not config.reprocess_unmatched: self.errors = ['no parser matches during re-process, will not re-process this calc'] + try: + upload_files = PublicUploadFiles(self.upload_id, is_authorized=lambda: True) + with upload_files.read_archive(self.calc_id) as archive: + self.upload_files.write_archive(self.calc_id, archive[self.calc_id].to_dict()) + + except Exception as e: + logger.error('could not copy archive for non matching, non reprocessed entry', exc_info=e) + # mock the steps of actual processing self._continue_with('parsing') self._continue_with('normalizing') @@ -187,7 +243,6 @@ class Calc(Proc): self._complete() return - logger = self.get_logger() if parser is None: self.get_logger().error('no parser matches during re-process, use the old parser') self.errors = ['no matching parser found during re-processing'] @@ -198,16 +253,12 @@ class Calc(Proc): parser=parser.name) try: - entry_metadata = datamodel.EntryMetadata.m_from_dict(self.metadata) - entry_metadata.upload_id = self.upload_id - entry_metadata.calc_id = self.calc_id - entry_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) - entry_metadata.mainfile = self.mainfile - entry_metadata.nomad_version = config.version - entry_metadata.nomad_commit = config.commit - entry_metadata.last_processing = datetime.utcnow() - entry_metadata.files = self.upload_files.calc_files(self.mainfile) - self.metadata = entry_metadata.m_to_dict(include_defaults=True) + self._entry_metadata = self.user_metadata() + self._entry_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) + self._entry_metadata.nomad_version = config.version + self._entry_metadata.nomad_commit = config.commit + self._entry_metadata.last_processing = datetime.utcnow() + self._entry_metadata.files = self.upload_files.calc_files(self.mainfile) self.parsing() self.normalizing() @@ -233,23 +284,12 @@ class Calc(Proc): try: # save preliminary minimum calc metadata in case processing fails # successful processing will replace it with the actual metadata - calc_metadata = datamodel.EntryMetadata() - calc_metadata.domain = parser_dict[self.parser].domain - calc_metadata.upload_id = self.upload_id - calc_metadata.calc_id = self.calc_id - calc_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) - calc_metadata.mainfile = self.mainfile - calc_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) - calc_metadata.nomad_version = config.version - calc_metadata.nomad_commit = config.commit - calc_metadata.last_processing = datetime.utcnow() - calc_metadata.files = self.upload_files.calc_files(self.mainfile) - calc_metadata.uploader = self.upload.user_id - calc_metadata.upload_time = self.upload.upload_time - calc_metadata.upload_name = self.upload.name - self.metadata = calc_metadata.m_to_dict(include_defaults=True) # TODO use embedded doc? - - if len(calc_metadata.files) >= config.auxfile_cutoff: + self._entry_metadata = self.create_metadata() + self._entry_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) + self._entry_metadata.last_processing = datetime.utcnow() + self._entry_metadata.files = self.upload_files.calc_files(self.mainfile) + + if len(self._entry_metadata.files) >= config.auxfile_cutoff: self.warning( 'This calc has many aux files in its directory. ' 'Have you placed many calculations in the same directory?') @@ -269,22 +309,28 @@ class Calc(Proc): # in case of failure, index a minimum set of metadata and mark # processing failure try: - entry_metadata = datamodel.EntryMetadata.m_from_dict(self.metadata) if self.parser is not None: parser = parser_dict[self.parser] if hasattr(parser, 'code_name'): - entry_metadata.code_name = parser.code_name + self._entry_metadata.code_name = parser.code_name - entry_metadata.processed = False - self.metadata = entry_metadata.m_to_dict(include_defaults=True) - entry_metadata.a_elastic.index() + self._entry_metadata.processed = False + self.apply_entry_metadata(self._entry_metadata) + self._entry_metadata.a_elastic.index() except Exception as e: - self.get_logger().error('could not index after processing failure', exc_info=e) + self.get_logger().error( + 'could not index after processing failure', exc_info=e) try: - self.upload_files.write_archive(self.calc_id, {'processing_logs': self._calc_proc_logs}) + archive = datamodel.EntryArchive() + archive.m_add_sub_section( + datamodel.EntryArchive.section_metadata, self._entry_metadata) + archive.processing_logs = self._calc_proc_logs + + self.upload_files.write_archive(self.calc_id, archive.m_to_dict()) except Exception as e: - self.get_logger().error('could not write archive (logs) after processing failure', exc_info=e) + self.get_logger().error( + 'could not write archive after processing failure', exc_info=e) super().fail(*errors, log_level=log_level, **kwargs) @@ -302,7 +348,7 @@ class Calc(Proc): context = dict(parser=self.parser, step=self.parser) logger = self.get_logger(**context) parser = parser_dict[self.parser] - self.metadata['parser_name'] = self.parser + self._entry_metadata.parser_name = self.parser with utils.timer(logger, 'parser executed', input_size=self.mainfile_file.size): try: @@ -315,26 +361,8 @@ class Calc(Proc): self.fail('parser raised system exit', error='system exit', **context) return - # add the non code specific calc metadata to the backend - # all other quantities have been determined by parsers/normalizers - self._parser_backend.openNonOverlappingSection('section_entry_info') - self._parser_backend.addValue('upload_id', self.upload_id) - self._parser_backend.addValue('calc_id', self.calc_id) - self._parser_backend.addValue('calc_hash', self.metadata['calc_hash']) - self._parser_backend.addValue('mainfile', self.mainfile) - self._parser_backend.addValue('parser_name', self.parser) - filepaths = self.metadata['files'] - self._parser_backend.addValue('number_of_files', len(filepaths)) - self._parser_backend.addValue('filepaths', filepaths) - uploader = self.upload.uploader - self._parser_backend.addValue( - 'entry_uploader_name', '%s, %s' % (uploader.first_name, uploader.last_name)) - self._parser_backend.addValue( - 'entry_uploader_id', str(uploader.user_id)) - self._parser_backend.addValue('entry_upload_time', int(self.upload.upload_time.timestamp())) - self._parser_backend.closeNonOverlappingSection('section_entry_info') - - self.add_processor_info(self.parser) + self._parser_backend.entry_archive.m_add_sub_section( + datamodel.EntryArchive.section_metadata, self._entry_metadata) if self._parser_backend.status[0] != 'ParseSuccess': error = self._parser_backend.status[1] @@ -344,27 +372,25 @@ class Calc(Proc): def use_parser_backend(self, processor_name): self._parser_backend.reset_status() yield self._parser_backend - self.add_processor_info(processor_name) - - def add_processor_info(self, processor_name: str) -> None: - self._parser_backend.openContext('/section_entry_info/0') - self._parser_backend.openNonOverlappingSection('section_archive_processing_info') - self._parser_backend.addValue('archive_processor_name', processor_name) if self._parser_backend.status[0] == 'ParseSuccess': warnings = getattr(self._parser_backend, '_warnings', []) + if len(warnings) > 0: - self._parser_backend.addValue('archive_processor_status', 'WithWarnings') - self._parser_backend.addValue('number_of_archive_processor_warnings', len(warnings)) - self._parser_backend.addArrayValues('archive_processor_warnings', [str(warning) for warning in warnings]) + self.get_logger().warn( + 'processor completed successful with warnings', + processor=processor_name, warnings=[str(warning) for warning in warnings]) + else: - self._parser_backend.addValue('archive_processor_status', 'Success') + self.get_logger().info( + 'processor completed successful', + processor=processor_name) + else: errors = self._parser_backend.status[1] - self._parser_backend.addValue('archive_processor_error', str(errors)) - - self._parser_backend.closeNonOverlappingSection('section_archive_processing_info') - self._parser_backend.closeContext('/section_entry_info/0') + self.get_logger().error( + 'processor completed with failure', + processor=processor_name, errors=str(errors)) @task def normalizing(self): @@ -401,29 +427,26 @@ class Calc(Proc): ''' The *task* that encapsulates all archival related actions. ''' logger = self.get_logger() - entry_metadata = datamodel.EntryMetadata.m_from_dict(self.metadata) - entry_metadata.apply_domain_metadata(self._parser_backend) - entry_metadata.processed = True + self._entry_metadata.apply_domain_metadata(self._parser_backend) + self._entry_metadata.processed = True # persist the calc metadata with utils.timer(logger, 'saved calc metadata', step='metadata'): - self.metadata = entry_metadata.m_to_dict(include_defaults=True) + self.apply_entry_metadata(self._entry_metadata) # index in search with utils.timer(logger, 'indexed', step='index'): - entry_metadata.a_elastic.index() + self._entry_metadata.a_elastic.index() # persist the archive with utils.timer( logger, 'archived', step='archive', input_size=self.mainfile_file.size) as log_data: - archive_data = self._parser_backend.resource.m_to_dict( - lambda section: section.m_def.name in datamodel.root_sections) - - archive_data['processing_logs'] = self._calc_proc_logs + self._parser_backend.entry_archive.processing_logs = self._calc_proc_logs self._calc_proc_logs = None + archive_data = self._parser_backend.entry_archive.m_to_dict() archive_size = self.upload_files.write_archive(self.calc_id, archive_data) log_data.update(archive_size=archive_size) @@ -587,44 +610,45 @@ class Upload(Proc): logger.info('started to publish') with utils.lnr(logger, 'publish failed'): - calcs = self.entries_metadata(self.metadata) + with self.entries_metadata(self.metadata) as calcs: - with utils.timer( - logger, 'upload metadata updated', step='metadata', - upload_size=self.upload_files.size): - - def create_update(calc): - calc.published = True - calc.with_embargo = calc.with_embargo if calc.with_embargo is not None else False - return UpdateOne( - {'_id': calc.calc_id}, - {'$set': {'metadata': calc.m_to_dict(include_defaults=True)}}) - - Calc._get_collection().bulk_write([create_update(calc) for calc in calcs]) - - if isinstance(self.upload_files, StagingUploadFiles): with utils.timer( - logger, 'staged upload files packed', step='pack', + logger, 'upload metadata updated', step='metadata', upload_size=self.upload_files.size): - self.upload_files.pack(calcs) - with utils.timer( - logger, 'index updated', step='index', - upload_size=self.upload_files.size): - search.publish(calcs) + def create_update(calc): + calc.published = True + calc.with_embargo = calc.with_embargo if calc.with_embargo is not None else False + return UpdateOne( + {'_id': calc.calc_id}, + {'$set': {'metadata': calc.m_to_dict( + include_defaults=True, categories=[datamodel.MongoMetadata])}}) + + Calc._get_collection().bulk_write([create_update(calc) for calc in calcs]) + + if isinstance(self.upload_files, StagingUploadFiles): + with utils.timer( + logger, 'staged upload files packed', step='pack', + upload_size=self.upload_files.size): + self.upload_files.pack(calcs) - if isinstance(self.upload_files, StagingUploadFiles): with utils.timer( - logger, 'staged upload deleted', step='delete staged', + logger, 'index updated', step='index', upload_size=self.upload_files.size): - self.upload_files.delete() - self.published = True - self.publish_time = datetime.utcnow() + search.publish(calcs) + + if isinstance(self.upload_files, StagingUploadFiles): + with utils.timer( + logger, 'staged upload deleted', step='delete staged', + upload_size=self.upload_files.size): + self.upload_files.delete() + self.published = True + self.publish_time = datetime.utcnow() + self.last_update = datetime.utcnow() + self.save() + else: self.last_update = datetime.utcnow() self.save() - else: - self.last_update = datetime.utcnow() - self.save() @process def re_process_upload(self): @@ -696,7 +720,7 @@ class Upload(Proc): self._continue_with('parse_all') self._continue_with('cleanup') - self.upload_files.re_pack(self.entries_metadata()) + self.upload_files.re_pack(self.user_metadata()) self.joined = True self._complete() @@ -785,7 +809,7 @@ class Upload(Proc): self.staging_upload_files.raw_file_object(path).os_path, self.staging_upload_files.raw_file_object(stripped_path).os_path)) - def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]: + def match_mainfiles(self) -> Iterator[Tuple[str, object]]: ''' Generator function that matches all files in the upload to all parsers to determine the upload's mainfiles. @@ -908,7 +932,7 @@ class Upload(Proc): logger, 'reprocessed staged upload packed', step='delete staged', upload_size=self.upload_files.size): - staging_upload_files.pack(self.entries_metadata(), skip_raw=True) + staging_upload_files.pack(self.user_metadata(), skip_raw=True) with utils.timer( logger, 'reprocessed staged upload deleted', step='delete staged', @@ -982,16 +1006,19 @@ class Upload(Proc): ''' All successfully processed calculations. ''' return Calc.objects(upload_id=self.upload_id, tasks_status=SUCCESS) - def entries_metadata(self, user_metadata: dict = None) -> Iterable[datamodel.EntryMetadata]: + @contextmanager + def entries_metadata( + self, user_metadata: dict = None) -> Iterator[Iterable[datamodel.EntryMetadata]]: ''' This is the :py:mod:`nomad.datamodel` transformation method to transform - processing uploads into datamodel uploads. It will also implicitely transform - all calculations of this upload. + processing upload's entries into list of :class:`nomad.datamodel.EntryMetadata` objects. Arguments: user_metadata: A dict of user metadata that is applied to the resulting datamodel data and the respective calculations. ''' + upload_files = self.upload_files + # prepare user metadata per upload and per calc if user_metadata is not None: entries_metadata_dict: Dict[str, Any] = dict() @@ -1008,7 +1035,7 @@ class Upload(Proc): user_upload_name = upload_metadata.get('upload_name', None) def get_metadata(calc: Calc): - entry_metadata = datamodel.EntryMetadata.m_from_dict(calc.metadata) + entry_metadata = calc.entry_metadata(upload_files) entry_user_metadata = dict(upload_metadata) entry_user_metadata.pop('embargo_length', None) # this is for uploads only entry_user_metadata.update(entries_metadata_dict.get(calc.mainfile, {})) @@ -1023,13 +1050,25 @@ class Upload(Proc): user_upload_time = None def get_metadata(calc: Calc): - entry_metadata = datamodel.EntryMetadata.m_from_dict(calc.metadata) + entry_metadata = calc.entry_metadata(upload_files) entry_metadata.upload_time = self.upload_time entry_metadata.upload_name = self.name return entry_metadata - return [get_metadata(calc) for calc in Calc.objects(upload_id=self.upload_id)] + try: + yield [ + get_metadata(calc) + for calc in Calc.objects(upload_id=self.upload_id)] + + finally: + upload_files.close() + + def entry_ids(self) -> Iterable[str]: + return [calc.calc_id for calc in Calc.objects(upload_id=self.upload_id)] + + def user_metadata(self) -> Iterable[datamodel.EntryMetadata]: + return [calc.user_metadata() for calc in Calc.objects(upload_id=self.upload_id)] def compress_and_set_metadata(self, metadata: Dict[str, Any]) -> None: ''' diff --git a/nomad/search.py b/nomad/search.py index 603d8dde3c50ae9b421c31a87f4176f29ca13e16..f9ac993acb6f6f06e706efaebb07eaa7e7baa740 100644 --- a/nomad/search.py +++ b/nomad/search.py @@ -23,7 +23,7 @@ from elasticsearch.exceptions import NotFoundError from datetime import datetime import json -from nomad import config, datamodel, infrastructure, datamodel, utils, processing as proc +from nomad import config, datamodel, infrastructure, datamodel, utils from nomad.metainfo.search_extension import search_quantities, metrics, order_default_quantities, default_statistics @@ -638,14 +638,6 @@ class SearchRequest: return json.dumps(self._search.to_dict(), indent=2) -def to_calc_with_metadata(results: List[Dict[str, Any]]): - ''' Translates search results into :class:`EntryMetadata` objects read from mongo. ''' - ids = [result['calc_id'] for result in results] - return [ - datamodel.EntryMetadata.m_from_dict(calc.metadata) - for calc in proc.Calc.objects(calc_id__in=ids)] - - def flat(obj, prefix=None): ''' Helper that translates nested result objects into flattened dicts with diff --git a/tests/app/test_api.py b/tests/app/test_api.py index bf5a176f41fa1756433f54cec8243a24830b9668..5790b17ab6344d8df7e4fb6543f14f39d2d3249f 100644 --- a/tests/app/test_api.py +++ b/tests/app/test_api.py @@ -33,7 +33,7 @@ from nomad.files import UploadFiles, PublicUploadFiles from nomad.processing import Upload, Calc, SUCCESS from nomad.datamodel import EntryMetadata, User, Dataset -from tests.conftest import create_auth_headers, clear_elastic, create_test_structure +from tests.conftest import create_auth_headers, clear_elastic, clear_raw_files from tests.test_files import example_file, example_file_mainfile, example_file_contents from tests.test_files import create_staging_upload, create_public_upload, assert_upload_files from tests.test_search import assert_search_upload @@ -260,19 +260,19 @@ class TestUploads: assert upload_proc is not None assert upload_proc.published is True assert upload_proc.embargo_length == min(36, metadata.get('embargo_length', 36)) - entries = upload_proc.entries_metadata() - - for entry in entries: - for key, transform in { - 'comment': lambda e: e.comment, - 'with_embargo': lambda e: e.with_embargo, - 'references': lambda e: e.references, - 'coauthors': lambda e: [u.user_id for u in e.coauthors], - '_uploader': lambda e: e.uploader.user_id, - '_pid': lambda e: e.pid, - 'external_id': lambda e: e.external_id}.items(): - if key in metadata: - assert transform(entry) == metadata[key], key + + with upload_proc.entries_metadata() as entries: + for entry in entries: + for key, transform in { + 'comment': lambda e: e.comment, + 'with_embargo': lambda e: e.with_embargo, + 'references': lambda e: e.references, + 'coauthors': lambda e: [u.user_id for u in e.coauthors], + '_uploader': lambda e: e.uploader.user_id, + '_pid': lambda e: e.pid, + 'external_id': lambda e: e.external_id}.items(): + if key in metadata: + assert transform(entry) == metadata[key], key assert_upload_files(upload_id, entries, files.PublicUploadFiles, published=True) assert_search_upload(entries, additional_keys=additional_keys, published=True) @@ -614,7 +614,10 @@ class TestArchive(UploadFilesBasedTests): def test_get(self, api, upload, auth_headers): rv = api.get('/archive/%s/0' % upload, headers=auth_headers) assert rv.status_code == 200 - assert json.loads(rv.data) is not None + data = json.loads(rv.data) + assert data is not None + assert 'section_metadata' in data + assert 'section_run' in data @UploadFilesBasedTests.ignore_authorization def test_get_signed(self, api, upload, _, test_user_signature_token): @@ -707,7 +710,7 @@ class TestArchive(UploadFilesBasedTests): class TestRepo(): @pytest.fixture(scope='class') def example_elastic_calcs( - self, elastic_infra, normalized: parsing.Backend, + self, elastic_infra, raw_files_infra, normalized: parsing.Backend, test_user: User, other_test_user: User): clear_elastic(elastic_infra) @@ -748,6 +751,7 @@ class TestRepo(): yield example_dataset.a_mongo.delete() + clear_raw_files() def assert_search(self, rv: Any, number_of_calcs: int) -> dict: if rv.status_code != 200: @@ -1183,21 +1187,30 @@ class TestEditRepo(): Dataset.m_def.a_mongo.objects(name='new_ds').delete() @pytest.fixture(autouse=True) - def example_data(self, class_api, test_user, other_test_user): - def create_entry(id, user, **kwargs): - metadata = dict(uploader=user.user_id, **kwargs) - create_test_structure(id, 2, 1, [], 0, metadata=metadata) + def example_data(self, class_api, test_user, other_test_user, raw_files): + from tests.app.utils import Upload + + uploads = {} + for i in range(1, 4): + upload_id = 'upload_%d' % i + upload = Upload() + upload.upload_id = upload_id + uploads[upload_id] = upload entries = [ - dict(calc_id='1', upload_id='upload_1', user=test_user, published=True, with_embargo=False), - dict(calc_id='2', upload_id='upload_2', user=test_user, published=True, with_embargo=True), - dict(calc_id='3', upload_id='upload_2', user=test_user, published=False, with_embargo=False), - dict(calc_id='4', upload_id='upload_3', user=other_test_user, published=True, with_embargo=False)] + dict(upload_id='upload_1', user=test_user, published=True, with_embargo=False), + dict(upload_id='upload_2', user=test_user, published=True, with_embargo=True), + dict(upload_id='upload_2', user=test_user, published=False, with_embargo=False), + dict(upload_id='upload_3', user=other_test_user, published=True, with_embargo=False)] + + for i, entry in enumerate(entries): + upload = uploads[entry.pop('upload_id')] + user = entry.pop('user') + metadata = dict(uploader=user.user_id, **entry) + upload.create_test_structure(i + 1, 2, 1, [], 0, metadata=metadata) - i = 0 - for entry in entries: - create_entry(i, **entry) - i += 1 + for upload in uploads.values(): + upload.create_upload_files() search.refresh() @@ -1242,7 +1255,7 @@ class TestEditRepo(): def mongo(self, *args, edited: bool = True, **kwargs): for calc_id in args: - calc = Calc.objects(calc_id=str(calc_id)).first() + calc = Calc.objects(calc_id='test_calc_id_%d' % calc_id).first() assert calc is not None metadata = calc.metadata if edited: @@ -1254,7 +1267,7 @@ class TestEditRepo(): def elastic(self, *args, **kwargs): for calc_id in args: - for calc in search.SearchRequest().search_parameters(calc_id=str(calc_id)).execute_scan(): + for calc in search.SearchRequest().search_parameters(calc_id='test_calc_id_%d' % calc_id).execute_scan(): for key, value in kwargs.items(): if key in ['authors', 'owners']: ids = [user['user_id'] for user in calc.get(key)] @@ -1800,13 +1813,16 @@ class TestDataset: assert rv.status_code == 404 @pytest.fixture() - def example_dataset_with_entry(self, mongo, elastic, example_datasets): + def example_dataset_with_entry(self, mongo, elastic, raw_files, example_datasets): entry_metadata = EntryMetadata( domain='dft', calc_id='1', upload_id='1', published=True, with_embargo=False, datasets=['1']) Calc( calc_id='1', upload_id='1', create_time=datetime.datetime.now(), metadata=entry_metadata.m_to_dict()).save() + upload_files = files.StagingUploadFiles(upload_id='1', create=True) + upload_files.write_archive('1', dict(section_metadata=entry_metadata.m_to_dict())) + upload_files.close() entry_metadata.a_elastic.index(refresh=True) def test_delete_dataset(self, api, test_user_auth, example_dataset_with_entry): diff --git a/tests/app/test_optimade.py b/tests/app/test_optimade.py index 21e99ca7d9783b7045b8ab47b2adaae4ebcc8e1c..60cc09c7b9107e61aa5f5394ae2c13aec31aba6e 100644 --- a/tests/app/test_optimade.py +++ b/tests/app/test_optimade.py @@ -21,7 +21,7 @@ from nomad import search from nomad.app.optimade import parse_filter, url from tests.app.test_app import BlueprintClient -from tests.conftest import clear_elastic, create_test_structure +from tests.conftest import clear_elastic, clear_raw_files @pytest.fixture(scope='session') @@ -33,14 +33,18 @@ def test_get_entry(published: Upload): calc_id = list(published.calcs)[0].calc_id with published.upload_files.read_archive(calc_id) as archive: data = archive[calc_id] - assert 'OptimadeEntry' in data + assert data['section_metadata']['dft']['optimade'] is not None + search_result = search.SearchRequest().search_parameter('calc_id', calc_id).execute_paginated()['results'][0] assert 'dft.optimade.chemical_formula_hill' in search.flat(search_result) -def test_no_optimade(mongo, elastic, api): - create_test_structure(1, 2, 1, [], 0) - create_test_structure(2, 2, 1, [], 0, optimade=False) +def test_no_optimade(mongo, elastic, raw_files, api): + from tests.app.utils import Upload + upload = Upload() + upload.create_test_structure(1, 2, 1, [], 0) + upload.create_test_structure(2, 2, 1, [], 0, optimade=False) + upload.create_upload_files() search.refresh() rv = api.get('/calculations') @@ -51,18 +55,22 @@ def test_no_optimade(mongo, elastic, api): @pytest.fixture(scope='module') -def example_structures(elastic_infra, mongo_infra): +def example_structures(elastic_infra, mongo_infra, raw_files_infra): clear_elastic(elastic_infra) mongo_infra.drop_database('test_db') - create_test_structure(1, 2, 1, [], 0) - create_test_structure(2, 2, 1, ['C'], 0) - create_test_structure(3, 2, 1, [], 1) - create_test_structure(4, 1, 1, [], 0) + from tests.app.utils import Upload + upload = Upload() + upload.create_test_structure(1, 2, 1, [], 0) + upload.create_test_structure(2, 2, 1, ['C'], 0) + upload.create_test_structure(3, 2, 1, [], 1) + upload.create_test_structure(4, 1, 1, [], 0) + upload.create_upload_files() search.refresh() yield clear_elastic(elastic_infra) + clear_raw_files() @pytest.mark.parametrize('query, results', [ @@ -155,6 +163,7 @@ def test_list_endpoint_request_fields(api, example_structures): assert rv.status_code == 200 data = json.loads(rv.data) ref_elements = [['H', 'O'], ['C', 'H', 'O'], ['H', 'O'], ['H', 'O']] + data['data'] = sorted(data['data'], key=lambda x: x['id']) for i in range(len(data['data'])): rf = list(data['data'][i]['attributes'].keys()) rf.sort() diff --git a/tests/app/utils.py b/tests/app/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f7730c7504e97d7452415f72edfc728f64fa9d67 --- /dev/null +++ b/tests/app/utils.py @@ -0,0 +1,110 @@ +# Copyright 2018 Markus Scheidgen +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an"AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +import numpy as np + +from nomad import processing, files +from nomad.datamodel import EntryMetadata, MongoMetadata, EntryArchive +from nomad.parsing import Backend + +from tests.test_normalizing import run_normalize + + +class Upload(): + + def __init__(self): + self.entries: List[EntryMetadata] = [] + self.upload_id = 'test_upload_id' + + def create_upload_files(self) -> None: + upload_files = files.StagingUploadFiles(self.upload_id, create=True) + for entry_metadata in self.entries: + archive = entry_metadata.m_parent + if archive is None: + archive = EntryArchive() + archive.m_add_sub_section(EntryArchive.section_metadata, entry_metadata) + + upload_files.write_archive(entry_metadata.calc_id, archive.m_to_dict()) + + upload_files.pack(self.entries, skip_raw=True) + upload_files.delete() + + assert files.UploadFiles.get(self.upload_id) is not None + + def add_entry(self, entry_metadata: EntryMetadata): + self.entries.append(entry_metadata) + + processing.Calc.create( + calc_id=entry_metadata.calc_id, + upload_id=entry_metadata.upload_id, + mainfile=entry_metadata.mainfile, + metadata=entry_metadata.m_to_dict( + include_defaults=True, categories=[MongoMetadata])).save() + + entry_metadata.a_elastic.index() + + def create_test_structure( + self, id: int, h: int, o: int, extra: List[str], periodicity: int, + optimade: bool = True, metadata: dict = None): + ''' Creates a calculation in Elastic and Mongodb with the given properties. + + Does require initialized :func:`elastic_infra` and :func:`mongo_infra`. + + Args: + meta_info: A legace metainfo env. + id: A number to create ``test_calc_id_<number>`` ids. + h: The amount of H atoms + o: The amount of O atoms + extra: A list of further atoms + periodicity: The number of dimensions to repeat the structure in + optimade: A boolean. Iff true the entry will have optimade metadata. Default is True. + metadata: Additional (user) metadata. + ''' + atom_labels = ['H' for i in range(0, h)] + ['O' for i in range(0, o)] + extra + test_vector = np.array([0, 0, 0]) + + backend = Backend('public', False, True) # type: ignore + backend.openSection('section_run') + backend.addValue('program_name', 'test_code') + backend.openSection('section_system') + + backend.addArrayValues('atom_labels', np.array(atom_labels)) + backend.addArrayValues( + 'atom_positions', np.array([test_vector for i in range(0, len(atom_labels))])) + backend.addArrayValues( + 'lattice_vectors', np.array([test_vector, test_vector, test_vector])) + backend.addArrayValues( + 'configuration_periodic_dimensions', + np.array([True for _ in range(0, periodicity)] + [False for _ in range(periodicity, 3)])) + + backend.closeSection('section_system', 0) + backend.closeSection('section_run', 0) + + backend = run_normalize(backend) + entry_metadata = backend.entry_archive.section_metadata + + entry_metadata.m_update( + domain='dft', upload_id=self.upload_id, calc_id='test_calc_id_%d' % id, + mainfile='test_mainfile', published=True, with_embargo=False) + + entry_metadata.apply_domain_metadata(backend) + + if metadata is not None: + entry_metadata.m_update(**metadata) + + if not optimade: + entry_metadata.dft.optimade = None + + self.add_entry(entry_metadata) diff --git a/tests/conftest.py b/tests/conftest.py index 4b7731e1503f99c052bc7f74dc0f93b89c343c4f..b6faf5b56252640abe45136d223287d42bb42555 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,19 +27,16 @@ from bravado.client import SwaggerClient from flask import request, g import elasticsearch.exceptions from typing import List -import numpy as np import json import logging from nomad import config, infrastructure, parsing, processing, app, utils -from nomad.datamodel import User, EntryMetadata -from nomad.parsing import Backend +from nomad.datamodel import User from tests import test_parsing, test_normalizing from tests.processing import test_data as test_processing from tests.test_files import example_file, empty_file from tests.bravado_flask import FlaskTestHttpClient -from tests.test_normalizing import run_normalize test_log_level = logging.CRITICAL example_files = [empty_file, example_file] @@ -82,11 +79,16 @@ def raw_files(raw_files_infra): try: yield finally: - for directory in directories: - try: - shutil.rmtree(directory) - except FileNotFoundError: - pass + clear_raw_files() + + +def clear_raw_files(): + directories = [config.fs.staging, config.fs.public, config.fs.tmp] + for directory in directories: + try: + shutil.rmtree(directory) + except FileNotFoundError: + pass @pytest.fixture(scope='session') @@ -634,59 +636,3 @@ def reset_config(): def reset_infra(mongo, elastic): ''' Fixture that resets infrastructure after deleting db or search index. ''' yield None - - -def create_test_structure( - id: int, h: int, o: int, extra: List[str], periodicity: int, - optimade: bool = True, metadata: dict = None): - ''' Creates a calculation in Elastic and Mongodb with the given properties. - - Does require initialized :func:`elastic_infra` and :func:`mongo_infra`. - - Args: - meta_info: A legace metainfo env. - id: A number to create ``test_calc_id_<number>`` ids. - h: The amount of H atoms - o: The amount of O atoms - extra: A list of further atoms - periodicity: The number of dimensions to repeat the structure in - optimade: A boolean. Iff true the entry will have optimade metadata. Default is True. - metadata: Additional (user) metadata. - ''' - - atom_labels = ['H' for i in range(0, h)] + ['O' for i in range(0, o)] + extra - test_vector = np.array([0, 0, 0]) - - backend = Backend('public', False, True) # type: ignore - backend.openSection('section_run') - backend.addValue('program_name', 'test_code') - backend.openSection('section_system') - - backend.addArrayValues('atom_labels', np.array(atom_labels)) - backend.addArrayValues( - 'atom_positions', np.array([test_vector for i in range(0, len(atom_labels))])) - backend.addArrayValues( - 'lattice_vectors', np.array([test_vector, test_vector, test_vector])) - backend.addArrayValues( - 'configuration_periodic_dimensions', - np.array([True for _ in range(0, periodicity)] + [False for _ in range(periodicity, 3)])) - - backend.closeSection('section_system', 0) - backend.closeSection('section_run', 0) - - backend = run_normalize(backend) - calc = EntryMetadata( - domain='dft', upload_id='test_uload_id', calc_id='test_calc_id_%d' % id, - mainfile='test_mainfile', published=True, with_embargo=False) - calc.apply_domain_metadata(backend) - if metadata is not None: - calc.m_update(**metadata) - - if not optimade: - calc.dft.optimade = None - - proc_calc = processing.Calc.from_entry_metadata(calc) - proc_calc.save() - calc.a_elastic.index() - - assert processing.Calc.objects(calc_id__in=[calc.calc_id]).count() == 1 diff --git a/tests/metainfo/test_metainfo.py b/tests/metainfo/test_metainfo.py index ea86b3cb73bba8e6b971ebc4f77293970f3733e5..f5fdcd0b546be733720115f749363f54f835aa0a 100644 --- a/tests/metainfo/test_metainfo.py +++ b/tests/metainfo/test_metainfo.py @@ -420,6 +420,14 @@ class TestM1: self.assert_example_data(new_example_data) + def test_to_dict_category_filter(self, example_data: Run): + system = example_data.systems[0] + system.system_type = 'bulk' + dct = system.m_to_dict(categories=[SystemHash]) + assert 'atom_labels' in dct + assert 'n_atoms' not in dct # derived + assert 'system_type' not in dct # not system hash + def test_to_dict_defaults(self, example_data): dct = example_data.m_to_dict() assert 'nomad_version' not in dct['parsing'] diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index ef4182a41787647c8c36ea185c8b008f073cdb9f..e25ed455e646e2cf5c18f72cdd40909347da1a82 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -86,7 +86,7 @@ def assert_processing(upload: Upload, published: bool = False): with upload_files.read_archive(calc.calc_id) as archive: calc_archive = archive[calc.calc_id] assert 'section_run' in calc_archive - assert 'section_entry_info' in calc_archive + assert 'section_metadata' in calc_archive assert 'processing_logs' in calc_archive has_test_event = False @@ -101,15 +101,19 @@ def assert_processing(upload: Upload, published: bool = False): with upload_files.raw_file(calc.mainfile) as f: f.read() - for path in calc.metadata['files']: + entry_metadata = calc.entry_metadata(upload_files) + + for path in entry_metadata.files: with upload_files.raw_file(path) as f: f.read() # check some domain metadata - assert calc.metadata['n_atoms'] > 0 - assert len(calc.metadata['atoms']) > 0 + assert entry_metadata.n_atoms > 0 + assert len(entry_metadata.atoms) > 0 + + assert upload.get_calc(calc.calc_id) is not None - assert upload.get_calc(calc.calc_id).metadata is not None + upload_files.close() def test_processing(processed, no_warn, mails, monkeypatch): @@ -139,10 +143,9 @@ def test_publish(non_empty_processed: Upload, no_warn, internal_example_user_met except Exception: pass - entries = processed.entries_metadata(internal_example_user_metadata) - - assert_upload_files(processed.upload_id, entries, PublicUploadFiles, published=True) - assert_search_upload(entries, additional_keys, published=True) + with processed.entries_metadata(internal_example_user_metadata) as entries: + assert_upload_files(processed.upload_id, entries, PublicUploadFiles, published=True) + assert_search_upload(entries, additional_keys, published=True) assert_processing(Upload.get(processed.upload_id, include_published=True), published=True) @@ -160,10 +163,9 @@ def test_republish(non_empty_processed: Upload, no_warn, internal_example_user_m processed.publish_upload() processed.block_until_complete(interval=.01) - entries = processed.entries_metadata(internal_example_user_metadata) - - assert_upload_files(processed.upload_id, entries, PublicUploadFiles, published=True) - assert_search_upload(entries, additional_keys, published=True) + with processed.entries_metadata(internal_example_user_metadata) as entries: + assert_upload_files(processed.upload_id, entries, PublicUploadFiles, published=True) + assert_search_upload(entries, additional_keys, published=True) def test_publish_failed( @@ -183,9 +185,8 @@ def test_publish_failed( except Exception: pass - entries = processed.entries_metadata(internal_example_user_metadata) - - assert_search_upload(entries, additional_keys, published=True, processed=False) + with processed.entries_metadata(internal_example_user_metadata) as entries: + assert_search_upload(entries, additional_keys, published=True, processed=False) @pytest.mark.timeout(config.tests.default_timeout) @@ -228,16 +229,20 @@ def test_re_processing(published: Upload, internal_example_user_metadata, monkey old_calc_time = first_calc.metadata['last_processing'] with published.upload_files.read_archive(first_calc.calc_id) as archive: - old_logs = archive[first_calc.calc_id]['processing_logs'] + archive[first_calc.calc_id]['processing_logs'] old_archive_files = list( archive_file for archive_file in os.listdir(published.upload_files.os_path) if 'archive' in archive_file) - for archive_file in old_archive_files: - with open(published.upload_files.join_file(archive_file).os_path, 'wt') as f: - f.write('') + with published.entries_metadata(internal_example_user_metadata) as entries_generator: + entries = list(entries_generator) + + if with_failure != 'not-matched': + for archive_file in old_archive_files: + with open(published.upload_files.join_file(archive_file).os_path, 'wt') as f: + f.write('') if with_failure == 'after': raw_files = 'tests/data/proc/examples_template_unparsable.zip' @@ -249,8 +254,6 @@ def test_re_processing(published: Upload, internal_example_user_metadata, monkey shutil.copyfile( raw_files, published.upload_files.join_file('raw-restricted.plain.zip').os_path) - entries = published.entries_metadata(internal_example_user_metadata) - # reprocess monkeypatch.setattr('nomad.config.version', 're_process_test_version') monkeypatch.setattr('nomad.config.commit', 're_process_test_commit') @@ -277,24 +280,11 @@ def test_re_processing(published: Upload, internal_example_user_metadata, monkey # assert changed archive files if with_failure == 'after': with published.upload_files.read_archive(first_calc.calc_id) as archive: - assert list(archive[first_calc.calc_id].keys()) == ['processing_logs'] - - elif with_failure == 'not-matched': - with published.upload_files.read_archive(first_calc.calc_id) as archive: - assert len(archive[first_calc.calc_id]) == 0 - - else: - with published.upload_files.read_archive(first_calc.calc_id) as archive: - assert len(archive[first_calc.calc_id]) > 1 # contains more then logs - - # assert changed archive log files - if with_failure in ['not-matched']: - with published.upload_files.read_archive(first_calc.calc_id) as archive: - assert len(archive[first_calc.calc_id]) == 0 + assert list(archive[first_calc.calc_id].keys()) == ['processing_logs', 'section_metadata'] else: with published.upload_files.read_archive(first_calc.calc_id) as archive: - assert archive[first_calc.calc_id]['processing_logs'] != old_logs + assert len(archive[first_calc.calc_id]) > 2 # contains more then logs and metadata # assert maintained user metadata (mongo+es) assert_upload_files(published.upload_id, entries, PublicUploadFiles, published=True) @@ -303,10 +293,13 @@ def test_re_processing(published: Upload, internal_example_user_metadata, monkey assert_processing(Upload.get(published.upload_id, include_published=True), published=True) # assert changed calc metadata (mongo) + entry_metadata = first_calc.entry_metadata(published.upload_files) if with_failure not in ['after', 'not-matched']: - assert first_calc.metadata['atoms'][0] == 'H' + assert entry_metadata.atoms[0] == 'H' + elif with_failure == 'not-matched': + assert entry_metadata.atoms[0] == 'Si' else: - assert first_calc.metadata['atoms'][0] == 'Si' + assert entry_metadata.atoms == [] @pytest.mark.timeout(config.tests.default_timeout) @@ -409,6 +402,6 @@ def test_ems_data(proc_infra, test_user): assert upload.total_calcs == 1 assert len(upload.calcs) == 1 - entries = upload.entries_metadata() - assert_upload_files(upload.upload_id, entries, StagingUploadFiles, published=False) - assert_search_upload(entries, additional_keys, published=False) + with upload.entries_metadata() as entries: + assert_upload_files(upload.upload_id, entries, StagingUploadFiles, published=False) + assert_search_upload(entries, additional_keys, published=False) diff --git a/tests/test_archive.py b/tests/test_archive.py index a380ecc444570044ce42ef987e1b518542691d12..35855761206a280b0ae25cde8c7cd3853fd42134 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -221,3 +221,11 @@ def test_query(): assert query_archive(f, {'calc2': {'secA': {'subsecA1[0]': '*'}}}) == {'calc2': {'secA': {'subsecA1[0]': [{'propA1a': 2.0}]}}} # TODO # test [:][-1][0:1] ... + + +if __name__ == '__main__': + import sys + import pprint + with open(sys.argv[1], 'rb') as f: + data = msgpack.unpack(f) + pprint.pprint(data) diff --git a/tests/test_files.py b/tests/test_files.py index 1269b98bba9f5e379569e1934f5c245b2ed5c1a3..d8ab592220a7a005b3bcc81453e97ff0b1dcaeba 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -44,7 +44,11 @@ example_file_contents = [ 'examples_template/4.aux'] example_file_mainfile = 'examples_template/template.json' empty_file = 'tests/data/proc/empty.zip' -example_archive_contents = {"archive": True, "processing_logs": [{"entry": "test"}]} +example_archive_contents = { + "section_run": [], + "section_metadata": {}, + "processing_logs": [{"entry": "test"}] +} example_bucket = 'test_bucket' example_data = dict(test_key='test_value') diff --git a/tests/test_normalizing.py b/tests/test_normalizing.py index cf3d52fae737032120cadc4ea73244d3ac6f3b93..385c7c0ecec19e3b1143eea96bcac1fe7aac661a 100644 --- a/tests/test_normalizing.py +++ b/tests/test_normalizing.py @@ -226,7 +226,8 @@ def assert_normalized(backend: Backend): assert metadata.dft.crystal_system is not None assert metadata.dft.spacegroup is not None - exceptions = parser_exceptions.get(backend.get_value('parser_name'), []) + parser_name = backend.entry_archive.section_metadata.parser_name + exceptions = parser_exceptions.get(parser_name, []) if metadata.formula != config.services.unavailable_value: assert len(metadata.atoms) > 0 diff --git a/tests/test_parsing.py b/tests/test_parsing.py index aa28ebff62c7cd168321e93b8ab7ea38e9bbd607..58a3743e1ae82a63b39e68266fbd284ced2cdcac 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -19,7 +19,7 @@ import pytest import os from shutil import copyfile -from nomad import utils, files +from nomad import utils, files, datamodel from nomad.parsing import parser_dict, match_parser, BrokenParser, BadContextUri, Backend @@ -153,19 +153,17 @@ class TestBackend(object): backend.addValue('program_name', 't0') backend.closeSection('section_run', 0) - g_index = backend.openSection('section_entry_info') - assert g_index == 0 - backend.addValue('parser_name', 'p0') - backend.closeSection('section_entry_info', 0) + g_index = backend.openSection('section_run') + assert g_index == 1 + backend.addValue('program_name', 't1') + backend.closeSection('section_run', 1) - assert backend.get_sections('section_run') == [0] - assert backend.get_sections('section_entry_info') == [0] + assert backend.get_sections('section_run') == [0, 1] output = StringIO() json.dump(backend.resource.m_to_dict(), output) archive = json.loads(output.getvalue()) - assert 'section_run' in archive - assert 'section_entry_info' in archive + assert 'section_run' in archive['EntryArchive'] def test_subsection(self, backend: Backend, no_warn): backend.openSection('section_run') @@ -341,14 +339,12 @@ def parsed_example(request) -> Backend: def add_calculation_info(backend: Backend, **kwargs) -> Backend: - backend.openNonOverlappingSection('section_entry_info') - backend.addValue('upload_id', 'test_upload_id') - backend.addValue('calc_id', 'test_calc_id') - backend.addValue('calc_hash', 'test_calc_hash') - backend.addValue('mainfile', 'test/mainfile.txt') - for key, value in kwargs.items(): - backend.addValue(key, value) - backend.closeNonOverlappingSection('section_entry_info') + entry_metadata = backend.entry_archive.m_create(datamodel.EntryMetadata) + entry_metadata.upload_id = 'test_upload_id' + entry_metadata.calc_id = 'test_calc_id' + entry_metadata.calc_hash = 'test_calc_hash' + entry_metadata.mainfile = 'test/mainfile.txt' + entry_metadata.m_update(**kwargs) return backend diff --git a/tests/test_search.py b/tests/test_search.py index 6678017ccf41a685a1dd5948181cd8f56bf7698f..16324a259ccad14d447d4fd8028212f0b764430e 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -34,7 +34,8 @@ def test_index_skeleton_calc(elastic): def test_index_normalized_calc(elastic, normalized: parsing.Backend): - entry_metadata = datamodel.EntryMetadata( + entry_metadata = normalized.entry_archive.section_metadata + entry_metadata.m_update( domain='dft', upload_id='test upload id', calc_id='test id') entry_metadata.apply_domain_metadata(normalized) @@ -49,7 +50,8 @@ def test_index_normalized_calc(elastic, normalized: parsing.Backend): def test_index_normalized_calc_with_metadata( elastic, normalized: parsing.Backend, internal_example_user_metadata: dict): - entry_metadata = datamodel.EntryMetadata( + entry_metadata = normalized.entry_archive.section_metadata + entry_metadata.m_update( domain='dft', upload_id='test upload id', calc_id='test id') entry_metadata.apply_domain_metadata(normalized) internal_example_user_metadata.pop('embargo_length') # is for uploads only @@ -67,7 +69,8 @@ def test_index_upload(elastic, processed: processing.Upload): @pytest.fixture() def example_search_data(elastic, normalized: parsing.Backend): - entry_metadata = datamodel.EntryMetadata( + entry_metadata = normalized.entry_archive.section_metadata + entry_metadata.m_update( domain='dft', upload_id='test upload id', calc_id='test id') entry_metadata.apply_domain_metadata(normalized) create_entry(entry_metadata) @@ -78,7 +81,8 @@ def example_search_data(elastic, normalized: parsing.Backend): @pytest.fixture() def example_ems_search_data(elastic, parsed_ems: parsing.Backend): - entry_metadata = datamodel.EntryMetadata( + entry_metadata = parsed_ems.entry_archive.section_metadata + entry_metadata.m_update( domain='ems', upload_id='test upload id', calc_id='test id') entry_metadata.apply_domain_metadata(parsed_ems) create_entry(entry_metadata)