diff --git a/docs/datamodel_metadataflow.png b/docs/datamodel_metadataflow.png deleted file mode 100644 index 5dc4c1f634d93ebd6911022d5bd3e9763781ecee..0000000000000000000000000000000000000000 Binary files a/docs/datamodel_metadataflow.png and /dev/null differ diff --git a/docs/datamodel_transformations.png b/docs/datamodel_transformations.png deleted file mode 100644 index 7898b158421f07d6e6e1b2d714185c968f848e59..0000000000000000000000000000000000000000 Binary files a/docs/datamodel_transformations.png and /dev/null differ diff --git a/docs/introduction.md b/docs/introduction.md index 2d297945d78a39776cee2ff7a77f218f4282dadf..e1b6d17fb8f735d453425dfb5cf77c628a5e903c 100644 --- a/docs/introduction.md +++ b/docs/introduction.md @@ -216,9 +216,6 @@ There are three catergories of metadata: Those sets of metadata along with the actual raw and archive data are often transformed, passed, stored, etc. by the various nomad modules. -.. figure:: datamodel_metadataflow.png - :alt: nomad's metadata flow - ### Implementation The different entities have often multiple implementations for different storage systems. For example, aspects of calculations are stored in files (raw files, calc metadata, archive data), diff --git a/examples/domain.py b/examples/domain.py index 0454da98da5af84e5ebfa74aedee810a32b90e44..5342cb684a236e77fcabc7c91b1ba16340ce9126 100644 --- a/examples/domain.py +++ b/examples/domain.py @@ -1,5 +1,5 @@ from nomad import datamodel -print(datamodel.CalcWithMetadata(domain='DFT', calc_id='test').__class__.__name__) -print(datamodel.CalcWithMetadata(calc_id='test').__class__.__name__) -print(datamodel.CalcWithMetadata(domain='EMS', calc_id='test').__class__.__name__) +print(datamodel.EntryMetadata(domain='DFT', calc_id='test').__class__.__name__) +print(datamodel.EntryMetadata(calc_id='test').__class__.__name__) +print(datamodel.EntryMetadata(domain='EMS', calc_id='test').__class__.__name__) diff --git a/gui/src/components/metaInfoBrowser/MetaInfoBrowser.js b/gui/src/components/metaInfoBrowser/MetaInfoBrowser.js index 73ae7513b9c9181aa3c6da36905c713cf0aec743..f6b61bafed640795ed7d8d5ca74f22c9ac453fc0 100644 --- a/gui/src/components/metaInfoBrowser/MetaInfoBrowser.js +++ b/gui/src/components/metaInfoBrowser/MetaInfoBrowser.js @@ -89,7 +89,7 @@ class MetaInfoBrowser extends Component { update(pkg) { this.props.api.getInfo().then(info => { - const domain = info.domains.find(domain => domain.name === 'dft') // TODO deal with domains + const domain = info.domains.find(domain => domain.name === 'dft') // TODO deal with domains this.props.api.getMetaInfo(pkg || domain.metainfo.all_package).then(metainfos => { const metainfoName = this.props.metainfo || domain.metainfo.root_sections[0] const definition = metainfos.get(metainfoName) @@ -108,7 +108,7 @@ class MetaInfoBrowser extends Component { init() { this.props.api.getInfo().then(info => { - const domain = info.domains.find(domain => domain.name === 'dft') // TODO deal with domains + const domain = info.domains.find(domain => domain.name === 'dft') // TODO deal with domains this.props.api.getMetaInfo(domain.metainfo.all_package).then(metainfos => { const metainfoName = this.props.metainfo || domain.metainfo.root_sections[0] const definition = metainfos.get(metainfoName) diff --git a/nomad/app/__init__.py b/nomad/app/__init__.py index 0a8d0f71fbb1c32f7f40f69f0b2dd4835a7deedb..7b1d3a3ee58c38ae59794825c3eb0a8734401048 100644 --- a/nomad/app/__init__.py +++ b/nomad/app/__init__.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This module comprises the nomad@FAIRDI APIs. Currently there is NOMAD's official api, and we will soon at the optimade api. The app module also servers documentation, gui, and alive. -""" +''' from flask import Flask, Blueprint, jsonify, url_for, abort, request from flask_restplus import Api from flask_cors import CORS @@ -36,11 +36,11 @@ from . import common @property # type: ignore def specs_url(self): - """ + ''' Fixes issue where swagger-ui makes a call to swagger.json over HTTP. This can ONLY be used on servers that actually use HTTPS. On servers that use HTTP, this code should not be used at all. - """ + ''' return url_for(self.endpoint('specs'), _external=True, _scheme='https') @@ -49,7 +49,7 @@ if config.services.https: app = Flask(__name__) -""" The Flask app that serves all APIs. """ +''' The Flask app that serves all APIs. ''' app.config.APPLICATION_ROOT = common.base_path # type: ignore app.config.RESTPLUS_MASK_HEADER = False # type: ignore @@ -105,7 +105,7 @@ def handle(error: Exception): @app.route('/alive') def alive(): - """ Simple endpoint to utilize kubernetes liveness/readiness probing. """ + ''' Simple endpoint to utilize kubernetes liveness/readiness probing. ''' return "I am, alive!" diff --git a/nomad/app/api/__init__.py b/nomad/app/api/__init__.py index e154b7e29fb63183555efa6fcffa92773cc4b5da..de4ed2fde4ba792edb0da23eaf7f06a380b3658d 100644 --- a/nomad/app/api/__init__.py +++ b/nomad/app/api/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The official NOMAD API. There is a separate documentation for the API endpoints from a client perspective. @@ -22,7 +22,7 @@ There is a separate documentation for the API endpoints from a client perspectiv .. automodule:: nomad.app.api.upload .. automodule:: nomad.app.api.repo .. automodule:: nomad.app.api.archive -""" +''' from .api import blueprint from . import info, auth, upload, repo, archive, raw, mirror, dataset diff --git a/nomad/app/api/api.py b/nomad/app/api/api.py index 9a949d703978c9f7401e1e4a52fa3b9d54e7b244..45df901c6cf1dc50c081728925f3416ba5b100aa 100644 --- a/nomad/app/api/api.py +++ b/nomad/app/api/api.py @@ -23,7 +23,7 @@ api = Api( version='1.0', title='NOMAD API', description='Official NOMAD API', validate=True) -""" Provides the flask restplus api instance for the regular NOMAD api""" +''' Provides the flask restplus api instance for the regular NOMAD api''' # For some unknown reason it is necessary for each fr api to have a handler. # Otherwise the global app error handler won't be called. diff --git a/nomad/app/api/archive.py b/nomad/app/api/archive.py index 5c635f7e7652a8b54d4f06114e7a09b21ffd164b..4c9f66dd50cf1deea65b3e5ee90ad07606ab06bf 100644 --- a/nomad/app/api/archive.py +++ b/nomad/app/api/archive.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The archive API of the nomad@FAIRDI APIs. This API is about serving processed (parsed and normalized) calculation data in nomad's *meta-info* format. -""" +''' from typing import Dict, Any from io import BytesIO @@ -51,11 +51,11 @@ class ArchiveCalcLogResource(Resource): @api.response(200, 'Archive data send', headers={'Content-Type': 'application/plain'}) @authenticate(signature_token=True) def get(self, upload_id, calc_id): - """ + ''' Get calculation processing log. Calcs are references via *upload_id*, *calc_id* pairs. - """ + ''' archive_id = '%s/%s' % (upload_id, calc_id) upload_files = UploadFiles.get( @@ -85,11 +85,11 @@ class ArchiveCalcResource(Resource): @api.response(200, 'Archive data send') @authenticate(signature_token=True) def get(self, upload_id, calc_id): - """ + ''' Get calculation data in archive form. Calcs are references via *upload_id*, *calc_id* pairs. - """ + ''' archive_id = '%s/%s' % (upload_id, calc_id) upload_file = UploadFiles.get( @@ -128,7 +128,7 @@ class ArchiveDownloadResource(Resource): @api.response(200, 'File(s) send', headers={'Content-Type': 'application/zip'}) @authenticate(signature_token=True) def get(self): - """ + ''' Get calculation data in archive form from all query results. See ``/repo`` endpoint for documentation on the search @@ -138,7 +138,7 @@ class ArchiveDownloadResource(Resource): any files that the user is not authorized to access. The zip file will contain a ``manifest.json`` with the repository meta data. - """ + ''' try: args = _archive_download_parser.parse_args() compress = args.get('compress', False) @@ -229,7 +229,7 @@ class ArchiveQueryResource(Resource): @api.marshal_with(_archive_query_model, skip_none=True, code=200, description='Search results sent') @authenticate() def post(self): - """ + ''' Post a query schema and return it filled with archive data. See ``/repo`` endpoint for documentation on the search @@ -237,7 +237,7 @@ class ArchiveQueryResource(Resource): The actual data are in results and a supplementary python code (curl) to execute search is in python (curl). - """ + ''' try: data_in = request.get_json() scroll = data_in.get('scroll', None) @@ -323,9 +323,9 @@ class MetainfoResource(Resource): @api.response(404, 'The metainfo does not exist') @api.response(200, 'Metainfo data send') def get(self, metainfo_package_name): - """ + ''' Get a metainfo definition file. - """ + ''' try: return load_metainfo(metainfo_package_name), 200 except FileNotFoundError: @@ -345,7 +345,7 @@ metainfo_main_path = os.path.dirname(os.path.abspath(nomad_meta_info.__file__)) def load_metainfo( package_name_or_dependency: str, dependency_source: str = None, loaded_packages: Dict[str, Any] = None) -> Dict[str, Any]: - """ + ''' Loads the given metainfo package and all its dependencies. Returns a dict with all loaded package_names and respective packages. @@ -354,7 +354,7 @@ def load_metainfo( dependency_source: The path of the metainfo that uses this function to load a relative dependency. loaded_packages: Give a dict and the function will added freshly loaded packages to it and return it. - """ + ''' if loaded_packages is None: loaded_packages = {} diff --git a/nomad/app/api/auth.py b/nomad/app/api/auth.py index 0ba9876d9a96b49c665259e4141ff8b8a30ef16a..61b1fbc2399ccca2b5390426bb6308c730b88cd9 100644 --- a/nomad/app/api/auth.py +++ b/nomad/app/api/auth.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The API is protected with *keycloak* and *OpenIDConnect*. All API endpoints that require or support authentication accept OIDC bearer tokens via HTTP header (``Authentication``). These token can be acquired from the NOMAD keycloak server or through the ``/auth`` endpoint @@ -29,7 +29,7 @@ decorator. To allow authentification with signed urls, use this decorator: .. autofunction:: with_signature_token -""" +''' from flask import g, request from flask_restplus import abort, Resource, fields import functools @@ -69,11 +69,11 @@ api.authorizations = { def _verify_upload_token(token) -> str: - """ + ''' Verifies the upload token generated with :func:`generate_upload_token`. Returns: The user UUID or None if the toke could not be verified. - """ + ''' payload, signature = token.split('.') payload = utils.base64_decode(payload) signature = utils.base64_decode(signature) @@ -92,7 +92,7 @@ def _verify_upload_token(token) -> str: def authenticate( basic: bool = False, upload_token: bool = False, signature_token: bool = False, required: bool = False, admin_only: bool = False): - """ + ''' A decorator to protect API endpoints with authentication. Uses keycloak access token to authenticate users. Other methods might apply. Will abort with 401 if necessary. @@ -103,7 +103,7 @@ def authenticate( signature_token: Also allow signed urls required: Authentication is required admin_only: Only the admin user is allowed to use the endpoint. - """ + ''' methods = ['OpenIDConnect Bearer Token'] if basic: methods.append('HTTP Basic Authentication') @@ -192,7 +192,7 @@ class AuthResource(Resource): @api.marshal_with(auth_model, skip_none=True, code=200, description='Auth info send') @authenticate(required=True, basic=True) def get(self): - """ + ''' Provides authentication information. This endpoint requires authentification. Like all endpoints the OIDC access token based authentification. In additional, basic HTTP authentification can be used. This allows to login and acquire an @@ -202,7 +202,7 @@ class AuthResource(Resource): URLs with a ``signature_token`` query parameter, e.g. for file downloads on the raw or archive api endpoints; a short ``upload_token`` that is used in ``curl`` command line based uploads; and the OIDC JWT access token. - """ + ''' def signature_token(): expires_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=10) @@ -239,7 +239,7 @@ class UsersResource(Resource): @api.marshal_with(users_model, code=200, description='User suggestions send') @api.expect(users_parser, validate=True) def get(self): - """ Get existing users. """ + ''' Get existing users. ''' args = users_parser.parse_args() return dict(users=infrastructure.keycloak.search_user(args.get('query'))) @@ -248,7 +248,7 @@ class UsersResource(Resource): @api.marshal_with(user_model, code=200, skip_none=True, description='User invited') @api.expect(user_model, validate=True) def put(self): - """ Invite a new user. """ + ''' Invite a new user. ''' if config.keycloak.oasis: abort(400, 'User invide does not work this NOMAD OASIS') @@ -273,10 +273,10 @@ class UsersResource(Resource): def with_signature_token(func): - """ + ''' A decorator for API endpoint implementations that validates signed URLs. Token to sign URLs can be retrieved via the ``/auth`` endpoint. - """ + ''' @functools.wraps(func) @api.response(401, 'Invalid or expired signature token') def wrapper(*args, **kwargs): @@ -302,10 +302,10 @@ def with_signature_token(func): def create_authorization_predicate(upload_id, calc_id=None): - """ + ''' Returns a predicate that determines if the logged in user has the authorization to access the given upload and calculation. - """ + ''' def func(): if g.user is None: # guest users don't have authorized access to anything diff --git a/nomad/app/api/common.py b/nomad/app/api/common.py index f9e0692969d9a398da10c53725a32d07516dbf12..f69f5570b60d0fec82dabcb802f86d78b4a33367 100644 --- a/nomad/app/api/common.py +++ b/nomad/app/api/common.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Common data, variables, decorators, models used throughout the API. -""" +''' from typing import Callable, IO, Set, Tuple, Iterable, Dict, Any from flask_restplus import fields import zipstream @@ -24,8 +24,7 @@ from urllib.parse import urlencode import sys import os.path -from nomad import search, config -from nomad.datamodel import Domain +from nomad import search, config, datamodel from nomad.app.optimade import filterparser from nomad.app.common import RFC3339DateTime, rfc3339DateTime from nomad.files import Restricted @@ -57,7 +56,7 @@ pagination_model = api.model('Pagination', { 'order_by': fields.String(description='Sorting criterion.'), 'order': fields.Integer(description='Sorting order -1 for descending, 1 for asceding.') }) -""" Model used in responses with pagination. """ +''' Model used in responses with pagination. ''' scroll_model = api.model('Scroll', { 'scroll': fields.Boolean(default=False, description='Flag if scrolling is enables.'), @@ -79,13 +78,13 @@ search_model_fields = { search_model = api.model('Search', search_model_fields) query_model_fields = { - quantity.qualified_name: fields.Raw(description=quantity.description) - for quantity in Domain.all_quantities()} + qualified_name: fields.Raw(description=quantity.description) + for qualified_name, quantity in search.search_quantities.items()} query_model_fields.update(**{ 'owner': fields.String(description='The group the calculations belong to.', allow_null=True, skip_none=True), 'domain': fields.String(description='Specify the domain to search in: %s, default is ``%s``' % ( - ', '.join(['``%s``' % key for key in Domain.instances.keys()]), config.default_domain)), + ', '.join(['``%s``' % domain for domain in datamodel.domains]), config.default_domain)), 'from_time': fields.Raw(description='The minimum entry time.', allow_null=True, skip_none=True), 'until_time': fields.Raw(description='The maximum entry time.', allow_null=True, skip_none=True) }) @@ -94,7 +93,7 @@ query_model = api.model('Query', query_model_fields) def add_pagination_parameters(request_parser): - """ Add pagination parameters to Flask querystring parser. """ + ''' Add pagination parameters to Flask querystring parser. ''' request_parser.add_argument( 'page', type=int, help='The page, starting with 1.', location='args') request_parser.add_argument( @@ -111,7 +110,7 @@ pagination_request_parser = request_parser.copy() def add_scroll_parameters(request_parser): - """ Add scroll parameters to Flask querystring parser. """ + ''' Add scroll parameters to Flask querystring parser. ''' request_parser.add_argument( 'scroll', type=bool, help='Enable scrolling') request_parser.add_argument( @@ -119,12 +118,12 @@ def add_scroll_parameters(request_parser): def add_search_parameters(request_parser): - """ Add search parameters to Flask querystring parser. """ + ''' Add search parameters to Flask querystring parser. ''' # more search parameters request_parser.add_argument( 'domain', type=str, help='Specify the domain to search in: %s, default is ``%s``' % ( - ', '.join(['``%s``' % key for key in Domain.instances.keys()]), + ', '.join(['``%s``' % domain for domain in datamodel.domains]), config.default_domain)) request_parser.add_argument( 'owner', type=str, @@ -137,20 +136,18 @@ def add_search_parameters(request_parser): help='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)') # main search parameters - for quantity in Domain.all_quantities(): + for qualified_name, quantity in search.search_quantities.items(): request_parser.add_argument( - quantity.qualified_name, help=quantity.description, - action=quantity.argparse_action if quantity.multi else None) + qualified_name, help=quantity.description, action=quantity.argparse_action) -_search_quantities = set([ - domain.qualified_name for domain in Domain.all_quantities()]) +_search_quantities = set(search.search_quantities.keys()) def apply_search_parameters(search_request: search.SearchRequest, args: Dict[str, Any]): - """ + ''' Help that adds query relevant request args to the given SearchRequest. - """ + ''' args = {key: value for key, value in args.items() if value is not None} # domain @@ -196,7 +193,7 @@ def apply_search_parameters(search_request: search.SearchRequest, args: Dict[str def calc_route(ns, prefix: str = ''): - """ A resource decorator for /<upload>/<calc> based routes. """ + ''' A resource decorator for /<upload>/<calc> based routes. ''' def decorator(func): ns.route('%s/<string:upload_id>/<string:calc_id>' % prefix)( api.doc(params={ @@ -208,7 +205,7 @@ def calc_route(ns, prefix: str = ''): def upload_route(ns, prefix: str = ''): - """ A resource decorator for /<upload> based routes. """ + ''' A resource decorator for /<upload> based routes. ''' def decorator(func): ns.route('%s/<string:upload_id>' % prefix)( api.doc(params={ @@ -221,7 +218,7 @@ def upload_route(ns, prefix: str = ''): def streamed_zipfile( files: Iterable[Tuple[str, str, Callable[[str], IO], Callable[[str], int]]], zipfile_name: str, compress: bool = False): - """ + ''' Creates a response that streams the given files as a streamed zip file. Ensures that each given file is only streamed once, based on its filename in the resulting zipfile. @@ -232,17 +229,17 @@ def streamed_zipfile( zipfile_name: A name that will be used in the content disposition attachment used as an HTTP respone. compress: Uses compression. Default is stored only. - """ + ''' streamed_files: Set[str] = set() def generator(): - """ Stream a zip file with all files using zipstream. """ + ''' Stream a zip file with all files using zipstream. ''' def iterator(): - """ + ''' Replace the directory based iter of zipstream with an iter over all given files. - """ + ''' # the actual contents for zipped_filename, file_id, open_io, file_size in files: if zipped_filename in streamed_files: @@ -286,12 +283,12 @@ def streamed_zipfile( def query_api_url(*args, query_string: Dict[str, Any] = None): - """ + ''' Creates a API URL. Arguments: *args: URL path segments after the API base URL query_string: A dict with query string parameters - """ + ''' url = os.path.join(config.api_url(False), *args) if query_string is not None: url = '%s?%s' % (url, urlencode(query_string, doseq=True)) @@ -300,10 +297,10 @@ def query_api_url(*args, query_string: Dict[str, Any] = None): def query_api_python(*args, **kwargs): - """ + ''' Creates a string of python code to execute a search query to the repository using the requests library. - """ + ''' url = query_api_url(*args, **kwargs) return '''import requests response = requests.post("{}") @@ -311,8 +308,8 @@ data = response.json()'''.format(url) def query_api_curl(*args, **kwargs): - """ + ''' Creates a string of curl command to execute a search query to the repository. - """ + ''' url = query_api_url(*args, **kwargs) return 'curl -X POST %s -H "accept: application/json" --output "nomad.json"' % url diff --git a/nomad/app/api/dataset.py b/nomad/app/api/dataset.py index 7ff4d875675fbf4400ed55223823a5f1eeed8054..5adb190e15f8c0e15f32c724eed877000cbf1c1b 100644 --- a/nomad/app/api/dataset.py +++ b/nomad/app/api/dataset.py @@ -49,7 +49,7 @@ class DatasetListResource(Resource): @api.expect(list_datasets_parser) @authenticate(required=True) def get(self): - """ Retrieve a list of all datasets of the authenticated user. """ + ''' Retrieve a list of all datasets of the authenticated user. ''' args = { key: value for key, value in list_datasets_parser.parse_args().items() if value is not None} @@ -76,7 +76,7 @@ class DatasetListResource(Resource): @api.expect(dataset_model) @authenticate(required=True) def put(self): - """ Creates a new dataset. """ + ''' Creates a new dataset. ''' data = request.get_json() if data is None: data = {} @@ -112,7 +112,7 @@ class DatasetResource(Resource): @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset send') @authenticate(required=True) def get(self, name: str): - """ Retrieve a dataset by name. """ + ''' Retrieve a dataset by name. ''' try: result = Dataset.m_def.m_x('me').get(user_id=g.user.user_id, name=name) except KeyError: @@ -126,7 +126,7 @@ class DatasetResource(Resource): @api.marshal_with(dataset_model, skip_none=True, code=200, description='DOI assigned') @authenticate(required=True) def post(self, name: str): - """ Assign a DOI to the dataset. """ + ''' Assign a DOI to the dataset. ''' try: result = Dataset.m_def.m_x('me').get(user_id=g.user.user_id, name=name) except KeyError: @@ -168,7 +168,7 @@ class DatasetResource(Resource): @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset deleted') @authenticate(required=True) def delete(self, name: str): - """ Delete the dataset. """ + ''' Delete the dataset. ''' try: result = Dataset.m_def.m_x('me').get(user_id=g.user.user_id, name=name) except KeyError: diff --git a/nomad/app/api/info.py b/nomad/app/api/info.py index e6e87e43781ebd07708901660c23b9aae87794d7..0630498d380f87faa7109ffb86f63b6df8d925ad 100644 --- a/nomad/app/api/info.py +++ b/nomad/app/api/info.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' API endpoint that deliver backend configuration details. -""" +''' from flask_restplus import Resource, fields @@ -69,7 +69,7 @@ class InfoResource(Resource): @api.doc('get_info') @api.marshal_with(info_model, skip_none=True, code=200, description='Info send') def get(self): - """ Return information about the nomad backend and its configuration. """ + ''' Return information about the nomad backend and its configuration. ''' codes = [ parser.code_name for parser in parsing.parser_dict.values() @@ -83,16 +83,13 @@ class InfoResource(Resource): 'normalizers': [normalizer.__name__ for normalizer in normalizing.normalizers], 'domains': [ { - 'name': domain.name, - 'quantities': [quantity for quantity in domain.quantities.values()], - 'metrics_names': domain.metrics_names, - 'aggregations_names': domain.aggregations_names, + 'name': domain_name, 'metainfo': { - 'all_package': domain.metainfo_all_package, - 'root_sections': domain.root_sections + 'all_package': domain['metainfo_all_package'], + 'root_section': domain['root_section'] } } - for domain in datamodel.Domain.instances.values() + for domain_name, domain in datamodel.domains.items() ], 'version': config.version, 'release': config.release, diff --git a/nomad/app/api/mirror.py b/nomad/app/api/mirror.py index 9353600c16fd8044b2986978bb754a0733a0f8bb..e0794d0da9baa521132ff8dc2f77af1af6e0ec10 100644 --- a/nomad/app/api/mirror.py +++ b/nomad/app/api/mirror.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The mirror API of the nomad@FAIRDI APIs. Allows to export upload metadata. -""" +''' from flask import request from flask_restplus import Resource, abort, fields @@ -82,9 +82,9 @@ class MirrorUploadResource(Resource): @api.doc('get_upload_mirror') @authenticate(admin_only=True) def get(self, upload_id): - """ + ''' Export upload (and all calc) metadata for mirrors. - """ + ''' try: upload = proc.Upload.get(upload_id) except KeyError: diff --git a/nomad/app/api/raw.py b/nomad/app/api/raw.py index b08abd4b056a34d42dffe4472352d24edb81d179..b529c7b43909e3897fa097f307cf4ada02091cc5 100644 --- a/nomad/app/api/raw.py +++ b/nomad/app/api/raw.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The raw API of the nomad@FAIRDI APIs. Can be used to retrieve raw calculation files. -""" +''' from typing import IO, Any, Union, List import os.path @@ -71,13 +71,13 @@ _raw_file_from_path_parser.add_argument( class FileView: - """ + ''' File-like wrapper that restricts the contents to a portion of the file. Arguments: f: the file-like offset: the offset length: the amount of bytes - """ + ''' def __init__(self, f, offset, length): self.f = f self.f_offset = offset @@ -110,10 +110,10 @@ class FileView: def get_raw_file_from_upload_path( upload_files, upload_filepath, authorization_predicate, mainfile: str = None): - """ + ''' Helper method used by func:`RawFileFromUploadPathResource.get` and func:`RawFileFromCalcPathResource.get`. - """ + ''' upload_filepath = upload_filepath.rstrip('/') if upload_filepath[-1:] == '*': @@ -197,7 +197,7 @@ class RawFileFromUploadPathResource(Resource): @api.expect(_raw_file_from_path_parser, validate=True) @authenticate(signature_token=True) def get(self, upload_id: str, path: str): - """ Get a single raw calculation file, directory contents, or whole directory sub-tree + ''' Get a single raw calculation file, directory contents, or whole directory sub-tree from a given upload. The 'upload_id' parameter needs to identify an existing upload. @@ -223,7 +223,7 @@ class RawFileFromUploadPathResource(Resource): match the given path at the start, will be returned as a .zip file body. Zip files are streamed; instead of 401 errors, the zip file will just not contain any files that the user is not authorized to access. - """ + ''' # TODO this is a quick fix, since swagger cannot deal with not encoded path parameters if path is not None: path = urllib.parse.unquote(path) @@ -258,7 +258,7 @@ class RawFileFromCalcPathResource(Resource): @api.expect(_raw_file_from_path_parser, validate=True) @authenticate(signature_token=True) def get(self, upload_id: str, calc_id: str, path: str): - """ Get a single raw calculation file, calculation contents, or all files for a + ''' Get a single raw calculation file, calculation contents, or all files for a given calculation. The 'upload_id' parameter needs to identify an existing upload. @@ -266,7 +266,7 @@ class RawFileFromCalcPathResource(Resource): This endpoint behaves exactly like /raw/<upload_id>/<path>, but the path is now relative to the calculation and not the upload. - """ + ''' # TODO this is a quick fix, since swagger cannot deal with not encoded path parameters if path is not None: path = urllib.parse.unquote(path) @@ -300,11 +300,11 @@ class RawFileFromCalcEmptyPathResource(RawFileFromCalcPathResource): @api.expect(_raw_file_from_path_parser, validate=True) @authenticate(signature_token=True) def get(self, upload_id: str, calc_id: str): - """ Get calculation contents. + ''' Get calculation contents. This is basically /raw/calc/<upload_id>/<calc_id>/<path> with an empty path, since having an empty path parameter is not possible. - """ + ''' return super().get(upload_id, calc_id, None) @@ -336,11 +336,11 @@ class RawFilesResource(Resource): @api.expect(_raw_files_request_model, validate=True) @authenticate() def post(self, upload_id): - """ Download multiple raw calculation files in a .zip file. + ''' Download multiple raw calculation files in a .zip file. Zip files are streamed; instead of 401 errors, the zip file will just not contain any files that the user is not authorized to access. - """ + ''' json_data = request.get_json() compress = json_data.get('compress', False) files = [file.strip() for file in json_data['files']] @@ -353,12 +353,12 @@ class RawFilesResource(Resource): @api.expect(_raw_files_request_parser, validate=True) @authenticate(signature_token=True) def get(self, upload_id): - """ + ''' Download multiple raw calculation files. Download multiple raw calculation files in a .zip file. Zip files are streamed; instead of 401 errors, the zip file will just not contain any files that the user is not authorized to access. - """ + ''' args = _raw_files_request_parser.parse_args() files_str = args.get('files') @@ -401,7 +401,7 @@ class RawFileQueryResource(Resource): @api.response(200, 'File(s) send', headers={'Content-Type': 'application/zip'}) @authenticate(signature_token=True) def get(self): - """ Download a .zip file with all raw-files for all entries that match the given + ''' Download a .zip file with all raw-files for all entries that match the given search parameters. See ``/repo`` endpoint for documentation on the search @@ -411,7 +411,7 @@ class RawFileQueryResource(Resource): any files that the user is not authorized to access. The zip file will contain a ``manifest.json`` with the repository meta data. - """ + ''' logger = common.logger.bind(query=urllib.parse.urlencode(request.args, doseq=True)) patterns: List[str] = None diff --git a/nomad/app/api/repo.py b/nomad/app/api/repo.py index 6864a2909efa5fc0cb6c883df50dcc2566c904db..dfd783519be1d191f1a5cb7f22e8fa04f5f1fb8d 100644 --- a/nomad/app/api/repo.py +++ b/nomad/app/api/repo.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The repository API of the nomad@FAIRDI APIs. Currently allows to resolve repository meta-data. -""" +''' from typing import List, Dict, Any from flask_restplus import Resource, abort, fields @@ -26,7 +26,7 @@ import elasticsearch.helpers from datetime import datetime from nomad import search, utils, datamodel, processing as proc, infrastructure -from nomad.datamodel import UserMetadata, Dataset, User, Domain +from nomad.datamodel import Dataset, User, EditableUserMetadata from nomad.app import common from nomad.app.common import RFC3339DateTime, DotKeyNested @@ -47,12 +47,12 @@ class RepoCalcResource(Resource): @api.doc('get_repo_calc') @authenticate() def get(self, upload_id, calc_id): - """ + ''' Get calculation metadata in repository form. Repository metadata only entails the quantities shown in the repository. Calcs are references via *upload_id*, *calc_id* pairs. - """ + ''' try: calc = search.Entry.get(calc_id) except NotFoundError: @@ -88,7 +88,7 @@ _search_request_parser.add_argument( 'exclude', type=str, action='split', help='Excludes the given keys in the returned data.') for group_name in search.groups: _search_request_parser.add_argument( - group_name, type=bool, help=('Return %s group data.' % group_name)) + 'group_%s' % group_name, type=bool, help=('Return %s group data.' % group_name)) _search_request_parser.add_argument( '%s_after' % group_name, type=str, help='The last %s id of the last scroll window for the %s group' % (group_name, group_name)) @@ -100,14 +100,14 @@ _repo_calcs_model_fields = { 'There is a pseudo quantity "total" with a single value "all" that contains the ' ' metrics over all results. ' % ', '.join(search.metrics_names)))} -for group_name, (group_quantity, _) in search.groups.items(): +for group_name in search.groups: _repo_calcs_model_fields[group_name] = (DotKeyNested if '.' in group_name else fields.Nested)(api.model('RepoGroup', { 'after': fields.String(description='The after value that can be used to retrieve the next %s.' % group_name), - 'values': fields.Raw(description='A dict with %s as key. The values are dicts with "total" and "examples" keys.' % group_quantity) + 'values': fields.Raw(description='A dict with %s as key. The values are dicts with "total" and "examples" keys.' % group_name) }), skip_none=True) -for quantity in Domain.all_quantities(): - _repo_calcs_model_fields[quantity.name] = fields.Raw( +for qualified_name, quantity in search.search_quantities.items(): + _repo_calcs_model_fields[qualified_name] = fields.Raw( description=quantity.description, allow_null=True, skip_none=True) _repo_calcs_model = api.inherit('RepoCalculations', search_model, _repo_calcs_model_fields) @@ -121,7 +121,7 @@ class RepoCalcsResource(Resource): @api.marshal_with(_repo_calcs_model, skip_none=True, code=200, description='Search results send') @authenticate() def get(self): - """ + ''' Search for calculations in the repository form, paginated. The ``owner`` parameter determines the overall entries to search through. @@ -151,7 +151,7 @@ class RepoCalcsResource(Resource): Ordering is determined by ``order_by`` and ``order`` parameters. Default is ``upload_time`` in decending order. - """ + ''' try: parsed_args = _search_request_parser.parse_args() @@ -170,7 +170,7 @@ class RepoCalcsResource(Resource): metrics: List[str] = request.args.getlist('metrics') with_statistics = args.get('statistics', False) or \ - any(args.get(group_name, False) for group_name in search.groups) + any(args.get('group_%s' % group_name, False) for group_name in search.groups) except Exception as e: abort(400, message='bad parameters: %s' % str(e)) @@ -196,9 +196,9 @@ class RepoCalcsResource(Resource): search_request.default_statistics(metrics_to_use=metrics) additional_metrics = [ - metric - for group_name, (_, metric) in search.groups.items() - if args.get(group_name, False)] + group_quantity.metric_name + for group_name, group_quantity in search.groups.items() + if args.get('group_%s' % group_name, False)] total_metrics = metrics + additional_metrics @@ -217,13 +217,13 @@ class RepoCalcsResource(Resource): results = search_request.execute_scrolled(scroll_id=scroll_id, size=per_page) else: - for group_name, (group_quantity, _) in search.groups.items(): - if args.get(group_name, False): + for group_name, group_quantity in search.groups.items(): + if args.get('group_%s' % group_name, False): kwargs: Dict[str, Any] = {} - if group_name == 'uploads': + if group_name == 'group_uploads': kwargs.update(order_by='upload_time', order='desc') search_request.quantity( - group_quantity, size=per_page, examples=1, + group_quantity.qualified_name, size=per_page, examples=1, after=request.args.get('%s_after' % group_name, None), **kwargs) @@ -239,9 +239,9 @@ class RepoCalcsResource(Resource): if 'quantities' in results: quantities = results.pop('quantities') - for group_name, (group_quantity, _) in search.groups.items(): - if args.get(group_name, False): - results[group_name] = quantities[group_quantity] + for group_name, group_quantity in search.groups.items(): + if args.get('group_%s' % group_name, False): + results[group_name] = quantities[group_quantity.qualified_name] # build python code/curl snippet code_args = dict(request.args) @@ -265,13 +265,13 @@ _query_model_parameters = { 'until_time': RFC3339DateTime(description='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)') } -for quantity in datamodel.Domain.all_quantities(): - if quantity.multi and quantity.argparse_action is None: +for qualified_name, quantity in search.search_quantities.items(): + if quantity.many_and: def field(**kwargs): return fields.List(fields.String(**kwargs)) else: field = fields.String - _query_model_parameters[quantity.name] = field(description=quantity.description) + _query_model_parameters[qualified_name] = field(description=quantity.description) _repo_query_model = api.model('RepoQuery', _query_model_parameters, skip_none=True) @@ -296,13 +296,16 @@ _repo_edit_model = api.model('RepoEdit', { 'actions': fields.Nested( api.model('RepoEditActions', { quantity.name: repo_edit_action_field(quantity) - for quantity in UserMetadata.m_def.all_quantities.values() + for quantity in EditableUserMetadata.m_def.definitions }), skip_none=True, description='Each action specifies a single value (even for multi valued quantities).'), 'success': fields.Boolean(description='If the overall edit can/could be done. Only in API response.'), 'message': fields.String(description='A message that details the overall edit result. Only in API response.') }) +_editable_quantities = { + quantity.name: quantity for quantity in EditableUserMetadata.m_def.definitions} + def edit(parsed_query: Dict[str, Any], mongo_update: Dict[str, Any] = None, re_index=True) -> List[str]: # get all calculations that have to change @@ -327,8 +330,8 @@ def edit(parsed_query: Dict[str, Any], mongo_update: Dict[str, Any] = None, re_i if re_index: def elastic_updates(): for calc in proc.Calc.objects(calc_id__in=calc_ids): - entry = search.Entry.from_calc_with_metadata( - datamodel.CalcWithMetadata(**calc['metadata'])) + entry = search.create_entry( + datamodel.EntryMetadata.m_from_dict(calc['metadata'])) entry = entry.to_dict(include_meta=True) entry['_op_type'] = 'index' yield entry @@ -345,7 +348,7 @@ def edit(parsed_query: Dict[str, Any], mongo_update: Dict[str, Any] = None, re_i def get_uploader_ids(query): - """ Get all the uploader from the query, to check coauthers and shared_with for uploaders. """ + ''' Get all the uploader from the query, to check coauthers and shared_with for uploaders. ''' search_request = search.SearchRequest() apply_search_parameters(search_request, query) search_request.quantity(name='uploader_id') @@ -360,7 +363,7 @@ class EditRepoCalcsResource(Resource): @api.marshal_with(_repo_edit_model, skip_none=True, code=200, description='Edit verified/performed') @authenticate() def post(self): - """ Edit repository metadata. """ + ''' Edit repository metadata. ''' # basic body parsing and some semantic checks json_data = request.get_json() @@ -382,9 +385,10 @@ class EditRepoCalcsResource(Resource): parsed_query = {} for quantity_name, value in query.items(): if quantity_name in _search_quantities: - quantity = datamodel.Domain.get_quantity(quantity_name) - if quantity.multi and quantity.argparse_action == 'split' and not isinstance(value, list): - value = value.split(',') + quantity = search.search_quantities[quantity_name] + if quantity.many: + if not isinstance(value, list): + value = value.split(',') parsed_query[quantity_name] = value parsed_query['owner'] = owner parsed_query['domain'] = query.get('domain') @@ -398,7 +402,7 @@ class EditRepoCalcsResource(Resource): with utils.timer(common.logger, 'edit verified'): for action_quantity_name, quantity_actions in actions.items(): - quantity = UserMetadata.m_def.all_quantities.get(action_quantity_name) + quantity = _editable_quantities.get(action_quantity_name) if quantity is None: abort(400, 'Unknown quantity %s' % action_quantity_name) @@ -564,7 +568,7 @@ class RepoQuantityResource(Resource): @api.marshal_with(_repo_quantity_values_model, skip_none=True, code=200, description='Search results send') @authenticate() def get(self, quantity: str): - """ + ''' Retrieve quantity values from entries matching the search. You can use the various quantities to search/filter for. For some of the @@ -580,7 +584,7 @@ class RepoQuantityResource(Resource): The result will contain a 'quantity' key with quantity values and the "after" value. There will be upto 'size' many values. For the rest of the values use the "after" parameter in another request. - """ + ''' search_request = search.SearchRequest() args = { @@ -631,7 +635,7 @@ class RepoQuantitiesResource(Resource): @api.marshal_with(_repo_quantities_model, skip_none=True, code=200, description='Search results send') @authenticate() def get(self): - """ + ''' Retrieve quantity values for multiple quantities at once. You can use the various quantities to search/filter for. For some of the @@ -645,7 +649,7 @@ class RepoQuantitiesResource(Resource): The result will contain a 'quantities' key with a dict of quantity names and the retrieved values as values. - """ + ''' search_request = search.SearchRequest() args = { diff --git a/nomad/app/api/upload.py b/nomad/app/api/upload.py index 8655a0e8db4ecf05480369b7652ca7d66587cda5..cb04f51589a96fbf16b75489d3fbef593a642a3f 100644 --- a/nomad/app/api/upload.py +++ b/nomad/app/api/upload.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The upload API of the nomad@FAIRDI APIs. Provides endpoints to upload files and get the processing status of uploads. -""" +''' +from typing import Dict, Any from flask import g, request, Response from flask_restplus import Resource, fields, abort from datetime import datetime @@ -44,8 +45,8 @@ ns = api.namespace( class CalcMetadata(fields.Raw): def format(self, value): - calc_with_metadata = datamodel.CalcWithMetadata(**value) - return search.Entry.from_calc_with_metadata(calc_with_metadata).to_dict() + entry_metadata = datamodel.EntryMetadata.m_from_dict(value) + return search.create_entry(entry_metadata).to_dict() proc_model = api.model('Processing', { @@ -141,10 +142,10 @@ def disable_marshalling(f): def marshal_with(*args, **kwargs): - """ + ''' A special version of the RESTPlus marshal_with decorator that allows to disable marshalling at runtime by raising DisableMarshalling. - """ + ''' def decorator(func): @api.marshal_with(*args, **kwargs) def with_marshalling(*args, **kwargs): @@ -175,7 +176,7 @@ class UploadListResource(Resource): @api.expect(upload_list_parser) @authenticate(required=True) def get(self): - """ Get the list of all uploads from the authenticated user. """ + ''' Get the list of all uploads from the authenticated user. ''' try: state = request.args.get('state', 'unpublished') name = request.args.get('name', None) @@ -220,7 +221,7 @@ class UploadListResource(Resource): @marshal_with(upload_model, skip_none=True, code=200, description='Upload received') @authenticate(required=True, upload_token=True) def put(self): - """ + ''' Upload a file and automatically create a new upload in the process. Can be used to upload files via browser or other http clients like curl. This will also start the processing of the upload. @@ -237,7 +238,7 @@ class UploadListResource(Resource): There is a general limit on how many unpublished uploads a user can have. Will return 400 if this limit is exceeded. - """ + ''' # check existence of local_path if local_path is used local_path = request.args.get('local_path') if local_path: @@ -345,12 +346,12 @@ class UploadResource(Resource): @api.expect(pagination_request_parser) @authenticate(required=True) def get(self, upload_id: str): - """ + ''' Get an update for an existing upload. Will not only return the upload, but also its calculations paginated. Use the pagination params to determine the page. - """ + ''' try: upload = Upload.get(upload_id) except KeyError: @@ -398,12 +399,12 @@ class UploadResource(Resource): @api.marshal_with(upload_model, skip_none=True, code=200, description='Upload deleted') @authenticate(required=True) def delete(self, upload_id: str): - """ + ''' Delete an existing upload. Only uploads that are sill in staging, not already deleted, not still uploaded, and not currently processed, can be deleted. - """ + ''' try: upload = Upload.get(upload_id) except KeyError: @@ -436,7 +437,7 @@ class UploadResource(Resource): @api.expect(upload_operation_model) @authenticate(required=True) def post(self, upload_id): - """ + ''' Execute an upload operation. Available operations are ``publish`` and ``re-process`` Publish accepts further meta data that allows to provide coauthors, comments, @@ -449,7 +450,7 @@ class UploadResource(Resource): Re-process will re-process the upload and produce updated repository metadata and archive. Only published uploads that are not processing at the moment are allowed. Only for uploads where calculations have been processed with an older nomad version. - """ + ''' try: upload = Upload.get(upload_id) except KeyError: @@ -464,12 +465,18 @@ class UploadResource(Resource): operation = json_data.get('operation') - metadata = json_data.get('metadata', {}) - for key in metadata: - if key.startswith('_'): + user_metadata: Dict[str, Any] = json_data.get('metadata', {}) + metadata: Dict[str, Any] = {} + for user_key in user_metadata: + if user_key.startswith('_'): if not g.user.is_admin: abort(401, message='Only admin users can use _metadata_keys.') - break + + key = user_key[1:] + else: + key = user_key + + metadata[key] = user_metadata[user_key] if operation == 'publish': if upload.tasks_running: @@ -519,7 +526,7 @@ class UploadCommandResource(Resource): @api.marshal_with(upload_command_model, code=200, description='Upload command send') @authenticate(required=True) def get(self): - """ Get url and example command for shell based uploads. """ + ''' Get url and example command for shell based uploads. ''' token = generate_upload_token(g.user) upload_url = '%s/uploads/?token=%s' % (config.api_url(ssl=False), token) upload_url_with_name = upload_url + '&name=<name>' diff --git a/nomad/app/common.py b/nomad/app/common.py index 3ca5c2999592ae2abfe58b6e649d08461ce44a18..b4c5323b864af04864976c2097c0daad7210a211 100644 --- a/nomad/app/common.py +++ b/nomad/app/common.py @@ -22,10 +22,10 @@ from nomad import config logger: BoundLogger = None -""" A logger pre configured with information about the current request. """ +''' A logger pre configured with information about the current request. ''' base_path = config.services.api_base_path -""" Provides the root path of the nomad APIs. """ +''' Provides the root path of the nomad APIs. ''' class RFC3339DateTime(fields.DateTime): @@ -41,7 +41,7 @@ rfc3339DateTime = RFC3339DateTime() class DotKeyFieldMixin: - """ Allows use of flask_restplus fields with '.' in key names. By default, '.' + ''' Allows use of flask_restplus fields with '.' in key names. By default, '.' is used as a separator for accessing nested properties. Mixin prevents this, allowing fields to use '.' in the key names. @@ -53,7 +53,7 @@ class DotKeyFieldMixin: flask_restplus tries to fetch values for data['my']['dot']['field'] instead of data['my.dot.field'] which is the desired behaviour in this case. - """ + ''' def output(self, key, obj, **kwargs): transformed_obj = {k.replace(".", "___"): v for k, v in obj.items()} @@ -67,10 +67,10 @@ class DotKeyFieldMixin: @contextmanager def toggle_attribute(self): - """ Context manager to temporarily set self.attribute to None + ''' Context manager to temporarily set self.attribute to None Yields self.attribute before setting to None - """ + ''' attribute = self.attribute self.attribute = None yield attribute diff --git a/nomad/app/optimade/__init__.py b/nomad/app/optimade/__init__.py index b2573598d83d2481db75ddc24a7bd71adc42f59e..913892a96a55341268645f3f0519086b17df30fb 100644 --- a/nomad/app/optimade/__init__.py +++ b/nomad/app/optimade/__init__.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The optimade implementation of NOMAD. -""" +''' from flask import Blueprint from flask_restplus import Api diff --git a/nomad/app/optimade/api.py b/nomad/app/optimade/api.py index 6a9057b4729da46f4c0bbb12f5276bad68d44f4b..c974d712e234498921401f9d0e428db739ee6167 100644 --- a/nomad/app/optimade/api.py +++ b/nomad/app/optimade/api.py @@ -26,7 +26,7 @@ base_url = 'http://%s/%s/optimade' % ( def url(endpoint: str = None, **kwargs): - """ Returns the full optimade api url (for a given endpoint) including query parameters. """ + ''' Returns the full optimade api url (for a given endpoint) including query parameters. ''' if endpoint is None: url = base_url else: @@ -43,7 +43,7 @@ api = Api( version='1.0', title='NOMAD\'s OPTiMaDe API implementation', description='NOMAD\'s OPTiMaDe API implementation, version 0.10.0.', validate=True) -""" Provides the flask restplust api instance for the optimade api""" +''' Provides the flask restplust api instance for the optimade api''' # For some unknown reason it is necessary for each fr api to have a handler. diff --git a/nomad/app/optimade/endpoints.py b/nomad/app/optimade/endpoints.py index 9efb314e01825993b6037f159fbcccd1e508b0de..5f41a8f292bf282c189112b95e1fd8a0beef36e4 100644 --- a/nomad/app/optimade/endpoints.py +++ b/nomad/app/optimade/endpoints.py @@ -41,9 +41,9 @@ def base_request_args(): def base_search_request(): - """ Creates a search request for all public and optimade enabled data. """ + ''' Creates a search request for all public and optimade enabled data. ''' return search.SearchRequest().owner('all', None).query( - Q('exists', field='dft.optimade.nelements')) # TODO use the elastic annotations when done + Q('exists', field='dft.optimade.elements')) # TODO use the elastic annotations when done @ns.route('/calculations') @@ -53,7 +53,7 @@ class CalculationList(Resource): @api.expect(entry_listing_endpoint_parser, validate=True) @api.marshal_with(json_api_list_response_model, skip_none=True, code=200) def get(self): - """ Retrieve a list of calculations that match the given Optimade filter expression. """ + ''' Retrieve a list of calculations that match the given Optimade filter expression. ''' request_fields = base_request_args() try: @@ -106,7 +106,7 @@ class Calculation(Resource): @api.expect(single_entry_endpoint_parser, validate=True) @api.marshal_with(json_api_single_response_model, skip_none=True, code=200) def get(self, id: str): - """ Retrieve a single calculation for the given id. """ + ''' Retrieve a single calculation for the given id. ''' request_fields = base_request_args() search_request = base_search_request().search_parameters(calc_id=id) @@ -134,7 +134,7 @@ class CalculationInfo(Resource): @api.expect(base_endpoint_parser, validate=True) @api.marshal_with(json_api_info_response_model, skip_none=True, code=200) def get(self): - """ Returns information relating to the API implementation- """ + ''' Returns information relating to the API implementation- ''' base_request_args() result = { @@ -160,7 +160,7 @@ class Info(Resource): @api.expect(base_endpoint_parser, validate=True) @api.marshal_with(json_api_single_response_model, skip_none=True, code=200) def get(self): - """ Returns information relating to the API implementation- """ + ''' Returns information relating to the API implementation- ''' base_request_args() result = { diff --git a/nomad/app/optimade/filterparser.py b/nomad/app/optimade/filterparser.py index c238709292e9ef938c78cd04eaf380769425826a..a95e78100af84b5bd8a457ed596ea40919f5b95a 100644 --- a/nomad/app/optimade/filterparser.py +++ b/nomad/app/optimade/filterparser.py @@ -20,17 +20,17 @@ from nomad.metainfo.optimade import OptimadeEntry class FilterException(Exception): - """ Raised on parsing a filter expression with syntactic of semantic errors. """ + ''' Raised on parsing a filter expression with syntactic of semantic errors. ''' pass quantities: Dict[str, Quantity] = { q.name: Quantity( q.name, es_field='dft.optimade.%s' % q.name, - elastic_mapping_type=q.m_annotations['elastic']['type']) + elastic_mapping_type=q.m_x('search').es_mapping.__class__) for q in OptimadeEntry.m_def.all_quantities.values() - if 'elastic' in q.m_annotations} + if 'search' in q.m_annotations} quantities['elements'].length_quantity = quantities['nelements'] quantities['dimension_types'].length_quantity = quantities['dimension_types'] @@ -43,7 +43,7 @@ _transformer = Transformer(quantities=quantities.values()) def parse_filter(filter_str: str) -> Q: - """ Parses the given optimade filter str and returns a suitable elastic search query. + ''' Parses the given optimade filter str and returns a suitable elastic search query. Arguments: filter_str: Can be direct user input with no prior processing. @@ -51,7 +51,7 @@ def parse_filter(filter_str: str) -> Q: Raises: FilterException: If the given str cannot be parsed, or if there are any semantic errors in the given expression. - """ + ''' try: parse_tree = _parser.parse(filter_str) diff --git a/nomad/app/optimade/models.py b/nomad/app/optimade/models.py index ac0643b283cf5b32381f4e06b03108cbb21f072f..9f423a171279a491b21e7ba57c5df9c3cc499386 100644 --- a/nomad/app/optimade/models.py +++ b/nomad/app/optimade/models.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' All the API flask restplus models. -""" +''' from typing import Set from flask_restplus import fields @@ -23,7 +23,7 @@ import math from nomad import config from nomad.app.common import RFC3339DateTime -from nomad.datamodel import CalcWithMetadata +from nomad.datamodel import EntryMetadata from .api import api, base_url, url @@ -235,7 +235,7 @@ json_api_calculation_info_model = api.model('CalculationInfo', { class CalculationDataObject: - def __init__(self, calc: CalcWithMetadata, request_fields: Set[str] = None): + def __init__(self, calc: EntryMetadata, request_fields: Set[str] = None): def include(key): if request_fields is None or (key in request_fields): @@ -243,7 +243,7 @@ class CalculationDataObject: return False - attrs = {key: value for key, value in calc['optimade'].items() if include(key)} + attrs = {key: value for key, value in calc.dft.optimade.m_to_dict().items() if include(key)} self.type = 'calculation' self.id = calc.calc_id diff --git a/nomad/archive.py b/nomad/archive.py index f68c5d85399ac3c8e8b1d97303115cc0aad9ed68..67963f1d0268431be6485241c59fca03bbab3e26 100644 --- a/nomad/archive.py +++ b/nomad/archive.py @@ -33,12 +33,12 @@ class ArchiveError(Exception): class TOCPacker(Packer): - """ + ''' A special msgpack packer that records a TOC while packing. Uses a combination of the pure python msgpack fallback packer and the "real" c-based packing. - """ + ''' def __init__(self, toc_depth: int, *args, **kwargs): self.toc_depth = toc_depth self.toc: Dict[str, Any] = None @@ -403,7 +403,7 @@ class ArchiveReader(ArchiveObject): def write_archive( path_or_file: Union[str, BytesIO], n_entries: int, data: Iterable[Tuple[str, Any]], entry_toc_depth: int = 2) -> None: - """ + ''' Writes a msgpack-based archive file. The file contents will be a valid msgpack-object. The data will contain extra table-of-contents (TOC) objects that map some keys to positions in the file. Data can be partially read from these positions and deserialized @@ -456,14 +456,14 @@ def write_archive( data: The file contents as an iterator of entry id, data tuples. entry_toc_depth: The depth of the table of contents in each entry. Only objects will count for calculating the depth. - """ + ''' with ArchiveWriter(path_or_file, n_entries, entry_toc_depth=entry_toc_depth) as writer: for uuid, entry in data: writer.add(uuid, entry) def read_archive(file_or_path: str, **kwargs) -> ArchiveReader: - """ + ''' Allows to read a msgpack-based archive. Arguments: @@ -475,7 +475,7 @@ def read_archive(file_or_path: str, **kwargs) -> ArchiveReader: A mapping (dict-like) that can be used to access the archive data. The mapping will lazyly load data as it is used. The mapping needs to be closed or used within a 'with' statement to free the underlying file resource after use. - """ + ''' return ArchiveReader(file_or_path, **kwargs) diff --git a/nomad/archive_query.py b/nomad/archive_query.py index 893c39e09975691aef8fb84241bd2a73c7bb8f37..cf1af8b3d510d1e1aa655be215cb732930496720 100644 --- a/nomad/archive_query.py +++ b/nomad/archive_query.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Contains interfaces to the archive metainfo and query. In module ``ArchiveMetainfo``, the data is provided either from raw @@ -32,7 +32,7 @@ and a query schema similar to the archive json format can be provided to filter metainfo = q.query() for c in metainfo.calcs: print(c.section_run.section_single_configuration_calculation[0]({'energy_total':'*'})) -""" +''' import numpy as np import requests @@ -47,11 +47,11 @@ from nomad.cli.client.client import KeycloakAuthenticator class ArchiveMetainfo: - """ + ''' Converts archive data in json format to the new nomad metainfo model Arguments: archive_data: the archive data in json format - """ + ''' def __init__(self, archive_data: List[Dict[str, Any]]): self._archive_data = archive_data self.metainfo = None @@ -107,10 +107,10 @@ class ArchiveMetainfo: @property def calcs(self): - """ + ''' Calculations in metainfo form which can be actively queried by using the get functionality and providing a schema - """ + ''' if not self._calcs: self._init_calcs() for calc_id, calc in self._calcs.items(): @@ -126,9 +126,9 @@ class ArchiveMetainfo: @property def base_metacls(self): - """ + ''' The base metaclass to apply a calculation - """ + ''' if self._base_metacls is None: name = self._prefix self._base_metacls = self._build_meta_cls(self.base_data, name) diff --git a/nomad/cli/__init__.py b/nomad/cli/__init__.py index a04890b79dcae5ef7e0d963514de3712bc05a814..7fb192f3956e326735f05cac81aa2d00584bb1c1 100644 --- a/nomad/cli/__init__.py +++ b/nomad/cli/__init__.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Command line interface (CLI) for nomad. Provides a group/sub-command structure, think git, that offers various functionality to the command line user. Use it from the command line with ``nomad --help`` or ``python -m nomad.cli --help`` to learn more. -""" +''' from nomad.utils import POPO diff --git a/nomad/cli/admin/admin.py b/nomad/cli/admin/admin.py index c44107b25cbd4420568abb9de8e451c2f721682f..8ee72493e58939b1df9a6f5c0a2f1b8d112b694a 100644 --- a/nomad/cli/admin/admin.py +++ b/nomad/cli/admin/admin.py @@ -158,9 +158,7 @@ def lift_embargo(dry, parallel): uploads_to_repack.append(upload) upload.save() - upload_with_metadata = upload.to_upload_with_metadata() - calcs = upload_with_metadata.calcs - search.index_all(calcs) + search.index_all(upload.entries_metadata()) if not dry: __run_processing(uploads_to_repack, parallel, lambda upload: upload.re_pack(), 're-packing') @@ -182,8 +180,8 @@ def index(threads, dry): for calc in proc.Calc.objects(): eta.add() entry = None - entry = search.Entry.from_calc_with_metadata( - datamodel.CalcWithMetadata(**calc.metadata)) + entry = search.create_entry( + datamodel.EntryMetadata.m_from_dict(calc.metadata)) entry = entry.to_dict(include_meta=True) entry['_op_type'] = 'index' yield entry @@ -335,20 +333,20 @@ AllowEncodedSlashes On def write_prototype_data_file(aflow_prototypes: dict, filepath) -> None: - """Writes the prototype data file in a compressed format to a python + '''Writes the prototype data file in a compressed format to a python module. Args: aflow_prototypes - """ + ''' class NoIndent(object): def __init__(self, value): self.value = value class NoIndentEncoder(json.JSONEncoder): - """A custom JSON encoder that can pretty-print objects wrapped in the + '''A custom JSON encoder that can pretty-print objects wrapped in the NoIndent class. - """ + ''' def __init__(self, *args, **kwargs): super(NoIndentEncoder, self).__init__(*args, **kwargs) self.kwargs = dict(kwargs) diff --git a/nomad/cli/admin/migration.py b/nomad/cli/admin/migration.py index 72a4de7d7e6672942170e79fc4b3c01e1d3c25ef..a515b66dd738779b4e49ea0464dd0d15bd5693f3 100644 --- a/nomad/cli/admin/migration.py +++ b/nomad/cli/admin/migration.py @@ -20,7 +20,7 @@ import datetime import json from nomad import utils, processing as proc, search -from nomad.datamodel import CalcWithMetadata +from nomad.datamodel import EntryMetadata from nomad.cli.client.mirror import transform_reference, tarnsform_user_id, transform_dataset @@ -28,14 +28,14 @@ __logger = utils.get_logger(__name__) class SourceCalc(Document): - """ + ''' Mongo document used as a calculation, upload, and metadata db and index build from a given source db. Each :class:`SourceCacl` entry relates a pid, mainfile, upload "id" with each other for a corressponding calculation. It might alos contain the user metadata. The uploads are "id"ed via the specific path segment that identifies an upload on the CoE repo FS(s) without any prefixes (e.g. $EXTRACTED, /data/upload, etc.) - """ + ''' pid = IntField(primary_key=True) mainfile = StringField() upload = StringField() @@ -53,14 +53,14 @@ class SourceCalc(Document): def update_user_metadata(bulk_size: int = 1000, update_index: bool = False, **kwargs): - """ Goes through the whole source index to sync differences between repo user metadata + ''' Goes through the whole source index to sync differences between repo user metadata and metadata in fairdi. It goes through the source index calc by calc, working in bulks. Getting the samedata for fairdi and updating the different calcs in mongo. Will only update user metadata. Uses kwargs as filters for the used source index query. - """ + ''' logger = utils.get_logger(__name__) start_time = time.time() @@ -96,7 +96,7 @@ def update_user_metadata(bulk_size: int = 1000, update_index: bool = False, **kw important_changes['missing_calcs'].setdefault(source.upload, []).append(source.pid) continue - target_metadata = CalcWithMetadata(**target.metadata) + target_metadata = EntryMetadata(**target.metadata) source_metadata_normalized: Dict[str, Any] = dict( comment=source.metadata.get('comment'), references={transform_reference(ref) for ref in source.metadata['references']}, diff --git a/nomad/cli/admin/uploads.py b/nomad/cli/admin/uploads.py index 7d7feb6271b03cc32c54aa3bde72c80c457f2ddd..3fe4d3e87f687700831df158d038c1091c3937aa 100644 --- a/nomad/cli/admin/uploads.py +++ b/nomad/cli/admin/uploads.py @@ -144,8 +144,7 @@ def chown(ctx, username, uploads): for upload in uploads: upload.user_id = user.user_id - upload_with_metadata = upload.to_upload_with_metadata() - calcs = upload_with_metadata.calcs + calcs = upload.entries_metadata() def create_update(calc): return UpdateOne( @@ -155,8 +154,7 @@ def chown(ctx, username, uploads): proc.Calc._get_collection().bulk_write([create_update(calc) for calc in calcs]) upload.save() - upload_with_metadata = upload.to_upload_with_metadata() - calcs = upload_with_metadata.calcs + calcs = upload.entries_metadata() search.index_all(calcs, do_refresh=False) search.refresh() @@ -194,8 +192,7 @@ def index(ctx, uploads): i, failed = 0, 0 for upload in uploads: - upload_with_metadata = upload.to_upload_with_metadata() - calcs = upload_with_metadata.calcs + calcs = upload.entries_metadata() failed += search.index_all(calcs) i += 1 diff --git a/nomad/cli/client/client.py b/nomad/cli/client/client.py index 48c7265a76f85e00f749e98fc1c67f5828740a95..d846a42f8fe2f2915fd4b545d13abba949bd4836 100644 --- a/nomad/cli/client/client.py +++ b/nomad/cli/client/client.py @@ -65,7 +65,7 @@ def __create_client( user: str = nomad_config.client.user, password: str = nomad_config.client.password, ssl_verify: bool = True, use_token: bool = True): - """ A factory method to create the client. """ + ''' A factory method to create the client. ''' if not ssl_verify: import warnings warnings.filterwarnings("ignore") diff --git a/nomad/cli/client/integrationtests.py b/nomad/cli/client/integrationtests.py index 1baca89ed3296311c9a350aa9e7dbe2eafc0d0ee..f22dde7a6f5e2af9361c4353c7854a1da7a6f469 100644 --- a/nomad/cli/client/integrationtests.py +++ b/nomad/cli/client/integrationtests.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' A command that runs some example operations on a working nomad@FAIRDI installation as a final integration test. -""" +''' import time import os diff --git a/nomad/cli/client/local.py b/nomad/cli/client/local.py index e6001c09ca63810fbf469a50a43f1ec0800ee960..427210522eda2c04944f91dc708eb55ff59a8dbc 100644 --- a/nomad/cli/client/local.py +++ b/nomad/cli/client/local.py @@ -24,7 +24,7 @@ import bravado.exception from nomad import config, utils from nomad.files import ArchiveBasedStagingUploadFiles -from nomad.datamodel import CalcWithMetadata +from nomad.datamodel import EntryMetadata from nomad.parsing import LocalBackend from nomad.cli.parse import parse, normalize, normalize_all @@ -32,7 +32,7 @@ from .client import client class CalcProcReproduction: - """ + ''' Instances represent a local reproduction of the processing for a single calculation. It allows to download raw data from a nomad server and reproduce its processing (parsing, normalizing) with the locally installed parsers and normalizers. @@ -44,7 +44,7 @@ class CalcProcReproduction: Arguments: calc_id: The calc_id of the calculation to locally process. override: Set to true to override any existing local calculation data. - """ + ''' def __init__(self, archive_id: str, override: bool = False, mainfile: str = None) -> None: if '/' in archive_id: self.calc_id = utils.archive.calc_id(archive_id) @@ -125,25 +125,25 @@ class CalcProcReproduction: self.upload_files.delete() def parse(self, parser_name: str = None, **kwargs) -> LocalBackend: - """ + ''' Run the given parser on the downloaded calculation. If no parser is given, do parser matching and use the respective parser. - """ + ''' return parse(self.mainfile, self.upload_files, parser_name=parser_name, logger=self.logger, **kwargs) def normalize(self, normalizer: Union[str, Callable], parser_backend: LocalBackend = None): - """ + ''' Parse the downloaded calculation and run the given normalizer. - """ + ''' if parser_backend is None: parser_backend = self.parse() return normalize(parser_backend=parser_backend, normalizer=normalizer, logger=self.logger) def normalize_all(self, parser_backend: LocalBackend = None): - """ + ''' Parse the downloaded calculation and run the whole normalizer chain. - """ + ''' return normalize_all(parser_backend=parser_backend, logger=self.logger) @@ -173,6 +173,6 @@ def local(calc_id, show_backend, show_metadata, skip_normalizers, not_strict, ** backend.write_json(sys.stdout, pretty=True) if show_metadata: - metadata = CalcWithMetadata(domain=local.parser.domain) + metadata = EntryMetadata(domain=local.parser.domain) metadata.apply_domain_metadata(backend) - ujson.dump(metadata.to_dict(), sys.stdout, indent=4) + ujson.dump(metadata.m_to_dict(), sys.stdout, indent=4) diff --git a/nomad/cli/client/mirror.py b/nomad/cli/client/mirror.py index 85b388d65a5d8488949948524a522ee79b339ed8..d3f07fe9ba435458f9574905549a680925491328 100644 --- a/nomad/cli/client/mirror.py +++ b/nomad/cli/client/mirror.py @@ -30,7 +30,7 @@ from .client import client __in_test = False -""" Will be monkeypatched by tests to alter behavior for testing. """ +''' Will be monkeypatched by tests to alter behavior for testing. ''' _Dataset = Dataset.m_def.m_x('me').me_cls __logger = utils.get_logger(__name__) @@ -82,7 +82,7 @@ def transform_reference(reference): def v0Dot6(upload_data): - """ Inplace transforms v0.6.x upload data into v0.7.x upload data. """ + ''' Inplace transforms v0.6.x upload data into v0.7.x upload data. ''' upload = json.loads(upload_data.upload) upload['user_id'] = tarnsform_user_id(upload['user_id']) upload_data.upload = json.dumps(upload) @@ -318,7 +318,7 @@ def mirror( proc.Calc._get_collection().insert(upload_data.calcs) # index es - search.index_all(upload.to_upload_with_metadata().calcs) + search.index_all(upload.entries_metadata()) print( 'Mirrored %s with %d calcs at %s' % diff --git a/nomad/cli/client/statistics.py b/nomad/cli/client/statistics.py index 58339bc2f28e9dccb58541d67f343a513e85a701..1878c5f2a474995c48d4d1d6c1e65f80c0830a7d 100644 --- a/nomad/cli/client/statistics.py +++ b/nomad/cli/client/statistics.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' A command that generates various statistics. -""" +''' from matplotlib import scale as mscale from matplotlib import transforms as mtransforms diff --git a/nomad/cli/client/update_database.py b/nomad/cli/client/update_database.py index 703796fc2ade509c657a10216bc09d440614d0c6..4d1f44bdfc58db3934b1de8e2b06fb30c086f7da 100644 --- a/nomad/cli/client/update_database.py +++ b/nomad/cli/client/update_database.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Automatically synchronizes nomad it with a given database. It creates a list of paths to mainfiles in nomad and compares it with paths in the external database. The missing paths in nomad will then be downloaded from the external database and subsequently uploaded to nomad. The downloaded files are by default saved in '/nomad/fairdi/external'. -""" +''' import requests import re diff --git a/nomad/cli/client/upload.py b/nomad/cli/client/upload.py index fd1ac0848de191fae226cd660f89d5eb28c1a933..a37fe80853a99a8d08be13f710a46bf3084ea080 100644 --- a/nomad/cli/client/upload.py +++ b/nomad/cli/client/upload.py @@ -41,7 +41,7 @@ def stream_upload_with_client(client, stream, name=None): def upload_file(file_path: str, name: str = None, offline: bool = False, publish: bool = False, client=None): - """ + ''' Upload a file to nomad. Arguments: @@ -51,7 +51,7 @@ def upload_file(file_path: str, name: str = None, offline: bool = False, publish publish: automatically publish after successful processing Returns: The upload_id - """ + ''' if client is None: from nomad.cli.client import create_client client = create_client() diff --git a/nomad/cli/parse.py b/nomad/cli/parse.py index f1faaa30cb83c050c02f516b75caacd2926ee43f..be462f842cd0554af68ea08b486d72a5618a14cf 100644 --- a/nomad/cli/parse.py +++ b/nomad/cli/parse.py @@ -8,7 +8,7 @@ from nomad import config, utils, files from nomad.parsing import LocalBackend, parser_dict, match_parser, MatchingParser, MetainfoBackend from nomad.metainfo.legacy import LegacyMetainfoEnvironment from nomad.normalizing import normalizers -from nomad.datamodel import CalcWithMetadata +from nomad.datamodel import EntryMetadata from nomadcore import simple_parser @@ -20,10 +20,10 @@ def parse( parser_name: str = None, backend_factory: Callable = None, strict: bool = True, logger=None) -> LocalBackend: - """ + ''' Run the given parser on the downloaded calculation. If no parser is given, do parser matching and use the respective parser. - """ + ''' if logger is None: logger = utils.get_logger(__name__) if parser_name is not None: @@ -87,9 +87,9 @@ def normalize( def normalize_all(parser_backend: LocalBackend = None, logger=None) -> LocalBackend: - """ + ''' Parse the downloaded calculation and run the whole normalizer chain. - """ + ''' for normalizer in normalizers: parser_backend = normalize(normalizer, parser_backend=parser_backend, logger=logger) @@ -129,6 +129,6 @@ def _parse( if show_backend: backend.write_json(sys.stdout, pretty=True) if show_metadata: - metadata = CalcWithMetadata(domain='dft') # TODO take domain from matched parser + metadata = EntryMetadata(domain='dft') # TODO take domain from matched parser metadata.apply_domain_metadata(backend) - json.dump(metadata.to_dict(), sys.stdout, indent=4) + json.dump(metadata.m_to_dict(), sys.stdout, indent=4) diff --git a/nomad/config.py b/nomad/config.py index 06bcb228eeb28059950f5cbec69193d491dcb990..e195cc62533207246b56cf19864386983df28c6b 100644 --- a/nomad/config.py +++ b/nomad/config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This module describes all configurable parameters for the nomad python code. The configuration is used for all executed python code including API, worker, CLI, and other scripts. To use the configuration in your own scripts or new modules, simply import @@ -30,7 +30,7 @@ over defaults. .. autoclass:: nomad.config.NomadConfig .. autofunction:: nomad.config.apply .. autofunction:: nomad.config.load_config -""" +''' import logging import os @@ -46,10 +46,10 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed") class NomadConfig(dict): - """ + ''' A class for configuration categories. It is a dict subclass that uses attributes as key/value pairs. - """ + ''' def __init__(self, **kwargs): super().__init__(**kwargs) @@ -246,11 +246,11 @@ logger = logging.getLogger(__name__) def apply(key, value) -> None: - """ + ''' Changes the config according to given key and value. The keys are interpreted as paths to config values with ``_`` as a separator. E.g. ``fs_staging`` leading to ``config.fs.staging`` - """ + ''' path = list(reversed(key.split('_'))) child_segment = None current_value = None @@ -299,13 +299,13 @@ def apply(key, value) -> None: def load_config(config_file: str = os.environ.get('NOMAD_CONFIG', 'nomad.yaml')) -> None: - """ + ''' Loads the configuration from the ``config_file`` and environment. Arguments: config_file: Override the configfile, default is file stored in env variable NOMAD_CONFIG or ``nomad.yaml``. - """ + ''' # load yaml and override defaults (only when not in test) if os.path.exists(config_file): with open(config_file, 'r') as stream: diff --git a/nomad/datamodel/__init__.py b/nomad/datamodel/__init__.py index ef643602377f3bd3222377cb7a78e2762c411873..c4f152be38c377a9defbf687b879f9b393e96b16 100644 --- a/nomad/datamodel/__init__.py +++ b/nomad/datamodel/__init__.py @@ -12,39 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This module contains classes that allow to represent the core -nomad data entities :class:`Upload` and :class:`Calc` on a high level of abstraction +nomad data entities (entries/calculations, users, datasets) on a high level of abstraction independent from their representation in the different modules :py:mod:`nomad.processing`, :py:mod:`nomad.coe_repo`, :py:mod:`nomad.parsing`, :py:mod:`nomad.search`, :py:mod:`nomad.app`, :py:mod:`nomad.migration`. It is not about representing every detail, but those parts that are directly involved in -api, processing, migration, mirroring, or other 'infrastructure' operations. - -Transformations between different implementations of the same entity can be build -and used. To ease the number of necessary transformations the classes -:class:`UploadWithMetadata` and :class:`CalcWithMetadata` can act as intermediate -representations. Therefore, implement only transformation from and to these -classes. These are the implemented transformations: - -.. image:: datamodel_transformations.png - -.. autoclass:: nomad.datamodel.UploadWithMetadata - :members: -.. autoclass:: nomad.datamodel.CalcWithMetadata - :members: - -The class :class:`CalcWithMetadata` only defines non domain specific metadata quantities -about ids, user metadata, etc. To define domain specific quantities :class:`CalcWithMetadata` -must be subclassed. The classes -:class:`Domain` and :class:`DomainQuantity` can be used to further define domain specific -quantities. - -.. autoclass:: nomad.datamodel.Domain - :members: -.. autoclass:: nomad.datamodel.DomainQuantity - :members: +api, processing, mirroring, or other 'infrastructure' operations. The class :class:`User` is used to represent users and their attributes. @@ -55,12 +31,33 @@ The class :class:`Dataset` is used to represent datasets and their attributes. .. autoclass:: nomad.datamodel.Dataset :members: -""" -import sys +The class :class:`UserMetadata` is used to represent user determined entry metadata. + +.. autoclass:: nomad.datamodel.UserMetadata + :members: + +The class :class:`EntryMetadata` is used to represent all metadata about an entry. -from nomad.datamodel.base import UploadWithMetadata, CalcWithMetadata, Domain, DomainQuantity -from nomad.datamodel import ems, dft -from nomad.datamodel.dft import DFTCalcWithMetadata -from nomad.datamodel.ems import EMSEntryWithMetadata -from nomad.datamodel.metainfo import Dataset, User, UserMetadata +.. autoclass:: nomad.datamodel.EntryMetadata + :members: +''' + +from .dft import DFTMetadata +from .ems import EMSMetadata +from .metainfo import Dataset, User, EditableUserMetadata, UserMetadata, EntryMetadata + +domains = { + 'dft': { + 'metadata': DFTMetadata, + 'metainfo_all_package': 'all.nomadmetainfo.json', + 'root_section': 'section_run' + }, + 'ems': { + 'metadata': EMSMetadata, + 'metainfo_all_package': 'all.experimental.nomadmetainfo.json', + 'root_section': 'section_experiment' + } +} + +root_sections = [domain['root_section'] for domain in domains.values()] + ['section_entry_info'] diff --git a/nomad/datamodel/base.py b/nomad/datamodel/base.py index 5dd084ebc92f3825b81e5e9390aca271323d08a4..99705430c081d9468eec637d3538da74a3e1a34e 100644 --- a/nomad/datamodel/base.py +++ b/nomad/datamodel/base.py @@ -12,508 +12,335 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, List, Dict, Type, Tuple, Callable, Any -import datetime -from elasticsearch_dsl import Keyword, Integer -from collections.abc import Mapping import numpy as np -import ase.data from nomad import config -from .metainfo import Dataset, User - - -class UploadWithMetadata(): - """ - See :class:`CalcWithMetadata`. - """ - - def __init__(self, **kwargs): - self.upload_id: str = None - self.uploader: str = None - self.upload_time: datetime.datetime = None - - self.calcs: Iterable['CalcWithMetadata'] = list() - - for key, value in kwargs.items(): - setattr(self, key, value) - - @property - def calcs_dict(self) -> Dict[str, 'CalcWithMetadata']: - return {calc.calc_id: calc for calc in self.calcs} - - -class CalcWithMetadata(Mapping): - """ - A dict/POPO class that can be used for mapping calc representations with calc metadata. - We have multi representations of calcs and their calc metadata. To avoid implement - mappings between all combinations, just implement mappings with the class and use - mapping transitivity. E.g. instead of A -> B, A -> this -> B. - - This is basically an abstract class and it has to be subclassed for each :class:`Domain`. - Subclasses can define additional attributes and have to implement :func:`apply_domain_metadata` - to fill these attributes from processed entries, i.e. instance of :class:`nomad.parsing.LocalBackend`. - - Attributes: - domain: Must be the key for a registered domain. This determines which actual - subclass is instantiated. - upload_id: The ``upload_id`` of the calculations upload (random UUID). - calc_id: The unique mainfile based calculation id. - calc_hash: The raw file content based checksum/hash of this calculation. - pid: The unique persistent id of this calculation. - mainfile: The upload relative mainfile path. - - files: A list of all files, relative to upload. - upload_time: The time when the calc was uploaded. - uploader: An object describing the uploading user, has at least ``user_id`` - processed: Boolean indicating if this calc was successfully processed and archive - data and calc metadata is available. - last_processing: A datatime with the time of the last successful processing. - nomad_version: A string that describes the version of the nomad software that was - used to do the last successful processing. - - with_embargo: Show if user set an embargo on the calculation. - coauthors: List of coauther user objects with at ``user_id``. - shared_with: List of users this calcs ownership is shared with, objects with at ``user_id``. - comment: String comment. - references: Objects describing user provided references, keys are ``id`` and ``value``. - datasets: A list of dataset ids. The corresponding :class:`Dataset`s must exist. - """ - - def __new__(cls, domain: str = None, **kwargs): - if domain is not None: - domain_obj = Domain.instances.get(domain) - assert domain_obj is not None - return super().__new__(domain_obj.domain_entry_class) - else: - return super().__new__(cls) - - def __init__(self, domain: str = None, **kwargs): - self.domain = domain - - # id relevant metadata - self.upload_id: str = None - self.calc_id: str = None - self.calc_hash: str = None - self.mainfile: str = None - self.pid: int = None - self.raw_id: str = None - - # basic upload and processing related metadata - self.upload_time: datetime.datetime = None - self.upload_name: str = None - self.files: List[str] = None - self.uploader: str = None - self.processed: bool = False - self.last_processing: datetime.datetime = None - self.nomad_version: str = None - self.nomad_commit: str = None - - # user metadata, i.e. quantities given and editable by the user - self.with_embargo: bool = None - self.published: bool = False - self.coauthors: List[str] = [] - self.shared_with: List[str] = [] - self.comment: str = None - self.references: List[str] = [] - self.datasets: List[str] = [] - self.external_id: str = None - self.last_edit: datetime.datetime = None - - # parser related general (not domain specific) metadata - self.parser_name = None - - # domain generic metadata - self.formula: str = None - self.atoms: List[str] = [] - self.n_atoms: int = 0 - - self.update(**kwargs) - - def __getitem__(self, key): - value = getattr(self, key, None) - - if value is None or key in ['backend']: - raise KeyError() - - return value - - def __iter__(self): - for key, value in self.__dict__.items(): - if value is None or key in ['backend']: - continue - - yield key - - def __len__(self): - count = 0 - for key, value in self.__dict__.items(): - if value is None or key in ['backend']: - continue - count += 1 - - return count - - def to_dict(self): - return {key: value for key, value in self.items()} - - def __str__(self): - return str(self.to_dict()) - - def update(self, **kwargs): - for key, value in kwargs.items(): - if value is None: - continue - - setattr(self, key, value) - - def apply_user_metadata(self, metadata: dict): - """ - Applies a user provided metadata dict to this calc. - """ - self.pid = metadata.get('_pid', self.pid) - self.comment = metadata.get('comment', self.comment) - self.upload_time = metadata.get('_upload_time', self.upload_time) - uploader_id = metadata.get('_uploader') - if uploader_id is not None: - self.uploader = uploader_id - self.references = metadata.get('references', []) - self.with_embargo = metadata.get('with_embargo', self.with_embargo) - self.coauthors = [ - user_id for user_id in metadata.get('coauthors', self.coauthors) - if User.get(user_id=user_id) is not None] - self.shared_with = [ - user_id for user_id in metadata.get('shared_with', self.shared_with) - if User.get(user_id=user_id) is not None] - self.datasets = [ - dataset_id for dataset_id in metadata.get('datasets', self.datasets) - if Dataset.m_def.m_x('me').get(dataset_id=dataset_id) is not None] - self.external_id = metadata.get('external_id') - - def apply_domain_metadata(self, backend): - raise NotImplementedError() - - -class DomainQuantity: - """ - This class can be used to define further details about a domain specific metadata - quantity. - - Attributes: - name: The name of the quantity, also the key used to store values in - :class:`CalcWithMetadata` - description: A human friendly description. The description is used to define - the swagger documentation on the relevant API endpoints. - multi: Indicates a list of values. This is important for the elastic mapping. - order_default: Indicates that this metric should be used for the default order of - search results. - aggregations: Indicates that search aggregations (and how many) should be provided. - 0 (the default) means no aggregations. - metric: Indicates that this quantity should be used as search metric. Values need - to be tuples with metric name and elastic aggregation (e.g. sum, cardinality) - elastic_mapping: An optional elasticsearch_dsl mapping. Default is ``Keyword``. - elastic_search_type: An optional elasticsearch search type. Default is ``term``. - elastic_field: An optional elasticsearch key. Default is the name of the quantity. - elastic_value: A collable that takes a :class:`CalcWithMetadata` as input and produces the - value for the elastic search index. - argparse_action: Action to use on argparse, either append or split for multi values. Append is default. - """ - - def __init__( - self, description: str = None, multi: bool = False, aggregations: int = 0, - order_default: bool = False, metric: Tuple[str, str] = None, - metadata_field: str = None, elastic_mapping: type = None, - elastic_search_type: str = 'term', elastic_field: str = None, - elastic_value: Callable[[Any], Any] = None, - argparse_action: str = 'append'): - - self.domain: str = None - self._name: str = None - self.description = description - self.multi = multi - self.order_default = order_default - self.aggregations = aggregations - self.metric = metric - self.elastic_mapping = elastic_mapping - self.elastic_search_type = elastic_search_type - self.metadata_field = metadata_field - self.elastic_field = elastic_field - self.argparse_action = argparse_action - - self.elastic_value = elastic_value - if self.elastic_value is None: - self.elastic_value = lambda o: o - - if self.elastic_mapping is None: - self.elastic_mapping = Keyword(multi=self.multi) - - @property - def name(self) -> str: - return self._name - - @name.setter - def name(self, name: str) -> None: - self._name = name - if self.metadata_field is None: - self.metadata_field = name - if self.elastic_field is None: - self.elastic_field = self.name - - @property - def qualified_elastic_field(self) -> str: - if self.domain is None: - return self.elastic_field - else: - return '%s.%s' % (self.domain, self.elastic_field) - - @property - def qualified_name(self) -> str: - if self.domain is None: - return self.name - else: - return '%s.%s' % (self.domain, self.name) - - -def only_atoms(atoms): - numbers = [ase.data.atomic_numbers[atom] for atom in atoms] - only_atoms = [ase.data.chemical_symbols[number] for number in sorted(numbers)] - return ''.join(only_atoms) - - -class Domain: - """ - A domain defines all metadata quantities that are specific to a certain scientific - domain, e.g. DFT calculations, or experimental material science. - - Each domain needs to define a subclass of :class:`CalcWithMetadata`. This - class has to define the necessary domain specific metadata quantities and how these - are filled from parser results (usually an instance of :class:LocalBackend). - - Furthermore, the class method :func:`register_domain` of this ``Domain`` class has - to be used to register a domain with ``domain_nam``. This also allows to provide - further descriptions on each domain specific quantity via instance of :class:`DomainQuantity`. - - While there can be multiple domains registered. Currently, only one domain can be - active. This active domain is define in the configuration using the ``domain_name``. - - Arguments: - name: A name for the domain. This is used as key in the configuration ``config.domain``. - domain_entry_class: A subclass of :class:`CalcWithMetadata` that adds the - domain specific quantities. - quantities: Additional specifications for the quantities in ``domain_entry_class`` as - instances of :class:`DomainQuantity`. - metrics: Tuples of elastic field name and elastic aggregation operation that - can be used to create statistic values. - groups: Tuple of quantity name and metric that describes quantities that - can be used to group entries by quantity values. - root_sections: The name of the possible root sections for this domain. - metainfo_all_package: The name of the full metainfo package for this domain. - """ - instances: Dict[str, 'Domain'] = {} - - base_quantities = dict( - authors=DomainQuantity( - elastic_field='authors.name.keyword', multi=True, aggregations=1000, - description=( - 'Search for the given author. Exact keyword matches in the form "Lastname, ' - 'Firstname".')), - uploader_id=DomainQuantity( - elastic_field='uploader.user_id', multi=False, aggregations=5, - description=('Search for the given uploader id.')), - uploader_name=DomainQuantity( - elastic_field='uploader.name.keyword', multi=False, - description=('Search for the exact uploader\'s full name')), - comment=DomainQuantity( - elastic_search_type='match', multi=True, - description='Search within the comments. This is a text search ala google.'), - paths=DomainQuantity( - elastic_search_type='match', elastic_field='files', multi=True, - description='Search for elements in one of the file paths. The paths are split at all "/".'), - files=DomainQuantity( - elastic_field='files.keyword', multi=True, - description='Search for exact file name with full path.'), - quantities=DomainQuantity( - multi=True, - description='Search for the existence of a certain meta-info quantity'), - upload_id=DomainQuantity( - description='Search for the upload_id.', - multi=True, argparse_action='split', elastic_search_type='terms'), - upload_time=DomainQuantity( - description='Search for the exact upload time.', elastic_search_type='terms'), - upload_name=DomainQuantity( - description='Search for the upload_name.', - multi=True, argparse_action='split', elastic_search_type='terms'), - calc_id=DomainQuantity( - description='Search for the calc_id.', - multi=True, argparse_action='split', elastic_search_type='terms'), - pid=DomainQuantity( - description='Search for the pid.', - multi=True, argparse_action='split', elastic_search_type='terms'), - raw_id=DomainQuantity( - description='Search for the raw_id.', - multi=True, argparse_action='split', elastic_search_type='terms'), - mainfile=DomainQuantity( - description='Search for the mainfile.', - multi=True, argparse_action='append', elastic_search_type='terms'), - external_id=DomainQuantity( - description='External user provided id. Does not have to be unique necessarily.', - multi=True, argparse_action='split', elastic_search_type='terms'), - calc_hash=DomainQuantity( - description='Search for the entries hash.', - multi=True, argparse_action='split', elastic_search_type='terms'), - dataset=DomainQuantity( - elastic_field='datasets.name', multi=True, elastic_search_type='match', - description='Search for a particular dataset by name.'), - dataset_id=DomainQuantity( - elastic_field='datasets.id', multi=True, - description='Search for a particular dataset by its id.'), - doi=DomainQuantity( - elastic_field='datasets.doi', multi=True, - description='Search for a particular dataset by doi (incl. http://dx.doi.org).'), - formula=DomainQuantity( - 'The chemical (hill) formula of the simulated system.', - order_default=True), - atoms=DomainQuantity( - 'The atom labels of all atoms in the simulated system.', - aggregations=len(ase.data.chemical_symbols), multi=True), - only_atoms=DomainQuantity( - 'The atom labels concatenated in species-number order. Used with keyword search ' - 'to facilitate exclusive searches.', - elastic_value=only_atoms, metadata_field='atoms', multi=True), - n_atoms=DomainQuantity( - 'Number of atoms in the simulated system', - elastic_mapping=Integer())) - - base_metrics = dict( - datasets=('dataset_id', 'cardinality'), - uploads=('upload_id', 'cardinality'), - uploaders=('uploader_name', 'cardinality'), - authors=('authors', 'cardinality'), - unique_entries=('calc_hash', 'cardinality')) - - base_groups = dict( - datasets=('dataset_id', 'datasets'), - uploads=('upload_id', 'uploads')) - - @classmethod - def get_quantity(cls, name_spec) -> DomainQuantity: - """ - Returns the quantity definition for the given quantity name. The name can be the - qualified name (``domain.quantity``) or in Django-style (``domain__quantity``). - """ - qualified_name = name_spec.replace('__', '.') - split_name = qualified_name.split('.') - if len(split_name) == 1: - return cls.base_quantities[split_name[0]] - elif len(split_name) == 2: - return cls.instances[split_name[0]].quantities[split_name[1]] - else: - assert False, 'qualified quantity name depth must be 2 max' - - @classmethod - def all_quantities(cls) -> Iterable[DomainQuantity]: - return set([quantity for domain in cls.instances.values() for quantity in domain.quantities.values()]) - - def __init__( - self, name: str, domain_entry_class: Type[CalcWithMetadata], - quantities: Dict[str, DomainQuantity], - metrics: Dict[str, Tuple[str, str]], - groups: Dict[str, Tuple[str, str]], - default_statistics: List[str], - root_sections=['section_run', 'section_entry_info'], - metainfo_all_package='all.nomadmetainfo.json') -> None: - - domain_quantities = quantities - - Domain.instances[name] = self - - self.name = name - self.domain_entry_class = domain_entry_class - self.domain_quantities: Dict[str, DomainQuantity] = {} - self.root_sections = root_sections - self.metainfo_all_package = metainfo_all_package - self.default_statistics = default_statistics - - reference_domain_calc = CalcWithMetadata(domain=name) - reference_general_calc = CalcWithMetadata(domain=None) - - # add non specified quantities from additional metadata class fields - for quantity_name in reference_domain_calc.__dict__.keys(): - if not hasattr(reference_general_calc, quantity_name): - quantity = domain_quantities.get(quantity_name, None) - if quantity is None: - domain_quantities[quantity_name] = DomainQuantity() - - # ensure domain quantity names and domains - for quantity_name, quantity in domain_quantities.items(): - quantity.domain = name - quantity.name = quantity_name - - # add domain prefix to domain metrics and groups - domain_metrics = { - '%s.%s' % (name, key): (quantities[quantity].qualified_elastic_field, es_op) - for key, (quantity, es_op) in metrics.items()} - domain_groups = { - '%s.%s' % (name, key): (quantities[quantity].qualified_name, '%s.%s' % (name, metric)) - for key, (quantity, metric) in groups.items()} - - # add all domain quantities - for quantity_name, quantity in domain_quantities.items(): - self.domain_quantities[quantity.name] = quantity - - # update the multi status from an example value - if quantity.metadata_field in reference_domain_calc.__dict__: - quantity.multi = isinstance( - reference_domain_calc.__dict__[quantity.metadata_field], list) - - assert not hasattr(reference_general_calc, quantity_name), \ - 'quantity overrides general non domain quantity: %s' % quantity_name - - # construct search quantities from base and domain quantities - self.quantities = dict(**Domain.base_quantities) - for quantity_name, quantity in self.quantities.items(): - quantity.name = quantity_name - self.quantities.update(self.domain_quantities) - - assert any(quantity.order_default for quantity in Domain.instances[name].quantities.values()), \ - 'you need to define a order default quantity' - - # construct metrics from base and domain metrics - self.metrics = dict(**Domain.base_metrics) - self.metrics.update(**domain_metrics) - self.groups = dict(**Domain.base_groups) - self.groups.update(**domain_groups) - - @property - def metrics_names(self) -> Iterable[str]: - """ Just the names of all metrics. """ - return list(self.metrics.keys()) - - @property - def aggregations(self) -> Dict[str, int]: - """ - The search aggregations and the default maximum number of calculated buckets. See also - :func:`nomad.search.aggregations`. - """ - return { - quantity.name: quantity.aggregations - for quantity in self.quantities.values() - if quantity.aggregations > 0 - } - - @property - def aggregations_names(self) -> Iterable[str]: - """ Just the names of all metrics. """ - return list(self.aggregations.keys()) - - @property - def order_default_quantity(self) -> str: - for quantity in self.quantities.values(): - if quantity.order_default: - return quantity.qualified_name - - assert False, 'each domain must defina an order_default quantity' +# from .metainfo import Dataset, User, EntryMetadata + + +# class DomainQuantity: +# ''' +# This class can be used to define further details about a domain specific metadata +# quantity. + +# Attributes: +# name: The name of the quantity, also the key used to store values in +# :class:`EntryMetadata` +# description: A human friendly description. The description is used to define +# the swagger documentation on the relevant API endpoints. +# multi: Indicates a list of values. This is important for the elastic mapping. +# order_default: Indicates that this metric should be used for the default order of +# search results. +# aggregations: Indicates that search aggregations (and how many) should be provided. +# 0 (the default) means no aggregations. +# metric: Indicates that this quantity should be used as search metric. Values need +# to be tuples with metric name and elastic aggregation (e.g. sum, cardinality) +# elastic_mapping: An optional elasticsearch_dsl mapping. Default is ``Keyword``. +# elastic_search_type: An optional elasticsearch search type. Default is ``term``. +# elastic_field: An optional elasticsearch key. Default is the name of the quantity. +# elastic_value: A collable that takes a :class:`EntryMetadata` as input and produces the +# value for the elastic search index. +# argparse_action: Action to use on argparse, either append or split for multi values. Append is default. +# ''' + +# def __init__( +# self, description: str = None, multi: bool = False, aggregations: int = 0, +# order_default: bool = False, metric: Tuple[str, str] = None, +# metadata_field: str = None, elastic_mapping: type = None, +# elastic_search_type: str = 'term', elastic_field: str = None, +# elastic_value: Callable[[Any], Any] = None, +# argparse_action: str = 'append'): + +# self.domain: str = None +# self._name: str = None +# self.description = description +# self.multi = multi +# self.order_default = order_default +# self.aggregations = aggregations +# self.metric = metric +# self.elastic_mapping = elastic_mapping +# self.elastic_search_type = elastic_search_type +# self.metadata_field = metadata_field +# self.elastic_field = elastic_field +# self.argparse_action = argparse_action + +# self.elastic_value = elastic_value +# if self.elastic_value is None: +# self.elastic_value = lambda o: o + +# if self.elastic_mapping is None: +# self.elastic_mapping = Keyword(multi=self.multi) + +# @property +# def name(self) -> str: +# return self._name + +# @name.setter +# def name(self, name: str) -> None: +# self._name = name +# if self.metadata_field is None: +# self.metadata_field = name +# if self.elastic_field is None: +# self.elastic_field = self.name + +# @property +# def qualified_elastic_field(self) -> str: +# if self.domain is None: +# return self.elastic_field +# else: +# return '%s.%s' % (self.domain, self.elastic_field) + +# @property +# def qualified_name(self) -> str: +# if self.domain is None: +# return self.name +# else: +# return '%s.%s' % (self.domain, self.name) + + +# def only_atoms(atoms): +# numbers = [ase.data.atomic_numbers[atom] for atom in atoms] +# only_atoms = [ase.data.chemical_symbols[number] for number in sorted(numbers)] +# return ''.join(only_atoms) + + +# class Domain: +# ''' +# A domain defines all metadata quantities that are specific to a certain scientific +# domain, e.g. DFT calculations, or experimental material science. + +# Each domain needs to define a subclass of :class:`EntryMetadata`. This +# class has to define the necessary domain specific metadata quantities and how these +# are filled from parser results (usually an instance of :class:LocalBackend). + +# Furthermore, the class method :func:`register_domain` of this ``Domain`` class has +# to be used to register a domain with ``domain_nam``. This also allows to provide +# further descriptions on each domain specific quantity via instance of :class:`DomainQuantity`. + +# While there can be multiple domains registered. Currently, only one domain can be +# active. This active domain is define in the configuration using the ``domain_name``. + +# Arguments: +# name: A name for the domain. This is used as key in the configuration ``config.domain``. +# domain_entry_class: A subclass of :class:`EntryMetadata` that adds the +# domain specific quantities. +# quantities: Additional specifications for the quantities in ``domain_entry_class`` as +# instances of :class:`DomainQuantity`. +# metrics: Tuples of elastic field name and elastic aggregation operation that +# can be used to create statistic values. +# groups: Tuple of quantity name and metric that describes quantities that +# can be used to group entries by quantity values. +# root_sections: The name of the possible root sections for this domain. +# metainfo_all_package: The name of the full metainfo package for this domain. +# ''' +# instances: Dict[str, 'Domain'] = {} + +# base_quantities = dict( +# authors=DomainQuantity( +# elastic_field='authors.name.keyword', multi=True, aggregations=1000, +# description=( +# 'Search for the given author. Exact keyword matches in the form "Lastname, ' +# 'Firstname".')), +# uploader_id=DomainQuantity( +# elastic_field='uploader.user_id', multi=False, aggregations=5, +# description=('Search for the given uploader id.')), +# uploader_name=DomainQuantity( +# elastic_field='uploader.name.keyword', multi=False, +# description=('Search for the exact uploader\'s full name')), +# comment=DomainQuantity( +# elastic_search_type='match', multi=True, +# description='Search within the comments. This is a text search ala google.'), +# paths=DomainQuantity( +# elastic_search_type='match', elastic_field='files', multi=True, +# description='Search for elements in one of the file paths. The paths are split at all "/".'), +# files=DomainQuantity( +# elastic_field='files.keyword', multi=True, +# description='Search for exact file name with full path.'), +# quantities=DomainQuantity( +# multi=True, +# description='Search for the existence of a certain meta-info quantity'), +# upload_id=DomainQuantity( +# description='Search for the upload_id.', +# multi=True, argparse_action='split', elastic_search_type='terms'), +# upload_time=DomainQuantity( +# description='Search for the exact upload time.', elastic_search_type='terms'), +# upload_name=DomainQuantity( +# description='Search for the upload_name.', +# multi=True, argparse_action='split', elastic_search_type='terms'), +# calc_id=DomainQuantity( +# description='Search for the calc_id.', +# multi=True, argparse_action='split', elastic_search_type='terms'), +# pid=DomainQuantity( +# description='Search for the pid.', +# multi=True, argparse_action='split', elastic_search_type='terms'), +# raw_id=DomainQuantity( +# description='Search for the raw_id.', +# multi=True, argparse_action='split', elastic_search_type='terms'), +# mainfile=DomainQuantity( +# description='Search for the mainfile.', +# multi=True, argparse_action='append', elastic_search_type='terms'), +# external_id=DomainQuantity( +# description='External user provided id. Does not have to be unique necessarily.', +# multi=True, argparse_action='split', elastic_search_type='terms'), +# calc_hash=DomainQuantity( +# description='Search for the entries hash.', +# multi=True, argparse_action='split', elastic_search_type='terms'), +# dataset=DomainQuantity( +# elastic_field='datasets.name', multi=True, elastic_search_type='match', +# description='Search for a particular dataset by name.'), +# dataset_id=DomainQuantity( +# elastic_field='datasets.id', multi=True, +# description='Search for a particular dataset by its id.'), +# doi=DomainQuantity( +# elastic_field='datasets.doi', multi=True, +# description='Search for a particular dataset by doi (incl. http://dx.doi.org).'), +# formula=DomainQuantity( +# 'The chemical (hill) formula of the simulated system.', +# order_default=True), +# atoms=DomainQuantity( +# 'The atom labels of all atoms in the simulated system.', +# aggregations=len(ase.data.chemical_symbols), multi=True), +# only_atoms=DomainQuantity( +# 'The atom labels concatenated in species-number order. Used with keyword search ' +# 'to facilitate exclusive searches.', +# elastic_value=only_atoms, metadata_field='atoms', multi=True), +# n_atoms=DomainQuantity( +# 'Number of atoms in the simulated system', +# elastic_mapping=Integer())) + +# base_metrics = dict( +# datasets=('dataset_id', 'cardinality'), +# uploads=('upload_id', 'cardinality'), +# uploaders=('uploader_name', 'cardinality'), +# authors=('authors', 'cardinality'), +# unique_entries=('calc_hash', 'cardinality')) + +# base_groups = dict( +# datasets=('dataset_id', 'datasets'), +# uploads=('upload_id', 'uploads')) + +# @classmethod +# def get_quantity(cls, name_spec) -> DomainQuantity: +# ''' +# Returns the quantity definition for the given quantity name. The name can be the +# qualified name (``domain.quantity``) or in Django-style (``domain__quantity``). +# ''' +# qualified_name = name_spec.replace('__', '.') +# split_name = qualified_name.split('.') +# if len(split_name) == 1: +# return cls.base_quantities[split_name[0]] +# elif len(split_name) == 2: +# return cls.instances[split_name[0]].quantities[split_name[1]] +# else: +# assert False, 'qualified quantity name depth must be 2 max' + +# @classmethod +# def all_quantities(cls) -> Iterable[DomainQuantity]: +# return set([quantity for domain in cls.instances.values() for quantity in domain.quantities.values()]) + +# def __init__( +# self, name: str, domain_entry_class: Type[EntryMetadata], +# quantities: Dict[str, DomainQuantity], +# metrics: Dict[str, Tuple[str, str]], +# groups: Dict[str, Tuple[str, str]], +# default_statistics: List[str], +# root_sections=['section_run', 'section_entry_info'], +# metainfo_all_package='all.nomadmetainfo.json') -> None: + +# domain_quantities = quantities + +# Domain.instances[name] = self + +# self.name = name +# self.domain_entry_class = domain_entry_class +# self.domain_quantities: Dict[str, DomainQuantity] = {} +# self.root_sections = root_sections +# self.metainfo_all_package = metainfo_all_package +# self.default_statistics = default_statistics + +# # TODO +# return + +# reference_domain_calc = EntryMetadata(domain=name) +# reference_general_calc = EntryMetadata(domain=None) + +# # add non specified quantities from additional metadata class fields +# for quantity_name in reference_domain_calc.__dict__.keys(): +# if not hasattr(reference_general_calc, quantity_name): +# quantity = domain_quantities.get(quantity_name, None) +# if quantity is None: +# domain_quantities[quantity_name] = DomainQuantity() + +# # ensure domain quantity names and domains +# for quantity_name, quantity in domain_quantities.items(): +# quantity.domain = name +# quantity.name = quantity_name + +# # add domain prefix to domain metrics and groups +# domain_metrics = { +# '%s.%s' % (name, key): (quantities[quantity].qualified_elastic_field, es_op) +# for key, (quantity, es_op) in metrics.items()} +# domain_groups = { +# '%s.%s' % (name, key): (quantities[quantity].qualified_name, '%s.%s' % (name, metric)) +# for key, (quantity, metric) in groups.items()} + +# # add all domain quantities +# for quantity_name, quantity in domain_quantities.items(): +# self.domain_quantities[quantity.name] = quantity + +# # update the multi status from an example value +# if quantity.metadata_field in reference_domain_calc.__dict__: +# quantity.multi = isinstance( +# reference_domain_calc.__dict__[quantity.metadata_field], list) + +# assert not hasattr(reference_general_calc, quantity_name), \ +# 'quantity overrides general non domain quantity: %s' % quantity_name + +# # construct search quantities from base and domain quantities +# self.quantities = dict(**Domain.base_quantities) +# for quantity_name, quantity in self.quantities.items(): +# quantity.name = quantity_name +# self.quantities.update(self.domain_quantities) + +# assert any(quantity.order_default for quantity in Domain.instances[name].quantities.values()), \ +# 'you need to define a order default quantity' + +# # construct metrics from base and domain metrics +# self.metrics = dict(**Domain.base_metrics) +# self.metrics.update(**domain_metrics) +# self.groups = dict(**Domain.base_groups) +# self.groups.update(**domain_groups) + +# @property +# def metrics_names(self) -> Iterable[str]: +# ''' Just the names of all metrics. ''' +# return list(self.metrics.keys()) + +# @property +# def aggregations(self) -> Dict[str, int]: +# ''' +# The search aggregations and the default maximum number of calculated buckets. See also +# :func:`nomad.search.aggregations`. +# ''' +# return { +# quantity.name: quantity.aggregations +# for quantity in self.quantities.values() +# if quantity.aggregations > 0 +# } + +# @property +# def aggregations_names(self) -> Iterable[str]: +# ''' Just the names of all metrics. ''' +# return list(self.aggregations.keys()) + +# @property +# def order_default_quantity(self) -> str: +# for quantity in self.quantities.values(): +# if quantity.order_default: +# return quantity.qualified_name + +# assert False, 'each domain must defina an order_default quantity' def get_optional_backend_value(backend, key, section, unavailable_value=None, logger=None): diff --git a/nomad/datamodel/dft.py b/nomad/datamodel/dft.py index f9cdb22341f0dd2ee7fdb677852a72afa8aa444f..49efdfcd8165e1c7aadbd3c1fe95e443f2852a08 100644 --- a/nomad/datamodel/dft.py +++ b/nomad/datamodel/dft.py @@ -12,21 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' DFT specific metadata -""" +''' -from typing import List import re -from elasticsearch_dsl import Integer, Object, InnerDoc, Keyword from nomadcore.local_backend import ParserEvent from nomad import utils, config -from nomad.metainfo import optimade, MSection, Section, Quantity, MEnum -from nomad.metainfo.elastic import elastic_mapping, elastic_obj +from nomad.metainfo import optimade, MSection, Section, Quantity, MEnum, SubSection +from nomad.metainfo.search import SearchQuantity -from .base import CalcWithMetadata, DomainQuantity, Domain, get_optional_backend_value +from .base import get_optional_backend_value xc_treatments = { @@ -38,7 +36,7 @@ xc_treatments = { 'vdw': 'vdW', 'lda': 'LDA', } -""" https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-meta-info/wikis/metainfo/XC-functional """ +''' https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-meta-info/wikis/metainfo/XC-functional ''' basis_sets = { 'gaussians': 'gaussians', @@ -70,81 +68,130 @@ def simplify_version(version): class Label(MSection): - """ + ''' Label that further classify a structure. Attributes: label: The label as a string type: The type of the label source: The source that this label was taken from. - """ - m_def = Section(a_elastic=dict(type=InnerDoc)) - - label = Quantity(type=str, a_elastic=dict(type=Keyword)) + ''' + label = Quantity(type=str, a_search=SearchQuantity()) type = Quantity(type=MEnum( 'compound_class', 'classification', 'prototype', 'prototype_id'), - a_elastic=dict(type=Keyword)) + a_search=SearchQuantity()) source = Quantity( type=MEnum('springer', 'aflow_prototype_library'), - a_elastic=dict(type=Keyword)) - - -ESLabel = elastic_mapping(Label.m_def, InnerDoc) - - -class DFTCalcWithMetadata(CalcWithMetadata): - - def __init__(self, **kwargs): - self.basis_set: str = None - self.xc_functional: str = None - self.system: str = None - self.crystal_system: str = None - self.spacegroup: str = None - self.spacegroup_symbol: str = None - self.code_name: str = None - self.code_version: str = None - - self.n_geometries = 0 - self.n_calculations = 0 - self.n_total_energies = 0 - self.n_quantities = 0 - self.quantities = [] - self.geometries = [] - self.group_hash: str = None - - self.labels: List[Label] = [] - self.optimade: optimade.OptimadeEntry = None - - super().__init__(**kwargs) - - def update(self, **kwargs): - super().update(**kwargs) - - if len(self.labels) > 0: - self.labels = [Label.m_from_dict(label) for label in self.labels] - - if self.optimade is not None and isinstance(self.optimade, dict): - self.optimade = optimade.OptimadeEntry.m_from_dict(self.optimade) - - def __getitem__(self, key): - value = super().__getitem__(key) - - if key == 'labels': - return [item.m_to_dict() for item in value] - - if key == 'optimade': - return value.m_to_dict() - - return value + a_search=SearchQuantity()) + + +class DFTMetadata(MSection): + m_def = Section(a_domain='dft') + + basis_set = Quantity( + type=str, default='not processed', + description='The used basis set functions.', + a_search=SearchQuantity(statistic_size=20, default_statistic=True)) + + xc_functional = Quantity( + type=str, default='not processed', + description='The libXC based xc functional classification used in the simulation.', + a_search=SearchQuantity(statistic_size=20, default_statistic=True)) + + system = Quantity( + type=str, default='not processed', + description='The system type of the simulated system.', + a_search=SearchQuantity(default_statistic=True)) + + crystal_system = Quantity( + type=str, default='not processed', + description='The crystal system type of the simulated system.', + a_search=SearchQuantity(default_statistic=True)) + + spacegroup = Quantity( + type=int, default='not processed', + description='The spacegroup of the simulated system as number.', + a_search=SearchQuantity()) + + spacegroup_symbol = Quantity( + type=str, default='not processed', + description='The spacegroup as international short symbol.', + a_search=SearchQuantity()) + + code_name = Quantity( + type=str, default='not processed', + description='The name of the used code.', + a_search=SearchQuantity(statistic_size=40, default_statistic=True)) + + code_version = Quantity( + type=str, default='not processed', + description='The version of the used code.', + a_search=SearchQuantity()) + + n_geometries = Quantity( + type=int, description='Number of unique geometries.', + a_sesrch=SearchQuantity(metric_name='geometries', metric='sum')) + + n_calculations = Quantity( + type=int, + description='Number of single configuration calculation sections', + a_search=SearchQuantity(metric_name='calculations', metric='sum')) + + n_total_energies = Quantity( + type=int, description='Number of total energy calculations', + a_search=SearchQuantity(metric_name='total_energies', metric='sum')) + + n_quantities = Quantity( + type=int, description='Number of metainfo quantities parsed from the entry.', + a_search=SearchQuantity(metric='sum', metric_name='quantities')) + + quantities = Quantity( + type=str, shape=['0..*'], + description='All quantities that are used by this entry.', + a_search=SearchQuantity( + metric_name='distinct_quantities', metric='cardinality', many_and='append')) + + geometries = Quantity( + type=str, shape=['0..*'], + description='Hashes for each simulated geometry', + a_search=SearchQuantity(metric_name='unique_geometries', metric='cardinality')) + + group_hash = Quantity( + type=str, + description='Hashes that describe unique geometries simulated by this code run.', + a_search=SearchQuantity(many_or='append', group='groups', metric_name='groups', metric='cardinality')) + + labels = SubSection( + sub_section=Label, repeats=True, + description='The labels taken from AFLOW prototypes and springer.', + a_search='labels') + + optimade = SubSection( + sub_section=optimade.OptimadeEntry, + description='Metadata used for the optimade API.', + a_search='optimade') + + def m_update(self, **kwargs): + # TODO necessary? + if 'labels' in kwargs: + print('########################## A') + self.labels = [Label.m_from_dict(label) for label in kwargs.pop('labels')] + + if 'optimade' in kwargs: + print('########################## B') + self.optimade = optimade.OptimadeEntry.m_from_dict(kwargs.pop('optimade')) + + super().m_update(**kwargs) def apply_domain_metadata(self, backend): from nomad.normalizing.system import normalized_atom_labels + entry = self.m_parent logger = utils.get_logger(__name__).bind( - upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile) + upload_id=entry.upload_id, calc_id=entry.calc_id, mainfile=entry.mainfile) # code and code specific ids self.code_name = backend.get_value('program_name', 0) @@ -153,44 +200,44 @@ class DFTCalcWithMetadata(CalcWithMetadata): except KeyError: self.code_version = config.services.unavailable_value - self.raw_id = get_optional_backend_value(backend, 'raw_id', 'section_run', 0) + raw_id = get_optional_backend_value(backend, 'raw_id', 'section_run', None) + if raw_id is not None: + entry.raw_id = raw_id # metadata (system, method, chemistry) - self.atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', [], logger=logger) - if hasattr(self.atoms, 'tolist'): - self.atoms = self.atoms.tolist() - self.n_atoms = len(self.atoms) - self.atoms = list(set(normalized_atom_labels(set(self.atoms)))) - self.atoms.sort() + atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', [], logger=logger) + if hasattr(atoms, 'tolist'): + atoms = atoms.tolist() + entry.n_atoms = len(atoms) + atoms = list(set(normalized_atom_labels(set(atoms)))) + atoms.sort() + entry.atoms = atoms self.crystal_system = get_optional_backend_value( backend, 'crystal_system', 'section_symmetry', logger=logger) self.spacegroup = get_optional_backend_value( backend, 'space_group_number', 'section_symmetry', 0, logger=logger) self.spacegroup_symbol = get_optional_backend_value( - backend, 'international_short_symbol', 'section_symmetry', 0, logger=logger) + backend, 'international_short_symbol', 'section_symmetry', logger=logger) self.basis_set = map_basis_set_to_basis_set_label( get_optional_backend_value(backend, 'program_basis_set_type', 'section_run', logger=logger)) self.system = get_optional_backend_value( backend, 'system_type', 'section_system', logger=logger) - self.formula = get_optional_backend_value( + entry.formula = get_optional_backend_value( backend, 'chemical_composition_bulk_reduced', 'section_system', logger=logger) self.xc_functional = map_functional_name_to_xc_treatment( get_optional_backend_value(backend, 'XC_functional_name', 'section_method', logger=logger)) # grouping self.group_hash = utils.hash( - self.formula, + entry.formula, self.spacegroup, self.basis_set, self.xc_functional, self.code_name, self.code_version, - self.with_embargo, - self.comment, - self.references, - self.uploader, - self.coauthors) + entry.with_embargo, + entry.uploader) # metrics and quantities quantities = set() @@ -247,69 +294,3 @@ class DFTCalcWithMetadata(CalcWithMetadata): # optimade self.optimade = backend.get_mi2_section(optimade.OptimadeEntry.m_def) - - -def _elastic_label_value(label): - if isinstance(label, str): - return label - else: - return elastic_obj(label, ESLabel) - - -Domain( - 'dft', DFTCalcWithMetadata, - quantities=dict( - basis_set=DomainQuantity( - 'The used basis set functions.', aggregations=20), - xc_functional=DomainQuantity( - 'The xc functional type used for the simulation.', aggregations=20), - system=DomainQuantity( - 'The system type of the simulated system.', aggregations=10), - crystal_system=DomainQuantity( - 'The crystal system type of the simulated system.', aggregations=10), - code_name=DomainQuantity( - 'The code name.', aggregations=40), - spacegroup=DomainQuantity('The spacegroup of the simulated system as number'), - spacegroup_symbol=DomainQuantity('The spacegroup as international short symbol'), - geometries=DomainQuantity( - 'Hashes that describe unique geometries simulated by this code run.', multi=True), - group_hash=DomainQuantity( - 'A hash from key metadata used to group similar entries.'), - quantities=DomainQuantity( - 'All quantities that are used by this calculation', - metric=('quantities', 'value_count'), multi=True), - n_total_energies=DomainQuantity( - 'Number of total energy calculations', - elastic_mapping=Integer()), - n_calculations=DomainQuantity( - 'Number of single configuration calculation sections', - elastic_mapping=Integer()), - n_quantities=DomainQuantity( - 'Number of overall parsed quantities', - elastic_mapping=Integer()), - n_geometries=DomainQuantity( - 'Number of unique geometries', - elastic_mapping=Integer()), - labels=DomainQuantity( - 'Search based for springer classification and aflow prototypes', - elastic_field='labels.label', - elastic_mapping=Object(ESLabel), - elastic_value=lambda labels: [_elastic_label_value(label) for label in labels], - multi=True), - optimade=DomainQuantity( - 'Search based on optimade\'s filter query language', - elastic_mapping=Object(optimade.ESOptimadeEntry), - elastic_value=lambda entry: elastic_obj(entry, optimade.ESOptimadeEntry) - )), - metrics=dict( - total_energies=('n_total_energies', 'sum'), - calculations=('n_calculations', 'sum'), - quantities=('n_quantities', 'sum'), - geometries=('n_geometries', 'sum'), - unique_geometries=('geometries', 'cardinality'), - groups=('group_hash', 'cardinality') - ), - groups=dict( - groups=('group_hash', 'groups')), - default_statistics=[ - 'atoms', 'dft.basis_set', 'dft.xc_functional', 'dft.system', 'dft.crystal_system', 'dft.code_name']) diff --git a/nomad/datamodel/ems.py b/nomad/datamodel/ems.py index 14277f4a9f213a3b4086e1b84c732001780e6912..ff6a983567124ba34fe46e22ba52c314e18f2c01 100644 --- a/nomad/datamodel/ems.py +++ b/nomad/datamodel/ems.py @@ -12,55 +12,60 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Experimental material science specific metadata -""" +''' from nomad import utils +from nomad.metainfo import Quantity, MSection, Section, Datetime +from nomad.metainfo.search import SearchQuantity -from .base import CalcWithMetadata, DomainQuantity, Domain, get_optional_backend_value +from .base import get_optional_backend_value -class EMSEntryWithMetadata(CalcWithMetadata): +class EMSMetadata(MSection): + m_def = Section(a_domain='ems') - def __init__(self, **kwargs): - # sample quantities - self.chemical: str = None - self.sample_constituents: str = None - self.sample_microstructure: str = None + # sample quantities + chemical = Quantity(type=str, default='not processed', a_search=SearchQuantity()) + sample_constituents = Quantity(type=str, default='not processed', a_search=SearchQuantity(default_statistic=True)) + sample_microstructure = Quantity(type=str, default='not processed', a_search=SearchQuantity(default_statistic=True)) - # general metadata - self.experiment_summary: str = None - self.experiment_location: str = None - self.experiment_time: str = None + # general metadata + experiment_summary = Quantity(type=str, default='not processed', a_search=SearchQuantity()) + experiment_location = Quantity(type=str, default='not processed', a_search=SearchQuantity()) + experiment_time = Quantity(type=Datetime, default='not processed', a_search=SearchQuantity()) - # method - self.method: str = None - self.probing_method: str = None + # method + method = Quantity(type=str, default='not processed', a_search=SearchQuantity(default_statistic=True)) + probing_method = Quantity(type=str, default='not processed', a_search=SearchQuantity(default_statistic=True)) - # data metadata - self.repository_name: str = None - self.repository_url: str = None - self.preview_url: str = None + # data metadata + repository_name = Quantity(type=str, default='not processed', a_search=SearchQuantity()) + repository_url = Quantity(type=str, default='not processed', a_search=SearchQuantity()) + preview_url = Quantity(type=str, default='not processed', a_search=SearchQuantity()) - self.quantities = [] - self.group_hash: str = None - - super().__init__(**kwargs) + # TODO move + quantities = Quantity(type=str, shape=['0..*'], default=[], a_search=SearchQuantity()) + group_hash = Quantity(type=str, a_search=SearchQuantity()) def apply_domain_metadata(self, backend): + entry = self.m_parent logger = utils.get_logger(__name__).bind( - upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile) + upload_id=entry.upload_id, calc_id=entry.calc_id, mainfile=entry.mainfile) - self.formula = get_optional_backend_value( + entry.formula = get_optional_backend_value( backend, 'sample_chemical_formula', 'section_sample', logger=logger) - self.atoms = get_optional_backend_value( + atoms = get_optional_backend_value( backend, 'sample_atom_labels', 'section_sample', logger=logger) - if hasattr(self.atoms, 'tolist'): - self.atoms = self.atoms.tolist() - self.n_atoms = len(self.atoms) - self.atoms = list(set(self.atoms)) - self.atoms.sort() + if hasattr(atoms, 'tolist'): + atoms = atoms.tolist() + entry.n_atoms = len(atoms) + + atoms = list(set(atoms)) + atoms.sort() + entry.atoms = atoms + self.chemical = get_optional_backend_value( backend, 'sample_chemical_name', 'section_sample', logger=logger) self.sample_microstructure = get_optional_backend_value( @@ -88,14 +93,11 @@ class EMSEntryWithMetadata(CalcWithMetadata): backend, 'data_preview_url', 'section_data', logger=logger) self.group_hash = utils.hash( - self.formula, + entry.formula, self.method, self.experiment_location, - self.with_embargo, - self.comment, - self.references, - self.uploader, - self.coauthors) + entry.with_embargo, + entry.uploader) quantities = set() @@ -103,26 +105,3 @@ class EMSEntryWithMetadata(CalcWithMetadata): quantities.add(meta_info) self.quantities = list(quantities) - - -Domain( - 'ems', EMSEntryWithMetadata, - root_sections=['section_experiment', 'section_entry_info'], - metainfo_all_package='all.experimental.nomadmetainfo.json', - quantities=dict( - method=DomainQuantity( - 'The experimental method used.', aggregations=20), - probing_method=DomainQuantity( - 'The used probing method.', aggregations=10), - sample_microstructure=DomainQuantity( - 'The sample micro structure.', aggregations=10), - sample_constituents=DomainQuantity( - 'The sample constituents.', aggregations=10), - quantities=DomainQuantity( - 'All quantities that are used by this calculation')), - metrics=dict( - quantities=('quantities', 'value_count')), - groups=dict(), - default_statistics=[ - 'atoms', 'ems.method', 'ems.probing_method', 'ems.sample_microstructure', - 'ems.sample_constituents']) diff --git a/nomad/datamodel/metainfo.py b/nomad/datamodel/metainfo.py index ff6eaebd254ae01b4d462eeeeaaa74c59d2022ba..07212222e06dfcb19a171c8d298daff921a857de 100644 --- a/nomad/datamodel/metainfo.py +++ b/nomad/datamodel/metainfo.py @@ -12,20 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This duplicates functionality for .base.py. It represents first pieces of a transition towards using the new metainfo system for all repository metadata. -""" -from typing import Dict +''' +from typing import Dict, Any from cachetools import cached, TTLCache -from elasticsearch_dsl import Keyword +from elasticsearch_dsl import Keyword, Text, analyzer, tokenizer +import ase.data from nomad import metainfo, config +from nomad.metainfo.search import SearchQuantity import nomad.metainfo.mongoengine +from .dft import DFTMetadata +from .ems import EMSMetadata + + +def _only_atoms(atoms): + numbers = [ase.data.atomic_numbers[atom] for atom in atoms] + only_atoms = [ase.data.chemical_symbols[number] for number in sorted(numbers)] + return ''.join(only_atoms) + + +path_analyzer = analyzer( + 'path_analyzer', + tokenizer=tokenizer('path_tokenizer', 'pattern', pattern='/')) + class User(metainfo.MSection): - """ A NOMAD user. + ''' A NOMAD user. Typically a NOMAD user has a NOMAD account. The user related data is managed by NOMAD keycloak user-management system. Users are used to denote uploaders, authors, @@ -41,16 +57,26 @@ class User(metainfo.MSection): create: The time the account was created repo_user_id: The id that was used to identify this user in the NOMAD CoE Repository is_admin: Bool that indicated, iff the user the use admin user - """ + ''' + + user_id = metainfo.Quantity( + type=str, + a_me=dict(primary_key=True), + a_search=SearchQuantity()) - user_id = metainfo.Quantity(type=str, a_me=dict(primary_key=True)) name = metainfo.Quantity( type=str, - derived=lambda user: ('%s %s' % (user.first_name, user.last_name)).strip()) + derived=lambda user: ('%s %s' % (user.first_name, user.last_name)).strip(), + a_search=SearchQuantity(es_mapping=Text(fields={'keyword': Keyword()}))) + first_name = metainfo.Quantity(type=str) last_name = metainfo.Quantity(type=str) email = metainfo.Quantity( - type=str, a_me=dict(index=True), a_elastic=dict(mapping=Keyword)) + type=str, + a_me=dict(index=True), + a_elastic=dict(mapping=Keyword), # TODO remove? + a_search=SearchQuantity()) + username = metainfo.Quantity(type=str) affiliation = metainfo.Quantity(type=str) affiliation_address = metainfo.Quantity(type=str) @@ -76,8 +102,33 @@ class User(metainfo.MSection): } +class UserReference(metainfo.Reference): + ''' + Special metainfo reference type that allows to use user_ids as values. It automatically + resolves user_ids to User objects. This is done lazily on getting the value. + ''' + + def __init__(self): + super().__init__(User.m_def) + + def set_normalize(self, section: metainfo.MSection, quantity_def: metainfo.Quantity, value: Any) -> Any: + if isinstance(value, str): + return metainfo.MProxy(value) + else: + return super().set_normalize(section, quantity_def, value) + + def resolve(self, section: metainfo.MSection, quantity_def: metainfo.Quantity, value: Any) -> metainfo.MSection: + return User.get(user_id=value.url) + + def serialize(self, section: metainfo.MSection, quantity_def: metainfo.Quantity, value: Any) -> Any: + return value.user_id + + +user_reference = UserReference() + + class Dataset(metainfo.MSection): - """ A Dataset is attached to one or many entries to form a set of data. + ''' A Dataset is attached to one or many entries to form a set of data. Args: dataset_id: The unique identifier for this dataset as a string. It should be @@ -94,31 +145,96 @@ class Dataset(metainfo.MSection): pid: The original NOMAD CoE Repository dataset PID. Old DOIs still reference datasets based on this id. Is not used for new datasets. created: The date when the dataset was first created. - """ + ''' dataset_id = metainfo.Quantity( type=str, - a_me=dict(primary_key=True)) + a_me=dict(primary_key=True), + a_search=SearchQuantity()) name = metainfo.Quantity( type=str, - a_me=dict(index=True)) + a_me=dict(index=True), + a_search=SearchQuantity()) user_id = metainfo.Quantity( type=str, a_me=dict(index=True)) doi = metainfo.Quantity( type=str, - a_me=dict(index=True)) + a_me=dict(index=True), + a_search=SearchQuantity()) pid = metainfo.Quantity( type=str, a_me=dict(index=True)) created = metainfo.Quantity( type=metainfo.Datetime, - a_me=dict(index=True)) + a_me=dict(index=True), + a_search=SearchQuantity()) -class UserMetadata(metainfo.MSection): - """ NOMAD entry quantities that are given by the user or determined by user actions. +class DatasetReference(metainfo.Reference): + ''' + Special metainfo reference type that allows to use dataset_ids as values. It automatically + resolves dataset_ids to Dataset objects. This is done lazily on getting the value. + ''' + + def __init__(self): + super().__init__(Dataset.m_def) + + def set_normalize(self, section: metainfo.MSection, quantity_def: metainfo.Quantity, value: Any) -> Any: + if isinstance(value, str): + return metainfo.MProxy(value) + else: + return super().set_normalize(section, quantity_def, value) + + def resolve(self, section: metainfo.MSection, quantity_def: metainfo.Quantity, value: Any) -> metainfo.MSection: + return Dataset.m_def.m_x('me').get(dataset_id=value.url) + + def serialize(self, section: metainfo.MSection, quantity_def: metainfo.Quantity, value: Any) -> Any: + if isinstance(value, metainfo.MProxy): + return value.url + else: + return value.user_id + + def deserialize(self, section: metainfo.MSection, quantity_def: metainfo.Quantity, value: Any) -> Any: + return metainfo.MProxy(value) + + +dataset_reference = DatasetReference() + + +class EditableUserMetadata(metainfo.MCategory): + ''' NOMAD entry quantities that can be edited by the user after publish. ''' + + +class UserMetadata(metainfo.MCategory): + ''' NOMAD entry quantities that are given by the user or determined by user actions. ''' + pass + + +class DomainMetadata(metainfo.MCategory): + ''' NOMAD entry quantities that are determined by the uploaded data. ''' + pass + + +class EntryMetadata(metainfo.MSection): + ''' + Attributes: + upload_id: The ``upload_id`` of the calculations upload (random UUID). + calc_id: The unique mainfile based calculation id. + calc_hash: The raw file content based checksum/hash of this calculation. + pid: The unique persistent id of this calculation. + mainfile: The upload relative mainfile path. + domain: Must be the key for a registered domain. This determines which actual + subclass is instantiated. + + files: A list of all files, relative to upload. + upload_time: The time when the calc was uploaded. + uploader: An object describing the uploading user, has at least ``user_id`` + processed: Boolean indicating if this calc was successfully processed and archive + data and calc metadata is available. + last_processing: A datatime with the time of the last successful processing. + nomad_version: A string that describes the version of the nomad software that was + used to do the last successful processing. - Args: comment: An arbitrary string with user provided information about the entry. references: A list of URLs for resources that are related to the entry. uploader: Id of the uploader of this entry. @@ -131,16 +247,217 @@ class UserMetadata(metainfo.MSection): user, and users the entry is shared with (see shared_with). upload_time: The time that this entry was uploaded datasets: Ids of all datasets that this entry appears in - """ - - comment = metainfo.Quantity(type=str) - references = metainfo.Quantity(type=str, shape=['0..*']) - uploader = metainfo.Quantity(type=str, a_flask=dict(admin_only=True, verify=User)) - coauthors = metainfo.Quantity(type=str, shape=['0..*'], a_flask=dict(verify=User)) - shared_with = metainfo.Quantity(type=str, shape=['0..*'], a_flask=dict(verify=User)) - with_embargo = metainfo.Quantity(type=bool) - upload_time = metainfo.Quantity(type=metainfo.Datetime, a_flask=dict(admin_only=True)) - datasets = metainfo.Quantity(type=str, shape=['0..*'], a_flask=dict(verify=Dataset)) + ''' + upload_id = metainfo.Quantity( + type=str, + description='A random UUID that uniquely identifies the upload of the entry.', + a_search=SearchQuantity( + many_or='append', group='uploads', metric_name='uploads', metric='cardinality')) + + calc_id = metainfo.Quantity( + type=str, + description='A unique ID based on the upload id and entry\'s mainfile.', + a_search=SearchQuantity(many_or='append')) + + calc_hash = metainfo.Quantity( + type=str, + description='A raw file content based checksum/hash.', + a_search=SearchQuantity( + many_or='append', metric_name='unique_entries', metric='cardinality')) + + mainfile = metainfo.Quantity( + type=str, + description='The upload relative mainfile path.', + a_search=[ + SearchQuantity( + description='Search within the mainfile path.', + es_mapping=Text(multi=True, analyzer=path_analyzer, fields={'keyword': Keyword()}), + many_or='append', es_quantity='mainfile.keyword'), + SearchQuantity( + description='Search for the exact mainfile.', + many_and='append', name='mainfile_path', es_quantity='mainfile.keyword')]) + + files = metainfo.Quantity( + type=str, shape=['0..*'], + description='The entries raw file paths relative to its upload.', + a_search=[ + SearchQuantity( + description='Search within the paths.', name='path', + es_mapping=Text( + multi=True, analyzer=path_analyzer, fields={'keyword': Keyword()}) + ), + SearchQuantity( + description='Search for exact paths.', + many_or='append', name='files', es_quantity='files.keyword')]) + + pid = metainfo.Quantity( + type=int, + description='The unique, sequentially enumerated, integer persistent identifier', + a_search=SearchQuantity(many_or='append')) + + raw_id = metainfo.Quantity( + type=str, + description='A raw format specific id that was acquired from the files of this entry', + a_search=SearchQuantity(many_or='append')) + + domain = metainfo.Quantity( + type=metainfo.MEnum('dft', 'ems'), + description='The material science domain', + a_search=SearchQuantity()) + + published = metainfo.Quantity( + type=bool, default=False, + description='Indicates if the entry is published', + a_search=SearchQuantity()) + + processed = metainfo.Quantity( + type=bool, default=False, + description='Indicates that the entry is successfully processed.', + a_search=SearchQuantity()) + + last_processing = metainfo.Quantity( + type=metainfo.Datetime, + description='The datetime of the last attempted processing.') + + nomad_version = metainfo.Quantity( + type=str, + description='The NOMAD version used for the last processing attempt.', + a_search=SearchQuantity(many_or='append')) + nomad_commit = metainfo.Quantity( + type=str, + description='The NOMAD commit used for the last processing attempt.', + a_search=SearchQuantity(many_or='append')) + parser_name = metainfo.Quantity( + type=str, + description='The NOMAD parser used for the last processing attempt.', + a_search=SearchQuantity(many_or='append')) + + comment = metainfo.Quantity( + type=str, categories=[UserMetadata, EditableUserMetadata], + description='A user provided comment.', + a_search=SearchQuantity(es_mapping=Text())) + + references = metainfo.Quantity( + type=str, shape=['0..*'], categories=[UserMetadata, EditableUserMetadata], + description='User provided references (URLs).', + a_search=SearchQuantity()) + + uploader = metainfo.Quantity( + type=user_reference, categories=[UserMetadata], + description='The uploader of the entry', + a_flask=dict(admin_only=True, verify=User), + a_search=[ + SearchQuantity( + description='Search uploader with exact names.', + metric_name='uploaders', metric='cardinality', + many_or='append', es_quantity='uploader.name.keyword'), + SearchQuantity( + name='uploader_id', es_quantity='uploader.user_id') + ]) + + coauthors = metainfo.Quantity( + type=user_reference, shape=['0..*'], default=[], categories=[UserMetadata, EditableUserMetadata], + description='A user provided list of co-authors.', + a_flask=dict(verify=User)) + + authors = metainfo.Quantity( + type=user_reference, shape=['0..*'], + description='All authors (uploader and co-authors).', + derived=lambda entry: ([entry.uploader] if entry.uploader is not None else []) + entry.coauthors, + a_search=SearchQuantity( + description='Search authors with exact names.', + metric='cardinality', + many_or='append', es_quantity='authors.name.keyword', statistic_size=1000)) + + shared_with = metainfo.Quantity( + type=user_reference, shape=['0..*'], default=[], categories=[UserMetadata, EditableUserMetadata], + description='A user provided list of userts to share the entry with.', + a_flask=dict(verify=User)) + + owners = metainfo.Quantity( + type=user_reference, shape=['0..*'], + description='All owner (uploader and shared with users).', + derived=lambda entry: ([entry.uploader] if entry.uploader is not None else []) + entry.shared_with, + a_search=SearchQuantity( + description='Search owner with exact names.', + many_or='append', es_quantity='owners.name.keyword')) + + with_embargo = metainfo.Quantity( + type=bool, default=False, categories=[UserMetadata, EditableUserMetadata], + description='Indicated if this entry is under an embargo', + a_search=SearchQuantity()) + + upload_time = metainfo.Quantity( + type=metainfo.Datetime, categories=[UserMetadata], + description='The datetime this entry was uploaded to nomad', + a_flask=dict(admin_only=True), + a_search=SearchQuantity(order_default=True)) + + upload_name = metainfo.Quantity( + type=str, categories=[UserMetadata], + description='The user provided upload name', + a_search=SearchQuantity(many_or='append')) + + datasets = metainfo.Quantity( + type=dataset_reference, shape=['0..*'], default=[], + categories=[UserMetadata, EditableUserMetadata], + description='A list of user curated datasets this entry belongs to.', + a_flask=dict(verify=Dataset), + a_search=[ + SearchQuantity( + es_quantity='datasets.name', many_or='append', + description='Search for a particular dataset by exact name.'), + SearchQuantity( + name='dataset_id', es_quantity='datasets.dataset_id', many_or='append', + group='datasets', + metric='cardinality', metric_name='datasets', + description='Search for a particular dataset by its id.')]) + + external_id = metainfo.Quantity( + type=str, categories=[UserMetadata], + description='A user provided external id.', + a_search=SearchQuantity(many_or='split')) + + last_edit = metainfo.Quantity( + type=metainfo.Datetime, categories=[UserMetadata], + description='The datetime the user metadata was edited last.', + a_search=SearchQuantity()) + + formula = metainfo.Quantity( + type=str, categories=[DomainMetadata], + description='A (reduced) chemical formula.', + a_search=SearchQuantity()) + + atoms = metainfo.Quantity( + type=str, shape=['n_atoms'], default=[], categories=[DomainMetadata], + description='The atom labels of all atoms of the entry\'s material.', + a_search=SearchQuantity( + many_and='append', default_statistic=True, statistic_size=len(ase.data.chemical_symbols))) + + only_atoms = metainfo.Quantity( + type=str, categories=[DomainMetadata], + description='The atom labels concatenated in order-number order.', + derived=lambda entry: _only_atoms(entry.atoms), + a_search=SearchQuantity(many_and='append', derived=_only_atoms)) + + n_atoms = metainfo.Quantity( + type=int, categories=[DomainMetadata], + description='The number of atoms in the entry\'s material', + a_search=SearchQuantity()) + + ems = metainfo.SubSection(sub_section=EMSMetadata, a_search='ems') + dft = metainfo.SubSection(sub_section=DFTMetadata, a_search='dft') + + def apply_user_metadata(self, metadata: dict): + ''' Applies a user provided metadata dict to this calc. ''' + self.m_update(**metadata) + + def apply_domain_metadata(self, backend): + assert self.domain is not None, 'all entries must have a domain' + domain_section_def = self.m_def.all_sub_sections.get(self.domain).sub_section + assert domain_section_def is not None, 'unknown domain %s' % self.domain + domain_section = self.m_create(domain_section_def.section_cls) + domain_section.apply_domain_metadata(backend) nomad.metainfo.mongoengine.init_section(User) diff --git a/nomad/doi.py b/nomad/doi.py index f05cf45a15bbff8a50c459cc4277f0764752c11b..8582a2fd125a0af658a8382f77e461e0cb98720a 100644 --- a/nomad/doi.py +++ b/nomad/doi.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This module contains all functions necessary to manage DOI via datacite.org and its MDS API (https://support.datacite.org/docs/mds-api-guide). -""" +''' import xml.etree.ElementTree as ET import datetime import requests @@ -28,7 +28,7 @@ from nomad import config, utils def edit_url(doi: str, url: str = None): - """ Changes the URL of an already findable DOI. """ + ''' Changes the URL of an already findable DOI. ''' if url is None: url = 'https://repository.nomad-coe.eu/app/gui/datasets/doi/%s' % doi @@ -70,7 +70,7 @@ class DOI(Document): @staticmethod def create(title: str, user: User) -> 'DOI': - """ Creates a unique DOI with the NOMAD DOI prefix. """ + ''' Creates a unique DOI with the NOMAD DOI prefix. ''' # TODO We use a collection of all DOIs in mongo to ensure uniqueness. We attempt # to create new DOIs based on a counter per day until we find a non existing DOI. # This might be bad if many DOIs per day are to be expected. diff --git a/nomad/files.py b/nomad/files.py index 40a0456a7dbb7c33e50f4a7e0e6dfcaab1b7fe36..f6918865b653bbcf157e910a36bf08dd1ce3669e 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Uploads contains classes and functions to create and maintain file structures for uploads. @@ -46,7 +46,7 @@ might be published! There are multiple ways to solve this. Due to the rarity of the case, we take the most simple solution: if one file is public, all files are made public, execpt those being other mainfiles. Therefore, the aux files of a restricted calc might become public! -""" +''' from abc import ABCMeta import sys @@ -60,8 +60,7 @@ import io import pickle import json -from nomad import config, utils -from nomad.datamodel import UploadWithMetadata +from nomad import config, utils, datamodel from nomad.archive import write_archive # TODO this should become obsolete, once we are going beyong python 3.6. For now @@ -76,21 +75,21 @@ user_metadata_filename = 'user_metadata.pickle' def always_restricted(path: str): - """ + ''' Used to put general restrictions on files, e.g. due to licensing issues. Will be called during packing and while accessing public files. - """ + ''' basename = os.path.basename(path) if basename.startswith('POTCAR') and not basename.endswith('.stripped'): return True def copytree(src, dst): - """ + ''' A close on ``shutils.copytree`` that does not try to copy the stats on all files. This is unecessary for our usecase and also causes permission denies for unknown reasons. - """ + ''' os.makedirs(dst, exist_ok=False) for item in os.listdir(src): @@ -103,7 +102,7 @@ def copytree(src, dst): class PathObject: - """ + ''' Object storage-like abstraction for paths in general. Arguments: bucket: The bucket to store this object in @@ -111,7 +110,7 @@ class PathObject: os_path: Override the "object storage" path with the given path. prefix: Add a x-digit prefix directory, e.g. foo/test/ -> foo/tes/test create_prefix: Create the prefix right away - """ + ''' def __init__( self, bucket: str, object_id: str, os_path: str = None, prefix: bool = False, create_prefix: bool = False) -> None: @@ -153,7 +152,7 @@ class PathObject: @property def size(self) -> int: - """ The os determined file size. """ + ''' The os determined file size. ''' return os.stat(self.os_path).st_size def __repr__(self) -> str: @@ -161,13 +160,13 @@ class PathObject: class DirectoryObject(PathObject): - """ + ''' Object storage-like abstraction for directories. Arguments: bucket: The bucket to store this object in object_id: The object id (i.e. directory path) create: True if the directory structure should be created. Default is False. - """ + ''' def __init__(self, bucket: str, object_id: str, create: bool = False, **kwargs) -> None: super().__init__(bucket, object_id, **kwargs) self._create = create @@ -234,7 +233,7 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta): pickle.dump(data, f) def to_staging_upload_files(self, create: bool = False) -> 'StagingUploadFiles': - """ Casts to or creates corresponding staging upload files or returns None. """ + ''' Casts to or creates corresponding staging upload files or returns None. ''' raise NotImplementedError() @staticmethod @@ -247,7 +246,7 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta): return None def raw_file(self, file_path: str, *args, **kwargs) -> IO: - """ + ''' Opens a raw file and returns a file-like object. Additional args, kwargs are delegated to the respective `open` call. Arguments: @@ -255,38 +254,38 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta): Raises: KeyError: If the file does not exist. Restricted: If the file is restricted and upload access evaluated to False. - """ + ''' raise NotImplementedError() def raw_file_size(self, file_path: str) -> int: - """ + ''' Returns: The size of the given raw file. - """ + ''' raise NotImplementedError() def raw_file_manifest(self, path_prefix: str = None) -> Generator[str, None, None]: - """ + ''' Returns the path for all raw files in the archive (with a given prefix). Arguments: path_prefix: An optional prefix; only returns those files that have the prefix. Returns: An iterable over all (matching) raw files. - """ + ''' raise NotImplementedError() def raw_file_list(self, directory: str) -> List[Tuple[str, int]]: - """ + ''' Gives a list of directory contents and its size. Arguments: directory: The directory to list Returns: A list of tuples with file name and size. - """ + ''' raise NotImplementedError() def archive_file(self, calc_id: str, *args, **kwargs) -> IO: - """ + ''' Opens a archive file and returns a file-like objects. Additional args, kwargs are delegated to the respective `open` call. Arguments: @@ -294,18 +293,18 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta): Raises: KeyError: If the calc does not exist. Restricted: If the file is restricted and upload access evaluated to False. - """ + ''' raise NotImplementedError() def archive_file_size(self, calc_id: str) -> int: - """ + ''' Returns: The size of the archive. - """ + ''' raise NotImplementedError() def archive_log_file(self, calc_id: str, *args, **kwargs) -> IO: - """ + ''' Opens a archive log file and returns a file-like objects. Additional args, kwargs are delegated to the respective `open` call. Arguments: @@ -313,11 +312,11 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta): Raises: KeyError: If the calc does not exist. Restricted: If the file is restricted and upload access evaluated to False. - """ + ''' raise NotImplementedError() def open_zipfile_cache(self): - """ Allows to reuse the same zipfile for multiple file operations. Must be closed. """ + ''' Allows to reuse the same zipfile for multiple file operations. Must be closed. ''' pass def close_zipfile_cache(self): @@ -398,7 +397,7 @@ class StagingUploadFiles(UploadFiles): def add_rawfiles( self, path: str, move: bool = False, prefix: str = None, force_archive: bool = False, target_dir: DirectoryObject = None) -> None: - """ + ''' Add rawfiles to the upload. The given file will be copied, moved, or extracted. Arguments: @@ -408,7 +407,7 @@ class StagingUploadFiles(UploadFiles): force_archive: Expect the file to be a zip or other support archive file. Usually those files are only extracted if they can be extracted and copied instead. target_dir: Overwrite the used directory to extract to. Default is the raw directory of this upload. - """ + ''' assert not self.is_frozen assert os.path.exists(path) self._size += os.stat(path).st_size @@ -449,13 +448,13 @@ class StagingUploadFiles(UploadFiles): @property def is_frozen(self) -> bool: - """ Returns True if this upload is already *bagged*. """ + ''' Returns True if this upload is already *bagged*. ''' return self._frozen_file.exists() def pack( - self, upload: UploadWithMetadata, target_dir: DirectoryObject = None, + self, entries: Iterable[datamodel.EntryMetadata], target_dir: DirectoryObject = None, skip_raw: bool = False, skip_archive: bool = False) -> None: - """ + ''' Replaces the staging upload data with a public upload record by packing all data into files. It is only available if upload *is_bag*. This is potentially a long running operation. @@ -466,7 +465,7 @@ class StagingUploadFiles(UploadFiles): is the corresponding public upload files directory. skip_raw: determine to not pack the raw data, only archive and user metadata skip_raw: determine to not pack the archive data, only raw and user metadata - """ + ''' self.logger.info('started to pack upload') # freeze the upload @@ -501,25 +500,25 @@ class StagingUploadFiles(UploadFiles): # zip archives if not skip_archive: with utils.timer(self.logger, 'packed zip json archive'): - self._pack_archive_files(upload, create_zipfile) + self._pack_archive_files(entries, create_zipfile) with utils.timer(self.logger, 'packed msgpack archive'): - self._pack_archive_files_msgpack(upload, write_msgfile) + self._pack_archive_files_msgpack(entries, write_msgfile) # zip raw files if not skip_raw: with utils.timer(self.logger, 'packed raw files'): - self._pack_raw_files(upload, create_zipfile) + self._pack_raw_files(entries, create_zipfile) - def _pack_archive_files_msgpack(self, upload: UploadWithMetadata, write_msgfile): + def _pack_archive_files_msgpack(self, entries: Iterable[datamodel.EntryMetadata], write_msgfile): restricted, public = 0, 0 - for calc in upload.calcs: + for calc in entries: if calc.with_embargo: restricted += 1 else: public += 1 def create_iterator(with_embargo: bool): - for calc in upload.calcs: + for calc in entries: if with_embargo == calc.with_embargo: archive_file = self.archive_file_object(calc.calc_id) if archive_file.exists(): @@ -535,12 +534,12 @@ class StagingUploadFiles(UploadFiles): except Exception as e: self.logger.error('exception during packing archives', exc_info=e) - def _pack_archive_files(self, upload: UploadWithMetadata, create_zipfile): + def _pack_archive_files(self, entries: Iterable[datamodel.EntryMetadata], create_zipfile): archive_public_zip = create_zipfile('archive', 'public', self._archive_ext) archive_restricted_zip = create_zipfile('archive', 'restricted', self._archive_ext) try: - for calc in upload.calcs: + for calc in entries: archive_zip = archive_restricted_zip if calc.with_embargo else archive_public_zip archive_filename = '%s.%s' % (calc.calc_id, self._archive_ext) @@ -560,7 +559,7 @@ class StagingUploadFiles(UploadFiles): archive_restricted_zip.close() archive_public_zip.close() - def _pack_raw_files(self, upload: UploadWithMetadata, create_zipfile): + def _pack_raw_files(self, entries: Iterable[datamodel.EntryMetadata], create_zipfile): raw_public_zip = create_zipfile('raw', 'public', 'plain') raw_restricted_zip = create_zipfile('raw', 'restricted', 'plain') @@ -568,7 +567,7 @@ class StagingUploadFiles(UploadFiles): # 1. add all public raw files # 1.1 collect all public mainfiles and aux files public_files: Dict[str, str] = {} - for calc in upload.calcs: + for calc in entries: if not calc.with_embargo: mainfile = calc.mainfile assert mainfile is not None @@ -578,7 +577,7 @@ class StagingUploadFiles(UploadFiles): if not always_restricted(filepath): public_files[filepath] = None # 1.2 remove the non public mainfiles that have been added as auxfiles of public mainfiles - for calc in upload.calcs: + for calc in entries: if calc.with_embargo: mainfile = calc.mainfile assert mainfile is not None @@ -629,14 +628,14 @@ class StagingUploadFiles(UploadFiles): return results def calc_files(self, mainfile: str, with_mainfile: bool = True, with_cutoff: bool = True) -> Iterable[str]: - """ + ''' Returns all the auxfiles and mainfile for a given mainfile. This implements nomad's logic about what is part of a calculation and what not. The mainfile is first entry, the rest is sorted. Arguments: mainfile: The mainfile relative to upload with_mainfile: Do include the mainfile, default is True - """ + ''' mainfile_object = self._raw_dir.join_file(mainfile) if not mainfile_object.exists(): raise KeyError(mainfile) @@ -666,7 +665,7 @@ class StagingUploadFiles(UploadFiles): return aux_files def calc_id(self, mainfile: str) -> str: - """ + ''' Calculates a id for the given calc. Arguments: mainfile: The mainfile path relative to the upload that identifies the calc in the folder structure. @@ -674,11 +673,11 @@ class StagingUploadFiles(UploadFiles): The calc id Raises: KeyError: If the mainfile does not exist. - """ + ''' return utils.hash(self.upload_id, mainfile) def calc_hash(self, mainfile: str) -> str: - """ + ''' Calculates a hash for the given calc based on file contents and aux file contents. Arguments: mainfile: The mainfile path relative to the upload that identifies the calc in the folder structure. @@ -686,7 +685,7 @@ class StagingUploadFiles(UploadFiles): The calculated hash Raises: KeyError: If the mainfile does not exist. - """ + ''' hash = hashlib.sha512() for filepath in self.calc_files(mainfile): with open(self._raw_dir.join_file(filepath).os_path, 'rb') as f: @@ -702,12 +701,12 @@ class StagingUploadFiles(UploadFiles): class ArchiveBasedStagingUploadFiles(StagingUploadFiles): - """ + ''' :class:`StagingUploadFiles` based on a single uploaded archive file (.zip) Arguments: upload_path: The path to the uploaded file. - """ + ''' def __init__( self, upload_id: str, upload_path: str, *args, **kwargs) -> None: @@ -736,12 +735,12 @@ class ArchiveBasedStagingUploadFiles(StagingUploadFiles): class PublicUploadFilesBasedStagingUploadFiles(StagingUploadFiles): - """ + ''' :class:`StagingUploadFiles` based on a single uploaded archive file (.zip) Arguments: upload_path: The path to the uploaded file. - """ + ''' def __init__( self, public_upload_files: 'PublicUploadFiles', *args, **kwargs) -> None: @@ -763,9 +762,9 @@ class PublicUploadFilesBasedStagingUploadFiles(StagingUploadFiles): def add_rawfiles(self, *args, **kwargs) -> None: assert False, 'do not add_rawfiles to a %s' % self.__class__.__name__ - def pack(self, upload: UploadWithMetadata, *args, **kwargs) -> None: - """ Packs only the archive contents and stores it in the existing public upload files. """ - super().pack(upload, target_dir=self.public_upload_files, skip_raw=True) + def pack(self, entries: Iterable[datamodel.EntryMetadata], *args, **kwargs) -> None: + ''' Packs only the archive contents and stores it in the existing public upload files. ''' + super().pack(entries, target_dir=self.public_upload_files, skip_raw=True) class PublicUploadFiles(UploadFiles): @@ -952,13 +951,13 @@ class PublicUploadFiles(UploadFiles): return self._file('archive', self._archive_ext, '%s.log' % calc_id, *args, **kwargs) def re_pack( - self, upload: UploadWithMetadata, skip_raw: bool = False, + self, entries: Iterable[datamodel.EntryMetadata], skip_raw: bool = False, skip_archive: bool = False) -> None: - """ + ''' Replaces the existing public/restricted data file pairs with new ones, based on current restricted information in the metadata. Should be used after updating the restrictions on calculations. This is potentially a long running operation. - """ + ''' # compute a list of files to repack files = [] kinds = [] @@ -991,10 +990,10 @@ class PublicUploadFiles(UploadFiles): # perform the repacking try: if not skip_archive: - staging_upload._pack_archive_files(upload, create_zipfile) - staging_upload._pack_archive_files_msgpack(upload, write_msgfile) + staging_upload._pack_archive_files(entries, create_zipfile) + staging_upload._pack_archive_files_msgpack(entries, write_msgfile) if not skip_raw: - staging_upload._pack_raw_files(upload, create_zipfile) + staging_upload._pack_raw_files(entries, create_zipfile) finally: staging_upload.delete() diff --git a/nomad/infrastructure.py b/nomad/infrastructure.py index 84da75147b87b5309d6558aff2ff6f111902ab74..11aa8d2d6f9ff50813684d0121df6499c56ead2d 100644 --- a/nomad/infrastructure.py +++ b/nomad/infrastructure.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This module provides function to establish connections to the database, searchengine, etc. infrastructure services. Usually everything is setup at once with :func:`setup`. This is run once for each *api* and *worker* process. Individual functions for partial setups exist to facilitate testing, :py:mod:`nomad.migration`, aspects of :py:mod:`nomad.cli`, etc. -""" +''' import os.path import shutil @@ -42,19 +42,19 @@ from nomad import config, utils logger = None elastic_client = None -""" The elastic search client. """ +''' The elastic search client. ''' mongo_client = None -""" The pymongo mongodb client. """ +''' The pymongo mongodb client. ''' def setup(): - """ + ''' Uses the current configuration (nomad/config.py and environment) to setup all the infrastructure services (repository db, mongo, elastic search) and logging. Will create client instances for the databases and has to be called before they can be used. - """ + ''' setup_logging() setup_mongo() setup_elastic() @@ -75,7 +75,7 @@ def setup_logging(): def setup_mongo(): - """ Creates connection to mongodb. """ + ''' Creates connection to mongodb. ''' global mongo_client try: mongo_client = connect(db=config.mongo.db_name, host=config.mongo.host, port=config.mongo.port) @@ -88,7 +88,7 @@ def setup_mongo(): def setup_elastic(): - """ Creates connection to elastic search. """ + ''' Creates connection to elastic search. ''' global elastic_client elastic_client = connections.create_connection( hosts=['%s:%d' % (config.elastic.host, config.elastic.port)], @@ -111,10 +111,10 @@ def setup_elastic(): class Keycloak(): - """ + ''' A class that encapsulates all keycloak related functions for easier mocking and configuration - """ + ''' def __init__(self): self.__oidc_client = None self.__admin_client = None @@ -148,7 +148,7 @@ class Keycloak(): return self.__public_keys def authorize_flask(self, basic: bool = True) -> str: - """ + ''' Authorizes the current flask request with keycloak. Uses either Bearer or Basic authentication, depending on available headers in the request. Bearer auth is basically offline (besides retrieving and caching keycloaks public key for signature @@ -157,7 +157,7 @@ class Keycloak(): Will set ``g.user``, either with None or user data from the respective OIDC token. Returns: An error message or None - """ + ''' g.oidc_access_token = None if 'Authorization' in request.headers and request.headers['Authorization'].startswith('Bearer '): g.oidc_access_token = request.headers['Authorization'].split(None, 1)[1].strip() @@ -235,10 +235,10 @@ class Keycloak(): pass def add_user(self, user, bcrypt_password=None, invite=False): - """ + ''' Adds the given :class:`nomad.datamodel.User` instance to the configured keycloak realm using the keycloak admin API. - """ + ''' from nomad import datamodel if not isinstance(user, datamodel.User): if 'user_id' not in user: @@ -337,12 +337,12 @@ class Keycloak(): for keycloak_user in keycloak_results] def get_user(self, user_id: str = None, username: str = None, user=None) -> object: - """ + ''' Retrives all available information about a user from the keycloak admin interface. This must be used to retrieve complete user information, because the info solely gathered from tokens (i.e. for the authenticated user ``g.user``) is generally incomplete. - """ + ''' if user is not None and user_id is None: user_id = user.user_id @@ -390,7 +390,7 @@ keycloak = Keycloak() def reset(remove: bool): - """ + ''' Resets the databases mongo, elastic/calcs, and all files. Be careful. In contrast to :func:`remove`, it will only remove the contents of dbs and indicies. This function just attempts to remove everything, there is no exception handling @@ -398,7 +398,7 @@ def reset(remove: bool): Args: remove: Do not try to recreate empty databases, remove entirely. - """ + ''' try: if not mongo_client: setup_mongo() diff --git a/nomad/metainfo/CONCEPT.md b/nomad/metainfo/CONCEPT.md index 9d1fb4324d5695a4bdd3dfda7db36c345cdb97be..f99214f37c77f143e75a1e41dff2d5e1adad0e05 100644 --- a/nomad/metainfo/CONCEPT.md +++ b/nomad/metainfo/CONCEPT.md @@ -179,9 +179,9 @@ Arbitrary serializable objects that can contain additional information. This could be code, from a python module that represents the NOMAD *common* package `nomad.metainfo.common`: ```python class System(MSection): - """ + ''' The system is ... - """ + ''' n_atoms = Quantity(type=int, derived_from='atom_labels') @@ -189,9 +189,9 @@ class System(MSection): shape=['n_atoms'], type=MEnum(ase.data.chemical_symbols), annotations=[ElasticSearchQuantity('keyword')]) - """ + ''' Atom labels are ... - """ + ''' formula_hill = Quantity(type=str, derived_from=['atom_labels']) diff --git a/nomad/metainfo/__init__.py b/nomad/metainfo/__init__.py index 9a54c3c36ac07652ba13b80bf5d239907dbd5400..0521e9d1cb0d621e00d45355459a6db6c56f0f1f 100644 --- a/nomad/metainfo/__init__.py +++ b/nomad/metainfo/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The NOMAD meta-info allows to define schemas for physics data independent of the used storage format. It allows to define physics quantities with types, complex shapes (vetors, matrices, etc.), units, links, and descriptions. It allows to organize large @@ -32,15 +32,15 @@ Starting example from nomad.metainfo import MSection, Quantity, SubSection, Units class System(MSection): - \"\"\" + \'\'\' A system section includes all quantities that describe a single a simulated system (a.k.a. geometry). - \"\"\" + \'\'\' n_atoms = Quantity( - type=int, description=''' + type=int, description=\'\'\' A Defines the number of atoms in the system. - ''') + \'\'\') atom_labels = Quantity(type=MEnum(ase.data.chemical_symbols), shape['n_atoms']) atom_positions = Quantity(type=float, shape=['n_atoms', 3], unit=Units.m) @@ -146,7 +146,7 @@ A `section class` looks like this: .. code-block:: python class SectionName(BaseSection): - ''' Section description ''' + \'\'\' Section description \'\'\' m_def = Section(**section_attributes) quantity_name = Quantity(**quantity_attributes) @@ -186,7 +186,7 @@ category looks like this: .. code-block:: python class CategoryName(MCategory): - ''' Category description ''' + \'\'\' Category description \'\'\' m_def = Category(links=['http://further.explanation.eu'], categories=[ParentCategory]) Packages @@ -272,7 +272,7 @@ A more complex example .. literalinclude:: ../nomad/metainfo/example.py :language: python -""" +''' from .metainfo import MSection, MCategory, Definition, Property, Quantity, SubSection, \ Section, Category, Package, Environment, MEnum, Datetime, MProxy, MetainfoError, DeriveError, \ diff --git a/nomad/metainfo/elastic.py b/nomad/metainfo/elastic.py index 352b74b7aec3eda5f355ea1653065a2167c1ada0..c386703864b22cda1bf727d4b6cedeaedfad3d3f 100644 --- a/nomad/metainfo/elastic.py +++ b/nomad/metainfo/elastic.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Adds elastic search support to the metainfo. -""" +''' from . import Section, MSection def elastic_mapping(section: Section, base_cls: type) -> type: - """ Creates an elasticsearch_dsl document class from a section definition. """ + ''' Creates an elasticsearch_dsl document class from a section definition. ''' dct = { name: quantity.m_annotations['elastic']['type']() diff --git a/nomad/metainfo/example.py b/nomad/metainfo/example.py index 3be207f5678cbee913ab704b4168074a935dd2f9..3f04abd9cb0324ac0f5646dc18f51f4b09a168b5 100644 --- a/nomad/metainfo/example.py +++ b/nomad/metainfo/example.py @@ -1,4 +1,4 @@ -""" An example metainfo package. """ +''' An example metainfo package. ''' import numpy as np from datetime import datetime @@ -9,28 +9,28 @@ m_package = Package(links=['http://metainfo.nomad-coe.eu']) class SystemHash(MCategory): - """ All quantities that contribute to what makes a system unique. """ + ''' All quantities that contribute to what makes a system unique. ''' class Parsing(MSection): - """ All data that describes the NOMAD parsing of this run. + ''' All data that describes the NOMAD parsing of this run. Quantities can also be documented like this: Args: parser_name: 'Name of the used parser' parser_version: 'Version of the used parser' - """ + ''' parser_name = Quantity(type=str) parser_version = Quantity(type=str) - nomad_version = Quantity(type=str) + nomad_version = Quantity(type=str, default='latest') warnings = Quantity(type=str, shape=['0..*']) parse_time = Quantity(type=Datetime) class System(MSection): - """ All data that describes a simulated system. """ + ''' All data that describes a simulated system. ''' n_atoms = Quantity( type=int, derived=lambda system: len(system.atom_labels), @@ -63,7 +63,7 @@ class SCC(MSection): class Run(MSection): - """ All data that belongs to a single code run. """ + ''' All data that belongs to a single code run. ''' code_name = Quantity(type=str, description='The name of the code that was run.') code_version = Quantity(type=str, description='The version of the code that was run.') @@ -78,7 +78,7 @@ class Run(MSection): class VaspRun(Run): - """ All VASP specific quantities for section Run. """ + ''' All VASP specific quantities for section Run. ''' m_def = Section(extends_base_section=True) x_vasp_raw_format = Quantity( diff --git a/nomad/metainfo/flask_restplus.py b/nomad/metainfo/flask_restplus.py index 5ee7fa53dea772927a579d05141a1e3e37a5f733..a7621e5089cfff53a93827cf56421c1d75095cd2 100644 --- a/nomad/metainfo/flask_restplus.py +++ b/nomad/metainfo/flask_restplus.py @@ -6,7 +6,7 @@ from .metainfo import Section, Quantity, Datetime def field(quantity: Quantity): - """ Returns a flask restplus field with quantity type and shape. """ + ''' Returns a flask restplus field with quantity type and shape. ''' field = None if quantity.type == int: field = fields.Integer diff --git a/nomad/metainfo/legacy.py b/nomad/metainfo/legacy.py index 3b0a3c4f8485b035f84a9443ae781edc548d09e0..e66b5cf4c5e8e903da9aaee4f7c7fa3d33f09d7e 100644 --- a/nomad/metainfo/legacy.py +++ b/nomad/metainfo/legacy.py @@ -50,7 +50,7 @@ def from_legacy_metainfo(meta_info_env, package_names: List[str] = None) \ class LegacyMetainfoEnvironment: - """ + ''' Args: env: The metainfo environment that is used to manage the definitions. orig_legacy_env: The old metainfo :class:`InfoKindEnv` environment with the @@ -59,7 +59,7 @@ class LegacyMetainfoEnvironment: converted metainfo environment. all_legacy_defs: A dict that stores the original :class:`InfoKindEl`s by name. all_defs: A dict that stroed the converted section and category definitions. - """ + ''' def __init__(self, metainfo=Union[InfoKindEnv, str], package_names: List[str] = None, logger=None): self.logger = utils.get_logger(__name__) if logger is None else logger self.env = Environment() @@ -109,9 +109,9 @@ class LegacyMetainfoEnvironment: def convert_package( self, legacy_definitions: List[InfoKindEl], **kwargs) -> Package: - """ Converts a single legacy metainfo package, i.e. a list of :class:`InfoKindEl` + ''' Converts a single legacy metainfo package, i.e. a list of :class:`InfoKindEl` into a metainfo package. - """ + ''' package = Package(**kwargs) definition: Definition = None @@ -212,7 +212,7 @@ class LegacyMetainfoEnvironment: return package def legacy_info(self, definition: Definition, *args, **kwargs) -> InfoKindEl: - """ Creates a legacy metainfo objects for the given definition. """ + ''' Creates a legacy metainfo objects for the given definition. ''' super_names: List[str] = list() result: Dict[str, Any] = dict( name=definition.name, @@ -266,7 +266,7 @@ class LegacyMetainfoEnvironment: return InfoKindEl(*args, **result, **kwargs) def legacy_info_env(self, packages: List[Package] = None, *args, **kwargs) -> InfoKindEnv: - """ Creates a legacy metainfo environment with all definitions from the given packages. """ + ''' Creates a legacy metainfo environment with all definitions from the given packages. ''' if packages is None: packages = self.env.packages @@ -331,7 +331,7 @@ class LegacyMetainfoEnvironment: if __name__ == '__main__': - """ Converts the old metainfo and code-generates definitions for the new metainfo """ + ''' Converts the old metainfo and code-generates definitions for the new metainfo ''' env = LegacyMetainfoEnvironment( metainfo='vasp.nomadmetainfo.json', package_names=['%s.nomadmetainfo.json' % pkg for pkg in ['common', 'public', 'vasp']]) diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py index 9c96ca4b204683414f4cbe1ed646da89bc0517ef..4febbc2e5a0bb54e1fe29a560e067f3c3f055d53 100644 --- a/nomad/metainfo/metainfo.py +++ b/nomad/metainfo/metainfo.py @@ -29,6 +29,7 @@ import aniso8601 from datetime import datetime import pytz import docstring_parser +import flask_restplus.inputs m_package: 'Package' = None @@ -41,24 +42,24 @@ T = TypeVar('T') # Metainfo errors class MetainfoError(Exception): - """ Metainfo related errors. """ + ''' Metainfo related errors. ''' pass class DeriveError(MetainfoError): - """ An error occurred while computing a derived value. """ + ''' An error occurred while computing a derived value. ''' pass class MetainfoReferenceError(MetainfoError): - """ An error indicating that a reference could not be resolved. """ + ''' An error indicating that a reference could not be resolved. ''' pass # Metainfo quantity data types class MEnum(): - """Allows to define str types with values limited to a pre-set list of possible values.""" + '''Allows to define str types with values limited to a pre-set list of possible values.''' def __init__(self, *args, **kwargs): # Supports one big list in place of args if len(args) == 1 and isinstance(args[0], list): @@ -80,18 +81,18 @@ class MEnum(): class MProxy(): - """ A placeholder object that acts as reference to a value that is not yet resolved. + ''' A placeholder object that acts as reference to a value that is not yet resolved. Attributes: url: The reference represented as an URL string. - """ + ''' def __init__(self, url: str): self.url = url class DataType: - """ + ''' Allows to define custom data types that can be used in the meta-info. The metainfo supports the most types out of the box. These includes the python build-in @@ -102,21 +103,21 @@ class DataType: type checks and various value transformations. This allows to store values in the section differently from how the usermight set/get them, and it allows to have non serializeable values that are transformed on de-/serialization. - """ + ''' def set_normalize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: - """ Transforms the given value before it is set and checks its type. """ + ''' Transforms the given value before it is set and checks its type. ''' return value def get_normalize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: - """ Transforms the given value when it is get. """ + ''' Transforms the given value when it is get. ''' return value def serialize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: - """ Transforms the given value when making the section serializeable. """ + ''' Transforms the given value when making the section serializeable. ''' return value def deserialize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: - """ Transforms the given value from its serializeable form. """ + ''' Transforms the given value from its serializeable form. ''' return value @@ -175,7 +176,7 @@ class _Unit(DataType): units = pint.UnitRegistry() -""" The default pint unit registry that should be used to give units to quantity definitions. """ +''' The default pint unit registry that should be used to give units to quantity definitions. ''' class _Callable(DataType): @@ -187,7 +188,7 @@ class _Callable(DataType): class _QuantityType(DataType): - """ Data type for defining the type of a metainfo quantity. + ''' Data type for defining the type of a metainfo quantity. A metainfo quantity type can be one of @@ -197,7 +198,7 @@ class _QuantityType(DataType): - an MEnum instance to use it's values as possible str values - a custom datatype, i.e. instance of :class:`DataType` - Any - """ + ''' def set_normalize(self, section, quantity_def, value): if value in [str, int, float, bool]: @@ -261,7 +262,7 @@ class _QuantityType(DataType): class Reference(DataType): - """ Datatype used for reference quantities. """ + ''' Datatype used for reference quantities. ''' def __init__(self, section_def: 'Section'): if not isinstance(section_def, Section): @@ -292,12 +293,15 @@ class Reference(DataType): return value + def resolve(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> 'MSection': + return section.m_resolve(value.url) + def get_normalize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: if isinstance(value, MProxy): - resolved: 'MSection' = section.m_resolve(value.url) + resolved: 'MSection' = self.resolve(section, quantity_def, value) if resolved is None: raise ReferenceError('Could not resolve %s from %s.' % (value, section)) - section.m_set(quantity_def, value) + return resolved return value @@ -311,31 +315,46 @@ class Reference(DataType): class _Datetime(DataType): - def __parse(self, datetime_str: str) -> datetime: + def _parse(self, datetime_str: str) -> datetime: try: - try: - return aniso8601.parse_datetime(datetime_str) - except ValueError: - date = aniso8601.parse_date(datetime_str) - return datetime(date.year, date.month, date.day) - except Exception: - raise TypeError('Invalid date literal "{0}"'.format(datetime_str)) + return aniso8601.parse_datetime(datetime_str) + except ValueError: + pass - def set_normalize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: + try: + return aniso8601.parse_date(datetime_str) + except ValueError: + pass + + try: + # TODO necessary? + return flask_restplus.inputs.datetime_from_rfc822(datetime_str) + except ValueError: + pass + + raise TypeError('Invalid date literal "{0}"'.format(datetime_str)) + + def _convert(self, value): if isinstance(value, str): - value = self.__parse(value) + value = self._parse(value) + + elif isinstance(value, (int, float)): + value = datetime.fromtimestamp(value) if not isinstance(value, datetime): raise TypeError('%s is not a datetime.' % value) return value + def set_normalize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: + return self._convert(value) + def serialize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: value.replace(tzinfo=pytz.utc) return value.isoformat() def deserialize(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> Any: - return self.__parse(value) + return self._convert(value) Dimension = _Dimension() @@ -365,7 +384,7 @@ class MObjectMeta(type): SectionDef = Union[str, 'Section', 'SubSection', Type[MSectionBound]] -""" Type for section definition references. +''' Type for section definition references. This can either be : @@ -373,11 +392,11 @@ This can either be : - the section definition itself - the definition of a sub section - or the section definition Python class -""" +''' class MData: - """ An interface for low-level metainfo data objects. + ''' An interface for low-level metainfo data objects. Metainfo data objects store the data of a single section instance. This interface constitutes the minimal functionality for accessing and modifying section data. @@ -386,7 +405,7 @@ class MData: All section instances will implement this interface, usually be delegating calls to a standalone implementation of this interface. This allows to configure various data backends on section instance creation. - """ + ''' def __getitem__(self, key): raise NotImplementedError() @@ -395,47 +414,52 @@ class MData: raise NotImplementedError() def m_set(self, section: 'MSection', quantity_def: 'Quantity', value: Any) -> None: - """ Set the given value for the given quantity. """ + ''' Set the given value for the given quantity. ''' raise NotImplementedError() def m_get(self, section: 'MSection', quantity_def: 'Quantity') -> Any: - """ Retrieve the given value for the given quantity. """ + ''' Retrieve the given value for the given quantity. ''' raise NotImplementedError() def m_is_set(self, section: 'MSection', quantity_def: 'Quantity') -> bool: - """ True iff this quantity was explicitely set. """ + ''' True iff this quantity was explicitely set. ''' raise NotImplementedError() def m_add_values( self, section: 'MSection', quantity_def: 'Quantity', values: Any, offset: int) -> None: - """ Add (partial) values for the given quantity of higher dimensionality. """ + ''' Add (partial) values for the given quantity of higher dimensionality. ''' raise NotImplementedError() def m_add_sub_section( self, section: 'MSection', sub_section_def: 'SubSection', sub_section: 'MSection') -> None: - """ Adds the given section instance as a sub section of the given sub section definition. """ + ''' Adds the given section instance as a sub section of the given sub section definition. ''' + raise NotImplementedError() + + def m_remove_sub_section( + self, section: 'MSection', sub_section_def: 'SubSection', index: int) -> None: + ''' Removes the given section instance as a sub section of the given sub section definition. ''' raise NotImplementedError() def m_get_sub_section( self, section: 'MSection', sub_section_def: 'SubSection', index: int) -> 'MSection': - """ Retrieves a single sub section of the given sub section definition. """ + ''' Retrieves a single sub section of the given sub section definition. ''' raise NotImplementedError() def m_get_sub_sections( self, section: 'MSection', sub_section_def: 'SubSection') -> Iterable['MSection']: - """ Retrieves all sub sections of the given sub section definition. """ + ''' Retrieves all sub sections of the given sub section definition. ''' raise NotImplementedError() def m_sub_section_count(self, section: 'MSection', sub_section_def: 'SubSection') -> int: - """ Returns the number of sub sections for the given sub section definition. """ + ''' Returns the number of sub sections for the given sub section definition. ''' raise NotImplementedError() class MDataDict(MData): - """ A simple dict backed implementaton of :class:`MData`. It is used by default. """ + ''' A simple dict backed implementaton of :class:`MData`. It is used by default. ''' def __init__(self, dct: Dict[str, Any] = None): if dct is None: @@ -484,6 +508,15 @@ class MDataDict(MData): else: self.dct[sub_section_name] = sub_section + def m_remove_sub_section( + self, section: 'MSection', sub_section_def: 'SubSection', index: int) -> None: + + if sub_section_def.repeats: + del(self.dct[sub_section_def.name][index]) + + elif sub_section_def.name in self.dct: + del(self.dct[sub_section_def.name]) + def m_get_sub_section( self, section: 'MSection', sub_section_def: 'SubSection', index: int) -> 'MSection': @@ -510,17 +543,17 @@ class MDataDict(MData): class MResource(): - """Represents a collection of related metainfo data, i.e. a set of :class:`MSection` instances. + '''Represents a collection of related metainfo data, i.e. a set of :class:`MSection` instances. MResource allows to keep related objects together and resolve sections of certain section definitions. - """ + ''' def __init__(self): self.__data: Dict['Section', List['MSection']] = dict() self.contents: List['MSection'] = [] def create(self, section_cls: Type[MSectionBound], *args, **kwargs) -> MSectionBound: - """ Create an instance of the given section class and adds it to this resource. """ + ''' Create an instance of the given section class and adds it to this resource. ''' result = section_cls(*args, **kwargs) self.add(result) return cast(MSectionBound, result) @@ -539,11 +572,11 @@ class MResource(): self.contents.remove(section) def all(self, section_cls: Type[MSectionBound]) -> List[MSectionBound]: - """ Returns all instances of the given section class in this resource. """ + ''' Returns all instances of the given section class in this resource. ''' return cast(List[MSectionBound], self.__data.get(section_cls.m_def, [])) def unload(self): - """ Breaks all references among the contain metainfo sections to allow GC. """ + ''' Breaks all references among the contain metainfo sections to allow GC. ''' for collections in self.__data.values(): for section in collections: section.m_parent = None @@ -552,8 +585,8 @@ class MResource(): # TODO break actual references via quantities -class MSection(metaclass=MObjectMeta): - """Base class for all section instances on all meta-info levels. +class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclass of collections.abs.Mapping + '''Base class for all section instances on all meta-info levels. All `section instances` indirectly instantiate the :class:`MSection` and therefore all members of :class:`MSection` are available on all `section instances`. :class:`MSection` @@ -585,7 +618,7 @@ class MSection(metaclass=MObjectMeta): m_resource: The :class:`MResource` that contains and manages this section. - """ + ''' m_def: 'Section' = None @@ -689,7 +722,7 @@ class MSection(metaclass=MObjectMeta): constraints: Set[str] = set() event_handlers: Set[Callable] = set(m_def.event_handlers) for name, attr in cls.__dict__.items(): - # transfer names and descriptions for properties + # transfer names and descriptions for properties, init properties if isinstance(attr, Property): attr.name = name if attr.description is not None: @@ -703,6 +736,8 @@ class MSection(metaclass=MObjectMeta): else: raise NotImplementedError('Unknown property kind.') + attr.__init_property__() + if inspect.isfunction(attr): method_name = attr.__name__ @@ -856,7 +891,7 @@ class MSection(metaclass=MObjectMeta): return self.__check_np(quantity_def, value) def m_set(self, quantity_def: 'Quantity', value: Any) -> None: - """ Set the given value for the given quantity. """ + ''' Set the given value for the given quantity. ''' quantity_def = self.__resolve_synonym(quantity_def) if quantity_def.derived is not None: @@ -890,7 +925,7 @@ class MSection(metaclass=MObjectMeta): handler(self, quantity_def, value) def m_get(self, quantity_def: 'Quantity') -> Any: - """ Retrieve the given value for the given quantity. """ + ''' Retrieve the given value for the given quantity. ''' quantity_def = self.__resolve_synonym(quantity_def) if quantity_def.derived is not None: try: @@ -918,6 +953,10 @@ class MSection(metaclass=MObjectMeta): 'Only numpy arrays and dtypes can be used for higher dimensional ' 'quantities.') + if isinstance(quantity_def.type, Reference): + # save the resolved values for the next access to avoid re-resolve + self.m_data.m_set(self, quantity_def, value) + elif type(quantity_def.type) == np.dtype: if quantity_def.unit is not None: value = value * quantity_def.unit @@ -925,7 +964,7 @@ class MSection(metaclass=MObjectMeta): return value def m_is_set(self, quantity_def: 'Quantity') -> bool: - """ True if the given quantity is set. """ + ''' True if the given quantity is set. ''' quantity_def = self.__resolve_synonym(quantity_def) if quantity_def.derived is not None: return True @@ -933,15 +972,25 @@ class MSection(metaclass=MObjectMeta): return self.m_data.m_is_set(self, quantity_def) def m_add_values(self, quantity_def: 'Quantity', values: Any, offset: int) -> None: - """ Add (partial) values for the given quantity of higher dimensionality. """ + ''' Add (partial) values for the given quantity of higher dimensionality. ''' self.m_data.m_add_values(self, quantity_def, values, offset) def m_add_sub_section(self, sub_section_def: 'SubSection', sub_section: 'MSection') -> None: - """ Adds the given section instance as a sub section of the given sub section definition. """ + ''' Adds the given section instance as a sub section of the given sub section definition. ''' parent_index = -1 if sub_section_def.repeats: parent_index = self.m_sub_section_count(sub_section_def) + + else: + old_sub_section = self.m_data.m_get_sub_section(self, sub_section_def, -1) + if old_sub_section is not None: + old_sub_section.m_parent = None + old_sub_section.m_parent_sub_section = None + old_sub_section.m_parent_index = -1 + if self.m_resource is not None: + self.m_resource.remove(sub_section) + sub_section.m_parent = self sub_section.m_parent_sub_section = sub_section_def sub_section.m_parent_index = parent_index @@ -956,29 +1005,33 @@ class MSection(metaclass=MObjectMeta): if handler.__name__.startswith('on_add_sub_section'): handler(self, sub_section_def, sub_section) + def m_remove_sub_section(self, sub_section_def: 'SubSection', index: int) -> None: + ''' Removes the exiting section for a non repeatable sub section ''' + self.m_data.m_remove_sub_section(self, sub_section_def, index) + def m_get_sub_section(self, sub_section_def: 'SubSection', index: int) -> 'MSection': - """ Retrieves a single sub section of the given sub section definition. """ + ''' Retrieves a single sub section of the given sub section definition. ''' return self.m_data.m_get_sub_section(self, sub_section_def, index) def m_get_sub_sections(self, sub_section_def: 'SubSection') -> Iterable['MSection']: - """ Retrieves all sub sections of the given sub section definition. """ + ''' Retrieves all sub sections of the given sub section definition. ''' return self.m_data.m_get_sub_sections(self, sub_section_def) def m_sub_section_count(self, sub_section_def: 'SubSection') -> int: - """ Returns the number of sub sections for the given sub section definition. """ + ''' Returns the number of sub sections for the given sub section definition. ''' return self.m_data.m_sub_section_count(self, sub_section_def) def m_create( self, section_cls: Type[MSectionBound], sub_section_def: 'SubSection' = None, **kwargs) -> MSectionBound: - """ Creates a section instance and adds it to this section provided there is a + ''' Creates a section instance and adds it to this section provided there is a corresponding sub section. Args: section_cls: The section class for the sub-secton to create sub_section_def: If there are multiple sub-sections for the given class, this must be used to explicitely state the sub-section definition. - """ + ''' section_def = section_cls.m_def sub_section_defs = self.m_def.all_sub_sections_by_section.get(section_def, []) @@ -1005,7 +1058,7 @@ class MSection(metaclass=MObjectMeta): return cast(MSectionBound, sub_section) def m_update(self, safe: bool = True, **kwargs): - """ Updates all quantities and sub-sections with the given arguments. """ + ''' Updates all quantities and sub-sections with the given arguments. ''' if safe: for name, value in kwargs.items(): prop = self.m_def.all_properties.get(name, None) @@ -1029,15 +1082,22 @@ class MSection(metaclass=MObjectMeta): self.m_data.m_data.dct.update(**kwargs) # type: ignore def m_as(self, section_cls: Type[MSectionBound]) -> MSectionBound: - """ 'Casts' this section to the given extending sections. """ + ''' 'Casts' this section to the given extending sections. ''' return cast(MSectionBound, self) def m_follows(self, definition: 'Section') -> bool: - """ Determines if this section's definition is or is derived from the given definition. """ + ''' Determines if this section's definition is or is derived from the given definition. ''' return self.m_def == definition or definition in self.m_def.all_base_sections - def m_to_dict(self, with_meta: bool = False) -> Dict[str, Any]: - """Returns the data of this section as a json serializeable dictionary. """ + def m_to_dict(self, with_meta: bool = False, include_defaults: bool = False) -> Dict[str, Any]: + ''' + Returns the data of this section as a json serializeable dictionary. + + Arguments: + with_meta: Include information about the section definition and the sections + position in its parent. + include_defaults: Include default values of unset quantities. + ''' def items() -> Iterable[Tuple[str, Any]]: # metadata @@ -1050,81 +1110,100 @@ class MSection(metaclass=MObjectMeta): # quantities for name, quantity in self.m_def.all_quantities.items(): - if quantity.virtual or not self.m_is_set(quantity): + if quantity.virtual: continue - if self.m_is_set(quantity) and quantity.derived is None: - serialize: TypingCallable[[Any], Any] = str - if isinstance(quantity.type, DataType): + is_set = self.m_is_set(quantity) + if not is_set: + if not include_defaults or not quantity.m_is_set(Quantity.default): + continue - def data_type_serialize(value): - return quantity.type.serialize(self, quantity, value) + quantity_type = quantity.type - serialize = data_type_serialize + serialize: TypingCallable[[Any], Any] = str + if isinstance(quantity_type, Reference): - elif quantity.type in [str, int, float, bool]: - serialize = quantity.type + def reference_serialize(value): + if isinstance(value, MProxy): + return value.url + else: + return quantity_type.serialize(self, quantity, value) - elif type(quantity.type) == np.dtype: - pass + serialize = reference_serialize - elif isinstance(quantity.type, MEnum): - pass + elif isinstance(quantity_type, DataType): - elif quantity.type == Any: - def _serialize(value: Any): - if type(value) not in [str, int, float, bool, list, type(None)]: - raise MetainfoError( - 'Only python primitives are allowed for Any typed non ' - 'virtual quantities: %s of quantity %s in section %s' % - (value, quantity, self)) + def data_type_serialize(value): + return quantity_type.serialize(self, quantity, value) - return value + serialize = data_type_serialize - serialize = _serialize + elif quantity_type in [str, int, float, bool]: + serialize = quantity_type - else: - raise MetainfoError( - 'Do not know how to serialize data with type %s for quantity %s' % - (quantity.type, quantity)) + elif type(quantity_type) == np.dtype: + pass + + elif isinstance(quantity_type, MEnum): + pass + + elif quantity_type == Any: + def _serialize(value: Any): + if type(value) not in [str, int, float, bool, list, type(None)]: + raise MetainfoError( + 'Only python primitives are allowed for Any typed non ' + 'virtual quantities: %s of quantity %s in section %s' % + (value, quantity, self)) + + return value + serialize = _serialize + + else: + raise MetainfoError( + 'Do not know how to serialize data with type %s for quantity %s' % + (quantity_type, quantity)) + + if is_set: value = cast(MDataDict, self.m_data).dct[name] + else: + value = quantity.default - if type(quantity.type) == np.dtype: - serializable_value = value.tolist() + if type(quantity_type) == np.dtype: + serializable_value = value.tolist() + else: + if len(quantity.shape) == 0: + serializable_value = serialize(value) + elif len(quantity.shape) == 1: + serializable_value = [serialize(i) for i in value] else: - if len(quantity.shape) == 0: - serializable_value = serialize(value) - elif len(quantity.shape) == 1: - serializable_value = [serialize(i) for i in value] - else: - raise NotImplementedError('Higher shapes (%s) not supported: %s' % (quantity.shape, quantity)) + raise NotImplementedError('Higher shapes (%s) not supported: %s' % (quantity.shape, quantity)) - yield name, serializable_value + yield name, serializable_value # sub sections for name, sub_section_def in self.m_def.all_sub_sections.items(): if sub_section_def.repeats: if self.m_sub_section_count(sub_section_def) > 0: yield name, [ - item.m_to_dict() + item.m_to_dict(with_meta=with_meta, include_defaults=include_defaults) for item in self.m_get_sub_sections(sub_section_def)] else: sub_section = self.m_get_sub_section(sub_section_def, -1) if sub_section is not None: - yield name, sub_section.m_to_dict() + yield name, sub_section.m_to_dict(with_meta=with_meta, include_defaults=include_defaults) return {key: value for key, value in items()} @classmethod def m_from_dict(cls: Type[MSectionBound], dct: Dict[str, Any]) -> MSectionBound: - """ Creates a section from the given serializable data dictionary. + ''' Creates a section from the given serializable data dictionary. This is the 'opposite' of :func:`m_to_dict`. It takes a deserialised dict, e.g loaded from JSON, and turns it into a proper section, i.e. instance of the given section class. - """ + ''' section_def = cls.m_def @@ -1173,11 +1252,11 @@ class MSection(metaclass=MObjectMeta): return section def m_to_json(self, **kwargs): - """ Returns the data of this section as a json string. """ + ''' Returns the data of this section as a json string. ''' return json.dumps(self.m_to_dict(), **kwargs) def m_all_contents(self) -> Iterable['MSection']: - """ Returns an iterable over all sub and sub subs sections. """ + ''' Returns an iterable over all sub and sub subs sections. ''' for content in self.m_contents(): for sub_content in content.m_all_contents(): yield sub_content @@ -1185,7 +1264,7 @@ class MSection(metaclass=MObjectMeta): yield content def m_contents(self) -> Iterable['MSection']: - """ Returns an iterable over all direct subs sections. """ + ''' Returns an iterable over all direct subs sections. ''' for sub_section_def in self.m_def.all_sub_sections.values(): if sub_section_def.repeats: index = 0 @@ -1198,7 +1277,7 @@ class MSection(metaclass=MObjectMeta): yield sub_section def m_path(self, quantity_def: 'Quantity' = None) -> str: - """ Returns the path of this section or the given quantity within the section hierarchy. """ + ''' Returns the path of this section or the given quantity within the section hierarchy. ''' if self.m_parent is None: return '/' @@ -1213,19 +1292,21 @@ class MSection(metaclass=MObjectMeta): return '%s/%s' % (self.m_parent.m_path().rstrip('/'), segment) def m_root(self, cls: Type[MSectionBound] = None) -> MSectionBound: - """ Returns the first parent of the parent section that has no parent; the root. """ + ''' Returns the first parent of the parent section that has no parent; the root. ''' if self.m_parent is None: return cast(MSectionBound, self) else: return self.m_parent.m_root(cls) def m_parent_as(self, cls: Type[MSectionBound] = None) -> MSectionBound: - """ Returns the parent section with the given section class type. """ + ''' Returns the parent section with the given section class type. ''' return cast(MSectionBound, self.m_parent) def m_resolve(self, path: str, cls: Type[MSectionBound] = None) -> MSectionBound: - """ Resolves the given path using this section as context. """ - + ''' + Resolves the given path or dotted quantity name using this section as context and + returns the sub_section or value. + ''' if path.startswith('/'): context: 'MSection' = self.m_root() else: @@ -1233,7 +1314,7 @@ class MSection(metaclass=MObjectMeta): path_stack = path.strip('/').split('/') path_stack.reverse() - while len(path_stack) > 1: + while len(path_stack) > 0: prop_name = path_stack.pop() prop_def = context.m_def.all_properties.get(prop_name, None) @@ -1275,7 +1356,7 @@ class MSection(metaclass=MObjectMeta): return cast(MSectionBound, context) def m_x(self, key: str, default=None): - """ Convinience method for get the annotation with name ``key``. """ + ''' Convinience method for get the annotation with name ``key``. ''' return self.m_annotations.get(key, default) def __validate_shape(self, quantity_def: 'Quantity', value): @@ -1301,7 +1382,7 @@ class MSection(metaclass=MObjectMeta): return True def m_validate(self): - """ Evaluates all constraints and shapes of this section and returns a list of errors. """ + ''' Evaluates all constraints and shapes of this section and returns a list of errors. ''' errors: List[str] = [] for constraint_name in self.m_def.constraints: constraint = getattr(self, 'c_%s' % constraint_name, None) @@ -1327,7 +1408,7 @@ class MSection(metaclass=MObjectMeta): return errors def m_all_validate(self): - """ Evaluates all constraints in the whole section hierarchy, incl. this section. """ + ''' Evaluates all constraints in the whole section hierarchy, incl. this section. ''' errors: List[str] = [] for section in itertools.chain([self], self.m_all_contents()): for error in section.m_validate(): @@ -1347,6 +1428,16 @@ class MSection(metaclass=MObjectMeta): return '%s:%s' % (name, m_section_name) + def __getitem__(self, key): + key = key.replace('.', '/') + return self.m_resolve(key) + + def __iter__(self): + return self.m_def.all_properties.__iter__() + + def __len__(self): + return len(self.m_def.all_properties) + class MCategory(metaclass=MObjectMeta): @@ -1374,7 +1465,7 @@ class MCategory(metaclass=MObjectMeta): # Metainfo M3 (i.e. definitions of definitions) class Definition(MSection): - """ A common base for all metainfo definitions. + ''' A common base for all metainfo definitions. All metainfo `definitions` (sections, quantities, sub-sections, packages, ...) share some common attributes. These are defined in a common base: all @@ -1403,7 +1494,7 @@ class Definition(MSection): Additional helper functions for `definitions`: .. automethod:: all_definitions - """ + ''' __all_definitions: Dict[Type[MSection], List[MSection]] = {} @@ -1423,11 +1514,11 @@ class Definition(MSection): @classmethod def all_definitions(cls: Type[MSectionBound]) -> Iterable[MSectionBound]: - """ Class method that returns all definitions of this class. + ''' Class method that returns all definitions of this class. This can be used to get a list of all globally available `defintions` or a certain kind. E.g. to get all `quantities`: ``Quantity.all_definitions()``. - """ + ''' return cast(Iterable[MSectionBound], Definition.__all_definitions.get(cls, [])) def qualified_name(self): @@ -1449,11 +1540,14 @@ class Definition(MSection): class Property(Definition): - pass + + def __init_property__(self): + ''' Is called during section initialisation to allow property initialisation ''' + pass class Quantity(Property): - """ Definition of an atomic piece of data. + ''' Definition of an atomic piece of data. Quantity definitions are the main building block of meta-info schemas. Each quantity represents a single piece of data. @@ -1551,7 +1645,7 @@ class Quantity(Property): is_scalar: Derived quantity that is True, iff this quantity has shape of length 0 - """ + ''' type: 'Quantity' = None shape: 'Quantity' = None @@ -1564,6 +1658,10 @@ class Quantity(Property): # TODO derived_from = Quantity(type=Quantity, shape=['0..*']) + def __init_property__(self): + if self.derived is not None: + self.virtual = True + def __get__(self, obj, cls): if obj is None: # class (def) attribute case @@ -1610,7 +1708,7 @@ class Quantity(Property): class DirectQuantity(Quantity): - """ Used for quantities that would cause indefinite loops due to bootstrapping. """ + ''' Used for quantities that would cause indefinite loops due to bootstrapping. ''' def __init__(self, **kwargs): super().__init__(**kwargs) @@ -1637,7 +1735,7 @@ class DirectQuantity(Quantity): class SubSection(Property): - """ Defines what sections can appear as sub-sections of another section. + ''' Defines what sections can appear as sub-sections of another section. Like quantities, sub-sections are defined in a `section class` as attributes of this class. An like quantities, each sub-section definition becomes a property of @@ -1656,7 +1754,7 @@ class SubSection(Property): repeats: A boolean that determines wether this sub-section can appear multiple times in the parent section. - """ + ''' sub_section: 'Quantity' = None repeats: 'Quantity' = None @@ -1674,14 +1772,24 @@ class SubSection(Property): return obj.m_get_sub_section(self, -1) def __set__(self, obj, value): - raise NotImplementedError('Sub sections cannot be set directly. Use m_create.') + if obj is None: + raise NotImplementedError() + + if self.repeats: + raise NotImplementedError('Cannot set a repeating sub section use m_create or m_add_sub_section.') + + else: + if value is None: + obj.m_remove_sub_section(self, -1) + else: + obj.m_add_sub_section(self, value) def __delete__(self, obj): raise NotImplementedError('Deleting sub sections is not supported.') class Section(Definition): - """ Sections define blocks of related quantities and allows hierarchical data. + ''' Sections define blocks of related quantities and allows hierarchical data. Section definitions determine what quantities and sub-sections can appear in a following section instance. @@ -1756,7 +1864,7 @@ class Section(Definition): parent_section_sub_section_defs: A helper attribute that gives all sub-section definitions that this section is used in. - """ + ''' section_cls: Type[MSection] = None @@ -1820,7 +1928,7 @@ class Section(Definition): class Package(Definition): - """ Packages organize metainfo defintions alongside Python modules + ''' Packages organize metainfo defintions alongside Python modules Each Python module with metainfo Definition (explicitely or implicitely) has a member ``m_package`` with an instance of this class. Definitions (categories, sections) in @@ -1843,7 +1951,7 @@ class Package(Definition): all_definitions: A helper attribute that provides all section definitions by name. - """ + ''' section_definitions: 'SubSection' = None category_definitions: 'SubSection' = None @@ -1874,7 +1982,7 @@ class Package(Definition): class Category(Definition): - """ Categories allow to organize metainfo definitions (not metainfo data like sections do) + ''' Categories allow to organize metainfo definitions (not metainfo data like sections do) Each definition, including categories themselves, can belong to a set of categories. Categories therefore form a hierarchy of concepts that definitions can belong to, i.e. @@ -1883,7 +1991,7 @@ class Category(Definition): Args: definitions: A helper attribute that gives all definitions that are directly or indirectly in this category. - """ + ''' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1955,7 +2063,7 @@ SubSection.__init_cls__() class Environment(MSection): - """ Environments allow to manage many metainfo packages and quickly access all definitions. + ''' Environments allow to manage many metainfo packages and quickly access all definitions. Environments provide a name-table for large-sets of metainfo definitions that span multiple packages. It provides various functions to resolve metainfo definitions by @@ -1963,7 +2071,7 @@ class Environment(MSection): Args: packages: Packages in this environment. - """ + ''' packages = SubSection(sub_section=Package, repeats=True) diff --git a/nomad/metainfo/mongoengine.py b/nomad/metainfo/mongoengine.py index 47455a28d6e209a36547d4440ef632a47777aef8..020704f9db5175ee4954bc86ad9a6254ab356025 100644 --- a/nomad/metainfo/mongoengine.py +++ b/nomad/metainfo/mongoengine.py @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Adds mongoengine supports to the metainfo. Allows to create, save, and get metainfo sections from mongoengine. Currently no sub-section support. The annotation key is "a_me", the annotation object support the following keys: - ``primary_key``: *Bool*, renders the quantity to be the primary key. - ``index``: *Bool*, adds this quantity to the index -""" +''' from typing import Any, Dict import mongoengine as me diff --git a/nomad/metainfo/optimade.py b/nomad/metainfo/optimade.py index 62378b1fab4ad0bb6ad99478a4e682ff4b578b7f..18d45824c96d1172c5bddfbad692d9ae5c8dac0d 100644 --- a/nomad/metainfo/optimade.py +++ b/nomad/metainfo/optimade.py @@ -1,11 +1,13 @@ from ase.data import chemical_symbols -from elasticsearch_dsl import Keyword, Integer, Float, InnerDoc, Nested +from elasticsearch_dsl import Keyword, Float, InnerDoc, Nested import numpy as np from . import MSection, Section, Quantity, SubSection, MEnum, units -from .elastic import elastic_mapping +from .search import SearchQuantity +# TODO move the module + def optimade_links(section: str): return [ 'https://github.com/Materials-Consortia/OPTiMaDe/blob/develop/optimade.md#%s' % @@ -29,11 +31,11 @@ class Optimade(): class Species(MSection): - """ + ''' Used to describe the species of the sites of this structure. Species can be pure chemical elements, or virtual-crystal atoms representing a statistical occupation of a given site by multiple chemical elements. - """ + ''' m_def = Section(links=optimade_links('h.6.2.13')) @@ -96,13 +98,12 @@ class Species(MSection): class OptimadeEntry(MSection): m_def = Section( links=optimade_links('h.6.2'), - a_flask=dict(skip_none=True), - a_elastic=dict(type=InnerDoc)) + a_flask=dict(skip_none=True)) elements = Quantity( type=MEnum(chemical_symbols), shape=['1..*'], links=optimade_links('h.6.2.1'), - a_elastic=dict(type=Keyword), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=True), description=''' Names of the different elements present in the structure. @@ -111,7 +112,7 @@ class OptimadeEntry(MSection): nelements = Quantity( type=int, links=optimade_links('h.6.2.2'), - a_elastic=dict(type=Integer), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=True), description=''' Number of different elements in the structure as an integer. @@ -120,7 +121,7 @@ class OptimadeEntry(MSection): elements_ratios = Quantity( type=float, shape=['nelements'], links=optimade_links('h.6.2.3'), - a_elastic=dict(type=lambda: Nested(ElementRatio), mapping=ElementRatio.from_structure_entry), + a_search=SearchQuantity(es_mapping=Nested(ElementRatio), es_value=ElementRatio.from_structure_entry), a_optimade=Optimade(query=True, entry=True), description=''' Relative proportions of different elements in the structure. @@ -129,7 +130,7 @@ class OptimadeEntry(MSection): chemical_formula_descriptive = Quantity( type=str, links=optimade_links('h.6.2.4'), - a_elastic=dict(type=Keyword), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=True), description=''' The chemical formula for a structure as a string in a form chosen by the API @@ -139,7 +140,7 @@ class OptimadeEntry(MSection): chemical_formula_reduced = Quantity( type=str, links=optimade_links('h.6.2.5'), - a_elastic=dict(type=Keyword), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=True), description=''' The reduced chemical formula for a structure as a string with element symbols and @@ -149,7 +150,7 @@ class OptimadeEntry(MSection): chemical_formula_hill = Quantity( type=str, links=optimade_links('h.6.2.6'), - a_elastic=dict(type=Keyword), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=False), description=''' The chemical formula for a structure in Hill form with element symbols followed by @@ -159,7 +160,7 @@ class OptimadeEntry(MSection): chemical_formula_anonymous = Quantity( type=str, links=optimade_links('h.6.2.7'), - a_elastic=dict(type=Keyword), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=True), description=''' The anonymous formula is the chemical_formula_reduced, but where the elements are @@ -171,7 +172,7 @@ class OptimadeEntry(MSection): dimension_types = Quantity( type=int, shape=[3], links=optimade_links('h.6.2.8'), - a_elastic=dict(type=Integer, mapping=lambda a: sum(a.dimension_types)), + a_search=SearchQuantity(es_value=lambda a: sum(a.dimension_types)), a_optimade=Optimade(query=True, entry=True), description=''' List of three integers. For each of the three directions indicated by the three lattice @@ -201,7 +202,7 @@ class OptimadeEntry(MSection): nsites = Quantity( type=int, links=optimade_links('h.6.2.11'), - a_elastic=dict(type=Integer), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=True), description=''' An integer specifying the length of the cartesian_site_positions property. ''') @@ -220,7 +221,7 @@ class OptimadeEntry(MSection): structure_features = Quantity( type=MEnum(['disorder', 'unknown_positions', 'assemblies']), shape=['1..*'], links=optimade_links('h.6.2.15'), - a_elastic=dict(type=Keyword), + a_search=SearchQuantity(), a_optimade=Optimade(query=True, entry=True), description=''' A list of strings that flag which special features are used by the structure. @@ -232,6 +233,3 @@ class OptimadeEntry(MSection): ''') species = SubSection(sub_section=Species.m_def, repeats=True) - - -ESOptimadeEntry = elastic_mapping(OptimadeEntry.m_def, InnerDoc) diff --git a/nomad/metainfo/search.py b/nomad/metainfo/search.py new file mode 100644 index 0000000000000000000000000000000000000000..45f1d317fa3859f3c973ff8ae9d2aea9174197b8 --- /dev/null +++ b/nomad/metainfo/search.py @@ -0,0 +1,116 @@ +from typing import Callable, Any + +from nomad import metainfo + + +# TODO multi, split are more flask related +class SearchQuantity: + ''' + A metainfo quantity annotation class that defines additional properties that determine + how to search for the respective quantity. Only quantities that have this will + be mapped to elastic search. + + Attributes: + name: The name of this search quantity. Will be the name in the elastic index and + the name for the search parameter. Default is the metainfo quantity name. + many_or: Indicates that an 'or' (es terms) search is performed if many values are given. + Otherwise an 'and' (es bool->should->match) is performed. Values are 'split' and + 'append' to indicate how URL search parameters should be treated. + many_and: Indicates that many values can be supplied for search. Values are 'split' and + 'append' to indicate how URL search parameters should be treated. + order_default: Indicates that this quantity is used to order search results + if no other ordering was specificed. + metric: Quantity can be used to build statistics. Statistics provide a metric + value for each value of the quantity. E.g. number of datasets with a given atom label. + This defines a metric based on this quantity. Values need to be a valid + elastic search aggregation (e.g. sum, cardinality, etc.). + metric_name: If this quantity is indicated to function as a metric, the metric + needs a name. By default the quantities name is used. + default_statistic: Indicates this quantity to be part of the default statistics. + statistics_size: + The maximum number of values in a statistic. Default is 10. + group: Indicates that his quantity can be used to group results. The value will + be the name of the group. + es_quantity: The quantity in the elastic mapping that is used to search. This is + especially useful if the quantity represents a inner document and only one + quantity of this inner object is used. Default is the name of the quantity. + es_mapping: A valid elasticsearch_dsl mapping. Default is ``Keyword()``. + es_value: A callable that is applied to section to get a value for this quantity in the elastic index. + derived: A callable that is applied to search parameter values before search. + ''' + + def __init__( + self, + name: str = None, description: str = None, + many_and: str = None, many_or: str = None, + order_default: bool = False, + group: str = None, metric: str = None, metric_name: str = None, + default_statistic: bool = False, + statistic_size: int = 10, + es_quantity: str = None, + es_mapping: Any = None, + es_value: Callable[[Any], Any] = None, + derived: Callable[[Any], Any] = None): + + self.name = name + self.description = description + self.many_and = many_and + self.many_or = many_or + self.order_default = order_default + self.group = group + self.default_statistic = default_statistic + self.metric = metric + self.metric_name = metric_name + self.statistic_size = statistic_size + self.es_quantity = es_quantity + self.es_mapping = es_mapping + self.es_value = es_value + self.derived = derived + + self.prefix: str = None + self.qualified_name: str = None + + assert many_and is None or many_or is None, 'A search quantity can only be used for multi or many search' + assert many_and in [None, 'split', 'append'], 'Only split and append are valid values' + assert many_or in [None, 'split', 'append'], 'Only split and append are valid values' + + def configure(self, quantity: metainfo.Quantity, prefix: str = None): + if self.name is None: + self.name = quantity.name + + if self.description is None: + self.description = quantity.description + + if prefix is not None: + self.qualified_name = '%s.%s' % (prefix, self.name) + if self.es_quantity is not None: + self.es_quantity = '%s.%s' % (prefix, self.es_quantity) + if self.metric_name is not None: + self.metric_name = '%s.%s' % (prefix, self.metric_name) + if self.group is not None: + self.group = '%s.%s' % (prefix, self.group) + else: + self.qualified_name = self.name + + if self.es_quantity is None: + self.es_quantity = self.qualified_name + if self.metric_name is None and self.metric is not None: + self.metric_name = self.qualified_name + + @property + def argparse_action(self): + if self.many_or is not None: + return self.many_or + + if self.many_and is not None: + return self.many_and + + return None + + @property + def many(self): + return self.many_and is not None or self.many_or is not None + + +def init(section: metainfo.MSection): + pass diff --git a/nomad/normalizing/__init__.py b/nomad/normalizing/__init__.py index 20502f9951d5738732309ee998d667a7c654f207..cd527d48da12258045901808535e62c8baf18517 100644 --- a/nomad/normalizing/__init__.py +++ b/nomad/normalizing/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' After parsing calculations have to be normalized with a set of *normalizers*. In NOMAD-coe those were programmed in python (we'll reuse) and scala (we'll rewrite). @@ -29,7 +29,7 @@ There is one ABC for all normalizer: .. autoclass::nomad.normalizing.normalizer.Normalizer :members: -""" +''' from typing import List, Any, Iterable, Type diff --git a/nomad/normalizing/data/springer_msgpack.py b/nomad/normalizing/data/springer_msgpack.py index b0590f20dafcb54dfd6f4c98279e098330f73b9a..9a9bfdda45998ee9a24a7da09cec2d2744ad77f8 100644 --- a/nomad/normalizing/data/springer_msgpack.py +++ b/nomad/normalizing/data/springer_msgpack.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Generates and queries a msgpack database of springer-related quantities downloaded from http://materials.springer.com. The database is stuctured as @@ -21,7 +21,7 @@ space_group_number : normalized_formula : springer_id : entry The msgpack file can be queried using ArchiveFileDB. The html parser was taken from a collection of scripts from FHI without further testing. -""" +''' import requests import re @@ -89,9 +89,9 @@ def normalize_formula(formula_str: str) -> str: def parse(htmltext: str) -> Dict[str, str]: - """ + ''' Parser the quantities in required_items from an html text. - """ + ''' soup = BeautifulSoup(htmltext, "html.parser") results = {} for item in soup.find_all(attrs={"class": "data-list__item"}): @@ -158,10 +158,10 @@ def _download(path: str, max_n_query: int = 10) -> str: def download_springer_data(max_n_query: int = 10): - """ + ''' Downloads the springer quantities related to a structure from springer and updates database. - """ + ''' # load database # querying database with unvailable dataset leads to error, # get toc keys first by making an empty query @@ -219,9 +219,9 @@ def download_springer_data(max_n_query: int = 10): def query_springer_data(normalized_formula: str, space_group_number: int) -> Dict[str, Any]: - """ + ''' Queries a msgpack database for springer-related quantities. - """ + ''' entries = query_archive(DB_NAME, {str(space_group_number): {normalized_formula: '*'}}) db_dict = {} entries = entries.get(str(space_group_number), {}).get(normalized_formula, {}) diff --git a/nomad/normalizing/normalizer.py b/nomad/normalizing/normalizer.py index 2ca1efbf906e3479aa0aa5080537f5006b494e78..ea0d70f2bfe77e8b5945f97ffae19c91464ae9ab 100644 --- a/nomad/normalizing/normalizer.py +++ b/nomad/normalizing/normalizer.py @@ -20,16 +20,16 @@ from nomad.utils import get_logger class Normalizer(metaclass=ABCMeta): - """ + ''' A base class for normalizers. Normalizers work on a :class:`AbstractParserBackend` instance for read and write. Normalizer instances are reused. Arguments: backend: The backend used to read and write data from and to. - """ + ''' domain = 'dft' - """ The domain this normalizer should be used in. Default for all normalizer is 'DFT'. """ + ''' The domain this normalizer should be used in. Default for all normalizer is 'DFT'. ''' def __init__(self, backend: AbstractParserBackend) -> None: self._backend = backend @@ -42,7 +42,7 @@ class Normalizer(metaclass=ABCMeta): class SystemBasedNormalizer(Normalizer, metaclass=ABCMeta): - """ + ''' A normalizer base class for normalizers that only touch a section_system. The normalizer is run on all section systems in a run. However, some systems, @@ -51,7 +51,7 @@ class SystemBasedNormalizer(Normalizer, metaclass=ABCMeta): Args: only_representatives: Will only normalize the `representative` systems. - """ + ''' def __init__(self, backend: AbstractParserBackend, only_representatives: bool = False): super().__init__(backend) self.only_representatives = only_representatives @@ -78,15 +78,15 @@ class SystemBasedNormalizer(Normalizer, metaclass=ABCMeta): @abstractmethod def normalize_system(self, section_system_index: int, is_representative: bool) -> bool: - """ Normalize the given section and returns True, iff successful""" + ''' Normalize the given section and returns True, iff successful''' pass def __representative_system(self): - """Used to select a representative system for this entry. + '''Used to select a representative system for this entry. Attempt to find a single section_system that is representative for the entry. The selection depends on the type of calculation. - """ + ''' # Try to find a frame sequence, only first found is considered try: frame_seq = self._backend['section_frame_sequence'][0] diff --git a/nomad/normalizing/optimade.py b/nomad/normalizing/optimade.py index 114b2064143fd6b0d463f193a82cad7521e22797..d0ddf643ac1f4e497cc831d48645c4fcec926314 100644 --- a/nomad/normalizing/optimade.py +++ b/nomad/normalizing/optimade.py @@ -28,19 +28,19 @@ species_re = re.compile(r'^([A-Z][a-z]?)(\d*)$') class OptimadeNormalizer(SystemBasedNormalizer): - """ + ''' This normalizer performs all produces a section all data necessary for the Optimade API. It assumes that the :class:`SystemNormalizer` was run before. - """ + ''' def __init__(self, backend): super().__init__(backend, only_representatives=True) def get_optimade_data(self, index) -> OptimadeEntry: - """ + ''' The 'main' method of this :class:`SystemBasedNormalizer`. Normalizes the section with the given `index`. Normalizes geometry, classifies, system_type, and runs symmetry analysis. - """ + ''' optimade = OptimadeEntry() def get_value(key: str, default: Any = None, numpy: bool = False, unit=None) -> Any: diff --git a/nomad/normalizing/structure.py b/nomad/normalizing/structure.py index 5632d16f118f637554468df4db012878f40079b2..b1661c56616d89882101259a0cf01694e1f524e9 100644 --- a/nomad/normalizing/structure.py +++ b/nomad/normalizing/structure.py @@ -32,7 +32,7 @@ if old_symmetry_tolerance != symmetry_tolerance: def get_normalized_wyckoff(atomic_numbers: np.array, wyckoff_letters: np.array) -> Dict[str, Dict[str, int]]: - """Returns a normalized Wyckoff sequence for the given atomic numbers and + '''Returns a normalized Wyckoff sequence for the given atomic numbers and corresponding wyckoff letters. In a normalized sequence the chemical species are "anonymized" by replacing them with upper case alphabets. @@ -45,7 +45,7 @@ def get_normalized_wyckoff(atomic_numbers: np.array, wyckoff_letters: np.array) dictionary. The dictionary contains the number of atoms for each species, where the species names have been anomymized in the form "X_<index>". - """ + ''' # Count the occurrence of each chemical species atom_count: Dict[int, int] = {} for atomic_number in atomic_numbers: @@ -106,7 +106,7 @@ def get_normalized_wyckoff(atomic_numbers: np.array, wyckoff_letters: np.array) def search_aflow_prototype(space_group: int, norm_wyckoff: dict) -> dict: - """Searches the AFLOW prototype library for a match for the given space + '''Searches the AFLOW prototype library for a match for the given space group and normalized Wyckoff sequence. The normalized Wyckoff sequence is assumed to come from the MatID symmetry routine. @@ -121,7 +121,7 @@ def search_aflow_prototype(space_group: int, norm_wyckoff: dict) -> dict: Returns: Dictionary containing the AFLOW prototype information. - """ + ''' structure_type_info = None type_descriptions = aflow_prototypes["prototypes_by_spacegroup"].get(space_group, []) for type_description in type_descriptions: diff --git a/nomad/normalizing/system.py b/nomad/normalizing/system.py index e91043ab88c15176e26e35edacce3136a8a1ceca..eaca47f82f3ea461d9765fb63234b957f1d4eb86 100644 --- a/nomad/normalizing/system.py +++ b/nomad/normalizing/system.py @@ -40,10 +40,10 @@ springer_db_connection = None def open_springer_database(): - """ + ''' Create a global connection to the Springer database in a way that each worker opens the database just once. - """ + ''' global springer_db_connection if springer_db_connection is None: # filepath definition in 'nomad-FAIR/nomad/config.py' @@ -59,22 +59,22 @@ def open_springer_database(): def normalized_atom_labels(atom_labels): - """ + ''' Normalizes the given atom labels: they either are labels right away, or contain additional numbers (to distinguish same species but different labels, see meta-info), or we replace them with ase placeholder atom for unknown elements 'X'. - """ + ''' return [ ase.data.chemical_symbols[0] if match is None else match.group(0) for match in [re.search(atom_label_re, atom_label) for atom_label in atom_labels]] def formula_normalizer(atoms): - """ + ''' Reads the chemical symbols in ase.atoms and returns a normalized formula. Formula normalization is on the basis of atom counting, e.g., Tc -> Tc100, SZn -> S50Zn50, Co2Nb -> Co67Nb33 - """ + ''' # chem_symb = atoms.get_chemical_symbols() atoms_counter = Counter(chem_symb) # dictionary @@ -91,10 +91,10 @@ def formula_normalizer(atoms): class SystemNormalizer(SystemBasedNormalizer): - """ + ''' This normalizer performs all system (atoms, cells, etc.) related normalizations of the legacy NOMAD-coe *stats* normalizer. - """ + ''' @staticmethod def atom_label_to_num(atom_label): @@ -109,13 +109,13 @@ class SystemNormalizer(SystemBasedNormalizer): return 0 def normalize_system(self, index, is_representative) -> bool: - """ + ''' The 'main' method of this :class:`SystemBasedNormalizer`. Normalizes the section with the given `index`. Normalizes geometry, classifies, system_type, and runs symmetry analysis. Returns: True, iff the normalization was successful - """ + ''' def get_value(key: str, default: Any = None, numpy: bool = True) -> Any: try: @@ -262,13 +262,13 @@ class SystemNormalizer(SystemBasedNormalizer): return True def system_type_analysis(self, atoms: Atoms) -> None: - """ + ''' Determine the system type with MatID. Write the system type to the backend. Args: atoms: The structure to analyse - """ + ''' system_type = config.services.unavailable_value if atoms.get_number_of_atoms() <= config.normalize.system_classification_with_clusters_threshold: try: @@ -297,7 +297,7 @@ class SystemNormalizer(SystemBasedNormalizer): self._backend.addValue('system_type', system_type) def symmetry_analysis(self, atoms) -> None: - """Analyze the symmetry of the material being simulated. + '''Analyze the symmetry of the material being simulated. We feed in the parsed values in section_system to the the symmetry analyzer. We then use the Matid library @@ -312,7 +312,7 @@ class SystemNormalizer(SystemBasedNormalizer): Returns: None: The method should write symmetry variables to the backend which is member of this class. - """ + ''' # Try to use Matid's symmetry analyzer to analyze the ASE object. try: symm = SymmetryAnalyzer(atoms, symmetry_tol=config.normalize.symmetry_tolerance) @@ -410,7 +410,7 @@ class SystemNormalizer(SystemBasedNormalizer): # SQL QUERY # (this replaces the four queries done in the old 'classify4me_SM_normalizer.py') - cur.execute(""" + cur.execute(''' SELECT entry.entry_id, entry.alphabetic_formula, @@ -425,7 +425,7 @@ class SystemNormalizer(SystemBasedNormalizer): LEFT JOIN reference ON reference.reference_nr = er.entry_nr WHERE entry.normalized_formula = ( %r ) and entry.space_group_number = '%d' GROUP BY entry.entry_id; - """ % (normalized_formula, space_group_number)) + ''' % (normalized_formula, space_group_number)) results = cur.fetchall() # 'results' is a list of tuples, i.e. '[(a,b,c,d), ..., (a,b,c,d)]' @@ -487,14 +487,14 @@ class SystemNormalizer(SystemBasedNormalizer): self.logger.warning('Mismatch in Springer classification or compounds') def prototypes(self, atom_species: np.array, wyckoffs: np.array, spg_number: int) -> None: - """Tries to match the material to an entry in the AFLOW prototype data. + '''Tries to match the material to an entry in the AFLOW prototype data. If a match is found, a section_prototype is added to section_system. Args: atomic_numbers: Array of atomic numbers. wyckoff_letters: Array of Wyckoff letters as strings. spg_number: Space group number. - """ + ''' norm_wyckoff = structure.get_normalized_wyckoff(atom_species, wyckoffs) protoDict = structure.search_aflow_prototype(spg_number, norm_wyckoff) if protoDict is not None: diff --git a/nomad/parsing/__init__.py b/nomad/parsing/__init__.py index 4144ce44a4cb306f1c7b423e338bb01ea35cd074..7327895df2314156ddd87da4f552763f5eb1a72a 100644 --- a/nomad/parsing/__init__.py +++ b/nomad/parsing/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The *parsing* module is an interface for the existing NOMAD-coe parsers. This module redefines some of the old NOMAD-coe python-common functionality to create a more coherent interface to the parsers. @@ -69,7 +69,7 @@ based on NOMAD-coe's *python-common* module. :members: .. autoclass:: nomad.parsing.LocalBackend :members: -""" +''' from typing import Callable, IO, Union, Dict import magic @@ -96,7 +96,7 @@ encoding_magic = magic.Magic(mime_encoding=True) def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFiles], strict=True) -> 'Parser': - """ + ''' Performs parser matching. This means it take the given mainfile and potentially opens it with the given callback and tries to identify a parser that can parse the file. @@ -111,7 +111,7 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile strict: Only match strict parsers, e.g. no artificial parsers for missing or empty entries. Returns: The parser, or None if no parser could be matched. - """ + ''' if mainfile.startswith('.') or mainfile.startswith('~'): return None @@ -484,7 +484,7 @@ if config.use_empty_parsers: parsers.append(BrokenParser()) -""" Instantiation and constructor based config of all parsers. """ +''' Instantiation and constructor based config of all parsers. ''' parser_dict = {parser.name: parser for parser in parsers} # type: ignore -""" A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. """ +''' A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. ''' diff --git a/nomad/parsing/artificial.py b/nomad/parsing/artificial.py index 969f8f2009a5bdd88548cd97b9afddc3d6a68619..fe85e9bf53dc8d0f996ff0c928aca1c75d2c629f 100644 --- a/nomad/parsing/artificial.py +++ b/nomad/parsing/artificial.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Parser for creating artificial test, brenchmark, and demonstration data. -""" +''' import json import os.path @@ -40,7 +40,7 @@ meta_info_env, _ = loadJsonFile(filePath=meta_info_path, dependencyLoader=None, class ArtificalParser(Parser): - """ Base class for artifical parsers based on VASP metainfo. """ + ''' Base class for artifical parsers based on VASP metainfo. ''' def __init__(self): super().__init__() self.backend = None @@ -54,9 +54,9 @@ class ArtificalParser(Parser): class EmptyParser(MatchingParser): - """ + ''' Implementation that produces an empty code_run - """ + ''' def run(self, mainfile: str, logger=None) -> LocalBackend: backend = LocalBackend(metaInfoEnv=meta_info_env, debug=False) # type: ignore backend.openSection('section_run') @@ -66,10 +66,10 @@ class EmptyParser(MatchingParser): class TemplateParser(ArtificalParser): - """ + ''' A parser that generates data based on a template given via the mainfile. The template is basically some archive json. Only - """ + ''' name = 'parsers/template' def is_mainfile( @@ -78,11 +78,11 @@ class TemplateParser(ArtificalParser): return filename.endswith('template.json') def transform_value(self, name, value): - """ allow subclasses to modify values """ + ''' allow subclasses to modify values ''' return value def transform_section(self, name, section): - """ allow subclasses to modify sections """ + ''' allow subclasses to modify sections ''' return section def add_section(self, section): @@ -130,7 +130,7 @@ class TemplateParser(ArtificalParser): class ChaosParser(ArtificalParser): - """ + ''' Parser that emulates typical error situations. Files can contain a json string (or object with key `chaos`) with one of the following string values: - exit @@ -139,7 +139,7 @@ class ChaosParser(ArtificalParser): - exception - segfault - random - """ + ''' name = 'parsers/chaos' def is_mainfile( @@ -212,7 +212,7 @@ class GenerateRandomParser(TemplateParser): return os.path.basename(filename).startswith('random_') def transform_section(self, name, section): - """ allow subclasses to modify sections """ + ''' allow subclasses to modify sections ''' if name == 'section_system': atoms = [] atom_positions = [] diff --git a/nomad/parsing/backend.py b/nomad/parsing/backend.py index 28b118b10a86facbf1e569aac217b582669db63f..0f25a91b546063789ea34d9d910041bd3a713794 100644 --- a/nomad/parsing/backend.py +++ b/nomad/parsing/backend.py @@ -55,124 +55,124 @@ class WrongContextState(Exception): class AbstractParserBackend(metaclass=ABCMeta): - """ + ''' This ABS provides the parser backend interface used by the NOMAD-coe parsers and normalizers. - """ + ''' @abstractmethod def metaInfoEnv(self): - """ Returns the meta info used by this backend. """ + ''' Returns the meta info used by this backend. ''' pass @abstractmethod def startedParsingSession( self, mainFileUri, parserInfo, parserStatus=None, parserErrors=None): - """ + ''' Should be called when the parsing starts. ParserInfo should be a valid json dictionary. - """ + ''' pass @abstractmethod def finishedParsingSession( self, parserStatus, parserErrors, mainFileUri=None, parserInfo=None, parsingStats=None): - """ Called when the parsing finishes. """ + ''' Called when the parsing finishes. ''' pass @abstractmethod def openContext(self, contextUri: str): - """ Open existing archive data to introduce new data into an existing section. """ + ''' Open existing archive data to introduce new data into an existing section. ''' pass @abstractmethod def closeContext(self, contextUri: str): - """ Close priorly opened existing archive data again. """ + ''' Close priorly opened existing archive data again. ''' pass @abstractmethod def openSection(self, metaName, parent_index=-1): - """ Opens a new section and returns its new unique gIndex. """ + ''' Opens a new section and returns its new unique gIndex. ''' pass @abstractmethod def closeSection(self, metaName, gIndex): - """ + ''' Closes the section with the given meta name and index. After this, no more value can be added to this section. - """ + ''' pass @abstractmethod def openNonOverlappingSection(self, metaName): - """ Opens a new non overlapping section. """ + ''' Opens a new non overlapping section. ''' pass @abstractmethod def setSectionInfo(self, metaName, gIndex, references): - """ + ''' Sets info values of an open section references should be a dictionary with the gIndexes of the root sections this section refers to. - """ + ''' pass @abstractmethod def closeNonOverlappingSection(self, metaName): - """ + ''' Closes the current non overlapping section for the given meta name. After this, no more value can be added to this section. - """ + ''' pass @abstractmethod def openSections(self): - """ Returns the sections that are still open as metaName, gIndex tuples. """ + ''' Returns the sections that are still open as metaName, gIndex tuples. ''' pass @abstractmethod def addValue(self, metaName, value, gIndex=-1): - """ + ''' Adds a json value for the given metaName. The gIndex is used to identify the right parent section. - """ + ''' pass @abstractmethod def addRealValue(self, metaName, value, gIndex=-1): - """ + ''' Adds a float value for the given metaName. The gIndex is used to identify the right parent section. - """ + ''' pass @abstractmethod def addArray(self, metaName, shape, gIndex=-1): - """ + ''' Adds an unannitialized array of the given shape for the given metaName. The gIndex is used to identify the right parent section. This is neccessary before array values can be set with :func:`setArrayValues`. - """ + ''' @abstractmethod def setArrayValues(self, metaName, values, offset=None, gIndex=-1): - """ + ''' Adds values of the given numpy array to the last array added for the given metaName and parent gIndex. - """ + ''' pass @abstractmethod def addArrayValues(self, metaName, values, gIndex=-1, override: bool = False): - """ + ''' Adds an array with the given numpy array values for the given metaName and parent section gIndex. Override determines whether to rewrite exisiting values in the backend. - """ + ''' pass @abstractmethod def pwarn(self, msg): - """ Used to catch parser warnings. """ + ''' Used to catch parser warnings. ''' pass # The following are extensions to the origin NOMAD-coe parser backend. And allow @@ -185,34 +185,34 @@ class AbstractParserBackend(metaclass=ABCMeta): @abstractmethod def get_sections(self, meta_name: str, g_index: int = -1) -> List[int]: - """ Return all gIndices for existing sections of the given meta_name and parent section index. """ + ''' Return all gIndices for existing sections of the given meta_name and parent section index. ''' pass @abstractmethod def get_value(self, metaName: str, g_index=-1) -> Any: - """ + ''' Return the value set to the given meta_name in its parent section of the given index. An index of -1 (default) is only allowed if there is exactly one parent section. - """ + ''' pass def write_json( self, out: TextIO, pretty=True, filter: Callable[[str, Any], Any] = None, root_sections: List[str] = ['section_run', 'section_entry_info']): - """ Writes the backend contents. """ + ''' Writes the backend contents. ''' pass def add_mi2_section(self, section: MSection): - """ Allows to mix a metainfo2 style section into backend. """ + ''' Allows to mix a metainfo2 style section into backend. ''' pass def get_mi2_section(self, section_def: MI2Section): - """ Allows to mix a metainfo2 style section into backend. """ + ''' Allows to mix a metainfo2 style section into backend. ''' pass def traverse(self, *args, **kwargs) -> Iterable[Tuple[str, str, Any]]: - """ Traverses the backend data and yiels tuples with metainfo name, event type, - and value """ + ''' Traverses the backend data and yiels tuples with metainfo name, event type, + and value ''' pass @@ -222,7 +222,7 @@ class JSONStreamWriter(): ARRAY = 2 KEY_VALUE = 3 - """ + ''' A generator that allows to output JSON based on calling 'event' functions. Its pure python and could be replaced by some faster implementation, e.g. yajl-py. It uses standard json decode to write values. This allows to mix streaming with @@ -236,7 +236,7 @@ class JSONStreamWriter(): Raises: AssertionError: If methods were called in a non JSON fashion. Call :func:`close` to make sure everything was closed properly. - """ + ''' def __init__(self, file, pretty=False): self._fp = file self._pretty = pretty @@ -335,10 +335,10 @@ class JSONStreamWriter(): class LegacyParserBackend(AbstractParserBackend): - """ + ''' Partial implementation of :class:`AbstractParserBackend` that implements some methods that are independent from the core backend implementation. - """ + ''' def __init__(self, logger): self.logger = logger if logger is not None else get_logger(__name__) @@ -365,10 +365,10 @@ class LegacyParserBackend(AbstractParserBackend): self._warnings.append('There are more warnings, check the processing logs.') def _parse_context_uri(self, context_uri: str) -> Tuple[str, int]: - """ + ''' Returns the last segment of the given context uri, i.e. the section that constitutes the context. - """ + ''' path_str = re.sub(r'^(nmd://[^/]+/[^/]+)?/', '', context_uri, count=1) path = path_str.split('/')[::-1] # reversed path via extended slice syntax @@ -388,7 +388,7 @@ class LegacyParserBackend(AbstractParserBackend): @property def status(self) -> ParserStatus: - """ Returns status and potential errors. """ + ''' Returns status and potential errors. ''' return (self._status, self._errors) def reset_status(self) -> None: @@ -398,12 +398,12 @@ class LegacyParserBackend(AbstractParserBackend): class LocalBackend(LegacyParserBackend, metaclass=DelegatingMeta): - """ + ''' This implementation of :class:`AbstractParserBackend` is a extended version of NOMAD-coe's ``LocalBackend`` that allows to write the results in an *archive*-style .json. It can be used like the original thing, but also allows to output archive JSON after parsing via :func:`write_json`. - """ + ''' def __init__(self, *args, **kwargs): logger = kwargs.pop('logger', None) super().__init__(logger=logger) @@ -417,7 +417,7 @@ class LocalBackend(LegacyParserBackend, metaclass=DelegatingMeta): return self.data[metaname] def __getattr__(self, name): - """ Support for unimplemented and unexpected methods. """ + ''' Support for unimplemented and unexpected methods. ''' if name not in self._known_attributes and self._unknown_attributes.get(name) is None: self.logger.debug('Access of unexpected backend attribute/method', attribute=name) self._unknown_attributes[name] = name @@ -425,11 +425,11 @@ class LocalBackend(LegacyParserBackend, metaclass=DelegatingMeta): return getattr(self._delegate, name) def add_mi2_section(self, section: MSection): - """ Allows to mix a metainfo2 style section into backend. """ + ''' Allows to mix a metainfo2 style section into backend. ''' self.mi2_data[section.m_def.name] = section def get_mi2_section(self, section_def: MI2Section): - """ Allows to mix a metainfo2 style section into backend. """ + ''' Allows to mix a metainfo2 style section into backend. ''' return self.mi2_data.get(section_def.name, None) def finishedParsingSession(self, *args, **kwargs): @@ -558,7 +558,7 @@ class LocalBackend(LegacyParserBackend, metaclass=DelegatingMeta): def write_json( self, out: TextIO, pretty=True, filter: Callable[[str, Any], Any] = None, root_sections: List[str] = ['section_run', 'section_entry_info']): - """ + ''' Writes the results stored in the backend after parsing in an 'archive'.json style format. @@ -566,7 +566,7 @@ class LocalBackend(LegacyParserBackend, metaclass=DelegatingMeta): out: The file-like that is used to write the json to. pretty: Format the json or not. filter: Optional filter that takes metaname, value pairs and returns a new value. - """ + ''' json_writer = JSONStreamWriter(out, pretty=pretty) json_writer.open_object() diff --git a/nomad/parsing/metainfo.py b/nomad/parsing/metainfo.py index 618b1bd8638b4d0100ac4cdfd2a4295e1c5edb1a..8db70c5b27c330b5950fd6918c380a210c3664bb 100644 --- a/nomad/parsing/metainfo.py +++ b/nomad/parsing/metainfo.py @@ -29,7 +29,7 @@ from .backend import LegacyParserBackend class MetainfoBackend(LegacyParserBackend): - """ A backend that uses the new metainfo to store all data. """ + ''' A backend that uses the new metainfo to store all data. ''' def __init__(self, env: LegacyMetainfoEnvironment, logger=None): super().__init__(logger=logger) @@ -69,22 +69,22 @@ class MetainfoBackend(LegacyParserBackend): return current def openContext(self, context_uri: str): - """ Open existing archive data to introduce new data into an existing section. """ + ''' Open existing archive data to introduce new data into an existing section. ''' resolved = self.resolve_context(context_uri) self.open_sections_by_def.setdefault(resolved.m_def, []).append(resolved) def closeContext(self, context_uri: str): - """ Close priorly opened existing archive data again. """ + ''' Close priorly opened existing archive data again. ''' resolved = self.resolve_context(context_uri) self.open_sections_by_def.setdefault(resolved.m_def, []).remove(resolved) def openSection(self, name): - """ + ''' It will assume that there is a sub-section def with the given name. It will use the latest opened section of the sub-sections parent as the parent for the new section. An Exception will be known root sections, e.g. 'section_run'. - """ + ''' if name in ['section_run', 'section_entry_info']: section_def = self.env.resolve_definition(name, Section) sub_section = self.resource.create(section_def.section_cls) @@ -108,7 +108,7 @@ class MetainfoBackend(LegacyParserBackend): return sub_section.m_parent_index def get_open_section_for_quantity(self, name, g_index): - """ Returns the open section that contains the quantity of the given name. """ + ''' Returns the open section that contains the quantity of the given name. ''' quantity_def = self.env.resolve_definition(name, Quantity) section_def = quantity_def.m_parent_as(Section) sections = self.open_sections_by_def.get(section_def, []) @@ -149,10 +149,10 @@ class MetainfoBackend(LegacyParserBackend): return self.openSection(metaName) def setSectionInfo(self, metaName, gIndex, references): - """ + ''' Sets info values of an open section references should be a dictionary with the gIndexes of the root sections this section refers to. - """ + ''' # TODO might be necessary to make references work? pass @@ -160,7 +160,7 @@ class MetainfoBackend(LegacyParserBackend): return self.closeSection(name, -1) def openSections(self): - """ Returns the sections that are still open as metaName, gIndex tuples. """ + ''' Returns the sections that are still open as metaName, gIndex tuples. ''' for section_def, sub_sections in self.open_sections_by_def: for sub_section in sub_sections: yield section_def.name, sub_section.m_parent_index @@ -187,26 +187,26 @@ class MetainfoBackend(LegacyParserBackend): self.addValue(name, value, g_index) def addArray(self, name, shape, g_index=-1): - """ + ''' Adds an unannitialized array of the given shape for the given metaName. The gIndex is used to identify the right parent section. This is neccessary before array values can be set with :func:`setArrayValues`. - """ + ''' raise NotImplementedError() def setArrayValues(self, metaName, values, offset=None, gIndex=-1): - """ + ''' Adds values of the given numpy array to the last array added for the given metaName and parent gIndex. - """ + ''' raise NotImplementedError() def addArrayValues(self, name, values, g_index=-1, override: bool = False): - """ + ''' Adds an array with the given numpy array values for the given metaName and parent section gIndex. Override determines whether to rewrite exisiting values in the backend. - """ + ''' section, quantity_def = self.get_open_section_for_quantity(name, g_index) if isinstance(quantity_def.type, Reference): # quantity is a reference @@ -239,17 +239,17 @@ class MetainfoBackend(LegacyParserBackend): 'This method does not make sense in the context of the new metainfo.') def get_sections(self, meta_name: str, g_index: int = -1) -> List[int]: - """ Return all gIndices for existing sections of the given meta_name and parent index. """ + ''' Return all gIndices for existing sections of the given meta_name and parent index. ''' section_def = self.env.resolve_definition(meta_name, Section) return [ section.m_parent_index for section in self.resource.all(section_def.section_cls) if g_index == -1 or section.m_parent.m_parent_index == g_index] def get_value(self, meta_name: str, g_index=-1) -> Any: - """ + ''' Return the value set to the given meta_name in its parent section of the given index. An index of -1 (default) is only allowed if there is exactly one parent section. - """ + ''' try: quantity = self.env.resolve_definition(meta_name, Quantity) except KeyError: diff --git a/nomad/parsing/parser.py b/nomad/parsing/parser.py index a9c14db93fd156ae2c7a248704d4c869ab0c7cff..b4204634ff5c3d6cd42dc8f9f6f04743cf4757ef 100644 --- a/nomad/parsing/parser.py +++ b/nomad/parsing/parser.py @@ -29,10 +29,10 @@ from nomad.parsing.backend import LocalBackend class Parser(metaclass=ABCMeta): - """ + ''' Instances specify a parser. It allows to find *main files* from given uploaded and extracted files. Further, allows to run the parser on those 'main files'. - """ + ''' def __init__(self): self.domain = 'dft' @@ -41,7 +41,7 @@ class Parser(metaclass=ABCMeta): def is_mainfile( self, filename: str, mime: str, buffer: bytes, decoded_buffer: str, compression: str = None) -> bool: - """ + ''' Checks if a file is a mainfile for the parsers. Arguments: @@ -49,12 +49,12 @@ class Parser(metaclass=ABCMeta): mime: The mimetype of the mainfile guessed with libmagic buffer: The first 2k of the mainfile contents compression: The compression of the mainfile ``[None, 'gz', 'bz2']`` - """ + ''' pass @abstractmethod def run(self, mainfile: str, logger=None) -> LocalBackend: - """ + ''' Runs the parser on the given mainfile. It uses :class:`LocalBackend` as a backend. The meta-info access is handled by the underlying NOMAD-coe parser. @@ -64,14 +64,14 @@ class Parser(metaclass=ABCMeta): Returns: The used :class:`LocalBackend` with status information and result data. - """ + ''' class BrokenParser(Parser): - """ + ''' A parser implementation that just fails and is used to match mainfiles with known patterns of corruption. - """ + ''' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.name = 'parser/broken' @@ -97,7 +97,7 @@ class BrokenParser(Parser): class MatchingParser(Parser): - """ + ''' A parser implementation that used regular experessions to match mainfiles. Arguments: @@ -107,7 +107,7 @@ class MatchingParser(Parser): mainfile_name_re: A regexp that is used to match the paths of potential mainfiles domain: The domain that this parser should be used for. Default is 'dft'. supported_compressions: A list of [gz, bz2], if the parser supports compressed files - """ + ''' def __init__( self, name: str, code_name: str, mainfile_contents_re: str = None, @@ -153,10 +153,10 @@ class MatchingParser(Parser): class MissingParser(MatchingParser): - """ + ''' A parser implementation that just fails and is used to match mainfiles with known patterns of corruption. - """ + ''' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -165,14 +165,14 @@ class MissingParser(MatchingParser): class LegacyParser(MatchingParser): - """ + ''' A parser implementation for legacy NOMAD-coe parsers. It assumes that parsers are installed to the python environment. Arguments: parser_class_name: the main parser class that implements NOMAD-coe's backend_factory: a callable that returns a backend, takes meta_info and logger as argument - """ + ''' def __init__(self, parser_class_name: str, *args, backend_factory=None, **kwargs) -> None: super().__init__(*args, **kwargs) @@ -212,11 +212,11 @@ class LegacyParser(MatchingParser): class VaspOutcarParser(LegacyParser): - """ + ''' LegacyParser that only matches mailfiles, if there is no .xml in the same directory, i.e. to use the VASP OUTCAR parser in absence of .xml output file. - """ + ''' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.name = 'parsers/vaspoutcar' diff --git a/nomad/processing/__init__.py b/nomad/processing/__init__.py index 4454cdbc60f56ba267f741b804d5d7de86c11dbc..34cd30e2e9ad969498f7d486c937adec238e8e02 100644 --- a/nomad/processing/__init__.py +++ b/nomad/processing/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' Processing comprises everything that is necessary to take an uploaded user file, processes it, and store all necessary data for *repository*, *archive*, and potential future services (e.g. *encyclopedia*). @@ -54,7 +54,7 @@ classes do represent the processing state, as well as the respective entity. :members: .. autoclass:: nomad.processing.data.Calc :members: -""" +''' from nomad.processing.base import app, InvalidId, ProcNotRegistered, SUCCESS, FAILURE, \ RUNNING, PENDING, PROCESS_COMPLETED, PROCESS_RUNNING, ProcessAlreadyRunning diff --git a/nomad/processing/base.py b/nomad/processing/base.py index 0fd5c3dc9b17a95aac3c946812a8741934de4a11..3dc11be8791e83a41a0ff1ca623e6899f1e563ea 100644 --- a/nomad/processing/base.py +++ b/nomad/processing/base.py @@ -107,7 +107,7 @@ class ProcMetaclass(TopLevelDocumentMetaclass): class Proc(Document, metaclass=ProcMetaclass): - """ + ''' Base class for objects that are involved in processing and need persistent processing state. @@ -133,14 +133,14 @@ class Proc(Document, metaclass=ProcMetaclass): complete_time: the time that processing completed (successfully or not) current_process: the currently or last run asyncronous process process_status: the status of the currently or last run asyncronous process - """ + ''' meta: Any = { 'abstract': True, } tasks: List[str] = None - """ the ordered list of tasks that comprise a processing run """ + ''' the ordered list of tasks that comprise a processing run ''' current_task = StringField(default=None) tasks_status = StringField(default=CREATED) @@ -158,17 +158,17 @@ class Proc(Document, metaclass=ProcMetaclass): @property def tasks_running(self) -> bool: - """ Returns True of the process has failed or succeeded. """ + ''' Returns True of the process has failed or succeeded. ''' return self.tasks_status not in [SUCCESS, FAILURE] @property def process_running(self) -> bool: - """ Returns True of an asynchrounous process is currently running. """ + ''' Returns True of an asynchrounous process is currently running. ''' return self.process_status is not None and self.process_status != PROCESS_COMPLETED @classmethod def process_running_mongoengine_query(cls): - """ Returns a mongoengine query dict (to be used in objects) to find running processes. """ + ''' Returns a mongoengine query dict (to be used in objects) to find running processes. ''' return dict(process_status__in=[PROCESS_CALLED, PROCESS_RUNNING]) def get_logger(self): @@ -179,9 +179,9 @@ class Proc(Document, metaclass=ProcMetaclass): @classmethod def create(cls, **kwargs): - """ Factory method that must be used instead of regular constructor. """ + ''' Factory method that must be used instead of regular constructor. ''' assert 'tasks_status' not in kwargs, \ - """ do not set the status manually, its managed """ + ''' do not set the status manually, its managed ''' kwargs.setdefault('create_time', datetime.utcnow()) self = cls(**kwargs) @@ -194,7 +194,7 @@ class Proc(Document, metaclass=ProcMetaclass): return self def reset(self, worker_hostname: str = None): - """ Resets the task chain. Assumes there no current running process. """ + ''' Resets the task chain. Assumes there no current running process. ''' assert not self.process_running self.current_task = None @@ -206,7 +206,7 @@ class Proc(Document, metaclass=ProcMetaclass): @classmethod def reset_pymongo_update(cls, worker_hostname: str = None): - """ Returns a pymongo update dict part to reset calculations. """ + ''' Returns a pymongo update dict part to reset calculations. ''' return dict( current_task=None, process_status=None, tasks_status=PENDING, errors=[], warnings=[], worker_hostname=worker_hostname) @@ -244,7 +244,7 @@ class Proc(Document, metaclass=ProcMetaclass): logger.critical(msg, **kwargs) def fail(self, *errors, log_level=logging.ERROR, **kwargs): - """ Allows to fail the process. Takes strings or exceptions as args. """ + ''' Allows to fail the process. Takes strings or exceptions as args. ''' assert self.process_running or self.tasks_running, 'Cannot fail a completed process.' failed_with_exception = False @@ -274,7 +274,7 @@ class Proc(Document, metaclass=ProcMetaclass): self.save() def warning(self, *warnings, log_level=logging.WARNING, **kwargs): - """ Allows to save warnings. Takes strings or exceptions as args. """ + ''' Allows to save warnings. Takes strings or exceptions as args. ''' assert self.process_running or self.tasks_running logger = self.get_logger(**kwargs) @@ -326,30 +326,30 @@ class Proc(Document, metaclass=ProcMetaclass): self.get_logger().info('completed process') def on_tasks_complete(self): - """ Callback that is called when the list of task are completed """ + ''' Callback that is called when the list of task are completed ''' pass def on_process_complete(self, process_name): - """ Callback that is called when the corrent process completed """ + ''' Callback that is called when the corrent process completed ''' pass def block_until_complete(self, interval=0.01): - """ + ''' Reloads the process constantly until it sees a completed process. Should be used with care as it can block indefinitely. Just intended for testing purposes. - """ + ''' while self.tasks_running or self.process_running: time.sleep(interval) self.reload() @classmethod def process_all(cls, func, query: Dict[str, Any], exclude: List[str] = []): - """ + ''' Allows to run process functions for all objects on the given query. Calling process functions though the func:`process` wrapper might be slow, because it causes a save on each call. This function will use a query based update to do the same for all objects at once. - """ + ''' running_query = dict(cls.process_running_mongoengine_query()) running_query.update(query) @@ -388,14 +388,14 @@ class Proc(Document, metaclass=ProcMetaclass): def task(func): - """ + ''' The decorator for tasks that will be wrapped in exception handling that will fail the process. The task methods of a :class:`Proc` class/document comprise a sequence (order of methods in class namespace) of tasks. Tasks must be executed in that order. Completion of the last task, will put the :class:`Proc` instance into the SUCCESS state. Calling the first task will put it into RUNNING state. Tasks will only be executed, if the process has not yet reached FAILURE state. - """ + ''' @functools.wraps(func) def wrapper(self, *args, **kwargs): try: @@ -425,20 +425,20 @@ def task(func): def all_subclasses(cls): - """ Helper method to calculate set of all subclasses of a given class. """ + ''' Helper method to calculate set of all subclasses of a given class. ''' return set(cls.__subclasses__()).union( [s for c in cls.__subclasses__() for s in all_subclasses(c)]) all_proc_cls = {cls.__name__: cls for cls in all_subclasses(Proc)} -""" Name dictionary for all Proc classes. """ +''' Name dictionary for all Proc classes. ''' class NomadCeleryRequest(Request): - """ + ''' A custom celery request class that allows to catch error in the worker main thread, which cannot be caught on the worker threads themselves. - """ + ''' def _fail(self, event, **kwargs): args = self._payload[0] @@ -480,9 +480,9 @@ class NomadCeleryTask(Task): def unwarp_task(task, cls_name, self_id, *args, **kwargs): - """ + ''' Retrieves the proc object that the given task is executed on from the database. - """ + ''' logger = utils.get_logger(__name__, cls=cls_name, id=self_id) # get the process class @@ -521,13 +521,13 @@ def unwarp_task(task, cls_name, self_id, *args, **kwargs): acks_late=config.celery.acks_late, soft_time_limit=config.celery.timeout, time_limit=config.celery.timeout * 2) def proc_task(task, cls_name, self_id, func_attr): - """ + ''' The celery task that is used to execute async process functions. It ignores results, since all results are handled via the self document. It retries for 3 times with a countdown of 3 on missing 'selfs', since this might happen in sharded, distributed mongo setups where the object might not have yet been propagated and therefore appear missing. - """ + ''' self = unwarp_task(task, cls_name, self_id) logger = self.get_logger() @@ -576,14 +576,14 @@ def proc_task(task, cls_name, self_id, func_attr): def process(func): - """ + ''' The decorator for process functions that will be called async via celery. All calls to the decorated method will result in celery task requests. To transfer state, the instance will be saved to the database and loading on the celery task worker. Process methods can call other (process) functions/methods on other :class:`Proc` instances. Each :class:`Proc` instance can only run one any process at a time. - """ + ''' @functools.wraps(func) def wrapper(self, *args, **kwargs): assert len(args) == 0 and len(kwargs) == 0, 'process functions must not have arguments' diff --git a/nomad/processing/data.py b/nomad/processing/data.py index 465c684af773bc69abd70f5b58bb25af1abc4110..7a58323da1798476f82a242b6a4e3a8f450f7afd 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This module comprises a set of persistent document classes that hold all user related data. These are information about users, their uploads and datasets, the associated calculations, and files @@ -22,9 +22,9 @@ calculations, and files .. autoclass:: Upload -""" +''' -from typing import cast, List, Any, ContextManager, Tuple, Generator, Dict, cast +from typing import cast, List, Any, ContextManager, Tuple, Generator, Dict, cast, Iterable from mongoengine import StringField, DateTimeField, DictField, BooleanField, IntField import logging from structlog import wrap_logger @@ -41,7 +41,6 @@ from nomad.files import PathObject, UploadFiles, ExtractError, ArchiveBasedStagi from nomad.processing.base import Proc, process, task, PENDING, SUCCESS, FAILURE from nomad.parsing import parser_dict, match_parser, LocalBackend from nomad.normalizing import normalizers -from nomad.datamodel import UploadWithMetadata def _pack_log_event(logger, method_name, event_dict): @@ -66,14 +65,8 @@ _log_processors = [ TimeStamper(fmt="%Y-%m-%d %H:%M.%S", utc=False)] -_all_root_sections = [] -for domain in datamodel.Domain.instances.values(): - for root_section in domain.root_sections: - _all_root_sections.append(root_section) - - class Calc(Proc): - """ + ''' Instances of this class represent calculations. This class manages the elastic search index entry, files, and archive for the respective calculation. @@ -88,8 +81,8 @@ class Calc(Proc): upload_id: the id of the upload used to create this calculation mainfile: the mainfile (including path in upload) that was used to create this calc - metadata: the metadata record wit calc and user metadata, see :class:`datamodel.CalcWithMetadata` - """ + metadata: the metadata record wit calc and user metadata, see :class:`datamodel.EntryMetadata` + ''' calc_id = StringField(primary_key=True) upload_id = StringField() mainfile = StringField() @@ -120,12 +113,12 @@ class Calc(Proc): self._calc_proc_logwriter_ctx: ContextManager = None @classmethod - def from_calc_with_metadata(cls, calc_with_metadata): + def from_entry_metadata(cls, entry_metadata): calc = Calc.create( - calc_id=calc_with_metadata.calc_id, - upload_id=calc_with_metadata.upload_id, - mainfile=calc_with_metadata.mainfile, - metadata=calc_with_metadata.to_dict()) + calc_id=entry_metadata.calc_id, + upload_id=entry_metadata.upload_id, + mainfile=entry_metadata.mainfile, + metadata=entry_metadata.m_to_dict(include_defaults=True)) return calc @@ -152,10 +145,10 @@ class Calc(Proc): return self._upload_files def get_logger(self, **kwargs): - """ + ''' Returns a wrapped logger that additionally saves all entries to the calculation processing log in the archive. - """ + ''' logger = super().get_logger() logger = logger.bind( upload_id=self.upload_id, mainfile=self.mainfile, calc_id=self.calc_id, **kwargs) @@ -189,11 +182,11 @@ class Calc(Proc): @process def re_process_calc(self): - """ + ''' Processes a calculation again. This means there is already metadata and instead of creating it initially, we are just updating the existing records. - """ + ''' parser = match_parser(self.mainfile, self.upload_files, strict=False) if parser is None and not config.reprocess_unmatched: @@ -228,16 +221,16 @@ class Calc(Proc): parser=parser.name) try: - calc_with_metadata = datamodel.CalcWithMetadata(**self.metadata) - calc_with_metadata.upload_id = self.upload_id - calc_with_metadata.calc_id = self.calc_id - calc_with_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) - calc_with_metadata.mainfile = self.mainfile - calc_with_metadata.nomad_version = config.version - calc_with_metadata.nomad_commit = config.commit - calc_with_metadata.last_processing = datetime.utcnow() - calc_with_metadata.files = self.upload_files.calc_files(self.mainfile) - self.metadata = calc_with_metadata.to_dict() + entry_metadata = datamodel.EntryMetadata.m_from_dict(self.metadata) + entry_metadata.upload_id = self.upload_id + entry_metadata.calc_id = self.calc_id + entry_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) + entry_metadata.mainfile = self.mainfile + entry_metadata.nomad_version = config.version + entry_metadata.nomad_commit = config.commit + entry_metadata.last_processing = datetime.utcnow() + entry_metadata.files = self.upload_files.calc_files(self.mainfile) + self.metadata = entry_metadata.m_to_dict(include_defaults=True) self.parsing() self.normalizing() @@ -253,10 +246,10 @@ class Calc(Proc): @process def process_calc(self): - """ + ''' Processes a new calculation that has no prior records in the mongo, elastic, or filesystem storage. It will create an initial set of (user) metadata. - """ + ''' logger = self.get_logger() if self.upload is None: logger.error('calculation upload does not exist') @@ -264,23 +257,23 @@ class Calc(Proc): try: # save preliminary minimum calc metadata in case processing fails # successful processing will replace it with the actual metadata - calc_with_metadata = datamodel.CalcWithMetadata( - domain=parser_dict[self.parser].domain, - upload_id=self.upload_id, - calc_id=self.calc_id, - calc_hash=self.upload_files.calc_hash(self.mainfile), - mainfile=self.mainfile) - calc_with_metadata.published = False - calc_with_metadata.uploader = self.upload.user_id - calc_with_metadata.upload_time = self.upload.upload_time - calc_with_metadata.upload_name = self.upload.name - calc_with_metadata.nomad_version = config.version - calc_with_metadata.nomad_commit = config.commit - calc_with_metadata.last_processing = datetime.utcnow() - calc_with_metadata.files = self.upload_files.calc_files(self.mainfile) - self.metadata = calc_with_metadata.to_dict() - - if len(calc_with_metadata.files) >= config.auxfile_cutoff: + calc_metadata = datamodel.EntryMetadata() + calc_metadata.domain = parser_dict[self.parser].domain + calc_metadata.upload_id = self.upload_id + calc_metadata.calc_id = self.calc_id + calc_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) + calc_metadata.mainfile = self.mainfile + calc_metadata.calc_hash = self.upload_files.calc_hash(self.mainfile) + calc_metadata.nomad_version = config.version + calc_metadata.nomad_commit = config.commit + calc_metadata.last_processing = datetime.utcnow() + calc_metadata.files = self.upload_files.calc_files(self.mainfile) + calc_metadata.uploader = self.upload.user_id + calc_metadata.upload_time = self.upload.upload_time + calc_metadata.upload_name = self.upload.name + self.metadata = calc_metadata.m_to_dict(include_defaults=True) # TODO use embedded doc? + + if len(calc_metadata.files) >= config.auxfile_cutoff: self.warning( 'This calc has many aux files in its directory. ' 'Have you placed many calculations in the same directory?') @@ -301,25 +294,16 @@ class Calc(Proc): # in case of failure, index a minimum set of metadata and mark # processing failure try: - calc_with_metadata = datamodel.CalcWithMetadata(**self.metadata) - calc_with_metadata.formula = config.services.not_processed_value - calc_with_metadata.basis_set = config.services.not_processed_value - calc_with_metadata.xc_functional = config.services.not_processed_value - calc_with_metadata.system = config.services.not_processed_value - calc_with_metadata.crystal_system = config.services.not_processed_value - calc_with_metadata.spacegroup = config.services.not_processed_value - calc_with_metadata.spacegroup_symbol = config.services.not_processed_value - calc_with_metadata.code_version = config.services.not_processed_value - - calc_with_metadata.code_name = config.services.not_processed_value + entry_metadata = datamodel.EntryMetadata.m_from_dict(self.metadata) if self.parser is not None: parser = parser_dict[self.parser] if hasattr(parser, 'code_name'): - calc_with_metadata.code_name = parser.code_name + entry_metadata.code_name = parser.code_name - calc_with_metadata.processed = False - self.metadata = calc_with_metadata.to_dict() - search.Entry.from_calc_with_metadata(calc_with_metadata).save() + entry_metadata.processed = False + self.metadata = entry_metadata.m_to_dict(include_defaults=True) + + search.create_entry(entry_metadata).save() except Exception as e: self.get_logger().error('could not index after processing failure', exc_info=e) @@ -335,7 +319,7 @@ class Calc(Proc): @task def parsing(self): - """ The *task* that encapsulates all parsing related actions. """ + ''' The *task* that encapsulates all parsing related actions. ''' context = dict(parser=self.parser, step=self.parser) logger = self.get_logger(**context) parser = parser_dict[self.parser] @@ -405,7 +389,7 @@ class Calc(Proc): @task def normalizing(self): - """ The *task* that encapsulates all normalizing related actions. """ + ''' The *task* that encapsulates all normalizing related actions. ''' for normalizer in normalizers: if normalizer.domain != parser_dict[self.parser].domain: continue @@ -435,27 +419,27 @@ class Calc(Proc): @task def archiving(self): - """ The *task* that encapsulates all archival related actions. """ + ''' The *task* that encapsulates all archival related actions. ''' logger = self.get_logger() - calc_with_metadata = datamodel.CalcWithMetadata(**self.metadata) - calc_with_metadata.apply_domain_metadata(self._parser_backend) - calc_with_metadata.processed = True + entry_metadata = datamodel.EntryMetadata.m_from_dict(self.metadata) + entry_metadata.apply_domain_metadata(self._parser_backend) + entry_metadata.processed = True # persist the calc metadata with utils.timer(logger, 'saved calc metadata', step='metadata'): - self.metadata = calc_with_metadata.to_dict() + self.metadata = entry_metadata.m_to_dict(include_defaults=True) # index in search with utils.timer(logger, 'indexed', step='index'): - search.Entry.from_calc_with_metadata(calc_with_metadata).save() + search.create_entry(entry_metadata).save() # persist the archive with utils.timer( logger, 'archived', step='archive', input_size=self.mainfile_file.size) as log_data: with self.upload_files.archive_file(self.calc_id, 'wt') as out: - self._parser_backend.write_json(out, pretty=True, root_sections=_all_root_sections) + self._parser_backend.write_json(out, pretty=True, root_sections=datamodel.root_sections) log_data.update(archive_size=self.upload_files.archive_file_object(self.calc_id).size) @@ -474,7 +458,7 @@ class Calc(Proc): class Upload(Proc): - """ + ''' Represents uploads in the databases. Provides persistence access to the files storage, and processing state. @@ -489,7 +473,7 @@ class Upload(Proc): publish_time: Date when the upload was initially published last_update: Date of the last publishing/re-processing joined: Boolean indicates if the running processing has joined (:func:`check_join`) - """ + ''' id_field = 'upload_id' upload_id = StringField(primary_key=True) @@ -518,13 +502,13 @@ class Upload(Proc): @property def metadata(self) -> dict: - """ + ''' Getter, setter for user metadata. Metadata is pickled to and from the public bucket to allow sharing among all processes. Usually uploads do not have (much) user defined metadata, but users provide all metadata per upload as part of the publish process. This will change, when we introduce editing functionality and metadata will be provided through different means. - """ + ''' try: upload_files = PublicUploadFiles(self.upload_id, is_authorized=lambda: True) except KeyError: @@ -542,7 +526,7 @@ class Upload(Proc): @classmethod def user_uploads(cls, user: datamodel.User, **kwargs) -> List['Upload']: - """ Returns all uploads for the given user. Kwargs are passed to mongo query. """ + ''' Returns all uploads for the given user. Kwargs are passed to mongo query. ''' return cls.objects(user_id=str(user.user_id), **kwargs) @property @@ -561,14 +545,14 @@ class Upload(Proc): @classmethod def create(cls, **kwargs) -> 'Upload': - """ + ''' Creates a new upload for the given user, a user given name is optional. It will populate the record with a signed url and pending :class:`UploadProc`. The upload will be already saved to the database. Arguments: user: The user that created the upload. - """ + ''' # use kwargs to keep compatibility with super method user: datamodel.User = kwargs['user'] del(kwargs['user']) @@ -583,15 +567,15 @@ class Upload(Proc): return self def delete(self): - """ Deletes this upload process state entry and its calcs. """ + ''' Deletes this upload process state entry and its calcs. ''' Calc.objects(upload_id=self.upload_id).delete() super().delete() def delete_upload_local(self): - """ + ''' Deletes the upload, including its processing state and staging files. Local version without celery processing. - """ + ''' logger = self.get_logger() with utils.lnr(logger, 'staged upload delete failed'): @@ -609,28 +593,27 @@ class Upload(Proc): @process def delete_upload(self): - """ + ''' Deletes of the upload, including its processing state and staging files. This starts the celery process of deleting the upload. - """ + ''' self.delete_upload_local() return True # do not save the process status on the delete upload @process def publish_upload(self): - """ + ''' Moves the upload out of staging to the public area. It will pack the staging upload files in to public upload files. - """ + ''' assert self.processed_calcs > 0 logger = self.get_logger() logger.info('started to publish') with utils.lnr(logger, 'publish failed'): - upload_with_metadata = self.to_upload_with_metadata(self.metadata) - calcs = upload_with_metadata.calcs + calcs = self.entries_metadata(self.metadata) with utils.timer( logger, 'upload metadata updated', step='metadata', @@ -641,7 +624,7 @@ class Upload(Proc): calc.with_embargo = calc.with_embargo if calc.with_embargo is not None else False return UpdateOne( {'_id': calc.calc_id}, - {'$set': {'metadata': calc.to_dict()}}) + {'$set': {'metadata': calc.m_to_dict(include_defaults=True)}}) Calc._get_collection().bulk_write([create_update(calc) for calc in calcs]) @@ -649,7 +632,7 @@ class Upload(Proc): with utils.timer( logger, 'staged upload files packed', step='pack', upload_size=self.upload_files.size): - self.upload_files.pack(upload_with_metadata) + self.upload_files.pack(calcs) with utils.timer( logger, 'index updated', step='index', @@ -671,7 +654,7 @@ class Upload(Proc): @process def re_process_upload(self): - """ + ''' A *process* that performs the re-processing of a earlier processed upload. @@ -681,7 +664,7 @@ class Upload(Proc): TODO this implementation does not do any re-matching. This will be more complex due to handling of new or missing matches. - """ + ''' assert self.published logger = self.get_logger() @@ -730,7 +713,7 @@ class Upload(Proc): @process def re_pack(self): - """ A *process* that repacks the raw and archive data based on the current embargo data. """ + ''' A *process* that repacks the raw and archive data based on the current embargo data. ''' assert self.published # mock the steps of actual processing @@ -739,19 +722,19 @@ class Upload(Proc): self._continue_with('parse_all') self._continue_with('cleanup') - self.upload_files.re_pack(self.to_upload_with_metadata()) + self.upload_files.re_pack(self.entries_metadata()) self.joined = True self._complete() @process def process_upload(self): - """ A *process* that performs the initial upload processing. """ + ''' A *process* that performs the initial upload processing. ''' self.extracting() self.parse_all() @task def uploading(self): - """ A no-op *task* as a stand-in for receiving upload data. """ + ''' A no-op *task* as a stand-in for receiving upload data. ''' pass @property @@ -772,10 +755,10 @@ class Upload(Proc): @task def extracting(self): - """ + ''' The *task* performed before the actual parsing/normalizing: extracting the uploaded files. - """ + ''' # extract the uploaded file self._upload_files = ArchiveBasedStagingUploadFiles( upload_id=self.upload_id, is_authorized=lambda: True, create=True, @@ -800,10 +783,10 @@ class Upload(Proc): return def _preprocess_files(self, path): - """ + ''' Some files need preprocessing. Currently we need to add a stripped POTCAR version and always restrict/embargo the original. - """ + ''' if os.path.basename(path).startswith('POTCAR'): # create checksum hash = hashlib.sha224() @@ -829,13 +812,13 @@ class Upload(Proc): self.staging_upload_files.raw_file_object(stripped_path).os_path)) def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]: - """ + ''' Generator function that matches all files in the upload to all parsers to determine the upload's mainfiles. Returns: Tuples of mainfile, filename, and parsers - """ + ''' directories_with_match: Dict[str, str] = dict() upload_files = self.staging_upload_files for filename in upload_files.raw_file_manifest(): @@ -859,10 +842,10 @@ class Upload(Proc): @task def parse_all(self): - """ + ''' The *task* used to identify mainfile/parser combinations among the upload's files, creates respective :class:`Calc` instances, and triggers their processing. - """ + ''' logger = self.get_logger() with utils.timer( @@ -882,14 +865,14 @@ class Upload(Proc): self.check_join() def check_join(self): - """ + ''' Performs an evaluation of the join condition and triggers the :func:`cleanup` task if necessary. The join condition allows to run the ``cleanup`` after all calculations have been processed. The upload processing stops after all calculation processings have been triggered (:func:`parse_all` or :func:`re_process_upload`). The cleanup task is then run within the last calculation process (the one that triggered the join by calling this method). - """ + ''' total_calcs = self.total_calcs processed_calcs = self.processed_calcs @@ -951,7 +934,7 @@ class Upload(Proc): logger, 'reprocessed staged upload packed', step='delete staged', upload_size=self.upload_files.size): - staging_upload_files.pack(self.to_upload_with_metadata(), skip_raw=True) + staging_upload_files.pack(self.entries_metadata(), skip_raw=True) with utils.timer( logger, 'reprocessed staged upload deleted', step='delete staged', @@ -963,10 +946,10 @@ class Upload(Proc): @task def cleanup(self): - """ + ''' The *task* that "cleans" the processing, i.e. removed obsolete files and performs pending archival operations. Depends on the type of processing. - """ + ''' search.refresh() if self.current_process == 're_process_upload': @@ -975,58 +958,58 @@ class Upload(Proc): self._cleanup_after_processing() def get_calc(self, calc_id) -> Calc: - """ Returns the upload calc with the given id or ``None``. """ + ''' Returns the upload calc with the given id or ``None``. ''' return Calc.objects(upload_id=self.upload_id, calc_id=calc_id).first() @property def processed_calcs(self): - """ + ''' The number of successfully or not successfully processed calculations. I.e. calculations that have finished processing. - """ + ''' return Calc.objects(upload_id=self.upload_id, tasks_status__in=[SUCCESS, FAILURE]).count() @property def total_calcs(self): - """ The number of all calculations. """ + ''' The number of all calculations. ''' return Calc.objects(upload_id=self.upload_id).count() @property def failed_calcs(self): - """ The number of calculations with failed processing. """ + ''' The number of calculations with failed processing. ''' return Calc.objects(upload_id=self.upload_id, tasks_status=FAILURE).count() @property def pending_calcs(self) -> int: - """ The number of calculations with pending processing. """ + ''' The number of calculations with pending processing. ''' return Calc.objects(upload_id=self.upload_id, tasks_status=PENDING).count() def all_calcs(self, start, end, order_by=None): - """ + ''' Returns all calculations, paginated and ordered. Arguments: start: the start index of the requested page end: the end index of the requested page order_by: the property to order by - """ + ''' query = Calc.objects(upload_id=self.upload_id)[start:end] return query.order_by(order_by) if order_by is not None else query @property def outdated_calcs(self): - """ All successfully processed and outdated calculations. """ + ''' All successfully processed and outdated calculations. ''' return Calc.objects( upload_id=self.upload_id, tasks_status=SUCCESS, metadata__nomad_version__ne=config.version) @property def calcs(self): - """ All successfully processed calculations. """ + ''' All successfully processed calculations. ''' return Calc.objects(upload_id=self.upload_id, tasks_status=SUCCESS) - def to_upload_with_metadata(self, user_metadata: dict = None) -> UploadWithMetadata: - """ + def entries_metadata(self, user_metadata: dict = None) -> Iterable[datamodel.EntryMetadata]: + ''' This is the :py:mod:`nomad.datamodel` transformation method to transform processing uploads into datamodel uploads. It will also implicitely transform all calculations of this upload. @@ -1034,10 +1017,10 @@ class Upload(Proc): Arguments: user_metadata: A dict of user metadata that is applied to the resulting datamodel data and the respective calculations. - """ + ''' # prepare user metadata per upload and per calc if user_metadata is not None: - calc_metadatas: Dict[str, Any] = dict() + entries_metadata_dict: Dict[str, Any] = dict() upload_metadata: Dict[str, Any] = dict() upload_metadata.update(user_metadata) @@ -1045,53 +1028,42 @@ class Upload(Proc): del(upload_metadata['calculations']) for calc in user_metadata.get('calculations', []): # pylint: disable=no-member - calc_metadatas[calc['mainfile']] = calc + entries_metadata_dict[calc['mainfile']] = calc - user_upload_time = upload_metadata.get('_upload_time', None) - user_upload_name = upload_metadata.get('_upload_name', None) + user_upload_time = upload_metadata.get('upload_time', None) + user_upload_name = upload_metadata.get('upload_name', None) def get_metadata(calc: Calc): - """ - Assemble metadata from calc's processed calc metadata and the uploads - user metadata. - """ - calc_data = calc.metadata - calc_with_metadata = datamodel.CalcWithMetadata(**calc_data) - calc_metadata = dict(upload_metadata) - calc_metadata.update(calc_metadatas.get(calc.mainfile, {})) - calc_with_metadata.apply_user_metadata(calc_metadata) - if calc_with_metadata.upload_time is None: - calc_with_metadata.upload_time = self.upload_time if user_upload_time is None else user_upload_time - if calc_with_metadata.upload_name is None: - calc_with_metadata.upload_name = self.name if user_upload_name is None else user_upload_name - - return calc_with_metadata + entry_metadata = datamodel.EntryMetadata.m_from_dict(calc.metadata) + entry_user_metadata = dict(upload_metadata) + entry_user_metadata.pop('embargo_length', None) # this is for uploads only + entry_user_metadata.update(entries_metadata_dict.get(calc.mainfile, {})) + entry_metadata.apply_user_metadata(entry_user_metadata) + if entry_metadata.upload_time is None: + entry_metadata.upload_time = self.upload_time if user_upload_time is None else user_upload_time + if entry_metadata.upload_name is None: + entry_metadata.upload_name = self.name if user_upload_name is None else user_upload_name + + return entry_metadata else: user_upload_time = None def get_metadata(calc: Calc): - calc_with_metadata = datamodel.CalcWithMetadata(**calc.metadata) - calc_with_metadata.upload_time = self.upload_time - calc_with_metadata.upload_name = self.name - - return calc_with_metadata - - result = UploadWithMetadata( - upload_id=self.upload_id, - uploader=self.user_id, - upload_time=self.upload_time if user_upload_time is None else user_upload_time) + entry_metadata = datamodel.EntryMetadata.m_from_dict(calc.metadata) + entry_metadata.upload_time = self.upload_time + entry_metadata.upload_name = self.name - result.calcs = [get_metadata(calc) for calc in Calc.objects(upload_id=self.upload_id)] + return entry_metadata - return result + return [get_metadata(calc) for calc in Calc.objects(upload_id=self.upload_id)] def compress_and_set_metadata(self, metadata: Dict[str, Any]) -> None: - """ + ''' Stores the given user metadata in the upload document. This is the metadata adhering to the API model (``UploadMetaData``). Most quantities can be stored for the upload and for each calculation. This method will try to move same values from the calculation to the upload to "compress" the data. - """ + ''' self.embargo_length = min(metadata.get('embargo_length', 36), 36) compressed = { @@ -1103,7 +1075,7 @@ class Upload(Proc): compressed_calc: Dict[str, Any] = {} calculations.append(compressed_calc) for key, value in calc.items(): - if key in ['_pid', 'mainfile', 'external_id']: + if key in ['pid', 'mainfile', 'external_id']: # these quantities are explicitly calc specific and have to stay with # the calc compressed_calc[key] = value diff --git a/nomad/search.py b/nomad/search.py index 0644b12f676790836752d4d9d8f5928580be98b6..d30268fe1ecd287a57f136eca67dc716029bdd67 100644 --- a/nomad/search.py +++ b/nomad/search.py @@ -12,22 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' This module represents calculations in elastic search. -""" +''' -from typing import Iterable, Dict, List, Any -from elasticsearch_dsl import Document, InnerDoc, Keyword, Text, Date, \ - Object, Boolean, Search, Q, A, analyzer, tokenizer -from elasticsearch_dsl.document import IndexMeta +from typing import Iterable, Dict, List, Any, Union, cast +from elasticsearch_dsl import Document, InnerDoc, Keyword, Date, \ + Object, Boolean, Integer, Search, Q, A, analyzer, tokenizer import elasticsearch.helpers from elasticsearch.exceptions import NotFoundError from datetime import datetime import json -from nomad import config, datamodel, infrastructure, datamodel, utils, processing as proc -from nomad.datamodel import Domain -import nomad.datamodel.base +from nomad import config, datamodel, infrastructure, datamodel, utils, metainfo, processing as proc +from nomad.metainfo.search import SearchQuantity path_analyzer = analyzer( @@ -44,173 +42,374 @@ class ElasticSearchError(Exception): pass class ScrollIdNotFound(Exception): pass -class User(InnerDoc): +_elastic_documents: Dict[str, Union[Document, InnerDoc]] = {} - @classmethod - def from_user(cls, user): - self = cls(user_id=user.user_id) - self.name = user.name - self.email = user.email +search_quantities: Dict[str, SearchQuantity] = {} +''' All available search quantities by their full qualified name. ''' - return self +metrics: Dict[str, SearchQuantity] = {} +''' +The available search metrics. Metrics are integer values given for each entry that can +be used in statistics (aggregations), e.g. the sum of all total energy calculations or cardinality of +all unique geometries. +''' + +groups: Dict[str, SearchQuantity] = {} +''' The available groupable quantities ''' + +order_default_quantities: Dict[str, SearchQuantity] = {} + +default_statistics: Dict[str, List[SearchQuantity]] = {} + + +# TODO make search the search quantities are initialized even without/before creating an elastic document +# otherwise a dependency on import order is created +def create_elastic_document( + section: metainfo.Section, document_name: str = None, super_cls=Document, + prefix: str = None, domain: str = None, + attrs: Dict[str, Any] = None) -> Union[Document, InnerDoc]: + ''' + Create all elasticsearch_dsl mapping classes for the section and its sub sections. + ''' + domain = section.m_x('domain', domain) + domain_or_all = domain if domain is not None else '__all__' + + if document_name is None: + document_name = section.name + + if attrs is None: + attrs = {} + + def get_inner_document(section: metainfo.Section, **kwargs) -> type: + inner_document = _elastic_documents.get(section.qualified_name()) + if inner_document is None: + inner_document = create_elastic_document( + section, super_cls=InnerDoc, **kwargs) + + return inner_document + + # create an attribute for each sub section + for sub_section in section.all_sub_sections.values(): + sub_section_prefix = sub_section.m_x('search') + if sub_section_prefix is None: + continue + + if prefix is not None: + sub_section_prefix = '%s.%s' % (prefix, sub_section_prefix) + + inner_document = get_inner_document( + sub_section.sub_section, domain=domain, prefix=sub_section_prefix) + attrs[sub_section.name] = Object(inner_document) + + # create an attribute for each quantity + for quantity in section.all_quantities.values(): + local_search_quantities = quantity.m_x('search') + + if local_search_quantities is None: + continue + + if not isinstance(local_search_quantities, List): + local_search_quantities = [local_search_quantities] + + for i, search_quantity in enumerate(local_search_quantities): + search_quantity.configure(quantity=quantity, prefix=prefix) + + # only prefixed or top-level quantities are considered for being + # searched directly. Other nested quantities can only be used via + # other search_quantities's es_quantity + if prefix is not None or super_cls == Document: + qualified_name = search_quantity.qualified_name + assert qualified_name not in search_quantities, 'Search quantities must have a unique name: %s' % qualified_name + search_quantities[qualified_name] = search_quantity + + if search_quantity.metric is not None: + qualified_metric_name = search_quantity.metric_name + assert qualified_metric_name not in metrics, 'Metric names must be unique: %s' % qualified_metric_name + metrics[qualified_metric_name] = search_quantity + + if search_quantity.group is not None: + qualified_group = search_quantity.group + assert qualified_group not in groups, 'Groups must be unique' + groups[qualified_group] = search_quantity + + if search_quantity.default_statistic: + default_statistics.setdefault(domain_or_all, []).append(search_quantity) + + if search_quantity.order_default: + assert order_default_quantities.get(domain_or_all) is None, 'Only one quantity can be the order default' + order_default_quantities[domain_or_all] = search_quantity + + if i != 0: + # only the first quantity gets is mapped, unless the other has an + # explicit mapping + assert search_quantity.es_mapping is None, 'only the first quantity gets is mapped' + continue + + if search_quantity.es_mapping is None: + # find a mapping based on quantity type + if quantity.type == str: + search_quantity.es_mapping = Keyword() + elif quantity.type == int: + search_quantity.es_mapping = Integer() + elif quantity.type == bool: + search_quantity.es_mapping = Boolean() + elif quantity.type == metainfo.Datetime: + search_quantity.es_mapping = Date() + elif isinstance(quantity.type, metainfo.Reference): + inner_document = get_inner_document(quantity.type.target_section_def) + search_quantity.es_mapping = Object(inner_document) + elif isinstance(quantity.type, metainfo.MEnum): + search_quantity.es_mapping = Keyword() + else: + raise NotImplementedError( + 'Quantity type %s for quantity %s is not supported.' % (quantity.type, quantity)) + + attrs[quantity.name] = search_quantity.es_mapping + + document = type(document_name, (super_cls,), attrs) + _elastic_documents[section.qualified_name()] = document + return document + + +# TODO move to a init function that is triggered by elastic setup in infrastructure +Entry = cast(Document, create_elastic_document( + datamodel.EntryMetadata.m_def, document_name='Entry', + attrs=dict(Index=type('Index', (), dict(name=config.elastic.index_name))))) +''' The elasticsearch_dsl Document class that constitutes the entry index. ''' + +metrics_names = list(metrics.keys()) +''' Names of all available metrics ''' + +for domain in datamodel.domains: + order_default_quantities.setdefault(domain, order_default_quantities.get('__all__')) + default_statistics.setdefault(domain, []).append(*default_statistics.get('__all__')) + + +# class User(InnerDoc): + +# @classmethod +# def from_user(cls, user): +# self = cls(user_id=user.user_id) +# self.name = user.name +# self.email = user.email + +# return self + +# user_id = Keyword() +# email = Keyword() +# name = Text(fields={'keyword': Keyword()}) + + +# class Dataset(InnerDoc): + +# @classmethod +# def from_dataset_id(cls, dataset_id): +# dataset = datamodel.Dataset.m_def.m_x('me').get(dataset_id=dataset_id) +# return cls(id=dataset.dataset_id, doi=dataset.doi, name=dataset.name, created=dataset.created) + +# id = Keyword() +# doi = Keyword() +# name = Keyword() +# created = Date() + + +# _domain_inner_doc_types: Dict[str, type] = {} - user_id = Keyword() - email = Keyword() - name = Text(fields={'keyword': Keyword()}) - - -class Dataset(InnerDoc): - - @classmethod - def from_dataset_id(cls, dataset_id): - dataset = datamodel.Dataset.m_def.m_x('me').get(dataset_id=dataset_id) - return cls(id=dataset.dataset_id, doi=dataset.doi, name=dataset.name, created=dataset.created) - - id = Keyword() - doi = Keyword() - name = Keyword() - created = Date() - - -_domain_inner_doc_types: Dict[str, type] = {} - - -class WithDomain(IndexMeta): - """ Override elasticsearch_dsl metaclass to sneak in domain specific mappings """ - def __new__(cls, name, bases, attrs): - for domain in Domain.instances.values(): - inner_doc_type = _domain_inner_doc_types.get(domain.name) - if inner_doc_type is None: - domain_attrs = { - quantity.elastic_field: quantity.elastic_mapping - for quantity in domain.domain_quantities.values()} - - inner_doc_type = type(domain.name, (InnerDoc,), domain_attrs) - _domain_inner_doc_types[domain.name] = inner_doc_type - - attrs[domain.name] = Object(inner_doc_type) - - return super(WithDomain, cls).__new__(cls, name, bases, attrs) - - -class Entry(Document, metaclass=WithDomain): - - class Index: - name = config.elastic.index_name - - domain = Keyword() - upload_id = Keyword() - upload_time = Date() - upload_name = Keyword() - calc_id = Keyword() - calc_hash = Keyword() - pid = Keyword() - raw_id = Keyword() - mainfile = Keyword() - files = Text(multi=True, analyzer=path_analyzer, fields={'keyword': Keyword()}) - uploader = Object(User) - - with_embargo = Boolean() - published = Boolean() - - processed = Boolean() - last_processing = Date() - nomad_version = Keyword() - nomad_commit = Keyword() - - authors = Object(User, multi=True) - owners = Object(User, multi=True) - comment = Text() - references = Keyword() - datasets = Object(Dataset) - external_id = Keyword() - - atoms = Keyword() - only_atoms = Keyword() - formula = Keyword() - - @classmethod - def from_calc_with_metadata(cls, source: datamodel.CalcWithMetadata) -> 'Entry': - entry = Entry(meta=dict(id=source.calc_id)) - entry.update(source) - return entry - - def update(self, source: datamodel.CalcWithMetadata) -> None: - self.domain = source.domain - self.upload_id = source.upload_id - self.upload_time = source.upload_time - self.upload_name = source.upload_name - self.calc_id = source.calc_id - self.calc_hash = source.calc_hash - self.pid = None if source.pid is None else str(source.pid) - self.raw_id = None if source.raw_id is None else str(source.raw_id) - - self.processed = source.processed - self.last_processing = source.last_processing - self.nomad_version = source.nomad_version - self.nomad_commit = source.nomad_commit - - self.mainfile = source.mainfile - if source.files is None: - self.files = [self.mainfile] - elif self.mainfile not in source.files: - self.files = [self.mainfile] + source.files - else: - self.files = source.files - self.with_embargo = bool(source.with_embargo) - self.published = source.published +# class WithDomain(IndexMeta): +# ''' Override elasticsearch_dsl metaclass to sneak in domain specific mappings ''' +# def __new__(cls, name, bases, attrs): +# for domain in Domain.instances.values(): +# inner_doc_type = _domain_inner_doc_types.get(domain.name) +# if inner_doc_type is None: +# domain_attrs = { +# quantity.elastic_field: quantity.elastic_mapping +# for quantity in domain.domain_quantities.values()} + +# inner_doc_type = type(domain.name, (InnerDoc,), domain_attrs) +# _domain_inner_doc_types[domain.name] = inner_doc_type + +# attrs[domain.name] = Object(inner_doc_type) + +# return super(WithDomain, cls).__new__(cls, name, bases, attrs) + + +# class Entry(Document, metaclass=WithDomain): - uploader = datamodel.User.get(user_id=source.uploader) if source.uploader is not None else None - authors = [datamodel.User.get(user_id) for user_id in source.coauthors] - owners = [datamodel.User.get(user_id) for user_id in source.shared_with] - if uploader is not None: - authors.append(uploader) - owners.append(uploader) - authors.sort(key=lambda user: user.last_name + ' ' + user.first_name) - owners.sort(key=lambda user: user.last_name + ' ' + user.first_name) +# class Index: +# name = config.elastic.index_name - self.uploader = User.from_user(uploader) if uploader is not None else None - self.authors = [User.from_user(user) for user in authors] - self.owners = [User.from_user(user) for user in owners] +# domain = Keyword() +# upload_id = Keyword() +# upload_time = Date() +# upload_name = Keyword() +# calc_id = Keyword() +# calc_hash = Keyword() +# pid = Keyword() +# raw_id = Keyword() +# mainfile = Keyword() +# files = Text(multi=True, analyzer=path_analyzer, fields={'keyword': Keyword()}) +# uploader = Object(User) + +# with_embargo = Boolean() +# published = Boolean() + +# processed = Boolean() +# last_processing = Date() +# nomad_version = Keyword() +# nomad_commit = Keyword() + +# authors = Object(User, multi=True) +# owners = Object(User, multi=True) +# comment = Text() +# references = Keyword() +# datasets = Object(Dataset) +# external_id = Keyword() + +# atoms = Keyword() +# only_atoms = Keyword() +# formula = Keyword() + +# @classmethod +# def from_entry_metadata(cls, source: datamodel.EntryMetadata) -> 'Entry': +# entry = Entry(meta=dict(id=source.calc_id)) +# entry.update(source) +# return entry + +# def update(self, source: datamodel.EntryMetadata) -> None: +# self.domain = source.domain +# self.upload_id = source.upload_id +# self.upload_time = source.upload_time +# self.upload_name = source.upload_name +# self.calc_id = source.calc_id +# self.calc_hash = source.calc_hash +# self.pid = None if source.pid is None else str(source.pid) +# self.raw_id = None if source.raw_id is None else str(source.raw_id) + +# self.processed = source.processed +# self.last_processing = source.last_processing +# self.nomad_version = source.nomad_version +# self.nomad_commit = source.nomad_commit + +# self.mainfile = source.mainfile +# if source.files is None: +# self.files = [self.mainfile] +# elif self.mainfile not in source.files: +# self.files = [self.mainfile] + source.files +# else: +# self.files = source.files + +# self.with_embargo = bool(source.with_embargo) +# self.published = source.published + +# uploader = datamodel.User.get(user_id=source.uploader) if source.uploader is not None else None +# authors = [datamodel.User.get(user_id) for user_id in source.coauthors] +# owners = [datamodel.User.get(user_id) for user_id in source.shared_with] +# if uploader is not None: +# authors.append(uploader) +# owners.append(uploader) +# authors.sort(key=lambda user: user.last_name + ' ' + user.first_name) +# owners.sort(key=lambda user: user.last_name + ' ' + user.first_name) + +# self.uploader = User.from_user(uploader) if uploader is not None else None +# self.authors = [User.from_user(user) for user in authors] +# self.owners = [User.from_user(user) for user in owners] + +# self.comment = source.comment +# self.references = source.references +# self.datasets = [Dataset.from_dataset_id(dataset_id) for dataset_id in source.datasets] +# self.external_id = source.external_id + +# self.atoms = source.atoms +# self.only_atoms = nomad.datamodel.base.only_atoms(source.atoms) +# self.formula = source.formula +# self.n_atoms = source.n_atoms + +# if self.domain is not None: +# inner_doc_type = _domain_inner_doc_types[self.domain] +# inner_doc = inner_doc_type() +# for quantity in Domain.instances[self.domain].domain_quantities.values(): +# quantity_value = quantity.elastic_value(getattr(source, quantity.metadata_field)) +# setattr(inner_doc, quantity.elastic_field, quantity_value) + +# setattr(self, self.domain, inner_doc) + + +def create_entry(section: metainfo.MSection) -> Any: + ''' Creates a elasticsearch_dsl document for the given section. ''' + cls = _elastic_documents[section.m_def.qualified_name()] + + if section.m_def == datamodel.EntryMetadata.m_def: + obj = cls(meta=dict(id=section.m_get(datamodel.EntryMetadata.calc_id))) + else: + obj = cls() + + for quantity in section.m_def.all_quantities.values(): + search_quantities = quantity.m_x('search') + if search_quantities is None: + continue + + if not isinstance(search_quantities, list): + search_quantities = [search_quantities] + + value = section.m_get(quantity) + if value is None or value == []: + continue - self.comment = source.comment - self.references = source.references - self.datasets = [Dataset.from_dataset_id(dataset_id) for dataset_id in source.datasets] - self.external_id = source.external_id + for i, search_quantity in enumerate(search_quantities): + if i != 0: + # Only the value is only written for the first quantity + continue - self.atoms = source.atoms - self.only_atoms = nomad.datamodel.base.only_atoms(source.atoms) - self.formula = source.formula - self.n_atoms = source.n_atoms + quantity_type = quantity.type + if isinstance(quantity_type, metainfo.Reference): + if quantity.is_scalar: + value = create_entry(cast(metainfo.MSection, value)) + else: + value = [create_entry(item) for item in value] + + elif search_quantity.es_value is not None: + value = search_quantity.es_value(section) - if self.domain is not None: - inner_doc_type = _domain_inner_doc_types[self.domain] - inner_doc = inner_doc_type() - for quantity in Domain.instances[self.domain].domain_quantities.values(): - quantity_value = quantity.elastic_value(getattr(source, quantity.metadata_field)) - setattr(inner_doc, quantity.elastic_field, quantity_value) + setattr(obj, quantity.name, value) - setattr(self, self.domain, inner_doc) + for sub_section in section.m_def.all_sub_sections.values(): + if not sub_section.m_x('search'): + continue + + if sub_section.repeats: + mi_values = list(section.m_get_sub_sections(sub_section)) + if len(mi_values) == 0: + continue + value = [create_entry(value) for value in mi_values] + else: + mi_value = section.m_get_sub_section(sub_section, -1) + if mi_value is None: + continue + value = create_entry(mi_value) + + setattr(obj, sub_section.name, value) + + return obj def delete_upload(upload_id): - """ Delete all entries with given ``upload_id`` from the index. """ + ''' Delete all entries with given ``upload_id`` from the index. ''' index = Entry._default_index() Search(index=index).query('match', upload_id=upload_id).delete() def delete_entry(calc_id): - """ Delete the entry with the given ``calc_id`` from the index. """ + ''' Delete the entry with the given ``calc_id`` from the index. ''' index = Entry._default_index() Search(index=index).query('match', calc_id=calc_id).delete() -def publish(calcs: Iterable[datamodel.CalcWithMetadata]) -> None: - """ Update all given calcs with their metadata and set ``publish = True``. """ +def publish(calcs: Iterable[datamodel.EntryMetadata]) -> None: + ''' Update all given calcs with their metadata and set ``publish = True``. ''' def elastic_updates(): for calc in calcs: - entry = Entry.from_calc_with_metadata(calc) + entry = create_entry(calc) entry.published = True entry = entry.to_dict(include_meta=True) source = entry.pop('_source') @@ -222,16 +421,16 @@ def publish(calcs: Iterable[datamodel.CalcWithMetadata]) -> None: refresh() -def index_all(calcs: Iterable[datamodel.CalcWithMetadata], do_refresh=True) -> None: - """ +def index_all(calcs: Iterable[datamodel.EntryMetadata], do_refresh=True) -> None: + ''' Adds all given calcs with their metadata to the index. Returns: Number of failed entries. - """ + ''' def elastic_updates(): for calc in calcs: - entry = Entry.from_calc_with_metadata(calc) + entry = create_entry(calc) entry = entry.to_dict(include_meta=True) entry['_op_type'] = 'index' yield entry @@ -248,36 +447,6 @@ def refresh(): infrastructure.elastic_client.indices.refresh(config.elastic.index_name) -metrics = { - metric_name: metric - for domain in Domain.instances.values() - for metric_name, metric in domain.metrics.items()} -""" -The available search metrics. Metrics are integer values given for each entry that can -be used in statistics (aggregations), e.g. the sum of all total energy calculations or cardinality of -all unique geometries. -""" - -metrics_names = [metric_name for domain in Domain.instances.values() for metric_name in domain.metrics_names] -""" Names of all available metrics """ - -groups = { - key: value - for domain in Domain.instances.values() - for key, value in domain.groups.items()} -"""The available groupable quantities""" - -order_default_quantities = { - domain_name: domain.order_default_quantity - for domain_name, domain in Domain.instances.items() -} - -default_statistics = { - domain_name: domain.default_statistics - for domain_name, domain in Domain.instances.items() -} - - class SearchRequest: ''' Represents a search request and allows to execute that request. @@ -313,10 +482,10 @@ class SearchRequest: self._search = Search(index=config.elastic.index_name) def domain(self, domain: str = None): - """ + ''' Applies the domain of this request to the query. Allows to optionally update the domain of this request. - """ + ''' if domain is not None: self._domain = domain @@ -324,7 +493,7 @@ class SearchRequest: return self def owner(self, owner_type: str = 'all', user_id: str = None): - """ + ''' Uses the query part of the search to restrict the results based on the owner. The possible types are: ``all`` for all calculations; ``public`` for calculations visible by everyone, excluding embargo-ed entries and entries only visible @@ -340,7 +509,7 @@ class SearchRequest: KeyError: If the given owner_type is not supported ValueError: If the owner_type requires a user but none is given, or the given user is not allowed to use the given owner_type. - """ + ''' if owner_type == 'all': q = Q('term', published=True) if user_id is not None: @@ -378,31 +547,31 @@ class SearchRequest: return self def search_parameters(self, **kwargs): - """ + ''' Configures the existing query with additional search parameters. Kwargs are interpreted as key value pairs. Keys have to coresspond to valid entry quantities in the domain's (DFT calculations) datamodel. Alternatively search parameters can be set via attributes. - """ + ''' for name, value in kwargs.items(): self.search_parameter(name, value) return self def search_parameter(self, name, value): - quantity = Domain.get_quantity(name) + quantity = search_quantities[name] - if quantity.multi and not isinstance(value, list): + if quantity.many and not isinstance(value, list): value = [value] - value = quantity.elastic_value(value) + if quantity.many_or and isinstance(value, List): + self.q &= Q('terms', **{quantity.es_quantity: value}) + return self - if quantity.elastic_search_type == 'terms': - if not isinstance(value, list): + if quantity.derived: + if quantity.many and not isinstance(value, list): value = [value] - self.q &= Q('terms', **{quantity.qualified_elastic_field: value}) - - return self + value = quantity.derived(value) if isinstance(value, list): values = value @@ -410,18 +579,18 @@ class SearchRequest: values = [value] for item in values: - self.q &= Q(quantity.elastic_search_type, **{quantity.qualified_elastic_field: item}) + self.q &= Q('match', **{quantity.es_quantity: item}) return self def query(self, query): - """ Adds the given query as a 'and' (i.e. 'must') clause to the request. """ + ''' Adds the given query as a 'and' (i.e. 'must') clause to the request. ''' self._query &= query return self def time_range(self, start: datetime, end: datetime): - """ Adds a time range to the query. """ + ''' Adds a time range to the query. ''' if start is None and end is None: return self @@ -436,7 +605,7 @@ class SearchRequest: @property def q(self): - """ The underlying elasticsearch_dsl query object """ + ''' The underlying elasticsearch_dsl query object ''' if self._query is None: return Q('match_all') else: @@ -447,30 +616,30 @@ class SearchRequest: self._query = q def totals(self, metrics_to_use: List[str] = []): - """ + ''' Configure the request to return overall totals for the given metrics. The statics are returned with the other quantity statistics under the pseudo quantity name 'total'. 'total' contains the pseudo value 'all'. It is used to store the metrics aggregated over all entries in the search results. - """ + ''' self._add_metrics(self._search.aggs, metrics_to_use) return self def default_statistics(self, metrics_to_use: List[str] = []): - """ + ''' Configures the domain's default statistics. - """ - for name in default_statistics[self._domain]: + ''' + for search_quantity in default_statistics[self._domain]: self.statistic( - name, - Domain.get_quantity(name).aggregations, + search_quantity.qualified_name, + search_quantity.statistic_size, metrics_to_use=metrics_to_use) return self def statistic(self, quantity_name: str, size: int, metrics_to_use: List[str] = []): - """ + ''' This can be used to display statistics over the searched entries and allows to implement faceted search on the top values for each quantity. @@ -493,9 +662,9 @@ class SearchRequest: metrics_to_use: The metrics calculated over the aggregations. Can be ``unique_code_runs``, ``datasets``, other domain specific metrics. The basic doc_count metric ``code_runs`` is always given. - """ - quantity = Domain.get_quantity(quantity_name) - terms = A('terms', field=quantity.qualified_elastic_field, size=size, order=dict(_key='asc')) + ''' + quantity = search_quantities[quantity_name] + terms = A('terms', field=quantity.es_quantity, size=size, order=dict(_key='asc')) buckets = self._search.aggs.bucket('statistics:%s' % quantity_name, terms) self._add_metrics(buckets, metrics_to_use) @@ -507,24 +676,26 @@ class SearchRequest: parent = self._search.aggs for metric in metrics_to_use: - quantity, metric_kind = metrics[metric] - field = Domain.get_quantity(quantity).elastic_field - parent.metric('metric:%s' % metric, A(metric_kind, field=field)) + metric_quantity = metrics[metric] + field = metric_quantity.es_quantity + parent.metric( + 'metric:%s' % metric_quantity.metric_name, + A(metric_quantity.metric, field=field)) def date_histogram(self, metrics_to_use: List[str] = []): - """ + ''' Adds a date histogram on the given metrics to the statistics part. - """ + ''' histogram = A('date_histogram', field='upload_time', interval='1M', format='yyyy-MM-dd') self._add_metrics(self._search.aggs.bucket('statistics:date_histogram', histogram), metrics_to_use) return self def quantities(self, **kwargs): - """ + ''' Shorthand for adding multiple quantities. See :func:`quantity`. Keywork argument keys are quantity name, values are tuples of size and after value. - """ + ''' for name, spec in kwargs: size, after = spec self.quantity(name, after=after, size=size) @@ -534,7 +705,7 @@ class SearchRequest: def quantity( self, name, size=100, after=None, examples=0, examples_source=None, order_by: str = None, order: str = 'desc'): - """ + ''' Adds a requests for values of the given quantity. It allows to scroll through all values via elasticsearch's composite aggregations. The response will contain the quantity values and @@ -564,12 +735,12 @@ class SearchRequest: value bucket is used. order: "desc" or "asc" - """ + ''' if size is None: size = 100 - quantity = Domain.get_quantity(name) - terms = A('terms', field=quantity.qualified_elastic_field) + quantity = search_quantities[name] + terms = A('terms', field=quantity.es_quantity) # We are using elastic searchs 'composite aggregations' here. We do not really # compose aggregations, but only those pseudo composites allow us to use the @@ -597,36 +768,36 @@ class SearchRequest: return self def exclude(self, *args): - """ Exclude certain elastic fields from the search results. """ + ''' Exclude certain elastic fields from the search results. ''' self._search = self._search.source(excludes=args) return self def include(self, *args): - """ Include only the given fields in the search results. """ + ''' Include only the given fields in the search results. ''' self._search = self._search.source(includes=args) return self def execute(self): - """ + ''' Exectutes without returning actual results. Only makes sense if the request was configured for statistics or quantity values. - """ + ''' return self._response(self._search.query(self.q)[0:0].execute()) def execute_scan(self, order_by: str = None, order: int = -1, **kwargs): - """ + ''' This execute the search as scan. The result will be a generator over the found entries. Everything but the query part of this object, will be ignored. - """ + ''' search = self._search.query(self.q) if order_by is not None: - order_by_quantity = Domain.get_quantity(order_by) + order_by_quantity = search_quantities[order_by] if order == 1: - search = search.sort(order_by_quantity.qualified_elastic_field) + search = search.sort(order_by_quantity.es_quantity) else: - search = search.sort('-%s' % order_by_quantity.qualified_elastic_field) + search = search.sort('-%s' % order_by_quantity.es_quantity) search = search.params(preserve_order=True) @@ -636,7 +807,7 @@ class SearchRequest: def execute_paginated( self, page: int = 1, per_page=10, order_by: str = None, order: int = -1): - """ + ''' Executes the search and returns paginated results. Those are sorted. Arguments: @@ -644,21 +815,22 @@ class SearchRequest: per_page: The number of entries per page. order_by: The quantity to order by. order: -1 or 1 for descending or ascending order. - """ + ''' if order_by is None: - order_by = order_default_quantities[self._domain] + order_by_quantity = order_default_quantities[self._domain] + else: + order_by_quantity = search_quantities[order_by] search = self._search.query(self.q) - order_by_quantity = Domain.get_quantity(order_by) - if order == 1: - search = search.sort(order_by_quantity.qualified_elastic_field) + search = search.sort(order_by_quantity.es_quantity) else: - search = search.sort('-%s' % order_by_quantity.qualified_elastic_field) + search = search.sort('-%s' % order_by_quantity.es_quantity) search = search[(page - 1) * per_page: page * per_page] es_result = search.execute() + result = self._response(es_result, with_hits=True) result.update(pagination=dict(total=result['total'], page=page, per_page=per_page)) @@ -667,7 +839,7 @@ class SearchRequest: def execute_scrolled( self, scroll_id: str = None, size: int = 1000, scroll: str = u'5m', order_by: str = None, order: int = -1): - """ + ''' Executes a scrolling search. based on ES scroll API. Pagination is replaced with scrolling, no ordering is available, no statistics, no quantities will be provided. @@ -687,7 +859,7 @@ class SearchRequest: to this method) in ES time units. Default is 5 minutes. TODO support order and order_by - """ + ''' es = infrastructure.elastic_client if scroll_id is None: @@ -726,11 +898,11 @@ class SearchRequest: return dict(scroll=scroll_info, results=results) def _response(self, response, with_hits: bool = False) -> Dict[str, Any]: - """ + ''' Prepares a response object covering the total number of results, hits, statistics, and quantities. Other aspects like pagination and scrolling have to be added elsewhere. - """ + ''' result: Dict[str, Any] = dict() aggs = response.aggregations.to_dict() @@ -809,24 +981,25 @@ class SearchRequest: def to_calc_with_metadata(results: List[Dict[str, Any]]): - """ Translates search results into :class:`CalcWithMetadata` objects read from mongo. """ + ''' Translates search results into :class:`EntryMetadata` objects read from mongo. ''' ids = [result['calc_id'] for result in results] return [ - datamodel.CalcWithMetadata(**calc.metadata) + datamodel.EntryMetadata.m_from_dict(calc.metadata) for calc in proc.Calc.objects(calc_id__in=ids)] def flat(obj, prefix=None): - """ + ''' Helper that translates nested result objects into flattened dicts with ``domain.quantity`` as keys. - """ + ''' if isinstance(obj, dict): result = {} for key, value in obj.items(): if isinstance(value, dict): + value = flat(value) for child_key, child_value in value.items(): - result['%s.%s' % (key, child_key)] = flat(child_value) + result['%s.%s' % (key, child_key)] = child_value else: result[key] = value diff --git a/nomad/utils.py b/nomad/utils.py index 1cdb8c933b0956e586fbd2d4b499a86c675d13f2..c2a71853c3cbedad352c04f9dcd39a5ef198264e 100644 --- a/nomad/utils.py +++ b/nomad/utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' .. autofunc::nomad.utils.create_uuid .. autofunc::nomad.utils.hash .. autofunc::nomad.utils.timer @@ -31,7 +31,7 @@ Depending on the configuration all logs will also be send to a central logstash. .. autofunc::nomad.utils.create_uuid .. autofunc::nomad.utils.timer .. autofunc::nomad.utils.lnr -""" +''' from typing import List import base64 @@ -53,7 +53,7 @@ from datetime import timedelta from nomad import config default_hash_len = 28 -""" Length of hashes and hash-based ids (e.g. calc, upload) in nomad. """ +''' Length of hashes and hash-based ids (e.g. calc, upload) in nomad. ''' def decode_handle_id(handle_str: str): @@ -73,7 +73,7 @@ def decode_handle_id(handle_str: str): def hash(*args, length: int = default_hash_len) -> str: - """ Creates a websave hash of the given length based on the repr of the given arguments. """ + ''' Creates a websave hash of the given length based on the repr of the given arguments. ''' hash = hashlib.sha512() for arg in args: hash.update(str(arg).encode('utf-8')) @@ -82,7 +82,7 @@ def hash(*args, length: int = default_hash_len) -> str: def make_websave(hash, length: int = default_hash_len) -> str: - """ Creates a websave string for a hashlib hash object. """ + ''' Creates a websave string for a hashlib hash object. ''' if length > 0: return base64.b64encode(hash.digest(), altchars=b'-_')[:length].decode('utf-8') else: @@ -90,30 +90,30 @@ def make_websave(hash, length: int = default_hash_len) -> str: def base64_encode(string): - """ + ''' Removes any `=` used as padding from the encoded string. - """ + ''' encoded = base64.urlsafe_b64encode(string).decode('utf-8') return encoded.rstrip("=") def base64_decode(string): - """ + ''' Adds back in the required padding before decoding. - """ + ''' padding = 4 - (len(string) % 4) bytes = (string + ("=" * padding)).encode('utf-8') return base64.urlsafe_b64decode(bytes) def sanitize_logevent(event: str) -> str: - """ + ''' Prepares a log event or message for analysis in elastic stack. It removes numbers, list, and matrices of numbers from the event string and limits its size. The goal is to make it easier to define aggregations over events by using event strings as representatives for event classes rather than event instances (with concrete numbers, etc). - """ + ''' sanitized_event = event[:120] sanitized_event = re.sub(r'(\d*\.\d+|\d+(\.\d*)?)', 'X', sanitized_event) sanitized_event = re.sub(r'((\[|\()\s*)?X\s*(,\s*X)+(\s*(\]|\)))?', 'L', sanitized_event) @@ -123,7 +123,7 @@ def sanitize_logevent(event: str) -> str: @contextmanager def legacy_logger(logger): - """ Context manager that makes the given logger the logger for legacy log entries. """ + ''' Context manager that makes the given logger the logger for legacy log entries. ''' LogstashHandler.legacy_logger = logger try: yield @@ -132,14 +132,14 @@ def legacy_logger(logger): class LogstashHandler(logstash.TCPLogstashHandler): - """ + ''' A log handler that emits records to logstash. It also filters logs for being structlog entries. All other entries are diverted to a global `legacy_logger`. This legacy logger is supposed to be a structlog logger that turns legacy records into structlog entries with reasonable binds depending on the current execution context (e.g. parsing/normalizing, etc.). If no legacy logger is set, they get emitted as usual (e.g. non nomad logs, celery, dbs, etc.) - """ + ''' legacy_logger = None @@ -349,15 +349,15 @@ def configure_logging(): def create_uuid() -> str: - """ Returns a web-save base64 encoded random uuid (type 4). """ + ''' Returns a web-save base64 encoded random uuid (type 4). ''' return base64.b64encode(uuid.uuid4().bytes, altchars=b'-_').decode('utf-8')[0:-2] def get_logger(name, **kwargs): - """ + ''' Returns a structlog logger that is already attached with a logstash handler. Use additional *kwargs* to pre-bind some values to all events. - """ + ''' if name.startswith('nomad.'): name = '.'.join(name.split('.')[:2]) @@ -367,14 +367,14 @@ def get_logger(name, **kwargs): @contextmanager def lnr(logger, event, **kwargs): - """ + ''' A context manager that Logs aNd Raises all exceptions with the given logger. Arguments: logger: The logger that should be used for logging exceptions. event: the log message **kwargs: additional properties for the structured log - """ + ''' try: yield except HTTPException as e: @@ -387,7 +387,7 @@ def lnr(logger, event, **kwargs): @contextmanager def timer(logger, event, method='info', **kwargs): - """ + ''' A context manager that takes execution time and produces a log entry with said time. Arguments: @@ -399,7 +399,7 @@ def timer(logger, event, method='info', **kwargs): Returns: The method yields a dictionary that can be used to add further log data. - """ + ''' start = time.time() try: @@ -441,15 +441,15 @@ def to_tuple(self, *args): def chunks(list, n): - """ Chunks up the given list into parts of size n. """ + ''' Chunks up the given list into parts of size n. ''' for i in range(0, len(list), n): yield list[i:i + n] class POPO(dict): - """ + ''' A dict subclass that uses attributes as key/value pairs. - """ + ''' def __init__(self, **kwargs): super().__init__(**kwargs) @@ -470,10 +470,10 @@ class POPO(dict): class SleepTimeBackoff: - """ + ''' Provides increasingly larger sleeps. Useful when observing long running processes with unknown runtime. - """ + ''' def __init__(self, start_time: float = 0.1, max_time: float = 5): self.current_time = start_time @@ -517,10 +517,10 @@ class ETA: def common_prefix(paths): - """ + ''' Computes the longest common file path prefix (with respect to '/' separated segments). Returns empty string is ne common prefix exists. - """ + ''' common_prefix = None for path in paths: diff --git a/tests/__init__.py b/tests/__init__.py index e48f987ae9f076668b44484fd972c9599a585b40..05d19b7d8a11b9f922d7200eb9155c74f5dc3b9d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' The nomad@FAIRDI tests are based on the pytest library. Pytest uses *fixtures* to modularize setup and teardown of mocks, infrastructure, and other context objects. The following depicts the used hierarchy of fixtures: @@ -20,7 +20,7 @@ The following depicts the used hierarchy of fixtures: .. image:: test_fixtures.png Otherwise the test submodules follow the names of the nomad code modules. -""" +''' from nomad import config diff --git a/tests/app/resource.py b/tests/app/resource.py index 877031f9df65f1d5c7e5bf12b4549f8d7df9a216..7215b4bee77b5a8f06723e6f49118213aa381726 100644 --- a/tests/app/resource.py +++ b/tests/app/resource.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' API endpoints that cause various scenerios to test general API aspects like logging, error handling, etc. -""" +''' from flask_restplus import Resource diff --git a/tests/app/test_api.py b/tests/app/test_api.py index 041825488e1c32842c85a683c3e8c912075ce409..5d37a087470ff03c05b11d6e891e6885a1d7128b 100644 --- a/tests/app/test_api.py +++ b/tests/app/test_api.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any +from typing import Any, Iterable import pytest import time import json @@ -30,7 +30,7 @@ from nomad.app.api.auth import generate_upload_token from nomad import search, parsing, files, config, utils, infrastructure from nomad.files import UploadFiles, PublicUploadFiles from nomad.processing import Upload, Calc, SUCCESS -from nomad.datamodel import UploadWithMetadata, CalcWithMetadata, User, Dataset +from nomad.datamodel import EntryMetadata, User, Dataset from tests.conftest import create_auth_headers, clear_elastic, create_test_structure from tests.test_files import example_file, example_file_mainfile, example_file_contents @@ -56,12 +56,11 @@ def test_user_signature_token(api, test_user_auth): return json.loads(rv.data)['signature_token'] -def get_upload_with_metadata(upload: dict) -> UploadWithMetadata: - """ Create a :class:`UploadWithMetadata` from a API upload json record. """ - return UploadWithMetadata( - upload_id=upload['upload_id'], calcs=[ - CalcWithMetadata(domain='dft', calc_id=calc['calc_id'], mainfile=calc['mainfile']) - for calc in upload['calcs']['results']]) +def get_upload_entries_metadata(upload: dict) -> Iterable[EntryMetadata]: + ''' Create a iterable of :class:`EntryMetadata` from a API upload json record. ''' + return [ + EntryMetadata(domain='dft', calc_id=entry['calc_id'], mainfile=entry['mainfile']) + for entry in upload['calcs']['results']] def assert_zip_file(rv, files: int = -1, basename: bool = None): @@ -233,16 +232,14 @@ class TestUploads: upload = self.assert_upload(rv.data) assert len(upload['calcs']['results']) == 1 - upload_with_metadata = get_upload_with_metadata(upload) - assert_upload_files(upload_with_metadata, files.StagingUploadFiles) - assert_search_upload(upload_with_metadata, additional_keys=['atoms', 'dft.system']) + entries = get_upload_entries_metadata(upload) + assert_upload_files(upload_id, entries, files.StagingUploadFiles) + assert_search_upload(entries, additional_keys=['atoms', 'dft.system']) def assert_published(self, api, test_user_auth, upload_id, proc_infra, metadata={}): rv = api.get('/uploads/%s' % upload_id, headers=test_user_auth) upload = self.assert_upload(rv.data) - upload_with_metadata = get_upload_with_metadata(upload) - rv = api.post( '/uploads/%s' % upload_id, headers=test_user_auth, @@ -263,10 +260,22 @@ class TestUploads: assert upload_proc is not None assert upload_proc.published is True assert upload_proc.embargo_length == min(36, metadata.get('embargo_length', 36)) - upload_with_metadata = upload_proc.to_upload_with_metadata() + entries = upload_proc.entries_metadata() - assert_upload_files(upload_with_metadata, files.PublicUploadFiles, published=True) - assert_search_upload(upload_with_metadata, additional_keys=additional_keys, published=True) + for entry in entries: + for key, transform in { + 'comment': lambda e: e.comment, + 'with_embargo': lambda e: e.with_embargo, + 'references': lambda e: e.references, + 'coauthors': lambda e: [u.user_id for u in e.coauthors], + '_uploader': lambda e: e.uploader.user_id, + '_pid': lambda e: e.pid, + 'external_id': lambda e: e.external_id}.items(): + if key in metadata: + assert transform(entry) == metadata[key], key + + assert_upload_files(upload_id, entries, files.PublicUploadFiles, published=True) + assert_search_upload(entries, additional_keys=additional_keys, published=True) def block_until_completed(self, api, upload_id: str, test_user_auth): while True: @@ -504,6 +513,7 @@ class TestUploads: today = datetime.datetime.utcnow().date() +today_datetime = datetime.datetime(*today.timetuple()[:6]) class UploadFilesBasedTests: @@ -590,9 +600,9 @@ class UploadFilesBasedTests: calc_specs = 'r' if restricted else 'p' Upload.create(user=test_user, upload_id='test_upload') if in_staging: - _, upload_files = create_staging_upload('test_upload', calc_specs=calc_specs) + _, _, upload_files = create_staging_upload('test_upload', calc_specs=calc_specs) else: - _, upload_files = create_public_upload('test_upload', calc_specs=calc_specs) + _, _, upload_files = create_public_upload('test_upload', calc_specs=calc_specs) yield 'test_upload', authorized, auth_headers @@ -697,34 +707,35 @@ class TestRepo(): dataset_id='ds_id', name='ds_name', user_id=test_user.user_id, doi='ds_doi') example_dataset.m_x('me').create() - calc_with_metadata = CalcWithMetadata( - domain='dft', upload_id='example_upload_id', calc_id='0', upload_time=today) - calc_with_metadata.files = ['test/mainfile.txt'] - calc_with_metadata.apply_domain_metadata(normalized) + entry_metadata = EntryMetadata( + domain='dft', upload_id='example_upload_id', calc_id='0', upload_time=today_datetime) + entry_metadata.files = ['test/mainfile.txt'] + entry_metadata.apply_domain_metadata(normalized) - calc_with_metadata.update(datasets=[example_dataset.dataset_id]) + entry_metadata.m_update(datasets=[example_dataset.dataset_id]) - calc_with_metadata.update( + entry_metadata.m_update( calc_id='1', uploader=test_user.user_id, published=True, with_embargo=False) - search.Entry.from_calc_with_metadata(calc_with_metadata).save(refresh=True) + search.create_entry(entry_metadata).save(refresh=True) - calc_with_metadata.update( + entry_metadata.m_update( calc_id='2', uploader=other_test_user.user_id, published=True, - with_embargo=False, pid=2, upload_time=today - datetime.timedelta(days=5), + with_embargo=False, pid=2, upload_time=today_datetime - datetime.timedelta(days=5), external_id='external_2') - calc_with_metadata.update( - atoms=['Fe'], comment='this is a specific word', formula='AAA', basis_set='zzz') - search.Entry.from_calc_with_metadata(calc_with_metadata).save(refresh=True) + entry_metadata.m_update( + atoms=['Fe'], comment='this is a specific word', formula='AAA') + entry_metadata.dft.basis_set = 'zzz' + search.create_entry(entry_metadata).save(refresh=True) - calc_with_metadata.update( + entry_metadata.m_update( calc_id='3', uploader=other_test_user.user_id, published=False, with_embargo=False, pid=3, external_id='external_3') - search.Entry.from_calc_with_metadata(calc_with_metadata).save(refresh=True) + search.create_entry(entry_metadata).save(refresh=True) - calc_with_metadata.update( + entry_metadata.m_update( calc_id='4', uploader=other_test_user.user_id, published=True, with_embargo=True, pid=4, external_id='external_4') - search.Entry.from_calc_with_metadata(calc_with_metadata).save(refresh=True) + search.create_entry(entry_metadata).save(refresh=True) yield @@ -780,28 +791,27 @@ class TestRepo(): assert rv.status_code == 404 def test_search_datasets(self, api, example_elastic_calcs, no_warn, other_test_user_auth): - rv = api.get('/repo/?owner=all&datasets=true', headers=other_test_user_auth) + rv = api.get('/repo/?owner=all&group_datasets=true', headers=other_test_user_auth) data = self.assert_search(rv, 4) datasets = data.get('datasets', None) assert datasets is not None values = datasets['values'] assert values['ds_id']['total'] == 4 - assert values['ds_id']['examples'][0]['datasets'][0]['id'] == 'ds_id' + assert values['ds_id']['examples'][0]['datasets'][0]['dataset_id'] == 'ds_id' assert 'after' in datasets assert 'datasets' in data['statistics']['total']['all'] assert data['statistics']['total']['all']['datasets'] > 0 def test_search_uploads(self, api, example_elastic_calcs, no_warn, other_test_user_auth): - rv = api.get('/repo/?owner=all&uploads=true', headers=other_test_user_auth) + rv = api.get('/repo/?owner=all&group_uploads=true', headers=other_test_user_auth) data = self.assert_search(rv, 4) uploads = data.get('uploads', None) assert uploads is not None values = uploads['values'] - # the 4 uploads have "example upload id", but 3 have newer upload time. Therefore, - # only 3 calc will be in the last (and therefore used) bucket of 'example_upload_id'. - assert values['example_upload_id']['total'] == 3 + + assert values['example_upload_id']['total'] == 4 assert values['example_upload_id']['examples'][0]['upload_id'] == 'example_upload_id' assert 'after' in uploads assert 'uploads' in data['statistics']['total']['all'] @@ -930,10 +940,10 @@ class TestRepo(): def test_search_aggregation_metrics(self, api, example_elastic_calcs, no_warn, metrics): rv = api.get('/repo/?%s' % urlencode({ 'metrics': metrics, - 'statistics': True, - 'dft.groups': True, - 'datasets': True, - 'uploads': True}, doseq=True)) + 'group_statistics': True, + 'group_dft.groups': True, + 'group_datasets': True, + 'group_uploads': True}, doseq=True)) assert rv.status_code == 200 data = json.loads(rv.data) @@ -1169,10 +1179,10 @@ class TestEditRepo(): create_test_structure(meta_info, id, 2, 1, [], 0, metadata=metadata) entries = [ - dict(calc_id='1', upload_id='upload_1', user=test_user, published=True, embargo=False), - dict(calc_id='2', upload_id='upload_2', user=test_user, published=True, embargo=True), - dict(calc_id='3', upload_id='upload_2', user=test_user, published=False, embargo=False), - dict(calc_id='4', upload_id='upload_3', user=other_test_user, published=True, embargo=False)] + dict(calc_id='1', upload_id='upload_1', user=test_user, published=True, with_embargo=False), + dict(calc_id='2', upload_id='upload_2', user=test_user, published=True, with_embargo=True), + dict(calc_id='3', upload_id='upload_2', user=test_user, published=False, with_embargo=False), + dict(calc_id='4', upload_id='upload_3', user=other_test_user, published=True, with_embargo=False)] i = 0 for entry in entries: @@ -1253,6 +1263,7 @@ class TestEditRepo(): shared_with=[other_test_user.user_id]) rv = self.perform_edit(**edit_data, query=dict(upload_id='upload_1')) result = json.loads(rv.data) + assert rv.status_code == 200 actions = result.get('actions') for key in edit_data: assert key in actions @@ -1393,7 +1404,7 @@ def test_edit_lift_embargo(api, published, other_test_user_auth): } } })) - assert rv.status_code == 200 + assert rv.status_code == 200, rv.data assert not Calc.objects(calc_id=example_calc.calc_id).first().metadata['with_embargo'] Upload.get(published.upload_id).block_until_complete() @@ -1780,13 +1791,13 @@ class TestDataset: @pytest.fixture() def example_dataset_with_entry(self, mongo, elastic, example_datasets): - calc = CalcWithMetadata( + entry_metadata = EntryMetadata( domain='dft', calc_id='1', upload_id='1', published=True, with_embargo=False, datasets=['1']) Calc( calc_id='1', upload_id='1', create_time=datetime.datetime.now(), - metadata=calc.to_dict()).save() - search.Entry.from_calc_with_metadata(calc).save() + metadata=entry_metadata.m_to_dict()).save() + search.create_entry(entry_metadata).save() search.refresh() def test_delete_dataset(self, api, test_user_auth, example_dataset_with_entry): @@ -1818,12 +1829,12 @@ class TestDataset: assert rv.status_code == 400 def test_assign_doi_unpublished(self, api, test_user_auth, example_datasets): - calc = CalcWithMetadata( + entry_metadata = EntryMetadata( domain='dft', calc_id='1', upload_id='1', published=False, with_embargo=False, datasets=['1']) Calc( calc_id='1', upload_id='1', create_time=datetime.datetime.now(), - metadata=calc.to_dict()).save() + metadata=entry_metadata.m_to_dict()).save() rv = api.post('/datasets/ds1', headers=test_user_auth) assert rv.status_code == 400 diff --git a/tests/app/test_optimade.py b/tests/app/test_optimade.py index f63a7d7bd0d035e50ea9be6408aa8f9a52b921d0..c531bbf350f6aa2225b5200ce8b108ab1b819859 100644 --- a/tests/app/test_optimade.py +++ b/tests/app/test_optimade.py @@ -36,10 +36,10 @@ def test_get_entry(published: Upload): data = json.load(f) assert 'OptimadeEntry' in data search_result = search.SearchRequest().search_parameter('calc_id', calc_id).execute_paginated()['results'][0] - assert 'dft.optimade' in search.flat(search_result) + assert 'dft.optimade.chemical_formula_hill' in search.flat(search_result) -def test_no_optimade(meta_info, elastic, api): +def test_no_optimade(meta_info, mongo, elastic, api): create_test_structure(meta_info, 1, 2, 1, [], 0) create_test_structure(meta_info, 2, 2, 1, [], 0, optimade=False) search.refresh() diff --git a/tests/bravado_flask.py b/tests/bravado_flask.py index 2616a1bfe795d77392cfe407ee8920587dcb2b40..35c83c279aa641c461089659248b9acfd37d3cb7 100644 --- a/tests/bravado_flask.py +++ b/tests/bravado_flask.py @@ -25,7 +25,7 @@ class FlaskTestHttpClient(HttpClient): self._headers = headers def request(self, request_params, *args, **kwargs): - """ + ''' Taken from `bravado.http_client.HttpClient`. Args: @@ -40,7 +40,7 @@ class FlaskTestHttpClient(HttpClient): `bravado.http_future.HttpFuture`. Returns: `bravado_core.http_future.HttpFuture`: HTTP Future object - """ + ''' request_params.setdefault('headers', {}).update(self._headers) test_future = FlaskTestFutureAdapter(request_params, self._flask_client) @@ -48,7 +48,7 @@ class FlaskTestHttpClient(HttpClient): class FlaskTestFutureAdapter: - """ + ''' Mimics a :class:`concurrent.futures.Future` for the purposes of making it work with Bravado's :class:`bravado.http_future.HttpFuture` when simulating calls to a Falcon API. Those calls will be validated by Bravado. @@ -59,7 +59,7 @@ class FlaskTestFutureAdapter: falcon_api (`falcon.API`): API object to send the request to. response_encoding (str): Encoding that will be used to decode response's body. If set to None then the body won't be decoded. - """ + ''' def __init__(self, request_params, flask_client, response_encoding='utf-8'): self._flask_client = flask_client @@ -70,10 +70,10 @@ class FlaskTestFutureAdapter: self.connection_errors = None def result(self, **_): - """ + ''' Args: **_: Ignore all the keyword arguments (right now it's just timeout) passed by Bravado. - """ + ''' # Bravado will create the URL by appending request path to 'http://localhost' path = self._request_params['url'].replace('http://localhost', '') method = self._request_params.get('method') @@ -100,54 +100,54 @@ class FlaskTestFutureAdapter: class FlaskTestResponseAdapter(IncomingResponse): - """ + ''' Wraps a response from Falcon test client to provide a uniform interface expected by Bravado's :class:`bravado.http_future.HttpFuture`. Args: flask_response: Response to a call simulated with flask's test client. - """ + ''' def __init__(self, flask_response): self._response = flask_response @property def status_code(self): - """ + ''' Returns: int: HTTP status code - """ + ''' return self._response.status_code @property def text(self): - """ + ''' Returns: str: Textual representation of the response's body. - """ + ''' return self._response.data @property def reason(self): - """ + ''' Returns: str: Reason-phrase of the HTTP response (e.g. "OK", or "Not Found") - """ + ''' # status codes from Falcon look like this: "200 OK" return self._response.status[4:] @property def headers(self): - """ + ''' Returns: dict: Headers attached to the response. - """ + ''' return self._response.headers def json(self, **kwargs): - """ + ''' Args: **kwargs: This is a part of the interface, but we don't do anything with it. Returns: dict: JSON representation of the response's body. - """ + ''' return json.loads(self._response.data) diff --git a/tests/conftest.py b/tests/conftest.py index 9584b3dd8e23bac1dc6be5b091a1ab03a22d83b9..ff8713b2c870e656fcc40a3718e0f59e558b560f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,7 +35,7 @@ from nomadcore.local_meta_info import loadJsonFile import nomad_meta_info from nomad import config, infrastructure, parsing, processing, app, search, utils -from nomad.datamodel import User, CalcWithMetadata +from nomad.datamodel import User, EntryMetadata from nomad.parsing import LocalBackend from tests import test_parsing, test_normalizing @@ -77,7 +77,7 @@ def raw_files_infra(): @pytest.fixture(scope='function') def raw_files(raw_files_infra): - """ Provides cleaned out files directory structure per function. Clears files after test. """ + ''' Provides cleaned out files directory structure per function. Clears files after test. ''' directories = [config.fs.staging, config.fs.public, config.fs.tmp] for directory in directories: if not os.path.exists(directory): @@ -123,10 +123,10 @@ def celery_config(): @pytest.fixture(scope='session') def purged_app(celery_session_app): - """ + ''' Purges all pending tasks of the celery app before test. This is necessary to remove tasks from the queue that might be 'left over' from prior tests. - """ + ''' celery_session_app.control.purge() yield celery_session_app @@ -140,7 +140,7 @@ def celery_inspect(purged_app): # 'bleeding' into successive tests. @pytest.fixture(scope='function') def worker(mongo, celery_session_worker, celery_inspect): - """ Provides a clean worker (no old tasks) per function. Waits for all tasks to be completed. """ + ''' Provides a clean worker (no old tasks) per function. Waits for all tasks to be completed. ''' yield # wait until there no more active tasks, to leave clean worker and queues for the next @@ -164,7 +164,7 @@ def mongo_infra(monkeysession): @pytest.fixture(scope='function') def mongo(mongo_infra): - """ Provides a cleaned mocked mongo per function. """ + ''' Provides a cleaned mocked mongo per function. ''' # Some test cases need to reset the database connection if infrastructure.mongo_client != mongo_infra: mongo_infra = infrastructure.mongo_client @@ -174,7 +174,7 @@ def mongo(mongo_infra): @pytest.fixture(scope='session') def elastic_infra(monkeysession): - """ Provides elastic infrastructure to the session """ + ''' Provides elastic infrastructure to the session ''' monkeysession.setattr('nomad.config.elastic.index_name', 'nomad_fairdi_test') try: return infrastructure.setup_elastic() @@ -199,7 +199,7 @@ def clear_elastic(elastic): @pytest.fixture(scope='function') def elastic(elastic_infra): - """ Provides a clean elastic per function. Clears elastic before test. """ + ''' Provides a clean elastic per function. Clears elastic before test. ''' clear_elastic(elastic_infra) assert infrastructure.elastic_client is not None @@ -280,7 +280,7 @@ def keycloak(monkeypatch): @pytest.fixture(scope='function') def proc_infra(worker, elastic, mongo, raw_files): - """ Combines all fixtures necessary for processing (elastic, worker, files, mongo) """ + ''' Combines all fixtures necessary for processing (elastic, worker, files, mongo) ''' return dict(elastic=elastic) @@ -384,10 +384,10 @@ def with_warn(caplog): assert count > 0 -""" +''' Fixture for mocked SMTP server for testing. Based on https://gist.github.com/akheron/cf3863cdc424f08929e4cb7dc365ef23. -""" +''' RecordedMessage = namedtuple( 'RecordedMessage', @@ -527,31 +527,38 @@ def example_user_metadata(other_test_user, test_user) -> dict: } +@pytest.fixture(scope='module') +def internal_example_user_metadata(example_user_metadata) -> dict: + return { + key[1:] if key[0] == '_' else key: value + for key, value in example_user_metadata.items()} + + @pytest.fixture(scope='session') def parsed(example_mainfile: Tuple[str, str]) -> parsing.LocalBackend: - """ Provides a parsed calculation in the form of a LocalBackend. """ + ''' Provides a parsed calculation in the form of a LocalBackend. ''' parser, mainfile = example_mainfile return test_parsing.run_parser(parser, mainfile) @pytest.fixture(scope='session') def parsed_ems() -> parsing.LocalBackend: - """ Provides a parsed experiment in the form of a LocalBackend. """ + ''' Provides a parsed experiment in the form of a LocalBackend. ''' return test_parsing.run_parser('parsers/skeleton', 'tests/data/parsers/skeleton/example.metadata.json') @pytest.fixture(scope='session') def normalized(parsed: parsing.LocalBackend) -> parsing.LocalBackend: - """ Provides a normalized calculation in the form of a LocalBackend. """ + ''' Provides a normalized calculation in the form of a LocalBackend. ''' return test_normalizing.run_normalize(parsed) @pytest.fixture(scope='function') def uploaded(example_upload: str, raw_files) -> Tuple[str, str]: - """ + ''' Provides a uploaded with uploaded example file and gives the upload_id. Clears files after test. - """ + ''' example_upload_id = os.path.basename(example_upload).replace('.zip', '') return example_upload_id, example_upload @@ -565,9 +572,9 @@ def non_empty_uploaded(non_empty_example_upload: str, raw_files) -> Tuple[str, s @pytest.mark.timeout(config.tests.default_timeout) @pytest.fixture(scope='function') def processed(uploaded: Tuple[str, str], test_user: User, proc_infra) -> processing.Upload: - """ + ''' Provides a processed upload. Upload was uploaded with test_user. - """ + ''' return test_processing.run_processing(uploaded, test_user) @@ -586,19 +593,19 @@ def processeds(non_empty_example_upload: str, test_user: User, proc_infra) -> Li @pytest.mark.timeout(config.tests.default_timeout) @pytest.fixture(scope='function') def non_empty_processed(non_empty_uploaded: Tuple[str, str], test_user: User, proc_infra) -> processing.Upload: - """ + ''' Provides a processed upload. Upload was uploaded with test_user. - """ + ''' return test_processing.run_processing(non_empty_uploaded, test_user) @pytest.mark.timeout(config.tests.default_timeout) @pytest.fixture(scope='function') -def published(non_empty_processed: processing.Upload, example_user_metadata) -> processing.Upload: - """ +def published(non_empty_processed: processing.Upload, internal_example_user_metadata) -> processing.Upload: + ''' Provides a processed upload. Upload was uploaded with test_user. - """ - non_empty_processed.compress_and_set_metadata(example_user_metadata) + ''' + non_empty_processed.compress_and_set_metadata(internal_example_user_metadata) non_empty_processed.publish_upload() try: non_empty_processed.block_until_complete(interval=.01) @@ -611,9 +618,9 @@ def published(non_empty_processed: processing.Upload, example_user_metadata) -> @pytest.mark.timeout(config.tests.default_timeout) @pytest.fixture(scope='function') def published_wo_user_metadata(non_empty_processed: processing.Upload) -> processing.Upload: - """ + ''' Provides a processed upload. Upload was uploaded with test_user. - """ + ''' non_empty_processed.publish_upload() try: non_empty_processed.block_until_complete(interval=.01) @@ -625,7 +632,7 @@ def published_wo_user_metadata(non_empty_processed: processing.Upload) -> proces @pytest.fixture def reset_config(): - """ Fixture that resets configuration. """ + ''' Fixture that resets configuration. ''' service = config.service log_level = config.console_log_level yield None @@ -636,14 +643,14 @@ def reset_config(): @pytest.fixture def reset_infra(mongo, elastic): - """ Fixture that resets infrastructure after deleting db or search index. """ + ''' Fixture that resets infrastructure after deleting db or search index. ''' yield None def create_test_structure( meta_info, id: int, h: int, o: int, extra: List[str], periodicity: int, optimade: bool = True, metadata: dict = None): - """ Creates a calculation in Elastic and Mongodb with the given properties. + ''' Creates a calculation in Elastic and Mongodb with the given properties. Does require initialized :func:`elastic_infra` and :func:`mongo_infra`. @@ -656,7 +663,7 @@ def create_test_structure( periodicity: The number of dimensions to repeat the structure in optimade: A boolean. Iff true the entry will have optimade metadata. Default is True. metadata: Additional (user) metadata. - """ + ''' atom_labels = ['H' for i in range(0, h)] + ['O' for i in range(0, o)] + extra test_vector = np.array([0, 0, 0]) @@ -679,19 +686,19 @@ def create_test_structure( backend.closeSection('section_run', 0) backend = run_normalize(backend) - calc = CalcWithMetadata( + calc = EntryMetadata( domain='dft', upload_id='test_uload_id', calc_id='test_calc_id_%d' % id, mainfile='test_mainfile', published=True, with_embargo=False) calc.apply_domain_metadata(backend) if metadata is not None: - calc.update(**metadata) + calc.m_update(**metadata) if not optimade: - calc.optimade = None # type: ignore + calc.dft.optimade = None - proc_calc = processing.Calc.from_calc_with_metadata(calc) + proc_calc = processing.Calc.from_entry_metadata(calc) proc_calc.save() - search_entry = search.Entry.from_calc_with_metadata(calc) + search_entry = search.create_entry(calc) search_entry.save() assert processing.Calc.objects(calc_id__in=[calc.calc_id]).count() == 1 diff --git a/tests/data/parsers/octopus/stdout.txt b/tests/data/parsers/octopus/stdout.txt index 2b43895be9dd549f5fbadb074ab657f0b9ccf44a..94b5baabf3b5b659be5aa5f48b5100bcde7d7290 100644 --- a/tests/data/parsers/octopus/stdout.txt +++ b/tests/data/parsers/octopus/stdout.txt @@ -7,7 +7,7 @@ _.._ |0) ~ (0) | _.---'`__.-( (_. __.--'`_.. '.__.\ '--. \_.-' ,.--'` `""` ( ,.--'` ',__ /./; ;, '.__.'` __ - _`) ) .---.__.' / | |\ \__..--"" """--.,_ + _`) ) .---.__.' / | |\ \__..--"" '''--.,_ `---' .'.''-._.-'`_./ /\ '. \ _.-~~~````~~~-._`-.__.' | | .' _.-' | | \ \ '. `~---` \ \/ .' \ \ '. '-._) diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index 537aa2c84718c353c00790bd38824941a0a9f328..2a2f401d140c9514c09bcd3d0a2c8aa885fd52f0 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -129,9 +129,9 @@ def test_processing_with_large_dir(test_user, proc_infra): assert len(calc.warnings) == 1 -def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, monkeypatch): +def test_publish(non_empty_processed: Upload, no_warn, internal_example_user_metadata, monkeypatch): processed = non_empty_processed - processed.compress_and_set_metadata(example_user_metadata) + processed.compress_and_set_metadata(internal_example_user_metadata) additional_keys = ['with_embargo'] @@ -141,17 +141,17 @@ def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, mo except Exception: pass - upload = processed.to_upload_with_metadata(example_user_metadata) + entries = processed.entries_metadata(internal_example_user_metadata) - assert_upload_files(upload, PublicUploadFiles, published=True) - assert_search_upload(upload, additional_keys, published=True) + assert_upload_files(processed.upload_id, entries, PublicUploadFiles, published=True) + assert_search_upload(entries, additional_keys, published=True) - assert_processing(Upload.get(upload.upload_id, include_published=True), published=True) + assert_processing(Upload.get(processed.upload_id, include_published=True), published=True) -def test_republish(non_empty_processed: Upload, no_warn, example_user_metadata, monkeypatch): +def test_republish(non_empty_processed: Upload, no_warn, internal_example_user_metadata, monkeypatch): processed = non_empty_processed - processed.compress_and_set_metadata(example_user_metadata) + processed.compress_and_set_metadata(internal_example_user_metadata) additional_keys = ['with_embargo'] @@ -162,20 +162,20 @@ def test_republish(non_empty_processed: Upload, no_warn, example_user_metadata, processed.publish_upload() processed.block_until_complete(interval=.01) - upload = processed.to_upload_with_metadata(example_user_metadata) + entries = processed.entries_metadata(internal_example_user_metadata) - assert_upload_files(upload, PublicUploadFiles, published=True) - assert_search_upload(upload, additional_keys, published=True) + assert_upload_files(processed.upload_id, entries, PublicUploadFiles, published=True) + assert_search_upload(entries, additional_keys, published=True) def test_publish_failed( - non_empty_uploaded: Tuple[str, str], example_user_metadata, test_user, + non_empty_uploaded: Tuple[str, str], internal_example_user_metadata, test_user, monkeypatch, proc_infra): mock_failure(Calc, 'parsing', monkeypatch) processed = run_processing(non_empty_uploaded, test_user) - processed.compress_and_set_metadata(example_user_metadata) + processed.compress_and_set_metadata(internal_example_user_metadata) additional_keys = ['with_embargo'] @@ -185,9 +185,9 @@ def test_publish_failed( except Exception: pass - upload = processed.to_upload_with_metadata(example_user_metadata) + entries = processed.entries_metadata(internal_example_user_metadata) - assert_search_upload(upload, additional_keys, published=True, processed=False) + assert_search_upload(entries, additional_keys, published=True, processed=False) @pytest.mark.timeout(config.tests.default_timeout) @@ -211,7 +211,7 @@ def test_process_non_existing(proc_infra, test_user, with_error): @pytest.mark.timeout(config.tests.default_timeout) @pytest.mark.parametrize('with_failure', [None, 'before', 'after', 'not-matched']) -def test_re_processing(published: Upload, example_user_metadata, monkeypatch, with_failure): +def test_re_processing(published: Upload, internal_example_user_metadata, monkeypatch, with_failure): if with_failure == 'not-matched': monkeypatch.setattr('nomad.config.reprocess_unmatched', False) @@ -249,7 +249,7 @@ def test_re_processing(published: Upload, example_user_metadata, monkeypatch, wi shutil.copyfile( raw_files, published.upload_files.join_file('raw-restricted.plain.zip').os_path) - upload = published.to_upload_with_metadata(example_user_metadata) + entries = published.entries_metadata(internal_example_user_metadata) # reprocess monkeypatch.setattr('nomad.config.version', 're_process_test_version') @@ -292,10 +292,10 @@ def test_re_processing(published: Upload, example_user_metadata, monkeypatch, wi assert old_log_lines != new_log_lines # assert maintained user metadata (mongo+es) - assert_upload_files(upload, PublicUploadFiles, published=True) - assert_search_upload(upload, published=True) + assert_upload_files(published.upload_id, entries, PublicUploadFiles, published=True) + assert_search_upload(entries, published=True) if with_failure not in ['after', 'not-matched']: - assert_processing(Upload.get(upload.upload_id, include_published=True), published=True) + assert_processing(Upload.get(published.upload_id, include_published=True), published=True) # assert changed calc metadata (mongo) if with_failure not in ['after', 'not-matched']: @@ -306,7 +306,7 @@ def test_re_processing(published: Upload, example_user_metadata, monkeypatch, wi @pytest.mark.timeout(config.tests.default_timeout) @pytest.mark.parametrize('with_failure', [None, 'before', 'after']) -def test_re_pack(published: Upload, example_user_metadata, monkeypatch, with_failure): +def test_re_pack(published: Upload, monkeypatch, with_failure): upload_id = published.upload_id calc = Calc.objects(upload_id=upload_id).first() assert calc.metadata['with_embargo'] @@ -403,6 +403,6 @@ def test_ems_data(proc_infra, test_user): assert upload.total_calcs == 1 assert len(upload.calcs) == 1 - upload_with_metadata = upload.to_upload_with_metadata() - assert_upload_files(upload_with_metadata, StagingUploadFiles, published=False) - assert_search_upload(upload_with_metadata, additional_keys, published=False) + entries = upload.entries_metadata() + assert_upload_files(upload.upload_id, entries, StagingUploadFiles, published=False) + assert_search_upload(entries, additional_keys, published=False) diff --git a/tests/test_client.py b/tests/test_client.py index c2ee35226d5f19368b162abfec3342e2abe745c7..354bca0f0cd0318be179021f26f97be865896764 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -15,7 +15,7 @@ import time from nomad.processing import SUCCESS -from nomad.datamodel import CalcWithMetadata +from nomad.datamodel import EntryMetadata from tests.test_files import example_file from tests.test_search import create_entry @@ -37,8 +37,8 @@ def test_upload(bravado, proc_infra, no_warn): def test_get_repo_calc(bravado, proc_infra, raw_files): - create_entry(CalcWithMetadata( - domain='dft', calc_id=0, upload_id='test_upload', published=True, with_embargo=False)) + create_entry(EntryMetadata( + domain='dft', calc_id='0', upload_id='test_upload', published=True, with_embargo=False)) repo = bravado.repo.get_repo_calc(upload_id='test_upload', calc_id='0').response().result assert repo is not None assert repo['calc_id'] is not None diff --git a/tests/test_datamodel.py b/tests/test_datamodel.py index f00ea36b5b0e5e6cc6bd81a2f9d30e0996dfdabd..2125b99d895cf60d334556763a0ae40ae168e056 100644 --- a/tests/test_datamodel.py +++ b/tests/test_datamodel.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" +''' A generator for random test calculations. -""" +''' import random from essential_generators import DocumentGenerator @@ -65,49 +65,50 @@ def _gen_ref(): return random.choice(references) -def generate_calc(pid: int = 0, calc_id: str = None, upload_id: str = None) -> datamodel.CalcWithMetadata: +def generate_calc(pid: int = 0, calc_id: str = None, upload_id: str = None) -> datamodel.EntryMetadata: random.seed(pid) - self = datamodel.DFTCalcWithMetadata() - - self.upload_id = upload_id if upload_id is not None else utils.create_uuid() - self.calc_id = calc_id if calc_id is not None else utils.create_uuid() - - self.upload_time = datetime.datetime.utcnow() - self.calc_hash = utils.create_uuid() - self.pid = pid - self.mainfile = random.choice(filepaths) - self.files = list([self.mainfile] + random.choices(filepaths, k=random.choice(low_numbers_for_files))) - self.uploader = _gen_user() - - self.with_embargo = random.choice([True, False]) - self.published = True - self.coauthors = list(_gen_user() for _ in range(0, random.choice(low_numbers_for_refs_and_datasets))) - self.shared_with = list(_gen_user() for _ in range(0, random.choice(low_numbers_for_refs_and_datasets))) - self.comment = random.choice(comments) - self.references = list(_gen_ref() for _ in range(0, random.choice(low_numbers_for_refs_and_datasets))) - self.datasets = list( + entry = datamodel.EntryMetadata() + + entry.upload_id = upload_id if upload_id is not None else utils.create_uuid() + entry.calc_id = calc_id if calc_id is not None else utils.create_uuid() + + entry.upload_time = datetime.datetime.utcnow() + entry.calc_hash = utils.create_uuid() + entry.pid = pid + entry.mainfile = random.choice(filepaths) + entry.files = list([entry.mainfile] + random.choices(filepaths, k=random.choice(low_numbers_for_files))) + entry.uploader = _gen_user() + + entry.with_embargo = random.choice([True, False]) + entry.published = True + entry.coauthors = list(_gen_user() for _ in range(0, random.choice(low_numbers_for_refs_and_datasets))) + entry.shared_with = list(_gen_user() for _ in range(0, random.choice(low_numbers_for_refs_and_datasets))) + entry.comment = random.choice(comments) + entry.references = list(_gen_ref() for _ in range(0, random.choice(low_numbers_for_refs_and_datasets))) + entry.datasets = list( _gen_dataset() for _ in range(0, random.choice(low_numbers_for_refs_and_datasets))) - self.atoms = list(random.choices(chemical_symbols[1:], k=random.choice(low_numbers_for_atoms))) - self.formula = ''.join('%s%d' % (atom, random.choice(low_numbers_for_atoms)) for atom in self.atoms) - self.formula = self.formula.replace('1', '') + entry.atoms = list(random.choices(chemical_symbols[1:], k=random.choice(low_numbers_for_atoms))) + entry.formula = ''.join('%s%d' % (atom, random.choice(low_numbers_for_atoms)) for atom in entry.atoms) + entry.formula = entry.formula.replace('1', '') - self.basis_set = random.choice(basis_sets) - self.xc_functional = random.choice(xc_functionals) - self.system = random.choice(systems) - self.crystal_system = random.choice(crystal_systems) + dft_metadata = entry.m_create(datamodel.DFTMetadata) + dft_metadata.basis_set = random.choice(basis_sets) + dft_metadata.xc_functional = random.choice(xc_functionals) + dft_metadata.system = random.choice(systems) + dft_metadata.crystal_system = random.choice(crystal_systems) spacegroup = random.randint(1, 225) - self.spacegroup = str(spacegroup) - self.spacegroup_symbol = Spacegroup(spacegroup).symbol - self.code_name = random.choice(codes) - self.code_version = '1.0.0' + dft_metadata.spacegroup = str(spacegroup) + dft_metadata.spacegroup_symbol = Spacegroup(spacegroup).symbol + dft_metadata.code_name = random.choice(codes) + dft_metadata.code_version = '1.0.0' - self.n_total_energies = random.choice(range(0, 5)) - self.geometries = ['%d' % random.randint(1, 500), '%d' % random.randint(1, 500)] + dft_metadata.n_total_energies = random.choice(range(0, 5)) + dft_metadata.geometries = ['%d' % random.randint(1, 500), '%d' % random.randint(1, 500)] - return self + return entry if __name__ == '__main__': @@ -130,7 +131,6 @@ if __name__ == '__main__': for calcs_per_upload in utils.chunks(range(0, n_calcs), int(n_calcs / n_uploads)): upload_id = utils.create_uuid() - upload = datamodel.UploadWithMetadata(upload_id=upload_id) upload_files = files.StagingUploadFiles( upload_id=upload_id, create=True, is_authorized=lambda: True) @@ -150,7 +150,7 @@ if __name__ == '__main__': with upload_files.archive_log_file(calc.calc_id, 'wt') as f: f.write('this is a generated test file') - search_entry = search.Entry.from_calc_with_metadata(calc) + search_entry = search.Entry.from_entry_metadata(calc) search_entry.n_total_energies = random.choice(low_numbers_for_total_energies) search_entry.n_geometries = low_numbers_for_geometries for _ in range(0, random.choice(search_entry.n_geometries)): @@ -160,11 +160,9 @@ if __name__ == '__main__': pid += 1 calcs.append(calc) - upload.calcs = calcs - bulk( infrastructure.elastic_client, [entry.to_dict(include_meta=True) for entry in search_entries]) - upload_files.pack(upload) + upload_files.pack(calcs) upload_files.delete() diff --git a/tests/test_files.py b/tests/test_files.py index 62d071f4d82e847a3029788250dbf48ba0242cce..ce9d0cffa8952f9f7f294ec7ed517d86d1368a8b 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Generator, Any, Dict, Tuple +from typing import Generator, Any, Dict, Tuple, Iterable import os import os.path import shutil @@ -22,8 +22,7 @@ import itertools import zipfile import re -from nomad import config -from nomad.datamodel import UploadWithMetadata, CalcWithMetadata +from nomad import config, datamodel from nomad.files import DirectoryObject, PathObject from nomad.files import StagingUploadFiles, PublicUploadFiles, UploadFiles, Restricted, \ ArchiveBasedStagingUploadFiles @@ -31,10 +30,10 @@ from nomad.files import StagingUploadFiles, PublicUploadFiles, UploadFiles, Rest from tests.utils import assert_exception -CalcWithFiles = Tuple[CalcWithMetadata, str] -UploadWithFiles = Tuple[UploadWithMetadata, UploadFiles] -StagingUploadWithFiles = Tuple[UploadWithMetadata, StagingUploadFiles] -PublicUploadWithFiles = Tuple[UploadWithMetadata, PublicUploadFiles] +CalcWithFiles = Tuple[datamodel.EntryMetadata, str] +UploadWithFiles = Tuple[str, Iterable[datamodel.EntryMetadata], UploadFiles] +StagingUploadWithFiles = Tuple[str, Iterable[datamodel.EntryMetadata], StagingUploadFiles] +PublicUploadWithFiles = Tuple[str, Iterable[datamodel.EntryMetadata], PublicUploadFiles] # example_file uses an artificial parser for faster test execution, can also be # changed to examples_vasp.zip for using vasp parser @@ -56,7 +55,7 @@ example_data = dict(test_key='test_value') @pytest.fixture(scope='function', autouse=True) def raw_files_on_all_tests(raw_files): - """ Autouse fixture to apply raw_files to all tests. """ + ''' Autouse fixture to apply raw_files to all tests. ''' pass @@ -125,9 +124,9 @@ example_calc_id = example_calc['calc_id'] def generate_example_calc( calc_id: int, with_mainfile_prefix: bool, subdirectory: str = None, **kwargs) -> CalcWithFiles: - """ Generate an example calc with :class:`CalcWithMetadata` and rawfile. """ + ''' Generate an example calc with :class:`EntryMetadata` and rawfile. ''' - example_calc = CalcWithMetadata(domain='dft', calc_id=str(calc_id)) + example_calc = datamodel.EntryMetadata(domain='dft', calc_id=str(calc_id)) if with_mainfile_prefix: mainfile = '%d.template.json' % calc_id @@ -138,7 +137,7 @@ def generate_example_calc( mainfile = os.path.join(subdirectory, mainfile) example_calc.mainfile = mainfile - example_calc.update(**kwargs) + example_calc.m_update(**kwargs) example_file = os.path.join(config.fs.tmp, 'example.zip') example_calc.files = [] @@ -209,8 +208,8 @@ class UploadFilesContract(UploadFilesFixtures): assert UploadFiles.get(empty_test_upload.upload_id).__class__ == empty_test_upload.__class__ def test_rawfile(self, test_upload: UploadWithFiles): - upload, upload_files = test_upload - for calc in upload.calcs: + _, entries, upload_files = test_upload + for calc in entries: try: for file_path in calc.files: with upload_files.raw_file(file_path) as f: @@ -222,8 +221,8 @@ class UploadFilesContract(UploadFilesFixtures): assert calc.with_embargo def test_rawfile_size(self, test_upload: UploadWithFiles): - upload, upload_files = test_upload - for calc in upload.calcs: + _, entries, upload_files = test_upload + for calc in entries: try: for file_path in calc.files: assert upload_files.raw_file_size(file_path) > 0 @@ -235,13 +234,13 @@ class UploadFilesContract(UploadFilesFixtures): @pytest.mark.parametrize('prefix', [None, 'examples']) def test_raw_file_manifest(self, test_upload: UploadWithFiles, prefix: str): - _, upload_files = test_upload + _, _, upload_files = test_upload raw_files = list(upload_files.raw_file_manifest(path_prefix=prefix)) assert_example_files(raw_files) @pytest.mark.parametrize('prefix', [None, 'examples_template']) def test_raw_file_list(self, test_upload: UploadWithFiles, prefix: str): - _, upload_files = test_upload + _, _, upload_files = test_upload raw_files = list(upload_files.raw_file_list(directory=prefix)) if prefix is None: assert len(raw_files) == 0 @@ -256,8 +255,8 @@ class UploadFilesContract(UploadFilesFixtures): @pytest.mark.parametrize('test_logs', [True, False]) def test_archive(self, test_upload: UploadWithFiles, test_logs: bool): - upload, upload_files = test_upload - calcs = upload.calcs_dict + _, entries, upload_files = test_upload + calcs_dict = {entry.calc_id: entry for entry in entries} try: if test_logs: with upload_files.archive_log_file(example_calc_id, 'rt') as f: @@ -267,26 +266,26 @@ class UploadFilesContract(UploadFilesFixtures): assert json.load(f) == json.loads(example_archive_contents) if not upload_files._is_authorized(): - assert not calcs.get(example_calc_id).with_embargo + assert not calcs_dict.get(example_calc_id).with_embargo except Restricted: assert not upload_files._is_authorized() - assert calcs.get(example_calc_id).with_embargo + assert calcs_dict.get(example_calc_id).with_embargo def test_archive_size(self, test_upload: UploadWithFiles): - upload, upload_files = test_upload - calcs = upload.calcs_dict + _, entries, upload_files = test_upload + calcs_dict = {entry.calc_id: entry for entry in entries} try: assert upload_files.archive_file_size(example_calc_id) > 0 if not upload_files._is_authorized(): - assert not calcs.get(example_calc_id).with_embargo + assert not calcs_dict.get(example_calc_id).with_embargo except Restricted: assert not upload_files._is_authorized() - assert calcs.get(example_calc_id).with_embargo + assert calcs_dict.get(example_calc_id).with_embargo def create_staging_upload(upload_id: str, calc_specs: str) -> StagingUploadWithFiles: - """ + ''' Create an upload according to given spec. Additional arguments are given to the StagingUploadFiles contstructor. @@ -297,9 +296,8 @@ def create_staging_upload(upload_id: str, calc_specs: str) -> StagingUploadWithF The calcs will be copies of calcs in `example_file`. First calc is at top level, following calcs will be put under 1/, 2/, etc. All calcs with capital `P`/`R` will be put in the same directory under multi/. - """ + ''' upload_files = StagingUploadFiles(upload_id, create=True, is_authorized=lambda: True) - upload = UploadWithMetadata(upload_id=upload_id) calcs = [] prefix = 0 @@ -327,8 +325,7 @@ def create_staging_upload(upload_id: str, calc_specs: str) -> StagingUploadWithF prefix += 1 assert len(calcs) == len(calc_specs) - upload.calcs = calcs - return upload, upload_files + return upload_id, calcs, upload_files class TestStagingUploadFiles(UploadFilesContract): @@ -353,27 +350,27 @@ class TestStagingUploadFiles(UploadFilesContract): assert len(content) > 0 def test_write_archive(self, test_upload: StagingUploadWithFiles): - _, upload_files = test_upload + _, _, upload_files = test_upload assert json.load(upload_files.archive_file(example_calc_id, 'rt')) == json.loads(example_archive_contents) def test_calc_id(self, test_upload: StagingUploadWithFiles): - _, upload_files = test_upload + _, _, upload_files = test_upload assert upload_files.calc_id(example_file_mainfile) is not None def test_pack(self, test_upload: StagingUploadWithFiles): - upload, upload_files = test_upload - upload_files.pack(upload) + _, entries, upload_files = test_upload + upload_files.pack(entries) @pytest.mark.parametrize('with_mainfile', [True, False]) def test_calc_files(self, test_upload: StagingUploadWithFiles, with_mainfile): - upload, upload_files = test_upload - for calc in upload.calcs: + _, entries, upload_files = test_upload + for calc in entries: mainfile = calc.mainfile calc_files = upload_files.calc_files(mainfile, with_mainfile=with_mainfile) assert_example_files(calc_files, with_mainfile=with_mainfile) def test_delete(self, test_upload: StagingUploadWithFiles): - _, upload_files = test_upload + _, _, upload_files = test_upload upload_files.delete() assert not upload_files.exists() @@ -396,17 +393,17 @@ class TestArchiveBasedStagingUploadFiles(UploadFilesFixtures): def create_public_upload( upload_id: str, calc_specs: str, **kwargs) -> PublicUploadWithFiles: - upload, upload_files = create_staging_upload(upload_id, calc_specs) - upload_files.pack(upload) + _, entries, upload_files = create_staging_upload(upload_id, calc_specs) + upload_files.pack(entries) upload_files.delete() - return upload, PublicUploadFiles(upload_id, **kwargs) + return upload_id, entries, PublicUploadFiles(upload_id, **kwargs) class TestPublicUploadFiles(UploadFilesContract): @pytest.fixture(scope='function') def empty_test_upload(self, test_upload_id: str) -> UploadFiles: - _, upload_files = create_public_upload( + _, _, upload_files = create_public_upload( test_upload_id, calc_specs='', is_authorized=lambda: True) return upload_files @@ -415,13 +412,13 @@ class TestPublicUploadFiles(UploadFilesContract): ['r', 'rr', 'pr', 'rp', 'p', 'pp', 'RP', 'RR', 'PP'], [True, False])) def test_upload(self, request, test_upload_id: str) -> PublicUploadWithFiles: calc_specs, protected = request.param - upload, upload_files = create_staging_upload(test_upload_id, calc_specs=calc_specs) - upload_files.pack(upload) + _, entries, upload_files = create_staging_upload(test_upload_id, calc_specs=calc_specs) + upload_files.pack(entries) upload_files.delete() - return upload, PublicUploadFiles(test_upload_id, is_authorized=lambda: not protected) + return test_upload_id, entries, PublicUploadFiles(test_upload_id, is_authorized=lambda: not protected) def test_to_staging_upload_files(self, test_upload): - upload, upload_files = test_upload + _, entries, upload_files = test_upload assert upload_files.to_staging_upload_files() is None staging_upload_files = upload_files.to_staging_upload_files(create=True) assert staging_upload_files is not None @@ -438,7 +435,7 @@ class TestPublicUploadFiles(UploadFilesContract): with open(f, 'wt') as fh: fh.write('') - staging_upload_files.pack(upload) + staging_upload_files.pack(entries) staging_upload_files.delete() # We do a very simple check. We made all files empty, those that are rezipped @@ -453,19 +450,20 @@ class TestPublicUploadFiles(UploadFilesContract): assert upload_files.to_staging_upload_files() is None def test_repack(self, test_upload): - upload, upload_files = test_upload - for calc in upload.calcs: + upload_id, entries, upload_files = test_upload + for calc in entries: calc.with_embargo = False - upload_files.re_pack(upload) - assert_upload_files(upload, PublicUploadFiles, with_embargo=False) + upload_files.re_pack(entries) + assert_upload_files(upload_id, entries, PublicUploadFiles, with_embargo=False) assert len(os.listdir(upload_files.os_path)) == 8 with assert_exception(KeyError): StagingUploadFiles(upload_files.upload_id) def assert_upload_files( - upload: UploadWithMetadata, cls, no_archive: bool = False, **kwargs): - """ + upload_id: str, entries: Iterable[datamodel.EntryMetadata], cls, + no_archive: bool = False, **kwargs): + ''' Asserts the files aspect of uploaded data after processing or publishing Arguments: @@ -473,13 +471,13 @@ def assert_upload_files( cls: The :class:`UploadFiles` subclass that this upload should have n_calcs: The number of expected calcs in the upload **kwargs: Key, value pairs that each calc metadata should have - """ - upload_files = UploadFiles.get(upload.upload_id, is_authorized=lambda: True) + ''' + upload_files = UploadFiles.get(upload_id, is_authorized=lambda: True) assert upload_files is not None assert isinstance(upload_files, cls) - upload_files = UploadFiles.get(upload.upload_id) - for calc in upload.calcs: + upload_files = UploadFiles.get(upload_id) + for calc in entries: try: with upload_files.raw_file(calc.mainfile) as f: f.read() diff --git a/tests/test_metainfo.py b/tests/test_metainfo.py index 32adbc749b3a764af2f73d6ea1f9f21318e8fb85..4bb544c277065a16a604cf0a200f7b6cd42a4df4 100644 --- a/tests/test_metainfo.py +++ b/tests/test_metainfo.py @@ -45,7 +45,7 @@ def assert_section_instance(section: MSection): class TestM3: - """ Test for meta-info definition that are used to define other definitions. """ + ''' Test for meta-info definition that are used to define other definitions. ''' def test_section(self): assert Section.m_def == Section.m_def.m_def @@ -84,7 +84,7 @@ class TestM3: class TestPureReflection: - """ Test for using meta-info instances without knowing/using the respective definitions. """ + ''' Test for using meta-info instances without knowing/using the respective definitions. ''' def test_instantiation(self): test_section_def = Section(name='TestSection') @@ -98,19 +98,19 @@ class TestPureReflection: class MaterialDefining(MCategory): - """Quantities that add to what constitutes a different material.""" + '''Quantities that add to what constitutes a different material.''' pass class TestM2: - """ Test for meta-info definitions. """ + ''' Test for meta-info definitions. ''' def test_basics(self): assert_section_def(Run.m_def) assert_section_def(System.m_def) def test_default_section_def(self): - """ A section class without an explicit section def must set a default section def. """ + ''' A section class without an explicit section def must set a default section def. ''' assert Run.m_def is not None assert Run.m_def.name == 'Run' @@ -231,9 +231,12 @@ class TestM2: def test_qualified_name(self): assert System.m_def.qualified_name() == 'nomad.metainfo.example.System' + def test_derived_virtual(self): + assert System.n_atoms.virtual + class TestM1: - """ Test for meta-info instances. """ + ''' Test for meta-info instances. ''' def test_run(self): class Run(MSection): @@ -257,6 +260,30 @@ class TestM1: assert_section_instance(system) + def test_set_none(self): + run = Run() + run.code_name = 'test' + assert run.code_name is not None + + run.code_name = None + assert run.code_name is None + + def test_set_subsection(self): + run = Run() + first = Parsing() + run.parsing = first + assert first.m_parent == run + assert run.parsing == first + + second = Parsing() + run.parsing = second + assert first.m_parent is None + assert second.m_parent == run + assert run.parsing == second + + run.parsing = None + assert run.parsing is None + def test_defaults(self): assert len(System().periodic_dimensions) == 3 assert System().atom_labels is None @@ -333,6 +360,7 @@ class TestM1: def example_data(self): run = Run() run.code_name = 'test code name' + run.m_create(Parsing) system: System = run.m_create(System) system.atom_labels = ['H', 'H', 'O'] system.atom_positions = np.array([[1.2e-10, 0, 0], [0, 1.2e-10, 0], [0, 0, 1.2e-10]]) @@ -356,6 +384,15 @@ class TestM1: self.assert_example_data(new_example_data) + def test_to_dict_defaults(self, example_data): + dct = example_data.m_to_dict() + assert 'nomad_version' not in dct['parsing'] + assert 'n_atoms' not in dct['systems'][0] + + dct = example_data.m_to_dict(include_defaults=True) + assert 'nomad_version' in dct['parsing'] + assert 'n_atoms' not in dct['systems'][0] + def test_derived(self): system = System() @@ -412,6 +449,17 @@ class TestM1: assert len(resource.all(System)) == 2 + def test_mapping(self): + run = Run() + run.m_create(Parsing).parser_name = 'test' + system = run.m_create(System) + system.atom_labels = ['H', 'O'] + + assert run.systems[0].atom_labels == ['H', 'O'] + assert run['systems.0.atom_labels'] == ['H', 'O'] + assert run['systems/0/atom_labels'] == ['H', 'O'] + assert run['parsing.parser_name'] == 'test' + class TestEnvironment: diff --git a/tests/test_normalizing.py b/tests/test_normalizing.py index 9354d0aedcde713283484f3cbd54001e5a3a7fe9..8e32d6b0e7de47ffe95bce52f10a893e64da531c 100644 --- a/tests/test_normalizing.py +++ b/tests/test_normalizing.py @@ -50,36 +50,36 @@ vasp_parser_dos = ( glucose_atom_labels = ( 'parsers/template', 'tests/data/normalizers/glucose_atom_labels.json') -symmetry_keys = ['spacegroup', 'spacegroup_symbol', 'crystal_system'] +symmetry_keys = ['dft.spacegroup', 'dft.spacegroup_symbol', 'dft.crystal_system'] calc_metadata_keys = [ - 'code_name', 'code_version', 'basis_set', 'xc_functional', 'system', 'formula'] + symmetry_keys + 'dft.code_name', 'dft.code_version', 'dft.basis_set', 'dft.xc_functional', 'dft.system', 'formula'] + symmetry_keys parser_exceptions = { - 'parsers/wien2k': ['xc_functional'], + 'parsers/wien2k': ['dft.xc_functional'], 'parsers/nwchem': symmetry_keys, 'parsers/bigdft': symmetry_keys, 'parsers/gaussian': symmetry_keys, - 'parsers/abinit': ['formula', 'system'] + symmetry_keys, - 'parsers/dl-poly': ['formula', 'basis_set', 'xc_functional', 'system'] + symmetry_keys, - 'parsers/lib-atoms': ['basis_set', 'xc_functional'], + 'parsers/abinit': ['formula', 'dft.system'] + symmetry_keys, + 'parsers/dl-poly': ['formula', 'dft.basis_set', 'dft.xc_functional', 'dft.system'] + symmetry_keys, + 'parsers/lib-atoms': ['dft.basis_set', 'dft.xc_functional'], 'parsers/orca': symmetry_keys, 'parsers/octopus': symmetry_keys, - 'parsers/phonopy': ['basis_set', 'xc_functional'], + 'parsers/phonopy': ['dft.basis_set', 'dft.xc_functional'], 'parsers/gpaw2': symmetry_keys, - 'parsers/gamess': ['formula', 'system'] + symmetry_keys, - 'parsers/gulp': ['formula', 'xc_functional', 'system', 'basis_set'] + symmetry_keys, + 'parsers/gamess': ['formula', 'dft.system', 'dft.xc_functional'] + symmetry_keys, + 'parsers/gulp': ['formula', 'dft.xc_functional', 'dft.system', 'dft.basis_set'] + symmetry_keys, 'parsers/turbomole': symmetry_keys, - 'parsers/elastic': ['basis_set', 'xc_functional', 'system'] + symmetry_keys, - 'parsers/dmol': ['system'] + symmetry_keys, + 'parsers/elastic': ['dft.basis_set', 'dft.xc_functional', 'dft.system'] + symmetry_keys, + 'parsers/dmol': ['dft.system'] + symmetry_keys, 'parser/molcas': symmetry_keys, - 'parsers/band': ['system'] + symmetry_keys, - 'parsers/qbox': ['xc_functional'], - 'parser/onetep': ['formula', 'basis_set', 'xc_functional', 'system'] + symmetry_keys + 'parsers/band': ['dft.system'] + symmetry_keys, + 'parsers/qbox': ['dft.xc_functional'], + 'parser/onetep': ['formula', 'dft.basis_set', 'dft.xc_functional', 'dft.system'] + symmetry_keys } -""" +''' Keys that the normalizer for certain parsers might not produce. In an ideal world this map would be empty. -""" +''' def run_normalize(backend: LocalBackend) -> LocalBackend: @@ -209,17 +209,17 @@ def test_template_example_normalizer(parsed_template_example, no_warn, caplog): def assert_normalized(backend: LocalBackend): - metadata = datamodel.DFTCalcWithMetadata() + metadata = datamodel.EntryMetadata(domain='dft') metadata.apply_domain_metadata(backend) assert metadata.formula is not None - assert metadata.code_name is not None - assert metadata.code_version is not None - assert metadata.basis_set is not None - assert metadata.xc_functional is not None - assert metadata.system is not None - assert metadata.crystal_system is not None + assert metadata.dft.code_name is not None + assert metadata.dft.code_version is not None + assert metadata.dft.basis_set is not None + assert metadata.dft.xc_functional is not None + assert metadata.dft.system is not None + assert metadata.dft.crystal_system is not None assert len(metadata.atoms) is not None - assert metadata.spacegroup is not None + assert metadata.dft.spacegroup is not None exceptions = parser_exceptions.get(backend.get_value('parser_name'), []) @@ -228,7 +228,7 @@ def assert_normalized(backend: LocalBackend): for key in calc_metadata_keys: if key not in exceptions: - assert getattr(metadata, key) != config.services.unavailable_value + assert metadata[key] != config.services.unavailable_value def test_normalizer(normalized_example: LocalBackend): @@ -236,7 +236,7 @@ def test_normalizer(normalized_example: LocalBackend): def test_normalizer_faulty_matid(caplog): - """ Runs normalizer on an example w/ bools for atom pos. Should force matid error.""" + ''' Runs normalizer on an example w/ bools for atom pos. Should force matid error.''' # assert isinstance(backend, LocalBackend) backend = parse_file(boolean_positions) run_normalize(backend) @@ -245,26 +245,26 @@ def test_normalizer_faulty_matid(caplog): def test_normalizer_single_string_atom_labels(caplog): - """ + ''' Runs normalizer on ['Br1SiSiK'] expects error. Should replace the label with 'X' and the numbers of postitions should not match the labels. - """ + ''' backend = parse_file(single_string_atom_labels) run_normalize(backend) assert_log(caplog, 'ERROR', 'len of atom position does not match number of atoms') def test_normalizer_unknown_atom_label(caplog, no_warn): - """ Runs normalizer on ['Br','Si','Si','Za'], for normalizeation Za will be replaced, + ''' Runs normalizer on ['Br','Si','Si','Za'], for normalizeation Za will be replaced, but stays int the labels. - """ + ''' backend = parse_file(unknown_atom_label) run_normalize(backend) assert backend.get_value('atom_labels')[3] == 'Za' def test_symmetry_classification_fcc(): - """Runs normalizer where lattice vectors should give fcc symmetry.""" + '''Runs normalizer where lattice vectors should give fcc symmetry.''' backend = parse_file(fcc_symmetry) backend = run_normalize(backend) expected_crystal_system = 'cubic' @@ -297,9 +297,9 @@ def test_system_classification(atom, molecule, one_d, two_d, surface, bulk): def test_representative_systems(single_point, molecular_dynamics, geometry_optimization, phonon): - """Checks that the representative systems are correctly identified and + '''Checks that the representative systems are correctly identified and processed by SystemNormalizer. - """ + ''' def check_representative_frames(backend): # For systems with multiple frames the first and two last should be processed. try: @@ -343,9 +343,9 @@ def test_reduced_chemical_formula(): def test_vasp_incar_system(): - """ + ''' Ensure we can test an incar value in the VASP example - """ + ''' backend = parse_file(vasp_parser) backend = run_normalize(backend) expected_value = 'SrTiO3' # material's formula in vasp.xml @@ -359,8 +359,8 @@ def test_vasp_incar_system(): def test_aflow_prototypes(): - """Tests that some basis structures are matched with the correct AFLOW prototypes - """ + '''Tests that some basis structures are matched with the correct AFLOW prototypes + ''' # No prototype info for non-bulk structures backend = run_normalize_for_structure(ase.build.molecule("H2O")) assert len(backend["section_prototype"]) == 0 @@ -422,9 +422,9 @@ def test_aflow_prototypes(): def test_springer_normalizer(): - """ + ''' Ensure the Springer normalizer works well with the VASP example. - """ + ''' backend = parse_file(vasp_parser) backend = run_normalize(backend) @@ -442,9 +442,9 @@ def test_springer_normalizer(): def test_dos_normalizer(): - """ + ''' Ensure the DOS normalizer acted on the DOS values. We take a VASP example. - """ + ''' backend = parse_file(vasp_parser_dos) backend = run_normalize(backend) diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 65b65c1001f3e9fd5b96e771c01de8034a277047..63cd5d0a353c86fc174daf48987c1ce4231d1bae 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -132,7 +132,7 @@ class TestLocalBackend(object): assert backend.get_sections('section_symmetry', 2) == [1] def test_section_override(self, backend, no_warn): - """ Test whether we can overwrite values already in the backend.""" + ''' Test whether we can overwrite values already in the backend.''' expected_value = ['Cl', 'Zn'] backend.openSection('section_run') backend.openSection('section_system') @@ -328,7 +328,7 @@ def assert_parser_result(backend, error=False): def assert_parser_dir_unchanged(previous_wd, current_wd): - """Assert working directory has not been changed from parser.""" + '''Assert working directory has not been changed from parser.''' assert previous_wd == current_wd diff --git a/tests/test_search.py b/tests/test_search.py index b51f05c777aeeb89026a2a4e0f81f5595a0dabdb..cb8ef43873649b3274fe324da74a36484024f621 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import List, Iterable from elasticsearch_dsl import Q import pytest @@ -25,36 +25,39 @@ def test_init_mapping(elastic): def test_index_skeleton_calc(elastic): - calc_with_metadata = datamodel.CalcWithMetadata( - domain='dft', upload_id='test_upload', calc_id='test_calc') + entry_metadata = datamodel.EntryMetadata( + domain='dft', upload_id='test_upload', calc_id='test_calc', + mainfile='test/mainfile', files=['test/file1', 'test/file2']) - create_entry(calc_with_metadata) + create_entry(entry_metadata) def test_index_normalized_calc(elastic, normalized: parsing.LocalBackend): - calc_with_metadata = datamodel.CalcWithMetadata( + entry_metadata = datamodel.EntryMetadata( domain='dft', upload_id='test upload id', calc_id='test id') - calc_with_metadata.apply_domain_metadata(normalized) + entry_metadata.apply_domain_metadata(normalized) - entry = search.flat(create_entry(calc_with_metadata).to_dict()) + search_entry = create_entry(entry_metadata) + entry = search.flat(search_entry.to_dict()) assert 'calc_id' in entry assert 'atoms' in entry assert 'dft.code_name' in entry + assert 'dft.optimade.elements_ratios' in entry def test_index_normalized_calc_with_metadata( - elastic, normalized: parsing.LocalBackend, example_user_metadata: dict): - - calc_with_metadata = datamodel.CalcWithMetadata( + elastic, normalized: parsing.LocalBackend, internal_example_user_metadata: dict): + entry_metadata = datamodel.EntryMetadata( domain='dft', upload_id='test upload id', calc_id='test id') - calc_with_metadata.apply_domain_metadata(normalized) - calc_with_metadata.apply_user_metadata(example_user_metadata) + entry_metadata.apply_domain_metadata(normalized) + internal_example_user_metadata.pop('embargo_length') # is for uploads only + entry_metadata.apply_user_metadata(internal_example_user_metadata) - entry = create_entry(calc_with_metadata) + entry = create_entry(entry_metadata) - assert getattr(entry, 'with_embargo') == example_user_metadata['with_embargo'] - assert getattr(entry, 'comment') == example_user_metadata['comment'] + assert getattr(entry, 'with_embargo') == internal_example_user_metadata['with_embargo'] + assert getattr(entry, 'comment') == internal_example_user_metadata['comment'] def test_index_upload(elastic, processed: processing.Upload): @@ -63,10 +66,10 @@ def test_index_upload(elastic, processed: processing.Upload): @pytest.fixture() def example_search_data(elastic, normalized: parsing.LocalBackend): - calc_with_metadata = datamodel.CalcWithMetadata( + entry_metadata = datamodel.EntryMetadata( domain='dft', upload_id='test upload id', calc_id='test id') - calc_with_metadata.apply_domain_metadata(normalized) - create_entry(calc_with_metadata) + entry_metadata.apply_domain_metadata(normalized) + create_entry(entry_metadata) refresh_index() return normalized @@ -74,10 +77,10 @@ def example_search_data(elastic, normalized: parsing.LocalBackend): @pytest.fixture() def example_ems_search_data(elastic, parsed_ems: parsing.LocalBackend): - calc_with_metadata = datamodel.CalcWithMetadata( + entry_metadata = datamodel.EntryMetadata( domain='ems', upload_id='test upload id', calc_id='test id') - calc_with_metadata.apply_domain_metadata(parsed_ems) - create_entry(calc_with_metadata) + entry_metadata.apply_domain_metadata(parsed_ems) + create_entry(entry_metadata) refresh_index() return parsed_ems @@ -200,15 +203,15 @@ def test_search_quantity( elastic, normalized: parsing.LocalBackend, test_user: datamodel.User, other_test_user: datamodel.User, order_by: str): - calc_with_metadata = datamodel.CalcWithMetadata( + entry_metadata = datamodel.EntryMetadata( domain='dft', upload_id='test upload id', calc_id='test id') - calc_with_metadata.apply_domain_metadata(normalized) - calc_with_metadata.uploader = test_user.user_id - create_entry(calc_with_metadata) + entry_metadata.apply_domain_metadata(normalized) + entry_metadata.uploader = test_user.user_id + create_entry(entry_metadata) - calc_with_metadata.calc_id = 'other test id' - calc_with_metadata.uploader = other_test_user.user_id - create_entry(calc_with_metadata) + entry_metadata.calc_id = 'other test id' + entry_metadata.uploader = other_test_user.user_id + create_entry(entry_metadata) refresh_index() request = SearchRequest(domain='dft').quantity( @@ -228,10 +231,10 @@ def refresh_index(): infrastructure.elastic_client.indices.refresh(index=config.elastic.index_name) -def create_entry(calc_with_metadata: datamodel.CalcWithMetadata): - entry = search.Entry.from_calc_with_metadata(calc_with_metadata) +def create_entry(entry_metadata: datamodel.EntryMetadata): + entry = search.create_entry(entry_metadata) entry.save() - assert_entry(calc_with_metadata.calc_id) + assert_entry(entry_metadata.calc_id) return entry @@ -246,11 +249,13 @@ def assert_entry(calc_id): assert results[0]['calc_id'] == calc_id -def assert_search_upload(upload: datamodel.UploadWithMetadata, additional_keys: List[str] = [], **kwargs): +def assert_search_upload( + upload_entries: Iterable[datamodel.EntryMetadata], + additional_keys: List[str] = [], **kwargs): keys = ['calc_id', 'upload_id', 'mainfile', 'calc_hash'] refresh_index() search_results = Entry.search().query('match_all')[0:10] - assert search_results.count() == len(list(upload.calcs)) + assert search_results.count() == len(list(upload_entries)) if search_results.count() > 0: for hit in search_results: hit = search.flat(hit.to_dict()) @@ -287,7 +292,7 @@ if __name__ == '__main__': def gen_data(): for pid in range(0, n): calc = generate_calc(pid) - calc = Entry.from_calc_with_metadata(calc) + calc = Entry.from_entry_metadata(calc) yield calc.to_dict(include_meta=True) bulk(infrastructure.elastic_client, gen_data()) diff --git a/tests/utils.py b/tests/utils.py index 67f8340fab0142e4b93a0ce48a260677e38091d0..194da6e4e0a79110c6fcafaa580b158631f10ed6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" Methods to help with testing of nomad@FAIRDI.""" +''' Methods to help with testing of nomad@FAIRDI.''' from typing import Type import json @@ -21,7 +21,7 @@ from logging import LogRecord def assert_log(caplog, level: str, event_part: str) -> LogRecord: - """ + ''' Assert whether a log message exists in the logs of the tests at a certain level. Parameters @@ -35,7 +35,7 @@ def assert_log(caplog, level: str, event_part: str) -> LogRecord: The error message we're after. We search the logs matching level if they contain this string. - """ + ''' record = None for record in caplog.get_records(when='call'): if record.levelname == level: @@ -50,10 +50,10 @@ def assert_log(caplog, level: str, event_part: str) -> LogRecord: @contextmanager def assert_exception(exception_cls: Type = Exception): - """ + ''' A context manager that can be used to assert that the given exception is thrown within the respective ``with``clause. - """ + ''' has_exception = False try: yield