diff --git a/dependencies/encyclopedia-gui b/dependencies/encyclopedia-gui index bdffb455f21577435477daa9c871205e6c118efd..f19b84de11b1004548e8e5a324df78bb356e7896 160000 --- a/dependencies/encyclopedia-gui +++ b/dependencies/encyclopedia-gui @@ -1 +1 @@ -Subproject commit bdffb455f21577435477daa9c871205e6c118efd +Subproject commit f19b84de11b1004548e8e5a324df78bb356e7896 diff --git a/dependencies/parsers/phonopy b/dependencies/parsers/phonopy index 59ecd6cddb2bbf75000a1454fe40ceace2d2c207..bc9be2124405b87987e04d286e385f23b7b5712f 160000 --- a/dependencies/parsers/phonopy +++ b/dependencies/parsers/phonopy @@ -1 +1 @@ -Subproject commit 59ecd6cddb2bbf75000a1454fe40ceace2d2c207 +Subproject commit bc9be2124405b87987e04d286e385f23b7b5712f diff --git a/nomad/app/api/encyclopedia.py b/nomad/app/api/encyclopedia.py index 4836b8a37211fa61536f8ad87ee8649aacd7ae59..a278ffc12562da82b4d67eed0061460c01351a48 100644 --- a/nomad/app/api/encyclopedia.py +++ b/nomad/app/api/encyclopedia.py @@ -13,11 +13,12 @@ # limitations under the License. """ -The encyclopedia API of the nomad@FAIRDI APIs. +API for retrieving material information. """ import re import math import numpy as np +from collections import defaultdict from flask_restplus import Resource, abort, fields, marshal from flask import request, g @@ -29,31 +30,187 @@ from nomad.files import UploadFiles from nomad.units import ureg from nomad.atomutils import get_hill_decomposition from nomad.datamodel.datamodel import EntryArchive -from nomad.datamodel.material import Material +from nomad.datamodel.material import Material, Bulk, Method from .api import api from .auth import authenticate, create_authorization_predicate -ns = api.namespace("encyclopedia", description="Access encyclopedia metadata.") -re_formula = re.compile(r"([A-Z][a-z]?)(\d*)") +ns = api.namespace("encyclopedia", description="Access materials data.") +missing_material_msg = "The specified material {} could not be retrieved. It either does not exists or requires authentication." -material_prop_map = { - # General - "material_id": "encyclopedia.material.material_id", - "formula": "encyclopedia.material.formula", - "formula_reduced": "encyclopedia.material.formula_reduced", - "system_type": "encyclopedia.material.material_type", - # Bulk only - "has_free_wyckoff_parameters": "encyclopedia.material.bulk.has_free_wyckoff_parameters", - "strukturbericht_designation": "encyclopedia.material.bulk.strukturbericht_designation", - "material_name": "encyclopedia.material.material_name", - "bravais_lattice": "encyclopedia.material.bulk.bravais_lattice", - "crystal_system": "encyclopedia.material.bulk.crystal_system", - "point_group": "encyclopedia.material.bulk.point_group", - "space_group_number": "encyclopedia.material.bulk.space_group_number", - "space_group_international_short_symbol": "encyclopedia.material.bulk.space_group_international_short_symbol", - "structure_prototype": "encyclopedia.material.bulk.structure_prototype", - "structure_type": "encyclopedia.material.bulk.structure_type", -} + +class MaterialAccessError(Exception): + pass + + +class MaterialSearch(): + """Convenience class for material searches. Automatically ensures the + correct visibility of materials when the search is constructed through the + methods of his class. + """ + def __init__(self): + self._s = Search(index=config.elastic.materials_index_name) + self._filters = [] + self._musts = [] + self._extra = {} + self._authenticated = False + + def add_material_filter(self, query): + """Adds material based filters. + """ + self._filters.append(query) + + def add_material_aggregation(self, name, aggregation): + """Adds material based aggregation. + """ + self._s.aggs.bucket(name, aggregation) + + def add_material_must(self, query): + """Adds material based must query. + """ + self._musts.append(query) + + def add_calculation_filter(self, queries): + """Adds calculation based filters. The visibility of calculations is + automatically checked. + """ + if not isinstance(queries, (list, tuple)): + queries = [queries] + filters = self.get_authentication_filters_nested() + queries + nested_bool = Q( + "bool", + filter=filters, + ) + nested_query = Q("nested", path="calculations", query=nested_bool) + self._musts.append(nested_query) + self._authenticated = True + + def includes(self, includes): + self._extra["_source"] = {"includes": includes} + + def size(self, size): + self._extra["size"] = size + + def extra(self, extra): + self._extra = extra + + def s(self): + # If no authentication filters have been added already, add them now. + if not self._authenticated: + self._musts.append(Q( + "nested", + path="calculations", + query=Q("bool", filter=self.get_authentication_filters_nested()), + )) + self._authenticated = True + query = Q( + "bool", + filter=self._filters, + must=self._musts, + ) + s = self._s.query(query) + extra = self._extra + s = s.extra(**extra) + return s + + def execute(self): + s = self.s() + return s.execute() + + def get_authentication_filters_nested(self): + """Returns a shared term filter that will leave out unpublished (of + other users) or embargoed materials. + """ + # Handle authentication + filters = [] + if g.user is not None: + q = Q('term', calculations__published=True) & Q('term', calculations__with_embargo=False) + if g.user.user_id is not None: + q = q | Q('term', calculations__owners=g.user.user_id) + filters.append(q) + else: + q = Q('term', calculations__published=True) & Q('term', calculations__with_embargo=False) + filters.append(q) + + return filters + + def calculations(self): + """Executes the query and returns a list of visible calculations + associated with the first found material. Currently fetches all + calculations associated with a material. If the number of calculations + per material increases significantly then the inner_hits available for + nested queries should be used instead. + + Returns: + List of visible calculations for the first material matching the + constructed query. + + Raises: + MaterialAccessError if the queried material could not be found. + """ + source = self._extra.get("_source") + if source is None: + source = {} + self._extra["_source"] = source + includes = source.get("includes") + if includes is None: + includes = [] + source["includes"] = includes + + self._extra["_source"]["includes"].extend([ + "calculations.published", + "calculations.with_embargo", + "calculations.owners", + ]) + response = self.execute() + if response.hits.total == 0: + raise MaterialAccessError + + material = response.hits[0] + + # Filter out calculations based on their visibility + visible_calcs = [] + for calc in material.calculations: + if calc.published and not calc.with_embargo: + visible_calcs.append(calc) + elif g.user is not None and g.user.user_id in calc.owners: + visible_calcs.append(calc) + return visible_calcs + + +def get_authentication_filters(): + """Returns a shared term filter that will leave out unpublished (of other + users), embargoed or invalid entries in the calculations index. + """ + # Handle authentication + s = search.SearchRequest() + if g.user is not None: + s.owner('visible', user_id=g.user.user_id) + else: + s.owner('public') + return [ + s.q, + Q("term", encyclopedia__status="success"), + ] + + +def get_range_filter(field, minimum=None, maximum=None, source_unit=None, target_unit=None): + """For adding range filters + """ + query_dict = {} + if minimum is not None: + if source_unit is None and target_unit is None: + gte = minimum + else: + gte = (minimum * source_unit).to(target_unit).magnitude + query_dict["gte"] = gte + if maximum is not None: + if source_unit is None and target_unit is None: + lte = maximum + else: + lte = (maximum * source_unit).to(target_unit).magnitude + query_dict["lte"] = lte + query = Q("range", **{field: query_dict}) + return query def rgetattr(obj, attr_name): @@ -84,22 +241,46 @@ def get_es_doc_values(es_doc, mapping, keys=None): return result -def get_enc_filter(): - """Returns a shared term filter that will leave out unpublished (of other - users), embargoed or invalid entries. +def read_archive(upload_id: str, calc_id: str) -> EntryArchive: + """Used to read data from the archive. + + Args: + upload_id: Upload id. + calc_id: Calculation id. + + Returns: + MSection: The section_run as MSection + For each path, a dictionary containing the path as key and the returned + section as value. """ - # Handle authentication - s = search.SearchRequest() - if g.user is not None: - s.owner('visible', user_id=g.user.user_id) - else: - s.owner('public') - return [ - s.q, - Q("term", encyclopedia__status="success"), - ] + upload_files = UploadFiles.get( + upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id)) + + with upload_files.read_archive(calc_id) as archive: + data = archive[calc_id] + root = EntryArchive.m_from_dict(data.to_dict()) + + return root +material_prop_map = { + # General + "material_id": "material_id", + "formula": "formula", + "formula_reduced": "formula_reduced", + "material_type": "material_type", + "material_name": "material_name", + # Bulk + "has_free_wyckoff_parameters": "bulk.has_free_wyckoff_parameters", + "strukturbericht_designation": "bulk.strukturbericht_designation", + "bravais_lattice": "bulk.bravais_lattice", + "crystal_system": "bulk.crystal_system", + "point_group": "bulk.point_group", + "space_group_number": "bulk.space_group_number", + "space_group_international_short_symbol": "bulk.space_group_international_short_symbol", + "structure_type": "bulk.structure_type", + "structure_prototype": "bulk.structure_prototype", +} similarity = api.model("similarity", { # General "material_id": fields.String, @@ -120,7 +301,7 @@ material_result = api.model("material_result", { "material_id": fields.String, "formula": fields.String, "formula_reduced": fields.String, - "system_type": fields.String, + "material_type": fields.String, "n_matches": fields.Integer, # Bulk only "has_free_wyckoff_parameters": fields.Boolean, @@ -144,10 +325,10 @@ class EncMaterialResource(Resource): @api.doc("get_material") @api.expect(material_query) @api.marshal_with(material_result, skip_none=True) + @api.param("material_id", "28 character identifier for the material.") @authenticate() def get(self, material_id): - """Used to retrieve basic information related to the specified - material. + """Used to retrieve basic information related to a material. """ # Parse request arguments args = material_query.parse_args() @@ -159,36 +340,21 @@ class EncMaterialResource(Resource): keys = list(material_prop_map.keys()) es_keys = list(material_prop_map.values()) - # Find the first public entry with this material id and take - # information from there. In principle all other entries should have - # the same information. - s = Search(index=config.elastic.index_name) - query = Q( - "bool", - filter=get_enc_filter() + [ - Q("term", encyclopedia__material__material_id=material_id), - ] - ) - s = s.query(query) - - # Only one representative entry is returned by collapsing the results. - s = s.extra(**{ - "_source": {"includes": es_keys}, - "size": 1, - "collapse": {"field": "encyclopedia.material.material_id"}, - }) + # Get the material info, check that at least one calculation is visible + s = MaterialSearch() + s.add_material_filter(Q("term", material_id=material_id)) + s.includes(es_keys) response = s.execute() # No such material - if len(response) == 0: - abort(404, message="There is no material {}".format(material_id)) + if response.hits.total == 0: + abort(404, message=missing_material_msg.format(material_id)) # Add values from ES entry entry = response[0] result = get_es_doc_values(entry, material_prop_map, keys) - # Add similarity data that is currently stored in MongoDB. In the - # future a lot of the data will be accessed here. + # Add similarity data that is stored in MongoDB. try: material = Material.m_def.a_mongo.get(material_id=material_id) dos_similarity = material.similarity.electronic_dos @@ -199,31 +365,21 @@ class EncMaterialResource(Resource): # Only include similarity for materials that exist on the current # deployment to avoid dead links. similar_ids = dos_similarity.material_ids - id_value_map = {key: value for key, value in zip(dos_similarity.material_ids, dos_similarity.values)} - bool_query = Q( - "bool", - filter=get_enc_filter() + [Q("terms", encyclopedia__material__material_id=similar_ids)], - ) - s = Search(index=config.elastic.index_name) - s = s.query(bool_query) - s = s.extra(**{ - "_source": {"includes": [ - "encyclopedia.material.material_id", - "encyclopedia.material.formula_reduced", - "encyclopedia.material.bulk.space_group_number", - ]}, - "size": 5, - "collapse": {"field": "encyclopedia.material.material_id"}, - }) + id_value_map = {key: value for key, value in zip(similar_ids, dos_similarity.values)} + s = MaterialSearch() + s.add_material_filter(Q("terms", material_id=similar_ids)) + s.includes(["material_id", "formula_reduced", "bulk.space_group_number"]) + s.size(5) response = s.execute() + similarity = [] for hit in response.hits: try: similarity.append({ - "material_id": hit.encyclopedia.material.material_id, - "value": id_value_map[hit.encyclopedia.material.material_id], - "formula": hit.encyclopedia.material.formula_reduced, - "space_group_number": hit.encyclopedia.material.bulk.space_group_number, + "material_id": hit.material_id, + "value": id_value_map[hit.material_id], + "formula": hit.formula_reduced, + "space_group_number": hit.bulk.space_group_number, }) except AttributeError: pass @@ -233,66 +389,58 @@ class EncMaterialResource(Resource): return result, 200 +re_formula = re.compile(r"([A-Z][a-z]?)(\d*)") range_query = api.model("range_query", { "max": fields.Float, "min": fields.Float, }) -materials_after = api.model("materials_after", { - "materials": fields.String, -}) materials_query = api.model("materials_input", { "search_by": fields.Nested(api.model("search_query", { "exclusive": fields.Boolean(default=False), "formula": fields.String, "element": fields.String, - "page": fields.Integer(default=1), - "after": fields.Nested(materials_after, allow_null=True), - "per_page": fields.Integer(default=25), - "pagination": fields.Boolean, + "page": fields.Integer(default=1, description="Requested page number, indexing starts from 1."), + "per_page": fields.Integer(default=25, description="Number of results per page."), + "restricted": fields.Boolean(default=False, description="Select to restrict the query to individual calculations. If not selected, the query will combine results from several different calculations."), })), - "material_name": fields.List(fields.String), - "structure_type": fields.List(fields.String), - "space_group_number": fields.List(fields.Integer), - "system_type": fields.List(fields.String), - "crystal_system": fields.List(fields.String), - "band_gap": fields.Nested(range_query, description="Band gap range in eV."), - "band_gap_direct": fields.Boolean, - "has_band_structure": fields.Boolean, - "has_dos": fields.Boolean, - "has_fermi_surface": fields.Boolean, - "has_thermal_properties": fields.Boolean, - "functional_type": fields.List(fields.String), - "basis_set_type": fields.List(fields.String), - "code_name": fields.List(fields.String), - "mass_density": fields.Nested(range_query, description="Mass density range in kg / m ** 3."), + "material_type": fields.List(fields.String(enum=list(Material.material_type.type)), description=Material.material_type.description), + "material_name": fields.List(fields.String, description=Material.material_name.description), + "structure_type": fields.List(fields.String, description=Bulk.structure_type.description), + "space_group_number": fields.List(fields.Integer, description=Bulk.space_group_number.description), + "crystal_system": fields.List(fields.String(enum=list(Bulk.crystal_system.type)), description=Bulk.crystal_system.description), + "band_gap": fields.Nested(range_query, description="Band gap range in eV.", allow_null=True), + "has_band_structure": fields.Boolean(description="Set to True if electronic band structure needs to be available for this material."), + "has_dos": fields.Boolean(description="Set to True if electronic density of states needs to be available for this material."), + "has_thermal_properties": fields.Boolean(description="Set to True if thermodynamical properties need to be available for this material."), + "functional_type": fields.List(fields.String(enum=list(Method.functional_type.type)), description=Method.functional_type.description), + "basis_set": fields.List(fields.String(enum=list(Method.basis_set.type)), description=Method.basis_set.description), + "code_name": fields.List(fields.String(enum=list(Method.program_name.type)), description=Method.program_name.description), }) pages_result = api.model("page_info", { "per_page": fields.Integer, "total": fields.Integer, "page": fields.Integer, "pages": fields.Integer, - "after": fields.Nested(materials_after), }) materials_result = api.model("materials_result", { "total_results": fields.Integer(allow_null=False), "results": fields.List(fields.Nested(material_result, skip_none=True)), "pages": fields.Nested(pages_result, skip_none=True), - "es_query": fields.String(allow_null=False), }) -@ns.route("/materials") +@ns.route("/materials/") class EncMaterialsResource(Resource): @api.response(404, "No materials found") @api.response(400, "Bad request") - @api.response(200, "Metadata send", fields.Raw) + @api.response(200, "OK", materials_result) @api.expect(materials_query, validate=False) @api.marshal_with(materials_result, skip_none=True) - @api.doc("materials") + @api.doc("search_materials") @authenticate() def post(self): - """Used to query a list of materials with the given search options. + """Search materials based on their properties. """ # Get query parameters as json try: @@ -300,77 +448,65 @@ class EncMaterialsResource(Resource): except Exception as e: abort(400, message=str(e)) - def add_terms_filter(filters, source, target, query_type="terms"): - """For adding terms filters - """ - if data[source] is not None: - filters.append(Q(query_type, **{target: data[source]})) - - def add_exists_filter(filters, must_nots, source, target): - """For adding exists filters - """ - param = data[source] - if param is not None: - query = Q("exists", field=target) - if param is True: - filters.append(query) - elif param is False: - must_nots.append(query) - - def add_range_filter(filters, source, target, source_unit=None, target_unit=None): - """For adding range filters - """ - param = data[source] - query_dict = {} - if param["min"] is not None: - if source_unit is None and target_unit is None: - gte = param["min"] - else: - gte = (param["min"] * source_unit).to(target_unit).magnitude - query_dict["gte"] = gte - if param["max"] is not None: - if source_unit is None and target_unit is None: - lte = param["max"] - else: - lte = (param["max"] * source_unit).to(target_unit).magnitude - query_dict["lte"] = lte - if len(query_dict) != 0: - query = Q("range", **{target: query_dict}) - filters.append(query) - - property_map = { - "has_thermal_properties": "encyclopedia.properties.thermodynamical_properties", - "has_band_structure": "encyclopedia.properties.electronic_band_structure", - "has_dos": "encyclopedia.properties.electronic_dos", - "has_fermi_surface": "encyclopedia.properties.fermi_surface", - } - requested_properties = [] - filters = get_enc_filter() - must_nots = [] - musts = [] - add_terms_filter(filters, "material_name", "encyclopedia.material.material_name") - add_terms_filter(filters, "structure_type", "encyclopedia.material.bulk.structure_type") - add_terms_filter(filters, "space_group_number", "encyclopedia.material.bulk.space_group_number") - add_terms_filter(filters, "system_type", "encyclopedia.material.material_type") - add_terms_filter(filters, "crystal_system", "encyclopedia.material.bulk.crystal_system") - add_terms_filter(filters, "band_gap_direct", "encyclopedia.properties.band_gap_direct", query_type="term") - add_terms_filter(filters, "functional_type", "encyclopedia.method.functional_type") - add_terms_filter(filters, "basis_set_type", "dft.basis_set") - add_terms_filter(filters, "code_name", "dft.code_name") - add_range_filter(filters, "band_gap", "encyclopedia.properties.band_gap", ureg.eV, ureg.J) - add_range_filter(filters, "mass_density", "encyclopedia.properties.mass_density") - - # Create query for elements or formula + # Create filters from user query + s = MaterialSearch() + + # Material level filters + if data["material_type"] is not None: s.add_material_filter(Q("terms", material_type=data["material_type"])) + if data["material_name"] is not None: s.add_material_filter(Q("terms", material_name=data["material_name"])) + if data["structure_type"] is not None: s.add_material_filter(Q("terms", bulk__structure_type=data["structure_type"])) + if data["space_group_number"] is not None: s.add_material_filter(Q("terms", bulk__space_group_number=data["space_group_number"])) + if data["crystal_system"] is not None: s.add_material_filter(Q("terms", bulk__crystal_system=data["crystal_system"])) + + # Calculation filters + calc_filters = [] + if data["functional_type"] is not None: calc_filters.append(Q("terms", calculations__method__functional_type=data["functional_type"])) + if data["basis_set"] is not None: calc_filters.append(Q("terms", calculations__method__basis_set=data["basis_set"])) + if data["code_name"] is not None: calc_filters.append(Q("terms", calculations__method__program_name=data["code_name"])) + if data["has_band_structure"] is not None: calc_filters.append(Q("term", calculations__properties__has_electronic_band_structure=data["has_band_structure"])) + if data["has_dos"] is not None: calc_filters.append(Q("term", calculations__properties__has_electronic_dos=data["has_dos"])) + if data["has_thermal_properties"] is not None: calc_filters.append(Q("term", calculations__properties__has_thermodynamical_properties=data["has_thermal_properties"])) + if data["band_gap"] is not None: calc_filters.append(get_range_filter( + "calculations.properties.band_gap", + minimum=data["band_gap"].get("min"), + maximum=data["band_gap"].get("max"), + source_unit=ureg.eV, + target_unit=ureg.J, + )) search_by = data["search_by"] + restricted = search_by["restricted"] + if restricted: + s.add_calculation_filter(calc_filters) + else: + for f in calc_filters: + s.add_calculation_filter(f) + + # if data["functional_type"] is not None: s.add_calculation_filter(Q("terms", calculations__method__functional_type=data["functional_type"])) + # if data["basis_set"] is not None: s.add_calculation_filter(Q("terms", calculations__method__basis_set=data["basis_set"])) + # if data["code_name"] is not None: s.add_calculation_filter(Q("terms", calculations__method__program_name=data["code_name"])) + # if data["has_band_structure"] is not None: s.add_calculation_filter(Q("term", calculations__properties__has_electronic_band_structure=data["has_band_structure"])) + # if data["has_dos"] is not None: s.add_calculation_filter(Q("term", calculations__properties__has_electronic_dos=data["has_dos"])) + # if data["has_thermal_properties"] is not None: s.add_calculation_filter(Q("term", calculations__properties__has_thermodynamical_properties=data["has_thermal_properties"])) + # if data["band_gap"] is not None: s.add_calculation_filter(get_range_filter( + # "calculations.properties.band_gap", + # minimum=data["band_gap"].get("min"), + # maximum=data["band_gap"].get("max"), + # source_unit=ureg.eV, + # target_unit=ureg.J, + # )) + formula = search_by["formula"] elements = search_by["element"] exclusive = search_by["exclusive"] + # The given list of species/formula is reformatted with the Hill system into a + # query string. With exclusive search we look for exact match, with + # non-exclusive search we look for match that includes at least all + # species, possibly even more. if formula is not None: - # Here we determine a list of atom types. The types may occur - # multiple times and at multiple places. element_list = [] matches = re_formula.finditer(formula) + for match in matches: groups = match.groups() symbol = groups[0] @@ -381,11 +517,9 @@ class EncMaterialsResource(Resource): else: element_list += [symbol] * int(count) - # The given list of species is reformatted with the Hill system - # into a query string. The counts are reduced by the greatest - # common divisor. names, reduced_counts = get_hill_decomposition(element_list, reduced=True) query_string = [] + for name, count in zip(names, reduced_counts): if count == 1: query_string.append(name) @@ -393,228 +527,71 @@ class EncMaterialsResource(Resource): query_string.append("{}{}".format(name, int(count))) query_string = " ".join(query_string) - # With exclusive search we look for exact match if exclusive: - filters.append(Q("term", **{"encyclopedia.material.species_and_counts.keyword": query_string})) - # With non-exclusive search we look for match that includes at - # least all parts of the formula, possibly even more. + s.add_material_filter(Q("term", **{"species_and_counts.keyword": query_string})) else: - musts.append(Q( + s.add_material_must(Q( "match", - encyclopedia__material__species_and_counts={"query": query_string, "operator": "and"} + species_and_counts={"query": query_string, "operator": "and"} )) elif elements is not None: - # The given list of species is reformatted with the Hill system into a query string species, _ = get_hill_decomposition(elements.split(",")) query_string = " ".join(species) - # With exclusive search we look for exact match if exclusive: - filters.append(Q("term", **{"encyclopedia.material.species.keyword": query_string})) - # With non-exclusive search we look for match that includes at - # least all species, possibly even more. + s.add_material_filter(Q("term", **{"species.keyword": query_string})) else: - musts.append(Q( + s.add_material_must(Q( "match", - encyclopedia__material__species={"query": query_string, "operator": "and"} + species={"query": query_string, "operator": "and"} )) - # The queries that correspond to AND queries typically need to access - # multiple calculations at once to find the material ids that - # correspond to the query. To implement this behaviour we need to run - # an initial aggregation that checks that the requested properties are - # present for a material. This is a a very crude solution that does not - # scale to complex queries, but I'm not sure we can do much better - # until we have a separate index for materials. The size is set very - # large because all the results need to be returned. We cannot get the - # results in a paginated way with composite aggregation, because - # pipeline aggregations are not compatible with them. - agg_parent = A("terms", field="encyclopedia.material.material_id", size=500000) - for key, value in property_map.items(): - if data[key] is True: - agg = A("filter", exists={"field": value}) - agg_parent.bucket(key, agg) - requested_properties.append(key) - if len(requested_properties) > 1: - # First we setup a boolean filter query that filters for of the - # requested properties. This will reduce the size of the initial - # set on top of which the more expensive aggregation stack is run - # on. - bool_query = Q( - "bool", - filter=filters, - must_not=must_nots, - must=musts, - should=[Q("exists", field=property_map[x]) for x in requested_properties], - minimum_should_match=1, # At least one of the should query must match - ) - s = Search(index=config.elastic.index_name) - s = s.query(bool_query) - - # The remaining requested properties have to be queried as a nested - # aggregation. - s.aggs.bucket("materials", agg_parent) - buckets_path = {x: "{}._count".format(x) for x in requested_properties} - script = " && ".join(["params.{} > 0".format(x) for x in requested_properties]) - agg_parent.pipeline("selector", A( - "bucket_selector", - buckets_path=buckets_path, - script=script, - )) - s = s.extra(**{ - "size": 0, - }) - response = s.execute() - material_ids = [x["key"] for x in response.aggs.materials.buckets] - if len(material_ids) == 0: - abort(404, message="No materials found for the given search criteria or pagination.") - - # Add pre-selected material ids if multiple exists filters were - # requested. These IDs are already filtered based on the user query so - # none of the other search terms need be used. - if len(requested_properties) > 1: - must_nots = [] - musts = [] - filters = [] - filters.append(Q("terms", encyclopedia__material__material_id=material_ids)) - if len(requested_properties) == 1: - prop_name = requested_properties[0] - add_exists_filter(filters, must_nots, prop_name, property_map[prop_name]) - - # The top query filters out entries based on the user query + # Execute query page = search_by["page"] per_page = search_by["per_page"] - after = search_by["after"] - bool_query = Q( - "bool", - filter=filters, - must_not=must_nots, - must=musts, - ) - s = Search(index=config.elastic.index_name) - s = s.query(bool_query) - - # 1: The paginated approach: No way to know the amount of materials, - # but can return aggregation results in a quick fashion including - # the number of calculation entries per material. - mode = "collapse" - if mode == "aggregation": - # The materials are grouped by using three aggregations: - # "Composite" to enable scrolling, "Terms" to enable selecting - # by material_id and "Top Hits" to fetch a single - # representative material document. Unnecessary fields are - # filtered to reduce data transfer. - terms_agg = A("terms", field="encyclopedia.material.material_id") - composite_kwargs = {"sources": {"materials": terms_agg}, "size": per_page} - - # The number of matched materials is only requested on the first - # search, not for each page. - if after is not None: - composite_kwargs["after"] = after - else: - cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000) - s.aggs.metric("n_materials", cardinality_agg) - - composite_agg = A("composite", **composite_kwargs) - composite_agg.metric("representative", A( - "top_hits", - size=1, - _source={"includes": list(material_prop_map.values())}, - )) - s.aggs.bucket("materials", composite_agg) - - # We ignore the top level hits and sort by reduced material formula. - s = s.extra(**{ - "size": 0, - }) - - response = s.execute() - materials = response.aggs.materials.buckets - if len(materials) == 0: - abort(404, message="No materials found for the given search criteria or pagination.") - after_new = response.aggs.materials["after_key"] - - # Gather results from aggregations - result_list = [] - materials = response.aggs.materials.buckets - keys = list(material_prop_map.keys()) - for material in materials: - representative = material["representative"][0] - mat_dict = get_es_doc_values(representative, material_prop_map, keys) - mat_dict["n_matches"] = material.doc_count - result_list.append(mat_dict) - - # Page information is incomplete for aggregations - pages = { - "page": page, - "per_page": per_page, - "after": after_new, - } - if after is None: - n_materials = response.aggs.n_materials.value - pages["total"] = n_materials - - # 2. Collapse approach. Quickly provides a list of materials - # corresponding to the query, offers full pagination, the number of - # matches per material needs to be requested with a separate query. - elif mode == "collapse": - s = Search(index=config.elastic.index_name) - s = s.query(bool_query) - - # Add cardinality aggregation that gives out the total number of materials - cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000) - s.aggs.metric("n_materials", cardinality_agg) - - s = s.extra(**{ - "collapse": {"field": "encyclopedia.material.material_id"}, - "size": per_page, - "from": (page - 1) * per_page, - "sort": [{"encyclopedia.material.formula_reduced": {"order": "asc"}}], - "explain": True, - }) - - # Execute query - response = s.execute() + s.extra({ + "size": per_page, + "from": (page - 1) * per_page, + "sort": [{"formula_reduced": {"order": "asc"}}], + "_source": {"includes": list(material_prop_map.values())}, + }) + response = s.execute() - # No matches - if len(response) == 0: - abort(404, message="No materials found for the given search criteria or pagination.") + # Form final response + pages = { + "page": page, + "per_page": per_page, + "pages": math.ceil(response.hits.total / per_page), + "total": response.hits.total, + } - # Gather number of entries per material with a separate query - material_ids = [x.encyclopedia.material.material_id for x in response] - s = Search(index=config.elastic.index_name) - bool_query = Q( - "bool", - filter=Q("terms", encyclopedia__material__material_id=material_ids), + # Gather the number of visible calculation for each returned material + # with an aggregation + if len(response) != 0: + material_ids = [x.material_id for x in response] + s2 = MaterialSearch() + s2.size(0) + matched = s2._s.aggs.bucket("matched", A("filter", filter=Q("terms", material_id=material_ids))) + materials = matched.bucket("materials", A("terms", field="material_id", size=len(material_ids))) + nested = materials.bucket("nested", A("nested", path="calculations")) + nested.bucket( + "visible", + A("filter", filter=Q("bool", filter=s2.get_authentication_filters_nested())) ) - s2 = s.query(bool_query) - s2.aggs.bucket("n_matches", A("terms", field="encyclopedia.material.material_id")) response2 = s2.execute() - matmap = {x.key: x.doc_count for x in response2.aggs.n_matches} + agg_dict = {} + for agg in response2.aggs.matched.materials: + agg_dict[agg.key] = agg.nested.visible.doc_count - # Loop over materials - result_list = [] - keys = list(material_prop_map.keys()) - for material in response: - # Get values from the collapsed doc - mat_result = get_es_doc_values(material, material_prop_map, keys) - mat_id = material.encyclopedia.material.material_id - mat_result["n_matches"] = matmap[mat_id] - result_list.append(mat_result) - - # Full page information available for collapse - pages = { - "page": page, - "per_page": per_page, - "pages": math.ceil(response.hits.total / per_page), - "total": response.aggs.n_materials.value, - } + # Form the final list of results + result_list = [] + for x in response: + res = get_es_doc_values(x, material_prop_map, list(material_prop_map.keys())) + material_id = x.material_id + res["n_matches"] = agg_dict[material_id] + result_list.append(res) - result = { - "results": result_list, - "pages": pages, - } - return result, 200 + return {"results": result_list, "pages": pages}, 200 groups_result = api.model("groups_result", { @@ -627,57 +604,70 @@ groups_result = api.model("groups_result", { class EncGroupsResource(Resource): @api.response(404, "Material not found") @api.response(400, "Bad request") - @api.response(200, "Metadata send", fields.Raw) + @api.response(200, "OK", groups_result) @api.marshal_with(groups_result) - @api.doc("enc_materials") + @api.doc("get_material_groups") + @api.param("material_id", "28 character identifier for the material.") @authenticate() def get(self, material_id): - """Returns a summary of the calculation groups that were identified for - this material. + """Returns a summary of the calculation groups that were identified for this material. + + Two types of groups are reported: equation of state groups and + parameter variation groups. Equation of state groups contain + calculations with identical method and material, but different volume. + Parameter variation groups contain identical structure but different + methods. The response contains dictionaries for both groups + ('groups_eos' and 'groups_par'). These dictionaries map a group id with + a list of calculation ids. """ - # Find entries for the given material, which have EOS or parameter - # variation hashes set. - bool_query = Q( - "bool", - filter=get_enc_filter() + [Q("term", encyclopedia__material__material_id=material_id)], - must=[ - Q("exists", field="encyclopedia.properties.energies.energy_total"), - Q("exists", field="encyclopedia.material.idealized_structure.cell_volume"), - ], - should=[ - Q("exists", field="encyclopedia.method.group_eos_id"), - Q("exists", field="encyclopedia.method.group_parametervariation_id"), - ], - minimum_should_match=1, # At least one of the should query must match - ) - - s = Search(index=config.elastic.index_name) - s = s.query(bool_query) - - # Bucket the calculations by the group hashes. Only create a bucket if an - # above-minimum number of documents are found. - group_eos_bucket = A("terms", field="encyclopedia.method.group_eos_id", min_doc_count=4) - group_param_bucket = A("terms", field="encyclopedia.method.group_parametervariation_id", min_doc_count=2) - calc_aggregation = A( - "top_hits", - _source={"includes": ["calc_id"]}, - sort=[{"encyclopedia.properties.energies.energy_total": {"order": "asc"}}], - size=100, - ) - group_eos_bucket.bucket("calculations", calc_aggregation) - group_param_bucket.bucket("calculations", calc_aggregation) - s.aggs.bucket("groups_eos", group_eos_bucket) - s.aggs.bucket("groups_param", group_param_bucket) - - # We ignore the top level hits - s = s.extra(**{ - "size": 0, + # Get full entry for this material + s = MaterialSearch() + s.add_material_filter(Q("term", material_id=material_id)) + s.extra({ + "_source": {"includes": [ + "calculations.calc_id", + "calculations.method.group_eos_id", + "calculations.method.group_parametervariation_id", + "calculations.properties.energies.energy_total", + "calculations.idealized_structure.cell_volume", + ]}, + "size": 1, }) - # Collect information for each group from the aggregations - response = s.execute() - groups_eos = {group.key: [calc.calc_id for calc in group.calculations.hits] for group in response.aggs.groups_eos.buckets} - groups_param = {group.key: [calc.calc_id for calc in group.calculations.hits] for group in response.aggs.groups_param.buckets} + # Raise error if material not found + try: + calculations = s.calculations() + except MaterialAccessError: + abort(404, message=missing_material_msg.format(material_id)) + + groups_eos = defaultdict(list) + groups_param = defaultdict(list) + for calc in calculations: + try: + calc.properties.energies.energy_total + calc.idealized_structure.cell_volume + except AttributeError: + continue + try: + group_eos_id = calc.method.group_eos_id + if group_eos_id: + groups_eos[group_eos_id].append(calc.calc_id) + except AttributeError: + pass + try: + group_param_id = calc.method.group_parametervariation_id + if group_param_id: + groups_param[group_param_id].append(calc.calc_id) + except AttributeError: + pass + + # Filter out groups with too few entries + for key, items in list(groups_eos.items()): + if len(items) < 4: + del groups_eos[key] + for key, items in list(groups_param.items()): + if len(items) < 2: + del groups_param[key] # Return results result = { @@ -689,156 +679,98 @@ class EncGroupsResource(Resource): group_result = api.model("group_result", { - "calculations": fields.List(fields.String), - "energies": fields.List(fields.Float), - "volumes": fields.List(fields.Float), + "calculations": fields.List(fields.String, description="List of calculation ids."), + "energies": fields.List(fields.Float, description="List of total energies."), + "volumes": fields.List(fields.Float, description="List of cell volumes."), }) -group_source = { - "includes": [ - "calc_id", - "encyclopedia.properties.energies.energy_total", - "encyclopedia.material.idealized_structure.cell_volume", - ] -} @ns.route("/materials/<string:material_id>/groups/<string:group_type>/<string:group_id>") class EncGroupResource(Resource): @api.response(404, "Group not found") @api.response(400, "Bad request") - @api.response(200, "Metadata send", fields.Raw) + @api.response(200, "OK", group_result) @api.marshal_with(group_result) - @api.doc("enc_group") + @api.doc("get_material_group") + @api.param("group_type", "Type of group. Valid options are: 'eos' and 'par'.") + @api.param("group_id", "28 character identifier for the group.") + @api.param("material_id", "28 character identifier for the material.") @authenticate() def get(self, material_id, group_type, group_id): - """Used to query detailed information for a specific calculation group. + """Used to query detailed information about a specific calculation group. """ # Find entries for the given material, which have EOS or parameter # variation hashes set. if group_type == "eos": - group_id_source = "encyclopedia.method.group_eos_id" + group_id_source = "group_eos_id" elif group_type == "par": - group_id_source = "encyclopedia.method.group_parametervariation_id" + group_id_source = "group_parametervariation_id" else: abort(400, message="Unsupported group type.") - bool_query = Q( - "bool", - filter=get_enc_filter() + [ - Q("term", encyclopedia__material__material_id=material_id), - Q("term", **{group_id_source: group_id}), - ], - ) - - s = Search(index=config.elastic.index_name) - s = s.query(bool_query) - - # calc_id and energy should be extracted for each matched document. The - # documents are sorted by energy so that the minimum energy one can be - # easily extracted. A maximum request size is set in order to limit the - # result size. ES also has an index-level property - # "index.max_inner_result_window" that limits the number of results - # that an inner result can contain. - energy_aggregation = A( - "top_hits", - _source=group_source, - sort=[{"encyclopedia.properties.energies.energy_total": {"order": "asc"}}], - size=100, - ) - s.aggs.bucket("groups_eos", energy_aggregation) - - # We ignore the top level hits - s = s.extra(**{ - "size": 0, + s = MaterialSearch() + s.add_material_filter(Q("term", material_id=material_id)) + s.extra({ + "_source": {"includes": [ + "calculations.calc_id", + "calculations.properties.energies.energy_total", + "calculations.idealized_structure.cell_volume", + "calculations.method." + group_id_source, + ]}, + "size": 1, }) - # Collect information for each group from the aggregations - response = s.execute() + # Raise error if material not found + try: + calculations = s.calculations() + except MaterialAccessError: + abort(404, message=missing_material_msg.format(material_id)) + + # Gather groups from the calculations + calcs = [] + energies = [] + volumes = [] + for calc in calculations: + try: + i_group_id = getattr(calc.method, group_id_source) + if i_group_id == group_id: + calcs.append(calc.calc_id) + volumes.append(calc.idealized_structure.cell_volume) + energies.append(calc.properties.energies.energy_total) + except Exception: + pass + + # Sort results by energy + energies = np.array(energies) + volumes = np.array(volumes) + calcs = np.array(calcs) + order = energies.argsort() + energies = energies[order] + volumes = volumes[order] + calcs = calcs[order] - hits = response.aggs.groups_eos.hits - calculations = [doc.calc_id for doc in hits] - energies = [doc.encyclopedia.properties.energies.energy_total for doc in hits] - volumes = [doc.encyclopedia.material.idealized_structure.cell_volume for doc in hits] + # Return results group_dict = { - "calculations": calculations, - "energies": energies, - "volumes": volumes, + "calculations": calcs.tolist(), + "energies": energies.tolist(), + "volumes": volumes.tolist(), } return group_dict, 200 -suggestions_map = { - "code_name": "dft.code_name", - "structure_type": "encyclopedia.material.bulk.structure_type", -} -suggestions_query = api.parser() -suggestions_query.add_argument( - "property", - type=str, - choices=("code_name", "structure_type"), - help="The property name for which suggestions are returned.", - location="args" -) -suggestions_result = api.model("suggestions_result", { - "code_name": fields.List(fields.String), - "structure_type": fields.List(fields.String), -}) - - -@ns.route("/suggestions") -class EncSuggestionsResource(Resource): - @api.response(404, "Suggestion not found") - @api.response(400, "Bad request") - @api.response(200, "Metadata send", fields.Raw) - @api.expect(suggestions_query, validate=False) - @api.marshal_with(suggestions_result, skip_none=True) - @api.doc("enc_suggestions") - @authenticate() - def get(self): - - # Parse request arguments - args = suggestions_query.parse_args() - prop = args.get("property", None) - - # Use aggregation to return all unique terms for the requested field. - # Without using composite aggregations there is a size limit for the - # number of aggregation buckets. This should, however, not be a problem - # since the number of unique values is low for all supported properties. - s = Search(index=config.elastic.index_name) - query = Q( - "bool", - filter=get_enc_filter() - ) - s = s.query(query) - s = s.extra(**{ - "size": 0, - }) - - terms_agg = A("terms", field=suggestions_map[prop]) - s.aggs.bucket("suggestions", terms_agg) - - # Gather unique values into a list - response = s.execute() - suggestions = [x.key for x in response.aggs.suggestions.buckets] - - return {prop: suggestions}, 200 - - calc_prop_map = { "calc_id": "calc_id", "upload_id": "upload_id", - "code_name": "dft.code_name", - "code_version": "dft.code_version", - "functional_type": "encyclopedia.method.functional_type", - "basis_set_type": "dft.basis_set", - "core_electron_treatment": "encyclopedia.method.core_electron_treatment", - "run_type": "encyclopedia.calculation.calculation_type", - "has_dos": "encyclopedia.properties.electronic_dos", - "has_band_structure": "encyclopedia.properties.electronic_band_structure", - "has_thermal_properties": "encyclopedia.properties.thermodynamical_properties", - "has_phonon_dos": "encyclopedia.properties.phonon_dos", - "has_phonon_band_structure": "encyclopedia.properties.phonon_band_structure", + "code_name": "method.program_name", + "code_version": "method.program_version", + "functional_type": "method.functional_type", + "basis_set_type": "method.basis_set", + "core_electron_treatment": "method.core_electron_treatment", + "run_type": "workflow.workflow_type", + "has_dos": "properties.has_electronic_dos", + "has_band_structure": "properties.has_electronic_band_structure", + "has_thermal_properties": "properties.has_thermodynamical_properties", } calculation_result = api.model("calculation_result", { "calc_id": fields.String, @@ -852,8 +784,6 @@ calculation_result = api.model("calculation_result", { "has_dos": fields.Boolean, "has_band_structure": fields.Boolean, "has_thermal_properties": fields.Boolean, - "has_phonon_dos": fields.Boolean, - "has_phonon_band_structure": fields.Boolean, }) representatives_result = api.model("representatives_result", { "idealized_structure": fields.String, @@ -871,41 +801,20 @@ calculations_result = api.model("calculations_result", { @ns.route("/materials/<string:material_id>/calculations") class EncCalculationsResource(Resource): - @api.response(404, "Suggestion not found") + @api.response(404, "Material not found") @api.response(400, "Bad request") - @api.response(200, "Metadata send", fields.Raw) - @api.doc("get_calculations") + @api.response(200, "OK", calculations_result) + @api.doc("get_material_calculations") @authenticate() def get(self, material_id): - """Used to return all calculations related to the given material. Also - returns a representative calculation for each property shown in the - overview page. - """ - s = Search(index=config.elastic.index_name) - query = Q( - "bool", - filter=get_enc_filter() + [ - Q("term", encyclopedia__material__material_id=material_id), - ] - ) - s = s.query(query) - - # The query is filtered already on the ES side so we don"t need to - # transfer so much data. - s = s.extra(**{ - "_source": {"includes": list(calc_prop_map.values()) + ["dft.xc_functional"]}, - "size": 10000, - "from": 0, - }) - response = s.execute() + """Used to return information about all calculations related to the given material. - # No such material - if len(response) == 0: - abort(404, message="There is no material {}".format(material_id)) - - # Add representative properties. It might be possible to write a custom - # ES scoring mechanism or aggregation to also perform the selection. - representatives = {} + Returns a list of all calculations and a representative calculation for + few select quantities that are shown in the material overview page. + """ + s = MaterialSearch() + s.add_material_filter(Q("term", material_id=material_id)) + s.extra({"_source": {"includes": ["calculations"]}}) def calc_score(entry): """Custom scoring function used to sort results by their @@ -919,37 +828,53 @@ class EncCalculationsResource(Resource): "GGA": 100 } code_score = { - "FHI-aims": 3, - "VASP": 2, + "VASP": 3, # Prefer VASP data as it is the "cleanest" on average + "FHI-aims": 2, "Quantum Espresso": 1, } - code_name = entry.dft.code_name - functional = entry.dft.xc_functional - has_dos = rgetattr(entry, "encyclopedia.properties.electronic_band_structure") is not None - has_bs = rgetattr(entry, "encyclopedia.properties.electronic_dos") is not None + code_name = entry.method.program_name + functional = entry.method.functional_type + try: + has_bs = entry.properties.has_electronic_band_structure + except AttributeError: + has_bs = False + try: + has_dos = entry.properties.has_electronic_dos + except AttributeError: + has_dos = False score += functional_score.get(functional, 0) score += code_score.get(code_name, 0) if has_dos and has_bs: score += 10 - return (score, entry["calc_id"]) + return (score, entry.calc_id) + + # Raise error if material not found + try: + calculations = s.calculations() + except MaterialAccessError: + abort(404, message=missing_material_msg.format(material_id)) - # The calculations are first sorted by "quality" - sorted_calc = sorted(response, key=lambda x: calc_score(x), reverse=True) + # Sort calculations by "quality" + sorted_calc = sorted(calculations, key=lambda x: calc_score(x), reverse=True) # Get the requested representative properties + representatives = {} representatives["idealized_structure"] = sorted_calc[0].calc_id thermo_found = False bs_found = False dos_found = False for calc in sorted_calc: - if rgetattr(calc, "encyclopedia.properties.thermodynamical_properties") is not None: + if not hasattr(calc, "properties"): + continue + + if not thermo_found and calc.properties.has_thermodynamical_properties: representatives["thermodynamical_properties"] = calc.calc_id thermo_found = True - if rgetattr(calc, "encyclopedia.properties.electronic_band_structure") is not None: + if not bs_found and calc.properties.has_electronic_band_structure: representatives["electronic_band_structure"] = calc.calc_id bs_found = True - if rgetattr(calc, "encyclopedia.properties.electronic_dos") is not None: + if not dos_found and calc.properties.has_electronic_dos: representatives["electronic_dos"] = calc.calc_id dos_found = True if thermo_found and bs_found and dos_found: @@ -957,13 +882,8 @@ class EncCalculationsResource(Resource): # Create result JSON results = [] - for entry in response: + for entry in sorted_calc: calc_dict = get_es_doc_values(entry, calc_prop_map) - calc_dict["has_dos"] = calc_dict["has_dos"] is not None - calc_dict["has_band_structure"] = calc_dict["has_band_structure"] is not None - calc_dict["has_thermal_properties"] = calc_dict["has_thermal_properties"] is not None - calc_dict["has_phonon_dos"] = calc_dict["has_phonon_dos"] is not None - calc_dict["has_phonon_band_structure"] = calc_dict["has_phonon_band_structure"] is not None results.append(calc_dict) result = { @@ -1020,10 +940,11 @@ property_map = { class EncStatisticsResource(Resource): @api.response(404, "Suggestion not found") @api.response(400, "Bad request") - @api.response(200, "Metadata send", fields.Raw) + @api.response(200, "OK", statistics_result) @api.expect(statistics_query, validate=False) @api.marshal_with(statistics_result, skip_none=True) - @api.doc("enc_statistics") + @api.doc("get_material_statistics") + @api.param("material_id", "28 character identifier for the material.") @authenticate() def post(self, material_id): """Used to return statistics related to the specified material and @@ -1038,7 +959,7 @@ class EncStatisticsResource(Resource): # Find entries for the given material. bool_query = Q( "bool", - filter=get_enc_filter() + [ + filter=get_authentication_filters() + [ Q("term", encyclopedia__material__material_id=material_id), Q("terms", calc_id=data["calculations"]), ] @@ -1059,7 +980,7 @@ class EncStatisticsResource(Resource): # No hits on the top query level response = s.execute() if response.hits.total == 0: - abort(404, message="Could not find matching calculations.") + abort(404, message="The given calculations could not be found for material {}".format(material_id)) # Run a second query that creates histograms with fixed size buckets # based on the min and max from previous query. Might make more sense @@ -1137,48 +1058,61 @@ idealized_structure_result = api.model("idealized_structure_result", { calculation_property_map = { "lattice_parameters": { - "es_source": "encyclopedia.material.idealized_structure.lattice_parameters" + "source": "es", + "path": "encyclopedia.material.idealized_structure.lattice_parameters" }, "energies": { - "es_source": "encyclopedia.properties.energies", + "source": "es", + "path": "encyclopedia.properties.energies", }, "mass_density": { - "es_source": "encyclopedia.properties.mass_density", + "source": "es", + "path": "encyclopedia.properties.mass_density", }, "atomic_density": { - "es_source": "encyclopedia.properties.atomic_density", + "source": "es", + "path": "encyclopedia.properties.atomic_density", }, "cell_volume": { - "es_source": "encyclopedia.material.idealized_structure.cell_volume" + "source": "es", + "path": "encyclopedia.material.idealized_structure.cell_volume" }, "band_gap": { - "es_source": "encyclopedia.properties.band_gap" + "source": "es", + "path": "encyclopedia.properties.band_gap" }, "electronic_band_structure": { - "es_source": "encyclopedia.properties.electronic_band_structure" + "source": "es", + "path": "encyclopedia.properties.electronic_band_structure" }, "electronic_dos": { - "es_source": "encyclopedia.properties.electronic_dos" + "source": "es", + "path": "encyclopedia.properties.electronic_dos" }, "phonon_band_structure": { - "es_source": "encyclopedia.properties.phonon_band_structure" + "source": "es", + "path": "encyclopedia.properties.phonon_band_structure" }, "phonon_dos": { - "es_source": "encyclopedia.properties.phonon_dos" + "source": "es", + "path": "encyclopedia.properties.phonon_dos" }, "thermodynamical_properties": { - "es_source": "encyclopedia.properties.thermodynamical_properties" + "source": "es", + "path": "encyclopedia.properties.thermodynamical_properties" }, "wyckoff_sets": { - "arch_source": "section_metadata/encyclopedia/material/idealized_structure/wyckoff_sets" + "source": "archive", + "path": "section_metadata/encyclopedia/material/idealized_structure/wyckoff_sets" }, "idealized_structure": { - "arch_source": "section_metadata/encyclopedia/material/idealized_structure" + "source": "archive", + "path": "section_metadata/encyclopedia/material/idealized_structure" }, } calculation_property_query = api.model("calculation_query", { - "properties": fields.List(fields.String), + "properties": fields.List(fields.String(enum=list(calculation_property_map.keys())), description="List of calculation properties to return."), }) energies = api.model("energies", { "energy_total": fields.Float, @@ -1216,15 +1150,13 @@ calculation_property_result = api.model("calculation_property_result", { class EncCalculationResource(Resource): @api.response(404, "Material or calculation not found") @api.response(400, "Bad request") - @api.response(200, "Metadata send", fields.Raw) + @api.response(200, "OK", calculation_property_result) @api.expect(calculation_property_query, validate=False) @api.marshal_with(calculation_property_result, skip_none=True) @api.doc("get_calculation") @authenticate() def post(self, material_id, calc_id): - """Used to return calculation details. Some properties are not - available in the ES index and are instead read from the Archive - directly. + """Get properties from a specific calculation related to a material. """ # Get query parameters as json try: @@ -1235,7 +1167,7 @@ class EncCalculationResource(Resource): s = Search(index=config.elastic.index_name) query = Q( "bool", - filter=get_enc_filter() + [ + filter=get_authentication_filters() + [ Q("term", encyclopedia__material__material_id=material_id), Q("term", calc_id=calc_id), ] @@ -1245,8 +1177,9 @@ class EncCalculationResource(Resource): # Create dictionaries for requested properties references = [] properties = data["properties"] - arch_properties = {} es_properties = {} + mongo_properties = {} + arch_properties = {} ref_properties = set(( "electronic_dos", "electronic_band_structure", @@ -1255,14 +1188,16 @@ class EncCalculationResource(Resource): "phonon_band_structure", )) for prop in properties: - es_source = calculation_property_map[prop].get("es_source") - if es_source is not None: - es_properties[prop] = es_source + source = calculation_property_map[prop]["source"] + path = calculation_property_map[prop]["path"] + if source == "es": + es_properties[prop] = path if prop in ref_properties: references.append(prop) - arch_source = calculation_property_map[prop].get("arch_source") - if arch_source is not None: - arch_properties[prop] = arch_source + elif source == "mongo": + mongo_properties[prop] = path + elif source == "archive": + arch_properties[prop] = path # The query is filtered already on the ES side so we don't need to # transfer so much data. @@ -1282,7 +1217,11 @@ class EncCalculationResource(Resource): # No such material if len(response) == 0: - abort(404, message="There is no material {} with calculation {}".format(material_id, calc_id)) + abort(404, message=( + "Could not retrieve calculation {} for material {}. The " + "entry either does not exist or requires authentication." + .format(calc_id, material_id)) + ) # Add references that are to be read from the archive for ref in references: @@ -1361,9 +1300,86 @@ class EncCalculationResource(Resource): value = value.to_dict() result[prop] = value + # Add results from Mongo + if len(mongo_properties) != 0: + mongo_db = infrastructure.mongo_client[config.mongo.db_name] + archives = mongo_db['archive'] + archive = archives.find_one({"_id": calc_id}) + for prop, mongo_source in mongo_properties.items(): + value = rgetattr(archive, mongo_source) + if value is not None: + result[prop] = value + return result, 200 +suggestions_map = { + "code_name": "dft.code_name", + "structure_type": "bulk.structure_type", +} +suggestions_query = api.parser() +suggestions_query.add_argument( + "property", + type=str, + choices=("code_name", "structure_type"), + help="The property name for which suggestions are returned.", + location="args" +) +suggestions_result = api.model("suggestions_result", { + "code_name": fields.List(fields.String), + "structure_type": fields.List(fields.String), +}) + + +@ns.route("/suggestions") +class EncSuggestionsResource(Resource): + @api.response(404, "Suggestion not found") + @api.response(400, "Bad request") + @api.response(200, "OK", suggestions_result) + @api.expect(suggestions_query, validate=False) + @api.marshal_with(suggestions_result, skip_none=True) + @api.doc("get_material_suggestions") + @authenticate() + def get(self): + """Dynamically retrieves a list of unique values for the given property. + """ + # Uses terms aggregation to return all unique terms for the requested + # field. Without using composite aggregations there is a size limit for + # the number of aggregation buckets. This should, however, not be a + # problem since the number of unique values is low for all supported + # properties. + + # Parse request arguments + args = suggestions_query.parse_args() + prop = args.get("property", None) + + # Material level suggestions + if prop == "structure_type": + s = MaterialSearch() + s.size(0) + s.add_material_aggregation("suggestions", A("terms", field=suggestions_map[prop], size=999)) + # Calculation level suggestions + elif prop == "code_name": + s = Search(index=config.elastic.index_name) + query = Q( + "bool", + filter=get_authentication_filters() + ) + s = s.query(query) + s = s.extra(**{ + "size": 0, + }) + + terms_agg = A("terms", field=suggestions_map[prop], size=999) + s.aggs.bucket("suggestions", terms_agg) + + # Gather unique values into a list + response = s.execute() + suggestions = [x.key for x in response.aggs.suggestions.buckets] + + return {prop: suggestions}, 200 + + report_query = api.model("report_query", { "server": fields.String, "username": fields.String, @@ -1381,13 +1397,14 @@ report_query = api.model("report_query", { class ReportsResource(Resource): @api.response(500, "Error sending report") @api.response(400, "Bad request") - @api.response(204, "Report succesfully sent", fields.Raw) - @api.expect(calculation_property_query, validate=False) - @api.marshal_with(calculation_property_result, skip_none=True) - @api.doc("enc_report") + @api.response(204, "Report succesfully sent") + @api.expect(report_query) + @api.doc("post_material_report") + @api.param("material_id", "28 character identifier for the material.") @authenticate(required=True) def post(self, material_id): - + """Post an error report on a material. Requires authentication. + """ # Get query parameters as json try: query = marshal(request.get_json(), report_query) @@ -1414,29 +1431,7 @@ class ReportsResource(Resource): ).format(**query) try: infrastructure.send_mail( - name="webmaster", email="lauri.himanen@gmail.com", message=mail, subject='Encyclopedia error report') + name="webmaster", email="support@nomad-lab.eu", message=mail, subject='Encyclopedia error report') except Exception as e: abort(500, message="Error sending error report email.") return "", 204 - - -def read_archive(upload_id: str, calc_id: str) -> EntryArchive: - """Used to read data from the archive. - - Args: - upload_id: Upload id. - calc_id: Calculation id. - - Returns: - MSection: The section_run as MSection - For each path, a dictionary containing the path as key and the returned - section as value. - """ - upload_files = UploadFiles.get( - upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id)) - - with upload_files.read_archive(calc_id) as archive: - data = archive[calc_id] - root = EntryArchive.m_from_dict(data.to_dict()) - - return root diff --git a/nomad/app/api/repo.py b/nomad/app/api/repo.py index b6fd6834541d3042eaf71b0071ee110d5e7fa6ff..057a6e25f44cca07a76ecee4801f968fbb155ff2 100644 --- a/nomad/app/api/repo.py +++ b/nomad/app/api/repo.py @@ -26,7 +26,6 @@ import elasticsearch.helpers from datetime import datetime from nomad import search, utils, datamodel, processing as proc, infrastructure, files -from nomad.metainfo import search_extension from nomad.datamodel import Dataset, User, EditableUserMetadata from nomad.app import common from nomad.app.common import RFC3339DateTime, DotKeyNested @@ -88,13 +87,13 @@ _search_request_parser.add_argument( _search_request_parser.add_argument( 'metrics', type=str, action='append', help=( 'Metrics to aggregate over all quantities and their values as comma separated list. ' - 'Possible values are %s.' % ', '.join(search_extension.metrics.keys()))) + 'Possible values are %s.' % ', '.join(search.metrics.keys()))) _search_request_parser.add_argument( 'statistics', type=str, action='append', help=( 'Quantities for which to aggregate values and their metrics.')) _search_request_parser.add_argument( 'exclude', type=str, action='split', help='Excludes the given keys in the returned data.') -for group_name in search_extension.groups: +for group_name in search.groups: _search_request_parser.add_argument( group_name, type=bool, help=('Return %s group data.' % group_name)) _search_request_parser.add_argument( @@ -106,15 +105,15 @@ _repo_calcs_model_fields = { 'A dict with all statistics. Each statistic is dictionary with a metrics dict as ' 'value and quantity value as key. The possible metrics are code runs(calcs), %s. ' 'There is a pseudo quantity "total" with a single value "all" that contains the ' - ' metrics over all results. ' % ', '.join(search_extension.metrics.keys())))} + ' metrics over all results. ' % ', '.join(search.metrics.keys())))} -for group_name in search_extension.groups: +for group_name in search.groups: _repo_calcs_model_fields[group_name] = (DotKeyNested if '.' in group_name else fields.Nested)(api.model('RepoGroup', { 'after': fields.String(description='The after value that can be used to retrieve the next %s.' % group_name), 'values': fields.Raw(description='A dict with %s as key. The values are dicts with "total" and "examples" keys.' % group_name) }), skip_none=True) -for qualified_name, quantity in search_extension.search_quantities.items(): +for qualified_name, quantity in search.search_quantities.items(): _repo_calcs_model_fields[qualified_name] = fields.Raw( description=quantity.description, allow_null=True, skip_none=True) @@ -123,7 +122,7 @@ _repo_calcs_model_fields.update(**{ 'interval': fields.String(description='Interval to use for upload time aggregation.', allow_null=True, skip_none=True), 'metrics': fields.List(fields.String, description=( 'Metrics to aggregate over all quantities and their values as comma separated list. ' - 'Possible values are %s.' % ', '.join(search_extension.metrics.keys())), allow_null=True, skip_none=True), + 'Possible values are %s.' % ', '.join(search.metrics.keys())), allow_null=True, skip_none=True), 'statistics_required': fields.List(fields.String, description='Quantities for which to aggregate values and their metrics.', allow_null=True, skip_none=True), 'exclude': fields.List(fields.String, description='Excludes the given keys in the returned data.', allow_null=True, skip_none=True) }) @@ -192,7 +191,7 @@ class RepoCalcsResource(Resource): abort(400, message='bad parameters: %s' % str(e)) for metric in metrics: - if metric not in search_extension.metrics: + if metric not in search.metrics: abort(400, message='there is no metric %s' % metric) search_request = search.SearchRequest() @@ -214,7 +213,7 @@ class RepoCalcsResource(Resource): group_metrics = [ group_quantity.metric_name - for group_name, group_quantity in search_extension.groups.items() + for group_name, group_quantity in search.groups.items() if args.get(group_name, False)] total_metrics = metrics + group_metrics if len(total_metrics) > 0: @@ -230,7 +229,7 @@ class RepoCalcsResource(Resource): results = search_request.execute_scrolled(scroll_id=scroll_id, size=per_page) else: - for group_name, group_quantity in search_extension.groups.items(): + for group_name, group_quantity in search.groups.items(): if args.get(group_name, False): kwargs: Dict[str, Any] = {} if group_name == 'uploads_grouped': @@ -252,7 +251,7 @@ class RepoCalcsResource(Resource): if 'quantities' in results: quantities = results.pop('quantities') - for group_name, group_quantity in search_extension.groups.items(): + for group_name, group_quantity in search.groups.items(): if args.get(group_name, False): results[group_name] = quantities[group_quantity.qualified_name] @@ -335,7 +334,7 @@ class RepoCalcsResource(Resource): abort(400, message='bad parameters: %s' % str(e)) for metric in metrics: - if metric not in search_extension.metrics: + if metric not in search.metrics: abort(400, message='there is no metric %s' % metric) search_request = search.SearchRequest() @@ -363,7 +362,7 @@ class RepoCalcsResource(Resource): group_metrics = [ group_quantity.metric_name - for group_name, group_quantity in search_extension.groups.items() + for group_name, group_quantity in search.groups.items() if group_name in data_in] total_metrics = metrics + group_metrics if len(total_metrics) > 0: @@ -379,7 +378,7 @@ class RepoCalcsResource(Resource): results = search_request.execute_scrolled(scroll_id=scroll_id, size=per_page) else: - for group_name, group_quantity in search_extension.groups.items(): + for group_name, group_quantity in search.groups.items(): if group_name in data_in: kwargs: Dict[str, Any] = {} if group_name == 'uploads_grouped': @@ -401,7 +400,7 @@ class RepoCalcsResource(Resource): if 'quantities' in results: quantities = results.pop('quantities') - for group_name, group_quantity in search_extension.groups.items(): + for group_name, group_quantity in search.groups.items(): if group_name in data_in: results[group_name] = quantities[group_quantity.qualified_name] @@ -865,7 +864,7 @@ _repo_quantities_search_request_parser.add_argument( _repo_quantities_model = api.model('RepoQuantitiesResponse', { 'quantities': fields.Nested(api.model('RepoQuantities', { quantity: fields.List(fields.Nested(_repo_quantity_model)) - for quantity in search_extension.search_quantities + for quantity in search.search_quantities })) }) diff --git a/nomad/cli/admin/admin.py b/nomad/cli/admin/admin.py index 643a35efc268abb031a2bf33f4ed2d9ba9670fc8..370b9006dec53c1d31e9a31d50f6118c8daa57bc 100644 --- a/nomad/cli/admin/admin.py +++ b/nomad/cli/admin/admin.py @@ -21,6 +21,10 @@ import threading from nomad import processing as proc, search, datamodel, infrastructure, utils, config from nomad.cli.cli import cli +from nomad.datamodel.material import Material, Calculation +from nomad.datamodel.encyclopedia import EncyclopediaMetadata +from nomad.search import material_document +from nomad.datamodel.material import Material, Calculation, Method, Properties, IdealizedStructure, Energies, Workflow, Bulk def __run_parallel( @@ -209,16 +213,367 @@ def index(threads, dry): if dry: for _ in elastic_updates(): pass - if threads > 1: - print(' use %d threads' % threads) - for _ in elasticsearch.helpers.parallel_bulk( - infrastructure.elastic_client, elastic_updates(), chunk_size=500, - thread_count=threads): + else: + if threads > 1: + print(' use %d threads' % threads) + for _ in elasticsearch.helpers.parallel_bulk( + infrastructure.elastic_client, elastic_updates(), chunk_size=500, + thread_count=threads): + pass + else: + elasticsearch.helpers.bulk( + infrastructure.elastic_client, elastic_updates()) + search.refresh() + + print('') + print('indexing completed') + + +@admin.command() +@click.option('--threads', type=int, default=1, help='Number of threads to use.') +@click.option('--code', multiple=True, type=str, help='Index only calculcations of given codes.') +@click.option('--dry', is_flag=True, help='Do not index, just compute entries.') +@click.option('--in-place', is_flag=True, default=False, help='Perform indexing in the current elastic search index. Meant only for small reindex operations.') +@click.option('-n', type=int, default=None, help='Number of calculations to process. Leave undefined to process all calculations.') +@click.option('--source', + type=click.Choice(['mongo', 'es'], case_sensitive=True)) +def index_materials(threads, code, dry, in_place, n, source): + """(Re-)index all materials. + + This command will completely rebuild the materials index. The index is + built from the material metainfo stored in MongoDB. The materials index can + be used normally during the reindexing. + """ + chunk_size = 500 + infrastructure.setup_mongo() + client = infrastructure.setup_elastic() + + # In order to do the reindexing with zero downtime, two different indices + # are rotated and an alias is used + old_index_name = list(client.indices.get(config.elastic.materials_index_name).keys())[0] + if in_place: + target_index_name = old_index_name + else: + if old_index_name == config.elastic.materials_index_name + "_a": + target_index_name = config.elastic.materials_index_name + "_b" + elif old_index_name == config.elastic.materials_index_name + "_b": + target_index_name = config.elastic.materials_index_name + "_a" + else: + raise ValueError( + "Unrecognized index name accociated with the alias {}" + .format(config.elastic.materials_index_name) + ) + + if source == "mongo": + all_calcs = proc.Calc.objects().count() + print('indexing materials from %d calculations ...' % all_calcs) + + # Bulk update + def elastic_updates(): + with utils.ETA(all_calcs, ' index %10d of %10d calcs, ETA %s') as eta: + mongo_db = infrastructure.mongo_client[config.mongo.db_name] + mongo_collection = mongo_db['archive'] + i_calc = 0 + for mongo_archive in mongo_collection.find(): + i_calc += 1 + if n is not None: + if i_calc > n: + return + eta.add() + + # Do not process entries that do not have the material + # information + try: + status = mongo_archive["section_metadata"]["encyclopedia"]["status"] + if status != EncyclopediaMetadata.status.type.success: + raise AttributeError + except (KeyError, AttributeError, IndexError): + continue + + # Create material information + metadata = mongo_archive["section_metadata"] + encyclopedia = EncyclopediaMetadata.m_from_dict(metadata["encyclopedia"]) + dft = metadata["dft"] + material: Material = Material() + material.material_id = encyclopedia.material.material_id + material.material_type = encyclopedia.material.material_type + material.material_name = encyclopedia.material.material_name + material.material_classification = encyclopedia.material.material_classification + material.formula = encyclopedia.material.formula + material.formula_reduced = encyclopedia.material.formula_reduced + material.species_and_counts = encyclopedia.material.species_and_counts + material.species = encyclopedia.material.species + enc_bulk = encyclopedia.material.bulk + if enc_bulk: + bulk = Bulk.m_from_dict(enc_bulk.m_to_dict()) + material.m_add_sub_section(Material.bulk, bulk) + + # Create calculation info for this entry + calc = Calculation() + calc.calc_id = metadata["calc_id"] + calc.upload_id = metadata["upload_id"] + mongo_calc = proc.Calc.get(calc.calc_id) + calc.published = mongo_calc["metadata"]["published"] + calc.with_embargo = mongo_calc["metadata"]["with_embargo"] + calc.owners = [mongo_calc["metadata"]["uploader"]] + mongo_calc["metadata"]["shared_with"] + enc_idealized_structure = encyclopedia.material.idealized_structure + idealized_structure = IdealizedStructure() + cell_volume = enc_idealized_structure.cell_volume + if cell_volume is not None: + idealized_structure.cell_volume = cell_volume + idealized_structure.lattice_parameters = enc_idealized_structure.lattice_parameters + calc.m_add_sub_section(Calculation.idealized_structure, idealized_structure) + enc_method = encyclopedia.method + method = Method.m_from_dict(enc_method.m_to_dict()) + method.program_name = dft["code_name"] + method.program_version = dft["code_version"] + method.basis_set = dft["basis_set"] + calc.m_add_sub_section(Calculation.method, method) + enc_props = encyclopedia.properties + + # Properties may not exist at all + if enc_props is not None: + properties = Properties() + + # Energies may not be present in all calculations + try: + energies = Energies.m_from_dict(enc_props.energies.m_to_dict()) + properties.m_add_sub_section(Properties.energies, energies) + except AttributeError: + pass + + properties.has_electronic_dos = enc_props.electronic_dos is not None + properties.has_electronic_band_structure = enc_props.electronic_band_structure is not None + properties.has_thermodynamical_properties = enc_props.thermodynamical_properties is not None + atomic_density = enc_props.atomic_density + if atomic_density is not None: + properties.atomic_density = atomic_density + mass_density = enc_props.mass_density + if mass_density is not None: + properties.mass_density = mass_density + band_gap = enc_props.band_gap + if band_gap is not None: + properties.band_gap = band_gap + band_gap_direct = enc_props.band_gap_direct + if band_gap_direct is not None: + properties.band_gap_direct = band_gap_direct + calc.m_add_sub_section(Calculation.properties, properties) + + workflow = Workflow() + workflow.workflow_type = encyclopedia.calculation.calculation_type + calc.m_add_sub_section(Calculation.workflow, workflow) + material.m_add_sub_section(Material.calculations, calc) + + # Update entry that inserts the full material info if entry + # does not exists, otherwise only adds the calculation into the + # nested subdocument + entry = {} + entry['_op_type'] = 'update' + entry['_index'] = target_index_name + entry['_id'] = material.material_id + entry['_type'] = 'doc' + entry['_source'] = { + "upsert": material.m_to_dict(include_defaults=False, partial="es"), + "doc_as_upsert": False, + "script": { + "source": "ctx._source.calculations.add(params.calc)", + "params": { + "calc": calc.m_to_dict(include_defaults=False, partial="es") + }, + } + } + yield entry + elif source == "es": + s = elasticsearch_dsl.Search(index=config.elastic.index_name) + filters = [elasticsearch_dsl.Q("term", encyclopedia__status="success")] + if code: + filters.append(elasticsearch_dsl.Q("terms", dft__code_name=code)) + query = elasticsearch_dsl.Q( + "bool", + filter=filters, + ) + s = s.query(query) + s = s.extra(**{ + "size": 0, + }) + all_calcs = s.execute().hits.total + print('indexing materials from %d calculations ...' % all_calcs) + + def elastic_updates(): + with utils.ETA(all_calcs, ' index %10d of %10d calcs, ETA %s', chunk_size) as eta: + + s = elasticsearch_dsl.Search(index=config.elastic.index_name) + filters = [elasticsearch_dsl.Q("term", encyclopedia__status="success")] + if code: + filters.append(elasticsearch_dsl.Q("terms", dft__code_name=code)) + query = elasticsearch_dsl.Q( + "bool", + filter=filters, + ) + s = s.query(query) + s = s.extra(**{ + "size": chunk_size, + }) + i_calc = 0 + for hit in s.scan(): + i_calc += 1 + if n is not None: + if i_calc > n: + return + eta.add() + + material: Material = Material() + calc = Calculation() + + # Check that all required information exists. If not, the + # calculation is skipped. + try: + material.material_id = hit.encyclopedia.material.material_id + material.material_type = hit.encyclopedia.material.material_type + material.formula = hit.encyclopedia.material.formula + material.formula_reduced = hit.encyclopedia.material.formula_reduced + material.species_and_counts = hit.encyclopedia.material.species_and_counts + material.species = hit.encyclopedia.material.species + calc.calc_id = hit.calc_id + calc.upload_id = hit.upload_id + calc.published = hit.published + calc.with_embargo = hit.with_embargo + calc.owners = [x.user_id for x in hit.owners] + idealized_structure = IdealizedStructure.m_from_dict(hit.encyclopedia.material.idealized_structure.to_dict()) + calc.m_add_sub_section(Calculation.idealized_structure, idealized_structure) + + method = Method.m_from_dict(hit.encyclopedia.method.to_dict()) + method.program_name = hit.dft.code_name + method.program_version = hit.dft.code_version + method.basis_set = hit.dft.basis_set + calc.m_add_sub_section(Calculation.method, method) + + workflow = Workflow() + workflow.workflow_type = hit.encyclopedia.calculation.calculation_type + calc.m_add_sub_section(Calculation.workflow, workflow) + except AttributeError: + continue + + # Not all materials have a name + try: + material.material_name = hit.encyclopedia.material.material_name + except AttributeError: + pass + + # Not all materials have a bulk section + try: + bulk = Bulk.m_from_dict(hit.encyclopedia.material.bulk) + material.m_add_sub_section(Material.bulk, bulk) + except AttributeError: + pass + + # Properties may not exist at all + try: + enc_properties = hit.encyclopedia.properties + except AttributeError: + pass + else: + properties = Properties() + + # Energies may not be present in all calculations + try: + energies = Energies.m_from_dict(enc_properties.energies.to_dict()) + properties.m_add_sub_section(Properties.energies, energies) + except AttributeError: + pass + + # Gather the boolean flags that indicate the presence of + # certain properties + try: + properties.has_electronic_dos = enc_properties.electronic_dos is not None + except AttributeError: + properties.has_electronic_dos = False + try: + properties.has_electronic_band_structure = enc_properties.electronic_band_structure is not None + except AttributeError: + properties.has_electronic_band_structure = False + try: + properties.has_thermodynamical_properties = enc_properties.thermodynamical_properties is not None + except AttributeError: + properties.has_thermodynamical_properties = False + + # Not all materials have an atomic density + try: + properties.atomic_density = enc_properties.atomic_density + except AttributeError: + pass + + # Not all materials have a mass density + try: + properties.mass_density = enc_properties.mass_density + except AttributeError: + pass + + # Not all materials have band gaps + try: + properties.band_gap = enc_properties.band_gap + except AttributeError: + pass + + # Not all materials have band gap type + try: + properties.band_gap_direct = enc_properties.band_gap_direct + except AttributeError: + pass + + calc.m_add_sub_section(Calculation.properties, properties) + + material.m_add_sub_section(Material.calculations, calc) + + # Update entry that inserts the full material info if entry + # does not exists, otherwise only adds the calculation into + # the nested subdocument + entry = {} + entry['_op_type'] = 'update' + entry['_index'] = target_index_name + entry['_id'] = material.material_id + entry['_type'] = 'doc' + entry['_source'] = { + "upsert": material.m_to_dict(include_defaults=False, partial="es"), + "doc_as_upsert": False, + "script": { + "params": { + "calc": calc.m_to_dict(include_defaults=False, partial="es") + }, + } + } + if in_place: + entry['_source']["script"]["source"] = "ctx._source.calculations.removeIf(x -> x.calc_id == params.calc.calc_id); ctx._source.calculations.add(params.calc)" + else: + entry['_source']["script"]["source"] = "ctx._source.calculations.add(params.calc)" + yield entry + + if dry: + for _ in elastic_updates(): pass else: - elasticsearch.helpers.bulk( - infrastructure.elastic_client, elastic_updates()) - search.refresh() + # Create new index into which the data will be inserted. The old index will + # keep working while the new index is being built + material_document.init(index=target_index_name) + + if threads > 1: + print(' use %d threads' % threads) + for _ in elasticsearch.helpers.parallel_bulk( + infrastructure.elastic_client, elastic_updates(), chunk_size=chunk_size, + thread_count=threads): + pass + else: + elasticsearch.helpers.bulk( + infrastructure.elastic_client, elastic_updates()) + search.refresh() + + # Changes materials index alias to point to the new index and remove the + # old index. + if not in_place: + new_index = elasticsearch_dsl.Index(target_index_name) + new_index.put_alias(name=config.elastic.materials_index_name) + old_index = elasticsearch_dsl.Index(old_index_name) + old_index.delete() print('') print('indexing completed') diff --git a/nomad/config.py b/nomad/config.py index 802e374cef6a2fbd11c811849cb932a7d97cd95c..7687a5194dbecb6c6c41ad86a7282f16387f8aef 100644 --- a/nomad/config.py +++ b/nomad/config.py @@ -116,7 +116,8 @@ fs = NomadConfig( elastic = NomadConfig( host='localhost', port=9200, - index_name='nomad_fairdi_calcs' + index_name='nomad_fairdi_calcs', + materials_index_name='nomad_fairdi_materials' ) keycloak = NomadConfig( diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py index 038c2994dfdae574eb51b91c4c45664d5c9ca998..06a3dd6cc25321dc176295e237515f6291e336c3 100644 --- a/nomad/datamodel/datamodel.py +++ b/nomad/datamodel/datamodel.py @@ -536,7 +536,7 @@ class EntryMetadata(metainfo.MSection): ems = metainfo.SubSection(sub_section=EMSMetadata, a_search='ems') dft = metainfo.SubSection(sub_section=DFTMetadata, a_search='dft', categories=[fast_access]) qcms = metainfo.SubSection(sub_section=QCMSMetadata, a_search='qcms') - encyclopedia = metainfo.SubSection(sub_section=EncyclopediaMetadata, a_search='encyclopedia') + encyclopedia = metainfo.SubSection(sub_section=EncyclopediaMetadata, categories=[fast_access], a_search='encyclopedia') def apply_user_metadata(self, metadata: dict): ''' Applies a user provided metadata dict to this calc. ''' diff --git a/nomad/datamodel/encyclopedia.py b/nomad/datamodel/encyclopedia.py index 10443e1a7f9df61b75681dde896c3a18337ffc0d..48b52e3f8a09af99619c65708cb385ccebdbb7a6 100644 --- a/nomad/datamodel/encyclopedia.py +++ b/nomad/datamodel/encyclopedia.py @@ -64,7 +64,7 @@ class WyckoffSet(MSection): Chemical element at this Wyckoff position. """ ) - variables = SubSection(sub_section=WyckoffVariables.m_def, repeats=False) + variables = SubSection(sub_section=WyckoffVariables.m_def, repeats=False, categories=[fast_access]) class LatticeParameters(MSection): @@ -190,8 +190,8 @@ class IdealizedStructure(MSection): """, a_search=Search() ) - wyckoff_sets = SubSection(sub_section=WyckoffSet.m_def, repeats=True) - lattice_parameters = SubSection(sub_section=LatticeParameters.m_def) + wyckoff_sets = SubSection(sub_section=WyckoffSet.m_def, repeats=True, categories=[fast_access]) + lattice_parameters = SubSection(sub_section=LatticeParameters.m_def, categories=[fast_access]) class Bulk(MSection): @@ -370,10 +370,10 @@ class Material(MSection): ) # Bulk-specific properties - bulk = SubSection(sub_section=Bulk.m_def, repeats=False) + bulk = SubSection(sub_section=Bulk.m_def, repeats=False, categories=[fast_access]) # The idealized structure for this material - idealized_structure = SubSection(sub_section=IdealizedStructure.m_def, repeats=False) + idealized_structure = SubSection(sub_section=IdealizedStructure.m_def, repeats=False, categories=[fast_access]) class Method(MSection): @@ -575,11 +575,10 @@ class Properties(MSection): """, a_search=Search() ) - energies = SubSection(sub_section=Energies.m_def, repeats=False, a_search='energies') + energies = SubSection(sub_section=Energies.m_def, repeats=False, categories=[fast_access], a_search='energies') electronic_band_structure = Quantity( type=Reference(section_k_band.m_def), shape=[], - categories=[fast_access], description=""" Reference to an electronic band structure. """, @@ -588,7 +587,6 @@ class Properties(MSection): electronic_dos = Quantity( type=Reference(section_dos.m_def), shape=[], - categories=[fast_access], description=""" Reference to an electronic density of states. """, @@ -597,7 +595,6 @@ class Properties(MSection): phonon_band_structure = Quantity( type=Reference(section_k_band.m_def), shape=[], - categories=[fast_access], description=""" Reference to a phonon band structure. """, @@ -606,7 +603,6 @@ class Properties(MSection): phonon_dos = Quantity( type=Reference(section_dos.m_def), shape=[], - categories=[fast_access], description=""" Reference to a phonon density of states. """, @@ -615,7 +611,6 @@ class Properties(MSection): thermodynamical_properties = Quantity( type=Reference(section_thermodynamical_properties.m_def), shape=[], - categories=[fast_access], description=""" Reference to a section containing thermodynamical properties. """, @@ -631,10 +626,10 @@ class EncyclopediaMetadata(MSection): Section which stores information for the NOMAD Encyclopedia. """ ) - material = SubSection(sub_section=Material.m_def, repeats=False, a_search='material') - method = SubSection(sub_section=Method.m_def, repeats=False, a_search='method') - properties = SubSection(sub_section=Properties.m_def, repeats=False, a_search='properties') - calculation = SubSection(sub_section=Calculation.m_def, repeats=False, a_search='calculation') + material = SubSection(sub_section=Material.m_def, repeats=False, categories=[fast_access], a_search='material') + method = SubSection(sub_section=Method.m_def, repeats=False, categories=[fast_access], a_search='method') + properties = SubSection(sub_section=Properties.m_def, repeats=False, categories=[fast_access], a_search='properties') + calculation = SubSection(sub_section=Calculation.m_def, repeats=False, categories=[fast_access], a_search='calculation') status = Quantity( type=MEnum("success", "unsupported_material_type", "unsupported_method_type", "unsupported_calculation_type", "invalid_metainfo", "failure"), description=""" diff --git a/nomad/datamodel/material.py b/nomad/datamodel/material.py index 0a8c6ba1bb2f96d354828a61503e578767966d94..f946dc1896fcae7e9707f507485ba9ff16f6d9f2 100644 --- a/nomad/datamodel/material.py +++ b/nomad/datamodel/material.py @@ -1,10 +1,21 @@ -from nomad.metainfo import MSection, Section, SubSection, Quantity +import numpy as np +from elasticsearch_dsl import Text, Keyword + +from nomad import config +from nomad.metainfo import MSection, Section, SubSection, Quantity, MEnum from nomad.metainfo.mongoengine_extension import Mongo, MongoDocument +from nomad.metainfo.search_extension import Search +from nomad.metainfo.elastic_extension import ElasticDocument +from nomad.datamodel.datamodel import MongoMetadata, EditableUserMetadata class DOSSimilarity(MSection): m_def = Section( - a_mongo=MongoDocument() + a_mongo=MongoDocument(), + description=""" + Contains information about the similarity of the density of states of + this material compared to other materials. + """, ) material_ids = Quantity( type=str, @@ -20,17 +31,668 @@ class DOSSimilarity(MSection): class Similarity(MSection): m_def = Section( - a_mongo=MongoDocument() + a_mongo=MongoDocument(), + description=""" + Contains information about the similarity of this material to other + materials. + """, ) electronic_dos = SubSection(sub_section=DOSSimilarity.m_def, repeats=False) +class Energies(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + a_search="energies", + description=""" + Contains different types of energies extracted from this entry. The + energies are extracted from a representative calculation: for geometry + optimization it is the last optimization step. + """ + ) + energy_total = Quantity( + type=np.dtype(np.float64), + unit="eV", + description=""" + Total energy. + """, + a_search=Search() + ) + energy_total_T0 = Quantity( + type=np.dtype(np.float64), + unit="eV", + description=""" + Total energy projected to T=0. + """, + a_search=Search() + ) + energy_free = Quantity( + type=np.dtype(np.float64), + unit="eV", + description=""" + Free energy. + """, + a_search=Search() + ) + + +class Properties(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + a_search="properties", + description=""" + Contains derived physical properties that are specific to the NOMAD + Encyclopedia. + """ + ) + atomic_density = Quantity( + type=np.dtype(np.float64), + unit="1 / m ** 3", + description=""" + Atomic density of the material (atoms/volume)." + """, + a_search=Search() + ) + mass_density = Quantity( + type=np.dtype(np.float64), + unit="kg / m ** 3", + description=""" + Mass density of the material. + """, + a_search=Search() + ) + band_gap = Quantity( + type=np.dtype(np.float64), + unit="eV", + description=""" + Band gap value. If multiple spin channels are present, this value is + taken from the channel with smallest band gap value. + """, + a_search=Search() + ) + band_gap_direct = Quantity( + type=bool, + description=""" + Whether band gap is direct or not. If multiple spin channels are + present, this value is taken from the channel with smallest band gap + value. + """, + a_search=Search() + ) + energies = SubSection(sub_section=Energies.m_def, repeats=False, a_search='energies') + has_electronic_band_structure = Quantity( + type=bool, + shape=[], + description=""" + True if the calculation contains an electronic band structure. + """, + a_search=Search() + ) + has_electronic_dos = Quantity( + type=bool, + shape=[], + description=""" + True if the calculation contains an electronic density of states. + """, + a_search=Search() + ) + has_thermodynamical_properties = Quantity( + type=bool, + shape=[], + description=""" + True if the calculation contains thermodynamical properties. + """, + a_search=Search() + ) + + +class Method(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + a_search="method", + description=""" + Contains an overview of the methodology that was detected in this + entry. + """ + ) + program_name = Quantity( + type=MEnum("ABINIT", "Amber", "ASAP", "ATK", "BAND", "BigDFT", "CASTEP", "Charmm", "CP2K", "CPMD", "Crystal", "DFTB+", "DL_POLY", "DMol3", "elastic", "elk", "exciting", "FHI-aims", "fleur", "fplo", "GAMESS", "Gaussian", "GPAW", "Gromacs", "Gromos", "gulp", "LAMMPS", "libAtoms", "MOLCAS", "MOPAC", "Namd", "NWChem", "Octopus", "ONETEP", "OpenKIM", "ORCA", "Phonopy", "qbox", "Quantum Espresso", "Siesta", "TINKER", "turbomole", "VASP", "WIEN2k"), + a_search=Search(), + description=""" + Name of the program used for this calculation. + """, + ) + program_version = Quantity( + type=str, + a_search=Search(), + description=""" + Version of the program used for this calculation. + """, + ) + basis_set = Quantity( + type=MEnum('numeric AOs', 'LCAO', 'gaussians', '(L)APW+lo', 'plane waves', 'psinc functions', 'real-space grid'), + description='The used basis set functions.', + a_search=Search(), + ) + method_type = Quantity( + type=MEnum("DFT", "GW", "unavailable", DFTU="DFT+U"), + description=""" + Generic name for the used methodology. + """, + a_search=Search() + ) + core_electron_treatment = Quantity( + type=MEnum("full all electron", "all electron frozen core", "pseudopotential", "unavailable"), + description=""" + How the core electrons are described. + """, + a_search=Search() + ) + functional_long_name = Quantity( + type=str, + description=""" + Full identified for the used exchange-correlation functional. + """, + a_search=Search() + ) + functional_type = Quantity( + type=MEnum("GGA", "LDA", "hybrid-GGA", "hybrid-meta-GGA" "HF", "GW", "meta-GGA"), + description=""" + Basic type of the used exchange-correlation functional. + """, + a_search=Search() + ) + method_id = Quantity( + type=str, + description=""" + A fixed length, unique method identifier in the form of a hash digest. + The hash is created by using several method settings as seed. This hash + is only defined if a set of well-defined method settings is available + for the used program. + """ + ) + group_eos_id = Quantity( + type=str, + description=""" + A fixed length, unique identifier for equation-of-state calculations. + Only calculations within the same upload and with a method hash + available will be grouped under the same hash. + """, + a_search=Search() + ) + group_parametervariation_id = Quantity( + type=str, + description=""" + A fixed length, unique identifier for calculations where structure is + identical but the used computational parameters are varied. Only + calculations within the same upload and with a method hash available + will be grouped under the same hash. + """, + a_search=Search() + ) + gw_starting_point = Quantity( + type=str, + description=""" + The exchange-correlation functional that was used as a starting point + for this GW calculation. + """ + ) + gw_type = Quantity( + type=MEnum("G0W0", "scGW"), + description=""" + Basic type of GW calculation. + """ + ) + smearing_kind = Quantity( + type=str, + description=""" + Smearing function used for the electronic structure calculation. + """ + ) + smearing_parameter = Quantity( + type=np.dtype(np.float64), + description=""" + Parameter for smearing, usually the width. + """ + ) + + +class Workflow(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + a_search="calculation", + description=""" + Contains an overview of the type of workflow that was detected in + this entry. + """ + ) + workflow_type = Quantity( + type=MEnum( + single_point="single point", + geometry_optimization="geometry optimization", + molecular_dynamics="molecular dynamics", + phonon_calculation="phonon calculation", + elastic_constants="elastic constants", + qha_calculation="QHA calculation", + gw_calculation="GW calculation", + equation_of_state="equation of state", + parameter_variation="parameter variation", + unavailable="unavailable"), + description=""" + Defines the type of workflow that was detected for this entry. + """, + a_search=Search() + ) + + +class WyckoffVariables(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + description=""" + Contains the variables associated with a Wyckoff set. + """ + ) + x = Quantity( + type=np.dtype(np.float64), + description=""" + The x variable if present. + """ + ) + y = Quantity( + type=np.dtype(np.float64), + description=""" + The y variable if present. + """ + ) + z = Quantity( + type=np.dtype(np.float64), + description=""" + The z variable if present. + """ + ) + + +class WyckoffSet(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + description=""" + Section for storing Wyckoff set information. + """ + ) + wyckoff_letter = Quantity( + type=str, + description=""" + The Wyckoff letter for this set. + """ + ) + indices = Quantity( + type=np.dtype('i4'), + shape=["1..*"], + description=""" + Indices of the atoms belonging to this group. + """ + ) + element = Quantity( + type=str, + description=""" + Chemical element at this Wyckoff position. + """ + ) + variables = SubSection(sub_section=WyckoffVariables.m_def, repeats=False) + + +class LatticeParameters(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + description=""" + Lattice parameters of the idealized cell. The lattice parameters can + only be reported consistently after idealization and may not perfectly + correspond to the original simulation cell. + """, + ) + a = Quantity( + type=float, + description=""" + Length of the first basis vector. + """, + ) + b = Quantity( + type=float, + description=""" + Length of the second basis vector. + """, + ) + c = Quantity( + type=float, + description=""" + Length of the third basis vector. + """, + ) + alpha = Quantity( + type=float, + description=""" + Angle between second and third basis vector. + """, + ) + beta = Quantity( + type=float, + description=""" + Angle between first and third basis vector. + """, + ) + gamma = Quantity( + type=float, + description=""" + Angle between first and second basis vector. + """, + ) + + +class IdealizedStructure(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + description=""" + Contains structural information for an idealized representation of the + material used in the calculation. This idealization is used for + visualizing the material and for calculating the structural properties. + The properties of the idealized structure may slightly vary from the + original structure used in the calculation. + """, + a_search="idealized_structure", + ) + atom_labels = Quantity( + type=str, + shape=['1..*'], + description=""" + Type (element, species) of each atom. + """ + ) + atom_positions = Quantity( + type=np.dtype(np.float64), + shape=['number_of_atoms', 3], + description=""" + Atom positions given in coordinates that are relative to the idealized + cell. + """ + ) + lattice_vectors = Quantity( + type=np.dtype(np.float64), + shape=[3, 3], + description=""" + Lattice vectors of the idealized structure. For bulk materials it is + the Bravais cell. This cell is representative and is idealized to match + the detected symmetry properties. + """ + ) + lattice_vectors_primitive = Quantity( + type=np.dtype(np.float64), + shape=[3, 3], + description=""" + Lattice vectors of the the primitive unit cell in a form to be visualized + within the idealized cell. This cell is representative and is + idealized to match the detected symmemtry properties. + """ + ) + periodicity = Quantity( + type=np.bool, + shape=[3], + description=""" + Automatically detected true periodicity of each lattice direction. May + not correspond to the periodicity used in the calculation. + """ + ) + number_of_atoms = Quantity( + type=int, + description=""" + Number of atoms in the idealized structure." + """ + ) + cell_volume = Quantity( + type=np.dtype(np.float64), + unit="m ** 3", + description=""" + Volume of the idealized cell. The cell volume can only be reported + consistently after idealization and may not perfectly correspond to the + original simulation cell. + """, + a_search=Search() + ) + wyckoff_sets = SubSection(sub_section=WyckoffSet.m_def, repeats=True) + lattice_parameters = SubSection(sub_section=LatticeParameters.m_def) + + +class Calculation(MSection): + m_def = Section( + description=""" + List of all calculations related to this material. + """ + ) + calc_id = Quantity( + type=str, + a_search=Search() + ) + upload_id = Quantity( + type=str, + a_search=Search() + ) + published = Quantity( + type=bool, default=False, + description='Indicates if the entry is published', + categories=[MongoMetadata], + a_search=Search() + ) + with_embargo = Quantity( + type=bool, default=False, categories=[MongoMetadata, EditableUserMetadata], + description='Indicated if this entry is under an embargo', + a_search=Search() + ) + owners = Quantity( + type=str, shape=['0..*'], + description='The ids of the users that have access (upload and shared with users) to this calculation', + a_search=Search(many_or='append') + ) + method = SubSection(sub_section=Method.m_def, repeats=False, a_search="method") + workflow = SubSection(sub_section=Workflow.m_def, repeats=False, a_search="workflow") + properties = SubSection(sub_section=Properties.m_def, repeats=False, a_search="properties") + idealized_structure = SubSection(sub_section=IdealizedStructure.m_def, repeats=False, a_search="idealized_structure") + + +class Bulk(MSection): + m_def = Section( + a_flask=dict(skip_none=True), + description=""" + Contains information that is specific to bulk crystalline materials. + """ + ) + bravais_lattice = Quantity( + type=MEnum("aP", "mP", "mS", "oP", "oS", "oI", "oF", "tP", "tI", "hR", "hP", "cP", "cI", "cF"), + description=""" + The Bravais lattice type in the Pearson notation, where the first + lowercase letter indicates the crystal system, and the second uppercase + letter indicates the lattice type. The value can only be one of the 14 + different Bravais lattices in three dimensions. + + Crystal system letters: + + a = Triclinic + m = Monoclinic + o = Orthorhombic + t = Tetragonal + h = Hexagonal and Trigonal + c = Cubic + + Lattice type letters: + + P = Primitive + S (A, B, C) = One side/face centred + I = Body centered + R = Rhombohedral centring + F = All faces centred + """, + a_search=Search() + ) + crystal_system = Quantity( + type=MEnum("triclinic", "monoclinic", "orthorhombic", "tetragonal", "trigonal", "hexagonal", "cubic"), + description=""" + The detected crystal system. One of seven possibilities in three dimensions. + """, + a_search=Search() + ) + has_free_wyckoff_parameters = Quantity( + type=bool, + description=""" + Whether the material has any Wyckoff sites with free parameters. If a + materials has free Wyckoff parameters, at least some of the atoms are + not bound to a particular location in the structure but are allowed to + move with possible restrictions set by the symmetry. + """, + a_search=Search() + ) + point_group = Quantity( + type=MEnum("1", "-1", "2", "m", "2/m", "222", "mm2", "mmm", "4", "-4", "4/m", "422", "4mm", "-42m", "4/mmm", "3", "-3", "32", "3m", "-3m", "6", "-6", "6/m", "622", "6mm", "-6m2", "6/mmm", "23", "m-3", "432", "-43m", "m-3m"), + description=""" + Point group in Hermann-Mauguin notation, part of crystal structure + classification. There are 32 point groups in three dimensional space. + """, + a_search=Search() + ) + space_group_number = Quantity( + type=int, + description=""" + Integer representation of the space group, part of crystal structure + classification, part of material definition. + """, + a_search=Search() + ) + space_group_international_short_symbol = Quantity( + type=str, + description=""" + International short symbol notation of the space group. + """, + a_search=Search() + ) + structure_prototype = Quantity( + type=str, + description=""" + The prototypical material for this crystal structure. + """, + a_search=Search() + ) + structure_type = Quantity( + type=str, + description=""" + Classification according to known structure type, considering the point + group of the crystal and the occupations with different atom types. + """, + a_search=Search() + ) + strukturbericht_designation = Quantity( + type=str, + description=""" + Classification of the material according to the historically grown "strukturbericht". + """, + a_search=Search() + ) + program_name = Quantity( + type=str, + description=""" + The prototypical material for this crystal structure. + """, + a_search=Search() + ) + structure_type = Quantity( + type=str, + description=""" + Classification according to known structure type, considering the point + group of the crystal and the occupations with different atom types. + """, + a_search=Search() + ) + strukturbericht_designation = Quantity( + type=str, + description=""" + Classification of the material according to the historically grown "strukturbericht". + """, + a_search=Search() + ) + + class Material(MSection): m_def = Section( + a_elastic=ElasticDocument(index_name=config.elastic.materials_index_name, id=lambda x: x.material_id), a_mongo=MongoDocument() ) material_id = Quantity( type=str, - a_mongo=Mongo(primary_key=True) + description=""" + A fixed length, unique material identifier in the form of a hash + digest. + """, + a_mongo=Mongo(primary_key=True), + a_search=Search() + ) + material_type = Quantity( + type=MEnum(bulk="bulk", two_d="2D", one_d="1D"), + description=""" + Broad structural classification for the material. + """, + a_search=Search() + ) + material_name = Quantity( + type=str, + description=""" + Most meaningful name for a material if one could be assigned + """, + a_search=Search() + ) + material_classification = Quantity( + type=str, + description=""" + Contains the compound class and classification of the material + according to springer materials in JSON format. + """ + ) + formula = Quantity( + type=str, + description=""" + Formula giving the composition and occurrences of the elements in the + Hill notation. For periodic materials the formula is calculated fom the + primitive unit cell. + """, + a_search=Search() + ) + formula_reduced = Quantity( + type=str, + description=""" + Formula giving the composition and occurrences of the elements in the + Hill notation where the number of occurences have been divided by the + greatest common divisor. + """, + a_search=Search() + ) + species_and_counts = Quantity( + type=str, + description=""" + The formula separated into individual terms containing both the atom + type and count. Used for searching parts of a formula. + """, + a_search=Search(mapping=Text(multi=True, fields={'keyword': Keyword()})) + ) + species = Quantity( + type=str, + description=""" + The formula separated into individual terms containing only unique atom + species. Used for searching materials containing specific elements. + """, + a_search=Search(mapping=Text(multi=True, fields={'keyword': Keyword()})) ) similarity = SubSection(sub_section=Similarity.m_def, repeats=False) + calculations = SubSection( + sub_section=Calculation.m_def, + repeats=True, + a_search=Search("calculations", nested=True), + description=""" + List of all calculations related to this material. + """ + ) + bulk = SubSection(sub_section=Bulk.m_def, repeats=False, a_search="bulk") diff --git a/nomad/infrastructure.py b/nomad/infrastructure.py index 5077e3bd85a94fd11114d87bd57ee04cd0486d8b..59473e7fbbf4b3a5342c45ffd00113f6ef8e53a0 100644 --- a/nomad/infrastructure.py +++ b/nomad/infrastructure.py @@ -80,26 +80,52 @@ def setup_mongo(client=False): return mongo_client -def setup_elastic(): +def setup_elastic(create_mappings=True): ''' Creates connection to elastic search. ''' + from nomad.search import entry_document, material_document + from elasticsearch_dsl import Index + global elastic_client elastic_client = connections.create_connection( hosts=['%s:%d' % (config.elastic.host, config.elastic.port)], timeout=60, max_retries=10, retry_on_timeout=True) logger.info('setup elastic connection') - try: - from nomad.search import entry_document - entry_document.init(index=config.elastic.index_name) - except RequestError as e: - if e.status_code == 400 and 'resource_already_exists_exception' in e.error: - # happens if two services try this at the same time - pass - else: - raise e + # Setup materials index mapping. An alias is used to be able to reindex the + # materials with zero downtime. First see to which index the alias points + # to. If alias is not set, create it. Update the mapping in the index + # pointed to by the alias. + if create_mappings: + try: + if elastic_client.indices.exists_alias(config.elastic.materials_index_name): + index_name = list(elastic_client.indices.get(config.elastic.materials_index_name).keys())[0] + material_document.init(index_name) + else: + index_name = config.elastic.materials_index_name + "_a" + material_document.init(index_name) + index = Index(index_name) + index.put_alias(name=config.elastic.materials_index_name) + except RequestError as e: + if e.status_code == 400 and 'resource_already_exists_exception' in e.error: + # happens if two services try this at the same time + pass + else: + raise e + + # Initialize calculation index mapping + try: + entry_document.init(index=config.elastic.index_name) + except RequestError as e: + if e.status_code == 400 and 'resource_already_exists_exception' in e.error: + # happens if two services try this at the same time + pass + else: + raise e - entry_document._index._name = config.elastic.index_name - logger.info('initialized elastic index', index_name=config.elastic.index_name) + entry_document._index._name = config.elastic.index_name + material_document._index._name = config.elastic.materials_index_name + logger.info('initialized elastic index for calculations', index_name=config.elastic.index_name) + logger.info('initialized elastic index for materials', index_name=config.elastic.materials_index_name) return elastic_client @@ -405,9 +431,12 @@ def reset(remove: bool): if not elastic_client: setup_elastic() elastic_client.indices.delete(index=config.elastic.index_name) - from nomad.search import entry_document + material_index_name = list(elastic_client.indices.get(config.elastic.materials_index_name).keys())[0] + elastic_client.indices.delete(index=material_index_name) + from nomad.search import entry_document, material_document if not remove: entry_document.init(index=config.elastic.index_name) + material_document.init(index=material_index_name) logger.info('elastic index resetted') except Exception as e: logger.error('exception resetting elastic', exc_info=e) diff --git a/nomad/metainfo/elastic_extension.py b/nomad/metainfo/elastic_extension.py index fe91147c1938d78a3930eabd2cfd169171d798b5..f7ec74d0002e40eac1a5ae5f9a289fc564f7ac2f 100644 --- a/nomad/metainfo/elastic_extension.py +++ b/nomad/metainfo/elastic_extension.py @@ -133,12 +133,12 @@ class ElasticDocument(SectionAnnotation): @property def document(self): - return ElasticDocument.create_document(self.definition) + return ElasticDocument.create_document(self.definition, index_name=self.index_name) @classmethod def create_document( - cls, section: Section, inner_doc: bool = False, attrs: Dict[str, Any] = None, - prefix: str = None): + cls, section: Section, attrs: Dict[str, Any] = None, + prefix: str = None, index_name: str = None, root=True): ''' Create all elasticsearch_dsl mapping classes for the section and its sub sections. ''' @@ -146,19 +146,27 @@ class ElasticDocument(SectionAnnotation): if document is not None: return document - from elasticsearch_dsl import Document, InnerDoc, Keyword, Date, Integer, Boolean, Object, Double, Float, Long + from elasticsearch_dsl import Document, InnerDoc, Keyword, Date, Integer, Boolean, Object, Double, Float, Long, Nested if attrs is None: attrs = {} # create an field for each sub section for sub_section in section.all_sub_sections.values(): - sub_sectoin_prefix = '%s.%s' % (prefix, sub_section.name) if prefix else sub_section.name + sub_section_prefix = '%s.%s' % (prefix, sub_section.name) if prefix else sub_section.name + inner_document = ElasticDocument.create_document( - sub_section.sub_section, inner_doc=True, prefix=sub_sectoin_prefix) + sub_section.sub_section, prefix=sub_section_prefix, index_name=index_name, root=False) if inner_document is not None: - # sub sections with no elastic quantities get a None document - attrs[sub_section.name] = Object(inner_document) + try: + if sub_section.a_search.nested: + assert sub_section.repeats, ( + "Nested fields should be repeatable. If the subsection cannot be repeated, " + "define it as unnested instead." + ) + attrs[sub_section.name] = Nested(inner_document) + except AttributeError: + attrs[sub_section.name] = Object(inner_document) # create an field for each quantity for quantity in section.all_quantities.values(): @@ -187,8 +195,7 @@ class ElasticDocument(SectionAnnotation): if prefix is not None: inner_prefix = '%s.%s' % (prefix, inner_prefix) inner_document = ElasticDocument.create_document( - cast(Section, quantity.type.target_section_def), inner_doc=True, - prefix=inner_prefix) + cast(Section, quantity.type.target_section_def), prefix=inner_prefix, index_name=index_name, root=False) annotation.mapping = Object(inner_document) elif isinstance(quantity.type, MEnum): annotation.mapping = Keyword(**kwargs) @@ -201,7 +208,7 @@ class ElasticDocument(SectionAnnotation): if first: assert annotation.field not in attrs, 'Elastic fields must be unique' attrs[annotation.field] = annotation.mapping - annotation.register(prefix, annotation.field) + annotation.register(prefix, annotation.field, index_name) first = False @@ -209,7 +216,10 @@ class ElasticDocument(SectionAnnotation): # do not create a document/inner document class, if no elastic quantities are defined return None - document = type(section.name, (InnerDoc if inner_doc else Document,), attrs) + doc_cls_obj = InnerDoc + if root: + doc_cls_obj = Document + document = type(section.name, (doc_cls_obj,), attrs) cls._all_documents[section.qualified_name()] = document return document @@ -261,14 +271,14 @@ class Elastic(DefinitionAnnotation): def init_annotation(self, definition): super().init_annotation(definition) - assert isinstance(definition, Quantity), 'The Elastic annotation is only usable with Quantities.' if self.field is None: self.field = definition.name if self.value is None: self.value = lambda section: section.m_get(definition) - def register(self, prefix: str, field: str): + def register(self, prefix: str, field: str, index: str): + if prefix is None: self.qualified_field = field else: diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py index 168e02a362fb470215a653090ad77aa656957edb..89f3cd9ef2a7315c114ec1af024dfd554a97748e 100644 --- a/nomad/metainfo/metainfo.py +++ b/nomad/metainfo/metainfo.py @@ -1094,11 +1094,23 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas by partial. partial: A function that determines if a definition should be included in the output dictionary. Takes a definition and the containing section - as arguments. Partial is applied recursively on sub-sections. - Overrides categories. + as arguments. Two default functions can be used by providing a + string instead: + + - 'mongo': Only include quantities that have an a_mongo + annotation. + - 'es': Only include quantities that have an a_elastic or + an an a_search annotation. + + Partial is applied recursively on sub-sections. Overrides + categories. ''' # determine partial for sub-sections and partial based on categories if partial is not None: + if partial == "es": + partial = lambda d, s: hasattr(d, "a_search") or hasattr(d, "a_search") + if partial == "mongo": + partial = lambda d, s: hasattr(d, "a_mongo") child_partial = partial else: if categories is None: diff --git a/nomad/metainfo/search_extension.py b/nomad/metainfo/search_extension.py index c6d4ba4003a895ca70fdd3a5426045b9010fa17e..9f513aaf2569672122d2240c6616ef1dd8376b5a 100644 --- a/nomad/metainfo/search_extension.py +++ b/nomad/metainfo/search_extension.py @@ -12,25 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, Any, Dict, List +from typing import Callable, Any, Dict, List, DefaultDict +from collections import defaultdict from nomad.metainfo.elastic_extension import Elastic -search_quantities: Dict[str, 'Search'] = {} +search_quantities_by_index: DefaultDict[str, Dict[str, 'Search']] = defaultdict(dict) ''' All available search quantities by their full qualified name. ''' -metrics: Dict[str, 'Search'] = {} +metrics_by_index: DefaultDict[str, Dict[str, 'Search']] = defaultdict(dict) ''' The available search metrics. Metrics are integer values given for each entry that can be used in statistics (aggregations), e.g. the sum of all total energy calculations or -cardinality of all unique geometries. The key is the metric name. +cardinality of all unique geometries. First key is the index name, second key +is the metric name. ''' -groups: Dict[str, 'Search'] = {} -''' The available groupable quantities. The key is the group name. ''' +groups_by_index: DefaultDict[str, Dict[str, 'Search']] = defaultdict(dict) +''' The available groupable quantities. First key is the index name, second key +is the metric name. ''' -order_default_quantities: Dict[str, 'Search'] = {} +order_default_quantities_by_index: DefaultDict[str, Dict[str, 'Search']] = defaultdict(dict) ''' The quantity for each domain (key) that is the default quantity to order search results by. ''' @@ -74,6 +77,9 @@ class Search(Elastic): This might be different from the field that is used to store the value in elastic search. This is especially useful if the field represents a inner document and a subfield of this inner object should be used for search. + nested: Indicates if a subsection should be treated as a Nested field. + Defaults to False meaning that the subsection is treated as an inner + object. ''' def __init__( @@ -87,6 +93,7 @@ class Search(Elastic): statistic_values: List[str] = None, derived: Callable[[Any], Any] = None, search_field: str = None, + nested: bool = False, **kwargs): super().__init__(field=None, **kwargs) @@ -105,6 +112,7 @@ class Search(Elastic): self.statistic_order = statistic_order self.statistic_values = statistic_values self.search_field = search_field + self.nested = nested self.derived = derived @@ -124,7 +132,7 @@ class Search(Elastic): super().init_annotation(definition) - def register(self, prefix, field): + def register(self, prefix, field, index): domain_or_all = self.definition.m_parent.m_get_annotations('domain', '__all__') prefix_and_dot = prefix + '.' if prefix is not None else '' @@ -135,8 +143,8 @@ class Search(Elastic): else: self.search_field = self.qualified_name - assert self.qualified_name not in search_quantities, 'Search quantities must have a unique name: %s' % self.name - search_quantities[self.qualified_name] = self + assert self.qualified_name not in search_quantities_by_index[index], 'Search quantities must have a unique name: %s' % self.name + search_quantities_by_index[index][self.qualified_name] = self if self.metric is not None: if self.metric_name is None: @@ -144,17 +152,17 @@ class Search(Elastic): else: self.metric_name = prefix_and_dot + self.metric_name - assert self.metric_name not in metrics, 'Metric names must be unique: %s' % self.metric_name - metrics[self.metric_name] = self + assert self.metric_name not in metrics_by_index[index], 'Metric names must be unique: %s' % self.metric_name + metrics_by_index[index][self.metric_name] = self if self.group is not None: self.group = prefix_and_dot + self.group - assert self.group not in groups, 'Groups must be unique' - groups[self.group] = self + assert self.group not in groups_by_index[index], 'Groups must be unique' + groups_by_index[index][self.group] = self if self.order_default: - assert order_default_quantities.get(domain_or_all) is None, 'Only one quantity can be the order default' - order_default_quantities[domain_or_all] = self + assert order_default_quantities_by_index[index].get(domain_or_all) is None, 'Only one quantity can be the order default' + order_default_quantities_by_index[index][domain_or_all] = self @property def argparse_action(self): diff --git a/nomad/processing/data.py b/nomad/processing/data.py index a8b2f28228917580b2d527dc5826b67017f3cb26..caa7ae248564a0c6998fae4f4048bac1aa14e05f 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -586,7 +586,6 @@ class Calc(Proc): self._read_metadata_from_file(logger) # persist the calc metadata - # add the calc metadata with utils.timer(logger, 'saved calc metadata', step='metadata'): self.apply_entry_metadata(self._entry_metadata) diff --git a/nomad/search.py b/nomad/search.py index e49f5aa9ce79721e9092ac541fedc805ec143af0..b2c46ba1e3396b82ef8e0a5ec5944e2f42600175 100644 --- a/nomad/search.py +++ b/nomad/search.py @@ -23,8 +23,13 @@ from elasticsearch.exceptions import NotFoundError from datetime import datetime import json +from nomad.datamodel.material import Material from nomad import config, datamodel, infrastructure, utils -from nomad.metainfo.search_extension import search_quantities, metrics, order_default_quantities +from nomad.metainfo.search_extension import search_quantities_by_index, metrics_by_index, order_default_quantities_by_index, groups_by_index +search_quantities = search_quantities_by_index[config.elastic.index_name] +groups = groups_by_index[config.elastic.index_name] +metrics = metrics_by_index[config.elastic.index_name] +order_default_quantities = order_default_quantities_by_index[config.elastic.index_name] path_analyzer = analyzer( @@ -45,6 +50,7 @@ class InvalidQuery(Exception): pass entry_document = datamodel.EntryMetadata.m_def.a_elastic.document +material_document = Material.m_def.a_elastic.document for domain in datamodel.domains: order_default_quantities.setdefault(domain, order_default_quantities.get('__all__')) diff --git a/tests/app/test_api.py b/tests/app/test_api.py index f95df5cd6b9569744dd1badceb71a71d31bd2a9f..512abcaf6c756f1498a4734f3826f1334855ee30 100644 --- a/tests/app/test_api.py +++ b/tests/app/test_api.py @@ -1061,7 +1061,7 @@ class TestRepo(): assert 'only_atoms' not in result assert 'dft.basis_set' in result - metrics_permutations = [[], search_extension.metrics] + [[metric] for metric in search_extension.metrics] + metrics_permutations = [[], search_extension.metrics_by_index[config.elastic.index_name]] + [[metric] for metric in search_extension.metrics_by_index[config.elastic.index_name]] def test_search_admin(self, api, example_elastic_calcs, no_warn, admin_user_auth): rv = api.get('/repo/?owner=admin', headers=admin_user_auth) diff --git a/tests/app/test_api_encyclopedia.py b/tests/app/test_api_encyclopedia.py index 4fcaf35bda602ace643f08d43300345a1b7035cb..7389f8b20d04269a30ad680f173fe1fffeb39722 100644 --- a/tests/app/test_api_encyclopedia.py +++ b/tests/app/test_api_encyclopedia.py @@ -11,41 +11,347 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import time +import pytest +import click.testing +import json +import elasticsearch +from nomad import config +from nomad.cli import cli +from nomad import processing as proc, infrastructure -def test_material(): - pass +from tests.app.test_app import BlueprintClient - # Unpublished material should not be found +silicon_id = "fh3UBjhUVm4nxzeRd2JJuqw5oXYa" - # Embargoed material should not be found - # Missing material causes 404 +def exists(value, expected_type=str, is_not=set(["", None])): + return type(value) == expected_type and value not in is_not - # Correctly found materials returns all required values +def validate_material(material): + assert exists(material["material_id"]) + assert exists(material["formula"]) + assert exists(material["formula_reduced"]) + material_type = material["material_type"] + assert exists(material_type) + if material_type == "bulk": + assert exists(material["material_name"]) + assert exists(material["has_free_wyckoff_parameters"], bool) + assert exists(material["strukturbericht_designation"]) + assert exists(material["bravais_lattice"]) + assert exists(material["crystal_system"]) + assert exists(material["point_group"]) + assert exists(material["space_group_number"], int) + assert exists(material["space_group_international_short_symbol"]) + assert exists(material["structure_type"]) + assert exists(material["structure_prototype"]) -def test_materials(): - pass - # Unpublished material should not be found +@pytest.fixture(scope='function') +def api(client): + return BlueprintClient(client, '/api/encyclopedia') - # Embargoed material should not be found - # Missing material causes 404 +def upload(filepath, publish, test_user_bravado_client, proc_infra, test_user_auth, api): - # Correctly found materials returns all required values + # Perform a test upload + arguments = ['client', 'upload', '--offline', '--name', filepath, filepath] + if publish: + arguments.append("--publish") + click.testing.CliRunner().invoke( + cli, + arguments, + catch_exceptions=False + ) + upload = proc.Upload.objects(name=filepath).first() + upload_id = upload["upload_id"] + upload.block_until_complete(interval=0.2) - # Exclusive formula works as expected + return upload_id - # Inclusive formula works as expected - # Exclusive elements works as expected +@pytest.fixture(scope='function') +def enc_upload(test_user_bravado_client, proc_infra, test_user_auth, api, mongo_infra): + upload('tests/data/api/enc_public.zip', True, test_user_bravado_client, proc_infra, test_user_auth, api) + upload('tests/data/api/enc_private_material.zip', False, test_user_bravado_client, proc_infra, test_user_auth, api) + upload('tests/data/api/enc_si_private.zip', False, test_user_bravado_client, proc_infra, test_user_auth, api) + upload_id = upload('tests/data/api/enc_si_embargo.zip', True, test_user_bravado_client, proc_infra, test_user_auth, api) - # Inclusive elements works as expected + # Place upload entries on embargo in MongoDB + calculations = mongo_infra["test_db"]['calc'] + calculations.update_many( + {'upload_id': upload_id}, + {'$set': {'metadata.with_embargo': True}} + ) - # Band gap + # Place upload entries on embargo in ES + embargoed = proc.Upload.get(upload_id) + with embargoed.entries_metadata(embargoed.metadata) as calcs: + def elastic_updates(): + for calc in calcs: + entry = calc.a_elastic.create_index_entry() + entry.with_embargo = True + entry = entry.to_dict(include_meta=True) + source = entry.pop('_source') + entry['doc'] = source + entry['_op_type'] = 'update' + entry['_index'] = 'nomad_fairdi_calcs_test' + yield entry - # Mass density + elasticsearch.helpers.bulk(infrastructure.elastic_client, elastic_updates()) + # The indices need to be refreshed after the update + infrastructure.elastic_client.indices.refresh(config.elastic.index_name) - # Property existence + # Index materials + click.testing.CliRunner().invoke( + cli, + ['admin', 'index-materials', "--source=es"], + catch_exceptions=False) + + # Small wait time in order for ES indexing to finish up + time.sleep(1) + + +class TestEncyclopedia(): + + def test_material(self, enc_upload, elastic_infra, api, test_user_auth): + # Correctly found material returns all required values. + rv = api.get('/materials/{}'.format(silicon_id)) + assert rv.status_code == 200 + material = rv.json + validate_material(material) + + # Missing material causes 404. + rv = api.get('/materials/does_not_exist') + assert rv.status_code == 404 + + # Empty search should return all visible materials + rv = api.post( + '/materials/', + data="{}", + content_type='application/json' + ) + assert rv.status_code == 200 + results = rv.json['results'] + assert len(results) == 2 + for material in results: + validate_material(material) + + # Test that searches across calculations work as intended + rv = api.post( + '/materials/', + data=json.dumps({ + "has_band_structure": True, + "has_dos": True, + }), + content_type='application/json' + ) + results = rv.json['results'] + assert len(results) == 1 + rv = api.post( + '/materials/', + data=json.dumps({ + "code_name": ["Quantum Espresso"], + "functional_type": ["GGA"], + "has_band_structure": True, + }), + content_type='application/json' + ) + results = rv.json['results'] + assert len(results) == 1 + + # Test that searches within calculations work as intended + rv = api.post( + '/materials/', + data=json.dumps({ + "search_by": { + "restricted": True, + }, + "has_band_structure": True, + "has_dos": True, + }), + content_type='application/json' + ) + results = rv.json['results'] + assert len(results) == 0 + rv = api.post( + '/materials/', + data=json.dumps({ + "search_by": { + "restricted": True, + }, + "code_name": ["Quantum Espresso"], + "functional_type": ["GGA"], + "has_band_structure": True, + }), + content_type='application/json' + ) + results = rv.json['results'] + assert len(results) == 0 + + # Test that EOS groups are found. + rv = api.get('/materials/{}/groups'.format(silicon_id)) + assert rv.status_code == 200 + groups = rv.json + groups_eos = groups['groups_eos'] + assert len(groups_eos) == 1 + eos_id, group = list(groups_eos.items())[0] + exists(eos_id) + assert len(group) == 5 + for calc_id in group: + exists(calc_id) + + # Test that parameter variation groups are found. + rv = api.get('/materials/{}/groups'.format(silicon_id)) + assert rv.status_code == 200 + groups = rv.json + groups_par = groups["groups_par"] + assert len(groups_par) == 1 + par_id, group = list(groups_par.items())[0] + exists(par_id) + assert len(group) == 2 + for calc_id in group: + exists(calc_id) + + # Test query for a specific group. + rv = api.get('/materials/{}/groups/eos/{}'.format(silicon_id, eos_id)) + assert rv.status_code == 200 + group = rv.json + assert len(group['calculations']) == 5 + assert len(group['energies']) == 5 + assert len(group['volumes']) == 5 + rv = api.get('/materials/{}/groups/par/{}'.format(silicon_id, par_id)) + assert rv.status_code == 200 + group = rv.json + assert len(group['calculations']) == 2 + assert len(group['energies']) == 2 + assert len(group['volumes']) == 2 + + # Test suggestions + rv = api.get('/suggestions?property=structure_type') + assert rv.status_code == 200 + structure_types = rv.json + assert structure_types["structure_type"] == ["diamond"] + rv = api.get('/suggestions?property=code_name') + assert rv.status_code == 200 + code_names = rv.json + assert set(code_names["code_name"]) == set(["exciting", "Quantum Espresso", "VASP", "FHI-aims"]) + + # Test calculations: embargoed and private entries should not show up here + rv = api.get('/materials/{}/calculations'.format(silicon_id)) + assert rv.status_code == 200 + calculations = rv.json + assert calculations["total_results"] == 9 + assert len(calculations["results"]) == calculations["total_results"] + calc_ids = [x["calc_id"] for x in calculations["results"]] + for calc in calculations["results"]: + assert exists(calc["calc_id"]) + assert exists(calc["upload_id"]) + assert exists(calc["code_name"]) + assert exists(calc["code_version"]) + assert exists(calc["functional_type"]) + assert exists(calc["basis_set_type"]) + assert exists(calc["core_electron_treatment"]) + assert exists(calc["run_type"]) + assert exists(calc["has_dos"], bool) + assert exists(calc["has_band_structure"], bool) + assert exists(calc["has_thermal_properties"], bool) + + # Test statistics + rv = api.post( + '/materials/{}/statistics'.format(silicon_id), + data=json.dumps({ + "calculations": calc_ids, + "properties": [ + "cell_volume", + "atomic_density", + "mass_density", + "lattice_a", + "lattice_b", + "lattice_c", + "alpha", + "beta", + "gamma", + "band_gap", + ], + "n_histogram_bins": 3, + }), + content_type='application/json' + ) + assert rv.status_code == 200 + + # Test fetching information about a specific calculation + rv = api.post( + '/materials/{}/calculations/{}'.format(silicon_id, calc_ids[0]), + data=json.dumps({"properties": [ + "lattice_parameters", + "energies", + "mass_density", + "atomic_density", + "cell_volume", + "wyckoff_sets", + "idealized_structure", + "band_gap", + "electronic_band_structure", + "electronic_dos", + "phonon_band_structure", + "phonon_dos", + "thermodynamical_properties", + ]}), + content_type='application/json' + ) + assert rv.status_code == 200 + calc = rv.json + + # Test that completely private materials only become visible after + # authentication + rv = api.post( + '/materials/', + data=json.dumps({"search_by": {"element": "B"}}), + content_type='application/json', + ) + results = rv.json['results'] + assert len(results) == 0 + rv = api.post( + '/materials/', + data=json.dumps({"search_by": {"element": "B"}}), + content_type='application/json', + headers=test_user_auth, + ) + results = rv.json['results'] + assert len(results) == 1 + private_material = results[0] + private_material_id = private_material["material_id"] + rv = api.get('/materials/{}/calculations'.format(private_material_id), headers=test_user_auth) + private_calc_ids = [x["calc_id"] for x in rv.json["results"]] + for headers, code in [({}, 404), (test_user_auth, 200)]: + rv = api.get('/materials/{}'.format(private_material_id), headers=headers) + assert rv.status_code == code + rv = api.get('/materials/{}/groups'.format(private_material_id), headers=headers) + assert rv.status_code == code + rv = api.get('/materials/{}/calculations'.format(private_material_id), headers=headers) + assert rv.status_code == code + rv = api.post( + '/materials/{}/statistics'.format(private_material_id), + data=json.dumps({ + "calculations": private_calc_ids, + "properties": [ + "cell_volume", + "atomic_density", + "mass_density", + "lattice_a", + "lattice_b", + "lattice_c", + "alpha", + "beta", + "gamma", + "band_gap", + ], + "n_histogram_bins": 3, + }), + content_type='application/json', + headers=headers, + ) + assert rv.status_code == code diff --git a/tests/conftest.py b/tests/conftest.py index dbf6233493cdf011fa4097b6122192a843c41f1e..16e17a8ac0d9767ae2b8373b0f03948376c8d67a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -180,21 +180,25 @@ def mongo(mongo_infra): @pytest.fixture(scope='session') def elastic_infra(monkeysession): ''' Provides elastic infrastructure to the session ''' - monkeysession.setattr('nomad.config.elastic.index_name', 'nomad_fairdi_test') + monkeysession.setattr('nomad.config.elastic.index_name', 'nomad_fairdi_calcs_test') + monkeysession.setattr('nomad.config.elastic.materials_index_name', 'nomad_fairdi_materials_test') try: return infrastructure.setup_elastic() except Exception: # try to delete index, error might be caused by changed mapping from elasticsearch_dsl import connections connections.create_connection(hosts=['%s:%d' % (config.elastic.host, config.elastic.port)]) \ - .indices.delete(index='nomad_fairdi_test') + .indices.delete(index='nomad_fairdi_calcs_test') return infrastructure.setup_elastic() def clear_elastic(elastic): try: elastic.delete_by_query( - index='nomad_fairdi_test', body=dict(query=dict(match_all={})), + index='nomad_fairdi_calcs_test', body=dict(query=dict(match_all={})), + wait_for_completion=True, refresh=True) + elastic.delete_by_query( + index='nomad_fairdi_materials_test', body=dict(query=dict(match_all={})), wait_for_completion=True, refresh=True) except elasticsearch.exceptions.NotFoundError: # it is unclear why this happens, but it happens at least once, when all tests diff --git a/tests/data/api/enc_private_material.zip b/tests/data/api/enc_private_material.zip new file mode 100644 index 0000000000000000000000000000000000000000..bcf282766627e18f5eb63bcd6e0efca972040a22 Binary files /dev/null and b/tests/data/api/enc_private_material.zip differ diff --git a/tests/data/api/enc_public.zip b/tests/data/api/enc_public.zip new file mode 100644 index 0000000000000000000000000000000000000000..24173533e9d04880edf92a0cf06ab94f09bf5007 Binary files /dev/null and b/tests/data/api/enc_public.zip differ diff --git a/tests/data/api/enc_si_embargo.zip b/tests/data/api/enc_si_embargo.zip new file mode 100644 index 0000000000000000000000000000000000000000..9fff3960b8571fa0ed20df2f397ea3982db425e0 Binary files /dev/null and b/tests/data/api/enc_si_embargo.zip differ diff --git a/tests/data/api/enc_si_private.zip b/tests/data/api/enc_si_private.zip new file mode 100644 index 0000000000000000000000000000000000000000..e4b90301de269d16295fea507981146c0ce6ffae Binary files /dev/null and b/tests/data/api/enc_si_private.zip differ diff --git a/tests/data/normalizers/phonons.zip b/tests/data/normalizers/phonons.zip new file mode 100644 index 0000000000000000000000000000000000000000..9a422d1380e0161037a5db566cc8d9c54437529c Binary files /dev/null and b/tests/data/normalizers/phonons.zip differ diff --git a/tests/normalizing/test_encyclopedia.py b/tests/normalizing/test_encyclopedia.py index 1e3c3fb5f3d4a2a31eacc4707734343e716c746b..c32b28f1762169270e2414e5f51e4f6a0b28e229 100644 --- a/tests/normalizing/test_encyclopedia.py +++ b/tests/normalizing/test_encyclopedia.py @@ -19,12 +19,14 @@ import ase.build from matid.symmetry.wyckoffset import WyckoffSet from nomad.utils import hash +from nomad.files import UploadFiles from nomad import atomutils from nomad.datamodel import EntryArchive from nomad.datamodel.encyclopedia import ( Calculation, EncyclopediaMetadata, ) +from tests.processing.test_data import run_processing from tests.normalizing.conftest import ( # pylint: disable=unused-import run_normalize_for_structure, geometry_optimization, @@ -509,20 +511,45 @@ def test_electronic_bands(bands_unpolarized_no_gap, bands_polarized_no_gap, band generaltests(band_path_cF_nonstandard.section_metadata.encyclopedia.properties.electronic_band_structure) -def test_phonon(phonon: EntryArchive): +def test_phonon(test_user, proc_infra): """Tests that phonon calculations are correctly processed. """ + # Process a phonon calculation + upload = run_processing(("phonon_id", "tests/data/normalizers/phonons.zip"), test_user) + + # Read the resulting archive + upload_id = upload.upload_id + calcs = upload.calcs() + phonon_id = None + for calc in calcs: + if calc.parser == "parsers/phonopy": + phonon_id = calc.calc_id + break + upload_file = UploadFiles.get(upload_id, is_authorized=lambda: True) + archive_reader = upload_file.read_archive(phonon_id) + phonon_archive = archive_reader[phonon_id].to_dict() + phonon = EntryArchive.m_from_dict(phonon_archive) + enc = phonon.section_metadata.encyclopedia calc_type = enc.calculation.calculation_type status = enc.status prop = enc.properties + method = enc.method band = prop.phonon_band_structure dos = prop.phonon_dos thermo_props = prop.thermodynamical_properties assert calc_type == Calculation.calculation_type.type.phonon_calculation + assert status == EncyclopediaMetadata.status.type.success - # The method information is filled after the whole upload has been processed. - assert status == EncyclopediaMetadata.status.type.unsupported_method_type + # There should be a reference to the external calculation + assert phonon.section_run[0].section_single_configuration_calculation[0].section_calculation_to_calculation_refs[0].calculation_to_calculation_external_url is not None + + # The method information should have been read from the referenced + # calculation + assert method.method_type == "DFT" + assert method.core_electron_treatment == "full all electron" + assert method.functional_type == "LDA" + assert method.functional_long_name == "LDA_C_PW+LDA_X" # Check dos assert dos is not None diff --git a/tests/test_archive.py b/tests/test_archive.py index da553abe4e2d0f6f7cf547d2811a51dcb816e133..bcb67bd2bac6067d39b62128c040005bc1c010a6 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -339,7 +339,7 @@ def archive(): def assert_partial_archive(archive: EntryArchive) -> EntryArchive: # test contents assert archive.section_workflow.calculation_result_ref is not None - assert archive.section_metadata.encyclopedia is None + assert archive.section_metadata.encyclopedia is not None # test refs assert archive.section_workflow.calculation_result_ref.energy_total is not None assert len(archive.section_workflow.calculation_result_ref.section_eigenvalues) == 0 diff --git a/tests/test_search.py b/tests/test_search.py index 194ccf25ebb2a4e69e0d5202b0ddc632fa0bb2fd..f416319c084bfd0f7c5aafd179eaff40d18b6c9e 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -19,7 +19,6 @@ from datetime import datetime from nomad import datamodel, search, processing, infrastructure, config from nomad.search import entry_document, SearchRequest -from nomad.metainfo import search_extension def test_init_mapping(elastic): @@ -185,11 +184,11 @@ def assert_metrics(container, metrics_names): def test_search_statistics(elastic, example_search_data): - assert 'authors' in search_extension.metrics.keys() - assert 'datasets' in search_extension.metrics.keys() - assert 'unique_entries' in search_extension.metrics.keys() + assert 'authors' in search.metrics.keys() + assert 'datasets' in search.metrics.keys() + assert 'unique_entries' in search.metrics.keys() - use_metrics = search_extension.metrics.keys() + use_metrics = search.metrics.keys() request = SearchRequest(domain='dft').statistic( 'dft.system', size=10, metrics_to_use=use_metrics).date_histogram(metrics_to_use=use_metrics) @@ -227,7 +226,7 @@ def test_global_statistics(elastic, example_search_data): def test_search_totals(elastic, example_search_data): - use_metrics = search_extension.metrics.keys() + use_metrics = search.metrics.keys() request = SearchRequest(domain='dft').totals(metrics_to_use=use_metrics) results = request.execute()