Skip to content
Snippets Groups Projects

Encyclopedia api

Merged Lauri Himanen requested to merge encyclopedia-api into v0.8.1
2 files
+ 139
77
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 137
75
@@ -16,6 +16,7 @@
The encyclopedia API of the nomad@FAIRDI APIs.
"""
import re
import math
import numpy as np
from flask_restplus import Resource, abort, fields, marshal
@@ -80,6 +81,17 @@ def get_es_doc_values(es_doc, mapping, keys=None):
return result
def get_enc_filter():
"""Returns a shared term filter that will leave out unpublished, embargoed
or invalid entries.
"""
return [
Q("term", published=True),
Q("term", with_embargo=False),
Q("term", encyclopedia__status="success"),
]
material_query = api.parser()
material_query.add_argument(
"property",
@@ -107,11 +119,6 @@ material_result = api.model("material_result", {
"structure_prototype": fields.String,
"structure_type": fields.String,
})
enc_filter = [
Q("term", published=True),
Q("term", with_embargo=False),
Q("term", encyclopedia__status="success"),
]
@ns.route("/materials/<string:material_id>")
@@ -122,7 +129,8 @@ class EncMaterialResource(Resource):
@api.expect(material_query)
@api.marshal_with(material_result, skip_none=True)
def get(self, material_id):
"""Used to retrive basic information related to the specified material.
"""Used to retrieve basic information related to the specified
material.
"""
# Parse request arguments
args = material_query.parse_args()
@@ -138,24 +146,18 @@ class EncMaterialResource(Resource):
# information from there. In principle all other entries should have
# the same information.
s = Search(index=config.elastic.index_name)
# Since we are looking for an exact match, we use filtek context
# together with term search for speed (instead of query context and
# match search)
query = Q(
"bool",
filter=enc_filter + [
filter=get_enc_filter() + [
Q("term", encyclopedia__material__material_id=material_id),
]
)
s = s.query(query)
# If a representative calculation is requested, all calculations are
# returned in order to perform the scoring with a custom loop.
# Otherwise, only one representative entry is returned.
# Only one representative entry is returned by collapsing the results.
s = s.extra(**{
"_source": {"includes": es_keys},
"size": 10000,
"size": 1,
"collapse": {"field": "encyclopedia.material.material_id"},
})
response = s.execute()
@@ -237,7 +239,7 @@ class EncMaterialsResource(Resource):
except Exception as e:
abort(400, message=str(e))
filters = enc_filter
filters = get_enc_filter()
must_nots = []
musts = []
@@ -366,61 +368,121 @@ class EncMaterialsResource(Resource):
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
# The materials are grouped by using three aggregations:
# "Composite" to enable scrolling, "Terms" to enable selecting
# by material_id and "Top Hits" to fetch a single
# representative material document. Unnecessary fields are
# filtered to reduce data transfer.
terms_agg = A("terms", field="encyclopedia.material.material_id")
composite_kwargs = {"sources": {"materials": terms_agg}, "size": per_page}
# The number of matched materials is only requested on the first
# search, not for each page.
if after is not None:
composite_kwargs["after"] = after
else:
# 1: The paginated approach: No way to know the amount of materials,
# but can return aggregation results in a quick fashion including
# the number of calculation entries per material.
mode = "collapse"
if mode == "aggregation":
# The materials are grouped by using three aggregations:
# "Composite" to enable scrolling, "Terms" to enable selecting
# by material_id and "Top Hits" to fetch a single
# representative material document. Unnecessary fields are
# filtered to reduce data transfer.
terms_agg = A("terms", field="encyclopedia.material.material_id")
composite_kwargs = {"sources": {"materials": terms_agg}, "size": per_page}
# The number of matched materials is only requested on the first
# search, not for each page.
if after is not None:
composite_kwargs["after"] = after
else:
cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000)
s.aggs.metric("n_materials", cardinality_agg)
composite_agg = A("composite", **composite_kwargs)
composite_agg.metric("representative", A(
"top_hits",
size=1,
_source={"includes": list(material_prop_map.values())},
))
s.aggs.bucket("materials", composite_agg)
# We ignore the top level hits and sort by reduced material formula.
s = s.extra(**{
"size": 0,
})
response = s.execute()
materials = response.aggs.materials.buckets
if len(materials) == 0:
abort(404, message="No materials found for the given search criteria or pagination.")
after_new = response.aggs.materials["after_key"]
# Gather results from aggregations
result_list = []
materials = response.aggs.materials.buckets
keys = list(material_prop_map.keys())
for material in materials:
representative = material["representative"][0]
mat_dict = get_es_doc_values(representative, material_prop_map, keys)
mat_dict["n_matches"] = material.doc_count
result_list.append(mat_dict)
# Page information is incomplete for aggregations
pages = {
"page": page,
"per_page": per_page,
"after": after_new,
}
if after is None:
n_materials = response.aggs.n_materials.value
pages["total"] = n_materials
# 2. Collapse approach. Quickly provides a list of materials
# corresponding to the query, offers full pagination, doesn"t include
# the number of matches per material.
elif mode == "collapse":
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
# Add cardinality aggregation that gives out the total number of materials
cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000)
s.aggs.metric("n_materials", cardinality_agg)
composite_agg = A("composite", **composite_kwargs)
composite_agg.metric("representative", A(
"top_hits",
size=1,
_source={"includes": list(material_prop_map.values())},
))
s.aggs.bucket("materials", composite_agg)
# We ignore the top level hits
s = s.extra(**{
"size": 0,
})
response = s.execute()
materials = response.aggs.materials.buckets
if len(materials) == 0:
abort(404, message="No materials found for the given search criteria or pagination.")
after_new = response.aggs.materials["after_key"]
# Gather results from aggregations
result_list = []
materials = response.aggs.materials.buckets
keys = list(material_prop_map.keys())
for material in materials:
representative = material["representative"][0]
mat_dict = get_es_doc_values(representative, material_prop_map, keys)
mat_dict["n_matches"] = material.doc_count
result_list.append(mat_dict)
# Page information is incomplete for aggregations
pages = {
"page": page,
"per_page": per_page,
"after": after_new,
}
s = s.extra(**{
"collapse": {"field": "encyclopedia.material.material_id"},
"size": per_page,
"from": (page - 1) * per_page,
"sort": [{"encyclopedia.material.formula_reduced": {"order": "asc"}}],
"explain": True,
})
# Execute query
response = s.execute()
# No matches
if len(response) == 0:
abort(404, message="No materials found for the given search criteria or pagination.")
# Gather number of entries per material with a separate query
material_ids = [x.encyclopedia.material.material_id for x in response]
s = Search(index=config.elastic.index_name)
bool_query = Q(
"bool",
filter=Q("terms", encyclopedia__material__material_id=material_ids),
)
s2 = s.query(bool_query)
s2.aggs.bucket("n_matches", A("terms", field="encyclopedia.material.material_id"))
response2 = s2.execute()
matmap = {x.key: x.doc_count for x in response2.aggs.n_matches}
if after is None:
n_materials = response.aggs.n_materials.value
pages["total"] = n_materials
# Loop over materials
result_list = []
keys = list(material_prop_map.keys())
for material in response:
# Get values from the collapsed doc
mat_result = get_es_doc_values(material, material_prop_map, keys)
mat_id = material.encyclopedia.material.material_id
mat_result["n_matches"] = matmap[mat_id]
result_list.append(mat_result)
# Full page information available for collapse
pages = {
"page": page,
"per_page": per_page,
"pages": math.ceil(response.hits.total / per_page),
"total": response.aggs.n_materials.value,
}
result = {
"results": result_list,
@@ -450,7 +512,7 @@ class EncGroupsResource(Resource):
# variation hashes set.
bool_query = Q(
"bool",
filter=enc_filter + [Q("term", encyclopedia__material__material_id=material_id)],
filter=get_enc_filter() + [Q("term", encyclopedia__material__material_id=material_id)],
must=[
Q("exists", field="encyclopedia.properties.energies.energy_total"),
Q("exists", field="encyclopedia.material.idealized_structure.cell_volume"),
@@ -534,7 +596,7 @@ class EncGroupResource(Resource):
bool_query = Q(
"bool",
filter=enc_filter + [
filter=get_enc_filter() + [
Q("term", encyclopedia__material__material_id=material_id),
Q("term", **{group_id_source: group_id}),
],
@@ -617,7 +679,7 @@ class EncSuggestionsResource(Resource):
s = Search(index=config.elastic.index_name)
query = Q(
"bool",
filter=enc_filter
filter=get_enc_filter()
)
s = s.query(query)
s = s.extra(**{
@@ -690,7 +752,7 @@ class EncCalculationsResource(Resource):
s = Search(index=config.elastic.index_name)
query = Q(
"bool",
filter=enc_filter + [
filter=get_enc_filter() + [
Q("term", encyclopedia__material__material_id=material_id),
]
)
@@ -841,7 +903,7 @@ class EncStatisticsResource(Resource):
# Find entries for the given material.
bool_query = Q(
"bool",
filter=enc_filter + [
filter=get_enc_filter() + [
Q("term", encyclopedia__material__material_id=material_id),
Q("terms", calc_id=data["calculations"]),
]
@@ -1037,7 +1099,7 @@ class EncCalculationResource(Resource):
s = Search(index=config.elastic.index_name)
query = Q(
"bool",
filter=enc_filter + [
filter=get_enc_filter() + [
Q("term", encyclopedia__material__material_id=material_id),
Q("term", calc_id=calc_id),
]
@@ -1133,8 +1195,8 @@ class EncCalculationResource(Resource):
# Pre-calculate k-path length to be used as x-coordinate in
# plots. If the VBM and CBM information is needed later, it
# can be added as indices along the path. The exact
# k-points and occupations are removed to save band width.
# can be added as indices along the path. The exact k-points
# and occupations are removed to save some bandwidth.
if key == "electronic_band_structure" or key == "phonon_band_structure":
segments = value["section_k_band_segment"]
k_path_length = 0
Loading