Commit 4ec58a63 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Refactored the Encyclopedia API. The complex aggregations are now prefiltered...

Refactored the Encyclopedia API. The complex aggregations are now prefiltered corretly according to the user query and the maximum result set size has been lowered.
parent c8d797eb
Pipeline #82940 passed with stages
in 32 minutes and 3 seconds
...@@ -300,79 +300,15 @@ class EncMaterialsResource(Resource): ...@@ -300,79 +300,15 @@ class EncMaterialsResource(Resource):
except Exception as e: except Exception as e:
abort(400, message=str(e)) abort(400, message=str(e))
# The queries that correspond to AND queries typically need to access def add_terms_filter(filters, source, target, query_type="terms"):
# multiple calculations at once to find the material ids that """For adding terms filters
# correspond to the query. To implement this behaviour we need to run """
# an initial aggregation that checks that the requested properties are
# present for a material. This is a a very crude solution that does not
# scale to complex queries, but I'm not sure we can do much better
# until we have a separate index for materials.
property_map = {
"has_thermal_properties": "encyclopedia.properties.thermodynamical_properties",
"has_band_structure": "encyclopedia.properties.electronic_band_structure",
"has_dos": "encyclopedia.properties.electronic_dos",
"has_fermi_surface": "encyclopedia.properties.fermi_surface",
}
requested_properties = []
# The size is set very large because all the results need to be
# returned. We cannot get the results in a paginated way with composite
# aggregation, because pipeline aggregations are not compatible with
# them.
agg_parent = A("terms", field="encyclopedia.material.material_id", size=5000000)
for key, value in property_map.items():
if data[key] is True:
agg = A("filter", exists={"field": value})
agg_parent.bucket(key, agg)
requested_properties.append(key)
if len(requested_properties) > 1:
bool_query = Q(
"bool",
filter=get_enc_filter(),
)
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
s.aggs.bucket("materials", agg_parent)
buckets_path = {x: "{}._count".format(x) for x in requested_properties}
script = " && ".join(["params.{} > 0".format(x) for x in requested_properties])
agg_parent.pipeline("selector", A(
"bucket_selector",
buckets_path=buckets_path,
script=script,
))
s = s.extra(**{
"size": 0,
})
response = s.execute()
material_ids = [x["key"] for x in response.aggs.materials.buckets]
if len(material_ids) == 0:
abort(404, message="No materials found for the given search criteria or pagination.")
# After finding the material ids that fill the AND conditions, continue
# with a simple OR query.
filters = get_enc_filter()
must_nots = []
musts = []
def add_terms_filter(source, target, query_type="terms"):
if data[source] is not None: if data[source] is not None:
filters.append(Q(query_type, **{target: data[source]})) filters.append(Q(query_type, **{target: data[source]}))
if len(requested_properties) > 1: def add_exists_filter(filters, must_nots, source, target):
filters.append(Q("terms", encyclopedia__material__material_id=material_ids)) """For adding exists filters
add_terms_filter("material_name", "encyclopedia.material.material_name") """
add_terms_filter("structure_type", "encyclopedia.material.bulk.structure_type")
add_terms_filter("space_group_number", "encyclopedia.material.bulk.space_group_number")
add_terms_filter("system_type", "encyclopedia.material.material_type")
add_terms_filter("crystal_system", "encyclopedia.material.bulk.crystal_system")
add_terms_filter("band_gap_direct", "encyclopedia.properties.band_gap_direct", query_type="term")
add_terms_filter("functional_type", "encyclopedia.method.functional_type")
add_terms_filter("basis_set_type", "dft.basis_set")
add_terms_filter("code_name", "dft.code_name")
# Add exists filters if only one property was requested. The initial
# aggregation will handlei multiple simultaneous properties.
def add_exists_filter(source, target):
param = data[source] param = data[source]
if param is not None: if param is not None:
query = Q("exists", field=target) query = Q("exists", field=target)
...@@ -380,12 +316,10 @@ class EncMaterialsResource(Resource): ...@@ -380,12 +316,10 @@ class EncMaterialsResource(Resource):
filters.append(query) filters.append(query)
elif param is False: elif param is False:
must_nots.append(query) must_nots.append(query)
if len(requested_properties) == 1:
prop_name = requested_properties[0]
add_exists_filter(prop_name, property_map[prop_name])
# Add range filters def add_range_filter(filters, source, target, source_unit=None, target_unit=None):
def add_range_filter(source, target, source_unit=None, target_unit=None): """For adding range filters
"""
param = data[source] param = data[source]
query_dict = {} query_dict = {}
if param["min"] is not None: if param["min"] is not None:
...@@ -404,8 +338,27 @@ class EncMaterialsResource(Resource): ...@@ -404,8 +338,27 @@ class EncMaterialsResource(Resource):
query = Q("range", **{target: query_dict}) query = Q("range", **{target: query_dict})
filters.append(query) filters.append(query)
add_range_filter("band_gap", "encyclopedia.properties.band_gap", ureg.eV, ureg.J) property_map = {
add_range_filter("mass_density", "encyclopedia.properties.mass_density") "has_thermal_properties": "encyclopedia.properties.thermodynamical_properties",
"has_band_structure": "encyclopedia.properties.electronic_band_structure",
"has_dos": "encyclopedia.properties.electronic_dos",
"has_fermi_surface": "encyclopedia.properties.fermi_surface",
}
requested_properties = []
filters = get_enc_filter()
must_nots = []
musts = []
add_terms_filter(filters, "material_name", "encyclopedia.material.material_name")
add_terms_filter(filters, "structure_type", "encyclopedia.material.bulk.structure_type")
add_terms_filter(filters, "space_group_number", "encyclopedia.material.bulk.space_group_number")
add_terms_filter(filters, "system_type", "encyclopedia.material.material_type")
add_terms_filter(filters, "crystal_system", "encyclopedia.material.bulk.crystal_system")
add_terms_filter(filters, "band_gap_direct", "encyclopedia.properties.band_gap_direct", query_type="term")
add_terms_filter(filters, "functional_type", "encyclopedia.method.functional_type")
add_terms_filter(filters, "basis_set_type", "dft.basis_set")
add_terms_filter(filters, "code_name", "dft.code_name")
add_range_filter(filters, "band_gap", "encyclopedia.properties.band_gap", ureg.eV, ureg.J)
add_range_filter(filters, "mass_density", "encyclopedia.properties.mass_density")
# Create query for elements or formula # Create query for elements or formula
search_by = data["search_by"] search_by = data["search_by"]
...@@ -466,6 +419,69 @@ class EncMaterialsResource(Resource): ...@@ -466,6 +419,69 @@ class EncMaterialsResource(Resource):
encyclopedia__material__species={"query": query_string, "operator": "and"} encyclopedia__material__species={"query": query_string, "operator": "and"}
)) ))
# The queries that correspond to AND queries typically need to access
# multiple calculations at once to find the material ids that
# correspond to the query. To implement this behaviour we need to run
# an initial aggregation that checks that the requested properties are
# present for a material. This is a a very crude solution that does not
# scale to complex queries, but I'm not sure we can do much better
# until we have a separate index for materials. The size is set very
# large because all the results need to be returned. We cannot get the
# results in a paginated way with composite aggregation, because
# pipeline aggregations are not compatible with them.
agg_parent = A("terms", field="encyclopedia.material.material_id", size=500000)
for key, value in property_map.items():
if data[key] is True:
agg = A("filter", exists={"field": value})
agg_parent.bucket(key, agg)
requested_properties.append(key)
if len(requested_properties) > 1:
# First we setup a boolean filter query that filters for of the
# requested properties. This will reduce the size of the initial
# set on top of which the more expensive aggregation stack is run
# on.
bool_query = Q(
"bool",
filter=filters,
must_not=must_nots,
must=musts,
should=[Q("exists", field=property_map[x]) for x in requested_properties],
minimum_should_match=1, # At least one of the should query must match
)
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
# The remaining requested properties have to be queried as a nested
# aggregation.
s.aggs.bucket("materials", agg_parent)
buckets_path = {x: "{}._count".format(x) for x in requested_properties}
script = " && ".join(["params.{} > 0".format(x) for x in requested_properties])
agg_parent.pipeline("selector", A(
"bucket_selector",
buckets_path=buckets_path,
script=script,
))
s = s.extra(**{
"size": 0,
})
response = s.execute()
material_ids = [x["key"] for x in response.aggs.materials.buckets]
if len(material_ids) == 0:
abort(404, message="No materials found for the given search criteria or pagination.")
# Add pre-selected material ids if multiple exists filters were
# requested. These IDs are already filtered based on the user query so
# none of the other search terms need be used.
if len(requested_properties) > 1:
must_nots = []
musts = []
filters = []
filters.append(Q("terms", encyclopedia__material__material_id=material_ids))
if len(requested_properties) == 1:
prop_name = requested_properties[0]
add_exists_filter(filters, must_nots, prop_name, property_map[prop_name])
# The top query filters out entries based on the user query
page = search_by["page"] page = search_by["page"]
per_page = search_by["per_page"] per_page = search_by["per_page"]
after = search_by["after"] after = search_by["after"]
...@@ -475,8 +491,6 @@ class EncMaterialsResource(Resource): ...@@ -475,8 +491,6 @@ class EncMaterialsResource(Resource):
must_not=must_nots, must_not=must_nots,
must=musts, must=musts,
) )
# The top query filters out entries based on the user query
s = Search(index=config.elastic.index_name) s = Search(index=config.elastic.index_name)
s = s.query(bool_query) s = s.query(bool_query)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment