Commit 18940633 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merge branch 'aggregation-improvement' into 'v0.9.0'

Refactored the Encyclopedia API. The complex aggregations are now prefiltered correctly according to the user query and the maximum result set size has been lowered.

See merge request !183
parents c8d797eb 4ec58a63
Pipeline #82958 passed with stages
in 17 minutes and 17 seconds
......@@ -300,79 +300,15 @@ class EncMaterialsResource(Resource):
except Exception as e:
abort(400, message=str(e))
# The queries that correspond to AND queries typically need to access
# multiple calculations at once to find the material ids that
# correspond to the query. To implement this behaviour we need to run
# an initial aggregation that checks that the requested properties are
# present for a material. This is a a very crude solution that does not
# scale to complex queries, but I'm not sure we can do much better
# until we have a separate index for materials.
property_map = {
"has_thermal_properties": "encyclopedia.properties.thermodynamical_properties",
"has_band_structure": "encyclopedia.properties.electronic_band_structure",
"has_dos": "encyclopedia.properties.electronic_dos",
"has_fermi_surface": "encyclopedia.properties.fermi_surface",
}
requested_properties = []
# The size is set very large because all the results need to be
# returned. We cannot get the results in a paginated way with composite
# aggregation, because pipeline aggregations are not compatible with
# them.
agg_parent = A("terms", field="encyclopedia.material.material_id", size=5000000)
for key, value in property_map.items():
if data[key] is True:
agg = A("filter", exists={"field": value})
agg_parent.bucket(key, agg)
requested_properties.append(key)
if len(requested_properties) > 1:
bool_query = Q(
"bool",
filter=get_enc_filter(),
)
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
s.aggs.bucket("materials", agg_parent)
buckets_path = {x: "{}._count".format(x) for x in requested_properties}
script = " && ".join(["params.{} > 0".format(x) for x in requested_properties])
agg_parent.pipeline("selector", A(
"bucket_selector",
buckets_path=buckets_path,
script=script,
))
s = s.extra(**{
"size": 0,
})
response = s.execute()
material_ids = [x["key"] for x in response.aggs.materials.buckets]
if len(material_ids) == 0:
abort(404, message="No materials found for the given search criteria or pagination.")
# After finding the material ids that fill the AND conditions, continue
# with a simple OR query.
filters = get_enc_filter()
must_nots = []
musts = []
def add_terms_filter(source, target, query_type="terms"):
def add_terms_filter(filters, source, target, query_type="terms"):
"""For adding terms filters
"""
if data[source] is not None:
filters.append(Q(query_type, **{target: data[source]}))
if len(requested_properties) > 1:
filters.append(Q("terms", encyclopedia__material__material_id=material_ids))
add_terms_filter("material_name", "encyclopedia.material.material_name")
add_terms_filter("structure_type", "encyclopedia.material.bulk.structure_type")
add_terms_filter("space_group_number", "encyclopedia.material.bulk.space_group_number")
add_terms_filter("system_type", "encyclopedia.material.material_type")
add_terms_filter("crystal_system", "encyclopedia.material.bulk.crystal_system")
add_terms_filter("band_gap_direct", "encyclopedia.properties.band_gap_direct", query_type="term")
add_terms_filter("functional_type", "encyclopedia.method.functional_type")
add_terms_filter("basis_set_type", "dft.basis_set")
add_terms_filter("code_name", "dft.code_name")
# Add exists filters if only one property was requested. The initial
# aggregation will handlei multiple simultaneous properties.
def add_exists_filter(source, target):
def add_exists_filter(filters, must_nots, source, target):
"""For adding exists filters
"""
param = data[source]
if param is not None:
query = Q("exists", field=target)
......@@ -380,12 +316,10 @@ class EncMaterialsResource(Resource):
filters.append(query)
elif param is False:
must_nots.append(query)
if len(requested_properties) == 1:
prop_name = requested_properties[0]
add_exists_filter(prop_name, property_map[prop_name])
# Add range filters
def add_range_filter(source, target, source_unit=None, target_unit=None):
def add_range_filter(filters, source, target, source_unit=None, target_unit=None):
"""For adding range filters
"""
param = data[source]
query_dict = {}
if param["min"] is not None:
......@@ -404,8 +338,27 @@ class EncMaterialsResource(Resource):
query = Q("range", **{target: query_dict})
filters.append(query)
add_range_filter("band_gap", "encyclopedia.properties.band_gap", ureg.eV, ureg.J)
add_range_filter("mass_density", "encyclopedia.properties.mass_density")
property_map = {
"has_thermal_properties": "encyclopedia.properties.thermodynamical_properties",
"has_band_structure": "encyclopedia.properties.electronic_band_structure",
"has_dos": "encyclopedia.properties.electronic_dos",
"has_fermi_surface": "encyclopedia.properties.fermi_surface",
}
requested_properties = []
filters = get_enc_filter()
must_nots = []
musts = []
add_terms_filter(filters, "material_name", "encyclopedia.material.material_name")
add_terms_filter(filters, "structure_type", "encyclopedia.material.bulk.structure_type")
add_terms_filter(filters, "space_group_number", "encyclopedia.material.bulk.space_group_number")
add_terms_filter(filters, "system_type", "encyclopedia.material.material_type")
add_terms_filter(filters, "crystal_system", "encyclopedia.material.bulk.crystal_system")
add_terms_filter(filters, "band_gap_direct", "encyclopedia.properties.band_gap_direct", query_type="term")
add_terms_filter(filters, "functional_type", "encyclopedia.method.functional_type")
add_terms_filter(filters, "basis_set_type", "dft.basis_set")
add_terms_filter(filters, "code_name", "dft.code_name")
add_range_filter(filters, "band_gap", "encyclopedia.properties.band_gap", ureg.eV, ureg.J)
add_range_filter(filters, "mass_density", "encyclopedia.properties.mass_density")
# Create query for elements or formula
search_by = data["search_by"]
......@@ -466,6 +419,69 @@ class EncMaterialsResource(Resource):
encyclopedia__material__species={"query": query_string, "operator": "and"}
))
# The queries that correspond to AND queries typically need to access
# multiple calculations at once to find the material ids that
# correspond to the query. To implement this behaviour we need to run
# an initial aggregation that checks that the requested properties are
# present for a material. This is a a very crude solution that does not
# scale to complex queries, but I'm not sure we can do much better
# until we have a separate index for materials. The size is set very
# large because all the results need to be returned. We cannot get the
# results in a paginated way with composite aggregation, because
# pipeline aggregations are not compatible with them.
agg_parent = A("terms", field="encyclopedia.material.material_id", size=500000)
for key, value in property_map.items():
if data[key] is True:
agg = A("filter", exists={"field": value})
agg_parent.bucket(key, agg)
requested_properties.append(key)
if len(requested_properties) > 1:
# First we setup a boolean filter query that filters for of the
# requested properties. This will reduce the size of the initial
# set on top of which the more expensive aggregation stack is run
# on.
bool_query = Q(
"bool",
filter=filters,
must_not=must_nots,
must=musts,
should=[Q("exists", field=property_map[x]) for x in requested_properties],
minimum_should_match=1, # At least one of the should query must match
)
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
# The remaining requested properties have to be queried as a nested
# aggregation.
s.aggs.bucket("materials", agg_parent)
buckets_path = {x: "{}._count".format(x) for x in requested_properties}
script = " && ".join(["params.{} > 0".format(x) for x in requested_properties])
agg_parent.pipeline("selector", A(
"bucket_selector",
buckets_path=buckets_path,
script=script,
))
s = s.extra(**{
"size": 0,
})
response = s.execute()
material_ids = [x["key"] for x in response.aggs.materials.buckets]
if len(material_ids) == 0:
abort(404, message="No materials found for the given search criteria or pagination.")
# Add pre-selected material ids if multiple exists filters were
# requested. These IDs are already filtered based on the user query so
# none of the other search terms need be used.
if len(requested_properties) > 1:
must_nots = []
musts = []
filters = []
filters.append(Q("terms", encyclopedia__material__material_id=material_ids))
if len(requested_properties) == 1:
prop_name = requested_properties[0]
add_exists_filter(filters, must_nots, prop_name, property_map[prop_name])
# The top query filters out entries based on the user query
page = search_by["page"]
per_page = search_by["per_page"]
after = search_by["after"]
......@@ -475,8 +491,6 @@ class EncMaterialsResource(Resource):
must_not=must_nots,
must=musts,
)
# The top query filters out entries based on the user query
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment