From 0d16e374aab24429f638e9db5afd6a43ff76f5c4 Mon Sep 17 00:00:00 2001
From: Lauri Himanen <lauri.himanen@gmail.com>
Date: Mon, 22 Jun 2020 12:52:30 +0300
Subject: [PATCH] Now the /materials route behaves exactly like the old one.

---
 nomad/app/api/encyclopedia.py | 167 +++++++++++++++++++++++-----------
 1 file changed, 114 insertions(+), 53 deletions(-)

diff --git a/nomad/app/api/encyclopedia.py b/nomad/app/api/encyclopedia.py
index 016c35735f..8e3c692ca7 100644
--- a/nomad/app/api/encyclopedia.py
+++ b/nomad/app/api/encyclopedia.py
@@ -16,6 +16,7 @@
 The encyclopedia API of the nomad@FAIRDI APIs.
 """
 import re
+import math
 import numpy as np
 
 from flask_restplus import Resource, abort, fields, marshal
@@ -367,61 +368,121 @@ class EncMaterialsResource(Resource):
         s = Search(index=config.elastic.index_name)
         s = s.query(bool_query)
 
-        # The materials are grouped by using three aggregations:
-        # "Composite" to enable scrolling, "Terms" to enable selecting
-        # by material_id and "Top Hits" to fetch a single
-        # representative material document. Unnecessary fields are
-        # filtered to reduce data transfer.
-        terms_agg = A("terms", field="encyclopedia.material.material_id")
-        composite_kwargs = {"sources": {"materials": terms_agg}, "size": per_page}
-
-        # The number of matched materials is only requested on the first
-        # search, not for each page.
-        if after is not None:
-            composite_kwargs["after"] = after
-        else:
+        # 1: The paginated approach: No way to know the amount of materials,
+        # but can return aggregation results in a quick fashion including
+        # the number of calculation entries per material.
+        mode = "collapse"
+        if mode == "aggregation":
+            # The materials are grouped by using three aggregations:
+            # "Composite" to enable scrolling, "Terms" to enable selecting
+            # by material_id and "Top Hits" to fetch a single
+            # representative material document. Unnecessary fields are
+            # filtered to reduce data transfer.
+            terms_agg = A("terms", field="encyclopedia.material.material_id")
+            composite_kwargs = {"sources": {"materials": terms_agg}, "size": per_page}
+
+            # The number of matched materials is only requested on the first
+            # search, not for each page.
+            if after is not None:
+                composite_kwargs["after"] = after
+            else:
+                cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000)
+                s.aggs.metric("n_materials", cardinality_agg)
+
+            composite_agg = A("composite", **composite_kwargs)
+            composite_agg.metric("representative", A(
+                "top_hits",
+                size=1,
+                _source={"includes": list(material_prop_map.values())},
+            ))
+            s.aggs.bucket("materials", composite_agg)
+
+            # We ignore the top level hits and sort by reduced material formula.
+            s = s.extra(**{
+                "size": 0,
+            })
+
+            response = s.execute()
+            materials = response.aggs.materials.buckets
+            if len(materials) == 0:
+                abort(404, message="No materials found for the given search criteria or pagination.")
+            after_new = response.aggs.materials["after_key"]
+
+            # Gather results from aggregations
+            result_list = []
+            materials = response.aggs.materials.buckets
+            keys = list(material_prop_map.keys())
+            for material in materials:
+                representative = material["representative"][0]
+                mat_dict = get_es_doc_values(representative, material_prop_map, keys)
+                mat_dict["n_matches"] = material.doc_count
+                result_list.append(mat_dict)
+
+            # Page information is incomplete for aggregations
+            pages = {
+                "page": page,
+                "per_page": per_page,
+                "after": after_new,
+            }
+            if after is None:
+                n_materials = response.aggs.n_materials.value
+                pages["total"] = n_materials
+
+        # 2. Collapse approach. Quickly provides a list of materials
+        # corresponding to the query, offers full pagination, doesn"t include
+        # the number of matches per material.
+        elif mode == "collapse":
+            s = Search(index=config.elastic.index_name)
+            s = s.query(bool_query)
+
+            # Add cardinality aggregation that gives out the total number of materials
             cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000)
             s.aggs.metric("n_materials", cardinality_agg)
 
-        composite_agg = A("composite", **composite_kwargs)
-        composite_agg.metric("representative", A(
-            "top_hits",
-            size=1,
-            _source={"includes": list(material_prop_map.values())},
-        ))
-        s.aggs.bucket("materials", composite_agg)
-
-        # We ignore the top level hits
-        s = s.extra(**{
-            "size": 0,
-        })
-
-        response = s.execute()
-        materials = response.aggs.materials.buckets
-        if len(materials) == 0:
-            abort(404, message="No materials found for the given search criteria or pagination.")
-        after_new = response.aggs.materials["after_key"]
-
-        # Gather results from aggregations
-        result_list = []
-        materials = response.aggs.materials.buckets
-        keys = list(material_prop_map.keys())
-        for material in materials:
-            representative = material["representative"][0]
-            mat_dict = get_es_doc_values(representative, material_prop_map, keys)
-            mat_dict["n_matches"] = material.doc_count
-            result_list.append(mat_dict)
-
-        # Page information is incomplete for aggregations
-        pages = {
-            "page": page,
-            "per_page": per_page,
-            "after": after_new,
-        }
+            s = s.extra(**{
+                "collapse": {"field": "encyclopedia.material.material_id"},
+                "size": per_page,
+                "from": (page - 1) * per_page,
+                "sort": [{"encyclopedia.material.formula_reduced": {"order": "asc"}}],
+                "explain": True,
+            })
+
+            # Execute query
+            response = s.execute()
+
+            # No matches
+            if len(response) == 0:
+                abort(404, message="No materials found for the given search criteria or pagination.")
+
+            # Gather number of entries per material with a separate query
+            material_ids = [x.encyclopedia.material.material_id for x in response]
+            s = Search(index=config.elastic.index_name)
+            bool_query = Q(
+                "bool",
+                filter=Q("terms", encyclopedia__material__material_id=material_ids),
+            )
+            s2 = s.query(bool_query)
+            s2.aggs.bucket("n_matches", A("terms", field="encyclopedia.material.material_id"))
+            response2 = s2.execute()
+            matmap = {x.key: x.doc_count for x in response2.aggs.n_matches}
 
-        if after is None:
-            n_materials = response.aggs.n_materials.value
-            pages["total"] = n_materials
+            # Loop over materials
+            result_list = []
+            keys = list(material_prop_map.keys())
+            for material in response:
+                # Get values from the collapsed doc
+                mat_result = get_es_doc_values(material, material_prop_map, keys)
+                mat_id = material.encyclopedia.material.material_id
+                mat_result["n_matches"] = matmap[mat_id]
+                result_list.append(mat_result)
+
+            # Full page information available for collapse
+            pages = {
+                "page": page,
+                "per_page": per_page,
+                "pages": math.ceil(response.hits.total / per_page),
+                "total": response.aggs.n_materials.value,
+            }
 
         result = {
             "results": result_list,
@@ -1134,8 +1195,8 @@ class EncCalculationResource(Resource):
 
                 # Pre-calculate k-path length to be used as x-coordinate in
                 # plots. If the VBM and CBM information is needed later, it
-                # can be added as indices along the path. The exact
-                # k-points and occupations are removed to save band width.
+                # can be added as indices along the path. The exact k-points
+                # and occupations are removed to save some bandwidth.
                 if key == "electronic_band_structure" or key == "phonon_band_structure":
                     segments = value["section_k_band_segment"]
                     k_path_length = 0
-- 
GitLab