Added users metrics and date histogram aggregation to search.

4861d6a4 · Markus Scheidgen · b42ef722 · 4861d6a4 · 4861d6a4 · 4861d6a4
Commit 4861d6a4 authored 5 years ago by Markus Scheidgen
--- a/nomad/api/repo.py
+++ b/nomad/api/repo.py
@@ -110,6 +110,8 @@ repo_request_parser.add_argument(
    'scroll', type=bool, help='Enable scrolling')
 repo_request_parser.add_argument(
    'scroll_id', type=str, help='The id of the current scrolling window to use.')
+repo_request_parser.add_argument(
+    'date_histogram', type=bool, help='Add an additional aggregation over the upload time')
 repo_request_parser.add_argument(
    'metrics', type=str, action='append', help=(
        'Metrics to aggregate over all quantities and their values as comma separated list. '
@@ -199,6 +201,7 @@ class RepoCalcsResource(Resource):
        try:
            scroll = bool(request.args.get('scroll', False))
+            date_histogram = bool(request.args.get('date_histogram', False))
            scroll_id = request.args.get('scroll_id', None)
            page = int(request.args.get('page', 1))
            per_page = int(request.args.get('per_page', 10 if not scroll else 1000))
@@ -250,7 +253,8 @@ class RepoCalcsResource(Resource):
            else:
                results = search.metrics_search(
                    q=q, per_page=per_page, page=page, order=order, order_by=order_by,
-                    time_range=time_range, metrics_to_use=metrics, search_parameters=search_parameters)
+                    time_range=time_range, metrics_to_use=metrics, search_parameters=search_parameters,
+                    with_date_histogram=date_histogram)
                # TODO just a work around to make things prettier
                quantities = results['quantities']

--- a/nomad/search.py
+++ b/nomad/search.py
@@ -219,7 +219,8 @@ search_quantities = datamodel.Domain.instance.search_quantities
 metrics = {
    'datasets': ('cardinality', 'datasets.id'),
-    'unique_code_runs': ('cardinality', 'calc_hash')
+    'unique_code_runs': ('cardinality', 'calc_hash'),
+    'users': ('cardinality', 'uploader.name.keyword')
 }
 """
 The available search metrics. Metrics are integer values given for each entry that can
@@ -476,7 +477,7 @@ def quantity_search(
 def metrics_search(
        quantities: Dict[str, int] = aggregations, metrics_to_use: List[str] = [],
-        with_entries: bool = True, **kwargs) -> Dict[str, Any]:
+        with_entries: bool = True, with_date_histogram: bool = False, **kwargs) -> Dict[str, Any]:
    """
    Performs a search like :func:`entry_search`, but instead of entries, returns the given
    metrics aggregated for (a limited set of values) of the given quantities calculated
@@ -524,7 +525,6 @@ def metrics_search(
        # We are using elastic searchs 'composite aggregations' here. We do not really
        # compose aggregations, but only those pseudo composites allow us to use the
        # 'after' feature that allows to scan through all aggregation values.
-        terms: Dict[str, Any] = None
        quantity = search_quantities[quantity_name]
        min_doc_count = 0 if quantity.zero_aggs else 1
        terms = A(
@@ -535,6 +535,10 @@ def metrics_search(
        if quantity_name not in ['authors']:
            add_metrics(buckets)
+    if with_date_histogram:
+        histogram = A('date_histogram', field='upload_time', interval='1M', format='yyyy-MM-dd')
+        add_metrics(search.aggs.bucket('date_histogram', histogram))
    add_metrics(search.aggs)
    response, entry_results = _execute_paginated_search(search, **kwargs)
@@ -557,6 +561,12 @@ def metrics_search(
        if quantity_name not in metrics_names  # ES aggs for total metrics, and aggs for quantities stand side by side
    }
+    if with_date_histogram:
+        metrics_results['date_histogram'] = {
+            bucket.key_as_string: get_metrics(bucket, bucket.doc_count)
+            for bucket in response.aggregations.date_histogram.buckets
+        }
    total_metrics_result = get_metrics(response.aggregations, entry_results['pagination']['total'])
    metrics_results['total'] = dict(all=total_metrics_result)

--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -826,6 +826,14 @@ class TestRepo():
                else:
                    assert len(metrics_result) == 1  # code_runs is the only metric for authors
+    def test_search_date_histogram(self, client, example_elastic_calcs, no_warn):
+        rv = client.get('/repo/?date_histogram=true&metrics=total_energies')
+        assert rv.status_code == 200
+        data = json.loads(rv.data)
+        histogram = data.get('quantities').get('date_histogram')
+        print(histogram)
+        assert len(histogram) > 0
    @pytest.mark.parametrize('n_results, page, per_page', [(2, 1, 5), (1, 1, 1), (0, 2, 3)])
    def test_search_pagination(self, client, example_elastic_calcs, no_warn, n_results, page, per_page):
        rv = client.get('/repo/?page=%d&per_page=%d' % (page, per_page))

--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -66,9 +66,13 @@ def test_metrics_search(elastic, normalized: parsing.LocalBackend):
    create_entry(calc_with_metadata)
    refresh_index()
+    assert 'users' in search.metrics_names
+    assert 'datasets' in search.metrics_names
+    assert 'unique_code_runs' in search.metrics_names
    use_metrics = search.metrics_names
-    results = metrics_search(metrics_to_use=use_metrics, with_entries=True)
+    results = metrics_search(metrics_to_use=use_metrics, with_entries=True, with_date_histogram=True)
    quantities = results['quantities']
    hits = results['results']
    assert results['pagination']['total'] == 1