Skip to content
Snippets Groups Projects
Commit 4861d6a4 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added users metrics and date histogram aggregation to search.

parent b42ef722
Branches
Tags
1 merge request!50Towards v0.5.1
Pipeline #53810 passed
...@@ -110,6 +110,8 @@ repo_request_parser.add_argument( ...@@ -110,6 +110,8 @@ repo_request_parser.add_argument(
'scroll', type=bool, help='Enable scrolling') 'scroll', type=bool, help='Enable scrolling')
repo_request_parser.add_argument( repo_request_parser.add_argument(
'scroll_id', type=str, help='The id of the current scrolling window to use.') 'scroll_id', type=str, help='The id of the current scrolling window to use.')
repo_request_parser.add_argument(
'date_histogram', type=bool, help='Add an additional aggregation over the upload time')
repo_request_parser.add_argument( repo_request_parser.add_argument(
'metrics', type=str, action='append', help=( 'metrics', type=str, action='append', help=(
'Metrics to aggregate over all quantities and their values as comma separated list. ' 'Metrics to aggregate over all quantities and their values as comma separated list. '
...@@ -199,6 +201,7 @@ class RepoCalcsResource(Resource): ...@@ -199,6 +201,7 @@ class RepoCalcsResource(Resource):
try: try:
scroll = bool(request.args.get('scroll', False)) scroll = bool(request.args.get('scroll', False))
date_histogram = bool(request.args.get('date_histogram', False))
scroll_id = request.args.get('scroll_id', None) scroll_id = request.args.get('scroll_id', None)
page = int(request.args.get('page', 1)) page = int(request.args.get('page', 1))
per_page = int(request.args.get('per_page', 10 if not scroll else 1000)) per_page = int(request.args.get('per_page', 10 if not scroll else 1000))
...@@ -250,7 +253,8 @@ class RepoCalcsResource(Resource): ...@@ -250,7 +253,8 @@ class RepoCalcsResource(Resource):
else: else:
results = search.metrics_search( results = search.metrics_search(
q=q, per_page=per_page, page=page, order=order, order_by=order_by, q=q, per_page=per_page, page=page, order=order, order_by=order_by,
time_range=time_range, metrics_to_use=metrics, search_parameters=search_parameters) time_range=time_range, metrics_to_use=metrics, search_parameters=search_parameters,
with_date_histogram=date_histogram)
# TODO just a work around to make things prettier # TODO just a work around to make things prettier
quantities = results['quantities'] quantities = results['quantities']
......
...@@ -219,7 +219,8 @@ search_quantities = datamodel.Domain.instance.search_quantities ...@@ -219,7 +219,8 @@ search_quantities = datamodel.Domain.instance.search_quantities
metrics = { metrics = {
'datasets': ('cardinality', 'datasets.id'), 'datasets': ('cardinality', 'datasets.id'),
'unique_code_runs': ('cardinality', 'calc_hash') 'unique_code_runs': ('cardinality', 'calc_hash'),
'users': ('cardinality', 'uploader.name.keyword')
} }
""" """
The available search metrics. Metrics are integer values given for each entry that can The available search metrics. Metrics are integer values given for each entry that can
...@@ -476,7 +477,7 @@ def quantity_search( ...@@ -476,7 +477,7 @@ def quantity_search(
def metrics_search( def metrics_search(
quantities: Dict[str, int] = aggregations, metrics_to_use: List[str] = [], quantities: Dict[str, int] = aggregations, metrics_to_use: List[str] = [],
with_entries: bool = True, **kwargs) -> Dict[str, Any]: with_entries: bool = True, with_date_histogram: bool = False, **kwargs) -> Dict[str, Any]:
""" """
Performs a search like :func:`entry_search`, but instead of entries, returns the given Performs a search like :func:`entry_search`, but instead of entries, returns the given
metrics aggregated for (a limited set of values) of the given quantities calculated metrics aggregated for (a limited set of values) of the given quantities calculated
...@@ -524,7 +525,6 @@ def metrics_search( ...@@ -524,7 +525,6 @@ def metrics_search(
# We are using elastic searchs 'composite aggregations' here. We do not really # We are using elastic searchs 'composite aggregations' here. We do not really
# compose aggregations, but only those pseudo composites allow us to use the # compose aggregations, but only those pseudo composites allow us to use the
# 'after' feature that allows to scan through all aggregation values. # 'after' feature that allows to scan through all aggregation values.
terms: Dict[str, Any] = None
quantity = search_quantities[quantity_name] quantity = search_quantities[quantity_name]
min_doc_count = 0 if quantity.zero_aggs else 1 min_doc_count = 0 if quantity.zero_aggs else 1
terms = A( terms = A(
...@@ -535,6 +535,10 @@ def metrics_search( ...@@ -535,6 +535,10 @@ def metrics_search(
if quantity_name not in ['authors']: if quantity_name not in ['authors']:
add_metrics(buckets) add_metrics(buckets)
if with_date_histogram:
histogram = A('date_histogram', field='upload_time', interval='1M', format='yyyy-MM-dd')
add_metrics(search.aggs.bucket('date_histogram', histogram))
add_metrics(search.aggs) add_metrics(search.aggs)
response, entry_results = _execute_paginated_search(search, **kwargs) response, entry_results = _execute_paginated_search(search, **kwargs)
...@@ -557,6 +561,12 @@ def metrics_search( ...@@ -557,6 +561,12 @@ def metrics_search(
if quantity_name not in metrics_names # ES aggs for total metrics, and aggs for quantities stand side by side if quantity_name not in metrics_names # ES aggs for total metrics, and aggs for quantities stand side by side
} }
if with_date_histogram:
metrics_results['date_histogram'] = {
bucket.key_as_string: get_metrics(bucket, bucket.doc_count)
for bucket in response.aggregations.date_histogram.buckets
}
total_metrics_result = get_metrics(response.aggregations, entry_results['pagination']['total']) total_metrics_result = get_metrics(response.aggregations, entry_results['pagination']['total'])
metrics_results['total'] = dict(all=total_metrics_result) metrics_results['total'] = dict(all=total_metrics_result)
......
...@@ -826,6 +826,14 @@ class TestRepo(): ...@@ -826,6 +826,14 @@ class TestRepo():
else: else:
assert len(metrics_result) == 1 # code_runs is the only metric for authors assert len(metrics_result) == 1 # code_runs is the only metric for authors
def test_search_date_histogram(self, client, example_elastic_calcs, no_warn):
rv = client.get('/repo/?date_histogram=true&metrics=total_energies')
assert rv.status_code == 200
data = json.loads(rv.data)
histogram = data.get('quantities').get('date_histogram')
print(histogram)
assert len(histogram) > 0
@pytest.mark.parametrize('n_results, page, per_page', [(2, 1, 5), (1, 1, 1), (0, 2, 3)]) @pytest.mark.parametrize('n_results, page, per_page', [(2, 1, 5), (1, 1, 1), (0, 2, 3)])
def test_search_pagination(self, client, example_elastic_calcs, no_warn, n_results, page, per_page): def test_search_pagination(self, client, example_elastic_calcs, no_warn, n_results, page, per_page):
rv = client.get('/repo/?page=%d&per_page=%d' % (page, per_page)) rv = client.get('/repo/?page=%d&per_page=%d' % (page, per_page))
......
...@@ -66,9 +66,13 @@ def test_metrics_search(elastic, normalized: parsing.LocalBackend): ...@@ -66,9 +66,13 @@ def test_metrics_search(elastic, normalized: parsing.LocalBackend):
create_entry(calc_with_metadata) create_entry(calc_with_metadata)
refresh_index() refresh_index()
assert 'users' in search.metrics_names
assert 'datasets' in search.metrics_names
assert 'unique_code_runs' in search.metrics_names
use_metrics = search.metrics_names use_metrics = search.metrics_names
results = metrics_search(metrics_to_use=use_metrics, with_entries=True) results = metrics_search(metrics_to_use=use_metrics, with_entries=True, with_date_histogram=True)
quantities = results['quantities'] quantities = results['quantities']
hits = results['results'] hits = results['results']
assert results['pagination']['total'] == 1 assert results['pagination']['total'] == 1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment