Commit 5baba261 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Implemented search in repo search endpoint. Added search for files, path, quantities, etc.

parent b75b8896
Pipeline #43896 passed with stages
in 22 minutes and 54 seconds
......@@ -22,9 +22,9 @@ from .app import api
pagination_model = api.model('Pagination', {
'total': fields.Integer,
'page': fields.Integer,
'per_page': fields.Integer,
'total': fields.Integer(description='Number of total elements.'),
'page': fields.Integer(description='Number of the current page, starting with 0.'),
'per_page': fields.Integer(description='Number of elements per page.'),
""" Model used in responsed with pagination. """
......@@ -22,7 +22,7 @@ from flask import request, g
from elasticsearch_dsl import Q
from nomad.files import UploadFiles, Restricted
from import Entry
from nomad import search
from .app import api
from .auth import login_if_available, create_authorization_predicate
......@@ -61,7 +61,12 @@ class RepoCalcResource(Resource):
repo_calcs_model = api.model('RepoCalculations', {
'pagination': fields.Nested(pagination_model),
'results': fields.List(fields.Raw)
'results': fields.List(fields.Raw, description=(
'A list of search results. Each result is a dict with quantitie names as key and '
'values as values')),
'aggregations': fields.Raw(description=(
'A dict with all aggregations. Each aggregation is dictionary with the amount as '
'value and quantity value as key.'))
repo_request_parser = pagination_request_parser.copy()
......@@ -69,28 +74,35 @@ repo_request_parser.add_argument(
'owner', type=str,
help='Specify which calcs to return: ``all``, ``user``, ``staging``, default is ``all``')
for search_quantity in search.search_quantities.keys():
_, _, description = search.search_quantities[search_quantity]
repo_request_parser.add_argument(search_quantity, type=str, help=description)
class RepoCalcsResource(Resource):
@api.response(400, 'Invalid requests, e.g. wrong owner type')
@api.response(400, 'Invalid requests, e.g. wrong owner type or bad quantities')
@api.expect(repo_request_parser, validate=True)
@api.marshal_with(repo_calcs_model, skip_none=True, code=200, description='Metadata send')
def get(self):
Get *'all'* calculations in repository from, paginated.
This is currently not implemented!
Search for calculations in the repository from, paginated.
The ``owner`` parameter determines the overall entries to search through.
You can use the various quantities to search/filter for. For some of the
indexed quantities this endpoint returns aggregation information. This means
you will be given a list of all possible values and the number of entries
that have the certain value. You can also use these aggregations on an empty
search to determine the possible values.
# return dict(pagination=dict(total=0, page=1, per_page=10), results=[]), 200
page = int(request.args.get('page', 1))
page = int(request.args.get('page', 0))
per_page = int(request.args.get('per_page', 10))
owner = request.args.get('owner', 'all')
assert page >= 1
assert page >= 0
assert per_page > 0
except AssertionError:
abort(400, message='invalid pagination')
......@@ -112,13 +124,18 @@ class RepoCalcsResource(Resource):
abort(400, message='Invalid owner value. Valid values are all|user|staging, default is all')
search =
search = search[(page - 1) * per_page: page * per_page]
return {
'pagination': {
'total': search.count(),
'page': page,
'per_page': per_page
'results': [hit.to_dict() for hit in search]
}, 200
data = dict(**request.args)
data.pop('owner', None)
data.pop('page', None)
data.pop('per_page', None)
total, results, aggregations = search.aggregate_search(
page=page, per_page=per_page, q=q, **data)
except KeyError as e:
abort(400, str(e))
return dict(
pagination=dict(total=total, page=page, per_page=per_page),
aggregations=aggregations), 200
......@@ -18,12 +18,16 @@ This module represents calculations in elastic search.
from typing import Iterable, Dict, Tuple, List
from elasticsearch_dsl import Document, InnerDoc, Keyword, Text, Date, \
Object, Boolean, Search, Integer, Q, A
Object, Boolean, Search, Integer, Q, A, analyzer, tokenizer
import elasticsearch.helpers
from nomad import config, datamodel, infrastructure, datamodel, coe_repo, parsing
path_analyzer = analyzer(
tokenizer=tokenizer('path_tokenizer', 'pattern', pattern='/'))
class AlreadyExists(Exception): pass
......@@ -39,13 +43,11 @@ class User(InnerDoc):
name = '%s, %s' % (user['last_name'], user['first_name']) = name
self.name_keyword = name
return self
user_id = Keyword()
name = Text()
name_keyword = Keyword()
name = Text(fields={'keyword': Keyword()})
class Dataset(InnerDoc):
......@@ -72,7 +74,7 @@ class Entry(Document):
calc_hash = Keyword()
pid = Keyword()
mainfile = Keyword()
files = Keyword(multi=True)
files = Text(multi=True, analyzer=path_analyzer, fields={'keyword': Keyword()})
uploader = Object(User)
with_embargo = Boolean()
......@@ -99,11 +101,6 @@ class Entry(Document):
geometries = Keyword(multi=True)
quantities = Keyword(multi=True)
# def __init__(self, *args, **kwargs):
# super().__init__(*args, **kwargs)
# self.authors = []
# self.owners = []
def from_calc_with_metadata(cls, source: datamodel.CalcWithMetadata) -> 'Entry':
entry = Entry(meta=dict(id=source.calc_id))
......@@ -116,8 +113,15 @@ class Entry(Document):
self.calc_id = source.calc_id
self.calc_hash = source.calc_hash = str(
self.mainfile = source.mainfile
self.files = source.files
if source.files is None:
self.files = [self.mainfile]
elif self.mainfile not in source.files:
self.files = [self.mainfile] + source.files
self.files = source.files
self.uploader = User.from_user_popo(source.uploader) if source.uploader is not None else None
self.with_embargo = source.with_embargo
......@@ -179,7 +183,7 @@ def publish(calcs: Iterable[datamodel.CalcWithMetadata]) -> None:
elasticsearch.helpers.bulk(infrastructure.elastic_client, elastic_updates())
default_aggregations = {
aggregations = {
'atoms': len(,
'system': 10,
'crystal_system': 10,
......@@ -187,12 +191,37 @@ default_aggregations = {
'xc_functional': 10,
'authors': 10
""" The available aggregations in :func:`aggregate_search` and their maximum aggregation size """
search_quantities = {
'atoms': ('term', 'atoms', (
'Search the given atom. This quantity can be used multiple times to search for '
'results with all the given atoms. The atoms are given by their case sensitive '
'symbol, e.g. Fe.')),
'system': ('term', 'system', 'Search for the given system type.'),
'crystal_system': ('term', 'crystal_system', 'Search for the given crystal system.'),
'code_name': ('term', 'code_name', 'Search for the given code name.'),
'xc_functional': ('term', 'xc_functional', 'Search for the given xc functional treatment'),
'authors': ('term', '', (
'Search for the given author. Exact keyword matches in the form "Lastname, Firstname".')),
'comment': ('match', 'comment', 'Search within the comments. This is a text search ala google.'),
'paths': ('match', 'files', (
'Search for elements in one of the file paths. The paths are split at all "/".')),
'files': ('term', 'files.keyword', 'Search for exact file name with full path.'),
'quantities': ('term', 'quantities', 'Search for the existence of a certain meta-info quantity')
The available search quantities in :func:`aggregate_search` as tuples with *search type*,
elastic field and description.
def aggregate_search(
page: int = 0, per_page: int = 10, q: Q = None,
aggregations: Dict[str, int] = default_aggregations,
**kwargs) -> Tuple[int, List[dict], Dict[str, Dict[str, int]]]:
page: int = 0, per_page: int = 10, q: Q = None, **kwargs) -> Tuple[int, List[dict], Dict[str, Dict[str, int]]]:
Performs a search and returns paginated search results and aggregation bucket sizes
based on key quantities.
......@@ -203,7 +232,7 @@ def aggregate_search(
q: An *elasticsearch_dsl* query used to further filter the results (via `and`)
aggregations: A customized list of aggregations to perform. Keys are index fields,
and values the amount of buckets to return. Only works on *keyword* field.
**kwargs: Field, value pairs to search for.
**kwargs: Quantity, value pairs to search for.
Returns: A tuple with the total hits, an array with the results, an dictionary with
the aggregation data.
......@@ -211,12 +240,18 @@ def aggregate_search(
search = Search()
if q is not None:
search = search.query(q)
for key, value in kwargs.items():
if key == 'comment':
search = search.query(Q('match', **{key: value}))
query_type, field, _ = search_quantities.get(key, (None, None, None))
if query_type is None:
raise KeyError('Unknown quantity %s' % key)
if isinstance(value, list):
for item in value:
search = search.query(Q(query_type, **{field: item}))
search = search.query(Q('term', **{key: value}))
search = search.query(Q(query_type, **{field: value}))
for aggregation, size in aggregations.items():
if aggregation == 'authors':
......@@ -261,7 +296,7 @@ def authors(per_page: int = 10, after: str = None, prefix: str = None) -> Tuple[
composite = dict(
if after is not None:
......@@ -545,6 +545,7 @@ class TestRepo(UploadFilesBasedTests):
calc_with_metadata.update(calc_id='2', uploader=other_test_user.to_popo(), published=True)
calc_with_metadata.update(atoms=['Fe'], comment='this is a specific word')
calc_with_metadata.update(calc_id='3', uploader=other_test_user.to_popo(), published=False)
......@@ -566,9 +567,9 @@ class TestRepo(UploadFilesBasedTests):
(1, 'user', 'test_user'),
(2, 'user', 'other_test_user'),
(0, 'staging', 'test_user'),
(1, 'staging', 'other_test_user'),
(1, 'staging', 'other_test_user')
def test_search(self, client, example_elastic_calcs, no_warn, test_user_auth, other_test_user_auth, calcs, owner, auth):
def test_search_owner(self, client, example_elastic_calcs, no_warn, test_user_auth, other_test_user_auth, calcs, owner, auth):
auth = dict(none=None, test_user=test_user_auth, other_test_user=other_test_user_auth).get(auth)
rv = client.get('/repo/?owner=%s' % owner, headers=auth)
assert rv.status_code == 200
......@@ -581,7 +582,45 @@ class TestRepo(UploadFilesBasedTests):
for key in ['uploader', 'calc_id', 'formula', 'upload_id']:
assert key in results[0]
def test_calcs_pagination(self, client, example_elastic_calcs, no_warn):
@pytest.mark.parametrize('calcs, quantity, value', [
(2, 'system', 'Bulk'),
(0, 'system', 'Atom'),
(1, 'atoms', 'Br'),
(1, 'atoms', 'Fe'),
(0, 'atoms', ['Fe', 'Br']),
(1, 'comment', 'specific'),
(1, 'authors', 'Hofstadter, Leonard'),
(2, 'files', 'test/mainfile.txt'),
(2, 'paths', 'mainfile.txt'),
(2, 'paths', 'test'),
(2, 'quantities', ['wyckoff_letters_primitive', 'hall_number']),
(0, 'quantities', 'dos')
def test_search_quantities(self, client, example_elastic_calcs, no_warn, test_user_auth, calcs, quantity, value):
if isinstance(value, list):
query_string = '&'.join('%s=%s' % (quantity, item) for item in value)
query_string = '%s=%s' % (quantity, value)
rv = client.get('/repo/?%s' % query_string, headers=test_user_auth)
assert rv.status_code == 200
data = json.loads(
results = data.get('results', None)
assert results is not None
assert isinstance(results, list)
assert len(results) == calcs
aggregations = data.get('aggregations', None)
assert aggregations is not None
if quantity == 'system' and calcs != 0:
# for simplicity we only assert on aggregations for this case
assert 'system' in aggregations
assert len(aggregations['system']) == 1
assert value in aggregations['system']
def test_search_pagination(self, client, example_elastic_calcs, no_warn):
rv = client.get('/repo/?page=1&per_page=1')
assert rv.status_code == 200
data = json.loads(
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment