test_search.py 12.6 KB
Newer Older
Markus Scheidgen's avatar
Markus Scheidgen committed
1
2
3
4
#
# Copyright The NOMAD Authors.
#
# This file is part of NOMAD. See https://nomad-lab.eu for further info.
5
6
7
8
9
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
Markus Scheidgen's avatar
Markus Scheidgen committed
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
12
#
# Unless required by applicable law or agreed to in writing, software
Markus Scheidgen's avatar
Markus Scheidgen committed
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
15
16
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Markus Scheidgen's avatar
Markus Scheidgen committed
17
#
18

19
from typing import List, Iterable
20
from elasticsearch_dsl import Q
21
import pytest
22
from datetime import datetime
23

Markus Scheidgen's avatar
Markus Scheidgen committed
24
from nomad import datamodel, search, processing, infrastructure, config
25
from nomad.search import entry_document, SearchRequest
26
27


Markus Scheidgen's avatar
Markus Scheidgen committed
28
def test_init_mapping(elastic):
29
    pass
Markus Scheidgen's avatar
Markus Scheidgen committed
30
31


32
def test_index_skeleton_calc(elastic):
33
34
35
    entry_metadata = datamodel.EntryMetadata(
        domain='dft', upload_id='test_upload', calc_id='test_calc',
        mainfile='test/mainfile', files=['test/file1', 'test/file2'])
36

37
    create_entry(entry_metadata)
38
39


Markus Scheidgen's avatar
Markus Scheidgen committed
40
41
def test_index_normalized_calc(elastic, normalized: datamodel.EntryArchive):
    entry_metadata = normalized.section_metadata
42
    entry_metadata.m_update(
43
        domain='dft', upload_id='test upload id', calc_id='test id')
44
45
46
    entry_metadata.apply_domain_metadata(normalized)
    search_entry = create_entry(entry_metadata)
    entry = search.flat(search_entry.to_dict())
47

48
    assert 'calc_id' in entry
49
    assert 'atoms' in entry
50
    assert 'dft.code_name' in entry
51
    assert 'dft.optimade.elements_ratios' in entry
52
53
54


def test_index_normalized_calc_with_metadata(
Markus Scheidgen's avatar
Markus Scheidgen committed
55
56
        elastic, normalized: datamodel.EntryArchive, internal_example_user_metadata: dict):
    entry_metadata = normalized.section_metadata
57
    entry_metadata.m_update(
58
        domain='dft', upload_id='test upload id', calc_id='test id')
59
60
61
    entry_metadata.apply_domain_metadata(normalized)
    internal_example_user_metadata.pop('embargo_length')  # is for uploads only
    entry_metadata.apply_user_metadata(internal_example_user_metadata)
62

63
    entry = create_entry(entry_metadata)
64

65
66
    assert getattr(entry, 'with_embargo') == internal_example_user_metadata['with_embargo']
    assert getattr(entry, 'comment') == internal_example_user_metadata['comment']
67
68


69
70
71
72
73
74
75
76
77
78
79
80
def test_index_normalized_calc_with_author(
        elastic, normalized: datamodel.EntryArchive, internal_example_user_metadata: dict):
    entry_metadata = normalized.section_metadata
    entry_metadata.m_update(
        domain='dft', upload_id='test upload id', calc_id='test id',
        coauthors=[dict(first_name='Howard', last_name='Wolowitz')])
    entry_metadata.apply_domain_metadata(normalized)

    search_entry = create_entry(entry_metadata)
    search.flat(search_entry.to_dict())


81
82
83
84
def test_index_upload(elastic, processed: processing.Upload):
    pass


85
@pytest.fixture()
Markus Scheidgen's avatar
Markus Scheidgen committed
86
87
def example_search_data(elastic, normalized: datamodel.EntryArchive):
    entry_metadata = normalized.section_metadata
88
    entry_metadata.m_update(
89
90
        domain='dft', upload_id='test upload id', calc_id='test id',
        upload_time=datetime.now())
91
92
    entry_metadata.apply_domain_metadata(normalized)
    create_entry(entry_metadata)
93
94
    refresh_index()

95
    return normalized
96

97

98
@pytest.fixture()
Markus Scheidgen's avatar
Markus Scheidgen committed
99
100
def example_ems_search_data(elastic, parsed_ems: datamodel.EntryArchive):
    entry_metadata = parsed_ems.section_metadata
101
    entry_metadata.m_update(
102
        domain='ems', upload_id='test upload id', calc_id='test id')
103
104
    entry_metadata.apply_domain_metadata(parsed_ems)
    create_entry(entry_metadata)
105
106
107
108
109
    refresh_index()

    return parsed_ems


110
def test_search_entry(example_search_data):
111
    results = SearchRequest(domain='dft').execute()
112
    assert results['total'] > 0
Markus Scheidgen's avatar
Markus Scheidgen committed
113
114


115
def test_search_scan(elastic, example_search_data):
116
    results = list(SearchRequest(domain='dft').execute_scan())
117
    assert len(results) > 0
Markus Scheidgen's avatar
Markus Scheidgen committed
118
119


120
def test_search_paginated(elastic, example_search_data):
121
    results = SearchRequest(domain='dft').execute_paginated()
122
123
124
125
126
127
    assert results['total'] > 0
    assert len(results['results']) > 0
    pagination = results['pagination']
    assert pagination['total'] > 0
    assert 'page' in pagination
    assert 'per_page' in pagination
128
129


130
def test_search_scroll(elastic, example_search_data):
131
    request = SearchRequest(domain='dft')
132
    results = request.execute_scrolled()
133
134
135
    scroll_id = results['scroll']['scroll_id']
    assert results['scroll']['total'] == 1
    assert len(results['results']) == 1
136
137
    assert scroll_id is not None

138
    results = request.execute_scrolled(scroll_id=scroll_id)
139
140
141
    assert results['scroll']['total'] == 1
    assert len(results['results']) == 0
    assert 'scroll_id' not in results['scroll']
142
143


144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def test_search_aggregated(elastic, example_search_data):
    request = SearchRequest(domain='dft')
    results = request.execute_aggregated()
    after = results['aggregation']['after']
    assert results['aggregation']['total'] == 1
    assert len(results['results']) == 1
    assert 'calc_id' in results['results'][0]
    assert 'upload_id' in results['results'][0]
    assert after is not None

    results = request.execute_aggregated(after=after)
    assert results['aggregation']['total'] == 1
    assert len(results['results']) == 0
    assert results['aggregation']['after'] is None


def test_search_aggregated_includes(elastic, example_search_data):
    request = SearchRequest(domain='dft')
    results = request.execute_aggregated(includes=['with_embargo'])
    assert 'with_embargo' in results['results'][0]


166
167
168
169
170
171
172
173
174
def test_domain(elastic, example_ems_search_data):
    assert len(list(SearchRequest(domain='ems').execute_scan())) > 0
    assert len(list(SearchRequest(domain='ems').domain().execute_scan())) > 0
    assert len(list(SearchRequest(domain='ems').domain('dft').execute_scan())) == 0
    assert len(list(SearchRequest(domain='dft').domain('dft').execute_scan())) == 0

    results = SearchRequest(domain='ems').statistic('ems.method', size=10).execute()
    statistics = results['statistics']
    assert 'ems.method' in statistics
175
    assert 'electron energy loss spectroscopy' in statistics['ems.method']
176

177
    results = SearchRequest(domain='ems').statistics(['ems.method']).execute()
178
179
    statistics = results['statistics']
    assert 'ems.method' in statistics
180
    assert 'electron energy loss spectroscopy' in statistics['ems.method']
181
182


183
184
185
186
def assert_metrics(container, metrics_names):
    assert container['code_runs'] == 1
    for metric in metrics_names:
        assert metric in container
187

188
189

def test_search_statistics(elastic, example_search_data):
190
191
192
    assert 'authors' in search.metrics.keys()
    assert 'datasets' in search.metrics.keys()
    assert 'unique_entries' in search.metrics.keys()
193

194
    use_metrics = search.metrics.keys()
195

196
    request = SearchRequest(domain='dft').statistic(
197
        'dft.system', size=10, metrics_to_use=use_metrics).date_histogram(metrics_to_use=use_metrics)
198
199
200
201
    results = request.execute()

    statistics = results['statistics']
    assert 'results' not in results
202
    assert 'bulk' in statistics['dft.system']
203
    assert 'date_histogram' in statistics
204

205
    example_statistic = statistics['dft.system']['bulk']
206
    assert_metrics(example_statistic, use_metrics)
207
    assert_metrics(statistics['date_histogram'][list(statistics['date_histogram'].keys())[0]], use_metrics)
208
    assert_metrics(statistics['total']['all'], [])
209

210
    assert 'quantities' not in results
211

212

213
214
215
216
217
218
219
220
def test_suggest_statistics(elastic, example_search_data):
    results = SearchRequest(domain='dft').statistic('dft.system', include='ulk', size=2).execute()
    assert len(results['statistics']['dft.system']) == 1

    results = SearchRequest(domain='dft').statistic('dft.system', include='not_ulk', size=2).execute()
    assert len(results['statistics']['dft.system']) == 0


221
222
223
224
225
226
227
228
229
230
def test_global_statistics(elastic, example_search_data):
    results = SearchRequest().global_statistics().execute()
    statistics = results.get('global_statistics')
    assert statistics is not None
    assert statistics.get('n_entries') is not None
    assert statistics.get('n_uploads') is not None
    assert statistics.get('n_calculations') is not None
    assert statistics.get('n_quantities') is not None


231
def test_search_totals(elastic, example_search_data):
232
    use_metrics = search.metrics.keys()
233

234
    request = SearchRequest(domain='dft').totals(metrics_to_use=use_metrics)
235
236
237
238
239
240
241
    results = request.execute()

    statistics = results['statistics']
    assert 'results' not in results
    assert len(statistics) == 1

    assert_metrics(statistics['total']['all'], [])
242
243
244
245

    assert 'quantities' not in results


246
def test_search_exclude(elastic, example_search_data):
247
    for item in SearchRequest().execute_paginated()['results']:
248
        assert 'atoms' in search.flat(item)
249

250
251
    for item in SearchRequest().exclude('atoms').execute_paginated()['results']:
        assert 'atoms' not in search.flat(item)
252
253


254
255
def test_search_include(elastic, example_search_data):
    for item in SearchRequest().execute_paginated()['results']:
256
        assert 'atoms' in search.flat(item)
257
258

    for item in SearchRequest().include('calc_id').execute_paginated()['results']:
259
        item = search.flat(item)
260
        assert 'atoms' not in item
261
262
263
        assert 'calc_id' in item


264
265
@pytest.mark.parametrize("order_by", [None, 'upload_id'])
def test_search_quantity(
Markus Scheidgen's avatar
Markus Scheidgen committed
266
        elastic, normalized: datamodel.EntryArchive, test_user: datamodel.User,
267
268
        other_test_user: datamodel.User, order_by: str):

269
    entry_metadata = datamodel.EntryMetadata(
270
        domain='dft', upload_id='test upload id', calc_id='test id')
271
272
273
    entry_metadata.apply_domain_metadata(normalized)
    entry_metadata.uploader = test_user.user_id
    create_entry(entry_metadata)
274

275
276
277
    entry_metadata.calc_id = 'other test id'
    entry_metadata.uploader = other_test_user.user_id
    create_entry(entry_metadata)
278
279
    refresh_index()

280
    request = SearchRequest(domain='dft').quantity(
281
        name='authors', size=1, examples=1, order_by=order_by)
282
    results = request.execute()
283
284
    assert len(results['quantities']['authors']['values'].keys()) == 1
    name = list(results['quantities']['authors']['values'].keys())[0]
285
    assert len(results['quantities']['authors']['values'][name]['examples']) == 1
286
287
288
289
290
    if order_by is None:
        assert results['quantities']['authors']['after'] == name
    else:
        assert results['quantities']['authors']['after'] == \
            results['quantities']['authors']['values'][name]['examples'][0][order_by]
291
292
293
294
295
296


def refresh_index():
    infrastructure.elastic_client.indices.refresh(index=config.elastic.index_name)


297
def create_entry(entry_metadata: datamodel.EntryMetadata):
298
    entry = entry_metadata.a_elastic.index()
299
    assert_entry(entry_metadata.calc_id)
300
    return entry
301
302
303


def assert_entry(calc_id):
304
    refresh_index()
305
    calc = entry_document.get(calc_id)
306
    assert calc is not None
307

308
    search = entry_document.search().query(Q('term', calc_id=calc_id))[0:10]
309
310
311
    assert search.count() == 1
    results = list(hit.to_dict() for hit in search)
    assert results[0]['calc_id'] == calc_id
312
313


314
315
316
def assert_search_upload(
        upload_entries: Iterable[datamodel.EntryMetadata],
        additional_keys: List[str] = [], **kwargs):
317
    keys = ['calc_id', 'upload_id', 'mainfile', 'calc_hash']
318
    refresh_index()
319
    search_results = entry_document.search().query('match_all')[0:10]
320
    assert search_results.count() == len(list(upload_entries))
321
322
323
    if search_results.count() > 0:
        for hit in search_results:
            hit = search.flat(hit.to_dict())
324

325
326
327
            for key, value in kwargs.items():
                assert hit.get(key, None) == value

328
329
330
            if 'pid' in hit:
                assert int(hit.get('pid')) > 0

331
332
333
334
335
            for key in keys:
                assert key in hit

            for key in additional_keys:
                assert key in hit
336
                assert hit[key] != config.services.unavailable_value
337
338
339

            for coauthor in hit.get('coauthors', []):
                assert coauthor.get('name', None) is not None
340
341
342


if __name__ == '__main__':
343
    from .test_datamodel import generate_calc
344
345
346
    from elasticsearch.helpers import bulk
    import sys
    print('Generate index with random example calculation data. First arg is number of items')
347
    infrastructure.setup_mongo()
348
349
350
351
352
353
354
355
    infrastructure.setup_elastic()
    n = 100
    if len(sys.argv) > 1:
        n = int(sys.argv[1])

    def gen_data():
        for pid in range(0, n):
            calc = generate_calc(pid)
356
            calc = entry_document.from_entry_metadata(calc)
357
358
359
            yield calc.to_dict(include_meta=True)

    bulk(infrastructure.elastic_client, gen_data())