encyclopedia.py 53 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
"""
16
The encyclopedia API of the nomad@FAIRDI APIs.
17
"""
18
import re
19
import math
20
import json
21
import numpy as np
22

23
24
from flask_restplus import Resource, abort, fields, marshal
from flask import request
25
from elasticsearch_dsl import Search, Q, A
26
from elasticsearch_dsl.utils import AttrDict
27

28
from nomad import config, files, infrastructure
29
from nomad.units import ureg
Lauri Himanen's avatar
Lauri Himanen committed
30
from nomad.atomutils import get_hill_decomposition
31
from nomad.datamodel.datamodel import EntryArchive
32
from .api import api
33
from .auth import authenticate
34

35
ns = api.namespace("encyclopedia", description="Access encyclopedia metadata.")
36
37
re_formula = re.compile(r"([A-Z][a-z]?)(\d*)")

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
material_prop_map = {
    # General
    "material_id": "encyclopedia.material.material_id",
    "formula": "encyclopedia.material.formula",
    "formula_reduced": "encyclopedia.material.formula_reduced",
    "system_type": "encyclopedia.material.material_type",
    # Bulk only
    "has_free_wyckoff_parameters": "encyclopedia.material.bulk.has_free_wyckoff_parameters",
    "strukturbericht_designation": "encyclopedia.material.bulk.strukturbericht_designation",
    "material_name": "encyclopedia.material.material_name",
    "bravais_lattice": "encyclopedia.material.bulk.bravais_lattice",
    "crystal_system": "encyclopedia.material.bulk.crystal_system",
    "point_group": "encyclopedia.material.bulk.point_group",
    "space_group_number": "encyclopedia.material.bulk.space_group_number",
    "space_group_international_short_symbol": "encyclopedia.material.bulk.space_group_international_short_symbol",
    "structure_prototype": "encyclopedia.material.bulk.structure_prototype",
    "structure_type": "encyclopedia.material.bulk.structure_type",
}
56
57


58
59
60
61
62
63
64
65
66
67
68
69
def rgetattr(obj, attr_name):
    """Used to perform attribute access based on a (possibly nested) attribute
    name given as string.
    """
    try:
        for attr in attr_name.split("."):
            obj = obj[attr]
    except KeyError:
        return None
    return obj


70
def get_es_doc_values(es_doc, mapping, keys=None):
71
72
    """Used to form a material definition for "materials/<material_id>" from
    the given ElasticSearch root document.
73
    """
74
75
76
    if keys is None:
        keys = mapping.keys()

77
    result = {}
78
    for key in keys:
79
        es_key = mapping[key]
80
        value = rgetattr(es_doc, es_key)
81
        result[key] = value
82
83
84
85

    return result


86
87
88
89
90
91
92
93
94
95
96
def get_enc_filter():
    """Returns a shared term filter that will leave out unpublished, embargoed
    or invalid entries.
    """
    return [
        Q("term", published=True),
        Q("term", with_embargo=False),
        Q("term", encyclopedia__status="success"),
    ]


97
material_query = api.parser()
98
99
100
101
102
103
104
105
material_query.add_argument(
    "property",
    type=str,
    choices=tuple(material_prop_map.keys()),
    help="Optional single property to retrieve for the given material. If not specified, all properties will be returned.",
    location="args"
)
material_result = api.model("material_result", {
106
107
    # General
    "material_id": fields.String,
108
109
    "formula": fields.String,
    "formula_reduced": fields.String,
110
    "system_type": fields.String,
111
    "n_matches": fields.Integer,
112
    # Bulk only
113
    "has_free_wyckoff_parameters": fields.Boolean,
114
    "strukturbericht_designation": fields.String,
115
    "material_name": fields.String,
116
117
    "bravais_lattice": fields.String,
    "crystal_system": fields.String,
118
    "point_group": fields.String,
119
120
121
    "space_group_number": fields.Integer,
    "space_group_international_short_symbol": fields.String,
    "structure_prototype": fields.String,
122
123
    "structure_type": fields.String,
})
124
125


126
@ns.route("/materials/<string:material_id>")
127
class EncMaterialResource(Resource):
128
129
130
    @api.response(404, "The material does not exist")
    @api.response(200, "Metadata send", fields.Raw)
    @api.doc("material/<material_id>")
131
    @api.expect(material_query)
132
    @api.marshal_with(material_result, skip_none=True)
133
    def get(self, material_id):
134
135
        """Used to retrieve basic information related to the specified
        material.
136
        """
137
138
139
140
        # Parse request arguments
        args = material_query.parse_args()
        prop = args.get("property", None)
        if prop is not None:
141
142
            keys = [prop]
            es_keys = [material_prop_map[prop]]
143
144
        else:
            keys = list(material_prop_map.keys())
145
            es_keys = list(material_prop_map.values())
146

147
148
149
150
151
        # Find the first public entry with this material id and take
        # information from there. In principle all other entries should have
        # the same information.
        s = Search(index=config.elastic.index_name)
        query = Q(
152
            "bool",
153
            filter=get_enc_filter() + [
154
                Q("term", encyclopedia__material__material_id=material_id),
155
156
157
            ]
        )
        s = s.query(query)
158

159
        # Only one representative entry is returned by collapsing the results.
160
161
        s = s.extra(**{
            "_source": {"includes": es_keys},
162
            "size": 1,
163
164
            "collapse": {"field": "encyclopedia.material.material_id"},
        })
165
166
        response = s.execute()

167
        # No such material
168
        if len(response) == 0:
169
            abort(404, message="There is no material {}".format(material_id))
170

171
        # Add values from ES entry
172
        entry = response[0]
173
        result = get_es_doc_values(entry, material_prop_map, keys)
174

175
176
177
        return result, 200


178
range_query = api.model("range_query", {
179
180
181
    "max": fields.Float,
    "min": fields.Float,
})
182
183
184
materials_after = api.model("materials_after", {
    "materials": fields.String,
})
185
186
materials_query = api.model("materials_input", {
    "search_by": fields.Nested(api.model("search_query", {
187
188
        "exclusive": fields.Boolean(default=False),
        "formula": fields.String,
Lauri Himanen's avatar
Lauri Himanen committed
189
        "element": fields.String,
190
        "page": fields.Integer(default=1),
191
        "after": fields.Nested(materials_after, allow_null=True),
192
193
194
        "per_page": fields.Integer(default=25),
        "pagination": fields.Boolean,
    })),
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
    "material_name": fields.List(fields.String),
    "structure_type": fields.List(fields.String),
    "space_group_number": fields.List(fields.Integer),
    "system_type": fields.List(fields.String),
    "crystal_system": fields.List(fields.String),
    "band_gap": fields.Nested(range_query, description="Band gap range in eV."),
    "band_gap_direct": fields.Boolean,
    "has_band_structure": fields.Boolean,
    "has_dos": fields.Boolean,
    "has_fermi_surface": fields.Boolean,
    "has_thermal_properties": fields.Boolean,
    "functional_type": fields.List(fields.String),
    "basis_set_type": fields.List(fields.String),
    "code_name": fields.List(fields.String),
    "mass_density": fields.Nested(range_query, description="Mass density range in kg / m ** 3."),
210
})
211
212
213
214
215
pages_result = api.model("page_info", {
    "per_page": fields.Integer,
    "total": fields.Integer,
    "page": fields.Integer,
    "pages": fields.Integer,
216
    "after": fields.Nested(materials_after),
217
218
})

219
220
materials_result = api.model("materials_result", {
    "total_results": fields.Integer(allow_null=False),
221
222
    "results": fields.List(fields.Nested(material_result, skip_none=True)),
    "pages": fields.Nested(pages_result, skip_none=True),
223
    "es_query": fields.String(allow_null=False),
224
225
226
})


227
@ns.route("/materials")
228
class EncMaterialsResource(Resource):
229
230
231
    @api.response(404, "No materials found")
    @api.response(400, "Bad request")
    @api.response(200, "Metadata send", fields.Raw)
232
    @api.expect(materials_query, validate=False)
233
    @api.marshal_with(materials_result, skip_none=True)
234
    @api.doc("materials")
235
236
237
238
239
240
241
242
243
    def post(self):
        """Used to query a list of materials with the given search options.
        """
        # Get query parameters as json
        try:
            data = marshal(request.get_json(), materials_query)
        except Exception as e:
            abort(400, message=str(e))

244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
        # The queries that correspond to AND queries typically need to access
        # multiple calculations at once to find the material ids that
        # correspond to the query. To implement this behaviour we need to run
        # an initial aggregation that checks that the requested properties are
        # present for a material. This is a a very crude solution that does not
        # scale to complex queries, but I'm not sure we can do much better
        # until we have a separate index for materials.
        property_map = {
            "has_thermal_properties": "encyclopedia.properties.thermodynamical_properties",
            "has_band_structure": "encyclopedia.properties.electronic_band_structure",
            "has_dos": "encyclopedia.properties.electronic_dos",
            "has_fermi_surface": "encyclopedia.properties.fermi_surface",
        }
        requested_properties = []
        # The size is set very large because all the results need to be
        # returned. We cannot get the results in a paginated way with composite
        # aggregation, because pipeline aggregations are not compatible with
        # them.
        agg_parent = A("terms", field="encyclopedia.material.material_id", size=5000000)
        for key, value in property_map.items():
            if data[key] is True:
                agg = A("filter", exists={"field": value})
                agg_parent.bucket(key, agg)
                requested_properties.append(key)
        if len(requested_properties) > 1:
            bool_query = Q(
                "bool",
                filter=get_enc_filter(),
            )
            s = Search(index=config.elastic.index_name)
            s = s.query(bool_query)
            s.aggs.bucket("materials", agg_parent)
            buckets_path = {x: "{}._count".format(x) for x in requested_properties}
            script = " && ".join(["params.{} > 0".format(x) for x in requested_properties])
            agg_parent.pipeline("selector", A(
                "bucket_selector",
                buckets_path=buckets_path,
                script=script,
            ))
            s = s.extra(**{
                "size": 0,
            })
            response = s.execute()
            material_ids = [x["key"] for x in response.aggs.materials.buckets]
            if len(material_ids) == 0:
                abort(404, message="No materials found for the given search criteria or pagination.")

        # After finding the material ids that fill the AND conditions, continue
        # with a simple OR query.
293
        filters = get_enc_filter()
294
295
296
297
        must_nots = []
        musts = []

        def add_terms_filter(source, target, query_type="terms"):
298
            if data[source] is not None:
299
300
                filters.append(Q(query_type, **{target: data[source]}))

301
302
        if len(requested_properties) > 1:
            filters.append(Q("terms", encyclopedia__material__material_id=material_ids))
303
304
        add_terms_filter("material_name", "encyclopedia.material.material_name")
        add_terms_filter("structure_type", "encyclopedia.material.bulk.structure_type")
305
        add_terms_filter("space_group_number", "encyclopedia.material.bulk.space_group_number")
306
307
308
309
310
311
312
        add_terms_filter("system_type", "encyclopedia.material.material_type")
        add_terms_filter("crystal_system", "encyclopedia.material.bulk.crystal_system")
        add_terms_filter("band_gap_direct", "encyclopedia.properties.band_gap_direct", query_type="term")
        add_terms_filter("functional_type", "encyclopedia.method.functional_type")
        add_terms_filter("basis_set_type", "dft.basis_set")
        add_terms_filter("code_name", "dft.code_name")

313
314
        # Add exists filters if only one property was requested. The initial
        # aggregation will handlei multiple simultaneous properties.
315
316
317
318
319
320
321
322
        def add_exists_filter(source, target):
            param = data[source]
            if param is not None:
                query = Q("exists", field=target)
                if param is True:
                    filters.append(query)
                elif param is False:
                    must_nots.append(query)
323
324
325
        if len(requested_properties) == 1:
            prop_name = requested_properties[0]
            add_exists_filter(prop_name, property_map[prop_name])
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348

        # Add range filters
        def add_range_filter(source, target, source_unit=None, target_unit=None):
            param = data[source]
            query_dict = {}
            if param["min"] is not None:
                if source_unit is None and target_unit is None:
                    gte = param["min"]
                else:
                    gte = (param["min"] * source_unit).to(target_unit).magnitude
                query_dict["gte"] = gte
            if param["max"] is not None:
                if source_unit is None and target_unit is None:
                    lte = param["max"]
                else:
                    lte = (param["max"] * source_unit).to(target_unit).magnitude
                query_dict["lte"] = lte
            if len(query_dict) != 0:
                query = Q("range", **{target: query_dict})
                filters.append(query)

        add_range_filter("band_gap", "encyclopedia.properties.band_gap", ureg.eV, ureg.J)
        add_range_filter("mass_density", "encyclopedia.properties.mass_density")
349

350
351
352
        # Create query for elements or formula
        search_by = data["search_by"]
        formula = search_by["formula"]
Lauri Himanen's avatar
Lauri Himanen committed
353
        elements = search_by["element"]
354
355
356
        exclusive = search_by["exclusive"]

        if formula is not None:
Lauri Himanen's avatar
Lauri Himanen committed
357
358
            # Here we determine a list of atom types. The types may occur
            # multiple times and at multiple places.
359
360
361
362
363
364
365
366
367
368
            element_list = []
            matches = re_formula.finditer(formula)
            for match in matches:
                groups = match.groups()
                symbol = groups[0]
                count = groups[1]
                if symbol != "":
                    if count == "":
                        element_list.append(symbol)
                    else:
Lauri Himanen's avatar
Lauri Himanen committed
369
370
371
372
373
                        element_list += [symbol] * int(count)

            # The given list of species is reformatted with the Hill system
            # into a query string. The counts are reduced by the greatest
            # common divisor.
374
            names, reduced_counts = get_hill_decomposition(element_list, reduced=True)
Lauri Himanen's avatar
Lauri Himanen committed
375
376
377
378
379
            query_string = []
            for name, count in zip(names, reduced_counts):
                if count == 1:
                    query_string.append(name)
                else:
380
                    query_string.append("{}{}".format(name, int(count)))
Lauri Himanen's avatar
Lauri Himanen committed
381
            query_string = " ".join(query_string)
382
383
384

            # With exclusive search we look for exact match
            if exclusive:
Lauri Himanen's avatar
Lauri Himanen committed
385
                filters.append(Q("term", **{"encyclopedia.material.species_and_counts.keyword": query_string}))
386
387
388
389
390
            # With non-exclusive search we look for match that includes at
            # least all parts of the formula, possibly even more.
            else:
                musts.append(Q(
                    "match",
Lauri Himanen's avatar
Lauri Himanen committed
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
                    encyclopedia__material__species_and_counts={"query": query_string, "operator": "and"}
                ))
        elif elements is not None:
            # The given list of species is reformatted with the Hill system into a query string
            species, _ = get_hill_decomposition(elements.split(","))
            query_string = " ".join(species)

            # With exclusive search we look for exact match
            if exclusive:
                filters.append(Q("term", **{"encyclopedia.material.species.keyword": query_string}))
            # With non-exclusive search we look for match that includes at
            # least all species, possibly even more.
            else:
                musts.append(Q(
                    "match",
                    encyclopedia__material__species={"query": query_string, "operator": "and"}
407
408
                ))

409
410
        page = search_by["page"]
        per_page = search_by["per_page"]
411
        after = search_by["after"]
412
        bool_query = Q(
413
            "bool",
414
415
416
417
            filter=filters,
            must_not=must_nots,
            must=musts,
        )
418

419
420
421
422
        # The top query filters out entries based on the user query
        s = Search(index=config.elastic.index_name)
        s = s.query(bool_query)

423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
        # 1: The paginated approach: No way to know the amount of materials,
        # but can return aggregation results in a quick fashion including
        # the number of calculation entries per material.
        mode = "collapse"
        if mode == "aggregation":
            # The materials are grouped by using three aggregations:
            # "Composite" to enable scrolling, "Terms" to enable selecting
            # by material_id and "Top Hits" to fetch a single
            # representative material document. Unnecessary fields are
            # filtered to reduce data transfer.
            terms_agg = A("terms", field="encyclopedia.material.material_id")
            composite_kwargs = {"sources": {"materials": terms_agg}, "size": per_page}

            # The number of matched materials is only requested on the first
            # search, not for each page.
            if after is not None:
                composite_kwargs["after"] = after
            else:
                cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000)
                s.aggs.metric("n_materials", cardinality_agg)

            composite_agg = A("composite", **composite_kwargs)
            composite_agg.metric("representative", A(
                "top_hits",
                size=1,
                _source={"includes": list(material_prop_map.values())},
            ))
            s.aggs.bucket("materials", composite_agg)

            # We ignore the top level hits and sort by reduced material formula.
            s = s.extra(**{
                "size": 0,
            })

            response = s.execute()
            materials = response.aggs.materials.buckets
            if len(materials) == 0:
                abort(404, message="No materials found for the given search criteria or pagination.")
            after_new = response.aggs.materials["after_key"]

            # Gather results from aggregations
            result_list = []
            materials = response.aggs.materials.buckets
            keys = list(material_prop_map.keys())
            for material in materials:
                representative = material["representative"][0]
                mat_dict = get_es_doc_values(representative, material_prop_map, keys)
                mat_dict["n_matches"] = material.doc_count
                result_list.append(mat_dict)

            # Page information is incomplete for aggregations
            pages = {
                "page": page,
                "per_page": per_page,
                "after": after_new,
            }
            if after is None:
                n_materials = response.aggs.n_materials.value
                pages["total"] = n_materials

        # 2. Collapse approach. Quickly provides a list of materials
484
485
        # corresponding to the query, offers full pagination, the number of
        # matches per material needs to be requested with a separate query.
486
487
488
489
490
        elif mode == "collapse":
            s = Search(index=config.elastic.index_name)
            s = s.query(bool_query)

            # Add cardinality aggregation that gives out the total number of materials
491
            cardinality_agg = A("cardinality", field="encyclopedia.material.material_id", precision_threshold=1000)
492
            s.aggs.metric("n_materials", cardinality_agg)
493

494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
            s = s.extra(**{
                "collapse": {"field": "encyclopedia.material.material_id"},
                "size": per_page,
                "from": (page - 1) * per_page,
                "sort": [{"encyclopedia.material.formula_reduced": {"order": "asc"}}],
                "explain": True,
            })

            # Execute query
            response = s.execute()

            # No matches
            if len(response) == 0:
                abort(404, message="No materials found for the given search criteria or pagination.")

            # Gather number of entries per material with a separate query
            material_ids = [x.encyclopedia.material.material_id for x in response]
            s = Search(index=config.elastic.index_name)
            bool_query = Q(
                "bool",
                filter=Q("terms", encyclopedia__material__material_id=material_ids),
            )
            s2 = s.query(bool_query)
            s2.aggs.bucket("n_matches", A("terms", field="encyclopedia.material.material_id"))
            response2 = s2.execute()
            matmap = {x.key: x.doc_count for x in response2.aggs.n_matches}
520

521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
            # Loop over materials
            result_list = []
            keys = list(material_prop_map.keys())
            for material in response:
                # Get values from the collapsed doc
                mat_result = get_es_doc_values(material, material_prop_map, keys)
                mat_id = material.encyclopedia.material.material_id
                mat_result["n_matches"] = matmap[mat_id]
                result_list.append(mat_result)

            # Full page information available for collapse
            pages = {
                "page": page,
                "per_page": per_page,
                "pages": math.ceil(response.hits.total / per_page),
                "total": response.aggs.n_materials.value,
            }
538
539
540

        result = {
            "results": result_list,
541
            "pages": pages,
542
        }
543
        return result, 200
544
545


546
groups_result = api.model("groups_result", {
547
548
    "groups_eos": fields.Raw,
    "groups_par": fields.Raw,
549
550
551
})


552
@ns.route("/materials/<string:material_id>/groups")
Lauri Himanen's avatar
Lauri Himanen committed
553
class EncGroupsResource(Resource):
554
555
556
    @api.response(404, "Material not found")
    @api.response(400, "Bad request")
    @api.response(200, "Metadata send", fields.Raw)
557
    @api.marshal_with(groups_result)
558
    @api.doc("enc_materials")
Lauri Himanen's avatar
Lauri Himanen committed
559
    def get(self, material_id):
560
561
562
        """Returns a summary of the calculation groups that were identified for
        this material.
        """
563
564
565
        # Find entries for the given material, which have EOS or parameter
        # variation hashes set.
        bool_query = Q(
566
            "bool",
567
            filter=get_enc_filter() + [Q("term", encyclopedia__material__material_id=material_id)],
568
569
            must=[
                Q("exists", field="encyclopedia.properties.energies.energy_total"),
570
                Q("exists", field="encyclopedia.material.idealized_structure.cell_volume"),
571
572
            ],
            should=[
573
574
                Q("exists", field="encyclopedia.method.group_eos_id"),
                Q("exists", field="encyclopedia.method.group_parametervariation_id"),
575
576
577
            ],
            minimum_should_match=1,  # At least one of the should query must match
        )
Lauri Himanen's avatar
Lauri Himanen committed
578
579

        s = Search(index=config.elastic.index_name)
580
581
582
583
        s = s.query(bool_query)

        # Bucket the calculations by the group hashes. Only create a bucket if an
        # above-minimum number of documents are found.
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
        group_eos_bucket = A("terms", field="encyclopedia.method.group_eos_id", min_doc_count=4)
        group_param_bucket = A("terms", field="encyclopedia.method.group_parametervariation_id", min_doc_count=2)
        calc_aggregation = A(
            "top_hits",
            _source={"includes": ["calc_id"]},
            sort=[{"encyclopedia.properties.energies.energy_total": {"order": "asc"}}],
            size=100,
        )
        group_eos_bucket.bucket("calculations", calc_aggregation)
        group_param_bucket.bucket("calculations", calc_aggregation)
        s.aggs.bucket("groups_eos", group_eos_bucket)
        s.aggs.bucket("groups_param", group_param_bucket)

        # We ignore the top level hits
        s = s.extra(**{
            "size": 0,
        })

        # Collect information for each group from the aggregations
        response = s.execute()
        groups_eos = {group.key: [calc.calc_id for calc in group.calculations.hits] for group in response.aggs.groups_eos.buckets}
        groups_param = {group.key: [calc.calc_id for calc in group.calculations.hits] for group in response.aggs.groups_param.buckets}

        # Return results
        result = {
            "groups_eos": groups_eos,
            "groups_par": groups_param,
        }

        return result, 200


group_result = api.model("group_result", {
    "calculations": fields.List(fields.String),
    "energies": fields.List(fields.Float),
    "volumes": fields.List(fields.Float),
})
group_source = {
    "includes": [
        "calc_id",
        "encyclopedia.properties.energies.energy_total",
        "encyclopedia.material.idealized_structure.cell_volume",
    ]
}


@ns.route("/materials/<string:material_id>/groups/<string:group_type>/<string:group_id>")
class EncGroupResource(Resource):
    @api.response(404, "Group not found")
    @api.response(400, "Bad request")
    @api.response(200, "Metadata send", fields.Raw)
    @api.marshal_with(group_result)
    @api.doc("enc_group")
    def get(self, material_id, group_type, group_id):
        """Used to query detailed information for a specific calculation group.
        """
        # Find entries for the given material, which have EOS or parameter
        # variation hashes set.
        if group_type == "eos":
            group_id_source = "encyclopedia.method.group_eos_id"
        elif group_type == "par":
            group_id_source = "encyclopedia.method.group_parametervariation_id"
        else:
            abort(400, message="Unsupported group type.")

        bool_query = Q(
            "bool",
651
            filter=get_enc_filter() + [
652
653
654
655
656
657
658
                Q("term", encyclopedia__material__material_id=material_id),
                Q("term", **{group_id_source: group_id}),
            ],
        )

        s = Search(index=config.elastic.index_name)
        s = s.query(bool_query)
659
660
661
662
663

        # calc_id and energy should be extracted for each matched document. The
        # documents are sorted by energy so that the minimum energy one can be
        # easily extracted. A maximum request size is set in order to limit the
        # result size. ES also has an index-level property
664
        # "index.max_inner_result_window" that limits the number of results
665
666
667
668
669
670
671
        # that an inner result can contain.
        energy_aggregation = A(
            "top_hits",
            _source=group_source,
            sort=[{"encyclopedia.properties.energies.energy_total": {"order": "asc"}}],
            size=100,
        )
672
        s.aggs.bucket("groups_eos", energy_aggregation)
673

674
675
676
677
        # We ignore the top level hits
        s = s.extra(**{
            "size": 0,
        })
678

679
        # Collect information for each group from the aggregations
680
        response = s.execute()
681

682
683
684
685
686
687
688
689
        hits = response.aggs.groups_eos.hits
        calculations = [doc.calc_id for doc in hits]
        energies = [doc.encyclopedia.properties.energies.energy_total for doc in hits]
        volumes = [doc.encyclopedia.material.idealized_structure.cell_volume for doc in hits]
        group_dict = {
            "calculations": calculations,
            "energies": energies,
            "volumes": volumes,
690
        }
691
692

        return group_dict, 200
693
694


695
696
697
698
suggestions_map = {
    "code_name": "dft.code_name",
    "structure_type": "encyclopedia.material.bulk.structure_type",
}
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
suggestions_query = api.parser()
suggestions_query.add_argument(
    "property",
    type=str,
    choices=("code_name", "structure_type"),
    help="The property name for which suggestions are returned.",
    location="args"
)
suggestions_result = api.model("suggestions_result", {
    "code_name": fields.List(fields.String),
    "structure_type": fields.List(fields.String),
})


@ns.route("/suggestions")
class EncSuggestionsResource(Resource):
    @api.response(404, "Suggestion not found")
    @api.response(400, "Bad request")
    @api.response(200, "Metadata send", fields.Raw)
    @api.expect(suggestions_query, validate=False)
    @api.marshal_with(suggestions_result, skip_none=True)
    @api.doc("enc_suggestions")
    def get(self):

        # Parse request arguments
        args = suggestions_query.parse_args()
        prop = args.get("property", None)

727
728
729
730
731
732
733
        # Use aggregation to return all unique terms for the requested field.
        # Without using composite aggregations there is a size limit for the
        # number of aggregation buckets. This should, however, not be a problem
        # since the number of unique values is low for all supported properties.
        s = Search(index=config.elastic.index_name)
        query = Q(
            "bool",
734
            filter=get_enc_filter()
735
736
737
738
739
740
741
742
743
744
745
746
747
748
        )
        s = s.query(query)
        s = s.extra(**{
            "size": 0,
        })

        terms_agg = A("terms", field=suggestions_map[prop])
        s.aggs.bucket("suggestions", terms_agg)

        # Gather unique values into a list
        response = s.execute()
        suggestions = [x.key for x in response.aggs.suggestions.buckets]

        return {prop: suggestions}, 200
749
750
751
752
753
754
755
756


calc_prop_map = {
    "calc_id": "calc_id",
    "code_name": "dft.code_name",
    "code_version": "dft.code_version",
    "functional_type": "encyclopedia.method.functional_type",
    "basis_set_type": "dft.basis_set",
757
    "core_electron_treatment": "encyclopedia.method.core_electron_treatment",
758
759
760
761
    "run_type": "encyclopedia.calculation.calculation_type",
    "has_dos": "encyclopedia.properties.electronic_dos",
    "has_band_structure": "encyclopedia.properties.electronic_band_structure",
    "has_thermal_properties": "encyclopedia.properties.thermodynamical_properties",
762
763
    "has_phonon_dos": "encyclopedia.properties.phonon_dos",
    "has_phonon_band_structure": "encyclopedia.properties.phonon_band_structure",
764
765
766
767
768
769
770
}
calculation_result = api.model("calculation_result", {
    "calc_id": fields.String,
    "code_name": fields.String,
    "code_version": fields.String,
    "functional_type": fields.String,
    "basis_set_type": fields.String,
771
    "core_electron_treatment": fields.String,
772
773
774
775
    "run_type": fields.String,
    "has_dos": fields.Boolean,
    "has_band_structure": fields.Boolean,
    "has_thermal_properties": fields.Boolean,
776
777
    "has_phonon_dos": fields.Boolean,
    "has_phonon_band_structure": fields.Boolean,
778
})
779
780
781
782
783
784
representatives_result = api.model("representatives_result", {
    "idealized_structure": fields.String,
    "electronic_band_structure": fields.String,
    "electronic_dos": fields.String,
    "thermodynamical_properties": fields.String,
})
785
786
787
788
calculations_result = api.model("calculations_result", {
    "total_results": fields.Integer,
    "pages": fields.Nested(pages_result),
    "results": fields.List(fields.Nested(calculation_result)),
789
    "representatives": fields.Nested(representatives_result, skip_none=True),
790
791
792
793
})


@ns.route("/materials/<string:material_id>/calculations")
794
class EncCalculationsResource(Resource):
795
796
797
798
799
    @api.response(404, "Suggestion not found")
    @api.response(400, "Bad request")
    @api.response(200, "Metadata send", fields.Raw)
    @api.doc("enc_calculations")
    def get(self, material_id):
800
801
802
        """Used to return all calculations related to the given material. Also
        returns a representative calculation for each property shown in the
        overview page.
803
804
805
806
        """
        s = Search(index=config.elastic.index_name)
        query = Q(
            "bool",
807
            filter=get_enc_filter() + [
808
809
810
811
812
813
814
815
                Q("term", encyclopedia__material__material_id=material_id),
            ]
        )
        s = s.query(query)

        # The query is filtered already on the ES side so we don"t need to
        # transfer so much data.
        s = s.extra(**{
816
817
818
            "_source": {"includes": list(calc_prop_map.values()) + ["dft.xc_functional"]},
            "size": 10000,
            "from": 0,
819
820
821
822
823
824
825
        })
        response = s.execute()

        # No such material
        if len(response) == 0:
            abort(404, message="There is no material {}".format(material_id))

826
827
828
829
830
831
832
        # Add representative properties. It might be possible to write a custom
        # ES scoring mechanism or aggregation to also perform the selection.
        representatives = {}

        def calc_score(entry):
            """Custom scoring function used to sort results by their
            "quality". Currently built to mimic the scoring that was used
833
834
835
            in the old Encyclopedia GUI. Primarily sorts by quality measure,
            ties are broken by alphabetic sorting of entry_id in order to
            return consistent results.
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
            """
            score = 0
            functional_score = {
                "GGA": 100
            }
            code_score = {
                "FHI-aims": 3,
                "VASP": 2,
                "Quantum Espresso": 1,
            }
            code_name = entry.dft.code_name
            functional = entry.dft.xc_functional
            has_dos = rgetattr(entry, "encyclopedia.properties.electronic_band_structure") is not None
            has_bs = rgetattr(entry, "encyclopedia.properties.electronic_dos") is not None
            score += functional_score.get(functional, 0)
            score += code_score.get(code_name, 0)
            if has_dos and has_bs:
                score += 10

855
            return (score, entry["calc_id"])
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877

        # The calculations are first sorted by "quality"
        sorted_calc = sorted(response, key=lambda x: calc_score(x), reverse=True)

        # Get the requested representative properties
        representatives["idealized_structure"] = sorted_calc[0].calc_id
        thermo_found = False
        bs_found = False
        dos_found = False
        for calc in sorted_calc:
            if rgetattr(calc, "encyclopedia.properties.thermodynamical_properties") is not None:
                representatives["thermodynamical_properties"] = calc.calc_id
                thermo_found = True
            if rgetattr(calc, "encyclopedia.properties.electronic_band_structure") is not None:
                representatives["electronic_band_structure"] = calc.calc_id
                bs_found = True
            if rgetattr(calc, "encyclopedia.properties.electronic_dos") is not None:
                representatives["electronic_dos"] = calc.calc_id
                dos_found = True
            if thermo_found and bs_found and dos_found:
                break

878
879
880
881
882
        # Create result JSON
        results = []
        for entry in response:
            calc_dict = get_es_doc_values(entry, calc_prop_map)
            calc_dict["has_dos"] = calc_dict["has_dos"] is not None
883
            calc_dict["has_band_structure"] = calc_dict["has_band_structure"] is not None
884
            calc_dict["has_thermal_properties"] = calc_dict["has_thermal_properties"] is not None
885
886
            calc_dict["has_phonon_dos"] = calc_dict["has_phonon_dos"] is not None
            calc_dict["has_phonon_band_structure"] = calc_dict["has_phonon_band_structure"] is not None
887
888
889
890
891
            results.append(calc_dict)

        result = {
            "total_results": len(results),
            "results": results,
892
            "representatives": representatives,
893
894
895
896
897
        }

        return result, 200


898
899
900
901
histogram = api.model("histogram", {
    "occurrences": fields.List(fields.Integer),
    "values": fields.List(fields.Float),
})
902
903
statistics_query = api.model("statistics_query", {
    "calculations": fields.List(fields.String),
904
    "properties": fields.List(fields.String),
905
    "n_histogram_bins": fields.Integer,
906
907
908
909
910
})
statistics = api.model("statistics", {
    "min": fields.Float,
    "max": fields.Float,
    "avg": fields.Float,
911
    "histogram": fields.Nested(histogram, skip_none=True)
912
913
})
statistics_result = api.model("statistics_result", {
914
915
916
917
918
919
920
921
922
923
    "cell_volume": fields.Nested(statistics, skip_none=True),
    "atomic_density": fields.Nested(statistics, skip_none=True),
    "mass_density": fields.Nested(statistics, skip_none=True),
    "lattice_a": fields.Nested(statistics, skip_none=True),
    "lattice_b": fields.Nested(statistics, skip_none=True),
    "lattice_c": fields.Nested(statistics, skip_none=True),
    "alpha": fields.Nested(statistics, skip_none=True),
    "beta": fields.Nested(statistics, skip_none=True),
    "gamma": fields.Nested(statistics, skip_none=True),
    "band_gap": fields.Nested(statistics, skip_none=True),
924
})
925
926
927
928
929
930
931
932
933
934
property_map = {
    "cell_volume": "encyclopedia.material.idealized_structure.cell_volume",
    "atomic_density": "encyclopedia.properties.atomic_density",
    "mass_density": "encyclopedia.properties.mass_density",
    "lattice_a": "encyclopedia.material.idealized_structure.lattice_parameters.a",
    "lattice_b": "encyclopedia.material.idealized_structure.lattice_parameters.b",
    "lattice_c": "encyclopedia.material.idealized_structure.lattice_parameters.c",
    "alpha": "encyclopedia.material.idealized_structure.lattice_parameters.alpha",
    "beta": "encyclopedia.material.idealized_structure.lattice_parameters.beta",
    "gamma": "encyclopedia.material.idealized_structure.lattice_parameters.gamma",
Lauri Himanen's avatar
Lauri Himanen committed
935
    "band_gap": "encyclopedia.properties.band_gap",
936
}
937
938
939


@ns.route("/materials/<string:material_id>/statistics")
940
class EncStatisticsResource(Resource):
941
942
943
    @api.response(404, "Suggestion not found")
    @api.response(400, "Bad request")
    @api.response(200, "Metadata send", fields.Raw)
944
945
    @api.expect(statistics_query, validate=False)
    @api.marshal_with(statistics_result, skip_none=True)
946
947
    @api.doc("enc_statistics")
    def post(self, material_id):
948
949
        """Used to return statistics related to the specified material and
        calculations.
950
        """
951
952
953
954
955
956
957
958
959
        # Get query parameters as json
        try:
            data = marshal(request.get_json(), statistics_query)
        except Exception as e:
            abort(400, message=str(e))

        # Find entries for the given material.
        bool_query = Q(
            "bool",
960
            filter=get_enc_filter() + [
961
962
963
964
965
966
967
968
969
970
971
                Q("term", encyclopedia__material__material_id=material_id),
                Q("terms", calc_id=data["calculations"]),
            ]
        )

        s = Search(index=config.elastic.index_name)
        s = s.query(bool_query)
        s = s.extra(**{
            "size": 0,
        })

972
973
974
975
976
977
        # Add statistics aggregations for each requested property
        properties = data["properties"]
        for prop in properties:
            stats_agg = A("stats", field=property_map[prop])
            s.aggs.bucket("{}_stats".format(prop), stats_agg)

978
979
980
981
982
        # No hits on the top query level
        response = s.execute()
        if response.hits.total == 0:
            abort(404, message="Could not find matching calculations.")

983
984
985
986
987
988
989
990
        # Run a second query that creates histograms with fixed size buckets
        # based on the min and max from previous query. Might make more sense
        # to use the mean and sigma to define the range?
        s = Search(index=config.elastic.index_name)
        s = s.query(bool_query)
        s = s.extra(**{
            "size": 0,
        })
991
        n_bins = data["n_histogram_bins"]
992
993
        for prop in properties:
            stats = getattr(response.aggs, "{}_stats".format(prop))
994
995
            if stats.count == 0:
                continue
996
            interval = (stats.max * 1.001 - stats.min) / n_bins
997
998
            if interval == 0:
                interval = 1
999
            hist_agg = A("histogram", field=property_map[prop], interval=interval, offset=stats.min, min_doc_count=0)
1000
1001
1002
            s.aggs.bucket("{}_hist".format(prop), hist_agg)
        response_hist = s.execute()

1003
        # Return results
1004
1005
1006
        result = {}
        for prop in properties:
            stats = getattr(response.aggs, "{}_stats".format(prop))
1007
1008
            if stats.count == 0:
                continue
1009
1010
1011
1012
1013
1014
1015
            hist = getattr(response_hist.aggs, "{}_hist".format(prop))
            occurrences = [x.doc_count for x in hist.buckets]
            values = [x.key for x in hist.buckets]
            result[prop] = {
                "min": stats.min,
                "max": stats.max,
                "avg": stats.avg,
1016
                "histogram": {
1017
1018
1019
                    "occurrences": occurrences,
                    "values": values,
                }
1020
            }
1021

1022
        return result, 200
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033


wyckoff_variables_result = api.model("wyckoff_variables_result", {
    "x": fields.Float,
    "y": fields.Float,
    "z": fields.Float,
})
wyckoff_set_result = api.model("wyckoff_set_result", {
    "wyckoff_letter": fields.String,
    "indices": fields.List(fields.Integer),
    "element": fields.String,
1034
    "variables": fields.Nested(wyckoff_variables_result, skip_none=True),
1035
})
1036
1037
1038
1039
1040
1041
1042
1043
lattice_parameters = api.model("lattice_parameters", {
    "a": fields.Float,
    "b": fields.Float,
    "c": fields.Float,
    "alpha": fields.Float,
    "beta": fields.Float,
    "gamma": fields.Float,
})
1044
1045
1046
1047
1048
1049

idealized_structure_result = api.model("idealized_structure_result", {
    "atom_labels": fields.List(fields.String),
    "atom_positions": fields.List(fields.List(fields.Float)),
    "lattice_vectors": fields.List(fields.List(fields.Float)),
    "lattice_vectors_primitive": fields.List(fields.List(fields.Float)),
1050
    "lattice_parameters": fields.Nested(lattice_parameters, skip_none=True),
1051
1052
1053
    "periodicity": fields.List(fields.Boolean),
    "number_of_atoms": fields.Integer,
    "cell_volume": fields.Float,
1054
    "wyckoff_sets": fields.List(fields.Nested(wyckoff_set_result, skip_none=True)),
1055
1056
})

1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
calculation_property_map = {
    "lattice_parameters": {
        "es_source": "encyclopedia.material.idealized_structure.lattice_parameters"
    },
    "energies": {
        "es_source": "encyclopedia.properties.energies",
    },
    "mass_density": {
        "es_source": "encyclopedia.properties.mass_density",
    },
    "atomic_density": {
        "es_source": "encyclopedia.properties.atomic_density",
    },
    "cell_volume": {
        "es_source": "encyclopedia.material.idealized_structure.cell_volume"
    },
Lauri Himanen's avatar
Lauri Himanen committed
1073
1074
1075
    "band_gap": {
        "es_source": "encyclopedia.properties.band_gap"
    },
1076
1077
1078
1079
1080
1081
    "electronic_band_structure": {
        "es_source": "encyclopedia.properties.electronic_band_structure"
    },
    "electronic_dos": {
        "es_source": "encyclopedia.properties.electronic_dos"
    },
1082
1083
1084
1085
1086
1087
1088
1089
1090
    "phonon_band_structure": {
        "es_source": "encyclopedia.properties.phonon_band_structure"
    },
    "phonon_dos": {
        "es_source": "encyclopedia.properties.phonon_dos"
    },
    "thermodynamical_properties": {
        "es_source": "encyclopedia.properties.thermodynamical_properties"
    },
1091
1092
1093
    "wyckoff_sets": {
        "arch_source": "section_metadata/encyclopedia/material/idealized_structure/wyckoff_sets"
    },
1094
1095
1096
    "idealized_structure": {
        "arch_source": "section_metadata/encyclopedia/material/idealized_structure"
    },
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
}

calculation_property_query = api.model("calculation_query", {
    "properties": fields.List(fields.String),
})
energies = api.model("energies", {
    "energy_total": fields.Float,
    "energy_total_T0": fields.Float,
    "energy_free": fields.Float,
})
Lauri Himanen's avatar
Lauri Himanen committed
1107
1108
1109
1110
electronic_band_structure = api.model("electronic_band_structure", {
    "reciprocal_cell": fields.List(fields.List(fields.Float)),
    "brillouin_zone": fields.Raw,
    "section_k_band_segment": fields.Raw,
1111
    "section_band_gap": fields.Raw,
Lauri Himanen's avatar
Lauri Himanen committed
1112
1113
1114
})
electronic_dos = api.model("electronic_dos", {
    "dos_energies": fields.List(fields.Float),
1115
    "dos_values": fields.List(fields.List(fields.Float)),
Lauri Himanen's avatar
Lauri Himanen committed
1116
})
1117
1118
1119
calculation_property_result = api.model("calculation_property_result", {
    "lattice_parameters": fields.Nested(lattice_parameters, skip_none=True),
    "energies": fields.Nested(energies, skip_none=True),
1120
1121
1122
    "mass_density": fields.Float,
    "atomic_density": fields.Float,
    "cell_volume": fields.Float,
1123
    "wyckoff_sets": fields.Nested(wyckoff_set_result, skip_none=True),
1124
    "idealized_structure": fields.Nested(idealized_structure_result, skip_none=True),
1125
1126
1127
    "band_gap": fields.Float,
    "electronic_band_structure": fields.Nested(electronic_band_structure, skip_none=True),
    "electronic_dos": fields.Nested(electronic_dos, skip_none=True),
1128
1129
1130
    "phonon_band_structure": fields.Raw,
    "phonon_dos": fields.Raw,
    "thermodynamical_properties": fields.Raw,
1131
1132
1133
})


1134
1135
1136
1137
1138
@ns.route("/materials/<string:material_id>/calculations/<string:calc_id>")
class EncCalculationResource(Resource):
    @api.response(404, "Material or calculation not found")
    @api.response(400, "Bad request")
    @api.response(200, "Metadata send", fields.Raw)
1139
    @api.expect(calculation_property_query, validate=False)
1140
    @api.marshal_with(calculation_property_result, skip_none=True)
1141
    @api.doc("enc_calculation")
1142
1143
1144
1145
    def post(self, material_id, calc_id):
        """Used to return calculation details. Some properties are not
        available in the ES index and are instead read from the Archive
        directly.
1146
        """
1147
1148
1149
1150
1151
1152
        # Get query parameters as json
        try:
            data = marshal(request.get_json(), calculation_property_query)
        except Exception as e:
            abort(400, message=str(e))

1153
1154
1155
        s = Search(index=config.elastic.index_name)
        query = Q(
            "bool",
1156
            filter=get_enc_filter() + [
1157
1158
1159
1160
1161
1162
                Q("term", encyclopedia__material__material_id=material_id),
                Q("term", calc_id=calc_id),
            ]
        )
        s = s.query(query)

1163
        # Create dictionaries for requested properties
1164
        references = []
1165
1166
1167
        properties = data["properties"]
        arch_properties = {}
        es_properties = {}
1168
1169
1170
1171
1172
1173
1174
        ref_properties = set((
            "electronic_dos",
            "electronic_band_structure",
            "thermodynamical_properties",
            "phonon_dos",
            "phonon_band_structure",
        ))
1175
1176
1177
1178
        for prop in properties:
            es_source = calculation_property_map[prop].get("es_source")
            if es_source is not None:
                es_properties[prop] = es_source
1179
                if prop in ref_properties:
1180
                    references.append(prop)
1181
1182
1183
1184
            arch_source = calculation_property_map[prop].get("arch_source")
            if arch_source is not None:
                arch_properties[prop] = arch_source

1185
        # The query is filtered already on the ES side so we don't need to
1186
        # transfer so much data.
1187
1188
1189
        sources = [
            "upload_id",
            "calc_id",
1190
            "encyclopedia",
1191
1192
1193
        ]
        sources += list(es_properties.values())

1194
        s = s.extra(**{
1195
            "_source": {"includes": sources},
1196
1197
1198
1199
1200
1201
1202
1203
1204
            "size": 1,
        })

        response = s.execute()

        # No such material
        if len(response) == 0:
            abort(404, message="There is no material {} with calculation {}".format(material_id, calc_id))

1205
1206
1207
        # Add references that are to be read from the archive
        for ref in references:
            arch_path = response[0]
1208
1209
            arch_path = rgetattr(arch_path, es_properties[ref])
            if arch_path is not None:
Lauri Himanen's avatar
Lauri Himanen committed
1210
                arch_properties[ref] = arch_path
1211
1212
            del es_properties[ref]

1213
1214
1215
1216
1217
1218
1219
        # If any of the requested properties require data from the Archive, the
        # file is opened and read.
        result = {}
        if len(arch_properties) != 0:
            entry = response[0]
            upload_id = entry.upload_id
            calc_id = entry.calc_id
1220
            root = read_archive(
1221
1222
1223
1224
1225
                upload_id,
                calc_id,
            )

            # Add results from archive
1226
1227
1228
            for key, arch_path in arch_properties.items():
                value = root[arch_path]

1229
1230
                # Replace unnormalized thermodynamical properties with
                # normalized ones and turn into dict
1231
1232
1233
                if key == "thermodynamical_properties":
                    specific_heat_capacity = value.specific_heat_capacity.magnitude.tolist()
                    specific_free_energy = value.specific_vibrational_free_energy_at_constant_volume.magnitude.tolist()
1234
1235
                    specific_heat_capacity = [x if np.isfinite(x) else None for x in specific_heat_capacity]
                    specific_free_energy = [x if np.isfinite(x) else None for x in specific_free_energy]
1236
1237
1238
1239
                if isinstance(value, list):
                    value = [x.m_to_dict() for x in value]
                else:
                    value = value.m_to_dict()
1240
                if key == "thermodynamical_properties":
1241
1242
                    del value["thermodynamical_property_heat_capacity_C_v"]
                    del value["vibrational_free_energy_at_constant_volume"]
1243
1244
                    value["specific_heat_capacity"] = specific_heat_capacity
                    value["specific_vibrational_free_energy_at_constant_volume"] = specific_free_energy