base.py 12.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from typing import Iterable, List, Dict, Type, Tuple
16
import datetime
17
from elasticsearch_dsl import Keyword
18

19
from nomad import utils, config
20

21

Markus Scheidgen's avatar
Markus Scheidgen committed
22
class UploadWithMetadata():
Markus Scheidgen's avatar
Markus Scheidgen committed
23
24
25
    """
    See :class:`CalcWithMetadata`.
    """
26

Markus Scheidgen's avatar
Markus Scheidgen committed
27
28
29
30
    def __init__(self, **kwargs):
        self.upload_id: str = None
        self.uploader: utils.POPO = None
        self.upload_time: datetime.datetime = None
31

32
        self.calcs: Iterable['CalcWithMetadata'] = list()
33

Markus Scheidgen's avatar
Markus Scheidgen committed
34
35
36
        for key, value in kwargs.items():
            setattr(self, key, value)

37
38
39
40
    @property
    def calcs_dict(self) -> Dict[str, 'CalcWithMetadata']:
        return {calc.calc_id: calc for calc in self.calcs}

Markus Scheidgen's avatar
Markus Scheidgen committed
41
42

class CalcWithMetadata():
43
44
    """
    A dict/POPO class that can be used for mapping calc representations with calc metadata.
45
    We have multi representations of calcs and their calc metadata. To avoid implement
46
47
    mappings between all combinations, just implement mappings with the class and use
    mapping transitivity. E.g. instead of A -> B, A -> this -> B.
Markus Scheidgen's avatar
Markus Scheidgen committed
48

49
50
51
52
    This is basically an abstract class and it has to be subclassed for each :class:`Domain`.
    Subclasses can define additional attributes and have to implement :func:`apply_domain_metadata`
    to fill these attributes from processed entries, i.e. instance of :class:`nomad.parsing.LocalBackend`.

Markus Scheidgen's avatar
Markus Scheidgen committed
53
54
55
56
57
58
    Attributes:
        upload_id: The ``upload_id`` of the calculations upload (random UUID).
        calc_id: The unique mainfile based calculation id.
        calc_hash: The raw file content based checksum/hash of this calculation.
        pid: The unique persistent id of this calculation.
        mainfile: The upload relative mainfile path.
59

Markus Scheidgen's avatar
Markus Scheidgen committed
60
        files: A list of all files, relative to upload.
61
        upload_time: The time when the calc was uploaded.
Markus Scheidgen's avatar
Markus Scheidgen committed
62
        uploader: An object describing the uploading user, has at least ``user_id``
63
64
65
66
67
68
        processed: Boolean indicating if this calc was successfully processed and archive
            data and calc metadata is available.
        last_processing: A datatime with the time of the last successful processing.
        nomad_version: A string that describes the version of the nomad software that was
            used to do the last successful processing.

Markus Scheidgen's avatar
Markus Scheidgen committed
69
70
71
72
73
74
75
        with_embargo: Show if user set an embargo on the calculation.
        coauthors: List of coauther user objects with at ``user_id``.
        shared_with: List of users this calcs ownership is shared with, objects with at ``user_id``.
        comment: String comment.
        references: Objects describing user provided references, keys are ``id`` and ``value``.
        datasets: Objects describing the datasets, keys are ``id``, ``name``, ``doi``.
            DOI is optional, is an object with key ``id``, ``value``.
76
77
    """
    def __init__(self, **kwargs):
78
        # id relevant metadata
Markus Scheidgen's avatar
Markus Scheidgen committed
79
80
81
82
        self.upload_id: str = None
        self.calc_id: str = None
        self.calc_hash: str = None
        self.mainfile: str = None
83
84
85
86
        self.pid: int = None

        # basic upload and processing related metadata
        self.upload_time: datetime.datetime = None
Markus Scheidgen's avatar
Markus Scheidgen committed
87
88
        self.files: List[str] = None
        self.uploader: utils.POPO = None
89
90
91
        self.processed: bool = False
        self.last_processing: datetime.datetime = None
        self.nomad_version: str = None
92
        self.nomad_commit: str = None
Markus Scheidgen's avatar
Markus Scheidgen committed
93

94
        # user metadata, i.e. quantities given and editable by the user
Markus Scheidgen's avatar
Markus Scheidgen committed
95
        self.with_embargo: bool = None
96
        self.published: bool = False
Markus Scheidgen's avatar
Markus Scheidgen committed
97
98
99
100
101
102
        self.coauthors: List[utils.POPO] = []
        self.shared_with: List[utils.POPO] = []
        self.comment: str = None
        self.references: List[utils.POPO] = []
        self.datasets: List[utils.POPO] = []

103
        # parser related general (not domain specific) metadata
104
105
        self.parser_name = None

106
        self.update(**kwargs)
Markus Scheidgen's avatar
Markus Scheidgen committed
107
108
109
110

    def to_dict(self):
        return {
            key: value for key, value in self.__dict__.items()
111
            if value is not None and key not in ['backend']
Markus Scheidgen's avatar
Markus Scheidgen committed
112
        }
113

114
115
    def update(self, **kwargs):
        for key, value in kwargs.items():
116
117
118
            if value is None:
                continue

119
            if isinstance(value, list):
120
121
122
                if len(value) == 0:
                    continue

123
124
125
126
127
                if len(value) > 0 and isinstance(value[0], dict) and not isinstance(value[0], utils.POPO):
                    value = list(utils.POPO(**item) for item in value)
            if isinstance(value, dict) and not isinstance(value, utils.POPO):
                value = utils.POPO(**value)

128
129
            setattr(self, key, value)

130
131
132
133
134
135
136
137
138
    def apply_user_metadata(self, metadata: dict):
        """
        Applies a user provided metadata dict to this calc.
        """
        self.pid = metadata.get('_pid')
        self.comment = metadata.get('comment')
        self.upload_time = metadata.get('_upload_time')
        uploader_id = metadata.get('_uploader')
        if uploader_id is not None:
139
            self.uploader = utils.POPO(id=int(uploader_id))
140
141
142
        self.references = [utils.POPO(value=ref) for ref in metadata.get('references', [])]
        self.with_embargo = metadata.get('with_embargo', False)
        self.coauthors = [
143
            utils.POPO(id=int(user)) for user in metadata.get('coauthors', [])]
144
        self.shared_with = [
145
            utils.POPO(id=int(user)) for user in metadata.get('shared_with', [])]
146
        self.datasets = [
147
            utils.POPO(id=int(ds['id']), doi=utils.POPO(value=ds.get('_doi')), name=ds.get('_name'))
148
            for ds in metadata.get('datasets', [])]
149
150
151
152
153
154

    def apply_domain_metadata(self, backend):
        raise NotImplementedError()


class DomainQuantity:
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
    """
    This class can be used to define further details about a domain specific metadata
    quantity.

    Attributes:
        name: The name of the quantity, also the key used to store values in
            :class:`CalcWithMetadata`
        description: A human friendly description. The description is used to define
            the swagger documentation on the relevant API endpoints.
        multi: Indicates a list of values. This is important for the elastic mapping.
        order_default: Indicates that this metric should be used for the default order of
            search results.
        aggregations: Indicates that search aggregations (and how many) should be provided.
            0 (the default) means no aggregations.
        metric: Indicates that this quantity should be used as search metric. Values need
            to be tuples with metric name and elastic aggregation (e.g. sum, cardinality)
        elastic_mapping: An optional elasticsearch_dsl mapping. Default is ``Keyword``.
    """
173
174

    def __init__(
175
176
            self, description: str = None, multi: bool = False, aggregations: int = 0,
            order_default: bool = False, metric: Tuple[str, str] = None,
177
178
179
180
181
            elastic_mapping=None):

        self.name: str = None
        self.description = description
        self.multi = multi
182
183
184
        self.order_default = order_default
        self.aggregations = aggregations
        self.metric = metric
185
186
187
188
189
190
191
        self.elastic_mapping = elastic_mapping

        if self.elastic_mapping is None:
            self.elastic_mapping = Keyword(multi=self.multi)


class Domain:
192
193
194
195
    """
    A domain defines all metadata quantities that are specific to a certain scientific
    domain, e.g. DFT calculations, or experimental material science.

196
    Each domain needs to define a subclass of :class:`CalcWithMetadata`. This
197
198
199
200
201
202
203
204
205
    class has to define the necessary domain specific metadata quantities and how these
    are filled from parser results (usually an instance of :class:LocalBackend).

    Furthermore, the class method :func:`register_domain` of this ``Domain`` class has
    to be used to register a domain with ``domain_nam``. This also allows to provide
    further descriptions on each domain specific quantity via instance of :class:`DomainQuantity`.

    While there can be multiple domains registered. Currently, only one domain can be
    active. This active domain is define in the configuration using the ``domain_name``.
206
207
208
209
210
211
212

    Arguments:
        name: A name for the domain. This is used as key in the configuration ``config.domain``.
        domain_entry_class: A subclass of :class:`CalcWithMetadata` that adds the
            domain specific quantities.
        quantities: Additional specifications for the quantities in ``domain_entry_class`` as
            instances of :class:`DomainQuantity`.
213
    """
214
    instance: 'Domain' = None
215
    instances: Dict[str, 'Domain'] = {}
216

217
218
    def __init__(
            self, name: str, domain_entry_class: Type[CalcWithMetadata],
219
            quantities: Dict[str, DomainQuantity], root_sections=['section_run', 'section_entry_info']) -> None:
220
        if name == config.domain:
221
            assert Domain.instance is None, 'you can only define one domain.'
222
            Domain.instance = self
223

224
225
        Domain.instances[name] = self

226
227
        self.name = name
        self.domain_entry_class = domain_entry_class
228
        self.quantities: Dict[str, DomainQuantity] = {}
229
        self.root_sections = root_sections
230
231

        reference_domain_calc = domain_entry_class()
232
233
        reference_general_calc = CalcWithMetadata()

234
235
236
        for quantity_name, value in reference_domain_calc.__dict__.items():
            if not hasattr(reference_general_calc, quantity_name):
                quantity = quantities.get(quantity_name, None)
237
238
                if quantity is None:
                    quantity = DomainQuantity()
239
240
                    quantities[quantity_name] = quantity
                quantity.name = quantity_name
241
                quantity.multi = isinstance(value, list)
242
                self.quantities[quantity.name] = quantity
243

244
245
        for quantity_name in quantities.keys():
            assert hasattr(reference_domain_calc, quantity_name) and not hasattr(reference_general_calc, quantity_name), \
246
247
                'quantity does not exist or overrides general non domain quantity'

248
        assert any(quantity.order_default for quantity in Domain.instances[name].quantities.values()), \
249
            'you need to define a order default quantity'
250
251
252
253
254
255
256
257

    @property
    def metrics(self) -> Dict[str, Tuple[str, str]]:
        """
        The metrics specification used for search aggregations. See :func:`nomad.search.metrics`.
        """
        return {
            quantity.metric[0]: (quantity.metric[1], quantity.name)
258
            for quantity in self.quantities.values()
259
260
261
262
263
264
            if quantity.metric is not None
        }

    @property
    def metrics_names(self) -> Iterable[str]:
        """ Just the names of all metrics. """
265
        return list(self.metrics.keys())
266
267
268
269
270
271
272
273
274

    @property
    def aggregations(self) -> Dict[str, int]:
        """
        The search aggregations and the maximum number of calculated buckets. See also
        :func:`nomad.search.aggregations`.
        """
        return {
            quantity.name: quantity.aggregations
275
            for quantity in self.quantities.values()
276
277
            if quantity.aggregations > 0
        }
278
279
280
281
282

    @property
    def aggregations_names(self) -> Iterable[str]:
        """ Just the names of all metrics. """
        return list(self.aggregations.keys())
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309


def get_optional_backend_value(backend, key, section, unavailable_value=None, logger=None):
    # Section is section_system, section_symmetry, etc...
    val = None  # Initialize to None, so we can compare section values.
    # Loop over the sections with the name section in the backend.
    for section_index in backend.get_sections(section):
        try:
            new_val = backend.get_value(key, section_index)
        except KeyError:
            new_val = None

        # Compare values from iterations.
        if val is not None and new_val is not None:
            if val.__repr__() != new_val.__repr__() and logger:
                logger.warning(
                    'The values for %s differ between different %s: %s vs %s' %
                    (key, section, str(val), str(new_val)))

        val = new_val if new_val is not None else val

    if val is None and logger:
        logger.warning(
            'The values for %s where not available in any %s' % (key, section))
        return unavailable_value if unavailable_value is not None else config.services.unavailable_value
    else:
        return val