models.py 44.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#
# Copyright The NOMAD Authors.
#
# This file is part of NOMAD. See https://nomad-lab.eu for further info.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

19
from typing import List, Dict, Optional, Union, Any, Mapping
20
21
22
import enum
from fastapi import Body, Request, HTTPException, Query as FastApiQuery
import pydantic
23
24
25
26
27
28
29
30
31
32
from pydantic import (  # pylint: disable=unused-import
    BaseModel,
    StrictInt,
    StrictFloat,
    StrictBool,
    Field,
    Extra,
    validator,
    root_validator,
)
33
34
35
36
import datetime
import numpy as np
import re
import fnmatch
37
import json
38
39
40
41

from nomad import datamodel  # pylint: disable=unused-import
from nomad.utils import strip
from nomad.metainfo import Datetime, MEnum
42
from nomad.metainfo.elasticsearch_extension import DocumentType, material_entry_type, material_type
43

44
from .utils import parameter_dependency_from_model, update_url_query_arguments
45

46

47
User = datamodel.User.m_def.a_pydantic.model
48
49
50
51
# It is important that datetime.datetime comes last. Otherwise, number valued strings
# are interpreted as epoch dates by pydantic
Value = Union[StrictInt, StrictFloat, StrictBool, str, datetime.datetime]
ComparableValue = Union[StrictInt, StrictFloat, str, datetime.datetime]
52
53
54
55
56
57
58
59
60
61
62
63
64


class HTTPExceptionModel(BaseModel):
    detail: str


class NoneEmptyBaseModel(BaseModel):
    @root_validator
    def check_exists(cls, values):  # pylint: disable=no-self-argument
        assert any(value is not None for value in values.values())
        return values


65
class All(NoneEmptyBaseModel, extra=Extra.forbid):
66
67
68
    op: List[Value] = Field(None, alias='all')


69
class None_(NoneEmptyBaseModel, extra=Extra.forbid):
70
71
72
    op: List[Value] = Field(None, alias='none')


73
class Any_(NoneEmptyBaseModel, extra=Extra.forbid):
74
75
76
    op: List[Value] = Field(None, alias='any')


77
78
79
class Range(BaseModel, extra=Extra.forbid):
    """Represents a finite range which can have open or closed ends. Supports
    several datatypes that have a well-defined comparison operator.
80
81
82
83
84
85
86
    """
    @root_validator
    def check_range_is_valid(cls, values):  # pylint: disable=no-self-argument
        lt = values.get('lt')
        lte = values.get('lte')
        gt = values.get('gt')
        gte = values.get('gte')
87
88
89
90
91
92
93
94
95
96
97
98
99
100

        # At least one value needs to be defined
        assert (lt is not None) or (lte is not None) or (gt is not None) or (gte is not None)

        # The start/end can only be either open or closed, not both
        if lt is not None:
            assert lte is None
        if lte is not None:
            assert lt is None
        if gt is not None:
            assert gte is None
        if gte is not None:
            assert gt is None

101
102
        return values

103
104
105
106
    lt: Optional[ComparableValue] = Field(None)
    lte: Optional[ComparableValue] = Field(None)
    gt: Optional[ComparableValue] = Field(None)
    gte: Optional[ComparableValue] = Field(None)
107
108


109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
class LogicalOperator(NoneEmptyBaseModel):
    @validator('op', check_fields=False)
    def validate_query(cls, query):  # pylint: disable=no-self-argument
        if isinstance(query, list):
            return [_validate_query(item) for item in query]

        return _validate_query(query)


class And(LogicalOperator):
    op: List['Query'] = Field(None, alias='and')


class Or(LogicalOperator):
    op: List['Query'] = Field(None, alias='or')


class Not(LogicalOperator):
    op: 'Query' = Field(None, alias='not')


130
131
132
133
134
135
class Nested(BaseModel):
    query: 'Query'

    @validator('query')
    def validate_query(cls, query):  # pylint: disable=no-self-argument
        return _validate_query(query)
136
137


138
ops = {
139
140
141
142
    'lte': Range,
    'lt': Range,
    'gte': Range,
    'gt': Range,
143
144
    'all': All,
    'none': None_,
145
    'any': Any_
146
147
}

148
QueryParameterValue = Union[Value, List[Value], Range, Any_, All, None_, Nested, Dict[str, Any]]
149

150
Query = Union[And, Or, Not, Mapping[str, QueryParameterValue]]
151
152
153
154
155


And.update_forward_refs()
Or.update_forward_refs()
Not.update_forward_refs()
156
Nested.update_forward_refs()
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201


class Owner(str, enum.Enum):
    '''
    The `owner` allows to limit the scope of the searched based on entry ownership.
    This is useful, if you only want to search among all publically downloadable
    entries, or only among your own entries, etc.

    These are the possible owner values and their meaning:
    * `all`: Consider all entries.
    * `public` (default): Consider all entries that can be publically downloaded,
        i.e. only published entries without embargo
    * `user`: Only consider entries that belong to you.
    * `shared`: Only consider entries that belong to you or are shared with you.
    * `visible`: Consider all entries that are visible to you. This includes
        entries with embargo or unpublished entries that belong to you or are
        shared with you.
    * `staging`: Only search through unpublished entries.
    '''

    # There seems to be a slight bug in fast API. When it creates the example in OpenAPI
    # it will ignore any given default or example and simply take the first enum value.
    # Therefore, we put public first, which is the most default and save in most contexts.
    public = 'public'
    all_ = 'all'
    visible = 'visible'
    shared = 'shared'
    user = 'user'
    staging = 'staging'
    admin = 'admin'


class WithQuery(BaseModel):
    owner: Optional[Owner] = Body('public')
    query: Optional[Query] = Body(
        None,
        embed=True,
        description=strip('''
            A query can be very simple list of parameters. Different parameters are combined
            with a logical **and**, values of the same parameter with also with a logical **and**.
            The following would search for all entries that are VASP calculations,
            contain *Na* **and** *Cl*, **and** are authored by *Stefano Curtarolo*
            **and** *Chris Wolverton*.
            ```
            {
202
203
                "results.material.elements": ["Na", "Cl"],
                "results.method.simulation.program_name": "VASP",
204
205
206
207
208
209
210
211
                "authors": ["Stefano Curtarolo", "Chris Wolverton"]
            }
            ```

            A short cut to change the logical combination of values in a list, is to
            add a suffix to the quantity `:any`:
            ```
            {
212
213
                "results.material.elements": ["Na", "Cl"],
                "results.method.simulation.program_name": "VASP",
214
215
216
217
218
219
220
221
222
223
224
                "authors:any": ["Stefano Curtarolo", "Chris Wolverton"]
            }
            ```

            Otherwise, you can also write complex logical combinations of parameters like this:
            ```
            {
                "and": [
                    {
                        "or": [
                            {
225
                                "results.material.elements": ["Cl", "Na"]
226
227
                            },
                            {
228
                                "results.material.elements": ["H", "O"]
229
230
231
232
233
                            }
                        ]
                    },
                    {
                        "not": {
234
                            "results.material.symmetry.crystal_system": "cubic"
235
236
237
238
239
240
241
242
243
244
245
                        }
                    }
                ]
            }
            ```
            Other short-cut prefixes are `none:` and `any:` (the default).

            By default all quantity values have to **equal** the given values to match. For
            some values you can also use comparison operators like this:
            ```
            {
246
                "upload_create_time": {
247
248
249
                    "gt": "2020-01-01",
                    "lt": "2020-08-01"
                },
250
                "results.properties.geometry_optimization.final_energy_difference": {
251
252
253
254
255
256
257
258
                    "lte": 1.23e-18
                }
            }
            ```

            or shorter with suffixes:
            ```
            {
259
260
                "upload_create_time:gt": "2020-01-01",
                "upload_create_time:lt": "2020-08-01",
261
                "results.properties.geometry_optimization.final_energy_difference:lte" 1.23e-18
262
263
264
265
            }
            ```

            The searchable quantities are a subset of the NOMAD Archive quantities defined
266
            in the NOMAD Metainfo. The searchable quantities also depend on the API endpoint.
267
268
269
270
271
272
273
274

            There is also an additional query parameter that you can use to formulate queries based
            on the optimade filter language:
            ```
            {
                "optimade_filter": "nelements >= 2 AND elements HAS ALL 'Ti', 'O'"
            }
            ```
275
        '''),  # TODO custom documentation for entry and material API
276
        example={
277
            'upload_create_time:gt': '2020-01-01',
278
279
280
281
            'results.material.elements': ['Ti', 'O'],
            'results.method.simulation.program_name': 'VASP',
            'results.properties.geometry_optimization.final_energy_difference:lte': 1.23e-18,
            'results.properties.available_properties': 'section_dos',
282
283
            'results.material.type_structural:any': ['bulk', '2d'],
            'optimade_filter': 'nelements >= 2 AND elements HAS ALL "Ti", "O"'
284
285
286
287
288
289
290
291
292
        })

    @validator('query')
    def validate_query(cls, query):  # pylint: disable=no-self-argument
        return _validate_query(query)


def _validate_query(query: Query):
    if isinstance(query, dict):
293
294
295
        for key, value in list(query.items()):
            # Note, we loop over a list of items, not query.items(). This is because we
            # may modify the query in the loop.
296
297
298
            if isinstance(value, dict):
                value = Nested(query=value)

299
300
301
302
303
304
305
306
307
308
309
310
            if ':' in key:
                quantity, qualifier = key.split(':')
            else:
                quantity, qualifier = key, None

            if qualifier is not None:
                assert quantity not in query, 'a quantity can only appear once in a query'
                assert qualifier in ops, 'unknown quantity qualifier %s' % qualifier
                del(query[key])
                query[quantity] = ops[qualifier](**{qualifier: value})  # type: ignore
            elif isinstance(value, list):
                query[quantity] = All(all=value)
311
312
            else:
                query[quantity] = value
313
314
315
316

    return query


317
318
319
320
321
322
323
324
325
class QueryParameters:
    def __init__(self, doc_type: DocumentType):
        self.doc_type = doc_type

    def __call__(
        self,
        request: Request,
        owner: Optional[Owner] = FastApiQuery(
            'public', description=strip(Owner.__doc__)),
326
327
328
        json_query: Optional[str] = FastApiQuery(None, description=strip('''
                To pass a query string in the format of JSON e.g. '{{"results.material.elements": ["H", "O"]}}'.
            ''')),
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        q: Optional[List[str]] = FastApiQuery(
            [], description=strip('''
                Since we cannot properly offer forms for all parameters in the OpenAPI dashboard,
                you can use the parameter `q` and encode a query parameter like this
                `atoms__H` or `n_atoms__gt__3`. Multiple usage of `q` will combine parameters with
                logical *and*.
            '''))) -> WithQuery:

        # copy parameters from request
        query_params = {
            key: request.query_params.getlist(key)
            for key in request.query_params}

        # add the encoded parameters
        for parameter in q:
            fragments = parameter.split('__')
            if len(fragments) == 1 or len(fragments) > 3:
                raise HTTPException(422, detail=[{
                    'loc': ['query', 'q'],
                    'msg': 'wrong format, use <quantity>[__<op>]__<value>'}])
            name_op, value = '__'.join(fragments[:-1]), fragments[-1]
            quantity_name = name_op.split('__')[0]

352
353
354
355
356
357
358
359
360
361
            doc_type = self.doc_type
            if quantity_name.startswith('entries.'):
                if self.doc_type == material_type:
                    doc_type = material_entry_type
                else:
                    raise HTTPException(422, detail=[{
                        'loc': ['query', parameter],
                        'msg': f'entries can only be nested into material queries'}])

            if quantity_name not in doc_type.quantities:
362
363
                raise HTTPException(422, detail=[{
                    'loc': ['query', parameter],
364
                    'msg': f'{quantity_name} is not a {doc_type} quantity'}])
365
366
367
368
369
370
371
372
373
374
375
376

            query_params.setdefault(name_op, []).append(value)

        # transform query parameters to query
        query: Dict[str, Any] = {}
        for key in query_params:
            op = None
            if '__' in key:
                quantity_name, op = key.split('__')
            else:
                quantity_name = key

377
378
379
380
381
382
            if quantity_name.startswith('entries.'):
                quantity = material_entry_type.quantities.get(quantity_name[8:])
            else:
                quantity = self.doc_type.quantities.get(quantity_name)

            if quantity is None:
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
                continue

            type_ = quantity.definition.type
            if type_ is Datetime:
                type_ = datetime.datetime.fromisoformat
            elif isinstance(type_, MEnum):
                type_ = str
            elif isinstance(type_, np.dtype):
                type_ = float
            elif type_ not in [int, float, bool]:
                type_ = str
            values = query_params[key]
            values = [type_(value) for value in values]

            if op is None:
                op = 'all' if quantity.many_all else 'any'

            if op == 'all':
                query[quantity_name] = All(all=values)
            elif op == 'any':
                query[quantity_name] = Any_(any=values)
            elif op in ops:
                if len(values) > 1:
                    raise HTTPException(
                        status_code=422,
                        detail=[{
                            'loc': ['query', key],
                            'msg': 'operator %s does not support multiple values' % op}])
                query[quantity_name] = ops[op](**{op: values[0]})
            else:
413
                raise HTTPException(
414
                    422, detail=[{'loc': ['query', key], 'msg': 'operator %s is unknown' % op}])
415

416
417
418
419
420
421
422
423
        # process the json_query
        if json_query is not None:
            try:
                query.update(**json.loads(json_query))
            except Exception:
                raise HTTPException(
                    422, detail=[{'loc': ['json_query'], 'msg': 'cannot parse json_query'}])

424
        return WithQuery(query=query, owner=owner)
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440


class Direction(str, enum.Enum):
    '''
    Order direction, either ascending (`asc`) or descending (`desc`)
    '''
    asc = 'asc'
    desc = 'desc'


class MetadataRequired(BaseModel):
    ''' Defines which metadata quantities are included or excluded in the response. '''

    include: Optional[List[str]] = Field(
        None, description=strip('''
            Quantities to include for each result. Only those quantities will be
441
            returned. At least one id quantity (e.g. `entry_id`) will always be included.
442
443
444
445
        '''))
    exclude: Optional[List[str]] = Field(
        None, description=strip('''
            Quantities to exclude for each result. Only all other quantities will
446
            be returned. The entity's id quantity (e.g. `entry_id`) cannot be excluded.
447
448
449
450
451
452
453
454
455
456
        '''))


metadata_required_parameters = parameter_dependency_from_model(
    'metadata_required_parameters', MetadataRequired)


class Pagination(BaseModel):
    ''' Defines the order, size, and page of results. '''

457
    page_size: Optional[int] = Field(
458
        10, description=strip('''
459
            The page size, e.g. the maximum number of items contained in one response.
460
            A `page_size` of 0 will return no results.
461
        '''))
462
    order_by: Optional[str] = Field(
463
        None,
464
        description=strip('''
David Sikter's avatar
David Sikter committed
465
466
            The results are ordered by the values of this field. If omitted, default
            ordering is applied.
467
468
469
        '''))
    order: Optional[Direction] = Field(
        Direction.asc, description=strip('''
David Sikter's avatar
David Sikter committed
470
471
            The ordering direction of the results based on `order_by`. Its either
            ascending `asc` or decending `desc`. Default is `asc`.
472
        '''))
473
    page_after_value: Optional[str] = Field(
474
        None, description=strip('''
475
476
477
            This attribute defines the position after which the page begins, and is used
            to navigate through the total list of results.

478
479
480
481
            When requesting the first page, no value should be provided for
            `page_after_value`. Each response will contain a value `next_page_after_value`,
            which can be used to obtain the next page (by setting `page_after_value` in
            your next request to this value).
482

483
484
            The field is encoded as a string, and the format of `page_after_value` and
            `next_page_after_value` depends on which API method is used.
485
486

            Some API functions additionally allows a simplified navigation, by specifying
487
488
            the page number in the key `page`. It is however always possible to use
            `page_after_value` and `next_page_after_value` to iterate through the results.
489
            '''))
David Sikter's avatar
David Sikter committed
490
491
492
493
494
495
496
497
498
499
    page: Optional[int] = Field(
        None, description=strip('''
            The number of the page (1-based). When provided in a request, this attribute
            can be used instead of `page_after_value` to jump to a particular results page.

            **NOTE #1**: the option to request pages by submitting the `page` number is
            limited. There are api calls where this attribute cannot be used for indexing,
            or where it can only be used partially. **If you want to just iterate through
            all the results, aways use the `page_after_value` and `next_page_after_value`!**

500
501
502
503
504
505
506
507
508
509
510
511
512
            **NOTE #2**: Only one, `page`, `page_offset` or `page_after_value`, can be used.
        '''))
    page_offset: Optional[int] = Field(
        None, description=strip('''
            The number of skipped entries. When provided in a request, this attribute
            can be used instead of `page_after_value` to jump to a particular results page.

            **NOTE #1**: the option to request pages by submitting the `page_offset` number is
            limited. There are api calls where this attribute cannot be used for indexing,
            or where it can only be used partially. **If you want to just iterate through
            all the results, aways use the `page_after_value` and `next_page_after_value`!**

            **NOTE #2**: Only one, `page`, `page_offset` or `page_after_value`, can be used.
David Sikter's avatar
David Sikter committed
513
        '''))
David Sikter's avatar
David Sikter committed
514

515
516
517
518
    @validator('page_size')
    def validate_page_size(cls, page_size):  # pylint: disable=no-self-argument
        assert page_size >= 0, 'page_size must be >= 0'
        return page_size
David Sikter's avatar
David Sikter committed
519
520
521
522
523
524
525
526
527

    @validator('order_by')
    def validate_order_by(cls, order_by):  # pylint: disable=no-self-argument
        '''
        Override this in your Pagination class to ensure that a valid attribute is selected.
        This method has to be implemented!
        '''
        raise NotImplementedError('Validation of `order_by` not implemented!')

528
529
    @validator('page_after_value')
    def validate_page_after_value(cls, page_after_value, values):  # pylint: disable=no-self-argument
David Sikter's avatar
David Sikter committed
530
        '''
531
532
        Override this in your Pagination class to implement validation of the
        `page_after_value` value.
David Sikter's avatar
David Sikter committed
533
534
        This method has to be implemented!
        '''
535
        raise NotImplementedError('Validation of `page_after_value` not implemented!')
David Sikter's avatar
David Sikter committed
536
537
538

    @validator('page')
    def validate_page(cls, page, values):  # pylint: disable=no-self-argument
David Sikter's avatar
David Sikter committed
539
540
        if page is not None:
            assert page >= 1, 'page must be >= 1'
David Sikter's avatar
David Sikter committed
541
542
        return page

543
544
545
546
547
548
    @validator('page_offset')
    def validate_page_offset(cls, page_offset, values):  # pylint: disable=no-self-argument
        if page_offset is not None:
            assert page_offset >= 0, 'page must be >= 1'
        return page_offset

David Sikter's avatar
David Sikter committed
549
550
    @root_validator(skip_on_failure=True)
    def validate_values(cls, values):  # pylint: disable=no-self-argument
551
552
553
554
555
556
        # Because of a bug in pydantic (#2670), root validators can't be overridden, so
        # we invoke a class method, which *can* be overridden.
        return cls._root_validation(values)

    @classmethod
    def _root_validation(cls, values):
557
        page_offset = values.get('page_offset')
David Sikter's avatar
David Sikter committed
558
        page = values.get('page')
559
560
        page_after_value = values.get('page_after_value')
        page_size = values.get('page_size')
561
562
563
564

        n_offset_criteria = (1 if page_offset else 0) + (1 if page else 0) + (1 if page_after_value else 0)
        assert n_offset_criteria <= 1, 'Can only specify one `page_offset`, `page`, or `page_after_value'

David Sikter's avatar
David Sikter committed
565
        if page_size == 0:
566
            assert page_offset is None, 'Cannot specify `page_offset` when `page_size` is set to 0'
David Sikter's avatar
David Sikter committed
567
568
            assert page is None, 'Cannot specify `page` when `page_size` is set to 0'
            assert page_after_value is None, 'Cannot specify `page_after_value` when `page_size` is set to 0'
569

David Sikter's avatar
David Sikter committed
570
571
        return values

David Sikter's avatar
David Sikter committed
572
573
574
575
576
577
    def get_simple_index(self):
        '''
        If simple, index-based pagination is used, this method can be used to get the
        corresponding index (0-based). It will look on either `page` or `page_after_value`.
        If neither index is provided, we return 0 (i.e. the first index).
        '''
578
579
        if self.page_offset is not None:
            return self.page_offset
David Sikter's avatar
David Sikter committed
580
581
        if self.page is not None:
            return (self.page - 1) * self.page_size
582
583
584
585
586
        if self.page_after_value is not None:
            rv = int(self.page_after_value) + 1
            assert rv >= 0
            return rv
        return 0
David Sikter's avatar
David Sikter committed
587

David Sikter's avatar
David Sikter committed
588
589
590
591
592
593
594

class PaginationResponse(Pagination):
    total: int = Field(
        ..., description=strip('''
        The total number of results that fit the given query. This is independent of
        any pagination and aggregations.
        '''))
595
    next_page_after_value: Optional[str] = Field(
David Sikter's avatar
David Sikter committed
596
        None, description=strip('''
597
598
599
        The *next* value to be used as `page_after_value` in a follow up requests, to get
        the next page of results. If no more results are available, `next_page_after_value`
        will not be set.
David Sikter's avatar
David Sikter committed
600
        '''))
David Sikter's avatar
David Sikter committed
601
602
    page_url: Optional[str] = Field(
        None, description=strip('''
603
        The url of the current page. Only applicable for GET requests.
David Sikter's avatar
David Sikter committed
604
        '''))
605
    next_page_url: Optional[str] = Field(
606
        None, description=strip('''
607
        The url to get the next page. Only applicable for GET requests.
608
        '''))
David Sikter's avatar
David Sikter committed
609
610
611
612
613
    prev_page_url: Optional[str] = Field(
        None, description=strip('''
        The url to get the previous page. **NOTE:** Only applicable for some API methods,
        (namely, where indexing by `page` is possible), and only for GET requests.
        '''))
614
615
    first_page_url: Optional[str] = Field(
        None, description=strip('''
616
        The url to get the first page. Only applicable for GET requests.
617
        '''))
David Sikter's avatar
David Sikter committed
618
619
620
621
622
623

    @validator('order_by')
    def validate_order_by(cls, order_by):  # pylint: disable=no-self-argument
        # No validation - behaviour of this field depends on api method
        return order_by

624
625
    @validator('page_after_value')
    def validate_page_after_value(cls, page_after_value, values):  # pylint: disable=no-self-argument
David Sikter's avatar
David Sikter committed
626
        # No validation - behaviour of this field depends on api method
627
        return page_after_value
David Sikter's avatar
David Sikter committed
628

629
630
    @classmethod
    def _root_validation(cls, values):  # pylint: disable=no-self-argument
David Sikter's avatar
David Sikter committed
631
632
        # No validation
        return values
633

634
635
636
    def populate_urls(self, request: Request):
        '''
        Populates the urls (`page_url`, `next_page_url`, `first_page_url` from the
637
        request and `next_page_after_value`. Only applicable for GET requests.
638
        '''
David Sikter's avatar
David Sikter committed
639
        assert request.method.upper() == 'GET', 'Trying to populate urls, but method is not GET.'
640
641
        original_url = str(request.url)
        self.page_url = original_url
David Sikter's avatar
David Sikter committed
642
643
644
        if self.page_size:
            self.first_page_url = update_url_query_arguments(
                original_url, page=None, page_after_value=None)
645
646
647
        if self.next_page_after_value:
            self.next_page_url = update_url_query_arguments(
                original_url, page=None, page_after_value=self.next_page_after_value)
David Sikter's avatar
David Sikter committed
648
649
650
        if self.page and self.page > 1:
            self.prev_page_url = update_url_query_arguments(
                original_url, page=self.page - 1, page_after_value=None)
651

David Sikter's avatar
David Sikter committed
652
    def populate_simple_index_and_urls(self, request: Request):
653
        '''
David Sikter's avatar
David Sikter committed
654
655
656
        If simple, index-based pagination is used, this method can be used to populate
        the `page`, `page_after_value` and urls (if it is a GET request) automatically.
        Assumes that the field `total` is populated.
657
        '''
David Sikter's avatar
David Sikter committed
658
659
660
661
662
663
664
665
666
667
668
669
        if not self.page_size:
            self.page = 1
            self.page_after_value = None
            self.next_page_after_value = None
        else:
            ind = self.get_simple_index()
            self.page = ind // self.page_size + 1
            self.page_after_value = None if self.page == 1 else str(ind - 1)
            if self.page_size * self.page >= self.total:
                self.next_page_after_value = None
            else:
                self.next_page_after_value = str(ind + self.page_size - 1)
670
671
672
673
674

            if self.page < 1 or (
                    self.total == 0 and self.page != 1) or (
                    self.total > 0 and (self.page - 1) * self.page_size >= self.total):
                raise HTTPException(400, detail='Page out of range requested.')
David Sikter's avatar
David Sikter committed
675
676
        if request.method.upper() == 'GET':
            self.populate_urls(request)
David Sikter's avatar
David Sikter committed
677
678


679
class MetadataBasedPagination(Pagination):
David Sikter's avatar
David Sikter committed
680
    order_by: Optional[str] = Field(
681
        None,
David Sikter's avatar
David Sikter committed
682
683
684
        description=strip('''
            The results are ordered by the values of this field. If omitted, default
            ordering is applied.
685
686
687
688
        '''))

    @validator('order_by')
    def validate_order_by(cls, order_by):  # pylint: disable=no-self-argument
689
        # No validation here – validation is done during search
690
691
        return order_by

692
693
    @validator('page_after_value')
    def validate_page_after_value(cls, page_after_value, values):  # pylint: disable=no-self-argument
694
        # No validation here – validation is done during search
695
        return page_after_value
696
697


698
class MetadataPagination(MetadataBasedPagination):
699
700
701
702
    page: Optional[int] = Field(
        None, description=strip('''
            For simple, index-based pagination, this should contain the number of the
            requested page (1-based). When provided in a request, this attribute can be
703
            used instead of `page_after_value` to jump to a particular results page.
704
705

            However, you can only retreive up to the 10.000th entry with a page number.
706
            Only one, `page`, `page_offset` or `page_after_value`, can be used.
707
708
        '''))

709
710
711
712
713
714
715
716
717
718
    page_offset: Optional[int] = Field(
        None, description=strip('''
            Return the page that follows the given number of entries. Overwrites
            `page` and `page_after_value`.

            However, you can only retreive up to the 10.000th entry.
            Only one, `page`, `page_offset` or `page_after_value`, can be used.
        ''')
    )

719
720
721
722
    @validator('page')
    def validate_page(cls, page, values):  # pylint: disable=no-self-argument
        if page is not None:
            assert page > 0, 'Page has to be larger than 1.'
David Sikter's avatar
David Sikter committed
723
            assert page * values.get('page_size', 10) < 10000, 'Pagination by `page` is limited to 10.000 entries.'
724
725
726

        return page

727
728
729
730
731
732
733
734
    @validator('page_offset')
    def validate_page_offset(cls, page_offset, values):  # pylint: disable=no-self-argument
        if page_offset is not None:
            assert page_offset >= 0, 'Page offset has to be larger than 0.'
            assert page_offset + values.get('page_size', 10) < 10000, 'Page offset plus page size has to be smaller thant 10.0000.'

        return page_offset

735

736
737
metadata_pagination_parameters = parameter_dependency_from_model(
    'metadata_pagination_parameters', MetadataPagination)
738
739


740
class AggregationPagination(MetadataBasedPagination):
741
    order_by: Optional[str] = Field(
David Sikter's avatar
David Sikter committed
742
743
        None,  # type: ignore
        description=strip('''
744
745
746
747
748
            Either the string "count", "value", or the name of a quantity. If omitted the buckets
            will be ordered by the item "count".

            If you provide a quantity, all items
            in a bucket must have the same value for this quantity. For example, aggregating
749
750
            entries on `upload_id` and ordering with the buckets by `upload_create_time` is fine,
            because all entries in an upload have the same `upload_create_time`. The API cannot
751
752
753
754
755
            check this rule and the results will be unpredictable.

            If you want to order by the bucket values, you can either use "value" or use
            the aggregation quantity to `order_by`. The result will be the same, because
            the bucket values are the quantity values.
756
757
        '''))

David Sikter's avatar
David Sikter committed
758
759
760
761
762
    @validator('page')
    def validate_page(cls, page, values):  # pylint: disable=no-self-argument
        assert page is None, 'Pagination by `page` is not possible for aggregations, use `page_after_value`'
        return page

763
764
765
766
767
    @validator('page_size')
    def validate_page_size(cls, page_size, values):  # pylint: disable=no-self-argument
        assert page_size > 0, '0 or smaller page sizes are not allowed for aggregations.'
        return page_size

768
769
770
771

class AggregatedEntities(BaseModel):
    size: Optional[pydantic.conint(gt=0)] = Field(  # type: ignore
        1, description=strip('''
772
        The number of entries that should be returned for each value in the
773
774
775
776
777
778
779
780
        aggregation.
        '''))
    required: Optional[MetadataRequired] = Field(
        None, description=strip('''
        This allows to determined what fields should be returned for each entry.
        '''))


781
class AggregationBase(BaseModel):
782
783
784
785
    pass


class QuantityAggregation(AggregationBase):
786
    quantity: str = Field(
787
788
789
790
        ..., description=strip('''
        The manatory name of the quantity for the aggregation. Aggregations
        can only be computed for those search metadata that have discrete values;
        an aggregation buckets entries that have the same value for this quantity.'''))
791
792
793
794
795
796
797
798
799
800
801
802
803
    exclude_from_search: bool = Field(
        False, description=strip('''
        If set to true, top-level search criteria involving the aggregation quantity, will not
        be applied for this aggregation. Therefore, the aggregation will return all
        values for the quantity, even if the possible values where filtered by the query.

        There are two limitations. This is only supported with queries that start with a
        dictionary. It will not work for queries with a boolean operator. It can only
        exclude top-level criteria at the root of the query dictionary. Nested criteria,
        e.g. within complex and/or constructs, cannot be considered. Using this might also
        prohibit pagination with page_after_value on aggregations in the same request.
        ''')
    )
804
805


806
class BucketAggregation(QuantityAggregation):
807
    metrics: Optional[List[str]] = Field(
808
        [], description=strip('''
809
        By default the returned aggregations will provide the number of entries for each
810
811
812
813
814
        value. You can add more metrics. For each metric an additional number will be
        provided for each value. Metrics are also based on search metadata. Depending on
        the metric the number will represent either a sum (`calculations` for the number
        of individual calculation in each code run) or an amount of different values
        (i.e. `materials` for the amount of different material hashes).'''))
815
816
817
818


class TermsAggregation(BucketAggregation):
    pagination: Optional[AggregationPagination] = Field(
819
        None, description=strip('''
820
821
822
        Only the data few values are returned for each API call. Aggregation
        pagination allows to get all available values by pagination. It also allows to
        order values.
823

824
825
826
827
828
829
830
        You can only use pagination (to page through all available values) or size (to
        get the size number of values with the most available data).
        '''))
    size: Optional[pydantic.conint(gt=0)] = Field(  # type: ignore
        None, description=strip('''
        Only the data few values are returned for each API call. Pagination allows to
        get the next set of values based on the last value in the last call.
831
832
833
834
835
        '''))
    value_filter: Optional[pydantic.constr(regex=r'^[a-zA-Z0-9_\-\s]+$')] = Field(  # type: ignore
        None, description=strip('''
        An optional filter for values. Only values that contain the filter as substring
        will be part of the statistics.
836
837

        This is only available for non paginated aggregations.
838
        '''))
839
    entries: Optional[AggregatedEntities] = Field(
840
        None, description=strip('''
841
842
        Optionally, a set of entries can be returned for each value. These are basically
        example entries that have the respective bucket value.
843
844
845
        '''))


846
class HistogramAggregation(BucketAggregation):
847
848
849
850
851
    interval: pydantic.confloat(gt=0)  # type: ignore


class DateHistogramAggregation(BucketAggregation):
    interval: str = Field('1M')  # type: ignore
852
853


854
class MinMaxAggregation(QuantityAggregation):
855
856
857
    pass


858
859
860
861
862
863
864
865
866
class StatisticsAggregation(AggregationBase):
    metrics: Optional[List[str]] = Field(
        [], description=strip('''
        A list of search quantities to act as metrics on all data. Depending on
        the metric the number will represent either a sum (`calculations` for the number
        of individual calculation in each code run) or an amount (cardinality) of
        different values (i.e. `materials` for the amount of different material hashes).'''))


867
class Aggregation(BaseModel):
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
    terms: Optional[TermsAggregation] = Body(
        None,
        description=strip('''
            A `terms` aggregation allows to get the values of a quantity that occur in
            the search query result data. For each value, a bucket is created with
            information about how many entries have the value (or additional metrics).
            For example to get all entries that use a certain code, you can use:
            ```json
            {
                "aggregations": {
                    "all_codes": {
                        "terms": {
                            "quantity": "results.method.simulation.program_name"
                        }
                    }
                }
            }
            ```

            Terms aggregations can also be used to paginate though all values of a certain
            quantities. Each page will be companied with a `page_after_value` that
            can be used to retrieve the next value. For example to go through all datasets
            available in the search query:
            ```json
            {
                "aggregations": {
                    "all_datasets": {
                        "terms": {
                            "quantity": "datasets",
                            "pagination": {
                                "page_size": 100
                            }
                        }
                    }
                }
            }
            ```
        '''))

    histogram: Optional[HistogramAggregation] = Body(
        None,
        description=strip('''
            A `histogram` aggregation allows to get the ranges quantity values and
            how many (or another metrics) entries exhibit values in those ranges:

            ```json
            {
                "aggregations": {
                    "calculations_per_entry": {
                        "histogram": {
                            "quantity": "properties.n_calculations",
                            "interval": 10
                        }
                    }
                }
            }
            ```

            The used quantity must be a float or int typed quantity. An interval is
            mandatory and determines the bucket size.
        '''))

    date_histogram: Optional[DateHistogramAggregation] = Body(
        None,
        description=strip('''
            A `date_histogram` aggregation is like a normal `histogram` but for date valued
            quantities.:

            ```json
            {
                "aggregations": {
939
                    "upload_create_times": {
940
                        "histogram": {
941
                            "quantity": "upload_create_time",
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
                            "interval": "1M"
                        }
                    }
                }
            }
            ```

            The used quantity must be a datetime typed quantity. Intervals are strings
            that determine a time period. The default is one month, `1M`.
        '''))

    min_max: Optional[MinMaxAggregation] = Body(
        None,
        description=strip('''
            A `min_max` aggregation allows to get the minumum and maximum quantity values:

            ```json
            {
                "aggregations": {
                    "calculations_per_entry": {
                        "min_max": {
                            "quantity": "results.properties.n_calculations"
                        }
                    }
                }
            }
            ```

            The used quantity must be a float or int typed quantity.
        '''))
972

973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
    statistics: Optional[StatisticsAggregation] = Body(
        None,
        description=strip('''
            A `statistics` aggregation allows to get metrics (sums or cardinalities) from all data
            that matches the search.

            ```json
            {
                "aggregations": {
                    "statistics": {
                        "global": {
                            "metrics": ["results.properties.n_calculations", "results.material.material_id"]
                        }
                    }
                }
            }
            ```
        '''))

992

993
class WithQueryAndPagination(WithQuery):
994
    pagination: Optional[MetadataPagination] = Body(
995
996
        None,
        example={
997
            'page_size': 5,
998
            'order_by': 'upload_create_time'
999
1000
        })