repo.py 16 KB
Newer Older
Markus Scheidgen's avatar
Markus Scheidgen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
The repository API of the nomad@FAIRDI APIs. Currently allows to resolve repository
meta-data.
"""

20
from typing import List
Markus Scheidgen's avatar
Markus Scheidgen committed
21
from flask_restplus import Resource, abort, fields
22
from flask import request, g
23
from elasticsearch.exceptions import NotFoundError
Markus Scheidgen's avatar
Markus Scheidgen committed
24

25
from nomad import search, utils, datamodel
Markus Scheidgen's avatar
Markus Scheidgen committed
26
from nomad.app.utils import rfc3339DateTime
27
from nomad.app.optimade import filterparser
Markus Scheidgen's avatar
Markus Scheidgen committed
28

Markus Scheidgen's avatar
Markus Scheidgen committed
29
from .api import api
30
from .auth import authenticate
Markus Scheidgen's avatar
Markus Scheidgen committed
31
32
from .common import pagination_model, pagination_request_parser, calc_route

33
ns = api.namespace('repo', description='Access repository metadata.')
Markus Scheidgen's avatar
Markus Scheidgen committed
34
35
36
37
38


@calc_route(ns)
class RepoCalcResource(Resource):
    @api.response(404, 'The upload or calculation does not exist')
39
    @api.response(401, 'Not authorized to access the calculation')
40
    @api.response(200, 'Metadata send', fields.Raw)
41
    @api.doc('get_repo_calc')
42
    @authenticate()
43
    def get(self, upload_id, calc_id):
Markus Scheidgen's avatar
Markus Scheidgen committed
44
45
46
        """
        Get calculation metadata in repository form.

47
        Repository metadata only entails the quantities shown in the repository.
48
        Calcs are references via *upload_id*, *calc_id* pairs.
Markus Scheidgen's avatar
Markus Scheidgen committed
49
50
        """
        try:
51
52
53
54
55
56
57
58
            calc = search.Entry.get(calc_id)
        except NotFoundError:
            abort(404, message='There is no calculation %s/%s' % (upload_id, calc_id))

        if calc.with_embargo or not calc.published:
            if g.user is None:
                abort(401, message='Not logged in to access %s/%s.' % (upload_id, calc_id))

59
            if not (any(g.user.user_id == user.user_id for user in calc.owners) or g.user.is_admin):
60
61
62
                abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))

        return calc.to_dict(), 200
Markus Scheidgen's avatar
Markus Scheidgen committed
63
64
65


repo_calcs_model = api.model('RepoCalculations', {
Markus Scheidgen's avatar
Markus Scheidgen committed
66
    'pagination': fields.Nested(pagination_model, skip_none=True),
67
68
69
70
    'scroll': fields.Nested(allow_null=True, skip_none=True, model=api.model('Scroll', {
        'total': fields.Integer(description='The total amount of hits for the search.'),
        'scroll_id': fields.String(allow_null=True, description='The scroll_id that can be used to retrieve the next page.'),
        'size': fields.Integer(help='The size of the returned scroll page.')})),
71
72
73
    'results': fields.List(fields.Raw, description=(
        'A list of search results. Each result is a dict with quantitie names as key and '
        'values as values')),
74
75
76
77
    'statistics': fields.Raw(description=(
        'A dict with all statistics. Each statistic is dictionary with a metrics dict as '
        'value and quantity value as key. The possible metrics are code runs(calcs), %s. '
        'There is a pseudo quantity "total" with a single value "all" that contains the '
78
79
80
81
        ' metrics over all results. ' % ', '.join(datamodel.Domain.instance.metrics_names))),
    'datasets': fields.Raw(api.model('RepoDatasets', {
        'after': fields.String(description='The after value that can be used to retrieve the next datasets.'),
        'values': fields.Raw(description='A dict with names as key. The values are dicts with "total" and "examples" keys.')
Markus Scheidgen's avatar
Markus Scheidgen committed
82
    }), skip_none=True)
Markus Scheidgen's avatar
Markus Scheidgen committed
83
84
})

85

86
87
repo_calc_id_model = api.model('RepoCalculationId', {
    'upload_id': fields.String(), 'calc_id': fields.String()
Markus Scheidgen's avatar
Markus Scheidgen committed
88
89
})

90
91
92
93
94
95
96
97
98
99
100
101

def add_common_parameters(request_parser):
    request_parser.add_argument(
        'owner', type=str,
        help='Specify which calcs to return: ``all``, ``public``, ``user``, ``staging``, default is ``all``')
    request_parser.add_argument(
        'from_time', type=lambda x: rfc3339DateTime.parse(x),
        help='A yyyy-MM-ddTHH:mm:ss (RFC3339) minimum entry time (e.g. upload time)')
    request_parser.add_argument(
        'until_time', type=lambda x: rfc3339DateTime.parse(x),
        help='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)')

102
    for quantity in search.quantities.values():
103
        request_parser.add_argument(
104
            quantity.name, help=quantity.description,
105
            action=quantity.argparse_action if quantity.multi else None)
106
107


Markus Scheidgen's avatar
Markus Scheidgen committed
108
repo_request_parser = pagination_request_parser.copy()
109
add_common_parameters(repo_request_parser)
110
111
112
113
repo_request_parser.add_argument(
    'scroll', type=bool, help='Enable scrolling')
repo_request_parser.add_argument(
    'scroll_id', type=str, help='The id of the current scrolling window to use.')
114
115
repo_request_parser.add_argument(
    'date_histogram', type=bool, help='Add an additional aggregation over the upload time')
116
117
repo_request_parser.add_argument(
    'datasets_after', type=str, help='The last dataset id of the last scroll window for the dataset quantitiy')
Markus Scheidgen's avatar
Markus Scheidgen committed
118
repo_request_parser.add_argument(
119
    'metrics', type=str, action='append', help=(
120
        'Metrics to aggregate over all quantities and their values as comma separated list. '
121
        'Possible values are %s.' % ', '.join(datamodel.Domain.instance.metrics_names)))
Markus Scheidgen's avatar
Markus Scheidgen committed
122
123
124
125
repo_request_parser.add_argument(
    'datasets', type=bool, help=('Return dataset information.'))
repo_request_parser.add_argument(
    'statistics', type=bool, help=('Return statistics.'))
Markus Scheidgen's avatar
Markus Scheidgen committed
126

127

128
129
130
131
search_request_parser = api.parser()
add_common_parameters(search_request_parser)


132
def add_query(search_request: search.SearchRequest, parser=repo_request_parser):
133
134
135
    """
    Help that adds query relevant request parameters to the given SearchRequest.
    """
136
137
    args = {key: value for key, value in parser.parse_args().items() if value is not None}

138
139
140
    # owner
    try:
        search_request.owner(
141
            args.get('owner', 'all'),
142
143
144
145
146
147
148
            g.user.user_id if g.user is not None else None)
    except ValueError as e:
        abort(401, getattr(e, 'message', 'Invalid owner parameter'))
    except Exception as e:
        abort(400, getattr(e, 'message', 'Invalid owner parameter'))

    # time range
149
150
    from_time_str = args.get('from_time', None)
    until_time_str = args.get('until_time', None)
151
152

    try:
153
154
155
        from_time = rfc3339DateTime.parse(from_time_str) if from_time_str is not None else None
        until_time = rfc3339DateTime.parse(until_time_str) if until_time_str is not None else None
        search_request.time_range(start=from_time, end=until_time)
156
157
158
    except Exception:
        abort(400, message='bad datetime format')

159
160
    # optimade
    try:
161
        optimade = args.get('optimade', None)
162
163
164
165
166
167
        if optimade is not None:
            q = filterparser.parse_filter(optimade)
            search_request.query(q)
    except filterparser.FilterException:
        abort(400, message='could not parse optimade query')

168
169
    # search parameter
    search_request.search_parameters(**{
170
        key: value for key, value in args.items()
171
        if key not in ['optimade'] and key in search.quantities})
172
173


Markus Scheidgen's avatar
Markus Scheidgen committed
174
175
@ns.route('/')
class RepoCalcsResource(Resource):
176
    @api.doc('search')
177
    @api.response(400, 'Invalid requests, e.g. wrong owner type or bad search parameters')
Markus Scheidgen's avatar
Markus Scheidgen committed
178
    @api.expect(repo_request_parser, validate=True)
179
    @api.marshal_with(repo_calcs_model, skip_none=True, code=200, description='Search results send')
180
    @authenticate()
Markus Scheidgen's avatar
Markus Scheidgen committed
181
182
    def get(self):
        """
183
        Search for calculations in the repository form, paginated.
184
185

        The ``owner`` parameter determines the overall entries to search through.
186
187
188
189
        Possible values are: ``all`` (show all entries visible to the current user), ``public``
        (show all publically visible entries), ``user`` (show all user entries, requires login),
        ``staging`` (show all user entries in staging area, requires login).

190
191
192
193
194
        You can use the various quantities to search/filter for. For some of the
        indexed quantities this endpoint returns aggregation information. This means
        you will be given a list of all possible values and the number of entries
        that have the certain value. You can also use these aggregations on an empty
        search to determine the possible values.
195
196
197

        The pagination parameters allows determine which page to return via the
        ``page`` and ``per_page`` parameters. Pagination however, is limited to the first
Markus Scheidgen's avatar
Markus Scheidgen committed
198
199
200
201
202
203
204
205
        100k (depending on ES configuration) hits.

        An alternative to pagination is to use ``scroll`` and ``scroll_id``. With ``scroll``
        you will get a ``scroll_id`` on the first request. Each call with ``scroll`` and
        the respective ``scroll_id`` will return the next ``per_page`` (here the default is 1000)
        results. Scroll however, ignores ordering and does not return aggregations.
        The scroll view used in the background will stay alive for 1 minute between requests.
        If the given ``scroll_id`` is not available anymore, a HTTP 400 is raised.
206
207
208
209
210

        The search will return aggregations on a predefined set of quantities. Aggregations
        will tell you what quantity values exist and how many entries match those values.

        Ordering is determined by ``order_by`` and ``order`` parameters.
211
        """
212

213
        search_request = search.SearchRequest()
214
        add_query(search_request, repo_request_parser)
215

216
        try:
217
218
            scroll = bool(request.args.get('scroll', False))
            scroll_id = request.args.get('scroll_id', None)
219
            page = int(request.args.get('page', 1))
220
            per_page = int(request.args.get('per_page', 10 if not scroll else 1000))
221
            order = int(request.args.get('order', -1))
222
223
224
225
            order_by = request.args.get('order_by', 'formula')

            if bool(request.args.get('date_histogram', False)):
                search_request.date_histogram()
226
            metrics: List[str] = request.args.getlist('metrics')
Markus Scheidgen's avatar
Markus Scheidgen committed
227
228
229

            with_datasets = request.args.get('datasets', False)
            with_statistics = request.args.get('statistics', False)
230
231
232
        except Exception:
            abort(400, message='bad parameter types')

233
        try:
234
            assert page >= 1
235
236
237
238
            assert per_page > 0
        except AssertionError:
            abort(400, message='invalid pagination')

239
240
241
        if order not in [-1, 1]:
            abort(400, message='invalid pagination')

242
243
        for metric in metrics:
            if metric not in search.metrics_names:
244
245
                abort(400, message='there is no metric %s' % metric)

Markus Scheidgen's avatar
Markus Scheidgen committed
246
247
248
249
250
251
252
253
        if with_statistics:
            search_request.default_statistics(metrics_to_use=metrics)
            if 'datasets' not in metrics:
                total_metrics = metrics + ['datasets']
            else:
                total_metrics = metrics
            search_request.totals(metrics_to_use=total_metrics)
            search_request.statistic('authors', 1000)
254

255
        try:
256
            if scroll:
257
                results = search_request.execute_scrolled(scroll_id=scroll_id, size=per_page)
258

259
            else:
Markus Scheidgen's avatar
Markus Scheidgen committed
260
261
262
263
264
                if with_datasets:
                    search_request.quantity(
                        'dataset_id', size=per_page, examples=1,
                        after=request.args.get('datasets_after', None))

265
266
                results = search_request.execute_paginated(
                    per_page=per_page, page=page, order=order, order_by=order_by)
267
268

                # TODO just a work around to make things prettier
Markus Scheidgen's avatar
Markus Scheidgen committed
269
270
271
272
273
274
275
276
                if with_statistics:
                    statistics = results['statistics']
                    if 'code_name' in statistics and 'currupted mainfile' in statistics['code_name']:
                        del(statistics['code_name']['currupted mainfile'])

                if with_datasets:
                    datasets = results.pop('quantities')['dataset_id']
                    results['datasets'] = datasets
277
278

            return results, 200
Markus Scheidgen's avatar
Markus Scheidgen committed
279
280
        except search.ScrollIdNotFound:
            abort(400, 'The given scroll_id does not exist.')
281
        except KeyError as e:
282
283
            import traceback
            traceback.print_exc()
284
            abort(400, str(e))
285
286
287


repo_quantity_values_model = api.model('RepoQuantityValues', {
288
289
    'quantity': fields.Nested(api.model('RepoQuantity', {
        'after': fields.String(description='The after value that can be used to retrieve the next set of values.'),
290
        'values': fields.Raw(description='A dict with values as key. Values are dicts with "total" and "examples" keys.')
291
    }), allow_null=True)
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
})

repo_quantity_search_request_parser = api.parser()
add_common_parameters(repo_quantity_search_request_parser)
repo_quantity_search_request_parser.add_argument(
    'after', type=str, help='The after value to use for "scrolling".')
repo_request_parser.add_argument(
    'size', type=int, help='The max size of the returned values.')


@ns.route('/<string:quantity>')
class RepoQuantityResource(Resource):
    @api.doc('quantity_search')
    @api.response(400, 'Invalid requests, e.g. wrong owner type, bad quantity, bad search parameters')
    @api.expect(repo_quantity_search_request_parser, validate=True)
    @api.marshal_with(repo_quantity_values_model, skip_none=True, code=200, description='Search results send')
308
    @authenticate()
309
310
311
312
313
314
315
316
317
318
319
320
321
322
    def get(self, quantity: str):
        """
        Retrieve quantity values from entries matching the search.

        You can use the various quantities to search/filter for. For some of the
        indexed quantities this endpoint returns aggregation information. This means
        you will be given a list of all possible values and the number of entries
        that have the certain value. You can also use these aggregations on an empty
        search to determine the possible values.

        There is no ordering and no pagination. Instead there is an 'after' key based
        scrolling. The result will contain an 'after' value, that can be specified
        for the next request. You can use the 'size' and 'after' parameters accordingly.

323
324
325
        The result will contain a 'quantity' key with quantity values and the "after"
        value. There will be upto 'size' many values. For the rest of the values use the
        "after" parameter in another request.
326
327
        """

328
        search_request = search.SearchRequest()
329
        add_query(search_request, repo_quantity_search_request_parser)
330

331
332
333
334
335
336
337
338
339
340
341
        try:
            after = request.args.get('after', None)
            size = int(request.args.get('size', 100))
        except Exception:
            abort(400, message='bad parameter types')

        try:
            assert size >= 0
        except AssertionError:
            abort(400, message='invalid size')

342
        search_request.quantity(quantity, size=size, after=after)
343
344

        try:
345
346
347
            results = search_request.execute()
            quantities = results.pop('quantities')
            results['quantity'] = quantities[quantity]
348
349
350
351
352
353

            return results, 200
        except KeyError as e:
            import traceback
            traceback.print_exc()
            abort(400, 'Given quantity does not exist: %s' % str(e))
354
355
356
357
358
359
360


@ns.route('/pid/<int:pid>')
class RepoPidResource(Resource):
    @api.doc('resolve_pid')
    @api.response(404, 'Entry with PID does not exist')
    @api.marshal_with(repo_calc_id_model, skip_none=True, code=200, description='Entry resolved')
Markus Scheidgen's avatar
Markus Scheidgen committed
361
    @authenticate()
362
    def get(self, pid: int):
363
364
365
366
        search_request = search.SearchRequest()

        if g.user is not None:
            search_request.owner('all', user_id=g.user.user_id)
367
        else:
368
369
370
371
372
373
374
375
376
377
378
            search_request.owner('all')

        search_request.search_parameter('pid', pid)

        results = list(search_request.execute_scan())
        total = len(results)

        if total == 0:
            abort(404, 'Entry with PID %d does not exist' % pid)

        if total > 1:
379
            utils.get_logger(__name__).error('Two entries for the same pid', pid=pid)
380
381
382
383
384

        result = results[0]
        return dict(
            upload_id=result['upload_id'],
            calc_id=result['calc_id'])