repo.py 14.3 KB
Newer Older
Markus Scheidgen's avatar
Markus Scheidgen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
The repository API of the nomad@FAIRDI APIs. Currently allows to resolve repository
meta-data.
"""

20
from typing import List
Markus Scheidgen's avatar
Markus Scheidgen committed
21
from flask_restplus import Resource, abort, fields
22
from flask import request, g
23
from elasticsearch.exceptions import NotFoundError
Markus Scheidgen's avatar
Markus Scheidgen committed
24

25
from nomad import search, utils
Markus Scheidgen's avatar
Markus Scheidgen committed
26

Markus Scheidgen's avatar
Markus Scheidgen committed
27
from .app import api, rfc3339DateTime
28
from .auth import login_if_available
Markus Scheidgen's avatar
Markus Scheidgen committed
29
30
from .common import pagination_model, pagination_request_parser, calc_route

31
ns = api.namespace('repo', description='Access repository metadata.')
Markus Scheidgen's avatar
Markus Scheidgen committed
32
33
34
35
36


@calc_route(ns)
class RepoCalcResource(Resource):
    @api.response(404, 'The upload or calculation does not exist')
37
    @api.response(401, 'Not authorized to access the calculation')
38
    @api.response(200, 'Metadata send', fields.Raw)
39
    @api.doc('get_repo_calc')
40
    @login_if_available
41
    def get(self, upload_id, calc_id):
Markus Scheidgen's avatar
Markus Scheidgen committed
42
43
44
        """
        Get calculation metadata in repository form.

45
        Repository metadata only entails the quantities shown in the repository.
46
        Calcs are references via *upload_id*, *calc_id* pairs.
Markus Scheidgen's avatar
Markus Scheidgen committed
47
48
        """
        try:
49
50
51
52
53
54
55
56
57
58
59
            calc = search.Entry.get(calc_id)
        except NotFoundError:
            abort(404, message='There is no calculation %s/%s' % (upload_id, calc_id))

        if calc.with_embargo or not calc.published:
            if g.user is None:
                abort(401, message='Not logged in to access %s/%s.' % (upload_id, calc_id))

            is_owner = g.user.user_id == 0
            if not is_owner:
                for owner in calc.owners:
60
61
62
                    # At somepoint ids will be emails (strings) anyways.
                    # Right now it is hard to make sure that both are either str or int.
                    if str(owner.user_id) == str(g.user.user_id):
63
64
65
66
67
68
                        is_owner = True
                        break
            if not is_owner:
                abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))

        return calc.to_dict(), 200
Markus Scheidgen's avatar
Markus Scheidgen committed
69
70
71


repo_calcs_model = api.model('RepoCalculations', {
72
73
74
75
76
    'pagination': fields.Nested(pagination_model, allow_null=True),
    'scroll': fields.Nested(allow_null=True, skip_none=True, model=api.model('Scroll', {
        'total': fields.Integer(description='The total amount of hits for the search.'),
        'scroll_id': fields.String(allow_null=True, description='The scroll_id that can be used to retrieve the next page.'),
        'size': fields.Integer(help='The size of the returned scroll page.')})),
77
78
79
    'results': fields.List(fields.Raw, description=(
        'A list of search results. Each result is a dict with quantitie names as key and '
        'values as values')),
80
81
82
83
84
    'statistics': fields.Raw(description=(
        'A dict with all statistics. Each statistic is dictionary with a metrics dict as '
        'value and quantity value as key. The possible metrics are code runs(calcs), %s. '
        'There is a pseudo quantity "total" with a single value "all" that contains the '
        ' metrics over all results. ' % ', '.join(search.metrics_names)))
Markus Scheidgen's avatar
Markus Scheidgen committed
85
86
})

87

88
89
90
91
92
repo_calc_id_model = api.model('RepoCalculationId', {
    'upload_id': fields.String(), 'calc_id': fields.String()
})


93
94
95
96
97
98
99
100
101
102
103
def add_common_parameters(request_parser):
    request_parser.add_argument(
        'owner', type=str,
        help='Specify which calcs to return: ``all``, ``public``, ``user``, ``staging``, default is ``all``')
    request_parser.add_argument(
        'from_time', type=lambda x: rfc3339DateTime.parse(x),
        help='A yyyy-MM-ddTHH:mm:ss (RFC3339) minimum entry time (e.g. upload time)')
    request_parser.add_argument(
        'until_time', type=lambda x: rfc3339DateTime.parse(x),
        help='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)')

104
105
    for quantity in search.search_quantities.values():
        request_parser.add_argument(
106
            quantity.name, help=quantity.description,
107
            action='append' if quantity.multi else None)
108
109


Markus Scheidgen's avatar
Markus Scheidgen committed
110
repo_request_parser = pagination_request_parser.copy()
111
add_common_parameters(repo_request_parser)
112
113
114
115
repo_request_parser.add_argument(
    'scroll', type=bool, help='Enable scrolling')
repo_request_parser.add_argument(
    'scroll_id', type=str, help='The id of the current scrolling window to use.')
116
117
repo_request_parser.add_argument(
    'date_histogram', type=bool, help='Add an additional aggregation over the upload time')
Markus Scheidgen's avatar
Markus Scheidgen committed
118
repo_request_parser.add_argument(
119
    'metrics', type=str, action='append', help=(
120
        'Metrics to aggregate over all quantities and their values as comma separated list. '
121
        'Possible values are %s.' % ', '.join(search.metrics_names)))
Markus Scheidgen's avatar
Markus Scheidgen committed
122

123

124
125
126
127
search_request_parser = api.parser()
add_common_parameters(search_request_parser)


128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def add_query(search_request: search.SearchRequest):
    """
    Help that adds query relevant request parameters to the given SearchRequest.
    """
    # owner
    try:
        search_request.owner(
            request.args.get('owner', 'all'),
            g.user.user_id if g.user is not None else None)
    except ValueError as e:
        abort(401, getattr(e, 'message', 'Invalid owner parameter'))
    except Exception as e:
        abort(400, getattr(e, 'message', 'Invalid owner parameter'))

    # time range
143
144
145
146
    from_time_str = request.args.get('from_time', None)
    until_time_str = request.args.get('until_time', None)

    try:
147
148
149
        from_time = rfc3339DateTime.parse(from_time_str) if from_time_str is not None else None
        until_time = rfc3339DateTime.parse(until_time_str) if until_time_str is not None else None
        search_request.time_range(start=from_time, end=until_time)
150
151
152
    except Exception:
        abort(400, message='bad datetime format')

153
154
155
156
157
    # search parameter
    search_request.search_parameters(**{
        key: request.args.getlist(key) if search.search_quantities[key] else request.args.get(key)
        for key in request.args.keys()
        if key in search.search_quantities})
158
159


Markus Scheidgen's avatar
Markus Scheidgen committed
160
161
@ns.route('/')
class RepoCalcsResource(Resource):
162
    @api.doc('search')
163
    @api.response(400, 'Invalid requests, e.g. wrong owner type or bad search parameters')
Markus Scheidgen's avatar
Markus Scheidgen committed
164
    @api.expect(repo_request_parser, validate=True)
165
    @api.marshal_with(repo_calcs_model, skip_none=True, code=200, description='Search results send')
Markus Scheidgen's avatar
Markus Scheidgen committed
166
167
168
    @login_if_available
    def get(self):
        """
169
        Search for calculations in the repository form, paginated.
170
171

        The ``owner`` parameter determines the overall entries to search through.
172
173
174
175
        Possible values are: ``all`` (show all entries visible to the current user), ``public``
        (show all publically visible entries), ``user`` (show all user entries, requires login),
        ``staging`` (show all user entries in staging area, requires login).

176
177
178
179
180
        You can use the various quantities to search/filter for. For some of the
        indexed quantities this endpoint returns aggregation information. This means
        you will be given a list of all possible values and the number of entries
        that have the certain value. You can also use these aggregations on an empty
        search to determine the possible values.
181
182
183

        The pagination parameters allows determine which page to return via the
        ``page`` and ``per_page`` parameters. Pagination however, is limited to the first
Markus Scheidgen's avatar
Markus Scheidgen committed
184
185
186
187
188
189
190
191
        100k (depending on ES configuration) hits.

        An alternative to pagination is to use ``scroll`` and ``scroll_id``. With ``scroll``
        you will get a ``scroll_id`` on the first request. Each call with ``scroll`` and
        the respective ``scroll_id`` will return the next ``per_page`` (here the default is 1000)
        results. Scroll however, ignores ordering and does not return aggregations.
        The scroll view used in the background will stay alive for 1 minute between requests.
        If the given ``scroll_id`` is not available anymore, a HTTP 400 is raised.
192
193
194
195
196

        The search will return aggregations on a predefined set of quantities. Aggregations
        will tell you what quantity values exist and how many entries match those values.

        Ordering is determined by ``order_by`` and ``order`` parameters.
197
        """
198

199
200
201
        search_request = search.SearchRequest()
        add_query(search_request)

202
        try:
203
204
            scroll = bool(request.args.get('scroll', False))
            scroll_id = request.args.get('scroll_id', None)
205
            page = int(request.args.get('page', 1))
206
            per_page = int(request.args.get('per_page', 10 if not scroll else 1000))
207
            order = int(request.args.get('order', -1))
208
209
210
211
            order_by = request.args.get('order_by', 'formula')

            if bool(request.args.get('date_histogram', False)):
                search_request.date_histogram()
212
            metrics: List[str] = request.args.getlist('metrics')
213

214
215
216
        except Exception:
            abort(400, message='bad parameter types')

217
        try:
218
            assert page >= 1
219
220
221
222
            assert per_page > 0
        except AssertionError:
            abort(400, message='invalid pagination')

223
224
225
        if order not in [-1, 1]:
            abort(400, message='invalid pagination')

226
227
228
        for metric in metrics:
            if metric not in search.metrics_names:
                abort(400, message='there is not metric %s' % metric)
229
        search_request.statistics(metrics_to_use=metrics)
230

231
        try:
232
            if scroll:
233
                results = search_request.execute_scrolled(scroll_id=scroll_id, size=per_page)
234

235
            else:
236
237
                results = search_request.execute_paginated(
                    per_page=per_page, page=page, order=order, order_by=order_by)
238
239

                # TODO just a work around to make things prettier
240
241
242
                statistics = results['statistics']
                if 'code_name' in statistics and 'currupted mainfile' in statistics['code_name']:
                    del(statistics['code_name']['currupted mainfile'])
243
244

            return results, 200
Markus Scheidgen's avatar
Markus Scheidgen committed
245
246
        except search.ScrollIdNotFound:
            abort(400, 'The given scroll_id does not exist.')
247
        except KeyError as e:
248
249
            import traceback
            traceback.print_exc()
250
            abort(400, str(e))
251
252
253


repo_quantity_values_model = api.model('RepoQuantityValues', {
254
255
256
257
    'quantity': fields.Nested(api.model('RepoQuantity', {
        'after': fields.String(description='The after value that can be used to retrieve the next set of values.'),
        'values': fields.Raw(description='A dict with values as key and entry count as values.')
    }), allow_null=True)
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
})

repo_quantity_search_request_parser = api.parser()
add_common_parameters(repo_quantity_search_request_parser)
repo_quantity_search_request_parser.add_argument(
    'after', type=str, help='The after value to use for "scrolling".')
repo_request_parser.add_argument(
    'size', type=int, help='The max size of the returned values.')


@ns.route('/<string:quantity>')
class RepoQuantityResource(Resource):
    @api.doc('quantity_search')
    @api.response(400, 'Invalid requests, e.g. wrong owner type, bad quantity, bad search parameters')
    @api.expect(repo_quantity_search_request_parser, validate=True)
    @api.marshal_with(repo_quantity_values_model, skip_none=True, code=200, description='Search results send')
    @login_if_available
    def get(self, quantity: str):
        """
        Retrieve quantity values from entries matching the search.

        You can use the various quantities to search/filter for. For some of the
        indexed quantities this endpoint returns aggregation information. This means
        you will be given a list of all possible values and the number of entries
        that have the certain value. You can also use these aggregations on an empty
        search to determine the possible values.

        There is no ordering and no pagination. Instead there is an 'after' key based
        scrolling. The result will contain an 'after' value, that can be specified
        for the next request. You can use the 'size' and 'after' parameters accordingly.

289
290
291
        The result will contain a 'quantity' key with quantity values and the "after"
        value. There will be upto 'size' many values. For the rest of the values use the
        "after" parameter in another request.
292
293
        """

294
295
296
        search_request = search.SearchRequest()
        add_query(search_request)

297
298
299
300
301
302
303
304
305
306
307
        try:
            after = request.args.get('after', None)
            size = int(request.args.get('size', 100))
        except Exception:
            abort(400, message='bad parameter types')

        try:
            assert size >= 0
        except AssertionError:
            abort(400, message='invalid size')

308
        search_request.quantity(quantity, size=size, after=after)
309
310

        try:
311
312
313
            results = search_request.execute()
            quantities = results.pop('quantities')
            results['quantity'] = quantities[quantity]
314
315
316
317
318
319

            return results, 200
        except KeyError as e:
            import traceback
            traceback.print_exc()
            abort(400, 'Given quantity does not exist: %s' % str(e))
320
321
322
323
324
325
326
327
328


@ns.route('/pid/<int:pid>')
class RepoPidResource(Resource):
    @api.doc('resolve_pid')
    @api.response(404, 'Entry with PID does not exist')
    @api.marshal_with(repo_calc_id_model, skip_none=True, code=200, description='Entry resolved')
    @login_if_available
    def get(self, pid: int):
329
330
331
332
        search_request = search.SearchRequest()

        if g.user is not None:
            search_request.owner('all', user_id=g.user.user_id)
333
        else:
334
335
336
337
338
339
340
341
342
343
344
            search_request.owner('all')

        search_request.search_parameter('pid', pid)

        results = list(search_request.execute_scan())
        total = len(results)

        if total == 0:
            abort(404, 'Entry with PID %d does not exist' % pid)

        if total > 1:
345
            utils.get_logger(__name__).error('Two entries for the same pid', pid=pid)
346
347
348
349
350

        result = results[0]
        return dict(
            upload_id=result['upload_id'],
            calc_id=result['calc_id'])