repo.py 16.2 KB
Newer Older
Markus Scheidgen's avatar
Markus Scheidgen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
The repository API of the nomad@FAIRDI APIs. Currently allows to resolve repository
meta-data.
"""

20
from typing import List
Markus Scheidgen's avatar
Markus Scheidgen committed
21
from flask_restplus import Resource, abort, fields
22
23
from flask import request, g
from elasticsearch_dsl import Q
24
from elasticsearch.exceptions import NotFoundError
Markus Scheidgen's avatar
Markus Scheidgen committed
25
import datetime
Markus Scheidgen's avatar
Markus Scheidgen committed
26

27
from nomad import search, utils
Markus Scheidgen's avatar
Markus Scheidgen committed
28

Markus Scheidgen's avatar
Markus Scheidgen committed
29
from .app import api, rfc3339DateTime
30
from .auth import login_if_available
Markus Scheidgen's avatar
Markus Scheidgen committed
31
32
from .common import pagination_model, pagination_request_parser, calc_route

33
ns = api.namespace('repo', description='Access repository metadata.')
Markus Scheidgen's avatar
Markus Scheidgen committed
34
35
36
37
38


@calc_route(ns)
class RepoCalcResource(Resource):
    @api.response(404, 'The upload or calculation does not exist')
39
    @api.response(401, 'Not authorized to access the calculation')
40
    @api.response(200, 'Metadata send', fields.Raw)
41
    @api.doc('get_repo_calc')
42
    @login_if_available
43
    def get(self, upload_id, calc_id):
Markus Scheidgen's avatar
Markus Scheidgen committed
44
45
46
        """
        Get calculation metadata in repository form.

47
        Repository metadata only entails the quantities shown in the repository.
48
        Calcs are references via *upload_id*, *calc_id* pairs.
Markus Scheidgen's avatar
Markus Scheidgen committed
49
50
        """
        try:
51
52
53
54
55
56
57
58
59
60
61
            calc = search.Entry.get(calc_id)
        except NotFoundError:
            abort(404, message='There is no calculation %s/%s' % (upload_id, calc_id))

        if calc.with_embargo or not calc.published:
            if g.user is None:
                abort(401, message='Not logged in to access %s/%s.' % (upload_id, calc_id))

            is_owner = g.user.user_id == 0
            if not is_owner:
                for owner in calc.owners:
62
63
64
                    # At somepoint ids will be emails (strings) anyways.
                    # Right now it is hard to make sure that both are either str or int.
                    if str(owner.user_id) == str(g.user.user_id):
65
66
67
68
69
70
                        is_owner = True
                        break
            if not is_owner:
                abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))

        return calc.to_dict(), 200
Markus Scheidgen's avatar
Markus Scheidgen committed
71
72
73


repo_calcs_model = api.model('RepoCalculations', {
74
75
76
77
78
    'pagination': fields.Nested(pagination_model, allow_null=True),
    'scroll': fields.Nested(allow_null=True, skip_none=True, model=api.model('Scroll', {
        'total': fields.Integer(description='The total amount of hits for the search.'),
        'scroll_id': fields.String(allow_null=True, description='The scroll_id that can be used to retrieve the next page.'),
        'size': fields.Integer(help='The size of the returned scroll page.')})),
79
80
81
    'results': fields.List(fields.Raw, description=(
        'A list of search results. Each result is a dict with quantitie names as key and '
        'values as values')),
82
    'quantities': fields.Raw(description=(
Markus Scheidgen's avatar
Markus Scheidgen committed
83
        'A dict with all aggregations. Each aggregation is dictionary with a metrics dict as '
84
85
        'value and quantity value as key. The metrics are code runs(calcs), %s. '
        'There is a pseudo quantity "total" with a single value "all" that contains the metrics over all results. ' %
86
        ', '.join(search.metrics_names)))
Markus Scheidgen's avatar
Markus Scheidgen committed
87
88
})

89

90
91
92
93
94
repo_calc_id_model = api.model('RepoCalculationId', {
    'upload_id': fields.String(), 'calc_id': fields.String()
})


95
96
97
98
99
100
101
102
103
104
105
def add_common_parameters(request_parser):
    request_parser.add_argument(
        'owner', type=str,
        help='Specify which calcs to return: ``all``, ``public``, ``user``, ``staging``, default is ``all``')
    request_parser.add_argument(
        'from_time', type=lambda x: rfc3339DateTime.parse(x),
        help='A yyyy-MM-ddTHH:mm:ss (RFC3339) minimum entry time (e.g. upload time)')
    request_parser.add_argument(
        'until_time', type=lambda x: rfc3339DateTime.parse(x),
        help='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)')

106
107
    for quantity in search.search_quantities.values():
        request_parser.add_argument(
108
            quantity.name, help=quantity.description,
109
            action='append' if quantity.multi else None)
110
111


Markus Scheidgen's avatar
Markus Scheidgen committed
112
repo_request_parser = pagination_request_parser.copy()
113
add_common_parameters(repo_request_parser)
114
115
116
117
repo_request_parser.add_argument(
    'scroll', type=bool, help='Enable scrolling')
repo_request_parser.add_argument(
    'scroll_id', type=str, help='The id of the current scrolling window to use.')
118
119
repo_request_parser.add_argument(
    'date_histogram', type=bool, help='Add an additional aggregation over the upload time')
Markus Scheidgen's avatar
Markus Scheidgen committed
120
repo_request_parser.add_argument(
121
    'metrics', type=str, action='append', help=(
122
        'Metrics to aggregate over all quantities and their values as comma separated list. '
123
        'Possible values are %s.' % ', '.join(search.metrics_names)))
Markus Scheidgen's avatar
Markus Scheidgen committed
124

125

126
127
128
129
130
search_request_parser = api.parser()
add_common_parameters(search_request_parser)


def _create_owner_query():
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
    owner = request.args.get('owner', 'all')

    # TODO this should be removed after migration
    # if owner == 'migrated':
    #     q = Q('term', published=True) & Q('term', with_embargo=False)
    #     if g.user is not None:
    #         q = q | Q('term', owners__user_id=g.user.user_id)
    #     q = q & ~Q('term', **{'uploader.user_id': 1})  # pylint: disable=invalid-unary-operand-type
    if owner == 'all':
        q = Q('term', published=True) & Q('term', with_embargo=False)
        if g.user is not None:
            q = q | Q('term', owners__user_id=g.user.user_id)
    elif owner == 'public':
        q = Q('term', published=True) & Q('term', with_embargo=False)
    elif owner == 'user':
        if g.user is None:
            abort(401, message='Authentication required for owner value user.')

        q = Q('term', owners__user_id=g.user.user_id)
    elif owner == 'staging':
        if g.user is None:
            abort(401, message='Authentication required for owner value user.')
        q = Q('term', published=False) & Q('term', owners__user_id=g.user.user_id)
    elif owner == 'admin':
        if g.user is None or not g.user.is_admin:
            abort(401, message='This can only be used by the admin user.')
        q = None
    else:
        abort(400, message='Invalid owner value. Valid values are all|user|staging, default is all')

161
162
163
164
    # TODO this should be removed after migration
    without_currupted_mainfile = ~Q('term', code_name='currupted mainfile')  # pylint: disable=invalid-unary-operand-type
    q = q & without_currupted_mainfile if q is not None else without_currupted_mainfile

165
166
167
    return q


168
def _create_search_parameters():
169
    """ Helper that creates a request.args dict with isolated search parameters """
Markus Scheidgen's avatar
Markus Scheidgen committed
170
    return {
171
        key: request.args.getlist(key) if search.search_quantities[key] else request.args.get(key)
Markus Scheidgen's avatar
Markus Scheidgen committed
172
173
        for key in request.args.keys()
        if key in search.search_quantities}
174

Markus Scheidgen's avatar
Markus Scheidgen committed
175

176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def _create_time_range():
    from_time_str = request.args.get('from_time', None)
    until_time_str = request.args.get('until_time', None)

    try:
        if from_time_str is None and until_time_str is None:
            return None
        else:
            from_time = rfc3339DateTime.parse('2000-01-01' if from_time_str is None else from_time_str)
            until_time = rfc3339DateTime.parse(until_time_str) if until_time_str is not None else datetime.datetime.utcnow()
            return from_time, until_time
    except Exception:
        abort(400, message='bad datetime format')


def create_search_kwargs():
    return dict(
        q=_create_owner_query(),
        time_range=_create_time_range(),
        search_parameters=_create_search_parameters())


Markus Scheidgen's avatar
Markus Scheidgen committed
198
199
@ns.route('/')
class RepoCalcsResource(Resource):
200
    @api.doc('search')
201
    @api.response(400, 'Invalid requests, e.g. wrong owner type or bad search parameters')
Markus Scheidgen's avatar
Markus Scheidgen committed
202
    @api.expect(repo_request_parser, validate=True)
203
    @api.marshal_with(repo_calcs_model, skip_none=True, code=200, description='Search results send')
Markus Scheidgen's avatar
Markus Scheidgen committed
204
205
206
    @login_if_available
    def get(self):
        """
207
        Search for calculations in the repository form, paginated.
208
209

        The ``owner`` parameter determines the overall entries to search through.
210
211
212
213
        Possible values are: ``all`` (show all entries visible to the current user), ``public``
        (show all publically visible entries), ``user`` (show all user entries, requires login),
        ``staging`` (show all user entries in staging area, requires login).

214
215
216
217
218
        You can use the various quantities to search/filter for. For some of the
        indexed quantities this endpoint returns aggregation information. This means
        you will be given a list of all possible values and the number of entries
        that have the certain value. You can also use these aggregations on an empty
        search to determine the possible values.
219
220
221

        The pagination parameters allows determine which page to return via the
        ``page`` and ``per_page`` parameters. Pagination however, is limited to the first
Markus Scheidgen's avatar
Markus Scheidgen committed
222
223
224
225
226
227
228
229
        100k (depending on ES configuration) hits.

        An alternative to pagination is to use ``scroll`` and ``scroll_id``. With ``scroll``
        you will get a ``scroll_id`` on the first request. Each call with ``scroll`` and
        the respective ``scroll_id`` will return the next ``per_page`` (here the default is 1000)
        results. Scroll however, ignores ordering and does not return aggregations.
        The scroll view used in the background will stay alive for 1 minute between requests.
        If the given ``scroll_id`` is not available anymore, a HTTP 400 is raised.
230
231
232
233
234

        The search will return aggregations on a predefined set of quantities. Aggregations
        will tell you what quantity values exist and how many entries match those values.

        Ordering is determined by ``order_by`` and ``order`` parameters.
235
        """
236
237

        try:
238
            scroll = bool(request.args.get('scroll', False))
239
            date_histogram = bool(request.args.get('date_histogram', False))
240
            scroll_id = request.args.get('scroll_id', None)
241
            page = int(request.args.get('page', 1))
242
            per_page = int(request.args.get('per_page', 10 if not scroll else 1000))
243
            order = int(request.args.get('order', -1))
244
            metrics: List[str] = request.args.getlist('metrics')
245
246
247
        except Exception:
            abort(400, message='bad parameter types')

248
        search_kwargs = create_search_kwargs()
249

250
        order_by = request.args.get('order_by', 'formula')
251
252

        try:
253
            assert page >= 1
254
255
256
257
            assert per_page > 0
        except AssertionError:
            abort(400, message='invalid pagination')

258
259
260
        if order not in [-1, 1]:
            abort(400, message='invalid pagination')

261
262
263
264
        for metric in metrics:
            if metric not in search.metrics_names:
                abort(400, message='there is not metric %s' % metric)

265
        try:
266
            if scroll:
267
                results = search.scroll_search(
268
                    scroll_id=scroll_id, size=per_page, **search_kwargs)
269

270
            else:
271
                results = search.metrics_search(
272
273
274
                    per_page=per_page, page=page, order=order, order_by=order_by,
                    metrics_to_use=metrics,
                    with_date_histogram=date_histogram, **search_kwargs)
275
276
277
278
279
280
281

                # TODO just a work around to make things prettier
                quantities = results['quantities']
                if 'code_name' in quantities and 'currupted mainfile' in quantities['code_name']:
                    del(quantities['code_name']['currupted mainfile'])

            return results, 200
Markus Scheidgen's avatar
Markus Scheidgen committed
282
283
        except search.ScrollIdNotFound:
            abort(400, 'The given scroll_id does not exist.')
284
        except KeyError as e:
285
286
            import traceback
            traceback.print_exc()
287
            abort(400, str(e))
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337


repo_quantity_values_model = api.model('RepoQuantityValues', {
    'quantities': fields.Raw(description='''
        A dict with the requested quantity as single key.
        The value is a dictionary with 'after' and 'values' keys.
        The 'values' key holds a dict with actual values as keys and their entry count
        as values (i.e. number of entries with that value). ''')
})

repo_quantity_search_request_parser = api.parser()
add_common_parameters(repo_quantity_search_request_parser)
repo_quantity_search_request_parser.add_argument(
    'after', type=str, help='The after value to use for "scrolling".')
repo_request_parser.add_argument(
    'size', type=int, help='The max size of the returned values.')


@ns.route('/<string:quantity>')
class RepoQuantityResource(Resource):
    @api.doc('quantity_search')
    @api.response(400, 'Invalid requests, e.g. wrong owner type, bad quantity, bad search parameters')
    @api.expect(repo_quantity_search_request_parser, validate=True)
    @api.marshal_with(repo_quantity_values_model, skip_none=True, code=200, description='Search results send')
    @login_if_available
    def get(self, quantity: str):
        """
        Retrieve quantity values from entries matching the search.

        You can use the various quantities to search/filter for. For some of the
        indexed quantities this endpoint returns aggregation information. This means
        you will be given a list of all possible values and the number of entries
        that have the certain value. You can also use these aggregations on an empty
        search to determine the possible values.

        There is no ordering and no pagination. Instead there is an 'after' key based
        scrolling. The result will contain an 'after' value, that can be specified
        for the next request. You can use the 'size' and 'after' parameters accordingly.

        The result will contain a 'quantities' key with the given quantity and the
        respective values (upto 'size' many). For the rest of the values use the
        'after' parameter accordingly.
        """

        try:
            after = request.args.get('after', None)
            size = int(request.args.get('size', 100))

            from_time = rfc3339DateTime.parse(request.args.get('from_time', '2000-01-01'))
            until_time_str = request.args.get('until_time', None)
338
            until_time = rfc3339DateTime.parse(until_time_str) if until_time_str is not None else datetime.datetime.utcnow()
339
340
341
342
343
344
345
346
347
            time_range = (from_time, until_time)
        except Exception:
            abort(400, message='bad parameter types')

        try:
            assert size >= 0
        except AssertionError:
            abort(400, message='invalid size')

348
349
        q = _create_owner_query()
        search_parameters = _create_search_parameters()
350
351
352
353
354
355
356
357
358
359
360

        try:
            results = search.quantity_search(
                q=q, time_range=time_range, search_parameters=search_parameters,
                quantities={quantity: after}, size=size, with_entries=False)

            return results, 200
        except KeyError as e:
            import traceback
            traceback.print_exc()
            abort(400, 'Given quantity does not exist: %s' % str(e))
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383


@ns.route('/pid/<int:pid>')
class RepoPidResource(Resource):
    @api.doc('resolve_pid')
    @api.response(404, 'Entry with PID does not exist')
    @api.marshal_with(repo_calc_id_model, skip_none=True, code=200, description='Entry resolved')
    @login_if_available
    def get(self, pid: int):
        q = _create_owner_query()
        results = search.entry_search(q, page=1, per_page=1, search_parameters=dict(pid=pid))
        total = results['pagination']['total']
        if total == 1:
            return dict(
                upload_id=results['results'][0]['upload_id'],
                calc_id=results['results'][0]['calc_id'])
        elif total == 0:
            abort(404, 'Entry with PID %d does not exist' % pid)
        else:
            utils.get_logger(__name__).error('Two entries for the same pid', pid=pid)
            return dict(
                upload_id=results['results'][0]['upload_id'],
                calc_id=results['results'][0]['calc_id'])