archive.py 11.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
16
17
18
19
"""
The archive API of the nomad@FAIRDI APIs. This API is about serving processed
(parsed and normalized) calculation data in nomad's *meta-info* format.
"""

20
from typing import Dict, Any
21
from io import BytesIO
22
23
import os.path
from flask import send_file
24
from flask_restplus import abort, Resource
25
import json
26
import importlib
27

28
29
import nomad_meta_info

30
from nomad.files import UploadFiles, Restricted
31
from nomad import utils, search
32

33
from .auth import authenticate, create_authorization_predicate
Markus Scheidgen's avatar
Markus Scheidgen committed
34
from .api import api
35
from .repo import search_request_parser, add_query
36
from .common import calc_route, streamed_zipfile, build_snippet, to_json
37
38

ns = api.namespace(
39
40
    'archive',
    description='Access archive data and archive processing logs.')
41
42
43
44


@calc_route(ns, '/logs')
class ArchiveCalcLogResource(Resource):
45
    @api.doc('get_archive_logs')
46
    @api.response(404, 'The upload or calculation does not exist')
47
    @api.response(401, 'Not authorized to access the data.')
48
    @api.response(200, 'Archive data send', headers={'Content-Type': 'application/plain'})
49
    @authenticate(signature_token=True)
50
    def get(self, upload_id, calc_id):
51
52
53
        """
        Get calculation processing log.

54
        Calcs are references via *upload_id*, *calc_id* pairs.
55
        """
56
        archive_id = '%s/%s' % (upload_id, calc_id)
57

58
        upload_files = UploadFiles.get(
59
            upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id))
60

61
        if upload_files is None:
62
            abort(404, message='Upload %s does not exist.' % upload_id)
63
64
65

        try:
            return send_file(
66
                upload_files.archive_log_file(calc_id, 'rb'),
67
68
                mimetype='text/plain',
                as_attachment=True,
69
                cache_timeout=0,
70
71
                attachment_filename='%s.log' % archive_id)
        except Restricted:
72
            abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))
73
74
        except KeyError:
            abort(404, message='Calculation %s does not exist.' % archive_id)
75
76
77
78


@calc_route(ns)
class ArchiveCalcResource(Resource):
79
    @api.doc('get_archive_calc')
80
    @api.response(404, 'The upload or calculation does not exist')
81
    @api.response(401, 'Not authorized to access the data.')
82
    @api.response(200, 'Archive data send')
83
    @authenticate(signature_token=True)
84
    def get(self, upload_id, calc_id):
85
86
87
        """
        Get calculation data in archive form.

88
        Calcs are references via *upload_id*, *calc_id* pairs.
89
        """
90
        archive_id = '%s/%s' % (upload_id, calc_id)
91

92
        upload_file = UploadFiles.get(
93
            upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id))
94

95
        if upload_file is None:
96
            abort(404, message='Archive %s does not exist.' % upload_id)
97
98
99

        try:
            return send_file(
100
                upload_file.archive_file(calc_id, 'rb'),
101
102
                mimetype='application/json',
                as_attachment=True,
103
                cache_timeout=0,
104
105
                attachment_filename='%s.json' % archive_id)
        except Restricted:
106
            abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))
107
        except KeyError:
108
            abort(404, message='Calculation %s does not exist.' % archive_id)
109
110


111
112
113
114
archives_from_query_parser = search_request_parser.copy()
archives_from_query_parser.add_argument(
    name='compress', type=bool, help='Use compression on .zip files, default is not.',
    location='args')
115
116
117
118
archives_from_query_parser.add_argument(
    name='res_type', type=str, help='Type of return value, can be zip of json.',
    location='args'
)
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144


@ns.route('/query')
class ArchiveQueryResource(Resource):
    manifest_quantities = ['upload_id', 'calc_id', 'external_id', 'raw_id', 'pid', 'calc_hash']

    @api.doc('archives_from_query')
    @api.response(400, 'Invalid requests, e.g. wrong owner type or bad search parameters')
    @api.expect(archives_from_query_parser, validate=True)
    @api.response(200, 'File(s) send', headers={'Content-Type': 'application/zip'})
    @authenticate(signature_token=True)
    def get(self):
        """
        Get calculation data in archive form from all query results.

        See ``/repo`` endpoint for documentation on the search
        parameters.

        Zip files are streamed; instead of 401 errors, the zip file will just not contain
        any files that the user is not authorized to access.

        The zip file will contain a ``manifest.json`` with the repository meta data.
        """
        try:
            args = archives_from_query_parser.parse_args()
            compress = args.get('compress', False)
145
            res_type = args.get('res_type', 'zip')
146
147
148
149
150
151
        except Exception:
            abort(400, message='bad parameter types')

        search_request = search.SearchRequest()
        add_query(search_request, search_request_parser.parse_args())

152
        calcs = search_request.execute_scan(order_by='upload_id')
153
154

        def generator():
155
            manifest = {}
Markus Scheidgen's avatar
Markus Scheidgen committed
156
            upload_files = None
157
158
159
            for entry in calcs:
                upload_id = entry['upload_id']
                calc_id = entry['calc_id']
Markus Scheidgen's avatar
Markus Scheidgen committed
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
                if upload_files is None or upload_files.upload_id != upload_id:
                    if upload_files is not None:
                        upload_files.close_zipfile_cache()

                    upload_files = UploadFiles.get(
                        upload_id, create_authorization_predicate(upload_id))

                    if upload_files is None:
                        utils.get_logger(__name__).error('upload files do not exist', upload_id=upload_id)
                        continue

                    upload_files.open_zipfile_cache()

                yield (
                    '%s.%s' % (calc_id, upload_files._archive_ext), calc_id,
                    lambda calc_id: upload_files.archive_file(calc_id, 'rb'),
                    lambda calc_id: upload_files.archive_file_size(calc_id))
177

178
                manifest[calc_id] = {
179
180
181
182
                    key: entry[key]
                    for key in ArchiveQueryResource.manifest_quantities
                    if entry.get(key) is not None
                }
183

Markus Scheidgen's avatar
Markus Scheidgen committed
184
185
186
            if upload_files is not None:
                upload_files.close_zipfile_cache()

187
188
189
190
191
192
193
194
195
196
197
198
            try:
                manifest_contents = json.dumps(manifest).encode('utf-8')
            except Exception as e:
                manifest_contents = json.dumps(
                    dict(error='Could not create the manifest: %s' % (e))).encode('utf-8')
                utils.get_logger(__name__).error(
                    'could not create raw query manifest', exc_info=e)

            yield (
                'manifest.json', 'manifest',
                lambda *args: BytesIO(manifest_contents),
                lambda *args: len(manifest_contents))
199

200
201
202
203
204
205
206
207
208
209
        if res_type == 'zip':
            return streamed_zipfile(
                generator(), zipfile_name='nomad_archive.zip', compress=compress)
        elif res_type == 'json':
            archive_data = to_json(generator())
            code_snippet = build_snippet(args, os.path.join(api.base_url, ns.name, 'query'))
            data = {'archive_data': archive_data, 'code_snippet': code_snippet}
            return data, 200
        else:
            raise Exception('Unknown res_type %s' % res_type)
210
211


212
213
@ns.route('/metainfo/<string:metainfo_package_name>')
@api.doc(params=dict(metainfo_package_name='The name of the metainfo package.'))
214
215
216
217
class MetainfoResource(Resource):
    @api.doc('get_metainfo')
    @api.response(404, 'The metainfo does not exist')
    @api.response(200, 'Metainfo data send')
218
    def get(self, metainfo_package_name):
219
220
221
222
        """
        Get a metainfo definition file.
        """
        try:
223
            return load_metainfo(metainfo_package_name), 200
224
        except FileNotFoundError:
225
            parser_prefix = metainfo_package_name[:-len('.nomadmetainfo.json')]
226

227
            try:
228
229
230
                return load_metainfo(dict(
                    parser='%sparser' % parser_prefix,
                    path='%s.nomadmetainfo.json' % parser_prefix)), 200
231
232
            except FileNotFoundError:
                abort(404, message='The metainfo %s does not exist.' % metainfo_package_name)
233
234
235
236
237


metainfo_main_path = os.path.dirname(os.path.abspath(nomad_meta_info.__file__))


238
239
240
def load_metainfo(
        package_name_or_dependency: str, dependency_source: str = None,
        loaded_packages: Dict[str, Any] = None) -> Dict[str, Any]:
241
242
243
244
245
    """
    Loads the given metainfo package and all its dependencies. Returns a dict with
    all loaded package_names and respective packages.

    Arguments:
246
247
        package_name_or_dependency: The name of the package, or a nomadmetainfo dependency object.
        dependency_source: The path of the metainfo that uses this function to load a relative dependency.
248
249
250
251
252
253
        loaded_packages: Give a dict and the function will added freshly loaded packages
            to it and return it.
    """
    if loaded_packages is None:
        loaded_packages = {}

254
255
    if isinstance(package_name_or_dependency, str):
        package_name = package_name_or_dependency
256
        metainfo_path = os.path.join(metainfo_main_path, package_name)
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
    else:
        dependency = package_name_or_dependency
        if 'relativePath' in dependency:
            if dependency_source is None:
                raise Exception(
                    'Can only load relative dependency from within another metainfo package')

            metainfo_path = os.path.join(
                os.path.dirname(dependency_source), dependency['relativePath'])

        elif 'metainfoPath' in dependency:
            metainfo_path = os.path.join(metainfo_main_path, dependency['metainfoPath'])

        elif 'parser' in dependency:
            parser = dependency['parser']
            path = dependency['path']
            try:
                parser_module = importlib.import_module(parser).__file__
            except Exception:
                raise Exception('Parser not installed %s for metainfo path %s' % (parser, metainfo_path))

            parser_directory = os.path.dirname(parser_module)
            metainfo_path = os.path.join(parser_directory, path)

        else:
            raise Exception('Invalid dependency type in metainfo package %s' % metainfo_path)

        package_name = os.path.basename(metainfo_path)
285
286
287
288
289
290
291
292
293
294
295
296

    package_name = os.path.basename(package_name)

    if package_name in loaded_packages:
        return loaded_packages

    with open(metainfo_path, 'rt') as f:
        metainfo_json = json.load(f)

    loaded_packages[package_name] = metainfo_json

    for dependency in metainfo_json.get('dependencies', []):
297
        load_metainfo(dependency, dependency_source=metainfo_path, loaded_packages=loaded_packages)
298
299

    return loaded_packages