archive.py 10.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
16
17
18
19
"""
The archive API of the nomad@FAIRDI APIs. This API is about serving processed
(parsed and normalized) calculation data in nomad's *meta-info* format.
"""

20
from typing import Dict, Any
21
from io import BytesIO
22
23
import os.path
from flask import send_file
24
from flask_restplus import abort, Resource
25
import json
26
import importlib
27
import contextlib
28

29
30
import nomad_meta_info

31
from nomad.files import UploadFiles, Restricted
32
from nomad import utils, search
33

34
from .auth import authenticate, create_authorization_predicate
Markus Scheidgen's avatar
Markus Scheidgen committed
35
from .api import api
36
37
from .repo import search_request_parser, add_query
from .common import calc_route, streamed_zipfile
38
39

ns = api.namespace(
40
41
    'archive',
    description='Access archive data and archive processing logs.')
42
43
44
45


@calc_route(ns, '/logs')
class ArchiveCalcLogResource(Resource):
46
    @api.doc('get_archive_logs')
47
    @api.response(404, 'The upload or calculation does not exist')
48
    @api.response(401, 'Not authorized to access the data.')
49
    @api.response(200, 'Archive data send', headers={'Content-Type': 'application/plain'})
50
    @authenticate(signature_token=True)
51
    def get(self, upload_id, calc_id):
52
53
54
        """
        Get calculation processing log.

55
        Calcs are references via *upload_id*, *calc_id* pairs.
56
        """
57
        archive_id = '%s/%s' % (upload_id, calc_id)
58

59
        upload_files = UploadFiles.get(
60
            upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id))
61

62
        if upload_files is None:
63
            abort(404, message='Upload %s does not exist.' % upload_id)
64
65
66

        try:
            return send_file(
67
                upload_files.archive_log_file(calc_id, 'rb'),
68
69
                mimetype='text/plain',
                as_attachment=True,
70
                cache_timeout=0,
71
72
                attachment_filename='%s.log' % archive_id)
        except Restricted:
73
            abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))
74
75
        except KeyError:
            abort(404, message='Calculation %s does not exist.' % archive_id)
76
77
78
79


@calc_route(ns)
class ArchiveCalcResource(Resource):
80
    @api.doc('get_archive_calc')
81
    @api.response(404, 'The upload or calculation does not exist')
82
    @api.response(401, 'Not authorized to access the data.')
83
    @api.response(200, 'Archive data send')
84
    @authenticate(signature_token=True)
85
    def get(self, upload_id, calc_id):
86
87
88
        """
        Get calculation data in archive form.

89
        Calcs are references via *upload_id*, *calc_id* pairs.
90
        """
91
        archive_id = '%s/%s' % (upload_id, calc_id)
92

93
        upload_file = UploadFiles.get(
94
            upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id))
95

96
        if upload_file is None:
97
            abort(404, message='Archive %s does not exist.' % upload_id)
98
99
100

        try:
            return send_file(
101
                upload_file.archive_file(calc_id, 'rb'),
102
103
                mimetype='application/json',
                as_attachment=True,
104
                cache_timeout=0,
105
106
                attachment_filename='%s.json' % archive_id)
        except Restricted:
107
            abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))
108
        except KeyError:
109
            abort(404, message='Calculation %s does not exist.' % archive_id)
110
111


112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
archives_from_query_parser = search_request_parser.copy()
archives_from_query_parser.add_argument(
    name='compress', type=bool, help='Use compression on .zip files, default is not.',
    location='args')


@ns.route('/query')
class ArchiveQueryResource(Resource):
    manifest_quantities = ['upload_id', 'calc_id', 'external_id', 'raw_id', 'pid', 'calc_hash']

    @api.doc('archives_from_query')
    @api.response(400, 'Invalid requests, e.g. wrong owner type or bad search parameters')
    @api.expect(archives_from_query_parser, validate=True)
    @api.response(200, 'File(s) send', headers={'Content-Type': 'application/zip'})
    @authenticate(signature_token=True)
    def get(self):
        """
        Get calculation data in archive form from all query results.

        See ``/repo`` endpoint for documentation on the search
        parameters.

        Zip files are streamed; instead of 401 errors, the zip file will just not contain
        any files that the user is not authorized to access.

        The zip file will contain a ``manifest.json`` with the repository meta data.
        """
        try:
            args = archives_from_query_parser.parse_args()
            compress = args.get('compress', False)
        except Exception:
            abort(400, message='bad parameter types')

        search_request = search.SearchRequest()
        add_query(search_request, search_request_parser.parse_args())

148
        calcs = search_request.execute_scan(order_by='upload_id')
149
150

        def generator():
151
            manifest = {}
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
            for entry in calcs:
                upload_id = entry['upload_id']
                calc_id = entry['calc_id']
                upload_files = UploadFiles.get(
                    upload_id, create_authorization_predicate(upload_id))
                if upload_files is None:
                    utils.get_logger(__name__).error('upload files do not exist', upload_id=upload_id)
                    continue

                if hasattr(upload_files, 'zipfile_cache'):
                    zipfile_cache = upload_files.zipfile_cache()
                else:
                    zipfile_cache = contextlib.suppress()

                with zipfile_cache:
                    yield (
168
                        '%s.%s' % (calc_id, upload_files._archive_ext), calc_id,
169
170
171
                        lambda calc_id: upload_files.archive_file(calc_id, 'rb'),
                        lambda calc_id: upload_files.archive_file_size(calc_id))

172
                manifest[calc_id] = {
173
174
175
176
                    key: entry[key]
                    for key in ArchiveQueryResource.manifest_quantities
                    if entry.get(key) is not None
                }
177
178
179
180
181
182
183
184
185
186
187
188
189

            try:
                manifest_contents = json.dumps(manifest).encode('utf-8')
            except Exception as e:
                manifest_contents = json.dumps(
                    dict(error='Could not create the manifest: %s' % (e))).encode('utf-8')
                utils.get_logger(__name__).error(
                    'could not create raw query manifest', exc_info=e)

            yield (
                'manifest.json', 'manifest',
                lambda *args: BytesIO(manifest_contents),
                lambda *args: len(manifest_contents))
190
191

        return streamed_zipfile(
192
            generator(), zipfile_name='nomad_archive.zip', compress=compress)
193
194


195
196
@ns.route('/metainfo/<string:metainfo_package_name>')
@api.doc(params=dict(metainfo_package_name='The name of the metainfo package.'))
197
198
199
200
class MetainfoResource(Resource):
    @api.doc('get_metainfo')
    @api.response(404, 'The metainfo does not exist')
    @api.response(200, 'Metainfo data send')
201
    def get(self, metainfo_package_name):
202
203
204
205
        """
        Get a metainfo definition file.
        """
        try:
206
            return load_metainfo(metainfo_package_name), 200
207
        except FileNotFoundError:
208
            parser_prefix = metainfo_package_name[:-len('.nomadmetainfo.json')]
209

210
            try:
211
212
213
                return load_metainfo(dict(
                    parser='%sparser' % parser_prefix,
                    path='%s.nomadmetainfo.json' % parser_prefix)), 200
214
215
            except FileNotFoundError:
                abort(404, message='The metainfo %s does not exist.' % metainfo_package_name)
216
217
218
219
220


metainfo_main_path = os.path.dirname(os.path.abspath(nomad_meta_info.__file__))


221
222
223
def load_metainfo(
        package_name_or_dependency: str, dependency_source: str = None,
        loaded_packages: Dict[str, Any] = None) -> Dict[str, Any]:
224
225
226
227
228
    """
    Loads the given metainfo package and all its dependencies. Returns a dict with
    all loaded package_names and respective packages.

    Arguments:
229
230
        package_name_or_dependency: The name of the package, or a nomadmetainfo dependency object.
        dependency_source: The path of the metainfo that uses this function to load a relative dependency.
231
232
233
234
235
236
        loaded_packages: Give a dict and the function will added freshly loaded packages
            to it and return it.
    """
    if loaded_packages is None:
        loaded_packages = {}

237
238
    if isinstance(package_name_or_dependency, str):
        package_name = package_name_or_dependency
239
        metainfo_path = os.path.join(metainfo_main_path, package_name)
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
    else:
        dependency = package_name_or_dependency
        if 'relativePath' in dependency:
            if dependency_source is None:
                raise Exception(
                    'Can only load relative dependency from within another metainfo package')

            metainfo_path = os.path.join(
                os.path.dirname(dependency_source), dependency['relativePath'])

        elif 'metainfoPath' in dependency:
            metainfo_path = os.path.join(metainfo_main_path, dependency['metainfoPath'])

        elif 'parser' in dependency:
            parser = dependency['parser']
            path = dependency['path']
            try:
                parser_module = importlib.import_module(parser).__file__
            except Exception:
                raise Exception('Parser not installed %s for metainfo path %s' % (parser, metainfo_path))

            parser_directory = os.path.dirname(parser_module)
            metainfo_path = os.path.join(parser_directory, path)

        else:
            raise Exception('Invalid dependency type in metainfo package %s' % metainfo_path)

        package_name = os.path.basename(metainfo_path)
268
269
270
271
272
273
274
275
276
277
278
279

    package_name = os.path.basename(package_name)

    if package_name in loaded_packages:
        return loaded_packages

    with open(metainfo_path, 'rt') as f:
        metainfo_json = json.load(f)

    loaded_packages[package_name] = metainfo_json

    for dependency in metainfo_json.get('dependencies', []):
280
        load_metainfo(dependency, dependency_source=metainfo_path, loaded_packages=loaded_packages)
281
282

    return loaded_packages