upload.py 22.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
"""
16
17
The upload API of the nomad@FAIRDI APIs. Provides endpoints to upload files and
get the processing status of uploads.
18
"""
19

20
from flask import g, request, Response
21
22
from flask_restplus import Resource, fields, abort
from datetime import datetime
23
24
from werkzeug.datastructures import FileStorage
import os.path
25
import os
26
import io
27
from functools import wraps
28

29
from nomad import config, utils, files
30
from nomad.processing import Upload, FAILURE
31
from nomad.processing import ProcessAlreadyRunning
32

Markus Scheidgen's avatar
Markus Scheidgen committed
33
34
from nomad.app.utils import with_logger, RFC3339DateTime
from .api import api
35
from .auth import login_really_required
36
from .common import pagination_request_parser, pagination_model, upload_route
37

38

39
ns = api.namespace(
40
    'uploads',
41
42
43
44
45
46
    description='Uploading data and tracing uploaded data and its processing.')


proc_model = api.model('Processing', {
    'tasks': fields.List(fields.String),
    'current_task': fields.String,
47
    'tasks_running': fields.Boolean,
48
    'tasks_status': fields.String,
49
50
    'errors': fields.List(fields.String),
    'warnings': fields.List(fields.String),
51
52
    'create_time': RFC3339DateTime,
    'complete_time': RFC3339DateTime,
53
54
55
56
    'current_process': fields.String,
    'process_running': fields.Boolean,
})

Markus Scheidgen's avatar
Markus Scheidgen committed
57
58
59
60
61
62
dataset_model = api.model('DataSet', {
    'id': fields.Integer(required=True, description='The repository db dataset id'),
    '_doi': fields.String(description='The DOI of the dataset'),
    '_name': fields.String(description='The unique dataset name')
})

63
64
65
66
metadata_model = api.model('MetaData', {
    'with_embargo': fields.Boolean(default=False, description='Data with embargo is only visible to the upload until the embargo period ended.'),
    'comment': fields.String(description='The comment are shown in the repository for each calculation.'),
    'references': fields.List(fields.String, descriptions='References allow to link calculations to external source, e.g. URLs.'),
Markus Scheidgen's avatar
Markus Scheidgen committed
67
68
    'coauthors': fields.List(fields.Integer, description='A list of co-authors given by user_id.'),
    'shared_with': fields.List(fields.Integer, description='A list of users to share calculations with given by user_id.'),
69
    '_upload_time': RFC3339DateTime(description='Overrride the upload time.'),
Markus Scheidgen's avatar
Markus Scheidgen committed
70
    '_uploader': fields.Integer(description='Override the uploader with the given user id.'),
71
    'datasets': fields.List(fields.Nested(model=dataset_model, skip_none=True), description='A list of datasets.')
72
73
74
75
})

calc_metadata_model = api.inherit('CalcMetaData', metadata_model, {
    'mainfile': fields.String(description='The calculation main output file is used to identify the calculation in the upload.'),
76
77
    '_pid': fields.Integer(description='Assign a specific pid. It must be unique.'),
    'external_id': fields.String(description='External user provided id. Does not have to be unique necessarily.')
78
79
80
})

upload_metadata_model = api.inherit('UploadMetaData', metadata_model, {
81
    'calculations': fields.List(fields.Nested(model=calc_metadata_model, skip_none=True), description='Specific per calculation data that will override the upload data.')
82
83
})

84
upload_model = api.inherit('UploadProcessing', proc_model, {
85
86
87
88
    'name': fields.String(
        description='The name of the upload. This can be provided during upload '
                    'using the name query parameter.'),
    'upload_id': fields.String(
89
        description='The unique id for the upload.'),
90
    # TODO just removed during migration, where this get particularily large
91
    # 'metadata': fields.Nested(model=upload_metadata_model, description='Additional upload and calculation meta data.', skip_none=True),
92
    'upload_path': fields.String(description='The uploaded file on the server'),
93
    'published': fields.Boolean(description='If this upload is already published'),
94
    'upload_time': RFC3339DateTime(),
95
96
})

97
98
upload_list_model = api.model('UploadList', {
    'pagination': fields.Nested(model=pagination_model),
99
    'results': fields.List(fields.Nested(model=upload_model, skip_none=True))
100
101
})

102
calc_model = api.inherit('UploadCalculationProcessing', proc_model, {
103
    'calc_id': fields.String,
104
105
106
107
108
109
110
111
112
113
    'mainfile': fields.String,
    'upload_id': fields.String,
    'parser': fields.String
})

upload_with_calcs_model = api.inherit('UploadWithPaginatedCalculations', upload_model, {
    'processed_calcs': fields.Integer,
    'total_calcs': fields.Integer,
    'failed_calcs': fields.Integer,
    'pending_calcs': fields.Integer,
114
115
    'calcs': fields.Nested(model=api.model('UploadPaginatedCalculations', {
        'pagination': fields.Nested(model=api.inherit('UploadCalculationPagination', pagination_model, {
116
117
118
            'successes': fields.Integer,
            'failures': fields.Integer,
        })),
119
120
        'results': fields.List(fields.Nested(model=calc_model, skip_none=True))
    }), skip_none=True)
121
122
})

123
124
upload_operation_model = api.model('UploadOperation', {
    'operation': fields.String(description='Currently publish is the only operation.'),
125
    'metadata': fields.Nested(model=upload_metadata_model, description='Additional upload and calculation meta data. Will replace previously given metadata.')
126
127
128
129
130
131
})


upload_metadata_parser = api.parser()
upload_metadata_parser.add_argument('name', type=str, help='An optional name for the upload.', location='args')
upload_metadata_parser.add_argument('local_path', type=str, help='Use a local file on the server.', location='args')
132
upload_metadata_parser.add_argument('curl', type=bool, help='Provide a human readable message as body.', location='args')
133
upload_metadata_parser.add_argument('file', type=FileStorage, help='The file to upload.', location='files')
134

135
upload_list_parser = pagination_request_parser.copy()
136
upload_list_parser.add_argument('state', type=str, help='List uploads with given state: all, unpublished, published.', location='args')
137
138
upload_list_parser.add_argument('name', type=str, help='Filter for uploads with the given name.', location='args')

139

140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def disable_marshalling(f):
    @wraps(f)
    def wrapper(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except DisableMarshalling as e:
            print(e.un_marshalled)
            return e.un_marshalled

    return wrapper


def marshal_with(*args, **kwargs):
    """
    A special version of the RESTPlus marshal_with decorator that allows to disable
    marshalling at runtime by raising DisableMarshalling.
    """
    def decorator(func):
        @api.marshal_with(*args, **kwargs)
        def with_marshalling(*args, **kwargs):
            return func(*args, **kwargs)

        @wraps(with_marshalling)
        def wrapper(*args, **kwargs):
            try:
                return with_marshalling(*args, **kwargs)
            except DisableMarshalling as e:
                print(e.un_marshalled)
                return e.un_marshalled

        return wrapper
    return decorator


class DisableMarshalling(Exception):
    def __init__(self, body, status, headers):
        super().__init__()
        self.un_marshalled = Response(body, status=status, headers=headers)


180
@ns.route('/')
181
class UploadListResource(Resource):
182
    @api.doc('get_uploads')
183
    @api.response(400, 'Bad parameters')
184
    @api.marshal_with(upload_list_model, skip_none=True, code=200, description='Uploads send')
185
    @api.expect(upload_list_parser)
186
187
    @login_really_required
    def get(self):
188
        """ Get the list of all uploads from the authenticated user. """
189
        try:
190
            state = request.args.get('state', 'unpublished')
191
192
193
194
195
196
197
198
199
200
201
202
            name = request.args.get('name', None)
            page = int(request.args.get('page', 1))
            per_page = int(request.args.get('per_page', 10))
        except Exception:
            abort(400, message='bad parameter types')

        try:
            assert page >= 1
            assert per_page > 0
        except AssertionError:
            abort(400, message='invalid pagination')

203
        query_kwargs = {}
204
205
206
        if state == 'published':
            query_kwargs.update(published=True)
        elif state == 'unpublished':
207
            query_kwargs.update(published=False)
208
209
210
211
212
        elif state == 'all':
            pass
        else:
            abort(400, message='bad state value %s' % state)

213
214
        if name is not None:
            query_kwargs.update(name=name)
215
216
217
218
219
220

        uploads = Upload.user_uploads(g.user, **query_kwargs)
        total = uploads.count()

        results = [
            upload
221
            for upload in uploads.order_by('-upload_time')[(page - 1) * per_page: page * per_page]]
222
223
224
225

        return dict(
            pagination=dict(total=total, page=page, per_page=per_page),
            results=results), 200
226

227
    @api.doc('upload')
228
    @api.expect(upload_metadata_parser)
229
    @api.response(400, 'To many uploads')
230
    @marshal_with(upload_model, skip_none=True, code=200, description='Upload received')
Markus Scheidgen's avatar
Markus Scheidgen committed
231
    @login_really_required
232
233
    @with_logger
    def put(self, logger):
Markus Scheidgen's avatar
Markus Scheidgen committed
234
235
236
237
238
239
240
241
242
243
        """
        Upload a file and automatically create a new upload in the process.
        Can be used to upload files via browser or other http clients like curl.
        This will also start the processing of the upload.

        There are two basic ways to upload a file: multipart-formdata or simply streaming
        the file data. Both are supported. The later one does not allow to transfer a
        filename or other meta-data. If a filename is available, it will become the
        name of the upload.

244
        Example commands:
Markus Scheidgen's avatar
Markus Scheidgen committed
245

246
247
            curl -X put ".../nomad/api/uploads/" -F file=@local_file
            curl ".../nomad/api/uploads/" --upload-file local_file
248
249
250

        There is a general limit on how many unpublished uploads a user can have. Will
        return 400 if this limit is exceeded.
Markus Scheidgen's avatar
Markus Scheidgen committed
251
        """
252
        # check existence of local_path if local_path is used
253
        local_path = request.args.get('local_path')
254
255
256
257
        if local_path:
            if not os.path.exists(local_path):
                abort(404, message='The given local_path was not found.')

258
259
260
261
262
        # check the upload limit
        if not g.user.is_admin:
            if Upload.user_uploads(g.user, published=False).count() >= config.services.upload_limit:
                abort(400, 'Limit of unpublished uploads exceeded for user.')

263
        upload_name = request.args.get('name')
264
        upload_id = utils.create_uuid()
Markus Scheidgen's avatar
Markus Scheidgen committed
265

266
        logger = logger.bind(upload_id=upload_id, upload_name=upload_name)
267
        logger.info('upload created', )
Markus Scheidgen's avatar
Markus Scheidgen committed
268

269
270
271
        try:
            if local_path:
                # file is already there and does not to be received
272
                upload_path = local_path
273
            elif request.mimetype in ['multipart/form-data', 'application/multipart-formdata']:
274
                logger.info('receive upload as multipart formdata')
275
                upload_path = files.PathObject(config.fs.tmp, upload_id).os_path
276
277
                # multipart formdata, e.g. with curl -X put "url" -F file=@local_file
                # might have performance issues for large files: https://github.com/pallets/flask/issues/2086
278
                if 'file' not in request.files:
279
280
                    abort(400, message='Bad multipart-formdata, there is no file part.')
                file = request.files['file']
281
282
                if upload_name is None or upload_name is '':
                    upload_name = file.filename
283

284
                file.save(upload_path)
285
            else:
286
                print(request.mimetype)
287
                # simple streaming data in HTTP body, e.g. with curl "url" -T local_file
288
                logger.info('started to receive upload streaming data')
289
                upload_path = files.PathObject(config.fs.tmp, upload_id).os_path
290
291

                try:
292
                    with open(upload_path, 'wb') as f:
Markus Scheidgen's avatar
Markus Scheidgen committed
293
294
                        received_data = 0
                        received_last = 0
295
                        while True:
Markus Scheidgen's avatar
Markus Scheidgen committed
296
                            data = request.stream.read(io.DEFAULT_BUFFER_SIZE)
297
298
299
                            if len(data) == 0:
                                break

Markus Scheidgen's avatar
Markus Scheidgen committed
300
301
                            received_data += len(data)
                            received_last += len(data)
Markus Scheidgen's avatar
Markus Scheidgen committed
302
                            if received_last > 1e9:
Markus Scheidgen's avatar
Markus Scheidgen committed
303
304
                                received_last = 0
                                # TODO remove this logging or reduce it to debug
305
                                logger.info('received streaming data', size=received_data)
Markus Scheidgen's avatar
Markus Scheidgen committed
306
                            f.write(data)
307
308
309
310
311

                except Exception as e:
                    logger.warning('Error on streaming upload', exc_info=e)
                    abort(400, message='Some IO went wrong, download probably aborted/disrupted.')
        except Exception as e:
312
313
            if not local_path and os.path.isfile(upload_path):
                os.remove(upload_path)
314
315
            logger.info('Invalid or aborted upload')
            raise e
Markus Scheidgen's avatar
Markus Scheidgen committed
316
317

        logger.info('received uploaded file')
318
319
320
321
322

        upload = Upload.create(
            upload_id=upload_id,
            user=g.user,
            name=upload_name,
323
            upload_time=datetime.utcnow(),
324
325
326
            upload_path=upload_path,
            temporary=local_path != upload_path)

327
        upload.process_upload()
Markus Scheidgen's avatar
Markus Scheidgen committed
328
329
        logger.info('initiated processing')

330
331
332
333
334
335
336
337
        if bool(request.args.get('curl', False)):
            raise DisableMarshalling(
                '''
Thanks for uploading your data to nomad.
Go back to %s and press reload to see the progress on your upload and publish your data.

''' % upload.gui_url,
                200, {'Content-Type': 'text/plain; charset=utf-8'})
Markus Scheidgen's avatar
Markus Scheidgen committed
338
339

        return upload, 200
340

Markus Scheidgen's avatar
Markus Scheidgen committed
341

342
343
344
345
class ProxyUpload:
    def __init__(self, upload, calcs):
        self.upload = upload
        self.calcs = calcs
346

347
348
349
350
    def __getattr__(self, name):
        return self.upload.__getattribute__(name)


351
@upload_route(ns)
352
class UploadResource(Resource):
353
    @api.doc('get_upload')
354
    @api.response(404, 'Upload does not exist')
355
    @api.response(400, 'Invalid parameters')
356
    @api.marshal_with(upload_with_calcs_model, skip_none=True, code=200, description='Upload send')
357
    @api.expect(pagination_request_parser)
358
    @login_really_required
359
    def get(self, upload_id: str):
360
        """
361
362
363
364
        Get an update for an existing upload.

        Will not only return the upload, but also its calculations paginated.
        Use the pagination params to determine the page.
365
366
        """
        try:
367
            upload = Upload.get(upload_id)
368
369
370
        except KeyError:
            abort(404, message='Upload with id %s does not exist.' % upload_id)

371
        if upload.user_id != str(g.user.user_id) and not g.user.is_admin:
372
373
374
375
376
            abort(404, message='Upload with id %s does not exist.' % upload_id)

        try:
            page = int(request.args.get('page', 1))
            per_page = int(request.args.get('per_page', 10))
377
            order_by = request.args.get('order_by', None)
378
379
380
381
382
383
384
385
386
387
            order = int(str(request.args.get('order', -1)))
        except Exception:
            abort(400, message='invalid pagination or ordering')

        try:
            assert page >= 1
            assert per_page > 0
        except AssertionError:
            abort(400, message='invalid pagination')

388
389
390
391
        if order_by is not None:
            order_by = str(order_by)
            if order_by not in ['mainfile', 'tasks_status', 'parser']:
                abort(400, message='invalid order_by field %s' % order_by)
392

393
            order_by = ('-%s' if order == -1 else '+%s') % order_by
394

395
        calcs = upload.all_calcs((page - 1) * per_page, page * per_page, order_by=order_by)
396
        failed_calcs = upload.failed_calcs
397
        result = ProxyUpload(upload, {
398
399
400
            'pagination': dict(
                total=upload.total_calcs, page=page, per_page=per_page,
                successes=upload.processed_calcs - failed_calcs, failures=failed_calcs),
401
402
            'results': [calc for calc in calcs]
        })
403
404
405

        return result, 200

406
    @api.doc('delete_upload')
407
    @api.response(404, 'Upload does not exist')
408
    @api.response(401, 'Upload does not belong to authenticated user.')
409
    @api.response(400, 'The upload is still/already processed')
410
    @api.marshal_with(upload_model, skip_none=True, code=200, description='Upload deleted')
411
    @login_really_required
412
413
    @with_logger
    def delete(self, upload_id: str, logger):
414
        """
415
        Delete an existing upload.
416

417
        Only uploads that are sill in staging, not already deleted, not still uploaded, and
418
        not currently processed, can be deleted.
419
420
        """
        try:
421
            upload = Upload.get(upload_id)
422
423
424
        except KeyError:
            abort(404, message='Upload with id %s does not exist.' % upload_id)

425
        if upload.user_id != str(g.user.user_id) and not g.user.is_admin:
426
            abort(401, message='Upload with id %s does not belong to you.' % upload_id)
427

428
429
430
        if upload.published:
            abort(400, message='The upload is already published')

431
        if upload.tasks_running:
432
            abort(400, message='The upload is not processed yet')
433

434
435
436
        try:
            upload.delete_upload()
        except ProcessAlreadyRunning:
437
            abort(400, message='The upload is still processed')
438
439
440
        except Exception as e:
            logger.error('could not delete processing upload', exc_info=e)
            raise e
441
442

        return upload, 200
443

444
    @api.doc('exec_upload_operation')
445
    @api.response(404, 'Upload does not exist or not in staging')
446
    @api.response(400, 'Operation is not supported or the upload is still/already processed')
447
448
449
    @api.response(401, 'If the operation is not allowed for the current user')
    @api.marshal_with(upload_model, skip_none=True, code=200, description='Upload published successfully')
    @api.expect(upload_operation_model)
450
    @login_really_required
451
452
    def post(self, upload_id):
        """
453
        Execute an upload operation. Available operations are ``publish`` and ``re-process``
454

455
        Publish accepts further meta data that allows to provide coauthors, comments,
456
        external references, etc. See the model for details. The fields that start with
457
        ``_underscore`` are only available for users with administrative privileges.
458

459
        Publish changes the visibility of the upload. Clients can specify the visibility
460
        via meta data.
461
462
463
464

        Re-process will re-process the upload and produce updated repository metadata and
        archive. Only published uploads that are not processing at the moment are allowed.
        Only for uploads where calculations have been processed with an older nomad version.
465
        """
466
        try:
467
            upload = Upload.get(upload_id)
468
469
470
        except KeyError:
            abort(404, message='Upload with id %s does not exist.' % upload_id)

471
        if upload.user_id != str(g.user.user_id) and not g.user.is_admin:
472
            abort(404, message='Upload with id %s does not exist.' % upload_id)
473

474
475
476
        json_data = request.get_json()
        if json_data is None:
            json_data = {}
477

478
        operation = json_data.get('operation')
479

480
481
        metadata = json_data.get('metadata', {})
        for key in metadata:
482
483
            if key.startswith('_'):
                if not g.user.is_admin:
484
                    abort(401, message='Only admin users can use _metadata_keys.')
485
486
                break

487
        if operation == 'publish':
488
            if upload.tasks_running:
489
                abort(400, message='The upload is not processed yet')
490
            if upload.tasks_status == FAILURE:
491
                abort(400, message='Cannot publish an upload that failed processing')
492
493
            if upload.processed_calcs == 0:
                abort(400, message='Cannot publish an upload without calculations')
494
            try:
495
                upload.compress_and_set_metadata(metadata)
496
                upload.publish_upload()
497
498
            except ProcessAlreadyRunning:
                abort(400, message='The upload is still/already processed')
499

500
            return upload, 200
501
        elif operation == 're-process':
502
            if upload.tasks_running or upload.process_running or not upload.published:
503
504
505
506
507
508
509
510
                abort(400, message='Can only non processing, re-process published uploads')

            if len(metadata) > 0:
                abort(400, message='You can not provide metadata for re-processing')

            if len(upload.outdated_calcs) == 0:
                abort(400, message='You can only re-process uploads with at least one outdated calculation')

511
            upload.reset()
512
513
514
            upload.re_process_upload()

            return upload, 200
515

516
        abort(400, message='Unsupported operation %s.' % operation)
517
518
519
520


upload_command_model = api.model('UploadCommand', {
    'upload_url': fields.Url,
Markus Scheidgen's avatar
Markus Scheidgen committed
521
    'upload_command': fields.String,
522
    'upload_command_with_name': fields.String,
Markus Scheidgen's avatar
Markus Scheidgen committed
523
    'upload_progress_command': fields.String,
524
    'upload_command_form': fields.String,
Markus Scheidgen's avatar
Markus Scheidgen committed
525
    'upload_tar_command': fields.String
526
527
528
529
530
})


@ns.route('/command')
class UploadCommandResource(Resource):
531
    @api.doc('get_upload_command')
532
533
534
535
    @api.marshal_with(upload_command_model, code=200, description='Upload command send')
    @login_really_required
    def get(self):
        """ Get url and example command for shell based uploads. """
536
        upload_url = '%s/uploads/?curl=True' % config.api_url(ssl=False)
537
        upload_url_with_name = upload_url + '&name=<name>'
538

539
540
541
542
543
544
        # upload_command = 'curl -X PUT -H "X-Token: %s" "%s" -F file=@<local_file>' % (
        #     g.user.get_auth_token().decode('utf-8'), upload_url)

        # Upload via streaming data tends to work much easier, e.g. no mime type issues, etc.
        # It is also easier for the user to unterstand IMHO.
        upload_command = 'curl -H X-Token:%s %s -T <local_file>' % (
545
546
            g.user.get_auth_token().decode('utf-8'), upload_url)

547
548
549
550
551
552
        upload_command_form = 'curl -H X-Token:%s %s -X PUT -F file=@<local_file>' % (
            g.user.get_auth_token().decode('utf-8'), upload_url)

        upload_command_with_name = 'curl -H X-Token:%s "%s" -X PUT -T <local_file>' % (
            g.user.get_auth_token().decode('utf-8'), upload_url_with_name)

553
554
555
        upload_progress_command = upload_command + ' | xargs echo'
        upload_tar_command = 'tar -cf - <local_folder> | curl -# -H X-Token:%s %s -T - | xargs echo' % (
            g.user.get_auth_token().decode('utf-8'), upload_url)
556

557
558
559
        return dict(
            upload_url=upload_url,
            upload_command=upload_command,
560
            upload_command_with_name=upload_command_with_name,
561
            upload_progress_command=upload_progress_command,
562
            upload_command_form=upload_command_form,
563
            upload_tar_command=upload_tar_command), 200