Commit ffecdabd authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Switch to new files implemenation. Not all tests working.

parent 49851e8c
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.pyenv/bin/pytest", "program": "${workspaceFolder}/.pyenv/bin/pytest",
"args": [ "args": [
"-sv", "tests/test_uploads.py::TestPublicUploadFiles::test_rawfile[Ppr]" "-sv", "tests/processing/test_data.py::test_processing[tests/data/proc/examples_template.zip]"
] ]
}, },
{ {
......
...@@ -25,7 +25,7 @@ from flask_restplus import abort, Resource ...@@ -25,7 +25,7 @@ from flask_restplus import abort, Resource
import nomad_meta_info import nomad_meta_info
from nomad import config from nomad import config
from nomad.files import ArchiveFile, ArchiveLogFile from nomad.uploads import UploadFiles
from nomad.utils import get_logger from nomad.utils import get_logger
from .app import api from .app import api
...@@ -52,17 +52,13 @@ class ArchiveCalcLogResource(Resource): ...@@ -52,17 +52,13 @@ class ArchiveCalcLogResource(Resource):
archive_id = '%s/%s' % (upload_hash, calc_hash) archive_id = '%s/%s' % (upload_hash, calc_hash)
try: try:
archive = ArchiveLogFile(archive_id) upload_files = UploadFiles.get(upload_hash)
if not archive.exists(): with upload_files.archive_log_file(calc_hash, 'rt') as f:
raise FileNotFoundError() rv = send_file(
f,
archive_path = archive.os_path mimetype='text/plain',
as_attachment=True,
rv = send_file( attachment_filename='%s.log' % archive_id)
archive_path,
mimetype='text/plain',
as_attachment=True,
attachment_filename=os.path.basename(archive_path))
return rv return rv
except FileNotFoundError: except FileNotFoundError:
...@@ -90,23 +86,20 @@ class ArchiveCalcResource(Resource): ...@@ -90,23 +86,20 @@ class ArchiveCalcResource(Resource):
archive_id = '%s/%s' % (upload_hash, calc_hash) archive_id = '%s/%s' % (upload_hash, calc_hash)
try: try:
archive = ArchiveFile(archive_id) upload_file = UploadFiles.get(upload_hash)
if not archive.exists(): mode = 'rb' if config.files.compress_archive else 'rt'
raise FileNotFoundError() with upload_file.archive_file(calc_hash, mode) as f:
rv = send_file(
archive_path = archive.os_path f,
mimetype='application/json',
rv = send_file( as_attachment=True,
archive_path, attachment_filename='%s.json' % archive_id)
mimetype='application/json',
as_attachment=True,
attachment_filename=os.path.basename(archive_path))
if config.files.compress_archive: if config.files.compress_archive:
rv.headers['Content-Encoding'] = 'gzip' rv.headers['Content-Encoding'] = 'gzip'
return rv return rv
except FileNotFoundError: except KeyError:
abort(404, message='Archive %s does not exist.' % archive_id) abort(404, message='Archive %s does not exist.' % archive_id)
except Exception as e: except Exception as e:
logger = get_logger( logger = get_logger(
......
...@@ -26,8 +26,8 @@ from flask import Response, request, send_file ...@@ -26,8 +26,8 @@ from flask import Response, request, send_file
from flask_restplus import abort, Resource, fields from flask_restplus import abort, Resource, fields
from werkzeug.exceptions import HTTPException from werkzeug.exceptions import HTTPException
from nomad.files import RepositoryFile
from nomad.utils import get_logger from nomad.utils import get_logger
from nomad.uploads import UploadFiles
from .app import api from .app import api
from .auth import login_if_available from .auth import login_if_available
...@@ -69,15 +69,14 @@ class RawFileFromPathResource(Resource): ...@@ -69,15 +69,14 @@ class RawFileFromPathResource(Resource):
""" """
upload_filepath = fix_file_paths(path) upload_filepath = fix_file_paths(path)
repository_file = RepositoryFile(upload_hash) try:
if not repository_file.exists(): upload_files = UploadFiles.get(upload_hash)
except KeyError:
abort(404, message='The upload with hash %s does not exist.' % upload_hash) abort(404, message='The upload with hash %s does not exist.' % upload_hash)
if upload_filepath[-1:] == '*': if upload_filepath[-1:] == '*':
upload_filepath = upload_filepath[0:-1] upload_filepath = upload_filepath[0:-1]
files = list( files = list(upload_files.raw_file_manifest(path_prefix=upload_filepath))
file for file in repository_file.manifest
if file.startswith(upload_filepath))
if len(files) == 0: if len(files) == 0:
abort(404, message='There are no files for %s.' % upload_filepath) abort(404, message='There are no files for %s.' % upload_filepath)
else: else:
...@@ -85,8 +84,7 @@ class RawFileFromPathResource(Resource): ...@@ -85,8 +84,7 @@ class RawFileFromPathResource(Resource):
return respond_to_get_raw_files(upload_hash, files, compress) return respond_to_get_raw_files(upload_hash, files, compress)
try: try:
the_file = repository_file.get_file(upload_filepath) with upload_files.raw_file(upload_filepath) as f:
with the_file.open() as f:
rv = send_file( rv = send_file(
f, f,
mimetype='application/octet-stream', mimetype='application/octet-stream',
...@@ -94,7 +92,7 @@ class RawFileFromPathResource(Resource): ...@@ -94,7 +92,7 @@ class RawFileFromPathResource(Resource):
attachment_filename=os.path.basename(upload_filepath)) attachment_filename=os.path.basename(upload_filepath))
return rv return rv
except KeyError: except KeyError:
files = list(file for file in repository_file.manifest if file.startswith(upload_filepath)) files = list(file for file in upload_files.raw_file_manifest(upload_filepath))
if len(files) == 0: if len(files) == 0:
abort(404, message='The file %s does not exist.' % upload_filepath) abort(404, message='The file %s does not exist.' % upload_filepath)
else: else:
...@@ -161,8 +159,9 @@ class RawFilesResource(Resource): ...@@ -161,8 +159,9 @@ class RawFilesResource(Resource):
def respond_to_get_raw_files(upload_hash, files, compress=False): def respond_to_get_raw_files(upload_hash, files, compress=False):
logger = get_logger(__name__, endpoint='raw', action='get files', upload_hash=upload_hash) logger = get_logger(__name__, endpoint='raw', action='get files', upload_hash=upload_hash)
repository_file = RepositoryFile(upload_hash) try:
if not repository_file.exists(): upload_file = UploadFiles.get(upload_hash)
except KeyError:
abort(404, message='The upload with hash %s does not exist.' % upload_hash) abort(404, message='The upload with hash %s does not exist.' % upload_hash)
def generator(): def generator():
...@@ -170,22 +169,21 @@ def respond_to_get_raw_files(upload_hash, files, compress=False): ...@@ -170,22 +169,21 @@ def respond_to_get_raw_files(upload_hash, files, compress=False):
def iterator(): def iterator():
""" Replace the directory based iter of zipstream with an iter over all given files. """ """ Replace the directory based iter of zipstream with an iter over all given files. """
try: try:
with repository_file.zipped_container.zip_file() as zf: for filename in files:
for filename in files: # Write a file to the zipstream.
# Write a file to the zipstream. try:
try: with upload_file.raw_file(filename) as f:
with zf.open(repository_file.zipped_container.get_zip_path(filename)) as f: def iter_content():
def iter_content(): while True:
while True: data = f.read(100000)
data = f.read(100000) if not data:
if not data: break
break yield data
yield data
yield dict(arcname=filename, iterable=iter_content())
yield dict(arcname=filename, iterable=iter_content()) except KeyError as e:
except KeyError as e: # files that are not found, will not be returned
# files that are not found, will not be returned pass
pass
except Exception as e: except Exception as e:
logger.error('Exception while accessing files.', exc_info=e) logger.error('Exception while accessing files.', exc_info=e)
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
# limitations under the License. # limitations under the License.
""" """
The upload API of the nomad@FAIRDI APIs. Provides endpoints to create uploads, upload The upload API of the nomad@FAIRDI APIs. Provides endpoints to upload files and
files, and retrieve the processing status of uploads. get the processing status of uploads.
""" """
from flask import g, request from flask import g, request
...@@ -27,7 +27,7 @@ from nomad import config ...@@ -27,7 +27,7 @@ from nomad import config
from nomad.processing import Upload from nomad.processing import Upload
from nomad.processing import NotAllowedDuringProcessing from nomad.processing import NotAllowedDuringProcessing
from nomad.utils import get_logger from nomad.utils import get_logger
from nomad.files import UploadFile from nomad.uploads import ArchiveBasedStagingUploadFiles
from .app import api from .app import api
from .auth import login_really_required from .auth import login_really_required
...@@ -163,9 +163,11 @@ class UploadListResource(Resource): ...@@ -163,9 +163,11 @@ class UploadListResource(Resource):
logger = get_logger(__name__, endpoint='upload', action='put', upload_id=upload.upload_id) logger = get_logger(__name__, endpoint='upload', action='put', upload_id=upload.upload_id)
logger.info('upload created') logger.info('upload created')
uploadFile = UploadFile(upload.upload_id, local_path=local_path) upload_files = ArchiveBasedStagingUploadFiles(
upload.upload_id, create=True, local_path=local_path)
if local_path: if local_path:
# file is already there and does not to be received
pass pass
elif request.mimetype == 'application/multipart-formdata': elif request.mimetype == 'application/multipart-formdata':
# multipart formdata, e.g. with curl -X put "url" -F file=@local_file # multipart formdata, e.g. with curl -X put "url" -F file=@local_file
...@@ -176,11 +178,11 @@ class UploadListResource(Resource): ...@@ -176,11 +178,11 @@ class UploadListResource(Resource):
if upload.name is '': if upload.name is '':
upload.name = file.filename upload.name = file.filename
file.save(uploadFile.os_path) file.save(upload_files.upload_file_os_path)
else: else:
# simple streaming data in HTTP body, e.g. with curl "url" -T local_file # simple streaming data in HTTP body, e.g. with curl "url" -T local_file
try: try:
with uploadFile.open('wb') as f: with open(upload_files.upload_file_os_path, 'wb') as f:
while not request.stream.is_exhausted: while not request.stream.is_exhausted:
f.write(request.stream.read(1024)) f.write(request.stream.read(1024))
...@@ -188,10 +190,10 @@ class UploadListResource(Resource): ...@@ -188,10 +190,10 @@ class UploadListResource(Resource):
logger.error('Error on streaming upload', exc_info=e) logger.error('Error on streaming upload', exc_info=e)
abort(400, message='Some IO went wrong, download probably aborted/disrupted.') abort(400, message='Some IO went wrong, download probably aborted/disrupted.')
if not uploadFile.is_valid: if not upload_files.is_valid:
uploadFile.delete() upload_files.delete()
upload.delete() upload.delete()
abort(400, message='Bad file format, excpected %s.' % ", ".join(UploadFile.formats)) abort(400, message='Bad file format, excpected %s.' % ", ".join(upload_files.formats))
logger.info('received uploaded file') logger.info('received uploaded file')
upload.upload_time = datetime.now() upload.upload_time = datetime.now()
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
This module contains classes that allow to represent the core This module contains classes that allow to represent the core
nomad data entities :class:`Upload` and :class:`Calc` on a high level of abstraction nomad data entities :class:`Upload` and :class:`Calc` on a high level of abstraction
independent from their representation in the different modules :py:mod:`nomad.repo`, independent from their representation in the different modules :py:mod:`nomad.repo`,
:py:mod:`nomad.processing`, :py:mod:`nomad.coe_repo`, :py:mod:`nomad.files`. :py:mod:`nomad.processing`, :py:mod:`nomad.coe_repo`, :py:mod:`nomad.uploads`.
It is not about representing every detail, but those parts that are directly involved in It is not about representing every detail, but those parts that are directly involved in
api, processing, migration, mirroring, or other 'infrastructure' operations. api, processing, migration, mirroring, or other 'infrastructure' operations.
""" """
......
...@@ -32,7 +32,7 @@ from structlog import wrap_logger ...@@ -32,7 +32,7 @@ from structlog import wrap_logger
from contextlib import contextmanager from contextlib import contextmanager
from nomad import utils, coe_repo, datamodel from nomad import utils, coe_repo, datamodel
from nomad.files import UploadFile, ArchiveFile, ArchiveLogFile, File from nomad.uploads import PathObject, ArchiveBasedStagingUploadFiles
from nomad.repo import RepoCalc, RepoUpload from nomad.repo import RepoCalc, RepoUpload
from nomad.processing.base import Proc, Chord, process, task, PENDING, SUCCESS, FAILURE from nomad.processing.base import Proc, Chord, process, task, PENDING, SUCCESS, FAILURE
from nomad.parsing import parsers, parser_dict from nomad.parsing import parsers, parser_dict
...@@ -58,13 +58,11 @@ class Calc(Proc, datamodel.Calc): ...@@ -58,13 +58,11 @@ class Calc(Proc, datamodel.Calc):
parser: the name of the parser used to process this calc parser: the name of the parser used to process this calc
upload_id: the id of the upload used to create this calculation upload_id: the id of the upload used to create this calculation
mainfile: the mainfile (including path in upload) that was used to create this calc mainfile: the mainfile (including path in upload) that was used to create this calc
mainfile_tmp_path: path to the mainfile extracted for processing
""" """
archive_id = StringField(primary_key=True) archive_id = StringField(primary_key=True)
upload_id = StringField() upload_id = StringField()
mainfile = StringField() mainfile = StringField()
parser = StringField() parser = StringField()
mainfile_tmp_path = StringField()
meta: Any = { meta: Any = {
'indices': [ 'indices': [
...@@ -75,9 +73,9 @@ class Calc(Proc, datamodel.Calc): ...@@ -75,9 +73,9 @@ class Calc(Proc, datamodel.Calc):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self._parser_backend = None self._parser_backend = None
self._upload = None self._upload: Upload = None
self._upload_files: ArchiveBasedStagingUploadFiles = None
self._calc_proc_logwriter = None self._calc_proc_logwriter = None
self._calc_proc_logfile = None
self._calc_proc_logwriter_ctx: ContextManager = None self._calc_proc_logwriter_ctx: ContextManager = None
@classmethod @classmethod
...@@ -85,8 +83,8 @@ class Calc(Proc, datamodel.Calc): ...@@ -85,8 +83,8 @@ class Calc(Proc, datamodel.Calc):
return cls.get_by_id(id, 'archive_id') return cls.get_by_id(id, 'archive_id')
@property @property
def mainfile_file(self) -> File: def mainfile_file(self) -> PathObject:
return File(self.mainfile_tmp_path) return self.upload_files.raw_file_object(self.mainfile)
@property @property
def calc_hash(self) -> str: def calc_hash(self) -> str:
...@@ -98,15 +96,24 @@ class Calc(Proc, datamodel.Calc): ...@@ -98,15 +96,24 @@ class Calc(Proc, datamodel.Calc):
self._upload = Upload.get(self.upload_id) self._upload = Upload.get(self.upload_id)
return self._upload return self._upload
@property
def upload_files(self) -> ArchiveBasedStagingUploadFiles:
if not self._upload_files:
self._upload_files = ArchiveBasedStagingUploadFiles(self.upload_id, public_only=False)
return self._upload_files
@property
def upload_hash(self):
return utils.archive.upload_hash(self.archive_id)
def delete(self): def delete(self):
""" """
Delete this calculation and all associated data. This includes all files, Delete this calculation and all associated data. This includes all files,
the archive, and this search index entry. the archive, and this search index entry.
TODO is this needed? Or do we always delete hole uploads in bulk. TODO is this needed? Or do we always delete hole uploads in bulk.
""" """
# delete the archive # delete all files
if self.archive_id is not None: self.upload_files.delete()
ArchiveFile(self.archive_id).delete()
# delete the search index entry # delete the search index entry
try: try:
...@@ -120,11 +127,10 @@ class Calc(Proc, datamodel.Calc): ...@@ -120,11 +127,10 @@ class Calc(Proc, datamodel.Calc):
super().delete() super().delete()
def get_logger(self, **kwargs): def get_logger(self, **kwargs):
upload_hash, calc_hash = self.archive_id.split('/')
logger = super().get_logger() logger = super().get_logger()
logger = logger.bind( logger = logger.bind(
upload_id=self.upload_id, mainfile=self.mainfile, upload_id=self.upload_id, mainfile=self.mainfile,
upload_hash=upload_hash, calc_hash=calc_hash, upload_hash=self.upload_hash, calc_hash=self.calc_hash,
archive_id=self.archive_id, **kwargs) archive_id=self.archive_id, **kwargs)
return logger return logger
...@@ -137,8 +143,7 @@ class Calc(Proc, datamodel.Calc): ...@@ -137,8 +143,7 @@ class Calc(Proc, datamodel.Calc):
logger = self.get_logger(**kwargs) logger = self.get_logger(**kwargs)
if self._calc_proc_logwriter is None: if self._calc_proc_logwriter is None:
self._calc_proc_logfile = ArchiveLogFile(self.archive_id) self._calc_proc_logwriter_ctx = self.upload_files.archive_log_file(self.calc_hash, 'wt')
self._calc_proc_logwriter_ctx = self._calc_proc_logfile.open('wt')
self._calc_proc_logwriter = self._calc_proc_logwriter_ctx.__enter__() # pylint: disable=E1101 self._calc_proc_logwriter = self._calc_proc_logwriter_ctx.__enter__() # pylint: disable=E1101
def save_to_calc_log(logger, method_name, event_dict): def save_to_calc_log(logger, method_name, event_dict):
...@@ -184,7 +189,8 @@ class Calc(Proc, datamodel.Calc): ...@@ -184,7 +189,8 @@ class Calc(Proc, datamodel.Calc):
parser = parser_dict[self.parser] parser = parser_dict[self.parser]
with utils.timer(logger, 'parser executed', input_size=self.mainfile_file.size): with utils.timer(logger, 'parser executed', input_size=self.mainfile_file.size):
self._parser_backend = parser.run(self.mainfile_tmp_path, logger=logger) self._parser_backend = parser.run(
self.upload_files.raw_file_object(self.mainfile).os_path, logger=logger)
self._parser_backend.openNonOverlappingSection('section_calculation_info') self._parser_backend.openNonOverlappingSection('section_calculation_info')
self._parser_backend.addValue('upload_id', self.upload_id) self._parser_backend.addValue('upload_id', self.upload_id)
...@@ -263,7 +269,7 @@ class Calc(Proc, datamodel.Calc): ...@@ -263,7 +269,7 @@ class Calc(Proc, datamodel.Calc):
staging=True, staging=True,
restricted=False, restricted=False,
user_id=self.upload.user_id, user_id=self.upload.user_id,
aux_files=list(self.upload.upload_file.get_siblings(self.mainfile))) aux_files=list(self.upload_files.calc_files(self.mainfile, with_mainfile=False)))
with utils.timer(logger, 'indexed', step='index'): with utils.timer(logger, 'indexed', step='index'):
# persist to elastic search # persist to elastic search
...@@ -280,11 +286,10 @@ class Calc(Proc, datamodel.Calc): ...@@ -280,11 +286,10 @@ class Calc(Proc, datamodel.Calc):
input_size=self.mainfile_file.size) as log_data: input_size=self.mainfile_file.size) as log_data:
# persist the archive # persist the archive
archive_file = ArchiveFile(self.archive_id) with self.upload_files.archive_file(self.calc_hash, 'wt') as out:
with archive_file.write_archive_json() as out:
self._parser_backend.write_json(out, pretty=True) self._parser_backend.write_json(out, pretty=True)
log_data.update(archive_size=archive_file.size) log_data.update(archive_size=self.upload_files.archive_file_object(self.calc_hash).size)
# close loghandler # close loghandler
if self._calc_proc_logwriter is not None: if self._calc_proc_logwriter is not None:
...@@ -294,7 +299,7 @@ class Calc(Proc, datamodel.Calc): ...@@ -294,7 +299,7 @@ class Calc(Proc, datamodel.Calc):
self._calc_proc_logwriter_ctx.__exit__(None, None, None) # pylint: disable=E1101 self._calc_proc_logwriter_ctx.__exit__(None, None, None) # pylint: disable=E1101
self._calc_proc_logwriter = None self._calc_proc_logwriter = None
log_data.update(log_size=self._calc_proc_logfile.size) log_data.update(log_size=self.upload_files.archive_log_file_object(self.calc_hash).size)
class Upload(Chord, datamodel.Upload): class Upload(Chord, datamodel.Upload):
...@@ -341,7 +346,7 @@ class Upload(Chord, datamodel.Upload): ...@@ -341,7 +346,7 @@ class Upload(Chord, datamodel.Upload):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self._upload_file = None self._upload_files: ArchiveBasedStagingUploadFiles = None
@classmethod @classmethod
def get(cls, id): def get(cls, id):
...@@ -367,28 +372,17 @@ class Upload(Chord, datamodel.Upload): ...@@ -367,28 +372,17 @@ class Upload(Chord, datamodel.Upload):
if not (self.completed or self.current_task == 'uploading'): if not (self.completed or self.current_task == 'uploading'):
raise NotAllowedDuringProcessing() raise NotAllowedDuringProcessing()
with lnr(logger, 'delete upload file'): with lnr(logger, 'delete all files of upload'):
try: self.upload_files.delete()
UploadFile(self.upload_id, local_path=self.local_path).delete()
except KeyError:
if self.current_task == 'uploading':
logger.debug(
'Upload exist, but file does not exist. '
'It was probably aborted and deleted.')
else:
logger.debug('Upload exist, but uploaded file does not exist.')
with lnr(logger, 'deleting calcs'):
# delete archive files
ArchiveFile.delete_archives(upload_hash=self.upload_hash)
with lnr(logger, 'deleting calcs db entries'):
# delete repo entries # delete repo entries
self.to(RepoUpload).delete() self.to(RepoUpload).delete()
# delete calc processings # delete calc processings
Calc.objects(upload_id=self.upload_id).delete() Calc.objects(upload_id=self.upload_id).delete()
with lnr(logger, 'deleting upload'): with lnr(logger, 'deleting upload db entry'):
super().delete() super().delete()
@classmethod @classmethod
...@@ -433,11 +427,10 @@ class Upload(Chord, datamodel.Upload): ...@@ -433,11 +427,10 @@ class Upload(Chord, datamodel.Upload):
pass pass
@property @property
def upload_file(self): def upload_files(self) -> ArchiveBasedStagingUploadFiles:
""" The :class:`UploadFile` instance that represents the uploaded file of this upload. """ if not self._upload_files:
if not self._upload_file: self._upload_files = ArchiveBasedStagingUploadFiles(self.upload_id, public_only=False)
self._upload_file = UploadFile(self.upload_id, local_path=self.local_path) return self._upload_files
return self._upload_file
@task @task
def extracting(self): def extracting(self):
...@@ -451,15 +444,15 @@ class Upload(Chord, datamodel.Upload): ...@@ -451,15 +444,15 @@ class Upload(Chord, datamodel.Upload):
try: try:
with utils.timer( with utils.timer(
logger, 'upload extracted', step='extracting', logger, 'upload extracted', step='extracting',
upload_size=self.upload_file.size): upload_size=self.upload_files.size):
self.upload_file.extract() self.upload_files.extract()
except KeyError as e: except KeyError as e:
self.fail('process request for non existing upload', level=logging.INFO)