Commit 14cb7dee authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Removed upload_hash and replaced by upload_id.

parent e63f46b1
......@@ -95,7 +95,7 @@ def with_logger(func):
args = inspect.getcallargs(wrapper, *args, **kwargs)
logger_args = {
k: v for k, v in args.items()
if k in ['upload_id', 'upload_hash', 'calc_hash']}
if k in ['upload_id', 'calc_hash']}
logger = utils.get_logger(__name__, **logger_args)
args.update(logger=logger)
try:
......
......@@ -42,19 +42,19 @@ class ArchiveCalcLogResource(Resource):
@api.response(401, 'Not authorized to access the data.')
@api.response(200, 'Archive data send', headers={'Content-Type': 'application/plain'})
@login_if_available
def get(self, upload_hash, calc_hash):
def get(self, upload_id, calc_hash):
"""
Get calculation processing log.
Calcs are references via *upload_hash*, *calc_hash* pairs.
Calcs are references via *upload_id*, *calc_hash* pairs.
"""
archive_id = '%s/%s' % (upload_hash, calc_hash)
archive_id = '%s/%s' % (upload_id, calc_hash)
upload_files = UploadFiles.get(
upload_hash, is_authorized=create_authorization_predicate(upload_hash, calc_hash))
upload_id, is_authorized=create_authorization_predicate(upload_id, calc_hash))
if upload_files is None:
abort(404, message='Archive %s does not exist.' % upload_hash)
abort(404, message='Upload %s does not exist.' % upload_id)
try:
return send_file(
......@@ -63,7 +63,7 @@ class ArchiveCalcLogResource(Resource):
as_attachment=True,
attachment_filename='%s.log' % archive_id)
except Restricted:
abort(401, message='Not authorized to access %s/%s.' % (upload_hash, calc_hash))
abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_hash))
except KeyError:
abort(404, message='Calculation %s does not exist.' % archive_id)
......@@ -75,19 +75,19 @@ class ArchiveCalcResource(Resource):
@api.response(401, 'Not authorized to access the data.')
@api.response(200, 'Archive data send')
@login_if_available
def get(self, upload_hash, calc_hash):
def get(self, upload_id, calc_hash):
"""
Get calculation data in archive form.
Calcs are references via *upload_hash*, *calc_hash* pairs.
Calcs are references via *upload_id*, *calc_hash* pairs.
"""
archive_id = '%s/%s' % (upload_hash, calc_hash)
archive_id = '%s/%s' % (upload_id, calc_hash)
upload_file = UploadFiles.get(
upload_hash, is_authorized=create_authorization_predicate(upload_hash, calc_hash))
upload_id, is_authorized=create_authorization_predicate(upload_id, calc_hash))
if upload_file is None:
abort(404, message='Archive %s does not exist.' % upload_hash)
abort(404, message='Archive %s does not exist.' % upload_id)
try:
return send_file(
......@@ -96,7 +96,7 @@ class ArchiveCalcResource(Resource):
as_attachment=True,
attachment_filename='%s.json' % archive_id)
except Restricted:
abort(401, message='Not authorized to access %s/%s.' % (upload_hash, calc_hash))
abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_hash))
except KeyError:
abort(404, message='Calculation %s does not exist.' % archive_id)
......
......@@ -149,7 +149,7 @@ class TokenResource(Resource):
'there is no token for you.')
def create_authorization_predicate(upload_hash, calc_hash=None):
def create_authorization_predicate(upload_id, calc_hash=None):
"""
Returns a predicate that determines if the logged in user has the authorization
to access the given upload and calculation.
......@@ -160,18 +160,18 @@ def create_authorization_predicate(upload_hash, calc_hash=None):
return False
# look in repository
upload = coe_repo.Upload.from_upload_hash(upload_hash)
upload = coe_repo.Upload.from_upload_id(upload_id)
if upload is not None:
return upload.user_id == g.user.user_id
# look in staging
staging_upload = processing.Upload.get(upload_hash)
staging_upload = processing.Upload.get(upload_id)
if staging_upload is not None:
return str(g.user.user_id) == str(staging_upload.user_id)
# There are no db entries for the given resource
if files.UploadFiles.get(upload_hash) is not None:
logger = utils.get_logger(__name__, upload_hash=upload_hash, calc_hash=calc_hash)
if files.UploadFiles.get(upload_id) is not None:
logger = utils.get_logger(__name__, upload_id=upload_id, calc_hash=calc_hash)
logger.error('Upload files without respective db entry')
raise KeyError
......
......@@ -45,10 +45,10 @@ pagination_request_parser.add_argument(
def calc_route(ns, prefix: str = ''):
""" A resource decorator for /<upload>/<calc> based routes. """
def decorator(func):
ns.route('%s/<string:upload_hash>/<string:calc_hash>' % prefix)(
ns.route('%s/<string:upload_id>/<string:calc_hash>' % prefix)(
api.doc(params={
'upload_hash': 'The unique hash for the requested upload.',
'calc_hash': 'The unique hash for the requested calculation.'
'upload_id': 'The unique id for the requested upload.',
'calc_hash': 'The upload unique hash for the requested calculation.'
})(func)
)
return decorator
......@@ -38,9 +38,9 @@ raw_file_from_path_parser = api.parser()
raw_file_from_path_parser.add_argument(**raw_file_compress_argument)
@ns.route('/<string:upload_hash>/<path:path>')
@ns.route('/<string:upload_id>/<path:path>')
@api.doc(params={
'upload_hash': 'The unique hash for the requested upload.',
'upload_id': 'The unique hash for the requested upload.',
'path': 'The path to a file or directory.'
})
@api.header('Content-Type', 'application/gz')
......@@ -51,7 +51,7 @@ class RawFileFromPathResource(Resource):
@api.response(200, 'File(s) send', headers={'Content-Type': 'application/gz'})
@api.expect(raw_file_from_path_parser, validate=True)
@login_if_available
def get(self, upload_hash: str, path: str):
def get(self, upload_id: str, path: str):
"""
Get a single raw calculation file or whole directory from a given upload.
......@@ -63,9 +63,9 @@ class RawFileFromPathResource(Resource):
upload_filepath = path
upload_files = UploadFiles.get(
upload_hash, create_authorization_predicate(upload_hash))
upload_id, create_authorization_predicate(upload_id))
if upload_files is None:
abort(404, message='The upload with hash %s does not exist.' % upload_hash)
abort(404, message='The upload with hash %s does not exist.' % upload_id)
if upload_filepath[-1:] == '*':
upload_filepath = upload_filepath[0:-1]
......@@ -74,7 +74,7 @@ class RawFileFromPathResource(Resource):
abort(404, message='There are no files for %s.' % upload_filepath)
else:
compress = request.args.get('compress', None) is not None
return respond_to_get_raw_files(upload_hash, files, compress)
return respond_to_get_raw_files(upload_id, files, compress)
try:
return send_file(
......@@ -83,7 +83,7 @@ class RawFileFromPathResource(Resource):
as_attachment=True,
attachment_filename=os.path.basename(upload_filepath))
except Restricted:
abort(401, message='Not authorized to access upload %s.' % upload_hash)
abort(401, message='Not authorized to access upload %s.' % upload_id)
except KeyError:
files = list(file for file in upload_files.raw_file_manifest(upload_filepath))
if len(files) == 0:
......@@ -106,9 +106,9 @@ raw_files_request_parser.add_argument(
'files', required=True, type=str, help='Comma separated list of files to download.', location='args')
@ns.route('/<string:upload_hash>')
@ns.route('/<string:upload_id>')
@api.doc(params={
'upload_hash': 'The unique hash for the requested upload.'
'upload_id': 'The unique hash for the requested upload.'
})
class RawFilesResource(Resource):
@api.doc('get_files')
......@@ -116,7 +116,7 @@ class RawFilesResource(Resource):
@api.response(200, 'File(s) send', headers={'Content-Type': 'application/gz'})
@api.expect(raw_files_request_model, validate=True)
@login_if_available
def post(self, upload_hash):
def post(self, upload_id):
"""
Download multiple raw calculation files in a .zip file.
Zip files are streamed; instead of 401 errors, the zip file will just not contain
......@@ -126,14 +126,14 @@ class RawFilesResource(Resource):
compress = json_data.get('compress', False)
files = [file.strip() for file in json_data['files']]
return respond_to_get_raw_files(upload_hash, files, compress)
return respond_to_get_raw_files(upload_id, files, compress)
@api.doc('get_files_alternate')
@api.response(404, 'The upload or path does not exist')
@api.response(200, 'File(s) send', headers={'Content-Type': 'application/gz'})
@api.expect(raw_files_request_parser, validate=True)
@login_if_available
def get(self, upload_hash):
def get(self, upload_id):
"""
Download multiple raw calculation files.
Download multiple raw calculation files in a .zip file.
......@@ -147,14 +147,14 @@ class RawFilesResource(Resource):
abort(400, message="No files argument given.")
files = [file.strip() for file in files_str.split(',')]
return respond_to_get_raw_files(upload_hash, files, compress)
return respond_to_get_raw_files(upload_id, files, compress)
def respond_to_get_raw_files(upload_hash, files, compress=False):
def respond_to_get_raw_files(upload_id, files, compress=False):
upload_files = UploadFiles.get(
upload_hash, create_authorization_predicate(upload_hash))
upload_id, create_authorization_predicate(upload_id))
if upload_files is None:
abort(404, message='The upload with hash %s does not exist.' % upload_hash)
abort(404, message='The upload with hash %s does not exist.' % upload_id)
def generator():
""" Stream a zip file with all files using zipstream. """
......@@ -188,5 +188,5 @@ def respond_to_get_raw_files(upload_hash, files, compress=False):
yield chunk
response = Response(stream_with_context(generator()), mimetype='application/zip')
response.headers['Content-Disposition'] = 'attachment; filename={}'.format('%s.zip' % upload_hash)
response.headers['Content-Disposition'] = 'attachment; filename={}'.format('%s.zip' % upload_id)
return response
......@@ -35,19 +35,19 @@ class RepoCalcResource(Resource):
@api.response(404, 'The upload or calculation does not exist')
@api.response(200, 'Metadata send')
@api.doc('get_repo_calc')
def get(self, upload_hash, calc_hash):
def get(self, upload_id, calc_hash):
"""
Get calculation metadata in repository form.
Repository metadata only entails the quanties shown in the repository.
This is basically the elastic search index entry for the
requested calculations. Calcs are references via *upload_hash*, *calc_hash*
requested calculations. Calcs are references via *upload_id*, *calc_hash*
pairs.
"""
try:
return RepoCalc.get(id='%s/%s' % (upload_hash, calc_hash)).json_dict, 200
return RepoCalc.get(id='%s/%s' % (upload_id, calc_hash)).json_dict, 200
except NotFoundError:
abort(404, message='There is no calculation for %s/%s' % (upload_hash, calc_hash))
abort(404, message='There is no calculation for %s/%s' % (upload_id, calc_hash))
except Exception as e:
abort(500, message=str(e))
......
......@@ -55,12 +55,7 @@ upload_model = api.inherit('UploadProcessing', proc_model, {
description='The name of the upload. This can be provided during upload '
'using the name query parameter.'),
'upload_id': fields.String(
description='The unique id for the upload. Its a random uuid and '
'and used within nomad as long as no upload_hash is available.'),
'upload_hash': fields.String(
description='The unique upload hash. It is based on the uploaded content and '
'used within nomad to identify uploads.'
),
description='The unique id for the upload.'),
'additional_metadata': fields.Arbitrary,
'local_path': fields.String,
'upload_time': fields.DateTime(dt_format='iso8601'),
......
......@@ -143,7 +143,7 @@ class CalcProcReproduction:
"""
def __init__(self, archive_id: str, override: bool = False) -> None:
self.calc_hash = utils.archive.calc_hash(archive_id)
self.upload_hash = utils.archive.upload_hash(archive_id)
self.upload_id = utils.archive.upload_id(archive_id)
self.mainfile = None
self.parser = None
self.logger = utils.get_logger(__name__, archive_id=archive_id)
......@@ -156,7 +156,7 @@ class CalcProcReproduction:
# download with request, since bravado does not support streaming
# TODO currently only downloads mainfile
self.logger.info('Downloading calc.')
req = requests.get('%s/raw/%s/%s' % (api_base, self.upload_hash, os.path.dirname(self.mainfile)), stream=True)
req = requests.get('%s/raw/%s/%s' % (api_base, self.upload_id, os.path.dirname(self.mainfile)), stream=True)
with open(local_path, 'wb') as f:
for chunk in req.iter_content(chunk_size=1024):
f.write(chunk)
......
......@@ -89,7 +89,7 @@ class UploadMetaData:
class Upload(Base, datamodel.Upload): # type: ignore
__tablename__ = 'uploads'
upload_id = Column(Integer, primary_key=True, autoincrement=True)
coe_upload_id = Column('upload_id', Integer, primary_key=True, autoincrement=True)
upload_name = Column(String)
user_id = Column(Integer, ForeignKey('users.user_id'))
is_processed = Column(Boolean)
......@@ -100,17 +100,17 @@ class Upload(Base, datamodel.Upload): # type: ignore
@classmethod
def load_from(cls, obj):
return Upload.from_upload_hash(obj.upload_hash)
return Upload.from_upload_id(obj.upload_id)
@staticmethod
def from_upload_hash(upload_hash) -> 'Upload':
def from_upload_id(upload_id) -> 'Upload':
repo_db = infrastructure.repository_db
uploads = repo_db.query(Upload).filter_by(upload_name=upload_hash)
uploads = repo_db.query(Upload).filter_by(upload_name=upload_id)
assert uploads.count() <= 1, 'Upload hash/name must be unique'
return uploads.first()
@property
def upload_hash(self):
def upload_id(self):
return self.upload_name
@property
......@@ -140,17 +140,14 @@ class Upload(Base, datamodel.Upload): # type: ignore
repo_db = infrastructure.repository_db
repo_db.begin()
logger = utils.get_logger(
__name__,
upload_id=upload.upload_id,
upload_hash=upload.upload_hash)
logger = utils.get_logger(__name__, upload_id=upload.upload_id)
result = None
try:
# create upload
coe_upload = Upload(
upload_name=upload.upload_hash,
upload_name=upload.upload_id,
created=meta_data.get('_upload_time', upload.upload_time),
user=upload.uploader,
is_processed=True)
......
......@@ -76,7 +76,6 @@ class Upload(Entity):
Attributes:
upload_id(str): The unique random id that each upload has
upload_hash(str): The hash/checksum that describes unique uploads
upload_time(datatime): The upload time
uploader(repo.User): The user that uploaded this upload
calcs(Iterable[Calc]): An iterable over the calculations of this upload
......@@ -85,10 +84,6 @@ class Upload(Entity):
def upload_id(self) -> str:
return '<not assigned>'
@property
def upload_hash(self) -> str:
raise NotImplementedError
@property
def upload_time(self) -> Type[datetime.datetime]:
raise NotImplementedError
......
......@@ -585,10 +585,6 @@ class StagingUploadFiles(UploadFiles):
return utils.websave_hash(hash.digest(), utils.default_hash_len)
def upload_hash(self) -> str:
""" Returns: A hash for the whole upload. It is only available if upload *is_bag*. """
pass
class ArchiveBasedStagingUploadFiles(StagingUploadFiles):
"""
......
......@@ -100,15 +100,10 @@ class Calc(Proc, datamodel.Calc):
self._upload_files = ArchiveBasedStagingUploadFiles(self.upload_id, is_authorized=lambda: True, local_path=self.upload.local_path)
return self._upload_files
@property
def upload_hash(self):
return utils.archive.upload_hash(self.archive_id)
def get_logger(self, **kwargs):
logger = super().get_logger()
logger = logger.bind(
upload_id=self.upload_id, mainfile=self.mainfile,
upload_hash=self.upload_hash, calc_hash=self.calc_hash,
upload_id=self.upload_id, mainfile=self.mainfile, calc_hash=self.calc_hash,
archive_id=self.archive_id, **kwargs)
return logger
......@@ -240,7 +235,7 @@ class Calc(Proc, datamodel.Calc):
def archiving(self):
logger = self.get_logger()
upload_hash, calc_hash = self.archive_id.split('/')
_, calc_hash = self.archive_id.split('/')
additional = dict(
mainfile=self.mainfile,
upload_time=self.upload.upload_time,
......@@ -254,7 +249,6 @@ class Calc(Proc, datamodel.Calc):
repo_calc = RepoCalc.create_from_backend(
self._parser_backend,
additional=additional,
upload_hash=upload_hash,
calc_hash=calc_hash,
upload_id=self.upload_id)
repo_calc.persist()
......@@ -292,7 +286,6 @@ class Upload(Chord, datamodel.Upload):
upload_id: the upload id generated by the database
is_private: true if the upload and its derivitaves are only visible to the uploader
upload_time: the timestamp when the system realised the upload
upload_hash: the hash of the uploaded file
user_id: the id of the user that created this upload
"""
id_field = 'upload_id'
......@@ -306,7 +299,6 @@ class Upload(Chord, datamodel.Upload):
is_private = BooleanField(default=False)
upload_time = DateTimeField()
upload_hash = StringField(default=None)
user_id = StringField(required=True)
......@@ -316,7 +308,7 @@ class Upload(Chord, datamodel.Upload):
meta: Any = {
'indexes': [
'upload_hash', 'user_id', 'status'
'user_id', 'status'
]
}
......@@ -413,17 +405,10 @@ class Upload(Chord, datamodel.Upload):
logger, 'upload extracted', step='extracting',
upload_size=self.upload_files.size):
self.upload_files.extract()
except KeyError as e:
except KeyError:
self.fail('process request for non existing upload', level=logging.ERROR)
return
# create and save a hash for the upload
try:
self.upload_hash = self.upload_id # TODO self.upload_file.upload_hash()
except Exception as e:
self.fail('could not create upload hash', e)
return
# check if the file was already uploaded and processed before
if self.to(RepoUpload).exists():
self.fail('The same file was already uploaded and processed.', level=logging.INFO)
......@@ -463,7 +448,7 @@ class Upload(Chord, datamodel.Upload):
total_calcs = 0
for filename, parser in self.match_mainfiles():
calc = Calc.create(
archive_id='%s/%s' % (self.upload_hash, utils.hash(filename)),
archive_id='%s/%s' % (self.upload_id, utils.hash(filename)),
mainfile=filename, parser=parser.name,
upload_id=self.upload_id)
......
......@@ -45,13 +45,12 @@ class AlreadyExists(Exception): pass
class RepoUpload(datamodel.Entity):
def __init__(self, upload_id, upload_hash):
def __init__(self, upload_id):
self.upload_id = upload_id
self.upload_hash = upload_hash
@classmethod
def load_from(cls, obj):
return RepoUpload(obj.upload_id, obj.upload_hash)
return RepoUpload(obj.upload_id)
@property
def calcs(self):
......@@ -67,7 +66,7 @@ class RepoUpload(datamodel.Entity):
""" Returns true if there are already calcs from the given upload. """
# TODO this is deprecated and should be varyfied via repository files
search = Search(using=infrastructure.elastic_client, index=config.elastic.index_name) \
.query('match', upload_hash=self.upload_hash) \
.query('match', upload_id=self.upload_id) \
.execute()
return len(search) > 0
......@@ -91,7 +90,6 @@ class RepoCalc(ElasticDocument, datamodel.Entity):
calc_hash = Keyword()
mainfile = Keyword()
upload_hash = Keyword()
upload_id = Keyword()
upload_time = Date()
......@@ -116,38 +114,37 @@ class RepoCalc(ElasticDocument, datamodel.Entity):
@property
def upload(self):
return RepoUpload(self.upload_id, self.upload_hash)
return RepoUpload(self.upload_id)
@property
def archive_id(self) -> str:
""" The unique id for this calculation. """
return '%s/%s' % (self.upload_hash, self.calc_hash)
return '%s/%s' % (self.upload_id, self.calc_hash)
@classmethod
def create_from_backend(
cls, backend: LocalBackend, additional: Dict[str, Any],
upload_id: str, upload_hash: str, calc_hash: str) -> 'RepoCalc':
upload_id: str, calc_hash: str) -> 'RepoCalc':
"""
Create a new calculation instance in elastic search. The data from the given backend
will be used. Additional meta-data can be given as *kwargs*. ``upload_id``,
``upload_hash``, and ``calc_hash`` are mandatory.
will be used. Additional meta-data can be given as *kwargs*.
``upload_id`` and ``calc_hash`` are mandatory.
Arguments:
backend: The parsing/normalizing backend that contains the calculation data.
additional: Additional arguments not stored in the backend. E.g. ``user_id``,
``staging``, ``restricted``
upload_hash: The upload hash of the originating upload.
upload_id: The upload id of the originating upload.
calc_hash: The upload unique hash for this calculation.
Returns:
The created instance.
"""
assert upload_hash is not None and calc_hash is not None and upload_id is not None
additional.update(dict(upload_hash=upload_hash, calc_hash=calc_hash, upload_id=upload_id))
assert calc_hash is not None and upload_id is not None
additional.update(dict(calc_hash=calc_hash, upload_id=upload_id))
# prepare the entry with all necessary properties from the backend
calc = cls(meta=dict(id='%s/%s' % (upload_hash, calc_hash)))
calc = cls(meta=dict(id='%s/%s' % (upload_id, calc_hash)))
for property in cls._doc_type.mapping:
mapped_property = key_mappings.get(property, property)
......@@ -165,7 +162,7 @@ class RepoCalc(ElasticDocument, datamodel.Entity):
program_name = 'unknown'
logger.warning(
'Missing property value', property=mapped_property, upload_id=upload_id,
upload_hash=upload_hash, calc_hash=calc_hash, code=program_name)
calc_hash=calc_hash, code=program_name)
continue
setattr(calc, property, value)
......
......@@ -98,7 +98,7 @@ class LogstashFormatter(logstash.formatter.LogstashFormatterBase):
if key in ('event', 'stack_info', 'id', 'timestamp'):
continue
elif key in (
'upload_hash', 'archive_id', 'upload_id', 'calc_hash', 'mainfile',
'archive_id', 'upload_id', 'calc_hash', 'mainfile',
'service', 'release'):
key = 'nomad.%s' % key
else:
......@@ -258,8 +258,8 @@ def timer(logger, event, method='info', **kwargs):
class archive:
@staticmethod
def create(upload_hash: str, calc_hash: str) -> str:
return '%s/%s' % (upload_hash, calc_hash)
def create(upload_id: str, calc_hash: str) -> str:
return '%s/%s' % (upload_id, calc_hash)
@staticmethod
def items(archive_id: str) -> List[str]:
......@@ -274,5 +274,5 @@ class archive:
return archive.item(archive_id, 1)
@staticmethod
def upload_hash(archive_id: str) -> str:
def upload_id(archive_id: str) -> str:
return archive.item(archive_id, 0)
......@@ -144,26 +144,22 @@ def admin_user(repository_db):
@pytest.fixture(scope='function')
def mocksearch(monkeypatch):
uploads_by_hash = {}
uploads_by_id = {}
by_archive_id = {}
def persist(calc):
uploads_by_hash.setdefault(calc.upload_hash, []).append(calc)
uploads_by_id.setdefault(calc.upload_id, []).append(calc)
by_archive_id[calc.archive_id] = calc
def upload_exists(self):
return self.upload_hash in uploads_by_hash
return self.upload_id in uploads_by_id
def upload_delete(self):
upload_id = self.upload_id
if upload_id in uploads_by_id:
for calc in uploads_by_id[upload_id]:
del(by_archive_id[calc.archive_id])
upload_hash = uploads_by_id[upload_id][0].upload_hash
del(uploads_by_id[upload_id])
del(uploads_by_hash[upload_hash])
@property
def upload_calcs(self):
......
......@@ -85,7 +85,7 @@ def processed_upload(uploaded_id, test_user, worker, no_warn) -> Upload:
def assert_processing(upload: Upload, mocksearch=None):
assert upload.completed
assert upload.current_task == 'cleanup'
assert upload.upload_hash is not None
assert upload.upload_id is not None