Commit 435efe1f authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

RRplaced by ucalc_hash with calc_id.

parent 14cb7dee
......@@ -95,7 +95,7 @@ def with_logger(func):
args = inspect.getcallargs(wrapper, *args, **kwargs)
logger_args = {
k: v for k, v in args.items()
if k in ['upload_id', 'calc_hash']}
if k in ['upload_id', 'calc_id']}
logger = utils.get_logger(__name__, **logger_args)
args.update(logger=logger)
try:
......
......@@ -42,28 +42,28 @@ class ArchiveCalcLogResource(Resource):
@api.response(401, 'Not authorized to access the data.')
@api.response(200, 'Archive data send', headers={'Content-Type': 'application/plain'})
@login_if_available
def get(self, upload_id, calc_hash):
def get(self, upload_id, calc_id):
"""
Get calculation processing log.
Calcs are references via *upload_id*, *calc_hash* pairs.
Calcs are references via *upload_id*, *calc_id* pairs.
"""
archive_id = '%s/%s' % (upload_id, calc_hash)
archive_id = '%s/%s' % (upload_id, calc_id)
upload_files = UploadFiles.get(
upload_id, is_authorized=create_authorization_predicate(upload_id, calc_hash))
upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id))
if upload_files is None:
abort(404, message='Upload %s does not exist.' % upload_id)
try:
return send_file(
upload_files.archive_log_file(calc_hash, 'rt'),
upload_files.archive_log_file(calc_id, 'rt'),
mimetype='text/plain',
as_attachment=True,
attachment_filename='%s.log' % archive_id)
except Restricted:
abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_hash))
abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))
except KeyError:
abort(404, message='Calculation %s does not exist.' % archive_id)
......@@ -75,28 +75,28 @@ class ArchiveCalcResource(Resource):
@api.response(401, 'Not authorized to access the data.')
@api.response(200, 'Archive data send')
@login_if_available
def get(self, upload_id, calc_hash):
def get(self, upload_id, calc_id):
"""
Get calculation data in archive form.
Calcs are references via *upload_id*, *calc_hash* pairs.
Calcs are references via *upload_id*, *calc_id* pairs.
"""
archive_id = '%s/%s' % (upload_id, calc_hash)
archive_id = '%s/%s' % (upload_id, calc_id)
upload_file = UploadFiles.get(
upload_id, is_authorized=create_authorization_predicate(upload_id, calc_hash))
upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id))
if upload_file is None:
abort(404, message='Archive %s does not exist.' % upload_id)
try:
return send_file(
upload_file.archive_file(calc_hash, 'rt'),
upload_file.archive_file(calc_id, 'rt'),
mimetype='application/json',
as_attachment=True,
attachment_filename='%s.json' % archive_id)
except Restricted:
abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_hash))
abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))
except KeyError:
abort(404, message='Calculation %s does not exist.' % archive_id)
......
......@@ -149,7 +149,7 @@ class TokenResource(Resource):
'there is no token for you.')
def create_authorization_predicate(upload_id, calc_hash=None):
def create_authorization_predicate(upload_id, calc_id=None):
"""
Returns a predicate that determines if the logged in user has the authorization
to access the given upload and calculation.
......@@ -171,7 +171,7 @@ def create_authorization_predicate(upload_id, calc_hash=None):
# There are no db entries for the given resource
if files.UploadFiles.get(upload_id) is not None:
logger = utils.get_logger(__name__, upload_id=upload_id, calc_hash=calc_hash)
logger = utils.get_logger(__name__, upload_id=upload_id, calc_id=calc_id)
logger.error('Upload files without respective db entry')
raise KeyError
......
......@@ -45,10 +45,10 @@ pagination_request_parser.add_argument(
def calc_route(ns, prefix: str = ''):
""" A resource decorator for /<upload>/<calc> based routes. """
def decorator(func):
ns.route('%s/<string:upload_id>/<string:calc_hash>' % prefix)(
ns.route('%s/<string:upload_id>/<string:calc_id>' % prefix)(
api.doc(params={
'upload_id': 'The unique id for the requested upload.',
'calc_hash': 'The upload unique hash for the requested calculation.'
'calc_id': 'The unique id for the requested calculation.'
})(func)
)
return decorator
......@@ -40,7 +40,7 @@ raw_file_from_path_parser.add_argument(**raw_file_compress_argument)
@ns.route('/<string:upload_id>/<path:path>')
@api.doc(params={
'upload_id': 'The unique hash for the requested upload.',
'upload_id': 'The unique id for the requested upload.',
'path': 'The path to a file or directory.'
})
@api.header('Content-Type', 'application/gz')
......@@ -65,7 +65,7 @@ class RawFileFromPathResource(Resource):
upload_files = UploadFiles.get(
upload_id, create_authorization_predicate(upload_id))
if upload_files is None:
abort(404, message='The upload with hash %s does not exist.' % upload_id)
abort(404, message='The upload with id %s does not exist.' % upload_id)
if upload_filepath[-1:] == '*':
upload_filepath = upload_filepath[0:-1]
......@@ -108,7 +108,7 @@ raw_files_request_parser.add_argument(
@ns.route('/<string:upload_id>')
@api.doc(params={
'upload_id': 'The unique hash for the requested upload.'
'upload_id': 'The unique id for the requested upload.'
})
class RawFilesResource(Resource):
@api.doc('get_files')
......@@ -154,7 +154,7 @@ def respond_to_get_raw_files(upload_id, files, compress=False):
upload_files = UploadFiles.get(
upload_id, create_authorization_predicate(upload_id))
if upload_files is None:
abort(404, message='The upload with hash %s does not exist.' % upload_id)
abort(404, message='The upload with id %s does not exist.' % upload_id)
def generator():
""" Stream a zip file with all files using zipstream. """
......
......@@ -35,19 +35,19 @@ class RepoCalcResource(Resource):
@api.response(404, 'The upload or calculation does not exist')
@api.response(200, 'Metadata send')
@api.doc('get_repo_calc')
def get(self, upload_id, calc_hash):
def get(self, upload_id, calc_id):
"""
Get calculation metadata in repository form.
Repository metadata only entails the quanties shown in the repository.
This is basically the elastic search index entry for the
requested calculations. Calcs are references via *upload_id*, *calc_hash*
requested calculations. Calcs are references via *upload_id*, *calc_id*
pairs.
"""
try:
return RepoCalc.get(id='%s/%s' % (upload_id, calc_hash)).json_dict, 200
return RepoCalc.get(id='%s/%s' % (upload_id, calc_id)).json_dict, 200
except NotFoundError:
abort(404, message='There is no calculation for %s/%s' % (upload_id, calc_hash))
abort(404, message='There is no calculation for %s/%s' % (upload_id, calc_id))
except Exception as e:
abort(500, message=str(e))
......
......@@ -130,7 +130,7 @@ class CalcProcReproduction:
(parsing, normalizing) with the locally installed parsers and normalizers.
The use-case is error/warning reproduction. Use ELK to identify errors, use
the upload, archive ids/hashes to given by ELK, and reproduce and fix the error
the upload, archive ids to given by ELK, and reproduce and fix the error
in your development environment.
This is a class of :class:`UploadFile` the downloaded raw data will be treated as
......@@ -142,7 +142,7 @@ class CalcProcReproduction:
override: Set to true to override any existing local calculation data.
"""
def __init__(self, archive_id: str, override: bool = False) -> None:
self.calc_hash = utils.archive.calc_hash(archive_id)
self.calc_id = utils.archive.calc_id(archive_id)
self.upload_id = utils.archive.upload_id(archive_id)
self.mainfile = None
self.parser = None
......@@ -170,10 +170,10 @@ class CalcProcReproduction:
self.logger.info('Extracting calc data.')
self.upload_files.extract()
# find mainfile matching calc_hash
# find mainfile matching calc_id
self.mainfile = next(
filename for filename in self.upload_files.raw_file_manifest()
if self.upload_files.calc_hash(filename) == self.calc_hash)
if self.upload_files.calc_id(filename) == self.calc_id)
assert self.mainfile is not None, 'The mainfile could not be found.'
self.logger = self.logger.bind(mainfile=self.mainfile)
......
......@@ -28,7 +28,7 @@ from .base import Base, calc_citation_association, ownership, co_authorship, sha
class Calc(Base, datamodel.Calc): # type: ignore
__tablename__ = 'calculations'
calc_id = Column(Integer, primary_key=True, autoincrement=True)
coe_calc_id = Column('calc_id', Integer, primary_key=True, autoincrement=True)
origin_id = Column(Integer, ForeignKey('uploads.upload_id'))
upload = relationship('Upload')
checksum = Column(String)
......@@ -43,14 +43,14 @@ class Calc(Base, datamodel.Calc): # type: ignore
parents = relationship(
'Calc',
secondary=calc_dataset_containment,
primaryjoin=calc_dataset_containment.c.children_calc_id == calc_id,
secondaryjoin=calc_dataset_containment.c.parent_calc_id == calc_id,
primaryjoin=calc_dataset_containment.c.children_calc_id == coe_calc_id,
secondaryjoin=calc_dataset_containment.c.parent_calc_id == coe_calc_id,
backref='children')
@classmethod
def load_from(cls, obj):
repo_db = infrastructure.repository_db
return repo_db.query(Calc).filter_by(calc_id=int(obj.pid)).first()
return repo_db.query(Calc).filter_by(coe_calc_id=int(obj.pid)).first()
@property
def mainfile(self) -> str:
......@@ -58,14 +58,14 @@ class Calc(Base, datamodel.Calc): # type: ignore
@property
def pid(self):
return self.calc_id
return self.coe_calc_id
@property
def comment(self) -> str:
return self.user_meta_data.label
@property
def calc_hash(self) -> str:
def calc_id(self) -> str:
return self.checksum
@property
......@@ -92,19 +92,19 @@ class Calc(Base, datamodel.Calc): # type: ignore
@property
def all_datasets(self) -> List['DataSet']:
assert self.calc_id is not None
assert self.coe_calc_id is not None
repo_db = infrastructure.repository_db
query = repo_db.query(literal(self.calc_id).label('calc_id')).cte(recursive=True)
query = repo_db.query(literal(self.coe_calc_id).label('coe_calc_id')).cte(recursive=True)
right = aliased(query)
left = aliased(CalcSet)
query = query.union_all(repo_db.query(left.parent_calc_id).join(
right, right.c.calc_id == left.children_calc_id))
right, right.c.coe_calc_id == left.children_calc_id))
query = repo_db.query(query)
dataset_calc_ids = list(r[0] for r in query if not r[0] == self.calc_id)
dataset_calc_ids = list(r[0] for r in query if not r[0] == self.coe_calc_id)
if len(dataset_calc_ids) > 0:
return [
DataSet(dataset_calc)
for dataset_calc in repo_db.query(Calc).filter(Calc.calc_id.in_(dataset_calc_ids))]
for dataset_calc in repo_db.query(Calc).filter(Calc.coe_calc_id.in_(dataset_calc_ids))]
else:
return []
......@@ -132,7 +132,7 @@ class DataSet:
@property
def id(self):
return self._dataset_calc.calc_id
return self._dataset_calc.coe_calc_id
@property
def dois(self) -> List[Citation]:
......
......@@ -100,17 +100,17 @@ class Upload(Base, datamodel.Upload): # type: ignore
@classmethod
def load_from(cls, obj):
return Upload.from_upload_id(obj.upload_id)
return Upload.from_upload_id(str(obj.upload_id))
@staticmethod
def from_upload_id(upload_id) -> 'Upload':
def from_upload_id(upload_id: str) -> 'Upload':
repo_db = infrastructure.repository_db
uploads = repo_db.query(Upload).filter_by(upload_name=upload_id)
assert uploads.count() <= 1, 'Upload hash/name must be unique'
assert uploads.count() <= 1, 'Upload id/name must be unique'
return uploads.first()
@property
def upload_id(self):
def upload_id(self) -> str:
return self.upload_name
@property
......@@ -163,7 +163,7 @@ class Upload(Base, datamodel.Upload): # type: ignore
if has_calcs:
# empty upload case
repo_db.commit()
result = coe_upload.upload_id
result = coe_upload.coe_upload_id
else:
repo_db.rollback()
except Exception as e:
......@@ -181,8 +181,8 @@ class Upload(Base, datamodel.Upload): # type: ignore
# table based properties
coe_calc = Calc(
calc_id=calc_meta_data.get('_pid', None),
checksum=calc_meta_data.get('_checksum', calc.calc_hash),
coe_calc_id=calc_meta_data.get('_pid', None),
checksum=calc_meta_data.get('_checksum', calc.calc_id),
upload=self)
repo_db.add(coe_calc)
......@@ -242,7 +242,7 @@ class Upload(Base, datamodel.Upload): # type: ignore
# datasets
for dataset_id in calc_meta_data.get('datasets', []):
dataset = CalcSet(parent_calc_id=dataset_id, children_calc_id=coe_calc.calc_id)
dataset = CalcSet(parent_calc_id=dataset_id, children_calc_id=coe_calc.coe_calc_id)
repo_db.add(dataset)
# references
......
......@@ -50,7 +50,7 @@ class Calc(Entity):
Attributes:
pid: The persistent id (pid) for the calculation
mainfile: The mainfile path relative to upload root
calc_hash: A unique hash/checksum that describes unique calculations
calc_id: A unique id/checksum that describes unique calculations
upload: The upload object that this calculation belongs to.
"""
@property
......@@ -62,7 +62,7 @@ class Calc(Entity):
raise NotImplementedError
@property
def calc_hash(self) -> str:
def calc_id(self) -> str:
raise NotImplementedError
@property
......
......@@ -46,6 +46,7 @@ import shutil
from zipfile import ZipFile, BadZipFile, is_zipfile
from bagit import make_bag
import hashlib
import base64
import io
from nomad import config, utils
......@@ -140,11 +141,11 @@ class Metadata(metaclass=ABCMeta):
pass
def insert(self, calc: dict) -> None:
""" Insert a calc, using hash as key. """
""" Insert a calc, using calc_id as key. """
raise NotImplementedError()
def update(self, calc_hash: str, updates: dict) -> dict:
""" Updating a calc, using hash as key and running dict update with the given data. """
def update(self, calc_id: str, updates: dict) -> dict:
""" Updating a calc, using calc_id as key and running dict update with the given data. """
raise NotImplementedError()
def get(self, calc_id: str) -> dict:
......@@ -181,16 +182,16 @@ class StagingMetadata(Metadata):
pass
def insert(self, calc: dict) -> None:
id = calc['hash']
id = calc['calc_id']
path = self._dir.join_file('%s.json' % id)
assert not path.exists()
with open(path.os_path, 'wt') as f:
ujson.dump(calc, f)
def update(self, calc_hash: str, updates: dict) -> dict:
metadata = self.get(calc_hash)
def update(self, calc_id: str, updates: dict) -> dict:
metadata = self.get(calc_id)
metadata.update(updates)
path = self._dir.join_file('%s.json' % calc_hash)
path = self._dir.join_file('%s.json' % calc_id)
with open(path.os_path, 'wt') as f:
ujson.dump(metadata, f)
return metadata
......@@ -263,24 +264,24 @@ class PublicMetadata(Metadata):
def insert(self, calc: dict) -> None:
assert self.data is not None, "Metadata is not open."
id = calc['hash']
id = calc['calc_id']
assert id not in self.data
self.data[id] = calc
self._modified = True
def update(self, calc_hash: str, updates: dict) -> dict:
def update(self, calc_id: str, updates: dict) -> dict:
assert self.data is not None, "Metadata is not open."
if calc_hash not in self.data:
if calc_id not in self.data:
raise KeyError()
self.data[calc_hash].update(updates)
self.data[calc_id].update(updates)
self._modified = True
return self.data[calc_hash]
return self.data[calc_id]
def get(self, calc_hash: str) -> dict:
def get(self, calc_id: str) -> dict:
assert self.data is not None, "Metadata is not open."
return self.data[calc_hash]
return self.data[calc_id]
def __iter__(self) -> Iterator[dict]:
assert self.data is not None, "Metadata is not open."
......@@ -349,24 +350,24 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta):
"""
raise NotImplementedError()
def archive_file(self, calc_hash: str, *args, **kwargs) -> IO:
def archive_file(self, calc_id: str, *args, **kwargs) -> IO:
"""
Opens a archive file and returns a file-like objects. Additional args, kwargs are
delegated to the respective `open` call.
Arguments:
calc_hash: The hash identifying the calculation.
calc_id: The id identifying the calculation.
Raises:
KeyError: If the calc does not exist.
Restricted: If the file is restricted and upload access evaluated to False.
"""
raise NotImplementedError()
def archive_log_file(self, calc_hash: str, *args, **kwargs) -> IO:
def archive_log_file(self, calc_id: str, *args, **kwargs) -> IO:
"""
Opens a archive log file and returns a file-like objects. Additional args, kwargs are
delegated to the respective `open` call.
Arguments:
calc_hash: The hash identifying the calculation.
calc_id: The id identifying the calculation.
Raises:
KeyError: If the calc does not exist.
Restricted: If the file is restricted and upload access evaluated to False.
......@@ -409,21 +410,21 @@ class StagingUploadFiles(UploadFiles):
def raw_file_object(self, file_path: str) -> PathObject:
return self._raw_dir.join_file(file_path)
def archive_file(self, calc_hash: str, *args, **kwargs) -> IO:
def archive_file(self, calc_id: str, *args, **kwargs) -> IO:
if not self._is_authorized():
raise Restricted
return self._file(self.archive_file_object(calc_hash), *args, **kwargs)
return self._file(self.archive_file_object(calc_id), *args, **kwargs)
def archive_log_file(self, calc_hash: str, *args, **kwargs) -> IO:
def archive_log_file(self, calc_id: str, *args, **kwargs) -> IO:
if not self._is_authorized():
raise Restricted
return self._file(self.archive_log_file_object(calc_hash), *args, **kwargs)
return self._file(self.archive_log_file_object(calc_id), *args, **kwargs)
def archive_file_object(self, calc_hash: str) -> PathObject:
return self._archive_dir.join_file('%s.%s' % (calc_hash, self._archive_ext))
def archive_file_object(self, calc_id: str) -> PathObject:
return self._archive_dir.join_file('%s.%s' % (calc_id, self._archive_ext))
def archive_log_file_object(self, calc_hash: str) -> PathObject:
return self._archive_dir.join_file('%s.log' % calc_hash)
def archive_log_file_object(self, calc_id: str) -> PathObject:
return self._archive_dir.join_file('%s.log' % calc_id)
def add_rawfiles(self, path: str, move: bool = False, prefix: str = None) -> None:
"""
......@@ -519,10 +520,10 @@ class StagingUploadFiles(UploadFiles):
for calc in self.metadata:
archive_zip = archive_restricted_zip if calc.get('restricted', False) else archive_public_zip
archive_filename = '%s.%s' % (calc['hash'], self._archive_ext)
archive_filename = '%s.%s' % (calc['calc_id'], self._archive_ext)
archive_zip.write(self._archive_dir.join_file(archive_filename).os_path, archive_filename)
archive_log_filename = '%s.%s' % (calc['hash'], 'log')
archive_log_filename = '%s.%s' % (calc['calc_id'], 'log')
log_file = self._archive_dir.join_file(archive_log_filename)
if log_file.exists():
archive_zip.write(log_file.os_path, archive_log_filename)
......@@ -567,13 +568,34 @@ class StagingUploadFiles(UploadFiles):
os.path.join(calc_relative_dir, path) for path in os.listdir(calc_dir)
if os.path.isfile(os.path.join(calc_dir, path)) and (with_mainfile or path != mainfile))
def _websave_hash(self, hash: bytes, length: int = 0) -> str:
if length > 0:
return base64.b64encode(hash, altchars=b'-_')[0:28].decode('utf-8')
else:
return base64.b64encode(hash, altchars=b'-_')[0:-2].decode('utf-8')
def calc_id(self, mainfile: str) -> str:
"""
Calculates a id for the given calc.
Arguments:
mainfile: The mainfile path relative to the upload that identifies the calc in the folder structure.
Returns:
The calc id
Raises:
KeyError: If the mainfile does not exist.
"""
hash = hashlib.sha512()
hash.update(self.upload_id.encode('utf-8'))
hash.update(mainfile.encode('utf-8'))
return self._websave_hash(hash.digest(), utils.default_hash_len)
def calc_hash(self, mainfile: str) -> str:
"""
Calculates a hash for the given calc.
Calculates a hash for the given calc based on file contents and aux file contents.
Arguments:
mainfile: The mainfile path relative to the upload that identifies the calc in the folder structure.
Returns:
The calc hash
The calculated hash
Raises:
KeyError: If the mainfile does not exist.
"""
......@@ -583,7 +605,7 @@ class StagingUploadFiles(UploadFiles):
for data in iter(lambda: f.read(65536), b''):
hash.update(data)
return utils.websave_hash(hash.digest(), utils.default_hash_len)
return self._websave_hash(hash.digest(), utils.default_hash_len)
class ArchiveBasedStagingUploadFiles(StagingUploadFiles):
......@@ -675,11 +697,11 @@ class PublicUploadFiles(UploadFiles):
except FileNotFoundError:
pass
def archive_file(self, calc_hash: str, *args, **kwargs) -> IO:
return self._file('archive', self._archive_ext, '%s.%s' % (calc_hash, self._archive_ext), *args, **kwargs)
def archive_file(self, calc_id: str, *args, **kwargs) -> IO:
return self._file('archive', self._archive_ext, '%s.%s' % (calc_id, self._archive_ext), *args, **kwargs)
def archive_log_file(self, calc_hash: str, *args, **kwargs) -> IO:
return self._file('archive', self._archive_ext, '%s.log' % calc_hash, *args, **kwargs)
def archive_log_file(self, calc_id: str, *args, **kwargs) -> IO:
return self._file('archive', self._archive_ext, '%s.log' % calc_id, *args, **kwargs)
def repack(self) -> None:
"""
......
......@@ -29,7 +29,7 @@ class RepositoryNormalizer(Normalizer):
b.openNonOverlappingSection('section_repository_info')
b.openNonOverlappingSection('section_repository_parserdata')
b.addValue('repository_checksum', utils.archive.calc_hash(b.get_value('archive_id', 0)))
b.addValue('repository_checksum', utils.archive.calc_id(b.get_value('archive_id', 0)))
b.addValue('repository_chemical_formula', b.get_value('chemical_composition_bulk_reduced', 0))
b.addValue('repository_parser_id', b.get_value('parser_name', 0))
atoms = b.get_value('atom_labels', 0)
......
......@@ -52,7 +52,7 @@ class Calc(Proc, datamodel.Calc):
while parsing, including ``program_name``, ``program_version``, etc.
Attributes:
archive_id: the hash based archive id of the calc
archive_id: the full id upload_id and calc_id based id
parser: the name of the parser used to process this calc
upload_id: the id of the upload used to create this calculation
mainfile: the mainfile (including path in upload) that was used to create this calc
......@@ -85,8 +85,8 @@ class Calc(Proc, datamodel.Calc):
return self.upload_files.raw_file_object(self.mainfile)
@property
def calc_hash(self) -> str:
return utils.archive.calc_hash(self.archive_id)
def calc_id(self) -> str:
return utils.archive.calc_id(self.archive_id)
@property
def upload(self) -> 'Upload':
......@@ -103,7 +103,7 @@ class Calc(Proc, datamodel.Calc):
def get_logger(self, **kwargs):
logger = super().get_logger()
logger = logger.bind(
upload_id=self.upload_id, mainfile=self.mainfile, calc_hash=self.calc_hash,
upload_id=self.upload_id, mainfile=self.mainfile, calc_id=self.calc_id,
archive_id=self.archive_id, **kwargs)
return logger
......@@ -116,7 +116,7 @@ class Calc(Proc, datamodel.Calc):
logger = self.get_logger(**kwargs)
if self._calc_proc_logwriter is None:
self._calc_proc_logwriter_ctx = self.upload_files.archive_log_file(self.calc_hash, 'wt')
self._calc_proc_logwriter_ctx = self.upload_files.archive_log_file(self.calc_id, 'wt')
self._calc_proc_logwriter = self._calc_proc_logwriter_ctx.__enter__() # pylint: disable=E1101
def save_to_calc_log(logger, method_name, event_dict):
......@@ -235,7 +235,7 @@ class Calc(Proc, datamodel.Calc):