diff --git a/nomad/api/app.py b/nomad/api/app.py index 6191983079b4b1eb13d7c38747b63173b839a72f..43cc6866870e89f8769c2d11de79df1c25549ea5 100644 --- a/nomad/api/app.py +++ b/nomad/api/app.py @@ -95,7 +95,7 @@ def with_logger(func): args = inspect.getcallargs(wrapper, *args, **kwargs) logger_args = { k: v for k, v in args.items() - if k in ['upload_id', 'calc_hash']} + if k in ['upload_id', 'calc_id']} logger = utils.get_logger(__name__, **logger_args) args.update(logger=logger) try: diff --git a/nomad/api/archive.py b/nomad/api/archive.py index 02ec6bd35543cab90b8e0e58587d6548c676bfa4..18ff7fbaa6f88400efbf801748a905b9ac31c9bf 100644 --- a/nomad/api/archive.py +++ b/nomad/api/archive.py @@ -42,28 +42,28 @@ class ArchiveCalcLogResource(Resource): @api.response(401, 'Not authorized to access the data.') @api.response(200, 'Archive data send', headers={'Content-Type': 'application/plain'}) @login_if_available - def get(self, upload_id, calc_hash): + def get(self, upload_id, calc_id): """ Get calculation processing log. - Calcs are references via *upload_id*, *calc_hash* pairs. + Calcs are references via *upload_id*, *calc_id* pairs. """ - archive_id = '%s/%s' % (upload_id, calc_hash) + archive_id = '%s/%s' % (upload_id, calc_id) upload_files = UploadFiles.get( - upload_id, is_authorized=create_authorization_predicate(upload_id, calc_hash)) + upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id)) if upload_files is None: abort(404, message='Upload %s does not exist.' % upload_id) try: return send_file( - upload_files.archive_log_file(calc_hash, 'rt'), + upload_files.archive_log_file(calc_id, 'rt'), mimetype='text/plain', as_attachment=True, attachment_filename='%s.log' % archive_id) except Restricted: - abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_hash)) + abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id)) except KeyError: abort(404, message='Calculation %s does not exist.' % archive_id) @@ -75,28 +75,28 @@ class ArchiveCalcResource(Resource): @api.response(401, 'Not authorized to access the data.') @api.response(200, 'Archive data send') @login_if_available - def get(self, upload_id, calc_hash): + def get(self, upload_id, calc_id): """ Get calculation data in archive form. - Calcs are references via *upload_id*, *calc_hash* pairs. + Calcs are references via *upload_id*, *calc_id* pairs. """ - archive_id = '%s/%s' % (upload_id, calc_hash) + archive_id = '%s/%s' % (upload_id, calc_id) upload_file = UploadFiles.get( - upload_id, is_authorized=create_authorization_predicate(upload_id, calc_hash)) + upload_id, is_authorized=create_authorization_predicate(upload_id, calc_id)) if upload_file is None: abort(404, message='Archive %s does not exist.' % upload_id) try: return send_file( - upload_file.archive_file(calc_hash, 'rt'), + upload_file.archive_file(calc_id, 'rt'), mimetype='application/json', as_attachment=True, attachment_filename='%s.json' % archive_id) except Restricted: - abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_hash)) + abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id)) except KeyError: abort(404, message='Calculation %s does not exist.' % archive_id) diff --git a/nomad/api/auth.py b/nomad/api/auth.py index 9cd7a4f4556dfbf6218943e6d05f012c52b7308a..51ca4bfe6047002c0c9be6cd22c08509e4a23f47 100644 --- a/nomad/api/auth.py +++ b/nomad/api/auth.py @@ -149,7 +149,7 @@ class TokenResource(Resource): 'there is no token for you.') -def create_authorization_predicate(upload_id, calc_hash=None): +def create_authorization_predicate(upload_id, calc_id=None): """ Returns a predicate that determines if the logged in user has the authorization to access the given upload and calculation. @@ -171,7 +171,7 @@ def create_authorization_predicate(upload_id, calc_hash=None): # There are no db entries for the given resource if files.UploadFiles.get(upload_id) is not None: - logger = utils.get_logger(__name__, upload_id=upload_id, calc_hash=calc_hash) + logger = utils.get_logger(__name__, upload_id=upload_id, calc_id=calc_id) logger.error('Upload files without respective db entry') raise KeyError diff --git a/nomad/api/common.py b/nomad/api/common.py index f836ec88fcb2164e60339ca3ce1f54096aacced1..9dbd8ad565c1e82f97bd888aabff43b37b5b7ece 100644 --- a/nomad/api/common.py +++ b/nomad/api/common.py @@ -45,10 +45,10 @@ pagination_request_parser.add_argument( def calc_route(ns, prefix: str = ''): """ A resource decorator for /<upload>/<calc> based routes. """ def decorator(func): - ns.route('%s/<string:upload_id>/<string:calc_hash>' % prefix)( + ns.route('%s/<string:upload_id>/<string:calc_id>' % prefix)( api.doc(params={ 'upload_id': 'The unique id for the requested upload.', - 'calc_hash': 'The upload unique hash for the requested calculation.' + 'calc_id': 'The unique id for the requested calculation.' })(func) ) return decorator diff --git a/nomad/api/raw.py b/nomad/api/raw.py index 83f2a8560cb83d7b301ece62abb3eb0ccc4baf2f..8c1c483f1216a79a7ef22a6c9219c7be6f49968f 100644 --- a/nomad/api/raw.py +++ b/nomad/api/raw.py @@ -40,7 +40,7 @@ raw_file_from_path_parser.add_argument(**raw_file_compress_argument) @ns.route('/<string:upload_id>/<path:path>') @api.doc(params={ - 'upload_id': 'The unique hash for the requested upload.', + 'upload_id': 'The unique id for the requested upload.', 'path': 'The path to a file or directory.' }) @api.header('Content-Type', 'application/gz') @@ -65,7 +65,7 @@ class RawFileFromPathResource(Resource): upload_files = UploadFiles.get( upload_id, create_authorization_predicate(upload_id)) if upload_files is None: - abort(404, message='The upload with hash %s does not exist.' % upload_id) + abort(404, message='The upload with id %s does not exist.' % upload_id) if upload_filepath[-1:] == '*': upload_filepath = upload_filepath[0:-1] @@ -108,7 +108,7 @@ raw_files_request_parser.add_argument( @ns.route('/<string:upload_id>') @api.doc(params={ - 'upload_id': 'The unique hash for the requested upload.' + 'upload_id': 'The unique id for the requested upload.' }) class RawFilesResource(Resource): @api.doc('get_files') @@ -154,7 +154,7 @@ def respond_to_get_raw_files(upload_id, files, compress=False): upload_files = UploadFiles.get( upload_id, create_authorization_predicate(upload_id)) if upload_files is None: - abort(404, message='The upload with hash %s does not exist.' % upload_id) + abort(404, message='The upload with id %s does not exist.' % upload_id) def generator(): """ Stream a zip file with all files using zipstream. """ diff --git a/nomad/api/repo.py b/nomad/api/repo.py index ad37408f9e1625f6e599ad8034fe8fb35e59dcda..c3210585086179a17ea8b437c83001ef51b9d8a0 100644 --- a/nomad/api/repo.py +++ b/nomad/api/repo.py @@ -35,19 +35,19 @@ class RepoCalcResource(Resource): @api.response(404, 'The upload or calculation does not exist') @api.response(200, 'Metadata send') @api.doc('get_repo_calc') - def get(self, upload_id, calc_hash): + def get(self, upload_id, calc_id): """ Get calculation metadata in repository form. Repository metadata only entails the quanties shown in the repository. This is basically the elastic search index entry for the - requested calculations. Calcs are references via *upload_id*, *calc_hash* + requested calculations. Calcs are references via *upload_id*, *calc_id* pairs. """ try: - return RepoCalc.get(id='%s/%s' % (upload_id, calc_hash)).json_dict, 200 + return RepoCalc.get(id='%s/%s' % (upload_id, calc_id)).json_dict, 200 except NotFoundError: - abort(404, message='There is no calculation for %s/%s' % (upload_id, calc_hash)) + abort(404, message='There is no calculation for %s/%s' % (upload_id, calc_id)) except Exception as e: abort(500, message=str(e)) diff --git a/nomad/client.py b/nomad/client.py index d3dbd285ac29f8d1f1ffa95169d27ce1e7098a82..dd1c64130e7713b542ff8e5d2866a7b7a74a8a1a 100644 --- a/nomad/client.py +++ b/nomad/client.py @@ -130,7 +130,7 @@ class CalcProcReproduction: (parsing, normalizing) with the locally installed parsers and normalizers. The use-case is error/warning reproduction. Use ELK to identify errors, use - the upload, archive ids/hashes to given by ELK, and reproduce and fix the error + the upload, archive ids to given by ELK, and reproduce and fix the error in your development environment. This is a class of :class:`UploadFile` the downloaded raw data will be treated as @@ -142,7 +142,7 @@ class CalcProcReproduction: override: Set to true to override any existing local calculation data. """ def __init__(self, archive_id: str, override: bool = False) -> None: - self.calc_hash = utils.archive.calc_hash(archive_id) + self.calc_id = utils.archive.calc_id(archive_id) self.upload_id = utils.archive.upload_id(archive_id) self.mainfile = None self.parser = None @@ -170,10 +170,10 @@ class CalcProcReproduction: self.logger.info('Extracting calc data.') self.upload_files.extract() - # find mainfile matching calc_hash + # find mainfile matching calc_id self.mainfile = next( filename for filename in self.upload_files.raw_file_manifest() - if self.upload_files.calc_hash(filename) == self.calc_hash) + if self.upload_files.calc_id(filename) == self.calc_id) assert self.mainfile is not None, 'The mainfile could not be found.' self.logger = self.logger.bind(mainfile=self.mainfile) diff --git a/nomad/coe_repo/calc.py b/nomad/coe_repo/calc.py index e523b5b3ab4b377542b902c35f4210a0b8769969..b0399f3a899e14ec2282bf398b630f8174d48efe 100644 --- a/nomad/coe_repo/calc.py +++ b/nomad/coe_repo/calc.py @@ -28,7 +28,7 @@ from .base import Base, calc_citation_association, ownership, co_authorship, sha class Calc(Base, datamodel.Calc): # type: ignore __tablename__ = 'calculations' - calc_id = Column(Integer, primary_key=True, autoincrement=True) + coe_calc_id = Column('calc_id', Integer, primary_key=True, autoincrement=True) origin_id = Column(Integer, ForeignKey('uploads.upload_id')) upload = relationship('Upload') checksum = Column(String) @@ -43,14 +43,14 @@ class Calc(Base, datamodel.Calc): # type: ignore parents = relationship( 'Calc', secondary=calc_dataset_containment, - primaryjoin=calc_dataset_containment.c.children_calc_id == calc_id, - secondaryjoin=calc_dataset_containment.c.parent_calc_id == calc_id, + primaryjoin=calc_dataset_containment.c.children_calc_id == coe_calc_id, + secondaryjoin=calc_dataset_containment.c.parent_calc_id == coe_calc_id, backref='children') @classmethod def load_from(cls, obj): repo_db = infrastructure.repository_db - return repo_db.query(Calc).filter_by(calc_id=int(obj.pid)).first() + return repo_db.query(Calc).filter_by(coe_calc_id=int(obj.pid)).first() @property def mainfile(self) -> str: @@ -58,14 +58,14 @@ class Calc(Base, datamodel.Calc): # type: ignore @property def pid(self): - return self.calc_id + return self.coe_calc_id @property def comment(self) -> str: return self.user_meta_data.label @property - def calc_hash(self) -> str: + def calc_id(self) -> str: return self.checksum @property @@ -92,19 +92,19 @@ class Calc(Base, datamodel.Calc): # type: ignore @property def all_datasets(self) -> List['DataSet']: - assert self.calc_id is not None + assert self.coe_calc_id is not None repo_db = infrastructure.repository_db - query = repo_db.query(literal(self.calc_id).label('calc_id')).cte(recursive=True) + query = repo_db.query(literal(self.coe_calc_id).label('coe_calc_id')).cte(recursive=True) right = aliased(query) left = aliased(CalcSet) query = query.union_all(repo_db.query(left.parent_calc_id).join( - right, right.c.calc_id == left.children_calc_id)) + right, right.c.coe_calc_id == left.children_calc_id)) query = repo_db.query(query) - dataset_calc_ids = list(r[0] for r in query if not r[0] == self.calc_id) + dataset_calc_ids = list(r[0] for r in query if not r[0] == self.coe_calc_id) if len(dataset_calc_ids) > 0: return [ DataSet(dataset_calc) - for dataset_calc in repo_db.query(Calc).filter(Calc.calc_id.in_(dataset_calc_ids))] + for dataset_calc in repo_db.query(Calc).filter(Calc.coe_calc_id.in_(dataset_calc_ids))] else: return [] @@ -132,7 +132,7 @@ class DataSet: @property def id(self): - return self._dataset_calc.calc_id + return self._dataset_calc.coe_calc_id @property def dois(self) -> List[Citation]: diff --git a/nomad/coe_repo/upload.py b/nomad/coe_repo/upload.py index 974d14ed0fecf0426a912f57d8e1f2db307331bf..85ce22a475a88205681e516380095bea65f3f90f 100644 --- a/nomad/coe_repo/upload.py +++ b/nomad/coe_repo/upload.py @@ -100,17 +100,17 @@ class Upload(Base, datamodel.Upload): # type: ignore @classmethod def load_from(cls, obj): - return Upload.from_upload_id(obj.upload_id) + return Upload.from_upload_id(str(obj.upload_id)) @staticmethod - def from_upload_id(upload_id) -> 'Upload': + def from_upload_id(upload_id: str) -> 'Upload': repo_db = infrastructure.repository_db uploads = repo_db.query(Upload).filter_by(upload_name=upload_id) - assert uploads.count() <= 1, 'Upload hash/name must be unique' + assert uploads.count() <= 1, 'Upload id/name must be unique' return uploads.first() @property - def upload_id(self): + def upload_id(self) -> str: return self.upload_name @property @@ -163,7 +163,7 @@ class Upload(Base, datamodel.Upload): # type: ignore if has_calcs: # empty upload case repo_db.commit() - result = coe_upload.upload_id + result = coe_upload.coe_upload_id else: repo_db.rollback() except Exception as e: @@ -181,8 +181,8 @@ class Upload(Base, datamodel.Upload): # type: ignore # table based properties coe_calc = Calc( - calc_id=calc_meta_data.get('_pid', None), - checksum=calc_meta_data.get('_checksum', calc.calc_hash), + coe_calc_id=calc_meta_data.get('_pid', None), + checksum=calc_meta_data.get('_checksum', calc.calc_id), upload=self) repo_db.add(coe_calc) @@ -242,7 +242,7 @@ class Upload(Base, datamodel.Upload): # type: ignore # datasets for dataset_id in calc_meta_data.get('datasets', []): - dataset = CalcSet(parent_calc_id=dataset_id, children_calc_id=coe_calc.calc_id) + dataset = CalcSet(parent_calc_id=dataset_id, children_calc_id=coe_calc.coe_calc_id) repo_db.add(dataset) # references diff --git a/nomad/datamodel.py b/nomad/datamodel.py index 696ec7420fbde757fa501a13b75bc39a7fa41c06..a9b3f9745a1cde3cac17bdf91a08f4a7710d87fb 100644 --- a/nomad/datamodel.py +++ b/nomad/datamodel.py @@ -50,7 +50,7 @@ class Calc(Entity): Attributes: pid: The persistent id (pid) for the calculation mainfile: The mainfile path relative to upload root - calc_hash: A unique hash/checksum that describes unique calculations + calc_id: A unique id/checksum that describes unique calculations upload: The upload object that this calculation belongs to. """ @property @@ -62,7 +62,7 @@ class Calc(Entity): raise NotImplementedError @property - def calc_hash(self) -> str: + def calc_id(self) -> str: raise NotImplementedError @property diff --git a/nomad/files.py b/nomad/files.py index af811c5b7b7764ab7de04a5dd24d51ea8fe51974..a26f8ed4b6d0433d31abe80dc802cd4e93da8dcc 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -46,6 +46,7 @@ import shutil from zipfile import ZipFile, BadZipFile, is_zipfile from bagit import make_bag import hashlib +import base64 import io from nomad import config, utils @@ -140,11 +141,11 @@ class Metadata(metaclass=ABCMeta): pass def insert(self, calc: dict) -> None: - """ Insert a calc, using hash as key. """ + """ Insert a calc, using calc_id as key. """ raise NotImplementedError() - def update(self, calc_hash: str, updates: dict) -> dict: - """ Updating a calc, using hash as key and running dict update with the given data. """ + def update(self, calc_id: str, updates: dict) -> dict: + """ Updating a calc, using calc_id as key and running dict update with the given data. """ raise NotImplementedError() def get(self, calc_id: str) -> dict: @@ -181,16 +182,16 @@ class StagingMetadata(Metadata): pass def insert(self, calc: dict) -> None: - id = calc['hash'] + id = calc['calc_id'] path = self._dir.join_file('%s.json' % id) assert not path.exists() with open(path.os_path, 'wt') as f: ujson.dump(calc, f) - def update(self, calc_hash: str, updates: dict) -> dict: - metadata = self.get(calc_hash) + def update(self, calc_id: str, updates: dict) -> dict: + metadata = self.get(calc_id) metadata.update(updates) - path = self._dir.join_file('%s.json' % calc_hash) + path = self._dir.join_file('%s.json' % calc_id) with open(path.os_path, 'wt') as f: ujson.dump(metadata, f) return metadata @@ -263,24 +264,24 @@ class PublicMetadata(Metadata): def insert(self, calc: dict) -> None: assert self.data is not None, "Metadata is not open." - id = calc['hash'] + id = calc['calc_id'] assert id not in self.data self.data[id] = calc self._modified = True - def update(self, calc_hash: str, updates: dict) -> dict: + def update(self, calc_id: str, updates: dict) -> dict: assert self.data is not None, "Metadata is not open." - if calc_hash not in self.data: + if calc_id not in self.data: raise KeyError() - self.data[calc_hash].update(updates) + self.data[calc_id].update(updates) self._modified = True - return self.data[calc_hash] + return self.data[calc_id] - def get(self, calc_hash: str) -> dict: + def get(self, calc_id: str) -> dict: assert self.data is not None, "Metadata is not open." - return self.data[calc_hash] + return self.data[calc_id] def __iter__(self) -> Iterator[dict]: assert self.data is not None, "Metadata is not open." @@ -349,24 +350,24 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta): """ raise NotImplementedError() - def archive_file(self, calc_hash: str, *args, **kwargs) -> IO: + def archive_file(self, calc_id: str, *args, **kwargs) -> IO: """ Opens a archive file and returns a file-like objects. Additional args, kwargs are delegated to the respective `open` call. Arguments: - calc_hash: The hash identifying the calculation. + calc_id: The id identifying the calculation. Raises: KeyError: If the calc does not exist. Restricted: If the file is restricted and upload access evaluated to False. """ raise NotImplementedError() - def archive_log_file(self, calc_hash: str, *args, **kwargs) -> IO: + def archive_log_file(self, calc_id: str, *args, **kwargs) -> IO: """ Opens a archive log file and returns a file-like objects. Additional args, kwargs are delegated to the respective `open` call. Arguments: - calc_hash: The hash identifying the calculation. + calc_id: The id identifying the calculation. Raises: KeyError: If the calc does not exist. Restricted: If the file is restricted and upload access evaluated to False. @@ -409,21 +410,21 @@ class StagingUploadFiles(UploadFiles): def raw_file_object(self, file_path: str) -> PathObject: return self._raw_dir.join_file(file_path) - def archive_file(self, calc_hash: str, *args, **kwargs) -> IO: + def archive_file(self, calc_id: str, *args, **kwargs) -> IO: if not self._is_authorized(): raise Restricted - return self._file(self.archive_file_object(calc_hash), *args, **kwargs) + return self._file(self.archive_file_object(calc_id), *args, **kwargs) - def archive_log_file(self, calc_hash: str, *args, **kwargs) -> IO: + def archive_log_file(self, calc_id: str, *args, **kwargs) -> IO: if not self._is_authorized(): raise Restricted - return self._file(self.archive_log_file_object(calc_hash), *args, **kwargs) + return self._file(self.archive_log_file_object(calc_id), *args, **kwargs) - def archive_file_object(self, calc_hash: str) -> PathObject: - return self._archive_dir.join_file('%s.%s' % (calc_hash, self._archive_ext)) + def archive_file_object(self, calc_id: str) -> PathObject: + return self._archive_dir.join_file('%s.%s' % (calc_id, self._archive_ext)) - def archive_log_file_object(self, calc_hash: str) -> PathObject: - return self._archive_dir.join_file('%s.log' % calc_hash) + def archive_log_file_object(self, calc_id: str) -> PathObject: + return self._archive_dir.join_file('%s.log' % calc_id) def add_rawfiles(self, path: str, move: bool = False, prefix: str = None) -> None: """ @@ -519,10 +520,10 @@ class StagingUploadFiles(UploadFiles): for calc in self.metadata: archive_zip = archive_restricted_zip if calc.get('restricted', False) else archive_public_zip - archive_filename = '%s.%s' % (calc['hash'], self._archive_ext) + archive_filename = '%s.%s' % (calc['calc_id'], self._archive_ext) archive_zip.write(self._archive_dir.join_file(archive_filename).os_path, archive_filename) - archive_log_filename = '%s.%s' % (calc['hash'], 'log') + archive_log_filename = '%s.%s' % (calc['calc_id'], 'log') log_file = self._archive_dir.join_file(archive_log_filename) if log_file.exists(): archive_zip.write(log_file.os_path, archive_log_filename) @@ -567,13 +568,34 @@ class StagingUploadFiles(UploadFiles): os.path.join(calc_relative_dir, path) for path in os.listdir(calc_dir) if os.path.isfile(os.path.join(calc_dir, path)) and (with_mainfile or path != mainfile)) + def _websave_hash(self, hash: bytes, length: int = 0) -> str: + if length > 0: + return base64.b64encode(hash, altchars=b'-_')[0:28].decode('utf-8') + else: + return base64.b64encode(hash, altchars=b'-_')[0:-2].decode('utf-8') + + def calc_id(self, mainfile: str) -> str: + """ + Calculates a id for the given calc. + Arguments: + mainfile: The mainfile path relative to the upload that identifies the calc in the folder structure. + Returns: + The calc id + Raises: + KeyError: If the mainfile does not exist. + """ + hash = hashlib.sha512() + hash.update(self.upload_id.encode('utf-8')) + hash.update(mainfile.encode('utf-8')) + return self._websave_hash(hash.digest(), utils.default_hash_len) + def calc_hash(self, mainfile: str) -> str: """ - Calculates a hash for the given calc. + Calculates a hash for the given calc based on file contents and aux file contents. Arguments: mainfile: The mainfile path relative to the upload that identifies the calc in the folder structure. Returns: - The calc hash + The calculated hash Raises: KeyError: If the mainfile does not exist. """ @@ -583,7 +605,7 @@ class StagingUploadFiles(UploadFiles): for data in iter(lambda: f.read(65536), b''): hash.update(data) - return utils.websave_hash(hash.digest(), utils.default_hash_len) + return self._websave_hash(hash.digest(), utils.default_hash_len) class ArchiveBasedStagingUploadFiles(StagingUploadFiles): @@ -675,11 +697,11 @@ class PublicUploadFiles(UploadFiles): except FileNotFoundError: pass - def archive_file(self, calc_hash: str, *args, **kwargs) -> IO: - return self._file('archive', self._archive_ext, '%s.%s' % (calc_hash, self._archive_ext), *args, **kwargs) + def archive_file(self, calc_id: str, *args, **kwargs) -> IO: + return self._file('archive', self._archive_ext, '%s.%s' % (calc_id, self._archive_ext), *args, **kwargs) - def archive_log_file(self, calc_hash: str, *args, **kwargs) -> IO: - return self._file('archive', self._archive_ext, '%s.log' % calc_hash, *args, **kwargs) + def archive_log_file(self, calc_id: str, *args, **kwargs) -> IO: + return self._file('archive', self._archive_ext, '%s.log' % calc_id, *args, **kwargs) def repack(self) -> None: """ diff --git a/nomad/normalizing/repository.py b/nomad/normalizing/repository.py index cc15b69fa2902af222b831b9c7c336813e8a037f..ada41862b3349be6897971a9732503a794d93054 100644 --- a/nomad/normalizing/repository.py +++ b/nomad/normalizing/repository.py @@ -29,7 +29,7 @@ class RepositoryNormalizer(Normalizer): b.openNonOverlappingSection('section_repository_info') b.openNonOverlappingSection('section_repository_parserdata') - b.addValue('repository_checksum', utils.archive.calc_hash(b.get_value('archive_id', 0))) + b.addValue('repository_checksum', utils.archive.calc_id(b.get_value('archive_id', 0))) b.addValue('repository_chemical_formula', b.get_value('chemical_composition_bulk_reduced', 0)) b.addValue('repository_parser_id', b.get_value('parser_name', 0)) atoms = b.get_value('atom_labels', 0) diff --git a/nomad/processing/data.py b/nomad/processing/data.py index 4543d79b48a501c8de3a6079edfc71b134cd5c73..a5d8a5ec931c6f4b969a8e81184f74a9f5af524a 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -52,7 +52,7 @@ class Calc(Proc, datamodel.Calc): while parsing, including ``program_name``, ``program_version``, etc. Attributes: - archive_id: the hash based archive id of the calc + archive_id: the full id upload_id and calc_id based id parser: the name of the parser used to process this calc upload_id: the id of the upload used to create this calculation mainfile: the mainfile (including path in upload) that was used to create this calc @@ -85,8 +85,8 @@ class Calc(Proc, datamodel.Calc): return self.upload_files.raw_file_object(self.mainfile) @property - def calc_hash(self) -> str: - return utils.archive.calc_hash(self.archive_id) + def calc_id(self) -> str: + return utils.archive.calc_id(self.archive_id) @property def upload(self) -> 'Upload': @@ -103,7 +103,7 @@ class Calc(Proc, datamodel.Calc): def get_logger(self, **kwargs): logger = super().get_logger() logger = logger.bind( - upload_id=self.upload_id, mainfile=self.mainfile, calc_hash=self.calc_hash, + upload_id=self.upload_id, mainfile=self.mainfile, calc_id=self.calc_id, archive_id=self.archive_id, **kwargs) return logger @@ -116,7 +116,7 @@ class Calc(Proc, datamodel.Calc): logger = self.get_logger(**kwargs) if self._calc_proc_logwriter is None: - self._calc_proc_logwriter_ctx = self.upload_files.archive_log_file(self.calc_hash, 'wt') + self._calc_proc_logwriter_ctx = self.upload_files.archive_log_file(self.calc_id, 'wt') self._calc_proc_logwriter = self._calc_proc_logwriter_ctx.__enter__() # pylint: disable=E1101 def save_to_calc_log(logger, method_name, event_dict): @@ -235,7 +235,7 @@ class Calc(Proc, datamodel.Calc): def archiving(self): logger = self.get_logger() - _, calc_hash = self.archive_id.split('/') + _, calc_id = self.archive_id.split('/') additional = dict( mainfile=self.mainfile, upload_time=self.upload.upload_time, @@ -249,7 +249,7 @@ class Calc(Proc, datamodel.Calc): repo_calc = RepoCalc.create_from_backend( self._parser_backend, additional=additional, - calc_hash=calc_hash, + calc_id=calc_id, upload_id=self.upload_id) repo_calc.persist() @@ -258,10 +258,10 @@ class Calc(Proc, datamodel.Calc): input_size=self.mainfile_file.size) as log_data: # persist the archive - with self.upload_files.archive_file(self.calc_hash, 'wt') as out: + with self.upload_files.archive_file(self.calc_id, 'wt') as out: self._parser_backend.write_json(out, pretty=True) - log_data.update(archive_size=self.upload_files.archive_file_object(self.calc_hash).size) + log_data.update(archive_size=self.upload_files.archive_file_object(self.calc_id).size) # close loghandler if self._calc_proc_logwriter is not None: @@ -271,7 +271,7 @@ class Calc(Proc, datamodel.Calc): self._calc_proc_logwriter_ctx.__exit__(None, None, None) # pylint: disable=E1101 self._calc_proc_logwriter = None - log_data.update(log_size=self.upload_files.archive_log_file_object(self.calc_hash).size) + log_data.update(log_size=self.upload_files.archive_log_file_object(self.calc_id).size) class Upload(Chord, datamodel.Upload): @@ -448,7 +448,7 @@ class Upload(Chord, datamodel.Upload): total_calcs = 0 for filename, parser in self.match_mainfiles(): calc = Calc.create( - archive_id='%s/%s' % (self.upload_id, utils.hash(filename)), + archive_id='%s/%s' % (self.upload_id, self.upload_files.calc_id(filename)), mainfile=filename, parser=parser.name, upload_id=self.upload_id) diff --git a/nomad/repo.py b/nomad/repo.py index 6cc489a7ab99e3da3c4e5c91570dd68a86b86646..8152ca169f0a2e076cc74dfe6dfb239446e9b5b0 100644 --- a/nomad/repo.py +++ b/nomad/repo.py @@ -88,7 +88,7 @@ class RepoCalc(ElasticDocument, datamodel.Entity): class Index: name = config.elastic.index_name - calc_hash = Keyword() + calc_id = Keyword() mainfile = Keyword() upload_id = Keyword() @@ -119,32 +119,32 @@ class RepoCalc(ElasticDocument, datamodel.Entity): @property def archive_id(self) -> str: """ The unique id for this calculation. """ - return '%s/%s' % (self.upload_id, self.calc_hash) + return '%s/%s' % (self.upload_id, self.calc_id) @classmethod def create_from_backend( cls, backend: LocalBackend, additional: Dict[str, Any], - upload_id: str, calc_hash: str) -> 'RepoCalc': + upload_id: str, calc_id: str) -> 'RepoCalc': """ Create a new calculation instance in elastic search. The data from the given backend will be used. Additional meta-data can be given as *kwargs*. - ``upload_id`` and ``calc_hash`` are mandatory. + ``upload_id`` and ``calc_id`` are mandatory. Arguments: backend: The parsing/normalizing backend that contains the calculation data. additional: Additional arguments not stored in the backend. E.g. ``user_id``, ``staging``, ``restricted`` upload_id: The upload id of the originating upload. - calc_hash: The upload unique hash for this calculation. + calc_id: The upload unique id for this calculation. Returns: The created instance. """ - assert calc_hash is not None and upload_id is not None - additional.update(dict(calc_hash=calc_hash, upload_id=upload_id)) + assert calc_id is not None and upload_id is not None + additional.update(dict(calc_id=calc_id, upload_id=upload_id)) # prepare the entry with all necessary properties from the backend - calc = cls(meta=dict(id='%s/%s' % (upload_id, calc_hash))) + calc = cls(meta=dict(id='%s/%s' % (upload_id, calc_id))) for property in cls._doc_type.mapping: mapped_property = key_mappings.get(property, property) @@ -162,7 +162,7 @@ class RepoCalc(ElasticDocument, datamodel.Entity): program_name = 'unknown' logger.warning( 'Missing property value', property=mapped_property, upload_id=upload_id, - calc_hash=calc_hash, code=program_name) + calc_id=calc_id, code=program_name) continue setattr(calc, property, value) diff --git a/nomad/utils.py b/nomad/utils.py index 926e1457b24e2b71bbe13954e27e1af5ca32b070..50085c89c3248980f3f7bd9d68d7108e59569113 100644 --- a/nomad/utils.py +++ b/nomad/utils.py @@ -19,7 +19,7 @@ Logging in nomad is structured. Structured logging means that log entries contain dictionaries with quantities related to respective events. E.g. having the code, -parser, parser version, calc_hash, mainfile, etc. for all events that happen during +parser, parser version, calc_id, mainfile, etc. for all events that happen during calculation processing. This means the :func:`get_logger` and all logger functions take keyword arguments for structured data. Otherwise :func:`get_logger` can be used similar to the standard *logging.getLogger*. @@ -33,8 +33,7 @@ Depending on the configuration all logs will also be send to a central logstash. .. autofunc::nomad.utils.lnr """ -from typing import Union, IO, cast, List -import hashlib +from typing import List import base64 import logging import structlog @@ -98,7 +97,7 @@ class LogstashFormatter(logstash.formatter.LogstashFormatterBase): if key in ('event', 'stack_info', 'id', 'timestamp'): continue elif key in ( - 'archive_id', 'upload_id', 'calc_hash', 'mainfile', + 'archive_id', 'upload_id', 'calc_id', 'mainfile', 'service', 'release'): key = 'nomad.%s' % key else: @@ -173,28 +172,6 @@ def create_uuid() -> str: return base64.b64encode(uuid.uuid4().bytes, altchars=b'-_').decode('utf-8')[0:-2] -def hash(obj: Union[IO, str], length=default_hash_len) -> str: - """ - Returns a web-save base64 encoded 28 long hash for the given contents. - First 28 character of an URL safe base 64 encoded sha512 digest. - """ - hash = hashlib.sha512() - if getattr(obj, 'read', None) is not None: - for data in iter(lambda: cast(IO, obj).read(65536), b''): - hash.update(data) - elif isinstance(obj, str): - hash.update(obj.encode('utf-8')) - - return websave_hash(hash.digest(), length) - - -def websave_hash(hash, length=0): - if length > 0: - return base64.b64encode(hash, altchars=b'-_')[0:28].decode('utf-8') - else: - return base64.b64encode(hash, altchars=b'-_')[0:-2].decode('utf-8') - - def get_logger(name, **kwargs): """ Returns a structlog logger that is already attached with a logstash handler. @@ -258,8 +235,8 @@ def timer(logger, event, method='info', **kwargs): class archive: @staticmethod - def create(upload_id: str, calc_hash: str) -> str: - return '%s/%s' % (upload_id, calc_hash) + def create(upload_id: str, calc_id: str) -> str: + return '%s/%s' % (upload_id, calc_id) @staticmethod def items(archive_id: str) -> List[str]: @@ -270,7 +247,7 @@ class archive: return archive.items(archive_id)[index] @staticmethod - def calc_hash(archive_id: str) -> str: + def calc_id(archive_id: str) -> str: return archive.item(archive_id, 1) @staticmethod diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index d8c07a6f8c17568e34d14192708ade683a10a473..067cb718f9e462dbac043bac8be369f4a317fa3d 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -96,14 +96,14 @@ def assert_processing(upload: Upload, mocksearch=None): assert calc.parser is not None assert calc.mainfile is not None assert calc.status == 'SUCCESS', calc.archive_id - calc_hash = utils.archive.calc_hash(calc.archive_id) + calc_id = utils.archive.calc_id(calc.archive_id) - with upload_files.archive_file(calc_hash) as archive_json: + with upload_files.archive_file(calc_id) as archive_json: archive = json.load(archive_json) assert 'section_run' in archive assert 'section_calculation_info' in archive - with upload_files.archive_log_file(calc_hash) as f: + with upload_files.archive_log_file(calc_id) as f: assert 'a test' in f.read() assert len(calc.errors) == 0 diff --git a/tests/test_api.py b/tests/test_api.py index a79e77c997d52015159193ed12a9e3cb04cb8cdd..7a8e11a9f9e5cca567c9e7aaa1b4e7d5823231fd 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -351,7 +351,7 @@ class TestUploads: # class TestRepo: # def test_calc(self, client, example_elastic_calc, no_warn): # rv = client.get( -# '/repo/%s/%s' % (example_elastic_calc.upload_hash, example_elastic_calc.calc_hash)) +# '/repo/%s/%s' % (example_elastic_calc.upload_id, example_elastic_calc.calc_id)) # assert rv.status_code == 200 # def test_non_existing_calcs(self, client): diff --git a/tests/test_coe_repo.py b/tests/test_coe_repo.py index 81ce9e426e1d2745c8183108578fde73fa92dfa7..e498d999c54d8ee583641e7c694196b80b76368a 100644 --- a/tests/test_coe_repo.py +++ b/tests/test_coe_repo.py @@ -56,7 +56,7 @@ def assert_coe_upload(upload_id, empty=False, meta_data={}): def assert_coe_calc(calc: Calc, meta_data={}): assert int(calc.pid) == int(meta_data.get('_pid', calc.pid)) - assert calc.calc_hash == meta_data.get('_checksum', calc.calc_hash) + assert calc.calc_id == meta_data.get('_checksum', calc.calc_id) # calc data assert len(calc.filenames) == 5 @@ -76,11 +76,6 @@ def assert_coe_calc(calc: Calc, meta_data={}): def test_add_upload(clean_repository_db, processed_upload): empty = processed_upload.total_calcs == 0 - processed_upload.upload_id = str(1) - Upload.add(processed_upload) - assert_coe_upload(processed_upload.upload_id, empty=empty) - - processed_upload.upload_id = str(2) Upload.add(processed_upload) assert_coe_upload(processed_upload.upload_id, empty=empty) @@ -127,11 +122,11 @@ class TestDataSets: def test_all(self, datasets): one, two, three = datasets self.assert_datasets(one.all_datasets, []) - self.assert_datasets(two.all_datasets, [one.calc_id]) - self.assert_datasets(three.all_datasets, [one.calc_id, two.calc_id]) + self.assert_datasets(two.all_datasets, [one.coe_calc_id]) + self.assert_datasets(three.all_datasets, [one.coe_calc_id, two.coe_calc_id]) def test_direct(self, datasets): one, two, three = datasets self.assert_datasets(one.direct_datasets, []) - self.assert_datasets(two.direct_datasets, [one.calc_id]) - self.assert_datasets(three.direct_datasets, [two.calc_id]) + self.assert_datasets(two.direct_datasets, [one.coe_calc_id]) + self.assert_datasets(three.direct_datasets, [two.coe_calc_id]) diff --git a/tests/test_files.py b/tests/test_files.py index dfdc6536722453f8bf6d68a0de08fe1e7a02de8b..d00d89694b456d1d586adb4c1ef5670dedfd689a 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -103,11 +103,11 @@ class TestObjects: example_calc: Dict[str, Any] = { - 'hash': '0', + 'calc_id': '0', 'mainfile': 'examples_template/template.json', 'data': 'value' } -example_calc_hash = example_calc['hash'] +example_calc_id = example_calc['calc_id'] def assert_example_calc(calc): @@ -133,7 +133,7 @@ class MetadataContract: def test_insert(self, md: Metadata): md.insert(example_calc) assert len(md) == 1 - assert_example_calc(md.get(example_calc_hash)) + assert_example_calc(md.get(example_calc_id)) def test_insert_fail(self, md: Metadata): failed = False @@ -148,14 +148,14 @@ class MetadataContract: def test_update(self, md: Metadata): md.insert(example_calc) - md.update(example_calc_hash, dict(data='updated')) + md.update(example_calc_id, dict(data='updated')) assert len(md) == 1 - assert md.get(example_calc_hash)['data'] == 'updated' + assert md.get(example_calc_id)['data'] == 'updated' def test_update_fail(self, md: Metadata): failed = False try: - md.update(example_calc_hash, dict(data='updated')) + md.update(example_calc_id, dict(data='updated')) except KeyError: failed = True assert failed @@ -163,12 +163,12 @@ class MetadataContract: def test_get(self, md: Metadata): md.insert(example_calc) - assert_example_calc(md.get(example_calc_hash)) + assert_example_calc(md.get(example_calc_id)) def test_get_fail(self, md: Metadata): failed = False try: - md.get(example_calc_hash) + md.get(example_calc_id) except KeyError: failed = True assert failed @@ -233,11 +233,11 @@ class UploadFilesContract(UploadFilesFixtures): assert len(f.read()) > 0 if not test_upload._is_authorized(): with test_upload.metadata as md: - assert not md.get(example_calc_hash).get('restricted', False) + assert not md.get(example_calc_id).get('restricted', False) except Restricted: assert not test_upload._is_authorized() with test_upload.metadata as md: - assert md.get(example_calc_hash).get('restricted', False) + assert md.get(example_calc_id).get('restricted', False) @pytest.mark.parametrize('prefix', [None, 'examples']) def test_raw_file_manifest(self, test_upload: StagingUploadFiles, prefix: str): @@ -248,30 +248,30 @@ class UploadFilesContract(UploadFilesFixtures): def test_archive(self, test_upload, test_logs: bool): try: if test_logs: - with test_upload.archive_log_file(example_calc_hash, 'rt') as f: + with test_upload.archive_log_file(example_calc_id, 'rt') as f: assert f.read() == 'archive' else: - f = test_upload.archive_file(example_calc_hash, 'rt') + f = test_upload.archive_file(example_calc_id, 'rt') assert json.load(f) == 'archive' if not test_upload._is_authorized(): with test_upload.metadata as md: - assert not md.get(example_calc_hash).get('restricted', False) + assert not md.get(example_calc_id).get('restricted', False) except Restricted: assert not test_upload._is_authorized() with test_upload.metadata as md: - assert md.get(example_calc_hash).get('restricted', False) + assert md.get(example_calc_id).get('restricted', False) def test_metadata(self, test_upload): with test_upload.metadata as md: - assert_example_calc(md.get(example_calc_hash)) + assert_example_calc(md.get(example_calc_id)) def test_update_metadata(self, test_upload): with test_upload.metadata as md: - md.update(example_calc_hash, dict(data='updated')) + md.update(example_calc_id, dict(data='updated')) with test_upload.metadata as md: - assert md.get(example_calc_hash)['data'] == 'updated' + assert md.get(example_calc_id)['data'] == 'updated' def create_staging_upload(upload_id: str, calc_specs: str) -> StagingUploadFiles: @@ -291,13 +291,13 @@ def create_staging_upload(upload_id: str, calc_specs: str) -> StagingUploadFiles prefix = 0 for calc_spec in calc_specs: upload.add_rawfiles(example_file, prefix=None if prefix == 0 else str(prefix)) - hash = str(int(example_calc_hash) + prefix) - with upload.archive_file(hash, 'wt') as f: + calc_id = str(int(example_calc_id) + prefix) + with upload.archive_file(calc_id, 'wt') as f: f.write('"archive"') - with upload.archive_log_file(hash, 'wt') as f: + with upload.archive_log_file(calc_id, 'wt') as f: f.write('archive') calc = dict(**example_calc) - calc['hash'] = hash + calc['calc_id'] = calc_id if prefix > 0: calc['mainfile'] = os.path.join(str(prefix), calc['mainfile']) if calc_spec == 'r': @@ -341,10 +341,10 @@ class TestStagingUploadFiles(UploadFilesContract): assert len(content) > 0 def test_write_archive(self, test_upload): - assert json.load(test_upload.archive_file(example_calc_hash, 'rt')) == 'archive' + assert json.load(test_upload.archive_file(example_calc_id, 'rt')) == 'archive' - def test_calc_hash(self, test_upload): - assert test_upload.calc_hash(example_file_mainfile) is not None + def test_calc_id(self, test_upload): + assert test_upload.calc_id(example_file_mainfile) is not None def test_pack(self, test_upload): test_upload.pack() diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 27c571267046c28443a5c6e2eeb3892c4c85fb00..bfdfaf0a926ceed23138f689db67d6c16ea9f6c8 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -242,7 +242,7 @@ def parsed_example(request) -> LocalBackend: def add_calculation_info(backend: LocalBackend) -> LocalBackend: backend.openNonOverlappingSection('section_calculation_info') backend.addValue('upload_id', 'test_upload_id') - backend.addValue('archive_id', 'test_upload_hash/test_calc_hash') + backend.addValue('archive_id', 'test_upload_id/test_calc_id') backend.addValue('main_file', 'test/mainfile.txt') backend.addValue('parser_name', 'testParser') backend.closeNonOverlappingSection('section_calculation_info') diff --git a/tests/test_repo.py b/tests/test_repo.py index d13ad56f86edb200748c7426b942b5f35b6c7958..9fd96459fe3bbb1276120c06849a826b82b825ac 100644 --- a/tests/test_repo.py +++ b/tests/test_repo.py @@ -34,7 +34,7 @@ # auxfiles = list(upload_file.get_siblings(mainfile)) # try: -# calc = RepoCalc.get(id='test_upload_hash/test_calc_hash') +# calc = RepoCalc.get(id='test_upload_id/test_calc_id') # except NotFoundError: # pass # else: @@ -42,8 +42,8 @@ # entry = RepoCalc.create_from_backend( # normalized_template_example, -# upload_hash='test_upload_hash', -# calc_hash='test_calc_hash', +# upload_id='test_upload_id', +# calc_id='test_calc_id', # upload_id='test_upload_id', # additional=dict( # mainfile=mainfile, @@ -56,7 +56,7 @@ # yield entry # try: -# calc = RepoCalc.get(id='test_upload_hash/test_calc_hash') +# calc = RepoCalc.get(id='test_upload_id/test_calc_id') # except NotFoundError: # pass # else: @@ -76,7 +76,7 @@ # assert example_elastic_calc.upload.exists() # get_result: RepoCalc = RepoCalc.get( -# id='%s/%s' % (example_elastic_calc.upload_hash, example_elastic_calc.calc_hash)) +# id='%s/%s' % (example_elastic_calc.upload_id, example_elastic_calc.calc_id)) # assert_elastic_calc(get_result) @@ -85,8 +85,8 @@ # calc = RepoCalc.create_from_backend( # normalized_template_example, -# upload_hash='test_upload_hash', -# calc_hash='test_calc_hash', +# upload_id='test_upload_id', +# calc_id='test_calc_id', # upload_id='test_upload_id', # additional=dict( # mainfile='/test/mainfile', @@ -105,9 +105,9 @@ # def test_delete_elastic_calc(example_elastic_calc: RepoCalc): # example_elastic_calc.delete() -# assert not ArchiveFile('test_upload_hash/test_calc_hash').exists() +# assert not ArchiveFile('test_upload_id/test_calc_id').exists() # try: -# RepoCalc.get(id='test_upload_hash/test_calc_hash') +# RepoCalc.get(id='test_upload_id/test_calc_id') # assert False # except NotFoundError: # pass @@ -116,10 +116,10 @@ # def test_staging_elastic_calc(example_elastic_calc: RepoCalc, no_warn): -# assert RepoCalc.get(id='test_upload_hash/test_calc_hash').staging +# assert RepoCalc.get(id='test_upload_id/test_calc_id').staging # def test_unstage_elastic_calc(example_elastic_calc: RepoCalc, no_warn): # example_elastic_calc.upload.unstage(staging=False) -# assert not RepoCalc.get(id='test_upload_hash/test_calc_hash').staging +# assert not RepoCalc.get(id='test_upload_id/test_calc_id').staging