diff --git a/gui/src/components/uploads/Upload.js b/gui/src/components/uploads/Upload.js index 7f09f825258597e92e7cec3f45d2a59c780f0f17..e944f0bf4f606f1765939022556e6647b730e4d8 100644 --- a/gui/src/components/uploads/Upload.js +++ b/gui/src/components/uploads/Upload.js @@ -354,7 +354,20 @@ class Upload extends React.Component { const renderRow = (calc, index) => { const { mainfile, upload_id, calc_id, parser, tasks, current_task, tasks_status, errors } = calc - const color = tasks_status === 'FAILURE' ? 'error' : 'default' + let tooltip_html = null + let color = tasks_status === 'FAILURE' ? 'error' : 'default' + if (tasks_status === 'FAILURE') { + tooltip_html = `Calculation processing failed with errors: ${errors.join(', ')}` + color = 'error' + } + if (calc.errors.length > 0) { + color = 'error' + tooltip_html = `Calculation processed with errors: ${calc.errors.join(', ')}` + } + if (calc.warnings.length > 0) { + color = 'error' + tooltip_html = `Calculation processed with warnings: ${calc.warnings.join(', ')}` + } const processed = tasks_status === 'FAILURE' || tasks_status === 'SUCCESS' const row = ( <TableRow key={index} hover={processed} @@ -391,10 +404,9 @@ class Upload extends React.Component { </TableRow> ) - if (tasks_status === 'FAILURE') { - const error_html = `Calculation processing failed with errors: ${errors.join(', ')}` + if (tooltip_html) { return ( - <Tooltip key={calc_id} title={error_html}> + <Tooltip key={calc_id} title={tooltip_html}> {row} </Tooltip> ) diff --git a/nomad/api/raw.py b/nomad/api/raw.py index 145170dd6b8e9df7b60c83adeced6874be11f305..d7930b72cc1003dad6289d72e34adb6d899c2d50 100644 --- a/nomad/api/raw.py +++ b/nomad/api/raw.py @@ -100,7 +100,8 @@ class FileView: return self.f.read(size) -def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate): +def get_raw_file_from_upload_path( + upload_files, upload_filepath, authorization_predicate, mainfile: str = None): """ Helper method used by func:`RawFileFromUploadPathResource.get` and func:`RawFileFromCalcPathResource.get`. @@ -148,11 +149,14 @@ def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_p directory_files = upload_files.raw_file_list(upload_filepath) if len(directory_files) == 0: abort(404, message='There is nothing to be found at %s.' % upload_filepath) + + contents = sorted([dict(name=name, size=size) for name, size in directory_files], key=lambda x: '' if x['name'] == mainfile else x['name']) + # if mainfile is not None: + # contents = [mainfile] + [content for content in contents if content['name'] != mainfile] return { 'upload_id': upload_files.upload_id, 'directory': upload_filepath, - 'contents': [ - dict(name=name, size=size) for name, size in directory_files] + 'contents': contents }, 200 @@ -252,7 +256,9 @@ class RawFileFromCalcPathResource(Resource): abort(404, message='The calc with id %s is not part of the upload with id %s.' % (calc_id, upload_id)) upload_filepath = os.path.join(os.path.dirname(calc.mainfile), calc_filepath) - return get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate) + return get_raw_file_from_upload_path( + upload_files, upload_filepath, authorization_predicate, + mainfile=os.path.basename(calc.mainfile)) @ns.route('/calc/<string:upload_id>/<string:calc_id>/') diff --git a/nomad/config.py b/nomad/config.py index 77df549174db014d7b42cff5659b5159ae6f896a..ac13f62664972494645952b0f699dff50e023432 100644 --- a/nomad/config.py +++ b/nomad/config.py @@ -176,7 +176,7 @@ commit = gitinfo.commit release = 'devel' domain = 'DFT' service = 'unknown nomad service' -auxfile_cutoff = 30 +auxfile_cutoff = 100 console_log_level = logging.WARNING diff --git a/nomad/datamodel/base.py b/nomad/datamodel/base.py index b8f49a77400a02852c1204e3ff285bbdcac7fd42..deae57fecbb3ce995b91f4d23061904855a0002a 100644 --- a/nomad/datamodel/base.py +++ b/nomad/datamodel/base.py @@ -85,7 +85,6 @@ class CalcWithMetadata(): # basic upload and processing related metadata self.upload_time: datetime.datetime = None self.files: List[str] = None - self.file_sizes: List[int] = None self.uploader: utils.POPO = None self.processed: bool = False self.last_processing: datetime.datetime = None diff --git a/nomad/files.py b/nomad/files.py index 71e1b2e0a7c043644e434e0ba868e0eaec31ef93..01e250e23f09380ae51a4e660bcd6db83ed9abb6 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -468,9 +468,11 @@ class StagingUploadFiles(UploadFiles): if not calc.with_embargo: mainfile = calc.mainfile assert mainfile is not None - for filepath in self.calc_files(mainfile): - if not always_restricted(filepath): - public_files[filepath] = None + # mainfile might already have been added due to being a auxfile to another calc + if mainfile not in public_files: + for filepath in self.calc_files(mainfile, with_cutoff=False): + if not always_restricted(filepath): + public_files[filepath] = None # 1.2 remove the non public mainfiles that have been added as auxfiles of public mainfiles for calc in upload.calcs: if calc.with_embargo: @@ -542,7 +544,7 @@ class StagingUploadFiles(UploadFiles): return results - def calc_files(self, mainfile: str, with_mainfile: bool = True) -> Iterable[str]: + def calc_files(self, mainfile: str, with_mainfile: bool = True, with_cutoff: bool = True) -> Iterable[str]: """ Returns all the auxfiles and mainfile for a given mainfile. This implements nomad's logic about what is part of a calculation and what not. The mainfile @@ -559,16 +561,21 @@ class StagingUploadFiles(UploadFiles): calc_dir = os.path.dirname(mainfile_object.os_path) calc_relative_dir = calc_dir[len(self._raw_dir.os_path) + 1:] - ls = os.listdir(calc_dir) - if len(ls) > config.auxfile_cutoff: - # If there are two many of them, its probably just a directory with lots of - # calculations. In this case it does not make any sense to provide thousands of - # aux files. - return [mainfile] if with_mainfile else [] + file_count = 0 + aux_files: List[str] = [] + for filename in os.listdir(calc_dir): + if filename != mainfile_basename and os.path.isfile(os.path.join(calc_dir, filename)): + aux_files.append(os.path.join(calc_relative_dir, filename)) + file_count += 1 + + if with_cutoff and file_count > config.auxfile_cutoff: + # If there are two many of them, its probably just a directory with lots of + # calculations. In this case it does not make any sense to provide thousands of + # aux files. + break + + aux_files = sorted(aux_files) - aux_files = sorted( - os.path.join(calc_relative_dir, path) for path in ls - if os.path.isfile(os.path.join(calc_dir, path)) and path != mainfile_basename) if with_mainfile: return [mainfile] + aux_files else: diff --git a/nomad/processing/data.py b/nomad/processing/data.py index d54cd60763aa080ccde125c0dae870d0434fd616..53ae8f660e099979c6702c88b901dc556cf5080f 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -164,9 +164,13 @@ class Calc(Proc): calc_with_metadata.nomad_commit = config.commit calc_with_metadata.last_processing = datetime.now() calc_with_metadata.files = self.upload_files.calc_files(self.mainfile) - self.preprocess_files(calc_with_metadata.files) self.metadata = calc_with_metadata.to_dict() + if len(calc_with_metadata.files) >= config.auxfile_cutoff: + self.warning( + 'This calc has many aux files in its directory. ' + 'Have you placed many calculations in the same directory?') + self.parsing() self.normalizing() self.archiving() @@ -215,36 +219,6 @@ class Calc(Proc): self.upload.reload() self.upload.check_join() - def preprocess_files(self, filepaths): - for path in filepaths: - if os.path.basename(path) == 'POTCAR': - # create checksum - hash = hashlib.sha224() - with open(self.upload_files.raw_file_object(path).os_path, 'rb') as f: - for line in f.readlines(): - hash.update(line) - - checksum = hash.hexdigest() - - # created stripped POTCAR - stripped_path = path + '.stripped' - with open(self.upload_files.raw_file_object(stripped_path).os_path, 'wt') as f: - f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum) - os.system( - ''' - awk < %s >> %s ' - BEGIN { dump=1 } - /End of Dataset/ { dump=1 } - dump==1 { print } - /END of PSCTR/ { dump=0 }' - ''' % ( - self.upload_files.raw_file_object(path).os_path, - self.upload_files.raw_file_object(stripped_path).os_path)) - - filepaths.append(stripped_path) - - return filepaths - @task def parsing(self): context = dict(parser=self.parser, step=self.parser) @@ -652,6 +626,35 @@ class Upload(Proc): self.fail('bad .zip/.tar file', log_level=logging.INFO) return + def _preprocess_files(self, path): + """ + Some files need preprocessing. Currently we need to add a stripped POTCAR version + and always restrict/embargo the original. + """ + if os.path.basename(path) == 'POTCAR': + # create checksum + hash = hashlib.sha224() + with open(self.staging_upload_files.raw_file_object(path).os_path, 'rb') as orig_f: + for line in orig_f.readlines(): + hash.update(line) + + checksum = hash.hexdigest() + + # created stripped POTCAR + stripped_path = path + '.stripped' + with open(self.staging_upload_files.raw_file_object(stripped_path).os_path, 'wt') as stripped_f: + stripped_f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum) + os.system( + ''' + awk < %s >> %s ' + BEGIN { dump=1 } + /End of Dataset/ { dump=1 } + dump==1 { print } + /END of PSCTR/ { dump=0 }' + ''' % ( + self.staging_upload_files.raw_file_object(path).os_path, + self.staging_upload_files.raw_file_object(stripped_path).os_path)) + def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]: """ Generator function that matches all files in the upload to all parsers to @@ -663,6 +666,7 @@ class Upload(Proc): directories_with_match: Dict[str, str] = dict() upload_files = self.staging_upload_files for filename in upload_files.raw_file_manifest(): + self._preprocess_files(filename) try: parser = match_parser(filename, upload_files) if parser is not None: diff --git a/tests/data/proc/examples_large_dir.zip b/tests/data/proc/examples_large_dir.zip new file mode 100644 index 0000000000000000000000000000000000000000..dbf3663359d8e8ec414bcf434d776ae5d8b43fd2 Binary files /dev/null and b/tests/data/proc/examples_large_dir.zip differ diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index e56455c6d10848459d2a006b6a8a8d1b80749d3b..eb2e415027d7d07b2e12a4e5a1a02525dd091644 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -109,6 +109,14 @@ def test_processing(processed, no_warn, mails, monkeypatch): assert re.search(r'Processing completed', mails.messages[0].data.decode('utf-8')) is not None +def test_processing_with_large_dir(test_user, proc_infra): + upload_path = 'tests/data/proc/examples_large_dir.zip' + upload_id = upload_path[:-4] + upload = run_processing((upload_id, upload_path), test_user) + for calc in upload.calcs: + assert len(calc.warnings) == 1 + + def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, with_publish_to_coe_repo, monkeypatch): processed = non_empty_processed processed.compress_and_set_metadata(example_user_metadata) diff --git a/tests/test_api.py b/tests/test_api.py index acc8a947ac4e4da5b3c855babead6096823b3665..67ba51f68cafdea1983d1ba8968328d4e92eeae3 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -460,6 +460,7 @@ class TestUploads: # assert rv.status_code == 400 def test_potcar(self, client, proc_infra, test_user_auth): + # only the owner, shared with people are supposed to download the original potcar file example_file = 'tests/data/proc/examples_potcar.zip' rv = client.put('/uploads/?local_path=%s' % example_file, headers=test_user_auth)