Commit a6456e2d authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Improved the auxfile handling. Some fixes to rawfile api. Completes implementation of #171.

parent 5b668898
Pipeline #52158 passed with stages
in 29 minutes and 5 seconds
...@@ -354,7 +354,20 @@ class Upload extends React.Component { ...@@ -354,7 +354,20 @@ class Upload extends React.Component {
const renderRow = (calc, index) => { const renderRow = (calc, index) => {
const { mainfile, upload_id, calc_id, parser, tasks, current_task, tasks_status, errors } = calc const { mainfile, upload_id, calc_id, parser, tasks, current_task, tasks_status, errors } = calc
const color = tasks_status === 'FAILURE' ? 'error' : 'default' let tooltip_html = null
let color = tasks_status === 'FAILURE' ? 'error' : 'default'
if (tasks_status === 'FAILURE') {
tooltip_html = `Calculation processing failed with errors: ${errors.join(', ')}`
color = 'error'
}
if (calc.errors.length > 0) {
color = 'error'
tooltip_html = `Calculation processed with errors: ${calc.errors.join(', ')}`
}
if (calc.warnings.length > 0) {
color = 'error'
tooltip_html = `Calculation processed with warnings: ${calc.warnings.join(', ')}`
}
const processed = tasks_status === 'FAILURE' || tasks_status === 'SUCCESS' const processed = tasks_status === 'FAILURE' || tasks_status === 'SUCCESS'
const row = ( const row = (
<TableRow key={index} hover={processed} <TableRow key={index} hover={processed}
...@@ -391,10 +404,9 @@ class Upload extends React.Component { ...@@ -391,10 +404,9 @@ class Upload extends React.Component {
</TableRow> </TableRow>
) )
if (tasks_status === 'FAILURE') { if (tooltip_html) {
const error_html = `Calculation processing failed with errors: ${errors.join(', ')}`
return ( return (
<Tooltip key={calc_id} title={error_html}> <Tooltip key={calc_id} title={tooltip_html}>
{row} {row}
</Tooltip> </Tooltip>
) )
......
...@@ -100,7 +100,8 @@ class FileView: ...@@ -100,7 +100,8 @@ class FileView:
return self.f.read(size) return self.f.read(size)
def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate): def get_raw_file_from_upload_path(
upload_files, upload_filepath, authorization_predicate, mainfile: str = None):
""" """
Helper method used by func:`RawFileFromUploadPathResource.get` and Helper method used by func:`RawFileFromUploadPathResource.get` and
func:`RawFileFromCalcPathResource.get`. func:`RawFileFromCalcPathResource.get`.
...@@ -148,11 +149,14 @@ def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_p ...@@ -148,11 +149,14 @@ def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_p
directory_files = upload_files.raw_file_list(upload_filepath) directory_files = upload_files.raw_file_list(upload_filepath)
if len(directory_files) == 0: if len(directory_files) == 0:
abort(404, message='There is nothing to be found at %s.' % upload_filepath) abort(404, message='There is nothing to be found at %s.' % upload_filepath)
contents = sorted([dict(name=name, size=size) for name, size in directory_files], key=lambda x: '' if x['name'] == mainfile else x['name'])
# if mainfile is not None:
# contents = [mainfile] + [content for content in contents if content['name'] != mainfile]
return { return {
'upload_id': upload_files.upload_id, 'upload_id': upload_files.upload_id,
'directory': upload_filepath, 'directory': upload_filepath,
'contents': [ 'contents': contents
dict(name=name, size=size) for name, size in directory_files]
}, 200 }, 200
...@@ -252,7 +256,9 @@ class RawFileFromCalcPathResource(Resource): ...@@ -252,7 +256,9 @@ class RawFileFromCalcPathResource(Resource):
abort(404, message='The calc with id %s is not part of the upload with id %s.' % (calc_id, upload_id)) abort(404, message='The calc with id %s is not part of the upload with id %s.' % (calc_id, upload_id))
upload_filepath = os.path.join(os.path.dirname(calc.mainfile), calc_filepath) upload_filepath = os.path.join(os.path.dirname(calc.mainfile), calc_filepath)
return get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate) return get_raw_file_from_upload_path(
upload_files, upload_filepath, authorization_predicate,
mainfile=os.path.basename(calc.mainfile))
@ns.route('/calc/<string:upload_id>/<string:calc_id>/') @ns.route('/calc/<string:upload_id>/<string:calc_id>/')
......
...@@ -176,7 +176,7 @@ commit = gitinfo.commit ...@@ -176,7 +176,7 @@ commit = gitinfo.commit
release = 'devel' release = 'devel'
domain = 'DFT' domain = 'DFT'
service = 'unknown nomad service' service = 'unknown nomad service'
auxfile_cutoff = 30 auxfile_cutoff = 100
console_log_level = logging.WARNING console_log_level = logging.WARNING
......
...@@ -85,7 +85,6 @@ class CalcWithMetadata(): ...@@ -85,7 +85,6 @@ class CalcWithMetadata():
# basic upload and processing related metadata # basic upload and processing related metadata
self.upload_time: datetime.datetime = None self.upload_time: datetime.datetime = None
self.files: List[str] = None self.files: List[str] = None
self.file_sizes: List[int] = None
self.uploader: utils.POPO = None self.uploader: utils.POPO = None
self.processed: bool = False self.processed: bool = False
self.last_processing: datetime.datetime = None self.last_processing: datetime.datetime = None
......
...@@ -468,9 +468,11 @@ class StagingUploadFiles(UploadFiles): ...@@ -468,9 +468,11 @@ class StagingUploadFiles(UploadFiles):
if not calc.with_embargo: if not calc.with_embargo:
mainfile = calc.mainfile mainfile = calc.mainfile
assert mainfile is not None assert mainfile is not None
for filepath in self.calc_files(mainfile): # mainfile might already have been added due to being a auxfile to another calc
if not always_restricted(filepath): if mainfile not in public_files:
public_files[filepath] = None for filepath in self.calc_files(mainfile, with_cutoff=False):
if not always_restricted(filepath):
public_files[filepath] = None
# 1.2 remove the non public mainfiles that have been added as auxfiles of public mainfiles # 1.2 remove the non public mainfiles that have been added as auxfiles of public mainfiles
for calc in upload.calcs: for calc in upload.calcs:
if calc.with_embargo: if calc.with_embargo:
...@@ -542,7 +544,7 @@ class StagingUploadFiles(UploadFiles): ...@@ -542,7 +544,7 @@ class StagingUploadFiles(UploadFiles):
return results return results
def calc_files(self, mainfile: str, with_mainfile: bool = True) -> Iterable[str]: def calc_files(self, mainfile: str, with_mainfile: bool = True, with_cutoff: bool = True) -> Iterable[str]:
""" """
Returns all the auxfiles and mainfile for a given mainfile. This implements Returns all the auxfiles and mainfile for a given mainfile. This implements
nomad's logic about what is part of a calculation and what not. The mainfile nomad's logic about what is part of a calculation and what not. The mainfile
...@@ -559,16 +561,21 @@ class StagingUploadFiles(UploadFiles): ...@@ -559,16 +561,21 @@ class StagingUploadFiles(UploadFiles):
calc_dir = os.path.dirname(mainfile_object.os_path) calc_dir = os.path.dirname(mainfile_object.os_path)
calc_relative_dir = calc_dir[len(self._raw_dir.os_path) + 1:] calc_relative_dir = calc_dir[len(self._raw_dir.os_path) + 1:]
ls = os.listdir(calc_dir) file_count = 0
if len(ls) > config.auxfile_cutoff: aux_files: List[str] = []
# If there are two many of them, its probably just a directory with lots of for filename in os.listdir(calc_dir):
# calculations. In this case it does not make any sense to provide thousands of if filename != mainfile_basename and os.path.isfile(os.path.join(calc_dir, filename)):
# aux files. aux_files.append(os.path.join(calc_relative_dir, filename))
return [mainfile] if with_mainfile else [] file_count += 1
if with_cutoff and file_count > config.auxfile_cutoff:
# If there are two many of them, its probably just a directory with lots of
# calculations. In this case it does not make any sense to provide thousands of
# aux files.
break
aux_files = sorted(aux_files)
aux_files = sorted(
os.path.join(calc_relative_dir, path) for path in ls
if os.path.isfile(os.path.join(calc_dir, path)) and path != mainfile_basename)
if with_mainfile: if with_mainfile:
return [mainfile] + aux_files return [mainfile] + aux_files
else: else:
......
...@@ -164,9 +164,13 @@ class Calc(Proc): ...@@ -164,9 +164,13 @@ class Calc(Proc):
calc_with_metadata.nomad_commit = config.commit calc_with_metadata.nomad_commit = config.commit
calc_with_metadata.last_processing = datetime.now() calc_with_metadata.last_processing = datetime.now()
calc_with_metadata.files = self.upload_files.calc_files(self.mainfile) calc_with_metadata.files = self.upload_files.calc_files(self.mainfile)
self.preprocess_files(calc_with_metadata.files)
self.metadata = calc_with_metadata.to_dict() self.metadata = calc_with_metadata.to_dict()
if len(calc_with_metadata.files) >= config.auxfile_cutoff:
self.warning(
'This calc has many aux files in its directory. '
'Have you placed many calculations in the same directory?')
self.parsing() self.parsing()
self.normalizing() self.normalizing()
self.archiving() self.archiving()
...@@ -215,36 +219,6 @@ class Calc(Proc): ...@@ -215,36 +219,6 @@ class Calc(Proc):
self.upload.reload() self.upload.reload()
self.upload.check_join() self.upload.check_join()
def preprocess_files(self, filepaths):
for path in filepaths:
if os.path.basename(path) == 'POTCAR':
# create checksum
hash = hashlib.sha224()
with open(self.upload_files.raw_file_object(path).os_path, 'rb') as f:
for line in f.readlines():
hash.update(line)
checksum = hash.hexdigest()
# created stripped POTCAR
stripped_path = path + '.stripped'
with open(self.upload_files.raw_file_object(stripped_path).os_path, 'wt') as f:
f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum)
os.system(
'''
awk < %s >> %s '
BEGIN { dump=1 }
/End of Dataset/ { dump=1 }
dump==1 { print }
/END of PSCTR/ { dump=0 }'
''' % (
self.upload_files.raw_file_object(path).os_path,
self.upload_files.raw_file_object(stripped_path).os_path))
filepaths.append(stripped_path)
return filepaths
@task @task
def parsing(self): def parsing(self):
context = dict(parser=self.parser, step=self.parser) context = dict(parser=self.parser, step=self.parser)
...@@ -652,6 +626,35 @@ class Upload(Proc): ...@@ -652,6 +626,35 @@ class Upload(Proc):
self.fail('bad .zip/.tar file', log_level=logging.INFO) self.fail('bad .zip/.tar file', log_level=logging.INFO)
return return
def _preprocess_files(self, path):
"""
Some files need preprocessing. Currently we need to add a stripped POTCAR version
and always restrict/embargo the original.
"""
if os.path.basename(path) == 'POTCAR':
# create checksum
hash = hashlib.sha224()
with open(self.staging_upload_files.raw_file_object(path).os_path, 'rb') as orig_f:
for line in orig_f.readlines():
hash.update(line)
checksum = hash.hexdigest()
# created stripped POTCAR
stripped_path = path + '.stripped'
with open(self.staging_upload_files.raw_file_object(stripped_path).os_path, 'wt') as stripped_f:
stripped_f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum)
os.system(
'''
awk < %s >> %s '
BEGIN { dump=1 }
/End of Dataset/ { dump=1 }
dump==1 { print }
/END of PSCTR/ { dump=0 }'
''' % (
self.staging_upload_files.raw_file_object(path).os_path,
self.staging_upload_files.raw_file_object(stripped_path).os_path))
def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]: def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]:
""" """
Generator function that matches all files in the upload to all parsers to Generator function that matches all files in the upload to all parsers to
...@@ -663,6 +666,7 @@ class Upload(Proc): ...@@ -663,6 +666,7 @@ class Upload(Proc):
directories_with_match: Dict[str, str] = dict() directories_with_match: Dict[str, str] = dict()
upload_files = self.staging_upload_files upload_files = self.staging_upload_files
for filename in upload_files.raw_file_manifest(): for filename in upload_files.raw_file_manifest():
self._preprocess_files(filename)
try: try:
parser = match_parser(filename, upload_files) parser = match_parser(filename, upload_files)
if parser is not None: if parser is not None:
......
...@@ -109,6 +109,14 @@ def test_processing(processed, no_warn, mails, monkeypatch): ...@@ -109,6 +109,14 @@ def test_processing(processed, no_warn, mails, monkeypatch):
assert re.search(r'Processing completed', mails.messages[0].data.decode('utf-8')) is not None assert re.search(r'Processing completed', mails.messages[0].data.decode('utf-8')) is not None
def test_processing_with_large_dir(test_user, proc_infra):
upload_path = 'tests/data/proc/examples_large_dir.zip'
upload_id = upload_path[:-4]
upload = run_processing((upload_id, upload_path), test_user)
for calc in upload.calcs:
assert len(calc.warnings) == 1
def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, with_publish_to_coe_repo, monkeypatch): def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, with_publish_to_coe_repo, monkeypatch):
processed = non_empty_processed processed = non_empty_processed
processed.compress_and_set_metadata(example_user_metadata) processed.compress_and_set_metadata(example_user_metadata)
......
...@@ -460,6 +460,7 @@ class TestUploads: ...@@ -460,6 +460,7 @@ class TestUploads:
# assert rv.status_code == 400 # assert rv.status_code == 400
def test_potcar(self, client, proc_infra, test_user_auth): def test_potcar(self, client, proc_infra, test_user_auth):
# only the owner, shared with people are supposed to download the original potcar file
example_file = 'tests/data/proc/examples_potcar.zip' example_file = 'tests/data/proc/examples_potcar.zip'
rv = client.put('/uploads/?local_path=%s' % example_file, headers=test_user_auth) rv = client.put('/uploads/?local_path=%s' % example_file, headers=test_user_auth)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment