Commit a6456e2d authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Improved the auxfile handling. Some fixes to rawfile api. Completes implementation of #171.

parent 5b668898
Pipeline #52158 passed with stages
in 29 minutes and 5 seconds
......@@ -354,7 +354,20 @@ class Upload extends React.Component {
const renderRow = (calc, index) => {
const { mainfile, upload_id, calc_id, parser, tasks, current_task, tasks_status, errors } = calc
const color = tasks_status === 'FAILURE' ? 'error' : 'default'
let tooltip_html = null
let color = tasks_status === 'FAILURE' ? 'error' : 'default'
if (tasks_status === 'FAILURE') {
tooltip_html = `Calculation processing failed with errors: ${errors.join(', ')}`
color = 'error'
}
if (calc.errors.length > 0) {
color = 'error'
tooltip_html = `Calculation processed with errors: ${calc.errors.join(', ')}`
}
if (calc.warnings.length > 0) {
color = 'error'
tooltip_html = `Calculation processed with warnings: ${calc.warnings.join(', ')}`
}
const processed = tasks_status === 'FAILURE' || tasks_status === 'SUCCESS'
const row = (
<TableRow key={index} hover={processed}
......@@ -391,10 +404,9 @@ class Upload extends React.Component {
</TableRow>
)
if (tasks_status === 'FAILURE') {
const error_html = `Calculation processing failed with errors: ${errors.join(', ')}`
if (tooltip_html) {
return (
<Tooltip key={calc_id} title={error_html}>
<Tooltip key={calc_id} title={tooltip_html}>
{row}
</Tooltip>
)
......
......@@ -100,7 +100,8 @@ class FileView:
return self.f.read(size)
def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate):
def get_raw_file_from_upload_path(
upload_files, upload_filepath, authorization_predicate, mainfile: str = None):
"""
Helper method used by func:`RawFileFromUploadPathResource.get` and
func:`RawFileFromCalcPathResource.get`.
......@@ -148,11 +149,14 @@ def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_p
directory_files = upload_files.raw_file_list(upload_filepath)
if len(directory_files) == 0:
abort(404, message='There is nothing to be found at %s.' % upload_filepath)
contents = sorted([dict(name=name, size=size) for name, size in directory_files], key=lambda x: '' if x['name'] == mainfile else x['name'])
# if mainfile is not None:
# contents = [mainfile] + [content for content in contents if content['name'] != mainfile]
return {
'upload_id': upload_files.upload_id,
'directory': upload_filepath,
'contents': [
dict(name=name, size=size) for name, size in directory_files]
'contents': contents
}, 200
......@@ -252,7 +256,9 @@ class RawFileFromCalcPathResource(Resource):
abort(404, message='The calc with id %s is not part of the upload with id %s.' % (calc_id, upload_id))
upload_filepath = os.path.join(os.path.dirname(calc.mainfile), calc_filepath)
return get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate)
return get_raw_file_from_upload_path(
upload_files, upload_filepath, authorization_predicate,
mainfile=os.path.basename(calc.mainfile))
@ns.route('/calc/<string:upload_id>/<string:calc_id>/')
......
......@@ -176,7 +176,7 @@ commit = gitinfo.commit
release = 'devel'
domain = 'DFT'
service = 'unknown nomad service'
auxfile_cutoff = 30
auxfile_cutoff = 100
console_log_level = logging.WARNING
......
......@@ -85,7 +85,6 @@ class CalcWithMetadata():
# basic upload and processing related metadata
self.upload_time: datetime.datetime = None
self.files: List[str] = None
self.file_sizes: List[int] = None
self.uploader: utils.POPO = None
self.processed: bool = False
self.last_processing: datetime.datetime = None
......
......@@ -468,9 +468,11 @@ class StagingUploadFiles(UploadFiles):
if not calc.with_embargo:
mainfile = calc.mainfile
assert mainfile is not None
for filepath in self.calc_files(mainfile):
if not always_restricted(filepath):
public_files[filepath] = None
# mainfile might already have been added due to being a auxfile to another calc
if mainfile not in public_files:
for filepath in self.calc_files(mainfile, with_cutoff=False):
if not always_restricted(filepath):
public_files[filepath] = None
# 1.2 remove the non public mainfiles that have been added as auxfiles of public mainfiles
for calc in upload.calcs:
if calc.with_embargo:
......@@ -542,7 +544,7 @@ class StagingUploadFiles(UploadFiles):
return results
def calc_files(self, mainfile: str, with_mainfile: bool = True) -> Iterable[str]:
def calc_files(self, mainfile: str, with_mainfile: bool = True, with_cutoff: bool = True) -> Iterable[str]:
"""
Returns all the auxfiles and mainfile for a given mainfile. This implements
nomad's logic about what is part of a calculation and what not. The mainfile
......@@ -559,16 +561,21 @@ class StagingUploadFiles(UploadFiles):
calc_dir = os.path.dirname(mainfile_object.os_path)
calc_relative_dir = calc_dir[len(self._raw_dir.os_path) + 1:]
ls = os.listdir(calc_dir)
if len(ls) > config.auxfile_cutoff:
# If there are two many of them, its probably just a directory with lots of
# calculations. In this case it does not make any sense to provide thousands of
# aux files.
return [mainfile] if with_mainfile else []
file_count = 0
aux_files: List[str] = []
for filename in os.listdir(calc_dir):
if filename != mainfile_basename and os.path.isfile(os.path.join(calc_dir, filename)):
aux_files.append(os.path.join(calc_relative_dir, filename))
file_count += 1
if with_cutoff and file_count > config.auxfile_cutoff:
# If there are two many of them, its probably just a directory with lots of
# calculations. In this case it does not make any sense to provide thousands of
# aux files.
break
aux_files = sorted(aux_files)
aux_files = sorted(
os.path.join(calc_relative_dir, path) for path in ls
if os.path.isfile(os.path.join(calc_dir, path)) and path != mainfile_basename)
if with_mainfile:
return [mainfile] + aux_files
else:
......
......@@ -164,9 +164,13 @@ class Calc(Proc):
calc_with_metadata.nomad_commit = config.commit
calc_with_metadata.last_processing = datetime.now()
calc_with_metadata.files = self.upload_files.calc_files(self.mainfile)
self.preprocess_files(calc_with_metadata.files)
self.metadata = calc_with_metadata.to_dict()
if len(calc_with_metadata.files) >= config.auxfile_cutoff:
self.warning(
'This calc has many aux files in its directory. '
'Have you placed many calculations in the same directory?')
self.parsing()
self.normalizing()
self.archiving()
......@@ -215,36 +219,6 @@ class Calc(Proc):
self.upload.reload()
self.upload.check_join()
def preprocess_files(self, filepaths):
for path in filepaths:
if os.path.basename(path) == 'POTCAR':
# create checksum
hash = hashlib.sha224()
with open(self.upload_files.raw_file_object(path).os_path, 'rb') as f:
for line in f.readlines():
hash.update(line)
checksum = hash.hexdigest()
# created stripped POTCAR
stripped_path = path + '.stripped'
with open(self.upload_files.raw_file_object(stripped_path).os_path, 'wt') as f:
f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum)
os.system(
'''
awk < %s >> %s '
BEGIN { dump=1 }
/End of Dataset/ { dump=1 }
dump==1 { print }
/END of PSCTR/ { dump=0 }'
''' % (
self.upload_files.raw_file_object(path).os_path,
self.upload_files.raw_file_object(stripped_path).os_path))
filepaths.append(stripped_path)
return filepaths
@task
def parsing(self):
context = dict(parser=self.parser, step=self.parser)
......@@ -652,6 +626,35 @@ class Upload(Proc):
self.fail('bad .zip/.tar file', log_level=logging.INFO)
return
def _preprocess_files(self, path):
"""
Some files need preprocessing. Currently we need to add a stripped POTCAR version
and always restrict/embargo the original.
"""
if os.path.basename(path) == 'POTCAR':
# create checksum
hash = hashlib.sha224()
with open(self.staging_upload_files.raw_file_object(path).os_path, 'rb') as orig_f:
for line in orig_f.readlines():
hash.update(line)
checksum = hash.hexdigest()
# created stripped POTCAR
stripped_path = path + '.stripped'
with open(self.staging_upload_files.raw_file_object(stripped_path).os_path, 'wt') as stripped_f:
stripped_f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum)
os.system(
'''
awk < %s >> %s '
BEGIN { dump=1 }
/End of Dataset/ { dump=1 }
dump==1 { print }
/END of PSCTR/ { dump=0 }'
''' % (
self.staging_upload_files.raw_file_object(path).os_path,
self.staging_upload_files.raw_file_object(stripped_path).os_path))
def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]:
"""
Generator function that matches all files in the upload to all parsers to
......@@ -663,6 +666,7 @@ class Upload(Proc):
directories_with_match: Dict[str, str] = dict()
upload_files = self.staging_upload_files
for filename in upload_files.raw_file_manifest():
self._preprocess_files(filename)
try:
parser = match_parser(filename, upload_files)
if parser is not None:
......
......@@ -109,6 +109,14 @@ def test_processing(processed, no_warn, mails, monkeypatch):
assert re.search(r'Processing completed', mails.messages[0].data.decode('utf-8')) is not None
def test_processing_with_large_dir(test_user, proc_infra):
upload_path = 'tests/data/proc/examples_large_dir.zip'
upload_id = upload_path[:-4]
upload = run_processing((upload_id, upload_path), test_user)
for calc in upload.calcs:
assert len(calc.warnings) == 1
def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, with_publish_to_coe_repo, monkeypatch):
processed = non_empty_processed
processed.compress_and_set_metadata(example_user_metadata)
......
......@@ -460,6 +460,7 @@ class TestUploads:
# assert rv.status_code == 400
def test_potcar(self, client, proc_infra, test_user_auth):
# only the owner, shared with people are supposed to download the original potcar file
example_file = 'tests/data/proc/examples_potcar.zip'
rv = client.put('/uploads/?local_path=%s' % example_file, headers=test_user_auth)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment