Commit 493ada22 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added implementation and test for multiple mainfiles in one directory.

parent c637a61b
Pipeline #44014 passed with stages
in 24 minutes and 32 seconds
......@@ -135,7 +135,6 @@ class RepoCalcsResource(Resource):
abort(400, message='Invalid owner value. Valid values are all|user|staging, default is all')
data = dict(**request.args)
print(data)
data.pop('owner', None)
data.update(per_page=per_page, page=page, order=order, order_by=order_by)
......
......@@ -35,6 +35,22 @@ almost readonly (beside metadata) storage.
/archive-public.hdf5.zip
/archive-restricted.hdf5.zip
There is an implicit relationship between files, based on them being in the same
directory. Each directory with at least one *mainfile* is a *calculation directory*
and all the files are *aux* files to that *mainfile*. This is independent of the
respective files actually contributing data or not. A *calculation directory* might
contain multiple *mainfile*. E.g., user simulated multiple states of the same system, have
one calculation based on the other, etc. In this case the other *mainfile* is an *aux*
file to the original *mainfile* and vice versa.
Published files are kept in pairs of public and restricted files. Here the multiple *mainfiles*
per directory provides a dilemma. If on *mainfile* is restricted, all its *aux* files
should be restricted too. But if one of the *aux* files is actually a *mainfile* it
might be published!
There are multiple ways to solve this. Due to the rarity of the case, we take the
most simple solution: if one file is public, all files are made public, execpt those
being other mainfiles. Therefore, the aux files of a restricted calc might become public!
"""
from abc import ABCMeta
......@@ -446,15 +462,29 @@ class StagingUploadFiles(UploadFiles):
# copy raw -> .restricted
shutil.copytree(self._raw_dir.os_path, restricted_dir.os_path)
# move public data .restricted -> .public
# We do a trick to deal with multiple mainfiles sharing the same aux files while
# having different restriction, we first move all aux files to public (including
# potentially restricted mainfiles) and then we only move the restricted mainfiles
# back.
# move public aux files .restricted -> .public
for calc in self.metadata:
if not calc.get('with_embargo', False):
mainfile: str = calc['mainfile']
mainfile = calc['mainfile']
assert mainfile is not None
for filepath in self.calc_files(mainfile):
os.rename(
restricted_dir.join_file(filepath).os_path,
public_dir.join_file(filepath).os_path)
source = restricted_dir.join_file(filepath)
# file might have already been moved due to related calcs among aux files
if source.exists():
os.rename(source.os_path, public_dir.join_file(filepath).os_path)
# move restricted mainfiles back .public -> .restricted
for calc in self.metadata:
if calc.get('with_embargo', False):
mainfile = calc['mainfile']
assert mainfile is not None
source = public_dir.join_file(mainfile)
# file might not have been moved since all mainfiles among aux files were restricted
if source.exists():
os.rename(source.os_path, restricted_dir.join_file(mainfile).os_path)
# create bags
make_bag(restricted_dir.os_path, bag_info=bagit_metadata, checksums=['sha512'])
......@@ -662,9 +692,10 @@ class PublicUploadFiles(UploadFiles):
zip_file = self.join_file('raw-%s.bagit.zip' % access)
with ZipFile(zip_file.os_path) as zf:
for full_path in zf.namelist():
path = full_path[5:] # remove data/
if path_prefix is None or path.startswith(path_prefix):
yield path
if full_path.startswith('data/'):
path = full_path[5:] # remove data/
if path_prefix is None or path.startswith(path_prefix):
yield path
except FileNotFoundError:
pass
......@@ -680,4 +711,4 @@ class PublicUploadFiles(UploadFiles):
on current restricted information in the metadata. Should be used after updating
the restrictions on calculations. This is potentially a long running operation.
"""
pass
raise NotImplementedError()
......@@ -468,13 +468,16 @@ class Upload(Proc):
parser = match_parser(filename, self.upload_files)
if parser is not None:
directory = os.path.dirname(filename)
# TODO this might give us the chance to store directory based relationship
# between calcs for the future?
if directory in directories_with_match:
self.warnings.append(
self.info.append(
'The directory %s contains data from multiple code runs. '
'Nomad only processed %s.' % (directory, os.path.basename(filename)))
else:
directories_with_match[directory] = filename
yield filename, parser
yield filename, parser
except Exception as e:
self.get_logger().error(
'exception while matching pot. mainfile',
......
......@@ -64,17 +64,18 @@ def raw_files_infra(monkeysession):
@pytest.fixture(scope='function')
def raw_files(raw_files_infra):
""" Provides cleaned out files directory structure per function. Clears files after test. """
directories = [config.fs.objects, config.fs.tmp]
for directory in directories:
if not os.path.exists(directory):
os.makedirs(directory)
try:
yield
finally:
try:
shutil.rmtree(config.fs.objects)
except FileNotFoundError:
pass
try:
shutil.rmtree(config.fs.tmp)
except FileNotFoundError:
pass
for directory in directories:
try:
shutil.rmtree(directory)
except FileNotFoundError:
pass
@pytest.fixture(scope='function')
......
{
"section_run": [
{
"_name": "section_run",
"_gIndex": 0,
"program_name": "VASP",
"program_version": "4.6.35 3Apr08 complex parallel LinuxIFC",
"program_basis_set_type": "plane waves",
"section_method": [
{
"_name": "section_method",
"_gIndex": 0,
"electronic_structure_method": "DFT",
"section_XC_functionals": [
{
"_name": "section_XC_functionals",
"_gIndex": 0,
"XC_functional_name": "GGA_X_PBE"
}
]
}
],
"section_system": [
{
"_name": "section_system",
"_gIndex": 0,
"simulation_cell": [
[
5.76372622e-10,
0.0,
0.0
],
[
0.0,
5.76372622e-10,
0.0
],
[
0.0,
0.0,
4.0755698899999997e-10
]
],
"configuration_periodic_dimensions": [
true,
true,
true
],
"atom_positions": [
[
2.88186311e-10,
0.0,
2.0377849449999999e-10
],
[
0.0,
2.88186311e-10,
2.0377849449999999e-10
],
[
0.0,
0.0,
0.0
],
[
2.88186311e-10,
2.88186311e-10,
0.0
]
],
"atom_labels": [
"Br",
"K",
"Si",
"Si"
]
}
],
"section_single_configuration_calculation": [
{
"_name": "section_single_configuration_calculation",
"_gIndex": 0,
"single_configuration_calculation_to_system_ref": 0,
"single_configuration_to_calculation_method_ref": 0,
"energy_free": -1.5936767191492225e-18,
"energy_total": -1.5935696296699573e-18,
"energy_total_T0": -3.2126683561907e-22
}
],
"section_sampling_method": [
{
"_name": "section_sampling_method",
"_gIndex": 0,
"sampling_method": "geometry_optimization"
}
],
"section_frame_sequence": [
{
"_name": "section_frame_sequence",
"_gIndex": 0,
"frame_sequence_to_sampling_ref": 0,
"frame_sequence_local_frames_ref": [
0
]
}
]
}
]
}
\ No newline at end of file
......@@ -109,18 +109,6 @@ def test_processing_with_warning(proc_infra, test_user, with_warn):
assert_processing(upload)
@pytest.mark.timeout(10)
def test_processing_with_multi_calc_dir(proc_infra, test_user, no_warn):
example_file = 'tests/data/proc/examples_multi_calc_dir.zip'
example_upload_id = os.path.basename(example_file).replace('.zip', '')
upload_files = ArchiveBasedStagingUploadFiles(example_upload_id, create=True)
shutil.copyfile(example_file, upload_files.upload_file_os_path)
upload = run_processing(example_upload_id, test_user)
assert len(upload.warnings) > 0
assert_processing(upload)
@pytest.mark.timeout(10)
def test_process_non_existing(proc_infra, test_user, with_error):
upload = run_processing('__does_not_exist', test_user)
......
......@@ -471,7 +471,7 @@ class UploadFilesBasedTests:
return wrapper
@pytest.fixture(scope='function')
def test_data(self, request, postgres, mongo, no_warn, test_user, other_test_user):
def test_data(self, request, postgres, mongo, raw_files, no_warn, test_user, other_test_user):
# delete potential old test files
for _ in [0, 1]:
upload_files = UploadFiles.get('test_upload')
......
......@@ -18,6 +18,9 @@ import os.path
import shutil
import pytest
import json
import itertools
import zipfile
import re
from nomad import config
from nomad.files import DirectoryObject, PathObject
......@@ -29,6 +32,7 @@ from nomad.files import StagingUploadFiles, PublicUploadFiles, UploadFiles, Rest
# example_file uses an artificial parser for faster test execution, can also be
# changed to examples_vasp.zip for using vasp parser
example_file = 'tests/data/proc/examples_template.zip'
example_directory = 'tests/data/proc/examples_template'
example_file_contents = [
'examples_template/template.json',
'examples_template/1.aux',
......@@ -94,6 +98,52 @@ example_calc: Dict[str, Any] = {
example_calc_id = example_calc['calc_id']
def generate_example_calc(calc_id: int, with_mainfile_prefix: bool, subdirectory: str = None, **kwargs):
example_calc = dict(calc_id=str(calc_id), data='value')
if with_mainfile_prefix:
mainfile = '%d.template.json' % calc_id
else:
mainfile = 'template.json'
if subdirectory is not None:
mainfile = os.path.join(subdirectory, mainfile)
example_calc['mainfile'] = mainfile
example_calc.update(**kwargs)
example_file = os.path.join(config.fs.tmp, 'example.zip')
with zipfile.ZipFile(example_file, 'w', zipfile.ZIP_DEFLATED) as zf:
for filepath in example_file_contents:
filename = os.path.basename(filepath)
arcname = filename
if arcname == 'template.json' and with_mainfile_prefix:
arcname = '%d.template.json' % calc_id
if subdirectory is not None:
arcname = os.path.join(subdirectory, arcname)
zf.write(os.path.join(example_directory, filename), arcname)
return example_calc, example_file
def assert_example_files(names, with_mainfile: bool = True):
# TODO its complicated
# To compare the files with the example_file_contents list we have to assume
# - different subdirectories
# - mainfile prefixes
# - mainfiles among aux files
is_multi = any(re.search(r'[0-9].t', name) for name in names)
def normalized_file(name):
name = re.sub(r'[0-9].t', 't', name)
name = re.sub(r'^[0-9]\/', '', name)
return name
source = sorted(set(normalized_file(name) for name in names if not name.endswith('template.json') or with_mainfile or not is_multi))
target = sorted(name for name in example_file_contents if not name.endswith('template.json') or with_mainfile)
assert source == target
def assert_example_calc(calc):
assert calc is not None
assert calc['data'] == example_calc['data']
......@@ -208,19 +258,21 @@ class UploadFilesContract(UploadFilesFixtures):
assert UploadFiles.get(empty_test_upload.upload_id).__class__ == empty_test_upload.__class__
def test_rawfile(self, test_upload):
try:
with test_upload.raw_file(example_file_mainfile) as f:
assert len(f.read()) > 0
if not test_upload._is_authorized():
assert not test_upload.metadata.get(example_calc_id).get('with_embargo', False)
except Restricted:
assert not test_upload._is_authorized()
assert test_upload.metadata.get(example_calc_id).get('with_embargo', False)
assert len(test_upload.metadata) > 0
for calc in test_upload.metadata:
try:
with test_upload.raw_file(calc['mainfile']) as f:
assert len(f.read()) > 0
if not test_upload._is_authorized():
assert not test_upload.metadata.get(calc['calc_id']).get('with_embargo', False)
except Restricted:
assert not test_upload._is_authorized()
assert test_upload.metadata.get(calc['calc_id']).get('with_embargo', False)
@pytest.mark.parametrize('prefix', [None, 'examples'])
def test_raw_file_manifest(self, test_upload: StagingUploadFiles, prefix: str):
raw_files = list(test_upload.raw_file_manifest(path_prefix=prefix))
assert sorted(file for file in raw_files if file.startswith('examples')) == sorted(example_file_contents)
assert_example_files(raw_files)
@pytest.mark.parametrize('test_logs', [True, False])
def test_archive(self, test_upload, test_logs: bool):
......@@ -250,45 +302,44 @@ def create_staging_upload(upload_id: str, calc_specs: str) -> StagingUploadFiles
Arguments:
upload_id: The id that should be given to this test upload.
calc_specs: A string that determines the properties of the given upload.
With letters determining example calcs being public `p` or restricted `p`.
With letters determining example calcs being public `p` or restricted `r`.
The calcs will be copies of calcs in `example_file`.
First calc is at top level, following calcs will be put under 1/, 2/, etc.
All calcs with capital `P`/`R` will be put in the same directory under multi/.
"""
upload = StagingUploadFiles(upload_id, create=True, is_authorized=lambda: True)
prefix = 0
for calc_spec in calc_specs:
upload.add_rawfiles(example_file, prefix=None if prefix == 0 else str(prefix))
calc_id = str(int(example_calc_id) + prefix)
is_multi = calc_spec in ['R', 'P']
calc_spec = calc_spec.lower()
if is_multi or prefix == 0:
directory = 'examples_template'
else:
directory = os.path.join(str(prefix), 'examples_template')
calc, upload_file = generate_example_calc(
prefix, with_mainfile_prefix=is_multi, subdirectory=directory, with_embargo=calc_spec == 'r')
calc_id = calc['calc_id']
upload.add_rawfiles(upload_file)
with upload.archive_file(calc_id, 'wt') as f:
f.write('"archive"')
with upload.archive_log_file(calc_id, 'wt') as f:
f.write('archive')
calc = dict(**example_calc)
calc['calc_id'] = calc_id
if prefix > 0:
calc['mainfile'] = os.path.join(str(prefix), calc['mainfile'])
if calc_spec == 'r':
calc['with_embargo'] = True
elif calc_spec == 'p':
calc['with_embargo'] = False
upload.metadata.insert(calc)
prefix += 1
if calc_specs.startswith('P'):
public_only = True
calc_specs = calc_specs[1:]
else:
public_only = False
upload._is_authorized = lambda: not public_only
assert len(upload.metadata) == len(calc_specs)
return upload
class TestStagingUploadFiles(UploadFilesContract):
@pytest.fixture(scope='function', params=['r', 'rr', 'pr', 'rp', 'p', 'pp'])
@pytest.fixture(scope='function', params=['r', 'rr', 'pr', 'rp', 'p', 'pp', 'RP', 'RR', 'PP'])
def test_upload(self, request, test_upload_id: str) -> StagingUploadFiles:
return create_staging_upload(test_upload_id, calc_specs=request.param)
......@@ -321,11 +372,7 @@ class TestStagingUploadFiles(UploadFilesContract):
for calc in test_upload.metadata:
mainfile = calc['mainfile']
calc_files = test_upload.calc_files(mainfile, with_mainfile=with_mainfile)
assert len(list(calc_files)) == len(example_file_contents) - 0 if with_mainfile else 1
if with_mainfile:
for one, two in zip(calc_files, [mainfile] + sorted(example_file_contents[1:])):
assert one.endswith(two)
assert one.startswith(mainfile[:3])
assert_example_files(calc_files, with_mainfile=with_mainfile)
def test_delete(self, test_upload: StagingUploadFiles):
test_upload.delete()
......@@ -368,15 +415,10 @@ class TestPublicUploadFiles(UploadFilesContract):
def empty_test_upload(self, test_upload_id: str) -> Generator[UploadFiles, None, None]:
yield create_public_upload(test_upload_id, calc_specs='', is_authorized=lambda: True)
@pytest.fixture(scope='function', params=['r', 'rr', 'pr', 'rp', 'p', 'pp', 'Ppr', 'Prp'])
@pytest.fixture(scope='function', params=itertools.product(
['r', 'rr', 'pr', 'rp', 'p', 'pp', 'RP', 'RR', 'PP'], [True, False]))
def test_upload(self, request, test_upload_id: str) -> PublicUploadFiles:
calc_specs = request.param
if calc_specs.startswith('P'):
public_only = True
calc_specs = calc_specs[1:]
else:
public_only = False
calc_specs, protected = request.param
staging_upload = create_staging_upload(test_upload_id, calc_specs=calc_specs)
staging_upload.pack()
return PublicUploadFiles(test_upload_id, is_authorized=lambda: not public_only)
return PublicUploadFiles(test_upload_id, is_authorized=lambda: not protected)
......@@ -53,7 +53,6 @@ def normalized_template_example(parsed_template_example) -> LocalBackend:
def test_template_example_normalizer(parsed_template_example, no_warn, caplog):
run_normalize(parsed_template_example)
print(str(caplog.records))
def assert_normalized(backend: LocalBackend):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment