Commit 67fb5f9b authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added basic support for matching and parsing compression mainfiles.

parent 1fb5ef2c
Pipeline #43670 failed with stages
in 17 minutes and 26 seconds
......@@ -44,7 +44,7 @@
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.pyenv/bin/pytest",
"args": [
"-sv", "tests/test_api.py::TestAuth::test_put_user"
"-sv", "tests/test_parsing.py::test_parser[parsers/vasp-tests/data/parsers/vasp_compressed/vasp.xml.gz]"
]
},
{
......
Subproject commit 05e5f608178df11fe5563dcb88c3e475d51cd230
Subproject commit 386c64df4c8e4acba3d3339a5b018f1178c5294f
......@@ -91,7 +91,7 @@ class CalcProcReproduction:
if parser_name is not None:
parser = parser_dict.get(parser_name)
else:
parser = match_parser(self.mainfile, lambda: self.upload_files.raw_file(self.mainfile, 'rb'))
parser = match_parser(self.mainfile, self.upload_files)
assert parser is not None, 'there is not parser matching %s' % self.mainfile
self.logger = self.logger.bind(parser=parser.name) # type: ignore
......
......@@ -50,7 +50,6 @@ import hashlib
import base64
import io
import gzip
import bz2
from nomad import config, utils
......@@ -273,14 +272,12 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta):
""" The calc metadata for this upload. """
raise NotImplementedError
def raw_file(self, file_path: str, *args, compressed: bool = True, **kwargs) -> IO:
def raw_file(self, file_path: str, *args, **kwargs) -> IO:
"""
Opens a raw file and returns a file-like object. Additional args, kwargs are
delegated to the respective `open` call.
Arguments:
file_path: The path to the file relative to the upload.
compressed: If True will open the raw file as it is, even if it is compressed.
With False, it will transparently open to read the decompressed file contents.
Raises:
KeyError: If the file does not exist.
Restricted: If the file is restricted and upload access evaluated to False.
......@@ -345,26 +342,13 @@ class StagingUploadFiles(UploadFiles):
raise Restricted
return self._metadata
_compressions = {
b'\x1f\x8b\x08': gzip.open,
b'\x42\x5a\x68': bz2.open
}
def _file(self, path_object: PathObject, *args, compressed: bool = True, **kwargs) -> IO:
def _file(self, path_object: PathObject, *args, **kwargs) -> IO:
try:
open_compressed = None
if not compressed:
with open(path_object.os_path, 'rb') as f:
open_compressed = StagingUploadFiles._compressions[f.read(3)]
if open_compressed is not None and not compressed:
return open_compressed(path_object.os_path, *args, **kwargs)
else:
return open(path_object.os_path, *args, **kwargs)
return open(path_object.os_path, *args, **kwargs)
except FileNotFoundError:
raise KeyError()
def raw_file(self, file_path: str, *args, compressed: bool = False, **kwargs) -> IO:
def raw_file(self, file_path: str, *args, **kwargs) -> IO:
if not self._is_authorized():
raise Restricted
return self._file(self.raw_file_object(file_path), *args, **kwargs)
......@@ -432,7 +416,10 @@ class StagingUploadFiles(UploadFiles):
if move:
shutil.move(path, target_dir.os_path)
else:
shutil.copy(path, target_dir.os_path)
if os.path.isdir(path):
shutil.copytree(path, os.path.join(target_dir.os_path, os.path.dirname(path)))
else:
shutil.copy(path, target_dir.os_path)
@property
def is_frozen(self) -> bool:
......@@ -666,8 +653,7 @@ class PublicUploadFiles(UploadFiles):
raise KeyError()
def raw_file(self, file_path: str, *args, compressed: bool = True, **kwargs) -> IO:
assert compressed, 'not supported'
def raw_file(self, file_path: str, *args, **kwargs) -> IO:
return self._file('raw', 'bagit', 'data/' + file_path, *args, *kwargs)
def raw_file_manifest(self, path_prefix: str = None) -> Generator[str, None, None]:
......
......@@ -60,13 +60,23 @@ based on NOMAD-coe's *python-common* module.
"""
from typing import Callable, IO
import magic
import gzip
import bz2
from nomad import files
from nomad.parsing.backend import AbstractParserBackend, LocalBackend, LegacyLocalBackend, JSONStreamWriter, BadContextURI, WrongContextState
from nomad.parsing.parser import Parser, LegacyParser, VaspOutcarParser
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser
def match_parser(mainfile: str, open: Callable[[], IO]) -> 'Parser':
_compressions = {
b'\x1f\x8b\x08': ('gz', gzip.open),
b'\x42\x5a\x68': ('bz2', bz2.open)
}
def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Parser':
"""
Performs parser matching. This means it take the given mainfile and potentially
opens it with the given callback and tries to identify a parser that can parse
......@@ -81,7 +91,10 @@ def match_parser(mainfile: str, open: Callable[[], IO]) -> 'Parser':
Returns: The parser, or None if no parser could be matched.
"""
with open() as f:
with upload_files.raw_file(mainfile, 'rb') as f:
compression, open_compressed = _compressions.get(f.read(3), (None, open))
with open_compressed(upload_files.raw_file_object(mainfile).os_path, 'rb') as f:
buffer = f.read(2048)
mime_type = magic.from_buffer(buffer, mime=True)
......@@ -89,7 +102,7 @@ def match_parser(mainfile: str, open: Callable[[], IO]) -> 'Parser':
return None
for parser in parsers:
if parser.is_mainfile(mainfile, mime_type, buffer.decode('utf-8')):
if parser.is_mainfile(mainfile, mime_type, buffer.decode('utf-8'), compression):
return parser
return None
......@@ -108,7 +121,8 @@ parsers = [
r'?\s*<modeling>'
r'?\s*<generator>'
r'?\s*<i name="program" type="string">\s*vasp\s*</i>'
r'?')
r'?'),
supported_compressions=['gz', 'bz2']
),
VaspOutcarParser(
name='parsers/vasp',
......
......@@ -57,7 +57,7 @@ class TemplateParser(ArtificalParser):
"""
name = 'parsers/template'
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str, compression: str = None) -> bool:
return filename.endswith('template.json')
def transform_value(self, name, value):
......@@ -124,7 +124,7 @@ class ChaosParser(ArtificalParser):
"""
name = 'parsers/chaos'
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str, compression: str = None) -> bool:
return filename.endswith('chaos.json')
def run(self, mainfile: str, logger=None) -> LocalBackend:
......@@ -179,7 +179,7 @@ class GenerateRandomParser(TemplateParser):
self.template = json.load(open(template_file, 'r'))
self.random = None
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str, compression: str = None) -> bool:
return os.path.basename(filename).startswith('random_')
def transform_section(self, name, section):
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from abc import ABCMeta, abstractmethod
import sys
import re
......@@ -30,17 +31,10 @@ class Parser(metaclass=ABCMeta):
"""
Instances specify a parser. It allows to find *main files* from given uploaded
and extracted files. Further, allows to run the parser on those 'main files'.
Arguments:
name: The name of the parser
parser_class_name: Full qualified name of the main parser class. We assume it have one
parameter for the backend.
main_file_re: A regexp that matches main file paths that this parser can handle.
main_contents_re: A regexp that matches main file headers that this parser can parse.
"""
@abstractmethod
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str, compression: str = None) -> bool:
""" Checks if a file is a mainfile for the parsers. """
pass
......@@ -73,23 +67,27 @@ class LegacyParser(Parser):
mainfile_contents_re: A regexp that is used to match the first 1024 bytes of a
potential mainfile.
mainfile_name_re: A regexp that is used to match the paths of potential mainfiles
supported_compressions: A list of [gz, bz2], if the parser supports compressed files
"""
def __init__(
self, name: str, parser_class_name: str,
mainfile_contents_re: str,
mainfile_mime_re: str = r'text/.*',
mainfile_name_re: str = r'.*') -> None:
mainfile_name_re: str = r'.*',
supported_compressions: List[str] = []) -> None:
self.name = name
self.parser_class_name = parser_class_name
self._mainfile_mime_re = re.compile(mainfile_mime_re)
self._mainfile_name_re = re.compile(mainfile_name_re)
self._mainfile_contents_re = re.compile(mainfile_contents_re)
self._supported_compressions = supported_compressions
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str, compression: str = None) -> bool:
return self._mainfile_name_re.match(filename) is not None and \
self._mainfile_mime_re.match(mime) is not None and \
self._mainfile_contents_re.search(buffer) is not None
self._mainfile_contents_re.search(buffer) is not None and \
(compression is None or compression in self._supported_compressions)
def run(self, mainfile: str, logger=None) -> LocalBackend:
# TODO we need a homogeneous interface to parsers, but we dont have it right now.
......
......@@ -450,7 +450,7 @@ class Upload(Chord):
"""
for filename in self.upload_files.raw_file_manifest():
try:
parser = match_parser(filename, lambda: self.upload_files.raw_file(filename, 'rb'))
parser = match_parser(filename, self.upload_files)
if parser is not None:
yield filename, parser
except Exception as e:
......
......@@ -20,7 +20,7 @@ import pytest
from nomadcore.local_meta_info import loadJsonFile
import nomad_meta_info
from nomad import utils
from nomad import utils, files
from nomad.parsing import JSONStreamWriter, parser_dict, match_parser
from nomad.parsing import LocalBackend, BadContextURI
......@@ -30,6 +30,7 @@ parser_examples = [
('parsers/exciting', 'tests/data/parsers/exciting/Ag/INFO.OUT'),
('parsers/exciting', 'tests/data/parsers/exciting/GW/INFO.OUT'),
('parsers/vasp', 'tests/data/parsers/vasp/vasp.xml'),
('parsers/vasp', 'tests/data/parsers/vasp_compressed/vasp.xml.gz'),
('parsers/vaspoutcar', 'tests/data/parsers/vasp_outcar/OUTCAR'),
('parsers/fhi-aims', 'tests/data/parsers/fhi-aims/aims.out'),
('parsers/cp2k', 'tests/data/parsers/cp2k/si_bulk8.out'),
......@@ -44,7 +45,7 @@ faulty_unknown_one_d_matid_example = [
('parsers/template', 'tests/data/normalizers/no_sim_cell_boolean_positions.json')
]
correct_num_output_files = 15
correct_num_output_files = 16
class TestLocalBackend(object):
......@@ -287,16 +288,15 @@ def test_parser(parser_name, mainfile):
assert_parser_result(parsed_example)
def test_match(no_warn):
directory = 'tests/data/parsers'
def test_match(raw_files, no_warn):
example_upload_id = 'example_upload_id'
upload_files = files.StagingUploadFiles(example_upload_id, create=True, is_authorized=lambda: True)
upload_files.add_rawfiles('tests/data/parsers')
count = 0
for dirpath, _, filenames in os.walk(directory):
for filename in filenames:
fullname = os.path.join(dirpath, filename)
parser = match_parser(fullname, lambda: open(fullname, 'rb'))
if parser is not None:
count += 1
else:
print(fullname)
for mainfile in upload_files.raw_file_manifest():
parser = match_parser(mainfile, upload_files)
if parser is not None:
count += 1
assert count == correct_num_output_files
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment