Commit 3698e26f authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added mime type based parser matching. Related to #35, #55, #99.

parent 73efe701
Pipeline #43641 canceled with stages
in 48 seconds
......@@ -20,7 +20,7 @@ from typing import Union, Callable, cast
from nomad import config, utils
from nomad.files import ArchiveBasedStagingUploadFiles
from nomad.parsing import parsers, parser_dict, LocalBackend
from nomad.parsing import parser_dict, LocalBackend, match_parser
from nomad.normalizing import normalizers
from .main import cli, api_base
......@@ -91,11 +91,7 @@ class CalcProcReproduction:
if parser_name is not None:
parser = parser_dict.get(parser_name)
else:
for potential_parser in parsers:
with self.upload_files.raw_file(self.mainfile) as mainfile_f:
if potential_parser.is_mainfile(self.mainfile, lambda fn: mainfile_f):
parser = potential_parser
break
parser = match_parser(self.mainfile, lambda: self.upload_files.raw_file(self.mainfile, 'rb'))
assert parser is not None, 'there is not parser matching %s' % self.mainfile
self.logger = self.logger.bind(parser=parser.name) # type: ignore
......
......@@ -92,15 +92,13 @@ def setup_elastic():
from nomad.search import Entry
Entry.init(index=config.elastic.index_name)
Entry._index._name = config.elastic.index_name
logger.info('initialized elastic index', index_name=config.elastic.index_name)
except RequestError as e:
if e.status_code == 400 and 'resource_already_exists_exception' in e.error:
pass # happens if two services try this at the same time
# happens if two services try this at the same time
pass
else:
raise e
else:
logger.info('init elastic index')
return elastic_client
......
......@@ -24,7 +24,7 @@ For now, we make a few assumption about parsers
- they have no conflicting python requirments
- they can be loaded at the same time and can be used within the same python process
- they are uniquely identified by a GIT URL and publicly accessible
- their version is uniquly identified by a GIT commit SHA
- their version is uniquely identified by a GIT commit SHA
Each parser is defined via an instance of :class:`Parser`.
......@@ -42,6 +42,10 @@ The parser definitions are available via the following two variables.
Parsers are reused for multiple caclulations.
Parsers and calculation files are matched via regular expressions.
.. autofunc:: nomad.parsing.match_parser
Parsers in NOMAD-coe use a *backend* to create output. There are different NOMAD-coe
basends. In nomad@FAIRDI, we only currently only use a single backed. A version of
NOMAD-coe's *LocalBackend*. It stores all parser results in memory. The following
......@@ -54,12 +58,43 @@ based on NOMAD-coe's *python-common* module.
:members:
"""
from typing import Callable, IO
import magic
from nomad.parsing.backend import AbstractParserBackend, LocalBackend, LegacyLocalBackend, JSONStreamWriter, BadContextURI, WrongContextState
from nomad.parsing.parser import Parser, LegacyParser, VaspOutcarParser
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser
def match_parser(mainfile: str, open: Callable[[], IO]) -> 'Parser':
"""
Performs parser matching. This means it take the given mainfile and potentially
opens it with the given callback and tries to identify a parser that can parse
the file.
This is determined by filename (e.g. *.out), mime type (e.g. text/*, application/xml),
and beginning file contents.
Arguments:
mainfile: The upload relative path to the mainfile
open: A function that allows to open a stream to the file
Returns: The parser, or None if no parser could be matched.
"""
with open() as f:
buffer = f.read(2048)
mime_type = magic.from_buffer(buffer, mime=True)
if mime_type.startswith('application') and not mime_type.endswith('xml'):
return None
for parser in parsers:
if parser.is_mainfile(mainfile, mime_type, buffer.decode('utf-8')):
return parser
return None
parsers = [
GenerateRandomParser(),
TemplateParser(),
......@@ -67,8 +102,8 @@ parsers = [
LegacyParser(
name='parsers/vasp',
parser_class_name='vaspparser.VASPRunParserInterface',
main_file_re=r'^.*\.xml(\.[^\.]*)?$',
main_contents_re=(
mainfile_mime_re=r'(application/xml)|(text/.*)',
mainfile_contents_re=(
r'^\s*<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*'
r'?\s*<modeling>'
r'?\s*<generator>'
......@@ -78,14 +113,14 @@ parsers = [
VaspOutcarParser(
name='parsers/vasp',
parser_class_name='vaspparser.VaspOutcarParser',
main_file_re=r'^OUTCAR(\.[^\.]*)?$',
main_contents_re=(r'^\svasp\..*$')
mainfile_name_re=r'(.*/)?OUTCAR(\.[^\.]*)?',
mainfile_contents_re=(r'^\svasp\.')
),
LegacyParser(
name='parsers/exciting',
parser_class_name='excitingparser.ExcitingParser',
main_file_re=r'^.*/INFO\.OUT?',
main_contents_re=(
mainfile_name_re=r'^.*/INFO\.OUT?',
mainfile_contents_re=(
r'^\s*=================================================+\s*'
r'\s*\|\s*EXCITING\s+\S+\s+started\s*='
r'\s*\|\s*version hash id:\s*\S*\s*=')
......@@ -93,8 +128,7 @@ parsers = [
LegacyParser(
name='parsers/fhi-aims',
parser_class_name='fhiaimsparser.FHIaimsParser',
main_file_re=r'^.*\.out$',
main_contents_re=(
mainfile_contents_re=(
r'^(.*\n)*'
r'?\s*Invoking FHI-aims \.\.\.'
r'?\s*Version')
......@@ -102,8 +136,7 @@ parsers = [
LegacyParser(
name='parsers/cp2k',
parser_class_name='cp2kparser.CP2KParser',
main_file_re=r'^.*\.out$', # This looks for files with .out
main_contents_re=(
mainfile_contents_re=(
r'\*\*\*\* \*\*\*\* \*\*\*\*\*\* \*\* PROGRAM STARTED AT\s.*\n'
r' \*\*\*\*\* \*\* \*\*\* \*\*\* \*\* PROGRAM STARTED ON\s*.*\n'
r' \*\* \*\*\*\* \*\*\*\*\*\* PROGRAM STARTED BY .*\n'
......@@ -114,8 +147,7 @@ parsers = [
LegacyParser(
name='parsers/crystal',
parser_class_name='crystalparser.CrystalParser',
main_file_re=r'^.*\.out$',
main_contents_re=(
mainfile_contents_re=(
r'\s*[\*]{22,}' # Looks for '*' 22 times or more in a row.
r'\s*\*\s{20,}\*' # Looks for a '*' sandwhiched by whitespace.
r'\s*\*\s{10,}CRYSTAL(?P<majorVersion>[\d]+)\s{10,}\*'
......@@ -128,8 +160,7 @@ parsers = [
LegacyParser(
name='parsers/cpmd',
parser_class_name='cpmdparser.CPMDParser',
main_file_re=r'^.*\.out$',
main_contents_re=(
mainfile_contents_re=(
# r'\s+\*\*\*\*\*\* \*\*\*\*\*\* \*\*\*\* \*\*\*\* \*\*\*\*\*\*\s*'
# r'\s+\*\*\*\*\*\*\* \*\*\*\*\*\*\* \*\*\*\*\*\*\*\*\*\* \*\*\*\*\*\*\*\s+'
r'\*\*\* \*\* \*\*\* \*\* \*\*\*\* \*\* \*\* \*\*\*'
......@@ -143,8 +174,7 @@ parsers = [
LegacyParser(
name='parsers/nwchem',
parser_class_name='nwchemparser.NWChemParser',
main_file_re=r'^.*\.out$',
main_contents_re=(
mainfile_contents_re=(
r'\s+Northwest Computational Chemistry Package \(NWChem\) \d+\.\d+'
r'\s+------------------------------------------------------'
r'\s+Environmental Molecular Sciences Laboratory'
......@@ -155,8 +185,7 @@ parsers = [
LegacyParser(
name='parsers/bigdft',
parser_class_name='bigdftparser.BigDFTParser',
main_file_re=r'^.*\.out$',
main_contents_re=(
mainfile_contents_re=(
r'__________________________________ A fast and precise DFT wavelet code\s*'
r'\| \| \| \| \| \|\s*'
r'\| \| \| \| \| \| BBBB i gggggg\s*'
......@@ -187,8 +216,7 @@ parsers = [
LegacyParser(
name='parsers/wien2k',
parser_class_name='wien2kparser.Wien2kParser',
main_file_re=r'^.*\.scf$', # This looks for files with .scf
main_contents_re=r':ITE[0-9]+: 1. ITERATION'
mainfile_contents_re=r':LABEL\d+: using WIEN2k_\d+\.\d+'
)
]
......
......@@ -16,7 +16,6 @@
Parser for creating artificial test, brenchmark, and demonstration data.
"""
from typing import Callable, IO, Any
import json
import os.path
import numpy as np
......@@ -58,7 +57,7 @@ class TemplateParser(ArtificalParser):
"""
name = 'parsers/template'
def is_mainfile(self, filename: str, open: Callable[[str], IO[Any]]) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
return filename.endswith('template.json')
def transform_value(self, name, value):
......@@ -125,7 +124,7 @@ class ChaosParser(ArtificalParser):
"""
name = 'parsers/chaos'
def is_mainfile(self, filename: str, open: Callable[[str], IO[Any]]) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
return filename.endswith('chaos.json')
def run(self, mainfile: str, logger=None) -> LocalBackend:
......@@ -180,7 +179,7 @@ class GenerateRandomParser(TemplateParser):
self.template = json.load(open(template_file, 'r'))
self.random = None
def is_mainfile(self, filename: str, open: Callable[[str], IO[Any]]) -> bool:
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
return os.path.basename(filename).startswith('random_')
def transform_section(self, name, section):
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Callable, IO
from abc import ABCMeta, abstractmethod
import sys
import re
......@@ -41,8 +40,8 @@ class Parser(metaclass=ABCMeta):
"""
@abstractmethod
def is_mainfile(self, filename: str, open: Callable[[str], IO[Any]]) -> bool:
""" Checks if a file is a mainfile via the parsers ``main_contents_re``. """
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
""" Checks if a file is a mainfile for the parsers. """
pass
@abstractmethod
......@@ -70,37 +69,27 @@ class LegacyParser(Parser):
python_get: the git repository and commit that contains the legacy parser
parser_class_name: the main parser class that implements NOMAD-coe's
python-common *ParserInterface*. Instances of this class are currently not reused.
main_file_re: A regexp that is used to match the paths of potential mainfiles
main_contents_re: A regexp that is used to match the first 500 bytes of a
mainfile_mime_re: A regexp that is used to match against a files mime type
mainfile_contents_re: A regexp that is used to match the first 1024 bytes of a
potential mainfile.
mainfile_name_re: A regexp that is used to match the paths of potential mainfiles
"""
def __init__(
self, name: str, parser_class_name: str, main_file_re: str,
main_contents_re: str) -> None:
self, name: str, parser_class_name: str,
mainfile_contents_re: str,
mainfile_mime_re: str = r'text/.*',
mainfile_name_re: str = r'.*') -> None:
self.name = name
self.parser_class_name = parser_class_name
self._main_file_re = re.compile(main_file_re)
self._main_contents_re = re.compile(main_contents_re)
def is_mainfile(self, filename: str, open: Callable[[str], IO[Any]]) -> bool:
# Number of bytes to read at top of file. We might have to change this
# in the future since there is variable size information at the top of the
# file for instance for crystal parser.
num_bytes = 2000
if self._main_file_re.match(filename):
file = None
try:
file = open(filename)
contents = file.read(num_bytes)
fake_var = self._main_contents_re.search(contents) is not None
return fake_var
finally:
if file:
file.close()
return False
self._mainfile_mime_re = re.compile(mainfile_mime_re)
self._mainfile_name_re = re.compile(mainfile_name_re)
self._mainfile_contents_re = re.compile(mainfile_contents_re)
def is_mainfile(self, filename: str, mime: str, buffer: str) -> bool:
return self._mainfile_name_re.match(filename) is not None and \
self._mainfile_mime_re.match(mime) is not None and \
self._mainfile_contents_re.search(buffer) is not None
def run(self, mainfile: str, logger=None) -> LocalBackend:
# TODO we need a homogeneous interface to parsers, but we dont have it right now.
......
......@@ -33,7 +33,7 @@ from contextlib import contextmanager
from nomad import utils, coe_repo, config, infrastructure, search
from nomad.files import PathObject, UploadFiles, ExtractError, ArchiveBasedStagingUploadFiles
from nomad.processing.base import Proc, Chord, process, task, PENDING, SUCCESS, FAILURE
from nomad.parsing import parsers, parser_dict
from nomad.parsing import parser_dict, match_parser
from nomad.normalizing import normalizers
from nomad.datamodel import UploadWithMetadata, CalcWithMetadata
......@@ -449,15 +449,14 @@ class Upload(Chord):
Tuples of mainfile, filename, and parsers
"""
for filename in self.upload_files.raw_file_manifest():
for parser in parsers:
try:
with self.upload_files.raw_file(filename) as mainfile_f:
if parser.is_mainfile(filename, lambda fn: mainfile_f):
yield filename, parser
except Exception as e:
self.get_logger().error(
'exception while matching pot. mainfile',
mainfile=filename, exc_info=e)
try:
parser = match_parser(filename, lambda: self.upload_files.raw_file(filename, 'rb'))
if parser is not None:
yield filename, parser
except Exception as e:
self.get_logger().error(
'exception while matching pot. mainfile',
mainfile=filename, exc_info=e)
@task
def parse_all(self):
......
......@@ -107,10 +107,24 @@ def purged_app(celery_session_app):
@pytest.fixture(scope='session')
def worker(celery_session_worker):
def celery_inspect(purged_app):
yield purged_app.control.inspect()
@pytest.fixture(scope='session')
def worker(celery_session_worker, celery_inspect):
""" Provides a clean worker (no old tasks) per function. Waits for all tasks to be completed. """
pass
# wait until there no more active tasks, to leave clean worker and queues for the next
# test run.
while True:
empty = True
for value in celery_inspect.active().values():
empty = empty and len(value) == 0
if empty:
break
@pytest.fixture(scope='session')
def mongo_infra(monkeysession):
......@@ -128,15 +142,28 @@ def mongo(mongo_infra):
def elastic_infra(monkeysession):
""" Provides elastic infrastructure to the session """
monkeysession.setattr('nomad.config.elastic', config.elastic._replace(index_name='test_nomad_fairdi_calcs'))
return infrastructure.setup_elastic()
try:
return infrastructure.setup_elastic()
except Exception:
# try to delete index, error might be caused by changed mapping
from elasticsearch_dsl import connections
connections.create_connection(hosts=['%s:%d' % (config.elastic.host, config.elastic.port)]) \
.indices.delete(index='test_nomad_fairdi_calcs')
return infrastructure.setup_elastic()
@pytest.fixture(scope='function')
def elastic(elastic_infra):
""" Provides a clean elastic per function. Clears elastic before test. """
elastic_infra.delete_by_query(
index='test_nomad_fairdi_calcs', body=dict(query=dict(match_all={})),
wait_for_completion=True, refresh=True)
while True:
try:
elastic_infra.delete_by_query(
index='test_nomad_fairdi_calcs', body=dict(query=dict(match_all={})),
wait_for_completion=True, refresh=True)
break
except Exception:
time.sleep(0.1)
assert infrastructure.elastic_client is not None
return elastic_infra
......
<?xml version="1.0" encoding="ISO-8859-1"?>
<modeling>
<generator>
<i name="program" type="string">vasp</i>
<i name="version" type="string">4.6.35 </i>
<i name="subversion" type="string">3Apr08 complex parallel</i>
<i name="platform" type="string">LinuxIFC</i>
<i name="date" type="string">2009 10 15</i>
<i name="time" type="string">18:03:46</i>
</generator>
MPI-parallelism will be employed.
------------------------------------------------------------
Invoking FHI-aims ...
Version 160210
Git rev. (modified): 3e7c3af [waf] move extension options to Configurat[...]
Compiled on 2016/02/25 at 08:54:27 on host node17.timewarp.
When using FHI-aims, please cite the following reference:
This diff is collapsed.
<?xml version="1.0" encoding="ISO-8859-1"?>
<modeling>
<generator>
<i name="program" type="string">vasp </i>
<i name="version" type="string">5.2.2 </i>
<i name="subversion" type="string">15Apr09 complex parallel </i>
<i name="platform" type="string">LinuxIFC </i>
This diff is collapsed.
<?xml version="1.0" encoding="ISO-8859-1"?>
<modeling>
<generator>
<i name="program" type="string">vasp </i>
<i name="version" type="string">5.3.2 </i>
<i name="subversion" type="string">13Sep12 (build Mar 19 2013 10:46:17) complex serial </i>
<i name="platform" type="string">LinuxIFC </i>
<i name="date" type="string">2013 12 05 </i>
<?xml version="1.0" encoding="ISO-8859-1"?>
<modeling>
<generator>
<i name="program" type="string">vasp</i>
<i name="version" type="string">4.6.35 </i>
<i name="subversion" type="string">3Apr08 complex parallel</i>
<i name="platform" type="string">LinuxIFC</i>
<i name="date" type="string">2013 04 21</i>
<i name="time" type="string">17:15:34</i>
</generator>
<?xml version="1.0" encoding="ISO-8859-1"?>
<modeling>
<generator>
<i name="program" type="string">vasp</i>
......@@ -21,7 +21,7 @@ from nomadcore.local_meta_info import loadJsonFile
import nomad_meta_info
from nomad import utils
from nomad.parsing import JSONStreamWriter, parser_dict
from nomad.parsing import JSONStreamWriter, parser_dict, match_parser
from nomad.parsing import LocalBackend, BadContextURI
parser_examples = [
......@@ -44,7 +44,7 @@ faulty_unknown_one_d_matid_example = [
('parsers/template', 'tests/data/normalizers/no_sim_cell_boolean_positions.json')
]
correct_num_output_files = 14
correct_num_output_files = 15
class TestLocalBackend(object):
......@@ -293,8 +293,10 @@ def test_match(no_warn):
for dirpath, _, filenames in os.walk(directory):
for filename in filenames:
fullname = os.path.join(dirpath, filename)
for parser in parser_dict.values():
if parser.is_mainfile(fullname, lambda fn: open(fn)):
count += 1
parser = match_parser(fullname, lambda: open(fullname, 'rb'))
if parser is not None:
count += 1
else:
print(fullname)
assert count == correct_num_output_files
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment