Commit 8df634f9 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Major refactoring.

parent ff3bf906
import json
import os import os
import logging import logging
logger = logging.getLogger(__name__)
from abc import ABCMeta, abstractmethod
from nomadcore.parser_backend import JsonParseEventsWriterBackend
from nomadcore.simple_parser import SimpleParserBuilder, defaultParseFile
from nomadcore.local_meta_info import loadJsonFile
import StringIO import StringIO
import sys import sys
from abc import ABCMeta, abstractmethod
from nomadcore.simple_parser import SimpleParserBuilder, defaultParseFile
from nomadcore.caching_backend import CachingLevel, ActiveBackend
logger = logging.getLogger(__name__)
#=============================================================================== #===============================================================================
...@@ -28,102 +26,28 @@ class NomadParser(object): ...@@ -28,102 +26,28 @@ class NomadParser(object):
interface that can be expected from each parser, but leaves the interface that can be expected from each parser, but leaves the
implementation details to the developer. implementation details to the developer.
To initialize a NomadParser, you need to give it a JSON string in the
constructor. JSON is used because it is language-agnostic and can easily given
as a run argument for the parser. An example of the JSON file might look like
this:
{
"metaInfoFile": "/home/metainfo.json"
"tmpDir": "/home",
"metainfoToKeep": ["energy"],
"metainfoToSkip": ["particle_forces"],
"files": {
"/home/output.out": "output",
"/home/input.inp": "input",
"/home/coords.xyz": ""
}
}
Here is an explanation of the different attributes:
- metaInfoFile: The metainfo JSON file path containing the metainfo definitions
used by this parser
- tmpDir: A temporary directory for data
- metainfoToKeep: What metainfo should be parsed. If empty, tries to
parse everything except the ones specified in 'metainfoToSkip'
- metainfoToSkip: A list of metainfos that should be ignored
- files: Dictionary of files. The key is the path to the file, and the
value is an optional identifier that can be provided or later
determined by the parser.
Attributes: Attributes:
input_json_string: A string containing the JSON input.
input_json_object: The JSON string decoded as an accessible object.
files: A dictionary of file paths as keys and id's as values. These ids's only include files: A dictionary of file paths as keys and id's as values. These ids's only include
the ones given at initialization in the input JSON." the ones given at initialization in the input JSON."
tmp_dir: Temporary directory location.
metainfo_file: Path to the file where the metainfos are declared
meta_info_to_keep:
meta_info_to_skip:
file_ids: A dictionary containing all the assigned id's as keys and their file_ids: A dictionary containing all the assigned id's as keys and their
respective filepaths as values. metainfoenv: A dictionary for the metainfos
test_mode: A boolean for turning on test mode. In test mode the parsed backend: An object responsible for the JSON formatting, unit conversion
values are not converted to SI or formatted as JSON and they are not and sending the results to the scala layer.
sent to the backend but returned directly as one possibly large value.
backend: An object responsible for the JSON formatting and sending the
results to the scala layer.
""" """
__metaclass__ = ABCMeta __metaclass__ = ABCMeta
def __init__(self, input_json_string, stream=sys.stdout, test_mode=False): def __init__(self, parser_context):
self.input_json_string = input_json_string self.files = parser_context.files
self.input_json_object = None self.metainfoenv = parser_context.metainfoenv
self.files = {} self.backend = parser_context.backend
self.tmp_dir = None self.stream = parser_context.stream
self.metainfo_file = None self.version_id = parser_context.version_id
self.metainfoenv = None
self.metainfos = {}
self.metainfo_to_keep = None
self.metainfo_to_skip = None
self.file_ids = {}
self.results = {}
self.filepaths_wo_id = None
self.test_mode = test_mode
self.backend = None
self.stream = stream
self._file_handles = {} self._file_handles = {}
self._file_contents = {} self._file_contents = {}
self._file_sizes = {} self._file_sizes = {}
self.file_ids = {}
self.analyze_input_json()
self.setup_given_file_ids()
def analyze_input_json(self):
"""Analyze the validity of the JSON string given as input.
"""
# Try to decode the input JSON
try:
self.input_json_object = json.loads(self.input_json_string)
except ValueError as e:
logger.error("Error in decoding the given JSON input: {}".format(e))
# See if the needed attributes exist
self.metainfo_file = self.input_json_object.get("metaInfoFile")
if self.metainfo_file is None:
logger.error("No metainfo file path specified.")
self.tmp_dir = self.input_json_object.get("tmpDir")
if self.tmp_dir is None:
logger.error("No temporary folder specified.")
self.files = self.input_json_object.get("files")
if self.files is None:
logger.error("No files specified.")
self.metainfo_to_keep = self.input_json_object.get("metainfoToKeep")
self.metainfo_to_skip = self.input_json_object.get("metainfoToSkip")
# Try to decode the metainfo file and setup the backend
self.metainfoenv, warnings = loadJsonFile(self.metainfo_file)
self.backend = JsonParseEventsWriterBackend(self.metainfoenv, self.stream)
def setup_given_file_ids(self): def setup_given_file_ids(self):
"""Saves the file id's that were given in the JSON input. """Saves the file id's that were given in the JSON input.
...@@ -132,24 +56,24 @@ class NomadParser(object): ...@@ -132,24 +56,24 @@ class NomadParser(object):
if file_id: if file_id:
self.setup_file_id(path, file_id) self.setup_file_id(path, file_id)
@abstractmethod
def setup_version(self):
"""Do some version specific setup work. The parsers will have to
support many versions of the same code and the results of different
versions may have to be parsed differently.
With this function you should determine the version of the software,
and setup the version specific implementation of the parser.
"""
pass
@abstractmethod @abstractmethod
def parse(self): def parse(self):
"""Start the parsing. Will try to parse everything unless given special """Start the parsing. Will try to parse everything unless given special
rules (metaInfoToKeep, metaInfoToSkip).""" rules (metaInfoToKeep, metaInfoToSkip)."""
pass pass
def parse_file(self, fileToParse, mainFileDescription, metaInfoEnv, backend, parserInfo): def parse_file(
self,
fileToParse,
mainFileDescription,
metaInfoEnv,
backend,
parserInfo,
cachingLevelForMetaName={},
defaultDataCachingLevel=CachingLevel.ForwardAndCache,
defaultSectionCachingLevel=CachingLevel.Forward,
superContext=None,
onClose={}):
"""Uses the SimpleParser utilities to to parse a file. """Uses the SimpleParser utilities to to parse a file.
Args: Args:
...@@ -167,6 +91,15 @@ class NomadParser(object): ...@@ -167,6 +91,15 @@ class NomadParser(object):
if not parserBuilder.verifyMetaInfo(sys.stderr): if not parserBuilder.verifyMetaInfo(sys.stderr):
sys.exit(1) sys.exit(1)
# Setup the backend that caches ond handles triggers
backend = ActiveBackend.activeBackend(
metaInfoEnv=metaInfoEnv,
cachingLevelForMetaName=cachingLevelForMetaName,
defaultDataCachingLevel=defaultDataCachingLevel,
defaultSectionCachingLevel=defaultSectionCachingLevel,
onClose=onClose,
superBackend=backend)
# Compile the SimpleMatcher tree # Compile the SimpleMatcher tree
parserBuilder.compile() parserBuilder.compile()
if logger.isEnabledFor(logging.DEBUG): if logger.isEnabledFor(logging.DEBUG):
...@@ -311,6 +244,19 @@ class NomadParser(object): ...@@ -311,6 +244,19 @@ class NomadParser(object):
self._file_sizes[file_id] = size self._file_sizes[file_id] = size
return size return size
# @abstractmethod # @abstractmethod
# def get_supported_quantities(self): # def get_supported_quantities(self):
# """Return a list of the nomad quantities that this parser supports. The # """Return a list of the nomad quantities that this parser supports. The
......
This diff is collapsed.
import os import os
import json import json
from cp2kparser.implementation.parser import CP2KParser from cp2kparser.implementation.cp2kparserbuilder import CP2KParserBuilder
import sys import sys
...@@ -33,12 +33,11 @@ def get_parser(path, test_mode=True, stream=sys.stdout): ...@@ -33,12 +33,11 @@ def get_parser(path, test_mode=True, stream=sys.stdout):
json_input = { json_input = {
"version": "nomadparsein.json 1.0", "version": "nomadparsein.json 1.0",
"metaInfoFile": metaInfoPath, "metaInfoFile": metaInfoPath,
"tmpDir": "/home",
"metainfoToKeep": [], "metainfoToKeep": [],
"metainfoToSkip": [], "metainfoToSkip": [],
"files": files "files": files
} }
parser = CP2KParser(json.dumps(json_input), test_mode=test_mode, stream=stream) parser = CP2KParserBuilder(json.dumps(json_input), stream=stream).build_parser()
return parser return parser
......
...@@ -16,7 +16,7 @@ cp2k_input.xml. ...@@ -16,7 +16,7 @@ cp2k_input.xml.
import xml.etree.cElementTree as ET import xml.etree.cElementTree as ET
import logging import logging
import cPickle as pickle import cPickle as pickle
from cp2kparser.engines.cp2kinputenginedata.input_tree import * from cp2kparser.implementation.cp2kinputenginedata.input_tree import *
logger = logging logger = logging
......
...@@ -79,7 +79,7 @@ class CP2KInputEngine(object): ...@@ -79,7 +79,7 @@ class CP2KInputEngine(object):
return self.input_tree return self.input_tree
def setup_version_number(self, version_number): def setup_version(self, version_number):
""" The pickle file which contains preparsed data from the """ The pickle file which contains preparsed data from the
cp2k_input.xml is version specific. By calling this function before cp2k_input.xml is version specific. By calling this function before
parsing the correct file can be found. parsing the correct file can be found.
......
This diff is collapsed.
import re
import os import os
import re2 as re import logging
from cp2kparser.generics.nomadparser import NomadParser
from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine
from cp2kparser.engines.csvengine import CSVEngine from cp2kparser.engines.csvengine import CSVEngine
from cp2kparser.engines.cp2kinputengine import CP2KInputEngine from cp2kparser.implementation.cp2kinputparsers import CP2KInputEngine
from cp2kparser.engines.xmlengine import XMLEngine from cp2kparser.implementation.outputparsers import *
from nomadcore.coordinate_reader import CoordinateReader from nomadcore.coordinate_reader import CoordinateReader
from nomadcore.unit_conversion.unit_conversion import convert_unit, ureg from cp2kparser.generics.nomadparser import NomadParser
from nomadcore.simple_parser import SimpleMatcher as SM
from cp2kparser.engines.cp2kinputenginedata.input_tree import CP2KInput
import numpy as np
import logging
import sys
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
import math
#=============================================================================== #===============================================================================
class CP2KParser(NomadParser): class CP2KImplementation262(NomadParser):
"""The interface for a NoMaD CP2K parser. All parsing actions will go """Defines the basic functions that are used to map results to the
through this class. corresponding NoMaD quantities.
The CP2K version 2.6.2 was used as a reference for this basic This class provides the basic implementations and for a version specific
implementation. For other versions there should be classes that extend from updates and additions please make a new class that inherits from this.
this.
The functions that return certain quantities are tagged with a prefix '_Q_'
to be able to automatically determine which quantities have at least some
level of support. With the tag they can be also looped through.
""" """
def __init__(self, input_json_string, stream=sys.stdout, test_mode=False): def __init__(self, parser_context):
# Initialize the base class # Initialize the base class
NomadParser.__init__(self, input_json_string, stream, test_mode) NomadParser.__init__(self, parser_context)
# Engines are created here # Engines are created here
self.csvengine = CSVEngine(self) self.csvengine = CSVEngine(self)
self.regexengine = RegexEngine(self)
self.xmlengine = XMLEngine(self)
self.inputengine = CP2KInputEngine() self.inputengine = CP2KInputEngine()
self.atomsengine = CoordinateReader() self.atomsengine = CoordinateReader()
self.outputparser = globals()["CP2KOutputParser{}".format(self.version_id)]()
self.version_number = None
self.implementation = None
self.input_tree = None self.input_tree = None
self.regexs = None
self.extended_input = None self.extended_input = None
self.determine_file_ids_pre_setup() self.determine_file_ids_pre_setup()
self.input_preprocessor() self.input_preprocessor()
self.setup_version()
self.determine_file_ids_post_setup() self.determine_file_ids_post_setup()
def setup_version(self): def determine_file_ids_pre_setup(self):
"""Setups the version by looking at the output file and the version """First resolve the files that can be identified by extension.
specified in it.
""" """
# Determine the CP2K version from the output file # Input and output files
beginning = self.read_part_of_file("output", 2048) for file_path in self.files.iterkeys():
version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n") if file_path.endswith(".inp"):
self.version_number = version_regex.search(beginning).groups()[0].replace('.', '') self.setup_file_id(file_path, "input")
self.inputengine.setup_version_number(self.version_number) if file_path.endswith(".out"):
self.input_tree = self.inputengine.parse(self.extended_input) self.setup_file_id(file_path, "output")
version_name = '_' + self.version_number + '_'
# Search for a version specific regex class
class_name = "CP2K{}Regexs".format(version_name)
self.regexs = globals().get(class_name)
if self.regexs:
logger.debug("Using version specific regexs '{}'.".format(class_name))
self.regexs = self.regexs()
else:
logger.debug("Using default regexs.")
self.regexs = globals()["CP2KRegexs"]()
# Search for a version specific implementation
class_name = "CP2K{}Implementation".format(version_name)
class_object = globals().get(class_name)
if class_object:
logger.debug("Using version specific implementation '{}'.".format(class_name))
self.implementation = class_object(self)
else:
logger.debug("Using default implementation.")
self.implementation = globals()["CP2KImplementation"](self)
def read_part_of_file(self, file_id, size=1024): # Include files
fh = self.get_file_handle(file_id) input_file = self.get_file_contents("input")
buffer = fh.read(size) for line in input_file.split("\n"):
return buffer line = line.strip()
if line.startswith("@INCLUDE") or line.startswith("@include"):
split = line.split(None, 1)
filename = split[1]
if filename.startswith(('\"', '\'')) and filename.endswith(('\"', '\'')):
filename = filename[1:-1]
filepath = self.search_file(filename)
self.setup_file_id(filepath, "include")
def input_preprocessor(self): def input_preprocessor(self):
"""Preprocess the input file. Concatenate .inc files into the main input file and """Preprocess the input file. Concatenate .inc files into the main input file and
...@@ -167,29 +142,8 @@ class CP2KParser(NomadParser): ...@@ -167,29 +142,8 @@ class CP2KParser(NomadParser):
input_variables_replaced.append(new_line) input_variables_replaced.append(new_line)
self.extended_input = '\n'.join(input_variables_replaced) self.extended_input = '\n'.join(input_variables_replaced)
# print self.extended_input self.inputengine.setup_version(self.version_id)
self.input_tree = self.inputengine.parse(self.extended_input)
def determine_file_ids_pre_setup(self):
"""First resolve the files that can be identified by extension.
"""
# Input and output files
for file_path in self.files.iterkeys():
if file_path.endswith(".inp"):
self.setup_file_id(file_path, "input")
if file_path.endswith(".out"):
self.setup_file_id(file_path, "output")
# Include files
input_file = self.get_file_contents("input")
for line in input_file.split("\n"):
line = line.strip()
if line.startswith("@INCLUDE") or line.startswith("@include"):
split = line.split(None, 1)
filename = split[1]
if filename.startswith(('\"', '\'')) and filename.endswith(('\"', '\'')):
filename = filename[1:-1]
filepath = self.search_file(filename)
self.setup_file_id(filepath, "include")
def determine_file_ids_post_setup(self): def determine_file_ids_post_setup(self):
"""Determines the file id's after the CP2K verion has been set """Determines the file id's after the CP2K verion has been set
...@@ -312,106 +266,27 @@ class CP2KParser(NomadParser): ...@@ -312,106 +266,27 @@ class CP2KParser(NomadParser):
break break
return folders return folders
def parse(self):
self.implementation.parse()
# def get_all_quantities(self):
# """Parse all supported quantities."""
# for method in self.get_supported_quantities:
# self.get_quantity(method)
# def start_parsing(self, name):
# """Inherited from NomadParser.
# """
# # Ask the implementation for the quantity
# function = getattr(self.implementation, "_Q_" + name)
# if function:
# return function()
# else:
# logger.error("The function for quantity '{}' is not defined".format(name))
# def get_supported_quantities(self):
# """Inherited from NomadParser.
# """
# supported_quantities = []
# implementation_methods = [method for method in dir(self.implementation) if callable(getattr(self.implementation, method))]
# for method in implementation_methods:
# if method.startswith("_Q_"):
# method = method[3:]
# supported_quantities.append(method)
# return supported_quantities
#===============================================================================
class CP2KImplementation(object):
"""Defines the basic functions that are used to map results to the
corresponding NoMaD quantities.
This class provides the basic implementations and for a version specific
updates and additions please make a new class that inherits from this.
The functions that return certain quantities are tagged with a prefix '_Q_'
to be able to automatically determine which quantities have at least some
level of support. With the tag they can be also looped through.
"""
def __init__(self, parser):
self.parser = parser
self.regexs = parser.regexs
self.regexengine = parser.regexengine
self.csvengine = parser.csvengine
self.atomsengine = parser.atomsengine
self.input_tree = parser.input_tree
# Define the output parsing tree for this version
self.outputstructure = SM(
name='root',
startReStr="",
subMatchers=[
SM(
name='new_run',
startReStr=r" DBCSR\| Multiplication driver",
endReStr="[.\*]+PROGRAM STOPPED IN",
required=True,
sections=['section_run'],
subMatchers=[
SM(
name="run_datetime",
startReStr=r"[\*\s]+PROGRAM STARTED AT\s+(?P<cp2k_run_start_date>\d{4}-\d{2}-\d{2}) (?P<cp2k_run_start_time>\d{2}:\d{2}:\d{2}.\d{3})",
),
SM(
name="version",
startReStr=r" CP2K\| version string:\s+(?P<program_version>[\w\d\W\s]+)",
),
SM(
name="svn_revision",
startReStr=r" CP2K\| source code revision number:\s+svn:(?P<cp2k_svn_revision>\d+)",
)
]
)
]
)
def parse(self): def parse(self):
"""Parses everything that can be found from the given files. The """Parses everything that can be found from the given files. The
results are outputted to std.out by using the backend. The scala layer results are outputted to std.out by using the backend. The scala layer
will the take on from that. will the take on from that.
""" """
# Write the starting bracket # Write the starting bracket
self.parser.stream.write("[") self.stream.write("[")
# Use the SimpleMatcher to extract most of the results # Use the SimpleMatcher to extract most of the results
parserInfo = {"name": "cp2k-parser", "version": "1.0"} parserInfo = {"name": "cp2k-parser", "version": "1.0"}
outputfilename = self.parser.get_file_handle("output").name outputfilename = self.get_file_handle("output").name
metainfoenv = self.parser.metainfoenv metainfoenv = self.metainfoenv
backend = self.parser.backend backend = self.backend
outputstructure = self.outputstructure outputstructure = self.outputparser.outputstructure
self.parser.parse_file(