Commit 8df634f9 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Major refactoring.

parent ff3bf906
import json
import os
import logging
logger = logging.getLogger(__name__)
from abc import ABCMeta, abstractmethod
from nomadcore.parser_backend import JsonParseEventsWriterBackend
from nomadcore.simple_parser import SimpleParserBuilder, defaultParseFile
from nomadcore.local_meta_info import loadJsonFile
import StringIO
import sys
from abc import ABCMeta, abstractmethod
from nomadcore.simple_parser import SimpleParserBuilder, defaultParseFile
from nomadcore.caching_backend import CachingLevel, ActiveBackend
logger = logging.getLogger(__name__)
#===============================================================================
......@@ -28,102 +26,28 @@ class NomadParser(object):
interface that can be expected from each parser, but leaves the
implementation details to the developer.
To initialize a NomadParser, you need to give it a JSON string in the
constructor. JSON is used because it is language-agnostic and can easily given
as a run argument for the parser. An example of the JSON file might look like
this:
{
"metaInfoFile": "/home/metainfo.json"
"tmpDir": "/home",
"metainfoToKeep": ["energy"],
"metainfoToSkip": ["particle_forces"],
"files": {
"/home/output.out": "output",
"/home/input.inp": "input",
"/home/coords.xyz": ""
}
}
Here is an explanation of the different attributes:
- metaInfoFile: The metainfo JSON file path containing the metainfo definitions
used by this parser
- tmpDir: A temporary directory for data
- metainfoToKeep: What metainfo should be parsed. If empty, tries to
parse everything except the ones specified in 'metainfoToSkip'
- metainfoToSkip: A list of metainfos that should be ignored
- files: Dictionary of files. The key is the path to the file, and the
value is an optional identifier that can be provided or later
determined by the parser.
Attributes:
input_json_string: A string containing the JSON input.
input_json_object: The JSON string decoded as an accessible object.
files: A dictionary of file paths as keys and id's as values. These ids's only include
the ones given at initialization in the input JSON."
tmp_dir: Temporary directory location.
metainfo_file: Path to the file where the metainfos are declared
meta_info_to_keep:
meta_info_to_skip:
file_ids: A dictionary containing all the assigned id's as keys and their
respective filepaths as values.
test_mode: A boolean for turning on test mode. In test mode the parsed
values are not converted to SI or formatted as JSON and they are not
sent to the backend but returned directly as one possibly large value.
backend: An object responsible for the JSON formatting and sending the
results to the scala layer.
metainfoenv: A dictionary for the metainfos
backend: An object responsible for the JSON formatting, unit conversion
and sending the results to the scala layer.
"""
__metaclass__ = ABCMeta
def __init__(self, input_json_string, stream=sys.stdout, test_mode=False):
self.input_json_string = input_json_string
self.input_json_object = None
self.files = {}
self.tmp_dir = None
self.metainfo_file = None
self.metainfoenv = None
self.metainfos = {}
self.metainfo_to_keep = None
self.metainfo_to_skip = None
self.file_ids = {}
self.results = {}
self.filepaths_wo_id = None
self.test_mode = test_mode
self.backend = None
self.stream = stream
def __init__(self, parser_context):
self.files = parser_context.files
self.metainfoenv = parser_context.metainfoenv
self.backend = parser_context.backend
self.stream = parser_context.stream
self.version_id = parser_context.version_id
self._file_handles = {}
self._file_contents = {}
self._file_sizes = {}
self.analyze_input_json()
self.setup_given_file_ids()
def analyze_input_json(self):
"""Analyze the validity of the JSON string given as input.
"""
# Try to decode the input JSON
try:
self.input_json_object = json.loads(self.input_json_string)
except ValueError as e:
logger.error("Error in decoding the given JSON input: {}".format(e))
# See if the needed attributes exist
self.metainfo_file = self.input_json_object.get("metaInfoFile")
if self.metainfo_file is None:
logger.error("No metainfo file path specified.")
self.tmp_dir = self.input_json_object.get("tmpDir")
if self.tmp_dir is None:
logger.error("No temporary folder specified.")
self.files = self.input_json_object.get("files")
if self.files is None:
logger.error("No files specified.")
self.metainfo_to_keep = self.input_json_object.get("metainfoToKeep")
self.metainfo_to_skip = self.input_json_object.get("metainfoToSkip")
# Try to decode the metainfo file and setup the backend
self.metainfoenv, warnings = loadJsonFile(self.metainfo_file)
self.backend = JsonParseEventsWriterBackend(self.metainfoenv, self.stream)
self.file_ids = {}
def setup_given_file_ids(self):
"""Saves the file id's that were given in the JSON input.
......@@ -132,24 +56,24 @@ class NomadParser(object):
if file_id:
self.setup_file_id(path, file_id)
@abstractmethod
def setup_version(self):
"""Do some version specific setup work. The parsers will have to
support many versions of the same code and the results of different
versions may have to be parsed differently.
With this function you should determine the version of the software,
and setup the version specific implementation of the parser.
"""
pass
@abstractmethod
def parse(self):
"""Start the parsing. Will try to parse everything unless given special
rules (metaInfoToKeep, metaInfoToSkip)."""
pass
def parse_file(self, fileToParse, mainFileDescription, metaInfoEnv, backend, parserInfo):
def parse_file(
self,
fileToParse,
mainFileDescription,
metaInfoEnv,
backend,
parserInfo,
cachingLevelForMetaName={},
defaultDataCachingLevel=CachingLevel.ForwardAndCache,
defaultSectionCachingLevel=CachingLevel.Forward,
superContext=None,
onClose={}):
"""Uses the SimpleParser utilities to to parse a file.
Args:
......@@ -167,6 +91,15 @@ class NomadParser(object):
if not parserBuilder.verifyMetaInfo(sys.stderr):
sys.exit(1)
# Setup the backend that caches ond handles triggers
backend = ActiveBackend.activeBackend(
metaInfoEnv=metaInfoEnv,
cachingLevelForMetaName=cachingLevelForMetaName,
defaultDataCachingLevel=defaultDataCachingLevel,
defaultSectionCachingLevel=defaultSectionCachingLevel,
onClose=onClose,
superBackend=backend)
# Compile the SimpleMatcher tree
parserBuilder.compile()
if logger.isEnabledFor(logging.DEBUG):
......@@ -311,6 +244,19 @@ class NomadParser(object):
self._file_sizes[file_id] = size
return size
# @abstractmethod
# def get_supported_quantities(self):
# """Return a list of the nomad quantities that this parser supports. The
......
This diff is collapsed.
import os
import json
from cp2kparser.implementation.parser import CP2KParser
from cp2kparser.implementation.cp2kparserbuilder import CP2KParserBuilder
import sys
......@@ -33,12 +33,11 @@ def get_parser(path, test_mode=True, stream=sys.stdout):
json_input = {
"version": "nomadparsein.json 1.0",
"metaInfoFile": metaInfoPath,
"tmpDir": "/home",
"metainfoToKeep": [],
"metainfoToSkip": [],
"files": files
}
parser = CP2KParser(json.dumps(json_input), test_mode=test_mode, stream=stream)
parser = CP2KParserBuilder(json.dumps(json_input), stream=stream).build_parser()
return parser
......
......@@ -16,7 +16,7 @@ cp2k_input.xml.
import xml.etree.cElementTree as ET
import logging
import cPickle as pickle
from cp2kparser.engines.cp2kinputenginedata.input_tree import *
from cp2kparser.implementation.cp2kinputenginedata.input_tree import *
logger = logging
......
......@@ -79,7 +79,7 @@ class CP2KInputEngine(object):
return self.input_tree
def setup_version_number(self, version_number):
def setup_version(self, version_number):
""" The pickle file which contains preparsed data from the
cp2k_input.xml is version specific. By calling this function before
parsing the correct file can be found.
......
This diff is collapsed.
import re
import os
import re2 as re
from cp2kparser.generics.nomadparser import NomadParser
from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine
import logging
from cp2kparser.engines.csvengine import CSVEngine
from cp2kparser.engines.cp2kinputengine import CP2KInputEngine
from cp2kparser.engines.xmlengine import XMLEngine
from cp2kparser.implementation.cp2kinputparsers import CP2KInputEngine
from cp2kparser.implementation.outputparsers import *
from nomadcore.coordinate_reader import CoordinateReader
from nomadcore.unit_conversion.unit_conversion import convert_unit, ureg
from nomadcore.simple_parser import SimpleMatcher as SM
from cp2kparser.engines.cp2kinputenginedata.input_tree import CP2KInput
import numpy as np
import logging
import sys
from cp2kparser.generics.nomadparser import NomadParser
logger = logging.getLogger(__name__)
import math
#===============================================================================
class CP2KParser(NomadParser):
"""The interface for a NoMaD CP2K parser. All parsing actions will go
through this class.
class CP2KImplementation262(NomadParser):
"""Defines the basic functions that are used to map results to the
corresponding NoMaD quantities.
The CP2K version 2.6.2 was used as a reference for this basic
implementation. For other versions there should be classes that extend from
this.
This class provides the basic implementations and for a version specific
updates and additions please make a new class that inherits from this.
The functions that return certain quantities are tagged with a prefix '_Q_'
to be able to automatically determine which quantities have at least some
level of support. With the tag they can be also looped through.
"""
def __init__(self, input_json_string, stream=sys.stdout, test_mode=False):
def __init__(self, parser_context):
# Initialize the base class
NomadParser.__init__(self, input_json_string, stream, test_mode)
NomadParser.__init__(self, parser_context)
# Engines are created here
self.csvengine = CSVEngine(self)
self.regexengine = RegexEngine(self)
self.xmlengine = XMLEngine(self)
self.inputengine = CP2KInputEngine()
self.atomsengine = CoordinateReader()
self.outputparser = globals()["CP2KOutputParser{}".format(self.version_id)]()
self.version_number = None
self.implementation = None
self.input_tree = None
self.regexs = None
self.extended_input = None
self.determine_file_ids_pre_setup()
self.input_preprocessor()
self.setup_version()
self.determine_file_ids_post_setup()
def setup_version(self):
"""Setups the version by looking at the output file and the version
specified in it.
def determine_file_ids_pre_setup(self):
"""First resolve the files that can be identified by extension.
"""
# Determine the CP2K version from the output file
beginning = self.read_part_of_file("output", 2048)
version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n")
self.version_number = version_regex.search(beginning).groups()[0].replace('.', '')
self.inputengine.setup_version_number(self.version_number)
self.input_tree = self.inputengine.parse(self.extended_input)
version_name = '_' + self.version_number + '_'
# Search for a version specific regex class
class_name = "CP2K{}Regexs".format(version_name)
self.regexs = globals().get(class_name)
if self.regexs:
logger.debug("Using version specific regexs '{}'.".format(class_name))
self.regexs = self.regexs()
else:
logger.debug("Using default regexs.")
self.regexs = globals()["CP2KRegexs"]()
# Search for a version specific implementation
class_name = "CP2K{}Implementation".format(version_name)
class_object = globals().get(class_name)
if class_object:
logger.debug("Using version specific implementation '{}'.".format(class_name))
self.implementation = class_object(self)
else:
logger.debug("Using default implementation.")
self.implementation = globals()["CP2KImplementation"](self)
# Input and output files
for file_path in self.files.iterkeys():
if file_path.endswith(".inp"):
self.setup_file_id(file_path, "input")
if file_path.endswith(".out"):
self.setup_file_id(file_path, "output")
def read_part_of_file(self, file_id, size=1024):
fh = self.get_file_handle(file_id)
buffer = fh.read(size)
return buffer
# Include files
input_file = self.get_file_contents("input")
for line in input_file.split("\n"):
line = line.strip()
if line.startswith("@INCLUDE") or line.startswith("@include"):
split = line.split(None, 1)
filename = split[1]
if filename.startswith(('\"', '\'')) and filename.endswith(('\"', '\'')):
filename = filename[1:-1]
filepath = self.search_file(filename)
self.setup_file_id(filepath, "include")
def input_preprocessor(self):
"""Preprocess the input file. Concatenate .inc files into the main input file and
......@@ -167,29 +142,8 @@ class CP2KParser(NomadParser):
input_variables_replaced.append(new_line)
self.extended_input = '\n'.join(input_variables_replaced)
# print self.extended_input
def determine_file_ids_pre_setup(self):
"""First resolve the files that can be identified by extension.
"""
# Input and output files
for file_path in self.files.iterkeys():
if file_path.endswith(".inp"):
self.setup_file_id(file_path, "input")
if file_path.endswith(".out"):
self.setup_file_id(file_path, "output")
# Include files
input_file = self.get_file_contents("input")
for line in input_file.split("\n"):
line = line.strip()
if line.startswith("@INCLUDE") or line.startswith("@include"):
split = line.split(None, 1)
filename = split[1]
if filename.startswith(('\"', '\'')) and filename.endswith(('\"', '\'')):
filename = filename[1:-1]
filepath = self.search_file(filename)
self.setup_file_id(filepath, "include")
self.inputengine.setup_version(self.version_id)
self.input_tree = self.inputengine.parse(self.extended_input)
def determine_file_ids_post_setup(self):
"""Determines the file id's after the CP2K verion has been set
......@@ -312,106 +266,27 @@ class CP2KParser(NomadParser):
break
return folders
def parse(self):
self.implementation.parse()
# def get_all_quantities(self):
# """Parse all supported quantities."""
# for method in self.get_supported_quantities:
# self.get_quantity(method)
# def start_parsing(self, name):
# """Inherited from NomadParser.
# """
# # Ask the implementation for the quantity
# function = getattr(self.implementation, "_Q_" + name)
# if function:
# return function()
# else:
# logger.error("The function for quantity '{}' is not defined".format(name))
# def get_supported_quantities(self):
# """Inherited from NomadParser.
# """
# supported_quantities = []
# implementation_methods = [method for method in dir(self.implementation) if callable(getattr(self.implementation, method))]
# for method in implementation_methods:
# if method.startswith("_Q_"):
# method = method[3:]
# supported_quantities.append(method)
# return supported_quantities
#===============================================================================
class CP2KImplementation(object):
"""Defines the basic functions that are used to map results to the
corresponding NoMaD quantities.
This class provides the basic implementations and for a version specific
updates and additions please make a new class that inherits from this.
The functions that return certain quantities are tagged with a prefix '_Q_'
to be able to automatically determine which quantities have at least some
level of support. With the tag they can be also looped through.
"""
def __init__(self, parser):
self.parser = parser
self.regexs = parser.regexs
self.regexengine = parser.regexengine
self.csvengine = parser.csvengine
self.atomsengine = parser.atomsengine
self.input_tree = parser.input_tree
# Define the output parsing tree for this version
self.outputstructure = SM(
name='root',
startReStr="",
subMatchers=[
SM(
name='new_run',
startReStr=r" DBCSR\| Multiplication driver",
endReStr="[.\*]+PROGRAM STOPPED IN",
required=True,
sections=['section_run'],
subMatchers=[
SM(
name="run_datetime",
startReStr=r"[\*\s]+PROGRAM STARTED AT\s+(?P<cp2k_run_start_date>\d{4}-\d{2}-\d{2}) (?P<cp2k_run_start_time>\d{2}:\d{2}:\d{2}.\d{3})",
),
SM(
name="version",
startReStr=r" CP2K\| version string:\s+(?P<program_version>[\w\d\W\s]+)",
),
SM(
name="svn_revision",
startReStr=r" CP2K\| source code revision number:\s+svn:(?P<cp2k_svn_revision>\d+)",
)
]
)
]
)
def parse(self):
"""Parses everything that can be found from the given files. The
results are outputted to std.out by using the backend. The scala layer
will the take on from that.
"""
# Write the starting bracket
self.parser.stream.write("[")
self.stream.write("[")
# Use the SimpleMatcher to extract most of the results
parserInfo = {"name": "cp2k-parser", "version": "1.0"}
outputfilename = self.parser.get_file_handle("output").name
metainfoenv = self.parser.metainfoenv
backend = self.parser.backend
outputstructure = self.outputstructure
self.parser.parse_file(outputfilename, outputstructure, metainfoenv, backend, parserInfo)
outputfilename = self.get_file_handle("output").name
metainfoenv = self.metainfoenv
backend = self.backend
outputstructure = self.outputparser.outputstructure
cachingLevelForMetaName = self.outputparser.cachingLevelForMetaName
self.parse_file(outputfilename, outputstructure, metainfoenv, backend, parserInfo, cachingLevelForMetaName)
# Then extract the things that cannot be extracted by the SimpleMatcher
# Write the ending bracket
self.parser.stream.write("]\n")
self.stream.write("]\n")
# def dateconverter(datestring):
......@@ -758,7 +633,7 @@ class CP2KImplementation(object):
logger.error("Could not find cell declaration.")
#===============================================================================
class CP2K_262_Implementation(CP2KImplementation):
def __init__(self, parser):
CP2KImplementation.__init__(self, parser)
# #===============================================================================
# class CP2K_262_Implementation(CP2KImplementation):
# def __init__(self, parser):
# CP2KImplementation.__init__(self, parser)
<
from nomadcore.simple_parser import SimpleMatcher as SM
from nomadcore.caching_backend import CachingLevel
#===============================================================================
class CP2KOutputParser262(object):
"""The object that goes through the CP2K outputfile and parses everything
it can using the SimpleParser architecture.
"""
# Define the output parsing tree for this version
outputstructure = SM(
name='root',
startReStr="",
subMatchers=[
SM(
name='new_run',
startReStr=r" DBCSR\| Multiplication driver",
endReStr="[.\*]+PROGRAM STOPPED IN",
required=True,
sections=['section_run'],
subMatchers=[
SM(
name="run_datetime",
startReStr=r"[\*\s]+PROGRAM STARTED AT\s+(?P<cp2k_run_start_date>\d{4}-\d{2}-\d{2}) (?P<cp2k_run_start_time>\d{2}:\d{2}:\d{2}.\d{3})",
),
SM(
name="version",
startReStr=r" CP2K\| version string:\s+(?P<program_version>[\w\d\W\s]+)",
),
SM(
name="svn_revision",
startReStr=r" CP2K\| source code revision number:\s+svn:(?P<cp2k_svn_revision>\d+)",
),
SM(
name="cell",
startReStr=" CELL\|",
forwardMatch=True,
subMatchers=[
SM(
name="cell_a",
startReStr=" CELL\| Vector a \[angstrom\]:\s+(?P<cp2k_cell_vector_a>[\d\.]+\s+[\d\.]+\s+[\d\.]+)+"
),
SM(
name="cell_b",
startReStr=" CELL\| Vector b \[angstrom\]:\s+(?P<cp2k_cell_vector_b>[\d\.]+\s+[\d\.]+\s+[\d\.]+)+"
),
SM(
name="cell_c",
startReStr=" CELL\| Vector c \[angstrom\]:\s+(?P<cp2k_cell_vector_c>[\d\.]+\s+[\d\.]+\s+[\d\.]+)+"
),
]
)
]
)
]
)
# The cache settings