Commit cb222142 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Better logic for parsing input files, added support for aliases and default/lone keyword values.

parent 88166032
......@@ -33,9 +33,9 @@ the common parser structure when it is available.
# Structure
Currently the python package is divided into three subpackages:
- Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality.
- Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality.
# Reusable components and ideas for other parsers
......@@ -56,13 +56,13 @@ the performance of an engine but if the function calls remain the same no other
code has to be changed.
Currently implemented engines that could be reused (not tested properly yet):
- RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if
re2 not found).
- XYZEngine: For parsing XYZ files and files with similar structure. Has a very
flexible nature as you can specify comments, column delimiters, column
indices and the patterns used to separate different configurations.
- XMLEngine: For parsing XML files using XPath syntax.
- RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if
re2 not found).
- XYZEngine: For parsing XYZ files and files with similar structure. Has a very
flexible nature as you can specify comments, column delimiters, column
indices and the patterns used to separate different configurations.
- XMLEngine: For parsing XML files using XPath syntax.
## NomadParser base class
In the generics folder there is a module called nomadparser.py that defines a
......@@ -74,11 +74,11 @@ the scala code (will be modified later to conform to the common interface).
This class is also responsible for some common tasks that are present in all
parsers:
- Unit conversion
- JSON encoding
- Caching
- Time measurement for performance analysis
- Providing file contents, sizes and handles
- Unit conversion
- JSON encoding
- Caching
- Time measurement for performance analysis
- Providing file contents, sizes and handles
## Logging
Python has a great [logging package](https://www.google.com) which helps in
......
This diff is collapsed.
"""The classes which make up the CP2K input tree.
These are defined in their own module, instead of the xmlpreparser module,
because the pickling of these classes is wrong if they are defined in the same
file which is run in console (module will be then __main__).
"""
from collections import defaultdict
#===============================================================================
class Keyword(object):
"""Information about a keyword in a CP2K calculation.
"""
def __init__(self, default_name, default_value):
self.value = None
self.default_name = default_name
self.default_value = default_value
#===============================================================================
class Section(object):
"""An input section in a CP2K calculation.
"""
def __init__(self, name):
self.name = name
self.keywords = defaultdict(list)
self.default_keyword = ""
self.parameter = None
self.sections = defaultdict(list)
def get_section(self, path):
split_path = path.split("/")
section = self
for part in split_path:
section = section.sections.get(part)
if section:
if len(section) == 1:
section = section[0]
else:
# print "The subsection '{}' is repeated. Not yet supported.".format(path)
return None
else:
# print "Subsection '{}' does not exist in section '{}'".format(path, self.name)
return None
return section
def get_keyword_object(self, path):
split_path = path.rsplit("/", 1)
keyword = split_path[1]
section_path = split_path[0]
section = self.get_section(section_path)
keyword = section.keywords.get(keyword)
if keyword:
if len(keyword) == 1:
return keyword[0]
# print "The keyword in '{}' does not exist or has too many entries.".format(path)
return None
def get_keyword(self, path):
keyword = self.get_keyword_object(path)
if keyword:
return keyword.value
def get_default_keyword(self, path):
return self.get_section(path)
def set_keyword(self, path, value):
keyword = self.get_keyword_object(path)
if keyword:
keyword.value = value
else:
# print "Saving default keyword at path '{}'".format(path)
split_path = path.rsplit("/", 1)
keyword = split_path[1]
section_path = split_path[0]
section = self.get_section(section_path)
section.default_keyword += '\n' + keyword
def get_keyword_default(self, path):
keyword = self.get_keyword_object(path)
if keyword:
return keyword.default_value
def get_parameter_object(self, path):
section = self.get_section(path)
parameter = section.parameter
if parameter:
return parameter
else:
print "The section parameters object '{}' could not be found.".format(path)
def get_parameter(self, path):
parameter = self.get_parameter_object(path)
return parameter.value
def set_parameter(self, path, value):
parameter = self.get_parameter_object(path)
parameter.value = value
def get_parameter_lone(self, path):
parameter = self.get_parameter_object(path)
return parameter.lone_value
def get_parameter_default(self, path):
parameter = self.get_parameter_object(path)
return parameter.default_value
#===============================================================================
class SectionParameters(object):
"""Section parameters in a CP2K calculation.
Section parameters are the short values that can be added right after a
section name, e.g. &PRINT ON, where ON is the section parameter.
"""
def __init__(self, default_value, lone_value):
self.value = None
self.default_value = default_value
self.lone_value = lone_value
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""Provides functions for creating a python object representing a CP2K input
structure.
Creates preparsed versions of the cp2k_input.xmls and pickles them (python
version of serialization). The pickle files can then be easily reused without
doing the xml parsing again.
The actual calculation input contents can later be added to this object. Then
the object can be queried for the results, or the default values defined by the
cp2k_input.xml.
"""
import xml.etree.cElementTree as ET
import logging
import cPickle as pickle
from cp2kparser.engines.cp2kinputenginedata.input_tree import *
logger = logging
#===============================================================================
def generate_object_tree(xml_file):
xml_element = ET.parse(xml_file)
object_tree = recursive_tree_generation(xml_element)
return object_tree
#===============================================================================
def recursive_tree_generation(xml_element):
# Make new section object for the root
section_name_element = xml_element.find("NAME")
if section_name_element is not None:
section_name = section_name_element.text
else:
section_name = "CP2K_INPUT"
section = Section(section_name)
# Section parameters
parameter = xml_element.find("SECTION_PARAMETERS")
if parameter:
sp_default_element = parameter.find("DEFAULT_VALUE")
sp_default_value = None
if sp_default_element is not None:
sp_default_value = sp_default_element.text
sp_lone_element = parameter.find("LONE_KEYWORD_VALUE")
sp_lone_value = None
if sp_lone_element is not None:
sp_lone_value = sp_lone_element.text
parameter_object = SectionParameters(sp_default_value, sp_lone_value)
section.parameter = parameter_object
# Keywords
for keyword in xml_element.findall("KEYWORD"):
keyword_names = keyword.findall("NAME")
default_name = None
aliases = []
for name in keyword_names:
keytype = name.get("type")
if keytype == "default":
default_name = name.text
else:
aliases.append(name.text)
default_keyword_element = keyword.find("DEFAULT_VALUE")
default_keyword_value = None
if default_keyword_element is not None:
default_keyword_value = default_keyword_element.text
keyword_object = Keyword(default_name, default_keyword_value)
section.keywords[default_name].append(keyword_object)
for alias in aliases:
section.keywords[alias].append(keyword_object)
# Sections
for sub_section_element in xml_element.findall("SECTION"):
sub_section = recursive_tree_generation(sub_section_element)
section.sections[sub_section.name].append(sub_section)
# Return section
return section
#===============================================================================
# Run main function by default
if __name__ == "__main__":
xml_file = open("./cp2k_262/cp2k_input.xml", 'r')
object_tree = generate_object_tree(xml_file)
file_name = "./cp2k_262/cp2k_input_tree.pickle"
fh = open(file_name, "wb")
pickle.dump(object_tree, fh, protocol=2)
......@@ -6,7 +6,6 @@ ElemenTree API such as lxml.
"""
import xml.etree.cElementTree as ET
import sys
#===============================================================================
......@@ -27,8 +26,11 @@ class XMLEngine(object):
# handle
if isinstance(contents, (str, unicode)):
tree = ET.fromstring(contents)
else:
elif isinstance(contents, file):
tree = ET.parse(contents)
else:
tree = contents
return tree.findall(XPath)
# Get the path
return tree.getroot().findall(XPath)
......@@ -29,11 +29,12 @@ class CP2KParser(NomadParser):
self.version_number = None
# Engines are created here
self.inputengine = CP2KInputEngine(self)
self.xyzengine = XYZEngine(self)
self.regexengine = RegexEngine(self)
self.xmlengine = XMLEngine(self)
self.inputengine = CP2KInputEngine(self)
self.input_tree = None
self.regexs = None
self.analyse_input_json()
self.check_resolved_file_ids()
......@@ -50,6 +51,8 @@ class CP2KParser(NomadParser):
version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n")
self.version_number = version_regex.search(beginning).groups()[0].replace('.', '')
self.inputengine.setup_version_number(self.version_number)
self.inputengine.parse()
self.input_tree = self.inputengine.get_input_tree()
version_name = '_' + self.version_number + '_'
# Search for a version specific regex class
......@@ -112,9 +115,8 @@ class CP2KParser(NomadParser):
"""Inherited from NomadParser.
"""
# Check from input what the other files are called
self.inputengine.parse_input()
force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
project_name = self.inputengine.get_keyword("GLOBAL/PROJECT_NAME")
force_path = self.input_tree.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
project_name = self.input_tree.get_keyword("GLOBAL/PROJECT_NAME")
if force_path is not None and force_path != "__STD_OUT__":
# The force path is not typically exactly as written in input
......@@ -181,8 +183,8 @@ class CP2KImplementation(object):
self.parser = parser
self.regexs = parser.regexs
self.regexengine = parser.regexengine
self.inputengine = parser.inputengine
self.xyzengine = parser.xyzengine
self.input_tree = parser.input_tree
def _Q_energy_total(self):
"""Return the total energy from the bottom of the input file"""
......@@ -200,13 +202,13 @@ class CP2KImplementation(object):
"""
# First try to look at the shortcut
xc_shortcut = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
xc_shortcut = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
if xc_shortcut is not None and xc_shortcut != "NONE" and xc_shortcut != "NO_SHORTCUT":
logger.debug("Shortcut defined for XC_FUNCTIONAL")
# If PBE, check version
if xc_shortcut == "PBE":
pbe_version = self.inputengine.get_subsection("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/PBE").get_keyword("PARAMETRIZATION")
pbe_version = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/PBE/PARAMETRIZATION")
return {
'ORIG': "GGA_X_PBE",
'PBESOL': "GGA_X_PBE_SOL",
......@@ -232,14 +234,14 @@ class CP2KImplementation(object):
# Becke88
xc_components = []
becke_88 = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88")
becke_88 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88")
if becke_88 == "TRUE":
xc_components.append("GGA_X_B88")
# Becke 97
becke_97 = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97")
becke_97 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97")
if becke_97 == "TRUE":
becke_97_param = self.inputengine.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97/PARAMETRIZATION")
becke_97_param = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97/PARAMETRIZATION")
becke_97_result = {
'B97GRIMME': None,
'B97_GRIMME': None,
......@@ -261,7 +263,7 @@ class CP2KImplementation(object):
# Determine if a separate force file is used or are the forces printed
# in the output file.
separate_file = True
filename = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
filename = self.input_tree.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
if not filename or filename == "__STD_OUT__":
separate_file = False
......
......@@ -2,6 +2,7 @@ import unittest
import os
import logging
from cp2kparser.implementation.autoparser import get_parser
from cp2kparser.engines.cp2kinputenginedata.xmlpreparser import *
import cProfile
import pstats
......@@ -92,11 +93,11 @@ class TestForces(unittest.TestCase):
if __name__ == '__main__':
logger = logging.getLogger("cp2kparser")
logger.setLevel(logging.ERROR)
# unittest.main()
suite = unittest.TestLoader().loadTestsFromTestCase(TestForces)
def runtests():
unittest.TextTestRunner().run(suite)
s = cProfile.run("runtests()", sort="cumtime", filename="profile_file")
# unittest.TextTestRunner(verbosity=0).run(suite)
unittest.main()
# suite = unittest.TestLoader().loadTestsFromTestCase(TestForces)
# def runtests():
# unittest.main()
# unittest.TextTestRunner().run(suite)
# unittest.TextTestRunner(verbosity=0).run(suite)
# s = cProfile.run("runtests()", sort="cumtime", filename="profile_file")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment