Skip to content
Snippets Groups Projects
Commit 7fbb08a7 authored by Himanen, Lauri (himanel1)'s avatar Himanen, Lauri (himanel1)
Browse files

Added XMLEngine, added profiling tools

parent e0fb2d07
Branches
Tags
No related merge requests found
cp2kparser/cp2kparser.egg-info cp2kparser.egg-info
*.pyc *.pyc
...@@ -52,6 +52,11 @@ repository where other developers can improve and extend them. One should also ...@@ -52,6 +52,11 @@ repository where other developers can improve and extend them. One should also
write tests for the engines that would validate their behaviour and ease the write tests for the engines that would validate their behaviour and ease the
performance analysis. performance analysis.
The engine classes work also as interfaces. You can change the engine behaviour
while maintaining the same API in the parsers. For example one might improve
the performance of an engine but if the function calls remain the same no other
code has to be changed.
Currently implemented engines that could be reused (not tested properly yet): Currently implemented engines that could be reused (not tested properly yet):
- RegexEngine: For parsing text files with regular expressions. Uses the re2 - RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if library if available (falls back to default python regex implementation if
...@@ -83,6 +88,21 @@ cp2kparser the file cp2kparser/generics/logconfig.py defines the behaviour of ...@@ -83,6 +88,21 @@ cp2kparser the file cp2kparser/generics/logconfig.py defines the behaviour of
the logger. There you can setup the log levels even at a modular level. A more the logger. There you can setup the log levels even at a modular level. A more
easily readable formatting is also provided for the log messages. easily readable formatting is also provided for the log messages.
--- ### Testing
## Lessons learned The parsers can become quite complicated and maintaining them without
systematic testing is perhaps not a good idea. Unittests provide one way to
test each parseable quantity and python has a very good [library for
unittesting](https://docs.python.org/2/library/unittest.html).
### Profiling
The parsers have to be reasonably fast. For some codes there is already
significant amount of data in the NoMaD repository and the time taken to parse
it will depend on the performance of the parser. Also each time the parser
evolves after system deployment, the existing data may have to be reparsed at
least partially.
By profiling what functions take the most computational time and memory during
parsing you can identify the bottlenecks in the parser. There are already
existing profiling tools such as
[cProfile](https://docs.python.org/2/library/profile.html#module-cProfile)
which you can plug into your scripts very easily.
#! /usr/bin/env python #! /usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os
from collections import defaultdict from collections import defaultdict
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -14,20 +15,21 @@ class CP2KInputEngine(object): ...@@ -14,20 +15,21 @@ class CP2KInputEngine(object):
has such a clearly defined structure (unlike the output file of CP2K), it has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions. is better to use a dedicated parser instead of regular expressions.
""" """
def __init__(self, cp2k_parser): def __init__(self, parser):
""" """
Args: Args:
cp2k_parser: Instance of a CP2KParser or it's subclass. Allows parser: Instance of a NomadParser or it's subclass. Allows
access to e.g. unified file reading methods. access to e.g. unified file reading methods.
""" """
self.cp2k_parser = cp2k_parser self.parser = parser
self.root_section = None self.root_section = None
self.xml_file = None
def parse_input(self): def parse_input(self):
"""Parses the given CP2K input string""" """Parses the given CP2K input string"""
# The input file should be quite small, so just get the entire contents # The input file should be quite small, so just get the entire contents
inp = self.cp2k_parser.get_file_contents("input") inp = self.parser.get_file_contents("input")
root_section = InputSection('CP2K_INPUT') root_section = InputSection('CP2K_INPUT')
section_stack = [root_section] section_stack = [root_section]
...@@ -65,12 +67,22 @@ class CP2KInputEngine(object): ...@@ -65,12 +67,22 @@ class CP2KInputEngine(object):
keyword = split[1] keyword = split[1]
section = self.root_section.get_subsection(section_path, index) section = self.root_section.get_subsection(section_path, index)
if section is not None: if section is not None:
return section.get_keyword(keyword) return section.get_keyword(keyword, section_path, self)
def get_parameter(self, path, index=0): def get_parameter(self, path, index=0):
section = self.root_section.get_subsection(path, index) section = self.root_section.get_subsection(path, index)
if section is not None: if section is not None:
return section.get_parameter() return section.get_parameter(self, path)
def setup_version_number(self, version_number):
xml_file_path = os.path.dirname(__file__) + "/cp2kinputenginedata/xml/cp2k_{}/cp2k_input.xml".format(version_number)
self.xml_file = open(xml_file_path, 'r')
def get_xml_file(self):
"""Return the file handle that has been reset to the beginning.
"""
self.xml_file.seek(os.SEEK_SET)
return self.xml_file
#=============================================================================== #===============================================================================
...@@ -131,9 +143,11 @@ class InputSection(object): ...@@ -131,9 +143,11 @@ class InputSection(object):
return subsection return subsection
return subsection.get_subsection(parts[1]) return subsection.get_subsection(parts[1])
def get_keyword(self, keyword, index=0): def get_keyword(self, keyword, section_path, engine, index=0):
"""Finds a keyword specified by a string. If multiple keywords are found with the same """Finds a keyword specified by a string. If multiple keywords are
name, the one specified by the given index (default 0) is returned. found with the same name, the one specified by the given index (default
0) is returned. If the keyword is not explicitly set, returns the
default specified by the cp2k version specific XML file.
Args: Args:
keyword: String indicating the name of the keyword. The name is the keyword: String indicating the name of the keyword. The name is the
...@@ -143,12 +157,25 @@ class InputSection(object): ...@@ -143,12 +157,25 @@ class InputSection(object):
Returns: Returns:
The keyword value (everything else than the first word on the line). The keyword value (everything else than the first word on the line).
""" """
candidates = self.keywords.get(keyword) # [s for s in self.subsections if s.name == parts[0]] candidates = self.keywords.get(keyword)
if not candidates: if not candidates:
logger.debug("No keywords with name '{}' found in subsection '{}'".format(keyword, self.name)) logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name))
return None
# Form a XPath from the given path
xpath = "."
sections = section_path.split("/")
for section in sections:
xpath += "/SECTION[NAME='{}']".format(section)
xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword)
xml_file = engine.get_xml_file()
xmlengine = engine.parser.xmlengine
result = xmlengine.parse(xml_file, xpath)
return result[0].text
elif len(candidates) > 1: elif len(candidates) > 1:
logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword)) logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword))
try: try:
...@@ -157,7 +184,24 @@ class InputSection(object): ...@@ -157,7 +184,24 @@ class InputSection(object):
logger.error("Invalid keyword index given.") logger.error("Invalid keyword index given.")
return result return result
def get_parameter(self): def get_parameter(self, engine, path):
"""Return the SECTION_PARAMETER for this InputSection. If none is
explicitly set, return the default specified by the cp2k version
specific XML file.
"""
if self.params is None: if self.params is None:
logger.debug("The section '{}' has no parameters set".format(self.name))
# Form a XPath from the given path
xpath = "."
sections = path.split("/")
for section in sections:
xpath += "/SECTION[NAME='{}']".format(section)
xpath += "/SECTION_PARAMETERS/LONE_KEYWORD_VALUE"
xml_file = engine.get_xml_file()
xmlengine = engine.parser.xmlengine
result = xmlengine.parse(xml_file, xpath)
return result[0].text
return self.params return self.params
This diff is collapsed.
This diff is collapsed.
<HTML><BODY><HEAD><TITLE>The cp2k units list</TITLE>
<H1>CP2K Available Units of Measurement</H1>
<H2>Undefined</H2>
If the default unit of a keyword is explicitly undefined, all possible units of measurement can be used to define a proper value.<BR><DL>
<DD><B>internal_cp2k</B></DD>
</DL><P>
<H2>Energy</H2>
Possible units of measurement for Energies. The [energy] entry acts like a dummy flag (assumes the unit of measurement of energy is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>hartree</B></DD>
<DD><B>wavenumber_e</B></DD>
<DD><B>joule</B></DD>
<DD><B>kcalmol</B></DD>
<DD><B>kjmol</B></DD>
<DD><B>Ry</B></DD>
<DD><B>eV</B></DD>
<DD><B>K_e</B></DD>
<DD><B>energy</B></DD>
</DL><P>
<H2>Length</H2>
Possible units of measurement for Lengths. The [length] entry acts like a dummy flag (assumes the unit of measurement of length is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>bohr</B></DD>
<DD><B>m</B></DD>
<DD><B>pm</B></DD>
<DD><B>nm</B></DD>
<DD><B>angstrom</B></DD>
<DD><B>length</B></DD>
</DL><P>
<H2>Temperature</H2>
Possible units of measurement for Temperature. The [temperature] entry acts like a dummy flag (assumes the unit of measurement of temperature is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>K</B></DD>
<DD><B>au_temp</B></DD>
<DD><B>temperature</B></DD>
</DL><P>
<H2>Pressure</H2>
Possible units of measurement for Pressure. The [pressure] entry acts like a dummy flag (assumes the unit of measurement of pressure is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>bar</B></DD>
<DD><B>atm</B></DD>
<DD><B>kbar</B></DD>
<DD><B>Pa</B></DD>
<DD><B>MPa</B></DD>
<DD><B>GPa</B></DD>
<DD><B>au_p</B></DD>
<DD><B>pressure</B></DD>
</DL><P>
<H2>Angle</H2>
Possible units of measurement for Angles. The [angle] entry acts like a dummy flag (assumes the unit of measurement of angle is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>rad</B></DD>
<DD><B>deg</B></DD>
<DD><B>angle</B></DD>
</DL><P>
<H2>Time</H2>
Possible units of measurement for Time. The [time] entry acts like a dummy flag (assumes the unit of measurement of time is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>s</B></DD>
<DD><B>fs</B></DD>
<DD><B>ps</B></DD>
<DD><B>au_t</B></DD>
<DD><B>wavenumber_t</B></DD>
<DD><B>time</B></DD>
</DL><P>
<H2>Mass</H2>
Possible units of measurement for Masses. The [mass] entry acts like a dummy flag (assumes the unit of measurement of mass is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>kg</B></DD>
<DD><B>amu</B></DD>
<DD><B>m_e</B></DD>
<DD><B>mass</B></DD>
</DL><P>
<H2>Potential</H2>
Possible units of measurement for potentials. The [potential] entry acts like a dummy flag (assumes the unit of measurement of potential is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>volt</B></DD>
<DD><B>au_pot</B></DD>
<DD><B>potential</B></DD>
</DL><P>
<H2>Force</H2>
Possible units of measurement for forces. The [force] entry acts like a dummy flag (assumes the unit of measurement of force is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>N</B></DD>
<DD><B>Newton</B></DD>
<DD><B>mN</B></DD>
<DD><B>mNewton</B></DD>
<DD><B>au_f</B></DD>
<DD><B>force</B></DD>
</DL><P>
</BODY></HTML>
"""
This engine is used to parse XML files using XPath commands
(http://www.w3.org/TR/xpath/). It uses the cElementTree package, but it could
be easily replaced with another XML parsing package that implements the
ElemenTree API such as lxml.
"""
import xml.etree.cElementTree as ET
import sys
#===============================================================================
class XMLEngine(object):
"""Used to parse out XML content.
"""
def __init__(self, parser):
"""
Args:
cp2k_parser: Instance of a NomadParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.parser = parser
def parse(self, contents, XPath):
# Open the XML differently depending on whether it is string of a file
# handle
if isinstance(contents, (str, unicode)):
tree = ET.fromstring(contents)
else:
tree = ET.parse(contents)
# Get the path
return tree.getroot().findall(XPath)
...@@ -8,6 +8,7 @@ from cp2kparser.implementation.regexs import * ...@@ -8,6 +8,7 @@ from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine from cp2kparser.engines.regexengine import RegexEngine
from cp2kparser.engines.xyzengine import XYZEngine from cp2kparser.engines.xyzengine import XYZEngine
from cp2kparser.engines.cp2kinputengine import CP2KInputEngine from cp2kparser.engines.cp2kinputengine import CP2KInputEngine
from cp2kparser.engines.xmlengine import XMLEngine
import numpy as np import numpy as np
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -25,27 +26,34 @@ class CP2KParser(NomadParser): ...@@ -25,27 +26,34 @@ class CP2KParser(NomadParser):
def __init__(self, input_json_string): def __init__(self, input_json_string):
NomadParser.__init__(self, input_json_string) NomadParser.__init__(self, input_json_string)
self.version_number = None
# Engines are created here # Engines are created here
self.inputengine = CP2KInputEngine(self) self.inputengine = CP2KInputEngine(self)
self.xyzengine = XYZEngine(self) self.xyzengine = XYZEngine(self)
self.regexengine = RegexEngine(self) self.regexengine = RegexEngine(self)
self.xmlengine = XMLEngine(self)
self.regexs = None self.regexs = None
self.analyse_input_json() self.analyse_input_json()
self.determine_file_ids() self.check_resolved_file_ids()
self.open_files() self.determine_file_ids_from_extension()
self.setup_version() self.setup_version()
self.determine_file_ids()
# self.open_files()
def setup_version(self): def setup_version(self):
"""Inherited from NomadParser. """Inherited from NomadParser.
""" """
# Determine the CP2K version from the input file # Determine the CP2K version from the output file
beginning = self.read_part_of_file("output", 2048) beginning = self.read_part_of_file("output", 2048)
version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n") version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n")
version_number = '_' + version_regex.search(beginning).groups()[0].replace('.', '') + '_' self.version_number = version_regex.search(beginning).groups()[0].replace('.', '')
self.inputengine.setup_version_number(self.version_number)
version_name = '_' + self.version_number + '_'
# Search for a version specific regex class # Search for a version specific regex class
class_name = "CP2K{}Regexs".format(version_number) class_name = "CP2K{}Regexs".format(version_name)
self.regexs = globals().get(class_name) self.regexs = globals().get(class_name)
if self.regexs: if self.regexs:
logger.debug("Using version specific regexs '{}'.".format(class_name)) logger.debug("Using version specific regexs '{}'.".format(class_name))
...@@ -55,7 +63,7 @@ class CP2KParser(NomadParser): ...@@ -55,7 +63,7 @@ class CP2KParser(NomadParser):
self.regexs = globals()["CP2KRegexs"]() self.regexs = globals()["CP2KRegexs"]()
# Search for a version specific implementation # Search for a version specific implementation
class_name = "CP2K{}Implementation".format(version_number) class_name = "CP2K{}Implementation".format(version_name)
class_object = globals().get(class_name) class_object = globals().get(class_name)
if class_object: if class_object:
logger.debug("Using version specific implementation '{}'.".format(class_name)) logger.debug("Using version specific implementation '{}'.".format(class_name))
...@@ -70,10 +78,9 @@ class CP2KParser(NomadParser): ...@@ -70,10 +78,9 @@ class CP2KParser(NomadParser):
buffer = fh.read(size) buffer = fh.read(size)
return buffer return buffer
def determine_file_ids(self): def check_resolved_file_ids(self):
"""Inherited from NomadParser. """Save the file id's that were given in the JSON input.
""" """
# Determine a list of filepaths that need id resolution
resolved = {} resolved = {}
resolvable = [] resolvable = []
for file_object in self.files: for file_object in self.files:
...@@ -84,17 +91,27 @@ class CP2KParser(NomadParser): ...@@ -84,17 +91,27 @@ class CP2KParser(NomadParser):
else: else:
resolved[file_id] = path resolved[file_id] = path
# First resolve the file that can be identified by extension for id, path in resolved.iteritems():
input_path = resolved.get("input") self.file_ids[id] = path
if not input_path: self.get_file_handle(id)
for file_path in resolvable:
self.resolvable = resolvable
def determine_file_ids_from_extension(self):
"""First resolve the files that can be identified by extension.
"""
for file_path in self.resolvable:
if file_path.endswith(".inp"): if file_path.endswith(".inp"):
self.file_ids["input"] = file_path self.file_ids["input"] = file_path
self.get_file_handle("input") self.get_file_handle("input")
if file_path.endswith(".out"): if file_path.endswith(".out"):
self.file_ids["output"] = file_path self.file_ids["output"] = file_path
self.get_file_handle("output")
# Now check from input what the other files are called def determine_file_ids(self):
"""Inherited from NomadParser.
"""
# Check from input what the other files are called
self.inputengine.parse_input() self.inputengine.parse_input()
force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME") force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
project_name = self.inputengine.get_keyword("GLOBAL/PROJECT_NAME") project_name = self.inputengine.get_keyword("GLOBAL/PROJECT_NAME")
...@@ -113,22 +130,22 @@ class CP2KParser(NomadParser): ...@@ -113,22 +130,22 @@ class CP2KParser(NomadParser):
force_path = os.path.basename(force_path) force_path = os.path.basename(force_path)
# Check against the given files # Check against the given files
for file_path in resolvable: for file_path in self.resolvable:
tail = os.path.basename(file_path) tail = os.path.basename(file_path)
if force_path is not None and tail == force_path: if force_path is not None and tail == force_path:
self.file_ids["forces"] = file_path self.file_ids["forces"] = file_path
self.get_file_handle("forces") self.get_file_handle("forces")
def open_files(self): # def open_files(self):
"""Open the file handles and keep them open until program finishes. # """Open the file handles and keep them open until program finishes.
""" # """
for file_id, file_path in self.file_ids.iteritems(): # for file_id, file_path in self.file_ids.iteritems():
try: # try:
file_handle = open(file_path, 'r') # file_handle = open(file_path, 'r')
except (OSError, IOError): # except (OSError, IOError):
logger.error("Could not open file: '{}'".format(file_path)) # logger.error("Could not open file: '{}'".format(file_path))
else: # else:
self.file_handles[file_id] = file_handle # self.file_handles[file_id] = file_handle
def get_unformatted_quantity(self, name): def get_unformatted_quantity(self, name):
"""Inherited from NomadParser. The timing and caching is already """Inherited from NomadParser. The timing and caching is already
...@@ -183,7 +200,7 @@ class CP2KImplementation(object): ...@@ -183,7 +200,7 @@ class CP2KImplementation(object):
""" """
# First try to look at the shortcut # First try to look at the shortcut
xc_shortcut = self.inputengine.get_subsection("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL").get_parameter() xc_shortcut = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
if xc_shortcut is not None and xc_shortcut != "NONE" and xc_shortcut != "NO_SHORTCUT": if xc_shortcut is not None and xc_shortcut != "NONE" and xc_shortcut != "NO_SHORTCUT":
logger.debug("Shortcut defined for XC_FUNCTIONAL") logger.debug("Shortcut defined for XC_FUNCTIONAL")
...@@ -215,7 +232,7 @@ class CP2KImplementation(object): ...@@ -215,7 +232,7 @@ class CP2KImplementation(object):
# Becke88 # Becke88
xc_components = [] xc_components = []
becke_88 = self.inputengine.get_subsection("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88").get_parameter() becke_88 = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88")
if becke_88 == "TRUE": if becke_88 == "TRUE":
xc_components.append("GGA_X_B88") xc_components.append("GGA_X_B88")
... ...
......
"""
Used to analyse the profiling statistics gathered by the test suite.
"""
import pstats
#===============================================================================
if __name__ == '__main__':
p = pstats.Stats("profile_file")
p.strip_dirs().sort_stats("tottime").print_stats(20)
File added
import unittest import unittest
import os import os
import logging
from cp2kparser.implementation.autoparser import get_parser from cp2kparser.implementation.autoparser import get_parser
import cProfile
import pstats
#=============================================================================== #===============================================================================
...@@ -87,6 +90,13 @@ class TestForces(unittest.TestCase): ...@@ -87,6 +90,13 @@ class TestForces(unittest.TestCase):
self.assertEqual(forces, None) self.assertEqual(forces, None)
if __name__ == '__main__': if __name__ == '__main__':
logger = logging.getLogger("cp2kparser")
logger.setLevel(logging.ERROR)
# unittest.main() # unittest.main()
suite = unittest.TestLoader().loadTestsFromTestCase(TestForces) suite = unittest.TestLoader().loadTestsFromTestCase(TestForces)
unittest.TextTestRunner(verbosity=0).run(suite)
def runtests():
unittest.TextTestRunner().run(suite)
s = cProfile.run("runtests()", sort="cumtime", filename="profile_file")
# unittest.TextTestRunner(verbosity=0).run(suite)
...@@ -11,7 +11,7 @@ def main(): ...@@ -11,7 +11,7 @@ def main():
author="Lauri Himanen", author="Lauri Himanen",
author_email="lauri.himanen@gmail.com", author_email="lauri.himanen@gmail.com",
license="GPL3", license="GPL3",
packages=["engines", "generics", "implementation"], packages=["cp2kparser"],
zip_safe=False zip_safe=False
) )
... ...
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment