Commit 7fbb08a7 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Added XMLEngine, added profiling tools

parent e0fb2d07
cp2kparser/cp2kparser.egg-info
cp2kparser.egg-info
*.pyc
......@@ -52,6 +52,11 @@ repository where other developers can improve and extend them. One should also
write tests for the engines that would validate their behaviour and ease the
performance analysis.
The engine classes work also as interfaces. You can change the engine behaviour
while maintaining the same API in the parsers. For example one might improve
the performance of an engine but if the function calls remain the same no other
code has to be changed.
Currently implemented engines that could be reused (not tested properly yet):
- RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if
......@@ -83,6 +88,21 @@ cp2kparser the file cp2kparser/generics/logconfig.py defines the behaviour of
the logger. There you can setup the log levels even at a modular level. A more
easily readable formatting is also provided for the log messages.
---
## Lessons learned
### Testing
The parsers can become quite complicated and maintaining them without
systematic testing is perhaps not a good idea. Unittests provide one way to
test each parseable quantity and python has a very good [library for
unittesting](https://docs.python.org/2/library/unittest.html).
### Profiling
The parsers have to be reasonably fast. For some codes there is already
significant amount of data in the NoMaD repository and the time taken to parse
it will depend on the performance of the parser. Also each time the parser
evolves after system deployment, the existing data may have to be reparsed at
least partially.
By profiling what functions take the most computational time and memory during
parsing you can identify the bottlenecks in the parser. There are already
existing profiling tools such as
[cProfile](https://docs.python.org/2/library/profile.html#module-cProfile)
which you can plug into your scripts very easily.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
from collections import defaultdict
import logging
logger = logging.getLogger(__name__)
......@@ -14,20 +15,21 @@ class CP2KInputEngine(object):
has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions.
"""
def __init__(self, cp2k_parser):
def __init__(self, parser):
"""
Args:
cp2k_parser: Instance of a CP2KParser or it's subclass. Allows
parser: Instance of a NomadParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.cp2k_parser = cp2k_parser
self.parser = parser
self.root_section = None
self.xml_file = None
def parse_input(self):
"""Parses the given CP2K input string"""
# The input file should be quite small, so just get the entire contents
inp = self.cp2k_parser.get_file_contents("input")
inp = self.parser.get_file_contents("input")
root_section = InputSection('CP2K_INPUT')
section_stack = [root_section]
......@@ -65,12 +67,22 @@ class CP2KInputEngine(object):
keyword = split[1]
section = self.root_section.get_subsection(section_path, index)
if section is not None:
return section.get_keyword(keyword)
return section.get_keyword(keyword, section_path, self)
def get_parameter(self, path, index=0):
section = self.root_section.get_subsection(path, index)
if section is not None:
return section.get_parameter()
return section.get_parameter(self, path)
def setup_version_number(self, version_number):
xml_file_path = os.path.dirname(__file__) + "/cp2kinputenginedata/xml/cp2k_{}/cp2k_input.xml".format(version_number)
self.xml_file = open(xml_file_path, 'r')
def get_xml_file(self):
"""Return the file handle that has been reset to the beginning.
"""
self.xml_file.seek(os.SEEK_SET)
return self.xml_file
#===============================================================================
......@@ -131,9 +143,11 @@ class InputSection(object):
return subsection
return subsection.get_subsection(parts[1])
def get_keyword(self, keyword, index=0):
"""Finds a keyword specified by a string. If multiple keywords are found with the same
name, the one specified by the given index (default 0) is returned.
def get_keyword(self, keyword, section_path, engine, index=0):
"""Finds a keyword specified by a string. If multiple keywords are
found with the same name, the one specified by the given index (default
0) is returned. If the keyword is not explicitly set, returns the
default specified by the cp2k version specific XML file.
Args:
keyword: String indicating the name of the keyword. The name is the
......@@ -143,12 +157,25 @@ class InputSection(object):
Returns:
The keyword value (everything else than the first word on the line).
"""
candidates = self.keywords.get(keyword) # [s for s in self.subsections if s.name == parts[0]]
candidates = self.keywords.get(keyword)
if not candidates:
logger.debug("No keywords with name '{}' found in subsection '{}'".format(keyword, self.name))
return None
logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name))
# Form a XPath from the given path
xpath = "."
sections = section_path.split("/")
for section in sections:
xpath += "/SECTION[NAME='{}']".format(section)
xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword)
xml_file = engine.get_xml_file()
xmlengine = engine.parser.xmlengine
result = xmlengine.parse(xml_file, xpath)
return result[0].text
elif len(candidates) > 1:
logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword))
try:
......@@ -157,7 +184,24 @@ class InputSection(object):
logger.error("Invalid keyword index given.")
return result
def get_parameter(self):
def get_parameter(self, engine, path):
"""Return the SECTION_PARAMETER for this InputSection. If none is
explicitly set, return the default specified by the cp2k version
specific XML file.
"""
if self.params is None:
logger.debug("The section '{}' has no parameters set".format(self.name))
# Form a XPath from the given path
xpath = "."
sections = path.split("/")
for section in sections:
xpath += "/SECTION[NAME='{}']".format(section)
xpath += "/SECTION_PARAMETERS/LONE_KEYWORD_VALUE"
xml_file = engine.get_xml_file()
xmlengine = engine.parser.xmlengine
result = xmlengine.parse(xml_file, xpath)
return result[0].text
return self.params
<HTML><BODY><HEAD><TITLE>The cp2k units list</TITLE>
<H1>CP2K Available Units of Measurement</H1>
<H2>Undefined</H2>
If the default unit of a keyword is explicitly undefined, all possible units of measurement can be used to define a proper value.<BR><DL>
<DD><B>internal_cp2k</B></DD>
</DL><P>
<H2>Energy</H2>
Possible units of measurement for Energies. The [energy] entry acts like a dummy flag (assumes the unit of measurement of energy is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>hartree</B></DD>
<DD><B>wavenumber_e</B></DD>
<DD><B>joule</B></DD>
<DD><B>kcalmol</B></DD>
<DD><B>kjmol</B></DD>
<DD><B>Ry</B></DD>
<DD><B>eV</B></DD>
<DD><B>K_e</B></DD>
<DD><B>energy</B></DD>
</DL><P>
<H2>Length</H2>
Possible units of measurement for Lengths. The [length] entry acts like a dummy flag (assumes the unit of measurement of length is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>bohr</B></DD>
<DD><B>m</B></DD>
<DD><B>pm</B></DD>
<DD><B>nm</B></DD>
<DD><B>angstrom</B></DD>
<DD><B>length</B></DD>
</DL><P>
<H2>Temperature</H2>
Possible units of measurement for Temperature. The [temperature] entry acts like a dummy flag (assumes the unit of measurement of temperature is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>K</B></DD>
<DD><B>au_temp</B></DD>
<DD><B>temperature</B></DD>
</DL><P>
<H2>Pressure</H2>
Possible units of measurement for Pressure. The [pressure] entry acts like a dummy flag (assumes the unit of measurement of pressure is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>bar</B></DD>
<DD><B>atm</B></DD>
<DD><B>kbar</B></DD>
<DD><B>Pa</B></DD>
<DD><B>MPa</B></DD>
<DD><B>GPa</B></DD>
<DD><B>au_p</B></DD>
<DD><B>pressure</B></DD>
</DL><P>
<H2>Angle</H2>
Possible units of measurement for Angles. The [angle] entry acts like a dummy flag (assumes the unit of measurement of angle is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>rad</B></DD>
<DD><B>deg</B></DD>
<DD><B>angle</B></DD>
</DL><P>
<H2>Time</H2>
Possible units of measurement for Time. The [time] entry acts like a dummy flag (assumes the unit of measurement of time is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>s</B></DD>
<DD><B>fs</B></DD>
<DD><B>ps</B></DD>
<DD><B>au_t</B></DD>
<DD><B>wavenumber_t</B></DD>
<DD><B>time</B></DD>
</DL><P>
<H2>Mass</H2>
Possible units of measurement for Masses. The [mass] entry acts like a dummy flag (assumes the unit of measurement of mass is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>kg</B></DD>
<DD><B>amu</B></DD>
<DD><B>m_e</B></DD>
<DD><B>mass</B></DD>
</DL><P>
<H2>Potential</H2>
Possible units of measurement for potentials. The [potential] entry acts like a dummy flag (assumes the unit of measurement of potential is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>volt</B></DD>
<DD><B>au_pot</B></DD>
<DD><B>potential</B></DD>
</DL><P>
<H2>Force</H2>
Possible units of measurement for forces. The [force] entry acts like a dummy flag (assumes the unit of measurement of force is in internal units), useful for dimensional analysis.<BR><DL>
<DD><B>N</B></DD>
<DD><B>Newton</B></DD>
<DD><B>mN</B></DD>
<DD><B>mNewton</B></DD>
<DD><B>au_f</B></DD>
<DD><B>force</B></DD>
</DL><P>
</BODY></HTML>
"""
This engine is used to parse XML files using XPath commands
(http://www.w3.org/TR/xpath/). It uses the cElementTree package, but it could
be easily replaced with another XML parsing package that implements the
ElemenTree API such as lxml.
"""
import xml.etree.cElementTree as ET
import sys
#===============================================================================
class XMLEngine(object):
"""Used to parse out XML content.
"""
def __init__(self, parser):
"""
Args:
cp2k_parser: Instance of a NomadParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.parser = parser
def parse(self, contents, XPath):
# Open the XML differently depending on whether it is string of a file
# handle
if isinstance(contents, (str, unicode)):
tree = ET.fromstring(contents)
else:
tree = ET.parse(contents)
# Get the path
return tree.getroot().findall(XPath)
......@@ -8,6 +8,7 @@ from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine
from cp2kparser.engines.xyzengine import XYZEngine
from cp2kparser.engines.cp2kinputengine import CP2KInputEngine
from cp2kparser.engines.xmlengine import XMLEngine
import numpy as np
import logging
logger = logging.getLogger(__name__)
......@@ -25,27 +26,34 @@ class CP2KParser(NomadParser):
def __init__(self, input_json_string):
NomadParser.__init__(self, input_json_string)
self.version_number = None
# Engines are created here
self.inputengine = CP2KInputEngine(self)
self.xyzengine = XYZEngine(self)
self.regexengine = RegexEngine(self)
self.xmlengine = XMLEngine(self)
self.regexs = None
self.analyse_input_json()
self.determine_file_ids()
self.open_files()
self.check_resolved_file_ids()
self.determine_file_ids_from_extension()
self.setup_version()
self.determine_file_ids()
# self.open_files()
def setup_version(self):
"""Inherited from NomadParser.
"""
# Determine the CP2K version from the input file
# Determine the CP2K version from the output file
beginning = self.read_part_of_file("output", 2048)
version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n")
version_number = '_' + version_regex.search(beginning).groups()[0].replace('.', '') + '_'
self.version_number = version_regex.search(beginning).groups()[0].replace('.', '')
self.inputengine.setup_version_number(self.version_number)
version_name = '_' + self.version_number + '_'
# Search for a version specific regex class
class_name = "CP2K{}Regexs".format(version_number)
class_name = "CP2K{}Regexs".format(version_name)
self.regexs = globals().get(class_name)
if self.regexs:
logger.debug("Using version specific regexs '{}'.".format(class_name))
......@@ -55,7 +63,7 @@ class CP2KParser(NomadParser):
self.regexs = globals()["CP2KRegexs"]()
# Search for a version specific implementation
class_name = "CP2K{}Implementation".format(version_number)
class_name = "CP2K{}Implementation".format(version_name)
class_object = globals().get(class_name)
if class_object:
logger.debug("Using version specific implementation '{}'.".format(class_name))
......@@ -70,10 +78,9 @@ class CP2KParser(NomadParser):
buffer = fh.read(size)
return buffer
def determine_file_ids(self):
"""Inherited from NomadParser.
def check_resolved_file_ids(self):
"""Save the file id's that were given in the JSON input.
"""
# Determine a list of filepaths that need id resolution
resolved = {}
resolvable = []
for file_object in self.files:
......@@ -84,17 +91,27 @@ class CP2KParser(NomadParser):
else:
resolved[file_id] = path
# First resolve the file that can be identified by extension
input_path = resolved.get("input")
if not input_path:
for file_path in resolvable:
if file_path.endswith(".inp"):
self.file_ids["input"] = file_path
self.get_file_handle("input")
if file_path.endswith(".out"):
self.file_ids["output"] = file_path
# Now check from input what the other files are called
for id, path in resolved.iteritems():
self.file_ids[id] = path
self.get_file_handle(id)
self.resolvable = resolvable
def determine_file_ids_from_extension(self):
"""First resolve the files that can be identified by extension.
"""
for file_path in self.resolvable:
if file_path.endswith(".inp"):
self.file_ids["input"] = file_path
self.get_file_handle("input")
if file_path.endswith(".out"):
self.file_ids["output"] = file_path
self.get_file_handle("output")
def determine_file_ids(self):
"""Inherited from NomadParser.
"""
# Check from input what the other files are called
self.inputengine.parse_input()
force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
project_name = self.inputengine.get_keyword("GLOBAL/PROJECT_NAME")
......@@ -113,22 +130,22 @@ class CP2KParser(NomadParser):
force_path = os.path.basename(force_path)
# Check against the given files
for file_path in resolvable:
for file_path in self.resolvable:
tail = os.path.basename(file_path)
if force_path is not None and tail == force_path:
self.file_ids["forces"] = file_path
self.get_file_handle("forces")
def open_files(self):
"""Open the file handles and keep them open until program finishes.
"""
for file_id, file_path in self.file_ids.iteritems():
try:
file_handle = open(file_path, 'r')
except (OSError, IOError):
logger.error("Could not open file: '{}'".format(file_path))
else:
self.file_handles[file_id] = file_handle
# def open_files(self):
# """Open the file handles and keep them open until program finishes.
# """
# for file_id, file_path in self.file_ids.iteritems():
# try:
# file_handle = open(file_path, 'r')
# except (OSError, IOError):
# logger.error("Could not open file: '{}'".format(file_path))
# else:
# self.file_handles[file_id] = file_handle
def get_unformatted_quantity(self, name):
"""Inherited from NomadParser. The timing and caching is already
......@@ -183,7 +200,7 @@ class CP2KImplementation(object):
"""
# First try to look at the shortcut
xc_shortcut = self.inputengine.get_subsection("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL").get_parameter()
xc_shortcut = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
if xc_shortcut is not None and xc_shortcut != "NONE" and xc_shortcut != "NO_SHORTCUT":
logger.debug("Shortcut defined for XC_FUNCTIONAL")
......@@ -215,7 +232,7 @@ class CP2KImplementation(object):
# Becke88
xc_components = []
becke_88 = self.inputengine.get_subsection("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88").get_parameter()
becke_88 = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88")
if becke_88 == "TRUE":
xc_components.append("GGA_X_B88")
......
"""
Used to analyse the profiling statistics gathered by the test suite.
"""
import pstats
#===============================================================================
if __name__ == '__main__':
p = pstats.Stats("profile_file")
p.strip_dirs().sort_stats("tottime").print_stats(20)
import unittest
import os
import logging
from cp2kparser.implementation.autoparser import get_parser
import cProfile
import pstats
#===============================================================================
......@@ -87,6 +90,13 @@ class TestForces(unittest.TestCase):
self.assertEqual(forces, None)
if __name__ == '__main__':
logger = logging.getLogger("cp2kparser")
logger.setLevel(logging.ERROR)
# unittest.main()
suite = unittest.TestLoader().loadTestsFromTestCase(TestForces)
unittest.TextTestRunner(verbosity=0).run(suite)
def runtests():
unittest.TextTestRunner().run(suite)
s = cProfile.run("runtests()", sort="cumtime", filename="profile_file")
# unittest.TextTestRunner(verbosity=0).run(suite)
......@@ -11,7 +11,7 @@ def main():
author="Lauri Himanen",
author_email="lauri.himanen@gmail.com",
license="GPL3",
packages=["engines", "generics", "implementation"],
packages=["cp2kparser"],
zip_safe=False
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment