Commit a1814ab0 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Added XYZ engine, better logging, tests for forces in a separate XYZ file, etc.

parent 977c3bef
...@@ -4,12 +4,60 @@ the common parser structure when it is available. ...@@ -4,12 +4,60 @@ the common parser structure when it is available.
## QuickStart ## QuickStart
- Clone repository - Clone repository
- Run setup by running the setup.py script: - Run setup by running the setup.py script. For local, user specific install
without sudo permissions use:
$ python setup.py install --user $ python setup.py install --user
- Parsing can be currently tested by simply running the script "parse.py" in a folder For a system-wide install use:
$ python setup.py install
- You can test if everything is running fine by running the test script in tests folder:
$ cd cp2kparser/tests/cp2k_2.6.2
$ python run_tests.py
- If you want to try out parsing for a custom cp2k calculation, place all
relevant output and input files inside a common directory and run the
following command within that folder:
$ python -m cp2kparser
## Structure ## Structure
Currently the python package is divided into three subpackages: Currently the python package is divided into three subpackages:
- Engines: Classes for parsing different type of files - Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes - Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality. - Implementation: The classes that actually define the parser functionality.
## Reusable components and ideas for other parsers
Some components and ideas could be reused in other parsers as well. If you find
any of the following useful in you parser, you are welcome to do so.
### Engines
Basically all the "engines", that is the modules that parse certain type of
files, are reusable as is in other parsers. They could be put into a common
repository where other developers can improve and extend them. One should also
write tests for the engines that would validate their behaviour and ease the
performance analysis.
Currently implemented engines that could be reused (not tested properly yet):
- RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if
re2 not found).
- XyzEngine: For parsing XYZ files and files with similar structure. Has a very
flexible nature as you can specify comments, column delimiters, column
indices and the patterns used to separate different configurations.
### NomadParser base class
In the generics folder there is a module called nomadparser.py that defines a
class called NomadParser. This acts as a base class for the cp2k parser defined
in the implementation folder.
The NomadParser class defines the interface which is eventually used by e.g.
the scala code (will be modified later to conform to the common interface).
This class is also responsible for some common tasks that are present in all
parsers:
- Unit conversion
- JSON encoding
- Caching
- Time measurement for performance analysis
- Providing file contents, sizes and handles
## Lessons learned
#! /usr/bin/env python
import os
import logging
from cp2kparser.implementation.autoparser import get_parser
# logging.basicConfig(level=logging.INFO)
path = os.getcwd()
parser = get_parser(path)
parser.get_all_quantities()
#! /usr/bin/env python #! /usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from collections import defaultdict from collections import defaultdict
from cp2kparser.generics.util import * from cp2kparser.generics.nomadlogging import *
#=============================================================================== #===============================================================================
...@@ -48,8 +48,10 @@ class CP2KInputEngine(object): ...@@ -48,8 +48,10 @@ class CP2KInputEngine(object):
section_stack[-1].subsections[name.upper()].append(s) section_stack[-1].subsections[name.upper()].append(s)
section_stack.append(s) section_stack.append(s)
else: else:
keyword_name = line.split(' ', 1)[0] split = line.split(' ', 1)
section_stack[-1].keywords[keyword_name].append(line) keyword_name = split[0]
keyword_value = split[1]
section_stack[-1].keywords[keyword_name].append(keyword_value)
self.root_section = root_section self.root_section = root_section
......
#! /usr/bin/env python #! /usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
from cp2kparser.generics.util import * from cp2kparser.generics.nomadlogging import *
try: try:
import re2 as re import re2 as re
except ImportError: except ImportError:
...@@ -139,10 +140,6 @@ class RegexEngine(object): ...@@ -139,10 +140,6 @@ class RegexEngine(object):
print_debug("Going into full regex search") print_debug("Going into full regex search")
result = self.regex_search_string(data, regex) result = self.regex_search_string(data, regex)
if not result:
print_debug("There was an issue in regex '{}' with index '{}' .".format(regex.regex_string, regex.index))
return None
# See if the tree continues # See if the tree continues
if regex.inner_regex is not None: if regex.inner_regex is not None:
print_debug("Entering next regex recursion level.") print_debug("Entering next regex recursion level.")
...@@ -207,7 +204,6 @@ class RegexEngine(object): ...@@ -207,7 +204,6 @@ class RegexEngine(object):
by piece to avoid loading huge files into memory. The piece-wise search by piece to avoid loading huge files into memory. The piece-wise search
can also be used to search the file from bottom to up. can also be used to search the file from bottom to up.
""" """
compiled_separator = regex.compiled_separator
separator = regex.separator separator = regex.separator
direction = regex.direction direction = regex.direction
index = regex.index index = regex.index
...@@ -217,10 +213,10 @@ class RegexEngine(object): ...@@ -217,10 +213,10 @@ class RegexEngine(object):
# Determine the direction in which the blocks are read # Determine the direction in which the blocks are read
if direction == "up": if direction == "up":
print_debug("Searching from bottom to up.") print_debug("Searching from bottom to up.")
generator = self.reverse_block_generator(file_handle, compiled_separator) generator = self.reverse_block_generator(file_handle, separator)
elif direction == "down": elif direction == "down":
print_debug("Searching from up to bottom.") print_debug("Searching from up to bottom.")
generator = self.block_generator(file_handle, compiled_separator) generator = self.block_generator(file_handle, separator)
else: else:
print_error("Unknown direction specifier: {}".format(direction)) print_error("Unknown direction specifier: {}".format(direction))
return return
...@@ -273,7 +269,7 @@ class RegexEngine(object): ...@@ -273,7 +269,7 @@ class RegexEngine(object):
else: else:
return results[i_result + (n_results-1) - index] return results[i_result + (n_results-1) - index]
def reverse_block_generator(self, fh, separator, buf_size=1000000): def reverse_block_generator(self, fh, separator_pattern, buf_size=1000000):
"""A generator that returns chunks of a file piece-by-piece in reverse """A generator that returns chunks of a file piece-by-piece in reverse
order. order.
""" """
...@@ -283,7 +279,8 @@ class RegexEngine(object): ...@@ -283,7 +279,8 @@ class RegexEngine(object):
total_size = remaining_size = fh.tell() total_size = remaining_size = fh.tell()
# Compile the separator with an added end of string character. # Compile the separator with an added end of string character.
end_match = separator.pattern + r'$' compiled_separator = re.compile(separator_pattern)
end_match = separator_pattern + r'$'
compiled_end_match = re.compile(end_match) compiled_end_match = re.compile(end_match)
while remaining_size > 0: while remaining_size > 0:
...@@ -292,7 +289,7 @@ class RegexEngine(object): ...@@ -292,7 +289,7 @@ class RegexEngine(object):
buffer = fh.read(min(remaining_size, buf_size)) buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size remaining_size -= buf_size
#print remaining_size #print remaining_size
lines = separator.split(buffer) lines = compiled_separator.split(buffer)
# lines = buffer.split(separator) # lines = buffer.split(separator)
# the first line of the buffer is probably not a complete line so # the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer # we'll save it and append it to the last line of the next buffer
...@@ -311,7 +308,7 @@ class RegexEngine(object): ...@@ -311,7 +308,7 @@ class RegexEngine(object):
yield lines[index] yield lines[index]
yield segment yield segment
def block_generator(self, fh, separator, buf_size=1000000): def block_generator(self, fh, separator_pattern, buf_size=1000000):
"""A generator that returns chunks of a file piece-by-piece """A generator that returns chunks of a file piece-by-piece
""" """
segment = None segment = None
...@@ -319,19 +316,23 @@ class RegexEngine(object): ...@@ -319,19 +316,23 @@ class RegexEngine(object):
fh.seek(0, os.SEEK_END) fh.seek(0, os.SEEK_END)
total_size = remaining_size = fh.tell() total_size = remaining_size = fh.tell()
fh.seek(0, os.SEEK_SET) fh.seek(0, os.SEEK_SET)
#Compile regex
compiled_separator = re.compile(separator_pattern)
while remaining_size > 0: while remaining_size > 0:
offset = min(total_size, offset) offset = min(total_size, offset)
fh.seek(offset, os.SEEK_SET) fh.seek(offset, os.SEEK_SET)
offset += buf_size offset += buf_size
buffer = fh.read(min(remaining_size, buf_size)) buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size remaining_size -= buf_size
parts = separator.split(buffer) parts = compiled_separator.split(buffer)
# The last part of the buffer must be appended to the next chunk's first part. # The last part of the buffer must be appended to the next chunk's first part.
if segment is not None: if segment is not None:
# If this chunk starts right with the separator, do not concatenate # If this chunk starts right with the separator, do not concatenate
# the segment to the first line of new chunk instead, yield the # the segment to the first line of new chunk instead, yield the
# segment instead # segment instead
if separator.match(buffer): if compiled_separator.match(buffer):
yield segment yield segment
else: else:
parts[0] = segment + parts[0] parts[0] = segment + parts[0]
......
from cp2kparser.generics.util import * from cp2kparser.generics.nomadlogging import *
import numpy as np import numpy as np
from io import StringIO from io import StringIO
np_version = np.__version__ try:
split = np_version.split(".") import re2 as re
if int(split[1]) < 10 and int(split[0] < 1): except ImportError:
print_warning("Using too old version of numpy, the XYZ Parsing may not work properly!") import re
print_warning((
"re2 package not found. Using re package instead. "
"If you wan't to use re2 please see the following links:"
" https://github.com/google/re2"
" https://pypi.python.org/pypi/re2/"
))
else:
re.set_fallback_notification(re.FALLBACK_WARNING)
#=============================================================================== #===============================================================================
class XYZEngine(object): class XYZEngine(object):
"""Used to parse out XYZ and extended XYZ files. """Used to parse out XYZ content and other content with similar structure.
Currently only can parse floating point information.
When given a file handle to a CP2K input file, this class attemts to parse Reads the given file or string line by line, ignoring commented sections.
out it's structure into an accessible object tree. Because the input file Each line with data is split with a given delimiter expression (regex).
has such a clearly defined structure (unlike the output file of CP2K), it From the split line the specified columns will be returned as floating
is better to use a dedicated parser instead of regular expressions. point numbers in a numpy array.
If given a separator specification (regex), the algorithm will try to split
the contents into different configurations which will be separated by a
line that matches the separator.
""" """
def __init__(self, parser): def __init__(self, parser):
""" """
...@@ -24,28 +37,129 @@ class XYZEngine(object): ...@@ -24,28 +37,129 @@ class XYZEngine(object):
""" """
self.parser = parser self.parser = parser
def parse_file(self, file_handle, columns, exclusion_patterns): def parse(self, contents, columns, delimiter=r"\s+", comments=r"#", separator=r"^\d+$"):
"""Parses floating point numbers from the given file using the given
columns.
The file handle should be opened and closed somewhere else. The columns def split_line(line):
are used to extract only certain components form each line. """Chop off comments, strip, and split at delimiter.
"""
if line.isspace():
return None
if comments:
line = compiled_comments.split(line, maxsplit=1)[0]
line = line.strip('\r\n ')
if line:
return compiled_delimiter.split(line)
else:
return []
Returns: def is_separator(line):
A numpy array of floating point numbers. """Check if the given line matches the separator pattern.
""" Separators are used to split a file into multiple configurations.
converters = {} """
for column in columns: if separator:
converters[column] = float return compiled_separator.search(line)
result = np.loadtxt(file_handle, dtype=np.float64, comments=exclusion_patterns, usecols=columns, converters=converters) return False
return result
# If string or unicode provided, create stream
def parse_string(self, string, columns, exclusion_patterns): if isinstance(contents, (str, unicode)):
"""Parses floating point numbers from the given string using the given contents = StringIO(unicode(contents))
columns.
# Compile the comments to regex objects
Returns: if comments:
3D numpy array of floating point numbers. comments = (re.escape(comment) for comment in comments)
""" compiled_comments = re.compile('|'.join(comments))
stream = StringIO(string)
return self.parse_file(stream, columns, exclusion_patterns) #Compile the separator
if separator:
compiled_separator = re.compile(separator)
#Compile the delimiter
compiled_delimiter = re.compile(delimiter)
# Colums as list
if columns is not None:
columns = list(columns)
# Start iterating
all_forces = []
conf_forces = []
for line in contents:
if is_separator(line):
if conf_forces:
all_forces.append(conf_forces)
conf_forces = []
else:
vals = split_line(line)
line_forces = []
if vals:
for column in columns:
try:
value = vals[column]
except IndexError:
print_warning("The given index '{}' could not be found on the line '{}'. The given delimiter or index could be wrong.".format(column, line))
return
try:
value = float(value)
except ValueError:
print_warning("Could not cast value '{}' to float. Currently only floating point values are accepted".format(value))
return
else:
line_forces.append(value)
conf_forces.append(line_forces)
if conf_forces:
all_forces.append(conf_forces)
# If any forces found, return them as numpy array. Otherwise return None.
if all_forces:
all_forces = np.array(all_forces)
return all_forces
else:
return None
# SLOWER OLD VERSION
# def parse_numpy(self, file_handle, columns, comments):
# """Parses floating point numbers from the given file using the given
# columns.
# The file handle should be opened and closed somewhere else. The columns
# are used to extract only certain components form each line.
# Returns:
# A numpy array of floating point numbers.
# """
# # If string or unicode provided, create stream
# if isinstance(file_handle, (str, unicode)):
# file_handle = StringIO(unicode(file_handle))
# converters = {}
# for column in columns:
# converters[column] = float
# result = np.loadtxt(file_handle, dtype=np.float64, comments=comments, usecols=columns, converters=converters)
# return result
# def parse(self, contents, columns, comments, separator=None):
# """Parse data from a file or string containing XYZ data.
# If a separator pattern is provided, the contents are divided into parts
# separated by the pattern. Each of these parts is handled as a separate
# configuration.
# """
# # If string or unicode provided, create stream
# if isinstance(contents, (str, unicode)):
# contents = StringIO(unicode(string))
# # If separator provided, get contents one block at a time
# if separator is not None:
# generator = block_generator(contents, separator)
# forces = []
# for block in generator:
# if block is not None and not block.isspace():
# array = self.parse_numpy(block, columns, comments)
# if array.size != 0:
# forces.append(array)
# forces = np.dstack(forces)
# else:
# forces = self.parse_numpy(contents, columns, comments)
# return forces
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
"""Misc. utility functions.""" """Misc. utility functions."""
import textwrap import textwrap
import logging import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)
#=============================================================================== #===============================================================================
...@@ -52,7 +53,7 @@ def make_message(message, width=80, spaces=0): ...@@ -52,7 +53,7 @@ def make_message(message, width=80, spaces=0):
#=============================================================================== #===============================================================================
def make_debug_message(message, width=80, spaces=0): def make_titled_message(title, message, width=80, spaces=0):
"""Styles a message to be printed into console. """Styles a message to be printed into console.
""" """
wrapper = textwrap.TextWrapper(width=width-6) wrapper = textwrap.TextWrapper(width=width-6)
...@@ -61,7 +62,7 @@ def make_debug_message(message, width=80, spaces=0): ...@@ -61,7 +62,7 @@ def make_debug_message(message, width=80, spaces=0):
first = True first = True
for line in lines: for line in lines:
if first: if first:
new_line = spaces*" " + " >> DEBUG: " + line + (width-6-len(line))*" " + " " new_line = spaces*" " + " >> {}: ".format(title) + line + (width-6-len(line))*" " + " "
styled_message += new_line styled_message += new_line
first = False first = False
else: else:
...@@ -88,7 +89,14 @@ def print_message(title, message, width=80): ...@@ -88,7 +89,14 @@ def print_message(title, message, width=80):
def print_debug(message, width=80): def print_debug(message, width=80):
"""Returns a styled warning message to be printed into console. """Returns a styled warning message to be printed into console.
""" """
logging.debug(make_debug_message(message)) logging.debug(make_titled_message("DEBUG", message))
#===============================================================================
def print_info(message, width=80):
"""Returns a styled warning message to be printed into console.
"""
logging.info(make_titled_message("INFO", message))
#=============================================================================== #===============================================================================
......
...@@ -4,7 +4,7 @@ import json ...@@ -4,7 +4,7 @@ import json
import os import os
import time import time
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from cp2kparser.generics.util import * from cp2kparser.generics.nomadlogging import *
from pint import UnitRegistry from pint import UnitRegistry
...@@ -131,12 +131,26 @@ class NomadParser(object): ...@@ -131,12 +131,26 @@ class NomadParser(object):
if result is None: if result is None:
print_debug("The quantity '{}' is not present or could not be succesfully parsed.".format(name)) print_debug("The quantity '{}' is not present or could not be succesfully parsed.".format(name))
# Check results
if result is None:
print_info("There was an issue in parsing quantity '{}'. It is either not present in the files or could not be succesfully parsed.".format(name))
else:
print_info("Succesfully parsed quantity '{}'. Result:\n{}".format(name, result))
# Do the conversion to SI units based on the given units # Do the conversion to SI units based on the given units
stop = time.clock() stop = time.clock()
print_debug("Elapsed time: {} ms".format((stop-start)*1000)) print_debug("Elapsed time: {} ms".format((stop-start)*1000))
return result return result
def get_all_quantities(self):
"""Parse all supported quantities."""
implementation_methods = [method for method in dir(self.implementation) if callable(getattr(self.implementation, method))]
for method in implementation_methods:
if method.startswith("_Q_"):
method = method[3:]
self.get_quantity(method)
@abstractmethod @abstractmethod
def setup_version(self): def setup_version(self):
"""Setup a correct implementation for this version. """Setup a correct implementation for this version.
......
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import re import re
from cp2kparser.generics.util import * from cp2kparser.generics.nomadlogging import *
from cp2kparser.generics.util import *
from cp2kparser.generics.nomadparser import NomadParser from cp2kparser.generics.nomadparser import NomadParser
from cp2kparser.implementation.regexs import * from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine from cp2kparser.engines.regexengine import RegexEngine
...@@ -99,13 +98,25 @@ class CP2KParser(NomadParser): ...@@ -99,13 +98,25 @@ class CP2KParser(NomadParser):
# Now check from input what the other files are called # Now check from input what the other files are called
self.inputengine.parse_input() self.inputengine.parse_input()
force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME") force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
project_name = self.inputengine.get_keyword("GLOBAL/PROJECT_NAME")
if force_path is not None and force_path != "__STD_OUT__": if force_path is not None and force_path != "__STD_OUT__":
force_path = os.path.basename(force_path) + "-1_0"
# The force path is not typically exactly as written in input
if force_path.startswith("="):
print_debug("Using single force file.")
force_path = force_path[1:]
elif re.match(r".?/", force_path):
print_debug("Using separate force file for each step.")
force_path = "{}-1_0.xyz".format(force_path)
else:
print_debug("Using separate force file for each step.")
force_path = "{}-{}-1_0.xyz".format(project_name, force_path)
force_path = os.path.basename(force_path)
# Check against the given files # Check against the given files
for file_path in resolvable: for file_path in resolvable: