Commit a1814ab0 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Added XYZ engine, better logging, tests for forces in a separate XYZ file, etc.

parent 977c3bef
......@@ -4,12 +4,60 @@ the common parser structure when it is available.
## QuickStart
- Clone repository
- Run setup by running the setup.py script:
- Run setup by running the setup.py script. For local, user specific install
without sudo permissions use:
$ python setup.py install --user
- Parsing can be currently tested by simply running the script "parse.py" in a folder
For a system-wide install use:
$ python setup.py install
- You can test if everything is running fine by running the test script in tests folder:
$ cd cp2kparser/tests/cp2k_2.6.2
$ python run_tests.py
- If you want to try out parsing for a custom cp2k calculation, place all
relevant output and input files inside a common directory and run the
following command within that folder:
$ python -m cp2kparser
## Structure
Currently the python package is divided into three subpackages:
- Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality.
## Reusable components and ideas for other parsers
Some components and ideas could be reused in other parsers as well. If you find
any of the following useful in you parser, you are welcome to do so.
### Engines
Basically all the "engines", that is the modules that parse certain type of
files, are reusable as is in other parsers. They could be put into a common
repository where other developers can improve and extend them. One should also
write tests for the engines that would validate their behaviour and ease the
performance analysis.
Currently implemented engines that could be reused (not tested properly yet):
- RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if
re2 not found).
- XyzEngine: For parsing XYZ files and files with similar structure. Has a very
flexible nature as you can specify comments, column delimiters, column
indices and the patterns used to separate different configurations.
### NomadParser base class
In the generics folder there is a module called nomadparser.py that defines a
class called NomadParser. This acts as a base class for the cp2k parser defined
in the implementation folder.
The NomadParser class defines the interface which is eventually used by e.g.
the scala code (will be modified later to conform to the common interface).
This class is also responsible for some common tasks that are present in all
parsers:
- Unit conversion
- JSON encoding
- Caching
- Time measurement for performance analysis
- Providing file contents, sizes and handles
## Lessons learned
#! /usr/bin/env python
import os
import logging
from cp2kparser.implementation.autoparser import get_parser
# logging.basicConfig(level=logging.INFO)
path = os.getcwd()
parser = get_parser(path)
parser.get_all_quantities()
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from collections import defaultdict
from cp2kparser.generics.util import *
from cp2kparser.generics.nomadlogging import *
#===============================================================================
......@@ -48,8 +48,10 @@ class CP2KInputEngine(object):
section_stack[-1].subsections[name.upper()].append(s)
section_stack.append(s)
else:
keyword_name = line.split(' ', 1)[0]
section_stack[-1].keywords[keyword_name].append(line)
split = line.split(' ', 1)
keyword_name = split[0]
keyword_value = split[1]
section_stack[-1].keywords[keyword_name].append(keyword_value)
self.root_section = root_section
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
from cp2kparser.generics.util import *
from cp2kparser.generics.nomadlogging import *
try:
import re2 as re
except ImportError:
......@@ -139,10 +140,6 @@ class RegexEngine(object):
print_debug("Going into full regex search")
result = self.regex_search_string(data, regex)
if not result:
print_debug("There was an issue in regex '{}' with index '{}' .".format(regex.regex_string, regex.index))
return None
# See if the tree continues
if regex.inner_regex is not None:
print_debug("Entering next regex recursion level.")
......@@ -207,7 +204,6 @@ class RegexEngine(object):
by piece to avoid loading huge files into memory. The piece-wise search
can also be used to search the file from bottom to up.
"""
compiled_separator = regex.compiled_separator
separator = regex.separator
direction = regex.direction
index = regex.index
......@@ -217,10 +213,10 @@ class RegexEngine(object):
# Determine the direction in which the blocks are read
if direction == "up":
print_debug("Searching from bottom to up.")
generator = self.reverse_block_generator(file_handle, compiled_separator)
generator = self.reverse_block_generator(file_handle, separator)
elif direction == "down":
print_debug("Searching from up to bottom.")
generator = self.block_generator(file_handle, compiled_separator)
generator = self.block_generator(file_handle, separator)
else:
print_error("Unknown direction specifier: {}".format(direction))
return
......@@ -273,7 +269,7 @@ class RegexEngine(object):
else:
return results[i_result + (n_results-1) - index]
def reverse_block_generator(self, fh, separator, buf_size=1000000):
def reverse_block_generator(self, fh, separator_pattern, buf_size=1000000):
"""A generator that returns chunks of a file piece-by-piece in reverse
order.
"""
......@@ -283,7 +279,8 @@ class RegexEngine(object):
total_size = remaining_size = fh.tell()
# Compile the separator with an added end of string character.
end_match = separator.pattern + r'$'
compiled_separator = re.compile(separator_pattern)
end_match = separator_pattern + r'$'
compiled_end_match = re.compile(end_match)
while remaining_size > 0:
......@@ -292,7 +289,7 @@ class RegexEngine(object):
buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size
#print remaining_size
lines = separator.split(buffer)
lines = compiled_separator.split(buffer)
# lines = buffer.split(separator)
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
......@@ -311,7 +308,7 @@ class RegexEngine(object):
yield lines[index]
yield segment
def block_generator(self, fh, separator, buf_size=1000000):
def block_generator(self, fh, separator_pattern, buf_size=1000000):
"""A generator that returns chunks of a file piece-by-piece
"""
segment = None
......@@ -319,19 +316,23 @@ class RegexEngine(object):
fh.seek(0, os.SEEK_END)
total_size = remaining_size = fh.tell()
fh.seek(0, os.SEEK_SET)
#Compile regex
compiled_separator = re.compile(separator_pattern)
while remaining_size > 0:
offset = min(total_size, offset)
fh.seek(offset, os.SEEK_SET)
offset += buf_size
buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size
parts = separator.split(buffer)
parts = compiled_separator.split(buffer)
# The last part of the buffer must be appended to the next chunk's first part.
if segment is not None:
# If this chunk starts right with the separator, do not concatenate
# the segment to the first line of new chunk instead, yield the
# segment instead
if separator.match(buffer):
if compiled_separator.match(buffer):
yield segment
else:
parts[0] = segment + parts[0]
......
from cp2kparser.generics.util import *
from cp2kparser.generics.nomadlogging import *
import numpy as np
from io import StringIO
np_version = np.__version__
split = np_version.split(".")
if int(split[1]) < 10 and int(split[0] < 1):
print_warning("Using too old version of numpy, the XYZ Parsing may not work properly!")
try:
import re2 as re
except ImportError:
import re
print_warning((
"re2 package not found. Using re package instead. "
"If you wan't to use re2 please see the following links:"
" https://github.com/google/re2"
" https://pypi.python.org/pypi/re2/"
))
else:
re.set_fallback_notification(re.FALLBACK_WARNING)
#===============================================================================
class XYZEngine(object):
"""Used to parse out XYZ and extended XYZ files.
"""Used to parse out XYZ content and other content with similar structure.
Currently only can parse floating point information.
When given a file handle to a CP2K input file, this class attemts to parse
out it's structure into an accessible object tree. Because the input file
has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions.
Reads the given file or string line by line, ignoring commented sections.
Each line with data is split with a given delimiter expression (regex).
From the split line the specified columns will be returned as floating
point numbers in a numpy array.
If given a separator specification (regex), the algorithm will try to split
the contents into different configurations which will be separated by a
line that matches the separator.
"""
def __init__(self, parser):
"""
......@@ -24,28 +37,129 @@ class XYZEngine(object):
"""
self.parser = parser
def parse_file(self, file_handle, columns, exclusion_patterns):
"""Parses floating point numbers from the given file using the given
columns.
def parse(self, contents, columns, delimiter=r"\s+", comments=r"#", separator=r"^\d+$"):
The file handle should be opened and closed somewhere else. The columns
are used to extract only certain components form each line.
def split_line(line):
"""Chop off comments, strip, and split at delimiter.
"""
if line.isspace():
return None
if comments:
line = compiled_comments.split(line, maxsplit=1)[0]
line = line.strip('\r\n ')
if line:
return compiled_delimiter.split(line)
else:
return []
Returns:
A numpy array of floating point numbers.
"""
converters = {}
for column in columns:
converters[column] = float
result = np.loadtxt(file_handle, dtype=np.float64, comments=exclusion_patterns, usecols=columns, converters=converters)
return result
def parse_string(self, string, columns, exclusion_patterns):
"""Parses floating point numbers from the given string using the given
columns.
Returns:
3D numpy array of floating point numbers.
"""
stream = StringIO(string)
return self.parse_file(stream, columns, exclusion_patterns)
def is_separator(line):
"""Check if the given line matches the separator pattern.
Separators are used to split a file into multiple configurations.
"""
if separator:
return compiled_separator.search(line)
return False
# If string or unicode provided, create stream
if isinstance(contents, (str, unicode)):
contents = StringIO(unicode(contents))
# Compile the comments to regex objects
if comments:
comments = (re.escape(comment) for comment in comments)
compiled_comments = re.compile('|'.join(comments))
#Compile the separator
if separator:
compiled_separator = re.compile(separator)
#Compile the delimiter
compiled_delimiter = re.compile(delimiter)
# Colums as list
if columns is not None:
columns = list(columns)
# Start iterating
all_forces = []
conf_forces = []
for line in contents:
if is_separator(line):
if conf_forces:
all_forces.append(conf_forces)
conf_forces = []
else:
vals = split_line(line)
line_forces = []
if vals:
for column in columns:
try:
value = vals[column]
except IndexError:
print_warning("The given index '{}' could not be found on the line '{}'. The given delimiter or index could be wrong.".format(column, line))
return
try:
value = float(value)
except ValueError:
print_warning("Could not cast value '{}' to float. Currently only floating point values are accepted".format(value))
return
else:
line_forces.append(value)
conf_forces.append(line_forces)
if conf_forces:
all_forces.append(conf_forces)
# If any forces found, return them as numpy array. Otherwise return None.
if all_forces:
all_forces = np.array(all_forces)
return all_forces
else:
return None
# SLOWER OLD VERSION
# def parse_numpy(self, file_handle, columns, comments):
# """Parses floating point numbers from the given file using the given
# columns.
# The file handle should be opened and closed somewhere else. The columns
# are used to extract only certain components form each line.
# Returns:
# A numpy array of floating point numbers.
# """
# # If string or unicode provided, create stream
# if isinstance(file_handle, (str, unicode)):
# file_handle = StringIO(unicode(file_handle))
# converters = {}
# for column in columns:
# converters[column] = float
# result = np.loadtxt(file_handle, dtype=np.float64, comments=comments, usecols=columns, converters=converters)
# return result
# def parse(self, contents, columns, comments, separator=None):
# """Parse data from a file or string containing XYZ data.
# If a separator pattern is provided, the contents are divided into parts
# separated by the pattern. Each of these parts is handled as a separate
# configuration.
# """
# # If string or unicode provided, create stream
# if isinstance(contents, (str, unicode)):
# contents = StringIO(unicode(string))
# # If separator provided, get contents one block at a time
# if separator is not None:
# generator = block_generator(contents, separator)
# forces = []
# for block in generator:
# if block is not None and not block.isspace():
# array = self.parse_numpy(block, columns, comments)
# if array.size != 0:
# forces.append(array)
# forces = np.dstack(forces)
# else:
# forces = self.parse_numpy(contents, columns, comments)
# return forces
......@@ -4,6 +4,7 @@
"""Misc. utility functions."""
import textwrap
import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)
#===============================================================================
......@@ -52,7 +53,7 @@ def make_message(message, width=80, spaces=0):
#===============================================================================
def make_debug_message(message, width=80, spaces=0):
def make_titled_message(title, message, width=80, spaces=0):
"""Styles a message to be printed into console.
"""
wrapper = textwrap.TextWrapper(width=width-6)
......@@ -61,7 +62,7 @@ def make_debug_message(message, width=80, spaces=0):
first = True
for line in lines:
if first:
new_line = spaces*" " + " >> DEBUG: " + line + (width-6-len(line))*" " + " "
new_line = spaces*" " + " >> {}: ".format(title) + line + (width-6-len(line))*" " + " "
styled_message += new_line
first = False
else:
......@@ -88,7 +89,14 @@ def print_message(title, message, width=80):
def print_debug(message, width=80):
"""Returns a styled warning message to be printed into console.
"""
logging.debug(make_debug_message(message))
logging.debug(make_titled_message("DEBUG", message))
#===============================================================================
def print_info(message, width=80):
"""Returns a styled warning message to be printed into console.
"""
logging.info(make_titled_message("INFO", message))
#===============================================================================
......
......@@ -4,7 +4,7 @@ import json
import os
import time
from abc import ABCMeta, abstractmethod
from cp2kparser.generics.util import *
from cp2kparser.generics.nomadlogging import *
from pint import UnitRegistry
......@@ -131,12 +131,26 @@ class NomadParser(object):
if result is None:
print_debug("The quantity '{}' is not present or could not be succesfully parsed.".format(name))
# Check results
if result is None:
print_info("There was an issue in parsing quantity '{}'. It is either not present in the files or could not be succesfully parsed.".format(name))
else:
print_info("Succesfully parsed quantity '{}'. Result:\n{}".format(name, result))
# Do the conversion to SI units based on the given units
stop = time.clock()
print_debug("Elapsed time: {} ms".format((stop-start)*1000))
return result
def get_all_quantities(self):
"""Parse all supported quantities."""
implementation_methods = [method for method in dir(self.implementation) if callable(getattr(self.implementation, method))]
for method in implementation_methods:
if method.startswith("_Q_"):
method = method[3:]
self.get_quantity(method)
@abstractmethod
def setup_version(self):
"""Setup a correct implementation for this version.
......
......@@ -2,8 +2,7 @@
# -*- coding: utf-8 -*-
import os
import re
from cp2kparser.generics.util import *
from cp2kparser.generics.util import *
from cp2kparser.generics.nomadlogging import *
from cp2kparser.generics.nomadparser import NomadParser
from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine
......@@ -99,13 +98,25 @@ class CP2KParser(NomadParser):
# Now check from input what the other files are called
self.inputengine.parse_input()
force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
project_name = self.inputengine.get_keyword("GLOBAL/PROJECT_NAME")
if force_path is not None and force_path != "__STD_OUT__":
force_path = os.path.basename(force_path) + "-1_0"
# The force path is not typically exactly as written in input
if force_path.startswith("="):
print_debug("Using single force file.")
force_path = force_path[1:]
elif re.match(r".?/", force_path):
print_debug("Using separate force file for each step.")
force_path = "{}-1_0.xyz".format(force_path)
else:
print_debug("Using separate force file for each step.")
force_path = "{}-{}-1_0.xyz".format(project_name, force_path)
force_path = os.path.basename(force_path)
# Check against the given files
for file_path in resolvable:
file_no_ext, file_extension = os.path.splitext(file_path)
if force_path and file_no_ext == force_path and file_extension == ".xyz":
tail = os.path.basename(file_path)
if force_path is not None and tail == force_path:
self.file_ids["forces"] = file_path
self.get_file_handle("forces")
......@@ -131,13 +142,6 @@ class CP2KParser(NomadParser):
else:
print_error("The function for quantity '{}' is not defined".format(name))
def parse_all(self):
"""Parse all supported quantities."""
implementation_methods = [method for method in dir(self.implementation) if callable(getattr(self.implementation, method))]
for method in implementation_methods:
if method.startswith("_Q_"):
getattr(self.implementation, method)()
def check_quantity_availability(self, name):
"""Inherited from NomadParser.
"""
......@@ -235,6 +239,8 @@ class CP2KImplementation(object):
def _Q_particle_forces(self):
"""Return all the forces for every step found.
Supports forces printed in the output file or in a single .xyz file.
"""
# Determine if a separate force file is used or are the forces printed
......@@ -249,28 +255,31 @@ class CP2KImplementation(object):
print_debug("Looking for forces in output file.")
forces = self.regexengine.parse(self.regexs.particle_forces, self.parser.get_file_handle("output"))
if forces is None:
print_warning("No forces could be found in the output file.")
return None
# Insert force configuration into the array
i_conf = 0
force_array = None
for force_conf in forces:
unicode_force_conf = unicode(force_conf)
i_force_array = self.xyzengine.parse_string(unicode_force_conf, (-3, -2, -1), ("#", "ATOMIC", "SUM"))
i_force_array = self.xyzengine.parse(force_conf, columns=(-3, -2, -1), comments=("#", "ATOMIC", "SUM"), separator=None)
i_force_array = i_force_array[0]
# Initialize the numpy array if not done yet
n_particles = i_force_array.shape[0]
n_dim = i_force_array.shape[1]
n_confs = len(forces)
force_array = np.empty((n_particles, n_dim, n_confs))
force_array = np.empty((n_confs, n_particles, n_dim))
force_array[:, :, i_conf] = i_force_array
force_array[i_conf, :, :] = i_force_array
i_conf += 1
return force_array
else:
print_debug("Looking for forces in separate force file.")
forces = self.xyzengine.parse_file(self.parser.get_file_handle("forces"), (-3, -2, -1), ("#", "ATOMIC", "SUM"))
forces = self.xyzengine.parse(self.parser.get_file_handle("forces"), columns=(-3, -2, -1), comments=("#", "ATOMIC", "SUM"), separator=r"\ ATOMIC FORCES in \[a\.u\.\]")
if forces is None:
print print_warning("No forces could be found in the XYZ file.")
return forces
......
3
i = 0, time = 0.000, E = -13.6788620342
O 0.0000000000 0.0000000000 -0.0655870000
H 0.0000000000 -0.7571360000 0.5205450000
H 0.0000000000 0.7571360000 0.5205450000
# Step Nr. Time[fs] Kin.[a.u.] Temp[K] Pot.[a.u.] Cons Qty[a.u.] UsedTime[s]
0 0.000000 0.002850134 300.000000000 -13.678862034 -13.675536878 0.000000000
0 0.000000 0.001425067 300.000000000 -13.936448531 -13.934548442 0.000000000
2
i = 0, time = 0.000, E = -13.9364485315
Na 0.0000000000 0.0000000000 -0.0655870000
Cl 0.0000000000 -0.7571360000 0.5205450000
......@@ -23,22 +23,21 @@
ABC 6.0 6.0 6.0
&END CELL
&COORD
O 0.000000 0.000000 -0.065587
H 0.000000 -0.757136 0.520545
H 0.000000 0.757136 0.520545
Na 0.000000 0.000000 -0.065587
Cl 0.000000 -0.757136 0.520545
&END COORD
&KIND H
&KIND Na
BASIS_SET DZVP-GTH-PADE
POTENTIAL GTH-PADE-q1
&END KIND
&KIND O
&KIND Cl
BASIS_SET DZVP-GTH-PADE
POTENTIAL GTH-PADE-q6
POTENTIAL GTH-PADE-q7
&END KIND
&END SUBSYS
&END FORCE_EVAL
&GLOBAL
PROJECT H2O-2