Commit 315e9adc authored by Lauri Himanen's avatar Lauri Himanen

Initial commit

parents
cp2kparser/cp2kparser.egg-info
# CP2K
The NoMaD parser for CP2K. Under development.
## QuickStart
- Clone repository
- Run setup by running the setup.py script:
$ python setup.py install --user
- Run tests (TODO)
## Structure
Currently the python package is divided into three subpackages:
- Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality.
#! /usr/bin/env python
#! /usr/bin/env python
from collections import defaultdict
from cp2kparser.generics.util import *
#===============================================================================
class CP2KInputEngine(object):
"""Used to parse out a CP2K input file.
When given a file handle to a CP2K input file, this class attemts to parse
out it's structure into an accessible object tree. Because the input file
has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions.
"""
def __init__(self, cp2k_parser):
"""
Args:
cp2k_parser: Instance of a CP2KParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.cp2k_parser = cp2k_parser
self.root_section = None
def parse_input(self):
"""Parses the given CP2K input string"""
# The input file should be quite small, so just get the entire contents
inp = self.cp2k_parser.get_file_contents("input")
root_section = InputSection('CP2K_INPUT')
section_stack = [root_section]
for line in inp.split('\n'):
line = line.split('!', 1)[0].strip()
if len(line) == 0:
continue
if line.upper().startswith('&END'):
s = section_stack.pop()
elif line[0] == '&':
parts = line.split(' ', 1)
name = parts[0][1:]
if len(parts) > 1:
s = InputSection(name=name, params=parts[1].strip())
else:
s = InputSection(name=name)
section_stack[-1].subsections[name.upper()].append(s)
section_stack.append(s)
else:
keyword_name = line.split(' ', 1)[0]
section_stack[-1].keywords[keyword_name].append(line)
self.root_section = root_section
def get_subsection(self, path, index=0):
return self.root_section.get_subsection(path, index)
#===============================================================================
class InputSection(object):
"""Represents a section in a CP2K input file"""
def __init__(self, name, params=None):
self.name = name.upper()
self.params = params
self.keywords = defaultdict(list)
self.subsections = defaultdict(list)
def write(self):
"""Outputs input section as string"""
output = []
for name, k_list in self.keywords.iteritems():
for value in k_list:
output.append(value)
for name, s_list in self.subsections.iteritems():
for s in s_list:
if s.params:
output.append('&%s %s' % (s.name, s.params))
else:
output.append('&%s' % s.name)
for l in s.write():
output.append(' %s' % l)
output.append('&END %s' % s.name)
return output
def get_subsection(self, path, index=0):
"""Finds a subsection specified by a string where subsections are
separated by a slash. If multiple subsections are found with the same
path, the one specified by the given index (default 0) is returned.
Example: get_subsection("FORCE_EVAL/PRINT/FORCES")
Args:
path: String indicating the path to the subsection
index: In case of repeating subsections, return the one specified
by this index.
Returns:
The InputSection object if found.
"""
parts = path.upper().split('/', 1)
candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]]
if not candidates:
print_debug("Subsection '{}' not found.".format(parts[0]))
return None
elif len(candidates) > 1:
print_warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0]))
try:
subsection = candidates[index]
except IndexError:
print_error("Invalid subsection index given.")
if len(parts) == 1:
return subsection
return subsection.get_subsection(parts[1])
def get_keyword(self, keyword, index=0):
"""Finds a keyword specified by a string. If multiple keywords are found with the same
name, the one specified by the given index (default 0) is returned.
Args:
keyword: String indicating the name of the keyword. The name is the
first word in the line.
index: In case of repeating keywords, return the one specified
by this index.
Returns:
The keyword value (everything else than the first word on the line).
"""
candidates = self.keywords.get(keyword) # [s for s in self.subsections if s.name == parts[0]]
if not candidates:
print_debug("No keywords with name '{}' found in subsection '{}'".format(keyword, self.name))
return None
elif len(candidates) > 1:
print_warning("Multiple keywords with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0]))
try:
result = candidates[index]
except IndexError:
print_error("Invalid keyword index given.")
return result
def get_parameter(self):
return self.params
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
from cp2kparser.generics.util import *
try:
import re2 as re
except ImportError:
import re
print_warning((
"re2 package not found. Using re package instead. "
"If you wan't to use re2 please see the following links:"
" https://github.com/google/re2"
" https://pypi.python.org/pypi/re2/"
))
else:
re.set_fallback_notification(re.FALLBACK_WARNING)
#===============================================================================
class Regex(object):
"""Represents a regex search used by the RegexEngine class.
In addition to a regular regex object from the re2 or re module, this
object wraps additional information about a regex search:
regex_string: The regular expression as a string. Supports also the
more verbose form
(https://docs.python.org/2/library/re.html#re.VERBOSE)
index: Index for the wanted match. Can be a single integer number (also
negative indices supported) or if the special value "all" is provided,
all results will be returned.
separator: If a separator is defined, the input file can be chopped
into smaller pieces which are separated by the given separator. The
separator is a strig representing a regular epression. The smaller pieces are
then searched independently. This approach allows bigger files to be
handled piece by piece without loading the whole file into memory.
direction: If a separator is defined, this parameter defines whether
the file is chopped into pieces starting from the end or from the
start.
from_beginning: If true, the input must match the regular expression
right from the start. Any matches in the middle of the input are not
searched.
"""
def __init__(self, regex_string, index="all", separator=None, direction="down", from_beginning=False):
self.regex_string = regex_string
self.index = index
self.separator = separator
self.direction = direction
self.from_beginning = from_beginning
self.compiled_regex = None
self.compiled_separator = None
self.inner_regex = None
self.check_input()
self.compile()
def set_inner_regex(self, inner_regex):
self.inner_regex = inner_regex
def compile(self):
self.compiled_regex = re.compile(self.regex_string, re.VERBOSE)
self.compiled_separator = re.compile(self.separator, re.VERBOSE)
def check_input(self):
if self.direction != "down" and self.direction != "up":
print_error("Unsupported direction value '{}' in a regex".format(self.direction))
def match(self, string):
return self.compiled_regex.match(string)
def search(self, string):
return self.compiled_regex.search(string)
def findall(self, string):
return self.compiled_regex.search(string)
def finditer(self, string):
return self.compiled_regex.finditer(string)
#===============================================================================
class RegexEngine(object):
"""Used for parsing values values from files with regular expressions.
"""
def __init__(self, parser):
self.regexs = None
self.results = {}
self.regex_dict = {}
self.target_dict = {}
self.files = None
self.extractors = None
self.extractor_results = {}
self.output = None
self.regexs = None
self.cache = {}
self.compiled_regexs = {}
self.file_contents = {}
def parse(self, regex, file_handle):
"""Use the given regex to parse contents from the given file handle"""
file_name = file_handle.name
print_debug("Searching regex in file '{}'".format(file_name))
result = self.recursive_extraction(regex, file_handle)
if result:
return result
# Couldn't find the quantity from any of the specified files
print_error("Could not find a result for {}.".format(regex.regex_string))
def recursive_extraction(self, regex, data):
"""Goes through the exctractor tree recursively until the final
extractor is found and returns the value given by it. The value can be
of any dimension but contains only strings.
"""
# # Early return with cached result
# result = self.extractor_results.get(extractor_id)
# if result:
# return result
result = None
# If separator specified, do a blockwise search
if regex.separator is not None:
print_debug("Going into blockwise regex search")
result = self.regex_block_search(data, regex)
# Regular string search
else:
print_debug("Going into full regex search")
result = self.regex_search_string(data, regex)
if not result:
print_error("There was an issue in regex '{}' with index '{}' .".format(regex.regex_string, regex.index))
return None
# See if the tree continues
if regex.inner_regex is not None:
print_debug("Entering next regex recursion level.")
return self.recursive_extraction(regex.inner_regex, result)
else:
return result
def regex_search_string(self, data, regex):
"""Do a regex search on the data. This loads the entire data into so it
might not be the best option for big files. See 'regex_block_search'
for reading the file piece-by-piece.
"""
from_beginning = regex.from_beginning
index = regex.index
# If given a file object, read all as string
if isinstance(data, file):
data.seek(0)
contents = data.read()
else:
contents = data
result = None
if from_beginning:
print_debug("Doing full string search from beginning.")
return regex.match(contents)
elif index == "all":
print_debug("Doing full string search for all results.")
result = regex.findall(contents)
if not result:
print_error("No matches.")
elif index >= 0:
print_debug("Doing full string search with specified index.")
iter = regex.finditer(contents)
i = 0
while i <= index:
try:
match = iter.next()
except StopIteration:
if i == 0:
print_error("No results.")
else:
print_error("Invalid regex index.")
break
if i == index:
result = match.groups()[0]
i += 1
elif index < 0:
matches = regex.findall(contents)
if not matches:
print_error("No matches.")
else:
try:
result = matches[index]
except IndexError:
print_error("Invalid regex index.")
return result
def regex_block_search(self, file_handle, regex):
"""Do a regex search on the data. This function can load the file piece
by piece to avoid loading huge files into memory. The piece-wise search
can also be used to search the file from bottom to up.
"""
compiled_separator = regex.compiled_separator
separator = regex.separator
direction = regex.direction
index = regex.index
from_beginning = regex.from_beginning
print_debug("Doing blockwise search with separator: '{}', direction: '{}', from_beginning: '{}' and index '{}'".format(separator, direction, from_beginning, index))
# Determine the direction in which the blocks are read
if direction == "up":
print_debug("Searching from bottom to up.")
generator = self.reverse_block_generator(file_handle, compiled_separator)
elif direction == "down":
print_debug("Searching from up to bottom.")
generator = self.block_generator(file_handle, compiled_separator)
else:
print_error("Unknown direction specifier: {}".format(direction))
return
# If all results wanted, just get all results from all blocks
if index == "all":
results = []
for block in generator:
results += regex.findall(block)
return results
# If index given, search until the correct index found
i_result = 0
counter = 0
for block in generator:
counter += 1
if from_beginning:
result = regex.match(block)
if result:
print_debug("Found match in beginning of block.")
if index + 1 > i_result + 1:
i_result += 1
else:
return result.groups()[0]
else:
results = regex.findall(block)
n_results = len(results)
if results:
print_debug("Found results within block.")
if index + 1 > i_result + n_results:
i_result += n_results
else:
return results[i_result + (n_results-1) - index]
def reverse_block_generator(self, fh, separator, buf_size=1000000):
"""A generator that returns chunks of a file piece-by-piece in reverse
order.
"""
segment = None
offset = 0
fh.seek(0, os.SEEK_END)
total_size = remaining_size = fh.tell()
while remaining_size > 0:
offset = min(total_size, offset + buf_size)
fh.seek(-offset, os.SEEK_END)
buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size
#print remaining_size
lines = separator.split(buffer)
# lines = buffer.split(separator)
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
# we read
if segment is not None:
# if the previous chunk starts right from the beginning of line
# do not concact the segment to the last line of new chunk
# instead, yield the segment first
if not buffer.endswith(separator):
lines[-1] += segment
else:
yield segment
segment = lines[0]
for index in range(len(lines) - 1, 0, -1):
if len(lines[index]):
yield lines[index]
yield segment
def block_generator(self, fh, separator, buf_size=1000000):
"""A generator that returns chunks of a file piece-by-piece
"""
segment = None
offset = 0
fh.seek(0, os.SEEK_END)
total_size = remaining_size = fh.tell()
fh.seek(0, os.SEEK_SET)
while remaining_size > 0:
offset = min(total_size, offset)
fh.seek(offset, os.SEEK_SET)
offset += buf_size
buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size
lines = separator.split(buffer)
# lines = buffer.split(separator)
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
# we read
if segment is not None:
# if the previous chunk starts right from the beginning of line
# do not concact the segment to the last line of new chunk
# instead, yield the segment first
if not buffer.startswith(separator):
lines[0] = segment + lines[0]
else:
yield segment
segment = lines[-1]
for index in range(0, len(lines) - 1, 1):
if len(lines[index]):
yield lines[index]
yield segment
from cp2kparser.generics.util import *
import numpy as np
from io import StringIO
np_version = np.__version__
split = np_version.split(".")
if int(split[1]) < 10 and int(split[0] < 1):
print_warning("Using too old version of numpy, the XYZ Parsing may not work properly!")
#===============================================================================
class XYZEngine(object):
"""Used to parse out XYZ and extended XYZ files.
When given a file handle to a CP2K input file, this class attemts to parse
out it's structure into an accessible object tree. Because the input file
has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions.
"""
def __init__(self, parser):
"""
Args:
cp2k_parser: Instance of a CP2KParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.parser = parser
def parse_file(self, file_handle, columns, exclusion_patterns):
"""Parses floating point numbers from the given file using the given
columns.
The file handle should be opened and closed somewhere else. The columns
are used to extract only certain components form each line.
Returns:
A numpy array of floating point numbers.
"""
converters = {}
for column in columns:
converters[column] = float
result = np.loadtxt(file_handle, dtype=np.float64, comments=exclusion_patterns, usecols=columns, converters=converters)
return result
def parse_string(self, string, columns, exclusion_patterns):
"""Parses floating point numbers from the given string using the given
columns.
Returns:
3D numpy array of floating point numbers.
"""
stream = StringIO(string)
return self.parse_file(stream, columns, exclusion_patterns)
#! /usr/bin/env python
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import json
import os
import time
from cp2kparser.generics.util import *
#===============================================================================
class NomadParser(object):
"""The base class for a NoMaD parser.
"""
def __init__(self, input_json_string):
self.input_json_string = input_json_string
self.input_json_object = None
self.files = {}
self.tmp_dir = None
self.metainfo_to_keep = None
self.metainfo_to_skip = None
self.file_ids = {}
self.file_handles = {}
self.interface_object = None
self.implementation = None
self.file_contents = {}
self.file_sizes = {}
self.results = {}
def get_file_contents(self, file_id):
cache_limit = 10000
contents = self.file_contents.get(file_id)
if not contents:
fh = self.file_handles[file_id]
fh.seek(0)
contents = fh.read()
if self.get_file_size(file_id) <= cache_limit:
self.file_contents[file_id] = contents
return contents
def get_file_size(self, file_id):
size = self.file_sizes.get(file_id)
if not size:
fh = self.file_handles[file_id]
fh.seek(0, os.SEEK_END)
size = fh.tell()
self.file_sizes[file_id] = size
return size