Commit 315e9adc authored by Lauri Himanen's avatar Lauri Himanen

Initial commit

parents
cp2kparser/cp2kparser.egg-info
# CP2K
The NoMaD parser for CP2K. Under development.
## QuickStart
- Clone repository
- Run setup by running the setup.py script:
$ python setup.py install --user
- Run tests (TODO)
## Structure
Currently the python package is divided into three subpackages:
- Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality.
#! /usr/bin/env python
#! /usr/bin/env python
from collections import defaultdict
from cp2kparser.generics.util import *
#===============================================================================
class CP2KInputEngine(object):
"""Used to parse out a CP2K input file.
When given a file handle to a CP2K input file, this class attemts to parse
out it's structure into an accessible object tree. Because the input file
has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions.
"""
def __init__(self, cp2k_parser):
"""
Args:
cp2k_parser: Instance of a CP2KParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.cp2k_parser = cp2k_parser
self.root_section = None
def parse_input(self):
"""Parses the given CP2K input string"""
# The input file should be quite small, so just get the entire contents
inp = self.cp2k_parser.get_file_contents("input")
root_section = InputSection('CP2K_INPUT')
section_stack = [root_section]
for line in inp.split('\n'):
line = line.split('!', 1)[0].strip()
if len(line) == 0:
continue
if line.upper().startswith('&END'):
s = section_stack.pop()
elif line[0] == '&':
parts = line.split(' ', 1)
name = parts[0][1:]
if len(parts) > 1:
s = InputSection(name=name, params=parts[1].strip())
else:
s = InputSection(name=name)
section_stack[-1].subsections[name.upper()].append(s)
section_stack.append(s)
else:
keyword_name = line.split(' ', 1)[0]
section_stack[-1].keywords[keyword_name].append(line)
self.root_section = root_section
def get_subsection(self, path, index=0):
return self.root_section.get_subsection(path, index)
#===============================================================================
class InputSection(object):
"""Represents a section in a CP2K input file"""
def __init__(self, name, params=None):
self.name = name.upper()
self.params = params
self.keywords = defaultdict(list)
self.subsections = defaultdict(list)
def write(self):
"""Outputs input section as string"""
output = []
for name, k_list in self.keywords.iteritems():
for value in k_list:
output.append(value)
for name, s_list in self.subsections.iteritems():
for s in s_list:
if s.params:
output.append('&%s %s' % (s.name, s.params))
else:
output.append('&%s' % s.name)
for l in s.write():
output.append(' %s' % l)
output.append('&END %s' % s.name)
return output
def get_subsection(self, path, index=0):
"""Finds a subsection specified by a string where subsections are
separated by a slash. If multiple subsections are found with the same
path, the one specified by the given index (default 0) is returned.
Example: get_subsection("FORCE_EVAL/PRINT/FORCES")
Args:
path: String indicating the path to the subsection
index: In case of repeating subsections, return the one specified
by this index.
Returns:
The InputSection object if found.
"""
parts = path.upper().split('/', 1)
candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]]
if not candidates:
print_debug("Subsection '{}' not found.".format(parts[0]))
return None
elif len(candidates) > 1:
print_warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0]))
try:
subsection = candidates[index]
except IndexError:
print_error("Invalid subsection index given.")
if len(parts) == 1:
return subsection
return subsection.get_subsection(parts[1])
def get_keyword(self, keyword, index=0):
"""Finds a keyword specified by a string. If multiple keywords are found with the same
name, the one specified by the given index (default 0) is returned.
Args:
keyword: String indicating the name of the keyword. The name is the
first word in the line.
index: In case of repeating keywords, return the one specified
by this index.
Returns:
The keyword value (everything else than the first word on the line).
"""
candidates = self.keywords.get(keyword) # [s for s in self.subsections if s.name == parts[0]]
if not candidates:
print_debug("No keywords with name '{}' found in subsection '{}'".format(keyword, self.name))
return None
elif len(candidates) > 1:
print_warning("Multiple keywords with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0]))
try:
result = candidates[index]
except IndexError:
print_error("Invalid keyword index given.")
return result
def get_parameter(self):
return self.params
This diff is collapsed.
from cp2kparser.generics.util import *
import numpy as np
from io import StringIO
np_version = np.__version__
split = np_version.split(".")
if int(split[1]) < 10 and int(split[0] < 1):
print_warning("Using too old version of numpy, the XYZ Parsing may not work properly!")
#===============================================================================
class XYZEngine(object):
"""Used to parse out XYZ and extended XYZ files.
When given a file handle to a CP2K input file, this class attemts to parse
out it's structure into an accessible object tree. Because the input file
has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions.
"""
def __init__(self, parser):
"""
Args:
cp2k_parser: Instance of a CP2KParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.parser = parser
def parse_file(self, file_handle, columns, exclusion_patterns):
"""Parses floating point numbers from the given file using the given
columns.
The file handle should be opened and closed somewhere else. The columns
are used to extract only certain components form each line.
Returns:
A numpy array of floating point numbers.
"""
converters = {}
for column in columns:
converters[column] = float
result = np.loadtxt(file_handle, dtype=np.float64, comments=exclusion_patterns, usecols=columns, converters=converters)
return result
def parse_string(self, string, columns, exclusion_patterns):
"""Parses floating point numbers from the given string using the given
columns.
Returns:
3D numpy array of floating point numbers.
"""
stream = StringIO(string)
return self.parse_file(stream, columns, exclusion_patterns)
#! /usr/bin/env python
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import json
import os
import time
from cp2kparser.generics.util import *
#===============================================================================
class NomadParser(object):
"""The base class for a NoMaD parser.
"""
def __init__(self, input_json_string):
self.input_json_string = input_json_string
self.input_json_object = None
self.files = {}
self.tmp_dir = None
self.metainfo_to_keep = None
self.metainfo_to_skip = None
self.file_ids = {}
self.file_handles = {}
self.interface_object = None
self.implementation = None
self.file_contents = {}
self.file_sizes = {}
self.results = {}
def get_file_contents(self, file_id):
cache_limit = 10000
contents = self.file_contents.get(file_id)
if not contents:
fh = self.file_handles[file_id]
fh.seek(0)
contents = fh.read()
if self.get_file_size(file_id) <= cache_limit:
self.file_contents[file_id] = contents
return contents
def get_file_size(self, file_id):
size = self.file_sizes.get(file_id)
if not size:
fh = self.file_handles[file_id]
fh.seek(0, os.SEEK_END)
size = fh.tell()
self.file_sizes[file_id] = size
return size
def get_file_handle(self, file_id):
handle = self.file_handles.get(file_id)
if not handle:
path = self.file_ids[file_id]
try:
handle = open(path, "r")
except (OSError, IOError):
print_error("Could not open file: '{}'".format(path))
else:
self.file_handles[file_id] = handle
handle.seek(0, os.SEEK_SET)
return handle
def analyse_input_json(self):
# Try to decode
self.input_json_object = json.loads(self.input_json_string)
# See if the needed attributes exist
self.tmp_dir = self.input_json_object.get("tmpDir")
if self.tmp_dir is None:
print_error("No temporary folder specified.")
self.files = self.input_json_object.get("files")
if self.files is None:
print_error("No files specified.")
self.metainfo_to_keep = self.input_json_object.get("metainfoToKeep")
self.metainfo_to_skip = self.input_json_object.get("metainfoToSkip")
# See if the metainfos exist
def setup_version(self):
"""Setup a correct implementation for this version of CP2K.
"""
pass
def determine_file_ids(self):
"""If the files have not been given an id, try to determine the
correct ids by looking at the input file, contents and file extensions.
"""
pass
def get_quantity(self, name):
"""Given a unique quantity id which is present in the metainfo
declaration, parses the corresponding quantity (if available) and
return the value as json.
"""
# Start timing
print_debug(74*'-')
print_debug("Getting quantity '{}'".format(name))
start = time.clock()
#Check availability
available = self.check_quantity_availability(name)
if not available:
print_warning("The quantity '{}' is not available for this parser version.".format(name))
return
# Check cache
result = self.results.get(name)
if not result:
# Ask the engine for the quantity
result = self.parse_quantity(name)
self.results[name] = result
if result is None:
print_debug("The quantity '{}' could not be succesfully parsed.".format(name))
else:
print_debug("Using cached result.")
stop = time.clock()
print_debug("Elapsed time: {} ms".format((stop-start)*1000))
return result
def parse_quantity(self, name):
"""Override this function in an actual implementation"""
pass
def check_quantity_availability(self, name):
"""Check quantity availability.
-Check the list of available quantities declared in interface.
-Check if the run type actually produces the quantity
-Check if the quantity is allowed by the 'metainfoToKeep' and
'metainfoToSkip'
"""
return True
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""Misc. utility functions."""
import textwrap
import logging
#===============================================================================
def make_title(title, width=80):
"""Styles a title to be printed into console.
"""
space = width-len(title)-4
pre_space = space/2-1
post_space = space-pre_space
line = "|" + str((pre_space)*"=") + " "
line += title
line += " " + str((post_space)*"=") + "|"
return line
#===============================================================================
def print_subtitle(title, width=80):
"""Styles a title to be printed into console.
"""
space = width-len(title)-4
pre_space = space/2-1
post_space = space-pre_space
line = "|" + str((pre_space)*"-") + " "
line += title
line += " " + str((post_space)*"-") + "|"
print line
#===============================================================================
def make_message(message, width=80, spaces=0):
"""Styles a message to be printed into console.
"""
wrapper = textwrap.TextWrapper(width=width-6)
lines = wrapper.wrap(message)
styled_message = ""
first = True
for line in lines:
new_line = spaces*" " + "| " + line + (width-6-len(line))*" " + " |"
if first:
styled_message += new_line
first = False
else:
styled_message += "\n" + new_line
styled_message += "\n" + spaces*" " + "|" + (width-2)*"-" + "|"
return styled_message
#===============================================================================
def make_debug_message(message, width=80, spaces=0):
"""Styles a message to be printed into console.
"""
wrapper = textwrap.TextWrapper(width=width-6)
lines = wrapper.wrap(message)
styled_message = ""
first = True
for line in lines:
if first:
new_line = spaces*" " + " >> DEBUG: " + line + (width-6-len(line))*" " + " "
styled_message += new_line
first = False
else:
new_line = spaces*" " + " " + line + (width-6-len(line))*" " + " "
styled_message += "\n" + new_line
return styled_message
#===============================================================================
def print_title(title, width=80):
"""Prints styled title into console.
"""
print make_title(title, width=width)
#===============================================================================
def print_message(title, message, width=80):
"""Returns a styled warning message to be printed into console.
"""
print make_title(title) + "\n" + make_message(message) + "\n"
#===============================================================================
def print_debug(message, width=80):
"""Returns a styled warning message to be printed into console.
"""
logging.debug(make_debug_message(message))
#===============================================================================
def print_text(text, spaces=0, width=80):
"""Styles a message to be printed into console. No borders, no footer, no
header.
"""
wrapper = textwrap.TextWrapper(width=width-4)
lines = wrapper.wrap(text)
styled_message = ""
first = True
for line in lines:
new_line = spaces*" " + " " + line
if first:
styled_message += new_line
first = False
else:
styled_message += "\n" + new_line
print(styled_message)
#===============================================================================
def print_warning(message, width=80):
"""Returns a styled warning message to be printed into console.
"""
logging.warning("\n " + make_title("WARNING", width=64) + "\n" + make_message(message, width=64, spaces=8) + "\n")
#===============================================================================
def print_error(message, width=80):
"""Returns a styled warning message to be printed into console.
"""
logging.error("\n " + make_title("ERROR", width=64) + "\n" + make_message(message, width=64, spaces=8) + "\n")
import logging
import os
import json
from cp2kparser.implementation.parser import CP2KParser
#===============================================================================
def scan_path_for_files(path):
# Define the allowed extensions
extensions = {
".inp",
".out",
".xyz",
}
files = []
for filename in os.listdir(path):
extension = os.path.splitext(filename)[1]
if extension in extensions:
file_object = {
"path": filename,
"file_id": "",
}
files.append(file_object)
return files
#===============================================================================
def extract(path):
files = scan_path_for_files(path)
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
json_input = {
"version": "nomadparsein.json 1.0",
"tmpDir": "/home/lauri",
"metainfoToKeep": [],
"metainfoToSkip": [],
"files": files
}
parser = CP2KParser(json.dumps(json_input))
print parser.get_quantity("energy_total")
print parser.get_quantity("XC_functional")
print parser.get_quantity("particle_forces")
# n = len(parser.get_quantity("particle_forces"))
# print "Number of force configurations found: {}".format(n)
#extract! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
from cp2kparser.generics.util import *
from cp2kparser.generics.util import *
from cp2kparser.generics.nomadparser import NomadParser
from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine
from cp2kparser.engines.xyzengine import XYZEngine
from cp2kparser.engines.cp2kinputengine import CP2KInputEngine
#===============================================================================
class CP2KParser(NomadParser):
"""The interface to a NoMaD CP2K parser.
"""
def __init__(self, input_json_string):
NomadParser.__init__(self, input_json_string)
# Engines are created here
self.inputengine = CP2KInputEngine(self)
self.xyzengine = XYZEngine(self)
self.regexengine = RegexEngine(self)
self.regexs = None
self.analyse_input_json()
self.determine_file_ids()
self.open_files()
self.setup_version()
def setup_version(self):
"""Inherited from NomadParser.
"""
# Determine the CP2K version from the input file
beginning = self.read_part_of_file("output", 2048)
version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n")
version_number = '_' + version_regex.search(beginning).groups()[0].replace('.', '') + '_'
# Search for a version specific regex class
class_name = "CP2K{}Regexs".format(version_number)
self.regexs = globals().get(class_name)
if self.regexs:
print_debug("Using version specific regexs '{}'.".format(class_name))
self.regexs = self.regexs()
else:
print_debug("Using default regexs.")
self.regexs = globals()["CP2KRegexs"]()
# Search for a version specific implementation
class_name = "CP2K{}Implementation".format(version_number)
self.implementation = globals().get(class_name)(self)
if self.implementation:
print_debug("Using version specific implementation '{}'.".format(class_name))
else:
print_debug("Using default implementation.")
self.implementation = globals()["CP2KImplementation"](self)
def read_part_of_file(self, file_id, size=1024):
fh = self.file_handles[file_id]
fh.seek(0, os.SEEK_SET)
buffer = fh.read(size)
return buffer
def determine_file_ids(self):
"""Inherited from NomadParser.
"""
# Determine a list of filepaths that need id resolution
resolved = {}
resolvable = []
for file_object in self.files:
path = file_object.get("path")
file_id = file_object.get("file_id")
if not file_id:
resolvable.append(path)
else:
resolved[file_id] = path
# First resolve the file that can be identified by extension
input_path = resolved.get("input")
if not input_path:
for file_path in resolvable: