Commit 3a5d0ae6 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Started to transfer stuff to the python-commons repo, now using functions from there.

parent 639a45b1
......@@ -131,8 +131,8 @@ Although CP2K often doesn't care about the file extensions, using them enables
the parser to automatically identify the files and makes it perform better
(only needs to decompress part of files in HDF5). Please use these default file
- Output file: .out
- Input file: .inp
- Output file: .out (Only one)
- Input file: .inp (Only one. If you have "include" files, use some other extension e.g. .inc)
- XYZ coordinate files: .xyz
- Protein Data Bank files: .pdb
- Crystallographic Information Files: .cif
import logging
logger = logging.getLogger(__name__)
class AtomsEngine(object):
"""Used to parse various different atomic coordinate files.
See the dictionary 'formats' for all the supported formats and a brief
explanation.Reading is primarily done by ASE or MDAnalysis, but in some cases own
implementation is used. Returns all coordinates as numpy arrays.
formats = {
"xyz": "(.xyz): The XYZ file format.",
"cif": "(.cif): Crystallographic Information File",
"pdb": "(.pdb): Protein Data Bank",
#"dcd": "(.dcd): Binary trajectory file format used by CHARMM, NAMD, and X-PLOR.",
def determine_tool(self, format):
"""Determines which tool to use for extracting trajectories in the
given format.
formats = {
"xyz": "ASE",
"cif": "ASE",
"pdb": "ASE",
result = formats.get(format)
if result:
return result
logger.warning("The format '{}' is not supported by AtomsEngine.".format(format))
def check_format_support(self, format):
"""Check if the given format is supported.
if format not in AtomsEngine.formats:
logger.error("The format '{}' is not supported by AtomsEngine.".format(format))
return False
return True
def n_atoms(self, file_handle, format):
"""Read the first configuration of the coordinate file to extract the
number of atoms in it.
iterator = self.iread(file_handle, format)
pos =
return pos.shape[0]
def iread(self, file_handle, format, index=0):
"""Returns an iterator that goes through the given trajectory file one
configuration at a time. Good for e.g. streaming the contents to disc as the
whole file doesn't have to be loaded into memory.
if not self.check_format_support(format):
if file_handle is None:
print "NONE"
tool = self.determine_tool(format)
if tool == "ASE":
return self.ase_iread(file_handle, format, index)
elif tool == "custom":
return self.custom_iread(file_handle, format, index)
elif tool == "MDAnalysis":
return self.mdanalysis_iread(file_handle, format, index)
def ase_iread(self, file_handle, format, index):
# After reading the ASE source code, it seems that the ASE iread does
# actually read the entire file into memory and the yields the
# configurations from it. Should be checked at some point.
def ase_generator(iterator):
"""Used to wrap an iterator returned by so that it returns
the positions instead of the ase.Atoms object.
for value in iterator:
yield value.get_positions()
iterator =, format=format)
return ase_generator(iterator)
def custom_iread(self, file_handle, format, index):
......@@ -10,11 +10,39 @@ logger = logging.getLogger(__name__)
class Root(object):
class CP2KInput(object):
"""The contents of a CP2K simulation including default values and default
units from the version-specific xml file.
def __init__(self, root_section):
self.root_section = root_section
def decode_cp2k_unit(unit):
"""Given a CP2K unit name, decode it as Pint unit definition.
map = {
# Length
"bohr": "bohr",
"m": "meter",
"pm": "picometer",
"nm": "nanometer",
"angstrom": "angstrom",
# Angle
"rad": "radian",
"deg": "degree",
"Ry": "rydberg"
pint_unit = map.get(unit)
if pint_unit:
return pint_unit
logger.error("Unknown CP2K unit definition '{}'.".format(unit))
def set_parameter(self, path, value):
parameter, section = self.get_parameter_and_section(path)
parameter.value = value
......@@ -59,7 +87,7 @@ class Root(object):
keyword, section = self.get_keyword_and_section(path)
if keyword:
if keyword.value is not None:
return keyword.value
return keyword.get_value()
if section.accessed:
return keyword.default_value
......@@ -81,6 +109,11 @@ class Root(object):
if keyword:
return keyword.default_unit
def get_unit(self, path):
keyword, section = self.get_keyword_and_section(path)
if keyword:
return keyword.get_unit()
def get_parameter_and_section(self, path):
section = self.get_section(path)
parameter = section.parameter
......@@ -94,31 +127,54 @@ class Root(object):
elif section and section.accessed:
return parameter.lone_value
# def get_parameter_lone(self, path):
# parameter = self.get_parameter_object(path)
# return parameter.lone_value
# def get_parameter_default(self, path):
# parameter = self.get_parameter_object(path)
# return parameter.default_value
class Keyword(object):
"""Information about a keyword in a CP2K calculation.
def __init__(self, default_name, default_value, default_unit_value):
self.value = None
self.unit = None
self.value_no_unit = None
self.default_name = default_name
self.default_value = default_value
self.default_unit = default_unit_value
def get_value():
def get_value(self):
"""If the units of this value can be changed, return a value and the
unit separately.
if self.default_unit:
if not self.value_no_unit:
return self.value_no_unit
return self.value
def get_unit(self):
if self.default_unit:
if not self.unit:
return self.unit
logger.error("The keyword '{}' does not have a unit.".format(self.default_name))
def decode_cp2k_unit_and_value(self):
"""Given a CP2K unit name, decode it as Pint unit definition.
splitted = self.value.split(None, 1)
unit_definition = splitted[0]
if unit_definition.startswith('[') and unit_definition.endswith(']'):
unit_definition = unit_definition[1:-1]
self.unit = CP2KInput.decode_cp2k_unit(self.default_unit)
self.value_no_unit = splitted[1]
elif self.default_unit:
logger.debug("No special unit definition found, returning default unit.")
self.unit = CP2KInput.decode_cp2k_unit(self.default_unit)
self.value_no_unit = self.value
logger.debug("The value has no unit, returning bare value.")
self.value_no_unit = self.value
......@@ -91,7 +91,7 @@ def recursive_tree_generation(xml_element):
# Run main function by default
if __name__ == "__main__":
xml_file = open("./cp2k_262/cp2k_input.xml", 'r')
object_tree = Root(generate_object_tree(xml_file))
object_tree = CP2KInput(generate_object_tree(xml_file))
file_name = "./cp2k_262/cp2k_input_tree.pickle"
fh = open(file_name, "wb")
pickle.dump(object_tree, fh, protocol=2)
......@@ -4,6 +4,7 @@ import logging
import numpy as np
logger = logging.getLogger(__name__)
from abc import ABCMeta, abstractmethod
from nomadcore.unit_conversion.unit_conversion import convert_unit
from enum import Enum
import sys
......@@ -170,7 +171,6 @@ class NomadParser(object):
manual: You use the backend explicitly and do the unit
conversion and JSON formatting "manually". In this case the
function should return 'None'.
......@@ -293,7 +293,7 @@ class NomadParser(object):
# Convert to SI units if unit has been specified
if unit is not None:
value = self.to_SI(value, unit)
value = convert_unit(value, unit)
if len(shape) != 0:
self.backend.addArrayValues(name, value)
......@@ -310,34 +310,62 @@ class NomadParser(object):
return value
def to_SI(self, value, unit):
"""If units have been defined to the result, convert the units to SI.
# Do the conversion to SI units based on the given units and type
value = value * unit
converted = value.to_base_units()
return converted.magnitude
def setup_file_id(self, path, file_id):
"""Used to map a simple identifier string to a file path. When a file
id has been setup, you can easily access the file by using the
functions get_file_handle() or get_file_contents()
if path in self.files:
self.file_ids[file_id] = path
value = self.file_ids.get(file_id)
if value:
pathlist = []
self.file_ids[file_id] = pathlist
logger.error("Trying to setup an id for an undefined path. See that the path was written correctly and it was given in the files attribute of the JSON string.")
def get_filepath_by_id(self, file_id):
"""Get the file paths that were registered with the given id.
value = self.file_ids.get(file_id)
if value:
if isinstance(value, list):
n = len(value)
if n == 1:
return value[0]
elif n == 0:
logger.warning("No files set with id '{}'".format(file_id))
return None
logger.debug("Multiple files set with id '{}'".format(file_id))
return value
logger.warning("No files set with id '{}'".format(file_id))
def get_file_handle(self, file_id):
"""Get the handle for a file with the given id. Uses cached result
"""Get the handle for a single file with the given id. Uses cached result
if available. Always seeks to beginning of file before returning it.
handle = self._file_handles.get(file_id)
if not handle:
path = self.file_ids.get(file_id)
if not path:
logger.warning("The file with id '{}' could not be found. You have either not registered this id, or the parser was not given files with this extension.".format(file_id))
# Get the filepath(s)
path = self.get_filepath_by_id(file_id)
if not path:
logger.warning("No filepaths registered to id '{}'. Register id's with setup_file_id().".format(file_id))
if isinstance(path, list):
if len(path) == 0:
elif len(path) != 1:
logger.error("Multiple filepaths found with id '{}'. Use get_file_handles() instead if you expect to have multiple files.".format(file_id))
path = path[0]
# Search for filehandles, if not present create one
handle = self._file_handles.get(path)
if not handle:
handle = open(path, "r")
except (OSError, IOError):
......@@ -347,6 +375,37 @@ class NomadParser(object):, os.SEEK_SET)
return handle
def get_file_handles(self, file_id):
"""Get the handles for multiple files with the given id. Uses cached result
if available. Always seeks to beginning of files before returning them.
# Get the filepath(s)
paths = self.get_filepath_by_id(file_id)
if not paths:
if not isinstance(paths, list):
paths = [paths]
# Search for filehandles, if not present create one
handles = []
for path in paths:
handle = self._file_handles.get(path)
if not handle:
handle = open(path, "r")
except (OSError, IOError):
logger.error("Could not open file: '{}'".format(path))
self._file_handles[file_id] = handle, os.SEEK_SET)
# Return handles
if len(handles) == 0:
return None
return handles
def get_file_contents(self, file_id):
"""Get the contents for the file with the given id. Uses cached result
if available. Does not cache files that are bigger than a certain
......@@ -16,6 +16,7 @@ def scan_path_for_files(path):
files = {}
for filename in os.listdir(path):
import os
import re
import re2 as re
from cp2kparser.generics.nomadparser import NomadParser, Result, ResultCode
from cp2kparser.implementation.regexs import *
from cp2kparser.engines.regexengine import RegexEngine
from cp2kparser.engines.csvengine import CSVEngine
from cp2kparser.engines.cp2kinputengine import CP2KInputEngine
from cp2kparser.engines.xmlengine import XMLEngine
from cp2kparser.engines.atomsengine import AtomsEngine
from nomadcore.coordinate_reader import CoordinateReader
from nomadcore.unit_conversion.unit_conversion import convert_unit, ureg
from cp2kparser.engines.cp2kinputenginedata.input_tree import CP2KInput
import numpy as np
import logging
import sys
logger = logging.getLogger(__name__)
from cp2kparser import ureg
import math
......@@ -33,15 +35,17 @@ class CP2KParser(NomadParser):
self.regexengine = RegexEngine(self)
self.xmlengine = XMLEngine(self)
self.inputengine = CP2KInputEngine()
self.atomsengine = AtomsEngine()
self.atomsengine = CoordinateReader()
self.version_number = None
self.implementation = None
self.input_tree = None
self.regexs = None
self.extended_input = None
# Use some convenient functions from base
......@@ -65,7 +69,7 @@ class CP2KParser(NomadParser):
version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n")
self.version_number =[0].replace('.', '')
self.input_tree = self.inputengine.parse(self.get_file_contents("input"))
self.input_tree = self.inputengine.parse(self.extended_input)
version_name = '_' + self.version_number + '_'
# Search for a version specific regex class
......@@ -103,15 +107,116 @@ class CP2KParser(NomadParser):
buffer =
return buffer
def input_preprocessor(self):
"""Preprocess the input file. Concatenate separate files into one and
explicitly state all variables.
# Merge include files to input
include_files = self.get_file_handles("include")
input_file = self.get_file_contents("input")
input_lines = input_file.split("\n")
extended_input = input_lines[:] # Make a copy
if include_files:
i_line = 0
for line in input_lines:
line = line.strip()
if line.startswith("@INCLUDE") or line.startswith("@include"):
split = line.split(None, 1)
filename = split[1]
if filename.startswith(('\"', '\'')) and filename.endswith(('\"', '\'')):
filename = filename[1:-1]
filepath = self.search_file(filename)
# Get the content from include file
for handle in include_files:
name =
if name == filepath:
contents =
contents = contents.split('\n')
del extended_input[i_line]
extended_input[i_line:i_line] = contents
i_line += len(contents)
i_line += 1
# Gather the variable definitions
variables = {}
input_set_removed = []
for i_line, line in enumerate(extended_input):
if line.startswith("@SET") or line.startswith("@set"):
components = line.split(None, 2)
name = components[1]
value = components[2]
variables[name] = value
logger.debug("Variable '{}' found with value '{}'".format(name, value))
# print '\n'.join(input_set_removed)
# Place the variables
variable_pattern = r"\@\{(\w+)\}|@(\w+)"
compiled = re.compile(variable_pattern)
reserved = ("include", "set", "if", "endif")
input_variables_replaced = []
for line in input_set_removed:
results = compiled.finditer(line)
new_line = line
offset = 0
for result in results:
options = result.groups()
first = options[0]
second = options[1]
if first:
name = first
elif second:
name = second
if name in reserved:
value = variables.get(name)
if not value:
logger.error("Value for variable '{}' not set.".format(name))
len_value = len(value)
len_name = len(name)
start = result.start()
end = result.end()
beginning = new_line[:offset+start]
rest = new_line[offset+end:]
new_line = beginning + value + rest
offset += len_value - len_name - 1
# print offset
# print "BEG:" + beginning
# print "VALUE:" + value
# print "REST: " + rest
# print new_line
self.extended_input = '\n'.join(input_variables_replaced)
# print self.extended_input
def determine_file_ids_pre_setup(self):
"""First resolve the files that can be identified by extension.
# Input and output files
for file_path in self.files.iterkeys():
if file_path.endswith(".inp"):
self.setup_file_id(file_path, "input")
if file_path.endswith(".out"):
self.setup_file_id(file_path, "output")
# Include files
input_file = self.get_file_contents("input")
for line in input_file.split("\n"):
line = line.strip()
if line.startswith("@INCLUDE") or line.startswith("@include"):
split = line.split(None, 1)
filename = split[1]
if filename.startswith(('\"', '\'')) and filename.endswith(('\"', '\'')):
filename = filename[1:-1]
filepath = self.search_file(filename)
self.setup_file_id(filepath, "include")
def determine_file_ids_post_setup(self):
"""Inherited from NomadParser.
......@@ -134,8 +239,7 @@ class CP2KParser(NomadParser):
# Check against the given files
file_path = self.search_file(force_path)
self.file_ids["forces"] = file_path
self.setup_file_id(file_path, "forces")
# Determine the presence of an initial coordinate file
init_coord_file = self.input_tree.get_keyword("FORCE_EVAL/SUBSYS/TOPOLOGY/COORD_FILE_NAME")
......@@ -161,15 +265,20 @@ class CP2KParser(NomadParser):
file_path = self.search_file(normalized_path)
self.setup_file_id(file_path, "trajectory")
# Determine the presence of a cell file
# Determine the presence of a cell output file
cell_motion_file = self.input_tree.get_keyword("MOTION/PRINT/CELL/FILENAME")
if cell_motion_file is not None:
logger.debug("Cell file found.")
extension = "cell"
normalized_path = self.normalize_cp2k_path(cell_motion_file, extension)
print normalized_path
file_path = self.search_file(normalized_path)
self.setup_file_id(file_path, "cell")
self.setup_file_id(file_path, "cell_output")
# Determine the presence of a cell input file
cell_input_file = self.input_tree.get_keyword("FORCE_EVAL/SUBSYS/CELL/CELL_FILE_NAME")
if cell_input_file is not None:
file_path = self.search_file(cell_input_file)
self.setup_file_id(file_path, "cell_input")
def normalize_cp2k_path(self, path, extension, name=""):
if name:
......@@ -184,13 +293,12 @@ class CP2KParser(NomadParser):
normalized_path = "{}-{}{}-1.{}".format(project_name, path, name, extension)
return normalized_path
def search_file(self, path):
"""Searches the list of given files for a file that is defined in the
CP2K input file.
First compares the basenames, and if multiple matches found descends
the path until only one or zero matches found.
the path until only only one or zero matches found.
matches = {}
resolvable = [x for x in self.files.iterkeys() if x not in self.file_ids.itervalues()]
......@@ -226,9 +334,7 @@ class CP2KParser(NomadParser):
if path != "":