Commit eccf31ab authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Started writing implementation for reading trajectories.

parent 17208913
......@@ -122,5 +122,30 @@ existing profiling tools such as
[cProfile](https://docs.python.org/2/library/profile.html#module-cProfile)
which you can plug into your scripts very easily.
# Manual for uploading a CP2K calculation
All the files that are needed to run the calculation should be included in the
upload, including the basis set and potential files. The folder structure does
not matter, as the whole directory is searced for relevant files.
Although CP2K often doesn't care about the file extensions, using them enables
the parser to automatically identify the files and makes it perform better
(only needs to decompress part of files in HDF5). Please use these default file
extensions:
- Output file: .out
- Input file: .inp
- XYZ coordinate files: .xyz
- Protein Data Bank files: .pdb
- Crystallographic Information Files: .cif
# Notes for CP2K developers
Here is a list of features/fixes that would make the parsing of CP2K results
easier:
- Include the number of simulated atoms to the output file. I could
not find a way to easily determine the number of atoms for all run types (MD,
GEO_OPT,...). Currently the number of atoms is deduced by counting the atoms
in the initials coordinate file/COORD section.
- The pdb trajectory output doesn't seem to conform to the actual standard as
the different configurations are separated by the END keyword which is
supposed to be written only once in the file. The [format specification](http://www.wwpdb.org/documentation/file-format)
states that different configurations should start with MODEL and end with
ENDMDL tags.
......@@ -11,20 +11,30 @@ class AtomsEngine(object):
Supports the following file formats:
- xyz (.xyz):
- cif (.cif): Crystallographic Information File
- pdb (.pdb): Protein Data Bank
- cp2k-pdb (.pdb): Protein Data Bank file written by CP2K, the format
is a bit peculiar so a custom implementation is used
Reading is primarily done by ASE or MDAnalysis, but in some cases own
implementation has to be made.
implementation had to be made.
"""
def __init__(self, parser):
"""
Args:
cp2k_parser: Instance of a NomadParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.parser = parser
def determine_tool(self, format):
ASE = "ASE"
custom = "custom"
# MDAnalysis = "MDAnalysis"
# custom = "custom"
formats = {
"xyz": ASE,
"cif": ASE,
"pdb": ASE,
"cp2k-pdb": custom,
}
result = formats.get(format)
if result:
......@@ -32,7 +42,7 @@ class AtomsEngine(object):
else:
logger.warning("The format '{}' is not supported by AtomsEngine.".format(format))
def parse_n_atoms(self, contents, format):
def n_atoms(self, contents, format):
"""Read the first configuration of the coordinate file to extract the
number of atoms in it.
"""
......@@ -49,3 +59,22 @@ class AtomsEngine(object):
n_atoms = len(u.atoms)
return n_atoms
def iread(self, contents, format, index=0):
"""Returns an iterator that goes through the given trajectory file one
configuration at a time. Good for e.g. streaming the contents to disc as the
whole file doesn't have to be loaded into memory.
"""
tool = self.determine_tool(format)
# After reading the ASE source code, it seems that the ASE iread does
# actually read the entire file into memory and the yields the
# configurations from it. Should be checked at some point.
if tool == "ASE":
iterator = ase.io.iread(contents, None, format)
return iterator
elif tool == "custom":
if format == "cp2k-pdb":
iterator = self.parser.csvengine.iread(contents, columns=[3, 4, 5], comments=["TITLE", "AUTHOR", "REMARK", "CRYST"], separator="END")
return iterator
......@@ -30,15 +30,24 @@ class CSVEngine(object):
the contents into different configurations which will be separated by a
line that matches the separator.
"""
def __init__(self, parser):
"""
Args:
cp2k_parser: Instance of a CP2KParser or it's subclass. Allows
cp2k_parser: Instance of a NomadParser or it's subclass. Allows
access to e.g. unified file reading methods.
"""
self.parser = parser
def parse(self, contents, columns, delimiter=r"\s+", comments=r"#", separator=r"^\d+$"):
def iread(self, contents, columns, delimiter=r"\s+", comments=r"#", separator=None):
"""Used to iterate a CSV-like file. If a separator is provided the file
is iterated one configuration at a time. Only keeps one configuration
of the file in memory. If no separator is given, the whole file will be
handled.
The contents are separated into configurations whenever the separator
regex is encountered on a line.
"""
def split_line(line):
"""Chop off comments, strip, and split at delimiter.
......@@ -65,31 +74,29 @@ class CSVEngine(object):
if isinstance(contents, (str, unicode)):
contents = StringIO(unicode(contents))
# Compile the comments to regex objects
# Precompile the different regexs before looping
if comments:
comments = (re.escape(comment) for comment in comments)
compiled_comments = re.compile('|'.join(comments))
#Compile the separator
if separator:
compiled_separator = re.compile(separator)
#Compile the delimiter
compiled_delimiter = re.compile(delimiter)
# Colums as list
# Columns as list
if columns is not None:
columns = list(columns)
# Start iterating
all_forces = []
conf_forces = []
for line in contents:
configuration = []
for line in contents: # This actually reads line by line and only keeps the current line in memory
# If separator encountered, yield the stored configuration
if is_separator(line):
if conf_forces:
all_forces.append(conf_forces)
conf_forces = []
if configuration:
yield np.array(configuration)
configuration = []
else:
# Ignore comments, separate by delimiter
vals = split_line(line)
line_forces = []
if vals:
......@@ -106,61 +113,9 @@ class CSVEngine(object):
return
else:
line_forces.append(value)
conf_forces.append(line_forces)
if conf_forces:
all_forces.append(conf_forces)
# If any forces found, return them as numpy array. Otherwise return None.
if all_forces:
all_forces = np.array(all_forces)
return all_forces
else:
return None
# SLOWER OLD VERSION
# def parse_numpy(self, file_handle, columns, comments):
# """Parses floating point numbers from the given file using the given
# columns.
# The file handle should be opened and closed somewhere else. The columns
# are used to extract only certain components form each line.
# Returns:
# A numpy array of floating point numbers.
# """
# # If string or unicode provided, create stream
# if isinstance(file_handle, (str, unicode)):
# file_handle = StringIO(unicode(file_handle))
# converters = {}
# for column in columns:
# converters[column] = float
# result = np.loadtxt(file_handle, dtype=np.float64, comments=comments, usecols=columns, converters=converters)
# return result
# def parse(self, contents, columns, comments, separator=None):
# """Parse data from a file or string containing XYZ data.
# If a separator pattern is provided, the contents are divided into parts
# separated by the pattern. Each of these parts is handled as a separate
# configuration.
# """
# # If string or unicode provided, create stream
# if isinstance(contents, (str, unicode)):
# contents = StringIO(unicode(string))
# # If separator provided, get contents one block at a time
# if separator is not None:
# generator = block_generator(contents, separator)
# forces = []
# for block in generator:
# if block is not None and not block.isspace():
# array = self.parse_numpy(block, columns, comments)
# if array.size != 0:
# forces.append(array)
# forces = np.dstack(forces)
# else:
# forces = self.parse_numpy(contents, columns, comments)
configuration.append(line_forces)
# return forces
# The last configuration is yielded even if separator is not present at
# the end of file or is not given at all
if configuration:
yield np.array(configuration)
......@@ -72,11 +72,14 @@ class NomadParser(object):
def get_file_handle(self, file_id):
"""Get the handle for a file with the given id. Uses cached result
if available.
if available. Always seeks to beginning of file before returning it.
"""
handle = self.file_handles.get(file_id)
if not handle:
path = self.file_ids[file_id]
path = self.file_ids.get(file_id)
if not path:
logger.warning("The file with id '{}' could not be found. Probable reason for this is that the parser was not given files with this extension.")
return
try:
handle = open(path, "r")
except (OSError, IOError):
......
......@@ -32,7 +32,7 @@ class CP2KParser(NomadParser):
self.regexengine = RegexEngine(self)
self.xmlengine = XMLEngine(self)
self.inputengine = CP2KInputEngine()
self.atomsengine = AtomsEngine()
self.atomsengine = AtomsEngine(self)
self.input_tree = None
self.regexs = None
......@@ -144,11 +144,27 @@ class CP2KParser(NomadParser):
self.file_ids["initial_coordinates"] = file_path
self.get_file_handle("initial_coordinates")
# for file_path in self.resolvable:
# tail = os.path.basename(file_path)
# if force_path is not None and tail == force_path:
# self.file_ids["forces"] = file_path
# self.get_file_handle("forces")
# Determine the presence of a trajectory file
traj_file = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FILENAME")
if traj_file is not None:
logger.debug("Trajectory file found.")
normalized_path = self.normalize_cp2k_traj_path(traj_file)
file_path = self.search_file(normalized_path)
self.file_ids["trajectory"] = file_path
self.get_file_handle("trajectory")
def normalize_cp2k_traj_path(self, path):
logger.debug("Normalizing trajectory path")
project_name = self.input_tree.get_keyword("GLOBAL/PROJECT_NAME")
file_format = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FORMAT")
extension = {"PDB": "pdb"}[file_format]
if path.startswith("="):
normalized_path = path[1:]
elif re.match(r"./", path):
normalized_path = "{}-pos-1.{}".format(path, extension)
else:
normalized_path = "{}-{}-pos-1.{}".format(project_name, path, extension)
return normalized_path
def search_file(self, path):
"""Searches the list of given files for a file that is defined in the
......@@ -342,8 +358,8 @@ class CP2KImplementation(object):
i_conf = 0
force_array = None
for force_conf in forces:
i_force_array = self.csvengine.parse(force_conf, columns=(-3, -2, -1), comments=("#", "ATOMIC", "SUM"), separator=None)
i_force_array = i_force_array[0]
iterator = self.csvengine.iread(force_conf, columns=(-3, -2, -1), comments=("#", "ATOMIC", "SUM"), separator=None)
i_force_array = iterator.next()
# Initialize the numpy array if not done yet
n_particles = i_force_array.shape[0]
......@@ -358,7 +374,12 @@ class CP2KImplementation(object):
return result
else:
logger.debug("Looking for forces in separate force file.")
forces = self.csvengine.parse(self.parser.get_file_handle("forces"), columns=(-3, -2, -1), comments=("#", "ATOMIC", "SUM"), separator=r"\ ATOMIC FORCES in \[a\.u\.\]")
iterator = self.csvengine.iread(self.parser.get_file_handle("forces"), columns=(-3, -2, -1), comments=("#", "SUM"), separator=r"\ ATOMIC FORCES in \[a\.u\.\]")
forces = []
for configuration in iterator:
forces.append(configuration)
forces = np.array(forces)
if forces is None:
msg = "No force configurations were found when searching an external XYZ force file."
logger.warning(msg)
......@@ -366,7 +387,8 @@ class CP2KImplementation(object):
result.code = ResultCode.fail
return result
else:
result.value = forces
if len(forces) != 0:
result.value = forces
return result
def _Q_particle_number(self):
......@@ -395,7 +417,7 @@ class CP2KImplementation(object):
coords.strip()
n_particles = coords.count("\n")
result.value = factor*n_particles
elif coord_format in ["CP2K", "G96", "XTL"]:
elif coord_format in ["CP2K", "G96", "XTL", "CRD"]:
msg = "Tried to read the number of atoms from the initial configuration, but the parser does not yet support the '{}' format that is used by file '{}'.".format(coord_format, self.parser.file_ids["initial_coordinates"])
logger.warning(msg)
result.error_message = msg
......@@ -404,15 +426,34 @@ class CP2KImplementation(object):
# External file, use AtomsEngine
init_coord_file = self.parser.get_file_handle("initial_coordinates")
if coord_format == "XYZ":
n_particles = self.atomsengine.parse_n_atoms(init_coord_file, format="xyz")
n_particles = self.atomsengine.n_atoms(init_coord_file, format="xyz")
if coord_format == "CIF":
n_particles = self.atomsengine.parse_n_atoms(init_coord_file, format="cif")
n_particles = self.atomsengine.n_atoms(init_coord_file, format="cif")
if coord_format == "PDB":
n_particles = self.atomsengine.parse_n_atoms(init_coord_file, format="pdb")
n_particles = self.atomsengine.n_atoms(init_coord_file, format="pdb")
result.value = factor*n_particles
return result
def _Q_particle_position(self):
"""Returns the particle positions (trajectory). Currently returns them
as one big object, which is not good because the trajectory can be very
large. When the streaming interface is available the coordinates can be
streamed to an outputfile.
"""
result = Result()
# Read the trajectory
traj_file = self.parser.get_file_handle("trajectory")
traj_iter = self.atomsengine.iread(traj_file, format="cp2k-pdb")
positions = []
for configuration in traj_iter:
positions.append(configuration)
result.value = np.array(positions)
return result
#===============================================================================
class CP2K_262_Implementation(CP2KImplementation):
......
......@@ -134,6 +134,20 @@ class TestParticleNumber(unittest.TestCase):
n = parser.get_quantity_unformatted("particle_number").value
self.assertEqual(n, 2)
#===============================================================================
class TestTrajectory(unittest.TestCase):
def test_filenames_bare(self):
parser = getparser("trajectory/filenames/bare")
pos = parser.get_quantity_unformatted("particle_position").value
n_conf = pos.shape[0]
n_particles = pos.shape[1]
n_dim = pos.shape[2]
self.assertEqual(n_conf, 11)
self.assertEqual(n_particles, 2)
self.assertEqual(n_dim, 3)
if __name__ == '__main__':
logger = logging.getLogger("cp2kparser")
logger.setLevel(logging.ERROR)
......@@ -143,6 +157,7 @@ if __name__ == '__main__':
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestForces))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestParticleNumber))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestFunctionals))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestTrajectory))
alltests = unittest.TestSuite(suites)
unittest.TextTestRunner(verbosity=0).run(alltests)
......
# Step Nr. Time[fs] Kin.[a.u.] Temp[K] Pot.[a.u.] Cons Qty[a.u.] UsedTime[s]
0 0.000000 0.001425067 300.000000000 -13.936448531 -13.934548442 0.000000000
1 0.100000 0.002507632 527.897770684 -13.937533661 -13.934548860 0.604384274
2 0.200000 0.006293850 1324.958830212 -13.941326201 -13.934550366 0.184237053
3 0.300000 0.012755221 2685.183624409 -13.947800632 -13.934552926 0.190719346
4 0.400000 0.021846287 4599.002336168 -13.956914640 -13.934556497 0.212849671
5 0.500000 0.033504957 7053.343980435 -13.968609623 -13.934560995 0.186353471
6 0.600000 0.047652921 10031.722995929 -13.982811502 -13.934566381 0.127557738
7 0.700000 0.064196119 13514.337834696 -13.999431494 -13.934572600 0.167711532
8 0.800000 0.083025390 17478.208941311 -14.018367225 -13.934579587 0.185877595
9 0.900000 0.104017180 21897.325624978 -14.039503885 -13.934587255 0.173664631
10 1.000000 0.127034217 26742.790133204 -14.062715402 -13.934595475 0.222027603
# Version information for this restart file
# current date 2015-11-25 12:12:38.263
# current working dir /home/lauri/Dropbox/nomad-dev/gitlab/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/trajectory/filenames/bare
# Program compiled at ke 4.11.2015 08.48.42 +0200
# Program compiled on lauri-Lenovo-Z50-70
# Program compiled for Linux-x86-64-gfortran_basic
# Source code revision number svn:15893
&GLOBAL
PRINT_LEVEL LOW
PROJECT_NAME NaCl
RUN_TYPE MD
&END GLOBAL
&MOTION
&MD
ENSEMBLE NVT
STEPS 10
TIMESTEP 9.9999999999999992E-02
STEP_START_VAL 10
TIME_START_VAL 9.9999999999999989E-01
ECONS_START_VAL -1.3934548442284184E+01
TEMPERATURE 3.0000000000000000E+02
&THERMOSTAT
&NOSE
LENGTH 3
YOSHIDA 3
TIMECON 9.9999999999999986E+01
MTS 2
&COORD
6.2120513073037316E-03 -3.1829858298434797E-03 -8.2147280702171048E-04
&END COORD
&VELOCITY
2.0470489274881129E-04 -7.6593567916822758E-05 -2.0957498650578900E-05
&END VELOCITY
&MASS
4.8711897697712084E+04 1.6237299232570693E+04 1.6237299232570693E+04
&END MASS
&FORCE
5.1572267230972504E-06 6.7202265495801951E-08 -5.2643439203876127E-08
&END FORCE
&END NOSE
&END THERMOSTAT
&AVERAGES T
&RESTART_AVERAGES
ITIMES_START 1
AVECPU 2.2553829139997106E-01
AVEHUGONIOT 0.0000000000000000E+00
AVETEMP_BARO 0.0000000000000000E+00
AVEPOT -1.3985501426499425E+01
AVEKIN 5.0283377453672257E-02
AVETEMP 1.0585477207202792E+04
AVEKIN_QM 0.0000000000000000E+00
AVETEMP_QM 0.0000000000000000E+00
AVEVOL 1.4576402699474393E+03
AVECELL_A 1.1338356797313857E+01
AVECELL_B 1.1338356797313857E+01
AVECELL_C 1.1338356797313857E+01
AVEALPHA 9.0000000000000000E+01
AVEBETA 9.0000000000000000E+01
AVEGAMMA 9.0000000000000000E+01
AVE_ECONS -2.9383245550040318E+00
AVE_PRESS 0.0000000000000000E+00
AVE_PXX 0.0000000000000000E+00
&END RESTART_AVERAGES
&END AVERAGES
&END MD
&PRINT
&TRAJECTORY SILENT
FILENAME traj
FORMAT PDB
&END TRAJECTORY
&END PRINT
&END MOTION
&FORCE_EVAL
METHOD QS
&DFT
BASIS_SET_FILE_NAME ../../../data/BASIS_SET
POTENTIAL_FILE_NAME ../../../data/POTENTIAL
&SCF
EPS_SCF 1.0000000000000000E-04
SCF_GUESS ATOMIC
&END SCF
&QS
EPS_DEFAULT 9.9999999999999995E-07
&END QS
&MGRID
CUTOFF 5.0000000000000000E+01
&END MGRID
&XC
DENSITY_CUTOFF 1.0000000000000000E-10
GRADIENT_CUTOFF 1.0000000000000000E-10
TAU_CUTOFF 1.0000000000000000E-10
&XC_FUNCTIONAL NO_SHORTCUT
&PADE T
&END PADE
&END XC_FUNCTIONAL
&END XC
&END DFT
&SUBSYS
&CELL
A 6.0000000000000000E+00 0.0000000000000000E+00 0.0000000000000000E+00
B 0.0000000000000000E+00 6.0000000000000000E+00 0.0000000000000000E+00
C 0.0000000000000000E+00 0.0000000000000000E+00 6.0000000000000000E+00
MULTIPLE_UNIT_CELL 1 1 1
&END CELL
&COORD
Na 4.1544404708899656E-03 1.7482111078164324E-02 -7.7066972632834380E-02
Cl -2.6969121147277422E-03 -7.6857221705712342E-01 5.2809721688312083E-01
&END COORD
&VELOCITY
1.9367109963415344E-04 1.5472469982694170E-03 -1.1022846655765221E-03
-1.2599411815314760E-04 -1.0126765236960966E-03 7.2478636512292196E-04
&END VELOCITY
&KIND Na
BASIS_SET DZVP-GTH-PADE
POTENTIAL GTH-PADE-q1
&BASIS
2
2 0 1 6 3 3
0.1766996857200000E+02 -0.1362341108000000E+00 0.2709525950000000E-01 0.0000000000000000E+00 0.7919629550000000E-01 0.0000000000000000E+00 0.0000000000000000E+00
0.6907180338000000E+01 -0.6458818040000000E-01 0.9874301500000000E-02 0.0000000000000000E+00 0.2184099565000000E+00 0.0000000000000000E+00 0.0000000000000000E+00
0.2609908538000000E+01 0.3866064317000000E+00 -0.7371482500000000E-01 0.0000000000000000E+00 0.3764868181000000E+00 0.0000000000000000E+00 0.0000000000000000E+00
0.9649711690000000E+00 0.5971408467000000E+00 -0.2186736173000000E+00 0.0000000000000000E+00 0.3922334703000000E+00 0.0000000000000000E+00 0.0000000000000000E+00
0.3400394180000000E+00 0.1756759463000000E+00 -0.3011093430000000E-01 0.0000000000000000E+00 0.1808450956000000E+00 0.1000000000000000E+01 0.0000000000000000E+00
0.4324867760000000E-01 -0.1710460900000000E-02 0.1038887373500000E+01 0.1000000000000000E+01 0.5769892900000000E-02 0.0000000000000000E+00 0.1000000000000000E+01
3 2 2 1 1
0.1169000000000000E+00 0.1000000000000000E+01
# Basis set name: DZVP-GTH-PADE for symbol: NA
# Basis set read from the basis set filename: ../../../data/BASIS_SET
&END BASIS
&POTENTIAL
1
0.8855093800000000E+00 1 -0.1238867130000000E+01
2
0.6611039000000000E+00 2 0.1847271350000000E+01 -0.2254090300000000E+00
0.5820036200000001E+00
0.8571192800000000E+00 1 0.4711325800000000E+00
# Potential name: GTH-PADE-Q1 for symbol: NA
# Potential read from the potential filename: ../../../data/POTENTIAL
&END POTENTIAL
&END KIND
&KIND Cl
BASIS_SET DZVP-GTH-PADE
POTENTIAL GTH-PADE-q7
&BASIS
2
3 0 1 4 2 2
0.2232175982800000E+01 0.3787841491000000E+00 0.0000000000000000E+00 0.1555383208000000E+00 0.0000000000000000E+00
0.1378773629000000E+01 -0.1239706468000000E+00 0.0000000000000000E+00 -0.2931737153000000E+00 0.0000000000000000E+00
0.4264909846000000E+00 -0.8298219996000000E+00 0.0000000000000000E+00 -0.5781090940000000E+00 0.0000000000000000E+00
0.1366672601000000E+00 -0.3175034120000000E+00 0.1000000000000000E+01 -0.3901119426000000E+00 0.1000000000000000E+01
3 2 2 1 1
0.7500000000000000E+00 0.1000000000000000E+01
# Basis set name: DZVP-GTH-PADE for symbol: CL
# Basis set read from the basis set filename: ../../../data/BASIS_SET
&END BASIS
&POTENTIAL
2 5
0.4100000000000000E+00 1 -0.6864754310000000E+01
2
0.3382083200000000E+00 2 0.9062239679999999E+01 -0.1961930360000000E+01
0.5065682400000000E+01
0.3761370900000000E+00 1 0.4465876400000000E+01
# Potential name: GTH-PADE-Q7 for symbol: CL
# Potential read from the potential filename: ../../../data/POTENTIAL
&END POTENTIAL
&END KIND
&TOPOLOGY
NUMBER_OF_ATOMS 2
MULTIPLE_UNIT_CELL 1 1 1
&END TOPOLOGY
&END SUBSYS
&END FORCE_EVAL
TITLE PDB file created by CP2K version 2.6.2 (revision svn:15893)
AUTHOR lauri@lauri-Lenovo-Z50-70 2015-11-25 12:12:35
REMARK Step 0, time = .000, E = -13.9364485315
CRYST1 6.000 6.000 6.000 90.00 90.00 90.00
ATOM 1 Na 0.000 0.000 -0.066 0.00 0.00 Na
ATOM 2 Cl 0.000 -0.757 0.521 0.00 0.00 Cl
END
REMARK Step 1, time = .100, E = -13.9375336611
CRYST1 6.000 6.000 6.000 90.00 90.00 90.00
ATOM 1 Na 0.000 0.000 -0.066 0.00 0.00 Na
ATOM 2 Cl -0.000 -0.757 0.521 0.00 0.00 Cl
END
REMARK Step 2, time = .200, E = -13.9413262006
CRYST1 6.000 6.000 6.000 90.00 90.00 90.00
ATOM 1 Na 0.001 0.001 -0.066 0.00 0.00 Na