Commit da96ccfb authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Moved configuration reading routines to nomadcore.

parent afc75b31
import ase.io
import ase.io.formats
import mdtraj as md
import mdtraj.formats
import numpy as np
import logging
logger = logging.getLogger("nomad")
#===============================================================================
def iread(filename, file_format=None):
"""Generator function that is used to read an atomic configuration file (MD
trajectory, geometry optimization, static snapshot) from a file one frame
at a time. Only the xyz positions are returned from the file, and no unit
conversion is done, so you have to be careful with units.
By using a generator pattern we can avoid loading the entire trajectory
file into memory. This function will instead load a chunk of the file into
memory (with MDTraj you can decide the chunk size, with ASE it seems to
always be one frame), and serve individual files from that chunk. Once the
frames in one chunk are iterated, the chunk will be garbage collected and
memory is freed.
Args:
filename: String for the file path.
file_format: String for the file format. If not given the format is
automatically detected from the extension.
Yields:
numpy array containing the atomic positions in one frame.
"""
# If file format is not explicitly stated, determine the format from the
# filename
if file_format is None:
file_format = filename.split(".")[-1]
# Try to open the file with MDTraj first. With a brief inspection it seems
# that MDTraj is better performance wise, because it can iteratively load a
# "chunk" of frames, and still serve the individual frames one by one. ASE
# on the other hand will iteratively read frames one by one (unnecessary
# IO).
mdtraj_chunk = 100 # How many frames MDTraj will load at once
mdtraj_failed = False
# Must use the low level MDTraj API to open files without topology.
class_format_map = {
"dcd": mdtraj.formats.DCDTrajectoryFile,
"xyz": mdtraj.formats.XYZTrajectoryFile,
"pdb": mdtraj.formats.PDBTrajectoryFile,
}
traj_class = class_format_map.get(file_format)
if traj_class is not None:
try:
with traj_class(filename, mode="r") as f:
empty = False
while not empty:
data = f.read(mdtraj_chunk)
if isinstance(data, tuple):
positions = data[0]
else:
positions = data
if len(positions) == 0:
empty = True
else:
for pos in positions:
yield pos
except IOError:
logger.warning("MDTraj could not read the file '{}' with format '{}'. The contents might be malformed or wrong format used.".format(filename, file_format))
return
else:
mdtraj_failed = True
# If MDTraj didn't support the format, try ASE instead
if mdtraj_failed:
try:
io = ase.io.formats.get_ioformat(file_format)
except ValueError:
logger.error("MDTraj could not read the file '{}' with format '{}'. If MDTraj is supposed to read this format, the contents might be malformed.".format(filename, file_format))
return
else:
# Return the positions in a numpy array instead of an ASE Atoms object
generator = ase.io.iread(filename, format=file_format)
for atoms in generator:
pos = atoms.positions
yield pos
import numpy as np
import logging
logger = logging.getLogger(__name__)
from io import StringIO
import re
#===============================================================================
def iread(filepath, columns, delimiter=r"\s+", comments=r"#", start=None, end=None):
"""Used to iterate a CSV-like file. If a separator is provided the file
is iterated one configuration at a time. Only keeps one configuration
of the file in memory. If no separator is given, the whole file will be
handled.
The contents are separated into configurations whenever the separator
regex is encountered on a line.
Args:
filepath: Path to the CSV like file to be processed.
columns: List of integers indicating the columns of interest in the CSV file.
start: A regex that is used to indicate the start of a new configuration.
end: A regex that is used to indicate the end of a configuration.
comments: A regex that is used identify comments in the file that are ignored.
"""
def split_line(line):
"""Chop off comments, strip, and split at delimiter.
"""
if line.isspace():
return None
if comments:
line = compiled_comments.split(line, maxsplit=1)[0]
line = line.strip('\r\n ')
if line:
return compiled_delimiter.split(line)
else:
return None
def is_end(line):
"""Check if the given line matches the separator pattern.
Separators are used to split a file into multiple configurations.
"""
if end:
return compiled_end.search(line)
return False
def is_start(line):
"""Check if the given line matches the separator pattern.
Separators are used to split a file into multiple configurations.
"""
if start:
return compiled_start.search(line)
return False
# Precompile the different regexs before looping
compiled_delimiter = re.compile(delimiter)
if comments:
comments = (re.escape(comment) for comment in comments)
compiled_comments = re.compile('|'.join(comments))
if end:
compiled_end = re.compile(end)
if start:
compiled_start = re.compile(start)
# Columns as list
if columns is not None:
columns = list(columns)
# Start iterating
configuration = []
started = False
# If no starting and ending condition are provided, read configuration by line
if start is None and end is None:
with open(filepath, "r") as f:
for line in f: # This actually reads line by line and only keeps the current line in memory
# Ignore comments, separate by delimiter
vals = split_line(line)
line_forces = []
if vals:
for column in columns:
try:
value = vals[column]
except IndexError:
logger.warning("The given index '{}' could not be found on the line '{}'. The given delimiter or index could be wrong.".format(column, line))
return
try:
value = float(value)
except ValueError:
logger.warning("Could not cast value '{}' to float. Currently only floating point values are accepted".format(value))
return
else:
line_forces.append(value)
yield np.array(line_forces)
# If starting and ending condition are provided, after starting condition
# is detected, add the values from lines to a new array that is returned
# when the end condition is met
elif start is not None and end is not None:
with open(filepath, "r") as f:
for line in f: # This actually reads line by line and only keeps the current line in memory
# If a start regex is provided, use it to detect the start of a configuration
if is_start(line):
started = True
continue
# If separator encountered, yield the stored configuration
if is_end(line):
started = False
if configuration:
yield np.array(configuration)
configuration = []
elif start is not None and started:
# Ignore comments, separate by delimiter
vals = split_line(line)
line_forces = []
if vals:
for column in columns:
try:
value = vals[column]
except IndexError:
logger.warning("The given index '{}' could not be found on the line '{}'. The given delimiter or index could be wrong.".format(column, line))
return
try:
value = float(value)
except ValueError:
logger.warning("Could not cast value '{}' to float. Currently only floating point values are accepted".format(value))
return
else:
line_forces.append(value)
configuration.append(line_forces)
# The last configuration is yielded even if separator is not present at
# the end of file or is not given at all
if configuration:
yield np.array(configuration)
......@@ -4,9 +4,9 @@ from builtins import next
from builtins import range
from nomadcore.simple_parser import SimpleMatcher as SM
from nomadcore.baseclasses import MainHierarchicalParser
import nomadcore.configurationreading
import nomadcore.csvparsing
from .commonparser import CP2KCommonParser
import cp2kparser.generic.configurationreading
import cp2kparser.generic.csvparsing
from nomadcore.caching_backend import CachingLevel
import logging
logger = logging.getLogger("nomad")
......@@ -258,10 +258,10 @@ class CP2KGeoOptParser(MainHierarchicalParser):
# Use special parsing for CP2K pdb files because they don't follow the proper syntax
if traj_format == "PDB":
self.traj_iterator = cp2kparser.generic.csvparsing.iread(traj_file, columns=[3, 4, 5], start="CRYST", end="END")
self.traj_iterator = nomadcore.csvparsing.iread(traj_file, columns=[3, 4, 5], start="CRYST", end="END")
else:
try:
self.traj_iterator = cp2kparser.generic.configurationreading.iread(traj_file)
self.traj_iterator = nomadcore.configurationreading.iread(traj_file)
except ValueError:
pass
......
......@@ -4,9 +4,9 @@ from builtins import range
import numpy as np
from nomadcore.simple_parser import SimpleMatcher as SM
from nomadcore.baseclasses import MainHierarchicalParser
import nomadcore.configurationreading
import nomadcore.csvparsing
from .commonparser import CP2KCommonParser
import cp2kparser.generic.configurationreading
import cp2kparser.generic.csvparsing
from nomadcore.caching_backend import CachingLevel
from nomadcore.unit_conversion.unit_conversion import convert_unit
import logging
......@@ -192,10 +192,10 @@ class CP2KMDParser(MainHierarchicalParser):
# Use special parsing for CP2K pdb files because they don't follow the proper syntax
if traj_format == "PDB":
self.traj_iterator = cp2kparser.generic.csvparsing.iread(coord_filepath, columns=[3, 4, 5], start="CRYST", end="END")
self.traj_iterator = nomadcore.csvparsing.iread(coord_filepath, columns=[3, 4, 5], start="CRYST", end="END")
else:
try:
self.traj_iterator = cp2kparser.generic.configurationreading.iread(coord_filepath)
self.traj_iterator = nomadcore.configurationreading.iread(coord_filepath)
except ValueError:
pass
......@@ -203,17 +203,17 @@ class CP2KMDParser(MainHierarchicalParser):
vel_format = self.cache_service["velocity_format"]
if vel_format is not None and vel_filepath is not None:
try:
self.vel_iterator = cp2kparser.generic.configurationreading.iread(vel_filepath)
self.vel_iterator = nomadcore.configurationreading.iread(vel_filepath)
except ValueError:
pass
# Setup energy file iterator
if energies_filepath is not None:
self.energy_iterator = cp2kparser.generic.csvparsing.iread(energies_filepath, columns=[0, 1, 2, 3, 4, 5, 6], comments="#")
self.energy_iterator = nomadcore.csvparsing.iread(energies_filepath, columns=[0, 1, 2, 3, 4, 5, 6], comments="#")
# Setup cell file iterator
if cell_filepath is not None:
self.cell_iterator = cp2kparser.generic.csvparsing.iread(cell_filepath, columns=[2, 3, 4, 5, 6, 7, 8, 9, 10], comments="#")
self.cell_iterator = nomadcore.csvparsing.iread(cell_filepath, columns=[2, 3, 4, 5, 6, 7, 8, 9, 10], comments="#")
def onClose_x_cp2k_section_md(self, backend, gIndex, section):
......@@ -290,7 +290,7 @@ class CP2KMDParser(MainHierarchicalParser):
try:
pos = next(self.traj_iterator)
except StopIteration:
logger.error("Could not get the next geometries from an external file. It seems that the number of optimization steps in the CP2K outpufile doesn't match the number of steps found in the external trajectory file.")
logger.error("Could not get the next geometries from an external file. It seems that the number of optimization steps in the CP2K outputfile doesn't match the number of steps found in the external trajectory file.")
else:
backend.addArrayValues("atom_positions", pos, unit=traj_unit)
......
......@@ -1034,21 +1034,21 @@ if __name__ == '__main__':
logger.setLevel(logging.ERROR)
suites = []
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestErrors))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestXCFunctional))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestEnergyForce))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestStressTensorMethods))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestSelfInteractionCorrectionMethod))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestConfigurationPeriodicDimensions))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestSCFConvergence))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestForceFiles))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestPreprocessor))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestErrors))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestXCFunctional))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestEnergyForce))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestStressTensorMethods))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestSelfInteractionCorrectionMethod))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestConfigurationPeriodicDimensions))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestSCFConvergence))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestForceFiles))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestPreprocessor))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestGeoOpt))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestGeoOptTrajFormats))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestGeoOptOptimizers))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestGeoOptTrajectory))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestGeoOptOptimizers))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestGeoOptTrajectory))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestMD))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestMDEnsembles))
suites.append(unittest.TestLoader().loadTestsFromTestCase(TestElectronicStructureMethod))
# suites.append(unittest.TestLoader().loadTestsFromTestCase(TestElectronicStructureMethod))
alltests = unittest.TestSuite(suites)
unittest.TextTestRunner(verbosity=0).run(alltests)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment