From d4f4b4b8cd77a467ce39eef657074175170147e9 Mon Sep 17 00:00:00 2001 From: Lauri Himanen <lauri.himanen@aalto.fi> Date: Wed, 22 Jun 2016 10:34:52 +0300 Subject: [PATCH] Initial commit for CPMD. --- .gitignore | 56 ++++++++++ .gitlab-ci.yml | 17 +++ README.md | 105 +++++++++++++++++- parser/parser-cpmd/cpmdparser/__init__.py | 1 + .../cpmdparser/generic/__init__.py | 0 parser/parser-cpmd/cpmdparser/parser.py | 67 +++++++++++ .../parser-cpmd/cpmdparser/scalainterface.py | 17 +++ parser/parser-cpmd/cpmdparser/setup_paths.py | 17 +++ .../cpmdparser/versions/__init__.py | 0 .../cpmdparser/versions/cpmd41/__init__.py | 0 .../cpmdparser/versions/versionsetup.py | 62 +++++++++++ setup.py | 33 ++++++ .../eu/nomad_lab/parsers/CpmdParser.scala | 56 ++++++++++ .../eu/nomad_lab/parsers/CpmdParserSpec.scala | 15 +++ test/unittests/README.md | 6 + 15 files changed, 447 insertions(+), 5 deletions(-) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 parser/parser-cpmd/cpmdparser/__init__.py create mode 100644 parser/parser-cpmd/cpmdparser/generic/__init__.py create mode 100644 parser/parser-cpmd/cpmdparser/parser.py create mode 100644 parser/parser-cpmd/cpmdparser/scalainterface.py create mode 100644 parser/parser-cpmd/cpmdparser/setup_paths.py create mode 100644 parser/parser-cpmd/cpmdparser/versions/__init__.py create mode 100644 parser/parser-cpmd/cpmdparser/versions/cpmd41/__init__.py create mode 100644 parser/parser-cpmd/cpmdparser/versions/versionsetup.py create mode 100644 setup.py create mode 100644 src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala create mode 100644 src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala create mode 100644 test/unittests/README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6f8ec45 --- /dev/null +++ b/.gitignore @@ -0,0 +1,56 @@ +# use glob syntax. +syntax: glob +*.ser +*.class +*~ +*.bak +#*.off +*.old +*.pyc +*.bk +*.swp +.DS_Store + +# logging files +detailed.log + +# eclipse conf file +.settings +.classpath +.project +.manager +.scala_dependencies + +# idea +.idea +*.iml + +# building +target +build +null +tmp* +temp* +dist +test-output +build.log + +# other scm +.svn +.CVS +.hg* + +# switch to regexp syntax. +# syntax: regexp +# ^\.pc/ + +#SHITTY output not in target directory +build.log + +#emacs TAGS +TAGS + +lib/ +env/ + +# CP2K files diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..ad6fb22 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,17 @@ +stages: + - test + +testing: + stage: test + script: + - cd .. && rm -rf nomad-lab-base + - git clone --recursive git@gitlab.mpcdf.mpg.de:nomad-lab/nomad-lab-base.git + - cd nomad-lab-base + - git submodule foreach git checkout master + - git submodule foreach git pull + - sbt cpmd/test + only: + - master + tags: + - test + - spec2 diff --git a/README.md b/README.md index c0fef60..821092d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,101 @@ -[NOMAD Laboratory CoE](http://nomad-lab.eu/) parser for [CPMD](http://www.cpmd.org) +This is the main repository of the [NOMAD](http://nomad-lab.eu) parser for +[CPMD](http://www.cpmd.org/). -The original lives at https://gitlab.rzg.mpg.de/nomad-lab/parser-cpmd -This depends on python common, and nomad-meta-info, you most likely want to -get [nomad-lab-base](https://gitlab.rzg.mpg.de/nomad-lab/nomad-lab-base) that -contains it along with all dependencies. \ No newline at end of file +# Installation +This parser is a submodule of the nomad-lab-base repository. Developers within +the NoMaD project will automatically get a copy of this repository when they +download and install the base repository. + +# Structure +The scala layer can access the parser functionality through the +scalainterface.py file, by calling the following command: + +```python + python scalainterface.py path/to/main/file +``` + +This scala interface is separated into it's own file to separate it from the +rest of the code. Some parsers will have the interface in the same file as the +parsing code, but I feel that this is a cleaner approach. + +The parser is designed to support multiple versions of CPMD with a +[DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) approach: The +initial parser class is based on CPMD v4.1, and other versions will be +subclassed from it. By sublassing, all the previous functionality will be +preserved, new functionality can be easily created, and old functionality +overridden only where necesssary. + + +# Standalone Mode +The parser is designed to be usable also outside the NoMaD project as a +separate python package. This standalone python-only mode is primarily for +people who want to easily access the parser without the need to setup the whole +"NOMAD Stack". It is also used when running custom unit tests found in the +folder *cpmd/test/unittests*. Here is an example of the call syntax: + +```python + from cpmdparser import CPMDParser + import matplotlib.pyplot as mpl + + # 1. Initialize a parser by giving a path to the CPMD output file and a list of + # default units + path = "path/to/main.file" + default_units = ["eV"] + parser = CPMDParser(path, default_units=default_units) + + # 2. Parse + results = parser.parse() + + # 3. Query the results with using the id's created specifically for NOMAD. + scf_energies = results["energy_total_scf_iteration"] + mpl.plot(scf_energies) + mpl.show() +``` + +To install this standalone version, you need to clone the repositories +"python-common", "nomad-meta-info", and "parser-cpmd" into the same folder. +Then install the python-common according to the instructions found in the +README. After that, you can install this package by running: + +```sh +python setup.py develop --user +``` + +# Tools and Methods +This section describes some of the guidelines that are used in the development +of this parser. + +## Documentation +This parser tries to follow the [google style +guide](https://google.github.io/styleguide/pyguide.html?showone=Comments#Comments) +for documenting python code. Documenting makes it much easier to follow the +logic behind your parser. + +## Testing +The parsers can become quite complicated and maintaining them without +systematic testing is impossible. There are general tests that are +performed automatically in the scala layer for all parsers. This is essential, +but can only test that the data is outputted in the correct format and +according to some general rules. These tests cannot verify that the contents +are correct. + +In order to truly test the parser output, unit testing is needed. The unit +tests for this parser are located in **cpmd/test/unittests**. Unit tests provide one way +to test each parseable quantity and python has a very good [library for unit +testing](https://docs.python.org/2/library/unittest.html). When the parser +supports a new quantity it is quite fast to create unit tests for it. These +tests will validate the parsing, and also easily detect bugs that may rise when +the code is modified in the future. + +## Profiling +The parsers have to be reasonably fast. For some codes there is already +significant amount of data in the NoMaD repository and the time taken to parse +it will depend on the performance of the parser. Also each time the parser +evolves after system deployment, the existing data may have to be reparsed at +least partially. + +By profiling what functions take the most computational time and memory during +parsing you can identify the bottlenecks in the parser. There are already +existing profiling tools such as +[cProfile](https://docs.python.org/2/library/profile.html#module-cProfile) +which you can plug into your scripts very easily. diff --git a/parser/parser-cpmd/cpmdparser/__init__.py b/parser/parser-cpmd/cpmdparser/__init__.py new file mode 100644 index 0000000..1e218e7 --- /dev/null +++ b/parser/parser-cpmd/cpmdparser/__init__.py @@ -0,0 +1 @@ +from cpmdparser.parser import CPMDParser diff --git a/parser/parser-cpmd/cpmdparser/generic/__init__.py b/parser/parser-cpmd/cpmdparser/generic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parser/parser-cpmd/cpmdparser/parser.py b/parser/parser-cpmd/cpmdparser/parser.py new file mode 100644 index 0000000..f3f0fae --- /dev/null +++ b/parser/parser-cpmd/cpmdparser/parser.py @@ -0,0 +1,67 @@ +from builtins import next +from builtins import range +import os +import re +import logging +from nomadcore.baseclasses import ParserInterface +from cpmdparser.versions.versionsetup import get_main_parser +logger = logging.getLogger("nomad") + + +#=============================================================================== +class CPMDParser(ParserInterface): + """This class handles the initial setup before any parsing can happen. It + determines which version of CP2K was used to generate the output and then + sets up a correct main parser. + + After the implementation has been setup, you can parse the files with + parse(). + """ + def __init__(self, main_file, metainfo_to_keep=None, backend=None, default_units=None, metainfo_units=None, debug=True, store=True): + super(CPMDParser, self).__init__(main_file, metainfo_to_keep, backend, default_units, metainfo_units, debug, store) + + def setup_version(self): + """Setups the version by looking at the output file and the version + specified in it. + """ + # Search for the CP2K version specification and the RUN_TYPE for the + # calculation. The correct and optimized parser is initialized based on + # this information. + regex_version = re.compile(r" CP2K\| version string:\s+CP2K version ([\d\.]+)") + regex_run_type = re.compile(r"\s+GLOBAL\| Run type\s+(.+)") + n_lines = 50 + version_id = None + run_type = None + with open(self.parser_context.main_file, 'r') as outputfile: + for i_line in range(n_lines): + line = next(outputfile) + result_version = regex_version.match(line) + result_run_type = regex_run_type.match(line) + if result_version: + version_id = result_version.group(1).replace('.', '') + if result_run_type: + run_type = result_run_type.group(1) + if version_id is None: + msg = "Could not find a version specification from the given main file." + logger.exception(msg) + raise RuntimeError(msg) + if run_type is None: + msg = "Could not find a version specification from the given main file." + logger.exception(msg) + raise RuntimeError(msg) + + # Setup the root folder to the fileservice that is used to access files + dirpath, filename = os.path.split(self.parser_context.main_file) + dirpath = os.path.abspath(dirpath) + self.parser_context.file_service.setup_root_folder(dirpath) + self.parser_context.file_service.set_file_id(filename, "output") + + # Setup the correct main parser based on the version id. If no match + # for the version is found, use the main parser for CP2K 2.6.2 + self.main_parser = get_main_parser(version_id, run_type)(self.parser_context.main_file, self.parser_context) + + def get_metainfo_filename(self): + return "cpmd.nomadmetainfo.json" + + def get_parser_info(self): + return {'name': 'cpmd-parser', 'version': '1.0'} diff --git a/parser/parser-cpmd/cpmdparser/scalainterface.py b/parser/parser-cpmd/cpmdparser/scalainterface.py new file mode 100644 index 0000000..eb0a691 --- /dev/null +++ b/parser/parser-cpmd/cpmdparser/scalainterface.py @@ -0,0 +1,17 @@ +""" +This is the access point to the parser for the scala layer in the +nomad project. +""" +from __future__ import absolute_import +import sys +import setup_paths +from nomadcore.parser_backend import JsonParseEventsWriterBackend +from cpmdparser import CPMDParser + + +if __name__ == "__main__": + + # Initialise the parser with the main filename and a JSON backend + main_file = sys.argv[1] + parser = CPMDParser(main_file, backend=JsonParseEventsWriterBackend) + parser.parse() diff --git a/parser/parser-cpmd/cpmdparser/setup_paths.py b/parser/parser-cpmd/cpmdparser/setup_paths.py new file mode 100644 index 0000000..e45903e --- /dev/null +++ b/parser/parser-cpmd/cpmdparser/setup_paths.py @@ -0,0 +1,17 @@ +""" +Setups the python-common library in the PYTHONPATH system variable. +""" +import sys +import os +import os.path + +baseDir = os.path.dirname(os.path.abspath(__file__)) +commonDir = os.path.normpath(os.path.join(baseDir, "../../../../../python-common/common/python")) +parserDir = os.path.normpath(os.path.join(baseDir, "../../parser-cpmd")) + +# Using sys.path.insert(1, ...) instead of sys.path.insert(0, ...) based on +# this discusssion: +# http://stackoverflow.com/questions/10095037/why-use-sys-path-appendpath-instead-of-sys-path-insert1-path +if commonDir not in sys.path: + sys.path.insert(1, commonDir) + sys.path.insert(1, parserDir) diff --git a/parser/parser-cpmd/cpmdparser/versions/__init__.py b/parser/parser-cpmd/cpmdparser/versions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parser/parser-cpmd/cpmdparser/versions/cpmd41/__init__.py b/parser/parser-cpmd/cpmdparser/versions/cpmd41/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parser/parser-cpmd/cpmdparser/versions/versionsetup.py b/parser/parser-cpmd/cpmdparser/versions/versionsetup.py new file mode 100644 index 0000000..e4f9176 --- /dev/null +++ b/parser/parser-cpmd/cpmdparser/versions/versionsetup.py @@ -0,0 +1,62 @@ +import importlib +import logging +logger = logging.getLogger("nomad") + + +#=============================================================================== +def get_main_parser(version_id, run_type): + """ + Setups a main parser class for this calculation. The main class can be + different for each version and run type. + + Args: + version_id: An integer representing the CP2K version. The version + number is originally a string the form '2.6.2', but here the numbers + are just concatenated into a single integer number 262. + run_type: A string that identifies the RUN_TYPE for the calculation. + All the possible run types can be found in the CP2K reference manual. + + Returns: + A python class that should be instantiated later with the correct + parameters. + """ + + # Search for a RUN_TYPE specific parser + parser_map = { + "ENERGY": "SinglePointParser", + "ENERGY_FORCE": "SinglePointParser", + "WAVEFUNCTION_OPTIMIZATION": "SinglePointParser", + "WFN_OPT": "SinglePointParser", + "GEO_OPT": "GeoOptParser", + "GEOMETRY_OPTIMIZATION": "GeoOptParser", + "MD": "MDParser", + "MOLECULAR_DYNAMICS": "MDParser", + } + try: + parser = parser_map[run_type] + except KeyError: + logger.exception("A parser corresponding to the run_type '{}' could not be found.".format(run_type)) + raise + + # Currently the version id is a pure integer, so it can directly be mapped + # into a package name. + base = "cpmdparser.versions.cp2k{}.{}".format(version_id, parser.lower()) + parser_module = None + parser_class = None + try: + parser_module = importlib.import_module(base) + except ImportError: + logger.warning("Could not find a parser for version '{}' and run type '{}'. Trying to default to the base implementation for CP2K 2.6.2".format(version_id, run_type)) + base = "cp2kparser.versions.cp2k262.{}".format(parser.lower()) + try: + parser_module = importlib.import_module(base) + except ImportError: + logger.exception("Tried to default to the CP2K 2.6.2 implementation but could not find the correct modules for run_type '{}'.".format(run_type)) + raise + try: + parser_class = getattr(parser_module, "CP2K{}".format(parser)) + except AttributeError: + logger.exception("A parser class '{}' could not be found in the module '[]'.".format(parser_class, parser_module)) + raise + + return parser_class diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fc623cc --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +""" +This is a setup script for installing the parser locally on python path with +all the required dependencies. Used mainly for local testing. +""" +from setuptools import setup, find_packages + + +#=============================================================================== +def main(): + # Start package setup + setup( + name="cpmdparser", + version="0.1", + # package_data={ + # 'cp2kparser.versions.cp2k262': ['input_data/cp2k_input_tree.pickle'], + # }, + description="NoMaD parser implementation for CPMD.", + author="Lauri Himanen", + author_email="lauri.himanen@aalto.fi", + license="GPL3", + # package_dir={'': 'parser/parser-cp2k'}, + packages=find_packages(), + install_requires=[ + 'pint', + 'numpy', + # 'mdtraj', + # 'ase' + ], + ) + +# Run main function by default +if __name__ == "__main__": + main() diff --git a/src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala b/src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala new file mode 100644 index 0000000..c0305e7 --- /dev/null +++ b/src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala @@ -0,0 +1,56 @@ +package eu.nomad_lab.parsers + +import eu.{ nomad_lab => lab } +import eu.nomad_lab.DefaultPythonInterpreter +import org.{ json4s => jn } +import scala.collection.breakOut + +object Cp2kParser extends SimpleExternalParserGenerator( + name = "Cp2kParser", + parserInfo = jn.JObject( + ("name" -> jn.JString("CpmdParser")) :: + ("parserId" -> jn.JString("CpmdParser" + lab.Cp2kVersionInfo.version)) :: + ("versionInfo" -> jn.JObject( + ("nomadCoreVersion" -> jn.JObject(lab.NomadCoreVersionInfo.toMap.map { + case (k, v) => k -> jn.JString(v.toString) + }(breakOut): List[(String, jn.JString)])) :: + (lab.CpmdVersionInfo.toMap.map { + case (key, value) => + (key -> jn.JString(value.toString)) + }(breakOut): List[(String, jn.JString)]) + )) :: Nil + ), + mainFileTypes = Seq("text/.*"), + mainFileRe = """ \*\*\*\* \*\*\*\* \*\*\*\*\*\* \*\* PROGRAM STARTED AT\s(?<cpmdStartedAt>.*) + \*\*\*\*\* \*\* \*\*\* \*\*\* \*\* PROGRAM STARTED ON\s*.* + \*\* \*\*\*\* \*\*\*\*\*\* PROGRAM STARTED BY .* + \*\*\*\*\* \*\* \*\* \*\* \*\* PROGRAM PROCESS ID .* + \*\*\*\* \*\* \*\*\*\*\*\*\* \*\* PROGRAM STARTED IN .* +(?:\s*\n| \s+.* +)* +(?:\s*CP2K\| version string:\s*(?<cpmdVersionString>.*) +)?(?:\s*CP2K\| source code revision number:\s*(?<cpmdRevision>.*) +)?""".r, + cmd = Seq(DefaultPythonInterpreter.python2Exe(), "${envDir}/parsers/cpmd/parser/parser-cpmd/cpmdparser/scalainterface.py", + "${mainFilePath}"), + cmdCwd = "${mainFilePath}/..", + resList = Seq( + "parser-cpmd/cpmdparser/__init__.py", + "parser-cpmd/cpmdparser/setup_paths.py", + "parser-cpmd/cpmdparser/parser.py", + "parser-cpmd/cpmdparser/generic/__init__.py", + "parser-cpmd/cpmdparser/versions/__init__.py", + "parser-cpmd/cpmdparser/versions/versionsetup.py", + "parser-cpmd/cpmdparser/versions/cpmd41/__init__.py", + "parser-cpmd/cpmdparser/scalainterface.py", + "nomad_meta_info/public.nomadmetainfo.json", + "nomad_meta_info/common.nomadmetainfo.json", + "nomad_meta_info/meta_types.nomadmetainfo.json", + "nomad_meta_info/cpmd.nomadmetainfo.json", + "nomad_meta_info/cpmd.general.nomadmetainfo.json" + ) ++ DefaultPythonInterpreter.commonFiles(), + dirMap = Map( + "parser-cpmd" -> "parsers/cpmd/parser/parser-cpmd", + "nomad_meta_info" -> "nomad-meta-info/meta_info/nomad_meta_info" + ) ++ DefaultPythonInterpreter.commonDirMapping() +) diff --git a/src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala b/src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala new file mode 100644 index 0000000..fd35602 --- /dev/null +++ b/src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala @@ -0,0 +1,15 @@ +package eu.nomad_lab.parsers + +import org.specs2.mutable.Specification + +object CpmdParserSpec extends Specification { + "CpmdParserTest" >> { + "test with json-events" >> { + ParserRun.parse(CpmdParser, "parsers/cpmd/test/examples/energy_force/si_bulk8.out", "json-events") must_== ParseResult.ParseSuccess + } + } + + "test energy_force with json" >> { + ParserRun.parse(CpmdParser, "parsers/cpmd/test/examples/energy_force/si_bulk8.out", "json") must_== ParseResult.ParseSuccess + } +} diff --git a/test/unittests/README.md b/test/unittests/README.md new file mode 100644 index 0000000..550be2b --- /dev/null +++ b/test/unittests/README.md @@ -0,0 +1,6 @@ +# Unit tests +This directory contains unit tests to evaluate the correctness of the parser in +a systematic way. Ideally each parsed metainfo should have at least one unit +test, and if the resulting values are predetermined, the available values +should all be tested individually. Also certain scenarios that should produce a +parsing error should be tested. -- GitLab