From d4f4b4b8cd77a467ce39eef657074175170147e9 Mon Sep 17 00:00:00 2001
From: Lauri Himanen <lauri.himanen@aalto.fi>
Date: Wed, 22 Jun 2016 10:34:52 +0300
Subject: [PATCH] Initial commit for CPMD.

---
 .gitignore                                    |  56 ++++++++++
 .gitlab-ci.yml                                |  17 +++
 README.md                                     | 105 +++++++++++++++++-
 parser/parser-cpmd/cpmdparser/__init__.py     |   1 +
 .../cpmdparser/generic/__init__.py            |   0
 parser/parser-cpmd/cpmdparser/parser.py       |  67 +++++++++++
 .../parser-cpmd/cpmdparser/scalainterface.py  |  17 +++
 parser/parser-cpmd/cpmdparser/setup_paths.py  |  17 +++
 .../cpmdparser/versions/__init__.py           |   0
 .../cpmdparser/versions/cpmd41/__init__.py    |   0
 .../cpmdparser/versions/versionsetup.py       |  62 +++++++++++
 setup.py                                      |  33 ++++++
 .../eu/nomad_lab/parsers/CpmdParser.scala     |  56 ++++++++++
 .../eu/nomad_lab/parsers/CpmdParserSpec.scala |  15 +++
 test/unittests/README.md                      |   6 +
 15 files changed, 447 insertions(+), 5 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 .gitlab-ci.yml
 create mode 100644 parser/parser-cpmd/cpmdparser/__init__.py
 create mode 100644 parser/parser-cpmd/cpmdparser/generic/__init__.py
 create mode 100644 parser/parser-cpmd/cpmdparser/parser.py
 create mode 100644 parser/parser-cpmd/cpmdparser/scalainterface.py
 create mode 100644 parser/parser-cpmd/cpmdparser/setup_paths.py
 create mode 100644 parser/parser-cpmd/cpmdparser/versions/__init__.py
 create mode 100644 parser/parser-cpmd/cpmdparser/versions/cpmd41/__init__.py
 create mode 100644 parser/parser-cpmd/cpmdparser/versions/versionsetup.py
 create mode 100644 setup.py
 create mode 100644 src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala
 create mode 100644 src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala
 create mode 100644 test/unittests/README.md

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6f8ec45
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,56 @@
+# use glob syntax.
+syntax: glob
+*.ser
+*.class
+*~
+*.bak
+#*.off
+*.old
+*.pyc
+*.bk
+*.swp
+.DS_Store
+
+# logging files
+detailed.log
+
+# eclipse conf file
+.settings
+.classpath
+.project
+.manager
+.scala_dependencies
+
+# idea
+.idea
+*.iml
+
+# building
+target
+build
+null
+tmp*
+temp*
+dist
+test-output
+build.log
+
+# other scm
+.svn
+.CVS
+.hg*
+
+# switch to regexp syntax.
+#  syntax: regexp
+#  ^\.pc/
+
+#SHITTY output not in target directory
+build.log
+
+#emacs TAGS
+TAGS
+
+lib/
+env/
+
+# CP2K files
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..ad6fb22
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,17 @@
+stages:
+  - test
+
+testing:
+  stage: test
+  script:
+    - cd .. && rm -rf nomad-lab-base
+    - git clone --recursive git@gitlab.mpcdf.mpg.de:nomad-lab/nomad-lab-base.git
+    - cd nomad-lab-base
+    - git submodule foreach git checkout master
+    - git submodule foreach git pull
+    - sbt cpmd/test
+  only:
+    - master
+  tags:
+    - test
+    - spec2
diff --git a/README.md b/README.md
index c0fef60..821092d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,101 @@
-[NOMAD Laboratory CoE](http://nomad-lab.eu/) parser for [CPMD](http://www.cpmd.org)
+This is the main repository of the [NOMAD](http://nomad-lab.eu) parser for
+[CPMD](http://www.cpmd.org/).
 
-The original lives at https://gitlab.rzg.mpg.de/nomad-lab/parser-cpmd
-This depends on python common, and nomad-meta-info, you most likely want to
-get [nomad-lab-base](https://gitlab.rzg.mpg.de/nomad-lab/nomad-lab-base) that
-contains it along with all dependencies.
\ No newline at end of file
+# Installation
+This parser is a submodule of the nomad-lab-base repository. Developers within
+the NoMaD project will automatically get a copy of this repository when they
+download and install the base repository.
+
+# Structure
+The scala layer can access the parser functionality through the
+scalainterface.py file, by calling the following command:
+
+```python
+    python scalainterface.py path/to/main/file
+```
+
+This scala interface is separated into it's own file to separate it from the
+rest of the code. Some parsers will have the interface in the same file as the
+parsing code, but I feel that this is a cleaner approach.
+
+The parser is designed to support multiple versions of CPMD with a
+[DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) approach: The
+initial parser class is based on CPMD v4.1, and other versions will be
+subclassed from it. By sublassing, all the previous functionality will be
+preserved, new functionality can be easily created, and old functionality
+overridden only where necesssary.
+
+
+# Standalone Mode
+The parser is designed to be usable also outside the NoMaD project as a
+separate python package. This standalone python-only mode is primarily for
+people who want to easily access the parser without the need to setup the whole
+"NOMAD Stack". It is also used when running custom unit tests found in the
+folder *cpmd/test/unittests*. Here is an example of the call syntax:
+
+```python
+    from cpmdparser import CPMDParser
+    import matplotlib.pyplot as mpl
+
+    # 1. Initialize a parser by giving a path to the CPMD output file and a list of
+    # default units
+    path = "path/to/main.file"
+    default_units = ["eV"]
+    parser = CPMDParser(path, default_units=default_units)
+
+    # 2. Parse
+    results = parser.parse()
+
+    # 3. Query the results with using the id's created specifically for NOMAD.
+    scf_energies = results["energy_total_scf_iteration"]
+    mpl.plot(scf_energies)
+    mpl.show()
+```
+
+To install this standalone version, you need to clone the repositories
+"python-common", "nomad-meta-info", and "parser-cpmd" into the same folder.
+Then install the python-common according to the instructions found in the
+README. After that, you can install this package by running:
+
+```sh
+python setup.py develop --user
+```
+
+# Tools and Methods
+This section describes some of the guidelines that are used in the development
+of this parser.
+
+## Documentation
+This parser tries to follow the [google style
+guide](https://google.github.io/styleguide/pyguide.html?showone=Comments#Comments)
+for documenting python code. Documenting makes it much easier to follow the
+logic behind your parser.
+
+## Testing
+The parsers can become quite complicated and maintaining them without
+systematic testing is impossible. There are general tests that are
+performed automatically in the scala layer for all parsers. This is essential,
+but can only test that the data is outputted in the correct format and
+according to some general rules. These tests cannot verify that the contents
+are correct.
+
+In order to truly test the parser output, unit testing is needed. The unit
+tests for this parser are located in **cpmd/test/unittests**. Unit tests provide one way
+to test each parseable quantity and python has a very good [library for unit
+testing](https://docs.python.org/2/library/unittest.html).  When the parser
+supports a new quantity it is quite fast to create unit tests for it. These
+tests will validate the parsing, and also easily detect bugs that may rise when
+the code is modified in the future.
+
+## Profiling
+The parsers have to be reasonably fast. For some codes there is already
+significant amount of data in the NoMaD repository and the time taken to parse
+it will depend on the performance of the parser. Also each time the parser
+evolves after system deployment, the existing data may have to be reparsed at
+least partially.
+
+By profiling what functions take the most computational time and memory during
+parsing you can identify the bottlenecks in the parser. There are already
+existing profiling tools such as
+[cProfile](https://docs.python.org/2/library/profile.html#module-cProfile)
+which you can plug into your scripts very easily.
diff --git a/parser/parser-cpmd/cpmdparser/__init__.py b/parser/parser-cpmd/cpmdparser/__init__.py
new file mode 100644
index 0000000..1e218e7
--- /dev/null
+++ b/parser/parser-cpmd/cpmdparser/__init__.py
@@ -0,0 +1 @@
+from cpmdparser.parser import CPMDParser
diff --git a/parser/parser-cpmd/cpmdparser/generic/__init__.py b/parser/parser-cpmd/cpmdparser/generic/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/parser/parser-cpmd/cpmdparser/parser.py b/parser/parser-cpmd/cpmdparser/parser.py
new file mode 100644
index 0000000..f3f0fae
--- /dev/null
+++ b/parser/parser-cpmd/cpmdparser/parser.py
@@ -0,0 +1,67 @@
+from builtins import next
+from builtins import range
+import os
+import re
+import logging
+from nomadcore.baseclasses import ParserInterface
+from cpmdparser.versions.versionsetup import get_main_parser
+logger = logging.getLogger("nomad")
+
+
+#===============================================================================
+class CPMDParser(ParserInterface):
+    """This class handles the initial setup before any parsing can happen. It
+    determines which version of CP2K was used to generate the output and then
+    sets up a correct main parser.
+
+    After the implementation has been setup, you can parse the files with
+    parse().
+    """
+    def __init__(self, main_file, metainfo_to_keep=None, backend=None, default_units=None, metainfo_units=None, debug=True, store=True):
+        super(CPMDParser, self).__init__(main_file, metainfo_to_keep, backend, default_units, metainfo_units, debug, store)
+
+    def setup_version(self):
+        """Setups the version by looking at the output file and the version
+        specified in it.
+        """
+        # Search for the CP2K version specification and the RUN_TYPE for the
+        # calculation. The correct and optimized parser is initialized based on
+        # this information.
+        regex_version = re.compile(r" CP2K\| version string:\s+CP2K version ([\d\.]+)")
+        regex_run_type = re.compile(r"\s+GLOBAL\| Run type\s+(.+)")
+        n_lines = 50
+        version_id = None
+        run_type = None
+        with open(self.parser_context.main_file, 'r') as outputfile:
+            for i_line in range(n_lines):
+                line = next(outputfile)
+                result_version = regex_version.match(line)
+                result_run_type = regex_run_type.match(line)
+                if result_version:
+                    version_id = result_version.group(1).replace('.', '')
+                if result_run_type:
+                    run_type = result_run_type.group(1)
+        if version_id is None:
+            msg = "Could not find a version specification from the given main file."
+            logger.exception(msg)
+            raise RuntimeError(msg)
+        if run_type is None:
+            msg = "Could not find a version specification from the given main file."
+            logger.exception(msg)
+            raise RuntimeError(msg)
+
+        # Setup the root folder to the fileservice that is used to access files
+        dirpath, filename = os.path.split(self.parser_context.main_file)
+        dirpath = os.path.abspath(dirpath)
+        self.parser_context.file_service.setup_root_folder(dirpath)
+        self.parser_context.file_service.set_file_id(filename, "output")
+
+        # Setup the correct main parser based on the version id. If no match
+        # for the version is found, use the main parser for CP2K 2.6.2
+        self.main_parser = get_main_parser(version_id, run_type)(self.parser_context.main_file, self.parser_context)
+
+    def get_metainfo_filename(self):
+        return "cpmd.nomadmetainfo.json"
+
+    def get_parser_info(self):
+        return {'name': 'cpmd-parser', 'version': '1.0'}
diff --git a/parser/parser-cpmd/cpmdparser/scalainterface.py b/parser/parser-cpmd/cpmdparser/scalainterface.py
new file mode 100644
index 0000000..eb0a691
--- /dev/null
+++ b/parser/parser-cpmd/cpmdparser/scalainterface.py
@@ -0,0 +1,17 @@
+"""
+This is the access point to the parser for the scala layer in the
+nomad project.
+"""
+from __future__ import absolute_import
+import sys
+import setup_paths
+from nomadcore.parser_backend import JsonParseEventsWriterBackend
+from cpmdparser import CPMDParser
+
+
+if __name__ == "__main__":
+
+    # Initialise the parser with the main filename and a JSON backend
+    main_file = sys.argv[1]
+    parser = CPMDParser(main_file, backend=JsonParseEventsWriterBackend)
+    parser.parse()
diff --git a/parser/parser-cpmd/cpmdparser/setup_paths.py b/parser/parser-cpmd/cpmdparser/setup_paths.py
new file mode 100644
index 0000000..e45903e
--- /dev/null
+++ b/parser/parser-cpmd/cpmdparser/setup_paths.py
@@ -0,0 +1,17 @@
+"""
+Setups the python-common library in the PYTHONPATH system variable.
+"""
+import sys
+import os
+import os.path
+
+baseDir = os.path.dirname(os.path.abspath(__file__))
+commonDir = os.path.normpath(os.path.join(baseDir, "../../../../../python-common/common/python"))
+parserDir = os.path.normpath(os.path.join(baseDir, "../../parser-cpmd"))
+
+# Using sys.path.insert(1, ...) instead of sys.path.insert(0, ...) based on
+# this discusssion:
+# http://stackoverflow.com/questions/10095037/why-use-sys-path-appendpath-instead-of-sys-path-insert1-path
+if commonDir not in sys.path:
+    sys.path.insert(1, commonDir)
+    sys.path.insert(1, parserDir)
diff --git a/parser/parser-cpmd/cpmdparser/versions/__init__.py b/parser/parser-cpmd/cpmdparser/versions/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/parser/parser-cpmd/cpmdparser/versions/cpmd41/__init__.py b/parser/parser-cpmd/cpmdparser/versions/cpmd41/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/parser/parser-cpmd/cpmdparser/versions/versionsetup.py b/parser/parser-cpmd/cpmdparser/versions/versionsetup.py
new file mode 100644
index 0000000..e4f9176
--- /dev/null
+++ b/parser/parser-cpmd/cpmdparser/versions/versionsetup.py
@@ -0,0 +1,62 @@
+import importlib
+import logging
+logger = logging.getLogger("nomad")
+
+
+#===============================================================================
+def get_main_parser(version_id, run_type):
+    """
+    Setups a main parser class for this calculation. The main class can be
+    different for each version and run type.
+
+    Args:
+        version_id: An integer representing the CP2K version. The version
+            number is originally a string the form '2.6.2', but here the numbers
+            are just concatenated into a single integer number 262.
+        run_type: A string that identifies the RUN_TYPE for the calculation.
+            All the possible run types can be found in the CP2K reference manual.
+
+    Returns:
+        A python class that should be instantiated later with the correct
+        parameters.
+    """
+
+    # Search for a RUN_TYPE specific parser
+    parser_map = {
+        "ENERGY": "SinglePointParser",
+        "ENERGY_FORCE": "SinglePointParser",
+        "WAVEFUNCTION_OPTIMIZATION": "SinglePointParser",
+        "WFN_OPT": "SinglePointParser",
+        "GEO_OPT": "GeoOptParser",
+        "GEOMETRY_OPTIMIZATION": "GeoOptParser",
+        "MD": "MDParser",
+        "MOLECULAR_DYNAMICS": "MDParser",
+    }
+    try:
+        parser = parser_map[run_type]
+    except KeyError:
+        logger.exception("A parser corresponding to the run_type '{}' could not be found.".format(run_type))
+        raise
+
+    # Currently the version id is a pure integer, so it can directly be mapped
+    # into a package name.
+    base = "cpmdparser.versions.cp2k{}.{}".format(version_id, parser.lower())
+    parser_module = None
+    parser_class = None
+    try:
+        parser_module = importlib.import_module(base)
+    except ImportError:
+        logger.warning("Could not find a parser for version '{}' and run type '{}'. Trying to default to the base implementation for CP2K 2.6.2".format(version_id, run_type))
+        base = "cp2kparser.versions.cp2k262.{}".format(parser.lower())
+        try:
+            parser_module = importlib.import_module(base)
+        except ImportError:
+            logger.exception("Tried to default to the CP2K 2.6.2 implementation but could not find the correct modules for run_type '{}'.".format(run_type))
+            raise
+    try:
+        parser_class = getattr(parser_module, "CP2K{}".format(parser))
+    except AttributeError:
+        logger.exception("A parser class '{}' could not be found in the module '[]'.".format(parser_class, parser_module))
+        raise
+
+    return parser_class
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..fc623cc
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,33 @@
+"""
+This is a setup script for installing the parser locally on python path with
+all the required dependencies. Used mainly for local testing.
+"""
+from setuptools import setup, find_packages
+
+
+#===============================================================================
+def main():
+    # Start package setup
+    setup(
+        name="cpmdparser",
+        version="0.1",
+        # package_data={
+            # 'cp2kparser.versions.cp2k262': ['input_data/cp2k_input_tree.pickle'],
+        # },
+        description="NoMaD parser implementation for CPMD.",
+        author="Lauri Himanen",
+        author_email="lauri.himanen@aalto.fi",
+        license="GPL3",
+        # package_dir={'': 'parser/parser-cp2k'},
+        packages=find_packages(),
+        install_requires=[
+            'pint',
+            'numpy',
+            # 'mdtraj',
+            # 'ase'
+        ],
+    )
+
+# Run main function by default
+if __name__ == "__main__":
+    main()
diff --git a/src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala b/src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala
new file mode 100644
index 0000000..c0305e7
--- /dev/null
+++ b/src/main/scala/eu/nomad_lab/parsers/CpmdParser.scala
@@ -0,0 +1,56 @@
+package eu.nomad_lab.parsers
+
+import eu.{ nomad_lab => lab }
+import eu.nomad_lab.DefaultPythonInterpreter
+import org.{ json4s => jn }
+import scala.collection.breakOut
+
+object Cp2kParser extends SimpleExternalParserGenerator(
+  name = "Cp2kParser",
+  parserInfo = jn.JObject(
+    ("name" -> jn.JString("CpmdParser")) ::
+      ("parserId" -> jn.JString("CpmdParser" + lab.Cp2kVersionInfo.version)) ::
+      ("versionInfo" -> jn.JObject(
+        ("nomadCoreVersion" -> jn.JObject(lab.NomadCoreVersionInfo.toMap.map {
+          case (k, v) => k -> jn.JString(v.toString)
+        }(breakOut): List[(String, jn.JString)])) ::
+          (lab.CpmdVersionInfo.toMap.map {
+            case (key, value) =>
+              (key -> jn.JString(value.toString))
+          }(breakOut): List[(String, jn.JString)])
+      )) :: Nil
+  ),
+  mainFileTypes = Seq("text/.*"),
+  mainFileRe = """  \*\*\*\* \*\*\*\* \*\*\*\*\*\*  \*\*  PROGRAM STARTED AT\s(?<cpmdStartedAt>.*)
+ \*\*\*\*\* \*\* \*\*\*  \*\*\* \*\*   PROGRAM STARTED ON\s*.*
+ \*\*    \*\*\*\*   \*\*\*\*\*\*    PROGRAM STARTED BY .*
+ \*\*\*\*\* \*\*    \*\* \*\* \*\*   PROGRAM PROCESS ID .*
+  \*\*\*\* \*\*  \*\*\*\*\*\*\*  \*\*  PROGRAM STARTED IN .*
+(?:\s*\n|                                      \s+.*
+)*
+(?:\s*CP2K\| version string:\s*(?<cpmdVersionString>.*)
+)?(?:\s*CP2K\| source code revision number:\s*(?<cpmdRevision>.*)
+)?""".r,
+  cmd = Seq(DefaultPythonInterpreter.python2Exe(), "${envDir}/parsers/cpmd/parser/parser-cpmd/cpmdparser/scalainterface.py",
+    "${mainFilePath}"),
+  cmdCwd = "${mainFilePath}/..",
+  resList = Seq(
+    "parser-cpmd/cpmdparser/__init__.py",
+    "parser-cpmd/cpmdparser/setup_paths.py",
+    "parser-cpmd/cpmdparser/parser.py",
+    "parser-cpmd/cpmdparser/generic/__init__.py",
+    "parser-cpmd/cpmdparser/versions/__init__.py",
+    "parser-cpmd/cpmdparser/versions/versionsetup.py",
+    "parser-cpmd/cpmdparser/versions/cpmd41/__init__.py",
+    "parser-cpmd/cpmdparser/scalainterface.py",
+    "nomad_meta_info/public.nomadmetainfo.json",
+    "nomad_meta_info/common.nomadmetainfo.json",
+    "nomad_meta_info/meta_types.nomadmetainfo.json",
+    "nomad_meta_info/cpmd.nomadmetainfo.json",
+    "nomad_meta_info/cpmd.general.nomadmetainfo.json"
+  ) ++ DefaultPythonInterpreter.commonFiles(),
+  dirMap = Map(
+    "parser-cpmd" -> "parsers/cpmd/parser/parser-cpmd",
+    "nomad_meta_info" -> "nomad-meta-info/meta_info/nomad_meta_info"
+  ) ++ DefaultPythonInterpreter.commonDirMapping()
+)
diff --git a/src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala b/src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala
new file mode 100644
index 0000000..fd35602
--- /dev/null
+++ b/src/test/scala/eu/nomad_lab/parsers/CpmdParserSpec.scala
@@ -0,0 +1,15 @@
+package eu.nomad_lab.parsers
+
+import org.specs2.mutable.Specification
+
+object CpmdParserSpec extends Specification {
+  "CpmdParserTest" >> {
+    "test with json-events" >> {
+      ParserRun.parse(CpmdParser, "parsers/cpmd/test/examples/energy_force/si_bulk8.out", "json-events") must_== ParseResult.ParseSuccess
+    }
+  }
+
+  "test energy_force with json" >> {
+    ParserRun.parse(CpmdParser, "parsers/cpmd/test/examples/energy_force/si_bulk8.out", "json") must_== ParseResult.ParseSuccess
+  }
+}
diff --git a/test/unittests/README.md b/test/unittests/README.md
new file mode 100644
index 0000000..550be2b
--- /dev/null
+++ b/test/unittests/README.md
@@ -0,0 +1,6 @@
+# Unit tests
+This directory contains unit tests to evaluate the correctness of the parser in
+a systematic way. Ideally each parsed metainfo should have at least one unit
+test, and if the resulting values are predetermined, the available values
+should all be tested individually. Also certain scenarios that should produce a
+parsing error should be tested.
-- 
GitLab