parsers.py 8.15 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Integration of parsers into the processing
==========================================

Parsers are developed as independed, individual python programs in their own GIT repositories.
They are build on a common modules called *python-common*, also in a separate GIT.
All parsers depend on the *meta-info*, which is also maintained in its own GIT.

Assumption about parsers
------------------------
For now, we make a few assumption about parsers
- they always work on the same *meta-inf*
- they have no conflicting python requirments
- they can be loaded at the same time and can be used within the same python process
- they are uniquely identified by a GIT URL and publicly accessible
- their version is uniquly identified by a GIT commit SHA

Preparing dependencies and parsers during python run-time
---------------------------------------------------------
To make GIT maintained python modules available, we use:

.. autoclass:: nomad.parsers.PythonGitRepository

Parsers, as a special case for a GIT maintained python modules, can be used via:

.. autoclass:: nomad.parsers.Parser
"""
42
import re
43
44
45
46
47
48
49
50
import os
import os.path
from git import Repo, Git
try:
    from pip import main as pip
except:
    from pip._internal import main as pip
import importlib
51
52
53
54
55

from nomadcore.parser_backend import JsonParseEventsWriterBackend

_meta_info_path = './submodules/nomad-meta-info/meta_info/nomad_meta_info/'

56
57
58
59
60
61
62
63
base_dir = './.dependencies'


class PythonGitRepositoryError(Exception):
    def __init__(self, msg, repo):
        msg = '%s [%s]' % (msg, repo)
        super().__init__(msg)

64

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class PythonGitRepository():
    """Represents a python module in a git repository.

    It allows to fetch a specific commit, install all requirements to
    the current python environment, and check the installation via module import.
    """
    def __init__(self, name, git_url, git_commit, modules=[]):
        """
        Args:
            name: A name that determines the download path, can contain '/' for sub dirs.
            git_url: A publically available and fetchable url to the GIT repository.
            git_commit: The full commit SHA of the desired commit.
            modules: A list of python module names that is used to confirm the installation.
        """
        super().__init__()
        self.name = name
        self.git_url = git_url
        self.git_commit = git_commit
        self.modules = modules

    def prepare(self, force_install=False):
        """Makes sure that the repository is fetched, at the right commit, and installed.

        Args:
            force_install: default is *False*. Allows to force install, e.g. after git commit or
                url change.

        Raises:
            PythonGitRepositoryError: if something went wrong.
        """
        # check/change working directory
        old_cwd = os.getcwd()
        try:
            cwd = os.path.join(base_dir, self.name)
            if not os.path.exists(cwd):
                os.makedirs(cwd)
            os.chdir(cwd)

            # check git/do init
            if os.path.exists('.git'):
                git = Repo('./')
            else:
                git_cmd = Git('./')
                git_cmd.init()
                git = Repo('./')
                origin = git.create_remote('origin', self.git_url)

            # check commit/checkout
            if 'master' not in git.heads:
                origin = git.remote('origin')
                origin.fetch(self.git_commit)
                git.create_head('master', self.git_commit)
            elif self.git_commit != git.heads.master.commit:
                origin = git.remote('origin')
                origin.fetch(self.git_commit)
            assert self.git_commit != git.heads.master.commit, \
                'Actual and desired commit do not match'
            git.heads.master.checkout()

            # check install
            def is_installed():
                for module in self.modules:
                    module_spec = importlib.util.find_spec(module)
                    if module_spec is None:
                        return False
                return True
            if is_installed() and not force_install:
                return

            # check/install requirements.txt
            if os.path.exists('requirements.txt'):
                # try twice to support circular dependencies
                for _ in range(1, 2):
                    pipcode = pip(['install', '-r', 'requirements.txt'])
                    if pipcode == 0:
                        break
                if pipcode != 0:
                    raise PythonGitRepositoryError(
                        'Could not install requirements (pip code=%s)' % pipcode, self)

            # check/install setup.py
            if os.path.exists('setup.py'):
                pipcode = pip(['install', '-e', '.'])
                if pipcode != 0:
                    raise PythonGitRepositoryError(
                        'Could not install (pip code=%s)' % pipcode, repo=self)

            # check install again
            if not is_installed():
                raise PythonGitRepositoryError(
                    'Some modules are not installed after install', repo=self)

            # reload, loaded modules when installed because of force_install
            # TODO
        except PythonGitRepositoryError as e:
            raise e
        except Exception as e:
            raise PythonGitRepositoryError(
                'Unexpected exception during preparation: %s' % e, repo=self)
        finally:
            os.chdir(old_cwd)
        pass


class Parser(PythonGitRepository):
170
171
172
173
    """
    Instances specify a parser. It allows to find *main files* from  given uploaded
    and extracted files. Further, allows to run the parser on those 'main files'.
    """
174
175
176
177
178
    def __init__(self, name, git_url, git_commit, parser, main_file_re, main_contents_re):
        modules = ['.'.join(parser.split('.')[:-1])]
        super().__init__(
            os.path.join('parsers', name), git_url, git_commit, modules=modules)
        self.parser = parser
179
180
181
182
183
184
185
186
187
188
189
190
191
192
        self._main_file_re = re.compile(main_file_re)
        self._main_contents_re = re.compile(main_contents_re)

    def is_mainfile(self, upload, filename):
        if self._main_file_re.match(filename):
            file = None
            try:
                file = upload.open_file(filename)
                return self._main_contents_re.match(file.read(500))
            finally:
                if file:
                    file.close()

    def run(self, mainfile):
193
194
195
196
197
198
        module_name = self.parser.split('.')[:-1]
        parser_class = self.parser.split('.')[1]
        module = importlib.import_module('.'.join(module_name))
        Parser = getattr(module, parser_class)
        parser = Parser(backend=JsonParseEventsWriterBackend)
        parser.parse(mainfile)
199
200
201
202
203
204


class VASPRunParser(Parser):
    def __init__(self):
        super().__init__(
            name='VASPRunParser',
205
206
207
            git_url='git@gitlab.mpcdf.mpg.de:nomad-lab/parser-vasp.git',
            git_commit='ddf8495944fbbcb62801f69b2c2c6c3d6099129d',
            parser='vaspparser.VASPParser',
208
209
210
211
212
213
214
215
216
217
218
219
220
            main_file_re=r'^.*\.xml$',
            main_contents_re=(
                r'^\s*<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*'
                r'?\s*<modeling>'
                r'?\s*<generator>'
                r'?\s*<i name="program" type="string">\s*vasp\s*</i>'
                r'?')
        )

parsers = [
    VASPRunParser()
]
parser_dict = {parser.name: parser for parser in parsers}
221
222
223
224
225
226
227
228
229


def prepare_parsers(force_install=False):
    for parser in parsers:
        parser.prepare(force_install=force_install)


if __name__ == '__main__':
    prepare_parsers(force_install=True)