legacy.py 22 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
This module contains functionality to use old 'legacy' NOMAD CoE parsers with the
new nomad@fairdi infrastructure. This covers aspects like the new metainfo, a unifying
wrapper for parsers, parser logging, and a parser backend.
'''
20

21
from typing import cast, Dict, List, Union, Any, Set, Iterable, Tuple
22
import numpy as np
23
24
from pint.errors import UndefinedUnitError
import os.path
25

26

27
from nomadcore.local_meta_info import loadJsonFile, InfoKindEl, InfoKindEnv
28
29
30
import nomad_meta_info

from nomad import utils
31
32
from nomad.metainfo import (
    Definition, SubSection, Package, Quantity, Category, Section, Reference, units,
33
    Environment, MEnum, MSection, DefinitionAnnotation)
34

35
logger = utils.get_logger(__name__)
36
37


38
39
40
_ignored_packages = [
    'meta_types.nomadmetainfo.json',
    'repository.nomadmetainfo.json']
41
42


43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class LegacyDefinition(DefinitionAnnotation):

    def __init__(self, name: str):
        self.name = name


class LegacyPackage(LegacyDefinition):
    def __init__(self, name, python_module, python_path):
        super().__init__(name)

        self.python_module = python_module
        self.python_path = python_path


def normalize_name(name: str):
    return name.replace('.', '_').replace('-', '_')


def normalized_package_name(name: str):
    '''
    Transforms legacy metainfo '.nomadmetainfo.json' filenames into proper (python)
    identifier.
    '''
    name = name.replace('.nomadmetainfo.json', '')
    return normalize_name(name)


70
71
72
def python_package_mapping(metainfo_package_name: str) -> Tuple[str, str]:
    '''
    Compute the python package for the given metainfo package name. It returns
73
    a tuple containing a package name and a file path. The filepath denotes the file
74
75
    for this package within the nomad git project.
    '''
76
77
    prefix = metainfo_package_name.replace('.nomadmetainfo.json', '').split('.')[0]
    metainfo_package_name = normalized_package_name(metainfo_package_name)
78

79
    if prefix in ['common', 'general', 'public', 'dft', 'ems']:
80
81
82
83
        directory = 'nomad/datamodel/metainfo'
        python_package_name = 'nomad.datamodel.metainfo.%s' % metainfo_package_name

    else:
84
85
86
87
        parser_dir = prefix.replace('_', '-')
        prefix = prefix.replace('_', '')

        directory = 'dependencies/parsers/%s/%sparser/metainfo' % (parser_dir, prefix)
88
89
90
91
92
93
94
        python_package_name = '%sparser.metainfo.%s' % (prefix, metainfo_package_name)

    path = '%s/%s.py' % (directory, metainfo_package_name)

    return python_package_name, path


95
class LegacyMetainfoEnvironment(Environment):
96
    '''
97
98
    A metainfo environment with functions to create a legacy metainfo version of
    the environment.
99
    '''
100
101
102

    legacy_package_name = Quantity(type=str)

103
    def legacy_info(self, definition: Definition, *args, **kwargs) -> InfoKindEl:
104
        ''' Creates a legacy metainfo objects for the given definition. '''
105
106
        super_names: List[str] = list()
        result: Dict[str, Any] = dict(
107
            name=definition.a_legacy.name,
108
109
110
111
            description=definition.description,
            superNames=super_names)

        for category in definition.categories:
112
            super_names.append(category.a_legacy.name)
113
114
115
116
117

        if isinstance(definition, Section):
            result['kindStr'] = 'type_section'
            result['repeats'] = any(
                sub_section.repeats
118
                for sub_section in self.resolve_definitions(definition.name, SubSection))
119

120
            for sub_section in self.resolve_definitions(definition.name, SubSection):
121
                super_names.append(sub_section.m_parent_as(Definition).a_legacy.name)
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

        elif isinstance(definition, Quantity):
            result['kindStr'] = 'document_content'
            result['shape'] = definition.shape
            dtype_str = None
            if definition.type == int:
                dtype_str = 'i'
            elif definition.type == float:
                dtype_str = 'f'
            elif definition.type == bool:
                dtype_str = 'b'
            elif definition.type == str:
                dtype_str = 'C'
            elif isinstance(definition.type, Reference):
                dtype_str = 'r'
137
138
                result['referencedSections'] = [
                    definition.type.target_section_def.m_resolved().a_legacy.name]
139
            elif isinstance(definition.type, MEnum):
140
141
142
143
144
145
146
147
148
149
150
                dtype_str = 'C'
            elif type(definition.type) == np.dtype:
                dtype_str = definition.type.name[0]
            elif definition.type == Any:
                dtype_str = 'D'
            else:
                raise TypeError(
                    'Unsupported quantity type %s in %s.' % (definition.type, definition))
            result['dtypeStr'] = dtype_str
            if definition.unit is not None:
                result['units'] = str(definition.unit)
151
            super_names.append(definition.m_parent_as(Definition).a_legacy.name)
152
153
154
155

        elif isinstance(definition, Category):
            result['kindStr'] = 'abstract_document_content'

156
157
158
159
160
161
        package = cast(MSection, definition)
        while not isinstance(package, Package):
            package = package.m_parent

        result['package'] = package.name

162
163
164
        return InfoKindEl(*args, **result, **kwargs)

    def legacy_info_env(self, packages: List[Package] = None, *args, **kwargs) -> InfoKindEnv:
165
        ''' Creates a legacy metainfo environment with all definitions from the given packages. '''
166
        if packages is None:
167
            packages = self.packages
168
169
170
171

        env = InfoKindEnv(*args, **kwargs)
        for package in packages:
            for definition in package.all_definitions.values():
172
173
174
                if not (isinstance(definition, Section) and definition.extends_base_section):
                    env.addInfoKindEl(self.legacy_info(definition))

175
176
177
178
179
180
                if isinstance(definition, Section):
                    for quantity in definition.quantities:
                        env.addInfoKindEl(self.legacy_info(quantity))

        return env

181

182
183
184
class EnvironmentConversion:
    def __init__(self, legacy_env_or_path: Union[InfoKindEnv, str]):
        if isinstance(legacy_env_or_path, str):
185
186
187
            if not os.path.exists(legacy_env_or_path):
                legacy_env_or_path = os.path.normpath(os.path.join(
                    os.path.dirname(nomad_meta_info.__file__), legacy_env_or_path))
188
189
190
191
192
193
194
195
196
197
198
199
            self.legacy_env, _ = loadJsonFile(filePath=legacy_env_or_path)

        else:
            self.legacy_env = cast(InfoKindEnv, legacy_env_or_path)

        self.__fix_legacy_super_names()

        self.package_conversions: Dict[str, PackageConversion] = {}

        for legacy_def in self.legacy_env.infoKindEls():
            if legacy_def.package in _ignored_packages:
                continue
200
            # legacy_def.package = normalized_package_name(legacy_def.package)
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
            package_conversion = self.package_conversions.get(legacy_def.package)
            if package_conversion is None:
                package_conversion = PackageConversion(self, legacy_def.package)
                self.package_conversions[legacy_def.package] = package_conversion

            package_conversion.legacy_defs.append(legacy_def)

        for package_conversion in self.package_conversions.values():
            package_conversion.create_definitions()

        for package_conversion in self.package_conversions.values():
            package_conversion.set_super_names()

        for package_conversion in self.package_conversions.values():
            package_conversion.init_definitions()

    def create_env(self) -> LegacyMetainfoEnvironment:
        env = LegacyMetainfoEnvironment()
219
        env.legacy_package_name = normalized_package_name(self.legacy_env.name)
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
        for package_conv in self.package_conversions.values():
            package = package_conv.package
            errors, warnings = package.m_all_validate()
            if len(errors) > 0:
                logger.error(
                    '%s. There are %d more errors in converted legacy package %s' %
                    (errors[0], len(errors) - 1, package))
            if len(warnings) > 0:
                logger.warn(
                    '%s. There are %d more warnings in converted legacy package %s' %
                    (warnings[0], len(warnings) - 1, package))
            env.m_add_sub_section(Environment.packages, package)
            package.init_metainfo()
        return env

    def __fix_legacy_super_names(self):

        def get_super_names(legacy_def: InfoKindEl, super_categories: Set[str] = None):
            super_section: str = None
            if super_categories is None:
                super_categories = set()

            for super_name in legacy_def.superNames:
                super_def = self.legacy_env.infoKindEl(super_name)

                if super_def.kindStr == 'type_section':
                    super_section = super_def.name

                elif super_def.kindStr == 'type_abstract_document_content':
                    super_categories.add(super_def.name)
                    super_super_section, _ = get_super_names(super_def, super_categories=super_categories)
251

252
253
                    if super_super_section is None:
                        pass
254

255
256
                    elif super_section is None:
                        super_section = super_super_section
257

258
259
                    elif super_section == super_super_section:
                        pass
260

261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
                    else:
                        logger.error('conflicting parent sections %s, %s for %s' % (
                            super_section, super_def.name, legacy_def.name))

            return super_section, super_categories

        for legacy_def in self.legacy_env.infoKindEls():
            super_section, super_categories = get_super_names(legacy_def)

            if super_section is None:
                legacy_def.superNames = list(super_categories)

            else:
                legacy_def.superNames = [super_section] + list(super_categories)

    def resolve(self, name: str) -> Iterable[Definition]:
        for package_conversion in self.package_conversions.values():
            definition = package_conversion.package.all_definitions.get(name)
            if definition is not None:
                yield definition


class PackageConversion:

    def __init__(self, env_conversion: EnvironmentConversion, name: str):
        self.env_conversion = env_conversion
        self.legacy_defs: List[InfoKindEl] = []

289
290
291
292
293
        python_module, python_path = python_package_mapping(name)

        self.package = Package(
            name=normalize_name(name),
            a_legacy=LegacyPackage(name, python_module, python_path))
294

295
296
297
298
299
300
        self.quantities: Dict[str, Quantity] = {}

        self.logger = logger.bind(package=name)

    def create_definitions(self):
        for legacy_def in self.legacy_defs:
301
            name = normalize_name(legacy_def.name)
302
303

            if legacy_def.kindStr == 'type_abstract_document_content':
304
305
                self.package.m_create(
                    Category, name=name, a_legacy=LegacyDefinition(name=legacy_def.name))
306
307

            elif legacy_def.kindStr == 'type_section':
308
309
310
                self.package.m_create(
                    Section, name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
311
312

            elif legacy_def.kindStr in ['type_dimension', 'type_document_content']:
313
314
315
                definition = Quantity(
                    name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
                self.quantities[name] = (definition)

            else:
                logger.error('unknown kindStr %s for %s' % (legacy_def.kindStr, name))

    def __resolve(self, name: str, create_extends: bool = False):
        definition: Definition = self.package.all_definitions.get(name)
        if definition is None:
            definition = self.quantities.get(name)

        if definition is not None:
            if not (isinstance(definition, Section) and definition.extends_base_section) or create_extends:
                return definition

        for definition in self.env_conversion.resolve(name):
            if isinstance(definition, Section) and definition.extends_base_section:
                continue

            if create_extends and isinstance(definition, Section):
335
336
337
                extending_def = self.package.m_create(
                    Section, name=definition.name,
                    a_legacy=LegacyDefinition(name=definition.a_legacy.name))
338
339
340
341
342
343
344
345
346
347
                extending_def.base_sections = [definition]
                extending_def.extends_base_section = True
                return extending_def

            return definition

        assert False, 'definition %s must be created now' % name

    def set_super_names(self):
        for legacy_def in self.legacy_defs:
348
349
350
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
351
352
353
354

            if isinstance(definition, Section):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
355
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
356
357
358
359
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                if parent_section is not None:
360
361
362
                    sub_section = parent_section.m_create(
                        SubSection, name=definition.name,
                        a_legacy=LegacyDefinition(name=legacy_def.name))
363
364
365
366
367
368
                    sub_section.sub_section = definition
                    sub_section.repeats = legacy_def.repeats is None or legacy_def.repeats

            if isinstance(definition, Quantity):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
369
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
370
371
372
373
374
375
376
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                parent_section.m_add_sub_section(Section.quantities, definition)

    def init_definitions(self):
        for legacy_def in self.legacy_defs:
377
378
379
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
380
381
382
            logger = self.logger.bind(definition=definition.name)

            # common properties
383
384
            if legacy_def.description is not None and legacy_def.description.strip() != '':
                definition.description = legacy_def.description
385
386
387
388
389

            if isinstance(definition, Definition):
                # deal with categories
                categories: List[Category] = []
                for super_name in legacy_def.superNames:
390
                    super_def = self.__resolve(normalize_name(super_name))
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
                    if isinstance(super_def, Category):
                        categories.append(super_def)

                definition.categories = categories

            if isinstance(definition, Quantity):
                # type
                referenced_sections = legacy_def.extra_args.get('referencedSections', [])
                if len(referenced_sections) == 1:
                    referenced_section = self.__resolve(referenced_sections[0])
                    if referenced_section is None:
                        logger.error('could not find referencedSection %s of %s' % (
                            referenced_sections[0], definition.name))
                        definition.type = int
                    else:
                        definition.type = Reference(referenced_section)

                elif len(referenced_sections) > 1:
                    logger.error(
410
                        'higher dimensional references not yet supported: %s' % name)
411
412
413
414
415
416
417
418
419
                    definition.type = np.dtype(int)

                elif legacy_def.kindStr == 'type_dimension':
                    definition.type = int
                elif legacy_def.dtypeStr == 'D':
                    definition.type = Any
                elif legacy_def.dtypeStr == 'C':
                    definition.type = str
                elif legacy_def.dtypeStr == 'r':
420
                    logger.error('r typed quantity %s  doesn\'t have referencedSections' % name)
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
                    definition.type = int
                elif legacy_def.dtypeStr == 'b':
                    definition.type = bool
                elif legacy_def.dtypeStr == 'i64':
                    definition.type = np.dtype(np.int64)
                elif legacy_def.dtypeStr == 'f':
                    definition.type = np.dtype(np.float64)
                else:
                    definition.type = np.dtype(legacy_def.dtypeStr)

                # shapes
                legacy_shape = legacy_def.shape
                if legacy_shape is None:
                    legacy_shape = []

                definition.shape = legacy_shape
                if len(definition.shape) > 1 and definition.type == str:
                    # Usually only np types have higher shapes in old metainfo;
                    # str is one exception.
                    definition.type = np.dtype('U')

                # units
                if legacy_def.units is not None:
                    try:
                        definition.unit = units.parse_units(legacy_def.units)
                    except UndefinedUnitError:
                        logger.error('unknown unit %s' % legacy_def.units)
                    except ValueError as e:
                        logger.error('cannot parse unit %s' % legacy_def.units, exc_info=e)


def convert(metainfo_path: str) -> LegacyMetainfoEnvironment:
    return EnvironmentConversion(metainfo_path).create_env()


456
def generate_metainfo_code(metainfo_env: LegacyMetainfoEnvironment):
457
458
459
460
461
462
    '''
    Generates python code with metainfo definitions for all packages in the given
    environement

    Arguments:
        env: The metainfo environment.
463
464
465
        python_package_path: An optional directory path. The directory must exist. Default
            is the working directory. The path will be used to form the module prefix
            for generated Python modules.
466
    '''
467
468
    from jinja2 import Environment as JinjaEnvironment, PackageLoader, select_autoescape
    import textwrap
469
470
471
472
473
474
475
476
477
478
479
480
481
482

    def format_description(description, indent=0, width=90):
        paragraphs = [paragraph.strip() for paragraph in description.split('\n')]

        def format_paragraph(paragraph, first):
            lines = textwrap.wrap(text=paragraph, width=width - indent * 4)
            lines = [l.replace('\\', '\\\\') for l in lines]
            return textwrap.indent(
                '\n'.join(lines), ' ' * 4 * indent, lambda x: not (first and x.startswith(lines[0])))

        return '\n\n'.join([
            format_paragraph(p, i == 0)
            for i, p in enumerate(paragraphs) if p != ''])

483
    def format_type(pkg, mi_type):
484
        if type(mi_type) == np.dtype:
485
486
487
            if mi_type == np.dtype('U'):
                return 'np.dtype(\'U\')'

488
            return 'np.dtype(np.%s)' % mi_type
489

490
491
        if mi_type in [int, float, str, bool]:
            return mi_type.__name__
492

493
        if isinstance(mi_type, Reference):
494
495
496
497
            if pkg == mi_type.target_section_def.m_parent:
                return "Reference(SectionProxy('%s'))" % mi_type.target_section_def.name

            else:
498
499
                python_module = mi_type.target_section_def.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], mi_type.target_section_def.name)
500

501
502
503
504
505
506
507
508
509
510
        else:
            return str(mi_type)

    def format_unit(unit):
        return "'%s'" % unit

    def format_definition_refs(pkg, definitions):
        def format_definition_ref(definition: Definition):
            if pkg == definition.m_parent:
                return definition.name
511
            else:
512
513
                python_module = definition.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], definition.name)
514
515
516

        return ', '.join([format_definition_ref(definition) for definition in definitions])

517
    def fromat_package_import(pkg):
518
519
520
521
522
523
        python_module = pkg.a_legacy.python_module
        modules = python_module.split('.')
        return 'from %s import %s' % ('.'.join(modules[:-1]), modules[-1])

    def order_categories(categories):
        return sorted(categories, key=lambda c: len(c.categories))
524

525
526
527
528
    env = JinjaEnvironment(
        loader=PackageLoader('nomad.metainfo', 'templates'),
        autoescape=select_autoescape(['python']))
    env.globals.update(
529
        order_categories=order_categories,
530
531
532
        format_description=format_description,
        format_type=format_type,
        format_unit=format_unit,
533
534
        format_definition_refs=format_definition_refs,
        fromat_package_import=fromat_package_import)
535
536

    for package in metainfo_env.packages:
537
        path = package.a_legacy.python_path
538
539
540
541
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'wt') as f:
542
543
544
545
546
            code = env.get_template('package.j2').render(pkg=package)
            code = '\n'.join([
                line.rstrip() if line.strip() != '' else ''
                for line in code.split('\n')])
            f.write(code)
547

548
549
    _, path = python_package_mapping(metainfo_env.legacy_package_name)
    with open(os.path.join(os.path.dirname(path), '__init__.py'), 'wt') as f:
550

551
552
553
554
555
        code = env.get_template('environment.j2').render(env=metainfo_env)
        code = '\n'.join([
            line.rstrip() if line.strip() != '' else ''
            for line in code.split('\n')])
        f.write(code)