legacy.py 24.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
This module contains functionality to use old 'legacy' NOMAD CoE parsers with the
new nomad@fairdi infrastructure. This covers aspects like the new metainfo, a unifying
wrapper for parsers, parser logging, and a parser backend.
'''
20

21
from typing import cast, Dict, List, Union, Any, Set, Iterable, Tuple
22
import numpy as np
23
24
from pint.errors import UndefinedUnitError
import os.path
25

26

27
from nomadcore.local_meta_info import loadJsonFile, InfoKindEl, InfoKindEnv
28
29

from nomad import utils
30
from nomad.units import ureg
31
from nomad.metainfo import (
32
    Definition, SubSection, Package, Quantity, Category, Section, Reference,
33
    Environment, MEnum, MSection, DefinitionAnnotation)
34

35
logger = utils.get_logger(__name__)
36
37


38
39
40
_ignored_packages = [
    'meta_types.nomadmetainfo.json',
    'repository.nomadmetainfo.json']
41
42


43
44
45
46
47
48
49
50
51
52
53
54
55
56
class LegacyDefinition(DefinitionAnnotation):

    def __init__(self, name: str):
        self.name = name


class LegacyPackage(LegacyDefinition):
    def __init__(self, name, python_module, python_path):
        super().__init__(name)

        self.python_module = python_module
        self.python_path = python_path


57
58
59
60
61
62
63
def def_name(definition):
    try:
        return definition.a_legacy.name
    except AttributeError:
        return definition.name


64
65
66
67
68
69
70
71
72
73
74
75
76
def normalize_name(name: str):
    return name.replace('.', '_').replace('-', '_')


def normalized_package_name(name: str):
    '''
    Transforms legacy metainfo '.nomadmetainfo.json' filenames into proper (python)
    identifier.
    '''
    name = name.replace('.nomadmetainfo.json', '')
    return normalize_name(name)


77
78
79
def python_package_mapping(metainfo_package_name: str) -> Tuple[str, str]:
    '''
    Compute the python package for the given metainfo package name. It returns
80
    a tuple containing a package name and a file path. The filepath denotes the file
81
82
    for this package within the nomad git project.
    '''
83
84
    prefix = metainfo_package_name.replace('.nomadmetainfo.json', '').split('.')[0]
    metainfo_package_name = normalized_package_name(metainfo_package_name)
85

86
    if prefix in ['common', 'general', 'public', 'dft', 'ems']:
87
88
89
90
        directory = 'nomad/datamodel/metainfo'
        python_package_name = 'nomad.datamodel.metainfo.%s' % metainfo_package_name

    else:
91
92
93
94
        parser_dir = prefix.replace('_', '-')
        prefix = prefix.replace('_', '')

        directory = 'dependencies/parsers/%s/%sparser/metainfo' % (parser_dir, prefix)
95
96
97
98
99
100
101
        python_package_name = '%sparser.metainfo.%s' % (prefix, metainfo_package_name)

    path = '%s/%s.py' % (directory, metainfo_package_name)

    return python_package_name, path


102
class LegacyMetainfoEnvironment(Environment):
103
    '''
104
105
    A metainfo environment with functions to create a legacy metainfo version of
    the environment.
106
    '''
107
108
109

    legacy_package_name = Quantity(type=str)

110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__section_to_sub_section_name = None

    @property
    def section_to_sub_section_name(self) -> Dict[str, str]:
        if self.__section_to_sub_section_name is not None:
            return self.__section_to_sub_section_name

        self.__section_to_sub_section_name = dict()
        for definition in self.m_all_contents():
            if definition.m_def == SubSection.m_def:
                self.__section_to_sub_section_name[definition.sub_section.name] = definition.name

        return self.__section_to_sub_section_name

126
    def legacy_info(self, definition: Definition, *args, **kwargs) -> InfoKindEl:
127
        ''' Creates a legacy metainfo object for the given definition. '''
128
129
        super_names: List[str] = list()
        result: Dict[str, Any] = dict(
130
            name=def_name(definition),
131
132
133
134
            description=definition.description,
            superNames=super_names)

        for category in definition.categories:
135
            super_names.append(def_name(category))
136
137

        if isinstance(definition, Section):
138
            sub_section_name = self.section_to_sub_section_name.get(definition.name, definition.name)
139
140
141
            result['kindStr'] = 'type_section'
            result['repeats'] = any(
                sub_section.repeats
142
                for sub_section in self.resolve_definitions(sub_section_name, SubSection))
143

144
145
            for sub_section in self.resolve_definitions(sub_section_name, SubSection):
                super_names.append(def_name(sub_section.m_parent_as(Definition)))
146
147

        elif isinstance(definition, Quantity):
148
            result['kindStr'] = 'type_document_content'
149
150
151
152
153
154
155
156
157
158
159
160
            result['shape'] = definition.shape
            dtype_str = None
            if definition.type == int:
                dtype_str = 'i'
            elif definition.type == float:
                dtype_str = 'f'
            elif definition.type == bool:
                dtype_str = 'b'
            elif definition.type == str:
                dtype_str = 'C'
            elif isinstance(definition.type, Reference):
                dtype_str = 'r'
161
                result['referencedSections'] = [
162
                    def_name(definition.type.target_section_def.m_resolved())]
163
            elif isinstance(definition.type, MEnum):
164
165
166
167
168
169
                dtype_str = 'C'
            elif type(definition.type) == np.dtype:
                dtype_str = definition.type.name[0]
            elif definition.type == Any:
                dtype_str = 'D'
            else:
170
171
172
                dtype_str = str(definition.type)
                # raise TypeError(
                #     'Unsupported quantity type %s in %s.' % (definition.type, definition))
173
174
175
            result['dtypeStr'] = dtype_str
            if definition.unit is not None:
                result['units'] = str(definition.unit)
176
            super_names.append(def_name(definition.m_parent_as(Definition)))
177
178

        elif isinstance(definition, Category):
179
            result['kindStr'] = 'type_abstract_document_content'
180

181
182
183
184
185
186
        package = cast(MSection, definition)
        while not isinstance(package, Package):
            package = package.m_parent

        result['package'] = package.name

187
188
189
        return InfoKindEl(*args, **result, **kwargs)

    def legacy_info_env(self, packages: List[Package] = None, *args, **kwargs) -> InfoKindEnv:
190
        ''' Creates a legacy metainfo environment with all definitions from the given packages. '''
191
        if packages is None:
192
            packages = self.packages
193
194
195
196

        env = InfoKindEnv(*args, **kwargs)
        for package in packages:
            for definition in package.all_definitions.values():
197
198
199
                if not (isinstance(definition, Section) and definition.extends_base_section):
                    env.addInfoKindEl(self.legacy_info(definition))

200
201
202
203
204
205
                if isinstance(definition, Section):
                    for quantity in definition.quantities:
                        env.addInfoKindEl(self.legacy_info(quantity))

        return env

206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
    def to_legacy_dict(
            self, packages: List[Package] = None, description: str = None,
            *args, **kwargs) -> Dict[str, Any]:
        '''
        Creates a dictionary that can be serialized to a legacy metainfo definition file
        (*.nomadmetainfo.json).

        Arguments:
            package: Will add all definitions of these packages as actual definitions,
                all other packages will be added by import.
            description: The description for the legacy file. If None the description of
                the firs package will be used.
        '''
        if packages is None:
            packages = []

        definitions = []
        dependencies = []
        for package in self.packages:
            if package in packages:
                if description is None:
                    description = package.description

                for definition in package.all_definitions.values():
                    if not (isinstance(definition, Section) and definition.extends_base_section):
                        definitions.append(self.legacy_info(definition).toDict())

                    if isinstance(definition, Section):
                        for quantity in definition.quantities:
                            definitions.append(self.legacy_info(quantity).toDict())
            else:
                dependencies.append(package)

        return {
            'type': 'nomad_meta_info_1_0',
            'description': description,
            'dependencies': [
243
                {'relativePath': def_name(dependency)}
244
245
246
247
                for dependency in dependencies],
            'metaInfos': definitions
        }

248

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
class EnvironmentConversion:
    def __init__(self, legacy_env_or_path: Union[InfoKindEnv, str]):
        if isinstance(legacy_env_or_path, str):
            self.legacy_env, _ = loadJsonFile(filePath=legacy_env_or_path)

        else:
            self.legacy_env = cast(InfoKindEnv, legacy_env_or_path)

        self.__fix_legacy_super_names()

        self.package_conversions: Dict[str, PackageConversion] = {}

        for legacy_def in self.legacy_env.infoKindEls():
            if legacy_def.package in _ignored_packages:
                continue
264
            # legacy_def.package = normalized_package_name(legacy_def.package)
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
            package_conversion = self.package_conversions.get(legacy_def.package)
            if package_conversion is None:
                package_conversion = PackageConversion(self, legacy_def.package)
                self.package_conversions[legacy_def.package] = package_conversion

            package_conversion.legacy_defs.append(legacy_def)

        for package_conversion in self.package_conversions.values():
            package_conversion.create_definitions()

        for package_conversion in self.package_conversions.values():
            package_conversion.set_super_names()

        for package_conversion in self.package_conversions.values():
            package_conversion.init_definitions()

    def create_env(self) -> LegacyMetainfoEnvironment:
        env = LegacyMetainfoEnvironment()
283
        env.legacy_package_name = normalized_package_name(self.legacy_env.name)
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
        for package_conv in self.package_conversions.values():
            package = package_conv.package
            errors, warnings = package.m_all_validate()
            if len(errors) > 0:
                logger.error(
                    '%s. There are %d more errors in converted legacy package %s' %
                    (errors[0], len(errors) - 1, package))
            if len(warnings) > 0:
                logger.warn(
                    '%s. There are %d more warnings in converted legacy package %s' %
                    (warnings[0], len(warnings) - 1, package))
            env.m_add_sub_section(Environment.packages, package)
            package.init_metainfo()
        return env

    def __fix_legacy_super_names(self):

        def get_super_names(legacy_def: InfoKindEl, super_categories: Set[str] = None):
            super_section: str = None
            if super_categories is None:
                super_categories = set()

            for super_name in legacy_def.superNames:
                super_def = self.legacy_env.infoKindEl(super_name)

                if super_def.kindStr == 'type_section':
                    super_section = super_def.name

                elif super_def.kindStr == 'type_abstract_document_content':
                    super_categories.add(super_def.name)
                    super_super_section, _ = get_super_names(super_def, super_categories=super_categories)
315

316
317
                    if super_super_section is None:
                        pass
318

319
320
                    elif super_section is None:
                        super_section = super_super_section
321

322
323
                    elif super_section == super_super_section:
                        pass
324

325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
                    else:
                        logger.error('conflicting parent sections %s, %s for %s' % (
                            super_section, super_def.name, legacy_def.name))

            return super_section, super_categories

        for legacy_def in self.legacy_env.infoKindEls():
            super_section, super_categories = get_super_names(legacy_def)

            if super_section is None:
                legacy_def.superNames = list(super_categories)

            else:
                legacy_def.superNames = [super_section] + list(super_categories)

    def resolve(self, name: str) -> Iterable[Definition]:
        for package_conversion in self.package_conversions.values():
            definition = package_conversion.package.all_definitions.get(name)
            if definition is not None:
                yield definition


class PackageConversion:

    def __init__(self, env_conversion: EnvironmentConversion, name: str):
        self.env_conversion = env_conversion
        self.legacy_defs: List[InfoKindEl] = []

353
354
355
356
357
        python_module, python_path = python_package_mapping(name)

        self.package = Package(
            name=normalize_name(name),
            a_legacy=LegacyPackage(name, python_module, python_path))
358

359
360
361
362
363
364
        self.quantities: Dict[str, Quantity] = {}

        self.logger = logger.bind(package=name)

    def create_definitions(self):
        for legacy_def in self.legacy_defs:
365
            name = normalize_name(legacy_def.name)
366
367

            if legacy_def.kindStr == 'type_abstract_document_content':
368
369
                self.package.m_create(
                    Category, name=name, a_legacy=LegacyDefinition(name=legacy_def.name))
370
371

            elif legacy_def.kindStr == 'type_section':
372
373
374
                self.package.m_create(
                    Section, name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
375
376

            elif legacy_def.kindStr in ['type_dimension', 'type_document_content']:
377
378
379
                definition = Quantity(
                    name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
                self.quantities[name] = (definition)

            else:
                logger.error('unknown kindStr %s for %s' % (legacy_def.kindStr, name))

    def __resolve(self, name: str, create_extends: bool = False):
        definition: Definition = self.package.all_definitions.get(name)
        if definition is None:
            definition = self.quantities.get(name)

        if definition is not None:
            if not (isinstance(definition, Section) and definition.extends_base_section) or create_extends:
                return definition

        for definition in self.env_conversion.resolve(name):
            if isinstance(definition, Section) and definition.extends_base_section:
                continue

            if create_extends and isinstance(definition, Section):
399
400
401
                extending_def = self.package.m_create(
                    Section, name=definition.name,
                    a_legacy=LegacyDefinition(name=definition.a_legacy.name))
402
403
404
405
406
407
408
409
410
411
                extending_def.base_sections = [definition]
                extending_def.extends_base_section = True
                return extending_def

            return definition

        assert False, 'definition %s must be created now' % name

    def set_super_names(self):
        for legacy_def in self.legacy_defs:
412
413
414
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
415
416
417
418

            if isinstance(definition, Section):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
419
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
420
421
422
423
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                if parent_section is not None:
424
425
426
                    sub_section = parent_section.m_create(
                        SubSection, name=definition.name,
                        a_legacy=LegacyDefinition(name=legacy_def.name))
427
428
429
430
431
432
                    sub_section.sub_section = definition
                    sub_section.repeats = legacy_def.repeats is None or legacy_def.repeats

            if isinstance(definition, Quantity):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
433
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
434
435
436
437
438
439
440
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                parent_section.m_add_sub_section(Section.quantities, definition)

    def init_definitions(self):
        for legacy_def in self.legacy_defs:
441
442
443
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
444
445
446
            logger = self.logger.bind(definition=definition.name)

            # common properties
447
448
            if legacy_def.description is not None and legacy_def.description.strip() != '':
                definition.description = legacy_def.description
449
450
451
452
453

            if isinstance(definition, Definition):
                # deal with categories
                categories: List[Category] = []
                for super_name in legacy_def.superNames:
454
                    super_def = self.__resolve(normalize_name(super_name))
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
                    if isinstance(super_def, Category):
                        categories.append(super_def)

                definition.categories = categories

            if isinstance(definition, Quantity):
                # type
                referenced_sections = legacy_def.extra_args.get('referencedSections', [])
                if len(referenced_sections) == 1:
                    referenced_section = self.__resolve(referenced_sections[0])
                    if referenced_section is None:
                        logger.error('could not find referencedSection %s of %s' % (
                            referenced_sections[0], definition.name))
                        definition.type = int
                    else:
                        definition.type = Reference(referenced_section)

                elif len(referenced_sections) > 1:
                    logger.error(
474
                        'higher dimensional references not yet supported: %s' % name)
475
476
477
478
479
480
481
482
483
                    definition.type = np.dtype(int)

                elif legacy_def.kindStr == 'type_dimension':
                    definition.type = int
                elif legacy_def.dtypeStr == 'D':
                    definition.type = Any
                elif legacy_def.dtypeStr == 'C':
                    definition.type = str
                elif legacy_def.dtypeStr == 'r':
484
                    logger.error('r typed quantity %s  doesn\'t have referencedSections' % name)
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
                    definition.type = int
                elif legacy_def.dtypeStr == 'b':
                    definition.type = bool
                elif legacy_def.dtypeStr == 'i64':
                    definition.type = np.dtype(np.int64)
                elif legacy_def.dtypeStr == 'f':
                    definition.type = np.dtype(np.float64)
                else:
                    definition.type = np.dtype(legacy_def.dtypeStr)

                # shapes
                legacy_shape = legacy_def.shape
                if legacy_shape is None:
                    legacy_shape = []

                definition.shape = legacy_shape
                if len(definition.shape) > 1 and definition.type == str:
                    # Usually only np types have higher shapes in old metainfo;
                    # str is one exception.
                    definition.type = np.dtype('U')

                # units
                if legacy_def.units is not None:
                    try:
509
                        definition.unit = ureg.parse_units(legacy_def.units)
510
511
512
513
514
515
516
517
518
519
                    except UndefinedUnitError:
                        logger.error('unknown unit %s' % legacy_def.units)
                    except ValueError as e:
                        logger.error('cannot parse unit %s' % legacy_def.units, exc_info=e)


def convert(metainfo_path: str) -> LegacyMetainfoEnvironment:
    return EnvironmentConversion(metainfo_path).create_env()


520
def generate_metainfo_code(metainfo_env: LegacyMetainfoEnvironment):
521
522
523
524
525
526
    '''
    Generates python code with metainfo definitions for all packages in the given
    environement

    Arguments:
        env: The metainfo environment.
527
528
529
        python_package_path: An optional directory path. The directory must exist. Default
            is the working directory. The path will be used to form the module prefix
            for generated Python modules.
530
    '''
531
532
    from jinja2 import Environment as JinjaEnvironment, PackageLoader, select_autoescape
    import textwrap
533
534
535
536
537
538

    def format_description(description, indent=0, width=90):
        paragraphs = [paragraph.strip() for paragraph in description.split('\n')]

        def format_paragraph(paragraph, first):
            lines = textwrap.wrap(text=paragraph, width=width - indent * 4)
539
            lines = [line.replace('\\', '\\\\') for line in lines]
540
541
542
543
544
545
546
            return textwrap.indent(
                '\n'.join(lines), ' ' * 4 * indent, lambda x: not (first and x.startswith(lines[0])))

        return '\n\n'.join([
            format_paragraph(p, i == 0)
            for i, p in enumerate(paragraphs) if p != ''])

547
    def format_type(pkg, mi_type):
548
        if type(mi_type) == np.dtype:
549
550
551
            if mi_type == np.dtype('U'):
                return 'np.dtype(\'U\')'

552
            return 'np.dtype(np.%s)' % mi_type
553

554
555
        if mi_type in [int, float, str, bool]:
            return mi_type.__name__
556

557
        if isinstance(mi_type, Reference):
558
559
560
561
            if pkg == mi_type.target_section_def.m_parent:
                return "Reference(SectionProxy('%s'))" % mi_type.target_section_def.name

            else:
562
563
                python_module = mi_type.target_section_def.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], mi_type.target_section_def.name)
564

565
566
567
568
569
570
571
572
573
574
        else:
            return str(mi_type)

    def format_unit(unit):
        return "'%s'" % unit

    def format_definition_refs(pkg, definitions):
        def format_definition_ref(definition: Definition):
            if pkg == definition.m_parent:
                return definition.name
575
            else:
576
577
                python_module = definition.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], definition.name)
578
579
580

        return ', '.join([format_definition_ref(definition) for definition in definitions])

581
    def fromat_package_import(pkg):
582
583
584
585
586
587
        python_module = pkg.a_legacy.python_module
        modules = python_module.split('.')
        return 'from %s import %s' % ('.'.join(modules[:-1]), modules[-1])

    def order_categories(categories):
        return sorted(categories, key=lambda c: len(c.categories))
588

589
590
591
592
    env = JinjaEnvironment(
        loader=PackageLoader('nomad.metainfo', 'templates'),
        autoescape=select_autoescape(['python']))
    env.globals.update(
593
        order_categories=order_categories,
594
595
596
        format_description=format_description,
        format_type=format_type,
        format_unit=format_unit,
597
598
        format_definition_refs=format_definition_refs,
        fromat_package_import=fromat_package_import)
599
600

    for package in metainfo_env.packages:
601
        path = package.a_legacy.python_path
602
603
604
605
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'wt') as f:
606
607
608
609
610
            code = env.get_template('package.j2').render(pkg=package)
            code = '\n'.join([
                line.rstrip() if line.strip() != '' else ''
                for line in code.split('\n')])
            f.write(code)
611

612
613
    _, path = python_package_mapping(metainfo_env.legacy_package_name)
    with open(os.path.join(os.path.dirname(path), '__init__.py'), 'wt') as f:
614

615
616
617
618
619
        code = env.get_template('environment.j2').render(env=metainfo_env)
        code = '\n'.join([
            line.rstrip() if line.strip() != '' else ''
            for line in code.split('\n')])
        f.write(code)