legacy.py 23.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
This module contains functionality to use old 'legacy' NOMAD CoE parsers with the
new nomad@fairdi infrastructure. This covers aspects like the new metainfo, a unifying
wrapper for parsers, parser logging, and a parser backend.
'''
20

21
from typing import cast, Dict, List, Union, Any, Set, Iterable, Tuple
22
import numpy as np
23
24
from pint.errors import UndefinedUnitError
import os.path
25

26

27
from nomadcore.local_meta_info import loadJsonFile, InfoKindEl, InfoKindEnv
28
29

from nomad import utils
30
from nomad.units import ureg
31
from nomad.metainfo import (
32
    Definition, SubSection, Package, Quantity, Category, Section, Reference,
33
    Environment, MEnum, MSection, DefinitionAnnotation)
34

35
logger = utils.get_logger(__name__)
36
37


38
39
40
_ignored_packages = [
    'meta_types.nomadmetainfo.json',
    'repository.nomadmetainfo.json']
41
42


43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class LegacyDefinition(DefinitionAnnotation):

    def __init__(self, name: str):
        self.name = name


class LegacyPackage(LegacyDefinition):
    def __init__(self, name, python_module, python_path):
        super().__init__(name)

        self.python_module = python_module
        self.python_path = python_path


def normalize_name(name: str):
    return name.replace('.', '_').replace('-', '_')


def normalized_package_name(name: str):
    '''
    Transforms legacy metainfo '.nomadmetainfo.json' filenames into proper (python)
    identifier.
    '''
    name = name.replace('.nomadmetainfo.json', '')
    return normalize_name(name)


70
71
72
def python_package_mapping(metainfo_package_name: str) -> Tuple[str, str]:
    '''
    Compute the python package for the given metainfo package name. It returns
73
    a tuple containing a package name and a file path. The filepath denotes the file
74
75
    for this package within the nomad git project.
    '''
76
77
    prefix = metainfo_package_name.replace('.nomadmetainfo.json', '').split('.')[0]
    metainfo_package_name = normalized_package_name(metainfo_package_name)
78

79
    if prefix in ['common', 'general', 'public', 'dft', 'ems']:
80
81
82
83
        directory = 'nomad/datamodel/metainfo'
        python_package_name = 'nomad.datamodel.metainfo.%s' % metainfo_package_name

    else:
84
85
86
87
        parser_dir = prefix.replace('_', '-')
        prefix = prefix.replace('_', '')

        directory = 'dependencies/parsers/%s/%sparser/metainfo' % (parser_dir, prefix)
88
89
90
91
92
93
94
        python_package_name = '%sparser.metainfo.%s' % (prefix, metainfo_package_name)

    path = '%s/%s.py' % (directory, metainfo_package_name)

    return python_package_name, path


95
class LegacyMetainfoEnvironment(Environment):
96
    '''
97
98
    A metainfo environment with functions to create a legacy metainfo version of
    the environment.
99
    '''
100
101
102

    legacy_package_name = Quantity(type=str)

103
    def legacy_info(self, definition: Definition, *args, **kwargs) -> InfoKindEl:
104
        ''' Creates a legacy metainfo object for the given definition. '''
105
106
        super_names: List[str] = list()
        result: Dict[str, Any] = dict(
107
            name=definition.a_legacy.name,
108
109
110
111
            description=definition.description,
            superNames=super_names)

        for category in definition.categories:
112
            super_names.append(category.a_legacy.name)
113
114
115
116
117

        if isinstance(definition, Section):
            result['kindStr'] = 'type_section'
            result['repeats'] = any(
                sub_section.repeats
118
                for sub_section in self.resolve_definitions(definition.name, SubSection))
119

120
            for sub_section in self.resolve_definitions(definition.name, SubSection):
121
                super_names.append(sub_section.m_parent_as(Definition).a_legacy.name)
122
123

        elif isinstance(definition, Quantity):
124
            result['kindStr'] = 'type_document_content'
125
126
127
128
129
130
131
132
133
134
135
136
            result['shape'] = definition.shape
            dtype_str = None
            if definition.type == int:
                dtype_str = 'i'
            elif definition.type == float:
                dtype_str = 'f'
            elif definition.type == bool:
                dtype_str = 'b'
            elif definition.type == str:
                dtype_str = 'C'
            elif isinstance(definition.type, Reference):
                dtype_str = 'r'
137
138
                result['referencedSections'] = [
                    definition.type.target_section_def.m_resolved().a_legacy.name]
139
            elif isinstance(definition.type, MEnum):
140
141
142
143
144
145
146
147
148
149
150
                dtype_str = 'C'
            elif type(definition.type) == np.dtype:
                dtype_str = definition.type.name[0]
            elif definition.type == Any:
                dtype_str = 'D'
            else:
                raise TypeError(
                    'Unsupported quantity type %s in %s.' % (definition.type, definition))
            result['dtypeStr'] = dtype_str
            if definition.unit is not None:
                result['units'] = str(definition.unit)
151
            super_names.append(definition.m_parent_as(Definition).a_legacy.name)
152
153

        elif isinstance(definition, Category):
154
            result['kindStr'] = 'type_abstract_document_content'
155

156
157
158
159
160
161
        package = cast(MSection, definition)
        while not isinstance(package, Package):
            package = package.m_parent

        result['package'] = package.name

162
163
164
        return InfoKindEl(*args, **result, **kwargs)

    def legacy_info_env(self, packages: List[Package] = None, *args, **kwargs) -> InfoKindEnv:
165
        ''' Creates a legacy metainfo environment with all definitions from the given packages. '''
166
        if packages is None:
167
            packages = self.packages
168
169
170
171

        env = InfoKindEnv(*args, **kwargs)
        for package in packages:
            for definition in package.all_definitions.values():
172
173
174
                if not (isinstance(definition, Section) and definition.extends_base_section):
                    env.addInfoKindEl(self.legacy_info(definition))

175
176
177
178
179
180
                if isinstance(definition, Section):
                    for quantity in definition.quantities:
                        env.addInfoKindEl(self.legacy_info(quantity))

        return env

181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    def to_legacy_dict(
            self, packages: List[Package] = None, description: str = None,
            *args, **kwargs) -> Dict[str, Any]:
        '''
        Creates a dictionary that can be serialized to a legacy metainfo definition file
        (*.nomadmetainfo.json).

        Arguments:
            package: Will add all definitions of these packages as actual definitions,
                all other packages will be added by import.
            description: The description for the legacy file. If None the description of
                the firs package will be used.
        '''
        if packages is None:
            packages = []

        definitions = []
        dependencies = []
        for package in self.packages:
            if package in packages:
                if description is None:
                    description = package.description

                for definition in package.all_definitions.values():
                    if not (isinstance(definition, Section) and definition.extends_base_section):
                        definitions.append(self.legacy_info(definition).toDict())

                    if isinstance(definition, Section):
                        for quantity in definition.quantities:
                            definitions.append(self.legacy_info(quantity).toDict())
            else:
                dependencies.append(package)

        return {
            'type': 'nomad_meta_info_1_0',
            'description': description,
            'dependencies': [
                {'relativePath': dependency.a_legacy.name}
                for dependency in dependencies],
            'metaInfos': definitions
        }

223

224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
class EnvironmentConversion:
    def __init__(self, legacy_env_or_path: Union[InfoKindEnv, str]):
        if isinstance(legacy_env_or_path, str):
            self.legacy_env, _ = loadJsonFile(filePath=legacy_env_or_path)

        else:
            self.legacy_env = cast(InfoKindEnv, legacy_env_or_path)

        self.__fix_legacy_super_names()

        self.package_conversions: Dict[str, PackageConversion] = {}

        for legacy_def in self.legacy_env.infoKindEls():
            if legacy_def.package in _ignored_packages:
                continue
239
            # legacy_def.package = normalized_package_name(legacy_def.package)
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
            package_conversion = self.package_conversions.get(legacy_def.package)
            if package_conversion is None:
                package_conversion = PackageConversion(self, legacy_def.package)
                self.package_conversions[legacy_def.package] = package_conversion

            package_conversion.legacy_defs.append(legacy_def)

        for package_conversion in self.package_conversions.values():
            package_conversion.create_definitions()

        for package_conversion in self.package_conversions.values():
            package_conversion.set_super_names()

        for package_conversion in self.package_conversions.values():
            package_conversion.init_definitions()

    def create_env(self) -> LegacyMetainfoEnvironment:
        env = LegacyMetainfoEnvironment()
258
        env.legacy_package_name = normalized_package_name(self.legacy_env.name)
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
        for package_conv in self.package_conversions.values():
            package = package_conv.package
            errors, warnings = package.m_all_validate()
            if len(errors) > 0:
                logger.error(
                    '%s. There are %d more errors in converted legacy package %s' %
                    (errors[0], len(errors) - 1, package))
            if len(warnings) > 0:
                logger.warn(
                    '%s. There are %d more warnings in converted legacy package %s' %
                    (warnings[0], len(warnings) - 1, package))
            env.m_add_sub_section(Environment.packages, package)
            package.init_metainfo()
        return env

    def __fix_legacy_super_names(self):

        def get_super_names(legacy_def: InfoKindEl, super_categories: Set[str] = None):
            super_section: str = None
            if super_categories is None:
                super_categories = set()

            for super_name in legacy_def.superNames:
                super_def = self.legacy_env.infoKindEl(super_name)

                if super_def.kindStr == 'type_section':
                    super_section = super_def.name

                elif super_def.kindStr == 'type_abstract_document_content':
                    super_categories.add(super_def.name)
                    super_super_section, _ = get_super_names(super_def, super_categories=super_categories)
290

291
292
                    if super_super_section is None:
                        pass
293

294
295
                    elif super_section is None:
                        super_section = super_super_section
296

297
298
                    elif super_section == super_super_section:
                        pass
299

300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
                    else:
                        logger.error('conflicting parent sections %s, %s for %s' % (
                            super_section, super_def.name, legacy_def.name))

            return super_section, super_categories

        for legacy_def in self.legacy_env.infoKindEls():
            super_section, super_categories = get_super_names(legacy_def)

            if super_section is None:
                legacy_def.superNames = list(super_categories)

            else:
                legacy_def.superNames = [super_section] + list(super_categories)

    def resolve(self, name: str) -> Iterable[Definition]:
        for package_conversion in self.package_conversions.values():
            definition = package_conversion.package.all_definitions.get(name)
            if definition is not None:
                yield definition


class PackageConversion:

    def __init__(self, env_conversion: EnvironmentConversion, name: str):
        self.env_conversion = env_conversion
        self.legacy_defs: List[InfoKindEl] = []

328
329
330
331
332
        python_module, python_path = python_package_mapping(name)

        self.package = Package(
            name=normalize_name(name),
            a_legacy=LegacyPackage(name, python_module, python_path))
333

334
335
336
337
338
339
        self.quantities: Dict[str, Quantity] = {}

        self.logger = logger.bind(package=name)

    def create_definitions(self):
        for legacy_def in self.legacy_defs:
340
            name = normalize_name(legacy_def.name)
341
342

            if legacy_def.kindStr == 'type_abstract_document_content':
343
344
                self.package.m_create(
                    Category, name=name, a_legacy=LegacyDefinition(name=legacy_def.name))
345
346

            elif legacy_def.kindStr == 'type_section':
347
348
349
                self.package.m_create(
                    Section, name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
350
351

            elif legacy_def.kindStr in ['type_dimension', 'type_document_content']:
352
353
354
                definition = Quantity(
                    name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
                self.quantities[name] = (definition)

            else:
                logger.error('unknown kindStr %s for %s' % (legacy_def.kindStr, name))

    def __resolve(self, name: str, create_extends: bool = False):
        definition: Definition = self.package.all_definitions.get(name)
        if definition is None:
            definition = self.quantities.get(name)

        if definition is not None:
            if not (isinstance(definition, Section) and definition.extends_base_section) or create_extends:
                return definition

        for definition in self.env_conversion.resolve(name):
            if isinstance(definition, Section) and definition.extends_base_section:
                continue

            if create_extends and isinstance(definition, Section):
374
375
376
                extending_def = self.package.m_create(
                    Section, name=definition.name,
                    a_legacy=LegacyDefinition(name=definition.a_legacy.name))
377
378
379
380
381
382
383
384
385
386
                extending_def.base_sections = [definition]
                extending_def.extends_base_section = True
                return extending_def

            return definition

        assert False, 'definition %s must be created now' % name

    def set_super_names(self):
        for legacy_def in self.legacy_defs:
387
388
389
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
390
391
392
393

            if isinstance(definition, Section):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
394
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
395
396
397
398
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                if parent_section is not None:
399
400
401
                    sub_section = parent_section.m_create(
                        SubSection, name=definition.name,
                        a_legacy=LegacyDefinition(name=legacy_def.name))
402
403
404
405
406
407
                    sub_section.sub_section = definition
                    sub_section.repeats = legacy_def.repeats is None or legacy_def.repeats

            if isinstance(definition, Quantity):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
408
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
409
410
411
412
413
414
415
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                parent_section.m_add_sub_section(Section.quantities, definition)

    def init_definitions(self):
        for legacy_def in self.legacy_defs:
416
417
418
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
419
420
421
            logger = self.logger.bind(definition=definition.name)

            # common properties
422
423
            if legacy_def.description is not None and legacy_def.description.strip() != '':
                definition.description = legacy_def.description
424
425
426
427
428

            if isinstance(definition, Definition):
                # deal with categories
                categories: List[Category] = []
                for super_name in legacy_def.superNames:
429
                    super_def = self.__resolve(normalize_name(super_name))
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
                    if isinstance(super_def, Category):
                        categories.append(super_def)

                definition.categories = categories

            if isinstance(definition, Quantity):
                # type
                referenced_sections = legacy_def.extra_args.get('referencedSections', [])
                if len(referenced_sections) == 1:
                    referenced_section = self.__resolve(referenced_sections[0])
                    if referenced_section is None:
                        logger.error('could not find referencedSection %s of %s' % (
                            referenced_sections[0], definition.name))
                        definition.type = int
                    else:
                        definition.type = Reference(referenced_section)

                elif len(referenced_sections) > 1:
                    logger.error(
449
                        'higher dimensional references not yet supported: %s' % name)
450
451
452
453
454
455
456
457
458
                    definition.type = np.dtype(int)

                elif legacy_def.kindStr == 'type_dimension':
                    definition.type = int
                elif legacy_def.dtypeStr == 'D':
                    definition.type = Any
                elif legacy_def.dtypeStr == 'C':
                    definition.type = str
                elif legacy_def.dtypeStr == 'r':
459
                    logger.error('r typed quantity %s  doesn\'t have referencedSections' % name)
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
                    definition.type = int
                elif legacy_def.dtypeStr == 'b':
                    definition.type = bool
                elif legacy_def.dtypeStr == 'i64':
                    definition.type = np.dtype(np.int64)
                elif legacy_def.dtypeStr == 'f':
                    definition.type = np.dtype(np.float64)
                else:
                    definition.type = np.dtype(legacy_def.dtypeStr)

                # shapes
                legacy_shape = legacy_def.shape
                if legacy_shape is None:
                    legacy_shape = []

                definition.shape = legacy_shape
                if len(definition.shape) > 1 and definition.type == str:
                    # Usually only np types have higher shapes in old metainfo;
                    # str is one exception.
                    definition.type = np.dtype('U')

                # units
                if legacy_def.units is not None:
                    try:
484
                        definition.unit = ureg.parse_units(legacy_def.units)
485
486
487
488
489
490
491
492
493
494
                    except UndefinedUnitError:
                        logger.error('unknown unit %s' % legacy_def.units)
                    except ValueError as e:
                        logger.error('cannot parse unit %s' % legacy_def.units, exc_info=e)


def convert(metainfo_path: str) -> LegacyMetainfoEnvironment:
    return EnvironmentConversion(metainfo_path).create_env()


495
def generate_metainfo_code(metainfo_env: LegacyMetainfoEnvironment):
496
497
498
499
500
501
    '''
    Generates python code with metainfo definitions for all packages in the given
    environement

    Arguments:
        env: The metainfo environment.
502
503
504
        python_package_path: An optional directory path. The directory must exist. Default
            is the working directory. The path will be used to form the module prefix
            for generated Python modules.
505
    '''
506
507
    from jinja2 import Environment as JinjaEnvironment, PackageLoader, select_autoescape
    import textwrap
508
509
510
511
512
513

    def format_description(description, indent=0, width=90):
        paragraphs = [paragraph.strip() for paragraph in description.split('\n')]

        def format_paragraph(paragraph, first):
            lines = textwrap.wrap(text=paragraph, width=width - indent * 4)
514
            lines = [line.replace('\\', '\\\\') for line in lines]
515
516
517
518
519
520
521
            return textwrap.indent(
                '\n'.join(lines), ' ' * 4 * indent, lambda x: not (first and x.startswith(lines[0])))

        return '\n\n'.join([
            format_paragraph(p, i == 0)
            for i, p in enumerate(paragraphs) if p != ''])

522
    def format_type(pkg, mi_type):
523
        if type(mi_type) == np.dtype:
524
525
526
            if mi_type == np.dtype('U'):
                return 'np.dtype(\'U\')'

527
            return 'np.dtype(np.%s)' % mi_type
528

529
530
        if mi_type in [int, float, str, bool]:
            return mi_type.__name__
531

532
        if isinstance(mi_type, Reference):
533
534
535
536
            if pkg == mi_type.target_section_def.m_parent:
                return "Reference(SectionProxy('%s'))" % mi_type.target_section_def.name

            else:
537
538
                python_module = mi_type.target_section_def.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], mi_type.target_section_def.name)
539

540
541
542
543
544
545
546
547
548
549
        else:
            return str(mi_type)

    def format_unit(unit):
        return "'%s'" % unit

    def format_definition_refs(pkg, definitions):
        def format_definition_ref(definition: Definition):
            if pkg == definition.m_parent:
                return definition.name
550
            else:
551
552
                python_module = definition.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], definition.name)
553
554
555

        return ', '.join([format_definition_ref(definition) for definition in definitions])

556
    def fromat_package_import(pkg):
557
558
559
560
561
562
        python_module = pkg.a_legacy.python_module
        modules = python_module.split('.')
        return 'from %s import %s' % ('.'.join(modules[:-1]), modules[-1])

    def order_categories(categories):
        return sorted(categories, key=lambda c: len(c.categories))
563

564
565
566
567
    env = JinjaEnvironment(
        loader=PackageLoader('nomad.metainfo', 'templates'),
        autoescape=select_autoescape(['python']))
    env.globals.update(
568
        order_categories=order_categories,
569
570
571
        format_description=format_description,
        format_type=format_type,
        format_unit=format_unit,
572
573
        format_definition_refs=format_definition_refs,
        fromat_package_import=fromat_package_import)
574
575

    for package in metainfo_env.packages:
576
        path = package.a_legacy.python_path
577
578
579
580
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'wt') as f:
581
582
583
584
585
            code = env.get_template('package.j2').render(pkg=package)
            code = '\n'.join([
                line.rstrip() if line.strip() != '' else ''
                for line in code.split('\n')])
            f.write(code)
586

587
588
    _, path = python_package_mapping(metainfo_env.legacy_package_name)
    with open(os.path.join(os.path.dirname(path), '__init__.py'), 'wt') as f:
589

590
591
592
593
594
        code = env.get_template('environment.j2').render(env=metainfo_env)
        code = '\n'.join([
            line.rstrip() if line.strip() != '' else ''
            for line in code.split('\n')])
        f.write(code)