legacy.py 24.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
This module contains functionality to use old 'legacy' NOMAD CoE parsers with the
new nomad@fairdi infrastructure. This covers aspects like the new metainfo, a unifying
wrapper for parsers, parser logging, and a parser backend.
'''
20

21
from typing import cast, Dict, List, Union, Any, Set, Iterable, Tuple
22
import numpy as np
23
24
from pint.errors import UndefinedUnitError
import os.path
25
import importlib
26

27

28
from nomadcore.local_meta_info import loadJsonFile, InfoKindEl, InfoKindEnv
29
30

from nomad import utils
31
from nomad.units import ureg
32
from nomad.metainfo import (
33
    Definition, SubSection, Package, Quantity, Category, Section, Reference,
34
    Environment, MEnum, MSection, DefinitionAnnotation)
35

36
logger = utils.get_logger(__name__)
37
38


39
40
41
_ignored_packages = [
    'meta_types.nomadmetainfo.json',
    'repository.nomadmetainfo.json']
42
43


44
45
46
47
48
49
50
51
52
53
54
55
56
57
class LegacyDefinition(DefinitionAnnotation):

    def __init__(self, name: str):
        self.name = name


class LegacyPackage(LegacyDefinition):
    def __init__(self, name, python_module, python_path):
        super().__init__(name)

        self.python_module = python_module
        self.python_path = python_path


58
59
60
61
62
63
64
def def_name(definition):
    try:
        return definition.a_legacy.name
    except AttributeError:
        return definition.name


65
66
67
68
69
70
71
72
73
74
75
76
77
def normalize_name(name: str):
    return name.replace('.', '_').replace('-', '_')


def normalized_package_name(name: str):
    '''
    Transforms legacy metainfo '.nomadmetainfo.json' filenames into proper (python)
    identifier.
    '''
    name = name.replace('.nomadmetainfo.json', '')
    return normalize_name(name)


78
79
80
def python_package_mapping(metainfo_package_name: str) -> Tuple[str, str]:
    '''
    Compute the python package for the given metainfo package name. It returns
81
    a tuple containing a package name and a file path. The filepath denotes the file
82
83
    for this package within the nomad git project.
    '''
84
85
    prefix = metainfo_package_name.replace('.nomadmetainfo.json', '').split('.')[0]
    metainfo_package_name = normalized_package_name(metainfo_package_name)
86

87
    if prefix in ['common', 'general', 'public', 'dft', 'ems']:
88
89
90
91
        directory = 'nomad/datamodel/metainfo'
        python_package_name = 'nomad.datamodel.metainfo.%s' % metainfo_package_name

    else:
92
93
94
95
        parser_dir = prefix.replace('_', '-')
        prefix = prefix.replace('_', '')

        directory = 'dependencies/parsers/%s/%sparser/metainfo' % (parser_dir, prefix)
96
97
98
99
100
101
102
        python_package_name = '%sparser.metainfo.%s' % (prefix, metainfo_package_name)

    path = '%s/%s.py' % (directory, metainfo_package_name)

    return python_package_name, path


103
class LegacyMetainfoEnvironment(Environment):
104
    '''
105
106
    A metainfo environment with functions to create a legacy metainfo version of
    the environment.
107
    '''
108

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    @staticmethod
    def from_legacy_package_path(path):
        metainfo_package_name = os.path.basename(path)
        package = metainfo_package_name
        if package.endswith('.nomadmetainfo.json'):
            package = package[:-19]
        if package.endswith('.json'):
            package = package[:-5]

        python_package_name, _ = python_package_mapping(package)
        python_package_name = '.'.join(python_package_name.split('.')[:-1])
        python_module = importlib.import_module(python_package_name)
        metainfo = getattr(python_module, 'm_env')

        return metainfo

125
126
    legacy_package_name = Quantity(type=str)

127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__section_to_sub_section_name = None

    @property
    def section_to_sub_section_name(self) -> Dict[str, str]:
        if self.__section_to_sub_section_name is not None:
            return self.__section_to_sub_section_name

        self.__section_to_sub_section_name = dict()
        for definition in self.m_all_contents():
            if definition.m_def == SubSection.m_def:
                self.__section_to_sub_section_name[definition.sub_section.name] = definition.name

        return self.__section_to_sub_section_name

143
    def legacy_info(self, definition: Definition, *args, **kwargs) -> InfoKindEl:
144
        ''' Creates a legacy metainfo object for the given definition. '''
145
146
        super_names: List[str] = list()
        result: Dict[str, Any] = dict(
147
            name=def_name(definition),
148
149
150
151
            description=definition.description,
            superNames=super_names)

        for category in definition.categories:
152
            super_names.append(def_name(category))
153
154

        if isinstance(definition, Section):
155
            sub_section_name = self.section_to_sub_section_name.get(definition.name, definition.name)
156
157
158
            result['kindStr'] = 'type_section'
            result['repeats'] = any(
                sub_section.repeats
159
                for sub_section in self.resolve_definitions(sub_section_name, SubSection))
160

161
162
            for sub_section in self.resolve_definitions(sub_section_name, SubSection):
                super_names.append(def_name(sub_section.m_parent_as(Definition)))
163
164

        elif isinstance(definition, Quantity):
165
            result['kindStr'] = 'type_document_content'
166
167
168
169
170
171
172
173
174
175
176
177
            result['shape'] = definition.shape
            dtype_str = None
            if definition.type == int:
                dtype_str = 'i'
            elif definition.type == float:
                dtype_str = 'f'
            elif definition.type == bool:
                dtype_str = 'b'
            elif definition.type == str:
                dtype_str = 'C'
            elif isinstance(definition.type, Reference):
                dtype_str = 'r'
178
                result['referencedSections'] = [
179
                    def_name(definition.type.target_section_def.m_resolved())]
180
            elif isinstance(definition.type, MEnum):
181
182
183
184
185
186
                dtype_str = 'C'
            elif type(definition.type) == np.dtype:
                dtype_str = definition.type.name[0]
            elif definition.type == Any:
                dtype_str = 'D'
            else:
187
188
189
                dtype_str = str(definition.type)
                # raise TypeError(
                #     'Unsupported quantity type %s in %s.' % (definition.type, definition))
190
191
192
            result['dtypeStr'] = dtype_str
            if definition.unit is not None:
                result['units'] = str(definition.unit)
193
            super_names.append(def_name(definition.m_parent_as(Definition)))
194
195

        elif isinstance(definition, Category):
196
            result['kindStr'] = 'type_abstract_document_content'
197

198
199
200
201
202
203
        package = cast(MSection, definition)
        while not isinstance(package, Package):
            package = package.m_parent

        result['package'] = package.name

204
205
206
        return InfoKindEl(*args, **result, **kwargs)

    def legacy_info_env(self, packages: List[Package] = None, *args, **kwargs) -> InfoKindEnv:
207
        ''' Creates a legacy metainfo environment with all definitions from the given packages. '''
208
        if packages is None:
209
            packages = self.packages
210
211
212
213

        env = InfoKindEnv(*args, **kwargs)
        for package in packages:
            for definition in package.all_definitions.values():
214
215
216
                if not (isinstance(definition, Section) and definition.extends_base_section):
                    env.addInfoKindEl(self.legacy_info(definition))

217
218
219
220
221
222
                if isinstance(definition, Section):
                    for quantity in definition.quantities:
                        env.addInfoKindEl(self.legacy_info(quantity))

        return env

223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    def to_legacy_dict(
            self, packages: List[Package] = None, description: str = None,
            *args, **kwargs) -> Dict[str, Any]:
        '''
        Creates a dictionary that can be serialized to a legacy metainfo definition file
        (*.nomadmetainfo.json).

        Arguments:
            package: Will add all definitions of these packages as actual definitions,
                all other packages will be added by import.
            description: The description for the legacy file. If None the description of
                the firs package will be used.
        '''
        if packages is None:
            packages = []

        definitions = []
        dependencies = []
        for package in self.packages:
            if package in packages:
                if description is None:
                    description = package.description

                for definition in package.all_definitions.values():
                    if not (isinstance(definition, Section) and definition.extends_base_section):
                        definitions.append(self.legacy_info(definition).toDict())

                    if isinstance(definition, Section):
                        for quantity in definition.quantities:
                            definitions.append(self.legacy_info(quantity).toDict())
            else:
                dependencies.append(package)

        return {
            'type': 'nomad_meta_info_1_0',
            'description': description,
            'dependencies': [
260
                {'relativePath': def_name(dependency)}
261
262
263
264
                for dependency in dependencies],
            'metaInfos': definitions
        }

265

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
class EnvironmentConversion:
    def __init__(self, legacy_env_or_path: Union[InfoKindEnv, str]):
        if isinstance(legacy_env_or_path, str):
            self.legacy_env, _ = loadJsonFile(filePath=legacy_env_or_path)

        else:
            self.legacy_env = cast(InfoKindEnv, legacy_env_or_path)

        self.__fix_legacy_super_names()

        self.package_conversions: Dict[str, PackageConversion] = {}

        for legacy_def in self.legacy_env.infoKindEls():
            if legacy_def.package in _ignored_packages:
                continue
281
            # legacy_def.package = normalized_package_name(legacy_def.package)
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
            package_conversion = self.package_conversions.get(legacy_def.package)
            if package_conversion is None:
                package_conversion = PackageConversion(self, legacy_def.package)
                self.package_conversions[legacy_def.package] = package_conversion

            package_conversion.legacy_defs.append(legacy_def)

        for package_conversion in self.package_conversions.values():
            package_conversion.create_definitions()

        for package_conversion in self.package_conversions.values():
            package_conversion.set_super_names()

        for package_conversion in self.package_conversions.values():
            package_conversion.init_definitions()

    def create_env(self) -> LegacyMetainfoEnvironment:
        env = LegacyMetainfoEnvironment()
300
        env.legacy_package_name = normalized_package_name(self.legacy_env.name)
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
        for package_conv in self.package_conversions.values():
            package = package_conv.package
            errors, warnings = package.m_all_validate()
            if len(errors) > 0:
                logger.error(
                    '%s. There are %d more errors in converted legacy package %s' %
                    (errors[0], len(errors) - 1, package))
            if len(warnings) > 0:
                logger.warn(
                    '%s. There are %d more warnings in converted legacy package %s' %
                    (warnings[0], len(warnings) - 1, package))
            env.m_add_sub_section(Environment.packages, package)
            package.init_metainfo()
        return env

    def __fix_legacy_super_names(self):

        def get_super_names(legacy_def: InfoKindEl, super_categories: Set[str] = None):
            super_section: str = None
            if super_categories is None:
                super_categories = set()

            for super_name in legacy_def.superNames:
                super_def = self.legacy_env.infoKindEl(super_name)

                if super_def.kindStr == 'type_section':
                    super_section = super_def.name

                elif super_def.kindStr == 'type_abstract_document_content':
                    super_categories.add(super_def.name)
                    super_super_section, _ = get_super_names(super_def, super_categories=super_categories)
332

333
334
                    if super_super_section is None:
                        pass
335

336
337
                    elif super_section is None:
                        super_section = super_super_section
338

339
340
                    elif super_section == super_super_section:
                        pass
341

342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
                    else:
                        logger.error('conflicting parent sections %s, %s for %s' % (
                            super_section, super_def.name, legacy_def.name))

            return super_section, super_categories

        for legacy_def in self.legacy_env.infoKindEls():
            super_section, super_categories = get_super_names(legacy_def)

            if super_section is None:
                legacy_def.superNames = list(super_categories)

            else:
                legacy_def.superNames = [super_section] + list(super_categories)

    def resolve(self, name: str) -> Iterable[Definition]:
        for package_conversion in self.package_conversions.values():
            definition = package_conversion.package.all_definitions.get(name)
            if definition is not None:
                yield definition


class PackageConversion:

    def __init__(self, env_conversion: EnvironmentConversion, name: str):
        self.env_conversion = env_conversion
        self.legacy_defs: List[InfoKindEl] = []

370
371
372
373
374
        python_module, python_path = python_package_mapping(name)

        self.package = Package(
            name=normalize_name(name),
            a_legacy=LegacyPackage(name, python_module, python_path))
375

376
377
378
379
380
381
        self.quantities: Dict[str, Quantity] = {}

        self.logger = logger.bind(package=name)

    def create_definitions(self):
        for legacy_def in self.legacy_defs:
382
            name = normalize_name(legacy_def.name)
383
384

            if legacy_def.kindStr == 'type_abstract_document_content':
385
386
                self.package.m_create(
                    Category, name=name, a_legacy=LegacyDefinition(name=legacy_def.name))
387
388

            elif legacy_def.kindStr == 'type_section':
389
390
391
                self.package.m_create(
                    Section, name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
392
393

            elif legacy_def.kindStr in ['type_dimension', 'type_document_content']:
394
395
396
                definition = Quantity(
                    name=name,
                    a_legacy=LegacyDefinition(name=legacy_def.name))
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
                self.quantities[name] = (definition)

            else:
                logger.error('unknown kindStr %s for %s' % (legacy_def.kindStr, name))

    def __resolve(self, name: str, create_extends: bool = False):
        definition: Definition = self.package.all_definitions.get(name)
        if definition is None:
            definition = self.quantities.get(name)

        if definition is not None:
            if not (isinstance(definition, Section) and definition.extends_base_section) or create_extends:
                return definition

        for definition in self.env_conversion.resolve(name):
            if isinstance(definition, Section) and definition.extends_base_section:
                continue

            if create_extends and isinstance(definition, Section):
416
417
418
                extending_def = self.package.m_create(
                    Section, name=definition.name,
                    a_legacy=LegacyDefinition(name=definition.a_legacy.name))
419
420
421
422
423
424
425
426
427
428
                extending_def.base_sections = [definition]
                extending_def.extends_base_section = True
                return extending_def

            return definition

        assert False, 'definition %s must be created now' % name

    def set_super_names(self):
        for legacy_def in self.legacy_defs:
429
430
431
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
432
433
434
435

            if isinstance(definition, Section):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
436
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
437
438
439
440
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                if parent_section is not None:
441
442
443
                    sub_section = parent_section.m_create(
                        SubSection, name=definition.name,
                        a_legacy=LegacyDefinition(name=legacy_def.name))
444
445
446
447
448
449
                    sub_section.sub_section = definition
                    sub_section.repeats = legacy_def.repeats is None or legacy_def.repeats

            if isinstance(definition, Quantity):
                parent_section: Section = None
                for super_name in legacy_def.superNames:
450
                    super_def = self.__resolve(normalize_name(super_name), create_extends=True)
451
452
453
454
455
456
457
                    if isinstance(super_def, Section):
                        parent_section = cast(Section, super_def)

                parent_section.m_add_sub_section(Section.quantities, definition)

    def init_definitions(self):
        for legacy_def in self.legacy_defs:
458
459
460
            name = normalize_name(legacy_def.name)
            definition = self.__resolve(name)
            assert definition is not None, 'definition %s must exist' % name
461
462
463
            logger = self.logger.bind(definition=definition.name)

            # common properties
464
465
            if legacy_def.description is not None and legacy_def.description.strip() != '':
                definition.description = legacy_def.description
466
467
468
469
470

            if isinstance(definition, Definition):
                # deal with categories
                categories: List[Category] = []
                for super_name in legacy_def.superNames:
471
                    super_def = self.__resolve(normalize_name(super_name))
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
                    if isinstance(super_def, Category):
                        categories.append(super_def)

                definition.categories = categories

            if isinstance(definition, Quantity):
                # type
                referenced_sections = legacy_def.extra_args.get('referencedSections', [])
                if len(referenced_sections) == 1:
                    referenced_section = self.__resolve(referenced_sections[0])
                    if referenced_section is None:
                        logger.error('could not find referencedSection %s of %s' % (
                            referenced_sections[0], definition.name))
                        definition.type = int
                    else:
                        definition.type = Reference(referenced_section)

                elif len(referenced_sections) > 1:
                    logger.error(
491
                        'higher dimensional references not yet supported: %s' % name)
492
493
494
495
496
497
498
499
500
                    definition.type = np.dtype(int)

                elif legacy_def.kindStr == 'type_dimension':
                    definition.type = int
                elif legacy_def.dtypeStr == 'D':
                    definition.type = Any
                elif legacy_def.dtypeStr == 'C':
                    definition.type = str
                elif legacy_def.dtypeStr == 'r':
501
                    logger.error('r typed quantity %s  doesn\'t have referencedSections' % name)
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
                    definition.type = int
                elif legacy_def.dtypeStr == 'b':
                    definition.type = bool
                elif legacy_def.dtypeStr == 'i64':
                    definition.type = np.dtype(np.int64)
                elif legacy_def.dtypeStr == 'f':
                    definition.type = np.dtype(np.float64)
                else:
                    definition.type = np.dtype(legacy_def.dtypeStr)

                # shapes
                legacy_shape = legacy_def.shape
                if legacy_shape is None:
                    legacy_shape = []

                definition.shape = legacy_shape
                if len(definition.shape) > 1 and definition.type == str:
                    # Usually only np types have higher shapes in old metainfo;
                    # str is one exception.
                    definition.type = np.dtype('U')

                # units
                if legacy_def.units is not None:
                    try:
526
                        definition.unit = ureg.parse_units(legacy_def.units)
527
528
529
530
531
532
533
534
535
536
                    except UndefinedUnitError:
                        logger.error('unknown unit %s' % legacy_def.units)
                    except ValueError as e:
                        logger.error('cannot parse unit %s' % legacy_def.units, exc_info=e)


def convert(metainfo_path: str) -> LegacyMetainfoEnvironment:
    return EnvironmentConversion(metainfo_path).create_env()


537
def generate_metainfo_code(metainfo_env: LegacyMetainfoEnvironment):
538
539
540
541
542
543
    '''
    Generates python code with metainfo definitions for all packages in the given
    environement

    Arguments:
        env: The metainfo environment.
544
545
546
        python_package_path: An optional directory path. The directory must exist. Default
            is the working directory. The path will be used to form the module prefix
            for generated Python modules.
547
    '''
548
549
    from jinja2 import Environment as JinjaEnvironment, PackageLoader, select_autoescape
    import textwrap
550
551
552
553
554
555

    def format_description(description, indent=0, width=90):
        paragraphs = [paragraph.strip() for paragraph in description.split('\n')]

        def format_paragraph(paragraph, first):
            lines = textwrap.wrap(text=paragraph, width=width - indent * 4)
556
            lines = [line.replace('\\', '\\\\') for line in lines]
557
558
559
560
561
562
563
            return textwrap.indent(
                '\n'.join(lines), ' ' * 4 * indent, lambda x: not (first and x.startswith(lines[0])))

        return '\n\n'.join([
            format_paragraph(p, i == 0)
            for i, p in enumerate(paragraphs) if p != ''])

564
    def format_type(pkg, mi_type):
565
        if type(mi_type) == np.dtype:
566
567
568
            if mi_type == np.dtype('U'):
                return 'np.dtype(\'U\')'

569
            return 'np.dtype(np.%s)' % mi_type
570

571
572
        if mi_type in [int, float, str, bool]:
            return mi_type.__name__
573

574
        if isinstance(mi_type, Reference):
575
576
577
578
            if pkg == mi_type.target_section_def.m_parent:
                return "Reference(SectionProxy('%s'))" % mi_type.target_section_def.name

            else:
579
580
                python_module = mi_type.target_section_def.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], mi_type.target_section_def.name)
581

582
583
584
585
586
587
588
589
590
591
        else:
            return str(mi_type)

    def format_unit(unit):
        return "'%s'" % unit

    def format_definition_refs(pkg, definitions):
        def format_definition_ref(definition: Definition):
            if pkg == definition.m_parent:
                return definition.name
592
            else:
593
594
                python_module = definition.m_parent.a_legacy.python_module
                return '%s.%s' % (python_module.split('.')[-1], definition.name)
595
596
597

        return ', '.join([format_definition_ref(definition) for definition in definitions])

598
    def fromat_package_import(pkg):
599
600
601
602
603
604
        python_module = pkg.a_legacy.python_module
        modules = python_module.split('.')
        return 'from %s import %s' % ('.'.join(modules[:-1]), modules[-1])

    def order_categories(categories):
        return sorted(categories, key=lambda c: len(c.categories))
605

606
607
608
609
    env = JinjaEnvironment(
        loader=PackageLoader('nomad.metainfo', 'templates'),
        autoescape=select_autoescape(['python']))
    env.globals.update(
610
        order_categories=order_categories,
611
612
613
        format_description=format_description,
        format_type=format_type,
        format_unit=format_unit,
614
615
        format_definition_refs=format_definition_refs,
        fromat_package_import=fromat_package_import)
616
617

    for package in metainfo_env.packages:
618
        path = package.a_legacy.python_path
619
620
621
622
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))

        with open(path, 'wt') as f:
623
624
625
626
627
            code = env.get_template('package.j2').render(pkg=package)
            code = '\n'.join([
                line.rstrip() if line.strip() != '' else ''
                for line in code.split('\n')])
            f.write(code)
628

629
630
    _, path = python_package_mapping(metainfo_env.legacy_package_name)
    with open(os.path.join(os.path.dirname(path), '__init__.py'), 'wt') as f:
631

632
633
634
635
636
        code = env.get_template('environment.j2').render(env=metainfo_env)
        code = '\n'.join([
            line.rstrip() if line.strip() != '' else ''
            for line in code.split('\n')])
        f.write(code)