diff --git a/examples/data/eln/example-schema.archive.yaml b/examples/data/eln/example-schema.archive.yaml index 1cbff1fdc1948dd62fc79379f3a062f5167f6777..d577ecb12ce9a1563f99272a8201aabd030b5c51 100644 --- a/examples/data/eln/example-schema.archive.yaml +++ b/examples/data/eln/example-schema.archive.yaml @@ -70,12 +70,6 @@ definitions: tabular_parser: sep: '\t' comment: '#' - columns: - time: Process Time in seconds - substrate_temperature: - name: Substrate PV - unit: degC - chamber_pressure: Vacuum Pressure1 browser: adaptor: RawFileAdaptor eln: @@ -84,11 +78,16 @@ definitions: type: np.float64 shape: ['*'] unit: s + m_annotations: + tabular: + name: Process Time in seconds chamber_pressure: type: np.float64 shape: ['*'] unit: mbar m_annotations: + tabular: + name: Vacuum Pressure1 plot: x: time y: chamber_pressure @@ -97,6 +96,9 @@ definitions: shape: ['*'] unit: kelvin m_annotations: + tabular: + name: Substrate PV + unit: degC plot: x: time y: substrate_temperature diff --git a/examples/data/eln/example.excel.archive.xlsx b/examples/data/eln/example.excel.archive.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..5fea4473696b3f3bedd1060b190d8d835191e354 Binary files /dev/null and b/examples/data/eln/example.excel.archive.xlsx differ diff --git a/examples/data/eln/excel.archive.yaml b/examples/data/eln/excel.archive.yaml new file mode 100644 index 0000000000000000000000000000000000000000..862bb8f8a34b69482bca6c20e8f2828891c8a6c8 --- /dev/null +++ b/examples/data/eln/excel.archive.yaml @@ -0,0 +1,21 @@ +definitions: + sections: + Root: + base_section: nomad.parsing.tabular.TableRow + quantities: + enum: + type: + type_kind: Enum + type_data: + - A + - B + float: + type: np.float64 + unit: s + sub_sections: + sub_section: + sub_section: MySubSection + MySubSection: + quantities: + quantity: + type: str \ No newline at end of file diff --git a/nomad/datamodel/context.py b/nomad/datamodel/context.py index 408a7b108eeeb65cd716654e2ecd854d6df9a0fb..0ed138bcde2c76cca554a02262faf588924b59fb 100644 --- a/nomad/datamodel/context.py +++ b/nomad/datamodel/context.py @@ -297,3 +297,9 @@ class ClientContext(Context): def raw_file(self, path, *args, **kwargs): file_path = os.path.join(self.local_dir, path) return open(file_path, *args, **kwargs) + + def create_reference(self, section: MSection, quantity_def: Quantity, value: MSection) -> str: + try: + return super().create_reference(section, quantity_def, value) + except AssertionError: + return f'<unavailable url>/#{value.m_path()}' diff --git a/nomad/datamodel/metainfo/eln/__init__.py b/nomad/datamodel/metainfo/eln/__init__.py index bdaa46e51a507babfff057783647a344cbc09dca..ac70c55268b1dbee527f75a8cd8a09de8e79c26f 100644 --- a/nomad/datamodel/metainfo/eln/__init__.py +++ b/nomad/datamodel/metainfo/eln/__init__.py @@ -19,7 +19,6 @@ from nomad.datamodel.data import EntryData from nomad.datamodel.results import ELN, Results from nomad.metainfo import MSection, Package, Quantity, Datetime -from nomad.units import ureg m_package = Package(name='material_library') @@ -73,7 +72,7 @@ class ElnBaseSection(MSection): archive.results.eln.descriptions = '' archive.results.eln.descriptions = f'|{self.description}' - if getattr(self, 'tags'): + if getattr(self, 'tags', None): if archive.results.eln.tags is None: archive.results.eln.tags = [] tags = self.tags @@ -86,7 +85,9 @@ class ElnBaseSection(MSection): archive.results.eln.sections = [] archive.results.eln.sections.append(self.m_def.name) - def tabular_parser(self, quantity_def: Quantity, archive, logger, columns, **kwargs): + def tabular_parser(self, quantity_def: Quantity, archive, logger, **kwargs): + from nomad.parsing.tabular import parse_columns, read_table_data + if not quantity_def.is_scalar: raise NotImplementedError('CSV parser is only implemented for single files.') @@ -94,23 +95,10 @@ class ElnBaseSection(MSection): if not value: return - import pandas as pd with archive.m_context.raw_file(self.data_file) as f: - data = pd.read_csv(f, engine='python', **kwargs) - - for quantity_name, column_value in columns.items(): - quantity = self.m_def.all_quantities[quantity_name] - if isinstance(column_value, str): - column_name = column_value - column_unit = quantity.unit - elif isinstance(column_value, dict): - column_name = column_value['name'] - column_unit = ureg(column_value['unit']) - - np_values = data.loc[:, column_name].to_numpy() - if column_unit is not None: - np_values *= column_unit - self.m_set(quantity, np_values) + data = read_table_data(self.data_file, f, **kwargs) + + parse_columns(data, self) class ElnActivityBaseSecton(ElnBaseSection): diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py index e932687818cb506da5f05ef0dfd8c595c77827a1..6d2cb90af896eef24636793626881ee03ab7e42c 100644 --- a/nomad/metainfo/metainfo.py +++ b/nomad/metainfo/metainfo.py @@ -3057,6 +3057,10 @@ class Section(Definition): that add their properties to this section via :attr:`extends_base_section`. This quantity will be set automatically. + inheriting_sections: + A list of `section definitions` (:class:`Section`). These are those sections + that inherit (i.e. are sub classes) of this section. + Besides defining quantities and sub-sections, a section definition can also provide constraints that are used to validate a section and its quantities and sub-sections. @@ -3102,6 +3106,9 @@ class Section(Definition): all_base_sections: A helper attribute that gives direct and indirect base sections. + all_inheriting_section: + A helper attribute that gives direct and indirect inheriting sections. + all_properties: A helper attribute that gives all properties (sub section and quantity) definitions including inherited properties and properties from extending sections as a @@ -3158,11 +3165,13 @@ class Section(Definition): base_sections: 'Quantity' = _placeholder_quantity extending_sections: 'Quantity' = _placeholder_quantity extends_base_section: 'Quantity' = _placeholder_quantity + inheriting_sections: 'Quantity' = _placeholder_quantity constraints: 'Quantity' = _placeholder_quantity event_handlers: 'Quantity' = _placeholder_quantity inherited_sections: 'Quantity' = _placeholder_quantity all_base_sections: 'Quantity' = _placeholder_quantity + all_inheriting_sections: 'Quantity' = _placeholder_quantity all_properties: 'Quantity' = _placeholder_quantity all_quantities: 'Quantity' = _placeholder_quantity all_sub_sections: 'Quantity' = _placeholder_quantity @@ -3228,6 +3237,11 @@ class Section(Definition): base_section.extending_sections = base_section.extending_sections + [self] + # Init inheriting_sections + if not self.extends_base_section: + for base_section in self.base_sections: + base_section.inheriting_sections = base_section.inheriting_sections + [self] + # Transfer properties of inherited and overwriten property definitions that # have not been overwritten inherited_properties: Dict[str, Property] = dict() @@ -3508,6 +3522,8 @@ Section.base_sections = Quantity( Section.extending_sections = Quantity( type=SectionReference, shape=['0..*'], default=[], name='extending_sections') Section.extends_base_section = Quantity(type=bool, default=False, name='extends_base_section') +Section.inheriting_sections = Quantity( + type=SectionReference, shape=['0..*'], default=[], name='inheriting_sections', virtual=True) Section.constraints = Quantity(type=str, shape=['0..*'], default=[], name='constraints') Section.event_handlers = Quantity( type=Callable, shape=['0..*'], name='event_handlers', virtual=True, default=[]) @@ -3538,6 +3554,20 @@ def all_base_sections(self) -> List[Section]: return result +@derived(cached=True) +def all_inheriting_sections(self) -> List[Section]: + result: Set[Section] = set() + for inheriting_section in self.inheriting_sections: + if isinstance(inheriting_section, SectionProxy): + continue + for inheriting_inheriting_section in inheriting_section.all_inheriting_sections: + if isinstance(inheriting_inheriting_section, SectionProxy): + continue + result.add(inheriting_inheriting_section) + result.add(inheriting_section) + return list(result) + + @derived(cached=True) def all_properties(self) -> Dict[str, Union[SubSection, Quantity]]: result: Dict[str, Union[SubSection, Quantity]] = dict() @@ -3626,6 +3656,7 @@ def section_path(self) -> str: Section.inherited_sections = inherited_sections Section.all_base_sections = all_base_sections +Section.all_inheriting_sections = all_inheriting_sections Section.all_properties = all_properties Section.all_quantities = all_quantities Section.all_sub_sections = all_sub_sections diff --git a/nomad/parsing/parsers.py b/nomad/parsing/parsers.py index 42d7a23f92528c2db4e68bdd9ad58fdc3811ce3a..262636a2f95093bd5a6221b7e74c7717eabd7a23 100644 --- a/nomad/parsing/parsers.py +++ b/nomad/parsing/parsers.py @@ -18,6 +18,7 @@ import os.path from typing import Tuple, List, Dict +from collections.abc import Iterable from nomad import config from nomad.datamodel import EntryArchive, EntryMetadata, results @@ -25,6 +26,7 @@ from nomad.datamodel.context import Context, ClientContext from .parser import MissingParser, BrokenParser, Parser, ArchiveParser, MatchingParserInterface from .artificial import EmptyParser, GenerateRandomParser, TemplateParser, ChaosParser +from .tabular import TabularDataParser try: @@ -107,7 +109,7 @@ def match_parser(mainfile_path: str, strict=True, parser_name: str = None) -> Tu match_result = parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression) if match_result: - if type(match_result) == set: + if isinstance(match_result, Iterable): assert parser.creates_children, 'Illegal return value - parser does not specify `creates_children`' for mainfile_key in match_result: # type: ignore assert mainfile_key and type(mainfile_key) == str, ( @@ -584,6 +586,7 @@ parsers = [ mainfile_name_re=(r'.*\.nxs'), supported_compressions=['gz', 'bz2', 'xz'] ), + TabularDataParser(), ArchiveParser() ] diff --git a/nomad/parsing/tabular.py b/nomad/parsing/tabular.py index b8f479bf25d8864d5297b4b03dad710d8b11e2ad..c19159494791355e1ce63a47be560a415447e36a 100644 --- a/nomad/parsing/tabular.py +++ b/nomad/parsing/tabular.py @@ -16,11 +16,18 @@ # limitations under the License. # -from typing import Union, List, Iterable, Dict -import csv +from typing import Union, List, Iterable, Dict, Callable, Set, Any, Tuple, cast +from memoization import cached +import os.path +import re + +from nomad import utils +from nomad.units import ureg from nomad.datamodel.datamodel import EntryArchive, EntryData -from nomad.metainfo import Section, Quantity, Package, Reference, SectionProxy +from nomad.datamodel.context import Context +from nomad.metainfo import Section, Quantity, Package, Reference, SectionProxy, MSection, Property +from nomad.metainfo.metainfo import MetainfoError, SubSection from .parser import MatchingParser @@ -48,21 +55,126 @@ class Table(EntryData): m_package.__init_metainfo__() +@cached(max_size=10) +def _create_column_to_quantity_mapping(section_def: Section): + mapping: Dict[str, Callable[[MSection, Any], MSection]] = {} + + def add_section_def(section_def: Section, path: List[Tuple[SubSection, Section]]): + properties: Set[Property] = set() + + for quantity in section_def.all_quantities.values(): + if quantity in properties: + continue + properties.add(quantity) + + tabular_annotation = quantity.m_annotations.get('tabular', None) + if tabular_annotation and 'name' in tabular_annotation: + col_name = tabular_annotation['name'] + else: + col_name = quantity.name + if len(path) > 0: + col_name = f'{".".join([item[0].name for item in path])}.{col_name}' + + if col_name in mapping: + raise MetainfoError( + f'The schema has non unique column names. {col_name} exists twice. ' + f'Column names must be unique, to be used for tabular parsing.') + + def set_value(section: MSection, value, path=path, quantity=quantity, tabular_annotation=tabular_annotation): + for sub_section, section_def in path: + next_section = section.m_get_sub_section(sub_section, -1) + if not next_section: + next_section = section_def.section_cls() + section.m_add_sub_section(sub_section, next_section, -1) + section = next_section + + if tabular_annotation and 'unit' in tabular_annotation: + value *= ureg(tabular_annotation['unit']) + section.m_set(quantity, value) + + mapping[col_name] = set_value + + for sub_section in section_def.all_sub_sections.values(): + if sub_section in properties or sub_section.repeats: + continue + next_base_section = sub_section.sub_section + properties.add(sub_section) + for sub_section_section in next_base_section.all_inheriting_sections + [next_base_section]: + add_section_def(sub_section_section, path + [(sub_section, sub_section_section,)]) + + add_section_def(section_def, []) + return mapping + + +def parse_columns(pd_dataframe, section: MSection): + ''' + Parses the given pandas dataframe and adds columns (all values as array) to + the given section. + ''' + import pandas as pd + data: pd.DataFrame = pd_dataframe + + mapping = _create_column_to_quantity_mapping(section.m_def) # type: ignore + for column in data: + if column in mapping: + mapping[column](section, data.loc[:, column]) + + +def parse_table(pd_dataframe, section_def: Section): + ''' + Parses the given pandas dataframe and creates a section based on the given + section_def for each row. The sections are filled with the cells from + their respective row. + ''' + import pandas as pd + data: pd.DataFrame = pd_dataframe + sections: List[MSection] = [] + + mapping = _create_column_to_quantity_mapping(section_def) # type: ignore + for _, row in data.iterrows(): + section = section_def.section_cls() + for column in data: + if column in mapping: + mapping[column](section, row[column]) + sections.append(section) + + return sections + + +def read_table_data(path, file_or_path=None, **kwargs): + import pandas as pd + if file_or_path is None: + file_or_path = path + if path.endswith('.xls') or path.endswith('.xlsx'): + return pd.read_excel(file_or_path, engine='openpyxl', **kwargs) + else: + return pd.read_csv(file_or_path, engine='python', **kwargs) + + class TabularDataParser(MatchingParser): - # TODO this is super simple and needs extension. Currently parses files like: - # header_0,header_1 - # 0_0,0_1 - # 1_0,1_1 - # TODO also extend tests/parsing/test_tabular + creates_children = True + def __init__(self) -> None: super().__init__( name='parser/tabular', code_name='tabular data', - mainfile_name_re=r'.*\.csv$') + mainfile_mime_re=r'text/.*|application/.*', + mainfile_name_re=r'.*\.archive\.(csv|xlsx?)$') - def _read_cvs(self, filename: str) -> List[List[str]]: - with open(filename, newline='') as csvfile: - reader = csv.reader(csvfile, delimiter=',', quotechar='#') - return [row for row in reader] + def _get_schema(self, filename: str, mainfile: str): + dir = os.path.dirname(filename) + match = re.match(r'^(.+\.)?([\w\-]+)\.archive\.(csv|xlsx?)$', os.path.basename(filename)) + if not match: + return None + + schema_name = match.group(2) + for extension in ['yaml', 'yml', 'json']: + schema_file_base = f'{schema_name}.archive.{extension}' + schema_file = os.path.join(dir, schema_file_base) + if not os.path.exists(schema_file): + continue + return os.path.join(os.path.dirname(mainfile), schema_file_base) + + return None def is_mainfile( self, filename: str, mime: str, buffer: bytes, decoded_buffer: str, @@ -75,44 +187,53 @@ class TabularDataParser(MatchingParser): return False try: - rows = self._read_cvs(filename) + data = read_table_data(filename) except Exception: # If this cannot be parsed as a .csv file, we don't match with this file return False - return [str(item) for item in range(1, len(rows))] + return [str(item) for item in range(0, data.shape[0])] def parse( self, mainfile: str, archive: EntryArchive, logger=None, child_archives: Dict[str, EntryArchive] = None ): - rows = self._read_cvs(mainfile) - header = rows[0] - rows = rows[1:] - - # Create a schema from the header and store it in the main archive. - # We are creating a specialized TableRow section that has a quantity - # for each column. - # TODO do something smart with the headers to create more complex schemas. - # e.g. parse headers like 'sec1/sec2/quantity_name?unit=m&type=float' - table_row_def = Section(name='TableRow', base_sections=[TableRow.m_def]) - table_row_def.quantities = [ - Quantity(name=name, type=str) for name in header] - archive.definitions = Package(section_definitions=[table_row_def]) - MyTableRow = table_row_def.section_cls - - # Create a Table as the main archive contents. This can hold references to the - # rows. + if logger is None: + logger = utils.get_logger(__name__) + + data = read_table_data(mainfile) + + # We use mainfile to check the files existence in the overall fs, + # and archive.metadata.mainfile to get an upload/raw relative schema_file + schema_file = self._get_schema(mainfile, archive.metadata.mainfile) + if schema_file is None: + logger.error('Tabular data file without schema.', details=( + 'For a tabular file like name.schema.archive.csv, there has to be an ' + 'uploaded schema like schema.archive.yaml')) + return + + try: + schema_archive = cast(Context, archive.m_context).load_raw_file( + schema_file, archive.metadata.upload_id, None) + package = schema_archive.definitions + section_def = package.section_definitions[0] + except Exception as e: + logger.error('Could not load schema', exc_info=e) + return + + if TableRow.m_def not in section_def.base_sections: + logger.error('Schema for tabular data must inherit from TableRow.') + return + + child_sections = parse_table(data, section_def) + assert len(child_archives) == len(child_sections) + table = Table() archive.data = table - table_rows = [] - - # Create the data for each row by instantiating the generated schema and store it - # in the child archives. Use references to connect with the Table section in the - # main archive. - for index, row in enumerate(rows): - key = str(index + 1) - archive = child_archives[key] - archive.data = MyTableRow(table_ref=table, **dict(zip(header, row))) # type: ignore - table_rows.append(archive.data) - table.row_refs = table_rows + + child_section_refs: List[MSection] = [] + for child_archive, child_section in zip(child_archives.values(), child_sections): + child_archive.data = child_section + child_section_refs.append(child_section) + child_section.table_ref = table + table.row_refs = child_section_refs diff --git a/requirements.txt b/requirements.txt index 84fdfd4a240040aee91e520b81a8c3ec8414b386..f686b70e618156db03e6d3546e7998eb11b345a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,6 +46,7 @@ pymatgen==2022.0.17 asr==0.4.1 bitarray==2.3.5 xrdtools==0.1.1 +openpyxl==3.0.9 # [infrastructure] optimade[mongo]==0.14.0 diff --git a/tests/metainfo/test_metainfo.py b/tests/metainfo/test_metainfo.py index c091cb6002f7e0f7ddac255a2dc98d5dd5daa9b0..ae35b059ba649c9913573f7465b01aa2307b0f3f 100644 --- a/tests/metainfo/test_metainfo.py +++ b/tests/metainfo/test_metainfo.py @@ -176,6 +176,7 @@ class TestM2: assert Definition.m_def in iter(Section.m_def.base_sections) assert 'name' in Section.m_def.all_quantities assert 'name' in Quantity.m_def.all_quantities + assert Quantity.m_def in Definition.m_def.all_inheriting_sections def test_unit(self): assert System.lattice_vectors.unit is not None diff --git a/tests/parsing/test_tabular.py b/tests/parsing/test_tabular.py index 4a5d931eaf2587bad603fe99253e21432f83a133..55a3466d4abb0fc9408d9034d92d4d7bf87565ba 100644 --- a/tests/parsing/test_tabular.py +++ b/tests/parsing/test_tabular.py @@ -17,41 +17,110 @@ # import pytest -import json +import os.path +import pandas as pd -from nomad.files import StagingUploadFiles +from nomad import config from nomad.datamodel.datamodel import EntryArchive, EntryMetadata -from nomad.datamodel.context import ServerContext +from nomad.datamodel.context import ClientContext from nomad.utils import generate_entry_id, strip from nomad.parsing.tabular import TabularDataParser -from nomad.processing import Upload -@pytest.mark.parametrize('content', [ - pytest.param(strip(''' - header_0,header_1 - 0_0,0_1 - 1_0,1_1 - '''), id='simple') +@pytest.mark.parametrize('schema,content', [ + pytest.param( + strip(''' + definitions: + sections: + MyTable: + base_section: nomad.parsing.tabular.TableRow + quantities: + header_0: + type: str + header_1: + type: str + '''), + strip(''' + header_0,header_1 + 0_0,0_1 + 1_0,1_1 + '''), id='simple'), + pytest.param( + strip(''' + definitions: + sections: + MyTable: + base_section: nomad.parsing.tabular.TableRow + quantities: + header_0: + type: str + quantity: + type: str + m_annotations: + tabular: + name: header_1 + sub_sections: + my_sub_section: + sub_section: MySubSection + MySubSection: + quantities: + quantity: + type: str + '''), + strip(''' + header_0,header_1,my_sub_section.quantity + 0_0,0_1,0_2 + 1_0,1_1,1_2 + '''), id='nested'), + pytest.param( + strip(''' + definitions: + sections: + MyTable: + base_section: nomad.parsing.tabular.TableRow + quantities: + header_0: + type: np.float64 + header_1: + type: np.float64 + unit: s + m_annotations: + tabular: + unit: ms + '''), + strip(''' + header_0,header_1 + 0.0,0.1 + 1.0,1.1 + '''), id='units') ]) -def test_tabular(raw_files, content): - upload_files = StagingUploadFiles(upload_id='test_upload', create=True) - upload = Upload(upload_id='test_upload') - mainfile = 'test.csv' - with upload_files.raw_file(mainfile, 'wt') as f: +def test_tabular(raw_files, monkeypatch, schema, content): + mainfile = os.path.join(config.fs.tmp, 'test.my_schema.archive.csv') + schema_file = os.path.join(config.fs.tmp, 'my_schema.archive.yaml') + with open(mainfile, 'wt') as f: f.write(content) + with open(schema_file, 'wt') as f: + f.write(schema) + + data = pd.read_csv(mainfile) parser = TabularDataParser() - keys = parser.is_mainfile( - upload_files.raw_file_object(mainfile).os_path, - 'text/application', bytes(), '') + keys = parser.is_mainfile(mainfile, 'text/application', bytes(), '') assert isinstance(keys, list) - assert len(keys) == 2 + assert len(keys) == data.shape[0] + + class MyContext(ClientContext): + def load_raw_file(self, path, upload_id, installation_url): + archive = super().load_raw_file(path, upload_id, installation_url) + archive.metadata = EntryMetadata( + upload_id='test_upload', + entry_id=generate_entry_id('test_upload', schema_file)) + return archive - context = ServerContext(upload=upload) + context = MyContext(local_dir='') main_archive = EntryArchive(m_context=context, metadata=EntryMetadata( - upload_id='test_upload', + upload_id=None, mainfile=mainfile, entry_id=generate_entry_id('test_upload', mainfile))) child_archives = { @@ -62,10 +131,13 @@ def test_tabular(raw_files, content): entry_id=generate_entry_id('test_upload', mainfile, key))) for key in keys} - parser.parse( - upload_files.raw_file_object(mainfile).os_path, - main_archive, None, child_archives) + parser.parse(mainfile, main_archive, None, child_archives) + main_archive.metadata.upload_id = 'test_upload' + + assert main_archive.data is not None + for child_archive in child_archives.values(): + child_archive.data is not None - print('# main: ', json.dumps(main_archive.m_to_dict(), indent=2)) - for key in keys: - print(f'# {key}: ', json.dumps(child_archives[key].m_to_dict(), indent=2)) + # print('# main: ', json.dumps(main_archive.m_to_dict(), indent=2)) + # for key in keys: + # print(f'# {key}: ', json.dumps(child_archives[key].m_to_dict(), indent=2))