Commit 2a2c0ca5 authored by David Sikter's avatar David Sikter
Browse files

Merge branch '899-parsing-xlsx-files-from-yaml-schema' into 'develop'

Resolve "Parsing .xlsx files from yaml schema"

Closes #899

See merge request !720
parents 639abab0 0a5e917b
Pipeline #137813 passed with stages
in 80 minutes and 32 seconds
......@@ -30,7 +30,6 @@ from nomad.metainfo import Section, Quantity, Package, Reference, SectionProxy,
from nomad.metainfo.metainfo import MetainfoError, SubSection
from nomad.parsing.parser import MatchingParser
# We define a simple base schema for tabular data. The parser will then generate more
# specialized sections based on the table headers. These specialized defintions will use
# this as base section definition.
......@@ -104,6 +103,7 @@ def _create_column_to_quantity_mapping(section_def: Section):
f'Column names must be unique, to be used for tabular parsing.')
def set_value(section: MSection, value, path=path, quantity=quantity, tabular_annotation=tabular_annotation):
import numpy as np
for sub_section, section_def in path:
next_section = section.m_get_sub_section(sub_section, -1)
if not next_section:
......@@ -114,6 +114,9 @@ def _create_column_to_quantity_mapping(section_def: Section):
if tabular_annotation and 'unit' in tabular_annotation:
value *= ureg(tabular_annotation['unit'])
if isinstance(value, (int, float, str)):
value = np.array(value)
if len(value.shape) == 1 and len(quantity.shape) == 0:
if len(value) == 1:
value = value[0]
......@@ -151,9 +154,21 @@ def parse_columns(pd_dataframe, section: MSection):
data: pd.DataFrame = pd_dataframe
mapping = _create_column_to_quantity_mapping(section.m_def) # type: ignore
for column in data:
if column in mapping:
mapping[column](section, data.loc[:, column])
for column in mapping:
if '/' in column:
# extract the sheet & col names if there is a '/' in the 'name'
sheet_name, col_name = column.split('/')
if sheet_name not in list(data):
raise ValueError(
'The sheet name {sheet_name} doesn''t exist in the excel file')
df2 = pd.DataFrame.from_dict(data.loc[0, sheet_name])
mapping[column](section, df2.loc[:, col_name])
else:
# Otherwise, assume the sheet_name is the first sheet of excel/csv
df2 = pd.DataFrame.from_dict(data.iloc[0, 0])
if column in df2:
mapping[column](section, df2.loc[:, column])
def parse_table(pd_dataframe, section_def: Section, logger):
......@@ -190,9 +205,18 @@ def read_table_data(path, file_or_path=None, **kwargs):
if file_or_path is None:
file_or_path = path
if path.endswith('.xls') or path.endswith('.xlsx'):
return pd.read_excel(file_or_path, engine='openpyxl', **kwargs)
excel_file: pd.ExcelFile = pd.ExcelFile(
file_or_path if isinstance(file_or_path, str) else file_or_path.name)
df = pd.DataFrame()
for sheet_name in excel_file.sheet_names:
df.loc[0, sheet_name] = [
pd.read_excel(excel_file, sheet_name=sheet_name, **kwargs)
.to_dict()]
return df
else:
return pd.read_csv(file_or_path, engine='python', **kwargs)
df = pd.DataFrame()
df.loc[0, 0] = [pd.read_csv(file_or_path, engine='python', **kwargs).to_dict()]
return df
class TabularDataParser(MatchingParser):
......@@ -226,6 +250,7 @@ class TabularDataParser(MatchingParser):
) -> Union[bool, Iterable[str]]:
# We use the main file regex capabilities of the superclass to check if this is a
# .csv file
import pandas as pd
is_tabular = super().is_mainfile(filename, mime, buffer, decoded_buffer, compression)
if not is_tabular:
return False
......@@ -235,18 +260,17 @@ class TabularDataParser(MatchingParser):
except Exception:
# If this cannot be parsed as a .csv file, we don't match with this file
return False
data = pd.DataFrame.from_dict(data.iloc[0, 0])
return [str(item) for item in range(0, data.shape[0])]
def parse(
self, mainfile: str, archive: EntryArchive, logger=None,
child_archives: Dict[str, EntryArchive] = None
):
import pandas as pd
if logger is None:
logger = utils.get_logger(__name__)
data = read_table_data(mainfile)
# We use mainfile to check the files existence in the overall fs,
# and archive.metadata.mainfile to get an upload/raw relative schema_file
schema_file = self._get_schema(mainfile, archive.metadata.mainfile)
......@@ -269,6 +293,12 @@ class TabularDataParser(MatchingParser):
logger.error('Schema for tabular data must inherit from TableRow.')
return
tabular_parser_annotation = section_def.m_annotations.get('tabular-parser', None)
if tabular_parser_annotation:
data = read_table_data(mainfile, **tabular_parser_annotation)
else:
data = read_table_data(mainfile)
data = pd.DataFrame.from_dict(data.iloc[0, 0])
child_sections = parse_table(data, section_def, logger=logger)
assert len(child_archives) == len(child_sections)
......
definitions:
name: 'A test schema for excel file parsing'
sections:
MovpeSto_schema:
base_section: nomad.datamodel.data.EntryData
sub_sections:
process:
section:
base_section: nomad.parsing.tabular.TableData
quantities:
data_file:
type: str
description: |
A reference to an uploaded .xlsx
m_annotations:
tabular_parser:
comment: '#'
browser:
adaptor: RawFileAdaptor
eln:
component: FileEditQuantity
experiment_identifier:
type: str
m_annotations:
tabular:
name: Experiment Identifier
eln:
component: StringEditQuantity
sub_sections:
process_steps:
section:
quantities:
step_duration:
type: Datetime
description: Past time since process start
m_annotations:
tabular:
name: Overview/Start Time
eln:
component: DateTimeEditQuantity
deposition_control_steps:
section:
quantities:
pyrotemperature:
type: np.float64
shape: ['*']
unit: K
description: My test description here
m_annotations:
tabular:
name: Deposition Control/Pyrotemperature
data:
m_def: MovpeSto_schema
process:
data_file: Test.xlsx
......@@ -87,7 +87,7 @@ for parser, mainfile in parser_examples:
fixed_parser_examples.append((parser, mainfile))
parser_examples = fixed_parser_examples
correct_num_output_files = 122
correct_num_output_files = 123
def create_reference(data, pretty):
......
......@@ -17,6 +17,7 @@
#
import pytest
import os
import os.path
import pandas as pd
......@@ -25,6 +26,8 @@ from nomad.datamodel.datamodel import EntryArchive, EntryMetadata
from nomad.datamodel.context import ClientContext
from nomad.utils import generate_entry_id, strip
from nomad.parsing.tabular import TabularDataParser
from nomad.parsing.parser import ArchiveParser
from tests.normalizing.conftest import run_normalize
@pytest.mark.parametrize('schema,content', [
......@@ -141,3 +144,143 @@ def test_tabular(raw_files, monkeypatch, schema, content):
# print('# main: ', json.dumps(main_archive.m_to_dict(), indent=2))
# for key in keys:
# print(f'# {key}: ', json.dumps(child_archives[key].m_to_dict(), indent=2))
@pytest.mark.parametrize('schema', [
pytest.param(
strip('''
definitions:
name: 'A test schema for excel file parsing'
sections:
MovpeSto_schema:
base_section: nomad.datamodel.data.EntryData
sub_sections:
process:
section:
base_section: nomad.parsing.tabular.TableData
quantities:
data_file:
type: str
description: |
A reference to an uploaded .xlsx
m_annotations:
tabular_parser:
comment: '#'
browser:
adaptor: RawFileAdaptor
eln:
component: FileEditQuantity
experiment_identifier:
type: str
m_annotations:
tabular:
name: Experiment Identifier
eln:
component: StringEditQuantity
data:
m_def: MovpeSto_schema
process:
data_file: Test.xlsx
'''), id='w/o_sheetName_rowMode'),
pytest.param(
strip('''
definitions:
name: 'A test schema for excel file parsing'
sections:
MovpeSto_schema:
base_section: nomad.datamodel.data.EntryData
sub_sections:
process:
section:
base_section: nomad.parsing.tabular.TableData
quantities:
data_file:
type: str
description: |
A reference to an uploaded .xlsx
m_annotations:
tabular_parser:
comment: '#'
browser:
adaptor: RawFileAdaptor
eln:
component: FileEditQuantity
experiment_identifier:
type: str
m_annotations:
tabular:
name: Overview/Experiment Identifier
eln:
component: StringEditQuantity
data:
m_def: MovpeSto_schema
process:
data_file: Test.xlsx
'''), id='w_sheetName_rowMode'),
pytest.param(
strip('''
definitions:
name: 'A test schema for excel file parsing'
sections:
MovpeSto_schema:
base_section: nomad.datamodel.data.EntryData
sub_sections:
process:
section:
base_section: nomad.parsing.tabular.TableData
quantities:
data_file:
type: str
description: |
A reference to an uploaded .xlsx
m_annotations:
tabular_parser:
comment: '#'
browser:
adaptor: RawFileAdaptor
eln:
component: FileEditQuantity
experiment_identifier:
type: str
m_annotations:
tabular:
name: Overview/Experiment Identifier
eln:
component: StringEditQuantity
pyrotemperature:
type: np.float64
shape: ['*']
unit: K
description: My test description here
m_annotations:
tabular:
name: Deposition Control/Pyrotemperature
data:
m_def: MovpeSto_schema
process:
data_file: Test.xlsx
'''), id='w_sheetName_colMode')
])
def test_xlsx_tabular(raw_files, monkeypatch, schema):
schema_file = os.path.join(config.fs.tmp, 'excel_parser.archive.yaml')
with open(schema_file, 'wt') as f:
f.write(schema)
excel_file = os.path.join(os.path.dirname(__file__), '../../tests/data/parsers/tabular/Test.xlsx')
class MyContext(ClientContext):
def raw_file(self, path, *args, **kwargs):
return open(excel_file, *args, **kwargs)
context = MyContext(local_dir='')
main_archive = EntryArchive(m_context=context, metadata=EntryMetadata(
upload_id=None,
mainfile=schema_file,
entry_id=generate_entry_id('test_upload', schema_file)))
ArchiveParser().parse(schema_file, main_archive)
run_normalize(main_archive)
assert main_archive.data is not None
assert 'experiment_identifier' in main_archive.data.process
assert main_archive.data.process.experiment_identifier == '22-01-21-MA-255'
if 'pyrotemperature' in main_archive.data.process:
assert len(main_archive.data.process['pyrotemperature']) == 6
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment