diff --git a/examples/data/docs/tabular-parser_3_row_current-entry_to-path.archive.yaml b/examples/data/docs/tabular-parser_3_row_current-entry_to-path.archive.yaml index 6fe345133787b42a5a262dd9fedfcdaca38c9531..db3cc7eebd1b85cd4c85313a6963a3324a2baa72 100644 --- a/examples/data/docs/tabular-parser_3_row_current-entry_to-path.archive.yaml +++ b/examples/data/docs/tabular-parser_3_row_current-entry_to-path.archive.yaml @@ -32,7 +32,7 @@ definitions: m_annotations: eln: more: - label_quantity: my_quantity_1 + label_quantity: '#/data/my_quantity_1' quantities: my_quantity_1: type: str diff --git a/examples/data/docs/tabular-parser_5_row_single-new-entry_to-path.archive.yaml b/examples/data/docs/tabular-parser_5_row_single-new-entry_to-path.archive.yaml index bd5f910dd3d5be679c8e22ba8a83eb2095eff8ad..b005f6854fd4d0e12e545906111b778f986965ba 100644 --- a/examples/data/docs/tabular-parser_5_row_single-new-entry_to-path.archive.yaml +++ b/examples/data/docs/tabular-parser_5_row_single-new-entry_to-path.archive.yaml @@ -41,7 +41,7 @@ definitions: m_annotations: eln: more: - label_quantity: my_quantity_1 + label_quantity: '#/data/my_quantity_1' sub_sections: my_repeated_sub_section: repeats: true diff --git a/examples/data/docs/tabular-parser_6_row_multiple-new-entries_to-root.archive.yaml b/examples/data/docs/tabular-parser_6_row_multiple-new-entries_to-root.archive.yaml index d90e5a3910383e4579047a59e3443a36bbec429e..5d71172d680084b6815c685bc79742a7d4c67992 100644 --- a/examples/data/docs/tabular-parser_6_row_multiple-new-entries_to-root.archive.yaml +++ b/examples/data/docs/tabular-parser_6_row_multiple-new-entries_to-root.archive.yaml @@ -8,7 +8,7 @@ definitions: m_annotations: eln: more: - label_quantity: my_quantity_1 + label_quantity: '#/data/my_quantity_1' quantities: data_file: type: str diff --git a/examples/data/docs/tabular-parser_7_row_multiple-new-entries_to-path.archive.yaml b/examples/data/docs/tabular-parser_7_row_multiple-new-entries_to-path.archive.yaml index 4fa7ac96f330584dc9754ef31c19ce5123e5416c..430da805977ab0fdbde1af71871950c88fd6ec0d 100644 --- a/examples/data/docs/tabular-parser_7_row_multiple-new-entries_to-path.archive.yaml +++ b/examples/data/docs/tabular-parser_7_row_multiple-new-entries_to-path.archive.yaml @@ -42,7 +42,7 @@ definitions: m_annotations: eln: more: - label_quantity: my_quantity_1 + label_quantity: '#/data/my_quantity_1' quantities: my_quantity_1: type: str diff --git a/examples/data/docs/tabular-parser_8_row_current-entry_to-path_subsubsection.archive.yaml b/examples/data/docs/tabular-parser_8_row_current-entry_to-path_subsubsection.archive.yaml index 23b7b67891f593a833bdcbb311e6c3e5fee0971f..8ed86ab3e4cc2d26e455281f96116fb3231b9286 100644 --- a/examples/data/docs/tabular-parser_8_row_current-entry_to-path_subsubsection.archive.yaml +++ b/examples/data/docs/tabular-parser_8_row_current-entry_to-path_subsubsection.archive.yaml @@ -32,7 +32,7 @@ definitions: m_annotations: eln: more: - label_quantity: my_quantity_1 + label_quantity: '#/data/my_quantity_1' quantities: my_quantity_1: type: str diff --git a/examples/data/tabular/README.md b/examples/data/tabular/README.md index 283c55d49149753b37d57563c3e22dc2f0bd91ca..6fa314b49611dc36a9be22d6eb60da8e0b313455 100644 --- a/examples/data/tabular/README.md +++ b/examples/data/tabular/README.md @@ -1,6 +1,6 @@ -This upload demonstrates the used of tabular data. In this example we use an *xlsx* file in combination with a custom schema. The schema describes what the columns in the excel file mean and NOMAD can parse everything accordingly to produce a **FAIR** dataset. +This upload demonstrates the use of tabular data. In this example we use an *xlsx* file in combination with a custom schema. The schema describes what columns in the excel file mean and how NOMAD is expected to parse and map the content accordingly in order to produce a **FAIR** dataset. -The schema is meant as a starting point. You can download the schema file and +This schema is meant as a starting point. You can download the schema file and extend the schema for your own tables. -Consult our [documentation on the NOMAD Archive and Metainfo](https://nomad-lab.eu/prod/v1/docs/archive.html) to learn more about schemas. +Consult our [documentation on the NOMAD Archive and Metainfo](https://nomad-lab.eu/prod/v1/staging/docs/) to learn more about schemas. diff --git a/examples/data/tabular/periodic-table.archive.xlsx b/examples/data/tabular/data.xlsx similarity index 100% rename from examples/data/tabular/periodic-table.archive.xlsx rename to examples/data/tabular/data.xlsx diff --git a/examples/data/tabular/periodic-table.archive.yaml b/examples/data/tabular/periodic-table.archive.yaml index 736b2f78a3ba66230ed2c2e79ecd2de333d23d6f..926e605fc97e4c77497a442ef6b50d8c718c7f33 100644 --- a/examples/data/tabular/periodic-table.archive.yaml +++ b/examples/data/tabular/periodic-table.archive.yaml @@ -4,20 +4,37 @@ definitions: name: Periodic Table sections: Element: + more: + label_quantity: '#/data/name' base_sections: - # We use ElnBaseSection here. This provides a few quantities (name, description, tags) + # We use ElnBaseSection here. This provides a few quantities (name, ags)description, t # that are added to the search index. If we map table columns to these quantities, # we can make those cells available for search. - nomad.datamodel.metainfo.eln.ElnBaseSection # Schemas that are used to directly parse table files (.csv, .xlsx), need to - # have the first definition to extend nomad.parsing.tabular.TableRow. - - nomad.parsing.tabular.TableRow + # have the first definition to extend nomad.parsing.tabular.TableData. + - nomad.parsing.tabular.TableData m_annotations: # We might not want to show all ElnBaseSection quantities. eln: hide: - lab_id quantities: + # data_file contains the information on how to parse the excel/csv file. Here we want to create + # as many entries as there are rows in the excel file and map the quantities annotated with 'tabular' from + # the tabular data into the nomad schema of each entry. + data_file: + type: str + default: data.xlsx + m_annotations: + tabular_parser: + parsing_options: + comment: '#' + mapping_options: + - mapping_mode: row + file_mode: multiple_new_entries + sections: + - '#root' # Tags will be picked up by ElnBaseSection and put into search. We do not really # use this to edit the tags, but we define a default that is then add to # all row data. diff --git a/nomad/parsing/tabular.py b/nomad/parsing/tabular.py index 42c02e8b240b2e7928ba755dfe2e33b4b4385e0a..127c5ae77822f3f0dbefc17d40bd3f91a662be9c 100644 --- a/nomad/parsing/tabular.py +++ b/nomad/parsing/tabular.py @@ -247,9 +247,11 @@ class TableData(ArchiveSection): except AttributeError: continue section_to_write = section_to_entry - if not any(item.label == 'EntryData' for item in section_to_entry.m_def.all_base_sections): + if not any( + (item.label == 'EntryData' or item.label == 'ArchiveSection') + for item in section_to_entry.m_def.all_base_sections): logger.warning( - f"make sure to inherit from EntryData in your base sections in {section_to_entry.m_def.name}") + f"make sure to inherit from EntryData in the base sections of {section_to_entry.m_def.name}") if not is_quantity_def: pass # raise TabularParserError( @@ -277,8 +279,9 @@ class TableData(ArchiveSection): setattr(self, single_entry_section.split('/')[0], None) self.m_add_sub_section( self.m_def.all_properties[single_entry_section.split('/')[0]], target_section, -1) - from nomad.datamodel import EntryArchive, EntryMetadata + from nomad.datamodel import EntryArchive, EntryMetadata + section_to_entry.fill_archive_from_datafile = False child_archive = EntryArchive( data=section_to_entry, m_context=archive.m_context, @@ -313,7 +316,7 @@ class TableData(ArchiveSection): logger.warning(f"make sure to inherit from EntryData in your base sections in {section.name}") try: - mainfile_name = getattr(getattr(section.m_root(), 'metadata'), 'mainfile') + mainfile_name = getattr(child_sections[0], section.m_def.more.get('label_quantity', None)) except (AttributeError, TypeError): logger.info('could not extract the mainfile from metadata. Setting a default name.') mainfile_name = section.m_def.name @@ -346,12 +349,14 @@ class TableData(ArchiveSection): current_child_entry_name = [get_nested_value(first_child, segments), '.yaml'] except Exception: current_child_entry_name = archive.metadata.mainfile.split('.archive') + first_child.m_context = archive.m_context self.m_update_from_dict(first_child.m_to_dict()) for index, child_section in enumerate(child_sections): if ref_entry_name: ref_entry_name: str = child_section.m_def.more.get('label_quantity', None) - segments = ref_entry_name.split('#/data/')[1].split('/') + segments = ref_entry_name.split('#/data/')[1].split('/') if ref_entry_name.find( + '/') else ref_entry_name filename = f"{get_nested_value(child_section, segments)}.entry_data.archive.{file_type}" current_child_entry_name = [get_nested_value(child_section, segments), '.yaml'] else: @@ -367,6 +372,7 @@ class TableData(ArchiveSection): annotation = data_quantity_def.m_get_annotations('tabular_parser') if annotation: child_section.m_update_from_dict({annotation.m_definition.name: data_file}) + child_section.fill_archive_from_datafile = False child_archive = EntryArchive( data=child_section, m_context=archive.m_context, @@ -391,7 +397,7 @@ m_package.__init_metainfo__() def set_entry_name(quantity_def, child_section, index) -> str: if name := child_section.m_def.more.get('label_quantity', None): - entry_name = f"{child_section[name]}_{index}" + entry_name = f"{child_section[name.split('#/data/')[1]]}_{index}" elif isinstance(quantity_def.type, Reference): entry_name = f"{quantity_def.type._target_section_def.name}_{index}" else: @@ -674,7 +680,7 @@ def _strip_whitespaces_from_df_columns(df): cleaned_col_name = col_name.strip().split('.')[0] count = 0 for string in transformed_column_names.values(): - if cleaned_col_name in string: + if cleaned_col_name == string.split('.')[0]: count += 1 if count: transformed_column_names.update({col_name: f'{cleaned_col_name}.{count}'}) @@ -755,8 +761,8 @@ class TabularDataParser(MatchingParser): return None def is_mainfile( - self, filename: str, mime: str, buffer: bytes, decoded_buffer: str, - compression: str = None + self, filename: str, mime: str, buffer: bytes, decoded_buffer: str, + compression: str = None ) -> Union[bool, Iterable[str]]: # We use the main file regex capabilities of the superclass to check if this is a # .csv file