diff --git a/nomad/cli/admin/admin.py b/nomad/cli/admin/admin.py index 3ddd29e1525ee29ada8643e68d813fcbde3bc400..769dfa28d37bc15ced7611253e65f53daf3bf2f0 100644 --- a/nomad/cli/admin/admin.py +++ b/nomad/cli/admin/admin.py @@ -192,44 +192,6 @@ def lift_embargo(dry, parallel): __run_processing(uploads_to_repack, parallel, lambda upload: upload.re_pack(), 're-packing') -@admin.command(help='(Re-)index all calcs.') -@click.option('--threads', type=int, default=1, help='Number of threads to use.') -@click.option('--dry', is_flag=True, help='Do not index, just compute entries.') -def index(threads, dry): - infrastructure.setup_mongo() - infrastructure.setup_elastic() - - all_calcs = proc.Calc.objects().count() - print('indexing %d ...' % all_calcs) - - def elastic_updates(): - with utils.ETA(all_calcs, ' index %10d or %10d calcs, ETA %s') as eta: - for calc in proc.Calc.objects(): - eta.add() - entry_metadata = datamodel.EntryMetadata.m_from_dict(calc.metadata) - entry = entry_metadata.a_elastic.create_index_entry().to_dict(include_meta=True) - entry['_op_type'] = 'index' - yield entry - - if dry: - for _ in elastic_updates(): - pass - else: - if threads > 1: - print(' use %d threads' % threads) - for _ in elasticsearch.helpers.parallel_bulk( - infrastructure.elastic_client, elastic_updates(), chunk_size=500, - thread_count=threads): - pass - else: - elasticsearch.helpers.bulk( - infrastructure.elastic_client, elastic_updates()) - search.refresh() - - print('') - print('indexing completed') - - @admin.command() @click.option('--threads', type=int, default=1, help='Number of threads to use.') @click.option('--code', multiple=True, type=str, help='Index only calculcations of given codes.') diff --git a/nomad/cli/admin/uploads.py b/nomad/cli/admin/uploads.py index 9834e32f9d8e3a506d1515f54793b2f49be41059..6d6e58d8d987db9f31b2b868195092bc88f9deba 100644 --- a/nomad/cli/admin/uploads.py +++ b/nomad/cli/admin/uploads.py @@ -247,19 +247,32 @@ def reset(ctx, uploads, with_calcs): @uploads.command(help='(Re-)index all calcs of the given uploads.') @click.argument('UPLOADS', nargs=-1) @click.option('--parallel', default=1, type=int, help='Use the given amount of parallel processes. Default is 1.') +@click.option('--transformer', help='Qualified name to a Python function that should be applied to each EntryMetadata.') @click.pass_context -def index(ctx, uploads, parallel): +def index(ctx, uploads, parallel, transformer): + transformer_func = None + if transformer is not None: + import importlib + module_name, func_name = transformer.rsplit('.', 1) + module = importlib.import_module(module_name) + transformer_func = getattr(module, func_name) + _, uploads = query_uploads(ctx, uploads) + def transform(calcs): + for calc in calcs: + try: + calc = transformer_func(calc) + except Exception as e: + import traceback + traceback.print_exc() + print(f' ERROR failed to transform calc (stop transforming for upload): {str(e)}') + break + def index_upload(upload, logger): with upload.entries_metadata() as calcs: - # This is just a temporary fix to update the group hash without re-processing - try: - for calc in calcs: - if calc.dft is not None: - calc.dft.update_group_hash() - except Exception: - pass + if transformer is not None: + transform(calcs) failed = search.index_all(calcs) if failed > 0: print(' WARNING failed to index %d entries' % failed) diff --git a/nomad/normalizing/optimade.py b/nomad/normalizing/optimade.py index 99c35af589547ccc0b2eb9bc0e381745e2dc225e..b56a0f6075bc2a806441b604ea2df5011368de04 100644 --- a/nomad/normalizing/optimade.py +++ b/nomad/normalizing/optimade.py @@ -33,6 +33,30 @@ from nomad.datamodel.metainfo.public import section_system species_re = re.compile(r'^([A-Z][a-z]?)(\d*)$') +def transform_to_v1(entry: EntryMetadata) -> EntryMetadata: + ''' + Transformation function to use during re-indexing of entries with outdated optimade + format. Fixes formulas and periodic dimensions, removed entries with X in formula. + ''' + optimade = entry.dft.optimade if entry.dft is not None else None + if optimade is None: + return entry + + if 'X' in optimade.chemical_formula_reduced: + entry.dft.m_remove_sub_section(DFTMetadata.optimade, -1) + return entry + + optimade.chemical_formula_reduced = optimade_chemical_formula_reduced(optimade.chemical_formula_reduced) + optimade.chemical_formula_anonymous = optimade_chemical_formula_anonymous(optimade.chemical_formula_reduced) + optimade.chemical_formula_hill = optimade_chemical_formula_hill(optimade.chemical_formula_hill) + optimade.chemical_formula_descriptive = optimade.chemical_formula_hill + dimension_types = optimade.dimension_types + if isinstance(dimension_types, int): + optimade.dimension_types = [1] * dimension_types + [0] * (3 - dimension_types) + + return entry + + def optimade_chemical_formula_reduced(formula: str): if formula is None: return formula diff --git a/tests/test_cli.py b/tests/test_cli.py index a35c48d76d57b63921fe99992b092fea947548b2..20bf313be31c5d244c9ba60730eaef19ee9e213b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -117,21 +117,6 @@ class TestAdmin: with files.UploadFiles.get(upload_id=upload_id).read_archive(calc_id=calc.calc_id) as archive: assert calc.calc_id in archive - def test_index(self, published): - upload_id = published.upload_id - calc = Calc.objects(upload_id=upload_id).first() - calc.metadata['comment'] = 'specific' - calc.save() - - assert search.SearchRequest().search_parameter('comment', 'specific').execute()['total'] == 0 - - result = click.testing.CliRunner().invoke( - cli, ['admin', 'index', '--threads', '2'], catch_exceptions=False) - assert result.exit_code == 0 - assert 'index' in result.stdout - - assert search.SearchRequest().search_parameter('comment', 'specific').execute()['total'] == 1 - def test_delete_entry(self, published): upload_id = published.upload_id calc = Calc.objects(upload_id=upload_id).first() @@ -145,6 +130,11 @@ class TestAdmin: assert Calc.objects(calc_id=calc.calc_id).first() is None +def transform_for_index_test(calc): + calc.comment = 'specific' + return calc + + @pytest.mark.usefixtures('reset_config', 'no_warn') class TestAdminUploads: @@ -236,6 +226,21 @@ class TestAdminUploads: assert search.SearchRequest().search_parameters(comment='specific').execute()['total'] == 1 + def test_index_with_transform(self, published): + upload_id = published.upload_id + assert search.SearchRequest().search_parameters(comment='specific').execute()['total'] == 0 + + result = click.testing.CliRunner().invoke( + cli, [ + 'admin', 'uploads', 'index', + '--transformer', 'tests.test_cli.transform_for_index_test', + upload_id], + catch_exceptions=False) + assert result.exit_code == 0 + assert 'index' in result.stdout + + assert search.SearchRequest().search_parameters(comment='specific').execute()['total'] == 1 + def test_re_process(self, published, monkeypatch): monkeypatch.setattr('nomad.config.meta.version', 'test_version') upload_id = published.upload_id