From e3276dd8bbca986ce1aceb131d3e65190b5f43a4 Mon Sep 17 00:00:00 2001 From: Markus Scheidgen <markus.scheidgen@gmail.com> Date: Thu, 13 Aug 2020 18:10:30 +0200 Subject: [PATCH] Added a simply cli script for data migration. --- nomad/app/api/mirror.py | 54 +++++++++++++++----------- nomad/cli/admin/__init__.py | 2 +- nomad/cli/admin/migrate.py | 76 +++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 23 deletions(-) create mode 100644 nomad/cli/admin/migrate.py diff --git a/nomad/app/api/mirror.py b/nomad/app/api/mirror.py index 4f8ed4b256..cce09e3f54 100644 --- a/nomad/app/api/mirror.py +++ b/nomad/app/api/mirror.py @@ -76,6 +76,30 @@ class MirrorUploadsResource(Resource): abort(400, message='Could not query mongodb: %s' % str(e)) +def _upload_data(upload_id, upload_json, calcs_col, datasets_col, dois_col): + calcs = [] + datasets = {} + dois = {} + for calc in calcs_col.find(dict(upload_id=upload_id)): + calcs.append(calc) + for dataset in calc['metadata'].get('datasets', []): + if dataset not in datasets: + datasets[dataset] = datasets_col.find_one(dict(_id=dataset)) + doi = datasets[dataset].get('doi', None) + if doi is not None: + doi_obj = dois_col.find_one(dict(_id=doi)) + if doi_obj is not None: + dois[doi] = doi_obj + + return { + 'upload_id': upload_id, + 'upload': upload_json, + 'calcs': calcs, + 'datasets': datasets, + 'dois': dois + } + + @upload_route(ns) class MirrorUploadResource(Resource): @api.response(400, 'Not available for the given upload, e.g. upload not published.') @@ -95,28 +119,14 @@ class MirrorUploadResource(Resource): if upload.tasks_running or upload.process_running: abort(400, message='Only non processing uploads can be exported') - calcs = [] - datasets = {} - dois = {} - for calc in proc.Calc._get_collection().find(dict(upload_id=upload_id)): - calcs.append(calc) - for dataset in calc['metadata'].get('datasets', []): - if dataset not in datasets: - datasets[dataset] = _Dataset._get_collection().find_one(dict(_id=dataset)) - doi = datasets[dataset].get('doi', None) - if doi is not None: - doi_obj = DOI._get_collection().find_one(dict(_id=doi)) - if doi_obj is not None: - dois[doi] = doi_obj - - return { - 'upload_id': upload_id, - 'upload': upload.to_json(), - 'calcs': calcs, - 'datasets': datasets, - 'dois': dois, - 'upload_files_path': upload.upload_files.os_path - }, 200 + upload_data = _upload_data( + upload.upload_id, + upload.to_json(), + calcs_col=proc.Calc._get_collection(), + datasets_col=_Dataset._get_collection(), + dois_col=DOI._get_collection()) + upload_data.update(upload_files_path=upload.upload_files.os_path) + return upload_data, 200 _mirror_files_parser = api.parser() diff --git a/nomad/cli/admin/__init__.py b/nomad/cli/admin/__init__.py index 6d1a106d55..ecc9cff1c9 100644 --- a/nomad/cli/admin/__init__.py +++ b/nomad/cli/admin/__init__.py @@ -51,4 +51,4 @@ lazy_import.lazy_module('nomad.config') lazy_import.lazy_module('nomad.files') lazy_import.lazy_module('nomad.archive') -from . import admin, uploads, entries, run, clean, users # noqa +from . import admin, uploads, entries, run, clean, users, migrate # noqa diff --git a/nomad/cli/admin/migrate.py b/nomad/cli/admin/migrate.py new file mode 100644 index 0000000000..95ea0b5484 --- /dev/null +++ b/nomad/cli/admin/migrate.py @@ -0,0 +1,76 @@ +# Copyright 2018 Markus Scheidgen +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an"AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import click + +from .admin import admin + + +@admin.group(help='Migrate data from older NOMAD versions') +@click.option('--mongo-db', help='The database name of the existing data', type=str) +def migrate(mongo_db: str, elastic_index: str): + import pymongo + import sys + import json + from nomad import config, processing as proc, doi as nomad_doi, datamodel + from nomad.app.api.mirror import _upload_data + from nomad.cli.client.mirror import v0Dot7, fix_time, _Dataset + + _Dataset = datamodel.Dataset.m_def.a_mongo.mongo_cls + + client = pymongo.MongoClient(config.mongo.host, config.mongo.port) + db = getattr(client, mongo_db) + if db is None: + print('The given mongo database %s does not exist' % mongo_db) + sys.exit(1) + + for upload in db.uploads.find(): + print('migrating upload with id %s' % upload['_id']) + upload_data = _upload_data(upload['_id'], json.dumps(upload), calcs_col=db.calcs, datasets_col=db.datasets, dois_col=db.d_o_i) + upload_data = v0Dot7(upload_data) + + proc.Upload._get_collection().insert(upload) + for calc in db.calcs.find(dict(upload_id=upload['_id'])): + proc.Upload.from_dict(upload).save() + + # create mongo + try: + upload = proc.Upload.from_json(upload_data['upload'], created=True) + if upload_data['datasets'] is not None: + for dataset in upload_data['datasets'].values(): + fix_time(dataset, ['created']) + _Dataset._get_collection().update(dict(_id=dataset['_id']), dataset, upsert=True) + if upload_data['dois'] is not None: + for doi in upload_data['dois'].values(): + if doi is not None and nomad_doi.DOI.objects(doi=doi).first() is None: + fix_time(doi, ['create_time']) + nomad_doi.DOI._get_collection().update(dict(_id=doi['_id']), doi, upsert=True) + if len(upload_data['calcs']) > 0: + for calc in upload_data['calcs']: + fix_time(calc, ['create_time', 'complete_time']) + fix_time(calc['metadata'], ['upload_time', 'last_processing']) + proc.Calc._get_collection().insert(upload_data['calcs']) + upload.save() + except Exception as e: + print('Could not migrate the uploads: %s' % str(e)) + print('Please reset and try again.') + sys.exit(1) + + # reprocess + upload.reset() + upload.re_process_upload() + upload.block_until_complete(interval=.5) + + if upload.tasks_status == proc.FAILURE: + print('upload processed with failure') -- GitLab