From e3276dd8bbca986ce1aceb131d3e65190b5f43a4 Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Thu, 13 Aug 2020 18:10:30 +0200
Subject: [PATCH] Added a simply cli script for data migration.

---
 nomad/app/api/mirror.py     | 54 +++++++++++++++-----------
 nomad/cli/admin/__init__.py |  2 +-
 nomad/cli/admin/migrate.py  | 76 +++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 23 deletions(-)
 create mode 100644 nomad/cli/admin/migrate.py

diff --git a/nomad/app/api/mirror.py b/nomad/app/api/mirror.py
index 4f8ed4b256..cce09e3f54 100644
--- a/nomad/app/api/mirror.py
+++ b/nomad/app/api/mirror.py
@@ -76,6 +76,30 @@ class MirrorUploadsResource(Resource):
             abort(400, message='Could not query mongodb: %s' % str(e))
 
 
+def _upload_data(upload_id, upload_json, calcs_col, datasets_col, dois_col):
+    calcs = []
+    datasets = {}
+    dois = {}
+    for calc in calcs_col.find(dict(upload_id=upload_id)):
+        calcs.append(calc)
+        for dataset in calc['metadata'].get('datasets', []):
+            if dataset not in datasets:
+                datasets[dataset] = datasets_col.find_one(dict(_id=dataset))
+                doi = datasets[dataset].get('doi', None)
+                if doi is not None:
+                    doi_obj = dois_col.find_one(dict(_id=doi))
+                    if doi_obj is not None:
+                        dois[doi] = doi_obj
+
+    return {
+        'upload_id': upload_id,
+        'upload': upload_json,
+        'calcs': calcs,
+        'datasets': datasets,
+        'dois': dois
+    }
+
+
 @upload_route(ns)
 class MirrorUploadResource(Resource):
     @api.response(400, 'Not available for the given upload, e.g. upload not published.')
@@ -95,28 +119,14 @@ class MirrorUploadResource(Resource):
         if upload.tasks_running or upload.process_running:
             abort(400, message='Only non processing uploads can be exported')
 
-        calcs = []
-        datasets = {}
-        dois = {}
-        for calc in proc.Calc._get_collection().find(dict(upload_id=upload_id)):
-            calcs.append(calc)
-            for dataset in calc['metadata'].get('datasets', []):
-                if dataset not in datasets:
-                    datasets[dataset] = _Dataset._get_collection().find_one(dict(_id=dataset))
-                    doi = datasets[dataset].get('doi', None)
-                    if doi is not None:
-                        doi_obj = DOI._get_collection().find_one(dict(_id=doi))
-                        if doi_obj is not None:
-                            dois[doi] = doi_obj
-
-        return {
-            'upload_id': upload_id,
-            'upload': upload.to_json(),
-            'calcs': calcs,
-            'datasets': datasets,
-            'dois': dois,
-            'upload_files_path': upload.upload_files.os_path
-        }, 200
+        upload_data = _upload_data(
+            upload.upload_id,
+            upload.to_json(),
+            calcs_col=proc.Calc._get_collection(),
+            datasets_col=_Dataset._get_collection(),
+            dois_col=DOI._get_collection())
+        upload_data.update(upload_files_path=upload.upload_files.os_path)
+        return upload_data, 200
 
 
 _mirror_files_parser = api.parser()
diff --git a/nomad/cli/admin/__init__.py b/nomad/cli/admin/__init__.py
index 6d1a106d55..ecc9cff1c9 100644
--- a/nomad/cli/admin/__init__.py
+++ b/nomad/cli/admin/__init__.py
@@ -51,4 +51,4 @@ lazy_import.lazy_module('nomad.config')
 lazy_import.lazy_module('nomad.files')
 lazy_import.lazy_module('nomad.archive')
 
-from . import admin, uploads, entries, run, clean, users  # noqa
+from . import admin, uploads, entries, run, clean, users, migrate  # noqa
diff --git a/nomad/cli/admin/migrate.py b/nomad/cli/admin/migrate.py
new file mode 100644
index 0000000000..95ea0b5484
--- /dev/null
+++ b/nomad/cli/admin/migrate.py
@@ -0,0 +1,76 @@
+# Copyright 2018 Markus Scheidgen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an"AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import click
+
+from .admin import admin
+
+
+@admin.group(help='Migrate data from older NOMAD versions')
+@click.option('--mongo-db', help='The database name of the existing data', type=str)
+def migrate(mongo_db: str, elastic_index: str):
+    import pymongo
+    import sys
+    import json
+    from nomad import config, processing as proc, doi as nomad_doi, datamodel
+    from nomad.app.api.mirror import _upload_data
+    from nomad.cli.client.mirror import v0Dot7, fix_time, _Dataset
+
+    _Dataset = datamodel.Dataset.m_def.a_mongo.mongo_cls
+
+    client = pymongo.MongoClient(config.mongo.host, config.mongo.port)
+    db = getattr(client, mongo_db)
+    if db is None:
+        print('The given mongo database %s does not exist' % mongo_db)
+        sys.exit(1)
+
+    for upload in db.uploads.find():
+        print('migrating upload with id %s' % upload['_id'])
+        upload_data = _upload_data(upload['_id'], json.dumps(upload), calcs_col=db.calcs, datasets_col=db.datasets, dois_col=db.d_o_i)
+        upload_data = v0Dot7(upload_data)
+
+        proc.Upload._get_collection().insert(upload)
+        for calc in db.calcs.find(dict(upload_id=upload['_id'])):
+            proc.Upload.from_dict(upload).save()
+
+        # create mongo
+        try:
+            upload = proc.Upload.from_json(upload_data['upload'], created=True)
+            if upload_data['datasets'] is not None:
+                for dataset in upload_data['datasets'].values():
+                    fix_time(dataset, ['created'])
+                    _Dataset._get_collection().update(dict(_id=dataset['_id']), dataset, upsert=True)
+            if upload_data['dois'] is not None:
+                for doi in upload_data['dois'].values():
+                    if doi is not None and nomad_doi.DOI.objects(doi=doi).first() is None:
+                        fix_time(doi, ['create_time'])
+                        nomad_doi.DOI._get_collection().update(dict(_id=doi['_id']), doi, upsert=True)
+            if len(upload_data['calcs']) > 0:
+                for calc in upload_data['calcs']:
+                    fix_time(calc, ['create_time', 'complete_time'])
+                    fix_time(calc['metadata'], ['upload_time', 'last_processing'])
+                proc.Calc._get_collection().insert(upload_data['calcs'])
+            upload.save()
+        except Exception as e:
+            print('Could not migrate the uploads: %s' % str(e))
+            print('Please reset and try again.')
+            sys.exit(1)
+
+        # reprocess
+        upload.reset()
+        upload.re_process_upload()
+        upload.block_until_complete(interval=.5)
+
+        if upload.tasks_status == proc.FAILURE:
+            print('upload processed with failure')
-- 
GitLab