Commit e3276dd8 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added a simply cli script for data migration.

parent fdfd8e6e
......@@ -76,6 +76,30 @@ class MirrorUploadsResource(Resource):
abort(400, message='Could not query mongodb: %s' % str(e))
def _upload_data(upload_id, upload_json, calcs_col, datasets_col, dois_col):
calcs = []
datasets = {}
dois = {}
for calc in calcs_col.find(dict(upload_id=upload_id)):
calcs.append(calc)
for dataset in calc['metadata'].get('datasets', []):
if dataset not in datasets:
datasets[dataset] = datasets_col.find_one(dict(_id=dataset))
doi = datasets[dataset].get('doi', None)
if doi is not None:
doi_obj = dois_col.find_one(dict(_id=doi))
if doi_obj is not None:
dois[doi] = doi_obj
return {
'upload_id': upload_id,
'upload': upload_json,
'calcs': calcs,
'datasets': datasets,
'dois': dois
}
@upload_route(ns)
class MirrorUploadResource(Resource):
@api.response(400, 'Not available for the given upload, e.g. upload not published.')
......@@ -95,28 +119,14 @@ class MirrorUploadResource(Resource):
if upload.tasks_running or upload.process_running:
abort(400, message='Only non processing uploads can be exported')
calcs = []
datasets = {}
dois = {}
for calc in proc.Calc._get_collection().find(dict(upload_id=upload_id)):
calcs.append(calc)
for dataset in calc['metadata'].get('datasets', []):
if dataset not in datasets:
datasets[dataset] = _Dataset._get_collection().find_one(dict(_id=dataset))
doi = datasets[dataset].get('doi', None)
if doi is not None:
doi_obj = DOI._get_collection().find_one(dict(_id=doi))
if doi_obj is not None:
dois[doi] = doi_obj
return {
'upload_id': upload_id,
'upload': upload.to_json(),
'calcs': calcs,
'datasets': datasets,
'dois': dois,
'upload_files_path': upload.upload_files.os_path
}, 200
upload_data = _upload_data(
upload.upload_id,
upload.to_json(),
calcs_col=proc.Calc._get_collection(),
datasets_col=_Dataset._get_collection(),
dois_col=DOI._get_collection())
upload_data.update(upload_files_path=upload.upload_files.os_path)
return upload_data, 200
_mirror_files_parser = api.parser()
......
......@@ -51,4 +51,4 @@ lazy_import.lazy_module('nomad.config')
lazy_import.lazy_module('nomad.files')
lazy_import.lazy_module('nomad.archive')
from . import admin, uploads, entries, run, clean, users # noqa
from . import admin, uploads, entries, run, clean, users, migrate # noqa
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import click
from .admin import admin
@admin.group(help='Migrate data from older NOMAD versions')
@click.option('--mongo-db', help='The database name of the existing data', type=str)
def migrate(mongo_db: str, elastic_index: str):
import pymongo
import sys
import json
from nomad import config, processing as proc, doi as nomad_doi, datamodel
from nomad.app.api.mirror import _upload_data
from nomad.cli.client.mirror import v0Dot7, fix_time, _Dataset
_Dataset = datamodel.Dataset.m_def.a_mongo.mongo_cls
client = pymongo.MongoClient(config.mongo.host, config.mongo.port)
db = getattr(client, mongo_db)
if db is None:
print('The given mongo database %s does not exist' % mongo_db)
sys.exit(1)
for upload in db.uploads.find():
print('migrating upload with id %s' % upload['_id'])
upload_data = _upload_data(upload['_id'], json.dumps(upload), calcs_col=db.calcs, datasets_col=db.datasets, dois_col=db.d_o_i)
upload_data = v0Dot7(upload_data)
proc.Upload._get_collection().insert(upload)
for calc in db.calcs.find(dict(upload_id=upload['_id'])):
proc.Upload.from_dict(upload).save()
# create mongo
try:
upload = proc.Upload.from_json(upload_data['upload'], created=True)
if upload_data['datasets'] is not None:
for dataset in upload_data['datasets'].values():
fix_time(dataset, ['created'])
_Dataset._get_collection().update(dict(_id=dataset['_id']), dataset, upsert=True)
if upload_data['dois'] is not None:
for doi in upload_data['dois'].values():
if doi is not None and nomad_doi.DOI.objects(doi=doi).first() is None:
fix_time(doi, ['create_time'])
nomad_doi.DOI._get_collection().update(dict(_id=doi['_id']), doi, upsert=True)
if len(upload_data['calcs']) > 0:
for calc in upload_data['calcs']:
fix_time(calc, ['create_time', 'complete_time'])
fix_time(calc['metadata'], ['upload_time', 'last_processing'])
proc.Calc._get_collection().insert(upload_data['calcs'])
upload.save()
except Exception as e:
print('Could not migrate the uploads: %s' % str(e))
print('Please reset and try again.')
sys.exit(1)
# reprocess
upload.reset()
upload.re_process_upload()
upload.block_until_complete(interval=.5)
if upload.tasks_status == proc.FAILURE:
print('upload processed with failure')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment