Commit 7a1a338a authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added admin command to rewrite doi url. #818

parent ad52e42a
Pipeline #132644 passed with stages
in 84 minutes and 39 seconds
......@@ -18,7 +18,6 @@
#
import click
import sys
from nomad import config
from nomad.cli.cli import cli
......@@ -35,6 +34,7 @@ def admin(ctx):
@click.option('--remove', is_flag=True, help='Do not just reset all dbs, but also remove them.')
@click.option('--i-am-really-sure', is_flag=True, help='Must be set for the command to to anything.')
def reset(remove, i_am_really_sure):
import sys
if not i_am_really_sure:
print('You do not seem to be really sure about what you are doing.')
sys.exit(1)
......@@ -343,6 +343,8 @@ def migrate_mongo(
host, port, src_db_name, dst_db_name, upload_query, entry_query,
ids_from_file, failed_ids_to_file, upload_update, entry_update, overwrite, fix_problems, dry):
import json
import sys
from pymongo.database import Database
from nomad import infrastructure
import nomad.cli.admin.migrate as migrate
......@@ -394,3 +396,54 @@ def migrate_mongo(
migrate.migrate_mongo_uploads(
db_src, db_dst, uploads, failed_ids_to_file, upload_update, entry_update, overwrite,
fix_problems, dry)
@admin.command(
help='''
Rewrites the existing dataset URLs in existing DOI records with freshly generated dataset URLs.
This is useful, if the URL layout has changed.''')
@click.argument('DOIs', nargs=-1)
@click.option('--dry', is_flag=True, help='Just test if DOI exists and print is current URL.')
@click.option('--save-existing-records', help='A filename to store the existing DOI records in.')
def rewrite_doi_urls(dois, dry, save_existing_records):
import json
import requests
from nomad.doi import edit_doi_url, _create_dataset_url
existing_records = []
if len(dois) == 0:
from nomad import infrastructure
from nomad.datamodel import Dataset
infrastructure.setup_mongo()
datasets = Dataset.m_def.a_mongo.objects(doi__exists=True)
dois = [dataset.doi for dataset in datasets]
try:
for doi in dois:
# TODO remove this
if doi == '10.17172/NOMAD/2016.10.14-1':
continue
# check if doi exits
response = requests.get(f'https://api.datacite.org/dois/{doi}')
if response.status_code == 404:
print(f'Cannot rewrite {doi}. DOI does not exist.')
continue
data = response.json()
existing_records.append(data)
if data['data']['attributes']['url'] == _create_dataset_url(doi):
print(f'Already up-to-date {doi}')
continue
print(f'Updating {doi} ...')
if not dry:
edit_doi_url(doi)
finally:
if save_existing_records:
with open(save_existing_records, 'wt') as f:
json.dump(existing_records, f, indent=2)
......@@ -31,16 +31,43 @@ from nomad.datamodel import User
from nomad import config, utils
def edit_url(doi: str, url: str = None):
def _create_dataset_url(doi: str) -> str:
'''
Returns:
The url that set in the DOI record and is used to resolve the DOI. The URL
points to the dataset page in the NOMAD GUI.
'''
return f'{config.gui_url()}/dataset/doi/{doi}'
def edit_doi_url(doi: str, url: str = None):
''' Changes the URL of an already findable DOI. '''
if url is None:
url = 'https://nomad-lab.eu/prod/rae/gui/datasets/doi/%s' % doi
metadata_url = '%s/doi/%s' % (config.datacite.mds_host, doi)
response = requests.put(
metadata_url,
headers={'Content-Type': 'text/plain;charset=UTF-8'},
data='doi=%s\nurl=%s' % (doi, url), **_requests_args())
url = _create_dataset_url(doi)
doi_url = '%s/doi/%s' % (config.datacite.mds_host, doi)
headers = {'Content-Type': 'text/plain;charset=UTF-8'}
data = f'doi={doi}\nurl={url}'
response = requests.put(doi_url, headers=headers, data=data, **_requests_args())
# There seems to be a bug in datacite. Some old records might have somehow invalid
# xml stored at datacite and for those the DOI update might fail. We try to
# get the xml string, parse and re-serialize and put it again. After this the
# url update might work.
if response.status_code == 422 and 'No matching global declaration available' in response.text:
metadata_url = f'{config.datacite.mds_host}/metadata/{doi}'
response = requests.get(metadata_url, **_requests_args())
original_xml = response.text
tree = ET.fromstring(original_xml)
repaired_xml = ET.tostring(tree, encoding='UTF-8', method='xml').decode('utf-8')
response = requests.put(metadata_url, **_requests_args())
requests.put(
metadata_url,
headers={'Content-Type': 'application/xml;charset=UTF-8'},
data=repaired_xml.encode('utf-8'), **_requests_args())
response = requests.put(doi_url, headers=headers, data=data, **_requests_args())
if response.status_code >= 300:
raise Exception(f'Encountered known xml problems for {doi}. But could not fix.')
if response.status_code >= 300:
raise Exception('Unexpected datacite response (status code %d): %s' % (
......@@ -96,7 +123,7 @@ class DOI(Document):
doi.doi_url = '%s/doi/%s' % (config.datacite.mds_host, doi_str)
doi.state = 'created'
doi.create_time = create_time
doi.url = '%s/dataset/doi/%s' % (config.gui_url(), doi_str)
doi.url = _create_dataset_url(doi_str)
affiliation = ''
if user.affiliation is not None:
......
......@@ -31,7 +31,7 @@ app:
worker:
replicas: 1
routing: "worker"
routing: "queue"
processes: 12
nomadNodeType: "prod-worker"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment