Commit a41ac458 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merge branch 'v0.8.0' of gitlab.mpcdf.mpg.de:nomad-lab/nomad-FAIR into v0.8.0

parents 0c5188ef 3077fcd7
Pipeline #70880 canceled with stages
in 1 minute and 34 seconds
......@@ -21,4 +21,4 @@ target/
vscode/
nomad.yaml
gunicorn.log.conf
gunicorn.conf
\ No newline at end of file
gunicorn.conf
......@@ -91,7 +91,7 @@ tests:
NOMAD_ELASTIC_HOST: elastic
NOMAD_MONGO_HOST: mongo
NOMAD_KEYCLOAK_PASSWORD: ${CI_KEYCLOAK_ADMIN_PASSWORD}
NOMAD_SPRINGER_DB_PATH: /nomad/fairdi/db/data/springer.db
NOMAD_NORMALIZE_SPRINGER_DB_PATH: /nomad/fairdi/db/data/springer.msg
script:
- cd /app
- ls /builds
......
......@@ -360,8 +360,14 @@ class ArchiveReader(ArchiveObject):
if positions is None:
r_start = 0
r_end = self._n_toc
i_block = None
while positions is None:
i_block = r_start + math.floor((r_end - r_start) / 2)
new_i_block = r_start + math.floor((r_end - r_start) / 2)
if i_block == new_i_block:
break
else:
i_block = new_i_block
first, last = self._load_toc_block(i_block)
if key < first:
r_end = i_block - 1
......@@ -371,6 +377,9 @@ class ArchiveReader(ArchiveObject):
positions = self._toc.get(key)
break
if positions is None:
raise KeyError(key)
toc_position, data_position = positions
else:
......@@ -496,17 +505,18 @@ def query_archive(f, query_dict: dict):
key = key.strip()
# process array indices
match = re.match(r'([_a-bA-Z0-9]+)\[([0-9]+|:)\]', key)
match = re.match(r'(\w+)\[([-?0-9:]+)\]', key)
if match:
archive_key = match.group(1)
index_str = match.group(2)
match = re.match(r'([0-9]*):([0-9]*)', index_str)
match = re.match(r'([-?0-9]*):([-?0-9]*)', index_str)
if match:
index = (
0 if match.group(1) == '' else int(match.group(1)),
None if match.group(2) == '' else int(match.group(2)))
else:
index = int(index_str) # type: ignore
key = archive_key
else:
archive_key = key
index = None
......@@ -515,7 +525,6 @@ def query_archive(f, query_dict: dict):
archive_key = key.split('[')[0]
if main_section:
archive_key = adjust_uuid_size(key)
try:
if index is None:
res[key] = _load_data(val, archive_item[archive_key])
......
......@@ -161,6 +161,7 @@ class ArchiveMetainfo:
return Quantity(type=dtype)
def _create_section(self, name, contents):
name = name.split('[')[0]
section = type(name.title(), (MSection,), contents)
return section
......@@ -245,9 +246,9 @@ class ArchiveQuery:
host=host,
user=nomad_config.client.user,
password=nomad_config.client.password,
server_url=nomad_config.keycloak.server_external_url,
server_url=nomad_config.keycloak.server_url,
realm_name=nomad_config.keycloak.realm_name,
client_id=nomad_config.keycloak.public_client_id)
client_id=nomad_config.keycloak.client_id)
if isinstance(self._authentication, KeycloakAuthenticator):
return self._authentication.apply()
else:
......
......@@ -33,9 +33,10 @@ from bs4 import BeautifulSoup
from matid import SymmetryAnalyzer
from nomad import processing as proc, search, datamodel, infrastructure, utils, config
from nomad.normalizing.structure import get_normalized_wyckoff
from nomad.normalizing.aflow_prototypes import get_normalized_wyckoff
from nomad.cli.cli import cli
from nomad import config
from nomad.normalizing.springer import update_springer_data
def __run_processing(
......@@ -528,3 +529,10 @@ def prototypes_update(ctx, filepath, matches_only):
# Write data file to the specified path
write_prototype_data_file(aflow_prototypes, filepath)
@admin.command(help='Updates the springer database in nomad.config.springer_msg_db_path.')
@click.option('--max-n-query', default=10, type=int, help='Number of unsuccessful springer request before returning an error. Default is 10.')
@click.option('--retry-time', default=120, type=int, help='Time in seconds to retry after unsuccessful request. Default is 120.')
def springer_update(max_n_query, retry_time):
update_springer_data(max_n_query, retry_time)
......@@ -190,7 +190,9 @@ normalize = NomadConfig(
# The distance tolerance between atoms for grouping them into the same
# cluster. Used in detecting system type.
cluster_threshold=3.1,
)
springer_db_path=os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'normalizing/data/springer.msg'))
client = NomadConfig(
user='leonard.hofstadter@nomad-fairdi.tests.de',
......@@ -220,10 +222,6 @@ use_empty_parsers = False
reprocess_unmatched = True
springer_db_relative_path = 'normalizing/data/SM_all08.db'
springer_db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), springer_db_relative_path)
def normalize_loglevel(value, default_level=logging.INFO):
plain_value = value
if plain_value is None:
......
SM_all08.db
springer.msg
\ No newline at end of file
springer.msg
......@@ -17,10 +17,6 @@ Generates and queries a msgpack database of springer-related quantities download
http://materials.springer.com. The database is stuctured as
space_group_number : normalized_formula : springer_id : entry
The msgpack file can be queried using ArchiveFileDB.
The html parser was taken from a collection of scripts from FHI without further testing.
'''
import requests
......@@ -28,12 +24,12 @@ import re
from bs4 import BeautifulSoup
from typing import Dict, List, Any
from time import sleep
import os
import os.path
from nomad.archive import query_archive, write_archive, ArchiveReader
from nomad import config
DB_NAME = '.springer.msg'
_DB_PATH = config.normalize.springer_db_path
required_items = {
'Alphabetic Formula:': 'alphabetic_formula',
......@@ -41,6 +37,7 @@ required_items = {
'Compound Class(es):': 'compound_classes',
'Dataset ID': 'id',
'Space Group:': 'space_group_number',
'Phase Label(s):': 'phase_labels'
}
spaces_re = re.compile(r'\s+')
......@@ -118,11 +115,15 @@ def parse(htmltext: str) -> Dict[str, str]:
results['compound_classes'] = [x for x in results['compound_classes'] if x != '–']
normalized_formula = None
if 'alphabetic_formula' in results:
try:
normalized_formula = normalize_formula(results['alphabetic_formula'])
except Exception:
pass
for formula_type in ['alphabetic_formula', 'phase_labels']:
formula = results.get(formula_type, None)
if formula:
try:
normalized_formula = normalize_formula(formula)
break
except Exception:
pass
results['normalized_formula'] = normalized_formula
return results
......@@ -140,7 +141,7 @@ def _merge_dict(dict0: Dict[str, Any], dict1: Dict[str, Any]) -> Dict[str, Any]:
return dict0
def _download(path: str, max_n_query: int = 10) -> str:
def _download(path: str, max_n_query: int = 10, retry_time: int = 120) -> str:
n_query = 0
while True:
response = requests.get(path)
......@@ -149,7 +150,7 @@ def _download(path: str, max_n_query: int = 10) -> str:
if n_query > max_n_query:
break
n_query += 1
sleep(120)
sleep(retry_time)
if response.status_code != 200:
response.raise_for_status()
......@@ -157,7 +158,7 @@ def _download(path: str, max_n_query: int = 10) -> str:
return response.text
def download_springer_data(max_n_query: int = 10):
def update_springer_data(max_n_query: int = 10, retry_time: int = 120):
'''
Downloads the springer quantities related to a structure from springer and updates
database.
......@@ -165,11 +166,11 @@ def download_springer_data(max_n_query: int = 10):
# load database
# querying database with unvailable dataset leads to error,
# get toc keys first by making an empty query
archive = ArchiveReader(DB_NAME)
archive = ArchiveReader(_DB_PATH)
_ = archive._load_toc_block(0)
archive_keys = archive._toc.keys()
sp_data = query_archive(DB_NAME, {spg: '*' for spg in archive_keys})
sp_data = query_archive(_DB_PATH, {spg: '*' for spg in archive_keys})
sp_ids: List[str] = []
for spg in sp_data:
......@@ -181,19 +182,23 @@ def download_springer_data(max_n_query: int = 10):
page = 1
while True:
# check springer database for new entries by comparing with local database
root = 'https://materials.springer.com/search?searchTerm=&pageNumber=%d&datasourceFacet=sm_isp&substanceId=' % page
req_text = _download(root, max_n_query)
root = 'http://materials.springer.com/search?searchTerm=&pageNumber=%d&datasourceFacet=sm_isp&substanceId=' % page
req_text = _download(root, max_n_query, retry_time)
if 'Sorry,' in req_text:
break
paths = search_re.findall(req_text)
if len(paths) == 0:
break
for path in paths:
sp_id = os.path.basename(path)
if sp_id in sp_ids:
continue
path = 'http://materials.springer.com%s' % path
req_text = _download(path, max_n_query)
req_text = _download(path, max_n_query, retry_time)
try:
data = parse(req_text)
except Exception:
......@@ -203,10 +208,12 @@ def download_springer_data(max_n_query: int = 10):
normalized_formula = data.get('normalized_formula', None)
if space_group_number is None or normalized_formula is None:
continue
aformula = data.get('alphabetic_formula', None)
if aformula is None:
aformula = data.get('phase_labels', None)
compound = data.get('compound_classes', None)
classification = data.get('classification', None)
entry = dict(
aformula=aformula, url=path, compound=compound,
classification=classification)
......@@ -215,14 +222,12 @@ def download_springer_data(max_n_query: int = 10):
page += 1
write_archive(DB_NAME, len(sp_data), sp_data.items(), entry_toc_depth=1)
write_archive(_DB_PATH, len(sp_data), sp_data.items(), entry_toc_depth=1)
def query_springer_data(normalized_formula: str, space_group_number: int) -> Dict[str, Any]:
'''
Queries a msgpack database for springer-related quantities.
'''
entries = query_archive(DB_NAME, {str(space_group_number): {normalized_formula: '*'}})
''' Queries a msgpack database for springer-related quantities. '''
entries = query_archive(_DB_PATH, {str(space_group_number): {normalized_formula: '*'}})
db_dict = {}
entries = entries.get(str(space_group_number), {}).get(normalized_formula, {})
......
......@@ -19,16 +19,14 @@ from ase import Atoms
import numpy as np
import json
import re
import os
import sqlite3
from matid import SymmetryAnalyzer, Classifier
from matid.classifications import Class0D, Atom, Class1D, Material2D, Surface, Class3D
from nomad.normalizing import structure
from nomad import utils, config
from nomad.normalizing.normalizer import SystemBasedNormalizer
from nomad.normalizing.data.springer_msgpack import query_springer_data
from . import aflow_prototypes
from .normalizer import SystemBasedNormalizer
from .springer import query_springer_data
# use a regular expression to check atom labels; expression is build from list of
# all labels sorted desc to find Br and not B when searching for Br.
......@@ -36,28 +34,6 @@ atom_label_re = re.compile('|'.join(
sorted(ase.data.chemical_symbols, key=lambda x: len(x), reverse=True)))
springer_db_connection = None
def open_springer_database():
'''
Create a global connection to the Springer database in a way that
each worker opens the database just once.
'''
global springer_db_connection
if springer_db_connection is None:
# filepath definition in 'nomad-FAIR/nomad/config.py'
db_file = config.springer_db_path
if not os.path.exists(db_file):
utils.get_logger(__name__).error('Springer database not found')
return None
springer_db_connection = sqlite3.connect(db_file, check_same_thread=False, uri=True)
# we lift the thread check because we share the connection among workers
# 'uri=True': open a database in read-only mode
return springer_db_connection
def normalized_atom_labels(atom_labels):
'''
Normalizes the given atom labels: they either are labels right away, or contain
......@@ -397,94 +373,43 @@ class SystemNormalizer(SystemBasedNormalizer):
self._backend.closeSection('section_symmetry', symmetry_gid)
def springer_classification(self, atoms, space_group_number, database='sqlite'):
# SPRINGER NORMALIZER
def springer_classification(self, atoms, space_group_number):
normalized_formula = formula_normalizer(atoms)
#
if database == 'sqlite':
springer_db_connection = open_springer_database()
if springer_db_connection is None:
return
cur = springer_db_connection.cursor()
# SQL QUERY
# (this replaces the four queries done in the old 'classify4me_SM_normalizer.py')
cur.execute('''
SELECT
entry.entry_id,
entry.alphabetic_formula,
GROUP_CONCAT(DISTINCT compound_classes.compound_class_name),
GROUP_CONCAT(DISTINCT classification.classification_name)
FROM entry
LEFT JOIN entry_compound_class as ecc ON ecc.entry_nr = entry.entry_nr
LEFT JOIN compound_classes ON ecc.compound_class_nr = compound_classes.compound_class_nr
LEFT JOIN entry_classification as ec ON ec.entry_nr = entry.entry_nr
LEFT JOIN classification ON ec.classification_nr = classification.classification_nr
LEFT JOIN entry_reference as er ON er.entry_nr = entry.entry_nr
LEFT JOIN reference ON reference.reference_nr = er.entry_nr
WHERE entry.normalized_formula = ( %r ) and entry.space_group_number = '%d'
GROUP BY entry.entry_id;
''' % (normalized_formula, space_group_number))
results = cur.fetchall()
# 'results' is a list of tuples, i.e. '[(a,b,c,d), ..., (a,b,c,d)]'
# All SQL queries done
# Storing 'results' in a dictionary
dbdict = {}
for ituple in results:
# 'spr' means 'springer'
spr_id = ituple[0]
spr_aformula = ituple[1] # alphabetical formula
spr_url = 'http://materials.springer.com/isp/crystallographic/docs/' + spr_id
spr_compound = ituple[2].split(',') # split to convert string to list
spr_classification = ituple[3].split(',')
#
spr_compound.sort()
spr_classification.sort()
#
dbdict[spr_id] = {
'spr_id': spr_id,
'spr_aformula': spr_aformula,
'spr_url': spr_url,
'spr_compound': spr_compound,
'spr_classification': spr_classification}
elif database == 'msgpack':
dbdict = query_springer_data(normalized_formula, space_group_number)
# =============
# SPRINGER's METAINFO UPDATE
# LAYOUT: Five sections under 'section_springer_material' for each material ID:
# id, alphabetical formula, url, compound_class, clasification.
# As per Markus/Luca's emails, we don't expose Springer bib references (Springer's paywall)
for material in dbdict.values():
springer_data = query_springer_data(normalized_formula, space_group_number)
for material in springer_data.values():
self._backend.openNonOverlappingSection('section_springer_material')
self._backend.addValue('springer_id', material['spr_id'])
self._backend.addValue('springer_alphabetical_formula', material['spr_aformula'])
self._backend.addValue('springer_url', material['spr_url'])
self._backend.addArrayValues('springer_compound_class', material['spr_compound'])
self._backend.addArrayValues('springer_classification', material['spr_classification'])
compound_classes = material['spr_compound']
if compound_classes is None:
compound_classes = []
self._backend.addArrayValues('springer_compound_class', compound_classes)
classifications = material['spr_classification']
if classifications is None:
classifications = []
self._backend.addArrayValues('springer_classification', classifications)
self._backend.closeNonOverlappingSection('section_springer_material')
# Check the 'springer_classification' and 'springer_compound_class' information
# found is the same for all springer_id's
dkeys = list(dbdict.keys())
if len(dkeys) != 0:
class_0 = dbdict[dkeys[0]]['spr_classification']
comp_0 = dbdict[spr_id]['spr_compound']
springer_data_keys = list(springer_data.keys())
if len(springer_data_keys) != 0:
class_0 = springer_data[springer_data_keys[0]]['spr_classification']
comp_0 = springer_data[springer_data_keys[0]]['spr_compound']
# compare 'class_0' and 'comp_0' against the rest
for ii in range(1, len(dkeys)):
class_test = (class_0 == dbdict[dkeys[ii]]['spr_classification'])
comp_test = (comp_0 == dbdict[dkeys[ii]]['spr_compound'])
for ii in range(1, len(springer_data_keys)):
class_test = (class_0 == springer_data[springer_data_keys[ii]]['spr_classification'])
comp_test = (comp_0 == springer_data[springer_data_keys[ii]]['spr_compound'])
if (class_test or comp_test) is False:
self.logger.warning('Mismatch in Springer classification or compounds')
self.logger.info('Mismatch in Springer classification or compounds')
def prototypes(self, atom_species: np.array, wyckoffs: np.array, spg_number: int) -> None:
'''Tries to match the material to an entry in the AFLOW prototype data.
......@@ -495,8 +420,8 @@ class SystemNormalizer(SystemBasedNormalizer):
wyckoff_letters: Array of Wyckoff letters as strings.
spg_number: Space group number.
'''
norm_wyckoff = structure.get_normalized_wyckoff(atom_species, wyckoffs)
protoDict = structure.search_aflow_prototype(spg_number, norm_wyckoff)
norm_wyckoff = aflow_prototypes.get_normalized_wyckoff(atom_species, wyckoffs)
protoDict = aflow_prototypes.search_aflow_prototype(spg_number, norm_wyckoff)
if protoDict is not None:
aflow_prototype_id = protoDict["aflow_prototype_id"]
aflow_prototype_url = protoDict["aflow_prototype_url"]
......
......@@ -59,7 +59,8 @@ data:
realm_name: "{{ .Values.keycloak.realmName }}"
username: "{{ .Values.keycloak.username }}"
client_id: "{{ .Values.keycloak.clientId }}"
springer_db_path: "{{ .Values.springerDbPath }}"
normalize:
springer_db_path: "{{ .Values.springerDbPath }}"
datacite:
enabled: {{ .Values.datacite.enabled }}
prefix: "{{ .Values.datacite.prefix }}"
......@@ -152,7 +152,7 @@ volumes:
# The domain configuration, currently there is dft and ems
domain: dft
springerDbPath: /nomad/fairdi/db/data/springer.db
springerDbPath: /nomad/fairdi/db/data/springer.msg
# Will reprocess calculations with their old matched parser, even if they do not
# match this parser anymore
......
......@@ -3,9 +3,11 @@ import pytest
import msgpack
from io import BytesIO
from nomad import utils
from nomad import utils, config
from nomad.archive import TOCPacker, write_archive, read_archive, ArchiveReader, query_archive
from .utils import assert_exception
def create_example_uuid(index: int = 0):
return ('{:%dd}' % utils.default_hash_len).format(index)
......@@ -145,6 +147,15 @@ def test_read_archive_single(example_uuid, example_entry, use_blocked_toc):
assert data[example_uuid]['run'].to_dict() == example_entry['run']
assert data[example_uuid].to_dict() == example_entry
with assert_exception(KeyError):
data['does not exist']
with assert_exception(KeyError):
data[example_uuid]['does not exist']
with assert_exception(IndexError):
data[example_uuid]['run']['system'][2]
@pytest.mark.parametrize('use_blocked_toc', [False, True])
def test_read_archive_multi(example_uuid, example_entry, use_blocked_toc):
......@@ -171,15 +182,15 @@ def test_read_archive_multi(example_uuid, example_entry, use_blocked_toc):
def test_query():
payload = {
'calc1': {
'secA': {
'subsecA1': [{'propA1a': 1.0}]
'c1': {
's1': {
'ss1': [{'p1': 1.0, 'p2': 'x'}, {'p1': 1.5, 'p2': 'y'}]
},
'secB': {'propB1a': ['a', 'b']}
's2': {'p1': ['a', 'b']}
},
'calc2': {
'secA': {'subsecA1': [{'propA1a': 2.0}]},
'secB': {'propB1a': ['c', 'd']}
'c2': {
's1': {'ss1': [{'p1': 2.0}]},
's2': {'p1': ['c', 'd']}
}
}
......@@ -188,7 +199,19 @@ def test_query():
packed_archive = f.getbuffer()
f = BytesIO(packed_archive)
assert query_archive(f, {'calc1': '*'}) == {'calc1': payload['calc1']}
assert query_archive(f, {'calc2': {'secA': {'subsecA1[0]': '*'}}}) == {'calc2': {'secA': {'subsecA1[0]': [{'propA1a': 2.0}]}}}
# TODO
# test [:][-1][0:1] ...
assert query_archive(f, {'c1': '*'}) == {'c1': payload['c1']}
assert query_archive(f, {'c1': '*', 'c2': {'s1': '*'}}) == {'c1': payload['c1'], 'c2': {'s1': payload['c2']['s1']}}
assert query_archive(f, {'c2': {'s1': {'ss1[0]': '*'}}}) == {'c2': {'s1': {'ss1': payload['c2']['s1']['ss1'][0]}}}
assert query_archive(f, {'c1': {'s1': {'ss1[1:]': '*'}}}) == {'c1': {'s1': {'ss1': payload['c1']['s1']['ss1'][1:]}}}
assert query_archive(f, {'c1': {'s1': {'ss1[:2]': '*'}}}) == {'c1': {'s1': {'ss1': payload['c1']['s1']['ss1'][:2]}}}
assert query_archive(f, {'c1': {'s1': {'ss1[0:2]': '*'}}}) == {'c1': {'s1': {'ss1': payload['c1']['s1']['ss1'][0:2]}}}
assert query_archive(f, {'c1': {'s1': {'ss1[-2]': '*'}}}) == {'c1': {'s1': {'ss1': payload['c1']['s1']['ss1'][-2]}}}
assert query_archive(f, {'c1': {'s1': {'ss1[:-1]': '*'}}}) == {'c1': {'s1': {'ss1': payload['c1']['s1']['ss1'][:-1]}}}
assert query_archive(f, {'c1': {'s1': {'ss1[1:-1]': '*'}}}) == {'c1': {'s1': {'ss1': payload['c1']['s1']['ss1'][1:-1]}}}
assert query_archive(f, {'c2': {'s1': {'ss1[-3:-1]': '*'}}}) == {'c2': {'s1': {'ss1': payload['c2']['s1']['ss1'][-3:-1]}}}
def test_read_springer():
springer = read_archive(config.normalize.springer_db_path)
with assert_exception(KeyError):
springer['doesnotexist']
......@@ -437,16 +437,18 @@ def test_springer_normalizer():
backend = parse_file(vasp_parser)
backend = run_normalize(backend)
backend_value = backend.get_value('springer_id', 89)
expected_value = 'sd_1932539'
gindex = 0
backend_value = backend.get_value('springer_id', gindex)
expected_value = 'sd_0305232'
assert expected_value == backend_value
backend_value = backend.get_value('springer_alphabetical_formula', 89)
backend_value = backend.get_value('springer_alphabetical_formula', gindex)
expected_value = 'O3SrTi'
assert expected_value == backend_value
backend_value = backend.get_value('springer_url', 89)
expected_value = 'http://materials.springer.com/isp/crystallographic/docs/sd_1932539'
backend_value = backend.get_value('springer_url', gindex)
expected_value = 'http://materials.springer.com/isp/crystallographic/docs/sd_0305232'
assert expected_value == backend_value
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment