Commit 39f46527 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Fixed endless recursion in springer.msg. Refactored springer/aflow normalizer.

parent 481c3d82
Pipeline #70117 canceled with stages
in 3 minutes and 13 seconds
......@@ -21,4 +21,4 @@ target/
vscode/
nomad.yaml
gunicorn.log.conf
gunicorn.conf
\ No newline at end of file
gunicorn.conf
......@@ -360,8 +360,14 @@ class ArchiveReader(ArchiveObject):
if positions is None:
r_start = 0
r_end = self._n_toc
while positions is None and len(self._toc) < self._n_toc:
i_block = r_start + math.floor((r_end - r_start) / 2)
i_block = None
while positions is None:
new_i_block = r_start + math.floor((r_end - r_start) / 2)
if i_block == new_i_block:
break
else:
i_block = new_i_block
first, last = self._load_toc_block(i_block)
if key < first:
r_end = i_block - 1
......
......@@ -190,7 +190,9 @@ normalize = NomadConfig(
# The distance tolerance between atoms for grouping them into the same
# cluster. Used in detecting system type.
cluster_threshold=3.1,
)
springer_db_path=os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'normalizing/data/springer.msg'))
client = NomadConfig(
user='leonard.hofstadter@nomad-fairdi.tests.de',
......@@ -220,14 +222,6 @@ use_empty_parsers = False
reprocess_unmatched = True
springer_db_relative_path = 'normalizing/data/SM_all08.db'
springer_db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), springer_db_relative_path)
springer_msg_db_relative_path = 'normalizing/data/springer.msg'
springer_msg_db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), springer_msg_db_relative_path)
def normalize_loglevel(value, default_level=logging.INFO):
plain_value = value
if plain_value is None:
......
......@@ -30,7 +30,7 @@ import os.path
from nomad.archive import query_archive, write_archive, ArchiveReader
from nomad import config
_DB_PATH = config.springer_msg_db_path
_DB_PATH = config.normalize.springer_db_path
required_items = {
'Alphabetic Formula:': 'alphabetic_formula',
......
......@@ -21,14 +21,14 @@ import json
import re
import os
import sqlite3
from matid import SymmetryAnalyzer, Classifier
from matid.classifications import Class0D, Atom, Class1D, Material2D, Surface, Class3D
from nomad.normalizing import structure
from nomad import utils, config
from nomad.normalizing.normalizer import SystemBasedNormalizer
from nomad.normalizing.data.springer_msgpack import query_springer_data
from . import aflow_prototypes
from .normalizer import SystemBasedNormalizer
from .springer import query_springer_data
# use a regular expression to check atom labels; expression is build from list of
# all labels sorted desc to find Br and not B when searching for Br.
......@@ -397,70 +397,11 @@ class SystemNormalizer(SystemBasedNormalizer):
self._backend.closeSection('section_symmetry', symmetry_gid)
def springer_classification(self, atoms, space_group_number, database='msgpack'):
# SPRINGER NORMALIZER
def springer_classification(self, atoms, space_group_number):
normalized_formula = formula_normalizer(atoms)
#
if database == 'sqlite':
springer_db_connection = open_springer_database()
if springer_db_connection is None:
return
cur = springer_db_connection.cursor()
# SQL QUERY
# (this replaces the four queries done in the old 'classify4me_SM_normalizer.py')
cur.execute("""
SELECT
entry.entry_id,
entry.alphabetic_formula,
GROUP_CONCAT(DISTINCT compound_classes.compound_class_name),
GROUP_CONCAT(DISTINCT classification.classification_name)
FROM entry
LEFT JOIN entry_compound_class as ecc ON ecc.entry_nr = entry.entry_nr
LEFT JOIN compound_classes ON ecc.compound_class_nr = compound_classes.compound_class_nr
LEFT JOIN entry_classification as ec ON ec.entry_nr = entry.entry_nr
LEFT JOIN classification ON ec.classification_nr = classification.classification_nr
LEFT JOIN entry_reference as er ON er.entry_nr = entry.entry_nr
LEFT JOIN reference ON reference.reference_nr = er.entry_nr
WHERE entry.normalized_formula = ( %r ) and entry.space_group_number = '%d'
GROUP BY entry.entry_id;
""" % (normalized_formula, space_group_number))
results = cur.fetchall()
# 'results' is a list of tuples, i.e. '[(a,b,c,d), ..., (a,b,c,d)]'
# All SQL queries done
# Storing 'results' in a dictionary
dbdict = {}
for ituple in results:
# 'spr' means 'springer'
spr_id = ituple[0]
spr_aformula = ituple[1] # alphabetical formula
spr_url = 'http://materials.springer.com/isp/crystallographic/docs/' + spr_id
spr_compound = ituple[2].split(',') # split to convert string to list
spr_classification = ituple[3].split(',')
#
spr_compound.sort()
spr_classification.sort()
#
dbdict[spr_id] = {
'spr_id': spr_id,
'spr_aformula': spr_aformula,
'spr_url': spr_url,
'spr_compound': spr_compound,
'spr_classification': spr_classification}
elif database == 'msgpack':
dbdict = query_springer_data(normalized_formula, space_group_number)
# =============
# SPRINGER's METAINFO UPDATE
# LAYOUT: Five sections under 'section_springer_material' for each material ID:
# id, alphabetical formula, url, compound_class, clasification.
# As per Markus/Luca's emails, we don't expose Springer bib references (Springer's paywall)
for material in dbdict.values():
springer_data = query_springer_data(normalized_formula, space_group_number)
for material in springer_data.values():
self._backend.openNonOverlappingSection('section_springer_material')
self._backend.addValue('springer_id', material['spr_id'])
......@@ -473,15 +414,15 @@ class SystemNormalizer(SystemBasedNormalizer):
# Check the 'springer_classification' and 'springer_compound_class' information
# found is the same for all springer_id's
dkeys = list(dbdict.keys())
if len(dkeys) != 0:
class_0 = dbdict[dkeys[0]]['spr_classification']
comp_0 = dbdict[dkeys[0]]['spr_compound']
springer_data_keys = list(springer_data.keys())
if len(springer_data_keys) != 0:
class_0 = springer_data[springer_data_keys[0]]['spr_classification']
comp_0 = springer_data[springer_data_keys[0]]['spr_compound']
# compare 'class_0' and 'comp_0' against the rest
for ii in range(1, len(dkeys)):
class_test = (class_0 == dbdict[dkeys[ii]]['spr_classification'])
comp_test = (comp_0 == dbdict[dkeys[ii]]['spr_compound'])
for ii in range(1, len(springer_data_keys)):
class_test = (class_0 == springer_data[springer_data_keys[ii]]['spr_classification'])
comp_test = (comp_0 == springer_data[springer_data_keys[ii]]['spr_compound'])
if (class_test or comp_test) is False:
self.logger.warning('Mismatch in Springer classification or compounds')
......@@ -495,8 +436,8 @@ class SystemNormalizer(SystemBasedNormalizer):
wyckoff_letters: Array of Wyckoff letters as strings.
spg_number: Space group number.
"""
norm_wyckoff = structure.get_normalized_wyckoff(atom_species, wyckoffs)
protoDict = structure.search_aflow_prototype(spg_number, norm_wyckoff)
norm_wyckoff = aflow_prototypes.get_normalized_wyckoff(atom_species, wyckoffs)
protoDict = aflow_prototypes.search_aflow_prototype(spg_number, norm_wyckoff)
if protoDict is not None:
aflow_prototype_id = protoDict["aflow_prototype_id"]
aflow_prototype_url = protoDict["aflow_prototype_url"]
......
......@@ -3,7 +3,7 @@ import pytest
import msgpack
from io import BytesIO
from nomad import utils
from nomad import utils, config
from nomad.archive import TOCPacker, write_archive, read_archive, ArchiveReader, query_archive
from .utils import assert_exception
......@@ -203,3 +203,9 @@ def test_query():
assert query_archive(f, {'calc2': {'secA': {'subsecA1[0]': '*'}}}) == {'calc2': {'secA': {'subsecA1[0]': [{'propA1a': 2.0}]}}}
# TODO
# test [:][-1][0:1] ...
def test_read_springer():
springer = read_archive(config.springer_msg_db_path)
with assert_exception(KeyError):
springer['doesnotexist']
......@@ -431,7 +431,7 @@ def test_springer_normalizer():
gindex = 0
backend_value = backend.get_value('springer_id', gindex)
expected_value = 'sd_1932539'
expected_value = 'sd_0305232'
assert expected_value == backend_value
backend_value = backend.get_value('springer_alphabetical_formula', gindex)
......@@ -439,7 +439,7 @@ def test_springer_normalizer():
assert expected_value == backend_value
backend_value = backend.get_value('springer_url', gindex)
expected_value = 'http://materials.springer.com/isp/crystallographic/docs/sd_1932539'
expected_value = 'http://materials.springer.com/isp/crystallographic/docs/sd_0305232'
assert expected_value == backend_value
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment