Commit c1dae8ee authored by Cuauhtemoc Salazar's avatar Cuauhtemoc Salazar
Browse files

Springer normalizer update. Pipeline issues clean up

parent 584a2f39
Subproject commit c3bb8b69771a3cc87ca500f80a182380175e09cf
Subproject commit 6cc658ff3117dfccad9aace953d225385e59c5cf
......@@ -206,6 +206,7 @@ max_upload_size = 32 * (1024 ** 3)
springer_db_relative_path = 'normalizing/data/SM_all08.db'
springer_db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), springer_db_relative_path)
def normalize_loglevel(value, default_level=logging.INFO):
plain_value = value
if plain_value is None:
......
......@@ -42,9 +42,10 @@ atom_label_re = re.compile('|'.join(
springer_db_connection = None
def open_springer_database():
"""
Create a global connection to the Springer database in a way that
Create a global connection to the Springer database in a way that
each worker opens the database just once.
"""
global springer_db_connection
......@@ -54,10 +55,11 @@ def open_springer_database():
if not os.path.exists(db_file):
utils.get_logger(__name__).error('Springer database not found')
return None
springer_db_connection = sqlite3.connect(db_file)
springer_db_connection = sqlite3.connect(db_file)
return springer_db_connection
def normalized_atom_labels(atom_labels):
"""
Normalizes the given atom labels: they either are labels right away, or contain
......@@ -68,26 +70,26 @@ def normalized_atom_labels(atom_labels):
ase.data.chemical_symbols[0] if match is None else match.group(0)
for match in [re.search(atom_label_re, atom_label) for atom_label in atom_labels]]
def formula_normalizer(atoms):
"""
Reads the chemical symbols in ase.atoms and returns a normalized formula.
Formula normalization is on the basis of atom counting,
e.g., Tc -> Tc100, SZn -> S50Zn50, Co2Nb -> Co67Nb33
e.g., Tc -> Tc100, SZn -> S50Zn50, Co2Nb -> Co67Nb33
"""
#
#
chem_symb = atoms.get_chemical_symbols()
atoms_counter = Counter(chem_symb) # dictionary
atoms_counter = Counter(chem_symb) # dictionary
atoms_total = sum(atoms_counter.values())
atoms_normed = []
for key in atoms_counter.keys():
norm = str(round(100 * atoms_counter[key] / atoms_total))
atoms_normed.append( key + norm)
norm = str(round(100 * atoms_counter[key] / atoms_total))
atoms_normed.append(key + norm)
#
atoms_normed.sort()
return ''.join(atoms_normed)
class SystemNormalizer(SystemBasedNormalizer):
......@@ -170,7 +172,7 @@ class SystemNormalizer(SystemBasedNormalizer):
if atom_species is None:
atom_species = atoms.get_atomic_numbers().tolist()
self._backend.addArrayValues('atom_species', atom_species)
else:
else:
if not isinstance(atom_species, list):
atom_species = [atom_species]
if atom_species != atoms.get_atomic_numbers().tolist():
......@@ -385,31 +387,31 @@ class SystemNormalizer(SystemBasedNormalizer):
self._backend.addArrayValues('equivalent_atoms_original', orig_equivalent_atoms)
self._backend.closeSection('section_original_system', orig_gid)
self._backend.closeSection('section_symmetry', symmetry_gid)
self.springer_classification(atoms, space_group_number) # Springer Normalizer
self.springer_classification(atoms, space_group_number) # Springer Normalizer
self.prototypes(prim_num, prim_wyckoff, space_group_number)
self._backend.closeSection('section_symmetry', symmetry_gid)
def springer_classification(self, atoms, space_group_number):
# SPRINGER NORMALIZER
normalized_formula = formula_normalizer(atoms)
#
normalized_formula = formula_normalizer(atoms)
#
springer_db_connection = open_springer_database()
if springer_db_connection is None:
if springer_db_connection is None:
return
cur = springer_db_connection.cursor()
cur = springer_db_connection.cursor()
# SQL QUERY
# SQL QUERY
# (this replaces the four queries done in the old 'classify4me_SM_normalizer.py')
cur.execute("""
cur.execute("""
SELECT
entry.entry_id,
entry.alphabetic_formula,
GROUP_CONCAT(DISTINCT compound_classes.compound_class_name),
GROUP_CONCAT(DISTINCT classification.classification_name)
GROUP_CONCAT(DISTINCT classification.classification_name)
FROM entry
LEFT JOIN entry_compound_class as ecc ON ecc.entry_nr = entry.entry_nr
LEFT JOIN compound_classes ON ecc.compound_class_nr = compound_classes.compound_class_nr
......@@ -422,57 +424,57 @@ class SystemNormalizer(SystemBasedNormalizer):
""" % (normalized_formula, space_group_number))
results = cur.fetchall()
# All SQL queries done
dbdict = {}
for ituple in results:
for ituple in results:
for item in ituple:
# 'spr' means 'springer'
spr_id = ituple[0]
spr_aformula = ituple[1] # alphabetical formula
spr_aformula = ituple[1] # alphabetical formula
spr_url = 'http://materials.springer.com/isp/crystallographic/docs/' + spr_id
spr_compound = ituple[2].split(',') # convert string to list
spr_classification = ituple[3].split(',')
#
#
spr_compound.sort()
spr_classification.sort()
dbdict[spr_id] = {'spr_id': spr_id,
'spr_aformula': spr_aformula,
'spr_url': spr_url,
'spr_compound': spr_compound,
'spr_classification': spr_classification }
dbdict[spr_id] = {'spr_id': spr_id,
'spr_aformula': spr_aformula,
'spr_url': spr_url,
'spr_compound': spr_compound,
'spr_classification': spr_classification}
# SPRINGER's METAINFO UPDATE
# LAYOUT: Five sections under 'section_springer_material' for each material ID:
# SPRINGER's METAINFO UPDATE
# LAYOUT: Five sections under 'section_springer_material' for each material ID:
# id, alphabetical formula, url, compound_class, clasification.
# As per Markus/Luca's emails, we don't expose Springer bib references (Springer's paywall)
for material in dbdict.values():
self._backend.openNonOverlappingSection('section_springer_material')
for material in dbdict.values():
self._backend.openNonOverlappingSection('section_springer_material')
self._backend.addValue('springer_id', material['spr_id'])
self._backend.addValue('springer_alphabetical_formula', material['spr_aformula'])
self._backend.addValue('springer_alphabetical_formula', material['spr_aformula'])
self._backend.addValue('springer_url', material['spr_url'])
self._backend.addArrayValues('springer_compound_class', material['spr_compound'])
self._backend.addArrayValues('springer_classification', material['spr_classification'])
self._backend.closeNonOverlappingSection('section_springer_material')
self._backend.closeNonOverlappingSection('section_springer_material')
# CHECK if the springer_classification and springer_compound_class found for each springer_id match
# Check the 'springer_classification' and 'springer_compound_class' information
# found is the same for all springer_id's
dkeys = list(dbdict.keys())
class_0 = dbdict[dkeys[0]]['spr_classification']
comp_0 = dbdict[spr_id]['spr_compound']
for ii in range(1, len(dkeys)):
class_test = class_0 == dbdict[dkeys[ii]]['spr_classification']
comp_test = comp_0 == dbdict[dkeys[ii]]['spr_compound']
if (class_test or comp_test) is False:
self.logger.warning('Mismatch in Springer classification or compounds')
if len(dkeys) != 0:
class_0 = dbdict[dkeys[0]]['spr_classification']
comp_0 = dbdict[spr_id]['spr_compound']
# compare 'class_0' and 'comp_0' against the rest
for ii in range(1, len(dkeys)):
class_test = (class_0 == dbdict[dkeys[ii]]['spr_classification'])
comp_test = (comp_0 == dbdict[dkeys[ii]]['spr_compound'])
if (class_test or comp_test) is False:
self.logger.warning('Mismatch in Springer classification or compounds')
def prototypes(self, atomSpecies, wyckoffs, spg_nr):
try:
norm_wyckoff = SystemNormalizer.get_normalized_wyckoff(atomSpecies, wyckoffs)
......
......@@ -205,31 +205,31 @@ def test_vasp_incar_system():
"""
backend = parse_file(vasp_parser)
backend = run_normalize(backend)
expected_value = 'SrTiO3' # material's formula in vasp.xml
expected_value = 'SrTiO3' # material's formula in vasp.xml
# backend_value = backend.get_value('x_vasp_unknown_incars') # OK
# backend_value = backend.get_value('x_vasp_atom_kind_refs') # OK
backend_value = backend.get_value('x_vasp_incar_SYSTEM') # OK
#backend_value = backend.get_value('x_vasp_unknown_incars') # OK
#backend_value = backend.get_value('x_vasp_atom_kind_refs') # OK
backend_value = backend.get_value('x_vasp_incar_SYSTEM') # OK
print("backend_value: ", backend_value)
assert expected_value == backend_value
def test_springer_normalizer():
"""
Ensure the Springer normalizer works well with the VASP example.
Ensure the Springer normalizer works well with the VASP example.
"""
backend = parse_file(vasp_parser)
backend = run_normalize(backend)
backend_value = backend.get_value('springer_url', 89)
# with get_value('springer_id') fails.
expected_value = 'http://materials.springer.com/isp/crystallographic/docs/sd_1932539'
print("backend_value: ", backend_value)
assert expected_value == backend_value
backend_value = backend.get_value('springer_id', 89)
expected_value = 'sd_1932539'
assert expected_value == backend_value
# FIXME: search for ID,
# also check NON empty for the others
# avois storing single use variables
backend_value = backend.get_value('springer_alphabetical_formula', 89)
expected_value = 'O3SrTi'
assert expected_value == backend_value
# TODO: add test fo rptototypes
\ No newline at end of file
backend_value = backend.get_value('springer_url', 89)
expected_value = 'http://materials.springer.com/isp/crystallographic/docs/sd_1932539'
assert expected_value == backend_value
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment