Commit 2b83f245 authored by Alvin Noe Ladines's avatar Alvin Noe Ladines
Browse files

Implemented major changes to springer_msgpack

parent 8918e397
Pipeline #69870 passed with stages
in 18 minutes and 47 seconds
......@@ -6,7 +6,6 @@ from msgpack.fallback import Packer, StringIO
import struct
import json
import math
import os.path
import re
from nomad import utils
......@@ -481,120 +480,6 @@ def read_archive(file_or_path: str, **kwargs) -> ArchiveReader:
return ArchiveReader(file_or_path, **kwargs)
class ArchiveFileDB:
def __init__(self, fileio: Union[str, BytesIO], mode: str = 'r', entry_toc_depth: int = 2, **kwargs):
self._fileobj = fileio
self._mode = mode
self._entry_toc_depth = entry_toc_depth
self._data: Dict[str, Any] = {}
self._key_length = utils.default_hash_len
self._db = None
self._ids: List[str] = []
self._infokey = self._adjust_key('INFO', 'X')
def write(self, abspath: str, relpath: str):
"""
Mimic the zipfile function to write files to database.
Arguments:
abspath: The absolute path to the file to be read
relpath: For compatibility with zipfile
"""
self.add_data(abspath)
def close(self, save: bool = True):
"""
Mimic the zipfile function to close the msgpack file.
Will trigger the creation of the database when in write mode.
Arguments:
save: If True will add the current data in memory to database
"""
if 'w' in self._mode:
self.create_db()
if isinstance(self._fileobj, BytesIO) and save:
self._fileobj.close()
self._fileobj = None
def create_db(self):
with ArchiveWriter(self._fileobj, len(self._data) + 1, self._entry_toc_depth) as db:
for key, val in self._data.items():
key = self._adjust_key(key)
self._ids.append(key)
db.add(key, val)
db.add(self._infokey, dict(ids=self._ids, entry_toc_depth=self._entry_toc_depth))
def _adjust_key(self, key: str, fill_with: str = ' '):
key = key.rjust(self._key_length, fill_with)
assert len(key) == self._key_length
return key
def add_data(self, data: Union[str, Dict[str, Any], List[Union[str, Dict]]]):
"""
Add data to the msgpack database.
Arguments:
data: Can be a filename or dictionary or list of both
"""
if isinstance(data, str):
key = os.path.basename(data)
if data.endswith('json'):
uid = key.split('.')[0]
val = json.load(open(data))
if isinstance(val, dict):
self._data[uid] = val
else:
try:
uid = key.split('.')[0]
dtype = key.split('.')[-1]
val = open(data).read()
if dtype not in self._data:
self._data[dtype] = {}
if val:
self._data[dtype].update({uid: val})
except Exception:
pass
elif isinstance(data, dict):
for key, val in data.items():
if val:
self._data[key] = val
elif isinstance(data, list):
for i in range(len(data)):
self.add_data(data[i])
else:
raise ValueError
@property
def ids(self):
if not self._ids:
with ArchiveReader(self._fileobj) as db:
self._ids = db[self._infokey]['ids']
return self._ids
@staticmethod
def _get_index(key: str) -> Union[Tuple[int, int], int]:
key = key.strip()
bracket = key.find('[')
if bracket <= 0:
return None
assert key[-1] == ']'
str_index = key[bracket + 1: -1]
if ':' in str_index:
lo_str, hi_str = str_index.split(':')
lo = int(lo_str) if lo_str else 0
hi = int(hi_str) if hi_str else 10000000
return lo, hi
else:
# if db structure should be maintained, return lo, lo + 1
# if conform with python indexing, return lo
lo = int(str_index)
return lo
def query_archive(f, query_dict: dict):
def _load_data(query_dict: Dict[str, Any], archive_item: ArchiveObject, main_section: bool = False):
......@@ -631,12 +516,16 @@ def query_archive(f, query_dict: dict):
if main_section:
archive_key = adjust_uuid_size(key)
if index is None:
res[key] = _load_data(val, archive_item[archive_key])
elif isinstance(index, int):
res[key] = _load_data(val, archive_item[archive_key])[index]
else:
res[key] = _load_data(val, archive_item[archive_key])[index[0]: index[1]]
try:
if index is None:
res[key] = _load_data(val, archive_item[archive_key])
elif isinstance(index, int):
res[key] = _load_data(val, archive_item[archive_key])[index]
else:
res[key] = _load_data(val, archive_item[archive_key])[index[0]: index[1]]
except Exception:
continue
return res
......
......@@ -25,169 +25,110 @@ The html parser was taken from a collection of scripts from FHI without further
import requests
import re
import os
from bs4 import BeautifulSoup
from typing import Dict, List, Any
from time import sleep
import os
from nomad.archive import ArchiveFileDB
from nomad.archive import query_archive, write_archive, ArchiveReader
DB_NAME = '.springer.msg'
spacesRe = re.compile(r"\s+")
required_items = {
'Alphabetic Formula:': 'alphabetic_formula',
'Classification by Properties:': 'classification',
'Compound Class(es):': 'compound_classes',
'Dataset ID': 'id',
'Space Group:': 'space_group_number',
}
symbRe = re.compile(r"[A-Z][a-z]{0,3}")
spaces_re = re.compile(r'\s+')
search_re = re.compile(" href=\"(/isp/[^\"]+)")
formula_re = re.compile(r'([A-Z][a-z]?)([0-9.]*)|\[(.*?)\]([0-9]+)')
numRe = re.compile(r"[0-9.]+")
bracketRe = re.compile(r"\[")
def _update_dict(dict0: Dict[str, float], dict1: Dict[str, float]):
for key, val in dict1.items():
if key in dict0:
dict0[key] += val
else:
dict0[key] = val
closingBraketRe = re.compile(r"\]")
columnNames = {
"Normalized_formula": "normalized_formula",
"Alphabetic Formula:": "alphabetic_formula",
"Classification by Properties:": "classification",
"Compound Class(es):": "compound_classes",
"Dataset ID": "id",
"Space Group:": "space_group_number",
}
def _components(formula_str: str, multiplier: float = 1.0) -> Dict[str, float]:
# match atoms and molecules (in brackets)
components = formula_re.findall(formula_str)
symbol_amount: Dict[str, float] = {}
for component in components:
element, amount_e, molecule, amount_m = component
if element:
if not amount_e:
amount_e = 1.0
_update_dict(symbol_amount, {element: float(amount_e) * multiplier})
def parseSymbol(formulaStr):
m = symbRe.match(formulaStr)
if m:
return (m.group(), formulaStr[len(m.group()):])
else:
return (None, formulaStr)
def parseAmount(formulaStr):
m = numRe.match(formulaStr)
if m:
return (float(m.group()), formulaStr[len(m.group()):])
else:
return (1.0, formulaStr)
def parseSimpleEntry(formulaStr):
sym, rest = parseSymbol(formulaStr)
if sym is None:
return (None, formulaStr)
else:
am, rest = parseAmount(rest)
res = {}
res[sym] = am
return (res, rest)
def parseComplexEntry(formulaStr, flatten=True):
res = {}
m = bracketRe.match(formulaStr)
if m is None:
return (None, formulaStr)
else:
rest = formulaStr[len(m.group()):]
while True:
simE, rest = parseEntry(rest)
if simE is None: break
if '#' in simE:
if 'fragments' in res:
res['fragments'].append(simE)
else:
res['fragments'] = [simE]
else:
for sym, am in simE.items():
if sym in res:
res[sym] += am
else:
res[sym] = am
m2 = closingBraketRe.match(rest)
if m2 is None:
return (None, formulaStr)
rest = rest[len(m2.group()):]
am, rest = parseAmount(rest)
for k, v in res.items():
res[k] = v * am
else:
res['#'] = am
return (res, rest)
def parseEntry(formulaStr):
e, rest = parseSimpleEntry(formulaStr)
if e is not None:
return (e, rest)
return parseComplexEntry(formulaStr)
def parseFormula(formulaStr):
res = {}
rest = formulaStr
while len(rest) > 0:
e, rest = parseEntry(rest)
if e is None:
raise Exception("could not parse entry from %r, did parse %s and failed with %r" % (formulaStr, res, rest))
if '#' in e:
if 'fragments' in res:
res['fragments'].append(e)
else:
res['fragments'] = [e]
else:
for sym, am in e.items():
if sym in res:
res[sym] += am
else:
res[sym] = am
return res
def normalizeFormula(formulaDict):
oldTot = sum(formulaDict.values())
res = {}
for symb, amount in formulaDict.items():
res[symb] = int(amount / oldTot * 100.0 + 0.5)
sortedS = list(res.keys())
sortedS.sort()
resStr = ""
for symb in sortedS:
resStr += symb
resStr += str(res[symb])
return resStr
def parse(htmltext):
elif molecule:
if not amount_m:
amount_m = 1.0
_update_dict(symbol_amount, _components(molecule, float(amount_m) * multiplier))
return symbol_amount
def normalize_formula(formula_str: str) -> str:
symbol_amount = _components(formula_str)
total = sum(symbol_amount.values())
symbol_normamount = {e: round(a / total * 100.) for e, a in symbol_amount.items()}
formula_sorted = [
'%s%d' % (s, symbol_normamount[s]) for s in sorted(list(symbol_normamount.keys()))]
return ''.join(formula_sorted)
def parse(htmltext: str) -> Dict[str, str]:
"""
Parser the quantities in columnNames from an html text.
Parser the quantities in required_items from an html text.
"""
soup = BeautifulSoup(htmltext, "html.parser")
results = {}
for el in soup.findAll(attrs={"class": "data-list__content"}):
for it in el.findAll(attrs={"class": "data-list__item"}):
key = it.find(attrs={"class": "data-list__item-key"})
keyStr = key.string
value = spacesRe.sub(" ", it.find(attrs={"class": "data-list__item-value"}).get_text())
if value:
value = value.strip()
if keyStr:
keyStr = keyStr.strip()
if keyStr in columnNames:
keyStr = columnNames[keyStr]
results[keyStr] = value
for item in soup.find_all(attrs={"class": "data-list__item"}):
key = item.find(attrs={"class": "data-list__item-key"})
if not key:
continue
key_str = key.string.strip()
if key_str not in required_items:
continue
value = item.find(attrs={"class": "data-list__item-value"})
value = spaces_re.sub(' ', value.get_text()).strip()
results[required_items[key_str]] = value
if len(results) >= len(required_items):
break
if 'classification' in results:
results['classification'] = [x.strip() for x in results['classification'].split(",")]
results['classification'] = [x for x in results['classification'] if x != '–']
if 'compound_classes' in results:
results['compound_classes'] = [x.strip() for x in results['compound_classes'].split(",")]
results['compound_classes'] = [x for x in results['compound_classes'] if x != '–']
normalized_formula = None
if 'alphabetic_formula' in results:
try:
f = parseFormula(results['alphabetic_formula'])
normalized_formula = normalizeFormula(f)
normalized_formula = normalize_formula(results['alphabetic_formula'])
except Exception:
normalized_formula = None
pass
results['normalized_formula'] = normalized_formula
return results
def _merge_dict(dict0, dict1):
def _merge_dict(dict0: Dict[str, Any], dict1: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(dict1, dict) or not isinstance(dict0, dict):
return dict1
......@@ -199,63 +140,95 @@ def _merge_dict(dict0, dict1):
return dict0
def download_entries(formula, space_group_number):
"""
Downloads the springer quantities related to a structure from springer.
"""
entries = {}
root = 'https://materials.springer.com/textsearch?searchTerm=%s&datasourceFacet=sm_isp&substanceId=' % formula
response = requests.get(root)
if response.status_code != 200:
return entries
re_search = re.compile(" href=\"(/isp/[^\"]+)")
paths = re_search.findall(response.text)
paths = ['http://materials.springer.com%s' % p for p in paths]
for path in paths:
def _download(path: str, max_n_query: int = 10) -> str:
n_query = 0
while True:
response = requests.get(path)
if response.status_code != 200:
continue
try:
data = parse(response.text)
except Exception:
continue
space_group_number = data.get('space_group_number', None)
normalized_formula = data.get('normalized_formula', None)
id = data.get('id', None)
if space_group_number is None or normalized_formula is None or id is None:
continue
aformula = data.get('alphabetic_formula', None)
compound = data.get('compound_classes', None)
classification = data.get('classification', None)
entry = dict(id=id, aformula=aformula, url=path, compound=compound, classification=classification)
entries = _merge_dict(entries, {str(space_group_number): {normalized_formula: {id: entry}}})
return entries
if response.status_code == 200:
break
if n_query > max_n_query:
break
n_query += 1
sleep(120)
if response.status_code != 200:
response.raise_for_status()
return response.text
def get_springer_data(normalized_formula, space_group_number):
def download_springer_data(max_n_query: int = 10):
"""
Queries a msgpack database for springer-related quantities. Downloads data if not
found in database and adds it to database.
Downloads the springer quantities related to a structure from springer and updates
database.
"""
entries = {}
mode = 'w'
if os.path.isfile(DB_NAME):
db = ArchiveFileDB(DB_NAME, 'r')
entries = db.query({str(space_group_number): {normalized_formula: '*'}})
db.close()
mode = 'w+'
if not entries:
formula = numRe.sub('', normalized_formula)
entries = download_entries(formula, space_group_number)
db = ArchiveFileDB(DB_NAME, mode, 3)
for key, entry in entries.items():
db.add_data({key: entry})
db.close()
# load database
# querying database with unvailable dataset leads to error,
# get toc keys first by making an empty query
archive = ArchiveReader(DB_NAME)
_ = archive._load_toc_block(0)
archive_keys = archive._toc.keys()
sp_data = query_archive(DB_NAME, {spg: '*' for spg in archive_keys})
sp_ids: List[str] = []
for spg in sp_data:
if not isinstance(sp_data[spg], dict):
continue
for formula in sp_data[spg]:
sp_ids += list(sp_data[spg][formula].keys())
page = 1
while True:
# check springer database for new entries by comparing with local database
root = 'https://materials.springer.com/search?searchTerm=&pageNumber=%d&datasourceFacet=sm_isp&substanceId=' % page
req_text = _download(root, max_n_query)
if 'Sorry,' in req_text:
break
paths = search_re.findall(req_text)
for path in paths:
sp_id = os.path.basename(path)
if sp_id in sp_ids:
continue
path = 'http://materials.springer.com%s' % path
req_text = _download(path, max_n_query)
try:
data = parse(req_text)
except Exception:
continue
space_group_number = data.get('space_group_number', None)
normalized_formula = data.get('normalized_formula', None)
if space_group_number is None or normalized_formula is None:
continue
aformula = data.get('alphabetic_formula', None)
compound = data.get('compound_classes', None)
classification = data.get('classification', None)
entry = dict(
aformula=aformula, url=path, compound=compound,
classification=classification)
sp_data = _merge_dict(
sp_data, {str(space_group_number): {normalized_formula: {sp_id: entry}}})
page += 1
write_archive(DB_NAME, len(sp_data), sp_data.items(), entry_toc_depth=1)
def query_springer_data(normalized_formula: str, space_group_number: int) -> Dict[str, Any]:
"""
Queries a msgpack database for springer-related quantities.
"""
entries = query_archive(DB_NAME, {str(space_group_number): {normalized_formula: '*'}})
db_dict = {}
entries = entries.get(str(space_group_number), {}).get(normalized_formula, {})
for id, entry in entries.items():
db_dict[id] = {
'spr_id': id,
for sp_id, entry in entries.items():
db_dict[sp_id] = {
'spr_id': sp_id,
'spr_aformula': entry['aformula'],
'spr_url': entry['url'],
'spr_compound': entry['compound'],
......
......@@ -28,7 +28,7 @@ from matid.classifications import Class0D, Atom, Class1D, Material2D, Surface, C
from nomad.normalizing import structure
from nomad import utils, config
from nomad.normalizing.normalizer import SystemBasedNormalizer
from nomad.normalizing.data.springer_msgpack import get_springer_data
from nomad.normalizing.data.springer_msgpack import query_springer_data
# use a regular expression to check atom labels; expression is build from list of
# all labels sorted desc to find Br and not B when searching for Br.
......@@ -452,7 +452,7 @@ class SystemNormalizer(SystemBasedNormalizer):
'spr_classification': spr_classification}
elif database == 'msgpack':
dbdict = get_springer_data(normalized_formula, space_group_number)
dbdict = query_springer_data(normalized_formula, space_group_number)
# =============
......
......@@ -134,7 +134,7 @@ class TestAdminUploads:
assert Upload.objects(upload_id=upload_id).first() is None
assert Calc.objects(upload_id=upload_id).first() is None
def test_create_msgpack(self, published):
def test_msgpacked(self, published):
upload_id = published.upload_id
result = click.testing.CliRunner().invoke(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment