Commit 6e23aaec authored by Mohamed, Fawzi Roberto (fawzi)'s avatar Mohamed, Fawzi Roberto (fawzi)
Browse files

big improvement: better matching, more robust handing

parent 454feac8
......@@ -15,7 +15,7 @@ from nomadcore.parser_backend import JsonParseEventsWriterBackend
from nomadcore.parse_streamed_dicts import ParseStreamedDicts
import json
import sqlite3
import sys, time, os.path
import sys, time, os.path, re
from io import open
from nomadcore.local_meta_info import loadJsonFile, InfoKindEl
import logging
......@@ -32,27 +32,64 @@ DB = os.path.exists(DB_FILE)
conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()
def formula2dict(formula):
atomRe = re.compile(r"(?P<symbol>[A-Z][a-z]*)(?P<count>[0-9]*)")
fDict = {}
i = 0
for m in atomRe.finditer(formula):
if i != m.start():
logging.warn("skipping %r when parsing %s", INPUT_FORMULA[i:m.start], INPUT_FORMULA)
i = m.end()
at = m.group("symbol")
count = m.group("count")
if not count:
count = 1
else:
count = int(count)
fDict[at] = fDict.get(at,0) + count
return fDict
def dict2formula(fDict):
keys = sorted(fDict.keys())
res = ""
for k in keys:
count = fDict[k]
if count:
res += k
if count != 1:
res += str(count)
return res
def to100(fDict):
tCount = sum(fDict.values())
res = {}
if tCount:
for k, v in fDict.items():
res[k] = (100*v+tCount -1)//tCount
return res
def classify4me(INPUT_FORMULA, backend):
logging.info('(%r) classified according to Springer Materials as follow: \n \n', INPUT_FORMULA )
##print('***',INPUT_FORMULA, ' classified in Springer Materials as:','\n{')
normalizedF = dict2formula(to100(formula2dict(INPUT_FORMULA)))
# logging.warn('%r -> %r starting Springer Materials normalization', INPUT_FORMULA, normalizedF)
res={}
cur.execute("""
select entry.space_group_number, entry.entry_id from entry
where entry.alphabetic_formula = ( %r ) group by entry.space_group_number, entry.entry_id;""" % INPUT_FORMULA)
select entry.space_group_number, entry.entry_id, entry.alphabetic_formula from entry
where entry.normalized_formula = ( %r ) group by entry.space_group_number, entry.entry_id;""" % normalizedF)
#build dictionary
#print(cur.fetchone()[0])
results = cur.fetchall()
for group , sprId in results:
if not group in res:
res[group]={}
if not "section_springer_id" in res[group]:
res[group]["section_springer_id"]=[]
spRefs = res[group]["section_springer_id"]
for group , sprId, formula in results:
k = (group,formula)
if not k in res:
res[k]={}
if not "section_springer_id" in res[k]:
res[k]["section_springer_id"]=[]
spRefs = res[k]["section_springer_id"]
if sprId.startswith("sd_"):
url="http://materials.springer.com/isp/crystallographic/docs/" + sprId
else:
......@@ -67,25 +104,26 @@ def classify4me(INPUT_FORMULA, backend):
cur.execute("""
select entry.space_group_number, compound_classes.compound_class_name, count(*)
select entry.space_group_number, entry.alphabetic_formula, compound_classes.compound_class_name, count(*)
from
entry
join entry_compound_class on entry.entry_nr = entry_compound_class.entry_nr
join compound_classes on compound_classes.compound_class_nr = entry_compound_class.compound_class_nr
where
entry.alphabetic_formula = ( %r )
entry.normalized_formula = ( %r )
group by entry.space_group_number, compound_classes.compound_class_name
;""" % INPUT_FORMULA)
;""" % normalizedF)
results = cur.fetchall()
#print('compound class:',results, '\n')
for group , sprCC, count in results:
if not group in res:
res[group]={}
if not "section_springer_compound_class" in res[group]:
res[group]["section_springer_compound_class"] = []
spRefs = res[group]["section_springer_compound_class"]
for group , formula, sprCC, count in results:
k = (group, formula)
if not k in res:
res[k]={}
if not "section_springer_compound_class" in res[k]:
res[k]["section_springer_compound_class"] = []
spRefs = res[k]["section_springer_compound_class"]
spRefs.append({
"springer_compound_class": sprCC,
"springer_number_of_compound_class_reference_per_material": count
......@@ -94,24 +132,25 @@ def classify4me(INPUT_FORMULA, backend):
cur.execute("""
select entry.space_group_number, classification.classification_name, count(*)
select entry.space_group_number, entry.alphabetic_formula, classification.classification_name, count(*)
from
entry
join entry_classification on entry.entry_nr = entry_classification.entry_nr
join classification on classification.classification_nr = entry_classification.classification_nr
where
entry.alphabetic_formula = ( %r )
entry.normalized_formula = ( %r )
group by entry.space_group_number, classification.classification_name
;""" % INPUT_FORMULA)
;""" % normalizedF)
results = cur.fetchall()
#print('classification:', results, '\n')
for group , sprC, count in results:
if not group in res:
res[group]={}
if not "section_springer_classification" in res[group]:
res[group]["section_springer_classification"] = []
spRefs = res[group]["section_springer_classification"]
for group , formula, sprC, count in results:
k = (group, formula)
if not k in res:
res[k]={}
if not "section_springer_classification" in res[k]:
res[k]["section_springer_classification"] = []
spRefs = res[k]["section_springer_classification"]
spRefs.append({
"springer_classification": sprC,
"springer_number_of_classification_reference_per_material": count
......@@ -120,27 +159,28 @@ def classify4me(INPUT_FORMULA, backend):
### fOut.write('classification:\n %r \n' % results)
cur.execute("""
select entry.space_group_number, reference.reference_name, entry.entry_id
select entry.space_group_number, entry.alphabetic_formula, reference.reference_name, entry.entry_id
from
entry
join entry_reference on entry.entry_nr = entry_reference.entry_nr
join reference on reference.reference_nr = entry_reference.reference_nr
where
entry.alphabetic_formula = ( %r )
entry.normalized_formula = ( %r )
group by entry.space_group_number, reference.reference_name
;""" % INPUT_FORMULA)
;""" % normalizedF)
results = cur.fetchall()
#print('references:',results)
###
### fOut.write('references: \n %r \n' % results)
#to be corrected
for group , sprRef, entryId in results:
if not group in res:
res[group]={}
if not "section_springer_id" in res[group]:
res[group]["section_springer_id"]=[]
spRefs = res[group]["section_springer_id"]
for group , formula, sprRef, entryId in results:
k = (group, formula)
if not k in res:
res[k]={}
if not "section_springer_id" in res[k]:
res[k]["section_springer_id"]=[]
spRefs = res[k]["section_springer_id"]
if entryId.startswith("sd_"):
......@@ -166,13 +206,13 @@ def classify4me(INPUT_FORMULA, backend):
})
#print('}')
for sp,entry in res.items():
for (sp, f),entry in res.items():
try:
spNr = int(sp)
except:
spNr = -1
entry["springer_formula"] = INPUT_FORMULA
entry["springer_space_group_number"] = spNr
entry["springer_formula"] = f
#json.dump(results, fOut, sort_keys=False, ensure_ascii=False, indent=2)
### fOut.write("} \n")
......@@ -238,28 +278,32 @@ def main():
backend = JsonParseEventsWriterBackend(metaInfoEnv, fOut)
#Start
calcContext=sys.argv[1]
calcContext=sys.argv[1]+"/_"
backend.startedParsingSession(
calcContext,
parserInfo = {'name':'SpringerNormalizer', 'version': '1.0'})
calcContext,
parserInfo = {'name':'SpringerNormalizer', 'version': '1.0'})
res = "ParseSuccess"
dictReader = ParseStreamedDicts(sys.stdin)
knownFormulas = set()
while True:
sectSys = dictReader.readNextDict()
if sectSys is None:
springerInfo = dictReader.readNextDict()
if springerInfo is None:
break
try:
formula = sectSys.get("system_composition")
if formula and not formula in knownFormulas:
knownFormulas.add(formula)
backend.openContext(calcContext)
classify4me(formula, backend)
backend.closeContext(calcContext)
fOut.flush()
except:
logging.exception("exception trying to calculate springer data for %s", sectSys)
backend.finishedParsingSession("ParseSuccess", None)
calcContext = springerInfo['calculation_uri']
knownFormulas = set()
for sysInfo in springerInfo["system_info"]:
try:
formula = sysInfo.get("system_composition")
if formula and formula not in knownFormulas:
knownFormulas.add(formula)
backend.openContext(calcContext)
classify4me(formula, backend)
backend.closeContext(calcContext)
fOut.flush()
except:
res = "ParseFailure"
logging.exception("exception trying to calculate springer data for %s", sysInfo)
backend.finishedParsingSession(res, None)
fOut.flush()
###
# con.create_function("", 1, )
......
{
"system_composition": ["MgNd"],
"calculation_uri": "nmd://R.../C..."
}
......@@ -22,15 +22,31 @@ import scala.collection.breakOut
import eu.nomad_lab.normalize.ExternalNormalizerGenerator
import eu.nomad_lab.meta
import eu.nomad_lab.query
import eu.nomad_lab.ref.ObjectKind
import eu.nomad_lab.resolve._
import eu.nomad_lab.h5.EmitJsonVisitor
import eu.nomad_lab.h5.H5EagerScanner
import eu.nomad_lab.h5.SectionH5
import eu.nomad_lab.h5.CalculationH5
import eu.nomad_lab.parsers.ExternalParserWrapper
import eu.nomad_lab.normalize.Normalizer
import eu.nomad_lab.JsonUtils
import eu.nomad_lab.JsonSupport.formats
import eu.nomad_lab.JsonSupport
import scala.collection.mutable.StringBuilder
/** Information on a system that can match a springer material data */
case class SpringerSystemInfo(
system_composition: String,
space_group_3D_number: Option[Int]
)
/** Information to match a calculation to springer material data */
case class SpringerInfo(
calculation_uri: String,
system_info: Seq[SpringerSystemInfo]
)
object SpringerNormalizer extends ExternalNormalizerGenerator(
name = "SpringerNormalizer",
info = jn.JObject(
......@@ -46,8 +62,7 @@ object SpringerNormalizer extends ExternalNormalizerGenerator(
}(breakOut): List[(String, jn.JString)])
)) :: Nil
),
context = "calculation_context",
filter = query.CompiledQuery(query.QueryExpression("program_name = \"FHI-aims\""), meta.KnownMetaInfoEnvs.publicMeta),
context = "archive_context",
cmd = Seq(DefaultPythonInterpreter.pythonExe(), "${envDir}/normalizers/springer/normalizer/normalizer-springer/classify4me_SM_normalizer.py",
"${contextUri}", "${archivePath}"),
resList = Seq(
......@@ -69,7 +84,7 @@ object SpringerNormalizer extends ExternalNormalizerGenerator(
override def stdInHandler(context: ResolvedRef)(wrapper: ExternalParserWrapper)(pIn: java.io.OutputStream): Unit = {
val out: java.io.Writer = new java.io.BufferedWriter(new java.io.OutputStreamWriter(pIn));
val trace: Boolean = Normalizer.trace
val trace: Boolean = Normalizer.trace || true
val stringBuilder = if (trace)
new StringBuilder
else
......@@ -86,41 +101,52 @@ object SpringerNormalizer extends ExternalNormalizerGenerator(
}
}
writeOut("[")
var isFirst = true
var isFirst: Boolean = true
def handleCalculation(c: CalculationH5): Unit = {
val sysTable = c.sectionTable(Seq("section_run", "section_system"))
var systems: Set[SpringerSystemInfo] = Set()
def handleSysSection(sysSection: SectionH5): Unit = {
sysSection.maybeValue("system_composition").map(_.stringValue) match {
case None => ()
case Some(formula) =>
val spaceG = sysSection.maybeValue("space_group_3D_number").map(_.intValue)
systems += SpringerSystemInfo(formula, spaceG)
}
}
val nSys = sysTable.lengthL
if (nSys > 0)
handleSysSection(sysTable(0))
if (nSys > 1)
handleSysSection(sysTable(nSys - 1))
if (nSys > 2)
handleSysSection(sysTable(nSys - 2))
if (!systems.isEmpty) {
if (!isFirst)
writeOut(",")
else
isFirst = false
writeOut(JsonSupport.writeNormalizedStr(
SpringerInfo(c.toRef.toUriStr(ObjectKind.NormalizedData), systems.toSeq)
))
}
flush()
}
try {
context match {
case Calculation(archiveSet, c) =>
val sysTable = c.sectionTable(Seq("section_run", "section_system"))
def outputSysSection(sysSection: SectionH5): Unit = {
if (!isFirst)
writeOut(",")
else
isFirst = false
//writeOut(s"""{
// | "context": ${JsonUtils.escapeString(m.toRef.toUriStr(archiveSet.objectKind))},
// | "section_system": """.stripMargin)
val visitor = new EmitJsonVisitor(
writeOut = writeOut
)
val scanner = new H5EagerScanner
scanner.scanResolvedRef(Section(archiveSet, sysSection), visitor)
//writeOut("}")
flush()
case Archive(archiveSet, a) =>
var i: Int = 0
for (c <- a.calculations) {
//if (i < 5)
handleCalculation(c)
i += 1
}
val nSys = sysTable.lengthL
if (nSys > 0)
// outputSysSection(sysTable(0))
//if (nSys > 1)
outputSysSection(sysTable(nSys - 1))
writeOut("]")
flush()
case Calculation(archiveSet, c) =>
handleCalculation(c)
case r =>
throw new Exception(s"FhiAimsBasisNormalizer expected a calculation as context, but got $r")
throw new Exception(s"SpringerNormalizer expected a calculation as context, but got $r")
}
} finally {
writeOut("]")
out.close()
pIn.close()
wrapper.sendStatus = ExternalParserWrapper.SendStatus.Finished
......
[
{
"calculation_uri": "nmd://R.../C...",
"system_info": [{
"system_composition": "MgNd",
"space_group_3D_number": 3
}]
},
{
"calculation_uri": "nmd://R.../C...",
"system_info": [{
"system_composition": "MgO",
"space_group_3D_number": 3
}]
}
{
"calculation_uri": "nmd://R.../C...",
"system_info": [{
"system_composition": "CoNd",
"space_group_3D_number": 3
}]
}
]
[{"calculation_uri":"nmd://N-8wH3zpeTbJWH_0I0OjMFnAVK7IM/C--6kVLos2hak5UF8yw5KCx5YUnkA","system_info":[{"system_composition":"HfYCr2"}]}
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment