diff --git a/normalizer/normalizer-springer/classify4me_SM_normalizer.py b/normalizer/normalizer-springer/classify4me_SM_normalizer.py index 6b2d7bdabf060733b46c3d8a6376410e22237475..5f23727117591f7744cac35ce6abfd7a8f9c9b35 100644 --- a/normalizer/normalizer-springer/classify4me_SM_normalizer.py +++ b/normalizer/normalizer-springer/classify4me_SM_normalizer.py @@ -15,7 +15,7 @@ from nomadcore.parser_backend import JsonParseEventsWriterBackend from nomadcore.parse_streamed_dicts import ParseStreamedDicts import json import sqlite3 -import sys, time, os.path +import sys, time, os.path, re from io import open from nomadcore.local_meta_info import loadJsonFile, InfoKindEl import logging @@ -32,27 +32,64 @@ DB = os.path.exists(DB_FILE) conn = sqlite3.connect(DB_FILE) cur = conn.cursor() +def formula2dict(formula): + atomRe = re.compile(r"(?P<symbol>[A-Z][a-z]*)(?P<count>[0-9]*)") + fDict = {} + i = 0 + for m in atomRe.finditer(formula): + if i != m.start(): + logging.warn("skipping %r when parsing %s", INPUT_FORMULA[i:m.start], INPUT_FORMULA) + i = m.end() + at = m.group("symbol") + count = m.group("count") + if not count: + count = 1 + else: + count = int(count) + fDict[at] = fDict.get(at,0) + count + return fDict + +def dict2formula(fDict): + keys = sorted(fDict.keys()) + res = "" + for k in keys: + count = fDict[k] + if count: + res += k + if count != 1: + res += str(count) + return res + +def to100(fDict): + tCount = sum(fDict.values()) + res = {} + if tCount: + for k, v in fDict.items(): + res[k] = (100*v+tCount -1)//tCount + return res + def classify4me(INPUT_FORMULA, backend): - logging.info('(%r) classified according to Springer Materials as follow: \n \n', INPUT_FORMULA ) - ##print('***',INPUT_FORMULA, ' classified in Springer Materials as:','\n{') + normalizedF = dict2formula(to100(formula2dict(INPUT_FORMULA))) + # logging.warn('%r -> %r starting Springer Materials normalization', INPUT_FORMULA, normalizedF) res={} cur.execute(""" - select entry.space_group_number, entry.entry_id from entry - where entry.alphabetic_formula = ( %r ) group by entry.space_group_number, entry.entry_id;""" % INPUT_FORMULA) + select entry.space_group_number, entry.entry_id, entry.alphabetic_formula from entry + where entry.normalized_formula = ( %r ) group by entry.space_group_number, entry.entry_id;""" % normalizedF) #build dictionary #print(cur.fetchone()[0]) results = cur.fetchall() - for group , sprId in results: - if not group in res: - res[group]={} - if not "section_springer_id" in res[group]: - res[group]["section_springer_id"]=[] - spRefs = res[group]["section_springer_id"] + for group , sprId, formula in results: + k = (group,formula) + if not k in res: + res[k]={} + if not "section_springer_id" in res[k]: + res[k]["section_springer_id"]=[] + spRefs = res[k]["section_springer_id"] if sprId.startswith("sd_"): url="http://materials.springer.com/isp/crystallographic/docs/" + sprId else: @@ -67,25 +104,26 @@ def classify4me(INPUT_FORMULA, backend): cur.execute(""" - select entry.space_group_number, compound_classes.compound_class_name, count(*) + select entry.space_group_number, entry.alphabetic_formula, compound_classes.compound_class_name, count(*) from entry join entry_compound_class on entry.entry_nr = entry_compound_class.entry_nr join compound_classes on compound_classes.compound_class_nr = entry_compound_class.compound_class_nr where - entry.alphabetic_formula = ( %r ) + entry.normalized_formula = ( %r ) group by entry.space_group_number, compound_classes.compound_class_name - ;""" % INPUT_FORMULA) + ;""" % normalizedF) results = cur.fetchall() #print('compound class:',results, '\n') - for group , sprCC, count in results: - if not group in res: - res[group]={} - if not "section_springer_compound_class" in res[group]: - res[group]["section_springer_compound_class"] = [] - spRefs = res[group]["section_springer_compound_class"] + for group , formula, sprCC, count in results: + k = (group, formula) + if not k in res: + res[k]={} + if not "section_springer_compound_class" in res[k]: + res[k]["section_springer_compound_class"] = [] + spRefs = res[k]["section_springer_compound_class"] spRefs.append({ "springer_compound_class": sprCC, "springer_number_of_compound_class_reference_per_material": count @@ -94,24 +132,25 @@ def classify4me(INPUT_FORMULA, backend): cur.execute(""" - select entry.space_group_number, classification.classification_name, count(*) + select entry.space_group_number, entry.alphabetic_formula, classification.classification_name, count(*) from entry join entry_classification on entry.entry_nr = entry_classification.entry_nr join classification on classification.classification_nr = entry_classification.classification_nr where - entry.alphabetic_formula = ( %r ) + entry.normalized_formula = ( %r ) group by entry.space_group_number, classification.classification_name - ;""" % INPUT_FORMULA) + ;""" % normalizedF) results = cur.fetchall() #print('classification:', results, '\n') - for group , sprC, count in results: - if not group in res: - res[group]={} - if not "section_springer_classification" in res[group]: - res[group]["section_springer_classification"] = [] - spRefs = res[group]["section_springer_classification"] + for group , formula, sprC, count in results: + k = (group, formula) + if not k in res: + res[k]={} + if not "section_springer_classification" in res[k]: + res[k]["section_springer_classification"] = [] + spRefs = res[k]["section_springer_classification"] spRefs.append({ "springer_classification": sprC, "springer_number_of_classification_reference_per_material": count @@ -120,27 +159,28 @@ def classify4me(INPUT_FORMULA, backend): ### fOut.write('classification:\n %r \n' % results) cur.execute(""" - select entry.space_group_number, reference.reference_name, entry.entry_id + select entry.space_group_number, entry.alphabetic_formula, reference.reference_name, entry.entry_id from entry join entry_reference on entry.entry_nr = entry_reference.entry_nr join reference on reference.reference_nr = entry_reference.reference_nr where - entry.alphabetic_formula = ( %r ) + entry.normalized_formula = ( %r ) group by entry.space_group_number, reference.reference_name - ;""" % INPUT_FORMULA) + ;""" % normalizedF) results = cur.fetchall() #print('references:',results) ### ### fOut.write('references: \n %r \n' % results) #to be corrected - for group , sprRef, entryId in results: - if not group in res: - res[group]={} - if not "section_springer_id" in res[group]: - res[group]["section_springer_id"]=[] - spRefs = res[group]["section_springer_id"] + for group , formula, sprRef, entryId in results: + k = (group, formula) + if not k in res: + res[k]={} + if not "section_springer_id" in res[k]: + res[k]["section_springer_id"]=[] + spRefs = res[k]["section_springer_id"] if entryId.startswith("sd_"): @@ -166,13 +206,13 @@ def classify4me(INPUT_FORMULA, backend): }) #print('}') - for sp,entry in res.items(): + for (sp, f),entry in res.items(): try: spNr = int(sp) except: spNr = -1 - entry["springer_formula"] = INPUT_FORMULA entry["springer_space_group_number"] = spNr + entry["springer_formula"] = f #json.dump(results, fOut, sort_keys=False, ensure_ascii=False, indent=2) ### fOut.write("} \n") @@ -238,28 +278,32 @@ def main(): backend = JsonParseEventsWriterBackend(metaInfoEnv, fOut) #Start - calcContext=sys.argv[1] + calcContext=sys.argv[1]+"/_" backend.startedParsingSession( - calcContext, - parserInfo = {'name':'SpringerNormalizer', 'version': '1.0'}) - + calcContext, + parserInfo = {'name':'SpringerNormalizer', 'version': '1.0'}) + res = "ParseSuccess" + dictReader = ParseStreamedDicts(sys.stdin) - knownFormulas = set() while True: - sectSys = dictReader.readNextDict() - if sectSys is None: + springerInfo = dictReader.readNextDict() + if springerInfo is None: break - try: - formula = sectSys.get("system_composition") - if formula and not formula in knownFormulas: - knownFormulas.add(formula) - backend.openContext(calcContext) - classify4me(formula, backend) - backend.closeContext(calcContext) - fOut.flush() - except: - logging.exception("exception trying to calculate springer data for %s", sectSys) - backend.finishedParsingSession("ParseSuccess", None) + calcContext = springerInfo['calculation_uri'] + knownFormulas = set() + for sysInfo in springerInfo["system_info"]: + try: + formula = sysInfo.get("system_composition") + if formula and formula not in knownFormulas: + knownFormulas.add(formula) + backend.openContext(calcContext) + classify4me(formula, backend) + backend.closeContext(calcContext) + fOut.flush() + except: + res = "ParseFailure" + logging.exception("exception trying to calculate springer data for %s", sysInfo) + backend.finishedParsingSession(res, None) fOut.flush() ### # con.create_function("", 1, ) diff --git a/normalizer/normalizer-springer/input.txt b/normalizer/normalizer-springer/input.txt deleted file mode 100644 index 27a78fdb93e29bb21944029c1bf7e5ac6c8d9f84..0000000000000000000000000000000000000000 --- a/normalizer/normalizer-springer/input.txt +++ /dev/null @@ -1,4 +0,0 @@ -{ -"system_composition": ["MgNd"], -"calculation_uri": "nmd://R.../C..." -} diff --git a/src/main/scala/eu/nomad_lab/normalizers/SpringerNormalizer.scala b/src/main/scala/eu/nomad_lab/normalizers/SpringerNormalizer.scala index 89a58cbc8989ae9a83839327b2dc252e8d095db6..40b366299e5a6b27f941b7a53c747f3736887410 100644 --- a/src/main/scala/eu/nomad_lab/normalizers/SpringerNormalizer.scala +++ b/src/main/scala/eu/nomad_lab/normalizers/SpringerNormalizer.scala @@ -22,15 +22,31 @@ import scala.collection.breakOut import eu.nomad_lab.normalize.ExternalNormalizerGenerator import eu.nomad_lab.meta import eu.nomad_lab.query +import eu.nomad_lab.ref.ObjectKind import eu.nomad_lab.resolve._ import eu.nomad_lab.h5.EmitJsonVisitor import eu.nomad_lab.h5.H5EagerScanner import eu.nomad_lab.h5.SectionH5 +import eu.nomad_lab.h5.CalculationH5 import eu.nomad_lab.parsers.ExternalParserWrapper import eu.nomad_lab.normalize.Normalizer import eu.nomad_lab.JsonUtils +import eu.nomad_lab.JsonSupport.formats +import eu.nomad_lab.JsonSupport import scala.collection.mutable.StringBuilder +/** Information on a system that can match a springer material data */ +case class SpringerSystemInfo( + system_composition: String, + space_group_3D_number: Option[Int] +) + +/** Information to match a calculation to springer material data */ +case class SpringerInfo( + calculation_uri: String, + system_info: Seq[SpringerSystemInfo] +) + object SpringerNormalizer extends ExternalNormalizerGenerator( name = "SpringerNormalizer", info = jn.JObject( @@ -46,8 +62,7 @@ object SpringerNormalizer extends ExternalNormalizerGenerator( }(breakOut): List[(String, jn.JString)]) )) :: Nil ), - context = "calculation_context", - filter = query.CompiledQuery(query.QueryExpression("program_name = \"FHI-aims\""), meta.KnownMetaInfoEnvs.publicMeta), + context = "archive_context", cmd = Seq(DefaultPythonInterpreter.pythonExe(), "${envDir}/normalizers/springer/normalizer/normalizer-springer/classify4me_SM_normalizer.py", "${contextUri}", "${archivePath}"), resList = Seq( @@ -69,7 +84,7 @@ object SpringerNormalizer extends ExternalNormalizerGenerator( override def stdInHandler(context: ResolvedRef)(wrapper: ExternalParserWrapper)(pIn: java.io.OutputStream): Unit = { val out: java.io.Writer = new java.io.BufferedWriter(new java.io.OutputStreamWriter(pIn)); - val trace: Boolean = Normalizer.trace + val trace: Boolean = Normalizer.trace || true val stringBuilder = if (trace) new StringBuilder else @@ -86,41 +101,52 @@ object SpringerNormalizer extends ExternalNormalizerGenerator( } } writeOut("[") - var isFirst = true + var isFirst: Boolean = true + def handleCalculation(c: CalculationH5): Unit = { + val sysTable = c.sectionTable(Seq("section_run", "section_system")) + var systems: Set[SpringerSystemInfo] = Set() + def handleSysSection(sysSection: SectionH5): Unit = { + sysSection.maybeValue("system_composition").map(_.stringValue) match { + case None => () + case Some(formula) => + val spaceG = sysSection.maybeValue("space_group_3D_number").map(_.intValue) + systems += SpringerSystemInfo(formula, spaceG) + } + } + val nSys = sysTable.lengthL + if (nSys > 0) + handleSysSection(sysTable(0)) + if (nSys > 1) + handleSysSection(sysTable(nSys - 1)) + if (nSys > 2) + handleSysSection(sysTable(nSys - 2)) + if (!systems.isEmpty) { + if (!isFirst) + writeOut(",") + else + isFirst = false + writeOut(JsonSupport.writeNormalizedStr( + SpringerInfo(c.toRef.toUriStr(ObjectKind.NormalizedData), systems.toSeq) + )) + } + flush() + } try { context match { - case Calculation(archiveSet, c) => - val sysTable = c.sectionTable(Seq("section_run", "section_system")) - - def outputSysSection(sysSection: SectionH5): Unit = { - if (!isFirst) - writeOut(",") - else - isFirst = false - //writeOut(s"""{ - // | "context": ${JsonUtils.escapeString(m.toRef.toUriStr(archiveSet.objectKind))}, - // | "section_system": """.stripMargin) - val visitor = new EmitJsonVisitor( - writeOut = writeOut - ) - val scanner = new H5EagerScanner - scanner.scanResolvedRef(Section(archiveSet, sysSection), visitor) - //writeOut("}") - flush() + case Archive(archiveSet, a) => + var i: Int = 0 + for (c <- a.calculations) { + //if (i < 5) + handleCalculation(c) + i += 1 } - - val nSys = sysTable.lengthL - if (nSys > 0) - // outputSysSection(sysTable(0)) - //if (nSys > 1) - outputSysSection(sysTable(nSys - 1)) - - writeOut("]") - flush() + case Calculation(archiveSet, c) => + handleCalculation(c) case r => - throw new Exception(s"FhiAimsBasisNormalizer expected a calculation as context, but got $r") + throw new Exception(s"SpringerNormalizer expected a calculation as context, but got $r") } } finally { + writeOut("]") out.close() pIn.close() wrapper.sendStatus = ExternalParserWrapper.SendStatus.Finished diff --git a/test/examples/input.txt b/test/examples/input.txt new file mode 100644 index 0000000000000000000000000000000000000000..5056529d0bfa2d616f040249383a63c7d9391c03 --- /dev/null +++ b/test/examples/input.txt @@ -0,0 +1,24 @@ +[ +{ +"calculation_uri": "nmd://R.../C...", +"system_info": [{ +"system_composition": "MgNd", +"space_group_3D_number": 3 +}] +}, +{ +"calculation_uri": "nmd://R.../C...", +"system_info": [{ +"system_composition": "MgO", +"space_group_3D_number": 3 +}] +} +{ +"calculation_uri": "nmd://R.../C...", +"system_info": [{ +"system_composition": "CoNd", +"space_group_3D_number": 3 +}] +} + +] diff --git a/test/examples/input2.txt b/test/examples/input2.txt new file mode 100644 index 0000000000000000000000000000000000000000..18c8139af42407db3c1ab0b7b096551d90b94bf1 --- /dev/null +++ b/test/examples/input2.txt @@ -0,0 +1,2 @@ +[{"calculation_uri":"nmd://N-8wH3zpeTbJWH_0I0OjMFnAVK7IM/C--6kVLos2hak5UF8yw5KCx5YUnkA","system_info":[{"system_composition":"HfYCr2"}]} +]