Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
nomad-lab
normalizer-springer
Commits
6e23aaec
Commit
6e23aaec
authored
Jan 18, 2018
by
Mohamed, Fawzi Roberto (fawzi)
Browse files
big improvement: better matching, more robust handing
parent
454feac8
Changes
5
Hide whitespace changes
Inline
Side-by-side
normalizer/normalizer-springer/classify4me_SM_normalizer.py
View file @
6e23aaec
...
...
@@ -15,7 +15,7 @@ from nomadcore.parser_backend import JsonParseEventsWriterBackend
from
nomadcore.parse_streamed_dicts
import
ParseStreamedDicts
import
json
import
sqlite3
import
sys
,
time
,
os
.
path
import
sys
,
time
,
os
.
path
,
re
from
io
import
open
from
nomadcore.local_meta_info
import
loadJsonFile
,
InfoKindEl
import
logging
...
...
@@ -32,27 +32,64 @@ DB = os.path.exists(DB_FILE)
conn
=
sqlite3
.
connect
(
DB_FILE
)
cur
=
conn
.
cursor
()
def
formula2dict
(
formula
):
atomRe
=
re
.
compile
(
r
"(?P<symbol>[A-Z][a-z]*)(?P<count>[0-9]*)"
)
fDict
=
{}
i
=
0
for
m
in
atomRe
.
finditer
(
formula
):
if
i
!=
m
.
start
():
logging
.
warn
(
"skipping %r when parsing %s"
,
INPUT_FORMULA
[
i
:
m
.
start
],
INPUT_FORMULA
)
i
=
m
.
end
()
at
=
m
.
group
(
"symbol"
)
count
=
m
.
group
(
"count"
)
if
not
count
:
count
=
1
else
:
count
=
int
(
count
)
fDict
[
at
]
=
fDict
.
get
(
at
,
0
)
+
count
return
fDict
def
dict2formula
(
fDict
):
keys
=
sorted
(
fDict
.
keys
())
res
=
""
for
k
in
keys
:
count
=
fDict
[
k
]
if
count
:
res
+=
k
if
count
!=
1
:
res
+=
str
(
count
)
return
res
def
to100
(
fDict
):
tCount
=
sum
(
fDict
.
values
())
res
=
{}
if
tCount
:
for
k
,
v
in
fDict
.
items
():
res
[
k
]
=
(
100
*
v
+
tCount
-
1
)
//
tCount
return
res
def
classify4me
(
INPUT_FORMULA
,
backend
):
logging
.
info
(
'(%r) classified according to Springer Materials as follow:
\n
\n
'
,
INPUT_FORMULA
)
##print('***',INPUT_FORMULA, ' classified in Springer Materials as:','\n{')
normalizedF
=
dict2formula
(
to100
(
formula2dict
(
INPUT_FORMULA
)))
# logging.warn('%r -> %r starting Springer Materials normalization', INPUT_FORMULA, normalizedF)
res
=
{}
cur
.
execute
(
"""
select entry.space_group_number, entry.entry_id from entry
where entry.
alphabetic
_formula = ( %r ) group by entry.space_group_number, entry.entry_id;"""
%
INPUT_FORMULA
)
select entry.space_group_number, entry.entry_id
, entry.alphabetic_formula
from entry
where entry.
normalized
_formula = ( %r ) group by entry.space_group_number, entry.entry_id;"""
%
normalizedF
)
#build dictionary
#print(cur.fetchone()[0])
results
=
cur
.
fetchall
()
for
group
,
sprId
in
results
:
if
not
group
in
res
:
res
[
group
]
=
{}
if
not
"section_springer_id"
in
res
[
group
]:
res
[
group
][
"section_springer_id"
]
=
[]
spRefs
=
res
[
group
][
"section_springer_id"
]
for
group
,
sprId
,
formula
in
results
:
k
=
(
group
,
formula
)
if
not
k
in
res
:
res
[
k
]
=
{}
if
not
"section_springer_id"
in
res
[
k
]:
res
[
k
][
"section_springer_id"
]
=
[]
spRefs
=
res
[
k
][
"section_springer_id"
]
if
sprId
.
startswith
(
"sd_"
):
url
=
"http://materials.springer.com/isp/crystallographic/docs/"
+
sprId
else
:
...
...
@@ -67,25 +104,26 @@ def classify4me(INPUT_FORMULA, backend):
cur
.
execute
(
"""
select entry.space_group_number, compound_classes.compound_class_name, count(*)
select entry.space_group_number,
entry.alphabetic_formula,
compound_classes.compound_class_name, count(*)
from
entry
join entry_compound_class on entry.entry_nr = entry_compound_class.entry_nr
join compound_classes on compound_classes.compound_class_nr = entry_compound_class.compound_class_nr
where
entry.
alphabetic
_formula = ( %r )
entry.
normalized
_formula = ( %r )
group by entry.space_group_number, compound_classes.compound_class_name
;"""
%
INPUT_FORMULA
)
;"""
%
normalizedF
)
results
=
cur
.
fetchall
()
#print('compound class:',results, '\n')
for
group
,
sprCC
,
count
in
results
:
if
not
group
in
res
:
res
[
group
]
=
{}
if
not
"section_springer_compound_class"
in
res
[
group
]:
res
[
group
][
"section_springer_compound_class"
]
=
[]
spRefs
=
res
[
group
][
"section_springer_compound_class"
]
for
group
,
formula
,
sprCC
,
count
in
results
:
k
=
(
group
,
formula
)
if
not
k
in
res
:
res
[
k
]
=
{}
if
not
"section_springer_compound_class"
in
res
[
k
]:
res
[
k
][
"section_springer_compound_class"
]
=
[]
spRefs
=
res
[
k
][
"section_springer_compound_class"
]
spRefs
.
append
({
"springer_compound_class"
:
sprCC
,
"springer_number_of_compound_class_reference_per_material"
:
count
...
...
@@ -94,24 +132,25 @@ def classify4me(INPUT_FORMULA, backend):
cur
.
execute
(
"""
select entry.space_group_number, classification.classification_name, count(*)
select entry.space_group_number,
entry.alphabetic_formula,
classification.classification_name, count(*)
from
entry
join entry_classification on entry.entry_nr = entry_classification.entry_nr
join classification on classification.classification_nr = entry_classification.classification_nr
where
entry.
alphabetic
_formula = ( %r )
entry.
normalized
_formula = ( %r )
group by entry.space_group_number, classification.classification_name
;"""
%
INPUT_FORMULA
)
;"""
%
normalizedF
)
results
=
cur
.
fetchall
()
#print('classification:', results, '\n')
for
group
,
sprC
,
count
in
results
:
if
not
group
in
res
:
res
[
group
]
=
{}
if
not
"section_springer_classification"
in
res
[
group
]:
res
[
group
][
"section_springer_classification"
]
=
[]
spRefs
=
res
[
group
][
"section_springer_classification"
]
for
group
,
formula
,
sprC
,
count
in
results
:
k
=
(
group
,
formula
)
if
not
k
in
res
:
res
[
k
]
=
{}
if
not
"section_springer_classification"
in
res
[
k
]:
res
[
k
][
"section_springer_classification"
]
=
[]
spRefs
=
res
[
k
][
"section_springer_classification"
]
spRefs
.
append
({
"springer_classification"
:
sprC
,
"springer_number_of_classification_reference_per_material"
:
count
...
...
@@ -120,27 +159,28 @@ def classify4me(INPUT_FORMULA, backend):
### fOut.write('classification:\n %r \n' % results)
cur
.
execute
(
"""
select entry.space_group_number, reference.reference_name, entry.entry_id
select entry.space_group_number,
entry.alphabetic_formula,
reference.reference_name, entry.entry_id
from
entry
join entry_reference on entry.entry_nr = entry_reference.entry_nr
join reference on reference.reference_nr = entry_reference.reference_nr
where
entry.
alphabetic
_formula = ( %r )
entry.
normalized
_formula = ( %r )
group by entry.space_group_number, reference.reference_name
;"""
%
INPUT_FORMULA
)
;"""
%
normalizedF
)
results
=
cur
.
fetchall
()
#print('references:',results)
###
### fOut.write('references: \n %r \n' % results)
#to be corrected
for
group
,
sprRef
,
entryId
in
results
:
if
not
group
in
res
:
res
[
group
]
=
{}
if
not
"section_springer_id"
in
res
[
group
]:
res
[
group
][
"section_springer_id"
]
=
[]
spRefs
=
res
[
group
][
"section_springer_id"
]
for
group
,
formula
,
sprRef
,
entryId
in
results
:
k
=
(
group
,
formula
)
if
not
k
in
res
:
res
[
k
]
=
{}
if
not
"section_springer_id"
in
res
[
k
]:
res
[
k
][
"section_springer_id"
]
=
[]
spRefs
=
res
[
k
][
"section_springer_id"
]
if
entryId
.
startswith
(
"sd_"
):
...
...
@@ -166,13 +206,13 @@ def classify4me(INPUT_FORMULA, backend):
})
#print('}')
for
sp
,
entry
in
res
.
items
():
for
(
sp
,
f
),
entry
in
res
.
items
():
try
:
spNr
=
int
(
sp
)
except
:
spNr
=
-
1
entry
[
"springer_formula"
]
=
INPUT_FORMULA
entry
[
"springer_space_group_number"
]
=
spNr
entry
[
"springer_formula"
]
=
f
#json.dump(results, fOut, sort_keys=False, ensure_ascii=False, indent=2)
### fOut.write("} \n")
...
...
@@ -238,28 +278,32 @@ def main():
backend
=
JsonParseEventsWriterBackend
(
metaInfoEnv
,
fOut
)
#Start
calcContext
=
sys
.
argv
[
1
]
calcContext
=
sys
.
argv
[
1
]
+
"/_"
backend
.
startedParsingSession
(
calcContext
,
parserInfo
=
{
'name'
:
'SpringerNormalizer'
,
'version'
:
'1.0'
})
calcContext
,
parserInfo
=
{
'name'
:
'SpringerNormalizer'
,
'version'
:
'1.0'
})
res
=
"ParseSuccess"
dictReader
=
ParseStreamedDicts
(
sys
.
stdin
)
knownFormulas
=
set
()
while
True
:
s
ectSys
=
dictReader
.
readNextDict
()
if
s
ectSys
is
None
:
s
pringerInfo
=
dictReader
.
readNextDict
()
if
s
pringerInfo
is
None
:
break
try
:
formula
=
sectSys
.
get
(
"system_composition"
)
if
formula
and
not
formula
in
knownFormulas
:
knownFormulas
.
add
(
formula
)
backend
.
openContext
(
calcContext
)
classify4me
(
formula
,
backend
)
backend
.
closeContext
(
calcContext
)
fOut
.
flush
()
except
:
logging
.
exception
(
"exception trying to calculate springer data for %s"
,
sectSys
)
backend
.
finishedParsingSession
(
"ParseSuccess"
,
None
)
calcContext
=
springerInfo
[
'calculation_uri'
]
knownFormulas
=
set
()
for
sysInfo
in
springerInfo
[
"system_info"
]:
try
:
formula
=
sysInfo
.
get
(
"system_composition"
)
if
formula
and
formula
not
in
knownFormulas
:
knownFormulas
.
add
(
formula
)
backend
.
openContext
(
calcContext
)
classify4me
(
formula
,
backend
)
backend
.
closeContext
(
calcContext
)
fOut
.
flush
()
except
:
res
=
"ParseFailure"
logging
.
exception
(
"exception trying to calculate springer data for %s"
,
sysInfo
)
backend
.
finishedParsingSession
(
res
,
None
)
fOut
.
flush
()
###
# con.create_function("", 1, )
...
...
normalizer/normalizer-springer/input.txt
deleted
100644 → 0
View file @
454feac8
{
"system_composition": ["MgNd"],
"calculation_uri": "nmd://R.../C..."
}
src/main/scala/eu/nomad_lab/normalizers/SpringerNormalizer.scala
View file @
6e23aaec
...
...
@@ -22,15 +22,31 @@ import scala.collection.breakOut
import
eu.nomad_lab.normalize.ExternalNormalizerGenerator
import
eu.nomad_lab.meta
import
eu.nomad_lab.query
import
eu.nomad_lab.ref.ObjectKind
import
eu.nomad_lab.resolve._
import
eu.nomad_lab.h5.EmitJsonVisitor
import
eu.nomad_lab.h5.H5EagerScanner
import
eu.nomad_lab.h5.SectionH5
import
eu.nomad_lab.h5.CalculationH5
import
eu.nomad_lab.parsers.ExternalParserWrapper
import
eu.nomad_lab.normalize.Normalizer
import
eu.nomad_lab.JsonUtils
import
eu.nomad_lab.JsonSupport.formats
import
eu.nomad_lab.JsonSupport
import
scala.collection.mutable.StringBuilder
/** Information on a system that can match a springer material data */
case
class
SpringerSystemInfo
(
system_composition
:
String
,
space_group_3D_number
:
Option
[
Int
]
)
/** Information to match a calculation to springer material data */
case
class
SpringerInfo
(
calculation_uri
:
String
,
system_info
:
Seq
[
SpringerSystemInfo
]
)
object
SpringerNormalizer
extends
ExternalNormalizerGenerator
(
name
=
"SpringerNormalizer"
,
info
=
jn
.
JObject
(
...
...
@@ -46,8 +62,7 @@ object SpringerNormalizer extends ExternalNormalizerGenerator(
}(
breakOut
)
:
List
[(
String
,
jn.JString
)])
))
::
Nil
),
context
=
"calculation_context"
,
filter
=
query
.
CompiledQuery
(
query
.
QueryExpression
(
"program_name = \"FHI-aims\""
),
meta
.
KnownMetaInfoEnvs
.
publicMeta
),
context
=
"archive_context"
,
cmd
=
Seq
(
DefaultPythonInterpreter
.
pythonExe
(),
"${envDir}/normalizers/springer/normalizer/normalizer-springer/classify4me_SM_normalizer.py"
,
"${contextUri}"
,
"${archivePath}"
),
resList
=
Seq
(
...
...
@@ -69,7 +84,7 @@ object SpringerNormalizer extends ExternalNormalizerGenerator(
override
def
stdInHandler
(
context
:
ResolvedRef
)(
wrapper
:
ExternalParserWrapper
)(
pIn
:
java.io.OutputStream
)
:
Unit
=
{
val
out
:
java.io.Writer
=
new
java
.
io
.
BufferedWriter
(
new
java
.
io
.
OutputStreamWriter
(
pIn
));
val
trace
:
Boolean
=
Normalizer
.
trace
val
trace
:
Boolean
=
Normalizer
.
trace
||
true
val
stringBuilder
=
if
(
trace
)
new
StringBuilder
else
...
...
@@ -86,41 +101,52 @@ object SpringerNormalizer extends ExternalNormalizerGenerator(
}
}
writeOut
(
"["
)
var
isFirst
=
true
var
isFirst
:
Boolean
=
true
def
handleCalculation
(
c
:
CalculationH5
)
:
Unit
=
{
val
sysTable
=
c
.
sectionTable
(
Seq
(
"section_run"
,
"section_system"
))
var
systems
:
Set
[
SpringerSystemInfo
]
=
Set
()
def
handleSysSection
(
sysSection
:
SectionH5
)
:
Unit
=
{
sysSection
.
maybeValue
(
"system_composition"
).
map
(
_
.
stringValue
)
match
{
case
None
=>
()
case
Some
(
formula
)
=>
val
spaceG
=
sysSection
.
maybeValue
(
"space_group_3D_number"
).
map
(
_
.
intValue
)
systems
+=
SpringerSystemInfo
(
formula
,
spaceG
)
}
}
val
nSys
=
sysTable
.
lengthL
if
(
nSys
>
0
)
handleSysSection
(
sysTable
(
0
))
if
(
nSys
>
1
)
handleSysSection
(
sysTable
(
nSys
-
1
))
if
(
nSys
>
2
)
handleSysSection
(
sysTable
(
nSys
-
2
))
if
(!
systems
.
isEmpty
)
{
if
(!
isFirst
)
writeOut
(
","
)
else
isFirst
=
false
writeOut
(
JsonSupport
.
writeNormalizedStr
(
SpringerInfo
(
c
.
toRef
.
toUriStr
(
ObjectKind
.
NormalizedData
),
systems
.
toSeq
)
))
}
flush
()
}
try
{
context
match
{
case
Calculation
(
archiveSet
,
c
)
=>
val
sysTable
=
c
.
sectionTable
(
Seq
(
"section_run"
,
"section_system"
))
def
outputSysSection
(
sysSection
:
SectionH5
)
:
Unit
=
{
if
(!
isFirst
)
writeOut
(
","
)
else
isFirst
=
false
//writeOut(s"""{
// | "context": ${JsonUtils.escapeString(m.toRef.toUriStr(archiveSet.objectKind))},
// | "section_system": """.stripMargin)
val
visitor
=
new
EmitJsonVisitor
(
writeOut
=
writeOut
)
val
scanner
=
new
H5EagerScanner
scanner
.
scanResolvedRef
(
Section
(
archiveSet
,
sysSection
),
visitor
)
//writeOut("}")
flush
()
case
Archive
(
archiveSet
,
a
)
=>
var
i
:
Int
=
0
for
(
c
<-
a
.
calculations
)
{
//if (i < 5)
handleCalculation
(
c
)
i
+=
1
}
val
nSys
=
sysTable
.
lengthL
if
(
nSys
>
0
)
// outputSysSection(sysTable(0))
//if (nSys > 1)
outputSysSection
(
sysTable
(
nSys
-
1
))
writeOut
(
"]"
)
flush
()
case
Calculation
(
archiveSet
,
c
)
=>
handleCalculation
(
c
)
case
r
=>
throw
new
Exception
(
s
"
FhiAimsBasis
Normalizer expected a calculation as context, but got $r"
)
throw
new
Exception
(
s
"
Springer
Normalizer expected a calculation as context, but got $r"
)
}
}
finally
{
writeOut
(
"]"
)
out
.
close
()
pIn
.
close
()
wrapper
.
sendStatus
=
ExternalParserWrapper
.
SendStatus
.
Finished
...
...
test/examples/input.txt
0 → 100644
View file @
6e23aaec
[
{
"calculation_uri": "nmd://R.../C...",
"system_info": [{
"system_composition": "MgNd",
"space_group_3D_number": 3
}]
},
{
"calculation_uri": "nmd://R.../C...",
"system_info": [{
"system_composition": "MgO",
"space_group_3D_number": 3
}]
}
{
"calculation_uri": "nmd://R.../C...",
"system_info": [{
"system_composition": "CoNd",
"space_group_3D_number": 3
}]
}
]
test/examples/input2.txt
0 → 100644
View file @
6e23aaec
[{"calculation_uri":"nmd://N-8wH3zpeTbJWH_0I0OjMFnAVK7IM/C--6kVLos2hak5UF8yw5KCx5YUnkA","system_info":[{"system_composition":"HfYCr2"}]}
]
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment