Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
nomad-lab
nomad-FAIR
Commits
2b83f245
Commit
2b83f245
authored
Feb 26, 2020
by
Alvin Noe Ladines
Browse files
Implemented major changes to springer_msgpack
parent
8918e397
Pipeline
#69870
passed with stages
in 18 minutes and 47 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
nomad/archive.py
View file @
2b83f245
...
...
@@ -6,7 +6,6 @@ from msgpack.fallback import Packer, StringIO
import
struct
import
json
import
math
import
os.path
import
re
from
nomad
import
utils
...
...
@@ -481,120 +480,6 @@ def read_archive(file_or_path: str, **kwargs) -> ArchiveReader:
return
ArchiveReader
(
file_or_path
,
**
kwargs
)
class
ArchiveFileDB
:
def
__init__
(
self
,
fileio
:
Union
[
str
,
BytesIO
],
mode
:
str
=
'r'
,
entry_toc_depth
:
int
=
2
,
**
kwargs
):
self
.
_fileobj
=
fileio
self
.
_mode
=
mode
self
.
_entry_toc_depth
=
entry_toc_depth
self
.
_data
:
Dict
[
str
,
Any
]
=
{}
self
.
_key_length
=
utils
.
default_hash_len
self
.
_db
=
None
self
.
_ids
:
List
[
str
]
=
[]
self
.
_infokey
=
self
.
_adjust_key
(
'INFO'
,
'X'
)
def
write
(
self
,
abspath
:
str
,
relpath
:
str
):
"""
Mimic the zipfile function to write files to database.
Arguments:
abspath: The absolute path to the file to be read
relpath: For compatibility with zipfile
"""
self
.
add_data
(
abspath
)
def
close
(
self
,
save
:
bool
=
True
):
"""
Mimic the zipfile function to close the msgpack file.
Will trigger the creation of the database when in write mode.
Arguments:
save: If True will add the current data in memory to database
"""
if
'w'
in
self
.
_mode
:
self
.
create_db
()
if
isinstance
(
self
.
_fileobj
,
BytesIO
)
and
save
:
self
.
_fileobj
.
close
()
self
.
_fileobj
=
None
def
create_db
(
self
):
with
ArchiveWriter
(
self
.
_fileobj
,
len
(
self
.
_data
)
+
1
,
self
.
_entry_toc_depth
)
as
db
:
for
key
,
val
in
self
.
_data
.
items
():
key
=
self
.
_adjust_key
(
key
)
self
.
_ids
.
append
(
key
)
db
.
add
(
key
,
val
)
db
.
add
(
self
.
_infokey
,
dict
(
ids
=
self
.
_ids
,
entry_toc_depth
=
self
.
_entry_toc_depth
))
def
_adjust_key
(
self
,
key
:
str
,
fill_with
:
str
=
' '
):
key
=
key
.
rjust
(
self
.
_key_length
,
fill_with
)
assert
len
(
key
)
==
self
.
_key_length
return
key
def
add_data
(
self
,
data
:
Union
[
str
,
Dict
[
str
,
Any
],
List
[
Union
[
str
,
Dict
]]]):
"""
Add data to the msgpack database.
Arguments:
data: Can be a filename or dictionary or list of both
"""
if
isinstance
(
data
,
str
):
key
=
os
.
path
.
basename
(
data
)
if
data
.
endswith
(
'json'
):
uid
=
key
.
split
(
'.'
)[
0
]
val
=
json
.
load
(
open
(
data
))
if
isinstance
(
val
,
dict
):
self
.
_data
[
uid
]
=
val
else
:
try
:
uid
=
key
.
split
(
'.'
)[
0
]
dtype
=
key
.
split
(
'.'
)[
-
1
]
val
=
open
(
data
).
read
()
if
dtype
not
in
self
.
_data
:
self
.
_data
[
dtype
]
=
{}
if
val
:
self
.
_data
[
dtype
].
update
({
uid
:
val
})
except
Exception
:
pass
elif
isinstance
(
data
,
dict
):
for
key
,
val
in
data
.
items
():
if
val
:
self
.
_data
[
key
]
=
val
elif
isinstance
(
data
,
list
):
for
i
in
range
(
len
(
data
)):
self
.
add_data
(
data
[
i
])
else
:
raise
ValueError
@
property
def
ids
(
self
):
if
not
self
.
_ids
:
with
ArchiveReader
(
self
.
_fileobj
)
as
db
:
self
.
_ids
=
db
[
self
.
_infokey
][
'ids'
]
return
self
.
_ids
@
staticmethod
def
_get_index
(
key
:
str
)
->
Union
[
Tuple
[
int
,
int
],
int
]:
key
=
key
.
strip
()
bracket
=
key
.
find
(
'['
)
if
bracket
<=
0
:
return
None
assert
key
[
-
1
]
==
']'
str_index
=
key
[
bracket
+
1
:
-
1
]
if
':'
in
str_index
:
lo_str
,
hi_str
=
str_index
.
split
(
':'
)
lo
=
int
(
lo_str
)
if
lo_str
else
0
hi
=
int
(
hi_str
)
if
hi_str
else
10000000
return
lo
,
hi
else
:
# if db structure should be maintained, return lo, lo + 1
# if conform with python indexing, return lo
lo
=
int
(
str_index
)
return
lo
def
query_archive
(
f
,
query_dict
:
dict
):
def
_load_data
(
query_dict
:
Dict
[
str
,
Any
],
archive_item
:
ArchiveObject
,
main_section
:
bool
=
False
):
...
...
@@ -631,12 +516,16 @@ def query_archive(f, query_dict: dict):
if
main_section
:
archive_key
=
adjust_uuid_size
(
key
)
if
index
is
None
:
res
[
key
]
=
_load_data
(
val
,
archive_item
[
archive_key
])
elif
isinstance
(
index
,
int
):
res
[
key
]
=
_load_data
(
val
,
archive_item
[
archive_key
])[
index
]
else
:
res
[
key
]
=
_load_data
(
val
,
archive_item
[
archive_key
])[
index
[
0
]:
index
[
1
]]
try
:
if
index
is
None
:
res
[
key
]
=
_load_data
(
val
,
archive_item
[
archive_key
])
elif
isinstance
(
index
,
int
):
res
[
key
]
=
_load_data
(
val
,
archive_item
[
archive_key
])[
index
]
else
:
res
[
key
]
=
_load_data
(
val
,
archive_item
[
archive_key
])[
index
[
0
]:
index
[
1
]]
except
Exception
:
continue
return
res
...
...
nomad/normalizing/data/springer_msgpack.py
View file @
2b83f245
...
...
@@ -25,169 +25,110 @@ The html parser was taken from a collection of scripts from FHI without further
import
requests
import
re
import
os
from
bs4
import
BeautifulSoup
from
typing
import
Dict
,
List
,
Any
from
time
import
sleep
import
os
from
nomad.archive
import
ArchiveFileDB
from
nomad.archive
import
query_archive
,
write_archive
,
ArchiveReader
DB_NAME
=
'.springer.msg'
spacesRe
=
re
.
compile
(
r
"\s+"
)
required_items
=
{
'Alphabetic Formula:'
:
'alphabetic_formula'
,
'Classification by Properties:'
:
'classification'
,
'Compound Class(es):'
:
'compound_classes'
,
'Dataset ID'
:
'id'
,
'Space Group:'
:
'space_group_number'
,
}
symbRe
=
re
.
compile
(
r
"[A-Z][a-z]{0,3}"
)
spaces_re
=
re
.
compile
(
r
'\s+'
)
search_re
=
re
.
compile
(
" href=
\"
(/isp/[^
\"
]+)"
)
formula_re
=
re
.
compile
(
r
'([A-Z][a-z]?)([0-9.]*)|\[(.*?)\]([0-9]+)'
)
numRe
=
re
.
compile
(
r
"[0-9.]+"
)
bracketRe
=
re
.
compile
(
r
"\["
)
def
_update_dict
(
dict0
:
Dict
[
str
,
float
],
dict1
:
Dict
[
str
,
float
]):
for
key
,
val
in
dict1
.
items
():
if
key
in
dict0
:
dict0
[
key
]
+=
val
else
:
dict0
[
key
]
=
val
closingBraketRe
=
re
.
compile
(
r
"\]"
)
columnNames
=
{
"Normalized_formula"
:
"normalized_formula"
,
"Alphabetic Formula:"
:
"alphabetic_formula"
,
"Classification by Properties:"
:
"classification"
,
"Compound Class(es):"
:
"compound_classes"
,
"Dataset ID"
:
"id"
,
"Space Group:"
:
"space_group_number"
,
}
def
_components
(
formula_str
:
str
,
multiplier
:
float
=
1.0
)
->
Dict
[
str
,
float
]:
# match atoms and molecules (in brackets)
components
=
formula_re
.
findall
(
formula_str
)
symbol_amount
:
Dict
[
str
,
float
]
=
{}
for
component
in
components
:
element
,
amount_e
,
molecule
,
amount_m
=
component
if
element
:
if
not
amount_e
:
amount_e
=
1.0
_update_dict
(
symbol_amount
,
{
element
:
float
(
amount_e
)
*
multiplier
})
def
parseSymbol
(
formulaStr
):
m
=
symbRe
.
match
(
formulaStr
)
if
m
:
return
(
m
.
group
(),
formulaStr
[
len
(
m
.
group
()):])
else
:
return
(
None
,
formulaStr
)
def
parseAmount
(
formulaStr
):
m
=
numRe
.
match
(
formulaStr
)
if
m
:
return
(
float
(
m
.
group
()),
formulaStr
[
len
(
m
.
group
()):])
else
:
return
(
1.0
,
formulaStr
)
def
parseSimpleEntry
(
formulaStr
):
sym
,
rest
=
parseSymbol
(
formulaStr
)
if
sym
is
None
:
return
(
None
,
formulaStr
)
else
:
am
,
rest
=
parseAmount
(
rest
)
res
=
{}
res
[
sym
]
=
am
return
(
res
,
rest
)
def
parseComplexEntry
(
formulaStr
,
flatten
=
True
):
res
=
{}
m
=
bracketRe
.
match
(
formulaStr
)
if
m
is
None
:
return
(
None
,
formulaStr
)
else
:
rest
=
formulaStr
[
len
(
m
.
group
()):]
while
True
:
simE
,
rest
=
parseEntry
(
rest
)
if
simE
is
None
:
break
if
'#'
in
simE
:
if
'fragments'
in
res
:
res
[
'fragments'
].
append
(
simE
)
else
:
res
[
'fragments'
]
=
[
simE
]
else
:
for
sym
,
am
in
simE
.
items
():
if
sym
in
res
:
res
[
sym
]
+=
am
else
:
res
[
sym
]
=
am
m2
=
closingBraketRe
.
match
(
rest
)
if
m2
is
None
:
return
(
None
,
formulaStr
)
rest
=
rest
[
len
(
m2
.
group
()):]
am
,
rest
=
parseAmount
(
rest
)
for
k
,
v
in
res
.
items
():
res
[
k
]
=
v
*
am
else
:
res
[
'#'
]
=
am
return
(
res
,
rest
)
def
parseEntry
(
formulaStr
):
e
,
rest
=
parseSimpleEntry
(
formulaStr
)
if
e
is
not
None
:
return
(
e
,
rest
)
return
parseComplexEntry
(
formulaStr
)
def
parseFormula
(
formulaStr
):
res
=
{}
rest
=
formulaStr
while
len
(
rest
)
>
0
:
e
,
rest
=
parseEntry
(
rest
)
if
e
is
None
:
raise
Exception
(
"could not parse entry from %r, did parse %s and failed with %r"
%
(
formulaStr
,
res
,
rest
))
if
'#'
in
e
:
if
'fragments'
in
res
:
res
[
'fragments'
].
append
(
e
)
else
:
res
[
'fragments'
]
=
[
e
]
else
:
for
sym
,
am
in
e
.
items
():
if
sym
in
res
:
res
[
sym
]
+=
am
else
:
res
[
sym
]
=
am
return
res
def
normalizeFormula
(
formulaDict
):
oldTot
=
sum
(
formulaDict
.
values
())
res
=
{}
for
symb
,
amount
in
formulaDict
.
items
():
res
[
symb
]
=
int
(
amount
/
oldTot
*
100.0
+
0.5
)
sortedS
=
list
(
res
.
keys
())
sortedS
.
sort
()
resStr
=
""
for
symb
in
sortedS
:
resStr
+=
symb
resStr
+=
str
(
res
[
symb
])
return
resStr
def
parse
(
htmltext
):
elif
molecule
:
if
not
amount_m
:
amount_m
=
1.0
_update_dict
(
symbol_amount
,
_components
(
molecule
,
float
(
amount_m
)
*
multiplier
))
return
symbol_amount
def
normalize_formula
(
formula_str
:
str
)
->
str
:
symbol_amount
=
_components
(
formula_str
)
total
=
sum
(
symbol_amount
.
values
())
symbol_normamount
=
{
e
:
round
(
a
/
total
*
100.
)
for
e
,
a
in
symbol_amount
.
items
()}
formula_sorted
=
[
'%s%d'
%
(
s
,
symbol_normamount
[
s
])
for
s
in
sorted
(
list
(
symbol_normamount
.
keys
()))]
return
''
.
join
(
formula_sorted
)
def
parse
(
htmltext
:
str
)
->
Dict
[
str
,
str
]:
"""
Parser the quantities in
columnName
s from an html text.
Parser the quantities in
required_item
s from an html text.
"""
soup
=
BeautifulSoup
(
htmltext
,
"html.parser"
)
results
=
{}
for
el
in
soup
.
findAll
(
attrs
=
{
"class"
:
"data-list__content"
}):
for
it
in
el
.
findAll
(
attrs
=
{
"class"
:
"data-list__item"
}):
key
=
it
.
find
(
attrs
=
{
"class"
:
"data-list__item-key"
})
keyStr
=
key
.
string
value
=
spacesRe
.
sub
(
" "
,
it
.
find
(
attrs
=
{
"class"
:
"data-list__item-value"
}).
get_text
())
if
value
:
value
=
value
.
strip
()
if
keyStr
:
keyStr
=
keyStr
.
strip
()
if
keyStr
in
columnNames
:
keyStr
=
columnNames
[
keyStr
]
results
[
keyStr
]
=
value
for
item
in
soup
.
find_all
(
attrs
=
{
"class"
:
"data-list__item"
}):
key
=
item
.
find
(
attrs
=
{
"class"
:
"data-list__item-key"
})
if
not
key
:
continue
key_str
=
key
.
string
.
strip
()
if
key_str
not
in
required_items
:
continue
value
=
item
.
find
(
attrs
=
{
"class"
:
"data-list__item-value"
})
value
=
spaces_re
.
sub
(
' '
,
value
.
get_text
()).
strip
()
results
[
required_items
[
key_str
]]
=
value
if
len
(
results
)
>=
len
(
required_items
):
break
if
'classification'
in
results
:
results
[
'classification'
]
=
[
x
.
strip
()
for
x
in
results
[
'classification'
].
split
(
","
)]
results
[
'classification'
]
=
[
x
for
x
in
results
[
'classification'
]
if
x
!=
'–'
]
if
'compound_classes'
in
results
:
results
[
'compound_classes'
]
=
[
x
.
strip
()
for
x
in
results
[
'compound_classes'
].
split
(
","
)]
results
[
'compound_classes'
]
=
[
x
for
x
in
results
[
'compound_classes'
]
if
x
!=
'–'
]
normalized_formula
=
None
if
'alphabetic_formula'
in
results
:
try
:
f
=
parseFormula
(
results
[
'alphabetic_formula'
])
normalized_formula
=
normalizeFormula
(
f
)
normalized_formula
=
normalize_formula
(
results
[
'alphabetic_formula'
])
except
Exception
:
normalized_formula
=
None
pass
results
[
'normalized_formula'
]
=
normalized_formula
return
results
def
_merge_dict
(
dict0
,
dict1
)
:
def
_merge_dict
(
dict0
:
Dict
[
str
,
Any
],
dict1
:
Dict
[
str
,
Any
])
->
Dict
[
str
,
Any
]
:
if
not
isinstance
(
dict1
,
dict
)
or
not
isinstance
(
dict0
,
dict
):
return
dict1
...
...
@@ -199,63 +140,95 @@ def _merge_dict(dict0, dict1):
return
dict0
def
download_entries
(
formula
,
space_group_number
):
"""
Downloads the springer quantities related to a structure from springer.
"""
entries
=
{}
root
=
'https://materials.springer.com/textsearch?searchTerm=%s&datasourceFacet=sm_isp&substanceId='
%
formula
response
=
requests
.
get
(
root
)
if
response
.
status_code
!=
200
:
return
entries
re_search
=
re
.
compile
(
" href=
\"
(/isp/[^
\"
]+)"
)
paths
=
re_search
.
findall
(
response
.
text
)
paths
=
[
'http://materials.springer.com%s'
%
p
for
p
in
paths
]
for
path
in
paths
:
def
_download
(
path
:
str
,
max_n_query
:
int
=
10
)
->
str
:
n_query
=
0
while
True
:
response
=
requests
.
get
(
path
)
if
response
.
status_code
!=
200
:
continue
try
:
data
=
parse
(
response
.
text
)
except
Exception
:
continue
space_group_number
=
data
.
get
(
'space_group_number'
,
None
)
normalized_formula
=
data
.
get
(
'normalized_formula'
,
None
)
id
=
data
.
get
(
'id'
,
None
)
if
space_group_number
is
None
or
normalized_formula
is
None
or
id
is
None
:
continue
aformula
=
data
.
get
(
'alphabetic_formula'
,
None
)
compound
=
data
.
get
(
'compound_classes'
,
None
)
classification
=
data
.
get
(
'classification'
,
None
)
entry
=
dict
(
id
=
id
,
aformula
=
aformula
,
url
=
path
,
compound
=
compound
,
classification
=
classification
)
entries
=
_merge_dict
(
entries
,
{
str
(
space_group_number
):
{
normalized_formula
:
{
id
:
entry
}}})
return
entries
if
response
.
status_code
==
200
:
break
if
n_query
>
max_n_query
:
break
n_query
+=
1
sleep
(
120
)
if
response
.
status_code
!=
200
:
response
.
raise_for_status
()
return
response
.
text
def
get
_springer_data
(
normalized_formula
,
space_group_number
):
def
download
_springer_data
(
max_n_query
:
int
=
10
):
"""
Queries a msgpack database for springer-related quantities. Downloads data if not
found in database and adds it to
database.
Downloads the springer quantities related to a structure from springer and updates
database.
"""
entries
=
{}
mode
=
'w'
if
os
.
path
.
isfile
(
DB_NAME
):
db
=
ArchiveFileDB
(
DB_NAME
,
'r'
)
entries
=
db
.
query
({
str
(
space_group_number
):
{
normalized_formula
:
'*'
}})
db
.
close
()
mode
=
'w+'
if
not
entries
:
formula
=
numRe
.
sub
(
''
,
normalized_formula
)
entries
=
download_entries
(
formula
,
space_group_number
)
db
=
ArchiveFileDB
(
DB_NAME
,
mode
,
3
)
for
key
,
entry
in
entries
.
items
():
db
.
add_data
({
key
:
entry
})
db
.
close
()
# load database
# querying database with unvailable dataset leads to error,
# get toc keys first by making an empty query
archive
=
ArchiveReader
(
DB_NAME
)
_
=
archive
.
_load_toc_block
(
0
)
archive_keys
=
archive
.
_toc
.
keys
()
sp_data
=
query_archive
(
DB_NAME
,
{
spg
:
'*'
for
spg
in
archive_keys
})
sp_ids
:
List
[
str
]
=
[]
for
spg
in
sp_data
:
if
not
isinstance
(
sp_data
[
spg
],
dict
):
continue
for
formula
in
sp_data
[
spg
]:
sp_ids
+=
list
(
sp_data
[
spg
][
formula
].
keys
())
page
=
1
while
True
:
# check springer database for new entries by comparing with local database
root
=
'https://materials.springer.com/search?searchTerm=&pageNumber=%d&datasourceFacet=sm_isp&substanceId='
%
page
req_text
=
_download
(
root
,
max_n_query
)
if
'Sorry,'
in
req_text
:
break
paths
=
search_re
.
findall
(
req_text
)
for
path
in
paths
:
sp_id
=
os
.
path
.
basename
(
path
)
if
sp_id
in
sp_ids
:
continue
path
=
'http://materials.springer.com%s'
%
path
req_text
=
_download
(
path
,
max_n_query
)
try
:
data
=
parse
(
req_text
)
except
Exception
:
continue
space_group_number
=
data
.
get
(
'space_group_number'
,
None
)
normalized_formula
=
data
.
get
(
'normalized_formula'
,
None
)
if
space_group_number
is
None
or
normalized_formula
is
None
:
continue
aformula
=
data
.
get
(
'alphabetic_formula'
,
None
)
compound
=
data
.
get
(
'compound_classes'
,
None
)
classification
=
data
.
get
(
'classification'
,
None
)
entry
=
dict
(
aformula
=
aformula
,
url
=
path
,
compound
=
compound
,
classification
=
classification
)
sp_data
=
_merge_dict
(
sp_data
,
{
str
(
space_group_number
):
{
normalized_formula
:
{
sp_id
:
entry
}}})
page
+=
1
write_archive
(
DB_NAME
,
len
(
sp_data
),
sp_data
.
items
(),
entry_toc_depth
=
1
)
def
query_springer_data
(
normalized_formula
:
str
,
space_group_number
:
int
)
->
Dict
[
str
,
Any
]:
"""
Queries a msgpack database for springer-related quantities.
"""
entries
=
query_archive
(
DB_NAME
,
{
str
(
space_group_number
):
{
normalized_formula
:
'*'
}})
db_dict
=
{}
entries
=
entries
.
get
(
str
(
space_group_number
),
{}).
get
(
normalized_formula
,
{})
for
id
,
entry
in
entries
.
items
():
db_dict
[
id
]
=
{
'spr_id'
:
id
,
for
sp_id
,
entry
in
entries
.
items
():
db_dict
[
sp_id
]
=
{
'spr_id'
:
sp_id
,
'spr_aformula'
:
entry
[
'aformula'
],
'spr_url'
:
entry
[
'url'
],
'spr_compound'
:
entry
[
'compound'
],
...
...
nomad/normalizing/system.py
View file @
2b83f245
...
...
@@ -28,7 +28,7 @@ from matid.classifications import Class0D, Atom, Class1D, Material2D, Surface, C
from
nomad.normalizing
import
structure
from
nomad
import
utils
,
config
from
nomad.normalizing.normalizer
import
SystemBasedNormalizer
from
nomad.normalizing.data.springer_msgpack
import
get
_springer_data
from
nomad.normalizing.data.springer_msgpack
import
query
_springer_data
# use a regular expression to check atom labels; expression is build from list of
# all labels sorted desc to find Br and not B when searching for Br.
...
...
@@ -452,7 +452,7 @@ class SystemNormalizer(SystemBasedNormalizer):
'spr_classification'
:
spr_classification
}
elif
database
==
'msgpack'
:
dbdict
=
get
_springer_data
(
normalized_formula
,
space_group_number
)
dbdict
=
query
_springer_data
(
normalized_formula
,
space_group_number
)
# =============
...
...
tests/test_cli.py
View file @
2b83f245
...
...
@@ -134,7 +134,7 @@ class TestAdminUploads:
assert
Upload
.
objects
(
upload_id
=
upload_id
).
first
()
is
None
assert
Calc
.
objects
(
upload_id
=
upload_id
).
first
()
is
None
def
test_
create_
msgpack
(
self
,
published
):
def
test_msgpack
ed
(
self
,
published
):
upload_id
=
published
.
upload_id
result
=
click
.
testing
.
CliRunner
().
invoke
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment