Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
nomad-lab
nomad-FAIR
Commits
ee4d5987
Commit
ee4d5987
authored
Apr 24, 2019
by
Markus Scheidgen
Browse files
Moved upload user metadata from mongo to files. Added special package creation of OQMD migration.
parent
a9415497
Pipeline
#47335
passed with stages
in 16 minutes and 57 seconds
Changes
9
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
nomad/client/migration.py
View file @
ee4d5987
...
@@ -21,6 +21,7 @@ import re
...
@@ -21,6 +21,7 @@ import re
import
shutil
import
shutil
import
multiprocessing
import
multiprocessing
import
queue
import
queue
import
json
from
nomad
import
config
,
infrastructure
from
nomad
import
config
,
infrastructure
from
nomad.migration
import
NomadCOEMigration
,
SourceCalc
,
Package
from
nomad.migration
import
NomadCOEMigration
,
SourceCalc
,
Package
...
@@ -191,3 +192,12 @@ def upload(
...
@@ -191,3 +192,12 @@ def upload(
_Migration
(
threads
=
parallel
).
migrate
(
_Migration
(
threads
=
parallel
).
migrate
(
*
determine_upload_paths
(
upload_paths
,
pattern
),
delete_failed
=
delete_failed
,
*
determine_upload_paths
(
upload_paths
,
pattern
),
delete_failed
=
delete_failed
,
create_packages
=
create_packages
)
create_packages
=
create_packages
)
@
migration
.
command
(
help
=
'Get an report about not migrated calcs.'
)
def
missing
():
infrastructure
.
setup_logging
()
infrastructure
.
setup_mongo
()
report
=
SourceCalc
.
missing
()
print
(
json
.
dumps
(
report
,
indent
=
2
))
nomad/files.py
View file @
ee4d5987
...
@@ -57,11 +57,15 @@ from zipfile import ZipFile, BadZipFile
...
@@ -57,11 +57,15 @@ from zipfile import ZipFile, BadZipFile
import
tarfile
import
tarfile
import
hashlib
import
hashlib
import
io
import
io
import
pickle
from
nomad
import
config
,
utils
from
nomad
import
config
,
utils
from
nomad.datamodel
import
UploadWithMetadata
from
nomad.datamodel
import
UploadWithMetadata
user_metadata_filename
=
'user_metadata.pickle'
class
PathObject
:
class
PathObject
:
"""
"""
Object storage-like abstraction for paths in general.
Object storage-like abstraction for paths in general.
...
@@ -172,6 +176,20 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta):
...
@@ -172,6 +176,20 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta):
self
.
upload_id
=
upload_id
self
.
upload_id
=
upload_id
self
.
_is_authorized
=
is_authorized
self
.
_is_authorized
=
is_authorized
self
.
_user_metadata_file
=
self
.
join_file
(
'user_metadata.pickle'
)
@
property
def
user_metadata
(
self
)
->
dict
:
if
self
.
_user_metadata_file
.
exists
():
with
open
(
self
.
_user_metadata_file
.
os_path
,
'rb'
)
as
f
:
return
pickle
.
load
(
f
)
else
:
return
{}
@
user_metadata
.
setter
def
user_metadata
(
self
,
data
:
dict
)
->
None
:
with
open
(
self
.
_user_metadata_file
.
os_path
,
'wb'
)
as
f
:
pickle
.
dump
(
data
,
f
)
@
staticmethod
@
staticmethod
def
get
(
upload_id
:
str
,
*
args
,
**
kwargs
)
->
'UploadFiles'
:
def
get
(
upload_id
:
str
,
*
args
,
**
kwargs
)
->
'UploadFiles'
:
...
@@ -349,6 +367,12 @@ class StagingUploadFiles(UploadFiles):
...
@@ -349,6 +367,12 @@ class StagingUploadFiles(UploadFiles):
create_prefix
=
True
)
create_prefix
=
True
)
assert
target_dir
.
exists
()
assert
target_dir
.
exists
()
# copy user metadata
if
self
.
_user_metadata_file
.
exists
():
shutil
.
copyfile
(
self
.
_user_metadata_file
.
os_path
,
target_dir
.
join_file
(
user_metadata_filename
).
os_path
)
def
create_zipfile
(
kind
:
str
,
prefix
:
str
,
ext
:
str
)
->
ZipFile
:
def
create_zipfile
(
kind
:
str
,
prefix
:
str
,
ext
:
str
)
->
ZipFile
:
file
=
target_dir
.
join_file
(
'%s-%s.%s.zip'
%
(
kind
,
prefix
,
ext
))
file
=
target_dir
.
join_file
(
'%s-%s.%s.zip'
%
(
kind
,
prefix
,
ext
))
return
ZipFile
(
file
.
os_path
,
mode
=
'w'
)
return
ZipFile
(
file
.
os_path
,
mode
=
'w'
)
...
...
nomad/migration.py
View file @
ee4d5987
...
@@ -38,6 +38,7 @@ import io
...
@@ -38,6 +38,7 @@ import io
import
threading
import
threading
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
import
shutil
import
shutil
import
json
from
nomad
import
utils
,
infrastructure
,
files
,
config
from
nomad
import
utils
,
infrastructure
,
files
,
config
from
nomad.coe_repo
import
User
,
Calc
,
LoginException
from
nomad.coe_repo
import
User
,
Calc
,
LoginException
...
@@ -316,6 +317,16 @@ class Package(Document):
...
@@ -316,6 +317,16 @@ class Package(Document):
if
len
(
files
)
==
0
:
if
len
(
files
)
==
0
:
continue
continue
if
len
(
files
)
<
20
and
any
(
file
.
endswith
(
'.tar.gz'
)
for
file
in
files
):
# TODO the OQMD case, files are managed as bunch of .tar.gz files
for
file
in
files
:
archive_path
=
os
.
path
.
join
(
root
,
file
)
prefix
=
os
.
path
.
dirname
(
archive_path
)[
len
(
upload_path
)
+
1
:]
with
cls
.
extracted_archive
(
archive_path
)
as
extracted_archive
:
for
paths
,
_
,
size
in
cls
.
iterate_upload_directory
(
extracted_archive
):
yield
[
os
.
path
.
join
(
prefix
,
path
)
for
path
in
paths
],
upload_path
,
size
continue
for
file
in
files
:
for
file
in
files
:
filepath
=
os
.
path
.
join
(
root
,
file
)
filepath
=
os
.
path
.
join
(
root
,
file
)
filename
=
filepath
[
len
(
upload_path
)
+
1
:]
filename
=
filepath
[
len
(
upload_path
)
+
1
:]
...
@@ -382,6 +393,60 @@ class SourceCalc(Document):
...
@@ -382,6 +393,60 @@ class SourceCalc(Document):
_dataset_cache
:
dict
=
{}
_dataset_cache
:
dict
=
{}
@
staticmethod
def
missing
():
"""
Produces data about non migrated calcs
"""
tmp_data_path
=
'/tmp/nomad_migration_missing.json'
if
os
.
path
.
exists
(
tmp_data_path
):
with
open
(
tmp_data_path
,
'rt'
)
as
f
:
data
=
utils
.
POPO
(
**
json
.
load
(
f
))
else
:
data
=
utils
.
POPO
(
step
=
0
)
try
:
# get source_uploads that have non migrated calcs
if
data
.
step
<
1
:
import
re
data
.
source_uploads
=
SourceCalc
.
_get_collection
()
\
.
find
({
'migration_version'
:
{
'$lt'
:
0
},
'mainfile'
:
{
'$not'
:
re
.
compile
(
r
'^aflowlib_data.*'
)}})
\
.
distinct
(
'upload'
)
data
.
step
=
1
if
data
.
step
<
2
:
source_uploads
=
[]
data
.
packages
=
utils
.
POPO
()
data
.
uploads_with_no_package
=
[]
for
source_upload
in
data
.
source_uploads
:
package
=
Package
.
objects
(
upload_id
=
source_upload
).
first
()
if
package
is
None
:
data
.
uploads_with_no_package
.
append
(
source_upload
)
else
:
source_uploads
.
append
(
source_upload
)
data
.
source_uploads
=
source_uploads
data
.
step
=
2
if
data
.
step
<
3
:
source_uploads
=
[]
for
source_upload
in
data
.
source_uploads
:
count
=
SourceCalc
.
objects
(
upload
=
source_upload
).
count
()
source_uploads
.
append
(
utils
.
POPO
(
id
=
source_upload
,
calcs
=
count
))
data
.
source_uploads
=
sorted
(
source_uploads
,
key
=
lambda
k
:
k
[
'calcs'
])
data
.
step
=
3
if
data
.
step
<
4
:
source_uploads
=
[]
for
source_upload
in
data
.
source_uploads
:
count
=
Package
.
objects
(
upload_id
=
source_upload
.
get
(
'id'
)).
count
()
source_upload
[
'packages'
]
=
count
data
.
step
=
4
finally
:
with
open
(
tmp_data_path
,
'wt'
)
as
f
:
json
.
dump
(
data
,
f
)
return
data
@
staticmethod
@
staticmethod
def
index
(
source
,
drop
:
bool
=
False
,
with_metadata
:
bool
=
True
,
per_query
:
int
=
100
)
\
def
index
(
source
,
drop
:
bool
=
False
,
with_metadata
:
bool
=
True
,
per_query
:
int
=
100
)
\
->
Generator
[
Tuple
[
'SourceCalc'
,
int
],
None
,
None
]:
->
Generator
[
Tuple
[
'SourceCalc'
,
int
],
None
,
None
]:
...
...
nomad/processing/data.py
View file @
ee4d5987
...
@@ -369,7 +369,6 @@ class Upload(Proc):
...
@@ -369,7 +369,6 @@ class Upload(Proc):
temporary
=
BooleanField
(
default
=
False
)
temporary
=
BooleanField
(
default
=
False
)
name
=
StringField
(
default
=
None
)
name
=
StringField
(
default
=
None
)
metadata
=
DictField
(
default
=
None
)
upload_time
=
DateTimeField
()
upload_time
=
DateTimeField
()
user_id
=
StringField
(
required
=
True
)
user_id
=
StringField
(
required
=
True
)
published
=
BooleanField
(
default
=
False
)
published
=
BooleanField
(
default
=
False
)
...
@@ -385,6 +384,14 @@ class Upload(Proc):
...
@@ -385,6 +384,14 @@ class Upload(Proc):
super
().
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
_upload_files
:
ArchiveBasedStagingUploadFiles
=
None
self
.
_upload_files
:
ArchiveBasedStagingUploadFiles
=
None
@
property
def
metadata
(
self
)
->
dict
:
return
self
.
upload_files
.
user_metadata
@
metadata
.
setter
def
metadata
(
self
,
data
:
dict
)
->
None
:
self
.
upload_files
.
user_metadata
=
data
@
classmethod
@
classmethod
def
get
(
cls
,
id
:
str
,
include_published
:
bool
=
False
)
->
'Upload'
:
def
get
(
cls
,
id
:
str
,
include_published
:
bool
=
False
)
->
'Upload'
:
upload
=
cls
.
get_by_id
(
id
,
'upload_id'
)
upload
=
cls
.
get_by_id
(
id
,
'upload_id'
)
...
@@ -535,10 +542,11 @@ class Upload(Proc):
...
@@ -535,10 +542,11 @@ class Upload(Proc):
@
property
@
property
def
upload_files
(
self
)
->
UploadFiles
:
def
upload_files
(
self
)
->
UploadFiles
:
upload_files_class
=
ArchiveBasedStagingUploadFiles
if
not
self
.
published
else
PublicUploadFiles
upload_files_class
=
ArchiveBasedStagingUploadFiles
if
not
self
.
published
else
PublicUploadFiles
kwargs
=
dict
(
upload_path
=
self
.
upload_path
)
if
not
self
.
published
else
{}
if
not
self
.
_upload_files
or
not
isinstance
(
self
.
_upload_files
,
upload_files_class
):
if
not
self
.
_upload_files
or
not
isinstance
(
self
.
_upload_files
,
upload_files_class
):
self
.
_upload_files
=
upload_files_class
(
self
.
_upload_files
=
upload_files_class
(
self
.
upload_id
,
is_authorized
=
lambda
:
True
,
upload_path
=
self
.
upload_path
)
self
.
upload_id
,
is_authorized
=
lambda
:
True
,
**
kwargs
)
return
self
.
_upload_files
return
self
.
_upload_files
...
...
tests/data/migration/packaging_oqmd/baseline/oqmd/one.tar.gz
0 → 100644
View file @
ee4d5987
File added
tests/data/migration/packaging_oqmd/baseline/oqmd/two.tar.gz
0 → 100644
View file @
ee4d5987
File added
tests/data/migration/packaging_oqmd/restriction/oqmd/archive.tar.gz
0 → 100644
View file @
ee4d5987
File added
tests/data/migration/packaging_oqmd/too_big/oqmd/archive.tar.gz
0 → 100644
View file @
ee4d5987
File added
tests/test_migration.py
View file @
ee4d5987
...
@@ -95,15 +95,12 @@ def source_package(mongo, migration):
...
@@ -95,15 +95,12 @@ def source_package(mongo, migration):
migration
.
package
(
*
glob
.
glob
(
'tests/data/migration/*'
))
migration
.
package
(
*
glob
.
glob
(
'tests/data/migration/*'
))
@
pytest
.
mark
.
parametrize
(
'ar
chived'
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
'
v
ar
iant'
,
[
''
,
'_archived'
,
'_oqmd'
])
@
pytest
.
mark
.
parametrize
(
'n_packages, restriction, upload'
,
[(
1
,
36
,
'baseline'
),
(
2
,
0
,
'too_big'
),
(
1
,
24
,
'restriction'
)])
@
pytest
.
mark
.
parametrize
(
'n_packages, restriction, upload'
,
[(
1
,
36
,
'baseline'
),
(
2
,
0
,
'too_big'
),
(
1
,
24
,
'restriction'
)])
def
test_package
(
def
test_package
(
mongo
,
migration
:
NomadCOEMigration
,
monkeypatch
,
n_packages
,
restriction
,
upload
,
ar
chived
):
mongo
,
migration
:
NomadCOEMigration
,
monkeypatch
,
n_packages
,
restriction
,
upload
,
v
ar
iant
):
monkeypatch
.
setattr
(
'nomad.migration.max_package_size'
,
3
)
monkeypatch
.
setattr
(
'nomad.migration.max_package_size'
,
3
)
if
archived
:
upload
=
os
.
path
.
join
(
'tests/data/migration/packaging%s'
%
variant
,
upload
)
upload
=
os
.
path
.
join
(
'tests/data/migration/packaging_archived'
,
upload
)
else
:
upload
=
os
.
path
.
join
(
'tests/data/migration/packaging'
,
upload
)
migration
.
package_index
(
upload
)
migration
.
package_index
(
upload
)
packages
=
Package
.
objects
()
packages
=
Package
.
objects
()
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment