Commit b4a7f6b9 authored by Lauri Himanen's avatar Lauri Himanen Committed by Markus Scheidgen
Browse files

Added support for ingesting/displaying DOS similarity data in Encyclopedia...

Added support for ingesting/displaying DOS similarity data in Encyclopedia GUI. Additionally, the MongoDB metainfo extension now can handle nested sections.
parent 7f1962f7
Subproject commit da7db6108e125b390a4f44b789f02d8b3b09a81e
Subproject commit bdffb455f21577435477daa9c871205e6c118efd
......@@ -29,6 +29,7 @@ from nomad.files import UploadFiles
from nomad.units import ureg
from nomad.atomutils import get_hill_decomposition
from nomad.datamodel.datamodel import EntryArchive
from nomad.datamodel.material import Material
from .api import api
from .auth import authenticate, create_authorization_predicate
......@@ -99,6 +100,13 @@ def get_enc_filter():
]
similarity = api.model("similarity", {
# General
"material_id": fields.String,
"value": fields.Float,
"formula": fields.String,
"space_group_number": fields.Integer,
})
material_query = api.parser()
material_query.add_argument(
"property",
......@@ -125,6 +133,7 @@ material_result = api.model("material_result", {
"space_group_international_short_symbol": fields.String,
"structure_prototype": fields.String,
"structure_type": fields.String,
"similarity": fields.List(fields.Nested(similarity, skip_none=True), skip_none=True),
})
......@@ -178,6 +187,49 @@ class EncMaterialResource(Resource):
entry = response[0]
result = get_es_doc_values(entry, material_prop_map, keys)
# Add similarity data that is currently stored in MongoDB. In the
# future a lot of the data will be accessed here.
try:
material = Material.m_def.a_mongo.get(material_id=material_id)
dos_similarity = material.similarity.electronic_dos
except KeyError:
# No similarity data for this material
pass
else:
# Only include similarity for materials that exist on the current
# deployment to avoid dead links.
similar_ids = dos_similarity.material_ids
id_value_map = {key: value for key, value in zip(dos_similarity.material_ids, dos_similarity.values)}
bool_query = Q(
"bool",
filter=get_enc_filter() + [Q("terms", encyclopedia__material__material_id=similar_ids)],
)
s = Search(index=config.elastic.index_name)
s = s.query(bool_query)
s = s.extra(**{
"_source": {"includes": [
"encyclopedia.material.material_id",
"encyclopedia.material.formula_reduced",
"encyclopedia.material.bulk.space_group_number",
]},
"size": 5,
"collapse": {"field": "encyclopedia.material.material_id"},
})
response = s.execute()
similarity = []
for hit in response.hits:
try:
similarity.append({
"material_id": hit.encyclopedia.material.material_id,
"value": id_value_map[hit.encyclopedia.material.material_id],
"formula": hit.encyclopedia.material.formula_reduced,
"space_group_number": hit.encyclopedia.material.bulk.space_group_number,
})
except AttributeError:
pass
if similarity:
result["similarity"] = similarity
return result, 200
......
......@@ -229,6 +229,11 @@ def ops():
pass
@ops.group(help='Tools for managing the DOS similarity data.')
def similarity():
pass
@ops.command(help=('Dump the mongo (calculation metadata) db.'))
@click.option('--restore', is_flag=True, help='Do not dump, but restore.')
def dump(restore: bool):
......@@ -361,3 +366,21 @@ def prototypes_update(ctx, filepath, matches_only):
def springer_update(max_n_query, retry_time):
from nomad.cli.admin import springer
springer.update_springer(max_n_query, retry_time)
@similarity.command(help='Updates the msgpack file containing the similarity information.')
@click.option('--dir', "-d", "input_dir", type=str, help='Path of the folder containing the raw similarity information files')
@click.option('--out', "-o", type=str, help='Path of the output msgpack file.')
@click.option('--verbose', is_flag=True, help='Enable verbose output.')
def update(input_dir, out, verbose):
from nomad.cli.admin import similarity
similarity.update(input_dir, out, verbose)
@similarity.command(help='Ingests the given similarity information from an msgpack file into MongoDB.')
@click.option('--in', "-i", "input_path", type=str, help='Path of the ingested msgpack file.')
@click.option('--batch_size', type=int, default=10000, help='Batch size for MongoDB bulk ingestion.')
@click.option('--verbose', is_flag=True, help='Enable verbose output.')
def ingest(input_path, batch_size, verbose):
from nomad.cli.admin import similarity
similarity.ingest(input_path, batch_size, verbose)
import os
import json
from nomad import infrastructure
from nomad.datamodel.material import Material, Similarity, DOSSimilarity
from nomad.archive import ArchiveReader, ArchiveWriter
from typing import List
def ingest(input_path: str, batch_size: int, verbose: bool):
"""Used to ingest the given DOS similarity values into MongoDB.
Args:
input_path: Path of the msgpack file to ingest.
batch_size: Batch size for MongoDB bulk ingest.
verbose: Enable verbose output.
"""
# Initialize mongo connection
infrastructure.setup_mongo()
bulk = Material.m_def.a_mongo.mongo_cls()._get_collection().initialize_ordered_bulk_op() # pylint: disable=not-callable
with ArchiveReader(input_path) as reader:
i = 0
for material_id in reader:
material_object = reader[material_id]
material_dict = material_object.to_dict()
# Each entry is cycled through the material metainfo definition and
# the mongo annotation to validate it and only push the annotated
# data.
mongo_instance = Material.m_def.a_mongo.mongo_cls(**material_dict, _created=False) # pylint: disable=not-callable
material_dict = mongo_instance.to_mongo().to_dict()
# Add an upsert to the bulk operations
bulk.find({'_id': material_dict["_id"]}).upsert().update_one({'$set': material_dict})
i += 1
if i % batch_size == 0:
bulk.execute()
bulk = Material.m_def.a_mongo.mongo_cls()._get_collection().initialize_ordered_bulk_op() # pylint: disable=not-callable
if verbose:
print("{} inserted".format(i))
# Final push for the remainder
bulk.execute()
if verbose:
print("{} inserted".format(i))
def update(input_dir: str, output_path: str, verbose: bool):
"""Used to create a compact msgpack file that follows the metainfo schema
for materials and contains the DOS similarity values.
Args:
input_dir: Path of the directory containing the raw similarity filesa.
output_path: Path of the output msgpack file.
verbose: Enable verbose output.
"""
# Find all valid data files in the given directory
similarity_files: List[str] = []
for filename in os.listdir(input_dir):
if filename.endswith(".dat"):
similarity_files.append(os.path.join(input_dir, filename))
n_files: int = len(similarity_files)
if n_files == 0:
raise ValueError("Could not find similarity files in directory: {}".format(input_dir))
if verbose:
print("{} files found".format(n_files))
# Gather the number of entries to prepare the msgpack file.
n_entries = 0
for filepath in similarity_files:
num_lines = sum(1 for line in open(filepath, "r"))
n_entries += num_lines
if verbose:
print("{} entries found".format(n_entries))
# Read the data file containing similarities
# i = 0
if verbose:
print("Writing msgpack file...")
with ArchiveWriter(output_path, n_entries, entry_toc_depth=1) as writer:
for filepath in similarity_files:
with open(filepath, "r") as f:
for line in f:
ientry = json.loads(line)
for key, value in ientry.items():
_, _, imaterial = key.split(":")
# Create data according to a metainfo model
material = Material()
material.material_id = imaterial
similarity = material.m_create(Similarity)
dos_similarity = similarity.m_create(DOSSimilarity)
ids = []
values = []
for jkey, similarity_value in value.items():
_, _, jmaterial = jkey.split(":")
ids.append(jmaterial)
values.append(similarity_value)
dos_similarity.material_ids = ids
dos_similarity.values = values
# Save as msgpack
writer.add(imaterial, material.m_to_dict())
if verbose:
print("Finished")
......@@ -251,6 +251,10 @@ normalize = NomadConfig(
)
)
paths = NomadConfig(
similarity="",
)
client = NomadConfig(
user='leonard.hofstadter@nomad-fairdi.tests.de',
password='password',
......
from nomad.metainfo import MSection, Section, SubSection, Quantity
from nomad.metainfo.mongoengine_extension import Mongo, MongoDocument
class DOSSimilarity(MSection):
m_def = Section(
a_mongo=MongoDocument()
)
material_ids = Quantity(
type=str,
shape=["n_similar_materials"],
a_mongo=Mongo(),
)
values = Quantity(
type=float,
shape=["n_similar_materials"],
a_mongo=Mongo(),
)
class Similarity(MSection):
m_def = Section(
a_mongo=MongoDocument()
)
electronic_dos = SubSection(sub_section=DOSSimilarity.m_def, repeats=False)
class Material(MSection):
m_def = Section(
a_mongo=MongoDocument()
)
material_id = Quantity(
type=str,
a_mongo=Mongo(primary_key=True)
)
similarity = SubSection(sub_section=Similarity.m_def, repeats=False)
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
......@@ -14,7 +14,7 @@
'''
Adds mongoengine supports to the metainfo. Allows to create, save, and get metainfo
sections from mongoengine. Currently no sub-section support. The annotation key is 'mongo'.
sections from mongoengine. The annotation key is 'mongo'.
'''
from typing import Any, Dict, List
......@@ -55,6 +55,7 @@ class MongoDocument(SectionAnnotation):
self._mongoengine_cls = None
self.primary_key: Mongo = None
self.primary_key_name: str = None
def new(self, section):
return dict(mongo=MongoInstance(section))
......@@ -90,30 +91,64 @@ class MongoDocument(SectionAnnotation):
if len(quantity.shape) == 0:
return result
elif len(quantity.shape) == 1:
return me.ListField(result)
return me.ListField(result, default=None)
else:
raise NotImplementedError
indexes: List[str] = []
dct: Dict[str, Any] = {}
for quantity in self.definition.all_quantities.values():
annotation = quantity.m_get_annotations(Mongo)
if annotation is None:
continue
if annotation.index:
indexes.append(quantity.name)
dct[quantity.name] = generate_field(quantity, annotation)
if annotation.primary_key:
self.primary_key = annotation
def create_model_recursive(section, level):
indexes: List[str] = []
dct: Dict[str, Any] = {}
# Add quantities to model
for quantity in section.all_quantities.values():
annotation = quantity.m_get_annotations(Mongo)
if annotation is None:
continue
if annotation.index:
indexes.append(quantity.name)
# Primary key is only stored from the root document.
if level == 0:
if annotation.primary_key:
self.primary_key = annotation
self.primary_key_name = quantity.name
dct[quantity.name] = generate_field(quantity, annotation)
# Add subsections to the model
for subsection in section.all_sub_sections.values():
annotation = subsection.sub_section.m_get_annotations(MongoDocument)
if annotation is None:
continue
embedded_doc_field = type(subsection.sub_section.name, (me.EmbeddedDocumentField,), {})
model = create_model_recursive(subsection.sub_section, level + 1)
if subsection.repeats:
dct[subsection.name] = me.ListField(embedded_doc_field(model))
else:
dct[subsection.name] = embedded_doc_field(model)
# Add meta dictionary. The strict mode is set to false in order to
# not raise an exception when reading data that is not specified in
# the model.
meta = {
"strict": False
}
if len(indexes) > 0:
meta["indexes"] = indexes
dct['meta'] = meta
# Return final model
if level == 0:
model = type(section.name, (me.Document,), dct)
else:
model = type(section.name, (me.EmbeddedDocument,), dct)
if len(indexes) > 0:
dct['meta'] = dict(indexes=indexes)
return model
self._mongoengine_cls = type(self.definition.name, (me.Document,), dct)
self._mongoengine_cls = create_model_recursive(self.definition, 0)
return self._mongoengine_cls
def objects(self, *args, **kwargs):
......@@ -138,13 +173,18 @@ class MongoDocument(SectionAnnotation):
Turns the given mongoengine document instance into its metainfo section instance
counterpart.
'''
section = self.definition.section_cls()
section_cls = self.definition.section_cls
# Get the mongo instance data as dict. This is easy to de-serialize
# into a metainfo section. If a primary key has been declared, rename
# the _id field.
mongo_dict = mongo_instance.to_mongo().to_dict()
if self.primary_key_name is not None:
mongo_dict[self.primary_key_name] = mongo_dict["_id"]
del mongo_dict["_id"]
section = section_cls.m_from_dict(mongo_dict)
section.a_mongo.mongo_instance = mongo_instance
for name, quantity in self.definition.all_quantities.items():
if quantity.m_get_annotations(Mongo) is not None:
value = getattr(mongo_instance, name)
if value is not None:
section.m_set(quantity, value)
return section
......@@ -154,27 +194,30 @@ class MongoInstance(Annotation):
The annotation that is automatically added to all instances of sections that
feature the :class:`MongoDocument` annotation.
'''
def __init__(self, section: MSection):
self.section = section
self.mongo_instance = None
self._id = None
def save(self):
''' Saves the section as mongo entry. Does an upsert. '''
if self.mongo_instance is None:
return self.create()
for quantity_name, quantity in self.section.m_def.all_quantities.items():
value = self.section.m_get(quantity)
setattr(self.mongo_instance, quantity_name, value)
# The best way to update a complex entry with mongoengine is to create
# a new Document instance and specify the target ID which should be
# updated. The targen ID is taken from an old previously saved
# instance. If no previous saves have been done, a new object will be
# created. See discussion at:
# https://stackoverflow.com/questions/19002469/update-a-mongoengine-document-using-a-python-dict
data = self.section.m_to_dict()
if self.mongo_instance is not None:
data["id"] = self.mongo_instance.id
mongo_instance = self.section.m_def.a_mongo.mongo_cls(**data, _created=False)
self.mongo_instance = mongo_instance.save()
self.mongo_instance.save()
return self.section
def create(self):
''' Creates a new mongo entry and saves it. '''
self.mongo_instance = self.section.m_def.a_mongo.mongo_cls()
return self.save()
def delete(self):
......
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
from nomad.metainfo import MSection, Section, Quantity, SubSection
from nomad.metainfo.mongoengine_extension import MongoDocument, Mongo
class B(MSection):
m_def = Section(a_mongo=MongoDocument())
value = Quantity(type=str, a_mongo=Mongo())
class C(MSection):
m_def = Section(a_mongo=MongoDocument())
value = Quantity(type=str, a_mongo=Mongo())
class D(MSection):
m_def = Section()
value = Quantity(type=str, a_mongo=Mongo())
class A(MSection):
"""Root level document with primary key.
"""
m_def = Section(a_mongo=MongoDocument())
primary_id = Quantity(type=str, a_mongo=Mongo(primary_key=True))
array = Quantity(type=float, shape=[2], a_mongo=Mongo())
not_in_mongo = Quantity(type=str)
value1 = Quantity(type=int, a_mongo=Mongo())
value2 = Quantity(type=int, a_mongo=Mongo())
b = SubSection(sub_section=B.m_def)
c = SubSection(sub_section=C.m_def, repeats=True)
d = SubSection(sub_section=D.m_def)
def test_create_new(mongo):
a = A()
a.primary_id = "123"
a.not_in_mongo = "not_in_mongo"
b = a.m_create(B)
b.value = "b_value"
c = a.m_create(C)
c.value = "c_value"
c = a.m_create(C)
c.value = "c_value"
# Create JSON with the values that are supposed to be in mongo
a_dict = a.m_to_dict()
del a_dict["not_in_mongo"]
a_json = json.dumps(a_dict, sort_keys=True)
# Store to mongo
mongo_doc = a.a_mongo
mongo_doc.save()
# Retrieve from mongo, and convert to JSON
a_from_db = A.m_def.a_mongo.get(primary_id="123")
a_from_db_json = json.dumps(a_from_db.m_to_dict(), sort_keys=True)
# Test equality of the JSON serializations
assert a_json == a_from_db_json
def test_update_with_new():
a = A()
a.primary_id = "123"
a.value1 = 1
a.value2 = 2
# Store to mongo
a.a_mongo.save()
# Update with new document that has the same ID
a_new = A()
a_new.primary_id = "123"
a_new.value2 = 3
a_new.a_mongo.save()
# Check that the document has only partly been updated
a_from_db = A.m_def.a_mongo.get(primary_id="123")
assert a_from_db.value1 == 1
assert a_from_db.value2 == 3
def test_update_self():
a = A()
a.primary_id = "123"
a.value1 = 1
a.value2 = 2
# Store to mongo
a.a_mongo.save()
# Update the metainfo and resave
a.value2 = 3
a.a_mongo.save()
# Check that the document has only partly been updated
a_from_db = A.m_def.a_mongo.get(primary_id="123")
assert a_from_db.value1 == 1
assert a_from_db.value2 == 3
def test_annotations(mongo):
"""Test that non-annotated quantities and sections are not stored.
"""
a = A()
a.primary_id = "123"
a.not_in_mongo = "not_in_mongo"
d = a.m_create(D)
d.value = "b_value"
# Store to mongo
a.a_mongo.save()
# Check that values do not exist in mongodb
a_from_db = A.m_def.a_mongo.get(primary_id="123")
assert a_from_db.not_in_mongo is None
assert a_from_db.d is None
def test_repeated_subsections():
a = A()
a.primary_id = "123"
c = a.m_create(C)
c.value = "c_value"
c = a.m_create(C)
c.value = "c_value"
# Store to mongo
a.a_mongo.save()
# Check that both sections are stored in mongodb
a_from_db = A.m_def.a_mongo.get(primary_id="123")
assert len(a_from_db.c) == 2
def test_arrays():
a = A()
a.primary_id = "123"
a.array = np.array([1.2, 3.4])
# Store to mongo