Commit 0bb29458 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Refactored domains. Added unique code runs as metric to backend and GUI.

parent a663d4f6
......@@ -290,7 +290,8 @@ class Repo extends React.Component {
code_runs: 'Code runs',
total_energies: 'Total energy calculations',
geometries: 'Unique geometries',
datasets: 'Datasets'
datasets: 'Datasets',
unique_code_runs: 'Unique code runs'
}
return (
<div className={classes.root}>
......@@ -323,7 +324,7 @@ class Repo extends React.Component {
<ExpansionPanel>
<ExpansionPanelSummary expandIcon={<ExpandMoreIcon/>} className={classes.searchSummary}>
<Typography variant="h6" style={{textAlign: 'center', width: '100%', fontWeight: 'normal'}}>
Found <b>{metrics.code_runs}</b> code runs
Found <b>{metrics.code_runs}</b>{metric === 'unique_code_runs' ? (<span>(<b>{metrics.unique_code_runs}</b> unique)</span>) : ''} code runs
{metric === 'geometries' ? (<span> that simulate <b>{metrics.geometries}</b> unique geometries</span>) : ''}
{metric === 'total_energies' ? (<span> with <b>{metrics.total_energies}</b> total energy calculations</span>) : ''}
{metric === 'datasets' ? (<span> curated in <b>{metrics.datasets}</b> datasets</span>) : ''}.
......@@ -334,7 +335,7 @@ class Repo extends React.Component {
<FormControl>
<FormLabel>Metric used in statistics: </FormLabel>
<FormGroup row>
{['code_runs', 'total_energies', 'geometries', 'datasets'].map(metric => (
{['code_runs', 'unique_code_runs', 'total_energies', 'geometries', 'datasets'].map(metric => (
<FormControlLabel key={metric}
control={
<Checkbox checked={this.state.metric === metric} onChange={() => this.handleMetricChange(metric)} value={metric} />
......
......@@ -74,11 +74,11 @@ repo_calcs_model = api.model('RepoCalculations', {
'scroll_id': fields.String(description='Id of the current scroll view in scroll based search.'),
'aggregations': fields.Raw(description=(
'A dict with all aggregations. Each aggregation is dictionary with a metrics dict as '
'value and quantity value as key. The metrics are code runs(calcs), total energies, '
'geometries, and datasets')),
'value and quantity value as key. The metrics are code runs(calcs), %s. ' %
', '.join(search.metrics_names))),
'metrics': fields.Raw(description=(
'A dict with the overall metrics. The metrics are code runs(calcs), total energies, '
'geometries, and datasets'))
'A dict with the overall metrics. The metrics are code runs(calcs), %s.' %
', '.join(search.metrics_names)))
})
repo_request_parser = pagination_request_parser.copy()
......@@ -92,11 +92,11 @@ repo_request_parser.add_argument(
repo_request_parser.add_argument(
'total_metrics', type=str, help=(
'Metrics to aggregate all search results over.'
'Possible values are total_energies, geometries, and datasets.'))
'Possible values are %s.' % ', '.join(search.metrics_names)))
repo_request_parser.add_argument(
'aggregation_metrics', type=str, help=(
'Metrics to aggregate all aggregation buckets over as comma separated list. '
'Possible values are total_energies, geometries, and datasets.'))
'Possible values are %s.' % ', '.join(search.metrics_names)))
for search_quantity in search.search_quantities.keys():
_, _, description = search.search_quantities[search_quantity]
......@@ -147,10 +147,10 @@ class RepoCalcsResource(Resource):
total_metrics = [
metric for metric in total_metrics_str.split(',')
if metric in ['total_energies', 'geometries', 'datasets']]
if metric in search.metrics_names]
aggregation_metrics = [
metric for metric in aggregation_metrics_str.split(',')
if metric in ['total_energies', 'geometries', 'datasets']]
if metric in search.metrics_names]
except Exception:
abort(400, message='bad parameter types')
......
......@@ -51,4 +51,4 @@ from nomad.datamodel.base import UploadWithMetadata, CalcWithMetadata, Domain
from nomad.datamodel.dft import DFTCalcWithMetadata
# Override the CalcWithMetadata with the domain specific decendant
setattr(sys.modules['nomad.datamodel'], 'CalcWithMetadata', Domain.domain_class)
setattr(sys.modules['nomad.datamodel'], 'CalcWithMetadata', Domain.instance.domain_entry_class)
......@@ -46,6 +46,10 @@ class CalcWithMetadata():
mappings between all combinations, just implement mappings with the class and use
mapping transitivity. E.g. instead of A -> B, A -> this -> B.
This is basically an abstract class and it has to be subclassed for each :class:`Domain`.
Subclasses can define additional attributes and have to implement :func:`apply_domain_metadata`
to fill these attributes from processed entries, i.e. instance of :class:`nomad.parsing.LocalBackend`.
Attributes:
upload_id: The ``upload_id`` of the calculations upload (random UUID).
calc_id: The unique mainfile based calculation id.
......@@ -188,7 +192,7 @@ class Domain:
A domain defines all metadata quantities that are specific to a certain scientific
domain, e.g. DFT calculations, or experimental material science.
For each domain there needs to define a subclass of :class:`CalcWithMetadata`. This
Each domain needs to define a subclass of :class:`CalcWithMetadata`. This
class has to define the necessary domain specific metadata quantities and how these
are filled from parser results (usually an instance of :class:LocalBackend).
......@@ -198,20 +202,30 @@ class Domain:
While there can be multiple domains registered. Currently, only one domain can be
active. This active domain is define in the configuration using the ``domain_name``.
Arguments:
name: A name for the domain. This is used as key in the configuration ``config.domain``.
domain_entry_class: A subclass of :class:`CalcWithMetadata` that adds the
domain specific quantities.
quantities: Additional specifications for the quantities in ``domain_entry_class`` as
instances of :class:`DomainQuantity`.
"""
domain_class: Type[CalcWithMetadata] = None
quantities: List[DomainQuantity] = []
instance: 'Domain' = None
@classmethod
def register_domain(cls, domain_class: type, domain_name: str, quantities: Dict[str, DomainQuantity]):
assert cls.domain_class is None, 'you can only define one domain.'
def __init__(
self, name: str, domain_entry_class: Type[CalcWithMetadata],
quantities: Dict[str, DomainQuantity]) -> None:
if not domain_name == config.domain:
return
assert Domain.instance is None, 'you can only define one domain.'
cls.domain_class = domain_class
if name == config.domain:
Domain.instance = self
reference_domain_calc = domain_class()
self.name = name
self.domain_entry_class = domain_entry_class
self.quantities: List[DomainQuantity] = []
reference_domain_calc = domain_entry_class()
reference_general_calc = CalcWithMetadata()
for name, value in reference_domain_calc.__dict__.items():
......@@ -222,11 +236,39 @@ class Domain:
quantities[name] = quantity
quantity.name = name
quantity.multi = isinstance(value, list)
cls.quantities.append(quantity)
self.quantities.append(quantity)
for name in quantities.keys():
assert hasattr(reference_domain_calc, name) and not hasattr(reference_general_calc, name), \
'quantity does not exist or overrides general non domain quantity'
assert any(quantity.order_default for quantity in Domain.quantities), \
assert any(quantity.order_default for quantity in Domain.instance.quantities), \
'you need to define a order default quantity'
@property
def metrics(self) -> Dict[str, Tuple[str, str]]:
"""
The metrics specification used for search aggregations. See :func:`nomad.search.metrics`.
"""
return {
quantity.metric[0]: (quantity.metric[1], quantity.name)
for quantity in self.quantities
if quantity.metric is not None
}
@property
def metrics_names(self) -> Iterable[str]:
""" Just the names of all metrics. """
return self.metrics.keys()
@property
def aggregations(self) -> Dict[str, int]:
"""
The search aggregations and the maximum number of calculated buckets. See also
:func:`nomad.search.aggregations`.
"""
return {
quantity.name: quantity.aggregations
for quantity in self.quantities
if quantity.aggregations > 0
}
......@@ -179,7 +179,7 @@ class DFTCalcWithMetadata(CalcWithMetadata):
self.n_geometries = n_geometries
Domain.register_domain(DFTCalcWithMetadata, 'DFT', quantities=dict(
Domain('DFT', DFTCalcWithMetadata, quantities=dict(
formula=DomainQuantity(
'The chemical (hill) formula of the simulated system.',
order_default=True),
......@@ -195,7 +195,7 @@ Domain.register_domain(DFTCalcWithMetadata, 'DFT', quantities=dict(
crystal_system=DomainQuantity(
'The crystal system type of the simulated system.', aggregations=10),
code_name=DomainQuantity(
'The code name.', aggregations=10),
'The code name.', aggregations=40),
spacegroup=DomainQuantity('The spacegroup of the simulated system as number'),
spacegroup_symbol=DomainQuantity('The spacegroup as international short symbol'),
geometries=DomainQuantity(
......
......@@ -70,7 +70,7 @@ class Dataset(InnerDoc):
class WithDomain(IndexMeta):
""" Override elasticsearch_dsl metaclass to sneak in domain specific mappings """
def __new__(cls, name, bases, attrs):
for quantity in datamodel.Domain.quantities:
for quantity in datamodel.Domain.instance.quantities:
attrs[quantity.name] = quantity.elastic_mapping
return super(WithDomain, cls).__new__(cls, name, bases, attrs)
......@@ -134,7 +134,7 @@ class Entry(Document, metaclass=WithDomain):
self.references = [ref.value for ref in source.references]
self.datasets = [Dataset.from_dataset_popo(ds) for ds in source.datasets]
for quantity in datamodel.Domain.quantities:
for quantity in datamodel.Domain.instance.quantities:
setattr(self, quantity.name, getattr(source, quantity.name))
......@@ -164,14 +164,9 @@ def refresh():
infrastructure.elastic_client.indices.refresh(config.elastic.index_name)
aggregations: Dict[str, int] = {}
aggregations = datamodel.Domain.instance.aggregations
""" The available aggregations in :func:`aggregate_search` and their maximum aggregation size """
for quantity in datamodel.Domain.quantities:
if quantity.aggregations > 0:
aggregations[quantity.name] = quantity.aggregations
search_quantities = {
'authors': ('term', 'authors.name.keyword', (
'Search for the given author. Exact keyword matches in the form "Lastname, Firstname".')),
......@@ -192,13 +187,14 @@ The available search quantities in :func:`aggregate_search` as tuples with *sear
elastic field and description.
"""
for quantity in datamodel.Domain.quantities:
for quantity in datamodel.Domain.instance.quantities:
search_spec = ('term', quantity.name, quantity.description)
search_quantities[quantity.name] = search_spec
metrics = {
'datasets': ('cardinality', 'datasets.id'),
'unique_code_runs': ('cardinality', 'calc_hash')
}
"""
The available search metrics. Metrics are integer values given for each entry that can
......@@ -206,14 +202,14 @@ be used in aggregations, e.g. the sum of all total energy calculations or cardin
all unique geometries.
"""
for quantity in datamodel.Domain.quantities:
if quantity.metric is not None:
metric, aggregation = quantity.metric
metrics[metric] = (aggregation, quantity.name)
for key, value in datamodel.Domain.instance.metrics.items():
metrics[key] = value
metrics_names = list(metric for metric in metrics.keys())
order_default_quantity = None
for quantity in datamodel.Domain.quantities:
for quantity in datamodel.Domain.instance.quantities:
if quantity.order_default:
order_default_quantity = quantity.name
......@@ -304,8 +300,9 @@ def aggregate_search(
**kwargs) -> Tuple[int, List[dict], Dict[str, Dict[str, Dict[str, int]]], Dict[str, int]]:
"""
Performs a search and returns paginated search results and aggregations. The aggregations
contain overall and per quantity value sums of code runs (calcs), datasets, total energies,
and unique geometries.
contain overall and per quantity value sums of code runs (calcs), unique code runs, datasets,
and additional domain specific metrics (e.g. total energies, and unique geometries for DFT
calculations).
Arguments:
page: The page to return starting with page 1
......@@ -313,7 +310,7 @@ def aggregate_search(
q: An *elasticsearch_dsl* query used to further filter the results (via `and`)
aggregations: A customized list of aggregations to perform. Keys are index fields,
and values the amount of buckets to return. Only works on *keyword* field.
aggregation_metrics: The metrics used to aggregate over. Can be ``datasets``,
aggregation_metrics: The metrics used to aggregate over. Can be ``unique_code_runs``, ``datasets``,
other domain specific metrics. The basic doc_count metric ``code_runs`` is always given.
total_metrics: The metrics used to for total numbers (see ``aggregation_metrics``).
**kwargs: Quantity, value pairs to search for.
......@@ -364,7 +361,7 @@ def aggregate_search(
for bucket in getattr(response.aggregations, aggregation).buckets
}
for aggregation in aggregations.keys()
if aggregation not in ['total_energies', 'geometries', 'datasets']
if aggregation not in metrics_names
}
total_metrics_result = get_metrics(response.aggregations, total_metrics, total_results)
......
......@@ -21,9 +21,18 @@ content-type: application/json
}
###
# Delete the calc index
# Delete unpublished calcs
DELETE http://localhost:9200/calcs HTTP/1.1
POST http://localhost:19200/fairdi_nomad_migration/_delete_by_query HTTP/1.1
Content-Type: application/json
{
"query": {
"match": {
"published": true
}
}
}
###
......@@ -55,41 +64,3 @@ content-type: application/json
}
}
}
###
POST http://localhost:9200/calcs/_update_by_query HTTP/1.1
content-type: application/json
{
"script": {
"inline": "ctx._source.staging=false",
"lang": "painless"
},
"query": {
"match": {
"upload_id": "epGwyPvaQ720_COyRvyLEg"
}
}
}
###
GET http://localhost:19200/fairdi_nomad_migration/_search HTTP/1.1
content-type: application/json
{
# "query": {
# "term": {
# "calc_hash": "FuBNAsvdEooB_dKcM-dLAEnpaTEX"
# }
# },
"aggs": {
"hash": {
"terms": {
"field": "calc_hash",
"size": 10000,
"show_term_doc_count_error": true
}
}
}
}
\ No newline at end of file
......@@ -644,7 +644,9 @@ class TestRepo():
assert len(aggregations['system']) == 1
assert value in aggregations['system']
@pytest.mark.parametrize('metrics', [[], ['total_energies'], ['geometries'], ['datasets'], ['total_energies', 'geometries', 'datasets']])
metrics_permutations = [[], search.metrics_names] + [[metric] for metric in search.metrics_names]
@pytest.mark.parametrize('metrics', metrics_permutations)
def test_search_total_metrics(self, client, example_elastic_calcs, no_warn, metrics):
rv = client.get('/repo/?total_metrics=%s' % ','.join(metrics))
assert rv.status_code == 200
......@@ -654,7 +656,7 @@ class TestRepo():
for metric in metrics:
assert metric in metrics_result
@pytest.mark.parametrize('metrics', [[], ['total_energies'], ['geometries'], ['datasets'], ['total_energies', 'geometries', 'datasets']])
@pytest.mark.parametrize('metrics', metrics_permutations)
def test_search_aggregation_metrics(self, client, example_elastic_calcs, no_warn, metrics):
rv = client.get('/repo/?aggregation_metrics=%s' % ','.join(metrics))
assert rv.status_code == 200
......
......@@ -56,7 +56,7 @@ def test_search(elastic, normalized: parsing.LocalBackend):
create_entry(calc_with_metadata)
refresh_index()
use_metrics = ['datasets', 'geometries', 'total_energies']
use_metrics = search.metrics_names
total, hits, aggs, metrics = aggregate_search(
aggregation_metrics=use_metrics,
......@@ -68,14 +68,13 @@ def test_search(elastic, normalized: parsing.LocalBackend):
example_agg = aggs['system']['bulk']
def assert_metrics(container):
def assert_metrics(container, metrics_names):
assert container['code_runs'] == 1
assert 'datasets' in container
assert 'geometries' in container
assert 'total_energies' in container
for metric in metrics_names:
assert metric in container
assert_metrics(example_agg)
assert_metrics(metrics)
assert_metrics(example_agg, use_metrics)
assert_metrics(metrics, use_metrics)
assert 'quantities' not in hits[0]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment