calc.py 15.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
import json
from sqlalchemy import Column, Integer, String, ForeignKey
18
19
from sqlalchemy.orm import relationship, aliased
from sqlalchemy.sql.expression import literal
20
from datetime import datetime
21

22
from nomad import infrastructure, utils, config
23
from nomad.datamodel import DFTCalcWithMetadata
24

25
from . import base
26
27
from .user import User
from .base import Base, calc_citation_association, ownership, co_authorship, shareship, \
Markus Scheidgen's avatar
Markus Scheidgen committed
28
29
    Tag, Topics, CalcSet, calc_dataset_containment, Citation, Spacegroup, CalcMetaData, \
    CodeVersion, StructRatio, UserMetaData
30
31


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
handle_base = '0123456789abcdefghijklmnopqrstuvwxyz'


def create_handle(pid: int) -> str:
    """
    Create a handle for the given pid. The pid is an autoincrement number. The handle
    a 'base32' encoded string of that number. Therefore, its string representation is a
    little shorter. The handle is prefixed with the configured handle prefix.
    """

    value = pid
    result = ''
    while value > 0:
        result += handle_base[value & 31]
        value = value >> 5

    return config.repository_db.handle_prefix + result[::-1]


51
class PublishContext:
52
    """
53
    Utilities necessary during adding calculations to the repo db.
54
55
    Caches queries to avoid unnecessary flushes while bulk creating calcs.
    Faster than even SQLAlchemy with ``autoflush=False``, because of reasons.
56
    Access to a logger with bound data about the upload, etc.
57
58
    """

59
    def __init__(self, **kwargs):
60
        self._cache = {}
61
        self.logger = utils.get_logger(__name__, **kwargs)
62

63
    def cache(self, entity, **kwargs):
64
65
66
67
68
69
70
71
72
        key = json.dumps(dict(entity=entity.__class__.__name__, **kwargs))
        value = self._cache.get(key, None)
        if value is None:
            value = infrastructure.repository_db.query(entity).filter_by(**kwargs).first()
            if value is not None:
                self._cache[key] = value
        return value


73
74
75
class IllegalCalcMetadata(Exception): pass


Markus Scheidgen's avatar
Markus Scheidgen committed
76
class Calc(Base):
77
78
    __tablename__ = 'calculations'

79
    coe_calc_id = Column('calc_id', Integer, primary_key=True, autoincrement=True)
80
    handlepid = Column(String)
81
    origin_id = Column(Integer, ForeignKey('uploads.upload_id'))
82
    upload = relationship('Upload', lazy='joined')
83
84
    checksum = Column(String)

85
86
    calc_metadata = relationship('CalcMetaData', uselist=False, lazy='joined')
    user_metadata = relationship('UserMetaData', uselist=False, lazy='joined')
87
88
89
90
    citations = relationship('Citation', secondary=calc_citation_association, lazy='joined')
    owners = relationship('User', secondary=ownership, lazy='joined')
    coauthors = relationship('User', secondary=co_authorship, lazy='joined')
    shared_with = relationship('User', secondary=shareship, lazy='joined')
91
    tags = relationship('Tag', lazy='subquery', join_depth=1)
92
    spacegroup = relationship('Spacegroup', lazy='joined', uselist=False)
93

94
95
96
97
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.topic_ids = {}

98
99
100
    parents = relationship(
        'Calc',
        secondary=calc_dataset_containment,
101
102
        primaryjoin=calc_dataset_containment.c.children_calc_id == coe_calc_id,
        secondaryjoin=calc_dataset_containment.c.parent_calc_id == coe_calc_id,
103
        backref='children', lazy='subquery', join_depth=1)
104

105
106
107
108
109
110
111
    @staticmethod
    def from_calc_id(calc_id: str) -> 'Calc':
        repo_db = infrastructure.repository_db
        calcs = repo_db.query(Calc).filter_by(checksum=calc_id)
        assert calcs.count() <= 1, 'Calc id/checksum must be unique'
        return calcs.first()

112
    @classmethod
113
    def load_from(cls, obj):
114
        repo_db = infrastructure.repository_db
115
        return repo_db.query(Calc).filter_by(coe_calc_id=int(obj.pid)).first()
116
117
118

    @property
    def mainfile(self) -> str:
119
        return self.calc_metadata.location
120
121

    @property
Markus Scheidgen's avatar
Markus Scheidgen committed
122
    def pid(self) -> int:
123
        return self.coe_calc_id
124
125
126

    @property
    def comment(self) -> str:
127
        return self.user_metadata.label
128
129

    @property
130
    def calc_id(self) -> str:
131
132
133
134
135
136
137
138
        return self.checksum

    @property
    def references(self) -> List[str]:
        return list(citation.value for citation in self.citations if citation.kind == 'EXTERNAL')

    @property
    def uploader(self) -> User:
139
        assert len(self.owners) == 1, 'A calculation must have exactly one owner.'
140
141
142
143
        return self.owners[0]

    @property
    def with_embargo(self) -> bool:
144
        return self.user_metadata.permission == 1
145
146

    @property
Markus Scheidgen's avatar
Markus Scheidgen committed
147
    def formula(self) -> str:
148
        return self.calc_metadata.chemical_formula
149
150

    @property
Markus Scheidgen's avatar
Markus Scheidgen committed
151
152
153
154
155
156
157
    def files(self) -> List[str]:
        if self.calc_metadata is not None:
            if self.calc_metadata.filenames is not None:
                filenames = self.calc_metadata.filenames.decode('utf-8')
                return json.loads(filenames)

        return []
158

159
    @property
160
    def all_datasets(self) -> List['DataSet']:
161
        assert self.coe_calc_id is not None
162
        repo_db = infrastructure.repository_db
163
        query = repo_db.query(literal(self.coe_calc_id).label('coe_calc_id')).cte(recursive=True)
164
165
166
        right = aliased(query)
        left = aliased(CalcSet)
        query = query.union_all(repo_db.query(left.parent_calc_id).join(
167
            right, right.c.coe_calc_id == left.children_calc_id))
168
        query = repo_db.query(query)
169
        dataset_calc_ids = list(r[0] for r in query if not r[0] == self.coe_calc_id)
170
        if len(dataset_calc_ids) > 0:
171
            return [
172
                DataSet(dataset_calc)
173
                for dataset_calc in repo_db.query(Calc).filter(Calc.coe_calc_id.in_(dataset_calc_ids))]
174
175
176
        else:
            return []

177
178
179
180
    @property
    def direct_datasets(self) -> List['DataSet']:
        return [DataSet(dataset_calc) for dataset_calc in self.parents]

181
    def _set_value(self, topic_cid: int, value: str, context: PublishContext) -> None:
182
183
184
185
        if value is None:
            return

        repo_db = infrastructure.repository_db
186
        topic = context.cache(Topics, cid=topic_cid, topic=value)
187
188
189
        if not topic:
            topic = Topics(cid=topic_cid, topic=value)
            repo_db.add(topic)
190
            repo_db.flush()
191

192
        if topic.tid not in self.topic_ids:
193
            tag = Tag(calc=self, topic=topic)
194
            self.topic_ids[topic.tid] = topic.tid
195
196
197
198
            repo_db.add(tag)
        else:
            logger = utils.get_logger(
                __name__, calc_id=self.calc_id, upload_id=self.upload.upload_id)
199
            logger.warning('double tag on same calc', cid=topic.cid, tid=topic.tid, value=topic.topic)
200

201
202
    _dataset_cache: dict = {}

203
    def apply_calc_with_metadata(self, calc: DFTCalcWithMetadata, context: PublishContext) -> None:
Markus Scheidgen's avatar
Markus Scheidgen committed
204
205
206
207
208
209
210
        """
        Applies the data from ``source`` to this coe Calc object.
        """
        repo_db = infrastructure.repository_db

        self.checksum = calc.calc_id
        source_code_version = calc.code_version  # TODO shorten version names
211
        code_version_obj = context.cache(CodeVersion, content=source_code_version)
Markus Scheidgen's avatar
Markus Scheidgen committed
212
213
214
        if code_version_obj is None:
            code_version_obj = CodeVersion(content=source_code_version)
            repo_db.add(code_version_obj)
Markus Scheidgen's avatar
Markus Scheidgen committed
215
            repo_db.flush()
Markus Scheidgen's avatar
Markus Scheidgen committed
216

217
218
219
220
221
222
223
        if calc.upload_time is not None:
            added_time = calc.upload_time
        elif self.upload is not None and self.upload.upload_time is not None:
            added_time = self.upload.upload_time
        else:
            added_time = datetime.now()

Markus Scheidgen's avatar
Markus Scheidgen committed
224
225
        metadata = CalcMetaData(
            calc=self,
226
            added=added_time,
Markus Scheidgen's avatar
Markus Scheidgen committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
            chemical_formula=calc.formula,
            filenames=('[%s]' % ','.join(['"%s"' % filename for filename in calc.files])).encode('utf-8'),
            location=calc.mainfile,
            version=code_version_obj)
        repo_db.add(metadata)

        struct_ratio = StructRatio(
            calc=self,
            chemical_formula=calc.formula,
            formula_units=1, nelem=len(calc.atoms))
        repo_db.add(struct_ratio)

        user_metadata = UserMetaData(
            calc=self,
            label=calc.comment,
            permission=(1 if calc.with_embargo else 0))
        repo_db.add(user_metadata)

245
246
247
248
        if isinstance(calc.spacegroup, int) or calc.spacegroup.isdigit():
            spacegroup = Spacegroup(calc=self, n=calc.spacegroup)
        else:
            spacegroup = Spacegroup(calc=self, n='0')
Markus Scheidgen's avatar
Markus Scheidgen committed
249
250
251
        repo_db.add(spacegroup)

        # topic based properties
252
        self._set_value(base.topic_code, calc.code_name, context)
Markus Scheidgen's avatar
Markus Scheidgen committed
253
        for atom in set(calc.atoms):
254
255
256
257
258
            self._set_value(base.topic_atoms, str(atom), context)
        self._set_value(base.topic_system_type, calc.system, context)
        self._set_value(base.topic_xc_treatment, calc.xc_functional, context)
        self._set_value(base.topic_crystal_system, calc.crystal_system, context)
        self._set_value(base.topic_basis_set_type, calc.basis_set, context)
Markus Scheidgen's avatar
Markus Scheidgen committed
259
260

        # user relations
261
262
        def add_users_to_relation(source_users, relation):
            for source_user in source_users:
263
                coe_user = context.cache(User, user_id=int(source_user.id))
264
265
266
                if coe_user is None:
                    raise IllegalCalcMetadata(
                        'User with user_id %s does not exist.' % source_user.id)
267
268
                source_user.update(coe_user.to_popo())
                relation.append(coe_user)
Markus Scheidgen's avatar
Markus Scheidgen committed
269

270
271
272
273
274
        if calc.uploader is not None:
            add_users_to_relation([calc.uploader], self.owners)
        elif self.upload is not None and self.upload.user is not None:
            self.owners.append(self.upload.user)
            calc.uploader = self.upload.user.to_popo()
Markus Scheidgen's avatar
Markus Scheidgen committed
275

276
277
        add_users_to_relation(calc.coauthors, self.coauthors)
        add_users_to_relation(calc.shared_with, self.shared_with)
Markus Scheidgen's avatar
Markus Scheidgen committed
278
279

        # datasets
280
        calcs_existing_datasets: List[int] = []
Markus Scheidgen's avatar
Markus Scheidgen committed
281
282
        for dataset in calc.datasets:
            dataset_id = dataset.id
283
284
285
286
287
            if dataset_id in calcs_existing_datasets:
                continue
            else:
                calcs_existing_datasets.append(dataset_id)

288
            coe_dataset_calc: Calc = context.cache(Calc, coe_calc_id=dataset_id)
289
290
291
            if coe_dataset_calc is None:
                coe_dataset_calc = Calc(coe_calc_id=dataset_id)
                repo_db.add(coe_dataset_calc)
Markus Scheidgen's avatar
Markus Scheidgen committed
292
293

                metadata = CalcMetaData(
294
                    calc=coe_dataset_calc,
Markus Scheidgen's avatar
Markus Scheidgen committed
295
296
297
                    added=self.upload.upload_time,
                    chemical_formula=dataset.name)
                repo_db.add(metadata)
Markus Scheidgen's avatar
Markus Scheidgen committed
298
                repo_db.flush()
Markus Scheidgen's avatar
Markus Scheidgen committed
299
300

                if dataset.doi is not None:
301
                    self._add_citation(coe_dataset_calc, dataset.doi['value'], 'INTERNAL', context)
Markus Scheidgen's avatar
Markus Scheidgen committed
302

Markus Scheidgen's avatar
Markus Scheidgen committed
303
304
305
                # cause a flush to create the backdirection of the above established
                # metadata-dataset_calc relation
                repo_db.flush()
Markus Scheidgen's avatar
Markus Scheidgen committed
306

307
            self.parents.append(coe_dataset_calc)
308
309

            dataset.update(DataSet(coe_dataset_calc).to_popo())
Markus Scheidgen's avatar
Markus Scheidgen committed
310
311
312

        # references
        for reference in calc.references:
313
314
            self._add_citation(self, reference['value'], 'EXTERNAL', context)

Markus Scheidgen's avatar
Markus Scheidgen committed
315
316
        repo_db.flush()

317
318
319
320
321
    def _add_citation(self, coe_calc: 'Calc', value: str, kind: str, context: PublishContext) -> None:
        if value is None or kind is None:
            context.logger.warning(
                'citation without value or kind str', value=value, kind=kind, calc_id=self.calc_id)
            return
Markus Scheidgen's avatar
Markus Scheidgen committed
322
323

        repo_db = infrastructure.repository_db
324
        citation = context.cache(Citation, value=value, kind=kind)
Markus Scheidgen's avatar
Markus Scheidgen committed
325
326
327
328
329
330
331

        if citation is None:
            citation = Citation(value=value, kind=kind)
            repo_db.add(citation)

        coe_calc.citations.append(citation)

332
    def to_calc_with_metadata(self) -> DFTCalcWithMetadata:
Markus Scheidgen's avatar
Markus Scheidgen committed
333
        """
334
        Creates a :class:`DFTCalcWithMetadata` instance with UCPM ids, and all UMD/CMD.
Markus Scheidgen's avatar
Markus Scheidgen committed
335
336
337
338
        Be aware that ``upload_id`` and ``calc_id``, might be old coe repository
        ``upload_name`` and calculation ``checksum`` depending on the context, i.e. used
        database.
        """
339
        result = DFTCalcWithMetadata(
340
            upload_id=self.upload.upload_id if self.upload else None,
Markus Scheidgen's avatar
Markus Scheidgen committed
341
            calc_id=self.checksum)
342

Markus Scheidgen's avatar
Markus Scheidgen committed
343
344
345
        result.pid = self.pid
        result.mainfile = self.mainfile
        result.files = self.files
346

347
        for topic in [tag.topic for tag in self.tags]:
348
349
350
            if topic is None:
                continue

351
            if topic.cid == base.topic_code:
Markus Scheidgen's avatar
Markus Scheidgen committed
352
                result.code_name = topic.topic
353
            elif topic.cid == base.topic_basis_set_type:
Markus Scheidgen's avatar
Markus Scheidgen committed
354
                result.basis_set = topic.topic
355
            elif topic.cid == base.topic_xc_treatment:
Markus Scheidgen's avatar
Markus Scheidgen committed
356
                result.xc_functional = topic.topic
357
            elif topic.cid == base.topic_system_type:
Markus Scheidgen's avatar
Markus Scheidgen committed
358
                result.system = topic.topic
359
            elif topic.cid == base.topic_atoms:
Markus Scheidgen's avatar
Markus Scheidgen committed
360
                result.atoms.append(topic.topic)
361
362
            elif topic.cid == base.topic_crystal_system:
                result.crystal_system = topic.topic
Markus Scheidgen's avatar
Markus Scheidgen committed
363
364
            elif topic.cid in [1996, 1994, 703, 702, 701, 100]:
                # user/author, restriction, formulas?, another category
365
                pass
366
367
368
            else:
                raise KeyError('topic cid %s.' % str(topic.cid))

Markus Scheidgen's avatar
Markus Scheidgen committed
369
370
        result.code_version = self.calc_metadata.version.content
        result.formula = self.calc_metadata.chemical_formula
Markus Scheidgen's avatar
Markus Scheidgen committed
371
372
        if self.spacegroup is not None:
            result.spacegroup = self.spacegroup.n
Markus Scheidgen's avatar
Markus Scheidgen committed
373
        result.atoms.sort()
374
375
376
377
378
379
380
381
382
383
384

        datasets: List[DataSet] = []
        for parent in self.parents:
            parents = Calc._dataset_cache.get(parent, None)
            if parents is None:
                parents = parent.all_datasets
                Calc._dataset_cache[parent] = parents
            datasets.append(DataSet(parent))
            datasets.extend(parents)

        result.pid = self.pid
Markus Scheidgen's avatar
Markus Scheidgen committed
385
        result.uploader = self.uploader.to_popo()
386
        result.upload_time = self.calc_metadata.added
387
        result.datasets = list(ds.to_popo() for ds in datasets)
388
389
        result.with_embargo = self.with_embargo
        result.comment = self.comment
390
        result.references = list(
Markus Scheidgen's avatar
Markus Scheidgen committed
391
            citation.to_popo() for citation in self.citations
392
            if citation.kind == 'EXTERNAL')
Markus Scheidgen's avatar
Markus Scheidgen committed
393
394
        result.coauthors = list(user.to_popo() for user in self.coauthors)
        result.shared_with = list(user.to_popo() for user in self.shared_with)
395

396
        return result
397
398


399
400
401
402
403
404
class DataSet:
    def __init__(self, dataset_calc: Calc) -> None:
        self._dataset_calc = dataset_calc

    @property
    def id(self):
405
        return self._dataset_calc.coe_calc_id
406
407

    @property
Markus Scheidgen's avatar
Markus Scheidgen committed
408
409
410
411
412
413
414
415
416
    def doi(self) -> Citation:
        doi = None
        for citation in self._dataset_calc.citations:
            if citation.kind == 'INTERNAL':
                if doi is not None:
                    utils.get_logger(__name__).warning(
                        'dataset with multiple dois', dataset_id=self.id)
                doi = citation
        return doi
417
418
419

    @property
    def name(self):
420
        return self._dataset_calc.calc_metadata.chemical_formula
421
422

    def to_popo(self):
Markus Scheidgen's avatar
Markus Scheidgen committed
423
        return utils.POPO(
Markus Scheidgen's avatar
Markus Scheidgen committed
424
425
426
            id=self.id,
            name=self.name,
            doi=self.doi.to_popo() if self.doi is not None else None)