diff --git a/nomad_dos_fingerprints/DOSfingerprint.py b/nomad_dos_fingerprints/DOSfingerprint.py index 20cf8972fd50d4a92607ab09da3299593db730e6..97cb2a1b6e89cdf46d9b0f3537b7ef9ed832eb1f 100644 --- a/nomad_dos_fingerprints/DOSfingerprint.py +++ b/nomad_dos_fingerprints/DOSfingerprint.py @@ -12,14 +12,19 @@ class DOSFingerprint(): self.indices = [] self.stepsize = stepsize self.filling_factor = 0 + self.grid_id = None def calculate(self, dos_energies, dos_values): energy, dos = self._convert_dos(dos_energies, dos_values) raw_energies, raw_dos = self._integrate_to_bins(energy, dos) grid = Grid().create() + self.grid_id = grid.get_grid_id() self.indices, self.bins = self._calculate_bytes(raw_energies, raw_dos, grid) return self + def to_dict(self): + return dict(bins = self.bins, indices = self.indices, stepsize = self.stepsize, grid_id = self.grid_id, filling_factor = self.filling_factor) + def _integrate_to_bins(self, xs, ys): """ Performs stepwise numerical integration of ``ys`` over the range of ``xs``. The stepsize of the generated histogram is controlled by DOSFingerprint().stepsize. diff --git a/nomad_dos_fingerprints/__init__.py b/nomad_dos_fingerprints/__init__.py index c99fd870690a71a0b489eb800ac6ac821127833d..678b38f80ff0b04241c565545069697bdaefd08f 100644 --- a/nomad_dos_fingerprints/__init__.py +++ b/nomad_dos_fingerprints/__init__.py @@ -1,2 +1,3 @@ from .DOSfingerprint import DOSFingerprint -from .grid import Grid \ No newline at end of file +from .grid import Grid +from .similarity import tanimoto_similarity \ No newline at end of file diff --git a/nomad_dos_fingerprints/similarity.py b/nomad_dos_fingerprints/similarity.py new file mode 100644 index 0000000000000000000000000000000000000000..8031f0907e913cf728f2e6a7bdb039b6266df103 --- /dev/null +++ b/nomad_dos_fingerprints/similarity.py @@ -0,0 +1,25 @@ +import numpy as np +from bitarray import bitarray + +def tanimoto_similarity(fingerprint1, fingerprint2): + if fingerprint1.grid_id != fingerprint2.grid_id: + raise AssertionError('Can not calculate similarity of fingerprints that have been calculated with different grids.') + # match fingerprints + num_bins = int(fingerprint1.grid_id.split(':')[1]) + offset = abs(fingerprint1.indices[0]-fingerprint2.indices[0]) + fingerprints = sorted([fingerprint1.to_dict(), fingerprint2.to_dict()], key = lambda x: x['indices'][0], reverse=True) + if offset != 0: + fingerprints[0]['bins'] = int(offset * num_bins / 8) * '00' + fingerprints[0]['bins'] + min_len = min([len(fingerprint['bins']) for fingerprint in fingerprints]) + mask = bitarray() + fp1 = bitarray() + fp2 = bitarray() + mask.frombytes(bytes.fromhex(int(offset * num_bins / 8) * '00' + int((min_len / 2 - offset)) * 'ff')) + fp1.frombytes(bytes.fromhex(fingerprints[0]['bins'][:min_len])) + fp2.frombytes(bytes.fromhex(fingerprints[1]['bins'][:min_len])) + fp1 = fp1 & mask + fp2 = fp2 & mask + a = fp1.count() + b = fp2.count() + c = (fp1 & fp2).count() + return c / float(a + b - c) \ No newline at end of file diff --git a/tests/test_functional.py b/tests/test_functional.py index 6cb924da2910ad455e60b0bbacf0e537ff9d59f6..bc3e91c221adb022dce30443793316107e71da32 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1,5 +1,6 @@ -from nomad_dos_fingerprints import DOSFingerprint +from nomad_dos_fingerprints import DOSFingerprint, tanimoto_similarity import pytest, os, json +import numpy as np with open(os.path.join(os.path.dirname(__file__), 'fingerprint_generation_test_data.json'), 'r') as test_data_file: test_data = json.load(test_data_file) @@ -9,5 +10,25 @@ def test_fingerprint_values(): for fp, mid in test_data['fingerprints']: raw_data = test_data[mid] new_fingerprint = DOSFingerprint().calculate(raw_data['dos_energies'], raw_data['dos_values']) - assert json.loads(fp)['indices'] == new_fingerprint.indices - assert json.loads(fp)['bins'] == new_fingerprint.bins + old_fingerprint = DOSFingerprint() + old_fingerprint.bins = json.loads(fp)['bins'] + old_fingerprint.indices = json.loads(fp)['indices'] + old_fingerprint.grid_id = new_fingerprint.grid_id + assert old_fingerprint.indices == new_fingerprint.indices + assert np.isclose(tanimoto_similarity(old_fingerprint, new_fingerprint),1, atol=5e-2) + +def test_materials_similarity(): + + fingerprints = test_data['fingerprints'] + similarity_matrix = test_data['simat'] + mids = [x[1] for x in fingerprints] + raw_data = [test_data[mid] for mid in mids] + new_fingerprints = [DOSFingerprint().calculate(entry['dos_energies'], entry['dos_values']) for entry in raw_data] + matrix = [] + for fp1 in new_fingerprints: + row = [] + for fp2 in new_fingerprints: + row.append(tanimoto_similarity(fp1,fp2)) + matrix.append(row) + print(matrix - np.array(similarity_matrix)) + assert np.isclose(similarity_matrix, matrix, atol = 5e-2).all() \ No newline at end of file diff --git a/tests/test_similarity.py b/tests/test_similarity.py new file mode 100644 index 0000000000000000000000000000000000000000..9689f4248074da20c175ad0eff589252d55c96cb --- /dev/null +++ b/tests/test_similarity.py @@ -0,0 +1,18 @@ +import pytest +from bitarray import bitarray +from nomad_dos_fingerprints import tanimoto_similarity, DOSFingerprint + +def test_tanimoto(): + # generate fp-type data and check if this can be realized with binary-strings only + fp1 = DOSFingerprint() + fp2 = DOSFingerprint() + fp1.bins = bitarray('00000000111111110000000011111111').tobytes().hex() + fp2.bins = bitarray('1111111100000000').tobytes().hex() + grid_id = 'a:8:b' + fp1.grid_id = grid_id + fp2.grid_id = grid_id + fp1.indices = [0,3] + fp2.indices = [1,2] + assert tanimoto_similarity(fp1, fp2) == 1 + assert tanimoto_similarity(fp1, fp1) == 1 + assert tanimoto_similarity(fp2, fp2) == 1