diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..b5daa6fe5e77d2c4942adb56de0d1677e656dd44 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +/data/**/*.json filter=lfs diff=lfs merge=lfs -text diff --git a/Dockerfile b/Dockerfile index 2ec7ece7e3154ed6b399ba8edc7803afa835b0aa..fdf200a1c22e6848ff71dbc9db663692c9cde55b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,4 +38,5 @@ USER ${NB_UID} WORKDIR ${HOME} COPY --chown=${NB_UID}:${NB_GID} assets/ assets/ +COPY --chown=${NB_UID}:${NB_GID} data/ data/ COPY --chown=${NB_UID}:${NB_GID} dos_similarity_search.ipynb . diff --git a/data/materials_results.json b/data/materials_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5020dad88420c216c9ad4c1dadb460bde99da677 --- /dev/null +++ b/data/materials_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7937ac9d6320f88da73059ac8a1753c5cc39df096f892936c77370f69a01b283 +size 3372401 diff --git a/dos_similarity_search.ipynb b/dos_similarity_search.ipynb index 652274c58a240a297e311a52b71164363821e3db..218461681ad9f90d64cda6069f2c09f4a8cab0d1 100644 --- a/dos_similarity_search.ipynb +++ b/dos_similarity_search.ipynb @@ -35,7 +35,7 @@ " <img style=\"float: left; margin: 15px 20px 0px 0px;\" src=\"assets/logos/nomad-infrastructure.svg\" width=\"120\">\n", " <img style=\"float: left; margin: 5px 20px 0px 10px;\" src=\"assets/logos/mpcdf.svg\" width=\"270\">\n", "</div>\n", - "<p style=\"text-align: right; padding: 0px 10px 10px 0px;\">[Last updated: February, 2024]<" + "<p style=\"text-align: right; padding: 0px 10px 10px 0px;\">[Last updated: February, 2024]</p>" ] }, { @@ -97,18 +97,35 @@ "source": [ "import nest_asyncio\n", "\n", + "import json\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from itertools import islice\n", "\n", "from nomad.client import ArchiveQuery\n", - "from nomad.datamodel.datamodel import EntryArchive\n", + "from nomad.datamodel import EntryArchive\n", "from nomad_dos_fingerprints import DOSFingerprint, tanimoto_similarity\n", "\n", "nest_asyncio.apply()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use already download data from local cache. Dowloading the data from the archive takes about 5 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "use_cache = True" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -138,7 +155,7 @@ "required_sections = {\n", " # DOS fingerprint\n", " \"workflow\": {\"calculation_result_ref\": {\"dos_electronic\": {\"fingerprint\": \"*\"}}},\n", - " # Upload and calculation id\n", + " # entry id\n", " \"metadata\": {\n", " \"entry_id\": \"*\",\n", " },\n", @@ -146,7 +163,6 @@ " \"results\": {\n", " \"material\": {\n", " \"chemical_formula_reduced\": \"*\",\n", - " \"material_id\": \"*\",\n", " \"symmetry\": {\"space_group_number\": \"*\"},\n", " }\n", " },\n", @@ -171,10 +187,18 @@ " required=required_sections,\n", ")\n", "\n", + "\n", "# Download\n", "reference_result = reference_query.download()[0]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading the reference from a cache:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -223,28 +247,40 @@ "metadata": {}, "outputs": [], "source": [ - "# choose a representative from the AFLOW or OQMD data\n", - "materials_query = ArchiveQuery(\n", - " query={\n", - " \"results.method.simulation.program_name\": \"VASP\",\n", - " \"results.material.elements\": {\"all\": [\"Ga\", \"As\"]},\n", - " \"results.properties.available_properties\": [\"dos_electronic\"],\n", - " \"results.method.simulation.dft.xc_functional_type\": [\"GGA\"],\n", - " \"results.material.n_elements\": {\"gte\": 2, \"lte\": 3},\n", - " },\n", - " required=required_sections,\n", - " page_size=100,\n", - " results_max=1366, # TODO: fix this\n", - ")\n", - "\n", - "materials_query.fetch()\n", - "\n", - "materials_results = []\n", - "while True:\n", - " result = materials_query.download(100)\n", - " if len(result) == 0:\n", - " break\n", - " materials_results.extend(result)\n", + "if not use_cache:\n", + " \n", + " materials_query = ArchiveQuery(\n", + " query={\n", + " \"results.method.simulation.program_name\": \"VASP\",\n", + " \"results.material.elements\": {\"all\": [\"Ga\", \"As\"]},\n", + " \"results.properties.available_properties\": [\"dos_electronic\"],\n", + " \"results.method.simulation.dft.xc_functional_type\": [\"GGA\"],\n", + " \"results.material.n_elements\": {\"gte\": 2, \"lte\": 3},\n", + " },\n", + " required=required_sections,\n", + " page_size=100,\n", + " results_max=1366\n", + " )\n", + "\n", + " materials_query.fetch()\n", + " \n", + " materials_results = []\n", + " while True:\n", + " result = materials_query.download(100)\n", + " if len(result) == 0:\n", + " break\n", + " materials_results.extend(result)\n", + "\n", + " # Cache the results (optional)\n", + " # with open('data/materials_results.json', 'w') as f:\n", + " # json.dump({'data': [entry.m_to_dict() for entry in materials_results]}, f)\n", + "\n", + "else:\n", + " \n", + " with open(\"data/downloaded.json\") as f:\n", + " data = json.load(f)\n", + "\n", + " materials_results = [EntryArchive.m_from_dict(entry) for entry in data[\"data\"]]\n", "\n", "\n", "len(materials_results)" @@ -277,7 +313,6 @@ " fingerprint[\"entry_id\"] = entry.metadata.entry_id\n", " fingerprint[\"upload_id\"] = entry.metadata.upload_id\n", " fingerprint[\"formula\"] = entry.results.material.chemical_formula_reduced\n", - " fingerprint[\"material_id\"] = entry.results.material.material_id\n", " fingerprint[\"space_group\"] = entry.results.material.symmetry.space_group_number\n", "\n", " # Generate a DOS fingerprint object from an Archive entry\n", @@ -308,9 +343,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "<span style='font-family:sans-serif'>\n", - "\n", - "# Calculation of similarity coefficients\n", + "## Calculation of similarity coefficients\n", "\n", "Now we compute the similarity between two DOS spectra.\n", "\n", @@ -326,8 +359,14 @@ "\n", "It is restricted to values $T_c \\in [0,1]$. 1 means that the DOS of two materials are identical, 0 means no overlap at all. **The Tanimoto coefficient can be interpreted as the ratio between the number of shared features and the total number of features of two fingerprints.** For dichotomous vectors, the complement of the Tanimoto coefficient ($1 - T_c$), also known as Jaccard distance, is a metric. The Tanimoto coefficient is implemented as the function `tanimoto_similarity` in the `nomad_dos_fingerprints` package.\n", "\n", - "The arguments of the function `tanimoto_similarity` are two `DOSFingerprint` objects. Using this, the similarity between the reference material and one of the candidate materials can be calculated, as shown in the following example:\n", - "</span>" + "The arguments of the function `tanimoto_similarity` are two `DOSFingerprint` objects. Using this, the similarity between the reference material and one of the candidate materials can be calculated, as shown in the following example:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Selecting the first material in the dictionary" ] }, { @@ -336,7 +375,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Selecting the first material in the dictionary\n", "candidate = next(iter(materials.values()))\n", "\n", "print(f\"Similarity between {reference['formula']} and {candidate['formula']}:\\n\")\n", @@ -370,13 +408,19 @@ " return material[\"dos_fingerprint\"].get_similarity(reference[\"dos_fingerprint\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Apply `calculate_similarity` to all entries in `materials`:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# apply `calculate_similarity` to all entries in `materials`\n", "for material in materials.values():\n", " material[\"similarity\"] = calculate_similarity(reference, material)" ] @@ -473,7 +517,7 @@ " bins=20,\n", " range=[0, 1.0],\n", " ec=\"w\",\n", - " label=f'Reference: {reference[\"formula\"]}',\n", + " label=f\"Reference: {reference['formula']} ({reference['space_group']})\",\n", ")\n", "plt.xticks(np.linspace(0, 1.0, 11))\n", "plt.xlabel(\"Tc\")\n", @@ -489,12 +533,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "<span style='font-family:sans-serif'>\n", - "\n", "Note the logarithmic scale in this histogram. We can see here, that the vast majority of materials has a low similarity to our reference. On the right side of the histogram ($\\mathrm{Tc} > 0.7$), the most similar materials can be found, which show exceptionally high similarity scores.\n", "\n", - "We construct a ranking table which shows the similarity of materials to our reference from the most similar to the least similar.\n", - "</span>" + "We construct a ranking table which shows the similarity of materials to our reference from the most similar to the least similar for the top `n = 20` materials. By clicking the link, you will land on the Archive page of the respective calculation, where you can find more information about your material." ] }, { @@ -517,9 +558,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By clicking the link, you will land on the Archive page of the respective calculation, where you can find more information about your material.\n", - "\n", - "Now we plot the DOS of the most similar materials to our reference. In the variable `ranks_to_download`, we give the rank of the materials from the table above, whose DOS we want to plot. To avoid unnecessary downloading, we check if the spectrum is already in `materials_data` under the key `dos`, if not, we download it." + "Now we plot the DOS of the most (`n = 4`) similar materials to our reference." ] }, { @@ -592,7 +631,7 @@ "outputs": [], "source": [ "# Number of the calulation with the highest similarity score\n", - "n = 3\n", + "n = 4\n", "\n", "# get entry_id from table\n", "entry_ids = [m[\"entry_id\"] for m in islice(materials.values(), n)]\n", @@ -600,7 +639,7 @@ "materials_dos_query = ArchiveQuery(\n", " query={\"entry_id\": {\"any\": entry_ids}},\n", " required=required_dos,\n", - " results_max=n # TODO: fix this\n", + " results_max=n\n", ")\n", "\n", "materials_dos_results = materials_dos_query.download()\n", @@ -620,14 +659,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ "plt.figure(figsize=(10, 5))\n", "plt.fill_between(\n", " reference_dos[\"energies\"].magnitude,\n", " reference_dos[\"values\"].magnitude,\n", - " label=f\"Reference: {reference['formula']}\",\n", + " label=f\"{reference['formula']} ({reference['space_group']}) - Reference\",\n", " facecolor=(\"r\", 0.2),\n", " edgecolor=\"r\",\n", ")\n", @@ -637,9 +678,10 @@ " values = materials_dos[entry_id][\"values\"].magnitude\n", "\n", " chem_formula = materials[entry_id][\"formula\"]\n", + " sg = materials[entry_id][\"space_group\"]\n", " Tc = materials[entry_id][\"similarity\"]\n", "\n", - " plt.plot(energies, values, label=f\"{chem_formula}, Tc = {Tc: .2f}\")\n", + " plt.plot(energies, values, label=f\"{chem_formula} ({sg}), Tc = {Tc: .2f}\")\n", " # plt.plot(energies, values)\n", " # plt.fill_between(energies, values, alpha = 0.2, label = f\"{chem_formula}, Tc = {Tc: .2f}\")\n", "\n", @@ -657,14 +699,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "<span style='font-family:sans-serif'>\n", + "## References\n", "\n", - "# References\n", + "[1] M. Kuban, S. Rigamonti, M. Scheidgen and C. Draxl: [Density-of-states similarity descriptor for unsupervised learning from materials data](https://arxiv.org/abs/2201.02187)\n", "\n", - "[1] M. Kuban, S. Rigamonti, M. Scheidgen, and C. Draxl: [Density-of-states similarity descriptor for unsupervised learning from materials data](https://arxiv.org/abs/2201.02187)\n", + "[2] P. Willet, J. M. Barnard, G. M. Downs: [Chemical Similarity Searching](https://pubs.acs.org/doi/abs/10.1021/ci9800211), _J. Chem. Inf. Comput. Sci._, __38__, 983, (1998)\n", "\n", - "[2] P. Willet, J. M. Barnard, G. M. Downs: [Chemical Similarity Searching](https://pubs.acs.org/doi/abs/10.1021/ci9800211), $\\textit{J. Chem. Inf. Comput. Sci.}$, $\\textbf{38}$, 983, (1998)\n", - "</span>" + "[3] M. Kuban, Š. Gabaj, W. Aggoune et al. [Similarity of materials and data-quality assessment by fingerprinting](https://doi.org/10.1557/s43577-022-00339-w), _MRS Bulletin_ __47__, 991–999 (2022)." ] }, { @@ -678,227 +719,6 @@ "We thank Luca Ghiringhelli for help in preparing this notebook. This work recieved partial funding from the European Union’s Horizon 2020 research and innovation program under the grant agreement Nº 951786 (NOMAD CoE), from the NFDI consortium FAIRmat, and from the German Research Foundation (DFG) through the CRC 1404 (FONDA).\n", "</span>" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Debugging" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# # choose a representative from the AFLOW or OQMD data\n", - "# materials_query = ArchiveQuery(\n", - "# query = {\n", - "# 'results.method.simulation.program_name': 'VASP',\n", - "# 'results.material.elements': {'all': ['Ga', 'As']},\n", - "# 'results.properties.available_properties': ['dos_electronic'],\n", - "# 'results.method.simulation.dft.xc_functional_type': ['GGA'],\n", - "# 'results.material.n_elements': {'gte': 2, 'lte': 3},\n", - "# },\n", - "# required = required_sections = {\n", - "# # DOS fingerprint\n", - "# 'workflow': {\n", - "# 'calculation_result_ref': {\n", - "# 'dos_electronic': {\n", - "# 'fingerprint': '*'\n", - "# }\n", - "# }\n", - "# },\n", - "# # Upload and calculation id\n", - "# 'metadata': {\n", - "# 'calc_id': '*',\n", - "# },\n", - "# # chemical formula, material id, and space group number\n", - "# 'results':{\n", - "# 'material':{\n", - "# 'chemical_formula_reduced': '*',\n", - "# 'material_id': '*',\n", - "# 'symmetry': {\n", - "# 'space_group_number': '*'\n", - "# }\n", - "# }\n", - "# }\n", - "# },\n", - "# page_size = 100,\n", - "# results_max = 4177\n", - "# )\n", - "\n", - "# materials_query.fetch()\n", - "\n", - "# materials_results = []\n", - "# while True:\n", - "# result = materials_query.download(100)\n", - "# if len(result) == 0:\n", - "# break\n", - "# materials_results.extend(result)\n", - "\n", - "# print(len(materials_results))\n", - "\n", - "# for m in materials_results[:10]:\n", - "# print(f\"entry_id: {m.metadata.entry_id}, calc_id: {m.metadata.calc_id}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we remove entries without DOS fingerprint data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # removing entries with empty fingerprints\n", - "# for i, result in enumerate(materials_results):\n", - "# if not result.run[0].calculation[-1].dos_electronic[0].fingerprint.bins:\n", - "# del materials_results[i]\n", - "\n", - "# len(materials_results)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The Archive API returns all calculations which fit the query, therefore, **for a single material multiple calculations (e.g. from different authors) are downloaded.** \n", - "\n", - "To simplify the analysis presented here, **we select a representative calculation for each material**. To do so, we define a function called `select_representative`. The cell below shows an example of this function that takes the first encountered calculation of a material as the representative. However, different approaches can be used, e.g., based on computational parameters employed in the DFT calculations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# def select_representative(reference:dict, materials: dict) -> dict:\n", - "# \"\"\"Example of a `select_representative` function.\n", - "# Returns the first calculation of a material it finds.\n", - "\n", - "# Inputs:\n", - "# materials: containing the materials in a dictionary\n", - "# as outputted by `get_fingerprints`\n", - "# \"\"\"\n", - "\n", - "# material_ids = set()\n", - "# output = {}\n", - "\n", - "# # for entry_id, material in materials.items():\n", - "# for entry_id, material in sorted(materials.items(), key = lambda x: x[1]['similarity'], reverse = True):\n", - "\n", - "# if material['formula'] == reference['formula']:\n", - "# continue\n", - "\n", - "# material_id = material['material_id']\n", - "\n", - "# if material_id in material_ids:\n", - "# continue\n", - "\n", - "# material_ids.add(material_id)\n", - "# output[entry_id] = material\n", - "\n", - "# return output\n", - "\n", - "# materials = select_representative(reference, materials)\n", - "# len(materials)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# reference: As4Ga4 (sg: 216)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f'{\"formula\":12} {\"space_group\"} {\"similarity\":<10} {\"entry_id\":<28}')\n", - "for m in materials.values():\n", - " if m[\"formula\"] == \"AsGa\":\n", - " print(\n", - " f'{m[\"formula\"]:<12} {m[\"space_group\"]:>11} {m[\"similarity\"]:10.4f} https://nomad-lab.eu/prod/v1/gui/entry/id/{m[\"entry_id\"]}'\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get entry_id from table\n", - "entry_ids = [m[\"entry_id\"] for m in materials.values() if m[\"formula\"] == \"AsGa\"]\n", - "n = len(entry_ids)\n", - "materials_dos_query = ArchiveQuery(\n", - " query={\"entry_id\": {\"any\": entry_ids}}, required=required_dos, results_max=n\n", - ")\n", - "\n", - "materials_dos_results = materials_dos_query.download()\n", - "\n", - "materials_dos = {}\n", - "for r in materials_dos_results:\n", - " materials_dos[r.metadata.entry_id] = get_dos(r)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(10, 5))\n", - "\n", - "plt.fill_between(\n", - " reference_dos[\"energies\"].magnitude,\n", - " reference_dos[\"values\"].magnitude,\n", - " label=f\"Reference: {reference['formula']}\",\n", - " facecolor=(\"r\", 0.2),\n", - " edgecolor=\"r\",\n", - ")\n", - "\n", - "for entry_id in entry_ids:\n", - " energies = materials_dos[entry_id][\"energies\"].magnitude\n", - " values = materials_dos[entry_id][\"values\"].magnitude\n", - "\n", - " chem_formula = materials[entry_id][\"formula\"]\n", - " Tc = materials[entry_id][\"similarity\"]\n", - "\n", - " plt.plot(energies, values, label=f\"{chem_formula}, Tc = {Tc: .2f}\")\n", - "# plt.fill_between(energies, values, alpha = 0.2, )\n", - "\n", - "\n", - "plt.ylabel(r\"DOS [$\\frac{1}{eV}$]\")\n", - "plt.xlabel(r\"Energy [$eV$]\")\n", - "\n", - "plt.xlim(-10, 5)\n", - "plt.ylim(0, 2)\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/dos_similarity_search_debugging.ipynb b/dos_similarity_search_debugging.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..652274c58a240a297e311a52b71164363821e3db --- /dev/null +++ b/dos_similarity_search_debugging.ipynb @@ -0,0 +1,976 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<div style=\"\n", + " background-color: #f7f7f7;\n", + " background-image: url('data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiIHN0YW5kYWxvbmU9Im5vIj8+CjxzdmcKICAgd2lkdGg9IjcyIgogICBoZWlnaHQ9IjczIgogICB2aWV3Qm94PSIwIDAgNzIgNzMiCiAgIGZpbGw9Im5vbmUiCiAgIHZlcnNpb249IjEuMSIKICAgaWQ9InN2ZzEzMTkiCiAgIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIKICAgeG1sbnM6c3ZnPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CiAgPGRlZnMKICAgICBpZD0iZGVmczEzMjMiIC8+CiAgPHBhdGgKICAgICBkPSJNIC0wLjQ5OTk4NSwxNDUgQyAzOS41MzMsMTQ1IDcyLDExMi41MzIgNzIsNzIuNSA3MiwzMi40Njc4IDM5LjUzMywwIC0wLjQ5OTk4NSwwIC00MC41MzI5LDAgLTczLDMyLjQ2NzggLTczLDcyLjUgYyAwLDQwLjAzMiAzMi40NjcxLDcyLjUgNzIuNTAwMDE1LDcyLjUgeiIKICAgICBmaWxsPSIjMDA4YTY3IgogICAgIGZpbGwtb3BhY2l0eT0iMC4yNSIKICAgICBpZD0icGF0aDEzMTciIC8+Cjwvc3ZnPgo='), url('data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiIHN0YW5kYWxvbmU9Im5vIj8+CjxzdmcKICAgd2lkdGg9IjIxNyIKICAgaGVpZ2h0PSIyMjMiCiAgIHZpZXdCb3g9IjAgMCAyMTcgMjIzIgogICBmaWxsPSJub25lIgogICB2ZXJzaW9uPSIxLjEiCiAgIGlkPSJzdmcxMTA3IgogICB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciCiAgIHhtbG5zOnN2Zz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxkZWZzCiAgICAgaWQ9ImRlZnMxMTExIiAvPgogIDxwYXRoCiAgICAgZD0ibSAyMi4wNDIsNDUuMDEwOSBjIDIxLjM2MjUsMjEuMjc1NyA1NS45NzYsMjEuMjc1NyA3Ny41MTkyLDAgQyAxMTkuNTU4LDI1LjA4IDE1MS41MDIsMjMuNzM1MiAxNzIuODY0LDQxLjM3OCBjIDEuMzQ1LDEuNTI1NCAyLjY5LDMuMjUxNiA0LjIzNiw0Ljc5NzEgMjEuMzYzLDIxLjI3NTYgMjEuMzYzLDU1Ljc5ODkgMCw3Ny4yNTQ5IC0yMS4zNjIsMjEuMjc2IC0yMS4zNjIsNTUuNzk4IDAsNzcuMjU1IDIxLjM2MywyMS40NTYgNTUuOTc2LDIxLjI3NSA3Ny41MiwwIDIxLjU0MywtMjEuMjc2IDIxLjM2MiwtNTUuNzk5IDAsLTc3LjI1NSAtMjEuMzYzLC0yMS4yNzYgLTIxLjM2MywtNTUuNzk4NiAwLC03Ny4yNTQ5IDEyLjY4OSwtMTIuNjQ1IDE3Ljg4OSwtMzAuMTA3MSAxNS4zOTksLTQ2LjU4NTc2IC0xLjU0NiwtMTEuNTAwOTQgLTYuNzI2LC0yMi44MjExNCAtMTUuNTgsLTMxLjYzMjU0IC0yMS4zNjMsLTIxLjI3NTYgLTU1Ljk3NiwtMjEuMjc1NiAtNzcuNTE5LDAgLTIxLjM2MywyMS4yNzU3IC01NS45NzYsMjEuMjc1NyAtNzcuNTE5NCwwIC0yMS4zNjI1LC0yMS4yNzU2IC01NS45NzYxLC0yMS4yNzU2IC03Ny41MTkyLDAgQyAwLjY3OTU2NSwtMTAuNzg3NiAwLjY3OTU5NiwyMy43MzUyIDIyLjA0Miw0NS4wMTA5IFoiCiAgICAgZmlsbD0iIzJhNGNkZiIKICAgICBzdHJva2U9IiMyYTRjZGYiCiAgICAgc3Ryb2tlLXdpZHRoPSIxMiIKICAgICBzdHJva2UtbWl0ZXJsaW1pdD0iMTAiCiAgICAgaWQ9InBhdGgxMTA1IiAvPgogIDxwYXRoCiAgICAgZD0ibSA1MS45OTUyMTIsMjIyLjczMDEzIGMgMjguMzU5MSwwIDUxLjM1ODM5OCwtMjIuOTk5OSA1MS4zNTgzOTgsLTUxLjM1ODQgMCwtMjguMzU4NiAtMjIuOTk5Mjk4LC01MS4zNTg1OSAtNTEuMzU4Mzk4LC01MS4zNTg1OSAtMjguMzU5MSwwIC01MS4zNTg2MDIsMjIuOTk5OTkgLTUxLjM1ODYwMiw1MS4zNTg1OSAwLDI4LjM1ODUgMjIuOTk5NTAyLDUxLjM1ODQgNTEuMzU4NjAyLDUxLjM1ODQgeiIKICAgICBmaWxsPSIjMTkyZTg2IgogICAgIGZpbGwtb3BhY2l0eT0iMC4zNSIKICAgICBpZD0icGF0aDE5MzciIC8+Cjwvc3ZnPgo=') ;\n", + " background-position: left bottom, right top;\n", + " background-repeat: no-repeat, no-repeat;\n", + " background-size: auto 60px, auto 160px;\n", + " border-radius: 5px;\n", + " box-shadow: 0px 3px 1px -2px rgba(0, 0, 0, 0.2), 0px 2px 2px 0px rgba(0, 0, 0, 0.14), 0px 1px 5px 0px rgba(0,0,0,.12);\">\n", + "\n", + "<h1 style=\"\n", + " color: #2a4cdf;\n", + " font-style: normal;\n", + " font-size: 4rem;\n", + " line-height: 1.4em;\n", + " font-weight: 4600;\n", + " padding: 30px 200px 0px 30px;\">\n", + " Electronic density-of-states similarity search</h1>\n", + "\n", + "<p style=\"font-size: 1.25em; font-style: italic; padding: 5px 200px 30px 30px;\">\n", + " Šimon Gabaj,\n", + " Martin Kuban,\n", + " Santiago Rigamonti and\n", + " Claudia Draxl</p>\n", + "</div>\n", + "\n", + "<div style=\"margin: 10px;\">\n", + " <img style=\"float: left; margin: 5px 20px 0px 0px;\" src=\"assets/logos/hu-berlin.svg\" width=\"110\">\n", + " <img style=\"float: left; margin: 0px 20px 0px 0px;\" src=\"assets/logos/nomad.svg\" width=\"110\">\n", + " <img style=\"float: left; margin: 15px 20px 0px 0px;\" src=\"assets/logos/nomad-infrastructure.svg\" width=\"120\">\n", + " <img style=\"float: left; margin: 5px 20px 0px 10px;\" src=\"assets/logos/mpcdf.svg\" width=\"270\">\n", + "</div>\n", + "<p style=\"text-align: right; padding: 0px 10px 10px 0px;\">[Last updated: February, 2024]<" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "This notebook shows how to compute the similarity of materials in terms of their electronic density-of-states (DOS) from data retrieved from the [NOMAD Archive](https://nomad-lab.eu/prod/v1/gui/search/entries). \n", + "\n", + "For this purpose, a _DOS fingerprint_ is used which encodes the DOS obtained from density-functional theory (DFT) calculations into a binary valued descriptor. A detailed description of the fingerprint can be found in Ref. [1].\n", + "\n", + "The DOS fingerprints in this notebook are precomputed and available in the NOMAD Archive. \n", + "We first download the respective data from the NOMAD Archive and use the fingerprint to find materials that are similar to a given reference material.\n", + "\n", + "**In this notebook we demonstrate how to find GaAs-based binary and ternary compounds from the NOMAD Archive that have the most similar electronic structure to GaAs.**\n", + "\n", + "### Contents:\n", + "- [Import modules](#Import-modules)\n", + "- [Downloading data from the NOMAD Archive](#Downloading-data-from-the-NOMAD-Archive)\n", + " - [Downloading a single calculation](#Downloading-a-reference-material)\n", + " - [Downloading calculations using search queries](#Downloading-calculations-using-search-queries)\n", + "- [The DOS fingerprint as a descriptor](#The-DOS-fingerprint-as-a-descriptor)\n", + "- [Calculation of similarity coefficients](#Calculation-of-similarity-coefficients)\n", + "- [Visualizing results](#Visualizing-results)\n", + "- [References](#References)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import modules\n", + "\n", + "To interact with the NOMAD Archive API we use the python package `nomad-lab`. To learn more about its usage, please refer to the [documentation](https://nomad-lab.eu/prod/rae/docs/client/client.html). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "julia" + } + }, + "outputs": [], + "source": [ + "import nomad.config\n", + "\n", + "nomad.config.client.url = 'http://nomad-lab.eu/prod/v1/staging/api'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from itertools import islice\n", + "\n", + "from nomad.client import ArchiveQuery\n", + "from nomad.datamodel.datamodel import EntryArchive\n", + "from nomad_dos_fingerprints import DOSFingerprint, tanimoto_similarity\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Downloading data from the NOMAD Archive\n", + "\n", + "**For a detailed overview on how to query the NOMAD Archive using the `nomad-lab` package see the tutorial 'Query the Archive' on the ['AI toolkit tutorials'](https://nomad-lab.eu/services/aitoolkit) page.** Here, we will download all necessary data to perform a similarity search using DOS fingerprints. This is achieved using an instance of `ArchiveQuery`. It allows for querying the NOMAD Achive with only few commands. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Downloading a reference material\n", + "\n", + "First, we download a reference calculation for [GaAs](https://nomad-lab.eu/prod/v1/gui/entry/id/zkkMIAPyn4OCbdEdW21DZTeretQ3) from the Archive. To download a specific calculation we construct the `query` dictionary only from the calculation ID. The calculation ID is a unique, static identifier for each calculation. \n", + "\n", + "For the here presented analysis, not all of the data of a calculation are required. Therefore, we select the paths to the needed data in the NOMAD Archive entry. The paths are contained in the cell below in the variable `required_sections`. This helps to reduce unnecessary download of data. The path to all data of a calculation can be found on the the NOMAD [Metainfo](https://nomad-lab.eu/prod/v1/gui/analyze/metainfo) page." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "required_sections = {\n", + " # DOS fingerprint\n", + " \"workflow\": {\"calculation_result_ref\": {\"dos_electronic\": {\"fingerprint\": \"*\"}}},\n", + " # Upload and calculation id\n", + " \"metadata\": {\n", + " \"entry_id\": \"*\",\n", + " },\n", + " # chemical formula, material id, and space group number\n", + " \"results\": {\n", + " \"material\": {\n", + " \"chemical_formula_reduced\": \"*\",\n", + " \"material_id\": \"*\",\n", + " \"symmetry\": {\"space_group_number\": \"*\"},\n", + " }\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "reference_entry_id = \"zkkMIAPyn4OCbdEdW21DZTeretQ3\"\n", + "\n", + "# Create a query\n", + "reference_query = ArchiveQuery(\n", + " query={\n", + " \"entry_id\": reference_entry_id # ID of the reference calculation\n", + " },\n", + " required=required_sections,\n", + ")\n", + "\n", + "# Download\n", + "reference_result = reference_query.download()[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reference_result.metadata.entry_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "This calculation stored in the variable `reference_result` will be used as a reference for our similarity search. \n", + "</span>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "## Downloading calculations using search queries\n", + "\n", + "To perfrom a similarity search, we compare the fingerprint of the reference to the fingerprints of a large data set.\n", + "In the following, we query the NOMAD Archive for GaAs-based binary and ternary compounds. As a starting point we restrict the search to only calculations computed with the DFT code 'VASP' using a GGA exchange-correlation functional. This information is written to the `query` dictionary that is passed to `ArchiveQuery`. Note that the `required` argument of the `ArchiveQuery` is unchanged.\n", + "</span>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "**Queries can be generated in the GUI of the [NOMAD Archive](https://nomad-lab.eu/prod/v1/gui/search) in the python dictionary format.** They can be found under the `<>` symbol at the top of the search menu. From there, they can be directly copied into the `query` dictionary of the `ArchiveQuery` function.\n", + " </span>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# choose a representative from the AFLOW or OQMD data\n", + "materials_query = ArchiveQuery(\n", + " query={\n", + " \"results.method.simulation.program_name\": \"VASP\",\n", + " \"results.material.elements\": {\"all\": [\"Ga\", \"As\"]},\n", + " \"results.properties.available_properties\": [\"dos_electronic\"],\n", + " \"results.method.simulation.dft.xc_functional_type\": [\"GGA\"],\n", + " \"results.material.n_elements\": {\"gte\": 2, \"lte\": 3},\n", + " },\n", + " required=required_sections,\n", + " page_size=100,\n", + " results_max=1366, # TODO: fix this\n", + ")\n", + "\n", + "materials_query.fetch()\n", + "\n", + "materials_results = []\n", + "while True:\n", + " result = materials_query.download(100)\n", + " if len(result) == 0:\n", + " break\n", + " materials_results.extend(result)\n", + "\n", + "\n", + "len(materials_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The DOS fingerprint as a descriptor\n", + "\n", + "In order to quantitatively evaluate materials similarity, we encode the electronic DOS in a so-called _DOS fingerprint_. The DOS fingerprint is a two-dimensional, binary-valued representation of the electronic DOS. An in-depth description can be found in Ref. [1].\n", + "\n", + "To make use of the fingerprint, the data stored in the NOMAD Archive must be loaded into `DOSFingerprint` objects. Therefore, we scan through the Archive contents that we downloaded previously and extract all data that are related to the fingerprint, as well as identifiers for presenting the results. To do so in a systematic manner, we define functions that collect the relevant information from an Archive entry. An example of such a function, `formula`, is given below. These function are passed in a list `exctract_properties` to the function `get_data`, which extracts the relevant data from `ArchiveQuery`. The extracted data is saved using the name of the function as the keyword.\n", + "\n", + "For convenience, the extracted data are collected in a dictionary which will allow us to efficienty search the results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fingerprint(entry: EntryArchive) -> dict:\n", + " \"\"\"Retrieve information from an Archive entry 'entry'\"\"\"\n", + "\n", + " fingerprint = dict()\n", + "\n", + " fingerprint[\"entry_id\"] = entry.metadata.entry_id\n", + " fingerprint[\"upload_id\"] = entry.metadata.upload_id\n", + " fingerprint[\"formula\"] = entry.results.material.chemical_formula_reduced\n", + " fingerprint[\"material_id\"] = entry.results.material.material_id\n", + " fingerprint[\"space_group\"] = entry.results.material.symmetry.space_group_number\n", + "\n", + " # Generate a DOS fingerprint object from an Archive entry\n", + " dos_fp = entry.run[0].calculation[-1].dos_electronic[0].fingerprint\n", + " fingerprint[\"dos_fingerprint\"] = DOSFingerprint.from_dict(dos_fp)\n", + "\n", + " return fingerprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "materials = {}\n", + "for r in materials_results:\n", + " entry_id = r.metadata.entry_id\n", + " materials[entry_id] = get_fingerprint(r)\n", + "\n", + "reference = get_fingerprint(reference_result)\n", + "reference" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "# Calculation of similarity coefficients\n", + "\n", + "Now we compute the similarity between two DOS spectra.\n", + "\n", + "A DOS fingerprint represents the electronic DOS as a binary vector [1]. In order to compute the similarity of two fingerprints we use the **Tanimoto coefficient** [2]. The Tanimoto coefficient, $T_c$, between two vectors $\\mathbf{a}$ and $\\mathbf{b}$ is defined as:\n", + "\n", + "\n", + "$$\n", + "\\begin{eqnarray}\n", + "T_c(\\mathbf{a},\\mathbf{b}) = \\frac{\\mathbf{a} \\cdot \\mathbf{b}}{||\\mathbf{a}||^2 + ||\\mathbf{b}||^2 - \\mathbf{a} \\cdot \\mathbf{b}}.\n", + "\\end{eqnarray}\n", + "$$\n", + "\n", + "\n", + "It is restricted to values $T_c \\in [0,1]$. 1 means that the DOS of two materials are identical, 0 means no overlap at all. **The Tanimoto coefficient can be interpreted as the ratio between the number of shared features and the total number of features of two fingerprints.** For dichotomous vectors, the complement of the Tanimoto coefficient ($1 - T_c$), also known as Jaccard distance, is a metric. The Tanimoto coefficient is implemented as the function `tanimoto_similarity` in the `nomad_dos_fingerprints` package.\n", + "\n", + "The arguments of the function `tanimoto_similarity` are two `DOSFingerprint` objects. Using this, the similarity between the reference material and one of the candidate materials can be calculated, as shown in the following example:\n", + "</span>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Selecting the first material in the dictionary\n", + "candidate = next(iter(materials.values()))\n", + "\n", + "print(f\"Similarity between {reference['formula']} and {candidate['formula']}:\\n\")\n", + "print(f\"Tc = {tanimoto_similarity(reference['dos_fingerprint'], candidate['dos_fingerprint'])}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "Now we **use the function `calculate_similarity` to calculate the similarities of the materials in `materials_data` to our reference**, `reference`. The function `calculate_similarity` returns a dictionary of the calculation, where `similarity` is the value of the Tanimoto coefficient between the reference and the current calculation. \n", + "</span>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_similarity(reference: dict, material: dict) -> float:\n", + " \"\"\"Calculates the similarity of material `material` to a reference material `reference`.\n", + "\n", + " Inputs:\n", + " reference: generated by `get_fingerprint_data`, contains information about the material\n", + " material: generated by `get_fingerprint_data`, contains information about the material\n", + " \"\"\"\n", + "\n", + " return material[\"dos_fingerprint\"].get_similarity(reference[\"dos_fingerprint\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# apply `calculate_similarity` to all entries in `materials`\n", + "for material in materials.values():\n", + " material[\"similarity\"] = calculate_similarity(reference, material)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "next(iter(materials.values()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Archive API returns all calculations which fit the query, therefore, **for a single material multiple calculations (e.g. from different authors) are downloaded.** \n", + "\n", + "To simplify the analysis presented here, **we select the calculation with highest similarity score for each formula and spacegroup**. To do so, we define a function called `select_representative`. The cell below shows an example of this function that takes the calculation with highest similarity score of a material as the representative. However, different approaches can be used, e.g., based on computational parameters employed in the DFT calculations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def select_representative(reference: dict, materials: dict) -> dict:\n", + " \"\"\"Example of a `select_representative` function.\n", + " Returns the calculation with highes similarity score of a materials it finds.\n", + "\n", + " Inputs:\n", + " materials: containing the materials in a dictionary\n", + " as outputted by `get_fingerprints`\n", + " \"\"\"\n", + "\n", + " output = dict()\n", + " lookup = set()\n", + "\n", + " for entry_id, material in sorted(\n", + " materials.items(), key=lambda x: x[1][\"similarity\"], reverse=True\n", + " ):\n", + " if material[\"formula\"] == reference[\"formula\"]:\n", + " continue\n", + "\n", + " group_by = (material[\"formula\"], material[\"space_group\"])\n", + "\n", + " if group_by in lookup:\n", + " # Due to the sorting it is garanteed that if the formula is already\n", + " # part of the output that has the highest score as well\n", + " continue\n", + "\n", + " lookup.add(group_by)\n", + " output[entry_id] = material\n", + "\n", + " return output\n", + "\n", + "\n", + "materials = select_representative(reference, materials)\n", + "\n", + "len(materials)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have computed the similarities of the `reference` to all the materials in `materials_data`.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualizing results\n", + "\n", + "We want to look at the results of the similarity search to identify the most similar materials to the reference. For an overview, we first visualize the found similarity coefficients in a histogram.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "similarities = [value[\"similarity\"] for value in materials.values()]\n", + "\n", + "plt.figure(figsize=(10, 5))\n", + "plt.hist(\n", + " similarities,\n", + " bins=20,\n", + " range=[0, 1.0],\n", + " ec=\"w\",\n", + " label=f'Reference: {reference[\"formula\"]}',\n", + ")\n", + "plt.xticks(np.linspace(0, 1.0, 11))\n", + "plt.xlabel(\"Tc\")\n", + "plt.xlim(0, 1)\n", + "plt.title(\"Frequency of similarity coefficients\")\n", + "plt.ylabel(\"Counts\")\n", + "plt.yscale(\"log\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "Note the logarithmic scale in this histogram. We can see here, that the vast majority of materials has a low similarity to our reference. On the right side of the histogram ($\\mathrm{Tc} > 0.7$), the most similar materials can be found, which show exceptionally high similarity scores.\n", + "\n", + "We construct a ranking table which shows the similarity of materials to our reference from the most similar to the least similar.\n", + "</span>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "n = 20\n", + "\n", + "print(f'{\"formula\":12} {\"space_group\"} {\"similarity\":<10} {\"entry_id\"}')\n", + "for m in islice(materials.values(), n):\n", + " print(f'{m[\"formula\"]:<12} {m[\"space_group\"]:>11} {m[\"similarity\"]:10.4f} ', end = '')\n", + " print(f'https://nomad-lab.eu/prod/v1/gui/entry/id/{m[\"entry_id\"]}')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By clicking the link, you will land on the Archive page of the respective calculation, where you can find more information about your material.\n", + "\n", + "Now we plot the DOS of the most similar materials to our reference. In the variable `ranks_to_download`, we give the rank of the materials from the table above, whose DOS we want to plot. To avoid unnecessary downloading, we check if the spectrum is already in `materials_data` under the key `dos`, if not, we download it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_dos(entry: EntryArchive):\n", + " \"\"\"Retrieve the density of states spectrum\"\"\"\n", + "\n", + " calc = entry.run[0].calculation[-1]\n", + "\n", + " total = calc.dos_electronic[0].total[0].value.to(\"1/eV\")\n", + " normalization_factor = calc.dos_electronic[0].total[0].normalization_factor\n", + "\n", + " offset = calc.energy.highest_occupied.to(\"eV\")\n", + "\n", + " energies = calc.dos_electronic[0].energies.to(\"eV\") - offset\n", + " values = total * normalization_factor\n", + "\n", + " spectrum = {\"energies\": energies, \"values\": values}\n", + "\n", + " return spectrum" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "required_dos = {\n", + " \"workflow\": {\n", + " \"calculation_result_ref\": {\n", + " \"dos_electronic\": {\n", + " \"energies\": \"*\",\n", + " \"total\": {\"value\": \"*\", \"normalization_factor\": \"*\"},\n", + " },\n", + " \"energy\": {\"highest_occupied\": \"*\"},\n", + " }\n", + " },\n", + " \"metadata\": {\n", + " \"entry_id\": \"*\",\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reference_dos_query = ArchiveQuery(\n", + " query={\"entry_id\": reference_entry_id},\n", + " required=required_dos\n", + ")\n", + "\n", + "reference_dos_result = reference_dos_query.download()[0]\n", + "reference_dos = get_dos(reference_dos_result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Number of the calulation with the highest similarity score\n", + "n = 3\n", + "\n", + "# get entry_id from table\n", + "entry_ids = [m[\"entry_id\"] for m in islice(materials.values(), n)]\n", + "\n", + "materials_dos_query = ArchiveQuery(\n", + " query={\"entry_id\": {\"any\": entry_ids}},\n", + " required=required_dos,\n", + " results_max=n # TODO: fix this\n", + ")\n", + "\n", + "materials_dos_results = materials_dos_query.download()\n", + "\n", + "materials_dos = {}\n", + "for r in materials_dos_results:\n", + " materials_dos[r.metadata.entry_id] = get_dos(r)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And finally we plot the spectra." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "plt.fill_between(\n", + " reference_dos[\"energies\"].magnitude,\n", + " reference_dos[\"values\"].magnitude,\n", + " label=f\"Reference: {reference['formula']}\",\n", + " facecolor=(\"r\", 0.2),\n", + " edgecolor=\"r\",\n", + ")\n", + "\n", + "for entry_id in entry_ids:\n", + " energies = materials_dos[entry_id][\"energies\"].magnitude\n", + " values = materials_dos[entry_id][\"values\"].magnitude\n", + "\n", + " chem_formula = materials[entry_id][\"formula\"]\n", + " Tc = materials[entry_id][\"similarity\"]\n", + "\n", + " plt.plot(energies, values, label=f\"{chem_formula}, Tc = {Tc: .2f}\")\n", + " # plt.plot(energies, values)\n", + " # plt.fill_between(energies, values, alpha = 0.2, label = f\"{chem_formula}, Tc = {Tc: .2f}\")\n", + "\n", + "\n", + "plt.ylabel(r\"DOS [$\\frac{1}{eV}$]\")\n", + "plt.xlabel(r\"Energy [$eV$]\")\n", + "\n", + "plt.xlim(-10, 5)\n", + "plt.ylim(0, 2)\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "# References\n", + "\n", + "[1] M. Kuban, S. Rigamonti, M. Scheidgen, and C. Draxl: [Density-of-states similarity descriptor for unsupervised learning from materials data](https://arxiv.org/abs/2201.02187)\n", + "\n", + "[2] P. Willet, J. M. Barnard, G. M. Downs: [Chemical Similarity Searching](https://pubs.acs.org/doi/abs/10.1021/ci9800211), $\\textit{J. Chem. Inf. Comput. Sci.}$, $\\textbf{38}$, 983, (1998)\n", + "</span>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<span style='font-family:sans-serif'>\n", + "\n", + "# Acknowledgements\n", + "\n", + "We thank Luca Ghiringhelli for help in preparing this notebook. This work recieved partial funding from the European Union’s Horizon 2020 research and innovation program under the grant agreement Nº 951786 (NOMAD CoE), from the NFDI consortium FAIRmat, and from the German Research Foundation (DFG) through the CRC 1404 (FONDA).\n", + "</span>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Debugging" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# # choose a representative from the AFLOW or OQMD data\n", + "# materials_query = ArchiveQuery(\n", + "# query = {\n", + "# 'results.method.simulation.program_name': 'VASP',\n", + "# 'results.material.elements': {'all': ['Ga', 'As']},\n", + "# 'results.properties.available_properties': ['dos_electronic'],\n", + "# 'results.method.simulation.dft.xc_functional_type': ['GGA'],\n", + "# 'results.material.n_elements': {'gte': 2, 'lte': 3},\n", + "# },\n", + "# required = required_sections = {\n", + "# # DOS fingerprint\n", + "# 'workflow': {\n", + "# 'calculation_result_ref': {\n", + "# 'dos_electronic': {\n", + "# 'fingerprint': '*'\n", + "# }\n", + "# }\n", + "# },\n", + "# # Upload and calculation id\n", + "# 'metadata': {\n", + "# 'calc_id': '*',\n", + "# },\n", + "# # chemical formula, material id, and space group number\n", + "# 'results':{\n", + "# 'material':{\n", + "# 'chemical_formula_reduced': '*',\n", + "# 'material_id': '*',\n", + "# 'symmetry': {\n", + "# 'space_group_number': '*'\n", + "# }\n", + "# }\n", + "# }\n", + "# },\n", + "# page_size = 100,\n", + "# results_max = 4177\n", + "# )\n", + "\n", + "# materials_query.fetch()\n", + "\n", + "# materials_results = []\n", + "# while True:\n", + "# result = materials_query.download(100)\n", + "# if len(result) == 0:\n", + "# break\n", + "# materials_results.extend(result)\n", + "\n", + "# print(len(materials_results))\n", + "\n", + "# for m in materials_results[:10]:\n", + "# print(f\"entry_id: {m.metadata.entry_id}, calc_id: {m.metadata.calc_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we remove entries without DOS fingerprint data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # removing entries with empty fingerprints\n", + "# for i, result in enumerate(materials_results):\n", + "# if not result.run[0].calculation[-1].dos_electronic[0].fingerprint.bins:\n", + "# del materials_results[i]\n", + "\n", + "# len(materials_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Archive API returns all calculations which fit the query, therefore, **for a single material multiple calculations (e.g. from different authors) are downloaded.** \n", + "\n", + "To simplify the analysis presented here, **we select a representative calculation for each material**. To do so, we define a function called `select_representative`. The cell below shows an example of this function that takes the first encountered calculation of a material as the representative. However, different approaches can be used, e.g., based on computational parameters employed in the DFT calculations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# def select_representative(reference:dict, materials: dict) -> dict:\n", + "# \"\"\"Example of a `select_representative` function.\n", + "# Returns the first calculation of a material it finds.\n", + "\n", + "# Inputs:\n", + "# materials: containing the materials in a dictionary\n", + "# as outputted by `get_fingerprints`\n", + "# \"\"\"\n", + "\n", + "# material_ids = set()\n", + "# output = {}\n", + "\n", + "# # for entry_id, material in materials.items():\n", + "# for entry_id, material in sorted(materials.items(), key = lambda x: x[1]['similarity'], reverse = True):\n", + "\n", + "# if material['formula'] == reference['formula']:\n", + "# continue\n", + "\n", + "# material_id = material['material_id']\n", + "\n", + "# if material_id in material_ids:\n", + "# continue\n", + "\n", + "# material_ids.add(material_id)\n", + "# output[entry_id] = material\n", + "\n", + "# return output\n", + "\n", + "# materials = select_representative(reference, materials)\n", + "# len(materials)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reference: As4Ga4 (sg: 216)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f'{\"formula\":12} {\"space_group\"} {\"similarity\":<10} {\"entry_id\":<28}')\n", + "for m in materials.values():\n", + " if m[\"formula\"] == \"AsGa\":\n", + " print(\n", + " f'{m[\"formula\"]:<12} {m[\"space_group\"]:>11} {m[\"similarity\"]:10.4f} https://nomad-lab.eu/prod/v1/gui/entry/id/{m[\"entry_id\"]}'\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get entry_id from table\n", + "entry_ids = [m[\"entry_id\"] for m in materials.values() if m[\"formula\"] == \"AsGa\"]\n", + "n = len(entry_ids)\n", + "materials_dos_query = ArchiveQuery(\n", + " query={\"entry_id\": {\"any\": entry_ids}}, required=required_dos, results_max=n\n", + ")\n", + "\n", + "materials_dos_results = materials_dos_query.download()\n", + "\n", + "materials_dos = {}\n", + "for r in materials_dos_results:\n", + " materials_dos[r.metadata.entry_id] = get_dos(r)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "\n", + "plt.fill_between(\n", + " reference_dos[\"energies\"].magnitude,\n", + " reference_dos[\"values\"].magnitude,\n", + " label=f\"Reference: {reference['formula']}\",\n", + " facecolor=(\"r\", 0.2),\n", + " edgecolor=\"r\",\n", + ")\n", + "\n", + "for entry_id in entry_ids:\n", + " energies = materials_dos[entry_id][\"energies\"].magnitude\n", + " values = materials_dos[entry_id][\"values\"].magnitude\n", + "\n", + " chem_formula = materials[entry_id][\"formula\"]\n", + " Tc = materials[entry_id][\"similarity\"]\n", + "\n", + " plt.plot(energies, values, label=f\"{chem_formula}, Tc = {Tc: .2f}\")\n", + "# plt.fill_between(energies, values, alpha = 0.2, )\n", + "\n", + "\n", + "plt.ylabel(r\"DOS [$\\frac{1}{eV}$]\")\n", + "plt.xlabel(r\"Energy [$eV$]\")\n", + "\n", + "plt.xlim(-10, 5)\n", + "plt.ylim(0, 2)\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_input": false, + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "metadata": { + "interpreter": { + "hash": "2dfbd2783628fbe9267bf55a6c8862bcee5db4b092f4a0aebb9a9eab56757fa6" + } + }, + "toc": { + "base_numbering": "0", + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}