band_gap_prediction.bkr 7.12 MB
 Lauri Himanen committed Aug 03, 2017 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 { "beaker": "2", "evaluators": [ { "name": "HTML", "plugin": "HTML", "view": { "cm": { "mode": "smartHTMLMode" } } }, { "name": "JavaScript", "plugin": "JavaScript", "view": { "cm": { "mode": "javascript", "background": "#FFE0F0" } }, "languageVersion": "ES2015" }, { "name": "Python3", "plugin": "Python3", "setup": "%matplotlib inline\nimport numpy\nimport matplotlib\nfrom matplotlib import pylab, mlab, pyplot\nnp = numpy\nplt = pyplot\nfrom IPython.display import display\nfrom IPython.core.pylabtools import figsize, getfigs\nfrom pylab import *\nfrom numpy import *\n", "view": { "cm": { "mode": "python" } } }, { "name": "IPython", "plugin": "IPython", "deferred": { "promise": {} }, "setup": "%matplotlib inline\nimport numpy\nimport matplotlib\nfrom matplotlib import pylab, mlab, pyplot\nnp = numpy\nplt = pyplot\nfrom IPython.display import display\nfrom IPython.core.pylabtools import figsize, getfigs\nfrom pylab import *\nfrom numpy import *\n", "view": { "cm": { "mode": "python" } } } ], "cells": [ { "id": "codeOQaB8J", "type": "code", "evaluator": "Python3", "input": { "body": [ "# Define notebook variables",  56 57  "beaker.dir = \"/home/beaker/test/band-gap-prediction\" # Production", "#beaker.dir = \"/home/beaker/host/band-gap-prediction/\" # Development",  Lauri Himanen committed Aug 03, 2017 58 59 60 61 62 63 64  "beaker.json_name = \"3366_gap_no_gap_ratio.json\"" ] }, "output": { "state": {}, "selectedType": "Hidden", "pluginName": "Python3",  65 66  "shellId": "D5A821EEFD7346AAA757F8AB337F9C21", "elapsedTime": 77  Lauri Himanen committed Aug 03, 2017 67 68  }, "evaluatorReader": true,  69  "lineCount": 4,  Lauri Himanen committed Aug 03, 2017 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131  "isError": false }, { "id": "load_stylesheets", "type": "code", "evaluator": "HTML", "input": { "body": [ "", "", "" ] }, "output": { "state": {}, "result": { "type": "BeakerDisplay", "innertype": "Html", "object": "\n\n\n" }, "selectedType": "BeakerDisplay",  132 133  "elapsedTime": 0, "height": 50  Lauri Himanen committed Aug 03, 2017 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161  }, "evaluatorReader": true, "lineCount": 44, "isError": false }, { "id": "load_javascript", "type": "code", "evaluator": "JavaScript", "input": { "body": [ "/**", " * Load external JavaScripts", " */", ";(function() {", " var asset = \"//cdnjs.cloudflare.com/ajax/libs/chosen/1.6.2\";", "", " // Load external JavaScript libraries", " beaker.loadCSS(asset + \"/chosen.min.css\");", " beaker.loadJS(asset + \"/chosen.jquery.min.js\");", "})();" ] }, "output": { "hidden": true, "state": {}, "selectedType": "BeakerDisplay", "pluginName": "JavaScript",  162  "elapsedTime": 12  Lauri Himanen committed Aug 03, 2017 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205  }, "evaluatorReader": true, "lineCount": 10, "isError": false }, { "id": "codetR2xzL", "type": "code", "evaluator": "HTML", "input": { "body": [ "
", "
", "
", "

NOMAD Analytics Toolkit ", "

", "
", "

Predicting the Existence of a Band Gap from Crystal Structures

", "

", " created by:", "Lauri Himanen1", "Filippo Federici Canova1

", " ", " 1Aalto University School of Science, P. O. Box 15100, FI-00076 Aalto (Espoo), Finland
", " [Last updated: July 31, 2017]", "

", "
", "
", "" ] }, "output": { "state": {}, "result": { "type": "BeakerDisplay", "innertype": "Html",  206  "object": "\n
\n
\n
\n

NOMAD Analytics Toolkit \n

\n
\n

Predicting the Existence of a Band Gap from Crystal Structures

\n

\n created by:\nLauri Himanen1\nFilippo Federici Canova1

\n \n 1Aalto University School of Science, P. O. Box 15100, FI-00076 Aalto (Espoo), Finland
\n [Last updated: July 31, 2017]\n

\n
\n
\n"  Lauri Himanen committed Aug 03, 2017 207 208 209  }, "selectedType": "BeakerDisplay", "elapsedTime": 0,  Lauri Himanen committed Jan 26, 2018 210  "height": 366  Lauri Himanen committed Aug 03, 2017 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237  }, "evaluatorReader": true, "lineCount": 25, "isError": false }, { "id": "codeBI4ymq", "type": "code", "evaluator": "HTML", "input": { "body": [ "
", "

Introduction

", "

In this notebook we show an example of using data in the NOMAD Archive to train a machine learning classifier that can predict the existence of an electronic band gap from crystal structure only. We achieve roughly 80% prediction accuracy with the best classifier and 9894 samples. As input for the learning we use very compact structural descriptors that are invariant under rotation and translation and include only information about the atomic species and their positions.", "

", "
" ] }, "output": { "state": {}, "result": { "type": "BeakerDisplay", "innertype": "Html", "object": "\n
\n

Introduction

\n

In this notebook we show an example of using data in the NOMAD Archive to train a machine learning classifier that can predict the existence of an electronic band gap from crystal structure only. We achieve roughly 80% prediction accuracy with the best classifier and 9894 samples. As input for the learning we use very compact structural descriptors that are invariant under rotation and translation and include only information about the atomic species and their positions.\n

\n
" }, "selectedType": "BeakerDisplay", "elapsedTime": 0,  238  "height": 209  Lauri Himanen committed Aug 03, 2017 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294  }, "evaluatorReader": true, "lineCount": 5, "isError": false }, { "id": "codeGaFZNr", "type": "code", "evaluator": "HTML", "input": { "body": [ "
", "

Data

", "

We use data from the NOMAD Archive to train the classifiers. We have selected calculations with the following criteria:

", "
", "
• VASP calculations originating from the AFLOWLIB1 project
• ", "
• Periodic crystal structures
• ", "
• PBE exchange-correlation functional2
• ", "
• Projector-augmented wavefunction (PAW) potentials3,4
• ", "
• The data should conform to the AFLOWLIB Standard for High-Throughput Computing5, which ensures reproducibility of the data, and provides reasoning for any parameters set in the calculation, such as accuracy thresholds, calculation pathways, and mesh dimensions.
• ", "
• The calculation must have density of states (\"dos_energies_normalized\" and \"dos_values\") available, because we detect and calculate the band gap based on this information. The DOS energies in the Archive have been normalized so that 0 is at the top of the valence band.
• ", "
• No more than 8 atoms in the simulation cell
• ", "
• Ignoring elements with < 0.5% occurence in the whole dataset. Occurrence of an atomic element is the percentage of samples with at least one atom of that species.
• ", "
• To ensure that we do not allow the same structure to enter the dataset twice, we only allow one sample for each chemical formula.
• ", "
", "

The Archive data is not alone sufficient to ensure some properties of the calculations. The following issues/restrictions have been identified and should be considered when analyzing the results:

", "
", "
• Based on the Archive information we cannot determine if the structure has been relaxed, so we may be including also unrelaxed samples.
• ", "
• The Archive output does not have convergence information (\"single_configuration_calculation_converged\") to determine if the calculation has been converged against some set of convergence criteria (\"settings_scf\").
• ", "
", "

From the calculations that match these criteria we choose all calculations with a band gap, which equals to 3298 samples. The dataset contains much more data without a band gap, but we randomly choose 6596 of such samples. The final dataset then consists of 9894 samples. The inbalance between the classes does not seem to significantly affect training as long as the samples are weighted during training.

", "

The band gap distribution is higly skewed towards materials with a low band gap (semiconductors), which poses a challenge for the training of the classifiers as many of the samples will be near the decision boundary.

", " ", " Plot histogram of non-zero band gaps", " Plot element occurence", "
", "", "" ] }, "output": { "state": {}, "result": { "type": "BeakerDisplay", "innertype": "Html", "object": "\n
\n

Data

\n

We use data from the NOMAD Archive to train the classifiers. We have selected calculations with the following criteria:

\n
\n
• VASP calculations originating from the AFLOWLIB1 project
• \n
• Periodic crystal structures
• \n
• PBE exchange-correlation functional2
• \n
• Projector-augmented wavefunction (PAW) potentials3,4
• \n
• The data should conform to the AFLOWLIB Standard for High-Throughput Computing5, which ensures reproducibility of the data, and provides reasoning for any parameters set in the calculation, such as accuracy thresholds, calculation pathways, and mesh dimensions.
• \n
• The calculation must have density of states (\"dos_energies_normalized\" and \"dos_values\") available, because we detect and calculate the band gap based on this information. The DOS energies in the Archive have been normalized so that 0 is at the top of the valence band.
• \n
• No more than 8 atoms in the simulation cell
• \n
• Ignoring elements with < 0.5% occurence in the whole dataset. Occurrence of an atomic element is the percentage of samples with at least one atom of that species.
• \n
• To ensure that we do not allow the same structure to enter the dataset twice, we only allow one sample for each chemical formula.
• \n
\n

The Archive data is not alone sufficient to ensure some properties of the calculations. The following issues/restrictions have been identified and should be considered when analyzing the results:

\n
\n
• Based on the Archive information we cannot determine if the structure has been relaxed, so we may be including also unrelaxed samples.
• \n
• The Archive output does not have convergence information (\"single_configuration_calculation_converged\") to determine if the calculation has been converged against some set of convergence criteria (\"settings_scf\").
• \n
\n

From the calculations that match these criteria we choose all calculations with a band gap, which equals to 3298 samples. The dataset contains much more data without a band gap, but we randomly choose 6596 of such samples. The final dataset then consists of 9894 samples. The inbalance between the classes does not seem to significantly affect training as long as the samples are weighted during training.

\n

The band gap distribution is higly skewed towards materials with a low band gap (semiconductors), which poses a challenge for the training of the classifiers as many of the samples will be near the decision boundary.

\n \n Plot histogram of non-zero band gaps\n Plot element occurence\n
\n\n" }, "selectedType": "BeakerDisplay", "elapsedTime": 0,  295  "height": 702  Lauri Himanen committed Aug 03, 2017 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362  }, "evaluatorReader": true, "lineCount": 34, "isError": false }, { "id": "moi", "type": "code", "evaluator": "Python3", "input": { "body": [ "# This script plots a histogram of the band gap values.", "", "import numpy", "from plotly.offline import download_plotlyjs, init_notebook_mode, iplot", "import plotly.graph_objs as go", "", "# Injects the plotly.js source files into the notebook", "init_notebook_mode()", "", "# Load the band gap data", "band_gaps = np.load(\"{}/band_gaps.npy\".format(beaker.dir))", "non_zero_gaps = band_gaps[band_gaps != 0]", "std = non_zero_gaps.std()", "mean = non_zero_gaps.mean()", "", "# Plot a histogram of the band gap values", "hist = go.Histogram(", " x=non_zero_gaps,", " xbins=dict(", " start=0,", " end=25,", " size=0.05,", " ),", ")", "data = [hist]", "layout = go.Layout(", " width=1000,", " height=500,", " title=\"Histogram for the non-zero band gaps\",", " titlefont={", " \"size\":25,", " },", " xaxis=dict(", " title=\"Band gap (eV)\",", " titlefont=dict(", " size=20,", " ),", " range=[0, mean+3*std]", " ),", " yaxis=dict(", " title=\"Probability\",", " titlefont=dict(", " size=20,", " )", " ),", ")", "fig = go.Figure(data=data, layout=layout)", "iplot(fig)", "" ] }, "output": { "result": { "type": "OutputContainer", "psubtype": "OutputContainer", "items": [  363 364  "