Merge branch 'joss' of gitlab.mpcdf.mpg.de:tpurcell/cpp_sisso into data_overwrite_error

11c7c9f6 · Thomas Purcell · ed9347fc · d4df6d55 · 11c7c9f6 · 11c7c9f6
Commit 11c7c9f6 authored Sep 16, 2021 by Thomas Purcell
--- a/joss/paper.bib
+++ b/joss/paper.bib
@@ -110,3 +110,110 @@ title = {{AI Feynman: A physics-inspired method for symbolic regression}},
 volume = {6},
 year = {2020}
 }
+@article{Bartel2018a,
+abstract = {The Gibbs energy, G, determines the equilibrium conditions of chemical reactions and materials stability. Despite this fundamental and ubiquitous role, G has been tabulated for only a small fraction of known inorganic compounds, impeding a comprehensive perspective on the effects of temperature and composition on materials stability and synthesizability. Here, we use the SISSO (sure independence screening and sparsifying operator) approach to identify a simple and accurate descriptor to predict G for stoichiometric inorganic compounds with {\~{}}50 meV atom−1 ({\~{}}1 kcal mol−1) resolution, and with minimal computational cost, for temperatures ranging from 300–1800 K. We then apply this descriptor to {\~{}}30,000 known materials curated from the Inorganic Crystal Structure Database (ICSD). Using the resulting predicted thermochemical data, we generate thousands of temperature-dependent phase diagrams to provide insights into the effects of temperature and composition on materials synthesizability and stability and to establish the temperature-dependent scale of metastability for inorganic compounds.},
+archivePrefix = {arXiv},
+arxivId = {1805.08155},
+author = {Bartel, Christopher J. and Millican, Samantha L. and Deml, Ann M. and Rumptz, John R. and Tumas, William and Weimer, Alan W. and Lany, Stephan and Stevanovi{\'{c}}, Vladan and Musgrave, Charles B. and Holder, Aaron M.},
+doi = {10.1038/s41467-018-06682-4},
+eprint = {1805.08155},
+file = {:home/purcell/Documents/Mendeley Desktop/Bartel et al. - Nature Communications - 2018.pdf:pdf},
+issn = {20411723},
+journal = {Nat. Commun.},
+keywords = {Statistics,Theory and computation},
+month = {oct},
+number = {1},
+pages = {1--10},
+pmid = {30301890},
+publisher = {Nature Publishing Group},
+title = {{Physical descriptor for the Gibbs energy of inorganic crystalline solids and temperature-dependent materials chemistry}},
+url = {https://www.nature.com/articles/s41467-018-06682-4},
+volume = {9},
+year = {2018}
+}
+@article{Schleder2020,
+abstract = {The increasing interest and research on two-dimensional (2D) materials has not yet translated into a reality of diverse materials applications. To go beyond graphene and transition metal dichalcogenides for several applications, suitable candidates with desirable properties must be proposed. Here we use machine learning techniques to identify thermodynamically stable 2D materials, which is the first essential requirement for any application. According to the formation energy and energy above the convex hull, we classify materials as having low, medium, or high stability. The proposed approach enables the stability evaluation of novel 2D compounds for further detailed investigation of promising candidates, using only composition properties and structural symmetry, without the need for information about atomic positions. We demonstrate the usefulness of the model generating more than a thousand novel compounds, corroborating with DFT calculations the classification for five of these materials. To illustrate the applicability of the stable materials, we then perform a screening of electronic materials suitable for photoelectrocatalytic water splitting, identifying the potential candidate Sn2SeTe generated by our model, and also PbTe, both not yet reported for this application.},
+author = {Schleder, Gabriel R. and Acosta, Carlos Mera and Fazzio, Adalberto},
+doi = {10.1021/acsami.9b14530},
+issn = {19448252},
+journal = {ACS Appl. Mater. Interfaces},
+keywords = {big data,density functional theory (DFT),high throughput screening,machine learning,two-dimensional materials},
+month = {may},
+number = {18},
+pages = {20149--20157},
+pmid = {31692336},
+publisher = {American Chemical Society},
+title = {{Exploring Two-Dimensional Materials Thermodynamic Stability via Machine Learning}},
+url = {https://pubs.acs.org/doi/full/10.1021/acsami.9b14530},
+volume = {12},
+year = {2020}
+}
+@article{Han2021,
+abstract = {Single-atom-alloy catalysts (SAACs) have recently become a frontier in catalysis research. Simultaneous optimization of reactants' facile dissociation and a balanced strength of intermediates' binding make them highly efficient catalysts for several industrially important reactions. However, discovery of new SAACs is hindered by lack of fast yet reliable prediction of catalytic properties of the large number of candidates. We address this problem by applying a compressed-sensing data-analytics approach parameterized with density-functional inputs. Besides consistently predicting efficiency of the experimentally studied SAACs, we identify more than 200 yet unreported promising candidates. Some of these candidates are more stable and efficient than the reported ones. We have also introduced a novel approach to a qualitative analysis of complex symbolic regression models based on the data-mining method subgroup discovery. Our study demonstrates the importance of data analytics for avoiding bias in catalysis design, and provides a recipe for finding best SAACs for various applications.},
+author = {Han, Zhong Kang and Sarker, Debalaya and Ouyang, Runhai and Mazheika, Aliaksei and Gao, Yi and Levchenko, Sergey V.},
+doi = {10.1038/s41467-021-22048-9},
+file = {:home/purcell/Documents/Mendeley Desktop/Han et al. - Nature Communications - 2021.pdf:pdf},
+issn = {20411723},
+journal = {Nat. Commun.},
+keywords = {Computational chemistry,Materials for energy and catalysis,Theory and computation},
+month = {mar},
+number = {1},
+pages = {1--9},
+pmid = {33758170},
+publisher = {Nature Publishing Group},
+title = {{Single-atom alloy catalysts designed by first-principles calculations and artificial intelligence}},
+url = {https://www.nature.com/articles/s41467-021-22048-9},
+volume = {12},
+year = {2021}
+}
+@article{Andersen2021,
+abstract = {ConspectusHeterogeneous catalysts are rather complex materials that come in many classes (e.g., metals, oxides, carbides) and shapes. At the same time, the interaction of the catalyst surface with even a relatively simple gas-phase environment such as syngas (CO and H2) may already produce a wide variety of reaction intermediates ranging from atoms to complex molecules. The starting point for creating predictive maps of, e.g., surface coverages or chemical activities of potential catalyst materials is the reliable prediction of adsorption enthalpies of all of these intermediates. For simple systems, direct density functional theory (DFT) calculations are currently the method of choice. However, a wider exploration of complex materials and reaction networks generally requires enthalpy predictions at lower computational cost.The use of machine learning (ML) and related techniques to make accurate and low-cost predictions of quantum-mechanical calculations has gained increasing attention lately. The employed approaches span from physically motivated models over hybrid physics-$\Delta$ML approaches to complete black-box methods such as deep neural networks. In recent works we have explored the possibilities for using a compressed sensing method (Sure Independence Screening and Sparsifying Operator, SISSO) to identify sparse (low-dimensional) descriptors for the prediction of adsorption enthalpies at various active-site motifs of metals and oxides. We start from a set of physically motivated primary features such as atomic acid/base properties, coordination numbers, or band moments and let the data and the compressed sensing method find the best algebraic combination of these features. Here we take this work as a starting point to categorize and compare recent ML-based approaches with a particular focus on model sparsity, data efficiency, and the level of physical insight that one can obtain from the model.Looking ahead, while many works to date have focused only on the mere prediction of databases of, e.g., adsorption enthalpies, there is also an emerging interest in our field to start using ML predictions to answer fundamental science questions about the functioning of heterogeneous catalysts or perhaps even to design better catalysts than we know today. This task is significantly simplified in works that make use of scaling-relation-based models (volcano curves), where the model outcome is determined by only one or two adsorption enthalpies and which consequently become the sole target for ML-based high-throughput screening or design. However, the availability of cheap ML energetics also allows going beyond scaling relations. On the basis of our own work in this direction, we will discuss the additional physical insight that can be achieved by integrating ML-based predictions with traditional catalysis modeling techniques from thermal and electrocatalysis, such as the computational hydrogen electrode and microkinetic modeling, as well as the challenges that lie ahead.},
+author = {Andersen, Mie and Reuter, Karsten},
+doi = {10.1021/acs.accounts.1c00153},
+file = {:home/purcell/Documents/Mendeley Desktop/Andersen, Reuter - Accounts of Chemical Research - 2021.pdf:pdf},
+issn = {0001-4842},
+journal = {Acc. Chem. Res.},
+month = {jun},
+number = {12},
+pages = {2741--2749},
+pmid = {34080415},
+publisher = {American Chemical Society},
+title = {{Adsorption Enthalpies for Catalysis Modeling through Machine-Learned Descriptors}},
+url = {https://pubs.acs.org/doi/abs/10.1021/acs.accounts.1c00153},
+volume = {54},
+year = {2021}
+}
+@article{Xu2020,
+abstract = {Computational screening of metal oxide catalysts is challenging due to their more localized and intricate electronic structure as compared to metal catalysts and the resulting lack of suitable activity descriptors to replace expensive density functional theory (DFT) calculations. By using a compressed sensing approach, we here identify descriptors in the form of algebraic expressions of surface-derived features for predicting adsorption enthalpies of oxygen evolution reaction (OER) intermediates at doped RuO2 and IrO2 electrocatalysts. Our descriptors significantly outperform previously highlighted single descriptors both in terms of accuracy and computational cost. Compared to standard scaling relations that employ the oxygen adsorption enthalpy as a unique reactivity descriptor, our analysis reveals that the consideration of features related to the local charge transfer leads to significantly improved refined scaling relations. These allow us to screen for improved OER electrocatalysts with an uncertainty in the theoretical overpotential similar to the expected intrinsic DFT error of 0.2 V.},
+author = {Xu, Wenbin and Andersen, Mie and Reuter, Karsten},
+doi = {10.1021/acscatal.0c04170},
+file = {:home/purcell/Documents/Mendeley Desktop/Xu, Andersen, Reuter - ACS Catalysis - 2021.pdf:pdf},
+issn = {21555435},
+journal = {ACS Catal.},
+keywords = {ab initio calculation,compressed sensing,computational screening,heterogeneous catalysis,machine learning,oxygen evolution reaction,transition metal oxides},
+month = {jan},
+number = {2},
+pages = {734--742},
+publisher = {American Chemical Society},
+title = {{Data-Driven Descriptor Engineering and Refined Scaling Relations for Predicting Transition Metal Oxide Reactivity}},
+url = {https://pubs.acs.org/doi/full/10.1021/acscatal.0c04170},
+volume = {11},
+year = {2021}
+}
+@article{Pilania2019,
+abstract = {Polyhydroxyalkanoate-based polymers - being ecofriendly, biosynthesizable, and economically viable and possessing a broad range of tunable properties - are currently being actively pursued as promising alternatives for petroleum-based plastics. The vast chemical complexity accessible within this class of polymers gives rise to challenges in the rational discovery of novel polymer chemistries for specific applications. The burgeoning field of polymer informatics addresses this challenge via providing tools and strategies for accelerated property prediction and materials design via surrogate machine-learning models built on reliable past data. In this contribution, we use glass transition temperature Tg as an example target property to demonstrate promise of the data-enabled route to accelerated learning of accurate structure-property mappings in PHA-based polymers. Our analysis uses a data set of experimentally measured Tg values, polymer molecular weights, and a polydispersity index for PHA-based homo- and copolymers that was carefully assembled from the literature. A fingerprinting scheme that captures key properties based on topology, shape, and charge/polarity of specific chemical units or motifs forming the polymer backbone was devised to numerically represent the polymers. A validated statistical learning model is then developed to allow for a mapping of the polymer fingerprints onto the property space in a physically meaningful and reliable manner. Once developed, the model can not only rapidly predict the property of new PHA polymers but also provide uncertainties underlying the predictions. The model is further combined with an evolutionary-algorithm-based search strategy to efficiently identify multicomponent polymer compositions with a prespecified Tg. While the present contribution is focused specifically on Tg, the surrogate model development approach put forward here is general and can, in principle, be extended to a range of other properties.},
+author = {Pilania, Ghanshyam and Iverson, Carl N. and Lookman, Turab and Marrone, Babetta L.},
+doi = {10.1021/acs.jcim.9b00807},
+file = {:home/purcell/Documents/Mendeley Desktop/Pilania et al. - Journal of Chemical Information and Modeling - 2019.pdf:pdf},
+issn = {15205142},
+journal = {J. Chem. Inf. Model.},
+month = {dec},
+number = {12},
+pages = {5013--5025},
+pmid = {31697891},
+publisher = {American Chemical Society},
+title = {{Machine-Learning-Based Predictive Modeling of Glass Transition Temperatures: A Case of Polyhydroxyalkanoate Homopolymers and Copolymers}},
+url = {https://pubs.acs.org/doi/full/10.1021/acs.jcim.9b00807},
+volume = {59},
+year = {2019}
+}
--- a/joss/paper.md
+++ b/joss/paper.md
@@ -38,26 +38,25 @@ A statement of need: Does the paper have a section titled ‘Statement of Need
 API, documentations, tutorial and quickstart guides are also important features

 # Summary
-The SISSO++ package is a C++ implementation of the sure-independence screening and sparsifying operator (SISSO) method with python bindings.
-SISSO is a symbolic regression method that takes in a set of input primary features and iteratively applies a set of analytical unary and binary operators to build a large and exhaustive feature space [@Ouyang2019a, @Ouyang2017].
-The goal of symbolic regression techniques is to find the mathematical expression that best describes a given target property given a set of primary features, i.e. the input features, and analytic operations.
-Because symbolic regression results an interpretable equation, it is an increasingly popular method across scientific disciplines [@Wang2019a, @Neumann2020, @Udrescu2020a].
-The SISSO++ package provides a modular library, executable, and python interface for the application of SISSO to myriad applications.
-
-SISSO++ implements the SISSO algorithm in a user-friendly, modular C++ library that is connected to both an executable and native python interface.
-The first step of SISSO is to build all possible expressions up to a user-defined maximum complexity, from the initial set of primary features and analytic operations.
-From here, an $\ell_0$-regularization is performed to find the best low-dimensional linear model of the features using the SISSO operator.
+The sure-independence screening and sparsifying operator (SISSO) method [@Ouyang2017] is an algorithm belonging to the field of artificial intelligence.
+As a symbolic regression technique SISSO is used to identify low-dimensional, analytic functions, the so called descriptors, that best reproduce or classify a target data set.
+In practice, SISSO first constructs a large and exhaustive feature space of billions of of potential descriptors by taking in a set of user-provided primary features, i.e. the input features, and then iteratively applying a set of analytical unary and binary operators, e.g., addition, multiplication, exponentiation, and squaring.
+From this exhaustive pool of candidate descriptors, the best one is identified by performing an $\ell_0$-regularization to find the best low-dimensional linear model of the features using the SISSO operator.
+
+Because symbolic regression generates an interpretable equation, it has become an increasingly popular method across scientific disciplines [@Wang2019a, @Neumann2020, @Udrescu2020a].
+In particular, SISSO has been used successfully in the past to solve numerous problems in material science, including: the stability of materials [@Bartel2018a, @Schleder2020], catalysis [@Han2021, @Xu2020, @Andersen2021], and glass transition temperatures [@Pilania2019].
+Beyond regression problems SISSO has also successfully used classify materials into different crystal prototypes [@Ouyang2019] or to determine if a material is a topological insulator or not [@Cao2019].
+
+The SISSO++ package is a modular and extensible C++ implementation of the SISSO method with python bindings.
 Specifically, SISSO++ applies this methodology for regression, log regression, and classification problems.
 Additionally the library include multiple python functions to facilitate the post-processing, analyzing, and visualizing the resulting models.
-Finally, we designed a code to be modular, allowing for future extensions to existing functionality.

 # Statement of need
-The main goal of the SISSO++ package is to provide a user-friendly, easily-extendable version of the SISSO for the use of the scientific community.
-While existing packages provide a high-performing implementation of SISSO [@Ouyang], multiple external efforts have implemented python wrappers to create a more accessible interface [@Xu, @Waroquiers].
-Additionally, SISSO++ addresses need for an implementation of postprocessing tools that facilitate the standard analysis tasks for the output of SISSO.
-Another key feature of the library is the modular design that simplifies the process of extending the code for other applications.
+The main goal of the SISSO++ package is to provide a user-friendly, easily-extendable version of the SISSO method for the use of the scientific community that can be used both on high-performance architectures for data production and on personal computing devices for analyzing and visualizing the results.
+For this reason, all computational-intensive task are written in C++ and support parallelization via MPI and openMP.
+Additionally, Python bindings allow to easily incorporate the methods in computational workflows and to easily postprocess results.
+The code is designed in a modular fashion, which simplifies the process of extending the code for other applications.
 Finally the project's extensive documentation and tutorials provide a good access point for new-users of the method.
-SISSO++ will broaden the applicability of SISSO to a wider audience and set of applications.

 # Features
 The following features are implemented in SISSO++: