diff --git a/joss/paper.bib b/joss/paper.bib index cc186e56cd5d8a8c3aba367db130b11773a0f652..5c8f3cbd620b70c379a76c9fcceb281a136d1caa 100644 --- a/joss/paper.bib +++ b/joss/paper.bib @@ -217,6 +217,40 @@ url = {https://pubs.acs.org/doi/full/10.1021/acs.jcim.9b00807}, volume = {59}, year = {2019} } +@article{Bartel2019a, +abstract = {Predicting the stability of the perovskite structure remains a long-standing challenge for the discovery of new functional materials for many applications including photovoltaics and electrocatalysts. We developed an accurate, physically interpretable, and one-dimensional tolerance factor, t, that correctly predicts 92{\%} of compounds as perovskite or nonperovskite for an experimental dataset of 576 ABX 3 materials (X = O 2− , F − , Cl − , Br − , I − ) using a novel data analytics approach based on SISSO (sure independence screening and sparsifying operator). t is shown to generalize outside the training set for 1034 experimentally realized single and double perovskites (91{\%} accuracy) and is applied to identify 23,314 new double perovskites (A 2 BB′X 6 ) ranked by their probability of being stable as perovskite. This work guides experimentalists and theorists toward which perovskites are most likely to be successfully synthesized and demonstrates an approach to descriptor identification that can be extended to arbitrary applications beyond perovskite stability predictions.}, +archivePrefix = {arXiv}, +arxivId = {1801.07700}, +author = {Bartel, Christopher J. and Sutton, Christopher and Goldsmith, Bryan R. and Ouyang, Runhai and Musgrave, Charles B. and Ghiringhelli, Luca M. and Scheffler, Matthias}, +doi = {10.1126/sciadv.aav0693}, +eprint = {1801.07700}, +issn = {23752548}, +journal = {Sci. Adv.}, +month = {feb}, +number = {2}, +pmid = {30783625}, +publisher = {American Association for the Advancement of Science}, +title = {{New tolerance factor to predict the stability of perovskite oxides and halides}}, +volume = {5}, +year = {2019} +} +@article{Cao2020, +abstract = {Significant advances have been made in predicting new topological materials using high-throughput empirical descriptors or symmetry-based indicators. To date, these approaches have been applied to materials in existing databases, and are severely limited to systems with well-defined symmetries, leaving a much larger materials space unexplored. Using tetradymites as a prototypical class of examples, we uncover a two-dimensional descriptor by applying an artificial intelligence (AI)-based approach for fast and reliable identification of the topological characters of a drastically expanded range of materials, without prior determination of their specific symmetries and detailed band structures. By leveraging this descriptor that contains only the atomic number and electronegativity of the constituent species, we have readily scanned a huge number of alloys in the tetradymite family. Strikingly, nearly half of them are identified to be topological insulators, revealing a much larger territory of the topological materials world. The present work also attests to the increasingly important role of such AI-based approaches in modern materials discovery.}, +author = {Cao, Guohua and Ouyang, Runhai and Ghiringhelli, Luca M. and Scheffler, Matthias and Liu, Huijun and Carbogno, Christian and Zhang, Zhenyu}, +doi = {10.1103/PhysRevMaterials.4.034204}, +file = {:home/purcell/Documents/Mendeley Desktop/Cao et al. - Physical Review Materials - 2020.pdf:pdf}, +issn = {24759953}, +journal = {Phys. Rev. Mater.}, +keywords = {doi:10.1103/PhysRevMaterials.4.034204 url:https://}, +month = {mar}, +number = {3}, +pages = {034204}, +publisher = {American Physical Society}, +title = {{Artificial intelligence for high-throughput discovery of topological insulators: The example of alloyed tetradymites}}, +url = {https://journals.aps.org/prmaterials/abstract/10.1103/PhysRevMaterials.4.034204}, +volume = {4}, +year = {2020} +} @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. diff --git a/joss/paper.md b/joss/paper.md index 21c546f860d9ea143e1fae3149f0ab7bedd01ae4..272fd76f6ffa4c0a93654ef6b68bd2e0ffa74635 100644 --- a/joss/paper.md +++ b/joss/paper.md @@ -1,10 +1,11 @@ --- -title: 'FHI-vibes: _Ab Initio_ Vibrational Simulations' +title: 'SISSO++' tags: - - Python + - SISSO + - Symbolic Regression - Physics - - Phonons - - Transport + - C++ + - Python authors: - name: Thomas A. R. Purcell orcid: 0000-0003-4564-7206 @@ -18,31 +19,33 @@ authors: orcid: 0000-0001-5099-3029 affiliation: 1 affiliations: - - name: Fritz Haber Institute of the Max Planck Society, Berlin, Germany + - name: NOMAD Laboratory at the Fritz Haber Institute of the Max Planck Society and Humboldt University, Berlin, Germany index: 1 date: September 2021 bibliography: paper.bib --- # Summary -The sure-independence screening and sparsifying operator (SISSO) method [@Ouyang2017] is an algorithm belonging to the field of artificial intelligence. -As a symbolic regression technique, SISSO is used to identify low-dimensional, analytic functions, the so called descriptors, that best reproduce or classify a target data set. -In practice, SISSO first constructs a large and exhaustive feature space of billions of potential descriptors by taking in a set of user-provided primary features, i.e. the input features, and then iteratively applying a set of analytical unary and binary operators, e.g., addition, multiplication, exponentiation, and squaring. -From this exhaustive pool of candidate descriptors, the best one is identified by performing an $\ell_0$-regularization to find the best low-dimensional linear model of the features using the SISSO operator. +The sure-independence screening and sparsifying operator (SISSO) method [@Ouyang2017] is an algorithm belonging to the field of artificial intelligence and more specifically supervised machine learning. +As a symbolic-regression technique, SISSO is used to identify low-dimensional, analytic functions, the so called descriptors, that best predict the labels of a target data set. +SISSO is introduced for both regression and classification tasks. +In practice, SISSO first constructs a large and exhaustive feature space of billions of potential descriptors by taking in a set of user-provided *primary features*, and then iteratively applying a set of unary and binary operators, e.g., addition, multiplication, exponentiation, and squaring. +From this exhaustive pool of candidate descriptors, the best ones are identified via sure-independence screening, from which the best low-dimensional linear models are found via an $\ell_0$ regularization. Because symbolic regression generates an interpretable equation, it has become an increasingly popular method across scientific disciplines [@Wang2019a, @Neumann2020, @Udrescu2020a]. -A particular advantage of these approaches are their capability to model complex phenomena using relatively simple features. -Because of this, SISSO has been used successfully in the past to model, explore, and predict important material properties, including: the stability of different phases [@Bartel2018a, @Schleder2020], the catalytic activity and reactivity [@Han2021, @Xu2020, @Andersen2021], and glass transition temperatures [@Pilania2019]. Beyond regression problems, SISSO has also been used successfully to classify materials into different crystal prototypes [@Ouyang2019] or to determine if a material is a topological insulator or not [@Cao2019]. +A particular advantage of these approaches are their capability to model complex phenomena using relatively simple descriptors. +Because of this, SISSO has been used successfully in the past to model, explore, and predict important material properties, including: the stability of different phases [@Bartel2018a, @Schleder2020], the catalytic activity and reactivity [@Han2021, @Xu2020, @Andersen2021], and glass transition temperatures [@Pilania2019]. +Beyond regression problems, SISSO has also been used successfully to classify materials into different crystal prototypes [@Ouyang2019], or whether a material crystallizes in its ground state as a perovskite [@Bartel2019], or to determine if a material is a topological insulator or not [@Cao2020]. The SISSO++ package is a modular and extensible C++ implementation of the SISSO method with python bindings. Specifically, SISSO++ applies this methodology for regression, log regression, and classification problems. Additionally the library include multiple python functions to facilitate the post-processing, analyzing, and visualizing the resulting models. # Statement of need -The main goal of the SISSO++ package is to provide a user-friendly, easily-extendable version of the SISSO method for the scientific community that can be used both on high-performance architectures for data production and on personal computing devices for analyzing and visualizing the results. +The main goal of the SISSO++ package is to provide a user-friendly, easily extendable version of the SISSO method for the scientific community that can be used both on high-performance architectures for data production and on personal computing devices for analyzing and visualizing the results. For this reason, all computational-intensive task are written in C++ and support parallelization via MPI and openMP. -Additionally, Python bindings allow to easily incorporate the methods in computational workflows and to easily postprocess results. -Furthermore, this can facilitate the future integration of SISSO in existing machine-learning frameworks, e.g. scikit-learn [@scikit-learn] +Additionally, the Python bindings allow one to easily incorporate the methods into computational workflows and postprocess results. +Furthermore, this can facilitate the future integration of SISSO into existing machine-learning frameworks, e.g. scikit-learn [@scikit-learn] The code is designed in a modular fashion, which simplifies the process of extending the code for other applications. Finally the project's extensive documentation and tutorials provide a good access point for new-users of the method. @@ -51,11 +54,11 @@ The following features are implemented in SISSO++: - A C++ library for using SISSO to find analytical models for a given problem - - Python bindings to be able to interface with the C++ objects in a python environment + - Python bindings to be able to interface with the C++ objects in a Python environment - - Postprocessing tools for visualizing models and analyzing results using matplotlib + - Postprocessing tools for visualizing models and analyzing results using Matplotlib - - Access to solve an n-dimensional classification model using a combination of calculating the convex-hull overlap and a linear-SVM solver + - Access to solve an *n*-dimensional classification model using a combination of calculating the convex-hull overlap and a linear-SVM solver - Features with better defined non-linearaities of the models by automatically optimizing the scale and bias terms to all operations using non-linear optimization @@ -65,6 +68,6 @@ The following features are implemented in SISSO++: # Acknowledgements -The authors would like to thank Markus Rampp and Meisam Tabriz for technical support. We would also like to thank Lucas Foppa, Jingkai Quan, Aakash Naik, and Luigi Sabilio for testing and providing valuable feedback. T.P. would like to thank the Alexander von Humboldt Foundation for their support through the Alexander von Humboldt Postdoctoral Fellowship Program. This project was supported by TEC1p (the European Research Council (ERC) Horizon 2020 research and innovation programme, grant agreement No. 740233), BigMax (the Max Planck Society’s Research Network on Big-Data-Driven Materials-Science), and the NOMAD pillar of the FAIR-DI e.V. association. +The authors would like to thank Markus Rampp and Meisam Tabriz for technical support. We would also like to thank Lucas Foppa, Jingkai Quan, Aakash Naik, and Luigi Sbailò for testing and providing valuable feedback. T.P. would like to thank the Alexander von Humboldt Foundation for their support through the Alexander von Humboldt Postdoctoral Fellowship Program. This project was supported by TEC1p (the European Research Council (ERC) Horizon 2020 research and innovation programme, grant agreement No. 740233), BiGmax (the Max Planck Society’s Research Network on Big-Data-Driven Materials-Science), and the NOMAD pillar of the FAIR-DI e.V. association. # References