Merge pull request #30 from ECRL/dev

Documentation Additions
ecrl · Apr 19, 2019 · 66dbe39 · 66dbe39
2 parents f92b742 + 6e0702a
commit 66dbe39
Show file tree

Hide file tree

Showing 7 changed files with 236 additions and 2 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -8,3 +8,4 @@ ECNet Documentation
    usage/installation
    usage/quickstart
    usage/tools
+   usage/examples
diff --git a/docs/tutorials/Getting Started/Getting Started with Predicting Fuel Properties.pdf b/docs/tutorials/Getting Started/Getting Started with Predicting Fuel Properties.pdf
diff --git a/docs/tutorials/Getting Started/scripts/create_parity_plot.py b/docs/tutorials/Getting Started/scripts/create_parity_plot.py
@@ -0,0 +1,47 @@
+from ecnet import Server
+from ecnet.utils.logging import logger
+from ecnet.tools.plotting import ParityPlot
+
+
+def main():
+
+    logger.stream_level = 'info'
+    sv = Server(prj_file='kinetic_viscosity.prj')
+
+    train_exp = []
+    train_exp.extend(y for y in sv._sets.learn_y)
+    train_exp.extend(y for y in sv._sets.valid_y)
+    train_pred = sv.use(dset='train')
+    train_errors = sv.errors('rmse', 'r2', dset='train')
+
+    test_exp = sv._sets.test_y
+    test_pred = sv.use(dset='test')
+    test_errors = sv.errors('rmse', 'r2', dset='test')
+
+    kv_plot = ParityPlot(
+        title='Predicted vs. Experimental Kinematic Viscosity',
+        x_label='Experimental KV',
+        y_label='Predicted KV'
+    )
+    kv_plot.add_series(
+        train_exp,
+        train_pred,
+        name='Training Set',
+        color='blue'
+    )
+    kv_plot.add_series(
+        test_exp,
+        test_pred,
+        name='Test Set',
+        color='red'
+    )
+    kv_plot.add_error_bars(test_errors['rmse'], label='Test RMSE')
+    kv_plot._add_label('Test R-Squared', test_errors['r2'])
+    kv_plot._add_label('Train RMSE', train_errors['rmse'])
+    kv_plot._add_label('Train R-Squared', train_errors['r2'])
+    kv_plot.save('../kv_parity_plot.png')
+
+
+if __name__ == '__main__':
+
+    main()
diff --git a/docs/usage/examples.md b/docs/usage/examples.md
@@ -0,0 +1,87 @@
+# Example Scripts
+
+## SMILES String Validation
+
+SMILES strings are the basis for QSPR descriptor generation, and therefore play an immense role in what neural networks learn (and how they correlate QSPR descriptors to given fuel properties). It is paramount that SMILES strings for molecules are correct to ensure neural networks learn from correct molecule representations.
+
+To validate SMILES strings for molecules stored in an ECNet-formatted database, we can use the script below to query PubChem using molecule names. The "validate_smiles" function accepts two arguments, the database you wish to validate and the filename of the resulting validated database. Note that QSPR descriptors in the resulting database do not reflect changes made to SMILES strings, and you will need to create a new database using our [database construction tool](https://ecnet.readthedocs.io/en/latest/usage/tools.html#database-creation) to generate new descriptors.
+
+```python
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Developed in 2019 by Travis Kessler <[email protected]>
+#
+# Example script for validating ECNet-formatted database SMILES strings
+#
+
+from ecnet.utils.data_utils import DataFrame
+from ecnet.tools.database import get_smiles
+from ecnet.utils.logging import logger
+
+
+def validate_smiles(db_name, new_db):
+
+    # load the database
+    logger.log('info', 'Loading data from {}'.format(db_name))
+    df = DataFrame(db_name)
+
+    # find index of `Compound Name` string
+    name_idx = -1
+    for idx, name in enumerate(df.string_names):
+        if name == 'Compound Name':
+            name_idx = idx
+            break
+    if name_idx == -1:
+        logger.log('error', '`Compound Name` string not found in database')
+        return
+
+    # find index of `SMILES` string
+    smiles_idx = -1
+    for idx, name in enumerate(df.string_names):
+        if name == 'SMILES':
+            smiles_idx = idx
+            break
+    if smiles_idx == -1:
+        logger.log('error', '`SMILES` string not found in database')
+
+    # check each molecule's SMILES, replace if incorrect
+    for pt in df.data_points:
+        smiles = get_smiles(pt.strings[name_idx])
+        if smiles == '':
+            logger.log('warn', '{} not found on PubChem'.format(
+                pt.strings[name_idx]
+            ))
+            continue
+        else:
+            if smiles != pt.strings[smiles_idx]:
+                logger.log(
+                    'crit',
+                    'Incorrect SMILES for {}:\n\tDatabase SMILES: {}'
+                    '\n\tPubChem SMILES: {}'.format(
+                        pt.strings[name_idx],
+                        pt.strings[smiles_idx],
+                        smiles
+                    ))
+                pt.strings[smiles_idx] = smiles
+            else:
+                logger.log('info', 'Correct SMILES for {}'.format(
+                    pt.strings[name_idx]
+                ))
+
+    # save the validated database
+    logger.log('info', 'Saving validated data to {}'.format(new_db))
+    df.save(new_db)
+    return
+
+
+if __name__ == '__main__':
+
+    # initialize logging
+    logger.stream_level = 'info'
+    # un-comment this for file logging
+    # logger.file_level = 'info'
+
+    validate_smiles('unvalidated_db.csv', 'validated_db.csv')
+
+```
diff --git a/docs/usage/img/cn_parity_plot.png b/docs/usage/img/cn_parity_plot.png
diff --git a/docs/usage/quickstart.md b/docs/usage/quickstart.md
@@ -1,10 +1,14 @@
 # Quick Start
 
+A full tutorial and additional databases are provided ([tutorial](https://github.com/ECRL/ECNet/tree/dev/docs/tutorials/Getting%20Started), [databases](https://github.com/ECRL/ECNet/tree/master/databases)) to get you started. The following documentation is intended for users seeking additional information about ECNet's functionality.
+
+## Preface
+
 ECNet operates using a **Server** object that interfaces with data utility classes, error calculation functions, and neural network creation classes. The Server object handles importing data and model creation for your project, and serves the data to models. Configurable variables for neural networks, such as learning rate, number of neurons per hidden layer, activation functions for hidden/input/output layers, and number of training epochs are found in a **.yml** configuration file.
 
 ## Model configuration file
 
-For training, we apply the Adam optimization algorithm to feed-forward neural networks. Here is the default model configuration:
+For training, we apply the Adam optimization algorithm to feed-forward neural networks. Here is the default model configuration, outlining default values we pass to model creation and training functions:
 
 ```
 ---
@@ -24,7 +28,7 @@ output_activation: linear
 
 ## Using the Server object
 
-To get started, create a Python script to handle your task and copy an ECNet-formatted CSV database file to your working directory. The Server object will create a default configuration file if an existing one is not specified or found. Example scripts, configuration files, and databases are provided ([examples/config](https://github.com/ECRL/ECNet/tree/master/examples), [databases](https://github.com/ECRL/ECNet/tree/master/databases)).
+First, create a Python script to handle your task and copy an ECNet-formatted CSV database file to your working directory. The Server object will create a default configuration file if an existing one is not specified or found.
 
 Your first steps are importing the Server object, initializing the Server and importing some data:
 

diff --git a/docs/usage/tools.md b/docs/usage/tools.md
@@ -115,3 +115,98 @@ predict('smiles.txt', 'results.csv', 'my_project.prj', form='smiles')
 ```
 
 Both Open Babel and the Java JRE are required for conversions.
+
+## Constructing parity plots
+
+A common method for visualizing how well neural networks predict data is by utilizing a parity plot. A parity plot will show how much predictions deviate from experimental values by plotting them in conjunction with a 1:1 linear function (the closer a plot's data points are to this line, the better they perform).
+
+To create a parity plot, let's import the ParityPlot object:
+
+```python
+from ecnet.tools.plotting import ParityPlot
+```
+
+And initialize it:
+
+```python
+my_plot = ParityPlot()
+
+# The plot's title defaults to `Parity Plot`; let's change that:
+my_plot = ParityPlot(title='Cetane Number Parity Plot')
+
+# The plot's axes default to `Experimental Value` (x-axis) and `Predicted Value`
+#   (y-axis); we can change those too:
+my_plot = ParityPlot(
+    title='Cetane Number Parity Plot',
+    x_label='Experimental CN',
+    y_label='Predicted CN'
+)
+
+# The plot's font is Times New Roman by default; to use another font:
+my_plot = ParityPlot(
+    title='Cetane Number Parity Plot',
+    x_label='Experimental CN',
+    y_label='Predicted CN',
+    font='Calibri'
+)
+```
+
+Now that our plot is initialized, we can add data:
+
+```python
+my_plot.add_series(x_vals, y_vals)
+```
+
+Say, for example, we obtained results from ECNet's Server object using the "use" method; let's plot predicted vs. experimental for the test set:
+```python
+'''Let's assume you've trained your model using a Server, `sv`'''
+
+# Obtain predictions for data in the test set:
+predicted_data = sv.use(dset='test')
+experimental_data = sv._sets.test_y
+
+# Pass the test data's experimental values and its predicted values:
+my_plot.add_series(experimental_data, predicted_data)
+
+# We can also asign a name to the series, and change its color:
+my_plot.add_series(
+    experimental_data,
+    predicted_data,
+    name='Test Set',
+    color='red'
+)
+```
+
+Multiple data series can be added to your plot, allowing you to visualize different data sets together.
+
+If we want to visualize how well given data points perform with respect to an error metric, we can add error bars to the plot. These error bars are placed on the positive and negative side of the 1:1 parity line:
+
+```python
+'''Let's assume you've trained your model using a Server, `sv`'''
+
+# Obtain the test set's RMSE:
+errors = sv.errors('rmse', dset='test')
+
+# Add the error bars:
+my_plot.add_error_bars(errors['rmse'])
+
+# We can show the value of the error by supplying:
+my_plot.add_error_bars(errors['rmse'], label='Test Set RMSE')
+```
+
+Once the plot is complete, it can be saved:
+
+```python
+# Save the plot to `my_plot.png`:
+my_plot.save('my_plot.png')
+```
+
+Or, we can view it without saving:
+```python
+# View the plot in a pop-up window:
+my_plot.show()
+```
+
+Here is what plotting cetane number training and test data looks like:
+
+![](./img/cn_parity_plot.png)