Merge branch 'release-0.3a1'

DistrictDataLabs · Oct 9, 2016 · b3e4c8a · b3e4c8a
2 parents 056067c + 0c8aef4
commit b3e4c8a
Show file tree

Hide file tree

Showing 45 changed files with 5,346 additions and 1,175 deletions.
diff --git a/.gitignore b/.gitignore
@@ -61,9 +61,10 @@ target/
 #Ipython Notebook
 .ipynb_checkpoints
 
-# Annoying stuff from contributors
+# Making sure the team plays well together
 venv
 .DS_Store
+spad.py 
 
 # Created by https://www.gitignore.io/api/pycharm
 

diff --git a/Makefile b/Makefile
@@ -13,6 +13,7 @@ PYTHON_BIN := $(VIRTUAL_ENV)/bin
 # Clean build files
 clean:
 	find . -name "*.pyc" -print0 | xargs -0 rm -rf
+	find . -name "__pycache__" -print0 | xargs -0 rm -rf
 	-rm -rf htmlcov
 	-rm -rf .coverage
 	-rm -rf build

diff --git a/README.md b/README.md
@@ -17,30 +17,66 @@ Image by [Quatro Cinco](https://flic.kr/p/2Yj9mj), used with permission, Flickr
 Yellowbrick is a suite of visual analysis and diagnostic tools to facilitate feature selection, model selection, and parameter tuning for machine learning. All visualizations are generated in Matplotlib. Custom `yellowbrick` visualization tools include:
 
 ## Tools for feature analysis and selection
-- boxplots (box-and-whisker plots)    
-- violinplots    
-- histograms    
-- scatter plot matrices (sploms)    
-- radial visualizations (radviz)    
-- parallel coordinates    
-- jointplots    
-- diagonal correlation matrix    
+ - Boxplots (box-and-whisker plots)    
+ - Violinplots    
+ - Histograms    
+ - Scatter plot matrices (sploms)    
+ - Radial visualizations (radviz)    
+ - Parallel coordinates    
+ - Jointplots    
+ - Rank 1D    
+ - Rank 2D        
 
 ## Tools for model evaluation
 ### Classification
-- ROC curves    
-- classification heatmaps    
+ - ROC-AUC curves    
+ - Classification heatmaps
+ - Class balance chart     
 
 ### Regression
-- prediction error plots     
-- residual plots     
+ - Prediction error plots     
+ - Residual plots     
+ - Most informative features    
+
+### Clustering
+ - Silhouettes  
+ - Density measures     
 
 ## Tools for parameter tuning
-- validation curves    
-- gridsearch heatmap    
+ - Validation curves    
+ - Gridsearch heatmaps    
 
 ## Using Yellowbrick
-For information on getting started with Yellowbrick, check out our [quick start guide](https://github.com/DistrictDataLabs/yellowbrick/blob/develop/docs/setup.md).
+The Yellowbrick API is specifically designed to play nicely with Scikit-Learn. Here is an example of a typical workflow sequence with Scikit-Learn and Yellowbrick:
+
+### Feature Visualization
+In this example, we see how Rank2D performs pairwise comparisons of each feature in the data set with a specific metric or algorithm, then returns them ranked as a lower left triangle diagram.
+```python
+from yellowbrick.features import Rank2D
+
+visualizer = Rank2D(features=features, algorithm='covariance')
+visualizer.fit(X, y)                # Fit the data to the visualizer
+visualizer.transform(X)             # Transform the data
+visualizer.poof()                   # Draw/show/poof the data
+```
+
+### Model Visualization
+In this example, we instantiate a Scikit-Learn classifier, and then we use Yellowbrick's ROCAUC class to visualize the tradeoff between the classifier's sensitivity and specificity.
+```python
+from sklearn.svm import LinearSVC
+from yellowbrick import ROCAUC
+
+model = LinearSVC()
+model.fit(X,y)
+y_pred = model.predict(X)
+visualizer = ROCAUC(model)
+visualizer.score(y,y_pred)
+visualizer.poof()
+```
+
+For additional information on getting started with Yellowbrick, check out our [examples notebook](https://github.com/DistrictDataLabs/yellowbrick/blob/develop/examples/examples.ipynb).
+
+We also have a [quick start guide](https://github.com/DistrictDataLabs/yellowbrick/blob/master/docs/setup.rst).
 
 ## Contributing to Yellowbrick
 
@@ -65,5 +101,5 @@ This repository is set up in a typical production/release/development cycle as d
     ~$ git branch -d feature-myfeature
     ~$ git push origin develop
     ```
-       
+
 4. Repeat. Releases will be routinely pushed into master via release branches, then deployed to the server.
diff --git a/docs/api/yellowbrick_classhierarchy.png b/docs/api/yellowbrick_classhierarchy.png
diff --git a/examples/examples.ipynb b/examples/examples.ipynb
diff --git a/examples/examples.py b/examples/examples.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+# Ben's scratchpad for testing
+
+## Imports
+import os
+import pandas as pd
+import yellowbrick as yb
+import matplotlib.pyplot as plt
+
+from pandas.tools.plotting import radviz, parallel_coordinates
+from yellowbrick.features import ParallelCoordinates, RadViz, Rank2D
+
+## Module Constants - the path to the test data sets
+FIXTURES = os.path.join(os.path.dirname(__file__), "examples", "data")
+
+## Dataset loading mechanisms
+datasets = {
+    "credit": os.path.join(FIXTURES, "credit.xls"),
+    "concrete": os.path.join(FIXTURES, "concrete.xls"),
+    "occupancy": os.path.join(FIXTURES, 'occupancy', 'datatraining.txt'),
+}
+
+## Human readable column names
+columns  = {
+    "credit": [
+        'id', 'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay',
+        'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill',
+        'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay',
+        'jul_pay', 'aug_pay', 'sep_pay', 'default'
+    ],
+    "concrete": [
+        'cement', 'slag', 'ash', 'water', 'splast',
+        'coarse', 'fine', 'age', 'strength'
+    ],
+    "occupancy": [
+        'date', 'temp', 'humid', 'light', 'co2', 'hratio', 'occupied'
+    ],
+}
+
+
+def load_data(name):
+    """
+    Loads and wrangls the passed in dataset.
+    """
+
+    path = datasets[name]
+    data = {
+        'credit': lambda p: pd.read_excel(p, header=1),
+        'concrete': lambda p: pd.read_excel(p),
+        'occupancy': lambda p: pd.read_csv(p),
+    }[name](path)
+
+    data.columns = columns[name]
+    return data
+
+
+def test_parallel_coords(pandas=False, outpath=None):
+    """
+    Runs the parallel coordinates visualizer on the dataset.
+
+    Parameters
+    ----------
+    pandas : bool
+        Run the pandas version of the function
+    outpath : path or None
+        Save the figure to disk rather than show (if None)
+    """
+    data = load_data('occupancy')       # Load the data
+    features = ['temp', 'humid', 'light', 'co2', 'hratio']
+    classes = ['unoccupied', 'occupied']
+    X = data[features].as_matrix()
+    y = data.occupied.as_matrix()
+
+    if pandas:
+        parallel_coordinates(data[features + ['occupied']], 'occupied')
+        if outpath:
+            plt.savefig(outpath)
+        else:
+            plt.show()
+
+    else:
+        visualizer = ParallelCoordinates(   # Instantiate the visualizer
+            classes=classes, features=features
+        )
+        visualizer.fit(X, y)                # Fit the data to the visualizer
+        visualizer.transform(X)             # Transform the data
+        visualizer.poof(outpath=outpath)    # Draw/show/poof the data
+
+
+def test_radviz(pandas=False, outpath=None):
+    """
+    Runs the radviz visualizer on the dataset.
+
+    Parameters
+    ----------
+    pandas : bool
+        Run the pandas version of the function
+    outpath : path or None
+        Save the figure to disk rather than show (if None)
+    """
+    data = load_data('occupancy')       # Load the data
+    features = ['temp', 'humid', 'light', 'co2', 'hratio']
+    classes = ['unoccupied', 'occupied']
+    X = data[features].as_matrix()
+    y = data.occupied.as_matrix()
+
+    if pandas:
+        radviz(data[features + ['occupied']], 'occupied')
+        if outpath:
+            plt.savefig(outpath)
+        else:
+            plt.show()
+
+    else:
+        visualizer = RadViz(   # Instantiate the visualizer
+            classes=classes, features=features
+        )
+        visualizer.fit(X, y)                # Fit the data to the visualizer
+        visualizer.transform(X)             # Transform the data
+        visualizer.poof(outpath=outpath)    # Draw/show/poof the data
+
+
+def test_rank2d(seaborn=False, outpath=None):
+    """
+    Runs the radviz visualizer on the dataset.
+
+    Parameters
+    ----------
+    pandas : bool
+        Run the pandas version of the function
+    outpath : path or None
+        Save the figure to disk rather than show (if None)
+    """
+    data = load_data('occupancy')       # Load the data
+    features = ['temp', 'humid', 'light', 'co2', 'hratio']
+    classes = ['unoccupied', 'occupied']
+    X = data[features].as_matrix()
+    y = data.occupied.as_matrix()
+
+    if seaborn:
+        raise NotImplementedError("Not yet!")
+
+    else:
+        visualizer = Rank2D(features=features, algorithm='covariance')
+        visualizer.fit(X, y)                # Fit the data to the visualizer
+        visualizer.transform(X)             # Transform the data
+        visualizer.poof(outpath=outpath)    # Draw/show/poof the data
+
+
+if __name__ == '__main__':
+    # test_parallel_coords(pandas=True)
+    # test_radviz(pandas=False, outpath='/Users/benjamin/Desktop/yb_radviz.png')
+    test_rank2d(outpath='/Users/benjamin/Desktop/yb_rank2d_covariance.png')