Preprocessing and pipelines¶






In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


    






In [ ]:

    
# First, look at everything.
from subprocess import check_output
print(check_output(["ls", "../input/"]).decode("utf8"))


    






In [ ]:

    
df = pd.read_csv('../input/automobile/auto.csv')
df.head()


    






In [ ]:

    
df.boxplot(column='mpg', by='origin', figsize=(10,10), fontsize=10);


    






In [ ]:

    
df.info()


    






In [ ]:

    
# Read 'gapminder.csv' into a DataFrame: df
df = pd.read_csv('../input/gapminder/gapminder.csv')

# Create a boxplot of life expectancy per region
df.boxplot('life', 'Region', rot=60, figsize=(5,5));


    






In [ ]:

    
df.head()


    









Creating dummy variables¶






In [ ]:

    
# Create dummy variables: df_region
df_region = pd.get_dummies(df)

# Print the columns of df_region
print(df_region.columns)

# Create dummy variables with drop_first=True: df_region
df_region2 = pd.get_dummies(df, drop_first=True)

# Print the new columns of df_region
print(df_region2.columns)


    






In [ ]:

    
df_region2.shape


    









Regression with categorical features¶






In [ ]:

    
y = df_region2.life.values
X = df_region2.drop('life', axis=1).values


    






In [ ]:

    
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Instantiate a ridge regressor: ridge
ridge = Ridge(alpha=.5, normalize=True)

# Perform 5-fold cross-validation: ridge_cv
ridge_cv = cross_val_score(ridge, X, y, cv=5)

# Print the cross-validated scores
print(ridge_cv)


    









Dropping missing data¶






In [ ]:

    
# Read the CSV file into a DataFrame: df
df = pd.read_csv('../input/house-votes-non-index/house-votes-non-index.csv')
df.head()


    






In [ ]:

    
# Convert '?' to NaN
df[df == '?'] = np.nan

# Print the number of NaNs
print(df.isnull().sum())

# Print shape of original DataFrame
print("Shape of Original DataFrame: {}".format(df.shape))

# Drop missing values and print shape of new DataFrame
df = df.dropna(axis=0)

# Print shape of new DataFrame
print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(df.shape))


    









When many values in your dataset are missing, if you drop them, you may end up throwing away valuable information along with the missing data.






In [ ]:

    
df.shape


    









Imputing missing data in a ML Pipeline I¶






In [ ]:

    
# Import the Imputer module
from sklearn.preprocessing import Imputer
#from sklearn.impute import SimpleImputer as Imputer
from sklearn.svm import SVC

# Setup the Imputation transformer: imp
##################
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
#imp = Imputer(missing_values='NaN', strategy='most_frequent')

# Instantiate the SVC classifier: clf
clf = SVC()

# Setup the pipeline with the required steps: steps
steps = [('imputation', imp),
        ('SVM', clf)]


    









Imputing missing data in a ML Pipeline II¶









Practice this for yourself now and generate a classification report of your predictions.






In [ ]:

    
y = df.party

X = df.drop('party', axis=1)


    






In [ ]:

    
X.shape


    






In [ ]:

    
df.info()


    






In [ ]:

    
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=.3, random_state=42)

# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Compute metrics
print(classification_report(y_test, y_pred))


    









Your pipeline has performed imputation as well as classification!









Centering and scaling your data¶






In [ ]:

    
w = pd.read_csv('../input/white-wine/white-wine.csv')
w.head()


    






In [ ]:

    
X = w.drop('quality', axis=1).values


    






In [ ]:

    
X.shape


    






In [ ]:

    
# Import scale
from sklearn.preprocessing import scale

# Scale the features: X_scaled
X_scaled = scale(X)

# Print the mean and standard deviation of the unscaled features
print("Mean of Unscaled Features: {}".format(np.mean(X))) 
print("Standard Deviation of Unscaled Features: {}".format(np.std(X)))

# Print the mean and standard deviation of the scaled features
print("Mean of Scaled Features: {}".format(np.mean(X_scaled))) 
print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled)))


    









Centering and scaling in a pipeline¶






In [ ]:

    
y = w.quality.apply(lambda x: True if x < 6 else False) # or without .values


    






In [ ]:

    
y.shape


    






In [ ]:

    
y[:20]


    






In [ ]:

    
# Import the necessary modules
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
        
# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=42)

# Fit the pipeline to the training set: knn_scaled
knn_scaled = pipeline.fit(X_train, y_train)

# Instantiate and fit a k-NN classifier to the unscaled data
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)

# Compute and print metrics
print('Accuracy with Scaling: {}'.format(pipeline.score(X_test, y_test)))
print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test)))


    









Bringing it all together I: Pipeline for classification¶






In [ ]:

    
from sklearn.model_selection import GridSearchCV

# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=21)

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters, cv=3)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))


    






In [ ]:

    
from sklearn.metrics import recall_score

recall_score(y_test, y_pred)


    






In [ ]:

    
y_pred[:10]


    






In [ ]:

    
y_test[:10].values


    









Bringing it all together II: Pipeline for regression¶






In [ ]:

    
df = pd.read_csv('../input/gapminder/gapminder.csv')

# Create arrays for features and target variable
y = df.life
X = df.drop(['life', 'Region'], axis=1)


    






In [ ]:

    
df.head()


    






In [ ]:

    
X.shape


    






In [ ]:

    
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


    






In [ ]:

    
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet(max_iter=10000))]
#steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
#         ('scaler', StandardScaler()),
#         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, parameters, cv=3)

# Fit to the training set
gm_cv.fit(X_train,y_train)

# Compute and print the metrics
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))