Binary Search Multiple Variables

python 3.6.4
15 November, 2019
[2]
%load_ext autoreload
%autoreload 1

import pandas as pd
import os
import sys

scriptpath = "binarygridsearch.py"
# Do the import
sys.path.append(os.path.abspath(scriptpath))

# Do the import
import binarygridsearch as bgs
%aimport binarygridsearch
[11]
from scipy.stats import uniform, randint
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
[4]
#Define a function that just returns accuracy
def getForestAccuracy(X, y, metric, kwargs):
    clf = RandomForestClassifier(**kwargs)
    clf.fit(X, y)
    y_pred = clf.oob_decision_function_[:, 1]
    return metric(y, y_pred)
[5]
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target
[6]
rfArgs = {"random_state": 0,
          "n_jobs": -1,
          "class_weight": "balanced",
         "n_estimators": 18,
         "oob_score": True}
[8]
hyperparameters = [["max_depth", 0, 1, 32],
                  ["min_samples_split", 2, 0.01, 0.1]]
[9]
%%time
dct = bgs.binarySearchParamsParallel(X, 
                         y,  
                         getForestAccuracy,  
                         rfArgs, 
                         roc_auc_score, 
                         hyperparameters)

print(dct['values'])
print(dct['score'])
{'max_depth': 32, 'min_samples_split': 0.1} 0.9837019713545796 CPU times: user 97.1 ms, sys: 48.3 ms, total: 145 ms Wall time: 1 s
[14]
dct["n_iterations"]
10
[12]
param_dist= {"max_depth": randint(1, 32) ,
            "min_samples_split": uniform(loc=0.01, scale=0.09) }
[15]
%%time
RF = RandomForestClassifier(**rfArgs)
clf = RandomizedSearchCV(RF, 
                         param_dist, 
                         random_state=0, 
                         n_iter=10, 
                         cv=3, 
                         verbose=0,
                         scoring='roc_auc',
                         n_jobs=-1,
                        )
best_model = clf.fit(X, y)
print(best_model.best_params_)
print(best_model.best_score_)
{'max_depth': 20, 'min_samples_split': 0.06813047017599905} 0.9893966403611371 CPU times: user 267 ms, sys: 42.3 ms, total: 309 ms Wall time: 3.2 s
[20]
hyperparameters = [["max_depth", 0, 1, 32],
                  ["min_samples_split", 3, 0.03, 0.1]]

[21]
%%time
dct = bgs.binarySearchParamsParallel(X, 
                         y,  
                         getForestAccuracy,  
                         rfArgs, 
                         roc_auc_score, 
                         hyperparameters)

print(dct['values'])
print(dct['score'])
{'max_depth': 32, 'min_samples_split': 0.1} 0.9837019713545796 CPU times: user 84.8 ms, sys: 54.9 ms, total: 140 ms Wall time: 1.4 s
[22]
hyperparameters = [['max_depth', 0, 1, 100],
                  ["min_samples_split", 2, 0.01, 0.1],
                  ["min_samples_leaf", 2, 0.01, 0.1],]
[23]
dct = bgs.binarySearchParamsParallel(X, 
                         y,  
                         getForestAccuracy,  
                         rfArgs, 
                         roc_auc_score, 
                         hyperparameters)


print(dct['values'])
print(dct['score'])
{'max_depth': 100, 'min_samples_split': 0.1, 'min_samples_leaf': 0.030000000000000002} 0.9860208234237091
[24]
hyperparameters = [['max_depth', 0, 1, 100],
                  ["min_samples_split", 3, 0.01, 0.1],
                  ["min_samples_leaf", 3, 0.01, 0.1],]

dct = bgs.binarySearchParamsParallel(X, 
                         y,  
                         getForestAccuracy,  
                         rfArgs, 
                         roc_auc_score, 
                         hyperparameters)


print(dct['values'])
print(dct['score'])
{'max_depth': 100, 'min_samples_split': 0.1, 'min_samples_leaf': 0.01} 0.9853535753924211
[ ]