Skip to content

Instantly share code, notes, and snippets.

@CaptainAshis
Created June 21, 2021 10:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CaptainAshis/d813b46fe69bb753b49c2a41057bb042 to your computer and use it in GitHub Desktop.
Save CaptainAshis/d813b46fe69bb753b49c2a41057bb042 to your computer and use it in GitHub Desktop.
Python code- Abzooba
#!/usr/bin/env python
# coding: utf-8
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve
import logging
import pickle
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import seaborn as sns
import pandas as pd
import numpy as np
import os
os.chdir('./abzooba')
class Classification_Model():
def __init__(self):
logging.basicConfig(
level=logging.DEBUG,
filename='data.log',
filemode='w',
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%d/%m/%Y %H:%M:%S')
print("Classifciation model initialized")
print()
self.clf_and_params = dict()
self.train_data = pd.DataFrame()
self.test_data = pd.DataFrame()
def read_data(self,train_data,test_data):
try:
self.train_data = pd.read_csv(train_data)
self.test_data = pd.read_csv(test_data)
print(f"Column names are {self.train_data.columns.tolist()} \n")
print(f"# of Columns are {len(self.train_data.columns)}")
assert(len(self.train_data.columns) +
1 == len(self.test_data.columns))
except BaseException:
logging.error('Exception occured while reading data')
return self.train_data, self.test_data
def exploratory_data(self,train_data,test_data):
try:
self.train_data, self.test_data = self.read_data(train_data,test_data)
df = pd.DataFrame(self.train_data.dtypes)
df['Missing_values'] = self.train_data.isna().sum().values
df.rename(columns={0: 'data types'}, inplace=True)
print("=" * 60)
print(df)
print("=" * 60)
print(
f" ========================== Data Stats ======================================")
print(self.train_data.describe())
print(
f"==============================================================================")
except BaseException:
logging.error('Exception occured in exploratory data format')
def drop_columns(self,train_data,test_data):
try:
train_data, test_data = self.read_data(train_data,test_data)
train_data1 = train_data.copy()
train_data.drop(['patient_id', 'Adherence'], axis=1, inplace=True)
test_data.drop(['patient_id'], axis=1, inplace=True)
print(
f"Number of columns in training data after dropping columns {len(train_data.columns)}")
return train_data, test_data, train_data1
except BaseException:
logging.error('Exception occured in dropping columns method')
def visualisation(self,train_data,test_data):
train_data, test_data = self.read_data(train_data,test_data)
list_of_cols = train_data.columns.tolist()[2:9]
list_of_cols.pop(1)
for names in list_of_cols:
print()
plot = train_data.groupby([names, 'Adherence'])[
'patient_id'].count().reset_index()
plot.rename(columns={'patient_id': 'counts'}, inplace=True)
sns.catplot(
y='counts',
x='Adherence',
hue=names,
data=plot,
kind='bar')
plt.title(f"Distribution of Adherence wrt to {names}")
def data_preprocessing(self,train_data,test_data):
try:
train_data, test_data, train_data1 = self.drop_columns(train_data,test_data)
df_train1 = pd.get_dummies(train_data, drop_first=True)
print(df_train1.columns)
correln_matrix = df_train1.corr()
print(correln_matrix)
# Calculating VIF
vif = pd.DataFrame()
vif["variables"] = df_train1.columns
vif["VIF"] = [
variance_inflation_factor(
df_train1.values,
i) for i in range(
df_train1.shape[1])]
# print(vif)
valid_variables = vif.loc[vif['VIF'] < 5]['variables'].tolist()
except BaseException:
logging.error('Exception occured in data_preprocessing')
return df_train1[valid_variables], train_data1
def train_test_split(self,train_data,test_data):
try:
df_train1, train_data1 = self.data_preprocessing(train_data,test_data)
df_train1['Adherence_new'] = train_data1['Adherence'].apply(
lambda x: 0 if x == 'No' else 1)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
df_train1, df_train1['Adherence_new'], test_size=0.2)
except BaseException:
logging.error('Exception occured in data_preprocessing')
return self.X_train, self.X_test, self.y_train, self.y_test
def algorithm(self, clf):
try:
clf = KNeighborsClassifier()
params = {'n_neighbors': [5, 7, 9, 11, 13, 15],
'leaf_size': [1, 2, 3, 5],
'weights': ['uniform', 'distance']
}
self.clf_and_params[clf] = params
clf = LogisticRegression()
params = {'penalty': ['l1', 'l2'],
'C': np.logspace(0, 4, 10)
}
self.clf_and_params[clf] = params
clf = SVC()
params = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {
'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
self.clf_and_params[clf] = params
clf = DecisionTreeClassifier()
params = {
'max_features': [
'auto',
'sqrt',
'log2'],
'min_samples_split': [
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15],
'min_samples_leaf': [1],
'random_state': [123]}
self.clf_and_params[clf] = params
clf = RandomForestClassifier()
params = {
'max_depth': [10, 15, 20],
'n_estimators': [2, 4],
'max_features': ['sqrt', 'auto']
}
self.clf_and_params[clf] = params
except BaseException:
logging.error('Exception occured in algorithm method')
return clf, self.clf_and_params[clf]
def model_train(self, clf, params):
try:
models = []
self.results = {}
self.current_clf_name = clf.__class__.__name__
grid_search_clf = RandomizedSearchCV(clf, params, cv=5)
grid_search_clf.fit(self.X_train, self.y_train)
self.Y_pred = grid_search_clf.predict(self.X_test)
self.Y_pred_prob = grid_search_clf.predict_proba(self.X_test)
clf_train_acc = round(
grid_search_clf.score(
self.X_train, self.y_train) * 100, 2)
print(
self.current_clf_name,
" trained and used for prediction on test data...")
self.results[self.current_clf_name] = clf_train_acc
self.show_result()
self.save_model(grid_search_clf)
self.output_stats(grid_search_clf)
models.append(clf)
except BaseException:
logging.error('Exception occured in model train method')
def show_result(self):
try:
for clf_name, train_acc in self.results.items():
print(
"{} train accuracy is {:.3f}".format(
clf_name, train_acc))
except BaseException:
logging.error('Exception occured in show result method')
def save_model(self, grid_search_clf):
try:
# save the model to disk
filename = f'{self.current_clf_name}_finalized_model.pickle'
with open(f'{filename}', 'wb') as handle:
pickle.dump(grid_search_clf, handle)
print(f"Pickling completed for {filename}")
except BaseException:
logging.error('Exception occured in save model method')
def output_stats(self, grid_search_clf):
try:
print("=== Confusion Matrix ===")
print(confusion_matrix(self.y_test, self.Y_pred))
print('\n')
print("=== Classification Report ===")
print(classification_report(self.y_test, self.Y_pred))
print()
plot_confusion_matrix(grid_search_clf, self.X_test, self.y_test)
plt.show()
plot_precision_recall_curve(
grid_search_clf, self.X_test, self.y_test)
plot_roc_curve(grid_search_clf, self.X_test, self.y_test)
except BaseException:
logging.error('Exception occured in output_stats method')
os.chdir('./abz')
filename=f'external.config'
contents=open(filename).read()
config=eval(contents)
train_data=config['train_data']
test_data=config['test_data']
cm = Classification_Model()
X_train, X_test, y_train, y_test = cm.train_test_split(train_data,test_data)
clf, params = cm.algorithm(RandomForestClassifier())
cm.model_train(clf, params)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment