In [1]:
#
# Data analysis from diabetes exercise
# Exercise inspired in https://www.kaggle.com/code/annelieseneo/diabetes-prediction
#
import numpy as np
import pandas as pd

data = pd.read_csv('./diabetes_data.csv')
print(data.head())
print('Dataset based on [rows, columns]:', data.shape)
print(data.columns)
print(); print('General info:')
print(data.info())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Dataset based on [rows, columns]: (768, 9)
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

General info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns

In [3]:
# Required libraries

#Installing seaborn library:
%pip install seaborn

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier
import warnings
import scipy.stats
warnings.filterwarnings('ignore')


In [None]:
#
# Values ​​like glucose,bloodpressure or BMI can not be 0. We have to fix the problem.

# looking for empty data, lost data.

print(); print('Null values:')
data.isna().sum()
print(); print('Empty values:')
data.eq(0).sum()

In [None]:
#
# What to do with missing data? 2 main options:
# 1. Interpolate missing data with linear regression
# 2. Replace 0 by the mean
#

In [None]:
# Missing Data Imputation Using Regression


def ImputeZeroValuesWithRegression(dataset):

  columnsToBeImputed = ['BloodPressure','Glucose','Insulin','SkinThickness','BMI']
  for column in columnsToBeImputed:

    test_df = dataset[dataset[column]==0]


    y_train= dataset[column]
    x_train= dataset.drop(column,axis=1)

    X_test = test_df.drop(column, axis=1)

    lr=LinearRegression()
    lr.fit(x_train,y_train)
    y_pred=lr.predict(X_test)


    dataset.loc[dataset[column]==0,column] = y_pred

  return dataset

# Interppolating blank values for columns that do not make sense to have empty values
df=ImputeZeroValuesWithRegression(dataset=data)
print(); print('Empty values:')
df.eq(0).sum()
#data = df


In [None]:
# Changing first the emptiy values of the table.
# NaN instead of 0
data[["Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"
    ]]=data[["Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]].replace(0,np.NaN) 

#filling in the missing values
data.fillna(data.mean(),inplace=True)

#I filled the empty spaces with average.
data.head()

In [None]:
#Looking at the data distribution
data.boxplot(figsize=(50,25), fontsize=20)

In [None]:
#Looking at the Age data
age_counts = data['Age'].value_counts()
age_counts.plot(kind="bar", figsize=(40,20), fontsize=20, title='Frequnce of ages')
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()


In [None]:
#Looking at the Pregnancies data
age_counts = data['Pregnancies'].value_counts()
age_counts.plot(kind="bar", figsize=(40,20), fontsize=20)
plt.xlabel("Pregnancies", size=20)
plt.ylabel("Frequency", size=20)
plt.title('Frequnce of Pregnancies', size=20)
plt.show()


In [None]:
#
# Order of importance
#
x=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',
       'Insulin', 'DiabetesPedigreeFunction']]
y=data.iloc[:,8]

model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(20).plot(kind='bar')
plt.show()

In [None]:
#
# Classic Correlation
#
#The measure of the relationship between variables.
print(data.corr())
sns.heatmap(data.corr(),annot=True)

In [None]:
##################################################
# Clustering data by means k-means
##################################################

x=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',
       'Insulin', 'DiabetesPedigreeFunction']]
y=data['Outcome']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)

#
# Identifying the number of valid clusters
#
inertia = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(x)
    kmeanModel.fit(x)
    inertia.append(kmeanModel.inertia_)

    # Plot the elbow
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.show()

#
# Once the number of clusters (=3), the model can be expanded
#

kmeans = KMeans(n_clusters=3, random_state=0).fit(x)
# y_kmeans = kmeans.predict(x) -> Used to assess predictions

labels = pd.DataFrame(kmeans.labels_) #This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data
labeled_data = pd.concat((data,labels),axis=1)
labeled_data = labeled_data.rename({0:'labels'},axis=1)
print(); print('Labelled information')
print(labeled_data.head())

# Plotting  data in clusters
sns.lmplot(x='Glucose',y='Insulin',data=labeled_data,hue='labels',fit_reg=False)
ax = plt.gca()
ax.set_title('Clusters Glucose vs. Insulin')

sns.lmplot(x='Insulin',y='Outcome',data=labeled_data,hue='labels',fit_reg=False)
ax = plt.gca()
ax.set_title('Clusters Insulin vs. Outcome')

sns.pairplot(labeled_data,hue='labels')



In [None]:
##################################################
## Classification Task:Logistic Regression
##################################################

#creation of algorithms.
x=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',
       'Insulin', 'DiabetesPedigreeFunction']]
y=data['Outcome']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)

#log reg.
log_reg = LogisticRegression(random_state=1, max_iter=1000)
log_reg.fit(x_train,y_train)
y_pred=log_reg.predict(x_test)
#predict for Logistic Reg.
y_pred
print("Logistic Regression Training Accuracy: ",log_reg.score(x_test,y_test))


In [None]:
##################################################
## Classification Task:Random Forest
##################################################

#randomforest
forest=RandomForestClassifier(n_estimators=20,criterion="entropy",random_state=0)
forest.fit(x_train,y_train)

#predict for forest classifier.
y_pred=forest.predict(x_test)
y_pred
print("Random Forest Classifier Training Accuracy: ",forest.score(x_test,y_test))

In [None]:
##################################################
## Classification Task:KNN Classifier
##################################################

#knn
KNN=KNeighborsClassifier(n_neighbors=1)
KNN.fit(x_train,y_train)


#predict for KNN.
y_pred=KNN.predict(x_test)

from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

print("KNN classifier training accuracy: ",KNN.score(x_test,y_test))

In [None]:
##################################################
## Comparing performances
##################################################

#performance
KNN_predict =[0 for _ in range(len(y_test)) ]
log_reg_predict=[0 for _ in range(len(y_test)) ]
forest_predict=[0 for _ in range(len(y_test)) ]

model1=KNN.fit(x_train,y_train)
model2=log_reg.fit(x_train,y_train)
model3=forest.fit(x_train,y_train)

KNN_predict=model1.predict_proba(x_test)
log_reg_predict=model2.predict_proba(x_test)
forest_predict=model3.predict_proba(x_test)

KNN_predict=KNN_predict[:,1]
log_reg_predict=log_reg_predict[:,1]
forest_predict=forest_predict[:,1]

KNN_auc=roc_auc_score(y_test,KNN_predict)
log_reg_auc=roc_auc_score(y_test,log_reg_predict)
forest_auc=roc_auc_score(y_test,forest_predict)

print("KNN:ROC AUC=%.3f"%(KNN_auc))
print("log_reg:ROC AUC=%.3f"%(log_reg_auc))
print("forest:ROC AUC=%.3f"%(forest_auc))

KNN_fpr,KNN_tpr,_=roc_curve(y_test,KNN_predict)
log_reg_fpr,log_reg_tpr,_=roc_curve(y_test,log_reg_predict)
forest_fpr,forest_tpr,_=roc_curve(y_test,forest_predict)

pyplot.plot(KNN_fpr,KNN_tpr,linestyle="--",label="KNN")
pyplot.plot(log_reg_fpr,log_reg_tpr,marker=".",label="log_reg")
pyplot.plot(forest_fpr,forest_tpr,marker=".",label="forest")
pyplot.legend()
pyplot.show()
