{ "metadata": { "kernelspec": { "name": "python", "display_name": "Python (Pyodide)", "language": "python" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8" } }, "nbformat_minor": 5, "nbformat": 4, "cells": [ { "cell_type": "code", "source": "#\n# Data analysis from diabetes exercise\n# Exercise inspired in https://www.kaggle.com/code/annelieseneo/diabetes-prediction\n#\nimport numpy as np\nimport pandas as pd\n\ndata = pd.read_csv('./diabetes_data.csv')\nprint(data.head())\nprint('Dataset based on [rows, columns]:', data.shape)\nprint(data.columns)\nprint(); print('General info:')\nprint(data.info())", "metadata": { "trusted": true }, "execution_count": 1, "outputs": [ { "name": "stdout", "text": " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n0 6 148 72 35 0 33.6 \n1 1 85 66 29 0 26.6 \n2 8 183 64 0 0 23.3 \n3 1 89 66 23 94 28.1 \n4 0 137 40 35 168 43.1 \n\n DiabetesPedigreeFunction Age Outcome \n0 0.627 50 1 \n1 0.351 31 0 \n2 0.672 32 1 \n3 0.167 21 0 \n4 2.288 33 1 \nDataset based on [rows, columns]: (768, 9)\nIndex(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n dtype='object')\n\nGeneral info:\n\nRangeIndex: 768 entries, 0 to 767\nData columns (total 9 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 Pregnancies 768 non-null int64 \n 1 Glucose 768 non-null int64 \n 2 BloodPressure 768 non-null int64 \n 3 SkinThickness 768 non-null int64 \n 4 Insulin 768 non-null int64 \n 5 BMI 768 non-null float64\n 6 DiabetesPedigreeFunction 768 non-null float64\n 7 Age 768 non-null int64 \n 8 Outcome 768 non-null int64 \ndtypes: float64(2), int64(7)\nmemory usage: 54.1 KB\nNone\n", "output_type": "stream" } ], "id": "9cccddb9-109f-4e26-96f6-a63eb5698fd7" }, { "cell_type": "code", "source": "# Required libraries\n\n#Installing seaborn library:\n%pip install seaborn\n\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom matplotlib import pyplot\nimport matplotlib.pyplot as plt\nfrom sklearn.cluster import KMeans\nfrom sklearn.ensemble import ExtraTreesClassifier\nimport warnings\nimport scipy.stats\nwarnings.filterwarnings('ignore')\n", "metadata": { "trusted": true }, "execution_count": 3, "outputs": [], "id": "a246a492-ae8a-4318-b1e2-b3294f83f490" }, { "cell_type": "code", "source": "#\n# Values ​​like glucose,bloodpressure or BMI can not be 0. We have to fix the problem.\n\n# looking for empty data, lost data.\n\nprint(); print('Null values:')\ndata.isna().sum()\nprint(); print('Empty values:')\ndata.eq(0).sum()", "metadata": {}, "execution_count": null, "outputs": [], "id": "63d2120c-50df-4380-afb0-f157255a222b" }, { "cell_type": "code", "source": "#\n# What to do with missing data? 2 main options:\n# 1. Interpolate missing data with linear regression\n# 2. Replace 0 by the mean\n#", "metadata": {}, "execution_count": null, "outputs": [], "id": "af427199-0020-4eb6-94a0-6cafe3de8d2b" }, { "cell_type": "code", "source": "# Missing Data Imputation Using Regression\n\n\ndef ImputeZeroValuesWithRegression(dataset):\n\n columnsToBeImputed = ['BloodPressure','Glucose','Insulin','SkinThickness','BMI']\n for column in columnsToBeImputed:\n\n test_df = dataset[dataset[column]==0]\n\n\n y_train= dataset[column]\n x_train= dataset.drop(column,axis=1)\n\n X_test = test_df.drop(column, axis=1)\n\n lr=LinearRegression()\n lr.fit(x_train,y_train)\n y_pred=lr.predict(X_test)\n\n\n dataset.loc[dataset[column]==0,column] = y_pred\n\n return dataset\n\n# Interppolating blank values for columns that do not make sense to have empty values\ndf=ImputeZeroValuesWithRegression(dataset=data)\nprint(); print('Empty values:')\ndf.eq(0).sum()\n#data = df\n", "metadata": {}, "execution_count": null, "outputs": [], "id": "492686af-76bc-4c8e-8907-6c2cd4fa3b11" }, { "cell_type": "code", "source": "# Changing first the emptiy values of the table.\n# NaN instead of 0\ndata[[\"Glucose\",\"BloodPressure\",\"SkinThickness\",\"Insulin\",\"BMI\",\"DiabetesPedigreeFunction\",\"Age\"\n ]]=data[[\"Glucose\",\"BloodPressure\",\"SkinThickness\",\"Insulin\",\"BMI\",\"DiabetesPedigreeFunction\",\"Age\"]].replace(0,np.NaN) \n\n#filling in the missing values\ndata.fillna(data.mean(),inplace=True)\n\n#I filled the empty spaces with average.\ndata.head()", "metadata": {}, "execution_count": null, "outputs": [], "id": "fc392be2-5545-4c4f-bb9b-7bd34f29fcba" }, { "cell_type": "code", "source": "#Looking at the data distribution\ndata.boxplot(figsize=(50,25), fontsize=20)", "metadata": {}, "execution_count": null, "outputs": [], "id": "dbbca34a-125e-472e-93df-b942bf3f6520" }, { "cell_type": "code", "source": "#Looking at the Age data\nage_counts = data['Age'].value_counts()\nage_counts.plot(kind=\"bar\", figsize=(40,20), fontsize=20, title='Frequnce of ages')\nplt.xlabel(\"Age\")\nplt.ylabel(\"Frequency\")\nplt.show()\n", "metadata": {}, "execution_count": null, "outputs": [], "id": "b7844935-a5e8-4b9e-8757-2460bdc48206" }, { "cell_type": "code", "source": "#Looking at the Pregnancies data\nage_counts = data['Pregnancies'].value_counts()\nage_counts.plot(kind=\"bar\", figsize=(40,20), fontsize=20)\nplt.xlabel(\"Pregnancies\", size=20)\nplt.ylabel(\"Frequency\", size=20)\nplt.title('Frequnce of Pregnancies', size=20)\nplt.show()\n", "metadata": {}, "execution_count": null, "outputs": [], "id": "4efcd787-a32a-45c9-b5f0-fd75533f7086" }, { "cell_type": "code", "source": "#\n# Order of importance\n#\nx=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',\n 'Insulin', 'DiabetesPedigreeFunction']]\ny=data.iloc[:,8]\n\nmodel = ExtraTreesClassifier()\nmodel.fit(x,y)\nprint(model.feature_importances_) \n#plot graph of feature importances for better visualization\nfeat_importances = pd.Series(model.feature_importances_, index=x.columns)\nfeat_importances.nlargest(20).plot(kind='bar')\nplt.show()", "metadata": {}, "execution_count": null, "outputs": [], "id": "83ac214a-b8bc-45b9-8b48-789b9bbfbcc9" }, { "cell_type": "code", "source": "#\n# Classic Correlation\n#\n#The measure of the relationship between variables.\nprint(data.corr())\nsns.heatmap(data.corr(),annot=True)", "metadata": {}, "execution_count": null, "outputs": [], "id": "18021ae4-80e3-45cb-93c3-9b02875c96f2" }, { "cell_type": "code", "source": "##################################################\n# Clustering data by means k-means\n##################################################\n\nx=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',\n 'Insulin', 'DiabetesPedigreeFunction']]\ny=data['Outcome']\nx_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)\n\n#\n# Identifying the number of valid clusters\n#\ninertia = []\nK = range(1,10)\nfor k in K:\n kmeanModel = KMeans(n_clusters=k).fit(x)\n kmeanModel.fit(x)\n inertia.append(kmeanModel.inertia_)\n\n # Plot the elbow\nplt.plot(K, inertia, 'bx-')\nplt.xlabel('k')\nplt.ylabel('Inertia')\nplt.show()\n\n#\n# Once the number of clusters (=3), the model can be expanded\n#\n\nkmeans = KMeans(n_clusters=3, random_state=0).fit(x)\n# y_kmeans = kmeans.predict(x) -> Used to assess predictions\n\nlabels = pd.DataFrame(kmeans.labels_) #This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data\nlabeled_data = pd.concat((data,labels),axis=1)\nlabeled_data = labeled_data.rename({0:'labels'},axis=1)\nprint(); print('Labelled information')\nprint(labeled_data.head())\n\n# Plotting data in clusters\nsns.lmplot(x='Glucose',y='Insulin',data=labeled_data,hue='labels',fit_reg=False)\nax = plt.gca()\nax.set_title('Clusters Glucose vs. Insulin')\n\nsns.lmplot(x='Insulin',y='Outcome',data=labeled_data,hue='labels',fit_reg=False)\nax = plt.gca()\nax.set_title('Clusters Insulin vs. Outcome')\n\nsns.pairplot(labeled_data,hue='labels')\n\n", "metadata": {}, "execution_count": null, "outputs": [], "id": "0de0f301-8efc-40b7-9c7c-534b3e266a76" }, { "cell_type": "code", "source": "##################################################\n## Classification Task:Logistic Regression\n##################################################\n\n#creation of algorithms.\nx=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',\n 'Insulin', 'DiabetesPedigreeFunction']]\ny=data['Outcome']\n\nx_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)\n\n#log reg.\nlog_reg = LogisticRegression(random_state=1, max_iter=1000)\nlog_reg.fit(x_train,y_train)\ny_pred=log_reg.predict(x_test)\n#predict for Logistic Reg.\ny_pred\nprint(\"Logistic Regression Training Accuracy: \",log_reg.score(x_test,y_test))\n", "metadata": {}, "execution_count": null, "outputs": [], "id": "8cc16938-cbbf-40f3-97f1-950f8647a62d" }, { "cell_type": "code", "source": "##################################################\n## Classification Task:Random Forest\n##################################################\n\n#randomforest\nforest=RandomForestClassifier(n_estimators=20,criterion=\"entropy\",random_state=0)\nforest.fit(x_train,y_train)\n\n#predict for forest classifier.\ny_pred=forest.predict(x_test)\ny_pred\nprint(\"Random Forest Classifier Training Accuracy: \",forest.score(x_test,y_test))", "metadata": {}, "execution_count": null, "outputs": [], "id": "6a91af2c-3bf1-4d35-9ab7-e909639b6798" }, { "cell_type": "code", "source": "##################################################\n## Classification Task:KNN Classifier\n##################################################\n\n#knn\nKNN=KNeighborsClassifier(n_neighbors=1)\nKNN.fit(x_train,y_train)\n\n\n#predict for KNN.\ny_pred=KNN.predict(x_test)\n\nfrom sklearn.metrics import classification_report,confusion_matrix\nprint(confusion_matrix(y_test,y_pred))\nprint(classification_report(y_test,y_pred))\n\nprint(\"KNN classifier training accuracy: \",KNN.score(x_test,y_test))", "metadata": {}, "execution_count": null, "outputs": [], "id": "6c0f28a7-ccc1-44a5-99ce-58d89c5d6b11" }, { "cell_type": "code", "source": "##################################################\n## Comparing performances\n##################################################\n\n#performance\nKNN_predict =[0 for _ in range(len(y_test)) ]\nlog_reg_predict=[0 for _ in range(len(y_test)) ]\nforest_predict=[0 for _ in range(len(y_test)) ]\n\nmodel1=KNN.fit(x_train,y_train)\nmodel2=log_reg.fit(x_train,y_train)\nmodel3=forest.fit(x_train,y_train)\n\nKNN_predict=model1.predict_proba(x_test)\nlog_reg_predict=model2.predict_proba(x_test)\nforest_predict=model3.predict_proba(x_test)\n\nKNN_predict=KNN_predict[:,1]\nlog_reg_predict=log_reg_predict[:,1]\nforest_predict=forest_predict[:,1]\n\nKNN_auc=roc_auc_score(y_test,KNN_predict)\nlog_reg_auc=roc_auc_score(y_test,log_reg_predict)\nforest_auc=roc_auc_score(y_test,forest_predict)\n\nprint(\"KNN:ROC AUC=%.3f\"%(KNN_auc))\nprint(\"log_reg:ROC AUC=%.3f\"%(log_reg_auc))\nprint(\"forest:ROC AUC=%.3f\"%(forest_auc))\n\nKNN_fpr,KNN_tpr,_=roc_curve(y_test,KNN_predict)\nlog_reg_fpr,log_reg_tpr,_=roc_curve(y_test,log_reg_predict)\nforest_fpr,forest_tpr,_=roc_curve(y_test,forest_predict)\n\npyplot.plot(KNN_fpr,KNN_tpr,linestyle=\"--\",label=\"KNN\")\npyplot.plot(log_reg_fpr,log_reg_tpr,marker=\".\",label=\"log_reg\")\npyplot.plot(forest_fpr,forest_tpr,marker=\".\",label=\"forest\")\npyplot.legend()\npyplot.show()\n", "metadata": {}, "execution_count": null, "outputs": [], "id": "bebd6540-ae31-422f-94fc-83001cba83e7" } ] }