Notebooks/Customer_loan_repayment_problem/loan-prediction-problem.ipynb
The dataset for this project is retrieved from kaggle, the home of Data Science.
The major aim of this project is to predict whether the customers will have their loan paid or not. Therefore, this is a supervised classification problem to be trained.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import plotly.express as px
df=pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')
df.head()
df.shape
df.columns=df.columns.str.lower()
df.columns=['loan_id', 'gender', 'married', 'dependents', 'education','self_employed', 'applicant_income', 'co-applicant_income', 'loan_amount', 'loan_amount_term', 'credit_history', 'property_area', 'loan_status']
df.isnull().sum()
we take care of missing values in "loan_amount" and "credit_history". For other null values, we either delete a particular row if it has a null value for a particular feature and a particular column if it has more than 70-75% of missing values. This method is advised only when there are enough samples in the data set.
df['loan_amount']=df['loan_amount'].fillna(df['loan_amount'].mean())
df['credit_history']=df['credit_history'].fillna(df['credit_history'].median())
df.dropna(axis=0, inplace=True)
df.isnull().sum()
df.head()
df.shape
df.info()
df.describe()
type(df['dependents'].iloc[0])
df['dependents'].unique()
model6=LabelEncoder()
model6.fit(df['dependents'])
df['dependents']= model6.transform(df['dependents'])
df[df['loan_status']=='Y'].count()['loan_status']
df[df['loan_status']=='N'].count()['loan_status']
plt.figure(figsize=(8,8))
plt.pie(x=[376,166], labels=['Yes','No'], autopct='%1.0f%%', pctdistance=0.5,labeldistance=0.7,colors=['g','r'])
plt.title('Distribution of Loan Status')
69% of applicants repay the loan and 39% do not repay the loan.
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.countplot(x='gender' ,hue='loan_status', data=df,palette='plasma')
plt.subplot(2,3,2)
sns.countplot(x='married',hue='loan_status',data=df,palette='viridis')
plt.ylabel(' ')
plt.yticks([ ])
plt.subplot(2,3,3)
sns.countplot(x='education',hue='loan_status',data=df,palette='copper')
plt.ylabel(' ')
plt.yticks([ ])
plt.subplot(2,3,4)
sns.countplot(x='credit_history', data=df,hue='loan_status',palette='summer')
plt.subplot(2,3,5)
sns.countplot(x='self_employed',hue='loan_status',data=df,palette='autumn')
plt.ylabel(' ')
plt.yticks([ ])
plt.subplot(2,3,6)
sns.countplot(x='property_area',data=df,hue='loan_status',palette='PuBuGn')
plt.ylabel(' ')
plt.yticks([ ])
Comparison between Genders in getting the Loan shows that a Male Individual has more chance of repaying the Loan.
Comparison between Married Status in getting the Loan shows that a Married Individual has more chance of repaying the Loan.
Comparison between Education Status of an Individual in getting the Loan shows that a Graduate Individual has more chance of repaying the Loan.
Comparison between Self-Employed or Not in getting the Loan shows that Not Self-Employed has more chance of repaying the Loan.
Comparison between Credit History for getting the Loan shows that an individual with a credit history has more chance of repaying the Loan.
Comparison between Property Area for getting the Loan shows that People living in Semi-Urban Area have more chance to repay the Loan.
px.sunburst( data_frame=df,path=['gender','loan_status'], color='loan_amount')
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.violinplot(x='gender', y='loan_amount',hue='loan_status', data=df,palette='plasma')
plt.subplot(2,3,2)
sns.violinplot(x='married',y='loan_amount',hue='loan_status',data=df,palette='viridis')
plt.ylabel(' ')
plt.yticks([ ])
plt.subplot(2,3,3)
sns.violinplot(x='education',y='loan_amount',hue='loan_status',data=df,palette='copper')
plt.ylabel(' ')
plt.yticks([ ])
plt.subplot(2,3,4)
sns.violinplot(x='credit_history',y='loan_amount', data=df,hue='loan_status',palette='summer')
plt.subplot(2,3,5)
sns.violinplot(x='self_employed',y='loan_amount',hue='loan_status',data=df,palette='autumn')
plt.ylabel(' ')
plt.yticks([ ])
plt.subplot(2,3,6)
sns.violinplot(x='property_area', y='loan_amount',data=df,hue='loan_status',palette='PuBuGn')
plt.ylabel(' ')
plt.yticks([ ])
plt.figure(figsize=(18,5))
plt.subplot(1,3,1)
sns.distplot(df['applicant_income'],bins=30,color='r',hist_kws=dict(edgecolor='white'))
plt.ylabel('frequency')
plt.subplot(1,3,2)
sns.distplot(df['co-applicant_income'],bins=30,color='blue',hist_kws=dict(edgecolor='white'))
plt.subplot(1,3,3)
sns.distplot(df['loan_amount'],bins=30,color='black',hist_kws=dict(edgecolor='white'))
px.scatter_3d(data_frame=df,x='applicant_income',y='co-applicant_income',z='loan_amount',color='loan_status')
model1=LabelEncoder()
model1.fit(df['gender'])
df['gender']= model1.transform(df['gender'])
model2=LabelEncoder()
model2.fit(df['married'])
df['married']= model2.transform(df['married'])
model3=LabelEncoder()
model3.fit(df['education'])
df['education']= model3.transform(df['education'])
model4=LabelEncoder()
model4.fit(df['self_employed'])
df['self_employed']= model4.transform(df['self_employed'])
model5=LabelEncoder()
model5.fit(df['property_area'])
df['property_area']= model5.transform(df['property_area'])
model6=LabelEncoder()
model6.fit(df['loan_status'])
df['loan_status']= model6.transform(df['loan_status'])
df.head()
plt.figure(figsize=(12,8))
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
ax = sns.heatmap(corr, mask=mask, square=True,annot=True,linewidths=2, cmap='viridis')
plt.title('Correlation Matrix for Loan Status')
From the above figure, we can see that Credit_History (Independent Variable) has the maximum correlation with Loan_Status (Dependent Variable). Which denotes that the Loan_Status is heavily dependent on the Credit_History.
X=df.drop(['loan_id','loan_status'],axis=1)
y=df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)
lr=LogisticRegression()
lr.fit(X_train, y_train)
lr_prediction=lr.predict(X_test)
print(confusion_matrix(y_test,lr_prediction))
print('\n')
print(classification_report(y_test,lr_prediction))
print('\n')
print('Logistic Regression accuracy: ', accuracy_score(y_test,lr_prediction))
dt=DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_prediction=dt.predict(X_test)
print(confusion_matrix(y_test,dt_prediction))
print('\n')
print(classification_report(y_test,dt_prediction))
print('\n')
print('Decision Tree Accuracy: ', accuracy_score(y_test,dt_prediction))
rf=RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)
rf_prediction=rf.predict(X_test)
print(confusion_matrix(y_test,rf_prediction))
print('\n')
print(classification_report(y_test,rf_prediction))
print('\n')
print('Random Forest Accuracy: ', accuracy_score(y_test,rf_prediction))
error_rate=[]
for n in range(1,40):
knn=KNeighborsClassifier(n_neighbors=n)
knn.fit(X_train, y_train)
knn_prediction=knn.predict(X_test)
error_rate.append(np.mean(knn_prediction!=y_test))
print(error_rate)
plt.figure(figsize=(8,6))
sns.set_style('whitegrid')
plt.plot(list(range(1,40)),error_rate,color='b', marker='o', linewidth=2, markersize=12, markerfacecolor='r', markeredgecolor='r')
plt.xlabel('Number of Neighbors')
plt.ylabel('Error Rate')
plt.title('Elbow Method')
knn=KNeighborsClassifier(n_neighbors=23)
knn.fit(X_train, y_train)
knn_prediction=knn.predict(X_test)
print(confusion_matrix(y_test,knn_prediction))
print('\n')
print(classification_report(y_test,knn_prediction))
print('\n')
print('KNN accuracy Accuracy: ', accuracy_score(y_test,knn_prediction))
svc=SVC()
svc.fit(X_train, y_train)
svc_prediction=svc.predict(X_test)
print(confusion_matrix(y_test,svc_prediction))
print('\n')
print(classification_report(y_test,svc_prediction))
print('\n')
print('SVC َAccuracy: ', accuracy_score(y_test,svc_prediction))
print('Logistic Regression Accuracy: ', accuracy_score(y_test,lr_prediction))
print('Decision Tree Accuracy: ', accuracy_score(y_test,dt_prediction))
print('Random Forest Accuracy: ', accuracy_score(y_test,rf_prediction))
print('KNN Accuracy: ', accuracy_score(y_test,knn_prediction))
print('SVC Accuracy: ', accuracy_score(y_test,svc_prediction))
The Loan Status is heavily dependent on the Credit History for Predictions.
The Logistic Regression algorithm gives us the maximum Accuracy (80%) compared to the other 4 Machine Learning Classification Algorithms.