This is an implementation of a decision tree classifier run on the GapMinder dataset.
Here’s the code used to run the Decision Tree classifier:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import pearsonr
import pandas as pd
from seaborn import regplot
import matplotlib.pyplot as plt
# check for missing data
def check_missing(dataframe, cols):
for col in cols:
print("Column {} is missing:".format(col))
print((dataframe[col].values == ' ').sum())
print()
# convert to numeric
def to_numeric(dataframe, cols):
for col in cols:
dataframe[col] = dataframe[col].convert_objects(convert_numeric=True)
# check frequency distribution
def freq_dist(dataframe, cols, norm_cols):
for col in cols:
print("Fred dist for: {}".format(col))
count = dataframe[col].value_counts(sort=False, dropna=False)
print(count)
for col in norm_cols:
print("Fred dist for: {}".format(col))
count = dataframe[col].value_counts(sort=False, dropna=False, normalize=True)
print(count)
df = pd.read_csv("gapminder.csv")
cols = ['lifeexpectancy', 'breastcancerper100th', 'suicideper100th']
norm_cols = ['internetuserate', 'employrate', 'incomeperperson']
df2 = df.copy()
to_numeric(df2, cols)
to_numeric(df2, norm_cols)
df_clean = df2.dropna()
def plot_regression(x, y, data, label_1, label_2):
reg_plot = regplot(x=x, y=y, fit_reg=True, data=data)
plt.xlabel(label_1)
plt.ylabel(label_2)
plt.show()
print('Association between life expectancy and internet use rate')
print(pearsonr(df_clean['lifeexpectancy'], df_clean['internetuserate']))
print('Association between employment rate and internet use rate')
print(pearsonr(df_clean['employrate'], df_clean['internetuserate']))
def group_incomes(row):
if row['incomeperperson'] <= 744.23:
return 1
elif row['incomeperperson'] <= 942.32:
return 2
else:
return 3
df_clean['income_group'] = df_clean.apply(lambda row: group_incomes(row), axis=1)
print(df_clean.head())
X = df_clean[['alcconsumption','breastcancerper100th','employrate', 'internetuserate','lifeexpectancy','urbanrate']]
X.astype(float)
Y = df_clean['income_group']
Y.astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
DT_clf = DecisionTreeClassifier()
DT_clf.fit(X_train, y_train)
preds = DT_clf.predict(X_test)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))
The features used by the Decision Tree classifier are:
'alcconsumption','breastcancerper100th','employrate', 'internetuserate','lifeexpectancy','urbanrate'
Meanwhile, the target variable is the income group that was created in the lesson on confounding variable comparisons.
Here’s the shape of the training and testing datasets:
(127, 6)
(127,)
(32, 6)
(32,)
Here’s the accuracy for the decision tree:
0.8125
And here’s the confusion matrix:
[[ 8 0 2]
[ 1 0 0]
[ 3 0 18]]
The correct predictions can be found running diagonally from the top left to the bottom right. It looks like the model was able to guess 8 instances in Group 1 correctly, 0 in Group 2, and 18 in Group 3. The model seems to be weakest at classifying Group 2 instances. Adding more explanatory variables other than the ones I have selected would likely help improve the model’s predictive power.