Running A Random Forest Classifier On The Gapminder Dataset

We’ll be running a Random Forest classifier on the Gapminder dataset. Using a handful of variables to predict income levels, which have been grouped into three different levels.

We’ll start by loading in the data, preprocessing the data, and creating a new feature is the income level divided into three different groups.

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from scipy.stats import pearsonr
import pandas as pd
from seaborn import regplot
import matplotlib.pyplot as plt

# check for missing data
def check_missing(dataframe, cols):

for col in cols:
print("Column {} is missing:".format(col))
print((dataframe[col].values == ' ').sum())
print()

# convert to numeric
def to_numeric(dataframe, cols):

for col in cols:
dataframe[col] = dataframe[col].convert_objects(convert_numeric=True)

# check frequency distribution
def freq_dist(dataframe, cols, norm_cols):

for col in cols:
print("Fred dist for: {}".format(col))
count = dataframe[col].value_counts(sort=False, dropna=False)
print(count)

for col in norm_cols:
print("Fred dist for: {}".format(col))
count = dataframe[col].value_counts(sort=False, dropna=False, normalize=True)
print(count)


df = pd.read_csv("gapminder.csv")

#print(dataframe.head())
#print(df.isnull().values.any())

cols = ['lifeexpectancy', 'breastcancerper100th', 'suicideper100th']
norm_cols = ['internetuserate', 'employrate', 'incomeperperson']

df2 = df.copy()

#check_missing(df, cols)
#check_missing(df, norm_cols)

to_numeric(df2, cols)
to_numeric(df2, norm_cols)

#freq_dist(df2, cols, norm_cols)

df_clean = df2.dropna()


def plot_regression(x, y, data, label_1, label_2):

reg_plot = regplot(x=x, y=y, fit_reg=True, data=data)
plt.xlabel(label_1)
plt.ylabel(label_2)
plt.show()

print('Association between life expectancy and internet use rate')
print(pearsonr(df_clean['lifeexpectancy'], df_clean['internetuserate']))

print('Association between employment rate and internet use rate')
print(pearsonr(df_clean['employrate'], df_clean['internetuserate']))

def group_incomes(row):
if row['incomeperperson'] <= 744.23:
return 1
elif row['incomeperperson'] <= 942.32:
return 2
else:
return 3

df_clean['income_group'] = df_clean.apply(lambda row: group_incomes(row), axis=1)

We’ll now use the Train-Test split function to divide the dataset into training and testing sets. After this, we’ll fit the classifier, get predictions and print some statistics about the performance of the model.

X = df_clean[['alcconsumption','breastcancerper100th','employrate', 'internetuserate','lifeexpectancy','urbanrate']]
X.astype(float)
Y = df_clean['income_group']
Y.astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

RF_clf = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
RF_clf.fit(X_train, y_train)
preds = RF_clf.predict(X_test)
print(accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

Here’s the accuracy and the confusion matrix:

0.85

[[ 8 0 1]
[ 2 0 0]
[ 3 0 26]]

Share this:

Related

Leave a comment Cancel reply