Testing Potential Moderators

We’re going to be checking to see if income level might have any effect on the relationship between internet usage rates and life expectancy.

Here’s the code used to carry out the check for moderators.

from scipy.stats import pearsonr
import pandas as pd
from seaborn import regplot
import matplotlib.pyplot as plt
import numpy as np

# check for missing data
def check_missing(dataframe, cols):

for col in cols:
print("Column {} is missing:".format(col))
print((dataframe[col].values == ' ').sum())
print()

# convert to numeric
def to_numeric(dataframe, cols):

for col in cols:
dataframe[col] = dataframe[col].convert_objects(convert_numeric=True)

# check frequency distribution
def freq_dist(dataframe, cols, norm_cols):

for col in cols:
print("Fred dist for: {}".format(col))
count = dataframe[col].value_counts(sort=False, dropna=False)
print(count)

for col in norm_cols:
print("Fred dist for: {}".format(col))
count = dataframe[col].value_counts(sort=False, dropna=False, normalize=True)
print(count)


df = pd.read_csv("gapminder.csv")

#print(dataframe.head())
#print(df.isnull().values.any())

cols = ['lifeexpectancy', 'breastcancerper100th', 'suicideper100th']
norm_cols = ['internetuserate', 'employrate', 'incomeperperson']

df2 = df.copy()

#check_missing(df, cols)
#check_missing(df, norm_cols)

to_numeric(df2, cols)
to_numeric(df2, norm_cols)

#freq_dist(df2, cols, norm_cols)

df_clean = df2.dropna()
df_clean['incomeperperson'] = df_clean['incomeperperson'].replace('', np.nan)

def plot_regression(x, y, data, label_1, label_2):

reg_plot = regplot(x=x, y=y, fit_reg=True, data=data)
plt.xlabel(label_1)
plt.ylabel(label_2)
plt.show()

#plot_regression('lifeexpectancy', 'internetuserate', df_clean, 'Life Expectancy', 'Internet Use Rate')
#plot_regression('employrate', 'internetuserate', df_clean, 'Employment Rate', 'Internet Use Rate')

print('Association between life expectancy and internet use rate')
print(pearsonr(df_clean['lifeexpectancy'], df_clean['internetuserate']))

print('Association between employment rate and internet use rate')
print(pearsonr(df_clean['employrate'], df_clean['internetuserate']))

def group_incomes(row):
if row['incomeperperson'] <= 744.23:
return 1
elif row['incomeperperson'] <= 942.32:
return 2
else:
return 3

df_clean['income_group'] = df_clean.apply(lambda row: group_incomes(row), axis=1)

subframe_1 = df_clean[(df_clean['income_group'] == 1)]
subframe_2 = df_clean[(df_clean['income_group'] == 2)]
subframe_3 = df_clean[(df_clean['income_group'] == 3)]

print('Association between life expectancy and internet use rate for low income countries')
print(pearsonr(subframe_1['lifeexpectancy'], subframe_1['internetuserate']))
print('Association between life expectancy and internet use rate for medium income countries')
print(pearsonr(subframe_2['lifeexpectancy'], subframe_2['internetuserate']))
print('Association between life expectancy and internet use rate for high income countries')
print(pearsonr(subframe_3['lifeexpectancy'], subframe_3['internetuserate']))

Three different income groups are created using a lambda function and the “group_incomes” function.

Here are the results of the code:

Association between life expectancy and internet use rate for low income countries
(0.38386370068495235, 0.010101223355274047)
Association between life expectancy and internet use rate for medium income countries
(0.9966009508278395, 0.05250454954743393)
Association between life expectancy and internet use rate for high income countries
(0.7019997488251704, 6.526819886007788e-18)

For low income countries, there is a moderate correlation between life expectancy and internet use rate, with a small P-value. For middle income countries, the correlation is very strong, but the P-value is too large to be considered significant. For high income countries, the correlation is decently strong with a very small P-value, suggesting a significant correlation.

Leave a comment

Design a site like this with WordPress.com
Get started