Program Data Analysis Tools: Module 4. Moderating variable
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 8 11:25:51 2023
@author: ANA4MD
"""
# ANOVA
import numpy
import pandas
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import seaborn
import matplotlib.pyplot as plt
data = pandas.read_csv('gapminder.csv', low_memory=False)
# lower-case all DataFrame column names - place after code for loading data above
data.columns = list(map(str.lower, data.columns))
# bug fix for display formats to avoid run time errors - put after code for loading data above
pandas.set_option('display.float_format', lambda x: '%f' % x)
# to fix empty data to avoid errors
data = data.replace(r'^\s*$', numpy.NaN, regex=True)
# checking the format of my variables and set to numeric
data['femaleemployrate'].dtype
data['lifeexpectancy'].dtype
data['incomeperperson'].dtype
#data['urbanrate'].dtype
data['femaleemployrate'] = pandas.to_numeric(data['femaleemployrate'], errors='coerce', downcast=None)
data['lifeexpectancy'] = pandas.to_numeric(data['lifeexpectancy'], errors='coerce', downcast=None)
data['incomeperperson'] = pandas.to_numeric(data['incomeperperson'], errors='coerce', downcast=None)
#data['urbanrate'] = pandas.to_numeric(data['urbanrate'], errors='coerce', downcast=None)
#print('The explantory variable is the urban rate in 2 levels (rural, urban)')
#print('distribution for urban rate in 2 groups (rural, urban) and creating a new variable urban as categorical variable')
#data['urbanclass'] = pandas.cut(data['urbanrate'], [0, 20, 100], labels=['rural', 'urban'])
#c1 = data['urbanclass'].value_counts(sort=False, dropna=False)
#print (c1)
print('The explantory variable is the income per person in 3 levels (low class, midle class, upper class)')
print('distribution for income per person splits into 3 groups and creating a new variable income as categorical variable')
data['income'] = pandas.cut(data['incomeperperson'], [0, 2000, 24000, 120000], labels=['low class', 'middle class', 'upper class'])
c2 = data['income'].value_counts(sort=False, dropna=False)
print (c2)
print('The explantory variable is the life expectancy in 2 levels ( low, high)')
print('distribution for life expectancy in 2 groups (low, high) and creating a new variable urban as categorical variable')
data['life'] = pandas.cut(data['lifeexpectancy'], [0, 70, 100], labels=['low', 'high'])
c3 = data['life'].value_counts(sort=False, dropna=False)
print (c3)
model1 = smf.ols(formula='femaleemployrate ~ C(income)', data=data).fit()
print (model1.summary())
sub1 = data[['femaleemployrate', 'income']].dropna()
print ("means for femaleemployrate by income (low class, middle class, upper class)")
m1= sub1.groupby('income').mean()
print (m1)
print ("standard deviation for mean femaleemployrate by income (low class, middle class, upper class)")
st1= sub1.groupby('income').std()
print (st1)
# bivariate bar graph
seaborn.factorplot(x="income", y="femaleemployrate", data=data, kind="bar", ci=None)
plt.xlabel('income level')
plt.ylabel('Mean female employ %')
sub2=data[(data['life']=='low')]
sub3=data[(data['life']=='high')]
print ('association between income and femaleemploy rate for those with less life expectancy (Low)')
model2 = smf.ols(formula='femaleemployrate ~ C(income)', data=sub2).fit()
print (model2.summary())
print ("means for femaleemployrate by income (low class, middle class, upper class,) for low life expectancy")
m2= sub2.groupby('income').mean()
print (m2)
# bivariate bar graph
seaborn.factorplot(x="income", y="femaleemployrate", data=sub2, kind="bar", ci=None)
plt.xlabel('income level')
plt.ylabel('Mean female employ %')
print ('association between income and femaleemploy ratefor those with more life expectancy (high)')
model3 = smf.ols(formula='femaleemployrate ~ C(income)', data=sub3).fit()
print (model3.summary())
print ("means for femaleemployrate by income (low class, middle class, upper class) for high life expectancy")
m3 = sub3.groupby('income').mean()
print (m3)
# bivariate bar graph
seaborn.factorplot(x="income", y="femaleemployrate", data=sub3, kind="bar", ci=None)
plt.xlabel('income level')
plt.ylabel('Mean female employ %')
0 notes