Data Management & Visualization: Week 4
Output / Figures:
Code
Import Libraries
import pandas
import numpy
import seaborn
import matplotlib.pyplot as plt
bug fix for display formats to avoid run time errors
pandas.set_option('display.float_format', lambda x:'%f'%x)
Set Pandas to show all colums and rows in Dataframes
pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', None)
Import gapminder.csv
data = pandas.read_csv('gapminder.csv', low_memory=False)
Replace all empty entries with 0
data = data.replace(r'^\s*$', numpy.NaN, regex=True)
Extract relevant variables from original dataset and save it in subdata set
print('List of extracted variables in subset')
subdata = data[['incomeperperson', 'lifeexpectancy', 'suicideper100th']]
Safe backup file of reduced dataset
subdata2 = subdata.copy()
Convert all entries to numeric format
subdata2['incomeperperson'] = pandas.to_numeric(subdata2['incomeperperson'])
subdata2['lifeexpectancy'] = pandas.to_numeric(subdata2['lifeexpectancy'])
subdata2['suicideper100th'] = pandas.to_numeric(subdata2['suicideper100th'])
All rows containing value 0 / previously had no entry are deleted from the subdata set
subdata2 = subdata2.dropna()
print(subdata2)
Describe statistical distribution of variable values
print('Statistics on "Income per Person"')
desc_income = subdata2['incomeperperson'].describe()
print(desc_income)
print('Statistics on "Life Expectancy"')
desc_lifeexp = subdata2['lifeexpectancy'].describe()
print(desc_lifeexp)
print('Statistics on "Suicide Rate per 100th"')
desc_suicide = subdata2['suicideper100th'].describe()
print(desc_suicide)
Identify min & max values within each column
print('Minimum & Maximum Income')
min_income = min(subdata2['incomeperperson'])
print(min_income)
max_income = max(subdata2['incomeperperson'])
print(max_income)
print('')
print('Minimum & Maximum Life Expectancy')
min_lifeexp = min(subdata2['lifeexpectancy'])
print(min_lifeexp)
max_lifeexp = max(subdata2['lifeexpectancy'])
print(max_lifeexp)
print('')
print('Minimum & Maximum Suicide Rate')
min_srate = min(subdata2['suicideper100th'])
print(min_srate)
max_srate = max(subdata2['suicideper100th'])
print(max_srate)
print('')
Split up income into percentiles
subdata2['INCGROUPS10']=pandas.qcut(subdata2.incomeperperson, 11, labels=["1=0%tile","2=10%tile","3=20%tile","4=30%tile","5=40%tile","6=50%tile","7=60%tile","8=70%tile","9=80%tile","10=90%tile","11=100%tile"])
inc_dist_percent = subdata2['INCGROUPS10'].value_counts(sort=False, normalize=True, dropna=True)
subdata2['INCGROUPS10'] = subdata2['INCGROUPS10'].astype('category')
print(inc_dist_percent)
print(subdata2)
subdata2['INCGROUPS5K'] = pandas.cut(subdata2.incomeperperson, [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000])
subdata2['INCGROUPS5K'] = subdata2['INCGROUPS5K'].astype('category')
subdata2['INCGROUPS5K'] = subdata2['INCGROUPS5K'].cat.rename_categories(["2.5k", "7.5k", "12.5k", "17.5k", "22.5k", "27.5k", "32.5k", "37.5k", "42.5k", "47.5k", "52.5k", "57.5k"])
inc_dist_dollar = subdata2['INCGROUPS5K'].value_counts(sort=False, normalize=True, dropna=True)
subdata2['INCGROUPS5K'] = subdata2['INCGROUPS5K'].astype('category')
print(inc_dist_dollar)
subdata2['LIFEEXPGROUPS5Y'] = pandas.cut(subdata2.lifeexpectancy, [45, 50, 55, 60, 65, 70, 75, 80, 85, 90])
lifeexp_dist = subdata2['LIFEEXPGROUPS5Y'].value_counts(sort=False, normalize=True, dropna=True)
subdata2['LIFEEXPGROUPS5Y'] = subdata2['LIFEEXPGROUPS5Y'].astype('category')
print(lifeexp_dist)
The following cross table compares income and life expectancy of different groups
print('First, simplified comparison of income and life expectancy')
comparison = pandas.crosstab(subdata2['INCGROUPS5K'], subdata2['LIFEEXPGROUPS5Y'])
print(comparison)
Univeriate plots
seaborn.countplot(x='INCGROUPS5K', data=subdata2)
plt.xlabel('Average Income of Income Group')
plt.title('Count distribution of Income per Person')
Biveriate plots
seaborn.catplot(x='INCGROUPS5K', y='lifeexpectancy', data=subdata2, kind="bar", ci=None)
plt.xlabel('Income Group')
plt.ylabel('Life Expectancy')
seaborn.catplot(x='INCGROUPS5K', y='suicideper100th', data=subdata2, kind="bar", ci=None)
plt.xlabel('Income Group')
plt.ylabel('Suicide Rate per 100 persons')
0 notes