Don't wanna be here? Send us removal request.
Text
Mod 4 Week 1
import pandas as pd
import numpy as np
from sklearn.metrics
import*from sklearn.model_selection
import train_test_split
from sklearn.tree
import DecisionTreeClassifier
from sklearn
import tree from io
import StringIO from IPython.display
import Image
import pydotplus
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt %matplotlib inline
rnd_state = 23468
ata = pd.read_csv('Data/breast_cancer.csv') data.info()
predictors = data.iloc[:, 2:] target = data.diagnosis
To train a Decision tree the dataset was splitted into train and test samples in proportion 70/30.
In [7]:(predictors_train, predictors_test, target_train, target_test) = train_test_split(predictors, target, test_size = .3, random_state = rnd_state)
In [8]:print('predictors_train:', predictors_train.shape) print('predictors_test:', predictors_test.shape) print('target_train:', target_train.shape) print('target_test:', target_test.shape) predictors_train: (398, 30) predictors_test: (171, 30) target_train: (398,) target_test: (171,)
In [9]:print(np.sum(target_train==0)) print(np.sum(target_train==1))
classifier = DecisionTreeClassifier(random_state = rnd_state).fit(predictors_train, target_train)
In [11]:prediction = classifier.predict(predictors_test)
In [12]:print('Confusion matrix:\n', pd.crosstab(target_test, prediction, colnames=['Actual'], rownames=['Predicted'], margins=True)) print('\nAccuracy: ', accuracy_score(target_test, prediction))
0 notes
Text
Mod 3 Week 4
import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
import statsmodels.api as smi
mport statsmodels.formula.api as smf
from statsmodels.stats
import outliers_influence
pandas.set_option('display.float_format', lambda x:'%.3f'%x)
fires = pandas.read_csv('forestfires.csv')
fires = fires.dropna()
months_table = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
days_table = ['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat']
fires['month'] = [months_table.index(month) for month in fires['month'] ]
fires['day'] = [days_table.index(day) for day in fires['day'] ] fires_attributes = list(fires.columns.values)
number_of_columns = len(fires_attributes)
fires['X'] -= min(fires['X'])
fires['Y'] -= min(fires['Y'])
model = smf.ols(formula = "area ~ C(X) + C(Y) + C(month) + C(day) + FFMC + DMC + " + " DC + ISI + temp + RH + wind + rain", data = fires).fit()
print(model.summary())
fires = pandas.read_csv('forestfires.csv')
# Delete rows where any or all of the data are missing
fires = fires.dropna()
# Convert categorical variables (months and days) into numerical valuesfires = pandas.get_dummies(fires, prefix_sep = '_')
# Shift (X, Y) coordinates to originfires['X'] -= min(fires['X'])fires['Y'] -= min(fires['Y'])
# X and Y are categorical variables, numerically coded
# -> Convert them in corresponding variables: X_0, X_1, ... Y_0, Y_1, ...for x in range(min(fires['X']), max(fires['X'])+1): fires["X_{}".format(x)] = 1 * (fires['X'] == x)fires.drop('X', axis=1, inplace=True) for y in range(min(fires['Y']), max(fires['Y'])+1): fires["Y_{}".format(y)] = 1 * (fires['Y'] == y)fires.drop('Y', axis=1, inplace=True) # In[8]: fires_attributes = list(fires.columns.values)number_of_columns = len(fires_attributes) # Logistic regression [...] The binary logistic model is used to estimate the probability of a binary # response based on one or more predictor (or independent) variables (features). (Reference: Wikipedia# (https://en.wikipedia.org/wiki/Logistic_regression) # Convert target variable (burned area) into a categorical (binary) variable# 0 = no burned area; 1 = some extension of the forest was burnedindex_list = fires[fires['area'] > 0.].index.tolist()fires['area'] = 0.fires.loc[index_list, 'area'] = 1. # Center each explanatory variables#to_be_centered = fires_attributes[fires_attributes.index('FFMC') : # fires_attributes.index('rain') + 1]to_be_centered = [attr for attr in fires_attributes if attr != 'area']for attr in to_be_centered: #From FFMC to rain: Exclude categorical variables fires[attr] = fires[attr] - fires[attr].mean() # Display general info about adjusted datasetfires.describe().T
0 notes
Text
Mod 3 Week 3
get_ipython().magic('matplotlib inline')
import pandas
import matplotlib.pyplot as plt
import seabornimport statsmodels.api as sm
import statsmodels.formula.api as smf
from pandas.tools.plotting
import scatter_matrix
from math import ceil pandas.set_option('display.float_format', lambda x:'%.3f'%x)
#pandas.set_option('display.mpl_style', 'default')
# --deprecatedplt.style.use('ggplot') # Make the graphs a bit prettierplt.rcParams['figure.figsize'] = (15, 5)
fires = pandas.read_csv('forestfires.csv')
fires.head()
fires_attributes = fires.columns.values.tolist()
number_of_columns = len(fires_attributes)
fires.describe()
attributes = [0, 1] + list(range(4, number_of_columns - 1))
n_cols = 3
n_rows = int(ceil(len(attributes) / n_cols))
fig = plt.figure()
idx = 1
for attr in attributes:
plt.subplot(n_rows, n_cols, idx)
plt.plot(fires['area'], fires[fires_attributes[attr]], 'b.')
# seaborn.reg
plot(x = fires['area'], y = fires[fires_attributes[attr]],
# scatter = True, color = 'b', data = fires)
plt.xlabel('area')
plt.ylabel(fires_attributes[attr])
idx += 1
plt.show()
print(fires[fires['area'] > 250])
# Plot some other variables scatter_matrix(fires, figsize = (15,10))
plt.show()
# High bias are appreciated in FFMC, DC, ISI, wind and area variables fires[['temp', 'RH', 'wind', 'rain']].plot()
#Plot temperature, relative humidity, wind
#and rain graphs print(fires.corr())
#Show correlation between variables
def plot_corr(df, size=10): '
''Function plots a graphical correlation matrix for each pair of columns in the dataframe, including the names of the attributes Input: df: pandas DataFrame size: vertical and horizontal size of the plot Code taken from: http://stackoverflow.com/questions/29432629/correlation-matrix-using-pandas ''' corr = df.corr() fig,
ax = plt.subplots(figsize=(size, size))
ax.matshow(corr, cmap = 'YlGnBu') plt.xticks(range(len(corr.columns)), corr.columns); plt.yticks(range(len(corr.columns)), corr.columns); #plt.matshow(fires.corr())plot_corr(fires, size = 6)
0 notes
Text
Mod 3 Week 2
get_ipython().magic('matplotlib inline')
import pandas
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from math import ceil
pandas.set_option('display.float_format', lambda x:'%.3f'%x)plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
fires = pandas.read_csv('forestfires.csv')
print(fires.head())
fires_attributes = fires.columns.values.tolist()
number_of_columns = len(fires_attributes)
statistics = pandas.DataFrame(index=range(0, number_of_columns - 2), columns=('min', 'max', 'mean', 'median', 'std'))
idx = 0
for attr in [0, 1] + list(range(4, number_of_columns)): statistics.loc[idx] = {'min': min(fires[fires_attributes[attr]]), 'max': max(fires[fires_attributes[attr]]), 'mean': fires[fires_attributes[attr]].mean(), 'median': fires[fires_attributes[attr]].median(), 'std': fires[fires_attributes[attr]].std()} idx += 1statistics.index = [fires_attributes[attr]
for attr in [0, 1] + list(range(4, number_of_columns))] print(statistics.T)
attributes = [0, 1] + list(range(4, number_of_columns - 1))
n_cols = 3
n_rows = int(ceil(len(attributes) / n_cols))
fig = plt.figure()
idx = 1
for attr in attributes:
plt.subplot(n_rows, n_cols, idx)
plt.plot(fires['area'], fires[fires_attributes[attr]], 'b.')
plt.xlabel('area')
plt.ylabel(fires_attributes[attr])
idx += 1
plt.show()
0 notes
Text
Mod 3 week1
Week 1. Introduction to Regression
Assignment: Writing About Your Data
Sample
The sample is from Cortez and Morais study about predicting forest fires using metereological data [Cortez and Morais, 2007]. The study includes data from 517 forest fires in the Natural Park Montesinho (Trás-os -Montes, in northeastern Portugal) January 2000 to December 2003, including meteorological data, the type of vegetation involved (which determines the six components of the Canadian Forest Fire Weather Index (FWI) system --see below--) and the total burned area in order to generate a model capable of predicting the burned area of small fires, which are more frequent.
Procedure
This data was built using two sources. The first database was collected by the inspector that was responsible for the Montesinho fire occurrences. At a daily basis, every time a forest fire occurred, several features were registered, such as the time, date, spatial location within a 9×9 grid (x and y axis spatial coordinate within the Montesinho park map), the type of vegetation involved, the six components of the FWI system and the total burned area. The second database was collected by the Bragança Polytechnic Institute, containing several weather observations (e.g. wind speed) that were recorded with a 30 minute period by a meteorological station located in the center of the Montesinho park. The two databases were stored in tens of individual spreadsheets, under distinct formats, and a substantial manual effort was performed to integrate them into a single dataset with a total of 517 entries.
Measures
The data contains the location of the fire (x,y axis spatial coordinate within the Montesinho park map: from 1 to 9), month and day of the week the fire occurred (january to december and monday to sunday), FWI system components: Fine Fuel Moisture Code (numeric rating of the moisture content of litter and other cured fine fuels: 18.7 to 96.2), Duff Moisture Code (numeric rating of the average moisture content of loosely compacted organic layers of moderate depth: 1.1 to 291.3), Drought Code (numeric rating of the average moisture content of deep, compact organic layers: 7.9 to 860.6) and Initial Spread Index (numeric rating of the expected rate of fire spread: 0.0 to 56.1), metereological variables: temperature (2.2 to 33.3 °C), relative humidity (15 to 100%), wind speed (0.4 to 9.4 Km/h) and outside rain (0.0 to 6.4 mm/m^2), among the burned area of the forest as response variable (0.0 to 1090.84 Ha).
References
[Cortez and Morais, 2007] P. Cortez and A. Morais. A Data Mining Approach to Predict Forest Fires using Meteorological Data. In J. Neves, M. F. Santos and J. Machado Eds., New Trends in Artificial Intelligence, Proceedings of the 13th EPIA 2007 - Portuguese Conference on Artificial Intelligence, December, Guimarães, Portugal, pp. 512-523, 2007. APPIA, ISBN-13 978-989-95618-0-9. Available at: [http://www.dsi.uminho.pt/~pcortez/fires.pdf]
Dataset extracted from https://archive.ics.uci.edu/ml/datasets/Forest+Fires
1 note
·
View note
Text
Week 4
import pandas
import numpy
import seaborn
import scipy
import matplotlib.pyplot as plt
nesarc = pandas.read_csv ('nesarc_pds.csv', low_memory=False)
pandas.set_option('display.max_columns' , None) pandas.set_option('display.max_rows' , None) nesarc.columns = map(str.upper , nesarc.columns) pandas.set_option('display.float_format' , lambda x:'%f'%x) # Change my variables to numeric
nesarc['AGE']=nesarc['AGE'].convert_objects(convert_numeric=True)nesarc['MAJORDEP12']=nesarc['MAJORDEP12'].convert_objects(convert_numeric=True)
nesarc['S1Q231']=nesarc['S1Q231'].convert_objects(convert_numeric=True)
nesarc['S3BQ1A5']=nesarc['S3BQ1A5'].convert_objects(convert_numeric=True)
nesarc['S3BD5Q2E']=nesarc['S3BD5Q2E'].convert_objects(convert_numeric=True)
subset1 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30) & nesarc['S3BQ1A5']==1]
subsetc1 = subset1.copy() subsetc1['S1Q231']=subsetc1['S1Q231'].replace(9, numpy.nan)
subsetc1['S3BQ1A5']=subsetc1['S3BQ1A5'].replace(9, numpy.nan)
subsetc1['S3BD5Q2E']=subsetc1['S3BD5Q2E'].replace(99, numpy.nan)
subsetc1['S3BD5Q2E']=subsetc1['S3BD5Q2E'].replace('BL', numpy.nan)
recode1 = {1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1} subsetc1['CUFREQ'] = subsetc1['S3BD5Q2E'].map(recode1)
subsetc1['CUFREQ'] = subsetc1['CUFREQ'].astype('category') subsetc1['CUFREQ'] =subsetc1['CUFREQ'].cat.rename_categories(["2 times/year","3-6 times/year","7-11 times/year","Once a month","2-3 times/month","1-2 times/week","3-4 times/week","Nearly every day","Every day"])
contab1 = pandas.crosstab(subsetc1['MAJORDEP12'], subsetc1['CUFREQ'])
print(contab1
colsum=contab1.sum(axis=0)
colpcontab=contab1/colsumprint(colpcontab)
print ('Chi-square value, p value, expected counts, for major depression within cannabis use status')
chsq1= scipy.stats.chi2_contingency(contab1)
print (chsq1)
plt.figure(figsize=(12,4))
sizeax1 = seaborn.factorplot(x="CUFREQ", y="MAJORDEP12", data=subsetc1, kind="bar", ci=None)
ax1.set_xticklabels(rotation=40, ha="right")
plt.xlabel('Frequency of cannabis use')
plt.ylabel('Proportion of Major Depression')
plt.show()
recode2 = {1: 10, 2: 9, 3: 8, 4: 7, 5: 6, 6: 5, 7: 4, 8: 3, 9: 2, 10: 1} subsetc1['CUFREQ2'] = subsetc1['S3BD5Q2E'].map(recode2) sub1=subsetc1[(subsetc1['S1Q231']== 1)]
sub2=subsetc1[(subsetc1['S1Q231']== 2)]
print ('Association between cannabis use status and major depression for those who lost a family member or a close friend in the last 12 months')
contab2=pandas.crosstab(sub1['MAJORDEP12'], sub1['CUFREQ2'])print (contab2) ] colsum2=contab2.sum(axis=0)
colpcontab2=contab2/colsum2
print(colpcontab2)
print ('Chi-square value, p value, expected counts')
chsq2= scipy.stats.chi2_contingency(contab2)
print (chsq2)
plt.figure(figsize=(12,4))
sizeax2 = seaborn.factorplot(x="CUFREQ", y="MAJORDEP12", data=sub1, kind="point", ci=None)
ax2.set_xticklabels(rotation=40, ha="right")
plt.xlabel('Frequency of cannabis use')
plt.ylabel('Proportion of Major Depression')
plt.title('Association between cannabis use status and major depression for those who lost a family member or a close friend in the last 12 months')
plt.show()
0 notes
Text
week 3
mport numpy
import scipy.stats
import seaborn
import matplotlib.pyplot as plt
nesarc = pandas.read_csv ('nesarc_pds.csv' , low_memory=False)
pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', None)
nesarc.columns = map(str.upper , nesarc.columns) pandas.set_option('display.float_format' , lambda x:'%f'%x)
nesarc['AGE'] = pandas.to_numeric(nesarc['AGE'], errors='coerce')
nesarc['S3BQ4'] = pandas.to_numeric(nesarc['S3BQ4'], errors='coerce')
nesarc['S3BQ1A5'] = pandas.to_numeric(nesarc['S3BQ1A5'], errors='coerce')
nesarc['S3BD5Q2B'] = pandas.to_numeric(nesarc['S3BD5Q2B'], errors='coerce')
nesarc['S3BD5Q2E'] = pandas.to_numeric(nesarc['S3BD5Q2E'], errors='coerce')
nesarc['MAJORDEP12'] = pandas.to_numeric(nesarc['MAJORDEP12'], errors='coerce')
nesarc['GENAXDX12'] = pandas.to_numeric(nesarc['GENAXDX12'], errors='coerce')
subset1 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30)] subsetc1 = subset1.copy()
subset2 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30) & (nesarc['S3BQ1A5']==1)]
subsetc2 = subset2.copy()
subsetc1['S3BQ1A5']=subsetc1['S3BQ1A5'].replace(9, numpy.nan)
subsetc2['S3BD5Q2E']=subsetc2['S3BD5Q2E'].replace('BL', numpy.nan)
subsetc2['S3BD5Q2E']=subsetc2['S3BD5Q2E'].replace(99, numpy.nan)
contab1=pandas.crosstab(subsetc1['MAJORDEP12'], subsetc1['S3BQ1A5'])
print (contab1)
0 notes
Text
Week 1
import pandas
import numpy
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
nesarc = pandas.read_csv ('nesarc_pds.csv' , low_memory=False)
pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', None)
nesarc.columns = map(str.upper , nesarc.columns) pandas.set_option('display.float_format' , lambda x:'%f'%x)
nesarc['AGE']=nesarc['AGE'].convert_objects(convert_numeric=True)nesarc['S3BQ4']=nesarc['S3BQ4'].convert_objects(convert_numeric=True)
nesarc['S3BQ1A5']=nesarc['S3BQ1A5'].convert_objects(convert_numeric=True)
nesarc['S3BD5Q2B']=nesarc['S3BD5Q2B'].convert_objects(convert_numeric=True)
nesarc['S3BD5Q2E']=nesarc['S3BD5Q2E'].convert_objects(convert_numeric=True)
nesarc['MAJORDEP12']=nesarc['MAJORDEP12'].convert_objects(convert_numeric=True)
nesarc['GENAXDX12']=nesarc['GENAXDX12'].convert_objects(convert_numeric=True)
subset5 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30) & (nesarc['S3BQ1A5']==1)]
subsetc5['S3BQ4']=subsetc5['S3BQ4'].replace(99, numpy.nan)
subsetc5['S3BQ4']=subsetc5['S3BQ4'].replace('BL', numpy.nan)
sub1 = subsetc5[['S3BQ4', 'MAJORDEP12']].dropna()
model1 = smf.ols(formula='S3BQ4 ~ C(MAJORDEP12)', data=sub1)
results1 = model1.fit()
print (results1.summary())
print ('Means for joints quantity by major depression status')
m1= sub1.groupby('MAJORDEP12').mean()
print (m1)
print ('Standard deviations for joints quantity by major depression status')
sd1 = sub1.groupby('MAJORDEP12').std()
print (sd1)
mc1 = multi.MultiComparison(sub3['S3BQ4'], sub3['S3BD5Q2E'])
res1 = mc1.tukeyhsd()
print(res1.summary())
0 notes
Text
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt # load gapminder dataset
data = pd.read_csv('gapminder.csv',low_memory=False)
# lower-case all DataFrame column names
data.columns = map(str.lower, data.columns)
# bug fix for display formats to avoid run time errorspd.set_option('display.float_format', lambda x:'%f'%x) # setting variables to be numeric
data['suicideper100th']=data['suicideper100th'].convert_objects(convert_numeric=True)
data['breastcancerper100th']=data['breastcancerper100th'].convert_objects(convert_numeric=True)
data['hivrate']=data['hivrate'].convert_objects(convert_numeric=True)
data['employrate']=data['employrate'].convert_objects(convert_numeric=True) # display summary statistics about the data
# print("Statistics for a Suicide Rate")
# print(data['suicideper100th'].describe()) # subset data for a high suicide rate based on summary statistics
sub = data[(data['suicideper100th']>12)]
#make a copy of my new subsetted datasub_copy = sub.copy() # Univariate graph for breast cancer rate for people with a high suicide rateplt.figure(1)
sb.distplot(sub_copy["breastcancerper100th"].dropna(),kde=False)
plt.xlabel('Breast Cancer Rate')
plt.ylabel('Frequency')
plt.title('Breast Cancer Rate for People with a High Suicide Rate') # Univariate graph for hiv rate for people with a high suicide rateplt.figure(2)sb.distplot(sub_copy["hivrate"].dropna(),kde=False)
plt.xlabel('HIV Rate')
plt.ylabel('Frequency')
plt.title('HIV Rate for People with a High Suicide Rate') # Univariate graph for employment rate for people with a high suicide rateplt.figure(3)sb.distplot(sub_copy["employrate"].dropna(),kde=False)
plt.xlabel('Employment Rate')plt.ylabel('Frequency')
plt.title('Employment Rate for People with a High Suicide Rate') # Bivariate graph for association of breast cancer rate with HIV rate for people with a high suicide rateplt.figure(4)sb.regplot(x="hivrate",y="breastcancerper100th",fit_reg=False,data=sub_copy)
plt.xlabel('HIV Rate')
plt.ylabel('Breast Cancer Rate')
plt.title('Breast Cancer Rate vs. HIV Rate for People with a High Suicide Rate')
0 notes
Text
import pandas as pd
# load gapminder dataset
data =pd.read_csv('gapminder.csv',low_memory=False)
# lower-case all DataFrame column names
data.columns = map(str.lower, data.columns)
# bug fix for display formats to avoid run time errorspd.set_option('display.float_format', lambda x:'%f'%x) # setting variables to be numericdata['suicideper100th'] = data['suicideper100th'].convert_objects(convert_numeric=True)data['breastcancerper100th'] = data['breastcancerper100th'].convert_objects(convert_numeric=True)
data['hivrate']=data['hivrate'].convert_objects(convert_numeric=True)data['employrate']=data['employrate'].convert_objects(convert_numeric=True)
# display summary statistics about the dataprint("Statistics for a Suicide Rate")
print(data['suicideper100th'].describe()) # subset data for a high suicide rate based on summary
statisticssub = data[(data['suicideper100th']>12)]
#make a copy of my new subsetted datasub_copy = sub.copy() percentage for 4 groups of breast cancer cases with a high suicide ratebc1=[] # Cumulative Frequencypbc1=[] # Cumulative Percentagecf=0cp=0for freq in bc: cf=cf+freq bc1.append(cf) pf=cf*100/len(sub_copy) pbc1.append(pf) print('Number of Breast Cancer Cases with a High Suicide Rate')fmt1 = '%10s %9s %9s %12s %13s'fmt2 = '%9s %9.d %10.2f %9.d %13.2f'print(fmt1 % ('# of Cases','Freq.','Percent','Cum. Freq.','Cum. Percent'))for i, (key, var1, var2, var3, var4) in enumerate(zip(bc.keys(),bc,pbc,bc1,pbc1)): print(fmt2 % (key, var1, var2, var3, var4)) # HIV RATE# frequency and percentage distritions for HIV rate with a high suicide rate# include the count of missing data and group the variables in 4 groups by quartile function# group the data in 4 groups and record it into new variable hcgroup4sub_copy['hcgroup4']=pd.qcut(sub_copy.hivrate,4,labels=["0% tile","25% tile","50% tile","75% tile"])# frequency for 4 groups of HIV rate with a high suicide ratehc = sub_copy['hcgroup4'].value_counts(sort=False,dropna=False)# percentage for 4 groups of HIV rate with a high suicide ratephc = sub_copy['hcgroup4'].value_counts(sort=False,dropna=False,normalize=True)*100 # cumulative frequency and cumulative percentage for 4 groups of HIV rate with a high suicide ratehc1=[] # Cumulative Frequencyphc1=[] # Cumulative Percentagecf=0cp=0for freq in hc: cf=cf+freq hc1.append(cf) pf=cf*100/len(sub_copy) phc1.append(pf) print('HIV Rate with a High Suicide Rate')print(fmt1 % ('Rate','Freq.','Percent','Cum. Freq.','Cum. Percent'))for i, (key, var1, var2, var3, var4) in enumerate(zip(hc.keys(),hc,phc,hc1,phc1)): print(fmt2 % (key, var1, var2, var3, var4)) # EMPLOYMENT RATE# frequency and percentage distritions for employment rate with a high suicide rate# include the count of missing data and group the variables in 5 groups by # group the data in 5 groups and record it into new variable ecgroup4def ecgroup4 (row): if row['employrate'] >= 32 and row['employrate'] < 51: return 1 elif row['employrate'] >= 51 and row['employrate'] < 59: return 2 elif row['employrate'] >= 59 and row['employrate'] < 65: return 3 elif row['employrate'] >= 65 and row['employrate'] < 84: return 4 else: return 5 # record for NAN values sub_copy['ecgroup4'] = sub_copy.apply(lambda row: ecgroup4 (row), axis=1) # frequency for 5 groups of employment rate with a high suicide rateec = sub_copy['ecgroup4'].value_counts(sort=False,dropna=False)# percentage for 5 groups of employment rate with a high suicide ratepec = sub_copy['ecgroup4'].value_counts(sort=False,dropna=False,normalize=True)*100 # cumulative frequency and cumulative percentage for 5 groups of employment rate with a high suicide rateec1=[] # Cumulative Frequencypec1=[] # Cumulative Percentagecf=0cp=0for freq in ec: cf=cf+freq ec1.append(cf) pf=cf*100/len(sub_copy) pec1.append(pf) print('Employment Rate with a High Suicide Rate')print(fmt1 % ('Rate','Freq.','Percent','Cum. Freq.','Cum. Percent'))for i, (key, var1, var2, var3, var4) in enumerate(zip(ec.keys(),ec,pec,ec1,pec1)): print(fmt2 % (key, var1, var2, var3, var4))
0 notes
Text
import pandas as pd
import numpy as np
# load gapminder dataset
data = pd.read_csv('gapminder.csv',low_memory=False)
# lower-case all DataFrame column names
data.columns = map(str.lower, data.columns)# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x) # setting variables to be numeric
data['suicideper100th']=data['suicideper100th'].convert_objects(convert_numeric=True)data['breastcancerper100th'] =data['breastcancerper100th'].convert_objects(convert_numeric=True)
data['hivrate']=data['hivrate'].convert_objects(convert_numeric=True)
data['employrate']=data['employrate'].convert_objects(convert_numeric=True)
# display summary statistics about the data
print("Statistics for a Suicide Rate")
print(data['suicideper100th'].describe()) # subset data for a high suicide rate based on summary statistics
sub = data[(data['suicideper100th']>12)]
#make a copy of my new subsetted
datasub_copy = sub.copy() # BREAST CANCER RATE# frequency and percentage distritions for a number of breast cancer cases with a high suicide rate#print('frequency for a number of breast cancer cases with a high suicide rate')
bc=sub_copy['breastcancerper100th'].value_counts(sort=False,bins=10)#print(bc) #print('percentage for a number of breast cancer cases with a high suicide rate')
pbc=sub_copy['breastcancerper100th'].value_counts(sort=False,bins=10,normalize=True)*100#print(pbc)
# cumulative frequency and cumulative percentage for a number of breast cancer cases with a high suicide rate
bc1=[] # Cumulative Frequencypbc1=[] # Cumulative Percentagec
f=0cp=0
for freq in bc:
cf=cf+freq bc1.append(cf)
pf=cf*100/len(sub_copy) pbc1.append(pf)
#print('cumulative frequency for a number of breast cancer cases with a high suicide rate')
#print(bc1)
#print('cumulative percentage for a number of breast cancer cases with a high suicide rate')
#print(pbc1)
print('Number of Breast Cancer Cases with a High Suicide Rate')
fmt1 = '%s %7s %9s %12s %12s'fmt2 = '%5.2f %10.d %10.2f %10.d %12.2f'
print(fmt1 % ('# of Cases','Freq.','Percent','Cum. Freq.','Cum. Percent'))
for i, (key, var1, var2, var3, var4)enumerate(zip(bc.keys(),bc,pbc,bc1,pbc1)):
print(fmt2 % (key, var1, var2, var3, var4))fmt3 = '%5s %10s %10s %10s %12s' print(fmt3 % ('NA', '2', '3.77', '53', '100.00'))
0 notes
Text
Analysing data of fertility rate
Data set: GapMinder Data. Research question: Is a fertility rate associated with a number of breast cancer cases? Items included in the CodeBook: for fertility rate: Children per woman (total fertility)Children per woman (total fertility), with projections for breast cancer: Breast cancer, deaths per 100,000 womenBreast cancer, new cases per 100,000 womenBreast cancer, number of female deathsBreast cancer, number of new female cases Literature Review: From original source: http://ww5.komen.org/KomenPerspectives/Does-pregnancy-affect-breast-cancer-risk-and-survival-.html The more children a woman has given birth to, the lower her risk of breast cancer tends to be. Women who have never given birth have a slightly higher risk of breast cancer compared to women who have had more than one child. The hypothesis to explore using GapMinder data set: the higher fertility rate, the lower risk of breast cancer.
1 note
·
View note