faizal2209 - Tumblr blog

faizal2209 · 2 years ago

Text

Mod 4 Week 1

import pandas as pd

import numpy as np

from sklearn.metrics

import*from sklearn.model_selection

import train_test_split

from sklearn.tree

import DecisionTreeClassifier

from sklearn

import tree from io

import StringIO from IPython.display

import Image

import pydotplus

from sklearn.manifold import TSNE

from matplotlib import pyplot as plt %matplotlib inline

rnd_state = 23468

ata = pd.read_csv('Data/breast_cancer.csv') data.info()

predictors = data.iloc[:, 2:] target = data.diagnosis

To train a Decision tree the dataset was splitted into train and test samples in proportion 70/30.

In [7]:(predictors_train, predictors_test, target_train, target_test) = train_test_split(predictors, target, test_size = .3, random_state = rnd_state)

In [8]:print('predictors_train:', predictors_train.shape) print('predictors_test:', predictors_test.shape) print('target_train:', target_train.shape) print('target_test:', target_test.shape) predictors_train: (398, 30) predictors_test: (171, 30) target_train: (398,) target_test: (171,)

In [9]:print(np.sum(target_train==0)) print(np.sum(target_train==1))

classifier = DecisionTreeClassifier(random_state = rnd_state).fit(predictors_train, target_train)

In [11]:prediction = classifier.predict(predictors_test)

In [12]:print('Confusion matrix:\n', pd.crosstab(target_test, prediction, colnames=['Actual'], rownames=['Predicted'], margins=True)) print('\nAccuracy: ', accuracy_score(target_test, prediction))

0 notes

faizal2209 · 2 years ago

Text

Mod 3 Week 4

import pandas

import numpy

import matplotlib.pyplot as plt

import seaborn

import statsmodels.api as smi

mport statsmodels.formula.api as smf

from statsmodels.stats

import outliers_influence

pandas.set_option('display.float_format', lambda x:'%.3f'%x)

fires = pandas.read_csv('forestfires.csv')

fires = fires.dropna()

months_table = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

days_table = ['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat']

fires['month'] = [months_table.index(month) for month in fires['month'] ]

fires['day'] = [days_table.index(day) for day in fires['day'] ] fires_attributes = list(fires.columns.values)

number_of_columns = len(fires_attributes)

fires['X'] -= min(fires['X'])

fires['Y'] -= min(fires['Y'])

model = smf.ols(formula = "area ~ C(X) + C(Y) + C(month) + C(day) + FFMC + DMC + " + " DC + ISI + temp + RH + wind + rain", data = fires).fit()

print(model.summary())

fires = pandas.read_csv('forestfires.csv')

# Delete rows where any or all of the data are missing

fires = fires.dropna()

# Convert categorical variables (months and days) into numerical valuesfires = pandas.get_dummies(fires, prefix_sep = '_')

# Shift (X, Y) coordinates to originfires['X'] -= min(fires['X'])fires['Y'] -= min(fires['Y'])

# X and Y are categorical variables, numerically coded

# -> Convert them in corresponding variables: X_0, X_1, ... Y_0, Y_1, ...for x in range(min(fires['X']), max(fires['X'])+1): fires["X_{}".format(x)] = 1 * (fires['X'] == x)fires.drop('X', axis=1, inplace=True) for y in range(min(fires['Y']), max(fires['Y'])+1): fires["Y_{}".format(y)] = 1 * (fires['Y'] == y)fires.drop('Y', axis=1, inplace=True) # In[8]: fires_attributes = list(fires.columns.values)number_of_columns = len(fires_attributes) # Logistic regression [...] The binary logistic model is used to estimate the probability of a binary # response based on one or more predictor (or independent) variables (features). (Reference: Wikipedia# (https://en.wikipedia.org/wiki/Logistic_regression) # Convert target variable (burned area) into a categorical (binary) variable# 0 = no burned area; 1 = some extension of the forest was burnedindex_list = fires[fires['area'] > 0.].index.tolist()fires['area'] = 0.fires.loc[index_list, 'area'] = 1. # Center each explanatory variables#to_be_centered = fires_attributes[fires_attributes.index('FFMC') : # fires_attributes.index('rain') + 1]to_be_centered = [attr for attr in fires_attributes if attr != 'area']for attr in to_be_centered: #From FFMC to rain: Exclude categorical variables fires[attr] = fires[attr] - fires[attr].mean() # Display general info about adjusted datasetfires.describe().T

0 notes

faizal2209 · 2 years ago

Text

Mod 3 Week 3

get_ipython().magic('matplotlib inline')

import pandas

import matplotlib.pyplot as plt

import seabornimport statsmodels.api as sm

import statsmodels.formula.api as smf

from pandas.tools.plotting

import scatter_matrix

from math import ceil pandas.set_option('display.float_format', lambda x:'%.3f'%x)

#pandas.set_option('display.mpl_style', 'default')

# --deprecatedplt.style.use('ggplot') # Make the graphs a bit prettierplt.rcParams['figure.figsize'] = (15, 5)

fires = pandas.read_csv('forestfires.csv')

fires.head()

fires_attributes = fires.columns.values.tolist()

number_of_columns = len(fires_attributes)

fires.describe()

attributes = [0, 1] + list(range(4, number_of_columns - 1))

n_cols = 3

n_rows = int(ceil(len(attributes) / n_cols))

fig = plt.figure()

idx = 1

for attr in attributes:

plt.subplot(n_rows, n_cols, idx)

plt.plot(fires['area'], fires[fires_attributes[attr]], 'b.')

# seaborn.reg

plot(x = fires['area'], y = fires[fires_attributes[attr]],

# scatter = True, color = 'b', data = fires)

plt.xlabel('area')

plt.ylabel(fires_attributes[attr])

idx += 1

plt.show()

print(fires[fires['area'] > 250])

# Plot some other variables scatter_matrix(fires, figsize = (15,10))

plt.show()

# High bias are appreciated in FFMC, DC, ISI, wind and area variables fires[['temp', 'RH', 'wind', 'rain']].plot()

#Plot temperature, relative humidity, wind

#and rain graphs print(fires.corr())

#Show correlation between variables

def plot_corr(df, size=10): '

''Function plots a graphical correlation matrix for each pair of columns in the dataframe, including the names of the attributes Input: df: pandas DataFrame size: vertical and horizontal size of the plot Code taken from: http://stackoverflow.com/questions/29432629/correlation-matrix-using-pandas ''' corr = df.corr() fig,

ax = plt.subplots(figsize=(size, size))

ax.matshow(corr, cmap = 'YlGnBu') plt.xticks(range(len(corr.columns)), corr.columns); plt.yticks(range(len(corr.columns)), corr.columns); #plt.matshow(fires.corr())plot_corr(fires, size = 6)

0 notes

faizal2209 · 2 years ago

Text

Mod 3 Week 2

get_ipython().magic('matplotlib inline')

import pandas

import matplotlib.pyplot as plt

import statsmodels.api as sm

import statsmodels.formula.api as smf

from math import ceil

pandas.set_option('display.float_format', lambda x:'%.3f'%x)plt.style.use('ggplot')

plt.rcParams['figure.figsize'] = (15, 5)

fires = pandas.read_csv('forestfires.csv')

print(fires.head())

fires_attributes = fires.columns.values.tolist()

number_of_columns = len(fires_attributes)

statistics = pandas.DataFrame(index=range(0, number_of_columns - 2), columns=('min', 'max', 'mean', 'median', 'std'))

idx = 0

for attr in [0, 1] + list(range(4, number_of_columns)): statistics.loc[idx] = {'min': min(fires[fires_attributes[attr]]), 'max': max(fires[fires_attributes[attr]]), 'mean': fires[fires_attributes[attr]].mean(), 'median': fires[fires_attributes[attr]].median(), 'std': fires[fires_attributes[attr]].std()} idx += 1statistics.index = [fires_attributes[attr]

for attr in [0, 1] + list(range(4, number_of_columns))] print(statistics.T)

attributes = [0, 1] + list(range(4, number_of_columns - 1))

n_cols = 3

n_rows = int(ceil(len(attributes) / n_cols))

fig = plt.figure()

idx = 1

for attr in attributes:

plt.subplot(n_rows, n_cols, idx)

plt.plot(fires['area'], fires[fires_attributes[attr]], 'b.')

plt.xlabel('area')

plt.ylabel(fires_attributes[attr])

idx += 1

plt.show()

0 notes

faizal2209 · 2 years ago

Text

Mod 3 week1

Week 1. Introduction to Regression

Assignment: Writing About Your Data

Sample

The sample is from Cortez and Morais study about predicting forest fires using metereological data [Cortez and Morais, 2007]. The study includes data from 517 forest fires in the Natural Park Montesinho (Trás-os -Montes, in northeastern Portugal) January 2000 to December 2003, including meteorological data, the type of vegetation involved (which determines the six components of the Canadian Forest Fire Weather Index (FWI) system --see below--) and the total burned area in order to generate a model capable of predicting the burned area of small fires, which are more frequent.

Procedure

This data was built using two sources. The first database was collected by the inspector that was responsible for the Montesinho fire occurrences. At a daily basis, every time a forest fire occurred, several features were registered, such as the time, date, spatial location within a 9×9 grid (x and y axis spatial coordinate within the Montesinho park map), the type of vegetation involved, the six components of the FWI system and the total burned area. The second database was collected by the Bragança Polytechnic Institute, containing several weather observations (e.g. wind speed) that were recorded with a 30 minute period by a meteorological station located in the center of the Montesinho park. The two databases were stored in tens of individual spreadsheets, under distinct formats, and a substantial manual effort was performed to integrate them into a single dataset with a total of 517 entries.

Measures

The data contains the location of the fire (x,y axis spatial coordinate within the Montesinho park map: from 1 to 9), month and day of the week the fire occurred (january to december and monday to sunday), FWI system components: Fine Fuel Moisture Code (numeric rating of the moisture content of litter and other cured fine fuels: 18.7 to 96.2), Duff Moisture Code (numeric rating of the average moisture content of loosely compacted organic layers of moderate depth: 1.1 to 291.3), Drought Code (numeric rating of the average moisture content of deep, compact organic layers: 7.9 to 860.6) and Initial Spread Index (numeric rating of the expected rate of fire spread: 0.0 to 56.1), metereological variables: temperature (2.2 to 33.3 °C), relative humidity (15 to 100%), wind speed (0.4 to 9.4 Km/h) and outside rain (0.0 to 6.4 mm/m^2), among the burned area of the forest as response variable (0.0 to 1090.84 Ha).

References

[Cortez and Morais, 2007] P. Cortez and A. Morais. A Data Mining Approach to Predict Forest Fires using Meteorological Data. In J. Neves, M. F. Santos and J. Machado Eds., New Trends in Artificial Intelligence, Proceedings of the 13th EPIA 2007 - Portuguese Conference on Artificial Intelligence, December, Guimarães, Portugal, pp. 512-523, 2007. APPIA, ISBN-13 978-989-95618-0-9. Available at: [http://www.dsi.uminho.pt/~pcortez/fires.pdf]

Dataset extracted from https://archive.ics.uci.edu/ml/datasets/Forest+Fires

1 note · View note

faizal2209 · 2 years ago

Text

Week 4

import pandas

import numpy

import seaborn

import scipy

import matplotlib.pyplot as plt

nesarc = pandas.read_csv ('nesarc_pds.csv', low_memory=False)

pandas.set_option('display.max_columns' , None) pandas.set_option('display.max_rows' , None) nesarc.columns = map(str.upper , nesarc.columns) pandas.set_option('display.float_format' , lambda x:'%f'%x) # Change my variables to numeric

nesarc['AGE']=nesarc['AGE'].convert_objects(convert_numeric=True)nesarc['MAJORDEP12']=nesarc['MAJORDEP12'].convert_objects(convert_numeric=True)

nesarc['S1Q231']=nesarc['S1Q231'].convert_objects(convert_numeric=True)

nesarc['S3BQ1A5']=nesarc['S3BQ1A5'].convert_objects(convert_numeric=True)

nesarc['S3BD5Q2E']=nesarc['S3BD5Q2E'].convert_objects(convert_numeric=True)

subset1 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30) & nesarc['S3BQ1A5']==1]

subsetc1 = subset1.copy() subsetc1['S1Q231']=subsetc1['S1Q231'].replace(9, numpy.nan)

subsetc1['S3BQ1A5']=subsetc1['S3BQ1A5'].replace(9, numpy.nan)

subsetc1['S3BD5Q2E']=subsetc1['S3BD5Q2E'].replace(99, numpy.nan)

subsetc1['S3BD5Q2E']=subsetc1['S3BD5Q2E'].replace('BL', numpy.nan)

recode1 = {1: 9, 2: 8, 3: 7, 4: 6, 5: 5, 6: 4, 7: 3, 8: 2, 9: 1} subsetc1['CUFREQ'] = subsetc1['S3BD5Q2E'].map(recode1)

subsetc1['CUFREQ'] = subsetc1['CUFREQ'].astype('category') subsetc1['CUFREQ'] =subsetc1['CUFREQ'].cat.rename_categories(["2 times/year","3-6 times/year","7-11 times/year","Once a month","2-3 times/month","1-2 times/week","3-4 times/week","Nearly every day","Every day"])

contab1 = pandas.crosstab(subsetc1['MAJORDEP12'], subsetc1['CUFREQ'])

print(contab1

colsum=contab1.sum(axis=0)

colpcontab=contab1/colsumprint(colpcontab)

print ('Chi-square value, p value, expected counts, for major depression within cannabis use status')

chsq1= scipy.stats.chi2_contingency(contab1)

print (chsq1)

plt.figure(figsize=(12,4))

sizeax1 = seaborn.factorplot(x="CUFREQ", y="MAJORDEP12", data=subsetc1, kind="bar", ci=None)

ax1.set_xticklabels(rotation=40, ha="right")

plt.xlabel('Frequency of cannabis use')

plt.ylabel('Proportion of Major Depression')

plt.show()

recode2 = {1: 10, 2: 9, 3: 8, 4: 7, 5: 6, 6: 5, 7: 4, 8: 3, 9: 2, 10: 1} subsetc1['CUFREQ2'] = subsetc1['S3BD5Q2E'].map(recode2) sub1=subsetc1[(subsetc1['S1Q231']== 1)]

sub2=subsetc1[(subsetc1['S1Q231']== 2)]

print ('Association between cannabis use status and major depression for those who lost a family member or a close friend in the last 12 months')

contab2=pandas.crosstab(sub1['MAJORDEP12'], sub1['CUFREQ2'])print (contab2) ] colsum2=contab2.sum(axis=0)

colpcontab2=contab2/colsum2

print(colpcontab2)

print ('Chi-square value, p value, expected counts')

chsq2= scipy.stats.chi2_contingency(contab2)

print (chsq2)

plt.figure(figsize=(12,4))

sizeax2 = seaborn.factorplot(x="CUFREQ", y="MAJORDEP12", data=sub1, kind="point", ci=None)

ax2.set_xticklabels(rotation=40, ha="right")

plt.xlabel('Frequency of cannabis use')

plt.ylabel('Proportion of Major Depression')

plt.title('Association between cannabis use status and major depression for those who lost a family member or a close friend in the last 12 months')

plt.show()

0 notes

faizal2209 · 2 years ago

Text

week 3

mport numpy

import scipy.stats

import seaborn

import matplotlib.pyplot as plt

nesarc = pandas.read_csv ('nesarc_pds.csv' , low_memory=False)

pandas.set_option('display.max_columns', None)

pandas.set_option('display.max_rows', None)

nesarc.columns = map(str.upper , nesarc.columns) pandas.set_option('display.float_format' , lambda x:'%f'%x)

nesarc['AGE'] = pandas.to_numeric(nesarc['AGE'], errors='coerce')

nesarc['S3BQ4'] = pandas.to_numeric(nesarc['S3BQ4'], errors='coerce')

nesarc['S3BQ1A5'] = pandas.to_numeric(nesarc['S3BQ1A5'], errors='coerce')

nesarc['S3BD5Q2B'] = pandas.to_numeric(nesarc['S3BD5Q2B'], errors='coerce')

nesarc['S3BD5Q2E'] = pandas.to_numeric(nesarc['S3BD5Q2E'], errors='coerce')

nesarc['MAJORDEP12'] = pandas.to_numeric(nesarc['MAJORDEP12'], errors='coerce')

nesarc['GENAXDX12'] = pandas.to_numeric(nesarc['GENAXDX12'], errors='coerce')

subset1 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30)] subsetc1 = subset1.copy()

subset2 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30) & (nesarc['S3BQ1A5']==1)]

subsetc2 = subset2.copy()

subsetc1['S3BQ1A5']=subsetc1['S3BQ1A5'].replace(9, numpy.nan)

subsetc2['S3BD5Q2E']=subsetc2['S3BD5Q2E'].replace('BL', numpy.nan)

subsetc2['S3BD5Q2E']=subsetc2['S3BD5Q2E'].replace(99, numpy.nan)

contab1=pandas.crosstab(subsetc1['MAJORDEP12'], subsetc1['S3BQ1A5'])

print (contab1)

0 notes

faizal2209 · 2 years ago

Text

Week 1

import pandas

import numpy

import statsmodels.formula.api as smf

import statsmodels.stats.multicomp as multi

nesarc = pandas.read_csv ('nesarc_pds.csv' , low_memory=False)

pandas.set_option('display.max_columns', None)

pandas.set_option('display.max_rows', None)

nesarc.columns = map(str.upper , nesarc.columns) pandas.set_option('display.float_format' , lambda x:'%f'%x)

nesarc['AGE']=nesarc['AGE'].convert_objects(convert_numeric=True)nesarc['S3BQ4']=nesarc['S3BQ4'].convert_objects(convert_numeric=True)

nesarc['S3BQ1A5']=nesarc['S3BQ1A5'].convert_objects(convert_numeric=True)

nesarc['S3BD5Q2B']=nesarc['S3BD5Q2B'].convert_objects(convert_numeric=True)

nesarc['S3BD5Q2E']=nesarc['S3BD5Q2E'].convert_objects(convert_numeric=True)

nesarc['MAJORDEP12']=nesarc['MAJORDEP12'].convert_objects(convert_numeric=True)

nesarc['GENAXDX12']=nesarc['GENAXDX12'].convert_objects(convert_numeric=True)

subset5 = nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=30) & (nesarc['S3BQ1A5']==1)]

subsetc5['S3BQ4']=subsetc5['S3BQ4'].replace(99, numpy.nan)

subsetc5['S3BQ4']=subsetc5['S3BQ4'].replace('BL', numpy.nan)

sub1 = subsetc5[['S3BQ4', 'MAJORDEP12']].dropna()

model1 = smf.ols(formula='S3BQ4 ~ C(MAJORDEP12)', data=sub1)

results1 = model1.fit()

print (results1.summary())

print ('Means for joints quantity by major depression status')

m1= sub1.groupby('MAJORDEP12').mean()

print (m1)

print ('Standard deviations for joints quantity by major depression status')

sd1 = sub1.groupby('MAJORDEP12').std()

print (sd1)

mc1 = multi.MultiComparison(sub3['S3BQ4'], sub3['S3BD5Q2E'])

res1 = mc1.tukeyhsd()

print(res1.summary())

0 notes

faizal2209 · 2 years ago

Text

import pandas as pd

import numpy as np

import seaborn as sb

import matplotlib.pyplot as plt # load gapminder dataset

data = pd.read_csv('gapminder.csv',low_memory=False)

# lower-case all DataFrame column names

data.columns = map(str.lower, data.columns)

# bug fix for display formats to avoid run time errorspd.set_option('display.float_format', lambda x:'%f'%x) # setting variables to be numeric

data['suicideper100th']=data['suicideper100th'].convert_objects(convert_numeric=True)

data['breastcancerper100th']=data['breastcancerper100th'].convert_objects(convert_numeric=True)

data['hivrate']=data['hivrate'].convert_objects(convert_numeric=True)

data['employrate']=data['employrate'].convert_objects(convert_numeric=True) # display summary statistics about the data

# print("Statistics for a Suicide Rate")

# print(data['suicideper100th'].describe()) # subset data for a high suicide rate based on summary statistics

sub = data[(data['suicideper100th']>12)]

#make a copy of my new subsetted datasub_copy = sub.copy() # Univariate graph for breast cancer rate for people with a high suicide rateplt.figure(1)

sb.distplot(sub_copy["breastcancerper100th"].dropna(),kde=False)

plt.xlabel('Breast Cancer Rate')

plt.ylabel('Frequency')

plt.title('Breast Cancer Rate for People with a High Suicide Rate') # Univariate graph for hiv rate for people with a high suicide rateplt.figure(2)sb.distplot(sub_copy["hivrate"].dropna(),kde=False)

plt.xlabel('HIV Rate')

plt.ylabel('Frequency')

plt.title('HIV Rate for People with a High Suicide Rate') # Univariate graph for employment rate for people with a high suicide rateplt.figure(3)sb.distplot(sub_copy["employrate"].dropna(),kde=False)

plt.xlabel('Employment Rate')plt.ylabel('Frequency')

plt.title('Employment Rate for People with a High Suicide Rate') # Bivariate graph for association of breast cancer rate with HIV rate for people with a high suicide rateplt.figure(4)sb.regplot(x="hivrate",y="breastcancerper100th",fit_reg=False,data=sub_copy)

plt.xlabel('HIV Rate')

plt.ylabel('Breast Cancer Rate')

plt.title('Breast Cancer Rate vs. HIV Rate for People with a High Suicide Rate')

0 notes

faizal2209 · 2 years ago

Text

import pandas as pd

# load gapminder dataset

data =pd.read_csv('gapminder.csv',low_memory=False)

# lower-case all DataFrame column names

data.columns = map(str.lower, data.columns)

# bug fix for display formats to avoid run time errorspd.set_option('display.float_format', lambda x:'%f'%x) # setting variables to be numericdata['suicideper100th'] = data['suicideper100th'].convert_objects(convert_numeric=True)data['breastcancerper100th'] = data['breastcancerper100th'].convert_objects(convert_numeric=True)

data['hivrate']=data['hivrate'].convert_objects(convert_numeric=True)data['employrate']=data['employrate'].convert_objects(convert_numeric=True)

# display summary statistics about the dataprint("Statistics for a Suicide Rate")

print(data['suicideper100th'].describe()) # subset data for a high suicide rate based on summary

statisticssub = data[(data['suicideper100th']>12)]

#make a copy of my new subsetted datasub_copy = sub.copy() percentage for 4 groups of breast cancer cases with a high suicide ratebc1=[] # Cumulative Frequencypbc1=[] # Cumulative Percentagecf=0cp=0for freq in bc: cf=cf+freq bc1.append(cf) pf=cf*100/len(sub_copy) pbc1.append(pf) print('Number of Breast Cancer Cases with a High Suicide Rate')fmt1 = '%10s %9s %9s %12s %13s'fmt2 = '%9s %9.d %10.2f %9.d %13.2f'print(fmt1 % ('# of Cases','Freq.','Percent','Cum. Freq.','Cum. Percent'))for i, (key, var1, var2, var3, var4) in enumerate(zip(bc.keys(),bc,pbc,bc1,pbc1)): print(fmt2 % (key, var1, var2, var3, var4)) # HIV RATE# frequency and percentage distritions for HIV rate with a high suicide rate# include the count of missing data and group the variables in 4 groups by quartile function# group the data in 4 groups and record it into new variable hcgroup4sub_copy['hcgroup4']=pd.qcut(sub_copy.hivrate,4,labels=["0% tile","25% tile","50% tile","75% tile"])# frequency for 4 groups of HIV rate with a high suicide ratehc = sub_copy['hcgroup4'].value_counts(sort=False,dropna=False)# percentage for 4 groups of HIV rate with a high suicide ratephc = sub_copy['hcgroup4'].value_counts(sort=False,dropna=False,normalize=True)*100 # cumulative frequency and cumulative percentage for 4 groups of HIV rate with a high suicide ratehc1=[] # Cumulative Frequencyphc1=[] # Cumulative Percentagecf=0cp=0for freq in hc: cf=cf+freq hc1.append(cf) pf=cf*100/len(sub_copy) phc1.append(pf) print('HIV Rate with a High Suicide Rate')print(fmt1 % ('Rate','Freq.','Percent','Cum. Freq.','Cum. Percent'))for i, (key, var1, var2, var3, var4) in enumerate(zip(hc.keys(),hc,phc,hc1,phc1)): print(fmt2 % (key, var1, var2, var3, var4)) # EMPLOYMENT RATE# frequency and percentage distritions for employment rate with a high suicide rate# include the count of missing data and group the variables in 5 groups by # group the data in 5 groups and record it into new variable ecgroup4def ecgroup4 (row): if row['employrate'] >= 32 and row['employrate'] < 51: return 1 elif row['employrate'] >= 51 and row['employrate'] < 59: return 2 elif row['employrate'] >= 59 and row['employrate'] < 65: return 3 elif row['employrate'] >= 65 and row['employrate'] < 84: return 4 else: return 5 # record for NAN values sub_copy['ecgroup4'] = sub_copy.apply(lambda row: ecgroup4 (row), axis=1) # frequency for 5 groups of employment rate with a high suicide rateec = sub_copy['ecgroup4'].value_counts(sort=False,dropna=False)# percentage for 5 groups of employment rate with a high suicide ratepec = sub_copy['ecgroup4'].value_counts(sort=False,dropna=False,normalize=True)*100 # cumulative frequency and cumulative percentage for 5 groups of employment rate with a high suicide rateec1=[] # Cumulative Frequencypec1=[] # Cumulative Percentagecf=0cp=0for freq in ec: cf=cf+freq ec1.append(cf) pf=cf*100/len(sub_copy) pec1.append(pf) print('Employment Rate with a High Suicide Rate')print(fmt1 % ('Rate','Freq.','Percent','Cum. Freq.','Cum. Percent'))for i, (key, var1, var2, var3, var4) in enumerate(zip(ec.keys(),ec,pec,ec1,pec1)): print(fmt2 % (key, var1, var2, var3, var4))

0 notes

faizal2209 · 2 years ago

Text

import pandas as pd

import numpy as np

# load gapminder dataset

data = pd.read_csv('gapminder.csv',low_memory=False)

# lower-case all DataFrame column names

data.columns = map(str.lower, data.columns)# bug fix for display formats to avoid run time errors

pd.set_option('display.float_format', lambda x:'%f'%x) # setting variables to be numeric

data['suicideper100th']=data['suicideper100th'].convert_objects(convert_numeric=True)data['breastcancerper100th'] =data['breastcancerper100th'].convert_objects(convert_numeric=True)

data['hivrate']=data['hivrate'].convert_objects(convert_numeric=True)

data['employrate']=data['employrate'].convert_objects(convert_numeric=True)

# display summary statistics about the data

print("Statistics for a Suicide Rate")

print(data['suicideper100th'].describe()) # subset data for a high suicide rate based on summary statistics

sub = data[(data['suicideper100th']>12)]

#make a copy of my new subsetted

datasub_copy = sub.copy() # BREAST CANCER RATE# frequency and percentage distritions for a number of breast cancer cases with a high suicide rate#print('frequency for a number of breast cancer cases with a high suicide rate')

bc=sub_copy['breastcancerper100th'].value_counts(sort=False,bins=10)#print(bc) #print('percentage for a number of breast cancer cases with a high suicide rate')

pbc=sub_copy['breastcancerper100th'].value_counts(sort=False,bins=10,normalize=True)*100#print(pbc)

# cumulative frequency and cumulative percentage for a number of breast cancer cases with a high suicide rate

bc1=[] # Cumulative Frequencypbc1=[] # Cumulative Percentagec

f=0cp=0

for freq in bc:

cf=cf+freq bc1.append(cf)

pf=cf*100/len(sub_copy) pbc1.append(pf)

#print('cumulative frequency for a number of breast cancer cases with a high suicide rate')

#print(bc1)

#print('cumulative percentage for a number of breast cancer cases with a high suicide rate')

#print(pbc1)

print('Number of Breast Cancer Cases with a High Suicide Rate')

fmt1 = '%s %7s %9s %12s %12s'fmt2 = '%5.2f %10.d %10.2f %10.d %12.2f'

print(fmt1 % ('# of Cases','Freq.','Percent','Cum. Freq.','Cum. Percent'))

for i, (key, var1, var2, var3, var4)enumerate(zip(bc.keys(),bc,pbc,bc1,pbc1)):

print(fmt2 % (key, var1, var2, var3, var4))fmt3 = '%5s %10s %10s %10s %12s' print(fmt3 % ('NA', '2', '3.77', '53', '100.00'))

#happy coding

0 notes

faizal2209 · 2 years ago

Text

Analysing data of fertility rate

Data set: GapMinder Data. Research question: Is a fertility rate associated with a number of breast cancer cases? Items included in the CodeBook: for fertility rate: Children per woman (total fertility)Children per woman (total fertility), with projections for breast cancer: Breast cancer, deaths per 100,000 womenBreast cancer, new cases per 100,000 womenBreast cancer, number of female deathsBreast cancer, number of new female cases Literature Review: From original source: http://ww5.komen.org/KomenPerspectives/Does-pregnancy-affect-breast-cancer-risk-and-survival-.html The more children a woman has given birth to, the lower her risk of breast cancer tends to be. Women who have never given birth have a slightly higher risk of breast cancer compared to women who have had more than one child. The hypothesis to explore using GapMinder data set: the higher fertility rate, the lower risk of breast cancer.

1 note · View note