course4ml - Tumblr blog

course4ml · 6 years ago

Text

Assignment 4 Program

from pandas import Series, DataFrame import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.cluster import KMeans

origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()

data["S13Q6A10"]=pd.to_numeric(data["S13Q6A10"],errors="coerce") data["CHECK321"]=pd.to_numeric(data["CHECK321"],errors="coerce") data["SMOKER"]=pd.to_numeric(data["SMOKER"],errors="coerce") data["S3AQ3B1"]=pd.to_numeric(data["S3AQ3B1"],errors="coerce") data["S3AQ3C1"]=pd.to_numeric(data["S3AQ3C1"],errors="coerce") data["S3AQ3D1R"]=pd.to_numeric(data["S3AQ3D1R"],errors="coerce") data["S13Q2"]=pd.to_numeric(data["S13Q2"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce") data["AGE"]=pd.to_numeric(data["AGE"],errors="coerce") data["S2AQ5D"]=pd.to_numeric(data["S2AQ5D"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce")

#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)

data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)

rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)

data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)

data["AGE"]=data["AGE"].replace(98,np.NaN)

data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)

rcd3={2:0,1:1} data["SEX"]=data["SEX"].map(rcd3)

data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)

sub1=data[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX","S13Q2","S2AQ5D"]] cluster=sub1.dropna()

#standardize dataset clustervar=cluster[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX","S13Q2"]] clustervar["CHECK321"]=preprocessing.scale(clustervar["CHECK321"].astype("float64")) clustervar["SMOKER"]=preprocessing.scale(clustervar["SMOKER"].astype("float64")) clustervar["S3AQ3B1"]=preprocessing.scale(clustervar["S3AQ3B1"].astype("float64")) clustervar["S3AQ3C1"]=preprocessing.scale(clustervar["S3AQ3C1"].astype("float64")) clustervar["S3AQ2A1"]=preprocessing.scale(clustervar["S3AQ2A1"].astype("float64")) clustervar["AGE"]=preprocessing.scale(clustervar["AGE"].astype("float64")) clustervar["SEX"]=preprocessing.scale(clustervar["SEX"].astype("float64")) clustervar["S13Q2"]=preprocessing.scale(clustervar["S13Q2"].astype("float64"))

#split into train and test parts clus_train,clus_test=train_test_split(clustervar, test_size=0.3, random_state=123)

#k-means analysis for 1-9 clusters from scipy.spatial.distance import cdist clusters=range(1,10) meandist=[]

for k in clusters: model=KMeans(n_clusters=k) model.fit(clus_train) clusassign=model.predict(clus_train) meandist.append(sum(np.min(cdist(clus_train,model.cluster_centers_,"euclidean"),axis=1))/clus_train.shape[0]) """ #plot average distrance of observations and cluster centroid using the Euclidean method plt.plot(clusters,meandist) plt.xlabel("Number of clusters") plt.ylabel("Average distance") plt.title("selecting k with the Elbow method") """

#interpret 6 cluster solution model6=KMeans(n_clusters=3) model6.fit(clus_train) clusassign=model6.predict(clus_train)

#plot cluster 6 from sklearn.decomposition import PCA pca_2=PCA(2) plot_columns=pca_2.fit_transform(clus_train) plt.scatter(x=plot_columns[:,0],y=plot_columns[:,1],c=model6.labels_,) plt.xlabel("Canonical varaible 1") plt.ylabel("Canonical variable 2") plt.title("Scatter plot of canonical variables for 6 clusters") plt.show()

""" mutiple steps to merge cluster assignments with clustering variables to examine cluster variable means by cluster """ #create a unique identifier variable from index for the cluster training data to merge with cluster assignment varibale clus_train.reset_index(level=0,inplace=True) #creat a list that has the new variable cluslist=list(clus_train["index"]) #create a list of cluster assignments labels=list(model6.labels_) #combine index variable list with cluster assignment list into a dictionary newlist=dict(zip(cluslist,labels)) newlist #convert dict to dataframe newclus=DataFrame.from_dict(newlist,orient="index") newclus

#do the same to cluster assignment variables #rename the cluster assignment column newclus.columns=["cluster"] #creat a unique identifier vaiables from the index for the cluster assignment dataframe #to merge with training data newclus.reset_index(level=0,inplace=True) #merge the cluster assignment dataframe with cluster training variable dataframe by index variable merged_train=pd.merge(clus_train,newclus,on="index") merged_train.head(n=100) #cluster frequency merged_train.cluster.value_counts()

pd.set_option('display.max_columns', None) clustergrp=merged_train.groupby("cluster").mean() print("Clustering variable means by cluster") print(clustergrp)

#validate clusters in training data by examing cluster difference in beers drinking using ANOVA #first have to merge beer drinking with clustering variables and cluster assignment data beer_data=cluster["S2AQ5D"] #split beer into training and testing data set beer_train, beer_test=train_test_split(beer_data, test_size=0.3,random_state=123) beer_train1=pd.DataFrame(beer_train) beer_train1.reset_index(level=0,inplace=True) merged_train_all=pd.merge(beer_train1,merged_train,on="index") sub2=merged_train_all[["S2AQ5D","cluster"]].dropna()

import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi

beermod=smf.ols(formula="S2AQ5D~C(cluster)",data=sub2).fit() print(beermod.summary()) print("means for beer drinking by cluster") m1=sub2.groupby("cluster").mean() print(m1)

print("standard deviations for beer by cluster") m2=sub2.groupby("cluster").std() print(m2)

mc1=multi.MultiComparison(sub2["S2AQ5D"],sub2["cluster"]) res1=mc1.tukeyhsd() print(res1.summary())

0 notes

course4ml · 6 years ago

Text

Assignment 3 Results

In this study, I tested smoking habits and how long you stay in hospital per year.

And variables I used including are you smoking, how long you smoked one cigarette, how many are you smoked in a day and the earliest age you smoked.

And I used Lasso regression in the model, here is it found:

{'CHECK321': 0.0, 'SMOKER': -2.137026383450022, 'S3AQ3B1': -1.2806242742542924, 'S3AQ3C1': 1.1613836341096238, 'S3AQ2A1': 0.0, 'AGE': 0.0, 'SEX': 0.8547639650347999}

It seems its not related to smoking status, and not related to earliest smoking age, also, not relate to the age you are in.

Here is the plot of the coefficient and alphas relationship:

And here is MSE in each fold

And here are numbers calculated:

Obviously, in this regression, all the variables could not predict the how long you stayed in hospital precisely. We need to find other variables and models to do the prediction.

0 notes

course4ml · 6 years ago

Text

Assignment 3 Program

import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn.linear_model import LassoLarsCV

origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()

#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)

data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)

rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)

data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)

data["AGE"]=data["AGE"].replace(98,np.NaN)

data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)

rcd3={2:0,1:1} data["SEX"]=data["SEX"].map(rcd3)

data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)

sub1=data[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX","S13Q2"]] sub1=sub1.dropna()

predvar=sub1[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX"]] target=sub1.S13Q2

#standardize all varibales predictors=predvar.copy() from sklearn import preprocessing predictors["CHECK321"]=preprocessing.scale(predictors["CHECK321"].astype("float64")) predictors["SMOKER"]=preprocessing.scale(predictors["SMOKER"].astype("float64")) predictors["S3AQ3B1"]=preprocessing.scale(predictors["S3AQ3B1"].astype("float64")) predictors["S3AQ3C1"]=preprocessing.scale(predictors["S3AQ3C1"].astype("float64")) #predictors["S3AQ3D1R"]=preprocessing.scale(predictors["S3AQ3D1R"].astype("float64")) #predictors["S13Q2"]=preprocessing.scale(predictors["S13Q2"].astype("float64")) predictors["S3AQ2A1"]=preprocessing.scale(predictors["S3AQ2A1"].astype("float64")) predictors["AGE"]=preprocessing.scale(predictors["AGE"].astype("float64")) predictors["SEX"]=preprocessing.scale(predictors["SEX"].astype("float64")) #predictors["S2AQ5D"]=preprocessing.scale(predictors["S2AQ5D"].astype("float64")) #predictors["S3AQ2A1"]=preprocessing.scale(predictors["S3AQ2A1"].astype("float64"))

#split into train and test set pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,target,test_size=.3,random_state=123)

#using Lasso regression model=LassoLarsCV(cv=10,Precompute=False).fit(pred_train,tar_train)

#print variable names, regression coefficient print(dict(zip(predictors.columns,model.coef_)))

#plot coefficent progression m_log_alphas=-np.log10(model.alphas_) ax=plt.gca() plt.plot(m_log_alphas,model.coef_path_.T) plt.axvline(-np.log10(model.alpha_),linestyle="--",color="k",label="alpha CV") plt.ylabel("Regression Coefficiency") plt.xlabel("-log(alpha)") plt.title("Regression Coeefficiency Progression for Lasso Paths")

#plot of mean square error for each fold m_log_alphascv=-np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv,model.mse_path_,":") plt.plot(m_log_alphascv,model.mse_path_.mean(axis=-1),"k",label="average across the folds",linewidth=2) plt.axvline(-np.log10(model.alpha_),linestyle="--",color="k",label="alpha CV") plt.legend() plt.xlabel("-log(alpha)") plt.ylabel("mean squared error") plt.title("mean squared errors in each fold")

#mean squared errors from training and test from sklearn.metrics import mean_squared_error train_error=mean_squared_error(tar_train,model.predict(pred_train)) test_error=mean_squared_error(tar_test,model.predict(pred_test)) print("training data MSE", train_error) print("test data MSA",test_error)

#R-square from training and test data rsquared_train=model.score(pred_train,tar_train) rsquared_test=model.score(pred_test,tar_test) print("traning data R-sqaure",rsquared_train) print("test data R-squared",rsquared_test)

0 notes

course4ml · 6 years ago

Text

Assignment 2 Results

We tested if one got gastritis is related to smoking and how smoking habits could make the prediction of gastritis.

There are 8 variables that we included in the study, including your smoking status now, the frequency of smoking, amount cigarettes smoked per day, age etc. And our target is if you have gastritis now.

Here are the results:

relative importance:

This shows that the most important in gastritis predictions is age(score=0.264), and the least important variable is smoking status right now (0.005). However, the age you smoking first cigarettes is also important (S3AQ2A1, score=0.207). This may indicate that how long you smoked is more important in the prediction of your gastritis.

Also, we draw the plot of how many trees is in high accuracy value,

In the plot, it seems 2 trees could accuracy score>92%, and over 10 trees make sure the accuracy score is stable.

0 notes

course4ml · 6 years ago

Text

Assignment 2 Program

import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split import sklearn.metrics #feature importance from sklearn.ensemble import ExtraTreesClassifier

origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()

#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)

data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)

rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)

data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)

data["AGE"]=data["AGE"].replace(98,np.NaN)

data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)

sub1=data[["GAS","CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] sub1=sub1.dropna()

#data_clean.dtypes #data_clean.describe()

#model predictors=sub1[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] targets=sub1.GAS

pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,targets,test_size=.4)

print(pred_train.shape) print(pred_test.shape) print(tar_train.shape) print(tar_test.shape)

#build model to training data from sklearn.ensemble import RandomForestClassifier classifier=RandomForestClassifier(n_estimators=25) classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) print(sklearn.metrics.confusion_matrix(tar_test,predictions)) #accuracy score print("\n arruracy score") print(sklearn.metrics.accuracy_score(tar_test,predictions))

#fit an Extra Trees model to the data model=ExtraTreesClassifier() model.fit(pred_train,tar_train) #display relative importance in each variable print(model.feature_importances_)

""" impact of trees number in prediciton accurancy useing for to run number from 0-24 """ trees=range(25) accuracy=np.zeros(25)

for idx in range(len(trees)): classifier=RandomForestClassifier(n_estimators=idx+1) classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) accuracy[idx]=sklearn.metrics.accuracy_score(tar_test,predictions)

plt.cla() plt.plot(trees,accuracy)

0 notes

course4ml · 6 years ago

Text

Assignment 1 Results

In this study, we tried to test the relationship between gastritis and smoking status. And also its relationship of age, gender.

984 77 78 8

Accuracy Score 0.8648648648648649

So in this model, we have 984 true positives and 8 true negatives, and 77 false positives and 78 false negatives. And accuracy score is 0.86, which means over 86% is classification right.

And then, we draw our decision trees, including 8 variables.

To see how a decision tree should be like, we used 2 variables, which is are you smoking. Here is the decision tree:

It indicates that for those who are smoking, gastritis is more likely to occur. And for those who do not smoke, it is low possibility than gastritis is occurring.

0 notes

course4ml · 6 years ago

Text

Assignment 1 Program

import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split from sklearn.tree import DecisionTreeClassifier import sklearn.metrics

origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()

#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)

data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)

rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)

data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)

data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)

data["AGE"]=data["AGE"].replace(98,np.NaN)

data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)

sub1=data[["GAS","CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] sub1=sub1.dropna()

#data_clean.dtypes #data_clean.describe()

#model predictors=sub1[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] targets=sub1.GAS

pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,targets,test_size=.4)

print(pred_train.shape) print(pred_test.shape) print(tar_train.shape) print(tar_test.shape)

#build model to training data classifier=DecisionTreeClassifier() classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) print(sklearn.metrics.confusion_matrix(tar_test,predictions))

#accuracy score print("\n arruracy score") print(sklearn.metrics.accuracy_score(tar_test,predictions))

#displaing the disecion tree from sklearn import tree tree.export_graphviz(classifier,out_file="mlearning.txt")

0 notes