Tumgik
course4ml · 6 years
Text
Assignment 4 Results
Here we tested people who has smoked in their life, try do find clusters in them. Besides, we tested if our cluster could seperate beers they tend to drinking. 
Tumblr media
it seems using clusters numbers as 3 will do the best result. 
Tumblr media Tumblr media
in cluster 1, people are more likely to have smoked in their life (CHECK321, SMOKER) and they are oldest (AGE). In cluster 2, people are smoking most frequently (S3AQ3B1). In cluster 0, they are in the middle. 
Then, we tested if this subgroups could seperate beer people drinking(number of beers drink each day, S2AQ5D), using ANOVA to test, as in many times, smoking and drink seems to happem together. 
Tumblr media Tumblr media Tumblr media
The results shows that in 3 clusters, beer drink are significantly different. And judging from the smoking and drinking habits, it seems cluster 0 drink most beer in 3 clusters, they drinking over 3 beers each day. And cluster 1 drinks least beers, drinking 2 beers each day. So as conslusion, it seems more smoking you do recently, you are non likely to have more beers. It goes against our assumption that drinking and smoking are happen together. We may need futher experiment to test this theory. 
0 notes
course4ml · 6 years
Text
Assignment 4 Program
from pandas import Series, DataFrame import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.cluster import KMeans
origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()
data["S13Q6A10"]=pd.to_numeric(data["S13Q6A10"],errors="coerce") data["CHECK321"]=pd.to_numeric(data["CHECK321"],errors="coerce") data["SMOKER"]=pd.to_numeric(data["SMOKER"],errors="coerce") data["S3AQ3B1"]=pd.to_numeric(data["S3AQ3B1"],errors="coerce") data["S3AQ3C1"]=pd.to_numeric(data["S3AQ3C1"],errors="coerce") data["S3AQ3D1R"]=pd.to_numeric(data["S3AQ3D1R"],errors="coerce") data["S13Q2"]=pd.to_numeric(data["S13Q2"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce") data["AGE"]=pd.to_numeric(data["AGE"],errors="coerce") data["S2AQ5D"]=pd.to_numeric(data["S2AQ5D"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce")
#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)
data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)
rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)
data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)
data["AGE"]=data["AGE"].replace(98,np.NaN)
data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)
rcd3={2:0,1:1} data["SEX"]=data["SEX"].map(rcd3)
data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)
sub1=data[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX","S13Q2","S2AQ5D"]] cluster=sub1.dropna()
#standardize dataset clustervar=cluster[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX","S13Q2"]] clustervar["CHECK321"]=preprocessing.scale(clustervar["CHECK321"].astype("float64")) clustervar["SMOKER"]=preprocessing.scale(clustervar["SMOKER"].astype("float64")) clustervar["S3AQ3B1"]=preprocessing.scale(clustervar["S3AQ3B1"].astype("float64")) clustervar["S3AQ3C1"]=preprocessing.scale(clustervar["S3AQ3C1"].astype("float64")) clustervar["S3AQ2A1"]=preprocessing.scale(clustervar["S3AQ2A1"].astype("float64")) clustervar["AGE"]=preprocessing.scale(clustervar["AGE"].astype("float64")) clustervar["SEX"]=preprocessing.scale(clustervar["SEX"].astype("float64")) clustervar["S13Q2"]=preprocessing.scale(clustervar["S13Q2"].astype("float64"))
#split into train and test parts clus_train,clus_test=train_test_split(clustervar, test_size=0.3, random_state=123)
#k-means analysis for 1-9 clusters from scipy.spatial.distance import cdist clusters=range(1,10) meandist=[]
for k in clusters:    model=KMeans(n_clusters=k)    model.fit(clus_train)    clusassign=model.predict(clus_train)    meandist.append(sum(np.min(cdist(clus_train,model.cluster_centers_,"euclidean"),axis=1))/clus_train.shape[0]) """ #plot average distrance of observations and cluster centroid using the Euclidean method plt.plot(clusters,meandist) plt.xlabel("Number of clusters") plt.ylabel("Average distance") plt.title("selecting k with the Elbow method") """
#interpret 6 cluster solution model6=KMeans(n_clusters=3) model6.fit(clus_train) clusassign=model6.predict(clus_train)
#plot cluster 6 from sklearn.decomposition import PCA pca_2=PCA(2) plot_columns=pca_2.fit_transform(clus_train) plt.scatter(x=plot_columns[:,0],y=plot_columns[:,1],c=model6.labels_,) plt.xlabel("Canonical varaible 1") plt.ylabel("Canonical variable 2") plt.title("Scatter plot of canonical variables for 6 clusters") plt.show()
""" mutiple steps to merge cluster assignments with clustering variables to examine cluster variable means by cluster """ #create a unique identifier variable from index for the cluster training data to merge with cluster assignment varibale clus_train.reset_index(level=0,inplace=True) #creat a list that has the new variable cluslist=list(clus_train["index"]) #create a list of cluster assignments labels=list(model6.labels_) #combine index variable list with cluster assignment list into a dictionary newlist=dict(zip(cluslist,labels)) newlist #convert dict to dataframe newclus=DataFrame.from_dict(newlist,orient="index") newclus
#do the same to cluster assignment variables #rename the cluster assignment column newclus.columns=["cluster"] #creat a unique identifier vaiables from the index for the cluster assignment dataframe #to merge with training data newclus.reset_index(level=0,inplace=True) #merge the cluster assignment dataframe with cluster training variable dataframe by index variable merged_train=pd.merge(clus_train,newclus,on="index") merged_train.head(n=100) #cluster frequency merged_train.cluster.value_counts()
pd.set_option('display.max_columns', None) clustergrp=merged_train.groupby("cluster").mean() print("Clustering variable means by cluster") print(clustergrp)
#validate clusters in training data by examing cluster difference in beers drinking using ANOVA #first have to merge beer drinking with clustering variables and cluster assignment data beer_data=cluster["S2AQ5D"] #split beer into training and testing data set beer_train, beer_test=train_test_split(beer_data, test_size=0.3,random_state=123) beer_train1=pd.DataFrame(beer_train) beer_train1.reset_index(level=0,inplace=True) merged_train_all=pd.merge(beer_train1,merged_train,on="index") sub2=merged_train_all[["S2AQ5D","cluster"]].dropna()
import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi
beermod=smf.ols(formula="S2AQ5D~C(cluster)",data=sub2).fit() print(beermod.summary()) print("means for beer drinking by cluster") m1=sub2.groupby("cluster").mean() print(m1)
print("standard deviations for beer by cluster") m2=sub2.groupby("cluster").std() print(m2)
mc1=multi.MultiComparison(sub2["S2AQ5D"],sub2["cluster"]) res1=mc1.tukeyhsd() print(res1.summary())
0 notes
course4ml · 6 years
Text
Assignment 3 Results
In this study, I tested smoking habits and how long you stay in hospital per year. 
And variables I used including are you smoking, how long you smoked one cigarette, how many are you smoked in a day and the earliest age you smoked. 
And I used Lasso regression in the model, here is it found:
{'CHECK321': 0.0, 'SMOKER': -2.137026383450022, 'S3AQ3B1': -1.2806242742542924, 'S3AQ3C1': 1.1613836341096238, 'S3AQ2A1': 0.0, 'AGE': 0.0, 'SEX': 0.8547639650347999}
It seems its not related to smoking status, and not related to earliest smoking age, also, not relate to the age you are in. 
 Here is the plot of the coefficient and alphas relationship:
Tumblr media
And here is MSE in each fold
Tumblr media
And here are numbers calculated:
Tumblr media
Obviously, in this regression, all the variables could not predict the how long you stayed in hospital precisely. We need to find other variables and models to do the prediction. 
0 notes
course4ml · 6 years
Text
Assignment 3 Program
import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn.linear_model import LassoLarsCV
origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()
data["S13Q6A10"]=pd.to_numeric(data["S13Q6A10"],errors="coerce") data["CHECK321"]=pd.to_numeric(data["CHECK321"],errors="coerce") data["SMOKER"]=pd.to_numeric(data["SMOKER"],errors="coerce") data["S3AQ3B1"]=pd.to_numeric(data["S3AQ3B1"],errors="coerce") data["S3AQ3C1"]=pd.to_numeric(data["S3AQ3C1"],errors="coerce") data["S3AQ3D1R"]=pd.to_numeric(data["S3AQ3D1R"],errors="coerce") data["S13Q2"]=pd.to_numeric(data["S13Q2"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce") data["AGE"]=pd.to_numeric(data["AGE"],errors="coerce") data["S2AQ5D"]=pd.to_numeric(data["S2AQ5D"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce")
#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)
data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)
rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)
data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)
data["AGE"]=data["AGE"].replace(98,np.NaN)
data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)
rcd3={2:0,1:1} data["SEX"]=data["SEX"].map(rcd3)
data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)
sub1=data[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX","S13Q2"]] sub1=sub1.dropna()
predvar=sub1[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ2A1","AGE","SEX"]] target=sub1.S13Q2
#standardize all varibales predictors=predvar.copy() from sklearn import preprocessing predictors["CHECK321"]=preprocessing.scale(predictors["CHECK321"].astype("float64")) predictors["SMOKER"]=preprocessing.scale(predictors["SMOKER"].astype("float64")) predictors["S3AQ3B1"]=preprocessing.scale(predictors["S3AQ3B1"].astype("float64")) predictors["S3AQ3C1"]=preprocessing.scale(predictors["S3AQ3C1"].astype("float64")) #predictors["S3AQ3D1R"]=preprocessing.scale(predictors["S3AQ3D1R"].astype("float64")) #predictors["S13Q2"]=preprocessing.scale(predictors["S13Q2"].astype("float64")) predictors["S3AQ2A1"]=preprocessing.scale(predictors["S3AQ2A1"].astype("float64")) predictors["AGE"]=preprocessing.scale(predictors["AGE"].astype("float64")) predictors["SEX"]=preprocessing.scale(predictors["SEX"].astype("float64")) #predictors["S2AQ5D"]=preprocessing.scale(predictors["S2AQ5D"].astype("float64")) #predictors["S3AQ2A1"]=preprocessing.scale(predictors["S3AQ2A1"].astype("float64"))
#split into train and test set pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,target,test_size=.3,random_state=123)
#using Lasso regression model=LassoLarsCV(cv=10,Precompute=False).fit(pred_train,tar_train)
#print variable names, regression coefficient print(dict(zip(predictors.columns,model.coef_)))
#plot coefficent progression m_log_alphas=-np.log10(model.alphas_) ax=plt.gca() plt.plot(m_log_alphas,model.coef_path_.T) plt.axvline(-np.log10(model.alpha_),linestyle="--",color="k",label="alpha CV") plt.ylabel("Regression Coefficiency") plt.xlabel("-log(alpha)") plt.title("Regression Coeefficiency Progression for Lasso Paths")
#plot of mean square error for each fold m_log_alphascv=-np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv,model.mse_path_,":") plt.plot(m_log_alphascv,model.mse_path_.mean(axis=-1),"k",label="average across the folds",linewidth=2) plt.axvline(-np.log10(model.alpha_),linestyle="--",color="k",label="alpha CV") plt.legend() plt.xlabel("-log(alpha)") plt.ylabel("mean squared error") plt.title("mean squared errors in each fold")
#mean squared errors from training and test from sklearn.metrics import mean_squared_error train_error=mean_squared_error(tar_train,model.predict(pred_train)) test_error=mean_squared_error(tar_test,model.predict(pred_test)) print("training data MSE", train_error) print("test data MSA",test_error)
#R-square from training and test data rsquared_train=model.score(pred_train,tar_train) rsquared_test=model.score(pred_test,tar_test) print("traning data R-sqaure",rsquared_train) print("test data R-squared",rsquared_test)
0 notes
course4ml · 6 years
Text
Assignment 2 Results
We tested if one got gastritis is related to smoking and how smoking habits could make the prediction of gastritis. 
There are 8 variables that we included in the study, including your smoking status now, the frequency of smoking, amount cigarettes smoked per day,  age etc. And our target is if you have gastritis now. 
Here are the results:
Tumblr media Tumblr media
relative importance:
Tumblr media Tumblr media
This shows that the most important in gastritis predictions is age(score=0.264), and the least important variable is smoking status right now (0.005). However, the age you smoking first cigarettes is also important (S3AQ2A1, score=0.207). This may indicate that how long you smoked is more important in the prediction of your gastritis. 
Also, we draw the plot of how many trees is in high accuracy value, 
Tumblr media
In the plot, it seems 2 trees could accuracy score>92%, and over 10 trees make sure the accuracy score is stable. 
0 notes
course4ml · 6 years
Text
Assignment 2 Program
import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split import sklearn.metrics #feature importance from sklearn.ensemble import ExtraTreesClassifier
origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()
data["S13Q6A10"]=pd.to_numeric(data["S13Q6A10"],errors="coerce") data["CHECK321"]=pd.to_numeric(data["CHECK321"],errors="coerce") data["SMOKER"]=pd.to_numeric(data["SMOKER"],errors="coerce") data["S3AQ3B1"]=pd.to_numeric(data["S3AQ3B1"],errors="coerce") data["S3AQ3C1"]=pd.to_numeric(data["S3AQ3C1"],errors="coerce") data["S3AQ3D1R"]=pd.to_numeric(data["S3AQ3D1R"],errors="coerce") data["S13Q2"]=pd.to_numeric(data["S13Q2"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce") data["AGE"]=pd.to_numeric(data["AGE"],errors="coerce") data["S2AQ5D"]=pd.to_numeric(data["S2AQ5D"],errors="coerce")
#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)
data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)
rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)
data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)
data["AGE"]=data["AGE"].replace(98,np.NaN)
data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)
sub1=data[["GAS","CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] sub1=sub1.dropna()
#data_clean.dtypes #data_clean.describe()
#model predictors=sub1[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] targets=sub1.GAS
pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,targets,test_size=.4)
print(pred_train.shape) print(pred_test.shape) print(tar_train.shape) print(tar_test.shape)
#build model to training data from sklearn.ensemble import RandomForestClassifier classifier=RandomForestClassifier(n_estimators=25) classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) print(sklearn.metrics.confusion_matrix(tar_test,predictions)) #accuracy score print("\n arruracy score") print(sklearn.metrics.accuracy_score(tar_test,predictions))
#fit an Extra Trees model to the data model=ExtraTreesClassifier() model.fit(pred_train,tar_train) #display relative importance in each variable print(model.feature_importances_)
""" impact of trees number in prediciton accurancy useing for to run number from 0-24 """ trees=range(25) accuracy=np.zeros(25)
for idx in range(len(trees)):    classifier=RandomForestClassifier(n_estimators=idx+1)    classifier=classifier.fit(pred_train,tar_train)    predictions=classifier.predict(pred_test)    accuracy[idx]=sklearn.metrics.accuracy_score(tar_test,predictions)
plt.cla() plt.plot(trees,accuracy)
0 notes
course4ml · 6 years
Text
Assignment 1 Results
In this study, we tried to test the relationship between gastritis and smoking status. And also its relationship of age, gender. 
984  77  78   8
Accuracy Score 0.8648648648648649
So in this model, we have 984 true positives and 8 true negatives, and 77 false positives and 78 false negatives. And accuracy score is 0.86, which means over 86% is classification right. 
And then, we draw our decision trees, including 8 variables. 
Tumblr media
To see how a decision tree should be like, we used 2 variables, which is are you smoking.  Here is the decision tree:
Tumblr media
It indicates that for those who are smoking, gastritis is more likely to occur. And for those who do not smoke, it is low possibility than gastritis is occurring. 
0 notes
course4ml · 6 years
Text
Assignment 1 Program
import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split from sklearn.tree import DecisionTreeClassifier import sklearn.metrics
origin=pd.read_csv("nesarc_pds.csv",low_memory=False) data=origin.copy()
data["S13Q6A10"]=pd.to_numeric(data["S13Q6A10"],errors="coerce") data["CHECK321"]=pd.to_numeric(data["CHECK321"],errors="coerce") data["SMOKER"]=pd.to_numeric(data["SMOKER"],errors="coerce") data["S3AQ3B1"]=pd.to_numeric(data["S3AQ3B1"],errors="coerce") data["S3AQ3C1"]=pd.to_numeric(data["S3AQ3C1"],errors="coerce") data["S3AQ3D1R"]=pd.to_numeric(data["S3AQ3D1R"],errors="coerce") data["S13Q2"]=pd.to_numeric(data["S13Q2"],errors="coerce") data["S3AQ2A1"]=pd.to_numeric(data["S3AQ2A1"],errors="coerce") data["AGE"]=pd.to_numeric(data["AGE"],errors="coerce") data["S2AQ5D"]=pd.to_numeric(data["S2AQ5D"],errors="coerce")
#cleaning data data["S13Q6A10"]=data["S13Q6A10"].replace(9,np.NaN) rcd={1:1,2:0} data["GAS"]=data["S13Q6A10"].map(rcd)
data["CHECK321"]=data["CHECK321"].replace(9,np.NaN)
rcd2={1:30,2:24,3:12,4:4,5:2,6:1} data["FREQ"]=data["S3AQ3B1"].map(rcd2) data.loc[(data["SMOKER"]==3),"FREQ"]=0 data["FREQ"]=data["FREQ"].replace(9,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3C1"]=0 data["S3AQ3C1"]=data["S3AQ3C1"].replace(99,np.NaN)
data.loc[data["SMOKER"]==3,"S3AQ3D1R"]=0 data["S3AQ3D1R"]=data["S3AQ3D1R"].replace(99999,np.NaN)
data["S3AQ2A1"]=data["S3AQ2A1"].replace(99,np.NaN)
data["AGE"]=data["AGE"].replace(98,np.NaN)
data["S2AQ5D"]=data["S2AQ5D"].replace(99,np.NaN)
sub1=data[["GAS","CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] sub1=sub1.dropna()
#data_clean.dtypes #data_clean.describe()
#model predictors=sub1[["CHECK321","SMOKER","S3AQ3B1","S3AQ3C1","S3AQ3D1R","S13Q2","S3AQ2A1","AGE"]] targets=sub1.GAS
pred_train,pred_test,tar_train,tar_test=train_test_split(predictors,targets,test_size=.4)
print(pred_train.shape) print(pred_test.shape) print(tar_train.shape) print(tar_test.shape)
#build model to training data classifier=DecisionTreeClassifier() classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) print(sklearn.metrics.confusion_matrix(tar_test,predictions))
#accuracy score print("\n arruracy score") print(sklearn.metrics.accuracy_score(tar_test,predictions))
#displaing the disecion tree from sklearn import tree tree.export_graphviz(classifier,out_file="mlearning.txt")
0 notes