Tumgik
shihab1992 · 2 years
Text
Weak 4 :Running a k-means Cluster Analysis
import pandas import statistics import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.cluster import KMeans # bug fix for display formats to avoid run time errors pandas.set_option('display.float_format', lambda x:'%.2f'%x) #load the data data = pandas.read_csv('../separatedData.csv') # convert to numeric format data["breastCancer100th"] = pandas.to_numeric(data["breastCancer100th"], errors='coerce') data["meanSugarPerson"] = pandas.to_numeric(data["meanSugarPerson"], errors='coerce') data["meanFoodPerson"] = pandas.to_numeric(data["meanFoodPerson"], errors='coerce') data["meanCholesterol"] = pandas.to_numeric(data["meanCholesterol"], errors='coerce') # listwise deletion of missing values sub1 = data[['breastCancer100th', 'meanFoodPerson', 'meanCholesterol', 'meanSugarPerson']].dropna() #Split into training and testing sets cluster = sub1[[ 'meanSugarPerson', 'meanFoodPerson', 'meanCholesterol']] # standardize predictors to have mean=0 and sd=1 clustervar = cluster.copy() clustervar['meanSugarPerson']= preprocessing.scale(clustervar['meanSugarPerson'].astype('float64')) clustervar['meanFoodPerson']= preprocessing.scale(clustervar['meanFoodPerson'].astype('float64')) clustervar['meanCholesterol']= preprocessing.scale(clustervar['meanCholesterol'].astype('float64')) # split data into train and test sets - Train = 70%, Test = 30% clus_train, clus_test = train_test_split(clustervar, test_size=.3, random_state=123)
To run the k-means Cluster Analysis we must standardize the predictors to have mean = 0 and standard deviation = 1. After that, we make 9 analysis with the data, the first one with one cluster increasing a cluster per experiment.# k-means cluster analysis for 1-9 clusters from scipy.spatial.distance import cdist clusters=range(1,10) meandist=[] for k in clusters: model=KMeans(n_clusters=k) model.fit(clus_train) clusassign=model.predict(clus_train) meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1)) / clus_train.shape[0]) """ Plot average distance from observations from the cluster centroid to use the Elbow Method to identify number of clusters to choose """ plt.plot(clusters, meandist) plt.xlabel('Number of clusters') plt.ylabel('Average distance') plt.title('Selecting k with the Elbow Method')
0 notes
shihab1992 · 2 years
Text
Assignment: Running a Lasso Regression Analysis
import pandas as pandas from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pylab as plt CSV_PATH = 'gapminder.csv' data = pandas.read_csv(CSV_PATH) print('Total number of countries: {0}'.format(len(data)))
PREDICTORS = [ 'incomeperperson', 'alcconsumption', 'armedforcesrate', 'breastcancerper100th', 'co2emissions', 'femaleemployrate', 'hivrate', 'internetuserate', 'polityscore', 'relectricperperson', 'suicideper100th', 'employrate', 'urbanrate' ] clean = data.copy() for key in PREDICTORS + ['lifeexpectancy']: clean[key] = pandas.to_numeric(clean[key], errors='coerce') clean = clean.dropna() print('Countries remaining:', len(clean)) clean.head()
0 notes
shihab1992 · 2 years
Text
Weak 2:Running a Random Forest
import pandas import sklearn.metrics import statistics import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier # bug fix for display formats to avoid run time errors pandas.set_option('display.float_format', lambda x:'%.2f'%x) #load the data data = pandas.read_csv('separatedData.csv') # convert to numeric format data["breastCancer100th"] = pandas.to_numeric(data["breastCancer100th"], errors='coerce') data["meanSugarPerson"] = pandas.to_numeric(data["meanSugarPerson"], errors='coerce') data["meanFoodPerson"] = pandas.to_numeric(data["meanFoodPerson"], errors='coerce') data["meanCholesterol"] = pandas.to_numeric(data["meanCholesterol"], errors='coerce') # listwise deletion of missing values sub1 = data[['breastCancer100th', 'meanFoodPerson', 'meanCholesterol', 'meanSugarPerson']].dropna() # Create the conditions to a new variable named incidence_cancer that will categorize the meanSugarPerson answers meanIncidence = statistics.mean(sub1['breastCancer100th']) def incidence_cancer (row): if row['breastCancer100th'] <= meanIncidence : return 0 # Incidence of breast cancer is below the # average of the incidence of all countries. if row['breastCancer100th'] > meanIncidence : return 1 # Incidence of breast cancer is above the average # of the incidence of all countries. # Add the new variable sugar_consumption to subData sub1['incidence_cancer'] = sub1.apply (lambda row: incidence_cancer (row),axis=1)
below code to make random
#Split into training and testing sets predictors = sub1[[ 'meanSugarPerson', 'meanFoodPerson', 'meanCholesterol']] targets = sub1['incidence_cancer'] #Train = 60%, Test = 40% pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4) #Build model on training data classifier=RandomForestClassifier(n_estimators=25) classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) confusion_matrix = sklearn.metrics.confusion_matrix(tar_test,predictions) accuracy_score = sklearn.metrics.accuracy_score(tar_test, predictions) print (confusion_matrix) print (accuracy_score) # fit an Extra Trees model to the data model = ExtraTreesClassifier() model.fit(pred_train,tar_train) # display the relative importance of each attribute print(model.feature_importances_)
0 notes
shihab1992 · 2 years
Text
Week 1 - Lab: Running Classification Tree
Week 1 python decision trees As part of peer review assigment I am working on decision trees in Python. This assigment is intended for Coursera course “Machine Learning for Data Analysis by Wesleyan University”. 
Installation in Linux Ubuntu.
sudo chmod +x Anaconda3-2022.10-Linux-x86_64.sh
./Anaconda3-2022.10-Linux-x86_64.sh
conda install scikit-learn
conda install -n my_environment scikit-learn
pip install sklearn
pip install -U scikit-learn scipy matplotlib
sudo apt-get install graphviz
sudo apt-get install pydotplus
conda create -c conda-forge -n spyder-env spyder numpy scipy pandas matplotlib sympy cython
conda create -c conda-forge -n spyder-env spyder
conda activate spyder-env
conda config --env --add channels conda-forge
conda config --env --set channel_priority strict
python -m pip install pydotplus
dot -Tpng tree.dot -o tree5.png
I have to perform a decision tree analysis to test nonlinear relationships among a series of explanatory variables and a binary, categorical response variable. Data set is  provided by The National Longitudinal Study of Adolescent Health (AddHealth).
I will not complicate things here, therefore I am focusing on regular smoking (TREG1 variable).
I choosed few of vars to determine if they can predict regular smoking. predictor = dc[['HISPANIC','WHITE','BLACK','NAMERICAN','ASIAN']]
Tumblr media
Therefore I did change predictor variables to just two: gender and age. I got this tree.
my code added below by pyhton
-- coding: utf-8 --
""" Created on Wed Dec 28 11:06:15 2022
@author: rfernandez """ import numpy as np import pandas as pd import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report from sklearn import tree import pydotplus import sklearn.metrics
#Load the dataset
data = pd.read_csv("tree_addhealth.csv")
dc = data.dropna() dc.dtypes dc.describe() """ Modeling and Prediction """
#Split into training and testing sets
predictor = dc[['HISPANIC','WHITE','BLACK','NAMERICAN','ASIAN']] target = dc.TREG1 pr_train,pr_test,t_train,t_test= train_test_split(predictor,target, test_size=0.4) pr_train.shape pr_test.shape t_train.shape t_test.shape
#Build model on training data
classif = DecisionTreeClassifier() classif = classif.fit(pr_train,t_train) pred=classif.predict(pr_test) sklearn.metrics.confusion_matrix(t_test,pred) sklearn.metrics.accuracy_score(t_test,pred)
#Running Classification Tree
tree.export_graphviz(classif, out_file='tree_race.dot')
2 notes · View notes