shihab1992 - Tumblr blog

shihab1992 · 2 years

Text

Weak 4 :Running a k-means Cluster Analysis

import pandas import statistics import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.cluster import KMeans # bug fix for display formats to avoid run time errors pandas.set_option('display.float_format', lambda x:'%.2f'%x) #load the data data = pandas.read_csv('../separatedData.csv') # convert to numeric format data["breastCancer100th"] = pandas.to_numeric(data["breastCancer100th"], errors='coerce') data["meanSugarPerson"] = pandas.to_numeric(data["meanSugarPerson"], errors='coerce') data["meanFoodPerson"] = pandas.to_numeric(data["meanFoodPerson"], errors='coerce') data["meanCholesterol"] = pandas.to_numeric(data["meanCholesterol"], errors='coerce') # listwise deletion of missing values sub1 = data[['breastCancer100th', 'meanFoodPerson', 'meanCholesterol', 'meanSugarPerson']].dropna() #Split into training and testing sets cluster = sub1[[ 'meanSugarPerson', 'meanFoodPerson', 'meanCholesterol']] # standardize predictors to have mean=0 and sd=1 clustervar = cluster.copy() clustervar['meanSugarPerson']= preprocessing.scale(clustervar['meanSugarPerson'].astype('float64')) clustervar['meanFoodPerson']= preprocessing.scale(clustervar['meanFoodPerson'].astype('float64')) clustervar['meanCholesterol']= preprocessing.scale(clustervar['meanCholesterol'].astype('float64')) # split data into train and test sets - Train = 70%, Test = 30% clus_train, clus_test = train_test_split(clustervar, test_size=.3, random_state=123)

To run the k-means Cluster Analysis we must standardize the predictors to have mean = 0 and standard deviation = 1. After that, we make 9 analysis with the data, the first one with one cluster increasing a cluster per experiment.# k-means cluster analysis for 1-9 clusters from scipy.spatial.distance import cdist clusters=range(1,10) meandist=[] for k in clusters: model=KMeans(n_clusters=k) model.fit(clus_train) clusassign=model.predict(clus_train) meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1)) / clus_train.shape[0]) """ Plot average distance from observations from the cluster centroid to use the Elbow Method to identify number of clusters to choose """ plt.plot(clusters, meandist) plt.xlabel('Number of clusters') plt.ylabel('Average distance') plt.title('Selecting k with the Elbow Method')

#Running a k-means Cluster Analysis

0 notes

shihab1992 · 2 years

Text

Assignment: Running a Lasso Regression Analysis

import pandas as pandas from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pylab as plt CSV_PATH = 'gapminder.csv' data = pandas.read_csv(CSV_PATH) print('Total number of countries: {0}'.format(len(data)))

PREDICTORS = [ 'incomeperperson', 'alcconsumption', 'armedforcesrate', 'breastcancerper100th', 'co2emissions', 'femaleemployrate', 'hivrate', 'internetuserate', 'polityscore', 'relectricperperson', 'suicideper100th', 'employrate', 'urbanrate' ] clean = data.copy() for key in PREDICTORS + ['lifeexpectancy']: clean[key] = pandas.to_numeric(clean[key], errors='coerce') clean = clean.dropna() print('Countries remaining:', len(clean)) clean.head()

#Running a Lasso Regression Analysis

0 notes

shihab1992 · 2 years

Text

Weak 2:Running a Random Forest

import pandas import sklearn.metrics import statistics import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier # bug fix for display formats to avoid run time errors pandas.set_option('display.float_format', lambda x:'%.2f'%x) #load the data data = pandas.read_csv('separatedData.csv') # convert to numeric format data["breastCancer100th"] = pandas.to_numeric(data["breastCancer100th"], errors='coerce') data["meanSugarPerson"] = pandas.to_numeric(data["meanSugarPerson"], errors='coerce') data["meanFoodPerson"] = pandas.to_numeric(data["meanFoodPerson"], errors='coerce') data["meanCholesterol"] = pandas.to_numeric(data["meanCholesterol"], errors='coerce') # listwise deletion of missing values sub1 = data[['breastCancer100th', 'meanFoodPerson', 'meanCholesterol', 'meanSugarPerson']].dropna() # Create the conditions to a new variable named incidence_cancer that will categorize the meanSugarPerson answers meanIncidence = statistics.mean(sub1['breastCancer100th']) def incidence_cancer (row): if row['breastCancer100th'] <= meanIncidence : return 0 # Incidence of breast cancer is below the # average of the incidence of all countries. if row['breastCancer100th'] > meanIncidence : return 1 # Incidence of breast cancer is above the average # of the incidence of all countries. # Add the new variable sugar_consumption to subData sub1['incidence_cancer'] = sub1.apply (lambda row: incidence_cancer (row),axis=1)

below code to make random

#Split into training and testing sets predictors = sub1[[ 'meanSugarPerson', 'meanFoodPerson', 'meanCholesterol']] targets = sub1['incidence_cancer'] #Train = 60%, Test = 40% pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4) #Build model on training data classifier=RandomForestClassifier(n_estimators=25) classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) confusion_matrix = sklearn.metrics.confusion_matrix(tar_test,predictions) accuracy_score = sklearn.metrics.accuracy_score(tar_test, predictions) print (confusion_matrix) print (accuracy_score) # fit an Extra Trees model to the data model = ExtraTreesClassifier() model.fit(pred_train,tar_train) # display the relative importance of each attribute print(model.feature_importances_)

#Running a Random Forest

0 notes

shihab1992 · 2 years

Text

Week 1 - Lab: Running Classification Tree

Week 1 python decision trees As part of peer review assigment I am working on decision trees in Python. This assigment is intended for Coursera course “Machine Learning for Data Analysis by Wesleyan University”.

Installation in Linux Ubuntu.

sudo chmod +x Anaconda3-2022.10-Linux-x86_64.sh

./Anaconda3-2022.10-Linux-x86_64.sh

conda install scikit-learn

conda install -n my_environment scikit-learn

pip install sklearn

pip install -U scikit-learn scipy matplotlib

sudo apt-get install graphviz

sudo apt-get install pydotplus

conda create -c conda-forge -n spyder-env spyder numpy scipy pandas matplotlib sympy cython

conda create -c conda-forge -n spyder-env spyder

conda activate spyder-env

conda config --env --add channels conda-forge

conda config --env --set channel_priority strict

python -m pip install pydotplus

dot -Tpng tree.dot -o tree5.png

I have to perform a decision tree analysis to test nonlinear relationships among a series of explanatory variables and a binary, categorical response variable. Data set is provided by The National Longitudinal Study of Adolescent Health (AddHealth).

I will not complicate things here, therefore I am focusing on regular smoking (TREG1 variable).

I choosed few of vars to determine if they can predict regular smoking. predictor = dc[['HISPANIC','WHITE','BLACK','NAMERICAN','ASIAN']]

Therefore I did change predictor variables to just two: gender and age. I got this tree.

my code added below by pyhton

-- coding: utf-8 --

""" Created on Wed Dec 28 11:06:15 2022

@author: rfernandez """ import numpy as np import pandas as pd import matplotlib.pylab as plt

from sklearn.cross_validation import train_test_split

from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report from sklearn import tree import pydotplus import sklearn.metrics

#Load the dataset

data = pd.read_csv("tree_addhealth.csv")

dc = data.dropna() dc.dtypes dc.describe() """ Modeling and Prediction """

#Split into training and testing sets

predictor = dc[['HISPANIC','WHITE','BLACK','NAMERICAN','ASIAN']] target = dc.TREG1 pr_train,pr_test,t_train,t_test= train_test_split(predictor,target, test_size=0.4) pr_train.shape pr_test.shape t_train.shape t_test.shape

#Build model on training data

classif = DecisionTreeClassifier() classif = classif.fit(pr_train,t_train) pred=classif.predict(pr_test) sklearn.metrics.confusion_matrix(t_test,pred) sklearn.metrics.accuracy_score(t_test,pred)

#Running Classification Tree

tree.export_graphviz(classif, out_file='tree_race.dot')

#Running Classification Tree

2 notes · View notes