decision-tree-for-coursera
Untitled
4 posts
Don't wanna be here? Send us removal request.
Text
Dwaipayan Mitra k-means cluster import numpy as np import random as rd class Kmeans: def init(self,X,K): self.X=X self.Output={} self.Centroids=np.array([]).reshape(self.X.shape[1],0) self.K=K self.m=self.X.shape[0]def kmeanspp(self,X,K): i=rd.randint(0,X.shape[0]) Centroid_temp=np.array([X[i]]) for k in range(1,K): D=np.array([]) for x in X: D=np.append(D,np.min(np.sum((x-Centroid_temp)**2))) prob=D/np.sum(D) cummulative_prob=np.cumsum(prob) r=rd.random() i=0 for j,p in enumerate(cummulative_prob): if r<p: i=j break Centroid_temp=np.append(Centroid_temp,[X[i]],axis=0) return Centroid_temp.T def fit(self,n_iter): #randomly Initialize the centroids self.Centroids=self.kmeanspp(self.X,self.K) """for i in range(self.K): rand=rd.randint(0,self.m-1) self.Centroids=np.c_[self.Centroids,self.X[rand]]""" #compute euclidian distances and assign clusters for n in range(n_iter): EuclidianDistance=np.array([]).reshape(self.m,0) for k in range(self.K): tempDist=np.sum((self.X-self.Centroids[:,k])**2,axis=1) EuclidianDistance=np.c_[EuclidianDistance,tempDist] C=np.argmin(EuclidianDistance,axis=1)+1 #adjust the centroids Y={} for k in range(self.K): Y[k+1]=np.array([]).reshape(2,0) for i in range(self.m): Y[C[i]]=np.c_[Y[C[i]],self.X[i]] for k in range(self.K): Y[k+1]=Y[k+1].T for k in range(self.K): self.Centroids[:,k]=np.mean(Y[k+1],axis=0) self.Output=Y def predict(self): return self.Output,self.Centroids.T def WCSS(self): wcss=0 for k in range(self.K): wcss+=np.sum((self.Output[k+1]-self.Centroids[:,k])**2) return wcss
0 notes
Text
Dwaipayan mitra-Running a Lasso Regression Analysis
import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression, Lasso, Ridge from sklearn.neighbors import KNeighborsRegressor from sklearn.neural_network import MLPRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, KFold import xgboost as xgb from sklearn.preprocessing import StandardScaler, PolynomialFeatures from sklearn.pipeline import Pipeline
plt.style.use('seaborn')
file = pd.read_csv('Final.csv', sep=';') df = pd.DataFrame(file)
def preprocessing(data):#Do first stage pre-processing (i.e. exclude non-numeric prices and convert prices to numeric variables) data = data[data.price_new.str.contains("aanvraag") == False].dropna() data[['price_new']] = data[['price_new']].apply(pd.to_numeric) # exclude everything with a price above or below 3 standard deviations (i.e. outliers) data = data[np.abs(data["price_new"]-data["price_new"].mean())<=(3*data["price_new"].std())] # Set x and y (dropping zipcode and rooms as latitude, longitude and surface pretty much capture the former) y = data.price_new X = data.drop('price_new', axis = 1).drop('zipcode_new', axis = 1).drop('rooms_new', axis = 1) return X, y
def model(pipeline, parameters, X_train, y_train, X, y):grid_obj = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=3, scoring='r2', verbose=2, n_jobs=1, refit=True) grid_obj.fit(X_train, y_train) '''Results''' results = pd.DataFrame(pd.DataFrame(grid_obj.cv_results_)) results_sorted = results.sort_values(by=['mean_test_score'], ascending=False) print("##### Results") print(results_sorted) print("best_index", grid_obj.best_index_) print("best_score", grid_obj.best_score_) print("best_params", grid_obj.best_params_) '''Cross Validation''' estimator = grid_obj.best_estimator_ ''' if estimator.named_steps['scl'] == True: X = (X - X.mean()) / (X.std()) y = (y - y.mean()) / (y.std()) ''' shuffle = KFold(n_splits=5, shuffle=True, random_state=0) cv_scores = cross_val_score(estimator, X, y.values.ravel(), cv=shuffle, scoring='r2') print("##### CV Results") print("mean_score", cv_scores.mean()) '''Show model coefficients or feature importances''' try: print("Model coefficients: ", list(zip(list(X), estimator.named_steps['clf'].coef_))) except: print("Model does not support model coefficients") try: print("Feature importances: ", list(zip(list(X), estimator.named_steps['clf'].feature_importances_))) except: print("Model does not support feature importances") '''Predict along CV and plot y vs. y_predicted in scatter''' y_pred = cross_val_predict(estimator, X, y, cv=shuffle) plt.scatter(y, y_pred) xmin, xmax = plt.xlim() ymin, ymax = plt.ylim() plt.plot([xmin, xmax], [ymin, ymax], "g--", lw=1, alpha=0.4) plt.xlabel("True prices") plt.ylabel("Predicted prices") plt.annotate(' R-squared CV = {}'.format(round(float(cv_scores.mean()), 3)), size=9, xy=(xmin,ymax), xytext=(10, -15), textcoords='offset points') plt.annotate(grid_obj.best_params_, size=9, xy=(xmin, ymax), xytext=(10, -35), textcoords='offset points', wrap=True) plt.title('Predicted prices (EUR) vs. True prices (EUR)') plt.show()
Pipeline and Parameters - Linear Regression
pipe_ols = Pipeline([('scl', StandardScaler()), ('clf', LinearRegression())])
param_ols = {}
Pipeline and Parameters - XGBoost
pipe_xgb = Pipeline([('clf', xgb.XGBRegressor())])
param_xgb = {'clf__max_depth':[5], 'clf__min_child_weight':[6], 'clf__gamma':[0.01], 'clf__subsample':[0.7], 'clf__colsample_bytree':[1]}
Pipeline and Parameters - KNN
pipe_knn = Pipeline([('clf', KNeighborsRegressor())])
param_knn = {'clf__n_neighbors':[5, 10, 15, 25, 30]}
Pipeline and Parameters - Lasso
pipe_lasso = Pipeline([('scl', StandardScaler()), ('clf', Lasso(max_iter=1500))])
param_lasso = {'clf__alpha': [0.01, 0.1, 1, 10]}
Pipeline and Parameters - Ridge
pipe_ridge = Pipeline([('scl', StandardScaler()), ('clf', Ridge())])
param_ridge = {'clf__alpha': [0.01, 0.1, 1, 10]}
Pipeline and Parameters - Polynomial Regression
pipe_poly = Pipeline([('scl', StandardScaler()), ('polynomial', PolynomialFeatures()), ('clf', LinearRegression())])
param_poly = {'polynomial__degree': [2, 4, 6]}
Pipeline and Parameters - Decision Tree Regression
pipe_tree = Pipeline([('clf', DecisionTreeRegressor())])
param_tree = {'clf__max_depth': [2, 5, 10], 'clf__min_samples_leaf': [5,10,50,100]}
Pipeline and Parameters - Random Forest
pipe_forest = Pipeline([('clf', RandomForestRegressor())])
param_forest = {'clf__n_estimators': [10, 20, 50], 'clf__max_features': [None, 1, 2], 'clf__max_depth': [1, 2, 5]}
Pipeline and Parameters - MLP Regression
pipe_neural = Pipeline([('scl', StandardScaler()), ('clf', MLPRegressor())])
param_neural = {'clf__alpha': [0.001, 0.01, 0.1, 1, 10, 100], 'clf__hidden_layer_sizes': [(5),(10,10),(7,7,7)], 'clf__solver': ['lbfgs'], 'clf__activation': ['relu', 'tanh'], 'clf__learning_rate' : ['constant', 'invscaling']}
Execute preprocessing & train/test split
X, y = preprocessing(df) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
Execute model hyperparameter tuning and crossvalidation
model(pipe_ols, param_ols, X_train, y_train, X, y) model(pipe_xgb, param_xgb, X_train, y_train, X, y) model(pipe_knn, param_knn, X_train, y_train, X, y) model(pipe_lasso, param_lasso, X_train, y_train, X, y) model(pipe_ridge, param_ridge, X_train, y_train, X, y) model(pipe_poly, param_poly, X_train, y_train, X, y) model(pipe_tree, param_tree, X_train, y_train, X, y) model(pipe_forest, param_forest, X_train, y_train, X, y) model(pipe_neural, param_neural, X_train, y_train, X, y)
0 notes
Text
Running a Random Forest Dwaipayan
Task
The second assignment deals with Random Forests. Random forests are predictive models that allow for a data driven exploration of many explanatory variables in predicting a response or target variable. Random forests provide importance scores for each explanatory variable and also allow you to evaluate any increases in correct classification with the growing of smaller and larger number of trees.
Run a Random Forest.
You will need to perform a random forest analysis to evaluate the importance of a series of explanatory variables in predicting a binary, categorical response variable.
Data
The dataset is related to red variants of the Portuguese "Vinho Verde" wine. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).
The classes are ordered and not balanced (e.g. there are munch more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.
Dataset can be found at UCI Machine Learning Repository
Attribute Information (For more information, read [Cortez et al., 2009]): Input variables (based on physicochemical tests):
1 - fixed acidity
2 - volatile acidity
3 - citric acid
4 - residual sugar
5 - chlorides
6 - free sulfur dioxide
7 - total sulfur dioxide
8 - density
9 - pH
10 - sulphates
11 - alcohol
Output variable (based on sensory data):
12 - quality (score between 0 and 10)
Results
Random forest and ExtraTrees classifier were deployed to evaluate the importance of a series of explanatory variables in predicting a categorical response variable - red wine quality (score between 0 and 10). The following explanatory variables were included: fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates and alcohol.
The explanatory variables with the highest importance score (evaluated by both classifiers) are alcohol, volatile acidity, sulphates. The accuracy of the Random forest and ExtraTrees clasifier is about 67%, which is quite good for highly unbalanced and hardly distinguished from each other classes. The subsequent growing of multiple trees rather than a single tree, adding a lot to the overall score of the model. For Random forest the number of estimators is 20, while for ExtraTrees classifier - 12, because the second classifier grows up much faster.
Code
In [1]:import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.model_selection import train_test_split, cross_val_score from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.manifold import MDS from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics import accuracy_score import seaborn as sns %matplotlib inline rnd_state = 4536
In [2]:data = pd.read_csv('Data\winequality-red.csv', sep=';') data.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): fixed acidity 1599 non-null float64 volatile acidity 1599 non-null float64 citric acid 1599 non-null float64 residual sugar 1599 non-null float64 chlorides 1599 non-null float64 free sulfur dioxide 1599 non-null float64 total sulfur dioxide 1599 non-null float64 density 1599 non-null float64 pH 1599 non-null float64 sulphates 1599 non-null float64 alcohol 1599 non-null float64 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
In [3]:data.head()
Out[3]:fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality07.40.700.001.90.07611.034.00.99783.510.569.4517.80.880.002.60.09825.067.00.99683.200.689.8527.80.760.042.30.09215.054.00.99703.260.659.85311.20.280.561.90.07517.060.00.99803.160.589.8647.40.700.001.90.07611.034.00.99783.510.569.45
In [4]:data.describe()
Out[4]:fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholqualitycount1599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.0000001599.000000mean8.3196370.5278210.2709762.5388060.08746715.87492246.4677920.9967473.3111130.65814910.4229835.636023std1.7410960.1790600.1948011.4099280.04706510.46015732.8953240.0018870.1543860.1695071.0656680.807569min4.6000000.1200000.0000000.9000000.0120001.0000006.0000000.9900702.7400000.3300008.4000003.00000025%7.1000000.3900000.0900001.9000000.0700007.00000022.0000000.9956003.2100000.5500009.5000005.00000050%7.9000000.5200000.2600002.2000000.07900014.00000038.0000000.9967503.3100000.62000010.2000006.00000075%9.2000000.6400000.4200002.6000000.09000021.00000062.0000000.9978353.4000000.73000011.1000006.000000max15.9000001.5800001.00000015.5000000.61100072.000000289.0000001.0036904.0100002.00000014.9000008.000000
Plots
For visualization purposes, the number of dimensions was reduced to two by applying MDS method with cosine distance. The plot illustrates that our classes are not clearly divided into parts.
In [5]:model = MDS(random_state=rnd_state, n_components=2, dissimilarity='precomputed') %time representation = model.fit_transform(pairwise_distances(data.iloc[:, :11], metric='cosine')) Wall time: 38.7 s
In [6]:colors = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"] plt.figure(figsize=(12, 4)) plt.subplot(121) plt.scatter(representation[:, 0], representation[:, 1], c=colors) plt.subplot(122) sns.countplot(x='quality', data=data, palette=sns.color_palette(colors));
Moreover, our classes are highly unbalanced, so in our classifier we should add parameter class_weight='balanced'.
In [7]:predictors = data.iloc[:, :11] target = data.quality
In [8]:(predictors_train, predictors_test, target_train, target_test) = train_test_split(predictors, target, test_size = .3, random_state = rnd_state)
RandomForest classifier
In [9]:list_estimators = list(range(1, 50, 5)) rf_scoring = [] for n_estimators in list_estimators: classifier = RandomForestClassifier(random_state = rnd_state, n_jobs =-1, class_weight='balanced', n_estimators=n_estimators) score = cross_val_score(classifier, predictors_train, target_train, cv=5, n_jobs=-1, scoring = 'accuracy') rf_scoring.append(score.mean())
In [10]:plt.plot(list_estimators, rf_scoring) plt.title('Accuracy VS trees number');
In [11]:classifier = RandomForestClassifier(random_state = rnd_state, n_jobs =-1, class_weight='balanced', n_estimators=20) classifier.fit(predictors_train, target_train)
Out[11]:RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1, oob_score=False, random_state=4536, verbose=0, warm_start=False)
In [12]:prediction = classifier.predict(predictors_test)
In [13]:print('Confusion matrix:\n', pd.crosstab(target_test, prediction, colnames=['Predicted'], rownames=['Actual'], margins=True)) print('\nAccuracy: ', accuracy_score(target_test, prediction)) Confusion matrix: Predicted 3 4 5 6 7 All Actual 3 0 0 3 0 0 3 4 0 1 9 6 0 16 5 2 1 166 41 3 213 6 0 0 46 131 14 191 7 0 0 5 25 23 53 8 0 0 0 3 1 4 All 2 2 229 206 41 480 Accuracy: 0.66875
In [14]:feature_importance = pd.Series(classifier.feature_importances_, index=data.columns.values[:11]).sort_values(ascending=False) feature_importance
Out[14]:volatile acidity 0.133023 alcohol 0.130114 sulphates 0.129498 citric acid 0.106427 total sulfur dioxide 0.094647 chlorides 0.086298 density 0.079843 pH 0.066566 residual sugar 0.061344 fixed acidity 0.058251 free sulfur dioxide 0.053990 dtype: float64
In [15]:et_scoring = [] for n_estimators in list_estimators: classifier = ExtraTreesClassifier(random_state = rnd_state, n_jobs =-1, class_weight='balanced', n_estimators=n_estimators) score = cross_val_score(classifier, predictors_train, target_train, cv=5, n_jobs=-1, scoring = 'accuracy') et_scoring.append(score.mean())
In [16]:plt.plot(list_estimators, et_scoring) plt.title('Accuracy VS trees number');
In [17]:classifier = ExtraTreesClassifier(random_state = rnd_state, n_jobs =-1, class_weight='balanced', n_estimators=12) classifier.fit(predictors_train, target_train)
Out[17]:ExtraTreesClassifier(bootstrap=False, class_weight='balanced', criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=12, n_jobs=-1, oob_score=False, random_state=4536, verbose=0, warm_start=False)
In [18]:prediction = classifier.predict(predictors_test)
In [19]:print('Confusion matrix:\n', pd.crosstab(target_test, prediction, colnames=['Predicted'], rownames=['Actual'], margins=True)) print('\nAccuracy: ', accuracy_score(target_test, prediction)) Confusion matrix: Predicted 3 4 5 6 7 8 All Actual 3 0 1 2 0 0 0 3 4 0 0 9 7 0 0 16 5 2 2 168 39 2 0 213 6 0 0 49 130 11 1 191 7 0 0 2 27 24 0 53 8 0 0 0 3 1 0 4 All 2 3 230 206 38 1 480 Accuracy: 0.6708333333333333
In [20]:feature_importance = pd.Series(classifier.feature_importances_, index=data.columns.values[:11]).sort_values(ascending=False) feature_importance
Out[20]:alcohol 0.157267 volatile acidity 0.132768 sulphates 0.100874 citric acid 0.095077 density 0.082334 chlorides 0.079283 total sulfur dioxide 0.076803 pH 0.074638 fixed acidity 0.069826 residual sugar 0.066551 free sulfur dioxide 0.064579 dtype: float64
0 notes
Text
Decision Tree
Task
This week’s assignment involves decision trees, and more specifically, classification trees. Decision trees are predictive models that allow for a data driven exploration of nonlinear relationships and interactions among many explanatory variables in predicting a response or target variable. When the response variable is categorical (two levels), the model is a called a classification tree. Explanatory variables can be either quantitative, categorical or both. Decision trees create segmentations or subgroups in the data, by applying a series of simple rules or criteria over and over again which choose variable constellations that best predict the response (i.e. target) variable.
Run a Classification Tree.
You will need to perform a decision tree analysis to test nonlinear relationships among a series of explanatory variables and a binary, categorical response variable.
Data
Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass [K. P. Bennett and O. L. Mangasarian: "Robust Linear Programming Discrimination of Two Linearly Inseparable Sets", Optimization Methods and Software 1, 1992, 23-34].
Dataset can be found at UCI Machine Learning Repository
In this Assignment the Decision tree has been applied to classification of breast cancer detection.
Attribute Information:
id - ID number
diagnosis (M = malignant, B = benign)
3-32 extra features
Ten real-valued features are computed for each cell nucleus: a) radius (mean of distances from center to points on the perimeter) b) texture (standard deviation of gray-scale values) c) perimeter d) area e) smoothness (local variation in radius lengths) f) compactness (perimeter^2 / area - 1.0) g) concavity (severity of concave portions of the contour) h) concave points (number of concave portions of the contour) i) symmetry j) fractal dimension ("coastline approximation" - 1)
All feature values are recoded with four significant digits. Missing attribute values: none Class distribution: 357 benign, 212 malignant
Results
Generated decision tree can be found below:
In [17]:img
Out[17]:
Decision tree analysis was performed to test nonlinear relationships among a series of explanatory variables and a binary, categorical response variable (breast cancer diagnosis: malignant or benign).
The dataset was splitted into train and test samples in ratio 70\30.
After fitting the classifier the key metrics were calculated - confusion matrix and accuracy = 0.924. This is a good result for a model trained on a small dataset.
From decision tree we can observe:
The malignant tumor is tend to have much more visible affected areas, texture and concave points, while the benign's characteristics are significantly lower.
The most important features are:
concave points_worst = 0.707688
area_worst = 0.114771
concave points_mean = 0.034234
fractal_dimension_se = 0.026301
texture_worst = 0.026300
area_se = 0.025201
concavity_se = 0.024540
texture_mean = 0.023671
perimeter_mean = 0.010415
concavity_mean = 0.006880
Code
In [1]:import pandas as pd import numpy as np from sklearn.metrics import*from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn import tree from io import StringIO from IPython.display import Image import pydotplus from sklearn.manifold import TSNE from matplotlib import pyplot as plt %matplotlib inline rnd_state = 23468
Load data
In [2]:data = pd.read_csv('Data/breast_cancer.csv') data.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): id 569 non-null int64 diagnosis 569 non-null object radius_mean 569 non-null float64 texture_mean 569 non-null float64 perimeter_mean 569 non-null float64 area_mean 569 non-null float64 smoothness_mean 569 non-null float64 compactness_mean 569 non-null float64 concavity_mean 569 non-null float64 concave points_mean 569 non-null float64 symmetry_mean 569 non-null float64 fractal_dimension_mean 569 non-null float64 radius_se 569 non-null float64 texture_se 569 non-null float64 perimeter_se 569 non-null float64 area_se 569 non-null float64 smoothness_se 569 non-null float64 compactness_se 569 non-null float64 concavity_se 569 non-null float64 concave points_se 569 non-null float64 symmetry_se 569 non-null float64 fractal_dimension_se 569 non-null float64 radius_worst 569 non-null float64 texture_worst 569 non-null float64 perimeter_worst 569 non-null float64 area_worst 569 non-null float64 smoothness_worst 569 non-null float64 compactness_worst 569 non-null float64 concavity_worst 569 non-null float64 concave points_worst 569 non-null float64 symmetry_worst 569 non-null float64 fractal_dimension_worst 569 non-null float64 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(1), object(1) memory usage: 146.8+ KB
In the output above there is an empty column 'Unnamed: 32', so next it should be dropped.
In [3]:data.drop('Unnamed: 32', axis=1, inplace=True) data.diagnosis = np.where(data.diagnosis=='M', 1, 0) # Decode diagnosis into binary data.describe()
Out[3]:iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worstcount5.690000e+02569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000...569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000569.000000mean3.037183e+070.37258314.12729219.28964991.969033654.8891040.0963600.1043410.0887990.048919...16.26919025.677223107.261213880.5831280.1323690.2542650.2721880.1146060.2900760.083946std1.250206e+080.4839183.5240494.30103624.298981351.9141290.0140640.0528130.0797200.038803...4.8332426.14625833.602542569.3569930.0228320.1573360.2086240.0657320.0618670.018061min8.670000e+030.0000006.9810009.71000043.790000143.5000000.0526300.0193800.0000000.000000...7.93000012.02000050.410000185.2000000.0711700.0272900.0000000.0000000.1565000.05504025%8.692180e+050.00000011.70000016.17000075.170000420.3000000.0863700.0649200.0295600.020310...13.01000021.08000084.110000515.3000000.1166000.1472000.1145000.0649300.2504000.07146050%9.060240e+050.00000013.37000018.84000086.240000551.1000000.0958700.0926300.0615400.033500...14.97000025.41000097.660000686.5000000.1313000.2119000.2267000.0999300.2822000.08004075%8.813129e+061.00000015.78000021.800000104.100000782.7000000.1053000.1304000.1307000.074000...18.79000029.720000125.4000001084.0000000.1460000.3391000.3829000.1614000.3179000.092080max9.113205e+081.00000028.11000039.280000188.5000002501.0000000.1634000.3454000.4268000.201200...36.04000049.540000251.2000004254.0000000.2226001.0580001.2520000.2910000.6638000.207500
8 rows × 32 columns
In [4]:data.head()
Out[4]:iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...radius_worsttexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worst0842302117.9910.38122.801001.00.118400.277600.30010.14710...25.3817.33184.602019.00.16220.66560.71190.26540.46010.118901842517120.5717.77132.901326.00.084740.078640.08690.07017...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902284300903119.6921.25130.001203.00.109600.159900.19740.12790...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758384348301111.4220.3877.58386.10.142500.283900.24140.10520...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300484358402120.2914.34135.101297.00.100300.132800.19800.10430...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
5 rows × 32 columns
Plots
For visualization purposes, the number of dimensions was reduced to two by applying t-SNE method. The plot illustrates that our classes are not clearly divided into two parts, so the nonlinear methods (like Decision tree) may solve this problem.
In [15]:model = TSNE(random_state=rnd_state, n_components=2) representation = model.fit_transform(data.iloc[:, 2:])
In [16]:plt.scatter(representation[:, 0], representation[:, 1], c=data.diagnosis, alpha=0.5, cmap=plt.cm.get_cmap('Set1', 2)) plt.colorbar(ticks=range(2));
Decision tree
In [6]:predictors = data.iloc[:, 2:] target = data.diagnosis
To train a Decision tree the dataset was splitted into train and test samples in proportion 70/30.
In [7]:(predictors_train, predictors_test, target_train, target_test) = train_test_split(predictors, target, test_size = .3, random_state = rnd_state)
In [8]:print('predictors_train:', predictors_train.shape) print('predictors_test:', predictors_test.shape) print('target_train:', target_train.shape) print('target_test:', target_test.shape) predictors_train: (398, 30) predictors_test: (171, 30) target_train: (398,) target_test: (171,)
In [9]:print(np.sum(target_train==0)) print(np.sum(target_train==1)) 253 145
Our train sample is quite balanced, so there is no need in balancing it.
In [10]:classifier = DecisionTreeClassifier(random_state = rnd_state).fit(predictors_train, target_train)
In [11]:prediction = classifier.predict(predictors_test)
In [12]:print('Confusion matrix:\n', pd.crosstab(target_test, prediction, colnames=['Actual'], rownames=['Predicted'], margins=True)) print('\nAccuracy: ', accuracy_score(target_test, prediction)) Confusion matrix: Actual 0 1 All Predicted 0 96 8 104 1 5 62 67 All 101 70 171 Accuracy: 0.9239766081871345
In [13]:out = StringIO() tree.export_graphviz(classifier, out_file = out, feature_names = predictors_train.columns.values, proportion =True, filled =True) graph = pydotplus.graph_from_dot_data(out.getvalue()) img = Image(data = graph.create_png()) with open('output.png', 'wb') as f: f.write(img.data)
In [14]:feature_importance = pd.Series(classifier.feature_importances_, index=data.columns.values[2:]).sort_values(ascending=False) feature_importance
Out[14]:concave points_worst 0.707688 area_worst 0.114771 concave points_mean 0.034234 fractal_dimension_se 0.026301 texture_worst 0.026300 area_se 0.025201 concavity_se 0.024540 texture_mean 0.023671 perimeter_mean 0.010415 concavity_mean 0.006880 fractal_dimension_worst 0.000000 fractal_dimension_mean 0.000000 symmetry_mean 0.000000 compactness_mean 0.000000 texture_se 0.000000 smoothness_mean 0.000000 area_mean 0.000000 radius_se 0.000000 smoothness_se 0.000000 perimeter_se 0.000000 symmetry_worst 0.000000 compactness_se 0.000000 concave points_se 0.000000 symmetry_se 0.000000 radius_worst 0.000000 perimeter_worst 0.000000 smoothness_worst 0.000000 compactness_worst 0.000000 concavity_worst 0.000000 radius_mean 0.000000 dtype: float64
1 note · View note