To showcase and practice machine learning techniques and visualizations.
library(data.table) #fread()
library(ggplot2)
library(dplyr) #The usual
library(caret) #train()
library(rsample) #initial_split() vfold_cv()
library(purrr) #map()
library(Metrics) #recall() mae()
library(ranger) #ranger()
library(glmnet) #glmnet()
library(broom) #map()
library(ggthemes) #theme_
Source: The data comes from kaggle where a user had subset the data from the UCI machine learning repository.
heart <- fread("heart.csv", header = TRUE)
heart.c <- fread("heart.csv", header = TRUE)
glimpse(heart)
Observations: 1,025
Variables: 14
$ age <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 5...
$ sex <int> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,...
$ cp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0,...
$ trestbps <int> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112...
$ chol <int> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149...
$ fbs <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,...
$ restecg <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,...
$ thalach <int> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125...
$ exang <int> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,...
$ oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6...
$ slope <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1,...
$ ca <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1,...
$ thal <int> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3,...
$ target <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,...
##### For a Little bit of clarity of variables
list_names <- c("Age" , "Gender", "ChestPain", "RestingBloodPressure", "Cholestrol", "FastingBloodSugar", "RestingECG", "MaxHeartRateAchivied", "ExerciseIndusedAngina", "Oldpeak", "Slope", "MajorVessels", "Thalassemia", "Target")
colnames(heart) <- list_names
colnames(heart.c) <- list_names
heart$Gender <- as.factor(heart$Gender)
heart$ChestPain <- as.factor(heart$ChestPain)
heart$ExerciseIndusedAngina <- as.factor(heart$ExerciseIndusedAngina)
heart$Thalassemia <- as.factor(heart$Thalassemia)
heart$Target <- as.factor(heart$Target)
# levels(heart$ChestPain)
heart <- transform(heart,
ChestPain=plyr::revalue(ChestPain,c("0"="Typical Angina", "1"="Atypical Angina", "2"="Non-Anginal", "3"="Asymptomatic")))
heart <- transform(heart,
Gender=plyr::revalue(Gender, c("0"="Female", "1"="Male")))
heart <- transform(heart,
Target=plyr::revalue(Target, c("0"="Healthy Heart", "1"="Heart Disease")))
glimpse(heart)
Observations: 1,025
Variables: 14
$ Age <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, ...
$ Gender <fct> Male, Male, Male, Male, Female, Female, ...
$ ChestPain <fct> Typical Angina, Typical Angina, Typical ...
$ RestingBloodPressure <int> 125, 140, 145, 148, 138, 100, 114, 160, ...
$ Cholestrol <int> 212, 203, 174, 203, 294, 248, 318, 289, ...
$ FastingBloodSugar <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0...
$ RestingECG <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1...
$ MaxHeartRateAchivied <int> 168, 155, 125, 161, 106, 122, 140, 145, ...
$ ExerciseIndusedAngina <fct> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1...
$ Oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, ...
$ Slope <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1...
$ MajorVessels <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3...
$ Thalassemia <fct> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3...
$ Target <fct> Healthy Heart, Healthy Heart, Healthy He...
Minimal & custom colour
posn_d <- position_dodge(width = 0.4)
heart %>%
ggplot( aes(x = factor(ChestPain), fill = factor(Target))) +
geom_bar(position = posn_d ,alpha = 0.8) +
labs(x = "Chest Pain", y = "Count",title = "Heart Health", subtitle = "By type of chest pain", caption = "Heart Disease UCI from Kaggle") +
geom_text(aes(label=..count..),stat="count",position=posn_d, vjust = -0.3) +
scale_fill_manual("legend", values = c("Healthy Heart" = "goldenrod1", "Heart Disease" = "mediumorchid1")) +
theme_minimal() +
theme(legend.title = element_blank(),
panel.grid.major = element_blank())
Fivethirtyeight & custom colour
heart %>%
ggplot( aes(x = factor(Gender), fill = factor(Target)) ) +
geom_bar(position = posn_d ,alpha = 0.8) +
labs(x = "Sex",title = "Heart Health", subtitle = "By sex", caption = "Heart Disease UCI from Kaggle") +
geom_text(aes(label=..count..),stat="count",position=posn_d, vjust = -0.3) +
scale_color_fivethirtyeight() +
theme_fivethirtyeight() +
theme(legend.title = element_blank()) +
scale_fill_manual("legend", values = c("Healthy Heart" = "brown1", "Heart Disease" = "forestgreen"))
Stata theme
heart %>%
ggplot( aes(x = factor(Gender), fill = factor(ChestPain) )) +
geom_bar(position = "dodge" ,alpha = 0.6) +
labs(x = "Sex", y = "Count",title = "Type of Chest Pain", subtitle = "By sex", caption = "Heart Disease UCI from Kaggle") +
geom_text(aes(label=..count..),stat="count",position=position_dodge(width = 0.9), vjust = -0.3) +
theme_stata() + scale_fill_stata() +
theme(legend.title = element_blank())
Stata blue
heart[, .N, by = .(Age, Target, Gender)] %>%
ggplot( aes(x = Age, y = N) ) +
geom_col( fill = "dodgerblue") +
facet_grid(Gender ~ Target, scales = "free") +
labs( y = "Count",title = "Age Distribution", subtitle = "By heart health and sex", caption = "Heart Disease UCI from Kaggle") +
theme_stata()
Dotplot and histogram
heart %>%
ggplot(aes(x=Age, y=Cholestrol, col = Gender)) +
geom_point() +
facet_wrap(~Target, scales = "free") +
geom_smooth(method=lm) + ylim(120,420) +
theme_grey() +
labs(title = "Cholestrol by Age", subtitle = "By heart health and sex", caption = "Heart Disease UCI from Kaggle") +
theme(legend.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
heart %>%
ggplot(aes(x=Age)) +
geom_histogram( fill = "purple", bins = 10) +
facet_wrap(~Target) +
theme_grey() +
labs(title = "Age Distribution", subtitle = "By Heart Health", caption = "Heart Disease UCI from Kaggle") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
# facet_grid(Gender ~ Target, scales = "free")
levels(heart$Target) <- make.names(levels(factor(heart$Target)))
set.seed(1337)
# Prepare the initial split object
data_split <- initial_split(heart, prop = 0.65)
# Extract the training dataframe
training_data <- training(data_split)
# Extract the testing dataframe
testing_data <- testing(data_split)
set.seed(1337)
cv_split <- vfold_cv(training_data, v = 5)
cv_data <- cv_split %>%
mutate(train = map(splits, ~training(.x)),validate = map(splits, ~testing(.x)))
########################################################
myControl <- trainControl(
method = "cv",
number = 10,
summaryFunction = twoClassSummary, #twoClassSummary for logistic
classProbs = TRUE, #Classifcation probabilities
verboseIter = TRUE
)
########################################################
cv_tune <- cv_data %>%
expand_grid(mtry = 2:13) # mtry range 1:# of features
#NOTE: crossing() has been updated, expand_grid() now replaces it.
cv_models_rf <- cv_tune %>%
mutate(model = map2(train, mtry, ~ranger(formula = Target~.,
data = .x, mtry = .y,
num.trees = 2000, seed = 1337)))
cv_prep_rf <- cv_models_rf %>%
mutate( validate_actual = map(validate, ~.x$Target == "Heart.Disease"),
validate_predicted = map2(.x = model, .y = validate,
~predict(.x, .y, type = "response")$predictions == "Heart.Disease")
)
# Recall on Random Forest models at different levels of mtry
cv_perf_recall <- cv_prep_rf %>%
mutate(recall = map2_dbl(.x = validate_actual, .y = validate_predicted, ~recall(actual = .x, predicted = .y)))
temp <- cv_perf_recall %>%
select(mtry, recall)
j = matrix(nrow = 12, ncol = 2)
for(i in 2:13){
j[i-1,] = i
j[i-1,2] = temp %>%
filter(mtry == i) %>%
summarize(mean_recall = mean(recall)) %>%
as.numeric()
}
colnames(j) <- c("mtry","mean_recall")
knitr::kable(as.data.frame(j))
mtry | mean_recall |
---|---|
2 | 0.9779728 |
3 | 0.9753061 |
4 | 0.9753061 |
5 | 0.9753061 |
6 | 0.9753061 |
7 | 0.9753061 |
8 | 0.9753061 |
9 | 0.9785848 |
10 | 0.9814017 |
11 | 0.9814017 |
12 | 0.9814017 |
13 | 0.9840683 |
# Tuning mtry
cv_eval_tune <- cv_prep_rf %>%
mutate(validate_mae = map2_dbl(.x = validate_actual, .y = validate_predicted, ~mae(actual = .x, predicted = .y)))
#################################################################
# Mean validate MAE for each fold and mtry combination
# cv_eval_tune %>%
# group_by(mtry) %>%
# summarise(mean_validate_mae = mean(validate_mae))
#Not working it did before
# cv_eval_tune %>%
# select(mtry, validate_mae) %>%
# group_nest(mtry) %>%
# unnest()
#Kind of gets there
#################################################################
temp <- cv_eval_tune %>%
select(mtry, validate_mae)
k = matrix(nrow = 12, ncol = 2)
for(i in 2:13){
k[i-1,] = i
k[i-1,2] = temp %>%
filter(mtry == i) %>%
summarize(mean_validate_mae = mean(validate_mae)) %>%
as.numeric()
}
colnames(k) <- c("mtry","mean_validate_mae")
knitr::kable(as.data.frame(k))
mtry | mean_validate_mae |
---|---|
2 | 0.0285265 |
3 | 0.0270228 |
4 | 0.0270228 |
5 | 0.0270228 |
6 | 0.0270228 |
7 | 0.0270228 |
8 | 0.0270228 |
9 | 0.0255190 |
10 | 0.0240153 |
11 | 0.0240153 |
12 | 0.0225115 |
13 | 0.0240265 |
The Mean Absolute Error (MAE) measures how much on average the predicted values differ from actual values, taking the mean of the MAE over different folds the hyperparameter mtry is best at 12.
On average the true positive rate (Sensitivity/Recall) seems to increase as mtry increases, in general. These predictive models performance quite well.
Mean Absolute Error
best_model <- ranger(formula = Target~., data = training_data , mtry = 12, num.trees = 2000, seed = 1337)
test_actual <- testing_data$Target == "Heart.Disease"
test_predicted <- predict(best_model, testing_data, type = "response")$predictions == "Heart.Disease"
mae(test_actual, test_predicted)
[1] 0.008379888
Accuracy Measures how well the model predicted both TRUE and FALSE classes.
Metrics::accuracy(test_actual,test_predicted)
[1] 0.9916201
Precision Calculates how often the model is correct at the TRUE class."
Metrics::precision(test_actual,test_predicted)
[1] 0.9836066
cv_models_lr <- cv_data %>%
mutate(model = map(train, ~glm(formula = Target~., data = .x, family = "binomial")))
# Examine the first model and validate
model <- cv_models_lr$model[[1]]
validate <- cv_models_lr$validate[[1]]
# Prepare binary vector of actual Heart Disease values in validate
validate_actual <- validate$Target == "Heart.Disease"
# Predict the probabilities for the observations in validate
validate_prob <- predict(model, validate, type = "response")
# Prepare binary vector of predicted Heart Disease values for validate
validate_predicted <- validate_prob > 0.5
# Compare the actual & predicted performance visually using a table
# table(validate_actual, validate_predicted)
ap.list <- matrix(nrow = 5, ncol = 2)
for(i in 1:5){
m <- cv_models_lr$model[[i]]
v <- cv_models_lr$validate[[i]]
v_actual <- v$Target == "Heart.Disease"
v_prob <- predict(m, v, type = "response")
v_predicted <- v_prob > 0.5
print(table(v_actual, v_predicted))
ap.list[i,1] = Metrics::accuracy(v_actual, v_predicted)
ap.list[i,2] = Metrics::precision(v_actual, v_predicted)
if (i == 5){
ap = apply(ap.list, 2, mean)
print("Mean Accuracy")
print(ap[1])
print("Mean Precision")
print(ap[2])
}
}
v_predicted
v_actual FALSE TRUE
FALSE 43 16
TRUE 3 72
v_predicted
v_actual FALSE TRUE
FALSE 48 12
TRUE 17 57
v_predicted
v_actual FALSE TRUE
FALSE 51 11
TRUE 9 62
v_predicted
v_actual FALSE TRUE
FALSE 58 10
TRUE 5 60
v_predicted
v_actual FALSE TRUE
FALSE 58 14
TRUE 10 51
[1] "Mean Accuracy"
[1] 0.8396364
[1] "Mean Precision"
[1] 0.8270684
Confusion matrix of all the logistic regression models
Using a logistic regression model to predict heart disease, the metrics of interest will be accuracy and precision. We would like to know who does and does not have heart disease, as well as how correct that true classification is.
Mean Recall of training models
cv_prep_lr <- cv_models_lr %>%
mutate(validate_actual = map(validate, ~.x$Target == "Heart.Disease"),
validate_predicted = map2(.x = model, .y = validate,
~predict(.x, .y, type = "response") > 0.5)
)
# Validate recall for each cross validation fold
cv_perf_recall <- cv_prep_lr %>%
mutate(validate_recall = map2_dbl(validate_actual, validate_predicted,
~recall(actual = .x, predicted = .y)))
# cv_perf_recall$validate_recall
mean(cv_perf_recall$validate_recall)
[1] 0.8725304
# Custom tuning grid for RF-modeling
# tune.grid <- data.frame(
# .mtry = 2:length(heart),
# .splitrule = "variance",
# .min.node.size = 5
# )
# Custom tuning grid for Lasso or Ridge regression
tune.grid <- expand.grid(
alpha = 0:1,
lambda = seq(0.0001, 1, length = 20)
)
# Glmnet places constraints on coeff to prevent overfitting, fits a glm via maximum likelihood
# Ridge (0) or Lasso (1)
model.glmnet <- train(
Target ~.,
heart,
metric = "ROC",
tuneGrid = tune.grid,
method = "glmnet",
trControl = myControl,
preProcess = c("medianImpute", "center", "scale")
)
+ Fold01: alpha=0, lambda=1
- Fold01: alpha=0, lambda=1
+ Fold01: alpha=1, lambda=1
- Fold01: alpha=1, lambda=1
+ Fold02: alpha=0, lambda=1
- Fold02: alpha=0, lambda=1
+ Fold02: alpha=1, lambda=1
- Fold02: alpha=1, lambda=1
+ Fold03: alpha=0, lambda=1
- Fold03: alpha=0, lambda=1
+ Fold03: alpha=1, lambda=1
- Fold03: alpha=1, lambda=1
+ Fold04: alpha=0, lambda=1
- Fold04: alpha=0, lambda=1
+ Fold04: alpha=1, lambda=1
- Fold04: alpha=1, lambda=1
+ Fold05: alpha=0, lambda=1
- Fold05: alpha=0, lambda=1
+ Fold05: alpha=1, lambda=1
- Fold05: alpha=1, lambda=1
+ Fold06: alpha=0, lambda=1
- Fold06: alpha=0, lambda=1
+ Fold06: alpha=1, lambda=1
- Fold06: alpha=1, lambda=1
+ Fold07: alpha=0, lambda=1
- Fold07: alpha=0, lambda=1
+ Fold07: alpha=1, lambda=1
- Fold07: alpha=1, lambda=1
+ Fold08: alpha=0, lambda=1
- Fold08: alpha=0, lambda=1
+ Fold08: alpha=1, lambda=1
- Fold08: alpha=1, lambda=1
+ Fold09: alpha=0, lambda=1
- Fold09: alpha=0, lambda=1
+ Fold09: alpha=1, lambda=1
- Fold09: alpha=1, lambda=1
+ Fold10: alpha=0, lambda=1
- Fold10: alpha=0, lambda=1
+ Fold10: alpha=1, lambda=1
- Fold10: alpha=1, lambda=1
Aggregating results
Selecting tuning parameters
Fitting alpha = 1, lambda = 1e-04 on full training set
plot(model.glmnet)
model.glmnet
glmnet
1025 samples
13 predictor
2 classes: 'Healthy.Heart', 'Heart.Disease'
Pre-processing: median imputation (17), centered (17), scaled (17)
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 922, 923, 922, 923, 922, 923, ...
Resampling results across tuning parameters:
alpha lambda ROC Sens Spec
0 0.00010000 0.9220681 0.8116327 0.8879173
0 0.05272632 0.9205358 0.8096327 0.8822206
0 0.10535263 0.9181369 0.8076327 0.8727504
0 0.15797895 0.9169969 0.8156327 0.8727504
0 0.21060526 0.9160041 0.8176327 0.8574383
0 0.26323158 0.9152798 0.8216327 0.8574746
0 0.31585789 0.9150876 0.8236327 0.8537010
0 0.36848421 0.9146720 0.8256327 0.8537010
0 0.42111053 0.9139078 0.8256327 0.8537010
0 0.47373684 0.9134100 0.8256327 0.8537010
0 0.52636316 0.9134099 0.8256327 0.8537010
0 0.57898947 0.9127240 0.8256327 0.8537010
0 0.63161579 0.9125709 0.8256327 0.8537010
0 0.68424211 0.9121145 0.8256327 0.8537010
0 0.73686842 0.9122655 0.8256327 0.8537010
0 0.78949474 0.9122263 0.8256327 0.8537010
0 0.84212105 0.9121116 0.8176327 0.8537010
0 0.89474737 0.9119570 0.8156327 0.8537010
0 0.94737368 0.9117284 0.8156327 0.8537010
0 1.00000000 0.9115390 0.8136327 0.8537010
1 0.00010000 0.9226709 0.7955510 0.8955007
1 0.05272632 0.9028522 0.7975918 0.8556241
1 0.10535263 0.8879042 0.7735510 0.8309869
1 0.15797895 0.8690479 0.7554694 0.7834180
1 0.21060526 0.8189731 0.7354694 0.7834180
1 0.26323158 0.5385007 0.0000000 1.0000000
1 0.31585789 0.5000000 0.0000000 1.0000000
1 0.36848421 0.5000000 0.0000000 1.0000000
1 0.42111053 0.5000000 0.0000000 1.0000000
1 0.47373684 0.5000000 0.0000000 1.0000000
1 0.52636316 0.5000000 0.0000000 1.0000000
1 0.57898947 0.5000000 0.0000000 1.0000000
1 0.63161579 0.5000000 0.0000000 1.0000000
1 0.68424211 0.5000000 0.0000000 1.0000000
1 0.73686842 0.5000000 0.0000000 1.0000000
1 0.78949474 0.5000000 0.0000000 1.0000000
1 0.84212105 0.5000000 0.0000000 1.0000000
1 0.89474737 0.5000000 0.0000000 1.0000000
1 0.94737368 0.5000000 0.0000000 1.0000000
1 1.00000000 0.5000000 0.0000000 1.0000000
ROC was used to select the optimal model using the largest value.
The final values used for the model were alpha = 1 and lambda = 1e-04.
#max(model.glmnet[["results"]][["ROC"]])
plot(model.glmnet$finalModel)
model.0 <- glm(Target ~ 1, data = heart, family ="binomial")
model.1 <- glm(Target ~ . -Slope, data = heart, family = "binomial")
step(model.0 , scope = formula(model.1), direction="forward", k = 2)
Start: AIC=1422.24
Target ~ 1
Df Deviance AIC
+ ChestPain 3 1124.0 1132.0
+ Thalassemia 3 1125.0 1133.0
+ Oldpeak 1 1195.3 1199.3
+ ExerciseIndusedAngina 1 1214.1 1218.1
+ MaxHeartRateAchivied 1 1219.6 1223.6
+ MajorVessels 1 1256.6 1260.6
+ Gender 1 1337.8 1341.8
+ Age 1 1365.1 1369.1
+ RestingBloodPressure 1 1400.3 1404.3
+ RestingECG 1 1401.6 1405.6
+ Cholestrol 1 1409.9 1413.9
<none> 1420.2 1422.2
+ FastingBloodSugar 1 1418.5 1422.5
Step: AIC=1131.99
Target ~ ChestPain
Df Deviance AIC
+ Thalassemia 3 948.04 962.04
+ Oldpeak 1 993.33 1003.33
+ MajorVessels 1 1015.13 1025.13
+ MaxHeartRateAchivied 1 1040.05 1050.05
+ Gender 1 1043.17 1053.17
+ ExerciseIndusedAngina 1 1063.34 1073.34
+ Age 1 1092.20 1102.20
+ RestingBloodPressure 1 1106.02 1116.02
+ RestingECG 1 1114.53 1124.53
+ Cholestrol 1 1118.39 1128.39
+ FastingBloodSugar 1 1118.81 1128.81
<none> 1123.99 1131.99
Step: AIC=962.04
Target ~ ChestPain + Thalassemia
Df Deviance AIC
+ MajorVessels 1 856.91 872.91
+ Oldpeak 1 866.97 882.97
+ MaxHeartRateAchivied 1 890.18 906.18
+ ExerciseIndusedAngina 1 913.62 929.62
+ Age 1 920.92 936.92
+ Gender 1 924.70 940.70
+ RestingECG 1 935.15 951.15
+ RestingBloodPressure 1 940.36 956.36
+ Cholestrol 1 941.63 957.63
<none> 948.04 962.04
+ FastingBloodSugar 1 946.14 962.14
Step: AIC=872.91
Target ~ ChestPain + Thalassemia + MajorVessels
Df Deviance AIC
+ Oldpeak 1 789.42 807.42
+ MaxHeartRateAchivied 1 814.56 832.56
+ ExerciseIndusedAngina 1 821.17 839.17
+ Gender 1 842.18 860.18
+ RestingECG 1 844.80 862.80
+ Age 1 846.63 864.63
+ Cholestrol 1 849.28 867.28
+ RestingBloodPressure 1 850.78 868.78
<none> 856.91 872.91
+ FastingBloodSugar 1 856.31 874.31
Step: AIC=807.42
Target ~ ChestPain + Thalassemia + MajorVessels + Oldpeak
Df Deviance AIC
+ ExerciseIndusedAngina 1 767.93 787.93
+ MaxHeartRateAchivied 1 767.95 787.95
+ Gender 1 772.91 792.91
+ RestingECG 1 773.07 793.07
+ Age 1 783.40 803.40
+ Cholestrol 1 784.28 804.28
+ RestingBloodPressure 1 785.65 805.65
<none> 789.42 807.42
+ FastingBloodSugar 1 788.74 808.74
Step: AIC=787.93
Target ~ ChestPain + Thalassemia + MajorVessels + Oldpeak + ExerciseIndusedAngina
Df Deviance AIC
+ Gender 1 751.35 773.35
+ RestingECG 1 753.30 775.30
+ MaxHeartRateAchivied 1 753.41 775.41
+ Age 1 762.89 784.89
+ Cholestrol 1 763.84 785.84
+ RestingBloodPressure 1 764.68 786.68
<none> 767.93 787.93
+ FastingBloodSugar 1 767.61 789.61
Step: AIC=773.35
Target ~ ChestPain + Thalassemia + MajorVessels + Oldpeak + ExerciseIndusedAngina +
Gender
Df Deviance AIC
+ MaxHeartRateAchivied 1 731.44 755.44
+ RestingECG 1 738.70 762.70
+ Age 1 739.13 763.13
+ Cholestrol 1 740.39 764.39
+ RestingBloodPressure 1 745.47 769.47
<none> 751.35 773.35
+ FastingBloodSugar 1 751.10 775.10
Step: AIC=755.44
Target ~ ChestPain + Thalassemia + MajorVessels + Oldpeak + ExerciseIndusedAngina +
Gender + MaxHeartRateAchivied
Df Deviance AIC
+ Cholestrol 1 717.71 743.71
+ RestingECG 1 719.39 745.39
+ RestingBloodPressure 1 721.61 747.61
+ Age 1 727.26 753.26
<none> 731.44 755.44
+ FastingBloodSugar 1 730.86 756.86
Step: AIC=743.71
Target ~ ChestPain + Thalassemia + MajorVessels + Oldpeak + ExerciseIndusedAngina +
Gender + MaxHeartRateAchivied + Cholestrol
Df Deviance AIC
+ RestingBloodPressure 1 708.85 736.85
+ RestingECG 1 709.86 737.86
<none> 717.71 743.71
+ Age 1 715.82 743.82
+ FastingBloodSugar 1 717.32 745.32
Step: AIC=736.85
Target ~ ChestPain + Thalassemia + MajorVessels + Oldpeak + ExerciseIndusedAngina +
Gender + MaxHeartRateAchivied + Cholestrol + RestingBloodPressure
Df Deviance AIC
+ RestingECG 1 701.69 731.69
<none> 708.85 736.85
+ Age 1 708.34 738.34
+ FastingBloodSugar 1 708.81 738.81
Step: AIC=731.69
Target ~ ChestPain + Thalassemia + MajorVessels + Oldpeak + ExerciseIndusedAngina +
Gender + MaxHeartRateAchivied + Cholestrol + RestingBloodPressure +
RestingECG
Df Deviance AIC
<none> 701.69 731.69
+ Age 1 701.51 733.51
+ FastingBloodSugar 1 701.68 733.68
Call: glm(formula = Target ~ ChestPain + Thalassemia + MajorVessels +
Oldpeak + ExerciseIndusedAngina + Gender + MaxHeartRateAchivied +
Cholestrol + RestingBloodPressure + RestingECG, family = "binomial",
data = heart)
Coefficients:
(Intercept) ChestPainAtypical Angina
0.000998 1.130884
ChestPainNon-Anginal ChestPainAsymptomatic
1.911585 1.873335
Thalassemia1 Thalassemia2
1.717835 1.756457
Thalassemia3 MajorVessels
0.335443 -0.761845
Oldpeak ExerciseIndusedAngina1
-0.640488 -0.812629
GenderMale MaxHeartRateAchivied
-1.449652 0.024614
Cholestrol RestingBloodPressure
-0.006052 -0.015617
RestingECG
0.513108
Degrees of Freedom: 1024 Total (i.e. Null); 1010 Residual
Null Deviance: 1420
Residual Deviance: 701.7 AIC: 731.7
# formula = Target ~ ChestPain + Thalassemia + MajorVessels +
# Oldpeak + ExerciseIndusedAngina + Gender + MaxHeartRateAchivied +
# Cholestrol + RestingBloodPressure + RestingECG
model.heart <- glm(formula = Target ~ ChestPain + Thalassemia + MajorVessels +
Oldpeak + ExerciseIndusedAngina + Gender + MaxHeartRateAchivied +
Cholestrol + RestingBloodPressure + RestingECG, family = "binomial",
data = heart)
summary(model.heart)
Call:
glm(formula = Target ~ ChestPain + Thalassemia + MajorVessels +
Oldpeak + ExerciseIndusedAngina + Gender + MaxHeartRateAchivied +
Cholestrol + RestingBloodPressure + RestingECG, family = "binomial",
data = heart)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.5893 -0.4198 0.1205 0.5466 2.5864
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.000998 1.448018 0.001 0.999450
ChestPainAtypical Angina 1.130884 0.295093 3.832 0.000127 ***
ChestPainNon-Anginal 1.911585 0.246880 7.743 9.71e-15 ***
ChestPainAsymptomatic 1.873335 0.339647 5.516 3.48e-08 ***
Thalassemia1 1.717835 1.103014 1.557 0.119375
Thalassemia2 1.756457 1.044426 1.682 0.092619 .
Thalassemia3 0.335443 1.048298 0.320 0.748977
MajorVessels -0.761845 0.102930 -7.402 1.35e-13 ***
Oldpeak -0.640487 0.110271 -5.808 6.31e-09 ***
ExerciseIndusedAngina1 -0.812629 0.230142 -3.531 0.000414 ***
GenderMale -1.449652 0.268969 -5.390 7.06e-08 ***
MaxHeartRateAchivied 0.024614 0.005171 4.760 1.94e-06 ***
Cholestrol -0.006052 0.002048 -2.955 0.003131 **
RestingBloodPressure -0.015617 0.005525 -2.827 0.004704 **
RestingECG 0.513108 0.192277 2.669 0.007617 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1420.24 on 1024 degrees of freedom
Residual deviance: 701.69 on 1010 degrees of freedom
AIC: 731.69
Number of Fisher Scoring iterations: 6