-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCancer-Prediction-Knn.R
92 lines (57 loc) · 2.25 KB
/
Cancer-Prediction-Knn.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
getwd()
setwd('D:/DATA science/datasets/Machine-Learning')
raw_data <- read.csv('wisc_bc_data.csv',stringsAsFactors = FALSE)
View(raw_data)
str(raw_data)
table(raw_data$diagnosis)
#Shuffle the rows of the data
set.seed(42)
rows <- sample(nrow(raw_data))
raw_data <- raw_data[rows,]
raw_data[1,]
rownames(raw_data) <- seq(length = nrow(raw_data))
raw_data$diagnosis <- factor(raw_data$diagnosis,levels = c('B','M'),labels = c('Benign','Malignant'))
round(prop.table(table(raw_data$diagnosis)) * 100)
#63% people were diagnosed as benign and 37% as malignant.
data <- raw_data[ ,-1]
View(data)
normalize <- function(x) {
return ((x - min(x))/(max(x) - min(x)))
}
data_normalized <- as.data.frame(lapply(data[ ,2:31], normalize))
View(data_normalized)
data_normalized['diagnosis'] <- data[ ,'diagnosis']
library(caret)
train_index <- createDataPartition(raw_data$diagnosis,p = 0.7,list = FALSE,times = 1)
cancer_train <- data_normalized[train_index, ]
nrow(cancer_train)
View(cancer_train)
cancer_train_labels <- cancer_train[ ,'diagnosis']
View(cancer_train_labels)
#cancer_train_labels <- as.data.frame(cancer_train_labels)
#colnames(cancer_train_labels) <- c("diagnosis")
class(cancer_train_labels)
#remove the classifying label from the train data
cancer_train <- cancer_train[ ,-31]
#Create test data and remove the classifying label from the data.
cancer_test <- data_normalized[-train_index, ]
cancer_test_labels <- cancer_test[ ,'diagnosis']
View(cancer_test_labels)
cancer_test <- cancer_test[ ,-31]
#KNN
library(class)
cancer_predicton <- knn(train = cancer_train,test = cancer_test,cl = cancer_train_labels,k = 21)
summary(cancer_predicton)
#Verifying the classification
library(gmodels)
CrossTable(cancer_test_labels,cancer_predicton,prop.chisq = FALSE)
#KNN with k=21 has given an accuracy of 95.3%, TPR of 89% and TNR of 99%.
# The Recall would be 89% and Precision would be 98%.
#In this scenario, We need to reduce the False Negative.
#Recall is more critical in this scenario.
#Trying k=12
cancer_prediction_12 <- knn(cancer_train,cancer_test,cancer_train_labels,12)
summary(cancer_prediction_12)
CrossTable(cancer_test_labels,cancer_prediction_12,prop.chisq = FALSE)
#Slightly beter result with k=12.
#The Recall would be 90.4% and Precision would be 98%.