wine.csv
0.01MB

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# KNN알고리즘을 이용하여 와인 등급 분류하기
wine = read.csv('c:/data/wine.csv',header=T)
 
# stringsAsFactors를 선언하지 않으면 자동적으로 stringsAsFactors=TRUE
 
# 데이터를 섞음 (wine 데이터는 예측값이 컬럼[1]인 type이므로 일단 다 섞음)
# 유방암 데이터를 섞을 때 [-1]을 한 이유는 유방암 데이터는 불필요한 환자 코드가 있었기 때문
# 하지만 와인 데이터는 모두 쓸모 있는 데이터 뿐이기 때문에 굳이 어떠한 컬럼을 빼지 않아도 됨
 
# 데이터를 shuffle하는 이유
# 처음부터 데이터가 섞여있을 수도 있지만, wine 데이터와 같이 type별로 구분되어져 있으면
# train, test로 나눌 때 train에는 type1,type2 데이터만 test에는 type3 데이터만 있을 수 있기 때문
set.seed(15)
wine_shuffle <- wine[sample(nrow(wine)), ]
wine_shuffle
 
# wine 데이터를 섞은 결과인 wine2 확인
wine2 <- wine_shuffle
str(wine2)
 
# 정규화 함수
normalize <- function(x) {
  return ( (x-min(x)) / (max(x) - min(x)))
}
 
# wine 데이터의 컬럼 개수를 파악하고,
ncol(wine)
 
# wine_n이라는 새로운 값을 만듦
# 여기서 우리가 학습에 필요한 데이터는 [1]에 위치하고 예측할 결과인 type을 제외한 모든 컬럼이기 때문에 wine2[2:13]까지를 정규화함
# 정규화를 하는 이유는 각각의 데이터가 단위가 다르기 때문에 정규화를 하여 비교, 계산하기 편하게 하기 위함임
wine_n<- as.data.frame(lapply(wine2[2:13],normalize))
summary(wine_n)
 
 
# 데이터 나누기 train(90%)/test(10%)
train_num<-round(0.9*nrow(wine_n),0)
wine_train<-wine_n[1:train_num,]
wine_test<-wine_n[(train_num+1):nrow(wine_n),]
 
# 라벨 나누기 train/test
wine_train_label <- wine2[1:train_num,1]
wine_test_label <- wine2[(train_num+1):nrow(wine_n),1]
 
# 확인
wine_train_label
wine_test_label
 
wine_train
wine_test
 
# knn 알고리즘 사용
library(class)
result1 <- knn(train=wine_train, test=wine_test,cl=wine_train_label, k=21# k는 내 마음대로 (대체로 홀수)
result1
wine_test_label
 
# 예측값과 정답을 테이블로 만듦
= data.frame('실제'=wine_test_label,"예측"=result1)
x
table(x)
 
# 예측률(정확도) 평가
library(gmodels)
g2 =CrossTable(x=wine_test_label,y=result1,prop.chisq=FALSE)
g2
 
# wine은 분류하고자 하는 type이 3가지이므로 prob값도 3개를 합해야 함
g2$prop.tbl[1+ g2$prop.tbl[5+ g2$prop.tbl[9# 1
 
 
# 어떤 k값이 정확도가 가장 높을까?
temp = c()
k_num = c()
for (i in 1:200) {
  if (i%%2 != 0) {
    k_num = append(k_num,i)
    wine_test_pred = knn(train=wine_train, test=wine_test, cl=wine_train_label, k=i)
    g2 = CrossTable(x=wine_test_label,y=wine_test_pred,chisq=F)
    g3 = g2$prop.tbl[1+ g2$prop.tbl[5+ g2$prop.tbl[9]
    temp = append(temp,g3)
  }
}
 
result = data.frame("k"=k_num,"정확도"=temp)
 
library(plotly)
plot_ly(x=~result[,"k"],y=~result[,"정확도"],type='scatter',mode='lines') %>%
  layout(xaxis=list(title="k값"),yaxis=list(title="정확도"))
cs
 

 

이 그래프와 동일하지 않아도 됩니다. K값이 증가할수록 대체로 언더피팅 되는 것만 확인하세요!

 

 

 

 

 

 

wisc_bc_data.csv
0.12MB

 

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# KNN알고리즘을 이용하여 유방암 분류하기
wbcd = read.csv('c:/data/wisc_bc_data.csv',header=T,stringsAsFactors=FALSE)
 
# data.frame()으로 데이터프레임을 생성할 때 변수에 문자가 있을 경우 자동으로 factor타입으로 변환됨
# factor 변수는 연산이 되지 않으므로 stringsAsFactors() 함수를 써서 factor타입으로 변환되지 않게 하자
 
# factor로 변환하고 하지 않는 이유
# factor로 변환하지 않으면 프로그램이 문자를 단지 형(string)으로만 인식하지만
# factor로 변환하면 각각의 문자를 범주로 인식하기 때문에 이 경우는 일단 stringsAsFactors=F로 함
 
# 진단 컬럼 확인
table(wbcd$diagnosis)
 
# R에서 Grubb's Test를 이용한 이상치 검출방법 (출처 : https://sosal.kr/945)
library(outliers)
grubbs.flag <- function(x) {
  outliers <- NULL
  test <- x
  grubbs.result <- grubbs.test(test)
  pv <- grubbs.result$p.value
  while(pv < 0.05) {
    outliers <- c(outliers,as.numeric(strsplit(grubbs.result$alternative," ")[[1]][3]))
    test <- x[!x %in% outliers]
    grubbs.result <- grubbs.test(test)
    pv <- grubbs.result$p.value
  }
  return(data.frame(X=x,Outlier=(x %in% outliers)))
}
 
# 3부터 시작하는 이유는 [1], [2] 컬럼은 수치가 아니기 때문
for (i in 3:length(colnames(wbcd))){
  a = grubbs.flag(wbcd[,colnames(wbcd)[i]])
  b = a[a$Outlier==TRUE,"Outlier"]
  print (paste(colnames(wbcd)[i],'--->',length(b)))
}
 
# 결측치(NA)값 확인
colSums(is.na(wbcd)) # 유방암 데이터는 결측치 없음
 
# factor는 범주형 자료에서 사용한다. 진단을 factor로 만들어줌
wbcd$diagnosis <- factor(wbcd$diagnosis,
                         levels =c("B","M"),
                         labels = c("Benign","Maliganant"))
 
# 진단 컬럼 확인
prop.table(table(wbcd$diagnosis))
 
 
# seed를 설정하여 항상 똑같게 shuffle 되도록 함
set.seed(1)
wbcd_shuffle <- wbcd[sample(nrow(wbcd)), ]
wbcd2 <- wbcd_shuffle[-1# 1번째 컬럼은 불필요한 환자번호이므로 없애고 shuffle
 
# 셔플한 데이터 확인
str(wbcd2)
 
# 정규화 함수
normalize <- function(x) {
  return ((x-min(x)) / (max(x) - min(x)))
}
 
ncol(wbcd2) # 31
wbcd_n <- as.data.frame(lapply(wbcd2[2:31],normalize)) # 진단인 [1] 을 제외하고 전체 컬럼을 정규화함
 
train_num<-round(0.9*nrow(wbcd_n),0# 512 (90%)
 
# 데이터 나누기 train/test
wbcd_train<-wbcd_n[1:train_num,] # 512개의 데이터
wbcd_test<-wbcd_n[(train_num+1):nrow(wbcd_n),] # 57개의 데이터
 
# 라벨 나누기 train/test
wbcd_train_label <- wbcd2[1:train_num,1]
wbcd_test_label <- wbcd2[(train_num+1):nrow(wbcd_n),1]
 
# knn 알고리즘 사용
library(class)
result1 <- knn(train=wbcd_train, test=wbcd_test,cl=wbcd_train_label, k=21# k는 내 마음대로 (대체로 홀수)
 
# 예측값과 정답을 테이블로 만듦
= data.frame('실제'=wbcd_test_label,"예측"=result1)
table(x)
 
# 예측률 평가
library(gmodels)
g2 = CrossTable(x=wbcd_test_label,y=result1,chisq=FALSE)
g2$prop.tbl[1+ g2$prop.tbl[4# 0.9298246
 
 
# 어떤 k값이 정확도가 가장 높을까?
temp = c()
k_num = c()
for (i in 1:200) {
  if (i%%2 != 0) {
    k_num = append(k_num,i)
    wbcd_test_pred = knn(train=wbcd_train, test=wbcd_test, cl=wbcd_train_label, k=i)
    g2 = CrossTable(x=wbcd_test_label,y=wbcd_test_pred,chisq=F)
    g3 = g2$prop.tbl[1+ g2$prop.tbl[4]
    temp = append(temp,g3)
  }
}
result = data.frame("k"=k_num,"정확도"=temp)
library(plotly)
plot_ly(x=~result[,"k"],y=~result[,"정확도"],type='scatter',mode='lines') %>%
  layout(xaxis=list(title="k값"),yaxis=list(title="정확도"))
 
cs

 

 

 

좋은 데이터는.... 좋은 결과물을 낳는다... 

 

 

 

+ Recent posts