mushrooms.csv
1.17MB

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# 규칙기반 알고리즘을 이용하여 식용버섯과 독버섯 분류하기 (oneR)
mushroom <- read.csv('c:/data/mushrooms.csv',header=T,stringsAsFactors = T)
train_cnt <- round( 0.75 * dim(mushroom)[1]) # 6093
 
# shuffle
set.seed(11)
train_index <- sample(1:dim(mushroom)[1], train_cnt, replace=F)
 
# train (75%) / test (25%)
mushroom_train <- mushroom[train_index,  ]
mushroom_test  <- mushroom[-train_index, ]
 
# oneR로 모델 생성
install.packages("OneR")
library(OneR)
 
model1 <- OneR(type~. ,  data=mushroom_train)
model1
summary(model1)
 
# 예측
result1 <- predict( model1, mushroom_test[   , -1] )
 
# 정확도
library(gmodels)
= CrossTable( mushroom_test[ , 1],  result1)
x$prop.tbl[1]+x$prop.tbl[4# 0.9862137
 
 
##########################################################
# 규칙기반 알고리즘을 이용하여 식용버섯과 독버섯 분류하기 (JRip)
# JRip로 모델 생성
install.packages("RWeka")
library(RWeka)
 
model2 <- JRip(type~ ., data=mushroom_train)
model2
 
summary(model2) # 작은 이원교차표가 하나 보임
 
# 예측
result2 <- predict( model2, mushroom_test[   , -1] )
 
# 정확도
library(gmodels)
<- CrossTable( mushroom_test[ , 1],  result2)  
x$prop.tbl[1]+x$prop.tbl[4# 1
 
cs

 

 

 

flu.csv
0.00MB

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
flu_func <- function(){
  library(e1071)
  
  flu_insert <- read.csv("c:/data/flu.csv", header=T, stringsAsFactors=TRUE)
  
  # 대문자 변경
  flu_insert$headache = toupper(flu_insert$headache)
  
  # 환자 번호 제거
  flu <- flu_insert[-1]
  
  nrow(flu) # 8
  
  train <- flu[1:8,]
  
  model <- naiveBayes(flue~., data=train , laplace=0)
  model
  
  # 증상 입력받기 (대문자로 써야함)
  a <- readline(prompt = '오한이 있습니까?'# Y
  b <- readline(prompt = '콧물이 있습니까?'# N
  c <- readline(prompt = '두통이 있습니까?'# MILD
  d <- readline(prompt = '열이 있습니까?'# N
  
  # data frame으로 만들기
  test = data.frame(chills=a,runny_nose=b,headache=c,fever=d,stringsAsFactors = T)
  test
  result <- predict( model, test,type="raw"# type="raw" 로 하면 라벨 비율이 나옴
  # result[1]은 N일 확률, result[2]는 Y일 확률을 의미
  # type을 명시하지 않으면 확률이 50% 초과인 라벨을 출력
  
  return(paste('독감일 확률이',round(result[2]*100,digits=1),'% 입니다.'))
  # "독감일 확률이 24.5 % 입니다."
}
 
flu_func()
cs

 

 

 

 

 

 

 

 

movie.csv
0.00MB
movie_test.csv
0.00MB

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#1. 워킹 디렉토리 지정
setwd("c:\\data")
 
# 2. 라이브러리 선언
library(e1071)
 
# 3. 데이터 로드
movie <- read.csv("movie.csv", header=T, stringsAsFactors=TRUE)
str(movie) # factor로 변환되었는지 확인
nrow(movie) # 39
 
# 4. 컬럼명을 영어로 변경
View(movie)
colnames(movie) <- c("age","gender","job","marry","friend","m_type")
 
View(movie)
 
# 5. train(38) / test(1)
train <- movie[1:38,]
test <- movie[39,]
 
train
test
 
# 6. 나이브 베이즈로 학습
#                     훈련데이터       라벨
model <- naiveBayes(train[ ,1:5], train$m_type , laplace=0)
# +) 나이브 베이즈를 이렇게도 학습 시킬 수 있음
# model <- naiveBayes(m_type~.,data=train,laplace=0) # m_type~. : 예측하고자 하는 라벨
 
#7. 예측 결과
result <- predict(model, test[,1:5])
# 그냥 test 데이터만 줘도 m_type은 predict를 위한 학습 데이터에서 사용되지 않기 때문에 상관은 x
result
 
test2 <- data.frame(age='20대', gender='여', job='IT', marry='NO',friend='NO')
result <- predict(model, test2)
result # 로맨틱
 
test3 <- data.frame(age='20대', gender='남', job='학생', marry='NO',friend='NO')
result <- predict(model, test3)
result # 코미디
 
# 입력 받은 파일 예측하기
fname <- file.choose()
test4 <- read.csv(fname,header=T,stringsAsFactors=T)
# 컬럼명을 똑같이 해줘야 제대로 결과를 낼 수 있음 (따라서 순서 상관 x)
names(test4) <- c("age","gender","job",'marry','frined')
 
result <- predict(model,test4)
result # 스릴러
cs

 

 

 

 

 

 

mushrooms.csv
1.17MB

 

 

 

자료형태

범주형

명목형

순서형

수치형

이산형

연속형

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# 나이브베이즈알고리즘을 이용하여 식용버섯과 독버섯 분류하기
mushroom = read.csv("c:/data/mushrooms.csv",header=T,stringsAsFactors = T)
 
# factor로 변환하는 이유
# factor로 변환하지 않으면 프로그램이 문자를 단지 형(string)으로만 인식하지만
# factor로 변환하면 각각의 문자를 범주로 인식하기 때문에 나이브 베이즈를 사용하는 명목형 데이터는 Factor 형태여야 함
 
# R은 테이블로도 데이터를 확인할 수 있음
View(mushroom)
 
# 결측치 확인
colSums(is.na(mushroom))
 
dim(mushroom) # 8124(데이터 개수) 23(컬럼 개수)
 
# 데이터 shuffle
set.seed(1)
train_cnt <- round( 0.75*dim(mushroom)[1] )
train_cnt # 6093
 
# mushroom의 1부터 8124 중 6093개(train_cnt)만큼 임의의 숫자를 뽑음
train_index <- sample( 1:dim(mushroom)[1], train_cnt, replace=F)
train_index
 
# 임의로 뽑힌 6093개의 인덱스들을 추출해서 train / test 로 나눔
mushroom_train <- mushroom[ train_index, ]
mushroom_test <- mushroom[-train_index, ]
 
nrow(mushroom_train) #6093
nrow(mushroom_test) #2031 
 
# mushroom_train 확인
str(mushroom_train)
 
# 없다면 'e1071 패키지 깔기'
# install.packages('e1071')
# 나이브 베이즈 알고리즘 사용
library(e1071)
model1 <- naiveBayes(type~.,data=mushroom_train) # type~. : 예측하고자 하는 컬럼 의미
model1
result1 <- predict( model1, mushroom_test[  , -1] )
result1 
 
library(gmodels)
CrossTable(mushroom_test[ ,1], result1)
 
# 어떤 laplace 값이 정확도가 가장 높을까?
temp = c()
laplace_num = c()
for (i in 1:10) {
   laplace_num = append(laplace_num,i*0.001)
    mushroom_test_pred = naiveBayes(type~ . ,  data=mushroom_train, laplace=i*0.001)
    result2 <- predict(mushroom_test_pred, mushroom_test[ , -1] )
    g2 <- CrossTable(mushroom_test[ ,1], result2)
    g3 <- g2$prop.tbl[1]+g2$prop.tbl[4]
    temp = append(temp,g3)
}
 
result = data.frame("laplace"=laplace_num,"정확도"=temp)
library(plotly)
plot_ly(x=~result[,"laplace"],y=~result[,"정확도"],type='scatter',mode='lines') %>%
  layout(xaxis=list(title="laplace값"),yaxis=list(title="정확도"))
cs

 

 

 

 

 

 

 

 

 

wine.csv
0.01MB

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# KNN알고리즘을 이용하여 와인 등급 분류하기
wine = read.csv('c:/data/wine.csv',header=T)
 
# stringsAsFactors를 선언하지 않으면 자동적으로 stringsAsFactors=TRUE
 
# 데이터를 섞음 (wine 데이터는 예측값이 컬럼[1]인 type이므로 일단 다 섞음)
# 유방암 데이터를 섞을 때 [-1]을 한 이유는 유방암 데이터는 불필요한 환자 코드가 있었기 때문
# 하지만 와인 데이터는 모두 쓸모 있는 데이터 뿐이기 때문에 굳이 어떠한 컬럼을 빼지 않아도 됨
 
# 데이터를 shuffle하는 이유
# 처음부터 데이터가 섞여있을 수도 있지만, wine 데이터와 같이 type별로 구분되어져 있으면
# train, test로 나눌 때 train에는 type1,type2 데이터만 test에는 type3 데이터만 있을 수 있기 때문
set.seed(15)
wine_shuffle <- wine[sample(nrow(wine)), ]
wine_shuffle
 
# wine 데이터를 섞은 결과인 wine2 확인
wine2 <- wine_shuffle
str(wine2)
 
# 정규화 함수
normalize <- function(x) {
  return ( (x-min(x)) / (max(x) - min(x)))
}
 
# wine 데이터의 컬럼 개수를 파악하고,
ncol(wine)
 
# wine_n이라는 새로운 값을 만듦
# 여기서 우리가 학습에 필요한 데이터는 [1]에 위치하고 예측할 결과인 type을 제외한 모든 컬럼이기 때문에 wine2[2:13]까지를 정규화함
# 정규화를 하는 이유는 각각의 데이터가 단위가 다르기 때문에 정규화를 하여 비교, 계산하기 편하게 하기 위함임
wine_n<- as.data.frame(lapply(wine2[2:13],normalize))
summary(wine_n)
 
 
# 데이터 나누기 train(90%)/test(10%)
train_num<-round(0.9*nrow(wine_n),0)
wine_train<-wine_n[1:train_num,]
wine_test<-wine_n[(train_num+1):nrow(wine_n),]
 
# 라벨 나누기 train/test
wine_train_label <- wine2[1:train_num,1]
wine_test_label <- wine2[(train_num+1):nrow(wine_n),1]
 
# 확인
wine_train_label
wine_test_label
 
wine_train
wine_test
 
# knn 알고리즘 사용
library(class)
result1 <- knn(train=wine_train, test=wine_test,cl=wine_train_label, k=21# k는 내 마음대로 (대체로 홀수)
result1
wine_test_label
 
# 예측값과 정답을 테이블로 만듦
= data.frame('실제'=wine_test_label,"예측"=result1)
x
table(x)
 
# 예측률(정확도) 평가
library(gmodels)
g2 =CrossTable(x=wine_test_label,y=result1,prop.chisq=FALSE)
g2
 
# wine은 분류하고자 하는 type이 3가지이므로 prob값도 3개를 합해야 함
g2$prop.tbl[1+ g2$prop.tbl[5+ g2$prop.tbl[9# 1
 
 
# 어떤 k값이 정확도가 가장 높을까?
temp = c()
k_num = c()
for (i in 1:200) {
  if (i%%2 != 0) {
    k_num = append(k_num,i)
    wine_test_pred = knn(train=wine_train, test=wine_test, cl=wine_train_label, k=i)
    g2 = CrossTable(x=wine_test_label,y=wine_test_pred,chisq=F)
    g3 = g2$prop.tbl[1+ g2$prop.tbl[5+ g2$prop.tbl[9]
    temp = append(temp,g3)
  }
}
 
result = data.frame("k"=k_num,"정확도"=temp)
 
library(plotly)
plot_ly(x=~result[,"k"],y=~result[,"정확도"],type='scatter',mode='lines') %>%
  layout(xaxis=list(title="k값"),yaxis=list(title="정확도"))
cs
 

 

이 그래프와 동일하지 않아도 됩니다. K값이 증가할수록 대체로 언더피팅 되는 것만 확인하세요!

 

 

 

+ Recent posts