mean_err <- mean(as.numeric(pred > 0.5) != test$label)
mean_err # 0.02172564
# 9. View feature importance/influence from the learnt model
# 학습된 model의 변수(feature) 중요도/영향력 보기
importance_matrix <- xgb.importance(colnames(train$data),
model = xgb_model)
importance_matrix
xgb.plot.importance(importance_matrix = importance_matrix)
library(rpart)
library(rpart.plot)
formula <- Species ~ .
iris.df <- rpart(formula, data=iris)
iris.df
plot(iris.df ) # 트리 프레임 보임
text(iris.df, use.n=T, cex=0.6) # 텍스트 추가
post(iris.df, file="")
rpart.plot(iris.df) # rpart 모델 tree 출력(rpart.plot 패키지 제공)
fancyRpartPlot(iris.df) # node 번호 출력(rattle 패키지 제공)
library('rattle')
formula <- Species ~ .
iris.df <- rpart(formula, data=iris)
iris.df
rpart.plot(iris.df) # rpart 모델 tree 출력(rpart.plot 패키지 제공)
install.packages('rattle')
rpart_model <- rpart(formula, data=iris)
rpart_model
rpart.plot(rpart_model) # rpart 모델 tree 출력(rpart.plot 패키지 제공)
# 1) 데이터 가져오기
# c:/Rwork/Part-IV/weather.csv 파일 선택
weather = read.csv("c:/Rwork/Part-IV/weather.csv", header=TRUE)
# 2) 데이터 특성 보기
str(weather) # data.frame':  366 obs. of  15 variables:
names(weather) # 15개 변수명
head(weather)
# 3) 분류분석 - 의사결정트리 생성
weather.df <- rpart(RainTomorrow ~ .,
data=weather[, c(-1,-14)], cp=0.01)
weather.df
# 4) 분류분석 시각화
X11()
# 4) 분류분석 시각화
rpart.plot(rpart_model)
# 4) 분류분석 시각화
rpart.plot(weather.df)
# 1. data set 생성
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 2. y 변수 binary(number)
Species2 <- ifelse(train$Species == "setosa", 0, 1)
train$Species2 <- Species2
Species2 <- ifelse(test$Species == "setosa", 0, 1)
test$Species2 <- Species2
head(train)
head(test)
tail(test)
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(5:6)]) # data : matrix(1~4)
dim(train_mat) # 105 4
train_lab <- train$Species2 # label : vector
length(train_lab) # 105
dtrain <- xgb.DMatrix(data = train_mat, label = train_lab)
dtrain
# 4. model 생성 : xgboost matrix 객체 이용
xgb_model_iris <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
xgb_model_iris # object info
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(5:6)]) # matrix
dim(test_mat) # 45  4
test_lab <- test$Species2 # vector
length(test_lab) # 45
pred_iris <- predict(xgb_model_iris, test_mat)
range(pred_iris) # 0.06015496 0.94841605
# 6. cut off 적용
cpred_iris <- ifelse(pred_iris >= 0.5, 1, 0)
# 7. confusion matrix
table(cpred_iris, test_lab)
# 8. 모델 성능평가1 : Accuracy
(14+31) / length(test_lab)
# 8. 모델 성능평가2 : 평균 오차(잔차의 평균)
mean_err <- mean(as.numeric(pred_iris > 0.5) != test_lab)
cat("test-error =", mean_err)
# 9. View feature importance/influence from the learnt model
# 학습된 model의 변수(feature) 중요도/영향력 보기
importance_matrix <- xgb.importance(colnames(train_mat),
model = xgb_model_iris)
importance_matrix
# Petal.Length
# 1. package install
install.packages("xgboost")
library(xgboost)
# 1. data set 생성
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 2. y 변수 binary(number)
Species2 <- ifelse(train$Species == "setosa", 0, 1)
train$Species2 <- Species2
Species2 <- ifelse(test$Species == "setosa", 0, 1)
test$Species2 <- Species2
head(train)
head(test)
tail(test)
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(5:6)]) # data : matrix(1~4)
dim(train_mat) # 105 4
train_lab <- train$Species2 # label : vector
length(train_lab) # 105
dtrain <- xgb.DMatrix(data = train_mat, label = train_lab)
dtrain
# 4. model 생성 : xgboost matrix 객체 이용
xgb_model_iris <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
xgb_model_iris # object info
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(5:6)]) # matrix
dim(test_mat) # 45  4
test_lab <- test$Species2 # vector
length(test_lab) # 45
pred_iris <- predict(xgb_model_iris, test_mat)
range(pred_iris) # 0.06015496 0.94841605
# 6. cut off 적용
cpred_iris <- ifelse(pred_iris >= 0.5, 1, 0)
# 7. confusion matrix
table(cpred_iris, test_lab)
# 8. 모델 성능평가1 : Accuracy
(14+31) / length(test_lab)
# 8. 모델 성능평가2 : 평균 오차(잔차의 평균)
mean_err <- mean(as.numeric(pred_iris > 0.5) != test_lab)
cat("test-error =", mean_err)
# 9. View feature importance/influence from the learnt model
# 학습된 model의 변수(feature) 중요도/영향력 보기
importance_matrix <- xgb.importance(colnames(train_mat),
model = xgb_model_iris)
importance_matrix
# Petal.Length
# 1. y변수 생성
table(iris$Species)
iris_label <- ifelse(iris$Species == 'setosa', 0,
ifelse(iris$Species == 'versicolor', 1, 2))
table(iris_label)
#  0  1  2
# 50 50 50
# 칼럼 추가
iris$label <- iris_label
head(iris)
tail(iris)
# 2. data set 생성
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(5:6)]) # data : matrix(1~4)
dim(train_mat) # 105 4
train_lab <- train$label # label : vector
length(train_lab) # 105
dtrain <- xgb.DMatrix(data = train_mat, label = train_lab)
dtrain
# 4. model 생성 : xgboost matrix 객체 이용
xgb_model2 <- xgboost(data = dtrain, max_depth = 2, eta = 1,
nthread = 2, nrounds = 2,
objective = "multi:softmax",
num_class = 3,
verbose = 0)
xgb_model2 # object info
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(5:6)]) # matrix
dim(test_mat) # 45  4
test_lab <- test$label # vector
length(test_lab) # 45
pred_iris <- predict(xgb_model2, test_mat)
pred_iris # class : 0 1 2
# 6. cut off 적용(x)
# 7. confusion matrix
table(pred_iris, test_lab)
# 8. 모델 성능평가1 : Accuracy
(17+15+12) / length(test_lab) # 0.9777778
# 9. View feature importance/influence from the learnt model
# 학습된 model의 변수(feature) 중요도/영향력 보기
importance_matrix <- xgb.importance(colnames(train_mat),
model = xgb_model2)
importance_matrix
xgb.plot.importance(importance_matrix)
# Petal.Length > Petal.Width
?xgboost
table(iris_label)
xgb_model2 # object info
# 4. model 생성 : xgboost matrix 객체 이용
xgb_model <- xgboost(data = dtrain, max_depth = 2, eta = 1,
nthread = 2, nrounds = 2,
objective = "multi:softmax",
num_class = 3,
verbose = 0)
xgb_model # object info
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(5:6)]) # matrix
dim(test_mat) # 45  4
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(5:6)]) # matrix
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(5:6)]) # matrix
dim(test_mat) # 45  4
test_lab <- test$label # vector
length(test_lab) # 45
pred_iris <- predict(xgb_model, test_mat)
pred_iris # class : 0 1 2
# 6. cut off 적용(x)
# 7. confusion matrix
table(pred_iris, test_lab)
# 8. 모델 성능평가1 : Accuracy
(17+15+12) / length(test_lab) # 0.9777778
# 9. View feature importance/influence from the learnt model
# 학습된 model의 변수(feature) 중요도/영향력 보기
importance_matrix <- xgb.importance(colnames(train_mat),
model = xgb_model)
importance_matrix
# 7. 모델 성능평가1 : Accuracy
(19+14+12) / length(test_lab) # 0.9777778
# 7. 모델 성능평가1 : Accuracy
(18+14+12) / length(test_lab) # 0.9777778
# 7. 모델 성능평가1 : Accuracy
(19+13+12) / length(test_lab) # 0.9777778
# 8. View feature importance/influence from the learnt model
# 학습된 model의 변수(feature) 중요도/영향력 보기
importance_matrix <- xgb.importance(colnames(train_mat),
model = xgb_model)
importance_matrix
xgb.plot.importance(importance_matrix)
library(rpart) # rpart() : 분류모델 생성
library(rpart.plot) # prp(), rpart.plot() : rpart 시각화
library('rattle') # fancyRpartPlot() : node 번호 시각화
# 단계1. 실습데이터 생성
data(iris)
set.seed(415)
idx = sample(1:nrow(iris), 0.7*nrow(iris))
train = iris[idx, ]
test = iris[-idx, ]
dim(train) # 105 5
dim(test) # 45  5
table(train$Species)
# 단계2. 분류모델 생성
# rpart(y변수 ~ x변수, data)
model = rpart(Species~., data=train) # iris의 꽃의 종류(Species) 분류
model
# 분류모델 시각화 - rpart.plot 패키지 제공
prp(model) # 간단한 시각화
rpart.plot(model) # rpart 모델 tree 출력
# http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
titanic = read.csv("D:/ITWILL/2_Rwork/Part-IV/titanic3.csv")
str(titanic)
#단계1: K겹 교차검정을 위한 샘플링
library(cvTools)
cross <- cvFolds(nrow(iris), K=3, R=1)
#단계2: K겹 교차검정 데이터 보기
str(cross) # 구조 보기
cross # 5겹 교차검정 데이터 보기
cross$which # 1 2 3 1 2 3
table(cross$which)
# 1set -> d1,d2,d3
d1 <- cross$subsets[cross$which==1,1]
# 2set -> d1,d2,d3
d2 <- cross$subsets[cross$which==1,2]
#단계3: K겹 교차검정 수행
R=1:2 # 2회 반복
K=1:3 # 3겹
ACC <- numeric() # 분류정확도
cnt <- 1
for(i in R){ # 2set(1set, 2set)
cat("###", i, "셋트###\n")
for(j in K){ # 3회(d1,d2,d3)
cat("###", j, "회###\n")
idx <- cross$subsets[cross$which==j,1]#d1,d2,d3
test <- iris[idx, ] # d1->test : 검정
train <- iris[-idx, ] # d2,d3 -> train : model
model <- rpart( Species ~ ., data = train)
# 예측치
p <- predict(model, test, type = 'class')
# 혼돈 matrix
t <- table(p, test$Species)
# 분류정확도
ACC[j] <- (t[1,1]+t[2,2]+t[3,3]) / sum(t)
cnt <- cnt + 1 # 카운터 변수
}
}
ACC # 0.94 0.90 0.98 0.86 0.96 0.96
mean(ACC) # 0.9333333
mtry = sqrt(19)
mtry = 4
mtry
library(rpart) # rpart() : 분류모델 생성
library(rpart.plot) # prp(), rpart.plot() : rpart 시각화
library('rattle') # fancyRpartPlot() : node 번호 시각화
# 단계1. 실습데이터 생성
data(iris)
set.seed(415)
idx = sample(1:nrow(iris), 0.7*nrow(iris))
train = iris[idx, ]
test = iris[-idx, ]
dim(train) # 105 5
dim(test) # 45  5
table(train$Species)
# 단계2. 분류모델 생성
# rpart(y변수 ~ x변수, data)
model = rpart(Species~., data=train) # iris의 꽃의 종류(Species) 분류
model
?rpart
# 가지치기(cp)
# 범위 : 0 ~ 1, default = 0.05
# 0으로 갈수록 트리 커짐, 오류율 감소, 과적합 증가
model$cptable
############################
## iris 적용 : y 다항분류
############################
?xgboost
############################
## iris 적용 : y 다항분류
############################
?xgboost
# 2. data set 생성
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 1. y변수 생성
table(iris$Species)
iris_label <- ifelse(iris$Species == 'setosa', 0,
ifelse(iris$Species == 'versicolor', 1, 2))
table(iris_label)
# 칼럼 추가
iris$label <- iris_label
head(iris)
tail(iris)
# 2. data set 생성
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(5:6)]) # data : matrix(1~4)
dim(train_mat) # 105 4
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(1,5:6)]) # data : matrix(1~4)
dim(train_mat) # 105 4
train_lab <- train$Sepal.Length # label : vector
length(train_lab) # 105
dtrain <- xgb.DMatrix(data = train_mat, label = train_lab)
library(xgboost)
dtrain <- xgb.DMatrix(data = train_mat, label = train_lab)
dtrain
# 4. model 생성 : xgboost matrix 객체 이용
xgb_model <- xgboost(data = dtrain, max_depth = 2, eta = 1,
nthread = 2, nrounds = 2,
num_class = 3,
verbose = 0)
# 4. model 생성 : xgboost matrix 객체 이용
xgb_model <- xgboost(data = dtrain, max_depth = 2, eta = 1,
nthread = 2, nrounds = 2,
objective = "reg:linear",
num_class = 3,
verbose = 0)
dtrain <- xgb.DMatrix(data = train_mat, label = train_lab)
dtrain
# 4. model 생성 : xgboost matrix 객체 이용
xgb_model <- xgboost(data = dtrain, max_depth = 2, eta = 1,
nthread = 2, nrounds = 2,
verbose = 0)
xgb_model # object info
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(1,5:6)]) # matrix
dim(test_mat) # 45  4
test_lab <- test$label # vector
length(test_lab) # 45
pred_iris <- predict(xgb_model, test_mat)
pred_iris # class : 0 1 2
# 6. confusion matrix
mean( (pred_iris- test_lab)**)
# 6. confusion matrix
mean( (pred_iris- test_lab)**2)
pred_iris # class : 0 1 2
test_lab <- test$Sepal.Length # vector
length(test_lab) # 45
pred_iris <- predict(xgb_model, test_mat)
pred_iris # class : 0 1 2
# 6. confusion matrix
mean( (pred_iris- test_lab)**2)
# 2. data set 생성
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(1,5)]) # data : matrix(1~4)
dim(train_mat) # 105 4
# 2. data set 생성
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(1,5)]) # data : matrix(1~4)
dim(train_mat) # 105 4
train_mat
# 2. data set 생성
data("iris")
idx <- sample(nrow(iris), 0.7*nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 3. xgb.DMatrix 생성
train_mat <- as.matrix(train[-c(1,5)]) # data : matrix(1~4)
dim(train_mat) # 105 4
train_lab <- train$Sepal.Length # label : vector
length(train_lab) # 105
dtrain <- xgb.DMatrix(data = train_mat, label = train_lab)
dtrain
# 4. model 생성 : xgboost matrix 객체 이용 : reg -> default
xgb_model <- xgboost(data = dtrain, max_depth = 2, eta = 1,
nthread = 2, nrounds = 2,
verbose = 0)
xgb_model # object info
# 5. prediction
# x(data) : matrix, y(label) : vector
test_mat <- as.matrix(test[-c(1,5)]) # matrix
dim(test_mat) # 45  4
test_lab <- test$Sepal.Length # vector
length(test_lab) # 45
pred_iris <- predict(xgb_model, test_mat)
pred_iris # class : 0 1 2
# 6. confusion matrix
mean( (pred_iris- test_lab)**2) #  0.2033847
# 대출 수락 or 거절 prediction
setwd("c:/ITWILL/2_Rwork/Part-IV")
# 실습 데이터 가져오기(TM에서 전처리한 데이터)
setwd("C:/Rwork/Part-IV")
sms_data <- read.csv('data/sms_spam_tm.csv')
# 실습 데이터 가져오기(TM에서 전처리한 데이터)
setwd("C:/Rwork/Part-IV")
sms_data <- read.csv('sms_spam_tm.csv')
dim(sms_data) # [1] 5558(row) 6824(word) - 6157
# 1. 파일 가져오기
weatherAUS = read.csv(file.choose()) #weatherAUS.csv
weatherAUS = weatherAUS[ ,c(-1,-2, -22, -23)] # 칼럼 제외
str(weatherAUS)
# 2. 데이터 셋 생성
set.seed(415)
idx = sample(1:nrow(weatherAUS), 0.7*nrow(weatherAUS))
training_w = weatherAUS[idx, ]
testing_w  = weatherAUS[-idx, ]
# 3. NA 제거
training_w = na.omit(training_w)
testing_w = na.omit(testing_w)
model_svm1 <- svm(RainTomorrow~MinTemp+MaxTemp+Rainfall+Sunshine+Evaporation, data=training_w, kernel='radial')
model_svm2 <- svm(RainTomorrow~MinTemp+MaxTemp+Rainfall+Sunshine+Evaporation, data=training_w, kernel='linear')
model_svm1
model_svm2
model_svm1 <- svm(RainTomorrow~MinTemp+MaxTemp+Rainfall+Sunshine+Evaporation, data=training_w, kernel='radial')
library(e1071)
install.packages("e1071") # naiveBayes(), svm() 제공
model_svm1 <- svm(RainTomorrow~MinTemp+MaxTemp+Rainfall+Sunshine+Evaporation, data=training_w, kernel='radial')
library(e1071)
# 1. SVM 기본 개념 익히기 - Support Vector Margin
df = data.frame(
x1 = c(1,2,1,2,4,5,6),
x2 = c(8,7,5,6,1,3,2),
y=factor(c(1,1,1,1,0,0,0))
)
# 2. svm 모델 생성
# 형식) svm(y ~ x, data, type, kernel) : method, kernel 속성 생략 가능
model_svm = svm(y ~ ., data = df)
model_svm
summary(model_svm)
pred <- predict(model_svm, df)
pred # 1 1 1 1 0 0 0
model_svm1 <- svm(RainTomorrow~MinTemp+MaxTemp+Rainfall+Sunshine+Evaporation, data=training_w, kernel='radial')
model_svm2 <- svm(RainTomorrow~MinTemp+MaxTemp+Rainfall+Sunshine+Evaporation, data=training_w, kernel='linear')
model_svm1
model_svm2
# 5. 분류모델 평가
pred1 <- predict(model_svm1, testing_w)
pred2 <- predict(model_svm2, testing_w)
# kernel=radial 방식
table(pred1, testing_w$RainTomorrow)
# kernel=linear 방식
table(pred2, testing_w$RainTomorrow)
# 2. 데이터 셋 생성
set.seed(415)
idx = sample(1:nrow(weatherAUS), 0.7*nrow(weatherAUS))
training_w = weatherAUS[idx, ]
testing_w  = weatherAUS[-idx, ]
training_w
testing_w
# 3. NA 제거
training_w2 = na.omit(training_w)
testing_w2 = na.omit(testing_w)
# formula
fo<-RainTomorrow~MinTemp+MaxTemp+Rainfall+Sunshine+Evaporation
# formula
form <-RainTomorrow~.
#1)NB
library(e1071)
weather_NB<-naiveBayes(form, data = training_w2)
pred_NB<-predict(weather_NB, testing_w2)
table(pred_NB, testing_w2$RainTomorrow)
###1) kernel="radial"
weather_svm_ra<-svm(fo, data=training_w2)
pred_svm_ra<-predict(weather_svm_ra, testing_w2)
table(pred_svm_ra, testing_w2$RainTomorrow)
# pred_svm   No  Yes
# No  3925  801
# Yes  177  388
(3890+423)/nrow(testing_w2) #0.8151578
