R

[빅분기 실기대비] 작업형 제2유형 for R (glm분석 이용)

한번해보즈아 2021. 6. 3. 18:17
#데이터 불러오기
setwd("C:/Users/jinuk/Downloads/[Dataset] 작업형 제2유형")
x_train <- read.csv("X_train.csv")
y_train <- read.csv("Y_train.csv")

#데이터 전처리
raw_df <- cbind(x_train,y_train)
raw_df <- raw_df[,-11]
class(raw_df$gender)
raw_df$gender <- as.factor(raw_df$gender)
raw_df$주구매상품 <- as.factor(raw_df$주구매상품)
raw_df$주구매지점 <- as.factor(raw_df$주구매지점)

raw_df$환불금액 <- ifelse(is.na(raw_df$환불금액), 0, raw_df$환불금액)
raw_df

#GLM 학습시작
set.seed(123)
str(raw_df)
idx <- sample(1:nrow(raw_df), nrow(raw_df)*0.7, replace=F)
train <- raw_df[idx,]
test <- raw_df[-idx,]
logistic <- glm(gender~총구매액+최대구매액+환불금액+주구매지점+주구매상품+내점일수+내점당구매건수+주말방문비율+구매주기,
    data=raw_df,
    family="binomial"
    )
summary(logistic)

#심화학습
step.logistic <- step(logistic, direction="both")

summary(step.logistic)
pred <- predict(step.logistic,test[,-11],type="response")
pred1 <- as.data.frame(pred)
pred1$"grade" <- ifelse(pred1$pred<0.5,0,1)

#결과 확인
confusionMatrix(as.factor(pred1$grade),test[,11])
pred.logistic.roc <- prediction(as.numeric(pred1$grade),as.numeric(test[,11]))
plot(performance(pred.logistic.roc,"tpr","fpr"))
performance(pred.logistic.roc,"auc")@y.values

첫 시도라 ROC-AUC는 58%밖에 안되지만 초안이라 생각하고 봐주시면 감사하겠습니다.