2. 判別分析 >
2. 判別分析
判別分析はクラスター分析と同じく、お互い似かよった個体をグループ化する方法ですが、 どのグループに属するかが明確なサンプルデータ(教師データ、学習データ、訓練データとも言う)を与えて、それを元にグループ分けする点が異なります。判別分析にもさまざまな手法があり、それらの実例を紹介していきます。
2.1. 線形判別
線形判別は、直線でグループ分けを行う方法です。
まず、分析したいデータを Tab 区切りファイルで保存します。
Tab 区切りファイル
id name time touch keep send recv get contact 03U10後 優 7218 23.35827 9.39318 3.15877 2.41064 5.32003 3.07564 04U10後 星 6417 15.42777 6.07761 2.15054 2.43104 1.87003 1.87003 05U10後 磨 9439 14.04810 4.13179 2.41551 2.41551 2.47908 1.20776 09U10後 光 6264 20.01916 8.62069 2.29885 3.83142 2.87356 2.29885 10U10後 夢 10407 22.54252 8.12914 4.03575 2.13318 5.70770 1.96022 ...
作成したファイルを読み込み、どのグループに分類するか明確なデータから判別モデルを作成します。
R コード
# データファイルのあるフォルダを指定して setwd("e:/data") # データ読み込み (header=T は、先頭行が項目名であることを示す) d <- read.table("stats.txt", header=T) # パスを出した回数が多いグループ sortlist <- order(d$send) d_send <- d[sortlist,] d_train_s <- tail(d_send, n=7) d_train_s$group <- "s" # パスを受けた回数が多いグループ sortlist <- order(d$recv) d_recv <- d[sortlist,] d_train_r <- tail(d_recv, n=7) d_train_r$group <- "r" # どちらも少ないグループ sortlist <- order(d$send + d$recv) d_other <- d[sortlist,] d_train_o <- head(d_other, n=7) d_train_o$group <- "o" # 一つにまとめる d_train <- rbind(d_train_s, d_train_r, d_train_o) d_train <- d_train[,c("send","recv", "group","name")]
判別モデルを使用して元のデータをグループ分けします。
R コード
# 線形判別 library(MASS) d_train.lda <- lda(d_train[,1:2], d_train$group) # 元データに判別関数を適用 d_test <- d[,c("send","recv")] d_test.lda.predict <- predict(d_train.lda, d_test) d_bind <- cbind(d[,c("name","send","recv")], d_test.lda.predict$class) colnames(d_bind) <- c("name","send","recv","class") # 描画 plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) )
実行結果
> d_train.lda Call: lda(d_train[, 1:2], d_train$group) Prior probabilities of groups: o r s 0.3333333 0.3333333 0.3333333 Group means: send recv o 1.778850 1.527601 r 2.230550 4.277090 s 3.536741 2.316207 Coefficients of linear discriminants: LD1 LD2 send 0.6554019 1.8339279 recv 2.7432618 -0.1921227 Proportion of trace: LD1 LD2 0.8437 0.1563 > d_test.lda.predict $class [1] s s s o r s r r s r o s s r r s o s o r o r s s o o r s o o o o s r o s [37] o s s s s r s s o r o s o s o s Levels: o r s $posterior o r s [1,] 3.133125e-04 5.850039e-06 9.996808e-01 [2,] 4.085686e-01 3.818548e-05 5.913933e-01 [3,] 9.058603e-02 2.738961e-05 9.093866e-01 [4,] 9.989232e-01 1.369978e-13 1.076782e-03 [5,] 4.169616e-10 9.997142e-01 2.858292e-04 ...省略... [48,] 1.082731e-02 2.825061e-03 9.863476e-01 [49,] 9.997539e-01 3.813770e-17 2.461237e-04 [50,] 3.419141e-02 4.071498e-07 9.658082e-01 [51,] 9.999650e-01 4.478852e-13 3.502893e-05 [52,] 7.890500e-03 9.025709e-03 9.830838e-01 $x LD1 LD2 [1,] -0.391221585 1.2368610 [2,] -0.996054943 -0.6160794 [3,] -0.864995947 -0.1271599 [4,] -3.609080636 -0.1405764 [5,] 2.942756723 -0.6131344 ...省略... [48,] 0.005947659 -0.3491820 [49,] -4.679391896 0.5337843 [50,] -1.308855565 0.6885472 [51,] -3.407038990 -1.4739148 [52,] 0.198552731 -0.4551210
図 2-1-1
グループの分割線を描画します。
R コード
# 格子状のデータを作成 len <- 100 cont.send <- seq(0, 5, length=len) cont.recv <- seq(0, 5, length=len) cont.grid <- expand.grid(send=cont.send, recv=cont.recv) # 格子状データに判別関数を適用 cont.lda.predict <- predict(d_train.lda, cont.grid) pred <- cont.lda.predict$posterior # 分割線を描画 cont1 <- pred[,"s"] - pmax(pred[,"r"], pred[,"o"]) contour( cont.send, cont.recv, matrix(cont1, len), add=T, levels=0 ) cont2 <- pred[,"o"] - pmax(pred[,"r"], pred[,"s"]) contour( cont.send, cont.recv, matrix(cont2, len), add=T, levels=0 )
図 2-1-2
predictで返された判別得点を使って、散布図を描画することもできます。
R コード
d_bind <- data.frame( cbind( d_test.lda.predict$x[,1], d_test.lda.predict$x[,2], as.character(d_test.lda.predict$class) ) ) colnames(d_bind) <- c("ld1","ld2","class") d_bind$name <- d$name # 描画 plot( x=d_bind$ld1, y=d_bind$ld2, type="n", xlim=c(-6,6.5), ylim=c(-4,3.5) ) text( x=d_bind$ld1, y=d_bind$ld2, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) )
図 2-1-3
2.2. 二次判別
二次判別は、二次曲線でグループ分けを行う方法です。
R コード
# 2次判別 library(MASS) d_train.qda <- qda(d_train[,1:2], d_train$group) # 元データに判別関数を適用 d_test.qda.predict <- predict(d_train.qda, d_test) d_bind <- cbind(d[,c("name","send","recv")], d_test.qda.predict$class) colnames(d_bind) <- c("name","send","recv","class") # 描画 (線形判別と共通) plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) ) # 格子状データに判別関数を適用 cont.qda.predict <- predict(d_train.qda, cont.grid) pred <- cont.qda.predict$posterior # 分割線を描画 (線形判別と共通) cont1 <- pred[,"s"] - pmax(pred[,"r"], pred[,"o"]) contour( cont.send, cont.recv, matrix(cont1, len), add=T, levels=0 ) cont2 <- pred[,"o"] - pmax(pred[,"r"], pred[,"s"]) contour( cont.send, cont.recv, matrix(cont2, len), add=T, levels=0 )
実行結果
> d_train.qda Call: qda(d_train[, 1:2], d_train$group) Prior probabilities of groups: o r s 0.3333333 0.3333333 0.3333333 Group means: send recv o 1.778850 1.527601 r 2.230550 4.277090 s 3.536741 2.316207 > d_test.qda.predict $class [1] s s s o r s r r s r o s s r r s o r o r o r s s o o r s o s o o s r o s [37] s s s s r r s r o r o s o s o s Levels: o r s $posterior o r s [1,] 1.246474e-11 3.146942e-08 1.000000e+00 [2,] 6.473634e-02 3.473031e-05 9.352289e-01 [3,] 3.029493e-04 3.075149e-06 9.996940e-01 [4,] 9.987960e-01 3.885414e-17 1.204041e-03 [5,] 5.475073e-21 9.999925e-01 7.485244e-06 ...省略... [48,] 5.460533e-06 2.686523e-03 9.973080e-01 [49,] 9.994857e-01 6.741299e-23 5.142916e-04 [50,] 4.604164e-06 1.346329e-09 9.999954e-01 [51,] 9.999990e-01 4.255540e-15 9.895930e-07 [52,] 3.995490e-06 1.669196e-02 9.833040e-01
図 2-2-1
2.3. k最近傍法(k最近隣法)
所属の不明な個体の周辺のk個の中で多数決を取り、グループ分けを行う方法です。
R コード
# k 最近傍法 library(class) d_test.knn.class <- knn(d_train[,1:2], d_test, cl=d_train$group, k=5) d_bind <- cbind(d[,c("name","send","recv")], d_test.knn.class) colnames(d_bind) <- c("name","send","recv","class") # 描画 (線形判別と共通) plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) )
実行結果
> d_test.knn.class [1] s o s o r s r r s r o s o r r s o r o r o r s s o o r s o o o o s r o s [37] o s s s o r s r o r o s o s o r Levels: o r s
図 2-3-1
2.4. ナイーブベイズ法(単純ベイズ法)
判別関数でグループを分割するのではなく、ベイズの定理を使った確率モデルでグループ分けを行う方法です。
R コード
# ナイーブベイズ library(e1071) d_train.bay <- naiveBayes(d_train[,1:2], factor(d_train$group)) # 元データに適用 d_test.bay.class <- predict(d_train.bay, d_test, type="class") d_bind <- cbind(d[,c("name","send","recv")], d_test.bay.class) colnames(d_bind) <- c("name","send","recv","class") # 描画 (線形判別と共通) plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) ) # 格子状データに適用 cont.bay.predict <- predict(d_train.bay, cont.grid, type="raw") pred <- cont.bay.predict # 分割線を描画 (線形判別と共通) cont1 <- pred[,"s"] - pmax(pred[,"r"], pred[,"o"]) contour( cont.send, cont.recv, matrix(cont1, len), add=T, levels=0 ) cont2 <- pred[,"o"] - pmax(pred[,"r"], pred[,"s"]) contour( cont.send, cont.recv, matrix(cont2, len), add=T, levels=0 )
実行結果
> d_train.bay Naive Bayes Classifier for Discrete Predictors Call: naiveBayes.default(x = d_train[, 1:2], y = d_train$group) A-priori probabilities: d_train$group o r s 0.3333333 0.3333333 0.3333333 Conditional probabilities: send d_train$group [,1] [,2] o 1.778850 0.4923629 r 2.230550 0.6789304 s 3.536741 0.3869773 recv d_train$group [,1] [,2] o 1.527601 0.4226392 r 4.277090 0.3239211 s 2.316207 0.3799006 > d_test.bay.class [1] s o o o r s r r s r o s o r r s o o o r o r s s o o r s o o o o s r o o o o s s o r s o o r o o o s o o Levels: o r s
図 2-4-1
klaR パッケージの NaiveBayes 関数を使えば、確率密度の推定を行うオプションを追加できます。データによっては結果が大きく変わるそうです。
R コード
# ナイーブベイズ library(klaR) d_train.bay2 <- klaR::NaiveBayes(factor(group) ~ send + recv, data=d_train, usekernel=TRUE) # 元データに適用 d_test.bay2.predict <- data.frame(predict(d_train.bay2, d_test) d_bind <- cbind(d[,c("name","send","recv")], d_test.bay2.predict$class) colnames(d_bind) <- c("name","send","recv","class") # 描画 (線形判別と共通) plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) ) # 格子状データに適用 cont.bay2.predict <- predict(d_train.bay2, cont.grid) pred <- cont.bay2.predict$posterior # 分割線を描画 (線形判別と共通) cont1 <- pred[,"s"] - pmax(pred[,"r"], pred[,"o"]) contour( cont.send, cont.recv, matrix(cont1, len), add=T, levels=0 ) cont2 <- pred[,"o"] - pmax(pred[,"r"], pred[,"s"]) contour( cont.send, cont.recv, matrix(cont2, len), add=T, levels=0 )
実行結果
> d_train.bay2 $apriori grouping o r s 0.3333333 0.3333333 0.3333333 $tables $tables$send $tables$send$o Call: density.default(x = xx) Data: xx (7 obs.); Bandwidth 'bw' = 0.2269 x y Min. :0.4326 Min. :0.002819 1st Qu.:1.1509 1st Qu.:0.095387 Median :1.8691 Median :0.277531 Mean :1.8691 Mean :0.347606 3rd Qu.:2.5873 3rd Qu.:0.589346 Max. :3.3056 Max. :0.816327 $tables$send$r Call: density.default(x = xx) Data: xx (7 obs.); Bandwidth 'bw' = 0.1763 x y Min. :0.2431 Min. :0.0001824 1st Qu.:1.0053 1st Qu.:0.0191214 Median :1.7675 Median :0.1627825 Mean :1.7675 Mean :0.3275238 3rd Qu.:2.5297 3rd Qu.:0.6007392 Max. :3.2919 Max. :1.0783976 $tables$send$s Call: density.default(x = xx) Data: xx (7 obs.); Bandwidth 'bw' = 0.236 x y Min. :2.365 Min. :0.003043 1st Qu.:2.959 1st Qu.:0.082165 Median :3.554 Median :0.477296 Mean :3.554 Mean :0.419708 3rd Qu.:4.149 3rd Qu.:0.704372 Max. :4.744 Max. :0.836525 $tables$recv $tables$recv$o Call: density.default(x = xx) Data: xx (7 obs.); Bandwidth 'bw' = 0.2577 x y Min. :0.2018 Min. :0.003011 1st Qu.:0.8548 1st Qu.:0.083017 Median :1.5079 Median :0.414838 Mean :1.5079 Mean :0.382203 3rd Qu.:2.1610 3rd Qu.:0.606359 Max. :2.8141 Max. :0.832611 $tables$recv$r Call: density.default(x = xx) Data: xx (7 obs.); Bandwidth 'bw' = 0.1696 x y Min. :3.384 Min. :0.003772 1st Qu.:3.878 1st Qu.:0.115070 Median :4.371 Median :0.416523 Mean :4.371 Mean :0.506138 3rd Qu.:4.864 3rd Qu.:0.904138 Max. :5.357 Max. :1.167379 $tables$recv$s Call: density.default(x = xx) Data: xx (7 obs.); Bandwidth 'bw' = 0.2271 x y Min. :1.115 Min. :0.003027 1st Qu.:1.720 1st Qu.:0.077045 Median :2.326 Median :0.417373 Mean :2.326 Mean :0.412173 3rd Qu.:2.932 3rd Qu.:0.729518 Max. :3.537 Max. :0.842907 $levels [1] "o" "r" "s" $call NaiveBayes.default(x = X, grouping = Y, usekernel = TRUE) $x send recv 23 3.07260 1.79595 1 3.15877 2.41064 39 3.19793 2.69514 24 3.67846 2.85622 12 3.73464 1.97464 ...省略... 51 1.60438 1.68265 31 1.39843 1.91364 47 2.09476 1.43641 19 1.90495 1.66683 49 2.62500 0.97500 $usekernel [1] TRUE $varnames [1] "send" "recv" attr(,"class") [1] "NaiveBayes" > d_test.bay2.predict $class [1] s o o o r s r r s r o s o o r s o o o r o r s s o o r s o o o o s r o o [37] o o s s o r s o o r o r o s o r Levels: o r s $posterior o r s [1,] 3.186609e-03 2.795202e-04 9.965339e-01 [2,] 8.835537e-01 5.856259e-02 5.788372e-02 [3,] 7.568864e-01 9.822284e-02 1.448908e-01 [4,] 9.776722e-01 2.057698e-02 1.750806e-03 [5,] 1.448097e-03 9.985334e-01 1.847129e-05 ...省略... [48,] 2.144650e-01 5.063836e-01 2.791514e-01 [49,] 9.700680e-01 2.835598e-02 1.575982e-03 [50,] 2.118898e-01 8.162553e-03 7.799476e-01 [51,] 9.976996e-01 7.582940e-06 2.292845e-03 [52,] 1.345788e-01 5.816646e-01 2.837566e-01
図 2-4-2
2.5. サポートベクターマシン
直線で区切ることができない二次元のデータを、三次元の空間に投射し平面で区切ることでグループ分けする方法です。
R コード
# サポートベクターマシン group <- factor(d_train$group) send <- d_train$send recv <- d_train$recv d_train.svm <- svm(group ~ send + recv, probability=TRUE) # 元データに適用 d_test.svm.class <- predict(d_train.svm, d_test) d_bind <- cbind(d[,c("name","send","recv")], d_test.svm.class) colnames(d_bind) <- c("name","send","recv","class") # 描画 (線形判別と共通) plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) ) # 格子状データに適用 cont.svm.predict <- predict(d_train.svm, cont.grid, decision.values=TRUE, probability=TRUE) pred <- attr(cont.svm.predict, "probabilities") # 分割線を描画 (線形判別と共通) cont1 <- pred[,"s"] - pmax(pred[,"r"], pred[,"o"]) contour( cont.send, cont.recv, matrix(cont1, len), add=T, levels=0 ) cont2 <- pred[,"o"] - pmax(pred[,"r"], pred[,"s"]) contour( cont.send, cont.recv, matrix(cont2, len), add=T, levels=0 )
実行結果
> d_train.svm Call: svm(formula = factor(d_train$group) ~ d_train$send + d_train$recv, probability = TRUE) Parameters: SVM-Type: C-classification SVM-Kernel: radial cost: 1 gamma: 0.5 Number of Support Vectors: 13 > d_test.svm.class 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 s o s o r s r r s r o s o r r s o r o r o r s s o 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 o r s o s o o s r o o o o s s o r s r o r o s o s 51 52 o r Levels: o r s
図 2-5-1
2.6. ニューラルネットワーク
神経回路をモデル化したもので、結合の重みをランダムな値から開始し、結果が目標値に近づくよう重みを変えて計算を繰り返す方法です。
R コード
# ニューラルネットワーク library(nnet) d_train.net <- nnet(factor(group) ~ send + recv, data=d_train, size=3, decay=0.1) # 元データに適用 d_test.net.class <- predict(d_train.net, d_test, type="class") d_bind <- cbind(d[,c("name","send","recv")], d_test.net.class) colnames(d_bind) <- c("name","send","recv","class") # 描画 (線形判別と共通) plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) ) # 格子状データに適用 cont.net.predict <- predict(d_train.net, cont.grid) pred <- cont.net.predict # 分割線を描画 (線形判別と共通) cont1 <- pred[,"s"] - pmax(pred[,"r"], pred[,"o"]) contour( cont.send, cont.recv, matrix(cont1, len), add=T, levels=0 ) cont2 <- pred[,"o"] - pmax(pred[,"r"], pred[,"s"]) contour( cont.send, cont.recv, matrix(cont2, len), add=T, levels=0 )
実行結果
> d_test.net.predict.class [1] "s" "o" "o" "o" "r" "s" "r" "r" "s" "r" "o" "s" "o" "r" "r" "s" "o" "r" [19] "o" "r" "o" "r" "s" "s" "o" "o" "r" "s" "r" "s" "o" "o" "s" "r" "o" "o" [37] "s" "o" "s" "s" "r" "r" "s" "r" "o" "r" "o" "r" "o" "s" "o" "r" > predict(d_train.net, d_test) o r s 1 0.19936545 0.07025025 0.73038430 2 0.50784596 0.25255608 0.23959796 3 0.44070918 0.18905113 0.37023968 4 0.64105302 0.03610868 0.32283830 5 0.09178498 0.83780720 0.07040782 ...省略... 48 0.36376197 0.36870579 0.26753224 49 0.57634266 0.01831401 0.40534333 50 0.31926722 0.07239756 0.60833522 51 0.82513372 0.07892821 0.09593807 52 0.33924601 0.43197509 0.22877890
図 2-6-1
2.7. ランダムフォレスト
単独では精度の低い方法を複数組み合わせることによって精度を高める方法です。
R コード
# ランダムフォレスト library(randomForest) d_train.rnd <- rpart(d_train[,1:2], d_train.group) # 元データに適用 d_test.rnd.class <- predict(d_train.rnd, d_test) d_bind <- cbind(d[,c("name","send","recv")], d_test.rnd.class) colnames(d_bind) <- c("name","send","recv","class") # 描画 (線形判別と共通) plot( x=d_bind$send, y=d_bind$recv, type="n", xlim=c(0,5), ylim=c(0,5) ) text( x=d_bind$send, y=d_bind$recv, label=d_bind$name, col=ifelse(d_bind$class=="s","dodgerblue3", ifelse(d_bind$class=="r","deeppink", "darkviolet" )) ) # 格子状データに適用 cont.rnd.predict <- predict(d_train.rnd, cont.grid, type="prob") pred <- cont.rnd.predict # 分割線を描画 (線形判別と共通) cont1 <- pred[,"s"] - pmax(pred[,"r"], pred[,"o"]) contour( cont.send, cont.recv, matrix(cont1, len), add=T, levels=0 ) cont2 <- pred[,"o"] - pmax(pred[,"r"], pred[,"s"]) contour( cont.send, cont.recv, matrix(cont2, len), add=T, levels=0 )
実行結果
> d_train.rnd Call: randomForest(x = d_train[, 1:2], y = factor(d_train$group)) Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 1 OOB estimate of error rate: 0% Confusion matrix: o r s class.error o 7 0 0 0 r 0 7 0 0 s 0 0 7 0 > d_test.rnd.class 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 s o o o r s r r s r o s o r r s o o o r o r s s o 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 o r s o o o o s r o o o o s s o r s o o r o o o o 51 52 o o Levels: o r s