少年サッカーデータ分析

3. 正準判別分析

３群以上へグループ分けすることを、重判別分析または正準判別分析と言います。前の章では２変量を使って３群にグループ分けしましたが、この章では多変量を使って３群にグループ分けしてみます。例として主成分分析の結果をクラスター分析でグループ分けしたデータを使って正準判別分析を行います。

判別得点によって描画された結果が、主成分分析の描画とほぼ同じ結果になることを確認します。

R コード

# データファイルのあるフォルダを指定して
setwd("e:/data")
# データ読み込み (header=T は、先頭行が項目名であることを示す)
d <- read.table("stats.txt", header=T)

# 主成分分析
d_prcomp <- prcomp(d[,5:10], scale=TRUE)
d_scores <- data.frame(d_prcomp$x[,1:2])
# 5個のクラスタに分類
d_kmeans <- kmeans(d_scores, centers=5, algorithm="Hartigan-Wong")

# データフレームにまとめる
d_lda <- data.frame(cbind(d$touch, d$keep, d$send, d$recv, d$get, d$contact, d_kmeans$cluster))
colnames(d_lda) <- c("touch", "keep", "send", "recv", "get", "contact", "class") 
d_lda$name <- d$name

# 線形判別
library(MASS)
d_lda.lda <- lda(class ~ touch + keep + send + recv + get + contact, data=d_lda)
d_lda.pred <- predict(d_lda.lda)

# 度数法を弧度法に変換
rad <- 140 * pi / 180
cos.angle <- cos(rad)
sin.angle <- sin(rad)
# 空ベクトルを用意
d_lda$x = numeric(length(d_lda.pred$x[,1]))
d_lda$y = numeric(length(d_lda.pred$x[,2]))
# 判別得点を回転
for (i in 1:length(d_lda.pred$x[,1])) {
    wk <- (
        matrix(c(
            cos.angle,
            sin.angle, 
            -sin.angle, 
            cos.angle
        ), 2,2) 
        %*% 
        c(
            d_lda.pred$x[i,1], 
            d_lda.pred$x[i,2]
        )
    )
    d_lda$x[i] <- -wk[1,1]
    d_lda$y[i] <- wk[2,1]
}

# フォントを準備
windowsFonts(HGKAI=windowsFont("HG正楷書体-PRO"))
windowsFonts(COURIER=windowsFont("Courier New"))

# 色を準備
RED_5    <- "#ff6b6b" # ■
PINK_5   <- "#f06595" # ■
GRAPE_5  <- "#cc5de8" # ■
VIOLET_5 <- "#845ef7" # ■
INDIGO_5 <- "#5c7cfa" # ■
BLUE_5   <- "#339af0" # ■
CYAN_5   <- "#22b8cf" # ■
TEAL_5   <- "#20c997" # ■
GREEN_5  <- "#51cf66" # ■
LIME_5   <- "#94d82d" # ■
YELLOW_5 <- "#fcc419" # ■
ORANGE_5 <- "#ff922b" # ■

# ggplot2 パッケージを使用
library("ggplot2")
# ggrepel パッケージを使用
library("ggrepel")

# 散布図を描画
g <- ggplot(
    d_scores, 
    aes(
        x=d_lda$x, 
        y=d_lda$y, 
        label=d_lda$name
    )
)
g <- g + geom_point(
    size=3, 
    alpha=0.6, 
    aes(
        colour=factor(d_lda$class)
    )
)
g <- g + stat_ellipse(
    aes(
        colour=factor(d_lda$class)
    ), 
    type="t", 
    linetype=2
)
g <- g + xlim(-5, 7.5)
g <- g + ylim(-5, 7.5)
g <- g + geom_text_repel(
    family="HGKAI"
)
# タイトルを変更
g <- g + labs(title="正準判別分析")
g <- g + xlab("")
g <- g + ylab("")
#グループごとの色指定
g <- g + scale_colour_manual(values=c(VIOLET_5, INDIGO_5, BLUE_5, CYAN_5, TEAL_5))
# フォントを指定
g <- g + theme_bw(
    base_size=12, 
    base_family="HGKAI"
)
g <- g + theme(
    legend.position="none",
    plot.title=element_text(
        hjust=0.5
    ),
    axis.text.x=element_text(
        family="COURIER", 
        size=10
    ),
    axis.text.y=element_text(
        family="COURIER", 
        size=10
    )
)
print(g)

図 3-1-1

主成分分析との比較

R コード

# 度数法を弧度法に変換
rad <- -120 * pi / 180
cos.angle <- cos(rad)
sin.angle <- sin(rad)
# 空ベクトルを用意
d_scores$PC1_rot = numeric(length(d_scores$PC1))
d_scores$PC2_rot = numeric(length(d_scores$PC2))
# 主成分得点を回転
for (i in 1:length(d_scores$PC1)) {
    wk <- (
        matrix(c(
            cos.angle,
            sin.angle, 
            -sin.angle, 
            cos.angle
        ), 2,2) 
        %*% 
        c(
            d_scores$PC1[i], 
            d_scores$PC2[i]
        )
    )
    d_scores$PC1_rot[i] <- wk[1,1]
    d_scores$PC2_rot[i] <- wk[2,1]
}

# データフレームにまとめる
d_pca <- data.frame(cbind(d_scores$PC1_rot,d_scores$PC2_rot,d_kmeans$cluster))
colnames(d_pca) <- c("x","y","class") 
d_pca$name <- d$name

# 散布図を描画
g <- ggplot(
    d_pca, 
    aes(
        x=d_pca$x, 
        y=d_pca$y, 
        label=d_pca$name
    )
)
g <- g + geom_point(
    size=3, 
    alpha=0.6, 
    aes(
        colour=factor(d_pca$class)
    )
)
g <- g + stat_ellipse(
    aes(
        colour=factor(d_pca$class)
    ), 
    type="t", 
    linetype=2
)
g <- g + xlim(-3, 4)
g <- g + ylim(-3.5, 5.5)
g <- g + geom_text_repel(
    family="HGKAI"
)
# タイトルを変更
g <- g + labs(title="主成分分析")
g <- g + xlab("")
g <- g + ylab("")
#グループごとの色指定
g <- g + scale_colour_manual(values=c(VIOLET_5, INDIGO_5, BLUE_5, CYAN_5, TEAL_5))
# フォントを指定
g <- g + theme_bw(
    base_size=12, 
    base_family="HGKAI"
)
g <- g + theme(
    legend.position="none",
    plot.title=element_text(
        hjust=0.5
    ),
    axis.text.x=element_text(
        family="COURIER", 
        size=10
    ),
    axis.text.y=element_text(
        family="COURIER", 
        size=10
    )
)
print(g)

図 3-2-1

前へ | 目次 | 次へ