B1:机器学习实战 Peter Harrington
library(tidyverse)
library(dplyr)
library(mlr3verse)
library(mclust)
library(GGally)
library(factoextra)
library(Rtsne)
library(umap)
library(kohonen)
library(lle)
library(pacman)
library(purrr)
library(ggplot2)
B1P287
聚类算法涵盖了一系列用于识别数据集中样本聚类的一系列技术。聚类是相比其他聚类中的样本而言彼此之间更为相似的一组样本。
聚类与分类:
分类使用已标注的样本学习数据中的模式,从而对类别进行区分;但是,当没有关于类别成员的任何先验知识或不知道数据中是否存在不同的类别时,使用聚类更为合适。因此,聚类描述了一系列试图从数据集中识别出分组结构的算法。
B1P288
k-均值算法将首先定义数据集中存在多少聚类,并试图学习数据集中的分组结构。这就是k的含义,而不管它们是否代表数据中真正的分组结构。这是k-均值算法的弱点,因为我们没有先验信息以确定需要搜索多少聚类。
在定义了想要搜索的聚类数量k之后,k-均值算法会初始化数据集中的k个中心或质心(通常是随机的)。每个质心代表一个聚类,样本则被分配给离它们质心最近的聚类。重复进行多次以后,质心将以最小化类内数据差异和最大化类间数据差异的方式在特征空间的周围移动。在每一次迭代中,样本将分配给离它们质心最近的聚类。
一些常用的k-均值算法如下:
Lloyd算法(又称Lloyd-Forgy算法)
MacQueen算法
Hartigan-Wong算法
B1P290
Lloyd算法执行过程:
选择k值
在特征空间中随机初始化k个质心
对于每个样本:计算样本和每个质心之间的距离,将样本分配给由最近的质心表示的聚类
将每个质心放在聚类样本的均值位置
重复步骤3,4,直到没有样本改变聚类或达到最大迭代次数
B1P291
MacQueen算法执行过程:
选择k值
在特征空间中随机初始化k个质心
将每个样本分配给由最近的质心表示的聚类
将每个质心放在聚类样本的均值位置
对于每个样本:计算样本和每个质心之间的距离,将样本分配给由最近的质心表示的聚类,如果样本改变聚类,就更新质心位置。
在考虑完所有样本后,更新所有质心位置
如果没有样本更改聚类,MacQueen算法将停止,否则重复步骤5
B1P291
Hartigan-Wong算法执行过程:
选择k值
在特征空间中随机初始化k个质心
将每个样本分配给由最近的质心表示的聚类
将每个质心放在聚类样本的均值位置
对于每个样本:忽视正在考虑的样本并计算对应聚类的误差平方和,将正在考虑的样本也包括在内并计算其他聚类的误差平方和,将样本分配给具有最小误差平方和的聚类,如果样本改变了聚类,就更新旧聚类的质心
如果没有样本更改聚类,Hartigan-Wong算法就停止,否则重复步骤5
R6为R提供了封装的面向对象编程的实现(有时也称为经典的面向对象编程)。
# mlr_learners 为R6类对象
str("mlr_learners")
## chr "mlr_learners"
class(mlr_learners)
## [1] "DictionaryLearner" "Dictionary" "R6"
names(mlr_learners)
## [1] ".__enclos_env__" "keys" "items" "remove"
## [5] "required_args" "mget" "clone" "initialize"
## [9] "has" "print" "format" "add"
## [13] "get"
# 常用机器学习算法
mlr_learners$keys()
## [1] "classif.cv_glmnet" "classif.debug" "classif.featureless"
## [4] "classif.glmnet" "classif.kknn" "classif.lda"
## [7] "classif.log_reg" "classif.multinom" "classif.naive_bayes"
## [10] "classif.nnet" "classif.qda" "classif.ranger"
## [13] "classif.rpart" "classif.svm" "classif.xgboost"
## [16] "clust.agnes" "clust.ap" "clust.cmeans"
## [19] "clust.cobweb" "clust.dbscan" "clust.diana"
## [22] "clust.em" "clust.fanny" "clust.featureless"
## [25] "clust.ff" "clust.hclust" "clust.kkmeans"
## [28] "clust.kmeans" "clust.MBatchKMeans" "clust.meanshift"
## [31] "clust.pam" "clust.SimpleKMeans" "clust.xmeans"
## [34] "dens.hist" "dens.kde" "regr.cv_glmnet"
## [37] "regr.debug" "regr.featureless" "regr.glmnet"
## [40] "regr.kknn" "regr.km" "regr.lm"
## [43] "regr.ranger" "regr.rpart" "regr.svm"
## [46] "regr.xgboost" "surv.coxph" "surv.cv_glmnet"
## [49] "surv.glmnet" "surv.kaplan" "surv.ranger"
## [52] "surv.rpart" "surv.xgboost"
str(mlr_learners$keys())
## chr [1:53] "classif.cv_glmnet" "classif.debug" "classif.featureless" ...
# 查看其中的聚类分析函数
mlr_learners$keys("clust")
## [1] "clust.agnes" "clust.ap" "clust.cmeans"
## [4] "clust.cobweb" "clust.dbscan" "clust.diana"
## [7] "clust.em" "clust.fanny" "clust.featureless"
## [10] "clust.ff" "clust.hclust" "clust.kkmeans"
## [13] "clust.kmeans" "clust.MBatchKMeans" "clust.meanshift"
## [16] "clust.pam" "clust.SimpleKMeans" "clust.xmeans"
# mlr_tasks 为R6类对象
# mlr_tasks
# keys查看哪些可以调用
mlr_tasks$keys() %>%
str()
## chr [1:28] "actg" "bike_sharing" "boston_housing" "breast_cancer" ...
# bike_sharing 共享单车数据
mlr_tasks$keys("bike_sharing")
## [1] "bike_sharing"
# 查看包含内容
tskBike <- mlr_tasks$get("bike_sharing")
names(tskBike)
## [1] ".__enclos_env__" "col_hashes" "labels" "weights"
## [5] "order" "groups" "strata" "data_formats"
## [9] "feature_types" "ncol" "nrow" "col_roles"
## [13] "row_roles" "properties" "target_names" "feature_names"
## [17] "row_names" "row_ids" "hash" "extra_args"
## [21] "man" "col_info" "backend" "task_type"
## [25] "label" "id" "clone" "truth"
## [29] "initialize" "add_strata" "droplevels" "set_levels"
## [33] "set_col_roles" "set_row_roles" "rename" "cbind"
## [37] "rbind" "select" "filter" "missings"
## [41] "levels" "head" "formula" "data"
## [45] "print" "format" "help"
# 获取数据
Bike <- tskBike$data(rows = 1:17379)
str(Bike)
## Classes 'data.table' and 'data.frame': 17379 obs. of 14 variables:
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ apparent_temperature: num 0.288 0.273 0.273 0.288 0.288 ...
## $ date : chr "2011-01-01" "2011-01-01" "2011-01-01" "2011-01-01" ...
## $ holiday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ hour : int 0 1 2 3 4 5 6 7 8 9 ...
## $ humidity : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ season : Factor w/ 4 levels "winter","spring",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ temperature : num 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
## $ weather : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 2 1 1 1 1 ...
## $ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
## $ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
## $ working_day : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ year : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# view(Bike)
class(tskBike)
## [1] "TaskRegr" "TaskSupervised" "Task" "R6"
# 数据量过大,维度过高,绘图受限
# autoplot(tskBike, type = "pairs")
# 绘图
mlr_tasks$keys()
## [1] "actg" "bike_sharing" "boston_housing" "breast_cancer"
## [5] "faithful" "gbcs" "german_credit" "grace"
## [9] "ilpd" "iris" "kc_housing" "lung"
## [13] "moneyball" "mtcars" "optdigits" "penguins"
## [17] "penguins_simple" "pima" "precip" "rats"
## [21] "sonar" "spam" "titanic" "unemployment"
## [25] "usarrests" "whas" "wine" "zoo"
tskIris <- mlr_tasks$get("iris")
autoplot(tskIris, type = "pairs") +
labs(title = "test")
tskP <- mlr_tasks$get(key = "penguins")
class(tskP)
## [1] "TaskClassif" "TaskSupervised" "Task" "R6"
autoplot(tskP)
# build a task 建立任务
data(GvHD, package="mclust")
str(GvHD.control)
## 'data.frame': 6809 obs. of 4 variables:
## $ CD4 : num 199 294 85 19 35 376 97 200 422 391 ...
## $ CD8b: num 420 311 79 1 29 346 329 342 433 390 ...
## $ CD3 : num 132 241 14 141 6 138 527 145 163 147 ...
## $ CD8 : num 226 164 218 130 135 176 406 189 47 190 ...
names(mlr_tasks)
## [1] ".__enclos_env__" "keys" "items" "remove"
## [5] "required_args" "mget" "clone" "initialize"
## [9] "has" "print" "format" "add"
## [13] "get"
class(TaskClust)
## [1] "R6ClassGenerator"
gvhdCDT <- as.data.table(scale(GvHD.control))
task <- TaskClust$new("gvhdC",backend = gvhdCDT)
class(task)
## [1] "TaskClust" "TaskUnsupervised" "Task" "R6"
names(task)
## [1] ".__enclos_env__" "col_hashes" "labels" "weights"
## [5] "order" "groups" "strata" "data_formats"
## [9] "feature_types" "ncol" "nrow" "col_roles"
## [13] "row_roles" "properties" "target_names" "feature_names"
## [17] "row_names" "row_ids" "hash" "extra_args"
## [21] "man" "col_info" "backend" "task_type"
## [25] "label" "id" "clone" "initialize"
## [29] "add_strata" "droplevels" "set_levels" "set_col_roles"
## [33] "set_row_roles" "rename" "cbind" "rbind"
## [37] "select" "filter" "missings" "levels"
## [41] "head" "formula" "data" "print"
## [45] "format" "help"
task$id
## [1] "gvhdC"
autoplot(task)
# get a learner 获取学习器
learner <- mlr_learners$get("clust.kmeans") # 可以换算法
class(learner)
## [1] "LearnerClustKMeans" "LearnerClust" "Learner"
## [4] "R6"
names(learner)
## [1] ".__enclos_env__" "hotstart_stack" "fallback" "encapsulate"
## [5] "param_set" "predict_type" "phash" "hash"
## [9] "errors" "warnings" "log" "timings"
## [13] "model" "save_assignments" "assignments" "man"
## [17] "timeout" "parallel_predict" "predict_sets" "packages"
## [21] "data_formats" "properties" "feature_types" "predict_types"
## [25] "task_type" "state" "label" "id"
## [29] "clone" "initialize" "reset" "base_learner"
## [33] "predict_newdata" "predict" "train" "help"
## [37] "print" "format"
# param set 参数设置
class(learner$param_set)
## [1] "ParamSet" "R6"
# learner$param_set
# learner$param_set$get_values()
# learner$param_set$values |> class()
learner$param_set |> names()
## [1] ".__enclos_env__" "has_deps" "values" "has_trafo"
## [5] "trafo" "all_categorical" "all_numeric" "is_categ"
## [9] "is_number" "storage_type" "tags" "default"
## [13] "special_vals" "is_bounded" "nlevels" "levels"
## [17] "upper" "lower" "class" "is_empty"
## [21] "length" "set_id" "deps" "params_unid"
## [25] "params" "assert_values" "clone" "print"
## [29] "format" "add_dep" "assert_dt" "test_dt"
## [33] "check_dt" "assert" "test" "check"
## [37] "search_space" "subset" "get_values" "ids"
## [41] "add" "initialize"
learner$param_set$values$centers = 4
# learner$param_set$values$algorithm <- "Lloyd"
learner$param_set$values$algorithm <- "Hartigan-Wong"
learner$param_set$values$iter.max = 200
learner$param_set
## <ParamSet>
## id class lower upper nlevels default value
## 1: centers ParamUty NA NA Inf 2 4
## 2: iter.max ParamInt 1 Inf Inf 10 200
## 3: algorithm ParamFct NA NA 4 Hartigan-Wong Hartigan-Wong
## 4: nstart ParamInt 1 Inf Inf 1
## 5: trace ParamInt 0 Inf Inf 0
# train 训练
learner$train(task)
predicted <- learner$predict(task)
# predicted$partition
autoplot(predicted,task,type="pca")
autoplot(predicted,task,type="scatter")
## Warning in GGally::ggscatmat(data, color = "partition", ...): Factor variables
## are omitted in plot
# names(learner)
# 可依据上述流程换用其他算法
ln1 <- lrn("clust.hclust")
ln1$param_set$values$k=4
ln1$train(task)
pln <- ln1$predict(task)
## Warning: Learner 'clust.hclust' doesn't predict on new data and predictions may
## not make sense on new data
autoplot(pln, task, type = "pca")
autoplot(pln, task, type = "scatter")
## Warning in GGally::ggscatmat(data, color = "partition", ...): Factor variables
## are omitted in plot
autoplot(ln1, type = "dend")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.