mlr3随机森林

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
rm(list = ls())

library(mlr3verse)
library(tidyverse)
library(paradox)

load("/home/lixiang/temp_file/all.data.RData")

# 导入数做预处理
df_sample <- data.table::fread("matedata.txt", header = TRUE) %>%
dplyr::mutate(sample = paste0("s", 1:119))

df_otu <- data.table::fread("otu_taxon_F.txt", header = TRUE) %>%
tibble::column_to_rownames(var = "#OTU ID") %>%
t() %>%
as.data.frame() %>%
dplyr::mutate(Name = rownames(.)) %>%
dplyr::left_join(df_sample[, c("Name", "Type", "sample")], by = "Name") %>%
dplyr::select(-Name) %>%
tibble::column_to_rownames(var = "sample") %>%
dplyr::filter(Type != "Ck") %>% # 剔除CK的样品
dplyr::mutate(Type = factor(Type, levels = unique(Type)))

df_colnames <- data.frame(Feature = colnames(df_otu)) %>%
dplyr::mutate(temp = case_when(
Feature == "Type" ~ "Type",
Feature != "Type" ~ paste0("M", rownames(.))
))
colnames(df_otu) <- c(paste0("M", 1:(ncol(df_otu) - 1)), "Type")

# 创建任务
task <- as_task_classif(df_otu, target = "Type")
task

autoplot(task)

# 选择学习器
ranger <- lrn("classif.ranger", importance = "permutation")
ranger

# 划分数据
set.seed(707)
split <- partition(task, ratio = 0.7)

# 超参数调参
ranger$param_set

search_space <- ps(
num.trees = p_int(lower = 1, upper = 50, trafo = function(x) 20 * x),
min.node.size = p_int(lower = 3, upper = 30)
)

at <- auto_tuner(
learner = ranger,
resampling = rsmp("cv", folds = 10),
measure = msr("classif.acc"),
search_space = search_space,
method = "random_search",
term_evals = 10
)

# 在训练集上启动调参过程
set.seed(1102)

at$train(task, row_ids = split$train)

# 查看最优参数
at$tuning_result

# 用最优参数训练模型
ranger$param_set$values <- at$tuning_result$learner_param_vals[[1]]
ranger$train(task, row_ids = split$train)

# 用训练好的模型预测测试集合
predictions <- ranger$predict(task, row_ids = split$test)
predictions
predictions$confusion

# 提取结果
predictions %>% as.data.table()

# 查看预测准确率
predictions$score(msr("classif.acc"))

# 改成百分比样式
ranger$predict_type <- "prob"
ranger$train(task, row_ids = split$train)
predictions <- ranger$predict(task, row_ids = split$test)
predictions

predictions$score(msr("classif.acc"))
autoplot(predictions, type = "roc")
autoplot(predictions, type = "prc")

# 提取变量的重要性
ranger$importance() %>%
as.data.table(keep.rownames = TRUE) %>%
stats::setNames(c("temp", "Importance")) %>%
dplyr::left_join(df_colnames, by = "temp") %>%
dplyr::select(-temp) %>%
dplyr::arrange(-Importance) %>%
dplyr::slice(1:10) %>%
ggplot(aes(x = reorder(Feature, Importance), y = Importance)) +
geom_col() +
scale_y_continuous(expand = c(0, 0)) +
coord_flip() +
xlab("") +
theme_bw()

💌lixiang117423@foxmail.com
💌lixiang117423@gmail.com


mlr3随机森林
https://lixiang117423.github.io/article/rfinmlr3/
作者
小蓝哥
发布于
2022年3月23日
许可协议