全基因组DNA甲基化分析流程

测序原理

甲基化的CBisulfite处理后不会发生变化;未发生甲基化的C在处理后会变为U,PCR扩增后会变为T. 因此甲基化比对时需要特定的基因组,所以需要对基因组进行转换:C->TG->A.

img

上游分析

DNA甲基化比对使用最广泛的比对软件是Bowtie2,因此选择Bowtie2作为比对器(不同的比对软件需要不同的index文件)。

基因组准备

需要基因组序列文件,把gff3文件也准备了,后续用得上:

1
2
-rw-rw-r-- 1 lixiang lixiang 3.0G  8月 27 15:14 genome.fa
-rw-rw-r-- 1 lixiang lixiang 308M 8月 27 15:02 genome.gff3

使用如下的命令生成Bismark比对需要的基因组:

1
bismark_genome_preparation --path_to_aligner ~/mambaforge/envs/bismark/bin/ --bowtie2 --parallel 60 --verbose 02.genome

运行完成会生成如下的文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
02.genome
├── Bisulfite_Genome
│ ├── CT_conversion
│ │ ├── BS_CT.1.bt2
│ │ ├── BS_CT.2.bt2
│ │ ├── BS_CT.3.bt2
│ │ ├── BS_CT.4.bt2
│ │ ├── BS_CT.rev.1.bt2
│ │ ├── BS_CT.rev.2.bt2
│ │ └── genome_mfa.CT_conversion.fa
│ └── GA_conversion
│ ├── BS_GA.1.bt2
│ ├── BS_GA.2.bt2
│ ├── BS_GA.3.bt2
│ ├── BS_GA.4.bt2
│ ├── BS_GA.rev.1.bt2
│ ├── BS_GA.rev.2.bt2
│ └── genome_mfa.GA_conversion.fa
├── genome.fa
├── genome.fa.fai
└── genome.gff3

3 directories, 17 files

合并测序数据

如果分成很多批次测序的话,需要先将原始文件合并,使用R语言生成合并代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
rm(list = ls())

readr::read_delim("data/其他项目/xx/data/all.rawdata.txt", col_names = FALSE, delim = "\t") %>%
magrittr::set_names("file") %>%
dplyr::mutate(path = stringr::str_split(file, ":") %>% sapply("[", 2) %>% stringr::str_split(" ") %>% sapply("[", 2)) %>%
dplyr::select(path) %>%
dplyr::mutate(
sample = stringr::str_split(path, "/SOL") %>% sapply("[", 1) %>% stringr::str_replace("01.rawdata/", "") %>% stringr::str_replace("C88", ""),
type = case_when(stringr::str_detect(path, "_1.clean.fq.gz") ~ "_1", TRUE ~ "_2")
) %>%
dplyr::mutate(group = paste0(sample, type)) -> df.file

all.comm <- NULL

for (i in unique(df.file$group)) {
df.file %>%
dplyr::filter(group == i) -> df.file.sub

for (j in unique(df.file.sub$path)) {
sprintf("cat %s >> 03.cleandata/%s.fq.gz", j, i) %>%
as.data.frame() %>%
magrittr::set_names("file") %>%
dplyr::bind_rows(all.comm) -> all.comm
}
}

all.comm %>%
write.table("data/其他项目/xx/data/merge.data.sh", row.names = FALSE, col.names = FALSE, quote = FALSE)

批量比对

使用R生成批量比对代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
rm(list = ls())

readr::read_delim("data/其他项目/张兴/data/all.rawdata.txt", col_names = FALSE, delim = "\t") %>%
magrittr::set_names("file") %>%
dplyr::mutate(path = stringr::str_split(file, ":") %>% sapply("[", 2) %>% stringr::str_split(" ") %>% sapply("[", 2)) %>%
dplyr::select(path) %>%
dplyr::mutate(
sample = stringr::str_split(path, "/SOL") %>% sapply("[", 1) %>% stringr::str_replace("01.rawdata/", "") %>% stringr::str_replace("C88", ""),
type = case_when(stringr::str_detect(path, "_1.clean.fq.gz") ~ "_1", TRUE ~ "_2")
) %>%
dplyr::mutate(group = paste0(sample, type)) %>%
dplyr::select(sample) %>%
dplyr::distinct_all() -> df.file

all.comm <- NULL

for (i in unique(df.file$sample)) {
sprintf("bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/%s_1.fq.gz -2 03.cleandata/%s_2.fq.gz -o 04.mapping", i, i) %>%
as.data.frame() %>%
magrittr::set_names("file") %>%
dplyr::bind_rows(all.comm) -> all.comm
}

all.comm %>%
write.table("data/其他项目/张兴/data/mapping.sh", row.names = FALSE, col.names = FALSE, quote = FALSE)

具体的代码如下:

1
2
3
4
5
6
7
8
9
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/YS3_1.fq.gz -2 03.cleandata/YS3_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/YS2_1.fq.gz -2 03.cleandata/YS2_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/YS1_1.fq.gz -2 03.cleandata/YS1_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/WT3_1.fq.gz -2 03.cleandata/WT3_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/WT2_1.fq.gz -2 03.cleandata/WT2_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/WT1_1.fq.gz -2 03.cleandata/WT1_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/RS3_1.fq.gz -2 03.cleandata/RS3_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/RS2_1.fq.gz -2 03.cleandata/RS2_2.fq.gz -o 04.mapping
bismark -p 60 --bowtie2 --genome 02.genome -1 03.cleandata/RS1_1.fq.gz -2 03.cleandata/RS1_2.fq.gz -o 04.mapping

结果去重

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
rm(list = ls())

readr::read_delim("data/其他项目/张兴/data/all.rawdata.txt", col_names = FALSE, delim = "\t") %>%
magrittr::set_names("file") %>%
dplyr::mutate(path = stringr::str_split(file, ":") %>% sapply("[", 2) %>% stringr::str_split(" ") %>% sapply("[", 2)) %>%
dplyr::select(path) %>%
dplyr::mutate(
sample = stringr::str_split(path, "/SOL") %>% sapply("[", 1) %>% stringr::str_replace("01.rawdata/", "") %>% stringr::str_replace("C88", ""),
type = case_when(stringr::str_detect(path, "_1.clean.fq.gz") ~ "_1", TRUE ~ "_2")
) %>%
dplyr::mutate(group = paste0(sample, type)) %>%
dplyr::select(sample) %>%
dplyr::distinct_all() -> df.file

all.comm <- NULL

for (i in unique(df.file$sample)) {
sprintf("deduplicate_bismark --bam 04.mapping/%s_1_bismark_bt2_pe.bam --output_dir 05.deduplicate", i) %>%
as.data.frame() %>%
magrittr::set_names("file") %>%
dplyr::bind_rows(all.comm) -> all.comm
}

all.comm %>%
write.table("data/其他项目/张兴/data/deduplicate.sh", row.names = FALSE, col.names = FALSE, quote = FALSE)

生成的代码如下:

1
2
3
4
5
6
7
8
9
deduplicate_bismark --bam 04.mapping/YS3_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/YS2_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/YS1_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/WT3_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/WT2_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/WT1_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/RS3_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/RS2_1_bismark_bt2_pe.bam --output_dir 05.deduplicate
deduplicate_bismark --bam 04.mapping/RS1_1_bismark_bt2_pe.bam --output_dir 05.deduplicate

提取甲基化结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
rm(list = ls())

readr::read_delim("data/其他项目/张兴/data/all.rawdata.txt", col_names = FALSE, delim = "\t") %>%
magrittr::set_names("file") %>%
dplyr::mutate(path = stringr::str_split(file, ":") %>% sapply("[", 2) %>% stringr::str_split(" ") %>% sapply("[", 2)) %>%
dplyr::select(path) %>%
dplyr::mutate(
sample = stringr::str_split(path, "/SOL") %>% sapply("[", 1) %>% stringr::str_replace("01.rawdata/", "") %>% stringr::str_replace("C88", ""),
type = case_when(stringr::str_detect(path, "_1.clean.fq.gz") ~ "_1", TRUE ~ "_2")
) %>%
dplyr::mutate(group = paste0(sample, type)) %>%
dplyr::select(sample) %>%
dplyr::distinct_all() -> df.file

all.comm <- NULL

for (i in unique(df.file$sample)) {
sprintf("bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/%s.bam -o 08.methylation/ ", i) %>%
as.data.frame() %>%
magrittr::set_names("file") %>%
dplyr::bind_rows(all.comm) -> all.comm
}

all.comm %>%
dplyr::slice(nrow(.):1) %>%
write.table("data/其他项目/张兴/data/methylation_extractor.sh", row.names = FALSE, col.names = FALSE, quote = FALSE)

具体的代码:

1
2
3
4
5
6
7
8
9
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/RS1.bam -o 08.methylation/ 
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/RS2.bam -o 08.methylation/
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/RS3.bam -o 08.methylation/
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/WT1.bam -o 08.methylation/
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/WT2.bam -o 08.methylation/
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/WT3.bam -o 08.methylation/
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/YS1.bam -o 08.methylation/
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/YS2.bam -o 08.methylation/
bismark_methylation_extractor --parallel 60 --scaffolds --gzip --bedGraph 05.deduplicate/YS3.bam -o 08.methylation/

下游分析

下游分析主要使用methylKit这个R包完成。

Akalin A, Kormaksson M, Li S, et al. methylKit: a comprehensive R package for the analysis of genome-wide DNA methylation profiles[J]. Genome biology, 2012, 13: 1-9.

bam转sam

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
rm(list = ls())

readr::read_delim("data/其他项目/张兴/data/all.rawdata.txt", col_names = FALSE, delim = "\t") %>%
magrittr::set_names("file") %>%
dplyr::mutate(path = stringr::str_split(file, ":") %>% sapply("[", 2) %>% stringr::str_split(" ") %>% sapply("[", 2)) %>%
dplyr::select(path) %>%
dplyr::mutate(
sample = stringr::str_split(path, "/SOL") %>% sapply("[", 1) %>% stringr::str_replace("01.rawdata/", "") %>% stringr::str_replace("C88", ""),
type = case_when(stringr::str_detect(path, "_1.clean.fq.gz") ~ "_1", TRUE ~ "_2")
) %>%
dplyr::mutate(group = paste0(sample, type)) %>%
dplyr::select(sample) %>%
dplyr::distinct_all() -> df.file

all.comm <- NULL

for (i in unique(df.file$sample)) {
sprintf("samtools view -@ 70 -h 07.sort.bam/%s.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/%s.sorted.sam", i, i) %>%
as.data.frame() %>%
magrittr::set_names("file") %>%
dplyr::bind_rows(all.comm) -> all.comm
}

all.comm %>%
write.table("data/其他项目/张兴/data/bam2sam.sh", row.names = FALSE, col.names = FALSE, quote = FALSE)

生成的代码:

1
2
3
4
5
6
7
8
9
samtools view -@ 70 -h 07.sort.bam/YS3.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/YS3.sorted.sam
samtools view -@ 70 -h 07.sort.bam/YS2.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/YS2.sorted.sam
samtools view -@ 70 -h 07.sort.bam/YS1.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/YS1.sorted.sam
samtools view -@ 70 -h 07.sort.bam/WT3.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/WT3.sorted.sam
samtools view -@ 70 -h 07.sort.bam/WT2.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/WT2.sorted.sam
samtools view -@ 70 -h 07.sort.bam/WT1.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/WT1.sorted.sam
samtools view -@ 70 -h 07.sort.bam/RS3.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/RS3.sorted.sam
samtools view -@ 70 -h 07.sort.bam/RS2.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/RS2.sorted.sam
samtools view -@ 70 -h 07.sort.bam/RS1.sorted.bam | samtools sort -@ 70 -o 13.sam4methkit/RS1.sorted.sam

提取甲基化结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 合并CpG结果
files_list <- list(
"13.sam4methkit/WT1.sorted.sam", "13.sam4methkit/WT2.sorted.sam", "13.sam4methkit/WT3.sorted.sam",
"13.sam4methkit/YS1.sorted.sam", "13.sam4methkit/YS2.sorted.sam", "13.sam4methkit/YS3.sorted.sam",
"13.sam4methkit/RS1.sorted.sam", "13.sam4methkit/RS2.sorted.sam", "13.sam4methkit/RS3.sorted.sam"
)

methylKit::processBismarkAln(
location = files_list,
sample.id = list("WT1", "WT2", "WT3", "YS1", "YS2", "YS3", "RS1", "RS2", "RS3"),
assembly = "C88",
save.folder = "14.methykit",
read.context = "CpG",
save.context = "CpG",
treatment = c(0, 0, 0, 1, 1, 1, 2, 2, 2)
) -> obj.cpg

save(obj.cpg, file = "14.methykit/CpG.RData")

会生成每个样品的甲基化结果:

1
2
3
4
5
6
14.methykit
├── WT1_CHG.txt
├── WT1_CHH.txt
└── WT1_CpG.txt

0 directories, 3 files

输出的结果就是methylKit需要的标准化的输入数据:

1
2
3
4
5
6
7
8
9
10
chrBase chr     base    strand  coverage        freqC   freqT
chr1_1.10277 chr1_1 10277 F 10 100.00 0.00
chr1_1.1032 chr1_1 1032 F 17 94.12 5.88
chr1_1.10356 chr1_1 10356 F 10 80.00 20.00
chr1_1.1065 chr1_1 1065 F 14 92.86 7.14
chr1_1.10717 chr1_1 10717 F 11 90.91 9.09
chr1_1.10730 chr1_1 10730 F 11 81.82 18.18
chr1_1.10747 chr1_1 10747 F 12 83.33 16.67
chr1_1.10780 chr1_1 10780 F 12 100.00 0.00
chr1_1.10787 chr1_1 10787 F 10 100.00 0.00

每一列依次是:

  • 染色体编号+碱基位置
  • 染色体编号
  • 碱基位置
  • 正负链
  • 比对的覆盖度(methylKit要求最低是10)
  • C碱基的频率
  • T碱基的频率

读取数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
library(methylKit)

rm(list = ls())

file.list=list( system.file("extdata",
"test1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"test2.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control2.myCpG.txt", package = "methylKit") )


# read the files to a methylRawList object: myobj
myobj=methRead(file.list,
sample.id=list("test1","test2","ctrl1","ctrl2"),
assembly="hg18",
treatment=c(1,1,0,0),
context="CpG",
mincov = 10
)

样品描述性统计

1
2
3
4
5
6
# 描述性统计, 第一个样品
myobj[[1]] %>%
methylKit::getMethylationStats(plot = TRUE, both.strands = FALSE, labels = TRUE)

myobj[[1]] %>%
methylKit::getCoverageStats(plot = TRUE, both.strands = FALSE)

比较分析

合并样本

CpG的结果可以把参数设置为destrand=TRUE,但是CpH的结果只能是destrand=FALSE.

1
2
3
4
# 合并样品
meth=unite(myobj, destrand=FALSE)

head(meth)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
methylBase object with 6 rows
--------------
chr start end strand coverage1 numCs1 numTs1 coverage2 numCs2 numTs2 coverage3 numCs3 numTs3 coverage4 numCs4 numTs4
1 chr21 9853296 9853296 + 17 10 7 333 268 65 18 16 2 395 341 54
2 chr21 9853326 9853326 + 17 12 5 329 249 79 16 14 2 379 284 95
3 chr21 9860126 9860126 + 39 38 1 83 78 5 83 83 0 41 40 1
4 chr21 9906604 9906604 + 68 42 26 111 97 14 23 18 5 37 33 4
5 chr21 9906616 9906616 + 68 52 16 111 104 7 23 14 9 37 27 10
6 chr21 9906619 9906619 + 68 59 9 111 109 2 22 18 4 37 29 8
--------------
sample.ids: test1 test2 ctrl1 ctrl2
destranded FALSE
assembly: hg18
context: CpG
treament: 1 1 0 0
resolution: base

coverage2后面的123啥的就是样品的分组信息编号。

样品相关性

1
2
# 样品间的相关性
methylKit::getCorrelation(meth, plot = TRUE)

image-20241010103420362

1
2
3
4
5
          test1     test2     ctrl1     ctrl2
test1 1.0000000 0.9252530 0.8767865 0.8737509
test2 0.9252530 1.0000000 0.8791864 0.8801669
ctrl1 0.8767865 0.8791864 1.0000000 0.9465369
ctrl2 0.8737509 0.8801669 0.9465369 1.0000000

样品聚类

1
2
clusterSamples(meth, dist="correlation", method="ward", plot=FALSE) %>% 
ggtree::ggtree()

image-20241010104510976

PCA

1
PCASamples(meth)

image-20241010104810186

DMR鉴定

1
2
3
4
5
6
7
8
9
10
11
12
13
myDiff=calculateDiffMeth(meth, mc.cores=10)

# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)

diffMethPerChr(myDiff,plot=FALSE,qvalue.cutoff=0.01, meth.cutoff=25) -> df.chr

DMR注释

使用基因进行注释。这个注释操作将告诉我们启动子/intro/ex ONS/inter 与 IC 区域上的四个差异甲基化区域的百分比。

1
2
3
4
5
library(genomation)

gene.obj=readTranscriptFeatures(system.file("extdata", "refseq.hg18.bed.txt", package = "methylKit"))

annotateWithGeneParts(as(myDiff25p,"GRanges"),gene.obj)

输出结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Summary of target set annotation with genic parts
Rows in target set: 133
-----------------------
percentage of target features overlapping with annotation:
promoter exon intron intergenic
27.82 15.04 34.59 57.14

percentage of target features overlapping with annotation:
(with promoter > exon > intron precedence):
promoter exon intron intergenic
27.82 0.00 15.04 57.14

percentage of annotation boundaries with feature overlap:
promoter exon intron
0.29 0.03 0.17

summary of distances to the nearest TSS:
Min. 1st Qu. Median Mean 3rd Qu. Max.
5 828 45158 52034 94644 313528

全基因组DNA甲基化分析流程
https://lixiang117423.github.io/article/dnamethylation/
作者
李详【Xiang LI】
发布于
2024年10月10日
许可协议