从二代测序数据中鉴定是否含有特定基因

基因组下载

下载参考基因组数据。

1
2
3
4
get https://www.mbkbase.org/R498/R498_Chr.soft.fasta.gz
get https://www.mbkbase.org/R498/R498_IGDBv3_coreset.tar.gz
get https://www.mbkbase.org/R498/R498_CoreSet.pro.tar.gz
get https://www.mbkbase.org/R498/R498_CoreSet.pro.tar.gz

Blast比对

核酸序列建库疯狂报错:

1
BLAST Database error: No alias or index file found for protein database [./blastdb/piapik] in search path [/sas16t/lixiang/yyt.reseq/02.blast::]

换蛋白序列建库比对:

1
2
makeblastdb -in piapik.fa -dbtype prot -out blastdb/piapik
blastp -query R498_CoreSet.pros.format.fasta -db blastdb/piapik -evalue 1e-5 -num_threads 60 -outfmt 6 -out blast.out.txt

提取gff文件和基因序列

提取gff文件:

1
2
grep "OsR498G1119642600.01" R498_IGDBv3_coreset/R498_IGDBv3_coreset.gff > pia.gff
grep "OsR498G1120737800.01" R498_IGDBv3_coreset/R498_IGDBv3_coreset.gff > pik.gff

提取基因序列:

1
2
type="gene"
sed 's/"/\t/g' piapik.gff | awk -v type="${type}" 'BEGIN{OFS=FS="\t"}{if($3==type) {print $1,$4-1,$5,$14,".",$7}}' > piapik.bed

提取蛋白序列和CDS序列用于后续的SnpEff:

1
2
seqkit grep -f piapik.cds.id.txt genome.cds.fa > piapik.cds.fa
seqkit grep -f piapik.cds.id.txt genome.pro.fa > piapik.pro.fa

将这些文件拷贝到SnpEff文件夹:

1
2
3
4
5
cp ../../../01.data/genome/piapik.fa sequences.fa
cp ../../../01.data/genome/piapik.gff genes.gff
cp ../../../01.data/genome/R498_IGDBv3_coreset/piapik.cds.fa cds.fa
cp ../../../01.data/genome/R498_IGDBv3_coreset/piapik.pro.fa ./protein.fa
cp PiaPik/sequences.fa ./genomes/PiaPik.fa

构建索引

1
2
3
bwa index 01.data/genome/piapik.fa
samtools faidx 01.data/genome/piapik.fa
gatk CreateSequenceDictionary -R 01.data/genome/piapik.fa

SnpEff索引

1
java -jar /home/lixiang/mambaforge/envs/gatk4/share/snpeff-5.1-2/snpEff.jar build -c snpEff.config -gff3 -v PiaPik -noCheckCds -noCheckProtein

批量运行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
rm(list = ls())

library(tidyverse)

dir("./01.data/clean.data/") %>%
as_data_frame() %>%
magrittr::set_names("file") %>%
dplyr::mutate(sample = stringr::str_split(file, "\\.") %>%
sapply("[", 1) %>%
stringr::str_sub(1, stringr::str_length(.)-2)) -> samples

# 构建批量代码

genome = "01.data/genome/piapik.fa "

all.run = NULL

for (i in unique(samples$sample)) {
# exit then skip
if (file.exists(sprintf("./03.mapping/%s.sorted.mapped.bam",i))) {
next
}else{
f.read = sprintf("./01.data/clean.data/%s_1.clean.fq.gz",i)
r.read = sprintf("./01.data/clean.data/%s_2.clean.fq.gz",i)
title = sprintf("\"@RG\\tID:%s\\tSM:%s\\tPL:illumina\"",i,i)
bam = sprintf("./03.mapping/%s.sorted.bam",i)

# bwa mem
sprintf("bwa mem -M -t 75 -R %s %s %s %s | samtools sort -@ 50 -m 8G -o %s",title, genome, f.read, r.read, bam) %>%
as_data_frame() %>%
rbind(all.run) -> all.run

# extract mapped
sprintf("samtools view -bF 12 -@ 75 03.mapping/%s.sorted.bam -o 04.sorted.bam/%s.sorted.mapped.bam",i,i) %>%
as_data_frame() %>%
rbind(all.run) -> all.run

# index
sprintf("samtools index -@ 75 -bc 04.sorted.bam/%s.sorted.mapped.bam",i) %>%
as_data_frame() %>%
rbind(all.run) -> all.run

# # Pia
# sprintf("samtools view 03.mapping/%s.sorted.mapped.bam Pia -o 03.mapping/%s.sorted.mapped.Pia.bam",i,i) %>%
# as_data_frame() %>%
# rbind(all.run) -> all.run
#
# # Pik
# sprintf("samtools view 03.mapping/%s.sorted.mapped.bam Pik -o 03.mapping/%s.sorted.mapped.Pik.bam",i,i) %>%
# as_data_frame() %>%
# rbind(all.run) -> all.run

# # bam2fastq
# sprintf("bedtools bamtofastq -i 03.mapping/%s.sorted.mapped.Pia.bam -fq 03.mapped.fastq/%s_1.Pia.fsatq -fq2 03.mapped.fastq/%s_2.Pia.fsatq",i,i,i) %>%
# as_data_frame() %>%
# rbind(all.run) -> all.run

# sprintf("bedtools bamtofastq -i 03.mapping/%s.sorted.mapped.Pik.bam -fq 03.mapped.fastq/%s_1.Pik.fsatq -fq2 03.mapped.fastq/%s_2.Pik.fsatq",i,i,i) %>%
# as_data_frame() %>%
# rbind(all.run) -> all.run

# # rm bam
# sprintf("rm 03.mapping/%s.sorted.bam",i) %>%
# as_data_frame() %>%
# rbind(all.run) -> all.run

# coverage
sprintf("samtools coverage ./04.sorted.bam/%s.sorted.mapped.bam > 05.converage/%s.coverage.txt",i,i) %>%
as_data_frame() %>%
rbind(all.run) -> all.run

# picard
sprintf("picard -Xmx400g MarkDuplicates I=04.sorted.bam/%s.sorted.mapped.bam O=06.picard/%s.picard.bam CREATE_INDEX=true REMOVE_DUPLICATES=trueu M=06.picard/%s.txt",i, i, i) %>%
as_data_frame() %>%
rbind(all.run) -> all.run

# gatk
sprintf("gatk --java-options \"-Xmx20G -XX:ParallelGCThreads=8\" HaplotypeCaller -R 01.data/genome/piapik.fa -I 06.picard/%s.picard.bam -O 07.gatk/%s.gvcf -ERC GVCF",i,i) %>%
as_data_frame() %>%
rbind(all.run) ->all.run

# gvcf 2 vcf
sprintf("gatk GenotypeGVCFs -R 01.data/genome/piapik.fa -V 07.gatk/%s.gvcf -O 07.gatk/%s.vcf",i,i) %>%
as_data_frame() %>%
rbind(all.run) ->all.run

# SneEff
sprintf("java -Xmx10G -jar ~/mambaforge/envs/gatk4/share/snpeff-5.1-2/snpEff.jar ann -c 08.snpeff/snpEff.config PiaPik 07.gatk/%s.vcf > 08.snpeff/%s.ann.vcf -csvStats 08.snpeff/%s.positive.csv -stats 08.snpeff/%s.positive.html",i,i,i,i) %>%
as_data_frame() %>%
rbind(all.run) ->all.run
}
}

all.run %>%
dplyr::slice(nrow(all.run):1) %>%
write.table("./run.all.sh", col.names = FALSE, row.names = FALSE, quote = FALSE)

合并覆盖度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
dir("./05.converage/") %>% 
as_data_frame() %>%
magrittr::set_names("file") -> coverage

all.converage.final = NULL

for (i in coverage$file) {
sprintf("./05.converage/%s",i) %>%
readr::read_delim() %>%
dplyr::mutate(sample = stringr::str_replace(i,".coverage.txt","")) %>%
dplyr::select(ncol(.), 1:(ncol(.)-1)) %>%
rbind(all.converage.final) -> all.converage.final
}

all.converage.final %>%
write.table("./all.coverage.txt", sep = "\t", quote = FALSE, row.names = FALSE)

判断突变类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
rm(list = ls())

readr::read_delim("./01.data/genome/piapik.gff", col_names = FALSE) %>%
dplyr::select(1,3:5) %>%
magrittr::set_names(c("HROM","gene.region","start","end")) %>%
dplyr::filter(gene.region == "exon") %>%
dplyr::mutate(HROM = stringr::str_replace(HROM,"\\.","_"))-> df.gff

dir("./08.snpeff/") %>%
as_data_frame() %>%
magrittr::set_names("vcf") %>%
dplyr::filter(stringr::str_ends(vcf, "ann.vcf"))-> vcf

all.vcf = NULL

for (i in vcf$vcf) {
sprintf("./08.snpeff/%s",i) %>%
data.table::fread() %>%
magrittr::set_names(c("HROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample")) %>%
dplyr::select(1:8) %>%
dplyr::mutate(mutation.type = stringr::str_split(INFO,";") %>%
sapply("[",12) %>%
stringr::str_split("\\|") %>%
sapply("[",2),
mutation.region = stringr::str_split(INFO,";") %>%
sapply("[",12) %>%
stringr::str_split("\\|") %>%
sapply("[",6),
mutation.protein = stringr::str_split(INFO,";") %>%
sapply("[",12) %>%
stringr::str_split("\\|") %>%
sapply("[",11)) %>%
dplyr::select(-INFO) %>%
dplyr::left_join(df.gff) %>%
dplyr::mutate(group = case_when(POS >= start & POS <= end ~ "Exon", TRUE ~ "Non-exon")) %>%
dplyr::filter(group == "Exon") %>%
dplyr::select(1:5,8:10) %>%
magrittr::set_names(c("chrom", "position", "id", "ref", "alt", "mutation.type", "mutation.region", "mutation.protein")) %>%
dplyr::mutate(sample = stringr::str_replace(i,".ann.vcf","")) %>%
dplyr::mutate(rgene = case_when(chrom == "OsR498G1119642600_01" ~ "Pia", TRUE ~ "Pik")) %>%
dplyr::select(9:10,2:8) %>%
rbind(all.vcf) -> all.vcf
}

all.vcf %>%
write.table("./all.vcf.txt", sep = "\t", quote = FALSE, row.names = FALSE)

从二代测序数据中鉴定是否含有特定基因
https://lixiang117423.github.io/article/findseqinfastq/
作者
李详【Xiang LI】
发布于
2023年10月8日
许可协议