前处理:
gff
转换成gtf
:
1
| gffread Oryza_sativa.IRGSP-1.0.51.gff3 -T -o rice.gtf
|
提取外显子和可变剪切:
1 2 3
| hisat2_extract_exons.py IRGSP-1.0_representative_transcript_exon_2021-05-10.gtf >IRGSP-1.0.exon
hisat2_extract_splice_sites.py IRGSP-1.0_representative_transcript_exon_2021-05-10.gtf >IRGSP-1.0.ss
|
构建基因组索引:
1
| hisat2-build -p 10 IRGSP-1.0_genome.fasta --ss rice.ss --exon rice.exon ./index/rice.index
|
在分析目录下创建文件夹
data
:存放原始的fastq
数据;
fastqc
:用于存放原始文件质控的结果;
mapping
:用于存放原始数据mapping到参考基因组的结果;
bam
:用于存放sam
转换后的bam
文件;
bam.stat
:用于存放bam
统计结果;
stringtie
:用于存放转录本拼接结果;
ballgown
:用于存放ballgown的结果。
批量比对并将sam
文件转换成bam
文件后输出比对的统计结果:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
| import functools import os import subprocess
files = os.listdir('data/')
samples = []
for i in files: if str.endswith(i,'fastq'): samples.append(str.split(i,'.')[0])
samples = list(set(samples))
f = open('step4.hisat2.alignment.stringtie.sh','w')
down = 0
for i in samples:
down += 1
print_info ='echo 开始运行:' + i + '\n' f.write(print_info)
par_0 = 'hisat2 -p 20 -x /mnt/d/Docker/genefamily/sanqi_WRKY/results/RNA-Seq/data/index/sanqi.index' par_1 = ' -U data/' + i + '.fastq' par_2 = ' -S mapping/' + i + '.sam\n'
hisat2 = par_0 + par_1 + par_2 f.write(hisat2)
par_4 = 'samtools sort -@ 20 -o bam/' + i + '.sorted.bam mapping/' + i + '.sam\n'
f.write(par_4)
del_sam = 'rm ' + 'mapping/' + i + '.sam\n' f.write(del_sam)
flagstat = 'samtools flagstat -@ 20 ' + 'bam/' + i + '.sorted.bam > bam.stat/' + i + '.bam.stat.txt\n'
f.write(flagstat)
par_8 = 'stringtie -p 20 -G /mnt/d/Docker/genefamily/sanqi_WRKY/results/RNA-Seq/data/raw/sanqi.gtf' par_9 = ' -B -o stringtie/' + i + '.gtf' par_10 = ' -l ' + i + ' bam/' + i + '.sorted.bam\n'
stringtie = par_8 + par_9 + par_10 f.write(stringtie) print_info ='echo 已完成:' + i + '\n' f.write(print_info) print_info ='echo 已完成' + str(down) + '/' + str(len(samples)) + '\n' f.write(print_info) print_info ='echo =============================================\n' f.write(print_info) f.close()
subprocess.call('sh step4.hisat2.alignment.stringtie.sh',shell=True)
|
合并转录本:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
| import functools import os import subprocess import pandas as pd
files = os.listdir('stringtie/')
samples = []
for i in files: if str.endswith(i,'.gtf'): samples.append(str.split(i,'.')[0])
samples = list(set(samples))
f_1 = open('stringtie.gtf.merge.list.txt','w')
for i in samples: gtf = i + '.gtf\n' f_1.write(gtf) f_1.close()
f_2= open('merge.gtf.sh')
par = 'stringtie --merge -p 20 -G /mnt/d/Docker/genefamily/sanqi_WRKY/results/RNA-Seq/data/index/raw/sanqi.gtf -o merged.gtf stringtie.gtf.merge.list.txt' f_2.write(par) f_2.close()
subprocess.call('sh merge.gtf.sh',shell=True)
|
ballgown
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
| import functools import os import subprocess import pandas as pd
files = pd.read_table('stringtie.gtf.merge.list.txt',header=None)
samples = []
num = 0
for i in files[0]: if str.endswith(i,'.gtf'): samples.append(str.split(i,'.')[0])
samples = list(set(samples))
f = open('step6.ballgown.sh','w')
for i in samples:
num += 1
print_info ='echo 开始运行:' + i + '\n' f.write(print_info)
par_0 = 'mkdir ballgown/' + i + '\n' par_1 = 'stringtie -e -B -p 20 -G merged.gtf' par_2 = ' -o ballgown/' + i + '/' + i + '.gtf' par_3 = ' bam/' + i + '.sorted.bam' + '\n'
ballgown = par_0 + par_1 + par_2 + par_3
f.write(ballgown)
print_info ='echo 已完成:' + i + '\n' f.write(print_info) print_info ='echo 已完成' + str(num) + '/' + str(len(samples)) + '\n' f.write(print_info) print_info ='echo =============================================\n' f.write(print_info) f.close()
subprocess.call('sh step6.ballgown.sh',shell=True)
|
提取FPKM
值
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| rm(list = ls())
library(ballgown)
bg = ballgown::ballgown(dataDir = 'ballgown/', samplePattern = 'SRR')
df.fpkm = bg@expr[["trans"]] %>% dplyr::select(gene_id, starts_with('FPKM')) %>% dplyr::mutate(temp = stringr::str_sub(gene_id,1,3)) %>% dplyr::filter(temp == 'Pno') %>% dplyr::select(-temp)
colnames(df.fpkm) = stringr::str_replace(colnames(df.fpkm),'FPKM.','')
rownames(df.fpkm) = df.fpkm$gene_id
|
💌lixiang117423@foxmail.com
💌lixiang117423@gmail.com