转录组Htseq流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import functools
import os
import subprocess

#os.chdir('G:\\33Pan-genome-RNA-Seq\\fastqc')

files = os.listdir('data/fastq/')

samples = []

for i in files:
if str.endswith(i,'fastq'):
samples.append(str.split(i,'_')[0])

samples = list(set(samples))

f = open('run-this-code.sh','w')

# 创建文件夹
mkdir = 'mkdir fastqc\n'
f.write(mkdir)
mkdir = 'mkdir mapping\n'
f.write(mkdir)
mkdir = 'mkdir counts\n'
f.write(mkdir)
mkdir = 'mkdir bam\n'
f.write(mkdir)
mkdir = 'mkdir bam.stat\n'
f.write(mkdir)

# 构建基因组索引等
# gff2gtf = 'gffread data/genome/*.gff* -T -o data/genome/genome.gtf'
# f.write(gff2gtf)
# extract_exons = 'hisat2_extract_exons.py data/genome/genome.gtf > data/genome/exon.txt'
# f.write(extract_exons)
# extract_splice_sites = 'hisat2_extract_splice_sites.py data/genome/genome.gtf > data/genome/ss.txt'
# f.write(extract_splice_sites)
# mkdir = 'mkdir data/index'
# f.write(mkdir)
# build_index = 'hisat2-build -p 10 data/genome/*.fasta --ss data/genome/ss.txt --exon data/genome/exon.txt data/index/index'
# f.write(build_index)

fastqc = 'fastqc -t 20 data/fastq/*.fastq -o fastqc/\n'
f.write(fastqc)

down = 0

for i in samples:
# alignment
down += 1

print_info ='echo 开始运行:' + i + '\n'
f.write(print_info)

par_0 = 'hisat2 -p 10 -x data/index/sanqi.index'
par_1 = ' -1 data/fastq/' + i + '_1.fastq'
par_2 = ' -2 data/fastq/' + i + '_2.fastq'
par_3 = ' -S mapping/' + i + '.sam\n'

hisat2 = par_0 + par_1 + par_2 + par_3
f.write(hisat2)

par_4 = 'samtools sort -@ 10 -o bam/' + i + '.sorted.bam mapping/' + i + '.sam\n'

f.write(par_4)

del_sam = 'rm ' + 'mapping/' + i + '.sam\n'
f.write(del_sam)

# bam stat
flagstat = 'samtools flagstat -@ 20 ' + 'bam/' + i + '.sorted.bam > bam.stat/' + i + '.bam.stat.txt\n'

f.write(flagstat)

par_8 = 'samtools index bam/' + i + '.sorted.bam\n'

f.write(par_8)

par_1 = 'htseq-count -f bam -r name -i gene_id -s yes -t exon -m union '
par_2 = 'bam/' + i + '.sorted.bam' + ' data/genome/genome.gtf'
par_3 = ' > ' + 'counts/' + i + '.count.txt\n'
htseq = par_1 + par_2 + par_3
f.write(htseq)


print_info ='echo 已完成:' + i + '\n'
f.write(print_info)
print_info ='echo 已完成' + str(down) + '/' + str(len(samples)) + '\n'
f.write(print_info)
print_info ='echo =============================================\n'
f.write(print_info)

if down == len(samples):
merge_counts = 'python merge.counts.py\n'
f.write(merge_counts)
print_info ='echo Counts矩阵合并完成!' + '\n'
f.write(print_info)
getTPMandFPKM = 'Rscript getTPMandFPKM.R\n'
f.write(getTPMandFPKM)
print_info ='echo TPM与FPKM提取完成!' + '\n'
f.write(print_info)

f.close()

subprocess.call('sh run-this-code.sh',shell=True)

💌lixiang117423@foxmail.com
💌lixiang117423@gmail.com


转录组Htseq流程
https://lixiang117423.github.io/article/1589gh89/
作者
小蓝哥
发布于
2021年11月23日
许可协议