1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
| from Bio import SeqIO import argparse
parser = argparse.ArgumentParser(description='从GFF文件中提取每个转录本的信息,并在此基础上提取指定长度的启动子序列。') parser.add_argument( '-i', '--input', dest='gff_file', type=str, required=True, help='输入GFF文件路径。' ) parser.add_argument( '-g', '--genome', dest='genome_file', type=str, required=True, help='输入基因组序列文件路径。' ) parser.add_argument( '-o', '--output', dest='output_file', type=str, required=True, help='输出文件路径。' ) parser.add_argument( '-l', '--length', dest='promoter_length', type=int, default=1500, help='指定启动子序列的长度,默认为1500bp。' )
args = parser.parse_args()
genome_dict = SeqIO.to_dict(SeqIO.parse(args.genome_file, 'fasta'))
with open(args.output_file, 'w') as out_file, open(args.gff_file) as in_file: for line in in_file: fields = line.strip().split('\t') if fields[2] == 'mRNA': transcript_id = fields[8].split(';')[0].split('=')[1] gene_id = fields[8].split(';')[1].split('=')[1] chr_num = fields[0] start = int(fields[3]) end = int(fields[4]) strand = fields[6] if strand == '+': promoter_start = max(0, start - args.promoter_length) promoter_end = start else: promoter_start = end promoter_end = min(len(genome_dict[chr_num]), end + args.promoter_length) promoter_seq = str(genome_dict[chr_num].seq[promoter_start:promoter_end]).upper() out_file.write(f'>{transcript_id}\n{promoter_seq}\n')
|