生物信息学脚本合集

一些在AI协助下写的生物信息学脚本,自用。

基因组

hifiasm使用HiFi和Hi-C数据进行基因组组装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/bin/bash
#########################################################################
# 🧬 HiFi + Hi-C 基因组组装脚本
# 软件: hifiasm
# 样本: OV53_1
# 预估基因组大小: 1.4-1.5G
#########################################################################

set -e # 遇到错误立即退出
set -u # 使用未定义变量时报错

# # 合并文件
# zcat /home/project/68.三代数据组装和注释/01.data/hic/raw/ov53-1-HIC1/*_1.fq.gz >> /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R1.fastq
# zcat /home/project/68.三代数据组装和注释/01.data/hic/raw/ov53-1-HIC1/*_2.fq.gz >> /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R2.fastq

# gzip /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R1.fastq
# gzip /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R2.fastq


# 📁 定义路径变量
HIFI_DATA="/home/project/68.三代数据组装和注释/01.data/hifi/OV53_1-hifi.fq"
HIC_R1="/home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R1.fastq.gz"
HIC_R2="/home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R2.fastq.gz"
BASE_DIR="/home/project/68.三代数据组装和注释/02.assembly"

# ⚙️ 定义参数
PREFIX="OV53_1"
THREADS=88
GENOME_SIZE="1.45g" # 取中间值1.45G
N_HAP=2 # 二倍体

# 📂 定义目录结构
WORK_DIR="${BASE_DIR}/${PREFIX}"
RAW_DIR="${WORK_DIR}/01.raw_output" # 原始输出文件
FASTA_DIR="${WORK_DIR}/02.fasta" # FASTA格式文件
LOG_DIR="${WORK_DIR}/03.logs" # 日志文件
STAT_DIR="${WORK_DIR}/04.statistics" # 统计信息

# 📋 打印运行信息
echo "=========================================="
echo "🧬 开始基因组组装"
echo "=========================================="
echo "📌 样本名称: ${PREFIX}"
echo "📌 基因组大小: ${GENOME_SIZE}"
echo "📌 线程数: ${THREADS}"
echo "📌 倍性: ${N_HAP}"
echo "📌 HiFi数据: ${HIFI_DATA}"
echo "📌 Hi-C R1: ${HIC_R1}"
echo "📌 Hi-C R2: ${HIC_R2}"
echo "📌 工作目录: ${WORK_DIR}"
echo "=========================================="
echo ""

# 🔍 检查输入文件
echo "🔍 检查输入文件..."
if [ ! -f "${HIFI_DATA}" ]; then
echo "❌ 错误: HiFi文件不存在: ${HIFI_DATA}"
exit 1
fi

if [ ! -f "${HIC_R1}" ]; then
echo "❌ 错误: Hi-C R1文件不存在: ${HIC_R1}"
exit 1
fi

if [ ! -f "${HIC_R2}" ]; then
echo "❌ 错误: Hi-C R2文件不存在: ${HIC_R2}"
exit 1
fi
echo "✅ 所有输入文件检查通过"
echo ""

# 📁 创建目录结构
echo "📁 创建目录结构..."
mkdir -p ${RAW_DIR}
mkdir -p ${FASTA_DIR}
mkdir -p ${LOG_DIR}
mkdir -p ${STAT_DIR}
echo "✅ 目录结构已创建:"
echo " 📂 ${WORK_DIR}/"
echo " ├── 📂 01.raw_output/ (原始GFA等文件)"
echo " ├── 📂 02.fasta/ (FASTA格式文件)"
echo " ├── 📂 03.logs/ (日志文件)"
echo " └── 📂 04.statistics/ (统计信息)"
echo ""

# ⏰ 记录开始时间
START_TIME=$(date +%s)
echo "⏰ 开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""

# 🚀 运行hifiasm组装
echo "=========================================="
echo "🚀 运行hifiasm组装 (HiFi + Hi-C模式)"
echo "=========================================="

cd ${RAW_DIR}

hifiasm \
-o ${PREFIX} \
-t ${THREADS} \
--hg-size ${GENOME_SIZE} \
--n-hap ${N_HAP} \
--h1 ${HIC_R1} \
--h2 ${HIC_R2} \
--primary \
-l 3 \
${HIFI_DATA} 2>&1 | tee ${LOG_DIR}/hifiasm_assembly.log

# ✅ 检查组装是否成功
if [ $? -eq 0 ]; then
echo ""
echo "=========================================="
echo "✅ 组装完成!"
echo "=========================================="
else
echo ""
echo "=========================================="
echo "❌ 组装失败,请检查错误信息"
echo "=========================================="
exit 1
fi

# 🔄 转换GFA为FASTA格式
echo ""
echo "=========================================="
echo "🔄 转换GFA为FASTA格式"
echo "=========================================="

cd ${RAW_DIR}

# 定义需要转换的GFA文件和对应的FASTA文件名
declare -A GFA_FILES=(
["${PREFIX}.hic.hap1.p_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.hap1.primary.fa"
["${PREFIX}.hic.hap2.p_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.hap2.primary.fa"
["${PREFIX}.hic.p_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.primary.fa"
["${PREFIX}.hic.a_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.alternate.fa"
)

# 转换函数
convert_gfa_to_fasta() {
local gfa_file=$1
local fasta_file=$2

if [ -f "${gfa_file}" ]; then
echo "🔹 转换: $(basename ${gfa_file}) -> $(basename ${fasta_file})"
awk '/^S/{print ">"$2; print $3}' ${gfa_file} > ${fasta_file}

# 统计序列数和总长度
local num_seqs=$(grep -c "^>" ${fasta_file})
local total_len=$(awk '/^>/{next}{sum+=length($0)}END{print sum}' ${fasta_file})
echo " ✓ 序列数: ${num_seqs}"
echo " ✓ 总长度: ${total_len} bp"
echo ""
else
echo "⚠️ 警告: 文件不存在 - ${gfa_file}"
echo ""
fi
}

# 执行转换
for gfa_file in "${!GFA_FILES[@]}"; do
convert_gfa_to_fasta "${gfa_file}" "${GFA_FILES[$gfa_file]}"
done

echo "✅ FASTA文件转换完成"
echo ""

# 📊 生成统计信息
echo "=========================================="
echo "📊 生成组装统计信息"
echo "=========================================="

STAT_FILE="${STAT_DIR}/${PREFIX}_assembly_statistics.txt"

{
echo "=========================================="
echo "基因组组装统计报告"
echo "样本: ${PREFIX}"
echo "日期: $(date '+%Y-%m-%d %H:%M:%S')"
echo "=========================================="
echo ""

for fasta_file in "${GFA_FILES[@]}"; do
if [ -f "${fasta_file}" ]; then
echo "📄 文件: $(basename ${fasta_file})"
echo "----------------------------------------"

# 序列数量
num_seqs=$(grep -c "^>" ${fasta_file})
echo "序列数量: ${num_seqs}"

# 总长度
total_len=$(awk '/^>/{next}{sum+=length($0)}END{print sum}' ${fasta_file})
echo "总长度: ${total_len} bp ($(echo "scale=2; ${total_len}/1000000000" | bc) Gb)"

# 最长序列
max_len=$(awk '/^>/{if(seq){print length(seq)}; seq=""; next}{seq=seq$0}END{if(seq){print length(seq)}}' ${fasta_file} | sort -rn | head -1)
echo "最长序列: ${max_len} bp"

# 计算N50
awk '/^>/{if(seq){print length(seq)}; seq=""; next}{seq=seq$0}END{if(seq){print length(seq)}}' ${fasta_file} | \
sort -rn | \
awk '{len[NR]=$1; sum+=$1} END {
target=sum/2;
cumsum=0;
for(i=1; i<=NR; i++){
cumsum+=len[i];
if(cumsum>=target){
print "N50: "len[i]" bp";
break
}
}
}'

echo ""
fi
done

echo "=========================================="
echo "文件位置"
echo "=========================================="
echo "原始输出: ${RAW_DIR}"
echo "FASTA文件: ${FASTA_DIR}"
echo "日志文件: ${LOG_DIR}"
echo "统计文件: ${STAT_DIR}"
echo ""

} | tee ${STAT_FILE}

echo "✅ 统计信息已保存至: ${STAT_FILE}"
echo ""

# ⏰ 记录结束时间
END_TIME=$(date +%s)
ELAPSED=$((END_TIME - START_TIME))
HOURS=$((ELAPSED / 3600))
MINUTES=$(((ELAPSED % 3600) / 60))
SECONDS=$((ELAPSED % 60))

echo ""
echo "=========================================="
echo "⏰ 运行时间统计"
echo "=========================================="
echo "开始时间: $(date -d @${START_TIME} '+%Y-%m-%d %H:%M:%S')"
echo "结束时间: $(date -d @${END_TIME} '+%Y-%m-%d %H:%M:%S')"
echo "总耗时: ${HOURS}小时 ${MINUTES}分钟 ${SECONDS}秒"
echo ""

# 📊 最终输出文件清单
echo "=========================================="
echo "📊 输出文件清单"
echo "=========================================="
echo ""
echo "📂 ${WORK_DIR}/"
echo " │"
echo " ├── 📂 01.raw_output/ (原始输出文件)"
echo " │ ├── 🔸 ${PREFIX}.bp.hap1.p_ctg.gfa"
echo " │ ├── 🔸 ${PREFIX}.bp.hap2.p_ctg.gfa"
echo " │ ├── 🔸 ${PREFIX}.bp.p_ctg.gfa"
echo " │ ├── 🔸 ${PREFIX}.bp.a_ctg.gfa"
echo " │ └── 🔸 其他中间文件 (.bin, .ovlp等)"
echo " │"
echo " ├── 📂 02.fasta/ (FASTA格式)"
echo " │ ├── 🧬 ${PREFIX}.hap1.primary.fa (单倍型1主要序列)"
echo " │ ├── 🧬 ${PREFIX}.hap2.primary.fa (单倍型2主要序列)"
echo " │ ├── 🧬 ${PREFIX}.primary.fa (主要组装)"
echo " │ └── 🧬 ${PREFIX}.alternate.fa (备选组装)"
echo " │"
echo " ├── 📂 03.logs/ (日志文件)"
echo " │ └── 📄 hifiasm_assembly.log"
echo " │"
echo " └── 📂 04.statistics/ (统计信息)"
echo " └── 📄 ${PREFIX}_assembly_statistics.txt"
echo ""
echo "=========================================="
echo "🎉 所有任务完成!"
echo "=========================================="
echo ""
echo "💡 下一步建议:"
echo " 1. 查看统计报告: cat ${STAT_FILE}"
echo " 2. 检查组装质量: 使用QUAST或BUSCO"
echo " 3. 可视化组装: 使用Bandage查看GFA文件"
echo ""

其他脚本

bam文件比对结果统计

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
BAM 文件批量统计报告生成器 📊 (并行修复版 + Excel输出)
"""
import argparse
import subprocess
import re
import sys
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from typing import Dict, List
from multiprocessing import Pool, cpu_count

def check_dependencies():
"""检查 samtools 和必要的Python库是否已安装"""
# 检查 Python 库
missing_libs = []
try:
import pandas
except ImportError:
missing_libs.append('pandas')
try:
import openpyxl
except ImportError:
missing_libs.append('openpyxl')
try:
import tqdm
except ImportError:
missing_libs.append('tqdm')

if missing_libs:
print(f"❌ 错误: 缺少必要的Python库。请先安装: {' '.join(missing_libs)}", file=sys.stderr)
print(f" 运行: pip install {' '.join(missing_libs)}", file=sys.stderr)
return False # <-- 明确返回 False

# 检查 samtools
try:
subprocess.run(['samtools', '--version'], check=True, capture_output=True)
print("✅ samtools 已找到,准备开始分析...")
return True # <-- 明确返回 True
except (subprocess.CalledProcessError, FileNotFoundError):
print("❌ 错误: 未找到 'samtools' 命令。", file=sys.stderr)
print(" 请确保 samtools 已安装并且在您的系统 PATH 环境变量中。", file=sys.stderr)
print(" 您可以通过 'conda install -c bioconda samtools' 进行安装。", file=sys.stderr)
return False # <-- 明确返回 False

def parse_flagstat(flagstat_output: str) -> Dict:
"""从 samtools flagstat 的输出中解析关键指标"""
stats = {}

total_reads_match = re.search(r'(\d+) \+ \d+ in total', flagstat_output)
mapped_reads_match = re.search(r'(\d+) \+ \d+ mapped', flagstat_output)
properly_paired_match = re.search(r'(\d+) \+ \d+ properly paired', flagstat_output)

total_reads = int(total_reads_match.group(1)) if total_reads_match else 0
mapped_reads = int(mapped_reads_match.group(1)) if mapped_reads_match else 0
properly_paired = int(properly_paired_match.group(1)) if properly_paired_match else 0

stats['Total_Reads'] = total_reads
stats['Mapped_Reads'] = mapped_reads
stats['Properly_Paired_Reads'] = properly_paired

if total_reads > 0:
stats['Overall_Mapping_Rate(%)'] = round((mapped_reads / total_reads) * 100, 2)
stats['Properly_Paired_Rate(%)'] = round((properly_paired / total_reads) * 100, 2)
else:
stats['Overall_Mapping_Rate(%)'] = 0.0
stats['Properly_Paired_Rate(%)'] = 0.0

return stats

def analyze_bam(bam_file: Path) -> Dict:
"""对单个 BAM 文件执行所有分析 (这是被并行调用的函数)"""
try:
sample_name = bam_file.name.replace('.sorted.bam', '').replace('.bam', '')
results = {'Sample': sample_name}

# 检查并创建索引 (在单进程内)
bai_file = bam_file.with_suffix('.bam.bai')
if not bai_file.exists():
subprocess.run(['samtools', 'index', str(bam_file)], check=True, capture_output=True)

# 1. 运行 samtools flagstat
flagstat_proc = subprocess.run(['samtools', 'flagstat', str(bam_file)], capture_output=True, text=True, check=True)
results.update(parse_flagstat(flagstat_proc.stdout))

# 2. 运行 samtools coverage
coverage_proc = subprocess.run(['samtools', 'coverage', str(bam_file)], capture_output=True, text=True, check=True)

# 3. 运行 samtools idxstats
idxstats_proc = subprocess.run(['samtools', 'idxstats', str(bam_file)], capture_output=True, text=True, check=True)

# --- 解析 ---
# 整体覆盖度
coverage_lines = coverage_proc.stdout.strip().splitlines()
# 寻找最后一行非注释行作为整体统计
summary_line = None
for line in reversed(coverage_lines):
if not line.startswith('#'):
summary_line = line
break

if summary_line:
overall_cov_parts = summary_line.strip().split('\t')
if len(overall_cov_parts) > 6:
results['Overall_Covered_Bases_Rate(%)'] = float(overall_cov_parts[5])
results['Overall_Coverage_Depth'] = float(overall_cov_parts[6])

# 每条染色体的统计
total_mapped = results.get('Mapped_Reads', 1)
for line in idxstats_proc.stdout.strip().splitlines():
parts = line.split('\t')
if len(parts) < 3: continue
chrom, _, mapped = parts[0], parts[1], parts[2]
if chrom == '*': continue
results[f'{chrom}_Mapped_Reads'] = int(mapped)
if total_mapped > 0:
results[f'{chrom}_Mapped_Reads_Percentage(%)'] = round((int(mapped) / total_mapped) * 100, 2)

for line in coverage_lines:
if line.startswith('#'): continue
parts = line.strip().split('\t')
chrom = parts[0]
if len(parts) > 6 and chrom != "genome" and chrom != summary_line.split('\t')[0]: # 排除整体行
results[f'{chrom}_Coverage_Depth'] = float(parts[6])
results[f'{chrom}_Covered_Bases_Rate(%)'] = float(parts[5])

return results

except subprocess.CalledProcessError as e:
error_msg = e.stderr.strip() if e.stderr else str(e)
return {'Sample': bam_file.name, 'Error': error_msg}
except Exception as e:
return {'Sample': bam_file.name, 'Error': str(e)}

def main():
parser = argparse.ArgumentParser(
description="BAM 文件批量统计报告生成器 📊 (并行修复版 + Excel输出)",
formatter_class=argparse.RawTextHelpFormatter
)
# ... (parser 定义同前) ...
parser.add_argument('-i', '--input_dir', type=str, required=True, help="包含 BAM 文件的输入文件夹路径。")
parser.add_argument('-o', '--output_file', type=str, required=True, help="输出的报告文件名 (推荐使用 .xlsx 后缀)。")
parser.add_argument('-p', '--processes', type=int, default=cpu_count(), help=f"使用的并行进程数 (默认为全部可用核心: {cpu_count()})。")

args = parser.parse_args()

# 关键修正:检查依赖检查的返回值
if not check_dependencies():
sys.exit(1)

input_path = Path(args.input_dir)
# ... (路径和文件检查同前) ...
if not input_path.is_dir():
print(f"❌ 错误: 输入路径 '{input_path}' 不是一个有效的文件夹。", file=sys.stderr)
sys.exit(1)

bam_files = sorted(list(input_path.glob('*.bam')))

if not bam_files:
print(f"⚠️ 警告: 在文件夹 '{input_path}' 中没有找到任何 .bam 文件。", file=sys.stderr)
sys.exit(0)

print(f"📁 在 '{input_path}' 中找到 {len(bam_files)} 个 BAM 文件。")
print(f"🚀 使用 {args.processes} 个进程并行处理...")

with Pool(processes=args.processes) as pool:
results_iterator = pool.imap_unordered(analyze_bam, bam_files)
all_results = list(tqdm(results_iterator, total=len(bam_files), desc="⚙️ 处理BAM文件"))

successful_results = [res for res in all_results if 'Error' not in res]
failed_samples = [res for res in all_results if 'Error' in res]

if failed_samples:
print("\n❌ 以下样本处理失败:")
for res in failed_samples:
print(f" - {res['Sample']}: {res['Error']}")

if not successful_results:
print("\n❌ 未能成功处理任何文件,程序退出。", file=sys.stderr)
sys.exit(1)

print("\n🔄 正在汇总所有成功的结果并生成报告...")
df = pd.DataFrame(successful_results)

# ... (转置和保存部分同前) ...
df = df.set_index('Sample').T

core_indices = [
'Total_Reads', 'Mapped_Reads', 'Overall_Mapping_Rate(%)',
'Properly_Paired_Reads', 'Properly_Paired_Rate(%)',
'Overall_Coverage_Depth', 'Overall_Covered_Bases_Rate(%)'
]
chrom_indices = sorted([idx for idx in df.index if idx not in core_indices])
final_indices = core_indices + chrom_indices
df = df.reindex(index=final_indices)

if Path(args.output_file).suffix.lower() == '.xlsx':
df.to_excel(args.output_file, index=True)
else:
df.to_csv(args.output_file, index=True)

print(f"\n🎉 报告生成完毕!结果已保存至: {args.output_file}")

if __name__ == "__main__":
main()

生物信息学脚本合集
https://lixiang117423.github.io/article/bioinf-scripts/
作者
李详【Xiang LI】
发布于
2025年12月5日
许可协议