生物信息学脚本合集

一些在AI协助下写的生物信息学脚本，自用。

基因组

hifiasm使用HiFi和Hi-C数据进行基因组组装

#!/bin/bash
#########################################################################
# 🧬 HiFi + Hi-C 基因组组装脚本
# 软件: hifiasm
# 样本: OV53_1
# 预估基因组大小: 1.4-1.5G
#########################################################################

set -e  # 遇到错误立即退出
set -u  # 使用未定义变量时报错

# # 合并文件
# zcat /home/project/68.三代数据组装和注释/01.data/hic/raw/ov53-1-HIC1/*_1.fq.gz >> /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R1.fastq
# zcat /home/project/68.三代数据组装和注释/01.data/hic/raw/ov53-1-HIC1/*_2.fq.gz >> /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R2.fastq

# gzip /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R1.fastq
# gzip /home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R2.fastq


# 📁 定义路径变量
HIFI_DATA="/home/project/68.三代数据组装和注释/01.data/hifi/OV53_1-hifi.fq"
HIC_R1="/home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R1.fastq.gz"
HIC_R2="/home/project/68.三代数据组装和注释/01.data/hic/OV53_1-hic_R2.fastq.gz"
BASE_DIR="/home/project/68.三代数据组装和注释/02.assembly"

# ⚙️ 定义参数
PREFIX="OV53_1"
THREADS=88
GENOME_SIZE="1.45g"  # 取中间值1.45G
N_HAP=2              # 二倍体

# 📂 定义目录结构
WORK_DIR="${BASE_DIR}/${PREFIX}"
RAW_DIR="${WORK_DIR}/01.raw_output"      # 原始输出文件
FASTA_DIR="${WORK_DIR}/02.fasta"         # FASTA格式文件
LOG_DIR="${WORK_DIR}/03.logs"            # 日志文件
STAT_DIR="${WORK_DIR}/04.statistics"     # 统计信息

# 📋 打印运行信息
echo "=========================================="
echo "🧬 开始基因组组装"
echo "=========================================="
echo "📌 样本名称: ${PREFIX}"
echo "📌 基因组大小: ${GENOME_SIZE}"
echo "📌 线程数: ${THREADS}"
echo "📌 倍性: ${N_HAP}"
echo "📌 HiFi数据: ${HIFI_DATA}"
echo "📌 Hi-C R1: ${HIC_R1}"
echo "📌 Hi-C R2: ${HIC_R2}"
echo "📌 工作目录: ${WORK_DIR}"
echo "=========================================="
echo ""

# 🔍 检查输入文件
echo "🔍 检查输入文件..."
if [ ! -f "${HIFI_DATA}" ]; then
    echo "❌ 错误: HiFi文件不存在: ${HIFI_DATA}"
    exit 1
fi

if [ ! -f "${HIC_R1}" ]; then
    echo "❌ 错误: Hi-C R1文件不存在: ${HIC_R1}"
    exit 1
fi

if [ ! -f "${HIC_R2}" ]; then
    echo "❌ 错误: Hi-C R2文件不存在: ${HIC_R2}"
    exit 1
fi
echo "✅ 所有输入文件检查通过"
echo ""

# 📁 创建目录结构
echo "📁 创建目录结构..."
mkdir -p ${RAW_DIR}
mkdir -p ${FASTA_DIR}
mkdir -p ${LOG_DIR}
mkdir -p ${STAT_DIR}
echo "✅ 目录结构已创建:"
echo "   📂 ${WORK_DIR}/"
echo "      ├── 📂 01.raw_output/     (原始GFA等文件)"
echo "      ├── 📂 02.fasta/          (FASTA格式文件)"
echo "      ├── 📂 03.logs/           (日志文件)"
echo "      └── 📂 04.statistics/     (统计信息)"
echo ""

# ⏰ 记录开始时间
START_TIME=$(date +%s)
echo "⏰ 开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""

# 🚀 运行hifiasm组装
echo "=========================================="
echo "🚀 运行hifiasm组装 (HiFi + Hi-C模式)"
echo "=========================================="

cd ${RAW_DIR}

hifiasm \
    -o ${PREFIX} \
    -t ${THREADS} \
    --hg-size ${GENOME_SIZE} \
    --n-hap ${N_HAP} \
    --h1 ${HIC_R1} \
    --h2 ${HIC_R2} \
    --primary \
    -l 3 \
    ${HIFI_DATA} 2>&1 | tee ${LOG_DIR}/hifiasm_assembly.log

# ✅ 检查组装是否成功
if [ $? -eq 0 ]; then
    echo ""
    echo "=========================================="
    echo "✅ 组装完成！"
    echo "=========================================="
else
    echo ""
    echo "=========================================="
    echo "❌ 组装失败，请检查错误信息"
    echo "=========================================="
    exit 1
fi

# 🔄 转换GFA为FASTA格式
echo ""
echo "=========================================="
echo "🔄 转换GFA为FASTA格式"
echo "=========================================="

cd ${RAW_DIR}

# 定义需要转换的GFA文件和对应的FASTA文件名
declare -A GFA_FILES=(
    ["${PREFIX}.hic.hap1.p_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.hap1.primary.fa"
    ["${PREFIX}.hic.hap2.p_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.hap2.primary.fa"
    ["${PREFIX}.hic.p_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.primary.fa"
    ["${PREFIX}.hic.a_ctg.gfa"]="${FASTA_DIR}/${PREFIX}.alternate.fa"
)

# 转换函数
convert_gfa_to_fasta() {
    local gfa_file=$1
    local fasta_file=$2
    
    if [ -f "${gfa_file}" ]; then
        echo "🔹 转换: $(basename ${gfa_file}) -> $(basename ${fasta_file})"
        awk '/^S/{print ">"$2; print $3}' ${gfa_file} > ${fasta_file}
        
        # 统计序列数和总长度
        local num_seqs=$(grep -c "^>" ${fasta_file})
        local total_len=$(awk '/^>/{next}{sum+=length($0)}END{print sum}' ${fasta_file})
        echo "   ✓ 序列数: ${num_seqs}"
        echo "   ✓ 总长度: ${total_len} bp"
        echo ""
    else
        echo "⚠️  警告: 文件不存在 - ${gfa_file}"
        echo ""
    fi
}

# 执行转换
for gfa_file in "${!GFA_FILES[@]}"; do
    convert_gfa_to_fasta "${gfa_file}" "${GFA_FILES[$gfa_file]}"
done

echo "✅ FASTA文件转换完成"
echo ""

# 📊 生成统计信息
echo "=========================================="
echo "📊 生成组装统计信息"
echo "=========================================="

STAT_FILE="${STAT_DIR}/${PREFIX}_assembly_statistics.txt"

{
    echo "=========================================="
    echo "基因组组装统计报告"
    echo "样本: ${PREFIX}"
    echo "日期: $(date '+%Y-%m-%d %H:%M:%S')"
    echo "=========================================="
    echo ""
    
    for fasta_file in "${GFA_FILES[@]}"; do
        if [ -f "${fasta_file}" ]; then
            echo "📄 文件: $(basename ${fasta_file})"
            echo "----------------------------------------"
            
            # 序列数量
            num_seqs=$(grep -c "^>" ${fasta_file})
            echo "序列数量: ${num_seqs}"
            
            # 总长度
            total_len=$(awk '/^>/{next}{sum+=length($0)}END{print sum}' ${fasta_file})
            echo "总长度: ${total_len} bp ($(echo "scale=2; ${total_len}/1000000000" | bc) Gb)"
            
            # 最长序列
            max_len=$(awk '/^>/{if(seq){print length(seq)}; seq=""; next}{seq=seq$0}END{if(seq){print length(seq)}}' ${fasta_file} | sort -rn | head -1)
            echo "最长序列: ${max_len} bp"
            
            # 计算N50
            awk '/^>/{if(seq){print length(seq)}; seq=""; next}{seq=seq$0}END{if(seq){print length(seq)}}' ${fasta_file} | \
            sort -rn | \
            awk '{len[NR]=$1; sum+=$1} END {
                target=sum/2; 
                cumsum=0; 
                for(i=1; i<=NR; i++){
                    cumsum+=len[i]; 
                    if(cumsum>=target){
                        print "N50: "len[i]" bp"; 
                        break
                    }
                }
            }'
            
            echo ""
        fi
    done
    
    echo "=========================================="
    echo "文件位置"
    echo "=========================================="
    echo "原始输出: ${RAW_DIR}"
    echo "FASTA文件: ${FASTA_DIR}"
    echo "日志文件: ${LOG_DIR}"
    echo "统计文件: ${STAT_DIR}"
    echo ""
    
} | tee ${STAT_FILE}

echo "✅ 统计信息已保存至: ${STAT_FILE}"
echo ""

# ⏰ 记录结束时间
END_TIME=$(date +%s)
ELAPSED=$((END_TIME - START_TIME))
HOURS=$((ELAPSED / 3600))
MINUTES=$(((ELAPSED % 3600) / 60))
SECONDS=$((ELAPSED % 60))

echo ""
echo "=========================================="
echo "⏰ 运行时间统计"
echo "=========================================="
echo "开始时间: $(date -d @${START_TIME} '+%Y-%m-%d %H:%M:%S')"
echo "结束时间: $(date -d @${END_TIME} '+%Y-%m-%d %H:%M:%S')"
echo "总耗时: ${HOURS}小时 ${MINUTES}分钟 ${SECONDS}秒"
echo ""

# 📊 最终输出文件清单
echo "=========================================="
echo "📊 输出文件清单"
echo "=========================================="
echo ""
echo "📂 ${WORK_DIR}/"
echo "   │"
echo "   ├── 📂 01.raw_output/     (原始输出文件)"
echo "   │   ├── 🔸 ${PREFIX}.bp.hap1.p_ctg.gfa"
echo "   │   ├── 🔸 ${PREFIX}.bp.hap2.p_ctg.gfa"
echo "   │   ├── 🔸 ${PREFIX}.bp.p_ctg.gfa"
echo "   │   ├── 🔸 ${PREFIX}.bp.a_ctg.gfa"
echo "   │   └── 🔸 其他中间文件 (.bin, .ovlp等)"
echo "   │"
echo "   ├── 📂 02.fasta/          (FASTA格式)"
echo "   │   ├── 🧬 ${PREFIX}.hap1.primary.fa      (单倍型1主要序列)"
echo "   │   ├── 🧬 ${PREFIX}.hap2.primary.fa      (单倍型2主要序列)"
echo "   │   ├── 🧬 ${PREFIX}.primary.fa           (主要组装)"
echo "   │   └── 🧬 ${PREFIX}.alternate.fa         (备选组装)"
echo "   │"
echo "   ├── 📂 03.logs/           (日志文件)"
echo "   │   └── 📄 hifiasm_assembly.log"
echo "   │"
echo "   └── 📂 04.statistics/     (统计信息)"
echo "       └── 📄 ${PREFIX}_assembly_statistics.txt"
echo ""
echo "=========================================="
echo "🎉 所有任务完成！"
echo "=========================================="
echo ""
echo "💡 下一步建议:"
echo "   1. 查看统计报告: cat ${STAT_FILE}"
echo "   2. 检查组装质量: 使用QUAST或BUSCO"
echo "   3. 可视化组装: 使用Bandage查看GFA文件"
echo ""

其他脚本

bam文件比对结果统计

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
BAM 文件批量统计报告生成器 📊 (并行修复版 + Excel输出)
"""
import argparse
import subprocess
import re
import sys
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from typing import Dict, List
from multiprocessing import Pool, cpu_count

def check_dependencies():
    """检查 samtools 和必要的Python库是否已安装"""
    # 检查 Python 库
    missing_libs = []
    try:
        import pandas
    except ImportError:
        missing_libs.append('pandas')
    try:
        import openpyxl
    except ImportError:
        missing_libs.append('openpyxl')
    try:
        import tqdm
    except ImportError:
        missing_libs.append('tqdm')
    
    if missing_libs:
        print(f"❌ 错误: 缺少必要的Python库。请先安装: {' '.join(missing_libs)}", file=sys.stderr)
        print(f"   运行: pip install {' '.join(missing_libs)}", file=sys.stderr)
        return False # <-- 明确返回 False

    # 检查 samtools
    try:
        subprocess.run(['samtools', '--version'], check=True, capture_output=True)
        print("✅ samtools 已找到，准备开始分析...")
        return True # <-- 明确返回 True
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("❌ 错误: 未找到 'samtools' 命令。", file=sys.stderr)
        print("   请确保 samtools 已安装并且在您的系统 PATH 环境变量中。", file=sys.stderr)
        print("   您可以通过 'conda install -c bioconda samtools' 进行安装。", file=sys.stderr)
        return False # <-- 明确返回 False

def parse_flagstat(flagstat_output: str) -> Dict:
    """从 samtools flagstat 的输出中解析关键指标"""
    stats = {}
    
    total_reads_match = re.search(r'(\d+) \+ \d+ in total', flagstat_output)
    mapped_reads_match = re.search(r'(\d+) \+ \d+ mapped', flagstat_output)
    properly_paired_match = re.search(r'(\d+) \+ \d+ properly paired', flagstat_output)
    
    total_reads = int(total_reads_match.group(1)) if total_reads_match else 0
    mapped_reads = int(mapped_reads_match.group(1)) if mapped_reads_match else 0
    properly_paired = int(properly_paired_match.group(1)) if properly_paired_match else 0
    
    stats['Total_Reads'] = total_reads
    stats['Mapped_Reads'] = mapped_reads
    stats['Properly_Paired_Reads'] = properly_paired
    
    if total_reads > 0:
        stats['Overall_Mapping_Rate(%)'] = round((mapped_reads / total_reads) * 100, 2)
        stats['Properly_Paired_Rate(%)'] = round((properly_paired / total_reads) * 100, 2)
    else:
        stats['Overall_Mapping_Rate(%)'] = 0.0
        stats['Properly_Paired_Rate(%)'] = 0.0
        
    return stats

def analyze_bam(bam_file: Path) -> Dict:
    """对单个 BAM 文件执行所有分析 (这是被并行调用的函数)"""
    try:
        sample_name = bam_file.name.replace('.sorted.bam', '').replace('.bam', '')
        results = {'Sample': sample_name}

        # 检查并创建索引 (在单进程内)
        bai_file = bam_file.with_suffix('.bam.bai')
        if not bai_file.exists():
            subprocess.run(['samtools', 'index', str(bam_file)], check=True, capture_output=True)

        # 1. 运行 samtools flagstat
        flagstat_proc = subprocess.run(['samtools', 'flagstat', str(bam_file)], capture_output=True, text=True, check=True)
        results.update(parse_flagstat(flagstat_proc.stdout))

        # 2. 运行 samtools coverage
        coverage_proc = subprocess.run(['samtools', 'coverage', str(bam_file)], capture_output=True, text=True, check=True)

        # 3. 运行 samtools idxstats
        idxstats_proc = subprocess.run(['samtools', 'idxstats', str(bam_file)], capture_output=True, text=True, check=True)

        # --- 解析 ---
        # 整体覆盖度
        coverage_lines = coverage_proc.stdout.strip().splitlines()
        # 寻找最后一行非注释行作为整体统计
        summary_line = None
        for line in reversed(coverage_lines):
            if not line.startswith('#'):
                summary_line = line
                break
        
        if summary_line:
            overall_cov_parts = summary_line.strip().split('\t')
            if len(overall_cov_parts) > 6:
                results['Overall_Covered_Bases_Rate(%)'] = float(overall_cov_parts[5])
                results['Overall_Coverage_Depth'] = float(overall_cov_parts[6])

        # 每条染色体的统计
        total_mapped = results.get('Mapped_Reads', 1)
        for line in idxstats_proc.stdout.strip().splitlines():
            parts = line.split('\t')
            if len(parts) < 3: continue
            chrom, _, mapped = parts[0], parts[1], parts[2]
            if chrom == '*': continue
            results[f'{chrom}_Mapped_Reads'] = int(mapped)
            if total_mapped > 0:
                results[f'{chrom}_Mapped_Reads_Percentage(%)'] = round((int(mapped) / total_mapped) * 100, 2)
        
        for line in coverage_lines:
            if line.startswith('#'): continue
            parts = line.strip().split('\t')
            chrom = parts[0]
            if len(parts) > 6 and chrom != "genome" and chrom != summary_line.split('\t')[0]: # 排除整体行
                 results[f'{chrom}_Coverage_Depth'] = float(parts[6])
                 results[f'{chrom}_Covered_Bases_Rate(%)'] = float(parts[5])
                
        return results

    except subprocess.CalledProcessError as e:
        error_msg = e.stderr.strip() if e.stderr else str(e)
        return {'Sample': bam_file.name, 'Error': error_msg}
    except Exception as e:
        return {'Sample': bam_file.name, 'Error': str(e)}

def main():
    parser = argparse.ArgumentParser(
        description="BAM 文件批量统计报告生成器 📊 (并行修复版 + Excel输出)",
        formatter_class=argparse.RawTextHelpFormatter
    )
    # ... (parser 定义同前) ...
    parser.add_argument('-i', '--input_dir', type=str, required=True, help="包含 BAM 文件的输入文件夹路径。")
    parser.add_argument('-o', '--output_file', type=str, required=True, help="输出的报告文件名 (推荐使用 .xlsx 后缀)。")
    parser.add_argument('-p', '--processes', type=int, default=cpu_count(), help=f"使用的并行进程数 (默认为全部可用核心: {cpu_count()})。")
    
    args = parser.parse_args()
    
    # 关键修正：检查依赖检查的返回值
    if not check_dependencies():
        sys.exit(1)
        
    input_path = Path(args.input_dir)
    # ... (路径和文件检查同前) ...
    if not input_path.is_dir():
        print(f"❌ 错误: 输入路径 '{input_path}' 不是一个有效的文件夹。", file=sys.stderr)
        sys.exit(1)
        
    bam_files = sorted(list(input_path.glob('*.bam')))
    
    if not bam_files:
        print(f"⚠️ 警告: 在文件夹 '{input_path}' 中没有找到任何 .bam 文件。", file=sys.stderr)
        sys.exit(0)

    print(f"📁 在 '{input_path}' 中找到 {len(bam_files)} 个 BAM 文件。")
    print(f"🚀 使用 {args.processes} 个进程并行处理...")

    with Pool(processes=args.processes) as pool:
        results_iterator = pool.imap_unordered(analyze_bam, bam_files)
        all_results = list(tqdm(results_iterator, total=len(bam_files), desc="⚙️  处理BAM文件"))

    successful_results = [res for res in all_results if 'Error' not in res]
    failed_samples = [res for res in all_results if 'Error' in res]

    if failed_samples:
        print("\n❌ 以下样本处理失败:")
        for res in failed_samples:
            print(f"  - {res['Sample']}: {res['Error']}")
    
    if not successful_results:
        print("\n❌ 未能成功处理任何文件，程序退出。", file=sys.stderr)
        sys.exit(1)
        
    print("\n🔄 正在汇总所有成功的结果并生成报告...")
    df = pd.DataFrame(successful_results)
    
    # ... (转置和保存部分同前) ...
    df = df.set_index('Sample').T
    
    core_indices = [
        'Total_Reads', 'Mapped_Reads', 'Overall_Mapping_Rate(%)',
        'Properly_Paired_Reads', 'Properly_Paired_Rate(%)',
        'Overall_Coverage_Depth', 'Overall_Covered_Bases_Rate(%)'
    ]
    chrom_indices = sorted([idx for idx in df.index if idx not in core_indices])
    final_indices = core_indices + chrom_indices
    df = df.reindex(index=final_indices)

    if Path(args.output_file).suffix.lower() == '.xlsx':
        df.to_excel(args.output_file, index=True)
    else:
        df.to_csv(args.output_file, index=True)

    print(f"\n🎉 报告生成完毕！结果已保存至: {args.output_file}")

if __name__ == "__main__":
    main()

生物信息学

#生物信息学

生物信息学脚本合集

https://lixiang117423.github.io/article/bioinf-scripts/

作者

李详【Xiang LI】

发布于

2025年12月5日

许可协议

NCBI基因组注释流程使用三代转录组数据报错解决方法上一篇

linux运行tassel做GWAS的脚本下一篇