一些Hi-C的脚本

Puzzle-Hi-C

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
#!/bin/bash
set -e # 遇到错误立即停止
set -u # 使用未定义变量时报错
set -o pipefail # 管道命令中任一失败则失败

# ==============================================================================
# 🛠️ 用户配置区域 (请修改这里!)
# ==============================================================================

# 1. 输入文件 (请使用绝对路径)
REF_FASTA="project/68.三代数据组装和注释/05.allhic/OV53_1.primary.fa"
HIC_R1="project/68.三代数据组装和注释/04.YaHS/fastq/OV53_1-hic_R1.fastq.gz"
HIC_R2="project/68.三代数据组装和注释/04.YaHS/fastq/OV53_1-hic_R2.fastq.gz"

# 2. 项目参数
PROJECT_NAME="OV53-1" # 项目/物种名称 (不要包含空格)
CHROM_NUM=12 # 染色体数量 (必须准确)
ENZYME="MboI" # 限制性内切酶 (如 DpnII, HindIII, MboI)
THREADS=64
THREADS_PUZZLE=12 # 使用的线程数

# 3. 软件路径
PUZZLE_DIR="/share/org/YZWL/yzwl_lixg/software/puzzle-hic"
JUICER_DIR="/share/org/YZWL/yzwl_lixg/software/juicer"
PUZZLE_CONDA_ENV="/share/org/YZWL/yzwl_lixg/miniforge3/envs/puzzle-hi-c"

# 4. 可选参数
PUZZLE_BINSIZE=50000 # Puzzle binsize (默认10k)
PUZZLE_CUTOFF=0.35 # Puzzle cutoff (默认0.35)
PUZZLE_INIT_TRIANGLE=6 # Puzzle init triangle (默认6)

# 5. 断点续传控制
SKIP_JUICER=true # 跳过 Juicer(如果 merged_nodups.txt 已存在)
SKIP_PUZZLE=false # 跳过 Puzzle Hi-C

# ==============================================================================
# 🔧 辅助函数
# ==============================================================================

# 日志函数
log_info() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ℹ️ $*" | tee -a ${LOG_FILE}
}

log_success() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ✅ $*" | tee -a ${LOG_FILE}
}

log_warning() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ⚠️ $*" | tee -a ${LOG_FILE}
}

log_error() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ❌ $*" | tee -a ${LOG_FILE}
}

# 错误处理函数
error_exit() {
log_error "$1"
log_error "流程失败,请查看日志: ${LOG_FILE}"
exit 1
}

# 检查文件是否存在
check_file() {
if [ ! -f "$1" ]; then
error_exit "文件不存在: $1"
fi
}

# 检查目录是否存在
check_dir() {
if [ ! -d "$1" ]; then
error_exit "目录不存在: $1"
fi
}

# ==============================================================================
# 🏁 流程开始
# ==============================================================================

echo "╔══════════════════════════════════════════════════════════════════════╗"
echo "║ 🧬 Puzzle Hi-C 基因组组装流程 v3.0 ║"
echo "╚══════════════════════════════════════════════════════════════════════╝"
echo ""

# 初始化工作目录
# WORK_DIR=$(pwd)/${PROJECT_NAME}_analysis
WORK_DIR=$(pwd)/puzzle_hic_output
mkdir -p ${WORK_DIR}
LOG_FILE="${WORK_DIR}/pipeline_$(date +%Y%m%d_%H%M%S).log"

log_info "Pipeline started"
log_info "Project: ${PROJECT_NAME}"
log_info "Working directory: ${WORK_DIR}"
log_info "Log file: ${LOG_FILE}"

# ==============================================================================
# Step 0: 环境检查
# ==============================================================================

log_info "[Step 0] 检查环境和输入文件..."

# 检查输入文件
check_file "${REF_FASTA}"
check_file "${HIC_R1}"
check_file "${HIC_R2}"
log_success "输入文件检查通过"

# 检查软件目录
check_dir "${PUZZLE_DIR}"
check_dir "${JUICER_DIR}"
check_dir "${PUZZLE_CONDA_ENV}"
log_success "软件目录检查通过"

# 设置软件路径
JUICER_SH="${JUICER_DIR}/scripts/juicer.sh"
JUICER_TOOLS="${JUICER_DIR}/scripts/juicer_tools"

if [ ! -f "${JUICER_SH}" ]; then
error_exit "Juicer脚本未找到: ${JUICER_SH}"
fi

# 创建目录结构
mkdir -p ${WORK_DIR}/{references,restriction_sites,fastq,logs,backup}
log_success "目录结构创建完成"

# ==============================================================================
# Step 1: 准备参考基因组和必需文件(参照您的成功脚本)
# ==============================================================================

log_info "[Step 1] 准备参考基因组和必需文件..."
cd ${WORK_DIR}

# 1.1 链接基因组文件(Juicer 需要特定文件名)
GENOME_FA="${WORK_DIR}/genome.fa"
if [ ! -f "${GENOME_FA}" ]; then
ln -s $(readlink -f ${REF_FASTA}) ${GENOME_FA}
log_success "基因组文件链接创建: ${GENOME_FA}"
fi

# 1.2 构建 BWA 索引
if [ ! -f "${GENOME_FA}.bwt" ]; then
log_info "构建 BWA 索引..."
bwa index ${GENOME_FA} 2>&1 | tee -a ${LOG_FILE}
log_success "BWA 索引构建完成"
else
log_warning "BWA 索引已存在,跳过"
fi

# 1.3 生成 chrom.sizes 文件
CHROM_SIZES="${WORK_DIR}/references/${PROJECT_NAME}.chrom.sizes"
if [ ! -f "${CHROM_SIZES}" ]; then
log_info "生成 chrom.sizes 文件..."
samtools faidx ${GENOME_FA}
awk -v OFS='\t' '{print $1, $2}' ${GENOME_FA}.fai > ${CHROM_SIZES}

NUM_CHROMS=$(wc -l < ${CHROM_SIZES})
log_info "检测到 ${NUM_CHROMS} 个序列"
log_success "chrom.sizes 生成完成"
else
log_warning "chrom.sizes 已存在,跳过"
fi

# 1.4 生成酶切位点文件(参照您的成功方法)
SITE_FILE="${WORK_DIR}/restriction_sites/${PROJECT_NAME}_${ENZYME}.txt"
SITE_FILE_TEMP="${WORK_DIR}/${PROJECT_NAME}_${ENZYME}.txt"

if [ -f "${SITE_FILE}" ] && [ -s "${SITE_FILE}" ]; then
log_warning "酶切位点文件已存在且非空,跳过"
else
log_info "生成限制性内切酶位点文件 (${ENZYME})..."

GEN_SITE_SCRIPT="${JUICER_DIR}/misc/generate_site_positions.py"

if [ ! -f "${GEN_SITE_SCRIPT}" ]; then
log_warning "generate_site_positions.py 未找到,跳过位点文件生成"
SITE_FILE=""
else
# 关键:先在当前目录生成,然后移动
python2 ${GEN_SITE_SCRIPT} ${ENZYME} ${PROJECT_NAME} ${GENOME_FA}

if [ ! -s "${SITE_FILE_TEMP}" ]; then
log_error "generate_site_positions.py 未能正确生成酶切位点文件"
SITE_FILE=""
else
# 排序并移动到目标位置
log_info "排序并移动酶切位点文件..."
sort -k1,1V -k2,2n "${SITE_FILE_TEMP}" > "${SITE_FILE}"
rm -f "${SITE_FILE_TEMP}"
log_success "酶切位点文件生成完成: ${SITE_FILE}"
fi
fi
fi

# ==============================================================================
# Step 2: 准备 Hi-C 数据
# ==============================================================================

log_info "[Step 2] 准备 Hi-C 测序数据..."
cd ${WORK_DIR}/fastq

# 创建符合 Juicer 要求的文件链接
ln -sf $(readlink -f ${HIC_R1}) ${PROJECT_NAME}_R1.fastq.gz
ln -sf $(readlink -f ${HIC_R2}) ${PROJECT_NAME}_R2.fastq.gz

log_success "Hi-C 数据链接创建完成"

# ==============================================================================
# Step 3: 运行 Juicer(使用您成功的参数格式)
# ==============================================================================

if [ "${SKIP_JUICER}" = "false" ]; then
log_info "[Step 3] 运行 Juicer 流程..."
cd ${WORK_DIR}

# 检查是否有旧的输出目录
MERGED_NODUPS="${WORK_DIR}/aligned/merged_nodups.txt"

if [ -f "${MERGED_NODUPS}" ] && [ -s "${MERGED_NODUPS}" ]; then
log_warning "检测到 merged_nodups.txt 已存在"
read -p "是否跳过 Juicer 重新运行? (y/n): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
log_info "跳过 Juicer,使用现有文件"
SKIP_JUICER=true
else
log_warning "清理旧的 aligned 和 splits 目录..."
rm -rf aligned splits
fi
fi

if [ "${SKIP_JUICER}" = "false" ]; then
log_info "启动 Juicer 主流程..."
log_warning "注意: Juicer 运行时间可能很长(数小时到数天),请耐心等待..."

# 使用您成功脚本的参数格式
JUICER_CMD="bash ${JUICER_SH} \
-g ${PROJECT_NAME} \
-d ${WORK_DIR} \
-s ${ENZYME} \
-p ${CHROM_SIZES} \
-z ${GENOME_FA} \
-D ${JUICER_DIR} \
-t ${THREADS} \
--assembly"

# 如果有酶切位点文件,添加 -y 参数
if [ ! -z "${SITE_FILE}" ] && [ -f "${SITE_FILE}" ]; then
JUICER_CMD="${JUICER_CMD} -y ${SITE_FILE}"
log_info "使用酶切位点文件: ${SITE_FILE}"
fi

log_info "执行命令: ${JUICER_CMD}"

# 运行 Juicer
eval ${JUICER_CMD} 2>&1 | tee -a ${LOG_FILE}

JUICER_EXIT_CODE=${PIPESTATUS[0]}
if [ ${JUICER_EXIT_CODE} -ne 0 ]; then
error_exit "Juicer 运行失败,退出码: ${JUICER_EXIT_CODE}"
fi

# 检查输出
if [ ! -f "${MERGED_NODUPS}" ] || [ ! -s "${MERGED_NODUPS}" ]; then
error_exit "Juicer 未生成有效的 merged_nodups.txt"
fi

VALID_PAIRS=$(wc -l < ${MERGED_NODUPS})
log_success "Juicer 完成,有效 reads 对数: ${VALID_PAIRS}"
fi
else
log_warning "[Step 3] 跳过 Juicer (SKIP_JUICER=true)"
MERGED_NODUPS="${WORK_DIR}/aligned/merged_nodups.txt"
check_file "${MERGED_NODUPS}"
VALID_PAIRS=$(wc -l < ${MERGED_NODUPS})
log_info "使用现有 merged_nodups.txt,有效 reads 对数: ${VALID_PAIRS}"
fi

# ==============================================================================
# Step 4: 运行 Puzzle Hi-C
# ==============================================================================

if [ "${SKIP_PUZZLE}" = "false" ]; then
log_info "[Step 4] 运行 Puzzle Hi-C (染色体级别组装)..."

# 激活 conda 环境
log_info "激活 Puzzle Hi-C conda 环境..."
eval "$(conda shell.bash hook)"
conda activate ${PUZZLE_CONDA_ENV}

if [ $? -ne 0 ]; then
error_exit "无法激活 conda 环境: ${PUZZLE_CONDA_ENV}"
fi
log_success "Conda 环境激活成功"

# 创建输出目录
PUZZLE_OUTPUT="${WORK_DIR}/Puzzle_Output"
mkdir -p ${PUZZLE_OUTPUT}
cd ${PUZZLE_OUTPUT}

# 链接必要文件
ln -sf ${MERGED_NODUPS} ./merged_nodups.txt

# 运行 Puzzle Hi-C
log_info "运行 Puzzle Hi-C main.py..."
log_info "参数: 染色体数=${CHROM_NUM}, binsize=${PUZZLE_BINSIZE}, cutoff=${PUZZLE_CUTOFF}"

python3 ${PUZZLE_DIR}/main.py \
-c ${CHROM_NUM} \
-p ${PROJECT_NAME} \
-s ${PUZZLE_BINSIZE} \
-t ${PUZZLE_CUTOFF} \
-i ${PUZZLE_INIT_TRIANGLE} \
-m merged_nodups.txt \
-f ${GENOME_FA} \
-j ${JUICER_TOOLS} \
-n ${THREADS_PUZZLE} 2>&1 | tee -a ${LOG_FILE}

if [ $? -ne 0 ]; then
conda deactivate
error_exit "Puzzle Hi-C 运行失败"
fi

# 反激活 conda 环境
conda deactivate
log_success "Puzzle Hi-C 运行完成"
else
log_warning "[Step 4] 跳过 Puzzle Hi-C (SKIP_PUZZLE=true)"
PUZZLE_OUTPUT="${WORK_DIR}/Puzzle_Output"
fi

# ==============================================================================
# Step 5: 结果验证与汇总
# ==============================================================================

log_info "[Step 5] 验证结果文件..."

# 查找输出文件
FINAL_FASTA=$(find ${PUZZLE_OUTPUT} -name "*.fasta" -o -name "*.fa" 2>/dev/null | head -n 1)
FINAL_AGP=$(find ${PUZZLE_OUTPUT} -name "*.agp" 2>/dev/null | head -n 1)

if [ -f "${FINAL_FASTA}" ]; then
log_success "找到组装结果: ${FINAL_FASTA}"

# 统计scaffold信息
NUM_SCAFFOLDS=$(grep -c "^>" ${FINAL_FASTA})
TOTAL_LENGTH=$(awk '/^>/ {next} {len+=length($0)} END {print len}' ${FINAL_FASTA})

log_info "Scaffold 数量: ${NUM_SCAFFOLDS}"
log_info "总长度: ${TOTAL_LENGTH} bp"
else
log_warning "未找到 FASTA 输出文件"
NUM_SCAFFOLDS="N/A"
TOTAL_LENGTH="N/A"
fi

if [ -f "${FINAL_AGP}" ]; then
log_success "找到 AGP 文件: ${FINAL_AGP}"
else
log_warning "未找到 AGP 文件"
fi

# ==============================================================================
# 流程完成
# ==============================================================================

echo ""
echo "╔══════════════════════════════════════════════════════════════════════╗"
echo "║ 🎉 流程成功完成! ║"
echo "╚══════════════════════════════════════════════════════════════════════╝"
echo ""

log_success "Pipeline completed successfully"
log_info "结果目录: ${WORK_DIR}"
log_info " - Juicer 输出: ${WORK_DIR}/aligned/"
log_info " - Puzzle 输出: ${PUZZLE_OUTPUT}/"
log_info " - 日志文件: ${LOG_FILE}"

# 生成结果摘要文件
SUMMARY_FILE="${WORK_DIR}/RESULTS_SUMMARY.txt"
cat > ${SUMMARY_FILE} << EOF
================================================================================
Puzzle Hi-C 流程运行摘要
================================================================================
运行时间: $(date)
项目名称: ${PROJECT_NAME}
工作目录: ${WORK_DIR}

输入文件:
- 参考基因组: ${REF_FASTA}
- Hi-C R1: ${HIC_R1}
- Hi-C R2: ${HIC_R2}

参数设置:
- 染色体数量: ${CHROM_NUM}
- 限制性内切酶: ${ENZYME}
- 线程数: ${THREADS}
- Puzzle binsize: ${PUZZLE_BINSIZE}
- Puzzle cutoff: ${PUZZLE_CUTOFF}

结果文件:
- 组装结果: ${FINAL_FASTA}
- AGP 文件: ${FINAL_AGP}
- 日志文件: ${LOG_FILE}

统计信息:
- Scaffold 数量: ${NUM_SCAFFOLDS}
- 总长度: ${TOTAL_LENGTH} bp
- 有效 reads 对数: ${VALID_PAIRS}

================================================================================
EOF

log_info "结果摘要已保存到: ${SUMMARY_FILE}"
cat ${SUMMARY_FILE}

echo ""
echo "✨ 所有任务完成!请查看上述路径中的结果文件。"

ALLHiC

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
#!/bin/bash

# ==============================================================================
# 🧬 ALLHiC Complete Pipeline with Juicebox Visualization
# 📅 Version: 3.0 (Integrated)
# 🎯 Purpose: Chromosome scaffolding + Juicebox visualization in one workflow
# ==============================================================================

set -euo pipefail

source /share/org/YZWL/yzwl_lixg/miniforge3/etc/profile.d/conda.sh
conda activate biopytools

# ---------------------- 📋 默认参数配置 (Default Configuration) ----------------------

# 软件路径
ALLHIC_SOFTWARE_PATH="/share/org/YZWL/yzwl_lixg/software/ALLHiC"
SCRIPT_VISUALIZER="/share/org/YZWL/yzwl_lixg/software/3d-dna/visualize/run-asm-visualizer.sh"
SCRIPT_AGP2ASM="/share/org/YZWL/yzwl_lixg/software/3d-dna/utils/agp2assembly.py"
JUICER_TOOLS="/share/org/YZWL/yzwl_lixg/software/juicer/CPU/common/juicer_tools.jar"

# 工作目录
WORK_DIR="/share/org/YZWL/yzwl_lixg/project/06.longliuxing_BSA/68.三代数据组装和注释/05.allhic"

# 输入文件
REF_FA_RAW=""
R1_FQ_RAW=""
R2_FQ_RAW=""

# 基因组参数
CHROMOSOME_K=12
RE_MOTIF="GATC"

# 系统资源
THREADS=64
MEMORY="500G"
JAVA_MEM="-Xmx500g"

# 可选参数
ALLELE_TABLE=""
SAMPLE_NAME="sample"

# 绘图参数
BIN_SIZE="500k"
MIN_BIN_SIZE="50k"

# 跳过步骤标志
SKIP_MAPPING=false
SKIP_ALLELE=false
SKIP_PRUNE=false
SKIP_PARTITION=false
SKIP_EXTRACT=false
SKIP_RESCUE=false
SKIP_OPTIMIZE=false
SKIP_BUILD=false
SKIP_PLOT=false
SKIP_JUICEBOX=false

# ==============================================================================
# 📚 帮助信息
# ==============================================================================

show_help() {
echo "🧬 ALLHiC Complete Pipeline v3.0 - Scaffolding + Juicebox Visualization"
echo ""
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Required Arguments:"
echo " -r, --reference FILE 📄 Reference genome assembly (FASTA)"
echo " -1, --read1 FILE 📄 Hi-C read 1 (FASTQ/FASTQ.GZ)"
echo " -2, --read2 FILE 📄 Hi-C read 2 (FASTQ/FASTQ.GZ)"
echo ""
echo "Basic Options:"
echo " -k, --chr-num INT 🔢 Number of chromosomes (default: 12)"
echo " -e, --enzyme STR ✂️ Restriction enzyme motif (default: GATC)"
echo " -t, --threads INT ⚡ CPU threads (default: 64)"
echo " -w, --workdir DIR 📂 Working directory (default: current)"
echo " -n, --name STR 📛 Sample name prefix (default: sample)"
echo ""
echo "Advanced Options:"
echo " -a, --allele-table FILE 📋 Allele table (Optional. If not provided, auto-generated)"
echo " -s, --software-path DIR 🛠️ ALLHiC installation path"
echo " -b, --bin-size STR 📊 Heatmap bin size (default: 500k)"
echo " -m, --min-bin STR 📊 Minimum bin size (default: 50k)"
echo " --memory STR 💾 Memory for sorting (default: 800G)"
echo ""
echo "Juicebox Options:"
echo " --visualizer PATH 🎨 Path to run-asm-visualizer.sh"
echo " --agp2asm PATH 🔄 Path to agp2assembly.py"
echo " --juicer-tools PATH 🛠️ Path to juicer_tools.jar"
echo ""
echo "Pipeline Control:"
echo " --skip-mapping ⏭️ Skip mapping step"
echo " --skip-allele ⏭️ Skip allele table generation"
echo " --skip-prune ⏭️ Skip pruning step"
echo " --skip-partition ⏭️ Skip partition step"
echo " --skip-extract ⏭️ Skip extract step"
echo " --skip-rescue ⏭️ Skip rescue step"
echo " --skip-optimize ⏭️ Skip optimize step"
echo " --skip-build ⏭️ Skip build step"
echo " --skip-plot ⏭️ Skip plot step"
echo " --skip-juicebox ⏭️ Skip Juicebox visualization"
echo ""
echo "Examples:"
echo " # Basic run"
echo " $0 -r genome.fa -1 R1.fq.gz -2 R2.fq.gz -k 12 -t 64"
echo ""
echo " # Skip ALLHiC steps, only run Juicebox"
echo " $0 -r genome.fa -1 R1.fq.gz -2 R2.fq.gz -k 12 -t 64 \\"
echo " --skip-mapping --skip-allele --skip-prune --skip-partition \\"
echo " --skip-extract --skip-rescue --skip-optimize --skip-build --skip-plot"
echo ""
}

show_version() {
echo "🧬 ALLHiC Complete Pipeline v3.0"
echo "📅 Last updated: $(date +%Y-%m-%d)"
}

# ==============================================================================
# 🔧 参数解析
# ==============================================================================

parse_arguments() {
if [ $# -eq 0 ]; then
show_help
exit 0
fi

while [ $# -gt 0 ]; do
case "$1" in
-h|--help) show_help; exit 0 ;;
-v|--version) show_version; exit 0 ;;
-r|--reference) REF_FA_RAW="$2"; shift 2 ;;
-1|--read1) R1_FQ_RAW="$2"; shift 2 ;;
-2|--read2) R2_FQ_RAW="$2"; shift 2 ;;
-k|--chr-num) CHROMOSOME_K="$2"; shift 2 ;;
-e|--enzyme) RE_MOTIF="$2"; shift 2 ;;
-t|--threads) THREADS="$2"; shift 2 ;;
-w|--workdir) WORK_DIR="$2"; shift 2 ;;
-n|--name) SAMPLE_NAME="$2"; shift 2 ;;
-a|--allele-table) ALLELE_TABLE="$2"; shift 2 ;;
-s|--software-path) ALLHIC_SOFTWARE_PATH="$2"; shift 2 ;;
-b|--bin-size) BIN_SIZE="$2"; shift 2 ;;
-m|--min-bin) MIN_BIN_SIZE="$2"; shift 2 ;;
--memory) MEMORY="$2"; shift 2 ;;
--visualizer) SCRIPT_VISUALIZER="$2"; shift 2 ;;
--agp2asm) SCRIPT_AGP2ASM="$2"; shift 2 ;;
--juicer-tools) JUICER_TOOLS="$2"; shift 2 ;;
--skip-mapping) SKIP_MAPPING=true; shift ;;
--skip-allele) SKIP_ALLELE=true; shift ;;
--skip-prune) SKIP_PRUNE=true; shift ;;
--skip-partition) SKIP_PARTITION=true; shift ;;
--skip-extract) SKIP_EXTRACT=true; shift ;;
--skip-rescue) SKIP_RESCUE=true; shift ;;
--skip-optimize) SKIP_OPTIMIZE=true; shift ;;
--skip-build) SKIP_BUILD=true; shift ;;
--skip-plot) SKIP_PLOT=true; shift ;;
--skip-juicebox) SKIP_JUICEBOX=true; shift ;;
*) echo "❌ Unknown option: $1"; exit 1 ;;
esac
done
}

# ==============================================================================
# 🔍 参数验证
# ==============================================================================

validate_parameters() {
local errors=0
echo "🔍 Validating parameters..."

[[ -z "$REF_FA_RAW" ]] && { echo "❌ Error: Reference genome (-r) required"; ((errors++)); }
[[ -z "$R1_FQ_RAW" ]] && { echo "❌ Error: Read1 (-1) required"; ((errors++)); }
[[ -z "$R2_FQ_RAW" ]] && { echo "❌ Error: Read2 (-2) required"; ((errors++)); }

if [ -n "$ALLELE_TABLE" ] && [ ! -f "$ALLELE_TABLE" ]; then
echo "⚠️ Warning: Specified Allele table not found. It will be re-generated."
ALLELE_TABLE=""
fi

if [ $errors -gt 0 ]; then exit 1; fi
echo "✅ Parameters validated"
}

# ==============================================================================
# 📝 日志系统
# ==============================================================================

LOG_FILE=""
setup_logging() {
mkdir -p "${WORK_DIR}/logs"
LOG_FILE="${WORK_DIR}/logs/allhic_complete_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Log file: $LOG_FILE"
}

log() {
local timestamp=$(date +'%Y-%m-%d %H:%M:%S')
echo -e "[$timestamp] $*" | tee -a "$LOG_FILE"
}

log_section() {
echo "" | tee -a "$LOG_FILE"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | tee -a "$LOG_FILE"
echo "$1" | tee -a "$LOG_FILE"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | tee -a "$LOG_FILE"
}

run_cmd() {
log "▶️ Running: $1"
if eval "$1" >> "$LOG_FILE" 2>&1; then
log "✅ Command completed successfully"
return 0
else
log "❌ Command failed with exit code $?"
exit 1
fi
}

# ==============================================================================
# 🔧 环境设置
# ==============================================================================

setup_environment() {
log_section "🔧 Setting up environment"

export PATH="${ALLHIC_SOFTWARE_PATH}/scripts:${ALLHIC_SOFTWARE_PATH}/bin:$PATH"

local missing_tools=()
for tool in ALLHiC_partition ALLHiC_prune ALLHiC_rescue ALLHiC_build ALLHiC_plot allhic bwa samtools minimap2; do
if ! command -v "$tool" >/dev/null 2>&1; then
missing_tools+=("$tool")
fi
done

if [ ${#missing_tools[@]} -gt 0 ]; then
log "❌ Missing required tools: ${missing_tools[*]}"
log "💡 Ensure minimap2 and ALLHiC are in PATH."
exit 1
fi

mkdir -p "$WORK_DIR"
if ! cd "$WORK_DIR"; then
log "❌ Failed to change to work directory: $WORK_DIR"
exit 1
fi

export INITIAL_WORK_DIR="$WORK_DIR"

log "📂 Work Dir: $WORK_DIR"
log "⚡ Threads: $THREADS | 🔢 K: $CHROMOSOME_K | ✂️ Enzyme: $RE_MOTIF"
}

# ==============================================================================
# 🔗 Step 0: 数据准备
# ==============================================================================

prepare_data() {
log_section "🔗 [Step 0] Preparing input data"
ln -sf "$REF_FA_RAW" draft.asm.fasta
ln -sf "$R1_FQ_RAW" reads_R1.fastq.gz
ln -sf "$R2_FQ_RAW" reads_R2.fastq.gz
log "✅ Input files linked"
}

# ==============================================================================
# 🔍 Step 1: 比对 (Mapping)
# ==============================================================================

run_mapping() {
if [ "$SKIP_MAPPING" = true ]; then return 0; fi
log_section "🔍 [Step 1] Mapping Hi-C reads to reference"

local CLEAN_BAM="sample.clean.bam"
if [ -f "$CLEAN_BAM" ]; then log "⚠️ $CLEAN_BAM exists. Skipping."; return 0; fi

[ ! -f "draft.asm.fasta.bwt" ] && run_cmd "bwa index draft.asm.fasta"
[ ! -f "draft.asm.fasta.fai" ] && run_cmd "samtools faidx draft.asm.fasta"

run_cmd "bwa mem -t $THREADS -5SP draft.asm.fasta reads_R1.fastq.gz reads_R2.fastq.gz | \
samtools view -@ $THREADS -hF 2316 -q 30 - | \
samtools sort -@ $THREADS -n -o $CLEAN_BAM -"
log "✅ Mapping done: $CLEAN_BAM"
}

# ==============================================================================
# 🧬 Step 1.5: 自动等位基因表生成
# ==============================================================================

run_allele_detection() {
if [ "$SKIP_ALLELE" = true ]; then return 0; fi
log_section "🧬 [Step 1.5] Auto-detecting Allelic Contigs"

if [ -n "$ALLELE_TABLE" ] && [ -f "$ALLELE_TABLE" ]; then
log "📋 Using provided allele table: $ALLELE_TABLE"
return 0
fi

local PAF_FILE="draft.asm.paf"
local COUNTS_FILE="sample.clean.counts_${RE_MOTIF}.txt"
local GEN_TABLE="alleles.table"

if [ -f "$GEN_TABLE" ]; then
log "⚠️ $GEN_TABLE already exists. Using it."
ALLELE_TABLE="$GEN_TABLE"
return 0
fi

if [ ! -f "$COUNTS_FILE" ]; then
log "📊 Generating RE counts from clean BAM..."
run_cmd "allhic extract sample.clean.bam draft.asm.fasta --RE $RE_MOTIF"
fi

if [ ! -f "$PAF_FILE" ]; then
log "🔍 Running minimap2 for self-alignment..."
run_cmd "minimap2 -DP -k19 -w19 -m200 -t $THREADS draft.asm.fasta draft.asm.fasta > $PAF_FILE"
fi

log "📝 Generating alleles.table..."
run_cmd "allhic alleles $PAF_FILE $COUNTS_FILE > $GEN_TABLE"

if [ -f "$GEN_TABLE" ] && [ -s "$GEN_TABLE" ]; then
ALLELE_TABLE="$GEN_TABLE"
local line_count=$(wc -l < "$GEN_TABLE")
log "✅ Allele table generated: $ALLELE_TABLE ($line_count allelic pairs detected)"
else
log "⚠️ Warning: allhic alleles produced no output."
log "ℹ️ This may be normal for haploid genomes or low heterozygosity."
log "ℹ️ Continuing without allele table..."
ALLELE_TABLE=""
fi

cd "$WORK_DIR" || exit 1
}

# ==============================================================================
# ✂️ Step 2: 修剪 (Pruning)
# ==============================================================================

run_pruning() {
if [ "$SKIP_PRUNE" = true ]; then return 0; fi
log_section "✂️ [Step 2] Pruning allelic/weak signals"

local CLEAN_BAM="sample.clean.bam"
local PRUNED_BAM="prunning.bam"

if [ -f "$PRUNED_BAM" ]; then log "⚠️ $PRUNED_BAM exists. Skipping."; return 0; fi

if [ -n "$ALLELE_TABLE" ] && [ -f "$ALLELE_TABLE" ]; then
log "📋 Pruning using allele table: $ALLELE_TABLE"
run_cmd "ALLHiC_prune -i $ALLELE_TABLE -b $CLEAN_BAM -r draft.asm.fasta"
else
log "⚠️ WARNING: No allele table found! Pruning will be ineffective."
log "ℹ️ Linking clean BAM as pruned BAM."
run_cmd "ln -sf $CLEAN_BAM $PRUNED_BAM"
fi

log "✅ Pruning completed: $PRUNED_BAM"
}

# ==============================================================================
# 📦 Step 3: 分组 (Partition)
# ==============================================================================

run_partition() {
if [ "$SKIP_PARTITION" = true ]; then return 0; fi
log_section "📦 [Step 3] Partitioning into $CHROMOSOME_K groups"

local PRUNED_BAM="prunning.bam"
local CLUSTERS_FILE="prunning.clusters.txt"

if [ ! -f "$PRUNED_BAM" ]; then log "❌ Pruned BAM missing"; exit 1; fi
if [ -f "$CLUSTERS_FILE" ]; then log "⚠️ Partition exists. Skipping."; return 0; fi

run_cmd "ALLHiC_partition -b $PRUNED_BAM -r draft.asm.fasta -e $RE_MOTIF -k $CHROMOSOME_K"

log "✅ Partition completed"
}

# ==============================================================================
# 🧬 Step 3.5: 提取信号 (Extract)
# ==============================================================================

run_extract() {
if [ "$SKIP_EXTRACT" = true ]; then return 0; fi
log_section "🧬 [Step 3.5] Extracting CLM (for Optimize)"

local CLEAN_BAM="sample.clean.bam"
local CLM_FILE="sample.clean.clm"

if [ -f "$CLM_FILE" ]; then log "⚠️ CLM exists. Skipping."; return 0; fi

log "📊 Extracting Hi-C matrix from CLEAN BAM..."
run_cmd "allhic extract $CLEAN_BAM draft.asm.fasta --RE $RE_MOTIF"

log "✅ Extract completed"
}

# ==============================================================================
# 🚑 Step 4: 挽救 (Rescue)
# ==============================================================================

run_rescue() {
if [ "$SKIP_RESCUE" = true ]; then return 0; fi
log_section "🚑 [Step 4] Rescuing unplaced contigs"

local CLEAN_BAM="sample.clean.bam"
local CLUSTERS_FILE="prunning.clusters.txt"
local COUNTS_FILE="sample.clean.counts_${RE_MOTIF}.txt"

if [ ! -f "$COUNTS_FILE" ]; then
log "⚠️ Counts file missing, generating..."
run_cmd "allhic extract $CLEAN_BAM draft.asm.fasta --RE $RE_MOTIF"
fi

run_cmd "ALLHiC_rescue -b $CLEAN_BAM -r draft.asm.fasta -c $CLUSTERS_FILE -i $COUNTS_FILE"
log "✅ Rescue completed"
}

# ==============================================================================
# ⚙️ Step 5: 优化 (Optimize)
# ==============================================================================

run_optimize() {
if [ "$SKIP_OPTIMIZE" = true ]; then return 0; fi
log_section "⚙️ [Step 5] Optimizing ordering and orientation"

local CLM_FILE="sample.clean.clm"
local PARTITION_FILES=$(ls prunning.counts_${RE_MOTIF}.${CHROMOSOME_K}g*.txt 2>/dev/null || true)

if [ -z "$PARTITION_FILES" ]; then log "❌ No group files found!"; exit 1; fi

> optimize_cmds.sh
local COUNT=1
for GFILE in $PARTITION_FILES; do
local NEW_NAME="group${COUNT}.txt"
cp "$GFILE" "$NEW_NAME"

if [ ! -f "group${COUNT}.tour" ]; then
echo "allhic optimize $NEW_NAME $CLM_FILE" >> optimize_cmds.sh
fi
((COUNT++))
done

if [ -s "optimize_cmds.sh" ]; then
log "🔄 Running optimization on $(wc -l < optimize_cmds.sh) groups..."
if command -v ParaFly >/dev/null 2>&1; then
ParaFly -c optimize_cmds.sh -CPU "$THREADS" -failed_cmds optimize_failed.cmds
else
cat optimize_cmds.sh | xargs -L 1 -I CMD -P "$THREADS" bash -c "CMD"
fi
else
log "⚠️ All groups already optimized."
fi
log "✅ Optimization completed"
}

# ==============================================================================
# 🏗️ Step 6 & 7: 构建与绘图 (Build & Plot)
# ==============================================================================

run_build() {
if [ "$SKIP_BUILD" = true ]; then return 0; fi
log_section "🏗️ [Step 6] Building Assembly"
[ ! -f "groups.asm.fasta" ] && run_cmd "ALLHiC_build draft.asm.fasta"
log "✅ Build completed"
}

run_plot() {
if [ "$SKIP_PLOT" = true ]; then return 0; fi
log_section "📊 [Step 7] Generating Heatmap"

local CLEAN_BAM="sample.clean.bam"
local AGP_FILE="groups.agp"
local CHR_LIST="chrn.list"

if [ ! -f "$AGP_FILE" ]; then log "❌ AGP missing"; exit 1; fi

if [ ! -f "$CHR_LIST" ]; then
run_cmd "samtools faidx groups.asm.fasta"
cut -f1,2 groups.asm.fasta.fai > "$CHR_LIST"
fi

log "🎨 Drawing heatmap..."
run_cmd "ALLHiC_plot -b $CLEAN_BAM -a $AGP_FILE -l $CHR_LIST -m $MIN_BIN_SIZE -s $BIN_SIZE -o ./"
log "✅ Plot completed"
}

# ==============================================================================
# 🎨 Step 8: Juicebox 可视化准备
# ==============================================================================

run_juicebox_prep() {
if [ "$SKIP_JUICEBOX" = true ]; then return 0; fi
log_section "🎨 [Step 8] Preparing Juicebox Visualization"

# 检查必需文件
if [ ! -f "groups.asm.fasta" ]; then
log "❌ groups.asm.fasta not found. Run build step first."
exit 1
fi

if [ ! -f "groups.agp" ]; then
log "❌ groups.agp not found. Run build step first."
exit 1
fi

# 创建 Juicebox 工作目录
local JB_DIR="${WORK_DIR}/juicebox_visualization"
mkdir -p "$JB_DIR"
cd "$JB_DIR" || exit 1

log "📂 Juicebox working directory: $JB_DIR"

# 链接必要文件
log "🔗 Linking reference files..."
ln -sf "${WORK_DIR}/groups.asm.fasta" ref.fasta
ln -sf "${WORK_DIR}/groups.agp" groups.agp

# 索引 reference(如果还没索引)
if [ ! -f "ref.fasta.bwt" ]; then
log "🧬 Indexing reference for Juicebox..."
run_cmd "bwa index ref.fasta"
run_cmd "samtools faidx ref.fasta"
else
log "⚠️ Reference index exists, skipping."
fi

# 检查是否已有 mapped BAM
if [ -f "mapped.namesorted.bam" ]; then
log "⚠️ mapped.namesorted.bam exists. Skipping mapping."
else
log "⚔️ Aligning Hi-C reads to scaffolded assembly..."
run_cmd "bwa mem -SP5 -t $THREADS ref.fasta $R1_FQ_RAW $R2_FQ_RAW | \
samtools view -@ 10 -Shb -F 2316 - | \
samtools sort -@ 20 -n -o mapped.namesorted.bam -"
fi

# 转换为 MND 格式
if [ -f "mapped.mnd.txt" ]; then
log "⚠️ mapped.mnd.txt exists. Skipping."
else
log "🔄 Converting BAM to Merged_Nodups.txt format..."
samtools view mapped.namesorted.bam | \
awk 'function get_strand(flag) { return and(flag, 16) ? 1 : 0 }
NR%2==1 {
r1_ref=$3; r1_pos=$4; r1_str=get_strand($2); r1_mapq=$5
}
NR%2==0 {
r2_ref=$3; r2_pos=$4; r2_str=get_strand($2); r2_mapq=$5;
if (r1_ref != "*" && r2_ref != "*") {
if (r1_ref > r2_ref || (r1_ref == r2_ref && r1_pos > r2_pos)) {
print r2_str, r2_ref, r2_pos, 0, r1_str, r1_ref, r1_pos, 1, r2_mapq, r1_mapq
} else {
print r1_str, r1_ref, r1_pos, 0, r2_str, r2_ref, r2_pos, 1, r1_mapq, r2_mapq
}
}
}' | \
sort -k2,2 -k3,3n -k6,6 -k7,7n --parallel=$THREADS -S $MEMORY > mapped.mnd.txt 2>> "$LOG_FILE"

if [ $? -eq 0 ]; then
log "✅ MND file generated: mapped.mnd.txt"
else
log "❌ Failed to generate MND file"
exit 1
fi
fi

# # 运行 3d-dna visualizer
# if [ -f "ref.hic" ]; then
# log "⚠️ ref.hic exists. Skipping visualization."
# else
# log "📊 Running 3d-dna Assembly Visualizer..."
# if [ ! -f "$SCRIPT_VISUALIZER" ]; then
# log "❌ Visualizer script not found: $SCRIPT_VISUALIZER"
# exit 1
# fi

# run_cmd "bash $SCRIPT_VISUALIZER -q 10 -j $JUICER_TOOLS ref.fasta mapped.mnd.txt"
# fi

# # 转换 AGP 为 Assembly 格式
# if [ -f "allhic_groups.assembly" ]; then
# log "⚠️ allhic_groups.assembly exists. Skipping."
# else
# log "📑 Converting ALLHiC AGP to Juicebox Assembly format..."
# if [ -f "$SCRIPT_AGP2ASM" ]; then
# run_cmd "python3 $SCRIPT_AGP2ASM groups.agp allhic_groups.assembly"
# else
# log "⚠️ Warning: agp2assembly.py not found at $SCRIPT_AGP2ASM"
# fi
# fi

# 3d-dna 主流程默认输出文件名格式为 <输入名前缀>.final.hic
EXPECTED_OUTPUT="ref.final.hic"

if [ -f "$EXPECTED_OUTPUT" ]; then
log "⚠️ $EXPECTED_OUTPUT exists. Skipping visualization."
else
log "📊 Running 3d-dna Pipeline (Heatmap Only Mode)..."

# # 检查主流程脚本是否存在
# if [ ! -f "$SCRIPT_PIPELINE" ]; then
# log "❌ Pipeline script not found: $SCRIPT_PIPELINE"
# exit 1
# fi

# 运行命令解析:
# -r 0 : 0轮组装迭代(即不进行组装,只生成热图)
# -q 10 : 过滤 mapping quality < 10 的 reads
# ref.fasta : 参考序列
# mapped.mnd.txt : 比对文件

run_cmd "bash /share/org/YZWL/yzwl_lixg/software/3d-dna/run-asm-pipeline.sh -r 0 -q 10 ref.fasta mapped.mnd.txt"

# 检查是否成功生成
if [ -f "$EXPECTED_OUTPUT" ]; then
log "✅ Successfully generated: $EXPECTED_OUTPUT"
else
log "❌ Failed to generate $EXPECTED_OUTPUT. Check logs."
exit 1
fi
fi

# 返回主工作目录
cd "$WORK_DIR" || exit 1

log "✅ Juicebox visualization files ready in: $JB_DIR"
log "📋 Files generated:"
log " - ref.hic (Load in Juicebox)"
log " - ref.assembly (Assembly structure)"
log " - allhic_groups.assembly (From AGP)"
}

# ==============================================================================
# 🚀 主流程
# ==============================================================================

main() {
parse_arguments "$@"
validate_parameters
setup_logging
setup_environment

local start_time=$(date +%s)
log_section "🚀 Starting ALLHiC Complete Pipeline v3.0"

# ALLHiC 核心流程
prepare_data
run_mapping
run_allele_detection
run_pruning
run_partition
run_extract
run_rescue
run_optimize
run_build
run_plot

# Juicebox 可视化流程
run_juicebox_prep

local end_time=$(date +%s)
local elapsed=$((end_time - start_time))
log_section "🎉 Pipeline Completed in $((elapsed/3600))h $(((elapsed%3600)/60))m $((elapsed%60))s"

log ""
log "📁 Output Summary:"
log " Main directory: $WORK_DIR"
log " - groups.asm.fasta (Final scaffolded assembly)"
log " - groups.agp (AGP file)"
log " - *.pdf (Contact maps)"
log ""
log " Juicebox directory: ${WORK_DIR}/juicebox_visualization"
log " - ref.hic (Load this in Juicebox Desktop)"
log " - ref.assembly (Assembly structure file)"
log ""
log "🎯 Next Steps:"
log " 1. Open Juicebox Desktop"
log " 2. Load ref.hic file"
log " 3. Load ref.assembly or allhic_groups.assembly"
log " 4. Manually curate misassemblies"
log " 5. Export new assembly structure"
}

main "$@"

一些Hi-C的脚本
https://lixiang117423.github.io/article/hic-scripts/
作者
李详【Xiang LI】
发布于
2025年11月30日
许可协议