linux运行tassel做GWAS的脚本

下载软件

1
git clone https://bitbucket.org/tasseladmin/tassel-5-standalone.git

主脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
#!/bin/bash

# ==============================================================================
# 🌾 TASSEL GWAS Automated Pipeline Script (v3.0) 🧬
# 功能:VCF排序 -> 智能计算Kinship -> 运行 GLM/MLM 模型 -> 结果提取
# 更新:增强错误处理、日志记录、结果验证、统计报告
# ==============================================================================

set -o pipefail # 管道中任何命令失败都会导致整个管道失败

# ----------------- 默认配置 -----------------
TASSEL_HOME="/share/org/YZWL/yzwl_lixg/software/tassel-5-standalone"
TASSEL_CMD="$TASSEL_HOME/run_pipeline.pl"
MEMORY_MIN="10g"
MEMORY_MAX="800g"
OUT_PREFIX="GWAS_Result"
SKIP_SORT=false
MODEL="MLM"
THREADS=64 # 线程数
KEEP_TEMP=false # 是否保留中间文件
LOG_LEVEL="INFO" # 日志级别: DEBUG, INFO, WARN, ERROR

# ----------------- 视觉样式 -----------------
GREEN='\033[0;32m'
BLUE='\033[0;34m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
MAGENTA='\033[0;35m'
NC='\033[0m'

ICON_DNA="🧬"
ICON_RICE="🌾"
ICON_RUN="🚀"
ICON_CHECK="✅"
ICON_WARN="⚠️"
ICON_STAT="📊"
ICON_GEAR="⚙️"
ICON_TIME="⏱️"
ICON_FIRE="🔥"
ICON_CLEAN="🧹"

# ----------------- 全局变量 -----------------
LOG_FILE=""
START_TIME=$(date +%s)
TEMP_FILES=()

# ==============================================================================
# 日志函数
# ==============================================================================
log() {
local level=$1
shift
local message="$@"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

# 根据级别决定是否输出
case $level in
DEBUG) [ "$LOG_LEVEL" = "DEBUG" ] && echo -e "${CYAN}[DEBUG]${NC} $message" | tee -a "$LOG_FILE" ;;
INFO) echo -e "${GREEN}[INFO]${NC} $message" | tee -a "$LOG_FILE" ;;
WARN) echo -e "${YELLOW}[WARN]${NC} $message" | tee -a "$LOG_FILE" ;;
ERROR) echo -e "${RED}[ERROR]${NC} $message" | tee -a "$LOG_FILE" ;;
esac
}

# ==============================================================================
# 错误处理函数
# ==============================================================================
error_exit() {
log ERROR "$1"
log ERROR "Pipeline failed! Check log: $LOG_FILE"
cleanup_on_error
exit 1
}

cleanup_on_error() {
if [ "$KEEP_TEMP" = false ]; then
log WARN "Cleaning up temporary files..."
for file in "${TEMP_FILES[@]}"; do
[ -f "$file" ] && rm -f "$file"
done
fi
}

# ==============================================================================
# 信号捕获 (Ctrl+C 等)
# ==============================================================================
trap 'error_exit "Pipeline interrupted by user"' INT TERM

# ==============================================================================
# 帮助信息函数
# ==============================================================================
usage() {
echo -e "${BLUE}============================================================${NC}"
echo -e "${ICON_RICE} TASSEL GWAS Pipeline v3.0"
echo -e "${BLUE}============================================================${NC}"
echo ""
echo -e "用法: $0 -i <vcf> -p <pheno> [options]"
echo ""
echo -e "${YELLOW}必选参数:${NC}"
echo -e " -i, --input 输入基因型文件 (VCF/VCF.gz格式)"
echo -e " -p, --pheno 输入表型文件 (txt格式, 第一列为样本ID)"
echo ""
echo -e "${YELLOW}模型参数:${NC}"
echo -e " -M, --model 选择模型: GLM, MLM, BOTH (默认: MLM)"
echo -e " • GLM: 速度快,适合初筛 (仅需 Q矩阵)"
echo -e " • MLM: 精度高,控制伪关联 (需 Q+K 矩阵)"
echo -e " • BOTH: 同时运行两种模型进行比较"
echo ""
echo -e "${YELLOW}可选参数:${NC}"
echo -e " -q, --qmatrix 群体结构 Q 矩阵文件 (推荐使用)"
echo -e " -k, --kinship 亲缘关系 K 矩阵文件 (若不提供则自动计算)"
echo -e " -o, --out 输出文件前缀 (默认: GWAS_Result)"
echo -e " -m, --memory Java最大内存 (默认: 100g)"
echo -e " -t, --threads 并行线程数 (默认: 4)"
echo -e " --maf 最小等位基因频率过滤 (默认: 不过滤)"
echo -e " --miss 最大缺失率过滤 (默认: 不过滤)"
echo -e " --skip-sort 跳过 VCF 排序步骤"
echo -e " --keep-temp 保留所有中间文件"
echo -e " --log-level 日志级别: DEBUG/INFO/WARN/ERROR (默认: INFO)"
echo -e " -h, --help 显示帮助信息"
echo ""
echo -e "${YELLOW}示例:${NC}"
echo -e " ${GREEN}# 基础用法 (默认跑 MLM)${NC}"
echo -e " $0 -i sample.vcf.gz -p traits.txt"
echo ""
echo -e " ${GREEN}# 只跑 GLM 并提供 Q 矩阵${NC}"
echo -e " $0 -i sample.vcf -p traits.txt -M GLM -q Q.txt"
echo ""
echo -e " ${GREEN}# 跑完整流程,保留中间文件${NC}"
echo -e " $0 -i sample.vcf -p traits.txt -M BOTH -q Q.txt --keep-temp"
echo ""
echo -e " ${GREEN}# 添加质控过滤${NC}"
echo -e " $0 -i sample.vcf -p traits.txt --maf 0.05 --miss 0.1"
echo ""
echo -e "${YELLOW}输出文件:${NC}"
echo -e " • ${OUT_PREFIX}.glm.manht_input - GLM 结果 (用于曼哈顿图)"
echo -e " • ${OUT_PREFIX}.mlm.manht_input - MLM 结果 (用于曼哈顿图)"
echo -e " • ${OUT_PREFIX}.pipeline.log - 详细运行日志"
echo -e " • ${OUT_PREFIX}.stats.txt - 统计摘要报告"
echo ""
exit 1
}

# ==============================================================================
# 解析命令行参数
# ==============================================================================
if [ $# -eq 0 ]; then usage; fi

ARGS=$(getopt -o i:p:q:k:o:m:M:t:h --long input:,pheno:,qmatrix:,kinship:,out:,memory:,model:,threads:,maf:,miss:,skip-sort,keep-temp,log-level:,help -n "$0" -- "$@")
if [ $? != 0 ]; then
echo -e "${RED}参数解析错误${NC}" >&2
exit 1
fi
eval set -- "$ARGS"

MAF_FILTER=""
MISS_FILTER=""

while true; do
case "$1" in
-i|--input) VCF_FILE="$2"; shift 2 ;;
-p|--pheno) PHENO_FILE="$2"; shift 2 ;;
-q|--qmatrix) Q_FILE="$2"; shift 2 ;;
-k|--kinship) KINSHIP_FILE="$2"; shift 2 ;;
-o|--out) OUT_PREFIX="$2"; shift 2 ;;
-m|--memory) MEMORY_MAX="$2"; shift 2 ;;
-M|--model) MODEL=$(echo "$2" | tr '[:lower:]' '[:upper:]'); shift 2 ;;
-t|--threads) THREADS="$2"; shift 2 ;;
--maf) MAF_FILTER="$2"; shift 2 ;;
--miss) MISS_FILTER="$2"; shift 2 ;;
--skip-sort) SKIP_SORT=true; shift ;;
--keep-temp) KEEP_TEMP=true; shift ;;
--log-level) LOG_LEVEL=$(echo "$2" | tr '[:lower:]' '[:upper:]'); shift 2 ;;
-h|--help) usage ;;
--) shift; break ;;
*) echo "Internal error!"; exit 1 ;;
esac
done

# ==============================================================================
# 初始化
# ==============================================================================
LOG_FILE="${OUT_PREFIX}.pipeline.log"
> "$LOG_FILE" # 清空日志文件

echo -e "${ICON_RUN} ${BLUE}╔════════════════════════════════════════════════════════╗${NC}"
echo -e "${ICON_RUN} ${BLUE}║ TASSEL GWAS Pipeline v3.0 Started ║${NC}"
echo -e "${ICON_RUN} ${BLUE}╚════════════════════════════════════════════════════════╝${NC}"

log INFO "${ICON_TIME} Start time: $(date '+%Y-%m-%d %H:%M:%S')"
log INFO "${ICON_GEAR} Working directory: $(pwd)"

# ==============================================================================
# 环境与配置检查
# ==============================================================================
log INFO "${ICON_GEAR} Checking environment..."

# 1. 检查模型参数
if [[ "$MODEL" != "GLM" && "$MODEL" != "MLM" && "$MODEL" != "BOTH" ]]; then
log WARN "Invalid model: $MODEL. Using default: MLM"
MODEL="MLM"
fi
log INFO "Selected Model: ${YELLOW}$MODEL${NC}"

# 2. 检查 TASSEL
if [ ! -f "$TASSEL_CMD" ]; then
error_exit "TASSEL not found at $TASSEL_CMD"
fi
log INFO "${ICON_CHECK} TASSEL found: $TASSEL_CMD"

# 3. 检查输入文件
if [ ! -f "$VCF_FILE" ]; then
error_exit "VCF file not found: $VCF_FILE"
fi
if [ ! -f "$PHENO_FILE" ]; then
error_exit "Phenotype file not found: $PHENO_FILE"
fi

# 统计输入文件信息
VCF_VARIANTS=$(zcat -f "$VCF_FILE" 2>/dev/null | grep -v "^#" | wc -l)
VCF_SAMPLES=$(zcat -f "$VCF_FILE" 2>/dev/null | grep "^#CHROM" | awk '{print NF-9}')
PHENO_SAMPLES=$(tail -n +2 "$PHENO_FILE" | wc -l)
PHENO_TRAITS=$(head -1 "$PHENO_FILE" | awk '{print NF-1}')

log INFO "${ICON_DNA} Input VCF: $VCF_FILE"
log INFO " • Variants: $VCF_VARIANTS"
log INFO " • Samples: $VCF_SAMPLES"
log INFO "${ICON_STAT} Phenotype: $PHENO_FILE"
log INFO " • Samples: $PHENO_SAMPLES"
log INFO " • Traits: $PHENO_TRAITS"

# 4. 检查 Q 和 K 矩阵
if [ -n "$Q_FILE" ] && [ -f "$Q_FILE" ]; then
Q_DIMS=$(awk 'NR==1 {print NF-1; exit}' "$Q_FILE")
log INFO "${ICON_CHECK} Q-matrix provided: $Q_FILE (K=$Q_DIMS)"
else
log WARN "No Q-matrix provided"
fi

if [ -n "$KINSHIP_FILE" ] && [ -f "$KINSHIP_FILE" ]; then
log INFO "${ICON_CHECK} Kinship matrix provided: $KINSHIP_FILE"
fi

# ==============================================================================
# 步骤 0: 表型文件预处理
# ==============================================================================
log INFO "${ICON_GEAR} Preparing phenotype file..."
PHENO_READY="${OUT_PREFIX}.pheno_ready.txt"
TEMP_FILES+=("$PHENO_READY")

# 检查表型文件格式
FIRST_LINE=$(head -1 "$PHENO_FILE")
log DEBUG "Original phenotype header: $FIRST_LINE"

# 创建符合TASSEL格式的表型文件
# TASSEL要求:
# 1. 第一列必须是 <Trait> 或 <Taxa>
# 2. 第二行开始是数据行,第二列开始是 <Data> 类型
# 3. 必须是制表符分隔
# 4. 不能有空行或注释行

awk 'BEGIN {OFS="\t"}
NR==1 {
# 替换第一列为 <Trait>
$1="<Trait>"
print
# 添加数据类型行(第一列是taxa,其余是data)
printf "<Trait>"
for(i=2; i<=NF; i++) {
printf "\t<Data>"
}
printf "\n"
next
}
NR>1 {
# 跳过空行
if(NF==0) next
# 跳过注释行
if($0 ~ /^#/) next
# 输出数据行
print
}' "$PHENO_FILE" > "$PHENO_READY"

# 验证生成的文件
if [ ! -s "$PHENO_READY" ]; then
error_exit "Failed to create phenotype file"
fi

PHENO_LINES=$(wc -l < "$PHENO_READY")
log INFO "${ICON_CHECK} Phenotype file prepared ($PHENO_LINES lines)"
log DEBUG "First 3 lines of prepared phenotype:"
head -3 "$PHENO_READY" | while read line; do
log DEBUG " $line"
done

# ==============================================================================
# 步骤 1: VCF 排序与质控
# ==============================================================================
if [ "$SKIP_SORT" = true ]; then
VCF_SORTED="$VCF_FILE"
log INFO "${ICON_DNA} Skipping VCF sort (user requested)"
else
VCF_SORTED="${OUT_PREFIX}.sorted.vcf"
TEMP_FILES+=("$VCF_SORTED")

if command -v bcftools &> /dev/null; then
log INFO "${ICON_DNA} Sorting and filtering VCF with bcftools..."

FILTER_CMD="bcftools view"
[ -n "$MAF_FILTER" ] && FILTER_CMD="$FILTER_CMD -q $MAF_FILTER"
[ -n "$MISS_FILTER" ] && FILTER_CMD="$FILTER_CMD -i 'F_MISSING<$MISS_FILTER'"

$FILTER_CMD "$VCF_FILE" | bcftools sort -O v -T ./ -o "$VCF_SORTED" || error_exit "VCF sorting failed"

FILTERED_VARIANTS=$(grep -v "^#" "$VCF_SORTED" | wc -l)
log INFO "${ICON_CHECK} VCF sorted. Variants after filter: $FILTERED_VARIANTS"
else
log WARN "bcftools not found. Using original VCF without sorting"
VCF_SORTED="$VCF_FILE"
fi
fi

# ==============================================================================
# 步骤 2: Kinship 矩阵准备
# ==============================================================================
REAL_KINSHIP=""
NEED_KINSHIP=false
if [[ "$MODEL" == "MLM" || "$MODEL" == "BOTH" ]]; then
NEED_KINSHIP=true
fi

if [ -n "$KINSHIP_FILE" ] && [ -f "$KINSHIP_FILE" ]; then
log INFO "${ICON_DNA} Using provided Kinship: $KINSHIP_FILE"
REAL_KINSHIP="$KINSHIP_FILE"
elif [ "$NEED_KINSHIP" = true ]; then
log INFO "${ICON_DNA} Calculating Kinship matrix (Centered_IBS)..."
REAL_KINSHIP="${OUT_PREFIX}.kinship.txt"
[ "$KEEP_TEMP" = false ] && TEMP_FILES+=("$REAL_KINSHIP")

K_START=$(date +%s)
perl "$TASSEL_CMD" -Xms${MEMORY_MIN} -Xmx${MEMORY_MAX} \
-importGuess "$VCF_SORTED" \
-KinshipPlugin -method Centered_IBS -endPlugin \
-export "$REAL_KINSHIP" -exportType SqrMatrix > "${OUT_PREFIX}.kinship.log" 2>&1

K_END=$(date +%s)
K_TIME=$((K_END - K_START))

if [ -f "$REAL_KINSHIP" ]; then
K_SIZE=$(wc -l < "$REAL_KINSHIP")
log INFO "${ICON_CHECK} Kinship calculated in ${K_TIME}s (${K_SIZE}x${K_SIZE})"
else
error_exit "Kinship calculation failed! Check ${OUT_PREFIX}.kinship.log"
fi
else
log INFO "${ICON_DNA} Skipping Kinship (GLM mode)"
fi

# ==============================================================================
# 步骤 3: 运行 GLM 模型
# ==============================================================================
if [[ "$MODEL" == "GLM" || "$MODEL" == "BOTH" ]]; then
echo ""
log INFO "${ICON_STAT} ${GREEN}═══════════════════════════════════════${NC}"
log INFO "${ICON_STAT} ${GREEN}Running GLM (General Linear Model)...${NC}"
log INFO "${ICON_STAT} ${GREEN}═══════════════════════════════════════${NC}"

GLM_OUT="${OUT_PREFIX}_GLM"
GLM_START=$(date +%s)

# 构建命令
CMD="perl $TASSEL_CMD -Xms${MEMORY_MIN} -Xmx${MEMORY_MAX} \
-fork1 -vcf \"$VCF_SORTED\" \
-fork2 -t \"$PHENO_READY\""

if [ -n "$Q_FILE" ] && [ -f "$Q_FILE" ]; then
log INFO " -> Using Q-matrix for population structure correction"
CMD="$CMD -fork3 -q \"$Q_FILE\" -combine4 -input1 -input2 -input3 -intersect"
else
log WARN " -> Running without Q-matrix (may increase false positives)"
CMD="$CMD -combine4 -input1 -input2 -intersect"
fi

CMD="$CMD -FixedEffectLMPlugin -endPlugin -export \"$GLM_OUT\""

log DEBUG "GLM command: $CMD"
eval $CMD > "${OUT_PREFIX}.glm.log" 2>&1

GLM_END=$(date +%s)
GLM_TIME=$((GLM_END - GLM_START))

# 提取结果
if [ -f "${GLM_OUT}1.txt" ]; then
awk 'BEGIN{OFS="\t"} {print $2,$3,$4,$6}' "${GLM_OUT}1.txt" | \
awk '$NF ~ /[0-9.]+/ || NR == 1' > "${OUT_PREFIX}.glm.manht_input"

GLM_SNPS=$(tail -n +2 "${OUT_PREFIX}.glm.manht_input" | wc -l)
GLM_SIG=$(tail -n +2 "${OUT_PREFIX}.glm.manht_input" | awk '$4 < 1e-5' | wc -l)

log INFO "${ICON_CHECK} GLM completed in ${GLM_TIME}s"
log INFO " • Total SNPs: $GLM_SNPS"
log INFO " • Significant (P<1e-5): $GLM_SIG"
log INFO " • Output: ${OUT_PREFIX}.glm.manht_input"
else
log ERROR "GLM output not found! Check ${OUT_PREFIX}.glm.log"
fi
fi

# ==============================================================================
# 步骤 4: 运行 MLM 模型
# ==============================================================================
if [[ "$MODEL" == "MLM" || "$MODEL" == "BOTH" ]]; then
echo ""
log INFO "${ICON_STAT} ${GREEN}═══════════════════════════════════════${NC}"
log INFO "${ICON_STAT} ${GREEN}Running MLM (Mixed Linear Model)...${NC}"
log INFO "${ICON_STAT} ${GREEN}═══════════════════════════════════════${NC}"

MLM_OUT="${OUT_PREFIX}_MLM"
MLM_START=$(date +%s)

# 构建命令
CMD="perl $TASSEL_CMD -Xms${MEMORY_MIN} -Xmx${MEMORY_MAX} \
-fork1 -vcf \"$VCF_SORTED\" \
-fork2 -t \"$PHENO_READY\""

if [ -n "$Q_FILE" ] && [ -f "$Q_FILE" ]; then
log INFO " -> Using Q-matrix + Kinship (Q+K model)"
CMD="$CMD -fork3 -q \"$Q_FILE\" -fork4 -k \"$REAL_KINSHIP\" \
-combine5 -input1 -input2 -input3 -intersect \
-combine6 -input5 -input4 -mlm"
else
log INFO " -> Using Kinship only (K model)"
CMD="$CMD -fork4 -k \"$REAL_KINSHIP\" \
-combine5 -input1 -input2 -intersect \
-combine6 -input5 -input4 -mlm"
fi

CMD="$CMD -mlmVarCompEst P3D -mlmCompressionLevel None -export \"$MLM_OUT\""

log DEBUG "MLM command: $CMD"
eval $CMD > "${OUT_PREFIX}.mlm.log" 2>&1

MLM_END=$(date +%s)
MLM_TIME=$((MLM_END - MLM_START))

# 查找结果文件
MLM_RES="${MLM_OUT}2.txt"
if [ ! -f "$MLM_RES" ]; then
MLM_RES=$(find . -name "${MLM_OUT}*stats*" 2>/dev/null | head -n 1)
fi

if [ -f "$MLM_RES" ]; then
awk 'BEGIN{OFS="\t"} NR == 1 || $7 ~ /[0-9]+/ {print $2, $3, $4, $7}' "$MLM_RES" \
> "${OUT_PREFIX}.mlm.manht_input"

MLM_SNPS=$(tail -n +2 "${OUT_PREFIX}.mlm.manht_input" | wc -l)
MLM_SIG=$(tail -n +2 "${OUT_PREFIX}.mlm.manht_input" | awk '$4 < 1e-5' | wc -l)

log INFO "${ICON_CHECK} MLM completed in ${MLM_TIME}s"
log INFO " • Total SNPs: $MLM_SNPS"
log INFO " • Significant (P<1e-5): $MLM_SIG"
log INFO " • Output: ${OUT_PREFIX}.mlm.manht_input"
else
log ERROR "MLM output not found! Check ${OUT_PREFIX}.mlm.log"
fi
fi

# ==============================================================================
# 生成统计报告
# ==============================================================================
STATS_FILE="${OUT_PREFIX}.stats.txt"
log INFO "${ICON_STAT} Generating statistics report..."

cat > "$STATS_FILE" << EOF
================================================================================
TASSEL GWAS Pipeline Summary Report
================================================================================
Pipeline Version: v3.0
Run Date: $(date '+%Y-%m-%d %H:%M:%S')
Working Directory: $(pwd)

INPUT FILES
--------------------------------------------------------------------------------
VCF File: $VCF_FILE
• Total Variants: $VCF_VARIANTS
• Total Samples: $VCF_SAMPLES

Phenotype File: $PHENO_FILE
• Total Samples: $PHENO_SAMPLES
• Total Traits: $PHENO_TRAITS

Q-Matrix: ${Q_FILE:-Not provided}
Kinship Matrix: ${KINSHIP_FILE:-Auto-calculated}

ANALYSIS SETTINGS
--------------------------------------------------------------------------------
Model: $MODEL
Memory: $MEMORY_MAX
MAF Filter: ${MAF_FILTER:-None}
Missing Filter: ${MISS_FILTER:-None}

RESULTS
--------------------------------------------------------------------------------
EOF

if [[ "$MODEL" == "GLM" || "$MODEL" == "BOTH" ]]; then
cat >> "$STATS_FILE" << EOF
GLM Analysis:
• Runtime: ${GLM_TIME}s
• SNPs Tested: ${GLM_SNPS:-N/A}
• Significant: ${GLM_SIG:-N/A} (P < 1e-5)
• Output File: ${OUT_PREFIX}.glm.manht_input

EOF
fi

if [[ "$MODEL" == "MLM" || "$MODEL" == "BOTH" ]]; then
cat >> "$STATS_FILE" << EOF
MLM Analysis:
• Runtime: ${MLM_TIME}s
• SNPs Tested: ${MLM_SNPS:-N/A}
• Significant: ${MLM_SIG:-N/A} (P < 1e-5)
• Output File: ${OUT_PREFIX}.mlm.manht_input

EOF
fi

END_TIME=$(date +%s)
TOTAL_TIME=$((END_TIME - START_TIME))

cat >> "$STATS_FILE" << EOF
PERFORMANCE
--------------------------------------------------------------------------------
Total Runtime: ${TOTAL_TIME}s ($(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60))))

OUTPUT FILES
--------------------------------------------------------------------------------
Log File: $LOG_FILE
Statistics: $STATS_FILE
$([ -f "${OUT_PREFIX}.glm.manht_input" ] && echo "GLM Results: ${OUT_PREFIX}.glm.manht_input")
$([ -f "${OUT_PREFIX}.mlm.manht_input" ] && echo "MLM Results: ${OUT_PREFIX}.mlm.manht_input")

================================================================================
EOF

log INFO "${ICON_CHECK} Statistics saved to: $STATS_FILE"

# ==============================================================================
# 清理临时文件
# ==============================================================================
if [ "$KEEP_TEMP" = false ] && [ ${#TEMP_FILES[@]} -gt 0 ]; then
log INFO "${ICON_CLEAN} Cleaning up temporary files..."
for file in "${TEMP_FILES[@]}"; do
if [ -f "$file" ]; then
rm -f "$file"
log DEBUG "Removed: $file"
fi
done
fi

# ==============================================================================
# 完成
# ==============================================================================
echo ""
log INFO "${ICON_RICE} ${GREEN}╔════════════════════════════════════════════════════════╗${NC}"
log INFO "${ICON_RICE} ${GREEN}║ Pipeline Completed Successfully! ${ICON_CHECK}${NC}"
log INFO "${ICON_RICE} ${GREEN}╚════════════════════════════════════════════════════════╝${NC}"
log INFO "${ICON_TIME} Total runtime: ${TOTAL_TIME}s"
log INFO "${ICON_STAT} Statistics: $STATS_FILE"
log INFO "${ICON_GEAR} Log file: $LOG_FILE"

echo ""
echo -e "${CYAN}Next steps:${NC}"
echo -e " 1. Review results: cat $STATS_FILE"
echo -e " 2. Visualize results with Manhattan plots"
echo -e " 3. Extract significant SNPs: awk '\$4 < 1e-5' ${OUT_PREFIX}.*.manht_input"

exit 0

批量处理脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/bin/bash

# ================= 配置区域 =================
# 工作目录
WORK_DIR="/share/org/YZWL/yzwl_lixg/project/94.rice_gas/11.tassel_gwas"

# 绝对路径的输入文件
VCF_FILE="${WORK_DIR}/variation.filtered.snp.maf5.vcf.gz"
PHENO_SOURCE="${WORK_DIR}/17.微生物和甲烷菌用于GWAS的数据.txt"

# TASSEL 运行脚本路径 (使用 $HOME 代表 ~)
RUN_SCRIPT="$HOME/software/scripts/44.run_tassel.sh"

# 错误日志文件 (记录失败的表型名称)
FAIL_LOG="${WORK_DIR}/failed_phenotypes.txt"

# ================= 脚本逻辑 =================

# 1. 进入工作目录
if ! cd "${WORK_DIR}"; then
echo "严重错误: 无法进入工作目录 ${WORK_DIR},脚本终止。"
exit 1
fi

# 初始化/清空错误日志
: > "${FAIL_LOG}"

# 2. 获取表型文件的总列数 (基于第一行表头)
TOTAL_COLS=$(head -n 1 "${PHENO_SOURCE}" | awk '{print NF}')

echo "检测到总列数: ${TOTAL_COLS}"
echo "开始遍历处理第 2 列到第 ${TOTAL_COLS} 列..."
echo "运行失败的表型将被记录在: ${FAIL_LOG}"

# 3. 循环遍历从第2列到最后一列
for (( i=2; i<=TOTAL_COLS; i++ )); do

# 获取当前列的表头名称(作为表型ID和文件夹名)
PHENO_NAME=$(head -n 1 "${PHENO_SOURCE}" | awk -v col=$i '{print $col}')

# 去除名称中可能存在的特殊字符,例如 / 替换为 _
DIR_NAME=${PHENO_NAME//\//_}

echo "------------------------------------------------------"
echo "正在处理表型: ${PHENO_NAME} (列索引: $i)"

# 创建子文件夹
if [ ! -d "${DIR_NAME}" ]; then
mkdir -p "${DIR_NAME}"
fi

# 尝试进入子文件夹 (修改点:失败则跳过本次循环,不退出脚本)
if ! cd "${DIR_NAME}"; then
echo "警告: 无法进入目录 ${DIR_NAME},跳过该表型。"
echo "${PHENO_NAME} (Directory Error)" >> "${FAIL_LOG}"
continue
fi

# 定义提取出来的单个表型文件名
SINGLE_PHENO_FILE="${PHENO_NAME}.pheno.txt"

# 提取第1列和第i列
# 使用 awk 判断是否成功执行
awk -v col=$i 'BEGIN{OFS="\t"} {print $1, $col}' "${PHENO_SOURCE}" > "${SINGLE_PHENO_FILE}"

if [ $? -ne 0 ]; then
echo "错误: 提取表型数据失败,跳过。"
echo "${PHENO_NAME} (Extraction Error)" >> "${FAIL_LOG}"
cd .. # 返回上一级
continue
fi

# 运行 TASSEL 脚本 (修改点:捕获运行结果)
echo "执行 TASSEL 脚本..."

# 这里的 bash 命令被放在 if 条件中
if bash "${RUN_SCRIPT}" -i "${VCF_FILE}" -p "${SINGLE_PHENO_FILE}" --keep-temp; then
echo ">>> 表型 ${PHENO_NAME} 运行成功。"
else
echo ">>> 警告: 表型 ${PHENO_NAME} 运行失败!已记录到日志。"
# 将失败的表型名称追加写入日志文件
echo "${PHENO_NAME}" >> "${FAIL_LOG}"
# 注意:这里没有 exit,脚本会继续向下运行
fi

# 返回上一级目录,准备处理下一个
cd ..

done

echo "======================================================"
echo "所有表型处理流程结束。"
if [ -s "${FAIL_LOG}" ]; then
echo "以下表型运行失败,请检查 ${FAIL_LOG} :"
cat "${FAIL_LOG}"
else
echo "完美!所有表型均成功运行。"
fi
echo "======================================================"

linux运行tassel做GWAS的脚本
https://lixiang117423.github.io/article/linux-tassel/
作者
李详【Xiang LI】
发布于
2025年12月3日
许可协议