葡萄Science文章基因组学和群体遗传学代码

参考文献

Dong Y, Duan S, Xia Q, et al. Dual domestications and origin of traits in grapevine evolution[J]. Science, 2023, 379(6635): 892-901.

基因组组装和注释

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
1.NextDenovo
1.1 seq_stat calculated length cutoff of reads
~/NextDenovo/bin/seq_stat -g 500Mb input.fofn
#45x reads were used to correct reads
1.2 run nextDenovo(v2.1-beta.0)
~/NextDenovo/nextDenovo ./run.cfg -l log.txt

# config content
job_type = local
job_prefix = nextDenovo
task = all
rewrite = yes
deltmp = yes
rerun = 3
parallel_jobs = 20
input_type = raw
input_fofn = /data/nextdenovo/input.fofn
workdir = /data/nextdenovo
#cluster_options = -q gxyy -l nodes1:ppn=10 -j oe
#nodelist = avanode.list.fofn

[correct_option]
read_cuoff = 1k
seed_cutoff = 19703
seed_cutfiles = 40
blocksize = 10g

pa_correction = 40
minimap2_options_raw = -x ava-pb -t 16
sort_options = -m 50g -t 16 -k 50
correction_options = -p 32

[assemble_option]
random_round = 100
minimap2_options_cns = -x ava-pb -t 32 -k17 -w17
nextgraph_options = -a 1

2. purge haplotigs

minimap2 -t 10 -ax map-pb VS1.fasta total.fasta.gz --secondary=no | samtools sort -m 1G -o aligned.bam -T tmp.ali

purge_haplotigs hist -b aligned.bam -g VS1.fasta -t 20
purge_haplotigs cov -i aligned.bam.gencov -o coverage_stats.csv -l 10 -m 68 -h 140
purge_haplotigs purge -g VS1.fasta -c coverage_stats.csv -t 4 -a 60

#get curated.fasta file

3. pilon polish
bwa mem -t 20 ./curated.fasta SV-1-1_TKD181000271_1.clean.fq.gz.clean.dup.clean SV-1-1_TKD181000271_2.clean.fq.gz.clean.dup.clean |samtools sort -@ 20 -O BAM -o VS1-1.pe.sort.bam
samtools index VS1-1.pe.sort.bam

java -Xmx300g -jar pilon-1.21.jar --genome ./curated.fasta --frags ./round1_bam/VS1-1.pe.sort.bam --frags ./round1_bam/VS1-2.pe.sort.bam --frags ./round1_bam/VS1-3.pe.sort.bam --frags ./round1_bam/VS1-4.pe.sort.bam --fix snps,indels --output VS1.round1.pilon --changes --threads 50

4. ccs extend
4.1 CCS bam to fasta
~/bin/bam2fastx -AQa -o ABFC20191093-01.ccs.fasta.gz ABFC20191093-01.ccs.bam
4.2 Canu assembly ccs reads
~/bin/canu -p vs1 -d vs1 gridEngine=PBS genomeSize=500m \
batOptions="-dg 3 -db 3 -dr 1 -ca 500 -cp 50 -M 250" \
correctedErrorRate=0.050 \
gridOptions="-q gxyy " \
-pacbio-hifi ABFC20191093-01.ccs.fasta.gz
4.3 Mummer align result of pilon polished and CCS assembly
4.4 Manual correction and links.

5. Nextpolish
nextPolish run.cfg

#run.cfg
[General]
job_type = local
job_prefix = nextPolish
task = best
rewrite = yes
rerun = 3
parallel_jobs = 10
multithread_jobs = 8
genome = ./genome.fasta
genome_size = auto
workdir = ./01_rundir
polish_options = -p {multithread_jobs}

[lgs_option]
//lgs_fofn = ./lgs.fofn
lgs_options = -min_read_len 1k -max_depth 60
lgs_minimap2_options = -x map-pb

#lgs.fofn
ABFC20191093-01.ccs.fasta.gz

6. 3ddna assembly
6.1 Juicer alignment
~/juicer/scripts/juicer.sh \
-z /data/references/VS1.pilon.merge.polish.fasta \
-d /data/NextDenovo2/VS1/hic_nextdenovo \
-s DpnII -t 50 \
-p /data/restriction_sites/VS1.pilon.merge.polish.chrom.sizes \
-y /data/restriction_sites/VS1.pilon.merge.polish_DpnII.txt \
-D ~/juicer

6.2 3ddna assembly
~/3d-dna/run-asm-pipeline.sh VS1.pilon.merge.polish.fasta merged_nodups.txt

6.3 3ddna-review
~/3d-dna/run-asm-pipeline-post-review.sh -s finalize --sort-output --build-gapped-map -r VS1.curated.FINAL.review.assembly VS1.curated.fa merged_nodups.txt

7. Mummber align the assembly to Pinot Noir genome
7.1 nucmer -c 100 -p out/1_1 VS1.FINAL.fa.cut//VS1.FINAL.fa.1 Vitis_vinifera.IGGP_12x.31.dna.genome.fa.cut//Vitis_vinifera.IGGP_12x.31.dna.genome.fa.1
7.2 python deal_delta.py out/ref_query.delta out/ref_query.deal.delta
7.3 delta-filter -i 89 -l 1000 -1 ref_query.deal.delta >ref_query.deal.delta.identy_0.9.len1k.filter
7.4 show-coords -c ref_query.deal.delta.identy_0.9.len1k.filter >ref_query.deal.delta.identy_0.9.len1k.filter.coords
7.5 ~/dotPlotly/mummerCoordsDotPlotly.R -i ref_query.deal.delta.identy_0.9.len1k.filter.coords -o ref_query.deal.delta.identy_0.9.len1k.filter.coords -s -t -m 5000 -q 5000 -k 25 -l

8. Repeat annotation
8.1 Ltr_finder:
LTR_FINDER_parallel -seq VS1.FINAL.fa -threads 40 -harvest_out -size 1000000 -time 300 -finder ltr_finder
8.2 Ltrharvest:
gt suffixerator -db VS1.FINAL.fa -indexname VS1.FINAL.fa -tis -suf -lcp -des -ssp -sds -dna
gt ltrharvest -index VS1.FINAL.fa -minlenltr 100 -maxlenltr 7000 -mintsd 4 -maxtsd 6 -motif TGCA -motifmis 1 -similar 85 -vic 10 -seed 20 -seqids yes > VS1.genome.split.harvest.scn
8.3 LTR_retriever:
LTR_retriever -genome VS1.FINAL.fa -inharvest VS1.FINAL.fa.rawLTR.scn -threads 40
8.4 RepeatModeler
~/RepeatModeler-2.0/BuildDatabase -name mydb VS1.FINAL.fa >log
~/RepeatModeler-2.0/RepeatModeler -pa 15 -database mydb > run.out
8.5 Repeatmasker
perl RepeatMasker -nolow -no_is -norna -pa 1 -species Viridiplantae VS1.FINAL.fa > VS1.FINAL.fa.log 2> VS1.FINAL.fa.log2
8.6 Proteinmasker
perl RepeatProteinMask -engine ncbi -noLowSimple -pvalue 0.0001 VS1.FINAL.fa.masked
8.7repeatmasker_de
perl RepeatMasker -nolow -no_is -norna -pa 1 -lib denovo.library.fa VS1.FINAL.fa. masked.masked >VS1.FINAL.fa. masked.masked.log 2>VS1.FINAL.fa.masked.masked.log2
8.8 Trf
trf VS1.FINAL.fa 2 7 7 80 10 50 2000 -d -h

9. Gene annotation
9.1 Homolog: Arabidopsis thaliana (Ensembl release 43), V. riparia, V. vinifera cv. Chardonnay, 12x.v2 (Ensembl, NCBI, and vitviv2)
perl ~/Annotation_pipeline/01.gene_finding/protein-map-genome/bin/protein_map_genome.pl --cpu 100 --align_rate 0.25 --step 1234 --tophit 10 --lines 1000 --verbose --extend_len 2000 --queue clu /data/Arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.43.gff.pep ./VS1.FINAL.fa

9.2 Denovo
9.2.1 Augustus (2.5.5) db: BUSCO results
~/augustus.2.5.5/bin/augustus --species=BUSCO --AUGUSTUS_CONFIG_PATH=~/augustus_busco/ --uniqueGeneId=true --noInFrameStop=true --gff3=on --strand=both VS1.FINAL.fa > VS1.FINAL.fa.augustus
9.2.2 glimmerHMM (3.0.2),db: Arabidopsis thaliana
~/GlimmerHMM/bin/glimmerhmm VS1.FINAL.fa -d ~/GlimmerHMM/trained_dir/arabidopsis -f -g > VS1.FINAL.fa.gff
9.2.3 Genescan (2015-10-31)
perl /nfs/nfs1/soft/Annotation_pipeline/01.gene_finding/denovo-predict/bin/split_seq.pl -len 1000000 /nfs/nfs3/dongx/wild_vitis/VS1/VS1.FINAL.fa > /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/genescan/./VS1.FINAL.fa.cut1Mb; perl /nfs/nfs1/soft/Annotation_pipeline/01.gene_finding/denovo-predict/bin/predict_run.pl /nfs/nfs1/soft/soft_annotation/genscan/genscan /nfs/nfs1/soft/soft_annotation/genscan/Arabidopsis.smat /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/genescan/./VS1.FINAL.fa.cut1Mb > /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/genescan/./VS1.FINAL.fa.genscan; perl /nfs/nfs1/soft/Annotation_pipeline/01.gene_finding/denovo-predict/bin/predict_convert_new.pl --predict genscan --backcoor --final G -outdir /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/genescan/./ /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/genescan/./VS1.FINAL.fa.genscan /nfs/nfs3/dongx/wild_vitis/VS1/VS1.FINAL.fa; perl /nfs/nfs1/soft/Annotation_pipeline/01.gene_finding/denovo-predict/bin/Check_GFF.pl -perfect -check_cds -mini_cds 150 -cds_ns 10 -outdir /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/genescan/./ /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/genescan/./VS1.FINAL.fa.genscan.gff /nfs/nfs3/dongx/wild_vitis/VS1/VS1.FINAL.fa
9.2.4 SNAP (Semi-HMM-based Nucleic Acid Parser, version 2006-07-28)
/nfs/nfs1/soft/soft_annotation/snap/snap -gff /nfs/nfs1/soft/soft_annotation/snap/HMM/A.thaliana.hmm /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/snap/./VS1.FINAL.fa.cut/VS1.FINAL.fa.1 > /nfs/nfs3/dongx/wild_vitis/VS1/2.gene/denovo/snap/./VS1.FINAL.fa.cut/VS1.FINAL.fa.1.snap

9.3 Transcriptome
9.3.1 Build index
hisat2-build -p 20 VS1.FINAL.fa VS1.FINAL.fa

9.3.2 Hisat alignment
hisat2 -p 10 -x VS1.FINAL.fa -U ${i}.fq.gz.qtrim -S ${i}.sam
samtools view -bS ${i}.sam > ${i}.bam
samtools sort -o ${i}.sorted.bam ${i}.bam
stringtie ${i}.sorted.bam -o ${i}.gtf -p 10

9.3.3 Stringtie merge gtf
stringtie --merge -o merged.gtf gtflist

9.4 EVM
perl ~/EVM_r2012-06-25/EvmUtils/partition_EVM_inputs.pl --genome VS1.FINAL.fa --gene_predictions denovo.gff3 \
--protein_alignments homolog.gff3 --transcript_alignments merged.gff3 \
--segmentSize 5000000 --overlapSize 10000 --partition_listing partitions_list.out
perl ~/EVM_r2012-06-25/EvmUtils/write_EVM_commands.pl \
--genome VS1.FINAL.fa --weights weight.txt \
--gene_predictions denovo.gff3 --protein_alignments homolog.gff3 --transcript_alignments merged.gff3 \
--output_file_name evm.out \
--partitions partition/partitions_list.out > commands.list
sh commands.list
#weight
PROTEIN A. thaliana 3, V. riparia 5, Chardonnay 6, 12x (3 versions) 8
AB initio AUGUSTUS 5, Glimmer, GeneScan, SNAP 1
Transcript 8

perl ~/EVM_r2012-06-25/EvmUtils/recombine_EVM_partial_outputs.pl \
--partitions partition/partitions_list.out --output_file_name evm.out

perl ~/EVM_r2012-06-25/EvmUtils/convert_EVM_outputs_to_GFF3.pl \
--partitions partition/partitions_list.out --output_file_name evm.out --genome VS1.FINAL.fa

for i in partition/*; do for j in $i/*out.gff3;do cat $j; done;done >EVM.out.gff3

9.5 PASA update
Transcriptome for PASA
Trinity --seqType fq --left SRRxx.1.fastq.gz --right SRRxx.2.fastq.gz --CPU 24 --max_memory 20G

perl ~/pasa/PASApipeline/misc_utilities/accession_extractor.pl <no_re_full-length.fasta>tdn.accs
perl ~/pasa/PASApipeline/bin/seqclean all_transcripts.fasta -c 16 -v ~/pasa/UniVec/UniVec.fasta
perl ~/pasa/PASApipeline/Launch_PASA_pipeline.pl \
--TDN tdn.accs \
-c alignAssembly.config -C -R -g VS1.FINAL.fa \
-t all_transcripts.fasta.clean -T -u all_transcripts.fasta \
--ALIGNERS blat --CPU 60
perl ~/pasa/PASApipeline/misc_utilities/pasa_gff3_validator.pl EVM.out.gff3
perl ~/pasa/PASApipeline/scripts/Load_Current_Gene_Annotations.dbi \
-c alignAssembly.config -g VS1.FINAL.fa \
-P EVM.out.gff3
perl ~/pasa/PASApipeline/Launch_PASA_pipeline.pl \
-c annotCompare.config -A \
-g VS1.FINAL.fa \
-t all_transcripts.fasta.clean --CPU 40

9.6 Gene filter
python filter_one_exon_CDS_length.py VS1.sqlite.gene_structures_post_PASA_updates.108840.gff3 VS1.exon_CDS_stat.gene.txt VS1.exon_CDS_filter.gene.txt 300 0
python filter_gene_base_repeat.py VS1.sqlite.gene_structures_post_PASA_updates.108840.single_iso.sorted.1.bed all_without_trf.sorted.merge.strand.1.bed VS1.repeat_percent.gene.1.txt VS1.repeat_filter.gene.1.txt 0.5

10 ncRNA annotation
10.1 rRNA
/nfs/nfs1/soft/soft_annotation/blast-2.2.26/bin/formatdb -p F -o T -i ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1; /nfs/nfs1/soft/soft_annotation/blast-2.2.26/bin/blastall -p blastn -e 1e-5 -v 10000 -b 10000 -d ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1 -i /nfs/nfs1/soft/Annotation_pipeline/04.ncRNA-finding/rRNA-tRNA-Rfam/dat/ref-rRNA/plant-rRNA/plant_rRNA.fa -o ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1.rRNA.blast; perl /nfs/nfs1/soft/Annotation_pipeline/common_bin/blast_parser.pl -nohead ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1.rRNA.blast > ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1.rRNA.blast.tab;

10.2 tRNA tRNAscan-SE (version 1.3.1)
/nfs/nfs1/soft/soft_annotation/tRNAscan-SE-1.3/bin/tRNAscan-SE -o ./VS1.FINAL.fa.60.cut/VS1.FINAL.fa.60.1.tRNA -f ./VS1.FINAL.fa.60.cut/VS1.FINAL.fa.60.1.tRNA.structure ./VS1.FINAL.fa.60.cut/VS1.FINAL.fa.60.1

10.3 miRNA
/nfs/nfs1/soft/soft_annotation/blast-2.2.26/bin/blastall -p blastn -W 7 -e 1 -v 10000 -b 10000 -m8 -d ./Rfam.fasta.miRNA -i ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1 -o ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1.miRNA.blast.m8;
/nfs/nfs1/soft/soft_annotation/infernal-0.81/bin/cmsearch /nfs/nfs1/soft/Annotation_pipeline/04.ncRNA-finding/rRNA-tRNA-Rfam/dat/Rfam/Rfam/RF01043.cm ./VS1.FINAL.fa.miRNA.cmsearch/1001_2000/10__18670843_18671776.RF01043 > ./VS1.FINAL.fa.miRNA.cmsearch/1001_2000/10__18670843_18671776.RF01043.cmsearch

10.4 snRNA
/nfs/nfs1/soft/soft_annotation/blast-2.2.26/bin/blastall -p blastn -W 7 -e 1 -v 10000 -b 10000 -m8 -d ./Rfam.fasta.snRNA -i ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1 -o ./VS1.FINAL.fa.cut/VS1.FINAL.fa.1.snRNA.blast.m8;
/nfs/nfs1/soft/soft_annotation/infernal-0.81/bin/cmsearch /nfs/nfs1/soft/Annotation_pipeline/04.ncRNA-finding/rRNA-tRNA-Rfam/dat/Rfam/Rfam/RF01217.cm ./VS1.FINAL.fa.snRNA.cmsearch/1001_2000/10__10502775_10503210.RF01217 > ./VS1.FINAL.fa.snRNA.cmsearch/1001_2000/10__10502775_10503210.RF01217.cmsearch

11 function
#SwissProt, TrEMBL, KEGG
~/blast-2.2.26/bin/blastall -b 100 -v 100 -p blastp -e 1e-05 -F F -d ${database} -i VS1.pep.fa -o VS1.pep.fa.${database}.blast;

#InterProScan
interproscan.sh -dp -f tsv -iprlookup -goterms -i VS1.pep.fa -o VS1.pep.fa.iprscan -T ./tmp;

filter_gene_base_repeat.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/python
import sys
import commands
import os
import random
import math
if len(sys.argv) <> 6:
print "Usage: " + sys.argv[0] + " gene_bed repeat_bed out_all out_filtered set_repeat_percent(0.5) \nwrited by dsc"
sys.exit(-1)
file1=open(sys.argv[1],'r')
file2=open(sys.argv[2],'r')
file3=open(sys.argv[3],'w')
file4=open(sys.argv[4],'w')
set_repeat_percent=float(sys.argv[5])
file3.write('Chr\tStart\tEnd\tStrand\tGene\tRepeat_Percent\n')
file4.write('Chr\tStart\tEnd\tStrand\tGene\tRepeat_Percent\n')
Line1=file1.readlines()
Line2=file2.readlines()
for line1 in Line1:
lin1=line1.strip('\n').split('\t')
gene_length=int(int(lin1[2])-int(lin1[1])+1)
repeat_length=0
for line2 in Line2:
lin2=line2.strip('\n').split('\t')
if lin2[3]==lin1[3]:
if lin2[0]==lin1[0]:
if int(lin2[2]) < int(lin1[1]):
continue
else:
if int(lin2[1])>int(lin1[2]):
break
else:
if int(lin2[1])<=int(lin1[1]) and int(lin2[2])>=int(lin1[2]): #repeat cover
repeat_length=int(int(lin1[2])-int(lin1[1])+1)
break
else:
if int(lin2[1])<=int(lin1[1]) and int(lin2[2])>int(lin1[1]) and int(lin2[2])<=int(lin1[2]): #left overlap
repeat_length+=int(int(lin2[2])-int(lin1[1])+1)
else:
if int(lin2[1])>=int(lin1[1]) and int(lin2[2])<=int(lin1[2]): #cover repeat
repeat_length+=int(int(lin2[2])-int(lin2[1])+1)
else:
if int(lin2[1])>=int(lin1[1]) and int(lin2[1])<int(lin1[2]) and int(lin2[2])>int(lin1[2]): #right overlap
repeat_length+=int(int(lin1[2])-int(lin2[1])+1)
break
repeat_percent='%.3f' % float(float(repeat_length)/float(gene_length))
file3.write('\t'.join(lin1)+'\t'+str(repeat_percent)+'\n')
if float(set_repeat_percent) == 0:
if float(repeat_percent) == 0:
file4.write('\t'.join(lin1)+'\t'+str(repeat_percent)+'\n')
else:
if float(repeat_percent) < float(set_repeat_percent):
file4.write('\t'.join(lin1)+'\t'+str(repeat_percent)+'\n')
file1.close()
file2.close()
file3.close()
file4.close()

filter_one_exon_CDS_length.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/python
import sys
import commands
import os
import random
import math
if len(sys.argv) <> 6:
print "Usage: " + sys.argv[0] + " gff3 all_out filtered_out set_CDS_lengt(300) set_exon_nuber(0) \nwrited by dsc"
sys.exit(-1)
file1=open(sys.argv[1],'r')
file2=open(sys.argv[2],'w')
file3=open(sys.argv[3],'w')
set_CDS_lengt=sys.argv[4]
set_exon_nuber=sys.argv[5]
file2.write('Gene\texon_number\tCDS_length\n')
file3.write('Gene\texon_number\tCDS_length\n')
new_file=[]
Line=file1.readlines()
for line in Line:
if len(line)>1:
if '#' not in line:
new_file.append(line)
line1=new_file[0]
lin1=line1.split('\t')
exon_num=0
cds_len=0
for i in range(1,len(new_file)):
line2=new_file[i]
lin2=line2.split('\t')
if lin2[2]=='CDS':
exon_num+=1
length=int(int(lin2[4])-int(lin2[3])+1)
cds_len+=length
else:
if lin2[2]=='gene':
file2.write(lin1[8].split(';')[0].split('=')[1]+'\t'+str(exon_num)+'\t'+str(cds_len)+'\n')
if int(exon_num) > int(set_exon_nuber) and int(cds_len)>=int(set_CDS_lengt):
file3.write(lin1[8].split(';')[0].split('=')[1]+'\t'+str(exon_num)+'\t'+str(cds_len)+'\n')
line1=line2
lin1=lin2
exon_num=0
cds_len=0
file2.write(lin1[8].split(';')[0].split('=')[1]+'\t'+str(exon_num)+'\t'+str(cds_len)+'\n')
if int(exon_num) >int(set_exon_nuber) and int(cds_len)>=int(set_CDS_lengt):
file3.write(lin1[8].split(';')[0].split('=')[1]+'\t'+str(exon_num)+'\t'+str(cds_len)+'\n') #last line
file1.close()
file2.close()
file3.close()

变异检测和注释

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
Variant calling, validation, and annotation
1. raw reads filtering
~/bin/fastp -i sample_1.clean.fq.gz -o sample_1.filtered.fq.gz -I sample_2.clean.fq.gz -O sample_2.filtered.fq.gz -q 20 -u 40 -l 70

2. Reads mapping
~/bin/bwa-mem2 index VS1.fa

~/bin/bwa-mem2 mem -R '@RG\tID:B44\tLB:B44\tSM:B44\tPL:ILLUMINA' -t 30 VS1.final.fa sample_1.filtered.fq.gz sample_2.filtered.fq.gz |samtools view -Sb - >sample.pe.bam

~/bin/gatk SortSam --INPUT sample.pe.bam --OUTPUT sample.sort.bam --SORT_ORDER coordinate

~/bin/gatk MarkDuplicates -I sample.sort.bam -O sample.sort.markdup.bam -M sample.sort.markdup_metrics.txt

~/bin/samtools index sample.sort.markdup.bam

3. Sequencing statistics
~/bin/bamdst -p VS1.final.fa.bed -o ~/sample sample.sort.markdup.bam

4. Variant calling
time java -Xmx12g -Djava.io.tmpdir=./java_tmp -jar ~/bin/gatk3.8.0/GenomeAnalysisTK.jar -T HaplotypeCaller --num_cpu_threads_per_data_thread 40 -R VS1.final.fa -I sample.sort.markdup.bam -ERC GVCF -variant_index_type LINEAR -variant_index_parameter 128000 -A StrandOddsRatio -A Coverage -A QualByDepth -A FisherStrand -A MappingQualityRankSumTest -A ReadPosRankSumTest -A RMSMappingQuality -o sample.g.vcf.gz

#joint-genotyping analysis of the gVCFs (-L 1..19)
time java -Xmx12g -Djava.io.tmpdir=./java_tmp -jar ~/bin/gatk3.8.0/GenomeAnalysisTK.jar -T CombineGVCFs -R VS1.final.fa -V sample1.g.vcf.gz -V sample2.g.vcf.gz -L 1 -o chr.1.g.vcf.gz

time java -Xmx12g -Djava.io.tmpdir=./java_tmp -jar ~/bin/gatk3.8.0/GenomeAnalysisTK.jar -T GenotypeGVCFs -R VS1.final.fa -V chr.1.g.vcf.gz -L 1 -o chr.1.raw.vcf.gz

~/bin/gatk SelectVariants -select-type SNP -V Grape.1.raw.vcf.gz -O Grape.1.snp.vcf.gz

~/bin/gatk SelectVariants -select-type INDEL -V Grape.1.raw.vcf.gz -O Grape.1.indel.vcf.gz

5. SNP and INDEL filtering
~/bin/gatk -V Grape.1.snp.vcf.gz -filter "QD < 2.0" --filter-name "QD2" -filter "QUAL < 30.0" --filter-name "QUAL30" -filter "SOR > 3.0" --filter-name "SOR3" -filter "FS > 60.0" --filter-name "FS60" -filter "MQ <40.0" --filter-name "MQ40" -filter "MQRankSum < -10.0" --filter-name "MQRankSum-10" -filter "ReadPosRankSum < -8.0" --filter-name ReadPosRankSum-8 -O Grape.1.fliter.snp.vcf.gz

~/bin/vcftools --gzvcf Grape.1.fliter.snp.vcf.gz --remove-filtered-all --recode --stdout | gzip -c Grape.1.pass.snp.vcf.gz

~/bin/vcftools --gzvcf Grape.1.pass.snp.vcf.gz --min-alleles 2 --max-alleles 2 --recode --stdout | gzip -c Grape.1.bi.snp.vcf.gz

~/bin/vcftools --gzvcf Grape.1.bi.snp.vcf.gz --max-missing 0.4 --maf 0.005 --recode --stdout | gzip -c >Grape.1.bi.maf0.005mis0.4.snp.vcf.gz

~/bin/vcftools --gzvcf Grape.1.bi.snp.vcf.gz --TsTv-summary --out Grape.1.bisnp

~/bin/gatk -V Grape.1.max40.indel.vcf.gz -filter "QD < 2.0" --filter-name "QD2" -filter "QUAL < 30.0" --filter-name "QUAL30" -filter "SOR > 5.0" --filter-name "SOR5" -filter "FS > 100.0" --filter-name "FS100" -filter "InbreedingCoeff <-0.8" --filter-name "InbreedingCoeff-0.8" -O Grape.1.filter.indel.vcf.gz

~/bin/vcftools --gzvcf --remove-filtered-all --recode --stdout | gzip -c >Grape.1.pass.indel.vcf.gz


6. SNP Validation
## CHIP and previous 472 SNP validation
perl get_snpfa.pl PN40024.fa chip.info 60
~/bin/makeblastdb -in VS1.final.fa -dbtype nucl -out VS1.final.fa

~/bin/blastn -task megablast -use_index true -db VS1.final.fa -query chip.snp.fa -outfmt 6 -out chip.snp.megablast.out

## 59 Chasselas clones validation
~/bin/gatk Mutect2 -R VS1.final.fa -I sample.sort.markdup.bam --independent-mates true -tumor sample -I 229.sort.markdup.bam -normal 229 -O sample.vcf.gz

~/bin/gatk FilterMutectCalls -R VS1.final.fa -V sample.vcf.gz -O sample.filter.vcf.gz

~/bin/gatk SelectVariants -select-type SNP -V sample.filter.vcf.gz -O sample.filter.snp.vcf.gz
~/bin/vcftools --gzvcf sample.filter.snp.vcf.gz --remove-filtered-all --recode --stdout | gzip -c > sample.pass.snp.vcf.gz

7. SNP annotation
## ANNOVAR annotation
perl ~/bin/annovar/convert2annovar.pl -format vcf4 -allsample --withfreq Grape.filter.snp.vcf.gz >Grape.filter.snp.annovar

perl ~/bin/annovar/annotate_variation.pl --buildver VS1 --geneanno Grape.filter.snp.annovar -outfile Grape.snp annovar/vvdb/

## Function annotation by Provean
~/bin/provean-1.1.5/provean.sh -q variation.fasta -v variation.var --tmp_dir tmp_dir -V --num_threads 472

get_snpfa.pl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!usr/bin/perl -w
use strict;
use FileHandle;
die "\n Usage perl $0 <fa><snp><length>\n" unless (@ARGV==3);
$/="\>";
open (AF,"$ARGV[0]")||die "$!";
<AF>;
my %hash;
while (<AF>)
{chomp;
my @aa=split /\n/,$_;
my $chr=shift @aa;
$chr=(split /\s+/,$chr)[0];
#print "$chr\n";
my $seq=join"",@aa;
$hash{$chr}=$seq;
}
close AF;

$/="\n";
open (BF,"$ARGV[1]")||die "$!";
#open (OUT,">$ARGV[2]")||die "$!";
my %fh;
<BF>;
my $le=$ARGV[2];
while (<BF>)
{chomp;
my @aa=split;
my $start;
my $sub1=0;
my $sub2=0;
if ($aa[1]<$le+1)
{$sub1=$aa[1]-1; $start=0;
$sub2=100;}
elsif ((length $hash{$aa[0]})-$aa[1]<$le)
{$sub2=(length $hash{$aa[0]})-$aa[1];
$sub1=$le;
$start=$aa[1]-$le-1;}
else{$sub1=$le;$sub2=$le;$start=$aa[1]-$le-1;}

my $seq1=substr $hash{$aa[0]},$start,$sub1;
my $mid=substr $hash{$aa[0]},$aa[1]-1,1;
my $seq2=substr $hash{$aa[0]},$aa[1],$sub2;
my $alle=(split /,/,$aa[3])[0];
my $name=$aa[0]."_".$aa[1]."_".$aa[2]."_".$alle;
$alle="[".$aa[2]."/".$alle."]";
if (!exists $fh{$aa[0]})
{$fh{$aa[0]}=FileHandle -> new(">$aa[0].info");}
if ($mid ne $aa[2])
{print "$name error\n";}
else
{$fh{$aa[0]} ->print("$name\t$seq1$alle$seq2\n");}
}
close BF;
foreach (keys %fh)
{$fh{$_} -> close();}

Genetic clonal accessions

1
2
3
4
5
6
7

Genetic clonal accessions
1. prepare data
~/bin/plink --allow-extra-chr --recode --chr-set 19 --vcf filter.mindepth7.vcf.gz --make-bed --out mindepth7

2. run snpduo
~/bin/snpduo/snpduo --file mindepth7 --calculated --counts --out mindepth7

系统发育树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
1. Filter snp by SNPhylo
~/bin/snphylo.sh -v Grape.vcf -r -A -b

2. Run RAxML-NG
~/bin/raxml-ng_v0.9.0_linux_x86_64_MPI/bin/raxml-ng --msa snphylo.output.phylip.txt --parse --model GTR+G

~/bin/raxml-ng_v0.9.0_linux_x86_64_MPI/bin/raxml-ng --search --msa Grape_raxml.raxml.rba --tree pars{1} --prefix CT$i --seed $i --threads 13

~/bin/raxml-ng_v0.9.0_linux_x86_64_MPI/bin/raxml-ng --search --msa Grape_raxml.raxml.rba --tree rand{1} --prefix CT$i --seed $i --threads 13

grep "Final LogLikelihood" CT*.raxml.log | sort -k 3

## bootstrap
~/bin/raxml-ng_v0.9.0_linux_x86_64_MPI/bin/raxml-ng --bootstrap --msa Grape_raxml.raxml.rba --bs-trees 100 --prefix CB$i --seed $i --threads 2

~/bin/raxml-ng_v0.9.0_linux_x86_64_MPI/binraxml-ng --bsconverge --bs-trees allbootstraps --prefix CS --seed 2 --threads 1

~/bin/raxml-ng_v0.9.0_linux_x86_64_MPI/binraxml-ng --support --tree best.tre --bs-trees allbootstraps --prefix CS --threads 1

SplitsTree

1
2
3
4
5
6
7
8
9
10
11
1. prepare data
vcftools --gzvcf grape.all.bi.snp.vcf.gz --keep grape.core_2448.txt --max-missing 0.8 --maf 0.05 --recode --stdout | gzip -c > grape.core.vcf.gz
plink --vcf grape.core.vcf.gz --set-missing-var-ids @:# --make-bed --out grape.core
plink --bfile grape.core --indep-pairwise 50 1 0.1 -out grape.core.pruning
plink --bfile grape.core --extract grape.core.pruning.prune.in --recode vcf-iid --out grape.core.pruned

# vcf to phylip, download from https://github.com/edgardomortiz/vcf2phylip
python vcf2phylip.py -i grape.core.pruned.vcf --nexus

2. run splitstree
SplitsTree -g -c cmd.txt

CMD

1
2
3
4
5
6
begin SplitsTree;
LOAD FILE=grape.core.pruned.min4.nex;
EXECUTE FILE=grape.core.pruned.min4.nex;
SAVE FILE=grape.core.pruned.min4.splitstree.nex REPLACE=yes;
QUIT;
end

vcf2phylip.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/usr/bin/env python2
# -*- coding: utf-8 -*-



'''
The script converts a collection of SNPs in VCF format into a PHYLIP, FASTA,
NEXUS, or binary NEXUS file for phylogenetic analysis. The code is optimized
to process VCF files with sizes >1GB. For small VCF files the algorithm slows
down as the number of taxa increases (but is still fast).

Any ploidy is allowed, but binary NEXUS is produced only for diploid VCFs.
'''



__author__ = "Edgardo M. Ortiz"
__credits__ = "Juan D. Palacio-Mejía"
__version__ = "2.0"
__email__ = "e.ortiz.v@gmail.com"
__date__ = "2019-01-15"



import sys
import os
import argparse
import gzip



def main():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-i", "--input", action="store", dest="filename", required=True,
help="Name of the input VCF file, can be gzipped")
parser.add_argument("-m", "--min-samples-locus", action="store", dest="min_samples_locus", type=int, default=4,
help="Minimum of samples required to be present at a locus, default=4 since is the minimum for phylogenetics.")
parser.add_argument("-o", "--outgroup", action="store", dest="outgroup", default="",
help="Name of the outgroup in the matrix. Sequence will be written as first taxon in the alignment.")
parser.add_argument("-p", "--phylip-disable", action="store_true", dest="phylipdisable", default=False,
help="A PHYLIP matrix is written by default unless you enable this flag")
parser.add_argument("-f", "--fasta", action="store_true", dest="fasta", default=False,
help="Write a FASTA matrix, disabled by default")
parser.add_argument("-n", "--nexus", action="store_true", dest="nexus", default=False,
help="Write a NEXUS matrix, disabled by default")
parser.add_argument("-b", "--nexus-binary", action="store_true", dest="nexusbin", default=False,
help="Write a binary NEXUS matrix for analysis of biallelic SNPs in SNAPP, disabled by default")
args = parser.parse_args()



filename = args.filename

# Prepare the opener if the SAM file is compressed
if filename.endswith(".gz"):
opener = gzip.open
else:
opener = open

min_samples_locus = args.min_samples_locus
outgroup = args.outgroup
phylipdisable = args.phylipdisable
fasta = args.fasta
nexus = args.nexus
nexusbin = args.nexusbin



# Dictionary of IUPAC ambiguities for nucleotides
# '*' means deletion for GATK (and other software?)
# Deletions are ignored when making the consensus
# Dictionary to translate IUPAC ambiguities, lowercase letters are used when "*" or "N" were present for a position,
# however, software like Genious for example are case insensitive and will imply ignore capitalization
amb = {"*":"-", "A":"A", "C":"C", "G":"G", "N":"N", "T":"T",
"*A":"a", "*C":"c", "*G":"g", "*N":"n", "*T":"t",
"AC":"M", "AG":"R", "AN":"a", "AT":"W", "CG":"S",
"CN":"c", "CT":"Y", "GN":"g", "GT":"K", "NT":"t",
"*AC":"m", "*AG":"r", "*AN":"a", "*AT":"w", "*CG":"s",
"*CN":"c", "*CT":"y", "*GN":"g", "*GT":"k", "*NT":"t",
"ACG":"V", "ACN":"m", "ACT":"H", "AGN":"r", "AGT":"D",
"ANT":"w", "CGN":"s", "CGT":"B", "CNT":"y", "GNT":"k",
"*ACG":"v", "*ACN":"m", "*ACT":"h", "*AGN":"r", "*AGT":"d",
"*ANT":"w", "*CGN":"s", "*CGT":"b", "*CNT":"y", "*GNT":"k",
"ACGN":"v", "ACGT":"N", "ACNT":"h", "AGNT":"d", "CGNT":"b",
"*ACGN":"v", "*ACGT":"N", "*ACNT":"h", "*AGNT":"d", "*CGNT":"b",
"*ACGNT":"N"}



# Dictionary for translating biallelic SNPs into SNAPP, only for diploid VCF
# 0 is homozygous reference
# 1 is heterozygous
# 2 is homozygous alternative
gen_bin = {"./.":"?",
".|.":"?",
"0/0":"0",
"0|0":"0",
"0/1":"1",
"0|1":"1",
"1/0":"1",
"1|0":"1",
"1/1":"2",
"1|1":"2"}



############################
# Process header of VCF file
ploidy = 0
gt_idx = []
missing = ""

with opener(filename) as vcf:

# Create a list to store sample names
sample_names = []

# Keep track of longest sequence name for padding with spaces in the output file
len_longest_name = 0

# Look for the line in the VCF header with the sample names
for line in vcf:
if line.startswith("#CHROM"):

# Split line into fields
broken = line.strip("\n").split("\t")

# If the minimum-samples-per-locus parameter is larger than the number of
# species in the alignment make it the same as the number of species
if min_samples_locus > len(broken[9:]):
min_samples_locus = len(broken[9:])

# Create a list of sample names and the keep track of the longest name length
for i in range(9, len(broken)):
name_sample = broken[i].replace("./","") # GATK adds "./" to sample names sometimes
sample_names.append(name_sample)
len_longest_name = max(len_longest_name, len(name_sample))

# Find out the ploidy of the genotypes, just distinguishes if sample is not haploid vs n-ploid
elif not line.startswith("#") and line.strip("\n") != "":
while ploidy == 0 and missing == "":
broken = line.strip("\n").split("\t")
for j in range(9, len(broken)):
if ploidy == 0:
if broken[j].split(":")[0][0] != ".":
ploidy = (len(broken[j].split(":")[0]) // 2) + 1
gt_idx = [i for i in range(0, len(broken[j].split(":")[0]), 2)]
missing = "/".join(["." for i in range(len(gt_idx))])
# print missing
# print broken[j]
# print gt_idx
# print ploidy
break

vcf.close()

print("\nConverting file " + filename + ":\n")
print("Number of samples in VCF: " + str(len(sample_names)))


####################
# SETUP OUTPUT FILES
# Output filename will be the same as input file, indicating the minimum of samples specified
if filename.endswith(".gz"):
outfile = filename.replace(".vcf.gz",".min"+str(min_samples_locus))
else:
outfile = filename.replace(".vcf",".min"+str(min_samples_locus))

# We need to create an intermediate file to hold the sequence data
# vertically and then transpose it to create the matrices
if fasta or nexus or not phylipdisable:
temporal = open(outfile+".tmp", "w")

# if binary NEXUS is selected also create a separate temporal
if nexusbin:
if ploidy == 2:
temporalbin = open(outfile+".bin.tmp", "w")
else:
print("Binary NEXUS not available for "+str(ploidy)+"-ploid VCF.")
nexusbin = False



##################
# PROCESS VCF FILE
index_last_sample = len(sample_names)+9

# Start processing SNPs of VCF file
with opener(filename) as vcf:

# Initialize line counter
snp_num = 0
snp_accepted = 0
snp_shallow = 0
snp_multinuc = 0
snp_biallelic = 0
while 1:

# Load large chunks of file into memory
vcf_chunk = vcf.readlines(50000)
if not vcf_chunk:
break

# Now process the SNPs one by one
for line in vcf_chunk:
if not line.startswith("#") and line.strip("\n") != "": # pyrad sometimes produces an empty line after the #CHROM line

# Split line into columns
broken = line.strip("\n").split("\t")
for g in range(9,len(broken)):
if broken[g].split(":")[0][0] == ".":
broken[g] = missing

# Keep track of number of genotypes processed
snp_num += 1

# Print progress every 500000 lines
if snp_num % 500000 == 0:
print(str(snp_num)+" genotypes processed.")

# Check if the SNP has the minimum of samples required
if (len(broken[9:]) - ''.join(broken[9:]).count(missing)) >= min_samples_locus:

# Check that ref genotype is a single nucleotide and alternative genotypes are single nucleotides
if len(broken[3]) == 1 and (len(broken[4])-broken[4].count(",")) == (broken[4].count(",")+1):

# Add to running sum of accepted SNPs
snp_accepted += 1

# If nucleotide matrices are requested
if fasta or nexus or not phylipdisable:

# Create a dictionary for genotype to nucleotide translation
# each SNP may code the nucleotides in a different manner
nuc = {str(0):broken[3], ".":"N"}
for n in range(len(broken[4].split(","))):
nuc[str(n+1)] = broken[4].split(",")[n]

# Translate genotypes into nucleotides and the obtain the IUPAC ambiguity
# for heterozygous SNPs, and append to DNA sequence of each sample
site_tmp = ''.join([amb[''.join(sorted(set([nuc[broken[i][j]] for j in gt_idx])))] for i in range(9, index_last_sample)])

# Write entire row of single nucleotide genotypes to temporary file
temporal.write(site_tmp+"\n")

# print nuc
# print [i.split(":")[0] for i in broken[9:]]
# print site_tmp

# Write binary NEXUS for SNAPP if requested
if nexusbin:

# Check taht the SNP only has two alleles
if len(broken[4]) == 1:

# Add to running sum of biallelic SNPs
snp_biallelic += 1

# Translate genotype into 0 for homozygous ref, 1 for heterozygous, and 2 for homozygous alt
binsite_tmp = ''.join([(gen_bin[broken[i][0:3]]) for i in range(9, index_last_sample)])

# Write entire row to temporary file
temporalbin.write(binsite_tmp+"\n")

else:
# Keep track of loci rejected due to multinucleotide genotypes
snp_multinuc += 1
# Keep track of loci rejected due to exceeded missing data
snp_shallow += 1

else:
# Keep track of loci rejected due to exceeded missing data
snp_shallow += 1

# Print useful information about filtering of SNPs
print("Total of genotypes processed: " + str(snp_num))
print("Genotypes excluded because they exceeded the amount of missing data allowed: " + str(snp_shallow))
print("Genotypes that passed missing data filter but were excluded for not being SNPs: " + str(snp_multinuc))
print("SNPs that passed the filters: " + str(snp_accepted))
if nexusbin:
print("Biallelic SNPs selected for binary NEXUS: " + str(snp_biallelic))
print("")

vcf.close()
if fasta or nexus or not phylipdisable:
temporal.close()
if nexusbin:
temporalbin.close()



#######################
# WRITE OUTPUT MATRICES

if not phylipdisable:
output_phy = open(outfile+".phy", "w")
header_phy = str(len(sample_names))+" "+str(snp_accepted)+"\n"
output_phy.write(header_phy)

if fasta:
output_fas = open(outfile+".fasta", "w")

if nexus:
output_nex = open(outfile+".nexus", "w")
header_nex = "#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX=" + str(len(sample_names)) + " NCHAR=" + str(snp_accepted) + ";\n\tFORMAT DATATYPE=DNA" + " MISSING=N" + " GAP=- ;\nMATRIX\n"
output_nex.write(header_nex)

if nexusbin:
output_nexbin = open(outfile+".bin.nexus", "w")
header_nexbin = "#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX=" + str(len(sample_names)) + " NCHAR=" + str(snp_biallelic) + ";\n\tFORMAT DATATYPE=SNP" + " MISSING=?" + " GAP=- ;\nMATRIX\n"
output_nexbin.write(header_nexbin)


# Store index of outgroup in list of sample names
idx_outgroup = "NA"

# Write outgroup as first sequence in alignment if the name is specified
if outgroup in sample_names:
idx_outgroup = sample_names.index(outgroup)

if fasta or nexus or not phylipdisable:
with open(outfile+".tmp") as tmp_seq:
seqout = ""

# This is where the transposing happens
for line in tmp_seq:
seqout += line[idx_outgroup]

# Write FASTA line
if fasta:
output_fas.write(">"+sample_names[idx_outgroup]+"\n"+seqout+"\n")

# Pad sequences names and write PHYLIP or NEXUS lines
padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " "
if not phylipdisable:
output_phy.write(sample_names[idx_outgroup]+padding+seqout+"\n")
if nexus:
output_nex.write(sample_names[idx_outgroup]+padding+seqout+"\n")

# Print current progress
print("Outgroup, "+outgroup+", added to the matrix(ces).")

if nexusbin:
with open(outfile+".bin.tmp") as bin_tmp_seq:
seqout = ""

# This is where the transposing happens
for line in bin_tmp_seq:
seqout += line[idx_outgroup]

# Write line of binary SNPs to NEXUS
padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " "
output_nexbin.write(sample_names[idx_outgroup]+padding+seqout+"\n")

# Print current progress
print("Outgroup, "+outgroup+", added to the binary matrix.")


# Write sequences of the ingroup
for s in range(0, len(sample_names)):
if s != idx_outgroup:
if fasta or nexus or not phylipdisable:
with open(outfile+".tmp") as tmp_seq:
seqout = ""

# This is where the transposing happens
for line in tmp_seq:
seqout += line[s]

# Write FASTA line
if fasta:
output_fas.write(">"+sample_names[s]+"\n"+seqout+"\n")

# Pad sequences names and write PHYLIP or NEXUS lines
padding = (len_longest_name + 3 - len(sample_names[s])) * " "
if not phylipdisable:
output_phy.write(sample_names[s]+padding+seqout+"\n")
if nexus:
output_nex.write(sample_names[s]+padding+seqout+"\n")

# Print current progress
print("Sample "+str(s+1)+" of "+str(len(sample_names))+", "+sample_names[s]+", added to the nucleotide matrix(ces).")

if nexusbin:
with open(outfile+".bin.tmp") as bin_tmp_seq:
seqout = ""

# This is where the transposing happens
for line in bin_tmp_seq:
seqout += line[s]

# Write line of binary SNPs to NEXUS
padding = (len_longest_name + 3 - len(sample_names[s])) * " "
output_nexbin.write(sample_names[s]+padding+seqout+"\n")

# Print current progress
print("Sample "+str(s+1)+" of "+str(len(sample_names))+", "+sample_names[s]+", added to the binary matrix.")


if not phylipdisable:
output_phy.close()
if fasta:
output_fas.close()
if nexus:
output_nex.write(";\nEND;\n")
output_nex.close()
if nexusbin:
output_nexbin.write(";\nEND;\n")
output_nexbin.close()

if fasta or nexus or not phylipdisable:
os.remove(outfile+".tmp")
if nexusbin:
os.remove(outfile+".bin.tmp")


print( "\nDone!\n")


if __name__ == "__main__":
main()

PCA和群体结构

1
2
3
4
5
6
7
8
9
10
Principal Component Analysis and ADMIXTURE
1. PCA
~/bin/plink --allow-extra-chr --recode --chr-set 19 --vcf total.snp.vcf.gz --make-bed --out total.snp

~/bin/gcta64 --bfile total.snp --make-grm --autosome --out total.snp.grim

~/bin/gcta64 --grm --pca 3 --out total.snp.pca

2. ADMIXTURE
~/bin/admixture --cv=10 total.snp.bed 2 |tee log2.out

Archetypal Analysis

1
2
3
4
5
6
7
8
9
10
11
12
1. prepare data
vcftools --gzvcf grape.all.bi.snp.vcf.gz --keep grape.core_2448.txt --max-missing 0.4 --maf 0.05 --recode --stdout | gzip -c > grape.core.vcf.gz
plink --vcf grape.core.vcf.gz --set-missing-var-ids @:# --make-bed --out grape.core
plink --bfile grape.core --indep-pairwise 50 5 0.5 -out grape.core.pruning
plink --bfile grape.core --extract grape.core.pruning.prune.in --make-bed --out grape.core.pruned

2. run archetypal-analysis
for ((i=2;i<13;i++))
do mkdir /public/archetypal-analysis/k${i}
archetypal-analysis -i /public/archetypal-analysis/k${i}/grape.core.pruned.bed -o /public/archetypal-analysis/k${i} -k ${i} --tolerance 0.0001 --max_iter 400;
done

主要亚群鉴定

1
2
3
4
5
6
7
8
9
10
11
1. Linkage disequilibrium
~/bin/PopLDdecay -InVCF total.bi.snp.vcf.gz -MaxDist 1000 -OutStat total.ld

2. Nucleotide diversity
~/bin/vcftools --gzvcf Grape.snp.vcf.gz --window-pi 100000 --out Grape.pi

3. pairwise population fixation index
~/bin/vcftools --gzvcf Grape.snp.vcf.gz --weir-fst-pop pop1.txt --weir-fst-pop pop2.txt --fst-window-size 100000 --out grape.pop1_po2

4. individual heterozygosity
~/bin/vcftools --gzvcf Grape.snp.vcf.gz --het --out grape

Isolation-by-distance analysis

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
1. calculated fst
vcftools --gzvcf grape.all.bi.snp.vcf.gz --weir-fst-pop pop1.txt --weir-fst-pop pop2.txt --fst-window-size 100000 --out grape.pop1_po2

2. calculated haversine distances
library(geosphere)
distHaversine(c(longitude1,latitude1),c(longitude2,latitude2))

3. obtain linear regressions
library(ggplot2)
library(ggpmisc)
library(dplyr)
a=read.table("distance_fst.txt",header=TRUE,sep='\t')
p1<-ggplot(a,aes(x= Distance,y= Fst))+geom_point(shape=21,size=2,color="gray")+
theme_classic()+xlim(0,20000)+ylim(0,0.4)+
stat_smooth(method="lm",formula = y ~ x)

model.lm <- lm(Fst ~ Distance, data = a)
summary(model.lm)
group_l <- list(a = format(coef(model.lm)[1], digits = 4), b = format(abs(coef(model.lm)[2]), digits = 4), r2 = format(summary(model.lm)$r.squared, digits = 4), p = format(summary(model.lm)$coefficients[2,4], digits = 4))

#distance_fst.txt
#Country Distance Fst
#Afghanistan.Albania 4218.852 0.0904883
#Afghanistan.Algeria 6230.262 0.0478774


4. mantel test
library(ade4)
fst <- read.table("fst.matrix.txt",header=TRUE)
distance <- read.table("distance.matrix.txt",header=TRUE)
fst_dist=as.dist(fst)
distance_dist=as.dist(distance)
mantel.rtest(distance_dist, fst_dist, nrepet = 9999)

生态位建模

1
2
3
4
5
1. ENMeval test 
Rscript enmevaluate.R

2. run MaxEnt
According to the result of ENMeval test, MaxEnt runs with the suggested parameters.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
setwd("/public/paleoclimate_data/LH_v1_2_5m")
#Don't forget to set your working directory!

#only need to run this code the first time you install these packages on your machine/user account.

#install.packages("rJava")
#install.packages("ENMeval")

##installing rJava can be tricky. If you're experiencing problems installing either rJava or
##ENMeval, it's the installation of rJava that is very likely your problem. Keeping in mind
##that these instructions are for a PC, visit this website for troubleshooting:
##https://cimentadaj.github.io/blog/2018-05-25-installing-rjava-on-windows-10/installing-rjava-on-windows-10/
options(java.parameters = "-Xmx100g" )
library(rJava)
library(ENMeval)
library(raster)

#Find where the java directory is for the dismo package using the command below. Then,
#you need to put the file maxent.jar into the indicated directory. You'll need to do
#this second step by hand (not using r)

system.file("java", package="dismo")

#put here the names of your environmental layers, following the pattern below:
bio1 <- raster("bio_1.asc")
bio2 <- raster("bio_2.asc")
bio3 <- raster("bio_3.asc")
bio4 <- raster("bio_4.asc")
bio5 <- raster("bio_5.asc")
bio6 <- raster("bio_6.asc")
bio7 <- raster("bio_7.asc")
bio8 <- raster("bio_8.asc")
bio9 <- raster("bio_9.asc")
bio10 <- raster("bio_10.asc")
bio11 <- raster("bio_11.asc")
bio12 <- raster("bio_12.asc")
bio13 <- raster("bio_13.asc")
bio14 <- raster("bio_14.asc")
bio15 <- raster("bio_15.asc")
bio16 <- raster("bio_16.asc")
bio17 <- raster("bio_17.asc")
bio18 <- raster("bio_18.asc")
bio19 <- raster("bio_19.asc")


#Do what's called "stacking" the rasters together into a single r object

env <- stack(bio1, bio2, bio3, bio4, bio5, bio6, bio7, bio8, bio9, bio10, bio11, bio12, bio13, bio14, bio15, bio16, bio17, bio18, bio19)

#Display the stacked environment layer. Make a note of the position in the list
#of any categorical variables (do that by hand)

env

#in this example, the categorical variables are #s 9 and 10 in the list. But know your own data!

#load in your occurrence points

occ <- read.csv("grape.csv")[,-1]

#check how many potential background points you have available

length(which(!is.na(values(subset(env, 1)))))

#If this number is far in excess of 10,000, then use 10,000 background points.
#If this number is comprable to, or smaller than 10,000, then use 5,000, 1,000, 500,
#or even 100 background points. The number of available non-NA spaces should
#be well in excess of the number of background points used.

#For the evalution below, we need to convert the bias object into another format.
#The code is set up to sample 5,000 background points. It would be better if we
#could sample 10,000 background points, but there are not enough places available.
#If we could change it to 10,000 background points we would change the ", 5000," to ",10000,"

#run the evaluation

##This run uses the "randomkfold" method of cross-validation and 10 cross-validation folds.
##There are two categorical variables: they are numbers 9 and 10 in the list of environmental
##variables from the stacked raster object.

enmeval_results <- ENMevaluate(occ, env, method="jackknife", n.bg=10000, algorithm='maxent.jar')

enmeval_results@results

write.csv(enmeval_results@results, "grape_enmeval_results.csv")

#If you were to use the block method, you would replace:
#method="randomkfold", kfolds = 10
#with:
#method = "block"

#############################
##########IMPORTANT##########
#############################
#If you have fewer than 50 occurrence points, you will need to use the "jackknife" method of
#model validation instread. To do that, you would replace:
#method="randomkfold", kfolds = 10
#with:
#method = "jackknife"

#If there are no categorical variables in your dataset, you would get rid of:
#categoricals=c(9,10)
#In general, be very careful that the categoricals argument points to the right variable(s).

MSMC2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
1. prepare input data
1.1 generate the mask for SNPable Regions
~/msmc-tools/seqbility-20091110/splitfa VS1.final.chr.fa 35 >VS1.splitfa.35.fa
bwa aln -t 50 -R 1000000 -O 3 -E 3 VS1.final.chr.fa VS1.splitfa.35.fa >VS1.splitfa.35.sai
bwa samse -f VS1.splitfa.35.sam VS1.final.chr.fa VS1.splitfa.35.sai VS1.splitfa.35.fa
perl ~/msmc-tools/seqbility-20091110/gen_raw_mask.pl VS1.splitfa.35.sam >VS1.rawMask_35.fa
~/msmc-tools/seqbility-20091110/gen_mask -l 35 -r 0.5 VS1.rawMask_35.fa >VS1.mask_35_50.fa
python makeMappabilityMask.py

1.2 estimate the mean coverage
for i in `cat sample.txt`;
do for ((j=1;j<20;j++));
samtools depth -r ${j} ${i}.bam |awk '{sum += $3} END {print sum / NR}' >>${i}.depth.txt;
done
done

1.3 calling vcf
for i in `cat sample.txt`;
do for ((j=1;j<20;j++));
do ~/samtools-0.1.19/samtools mpileup -q 20 -Q 20 -C 50 -u -r ${j} -f /public/Genome/VS1.final.fa ${i}.bam | ~/samtools-0.1.19/bcftools/bcftools view -cgI - | ~/msmc-tools/bamCaller.py <mean_cov> ${i}.chr${j}.mask.bed.gz |gzip -c > ${i}.chr${j}.vcf.gz
done
done

1.4 phase vcf
for ((i=1;i<20;i++));
do java -Xmx100g -jar ~/beagle/beagle_4.1.jar gtgl=/public/grape.chr${i}.vcf.gz out=grape.chr$i.imp gprobs=true nthreads=20;
~/shapeit/bin/shapeit -T 10 -V grape.chr$i.imp.vcf.gz --window 0.5 -O grape.chr$i.imp.phased;
~/shapeit/bin/shapeit -convert --input-haps grape.chr$i.imp.phased --output-ref grape.chr$i.phased.haplotypes grape.chr$i.phased.legend grape.chr$i.phased.sample --output-vcf grape.chr$i.phased.vcf
done

1.5 phase a single-sample vcf file against a reference panel.
for i in `cat sample.txt`;
do for ((j=1;j<20;j++));
do ~/msmc-tools/run_shapeit.sh ${i}.chr${j}.vcf.gz tmp ${j}; #need change the location of dir of reference panel in file run_shapeit.sh.
done
done

1.6 generate input files for msmc for one chromosome.
for ((i=1;i<20;i++));
do ~/msmc-tools/generate_multihetsep.py --mask=sample1.chr${i}.mask.bed.gz --mask=sample2.chr${i}.mask.bed.gz --mask=sample3.chr${i}.mask.bed.gz --mask=sample4.chr${i}.mask.bed.gz --mask=/public/SNPable/VS1.chr${i}.mask.bed sample1.chr${i}.phased.vcf.gz sample2.chr${i}.phased.vcf.gz sample3.chr${i}.phased.vcf.gz sample4.chr${i}.phased.vcf.gz >multihetsep.sample1.sample2.sample3.sample4.chr${i}.txt
done

2. population size estimation
~/msmc2/build/release/msmc2 -t 19 -o sample1.sample2.sample3.sample4.msmc2.out {multihetsep.sample1.sample2.sample3.sample4.chr1.txt to multihetsep.sample1.sample2.sample3.sample4.chr19.txt}

3. cross-population analysis
~/msmc2/build/release/msmc2 -t 19 -s -I 0,1,2,3 -o sample1.sample2.sample3.sample4.pop1.msmc2.out {multihetsep.sample1.sample2.sample3.sample4.chr1.txt to multihetsep.sample1.sample2.sample3.sample4.chr19.txt}
~/msmc2/build/release/msmc2 -t 19 -s -I 4,5,6,7 -o sample1.sample2.sample3.sample4.pop2.msmc2.out {multihetsep.sample1.sample2.sample3.sample4.chr1.txt to multihetsep.sample1.sample2.sample3.sample4.chr19.txt}
~/msmc2/build/release/msmc2 -t 19 -s -I 0-4,0-5,0-6,0-7,1-4,1-5,1-6,1-7,2-4,2-5,2-6,2-7,3-4,3-5,3-6,3-7 -o sample1.sample2.sample3.sample4.across.msmc2.out {multihetsep.sample1.sample2.sample3.sample4.chr1.txt to multihetsep.sample1.sample2.sample3.sample4.chr19.txt}
~/msmc-tools/combineCrossCoal.py sample1.sample2.sample3.sample4.across.msmc2.out sample1.sample2.sample3.sample4.pop1.msmc2.out sample1.sample2.sample3.sample4.pop2.msmc2.out >sample1.sample2.sample3.sample4.combined.msmc2.final.txt

makeMappabilityMask.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env python

import gzip
import sys

class MaskGenerator:
def __init__(self, filename, chr):
self.lastCalledPos = -1
self.lastStartPos = -1
sys.stderr.write("making mask {}\n".format(filename))
# self.file = gzip.open(filename, "w")
self.file = open(filename, "w")
self.chr = chr

# assume 1-based coordinate, output in bed format
# shouldd be 0-based ##dsc
def addCalledPosition(self, pos):
if self.lastCalledPos == -1:
self.lastCalledPos = pos
self.lastStartPos = pos
elif pos == self.lastCalledPos + 1:
self.lastCalledPos = pos
else:
self.file.write("{}\t{}\t{}\n".format(self.chr, self.lastStartPos - 1, self.lastCalledPos))
self.lastStartPos = pos
self.lastCalledPos = pos

with open("/public/SNPable/VS1.mask_35_50.fa", "r") as f:
for line in f:
if line.startswith('>'):
chr = line.split()[0][1:]
mask = MaskGenerator("/public/SNPable/VS1.chr{}.mask.bed".format(chr), chr)
pos = 0
continue
for c in line.strip():
pos += 1
if pos % 1000000 == 0:
sys.stderr.write("processing pos:{}\n".format(pos))
if c == "3":
mask.addCalledPosition(pos)

Stairway plot

1
2
3
4
5
6
7
8
9
1. filter SNPs
vcftools --gzvcf grape.snp.vcf.gz --keep group.txt --max-missing 1 --mac 1 --bed VS1.SNPable.bed.exclude_cds.bed --recode --stdout | gzip -c > grape.group.vcf.gz

2. construction of the site frequency spectrum
~/easySFS/easySFS.py -i grape.group.vcf.gz -p pop.group.txt --proj={2*sample size} -a -o output_group --prefix group

3. run stairway_plot
java -cp stairway_plot_es Stairbuilder group.blueprint
sh group.blueprint.sh

group.blueprint

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#example blueprint file
#input setting
popid: group # id of the population (no white space)
nseq: 102 # number of sequences
L: 257128778 # total number of observed nucleic sites, including polymorphic and monomorphic #278161037 SNPable
whether_folded: true # whethr the SFS is folded (true or false)
SFS: 689610 445144 318665 270834 237351 208996 181977 177559 163511 152628 138020 136121 126102 119257 112212 104448 99962 97062 92585 86347 82904 78861 75324 73939 72378 69395 66293 63918 64790 62330 59988 58041 56322 56471 55612 53506 52584 53598 53513 51691 51295 50252 50349 51070 51566 50365 50424 51334 51405 51300 30261
smallest_size_of_SFS_bin_used_for_estimation: 2 # default is 1; to ignore singletons, uncomment this line and change this number to 2
largest_size_of_SFS_bin_used_for_estimation: 51 # default is nseq/2 for folded SFS
pct_training: 0.67 # percentage of sites for training
nrand: 25 50 75 100 # number of random break points for each try (separated by white space)
project_dir: group # project directory
stairway_plot_dir: stairway_plot_es # directory to the stairway plot files
ninput: 200 # number of input files to be created for each estimation
#random_seed: 6
#output setting
mu: 5.4e-9 # assumed mutation rate per site per generation
year_per_generation: 3 # assumed generation time (in years)
#plot setting
plot_title: group # title of the plot
xrange: 0,0 # Time (1k year) range; format: xmin,xmax; "0,0" for default
yrange: 0,0 # Ne (1k individual) range; format: xmin,xmax; "0,0" for default
xspacing: 2 # X axis spacing
yspacing: 2 # Y axis spacing
fontsize: 12 # Font size

Momi2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
1. generate input data
for ((i=1;i<20;i++));
do vcftools --gzvcf grape.snp.chr${i}.vcf.gz --keep group.txt --max-missing 1 --mac 1 --bed VS1.SNPable.bed.exclude_cds.bed.chr${i} --recode --stdout > grape.group.chr${i}.vcf;
bgzip grape.group.chr${i}.vcf;
tabix grape.group.chr${i}.vcf.gz;
python -m momi.read_vcf grape.group.chr${i}.vcf.gz pop.txt grape.group.chr${i}.snpAlleleCounts.gz --bed VS1.SNPable.bed.exclude_cds.bed.chr${i} --no_aa
done

python -m momi.extract_sfs sfs.gz 100 *.snpAlleleCounts.gz

2. run momi2, repeat 20 times.
python momi2.py

3. run bootstrap, repeat 100 times.
python momi2.bootstrap.py

pop.txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
IS97	WEUE
IS103 WEUE
IS95 WEUE
IS135 WEUE
IS99 WEUE
Fre61 CEUG
IS52 CEUG
IS65 CEUG
IS39 CEUG
198-A CEUG
Spa10 CEUA
Spa27 CEUA
Spa7 CEUA
PO120 CEUA
PO113 CEUA
539 WEUG
559 WEUG
581 WEUG
554 WEUG
Fre305 WEUG

momi2.bootstrap.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import matplotlib.pyplot as plt
import momi
import logging
import pickle
import dill

logging.basicConfig(level=logging.INFO, filename="tutorial.bootstrap.log")

sfs = momi.Sfs.load("sfs.gz")

add_pulse_model = dill.load(open('best_run/add_pulse_model.txt', 'rb'))
results = pickle.load(open('best_run/results.txt', 'rb'))
best_result = pickle.load(open('best_run/best_result.txt', 'rb'))

add_pulse_model.set_params(best_result.parameters)

n_bootstraps = 1
# make copies of the original models to avoid changing them
#no_pulse_copy = no_pulse_model.copy()
add_pulse_copy = add_pulse_model.copy()

bootstrap_results = []
for i in range(n_bootstraps):
print(f"Fitting {i+1}-th bootstrap out of {n_bootstraps}")
# resample the data
resampled_sfs = sfs.resample()
# tell models to use the new dataset
#no_pulse_copy.set_data(resampled_sfs)
add_pulse_copy.set_data(resampled_sfs)
# choose new random parameters for submodel, optimize
#no_pulse_copy.set_params(randomize=True)
#no_pulse_copy.optimize()
# initialize parameters from submodel, randomizing the new parameters
add_pulse_copy.set_params(randomize=True) #no_pulse_copy.get_params(),
add_pulse_copy.optimize()
bootstrap_results.append(add_pulse_copy.get_params())

pickle.dump(bootstrap_results, open('bootstrap_results.txt', 'wb'))

momi2.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import matplotlib.pyplot as plt
import momi
import logging
import pickle
import dill
logging.basicConfig(level=logging.INFO, filename="tutorial.log")

sfs = momi.Sfs.load("sfs.gz")
add_pulse_model = momi.DemographicModel(N_e=1e3, gen_time=3, muts_per_gen=5.4e-9)
add_pulse_model.set_data(sfs)

add_pulse_model.add_size_param("n_weug",lower=500,upper=1e6)
add_pulse_model.add_size_param("n_weue",lower=500,upper=1e6)
add_pulse_model.add_size_param("n_ceug",lower=500,upper=1e6)
add_pulse_model.add_size_param("n_ceua",lower=500,upper=1e6)

add_pulse_model.add_time_param("t_weue_ceug", lower=8.95e3,upper=1.54e4)
add_pulse_model.add_time_param("t_ceug_ceua", lower=7.74e3,upper=1.14e4,upper_constraints=["t_weue_ceug"])
add_pulse_model.add_time_param("t_weue_weug", lower=3.69e5,upper=3.98e5)

add_pulse_model.add_leaf("CEUA", N="n_ceua")
add_pulse_model.add_leaf("CEUG", N="n_ceug")
add_pulse_model.add_leaf("WEUE", N="n_weue")
add_pulse_model.add_leaf("WEUG", N="n_weug")


add_pulse_model.move_lineages("CEUA", "CEUG", t="t_ceug_ceua",p=1)
add_pulse_model.move_lineages("CEUG", "WEUE", t="t_weue_ceug",p=1)
add_pulse_model.move_lineages("WEUG", "WEUE", t="t_weue_weug",p=1)

#adding a WEUG->CEUA migration arrow.
add_pulse_model.add_pulse_param("weug_ceua_pulse", upper=.5)
add_pulse_model.add_time_param("t_weug_ceua_pulse", upper_constraints=["t_ceug_ceua"])
add_pulse_model.move_lineages("CEUA", "WEUG", t="t_weug_ceua_pulse", p="weug_ceua_pulse")

results = []
n_runs = 1
for i in range(n_runs):
print(f"Starting run {i+1} out of {n_runs}...")
add_pulse_model.set_params(
# parameters inherited from no_pulse_model are set to their previous values
# no_pulse_model.get_params(),
# other parmaeters are set to random initial values
randomize=True)
# results.append(add_pulse_model.optimize(method="L-BFGS-B",options={"maxiter":200}))
results.append(add_pulse_model.optimize(method="TNC",options={"maxiter":200}))

dill.dump(add_pulse_model, open('add_pulse_model.txt', 'wb'))
pickle.dump(results, open('results.txt', 'wb'))
best_result = results[0]
print(best_result)
pickle.dump(best_result, open('best_result.txt', 'wb'))

add_pulse_model.set_params(best_result.parameters)

yticks = [1e3,1e4, 2.5e4, 5e4, 7.5e4, 1e5, 2.5e5, 5e5]
fig = momi.DemographyPlot(
add_pulse_model, ["WEUE", "CEUG", "CEUA","WEUG"],
figsize=(8,10),
major_yticks=yticks,
linthreshy=1e3, pulse_color_bounds=(0,.5))
plt.savefig("WEUE_CEUG_CEUA_WEUG.pdf")

#fig.draw_N_legend(loc="upper right")

选择扫描信号

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
1. calculated Fst
vcftools --gzvcf grape.all.vcf.gz --weir-fst-pop Syl-E1.txt --weir-fst-pop CG1.txt --fst-window-size 50000 --fst-window-step 10000 --out grape.Syl-E1.CG1
vcftools --gzvcf grape.all.vcf.gz --weir-fst-pop Syl-E2.txt --weir-fst-pop CG2.txt --fst-window-size 50000 --fst-window-step 10000 --out grape.Syl-E2.CG2

2. calculated pi
for i in {Syl-E1,Syl-E2,CG1,CG2};
do vcftools --gzvcf grape.${i}.vcf.gz --window-pi 50000 --window-pi-step 10000 --out grape.${i}
done

3. get selective regions.
python extract_signification.py grape.Syl-E1.CG1.fst_pi grape.Syl-E1.CG1.fst_pi.txt grape.Syl-E1.CG1.fst_pi.plot 0.05
python extract_signification.py grape.Syl-E2.CG2.fst_pi grape.Syl-E2.CG2.fst_pi.txt grape.Syl-E2.CG2.fst_pi.plot 0.05

# head -3 grape.Syl-E1.CG1.fst_pi
#CHROM BIN_START BIN_END WEIGHTED_FST Syl-E1 CG1
#1 1 50000 0.0475312 0.00365458 0.0043379
#1 10001 60000 0.050045 0.00370561 0.00445387

extract_signification.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python
import sys
import commands
import os
import math
import numpy as np
if len(sys.argv) <> 5:
print "Usage: " + sys.argv[0] + " input output_signification output_to_plot signification_cutoff (0.05)"
sys.exit(-1)
file1=open(sys.argv[1],'r')
file2=open(sys.argv[2],'w')
file3=open(sys.argv[3],'w')
top=float(sys.argv[4])
file2.write('Chr\tBin_start\tBin_end\tFst\tPi_log2\n')
file3.write('Fst\tPi_log2\n')
head=file1.readline()
fst_list=[]
pi_list=[]
Line1=file1.readlines()
for line1 in Line1:
lin1=line1.strip('\n').split('\t')
fst_list.append(float(lin1[3]))
if float(lin1[4])==0 or float(lin1[5])==0:
pi_list.append(float(0))
else:
pi=np.log2(float(lin1[4])/float(lin1[5]))
pi_list.append(float(pi))
# file2.write('\t'.join(lin1[:4])+'\t'+str(pi)+'\n')
fst_list_sorted=sorted(fst_list,reverse=True)
fst_index=int(math.ceil(float(len(fst_list))*top))
fst_cut_off=fst_list_sorted[fst_index]
pi_list_sorted=sorted(pi_list,reverse=True)
pi_index=int(math.ceil(float(len(pi_list))*top))
pi_cut_off=pi_list_sorted[pi_index]
file_name=sys.argv[1]+'.threshold.txt'
file=open(file_name,'w')
file.write('Fst_cutoff\tPi_cutoff\n')
file.write(str(fst_cut_off)+'\t'+str(pi_cut_off)+'\n')
file.close()
for line1 in Line1:
lin1=line1.strip('\n').split('\t')
if float(lin1[4]) !=0 and float(lin1[5]) !=0:
pi=np.log2(float(lin1[4])/float(lin1[5]))
if float(lin1[3]) > fst_cut_off and float(pi) > pi_cut_off:
file2.write('\t'.join(lin1[:4])+'\t'+str(pi)+'\n')
for i in range(0,len(fst_list)):
file3.write(str(fst_list[i])+'\t'+str(pi_list[i])+'\n')
file1.close()
file2.close()
file3.close()

Treemix

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
1. generate input data
vcftools --gzvcf grape.snp.vcf.gz --keep group.txt --max-missing 1 --mac 1 --recode --stdout | gzip -c > grape.group.vcf.gz
file=grape.group
vcftools --gzvcf $file.vcf.gz --plink-tped --out $file
plink --tfile $file --make-bed --out $file
plink --bfile $file --indep-pairwise 50 1 0.5 -out $file.pruning
plink --bfile $file --extract $file.pruning.prune.in --make-bed --out $file.pruned
plink --bfile $file.pruned --recode tab transpose --out $file.pruned.subset
mv $file.pruned.subset.tfam $file.pruned.subset.tfam.old
cat group*.txt >pop.cov
python change_FID.py pop.cov $file.pruned.subset.tfam.old $file.pruned.subset.tfam
plink --tfile $file.pruned.subset --freq --noweb --missing --within pop.cov --out $file.pruned.subset
gzip $file.pruned.subset.frq.strat
python plink2treemix.py $file.pruned.subset.frq.strat.gz $file.pruned.subset.treemix.gz

# head -1 pop.cov
#group1 sample1 group1

2.run treemix (m=0 to m=10), 10 iterations per m:
for ((i=0;i<11;i++));
do for j in {1..10};
do s=$RANDOM;
~/treemix-1.12/bin/treemix -i grape.group.pruned.subset.treemix.gz -o grape.group.pruned.subset.stem.m${i}.${j} -global -m ${i} -k 500 -seed ${s} -bootstrap 1000 -root Syl-E1;
done
done

3. Estimating the optimal number of migration edges from Treemix
R:
library("OptM")
folder <- "treemix_Syl-E1"
test.optM = optM(folder)
plot_optM(test.optM, method = "Evanno",pdf="treemix.OptM.pdf")

change_FID.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/usr/bin/python
import sys
import commands
import os
if len(sys.argv) <> 4:
print "Usage: " + sys.argv[0] + " file1 file2"
sys.exit(-1)
file1=open(sys.argv[1],'r')
file2=open(sys.argv[2],'r')
file3=open(sys.argv[3],'w')
file1_dict={}
Line1=file1.readlines()
for line1 in Line1:
lin1=line1.split('\t')
file1_dict[lin1[1]]=lin1[0]
Line2=file2.readlines()
for line2 in Line2:
lin2=line2.split('\t')
group=file1_dict[lin2[0]]
file3.write(group+'\t'+'\t'.join(lin2[1:]))
file1.close()
file2.close()
file3.close()

Dsuite_admixr

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
1. filter SNPs
vcftools --gzvcf grape.snp.vcf.gz --keep group.txt --max-missing 1 --mac 1 --recode --stdout | gzip -c > grape.vcf.gz

2. run Dtrios
~/Dsuite/Build/Dsuite Dtrios -n grape grape.vcf.gz SETS.txt

#head -2 SETS.txt
ZZ01 Outgroup
TA-6228 WEUA

3. df and fdM statistics
~/Dsuite/Build/Dsuite Dinvestigate -w 50,5 grape.vcf.gz SETS.txt need_group.txt

4. f3 statistics
#convert vcf to eigenstrat format for ADMIXTOOLS
sh convertVCFtoEigenstrat.sh grape.vcf.gz

R:
library(admixr)
library(readr)
data_prefix <- "grape"
snps <- eigenstrat(data_prefix)
pops <- c("group1","group2","group3".......)
result <- f3(A = pops, B = pops, C = "rotundifolia", data = snps)
write.table(result,file="grape.f3statistics.txt",row.names=FALSE,sep="\t",quote=FALSE)

convertVCFtoEigenstrat.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash

# Script to convert vcf to eigenstrat format for ADMIXTOOLS
# Written by Joana Meier
# It takes a single argument: the vcf file (can be gzipped) and
# optionally you can specify --renameScaff if you have scaffold names (not chr1, chr2...)

# Here, you can change the recombination rate which is currently set to 2 cM/Mb
rec=2

# It requires vcftools and admixtools

# for some clusters, it is needed to load these modules:
# module load gcc/4.8.2 vcftools openblas/0.2.13_seq perl/5.18.4 admixtools

renameScaff="FALSE"

# If help is requested
if [[ $1 == "-h" ]]
then
echo "Please provide the vcf file to parse, and optionally add --renameScaff if you have scaffolds instead of chromosomes"
echo "Usage: convertVCFtoEigenstrat.sh <vcf file> --renameScaff (note, the second argument is optional)"
exit 1

# If the second argument renameScaff is given, set it to True
elif [[ $2 == "--renameScaff" ]]
then
renameScaff="TRUE"

# If no argument is given or the second one is not -removeChr, give information and quit
elif [ $# -ne 1 ]
then
echo "Please provide the vcf file to parse, and optionally add --renameScaff if you have scaffolds instead of chromosomes"
echo "Usage: ./convertVCFtoEigenstrat.sh <vcf file> --renameScaff (note, the second argument is optional)"
exit 1
fi

# Set the first argument to the file name
file=$1
file=${file%.gz}
file=${file%.vcf}


# if the vcf file is gzipped:

if [ -s $file.vcf.gz ]
then

# If renaming of scaffolds is requested, set all chromosome/scaffold names to 1
if [ $renameScaff == "TRUE" ]
then
echo "setting scaffold names to 1 and positions to additive numbers"
zcat $file".vcf.gz" | awk 'BEGIN {OFS = "\t";add=0;lastPos=0;scaff=""}{
if($1!~/^#/){
if($1!=scaff){add=lastPos;scaff=$1}
$1="1"
$2=$2+add
lastPos=$2
}
print $0}' | gzip > $file.renamedScaff.vcf.gz

# Get a .map and .ped file (remove multiallelic SNPs, monomorphic sites and indels)
vcftools --gzvcf $file".renamedScaff.vcf.gz" \
--plink --mac 1.0 --remove-indels --max-alleles 2 --out $file

else
# Get a .map and .ped file (remove multiallelic SNPs, monomorphic sites and indels)
vcftools --gzvcf $file".vcf.gz" \
--plink --mac 1.0 --remove-indels --max-alleles 2 --out $file
fi

# if the file is not gzipped
else
# If renaming of scaffolds is requested, set all chromosome/scaffold names to 1
if [ $renameScaff == "TRUE" ]
then
echo "setting scaffold names to 1 and positions to additive numbers"
awk 'BEGIN {OFS = "\t";add=0;lastPos=0;scaff=""}{
if($1!~/^#/){
if($1!=scaff){add=lastPos;scaff=$1}
$1="1"
$2=$2+add
lastPos=$2
}
print $0}' $file.vcf | gzip > $file.renamedScaff.vcf.gz

# Get a .map and .ped file (remove multiallelic SNPs, monomorphic sites and indels)
vcftools --gzvcf $file".renamedScaff.vcf.gz" \
--plink --mac 1.0 --remove-indels --max-alleles 2 --out $file
else
vcftools --vcf $file".vcf" --plink --mac 1.0 --remove-indels --max-alleles 2 --out $file
fi
fi


# Change the .map file to match the requirements of ADMIXTOOLS by adding fake Morgan positions (assuming a recombination rate of 2 cM/Mbp)
awk -F"\t" -v rec=$rec 'BEGIN{scaff="";add=0}{
split($2,newScaff,":")
if(!match(newScaff[1],scaff)){
scaff=newScaff[1]
add=lastPos
}
pos=add+$4
count+=0.00000001*rec*(pos-lastPos)
print newScaff[1]"\t"$2"\t"count"\t"pos
lastPos=pos
}' ${file}.map | sed 's/^chr//' > better.map
mv better.map ${file}.map

# Change the .ped file to match the ADMIXTOOLS requirements
awk 'BEGIN{ind=1}{printf ind"\t"$2"\t0\t0\t0\t1\t";
for(i=7;i<=NF;++i) printf $i"\t";ind++;printf "\n"}' ${file}.ped > tmp.ped
mv tmp.ped ${file}.ped


# create an inputfile for convertf
echo "genotypename: ${file}.ped" > par.PED.EIGENSTRAT.${file}
echo "snpname: ${file}.map" >> par.PED.EIGENSTRAT.${file}
echo "indivname: ${file}.ped" >> par.PED.EIGENSTRAT.${file}
echo "outputformat: EIGENSTRAT" >> par.PED.EIGENSTRAT.${file}
echo "genotypeoutname: ${file}.eigenstratgeno" >> par.PED.EIGENSTRAT.${file}
echo "snpoutname: ${file}.snp" >> par.PED.EIGENSTRAT.${file}
echo "indivoutname: ${file}.ind" >> par.PED.EIGENSTRAT.${file}
echo "familynames: NO" >> par.PED.EIGENSTRAT.${file}


# Use CONVERTF to parse PED to eigenstrat
convertf -p par.PED.EIGENSTRAT.${file}

# change the snp file for ADMIXTOOLS:
awk 'BEGIN{i=0}{i=i+1; print $1"\t"$2"\t"$3"\t"i"\t"$5"\t"$6}' $file.snp > $file.snp.tmp
mv $file.snp.tmp $file.snp

GWAS

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
1. gwas for muscat and non-muscat grapevines
1.1 filter SNPs
vcftools --gzvcf grape.all.vcf.gz --keep muscat_nonmuscat.txt --max-missing 0.8 --maf 0.01 --recode --stdout |gzip -c > grape.muscat.vcf.gz

1.2 run gwas
file=grape.muscat
plink --vcf ${file}.vcf.gz --set-missing-var-ids @:# --make-bed --out ${file}
~/gcta_1.93.3beta2/gcta64 --bfile ${file} --make-grm --thread-num 20 --out geno_grm
~/gcta_1.93.3beta2/gcta64 --grm geno_grm --make-bK-sparse 0.05 --out sp_grm
~/gcta_1.93.3beta2/gcta64 --bfile ${file} --grm-sparse sp_grm --fastGWA-mlm-binary --pheno phenotype.txt --threads 20 --out geno_assoc

2. gwas for berry skin color
2.1 filter SNPs
vcftools --gzvcf grape.all.vcf.gz --keep OIV_225.txt --max-missing 0.7 --maf 0.01 --recode --stdout |gzip -c > grape.OIV_225.vcf.gz

2.2 run gwas
file=grape.OIV_225
plink --vcf ${file}.vcf.gz --set-missing-var-ids @:# --make-bed --out ${file}
~/gcta_1.93.3beta2/gcta64 --mlma-loco --bfile ${file} --pheno phenotype.txt --threads 20 --out OIV_225.geno_assoc

葡萄Science文章基因组学和群体遗传学代码
https://lixiang117423.github.io/article/genomecode/
作者
李详【Xiang LI】
发布于
2025年1月7日
许可协议