1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
| rm(list = ls())
readr::read_delim("./01.data/genome/piapik.gff", col_names = FALSE) %>% dplyr::select(1,3:5) %>% magrittr::set_names(c("HROM","gene.region","start","end")) %>% dplyr::filter(gene.region == "exon") %>% dplyr::mutate(HROM = stringr::str_replace(HROM,"\\.","_"))-> df.gff
dir("./08.snpeff/") %>% as_data_frame() %>% magrittr::set_names("vcf") %>% dplyr::filter(stringr::str_ends(vcf, "ann.vcf"))-> vcf
all.vcf = NULL
for (i in vcf$vcf) { sprintf("./08.snpeff/%s",i) %>% data.table::fread() %>% magrittr::set_names(c("HROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample")) %>% dplyr::select(1:8) %>% dplyr::mutate(mutation.type = stringr::str_split(INFO,";") %>% sapply("[",12) %>% stringr::str_split("\\|") %>% sapply("[",2), mutation.region = stringr::str_split(INFO,";") %>% sapply("[",12) %>% stringr::str_split("\\|") %>% sapply("[",6), mutation.protein = stringr::str_split(INFO,";") %>% sapply("[",12) %>% stringr::str_split("\\|") %>% sapply("[",11)) %>% dplyr::select(-INFO) %>% dplyr::left_join(df.gff) %>% dplyr::mutate(group = case_when(POS >= start & POS <= end ~ "Exon", TRUE ~ "Non-exon")) %>% dplyr::filter(group == "Exon") %>% dplyr::select(1:5,8:10) %>% magrittr::set_names(c("chrom", "position", "id", "ref", "alt", "mutation.type", "mutation.region", "mutation.protein")) %>% dplyr::mutate(sample = stringr::str_replace(i,".ann.vcf","")) %>% dplyr::mutate(rgene = case_when(chrom == "OsR498G1119642600_01" ~ "Pia", TRUE ~ "Pik")) %>% dplyr::select(9:10,2:8) %>% rbind(all.vcf) -> all.vcf }
all.vcf %>% write.table("./all.vcf.txt", sep = "\t", quote = FALSE, row.names = FALSE)
|