Following includes codes used to generate, run, and analyze phyloacc results. It uses phyloacc program (see installation link here), bash and R codes interchangeably along with other instructions to run. Please make sure you are running the codes in the correct interface.
Once you have a bed of conserved elements, we need to get alignments of the elements from the hal alignments. Steps to get the hal alignment are here. First we need to get maf alignments of the elements for conserved regions, separated by chromosomes.
#bash
#loop over bed file by chromosome and create mafs for each
#$1 is an input file that contains a list of chromosomes for a category in $2 (Z, macro, or micro)
while IFS= read -r line; do
mkdir -p final_dataset/maf_${2}_sort
mkdir -p final_dataset/beds/
grep -P $line final_dataset/final_working_conserved_filt.bed > final_dataset/beds/${line}.bed
hal2mafMP.py --numProc 6 --refGenome Gallus_gallus7 --noDupes --noAncestors --refTargets final_dataset/beds/${line}.bed all_birds_new.hal final_dataset/maf_${2}_sort/all_birds_${line}.maf
done < "$1"
Next we go from maf to fasta. Here is my github page for a script to convert maf files to fasta files.
#bash
#example of loop for macrochromosomes
while IFS= read -r line; do
do
mkdir -p fasta/beds/
grep -P ${line} cnee_macro.bed > fasta/beds/${line}.bed
python phast_scripts/maf2fasta.py --maf maf_macro/all_birds_${line}.maf \
--bed fasta/beds/${line}.bed \
--out_folder fasta/maf_macro/${line} \
--ref_species Gallus_gallus7
done < "$1"
Phyloacc takes a folder with elements. However, we concatenated the fasta files into a single file and removed alignments with less than 10% total taxa (so removes alignments with less than 8 species) using the ConcatinaTOR script.
#bash
#example for macrochromosomes
python ConcatinaTOR.py --fasta_folder fasta_macro \
--bed_file cnee_macro.txt \
--write_interval 10000 \
--output_folder fasta_concat/fasta_macro_concat/ \
--n_taxa 79 \
--per_taxa 0.10 \
--recursive
cat fasta_macro/*.fa > cnee_macro.fa
Also to get some stats using the get_fasta_stat.py script
#python
python get_fasta_stat.py --input_folder /n/holylfs05/LABS/informatics/Users/sshakya/genomes_phyloacc/final_dataset/fasta_macro/ --output macro_stats
python get_fasta_stat.py --input_folder /n/holylfs05/LABS/informatics/Users/sshakya/genomes_phyloacc/final_dataset/fasta_micro/ --output micro_stats
python get_fasta_stat.py --input_folder /n/holylfs05/LABS/informatics/Users/sshakya/genomes_phyloacc/final_dataset/fasta_Z/ --output Z_stats
Plot the stats from the stats generated above
#R
library(tidyr)
align_stats <- xlsx::read.xlsx("alignment_stats.xlsx", sheetIndex = 2)
align_stats <- pivot_longer(align_stats, c("Z", "macro", "micro"), names_to = "Chromosomes", values_to = "Count")
ggplot(align_stats, aes(x=Number, y=Count, fill = Chromosomes))+
geom_bar(stat="identity", position=position_dodge())+
scale_x_reverse()+
theme_bw()+
facet_grid(row = vars(Chromosomes), scales = "free")
We ran phyloacc (phyloacc man here). We ran elements in 3 batches, with separate rates and trees for macro, micro, and Z chromosomes respectively.
#bash
#see phyloacc manual to see what each of these flags to. charset.txt is generated by ConcatinaTOR.py,
#charset_id.txt is the fourth column of charset.txt
phyloacc.py --overwrite -a cnee_macro.fa \
-b charset.txt \
-i charset_id.txt \
-m all_birds_4d_macro_non-conserved.mod \
-o output \
-t "Aptenodytes_patagonicus;Aptenodytes_forsteri;Pygoscelis_adeliae;Pygoscelis_papua;Pygoscelis_antarcticus;Megadyptes_antipodes;Eudyptula_novaehollandiae;Eudyptula_albosignata;Eudyptula_minor;Spheniscus_demersus;Spheniscus_humboldti;Spheniscus_magellanicus;Eudyptes_pachyrhynchus;Eudyptes_sclateri;Eudyptes_chrysolophus;Eudyptes_schlegeli;Eudyptes_chrysocome;Eudyptes_filholi;Eudyptes_moseleyi;Halcyon_senegalensis;Chloroceryle_aenea;Actenoides_hombroni;Progne_subis;Riparia_riparia;Hirundo_rustica;Brachypodius_melanocephalos;Pycnonotus_jocosus" \
-g "Gavialis_gangeticus" \
-n 4 \
-batch 100 \
-j 20 \
-part "shared"
Plot for log of Bayes factor1 (comparison between the null model (with no acceleration allowed), and the target-lineage model (where acceleration is allowed in only the focal/target lineages)) and log of Bayes factor 2 (comparison between the target-lineage model (where acceleration is only allowed in only the focal/target lineages) and the unrestricted model (where acceleration can occur anywhere))
#R
library(plyr)
library(phangorn)
library(phytools)
library(data.table)
#read main result file
phyloacc <- fread("elem_lik_filt.txt", sep = "\t", header = T)
#find location of shifts on tree aka where acceleration occurred since phyloacc reports
#all daughter branches of a shift as being accelerated
shift_calc <- function(tree, post_phylo, subtree = ""){
tree_quant <- data.frame(node = tree$tip.label, id = seq(1, length(tree$tip.label)), type = "tip")
tree_quant <- rbind(tree_quant, data.frame(node = tree$node.label,
id = seq(nrow(tree_quant) + 1 , length(tree$node.label) + nrow(tree_quant)),
type = "internal"))
tree_quant <- merge(tree_quant, data.frame(edge = tree$edge[,2], edge_len = tree$edge.length, anc = tree$edge[,1]),
by.x = "id", by.y = "edge", all.x = T)
if (length(subtree) <= 1){
all_daughts <- tree_quant$node[-which(tree_quant$node %in% "root")]
}
else{
mrca <- getMRCA(tree, subtree)
start_node <- tree_quant$anc[which(tree_quant$id %in% mrca)]
all_daughts <- tree_quant$node[which(tree_quant$id %in% Descendants(tree, mrca, "all"))]
all_daughts <- c(tree_quant$node[which(tree_quant$id %in% start_node)],
tree_quant$node[which(tree_quant$id %in% mrca)],
all_daughts)
}
tips <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "tip")])], 3, sep = "_")
internals <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "internal")])], 3, sep = "_")
shifts <- rowSums(post_phylo[, ..tips]) - rowSums(post_phylo[,..internals])
return(shifts)
}
#get dataframe of shifts
run_shifts <- function(results){
tree <- read.newick("macro_tree.tree")
taxa_list <- fread("taxa_list.txt", header = F, sep = "\t")
basic_taxa <- paste(taxa_list$V1[taxa_list$V2 == "basic"], 3, sep = "_")
target_taxa <- paste(taxa_list$V1[taxa_list$V2 == "target"], 3, sep = "_")
results$basic <- rowSums(results[,..basic_taxa] >= 0.9)
results$target <- rowSums(results[,..target_taxa] >= 0.9)
bulbul_taxa <- target_taxa[c(26,27,28)]
results$bulbul <- rowSums(results[,..bulbul_taxa] >= 0.9)
swallow_taxa <- target_taxa[c(23,24,25,29,38)]
results$swallow <- rowSums(results[,..swallow_taxa] >= 0.9)
kingfisher_taxa <- target_taxa[c(20,21,22,30,39)]
results$kingfisher <- rowSums(results[,..kingfisher_taxa] >= 0.9)
penguin_taxa <- target_taxa[c(1:19,31:37,40:50)]
results$penguin <- rowSums(results[,..penguin_taxa] >= 0.9)
shifts <- results[,c("Locus ID", "basic", "target", "bulbul", "swallow", "kingfisher", "penguin")]
colnames(shifts)[1] <- "Locus"
shifts$all_shifts <- shift_calc(tree, results)
shifts$bulbul_shifts <- shift_calc(tree, results, subtree = c("Brachypodius_melanocephalos", "Pycnonotus_jocosus"))
shifts$swallow_shifts <- shift_calc(tree, results, subtree = c("Hirundo_rustica", "Progne_subis", "Riparia_riparia"))
shifts$kingfisher_shifts <- shift_calc(tree, results, subtree = c("Chloroceryle_aenea", "Actenoides_hombroni", "Halcyon_senegalensis"))
shifts$penguin_shifts <- shift_calc(tree, results, subtree = c("Aptenodytes_patagonicus", "Aptenodytes_forsteri", "Pygoscelis_adeliae", "Pygoscelis_papua", "Pygoscelis_antarcticus", "Megadyptes_antipodes", "Eudyptula_novaehollandiae", "Eudyptula_albosignata", "Eudyptula_minor", "Spheniscus_demersus", "Spheniscus_humboldti", "Spheniscus_magellanicus", "Eudyptes_pachyrhynchus", "Eudyptes_sclateri", "Eudyptes_chrysolophus", "Eudyptes_schlegeli", "Eudyptes_chrysocome", "Eudyptes_filholi", "Eudyptes_moseleyi"))
shifts$targets_with_shifts <- rowSums(shifts[,c("bulbul_shifts", "kingfisher_shifts", "swallow_shifts", "penguin_shifts")] >= 0.9)
shifts$in_lin <- round(shifts$bulbul_shifts + shifts$kingfisher_shifts + shifts$swallow_shifts + shifts$penguin_shifts)
shifts$out_lin <- round(shifts$all_shifts - shifts$bulbul_shifts - shifts$kingfisher_shifts - shifts$swallow_shifts - shifts$penguin_shifts)
return(shifts)
}
results_m2 <- fread("rate_postZ_M2.txt", header = T, sep = "\t")
shifts_m2 <- run_shifts(results_m2)
We filtered the dataset for logbf1 >= 10 to get list of
accelerated elements. We also filtered sites where there was
acceleration in at least all bulbuls, or all kingifishers, or all
swallows, or at least 3 penguins. This ensures that any element where an
acceleration was not present in at least one of the three target clades
or more than 3 species of penguins are omitted.
We also created a second highly filtered dataset where only elements with logbf1 >= 10 and logbf2 >=5 were kept. This would keep only elements where there acceleration was limited to the target clade.
#filter for all sites with evidence for acceleration (logbf1 > 10)
phyloacc_subset <- subset(phyloacc, logbf1 > 10)
phyloacc_subset <- merge(phyloacc_subset, shifts_m2, by.x = "phyloacc.id", by.y = "Locus", all.x = T, all.y = F)
phyloacc_subset <- subset(phyloacc_subset, targets_with_shifts > 0 & (bulbul > 1 | kingfisher > 2 | swallow > 2 | penguin > 2))
fwrite(phyloacc_subset, "accelerated_elements_all_m2.txt", sep = "\t")
phyloacc_sub_cat1 <- subset(phyloacc_subset, logbf2 >= 5)
fwrite(phyloacc_sub_cat1, "accelerated_elements_filtbf2_m2.txt", sep = "\t")
#read main result file
library(data.table)
phyloacc <- fread("elem_lik_filt.txt", sep = "\t", header = T)
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
phyloacc_subset_cat1 <- fread("accelerated_elements_filtbf2_m2.txt", sep = "\t")
phyloacc$col <- "CNEE"
phyloacc$col[which(phyloacc$phyloacc.id %in% phyloacc_subset$phyloacc.id)] <- "short-tarsi broad"
phyloacc$col[which(phyloacc$phyloacc.id %in% phyloacc_subset_cat1$phyloacc.id)] <- "short-tarsi specific"
#plot logbf1 vs logbf2
library(ggplot2)
ggplot(phyloacc, aes(x=logbf1, y=logbf2, col=col))+
geom_point(shape = 19, alpha = 0.1)+
theme_classic()+
scale_color_manual(values = c("grey", "red", "blue")) +
geom_vline(xintercept=10, linetype="dashed", color = "red")+
geom_hline(yintercept=5, linetype="dashed", color = "blue")
We have two datasets, one large dataset with very little filtering with 7948 elements and a smaller highly filtered dataset with 403 elements.
Distribution of conserved rate vs accelerated rate for elements
library(gridExtra)
phyloacc_subset <- fread("accelerated_elements_all.txt", sep = "\t")
#rate0 <- fread("rate_postZ_M0.txt", sep = "\t", header = T, select = 1:6)
rate1 <- fread("rate_postZ_M1.txt", sep = "\t", header = T, select = 1:6)
rate1 <- subset(rate1, (`Locus ID` %in% phyloacc_subset$phyloacc.id) & (phyloacc_subset$best.fit.model == "M1"))
rate2 <- fread("rate_postZ_M2.txt", sep = "\t", header = T, select = 1:6)
rate2 <- subset(rate2, (`Locus ID` %in% phyloacc_subset$phyloacc.id) & (phyloacc_subset$best.fit.model == "M2"))
rates <- rbind(rate1, rate2)
p1 <- ggplot(rates)+
geom_histogram(aes(x=n_rate), bins = 30, fill='red', alpha=0.5, position = 'identity')+
geom_vline(xintercept = mean(rates$n_rate), col = "red", linetype = "dashed")+
theme_classic()+
xlab("Accelerated Rate")
#mean accelerated rate: 1.91
p2 <- ggplot(rates)+
geom_histogram(aes(x=c_rate), bins = 30, fill='blue', alpha=0.5, position = 'identity')+
geom_vline(xintercept = mean(rates$c_rate), col = "blue", linetype = "dashed")+
theme_classic()+
xlab("Conserved Rate")
grid.arrange(p1, p2, ncol=1)
#mean conserved rate: 0.26
#get stats for elements
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
phyloacc_subset_cat1 <- fread("accelerated_elements_filtbf2_m2.txt", sep = "\t")
final_data_filtered <- fread("final_data_filtered.txt", sep = "\t", header = T)
final_data_accl <- subset(final_data_filtered, id %in% phyloacc_subset$original.id)
final_data_accl_cat1 <- subset(final_data_filtered, id %in% phyloacc_subset_cat1$original.id)
#atac accl elements
table(final_data_accl$atac)
##
## FALSE TRUE
## 3797 4151
table(final_data_accl_cat1$atac)
##
## FALSE TRUE
## 187 216
fisher.test(matrix(c(4151, 3597, 343313, 589154), nrow = 2, byrow = T),
alternative = "greater", simulate.p.value = T)
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(4151, 3597, 343313, 589154), nrow = 2, byrow = T)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 1.906846 Inf
## sample estimates:
## odds ratio
## 1.980437
#write.table(final_data_accl[,1:4], "accl_elements.bed", sep = "\t", quote = F, row.names = F, col.names = F)
#get overlap of atac-seq data using
#bedtools intersect -wao -a accl_elements.bed -b atac_all.bed > accl_atac_overlap.txt
atac_data <- fread("atac_all_annot.txt")
atac_overlap <- read.table("accl_atac_overlap.txt", sep="\t")
colnames(atac_overlap)[5:45] <- colnames(atac_data)
atac_overlap_true <- atac_overlap[-which(atac_overlap$atac_id %in% "."),]
length(unique(atac_overlap_true$V4)) #4151
length(unique(atac_overlap_true$V4[which(atac_overlap_true$hindlimb_HH18 == 1)])) #1575
atac_overlap_ingroup <- subset(atac_overlap_true, V4 %in% phyloacc_subset_cat1$original.id)
length(unique(atac_overlap_ingroup$V4)) #216
length(unique(atac_overlap_ingroup$V4[which(atac_overlap_true$hindlimb_HH18 == 1)])) #102
library(ggvenn)
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
venn_list <- list(penguin = phyloacc_subset$original.id[which(phyloacc_subset$penguin >= 1)],
bulbul = phyloacc_subset$original.id[which(phyloacc_subset$bulbul >= 1)],
swallow = phyloacc_subset$original.id[which(phyloacc_subset$swallow >= 1)],
kingfisher = phyloacc_subset$original.id[which(phyloacc_subset$kingfisher >= 1)])
g1 <- ggvenn(
venn_list,
fill_color = c("#ff6600", "#9999cc", "#cccc00", "#cc66ff"),
fill_alpha = 0.8,
stroke_size = 0.5, set_name_size = 5,
show_elements = F,
show_outside = "none")
g1
getphyper <- function(overlap, list1, list2, pop){
test_val <- phyper(overlap-1,list1,pop-list1,list2)
return(test_val)
}
#For broad
PopSize=nrow(phyloacc_subset)
listB=length(which(phyloacc_subset$bulbul_shifts>0.9))
listP=length(which(phyloacc_subset$penguin_shifts>0.9))
listS=length(which(phyloacc_subset$swallow_shifts>0.9))
listK=length(which(phyloacc_subset$kingfisher_shifts>0.9))
#calculate whether any of these pairwise comparisons are significant using a hypergeometric test
pvals = c()
#BP
pvals = c(pvals, getphyper(length(which(phyloacc_subset$bulbul_shifts>0.9 & phyloacc_subset$penguin_shifts>0.9)), listB, listP, PopSize))
#BS
pvals = c(pvals, getphyper(length(which(phyloacc_subset$bulbul_shifts>0.9 & phyloacc_subset$swallow_shifts>0.9)), listB, listS, PopSize))
#BK
pvals = c(pvals, getphyper(length(which(phyloacc_subset$bulbul_shifts>0.9 & phyloacc_subset$kingfisher_shifts>0.9)), listB, listK, PopSize))
#SK
pvals = c(pvals, getphyper(length(which(phyloacc_subset$swallow_shifts>0.9 & phyloacc_subset$kingfisher_shifts>0.9)), listS, listK, PopSize))
#PK
pvals = c(pvals, getphyper(length(which(phyloacc_subset$penguin_shifts>0.9 & phyloacc_subset$kingfisher_shifts>0.9)), listP, listK, PopSize))
#PS
pvals = c(pvals, getphyper(length(which(phyloacc_subset$penguin_shifts>0.9 & phyloacc_subset$swallow_shifts>0.9)), listP, listS, PopSize))
p.adjust(pvals, method = "bonferroni")
#For specific
phyloacc_subset_cat1 <- fread("accelerated_elements_filtbf2_m2.txt", sep = "\t")
PopSize=nrow(phyloacc_subset_cat1)
listB=length(which(phyloacc_subset_cat1$bulbul_shifts>0.9))
listP=length(which(phyloacc_subset_cat1$penguin_shifts>0.9))
listS=length(which(phyloacc_subset_cat1$swallow_shifts>0.9))
listK=length(which(phyloacc_subset_cat1$kingfisher_shifts>0.9))
#calculate whether any of these pairwise comparisons are significant using a hypergeometric test
pvals = c()
#BP
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$bulbul_shifts > 0.9 & phyloacc_subset_cat1$penguin_shifts > 0.9)), listB, listP, PopSize))
#BS
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$bulbul_shifts > 0.9 & phyloacc_subset_cat1$swallow_shifts > 0.9)), listB, listS, PopSize))
#BK
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$bulbul_shifts > 0.9 & phyloacc_subset_cat1$kingfisher_shifts > 0.9)), listB, listK, PopSize))
#SK
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$swallow_shifts > 0.9 & phyloacc_subset_cat1$kingfisher_shifts > 0.9)), listS, listK, PopSize))
#PK
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$penguin_shifts > 0.9 & phyloacc_subset_cat1$kingfisher_shifts > 0.9)), listP, listK, PopSize))
#PS
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$penguin_shifts > 0.9 & phyloacc_subset_cat1$swallow_shifts > 0.9)), listP, listS, PopSize))
p.adjust(pvals, method = "bonferroni")
phyloacc_subset$targets_with_shifts <- rowSums(phyloacc_subset[,c("bulbul", "kingfisher", "swallow", "penguin")] >= 1)
ggplot(phyloacc_subset, aes(x=targets_with_shifts))+
geom_histogram(fill="black", col="grey", bins = 4)+
theme_classic()
#R
library(plyr)
library(phangorn)
library(phytools)
library(data.table)
#read main result file
phyloacc <- fread("elem_lik_filt.txt", sep = "\t", header = T)
#find location of shifts on tree aka where acceleration occurred since phyloacc reports
#all daughter branches of a shift as being accelerated
shift_calc <- function(tree, post_phylo, subtree = ""){
tree_quant <- data.frame(node = tree$tip.label, id = seq(1, length(tree$tip.label)), type = "tip")
tree_quant <- rbind(tree_quant, data.frame(node = tree$node.label,
id = seq(nrow(tree_quant) + 1 , length(tree$node.label) + nrow(tree_quant)),
type = "internal"))
tree_quant <- merge(tree_quant, data.frame(edge = tree$edge[,2], edge_len = tree$edge.length, anc = tree$edge[,1]),
by.x = "id", by.y = "edge", all.x = T)
if (length(subtree) <= 1){
all_daughts <- tree_quant$node[-which(tree_quant$node %in% "root")]
}
else{
mrca <- getMRCA(tree, subtree)
start_node <- tree_quant$anc[which(tree_quant$id %in% mrca)]
all_daughts <- tree_quant$node[which(tree_quant$id %in% Descendants(tree, mrca, "all"))]
all_daughts <- c(tree_quant$node[which(tree_quant$id %in% start_node)],
tree_quant$node[which(tree_quant$id %in% mrca)],
all_daughts)
}
tips <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "tip")])], 3, sep = "_")
internals <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "internal")])], 3, sep = "_")
shifts <- rowSums(post_phylo[, ..tips]) - rowSums(post_phylo[,..internals])
return(shifts)
}
#get dataframe of shifts
run_shifts <- function(results){
tree <- read.newick("macro_tree.tree")
taxa_list <- fread("taxa_list_out.txt", header = F, sep = "\t")
basic_taxa <- paste(taxa_list$V1[taxa_list$V2 == "basic"], 3, sep = "_")
target_taxa <- paste(taxa_list$V1[taxa_list$V2 == "target"], 3, sep = "_")
results$basic <- rowSums(results[,..basic_taxa] >= 0.9)
results$target <- rowSums(results[,..target_taxa] >= 0.9)
bulbul_taxa <- target_taxa[c(5,6,11)]
results$abulbul <- rowSums(results[,..bulbul_taxa] >= 0.9)
swallow_taxa <- target_taxa[c(7,8,9,10,12,15,16)]
results$aswallow <- rowSums(results[,..swallow_taxa] >= 0.9)
kingfisher_taxa <- target_taxa[c(3,4,13)]
results$akingfisher <- rowSums(results[,..kingfisher_taxa] >= 0.9)
penguin_taxa <- target_taxa[c(1,2,14)]
results$apenguin <- rowSums(results[,..penguin_taxa] >= 0.9)
shifts <- results[,c("Locus ID", "basic", "target", "abulbul", "aswallow", "akingfisher", "apenguin")]
colnames(shifts)[1] <- "Locus"
shifts$all_shifts <- shift_calc(tree, results)
shifts$abulbul_shifts <- shift_calc(tree, results, subtree = c("Sturnus_vulgaris", "Ficedula_albicollis"))
shifts$aswallow_shifts <- shift_calc(tree, results, subtree = c("Emberiza_elegans", "Molothrus_ater", "Setophaga_coronata", "Diglossa_brunneiventris"))
shifts$akingfisher_shifts <- shift_calc(tree, results, subtree = c("Pogoniulus_pusillus", "Colaptes_auratus"))
shifts$apenguin_shifts <- shift_calc(tree, results, subtree = c("Pluvialis_apricaria", "Sterna_hirundo"))
shifts$targets_with_shifts <- rowSums(shifts[,c("abulbul_shifts", "akingfisher_shifts", "aswallow_shifts", "apenguin_shifts")] >= 0.9)
shifts$in_lin <- round(shifts$abulbul_shifts + shifts$akingfisher_shifts + shifts$aswallow_shifts + shifts$apenguin_shifts)
shifts$out_lin <- round(shifts$all_shifts - shifts$abulbul_shifts - shifts$akingfisher_shifts - shifts$aswallow_shifts - shifts$apenguin_shifts)
return(shifts)
}
#calculate shift location for elements where m2 is best model
results_m2 <- fread("rate_postZ_M2.txt", header = T, sep = "\t")
shifts_m2 <- run_shifts(results_m2)
phyloacc_subset_a <- subset(phyloacc, logbf3 > 10)
phyloacc_subset_a <- merge(phyloacc_subset_a, shifts_m2, by.x = "phyloacc.id", by.y = "Locus", all.x = T, all.y = F)
phyloacc_subset_a <- subset(phyloacc_subset_a, targets_with_shifts > 0 & (abulbul > 1 | akingfisher > 1 | aswallow > 2 | apenguin > 1))
write.table(phyloacc_subset_a, "accelerated_elements_outgroups.txt", sep = "\t", row.names = F, col.names = T, quote = F)
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
phyloacc_subset_a <- fread("accelerated_elements_outgroups.txt", sep = "\t")
#for the four group
conv_4 <- fisher.test(matrix(c(length(which(phyloacc_subset$targets_with_shifts == 4)),
length(which(phyloacc_subset_a$targets_with_shifts == 4)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$targets_with_shifts == 4)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$targets_with_shifts == 4))),nrow = 2),
alternative = "greater")
#for the three groups
conv_3_bks <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9 &
phyloacc_subset$swallow_shifts > 0.9)),
length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9 &
phyloacc_subset_a$aswallow_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9 &
phyloacc_subset$swallow_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9 &
phyloacc_subset_a$aswallow_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_3_bsp <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9 &
phyloacc_subset$swallow_shifts > 0.9)),
length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9 &
phyloacc_subset_a$aswallow_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9 &
phyloacc_subset$swallow_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9 &
phyloacc_subset_a$aswallow_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_3_bkp <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_3_skp <- fisher.test(matrix(c(length(which(phyloacc_subset$swallow_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$swallow_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
# for the two groups
conv_2_bk <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9)),
length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_2_bs <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$swallow_shifts > 0.9)),
length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$aswallow_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$swallow_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$aswallow_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_2_bp <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_2_sk <- fisher.test(matrix(c(length(which(phyloacc_subset$swallow_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9)),
length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$swallow_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_2_sp <- fisher.test(matrix(c(length(which(phyloacc_subset$swallow_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$swallow_shifts > 0.9 &
phyloacc_subset$penguin_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_2_pk <- fisher.test(matrix(c(length(which(phyloacc_subset$penguin_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9)),
length(which(phyloacc_subset_a$apenguin_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9)),
nrow(phyloacc_subset)-length(which(phyloacc_subset$penguin_shifts > 0.9 &
phyloacc_subset$kingfisher_shifts > 0.9)),
nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$apenguin_shifts > 0.9 &
phyloacc_subset_a$akingfisher_shifts > 0.9))),nrow = 2, byrow = T),
alternative = "greater")
conv_df <- data.frame(groups=c("BK", "BP", "BS", "PK", "SK", "SP", "BKP", "BKS", "BSP", "SKP", "BKSP"),
prob=c(conv_2_bk$p.value, conv_2_bp$p.value, conv_2_bs$p.value,
conv_2_pk$p.value, conv_2_sk$p.value, conv_2_sp$p.value,
conv_3_bkp$p.value, conv_3_bks$p.value, conv_3_bsp$p.value, conv_3_skp$p.value,
conv_4$p.value),
odds=c(conv_2_bk$estimate, conv_2_bp$estimate, conv_2_bs$estimate,
conv_2_pk$estimate, conv_2_sk$estimate, conv_2_sp$estimate,
conv_3_bkp$estimate, conv_3_bks$estimate, conv_3_bsp$estimate, conv_3_skp$estimate,
conv_4$estimate))
conv_df$groups <- factor(conv_df$groups, levels = conv_df$groups)
ggplot(conv_df, aes(x=groups, y=odds))+
geom_point(size=2)+
theme_bw()+
annotate("rect", xmin = "BK", xmax = "BKSP", ymin = 0, ymax = 1, col="grey", alpha=0.3)
To understand the functions of genes we have identified as accelerated, we will calculate a PANTHER gene enrichment score. Because conserved elements are not randomly distributed across the genome, we cannot use the default enrichment and p-value score reported by PANTHER pipeline. We will need to calculate these values independently.
#R
all_data <- fread("conserved_elements_all.txt")
final_data_filtered <- subset(all_data, id %in% phyloacc$original.id)
#Get loci
loci_subset <- subset(all_data, id %in% phyloacc_subset$original.id)
loci_subset_cat1 <- subset(all_data, id %in% phyloacc_subset_cat1$original.id)
#write loci
fwrite(final_data_filtered[,1:4], "loci_all_m2.txt", sep = "\t", col.names = F, row.names = F, quote = F)
fwrite(loci_subset[,1:4], "loci_accl_m2.txt", sep = "\t", col.names = F, row.names = F, quote = F)
fwrite(loci_subset_cat1[,1:4], "loci_accl_cat1_m2.txt", sep = "\t", col.names = F, row.names = F, quote = F)
Get genes 10000 bp upstream and downstream of elements
#bash
#add 10000 bp filler in front and behind loci
awk 'BEGIN{ FS="\t"; OFS=FS }{ if ($2-10000>0) print $1,$2-10000,$3+10000,$4; else print $1,0,$3+10000,$4 }' loci_all_m2.txt > loci_all_m2_10000.txt
awk 'BEGIN{ FS="\t"; OFS=FS }{ if ($2-10000>0) print $1,$2-10000,$3+10000,$4; else print $1,0,$3+10000,$4 }' loci_accl_m2.txt > loci_accl_m2_10000.txt
awk 'BEGIN{ FS="\t"; OFS=FS }{ if ($2-10000>0) print $1,$2-10000,$3+10000,$4; else print $1,0,$3+10000,$4 }' loci_accl_cat1_m2.txt > loci_accl_cat1_m2_10000.txt
#using bedtools intersect get genes in this window
bedtools intersect -wao -a loci_all_m2_10000.txt -b galgal7b_allgene.gff > loci_all_genes_m2.txt
bedtools intersect -wao -a loci_accl_m2_10000.txt -b galgal7b_allgene.gff > loci_accl_genes_m2.txt
bedtools intersect -wao -a loci_accl_cat1_m2_10000.txt -b galgal7b_allgene.gff > loci_accl_cat1_genes_m2.txt
Read the gene sets and run PANTHER.
#R
library(stringr)
#all genes near all conserved elements
closest_all <- fread("loci_all_genes_m2.txt", header = F)
closest_all$gene <- str_extract(closest_all$V13, "gene=[a-zA-Z0-9_]+")
closest_all$gene <- gsub("^[A-Za-z_]+=", "", closest_all$gene)
write.csv(unique(closest_all$gene),"closest_all_gene_m2.csv", row.names = F, col.names = F, quote = F, eol = ",")
#all genes near accelerated cnees in full dataset
closest <- fread("loci_accl_genes_m2.txt", header = F)
closest$gene <- str_extract(closest$V13, "gene=[a-zA-Z0-9_]+")
closest$gene <- gsub("^[A-Za-z_]+=", "", closest$gene)
write.csv(unique(closest$gene),"closest_gene_m2.csv", row.names = F, col.names = F, quote = F, eol = ",")
#get a few stats
#elements with no genes near 10kb window (1720 elements)
length(which(is.na(closest$gene)))
#number of unique genes (5008 unique genes)
length(unique(closest$gene))-1
#average number of genes per element (1.5 genes/element)
genes_perelem <- data.frame(table(table(closest$V4[-(which(is.na(closest$gene)))])), stringsAsFactors = F)
genes_perelem$Var1 <- as.integer(unlist(lapply(genes_perelem$Var1, as.character)))
genes_perelem$tot <- genes_perelem$Var1 * genes_perelem$Freq
sum(genes_perelem$tot)/sum(genes_perelem$Freq)
#all genes near accelerated cnees in ingorup dataset
closest_cat1 <- fread("loci_accl_cat1_genes_m2.txt", header = F)
closest_cat1$gene <- str_extract(closest_cat1$V13, "gene=[a-zA-Z0-9_]+")
closest_cat1$gene <- gsub("^[A-Za-z_]+=", "", closest_cat1$gene)
write.csv(unique(closest_cat1$gene),"closest_cat1_gene_m2.csv", row.names = F, col.names = F, quote = F, eol = ",")
closest2 <- merge(closest, phyloacc_subset, by.x = "V4", by.y="original.id", all.x = T)
gene_taxa <- unique(closest2[!is.na(closest2$gene),c("gene", "bulbul_shifts", "swallow_shifts", "kingfisher_shifts", "penguin_shifts", "targets_with_shifts")])
gene_taxa <- gene_taxa %>% group_by(gene) %>%
summarise(across(everything(), sum),
.groups = 'drop') %>%
as.data.frame()
gene_taxa$targets_with_shifts <- rowSums(gene_taxa[,c("bulbul_shifts", "kingfisher_shifts", "swallow_shifts", "penguin_shifts")] >= 0.9)
gene_taxa_cat1 <- subset(gene_taxa, gene %in% closest_cat1$gene)
#write.table(gene_taxa, "gene_taxa_m2.txt", sep = "\t", col.names = T, row.names = F, quote = F)
#write.table(gene_taxa_cat1, "gene_taxa_cat1_m2.txt", sep = "\t", col.names = T, row.names = F, quote = F)
getphyper <- function(overlap, list1, list2, pop){
test_val <- phyper(overlap-1,list1,pop-list1,list2)
return(test_val)
}
PopSize=nrow(gene_taxa)
listB=length(which(gene_taxa$bulbul_shifts>0.9))
listP=length(which(gene_taxa$penguin_shifts>0.9))
listS=length(which(gene_taxa$swallow_shifts>0.9))
listK=length(which(gene_taxa$kingfisher_shifts>0.9))
#calculate whether any of these pairwise comparisons are significant using a hypergeometric test
pvals = c()
#BP
pvals = c(pvals, getphyper(length(which(gene_taxa$bulbul_shifts > 0.9 & gene_taxa$penguin_shifts > 0.9)), listB, listP, PopSize))
#BS
pvals = c(pvals, getphyper(length(which(gene_taxa$bulbul_shifts > 0.9 & gene_taxa$swallow_shifts > 0.9)), listB, listS, PopSize))
#BK
pvals = c(pvals, getphyper(length(which(gene_taxa$bulbul_shifts > 0.9 & gene_taxa$kingfisher_shifts > 0.9)), listB, listK, PopSize))
#SK
pvals = c(pvals, getphyper(length(which(gene_taxa$swallow_shifts > 0.9 & gene_taxa$kingfisher_shifts > 0.9)), listS, listK, PopSize))
#PK
pvals = c(pvals, getphyper(length(which(gene_taxa$penguin_shifts > 0.9 & gene_taxa$kingfisher_shifts > 0.9)), listP, listK, PopSize))
#PS
pvals = c(pvals, getphyper(length(which(gene_taxa$penguin_shifts > 0.9 & gene_taxa$swallow_shifts > 0.9)), listP, listS, PopSize))
p.adjust(pvals, method = "bonferroni")
venn_list <- list(penguin = gene_taxa$gene[which(gene_taxa$penguin >= 1)],
bulbul = gene_taxa$gene[which(gene_taxa$bulbul >= 1)],
swallow = gene_taxa$gene[which(gene_taxa$swallow >= 1)],
kingfisher = gene_taxa$gene[which(gene_taxa$kingfisher >= 1)])
g1 <- ggvenn(
venn_list,
fill_color = c("#ff6600", "#9999cc", "#cccc00", "#cc66ff"),
fill_alpha = 0.8,
stroke_size = 0.5, set_name_size = 5,
show_elements = F,
show_outside = "none")
ggplot(gene_taxa, aes(x=targets_with_shifts))+
geom_histogram(fill="#4d4d4d", col="grey", bins = 4)+
theme_classic()
#R
#Run PANTHER
library(rbioapi)
# all_closest <- read.csv("closest_all_gene.csv", header = F)
#Organism 9031 is Gallus gallus
#Dataset GO:0008150 is GO terms in biological processes group
all_enrich_main <- rba_panther_enrich(genes = unique(closest$gene),
organism = 9031,
annot_dataset = "GO:0008150")
cat1_enrich_main <- rba_panther_enrich(genes = unique(closest_cat1$gene),
organism = 9031,
annot_dataset = "GO:0008150")
Next we need to calculate a baseline value of genes in GO categories if you randomly sampled genes close to conserved elements. We repeated this sampling and calculation 10000 times. Below is the R code to run to get this distribution fo each element for a supplied input element number (here 7948 and 403).
#R
library(data.table)
library(stringr)
library(rbioapi)
args = commandArgs(trailingOnly=TRUE)
# test if there is at least one argument: if not, return an error
if (length(args)==0) {
stop("At least one argument must be supplied", call.=FALSE)
} else if (!is.numeric(as.numeric(args[1]))){
stop("Argument must be a number", call.=FALSE)
}
all_data <- fread("final_data_filtered.bed", header = T)
nums <- args[1]
enrich_out <- data.frame(term.id=character())
gene_out <- data.frame(term.id=character())
for (i in seq(1,1000)){
rand_ces <- sample(all_data$id, nums)
loci_subset <- subset(all_data[,c(1:4)], id %in% rand_ces)
loci_subset$start <- loci_subset$start - 10000
loci_subset$end <- loci_subset$start + 10000
loci_subset$start[which(loci_subset$start < 0)] <- 0
fwrite(loci_subset, paste("loci_sub_", nums, ".txt", sep=""), sep = "\t", col.names = F, row.names = F, quote = F)
x <- paste('bedtools intersect -wao -a loci_sub_', nums, '.txt -b galgal7b_allgene.gff > loci_sub_genes_', nums, '.txt', sep="")
system(x)
closest <- fread(paste("loci_sub_genes_", nums, ".txt", sep = ""), header = F)
closest$gene <- sub(".*gene=([a-zA-Z0-9_]+).*", "\\1", closest$V13)
enrich <- rba_panther_enrich(genes = unique(closest$gene),
organism = 9031,
annot_dataset = "GO:0008150")
try({
print(paste("Run", i, sep=" "))
colnames(enrich$result)[1] <- paste(colnames(enrich$result)[1], i, sep = "_")
colnames(enrich$result)[2] <- paste(colnames(enrich$result)[2], i, sep = "_")
enrich_out <- merge(enrich_out, enrich$result[,c(paste("fold_enrichment",i, sep = "_"), "term.id")], by="term.id", all=T)
gene_out <- merge(gene_out, enrich$result[,c(paste("number_in_list",i, sep = "_"), "term.id")], by="term.id", all=T)
}, silent = T)
}
fwrite(enrich_out, paste("enrich_out_", nums, ".txt", sep=""), sep = "\t", col.names = T, row.names = F, quote = F)
fwrite(gene_out, paste("genes_out_", nums, ".txt", sep=""), sep = "\t", col.names = T, row.names = F, quote = F)
Read these distributions and calculate an effective p-value:
\[
pvalue = \frac{({\sf Number\ of\ genes\ in\ GO\ category\ for\
replicates
} >= \\{\sf Calculated\ number\ of\ genes\ in\ GO\ category\ for\
accelerated\ elements
})+1}{{\sf Total\ number\ of\ replicates+1}}
\] and enrichment score.
\[
enrichment = \frac{\frac{\sf Calculated\ number\ of\ genes\ in\ GO\
category\ for\ accelerated\ elements
}{\sf Total\ number\ of\ input\ genes}}{\frac{\sf Median\ number\ of\
genes\ in\ GO\ category\ for\ replicates
}{\sf Median\ number\ of\ input\ genes\ for\ replicates}}
\]
#R
enrich_dist <- fread("genes_out_7948.txt", header = T)
enrich_dist[is.na(enrich_dist)] <- 0
enrich_dist_cat1 <- fread("genes_out_403.txt", header = T)
enrich_dist_cat1[is.na(enrich_dist_cat1)] <- 0
get_pgo <- function(GO, value, distr){
if (GO %in% distr$term.id){
all_vals <- length(which(t(distr[which(distr$term.id %in% GO),2:ncol(distr)]) >= value))
p <- (all_vals+1)/(ncol(distr))
return(p)
}else{return(0)}
}
#calculate effective p
all_enrich <- all_enrich_main$result
all_enrich$pgo <- mapply(get_pgo, all_enrich$term.id, all_enrich$number_in_list, MoreArgs=list(dist=enrich_dist))
cat1_enrich <- cat1_enrich_main$result
cat1_enrich$pgo <- mapply(get_pgo, cat1_enrich$term.id, cat1_enrich$number_in_list, MoreArgs=list(dist=enrich_dist_cat1))
#calculate enrichment score
enrich_dist$median <- MatrixGenerics::rowMedians(as.matrix(enrich_dist[,2:ncol(enrich_dist)]), useNames = F)
cols_sub <- c(1, (ncol(enrich_dist)))
all_enrich <- merge(all_enrich, enrich_dist[,..cols_sub], by = "term.id", all.x = T, all.y = F)
tot <- read.table("total_out_7948.txt")
#total unique genes for this set 5009
all_enrich$median[which(all_enrich$median == 0)] <- 0.00001
all_enrich$enrich <- (all_enrich$number_in_list/5008)/(all_enrich$median/median(tot$V1))
enrich_dist_cat1$median <- MatrixGenerics::rowMedians(as.matrix(enrich_dist_cat1[,2:(ncol(enrich_dist_cat1))]), useNames = F)
cols_sub <- c(1, (ncol(enrich_dist_cat1)))
cat1_enrich <- merge(cat1_enrich, enrich_dist_cat1[,..cols_sub], by = "term.id", all.x = T, all.y = F)
tot_cat1 <- read.table("total_out_533.txt")
#total unique genes for this set 380
cat1_enrich$median[which(cat1_enrich$median == 0)] <- 0.00001
cat1_enrich$enrich <- (cat1_enrich$number_in_list/380)/(cat1_enrich$median/median(tot_cat1$V1))
#Gather significant p-values
#subset out GO categories with less than 5 genes
all_sigs <- all_enrich[(all_enrich$pgo < 0.05),]
cat1_sigs <- cat1_enrich[(cat1_enrich$pgo < 0.05),]
all_sigs <- subset(all_sigs, (number_in_reference >= 5))
all_sigs <- subset(all_sigs, (median >= 1))
cat1_sigs <- subset(cat1_sigs, (number_in_reference >= 5))
cat1_sigs <- subset(cat1_sigs, (median >= 1))
write.table(all_enrich, "all_enrich_m2.txt", sep = "\t", row.names = F, col.names = T, quote = F)
#all_enrich <- fread("all_enrich_m2.txt", sep = "\t", header = T)
write.table(cat1_enrich, "cat1_enrich_m2.txt", sep = "\t", row.names = F, col.names = T, quote = F)
#cat1_enrich <- fread("cat1_enrich_m2.txt", sep = "\t", header = T)
Next we will do a GO term clustering to figure out common groups of GO terms
#R
library(stringr)
#do GO similarity clustering for each group
panther_out <- fread("all_enrich_m2.txt", sep = "\t", header = T)
panther_cat1_out <- fread("cat1_enrich_m2.txt", sep = "\t", header = T)
panther_merge <- merge(panther_out, panther_cat1_out, by = "term.id", all.y=T, all.x=F)
panther_merge$enrich_diff <- (panther_merge$enrich.y - panther_merge$enrich.x)
panther_merge <- subset(panther_merge, (number_in_reference.y >= 5) & (median.y >= 1) & (pgo.y < 0.05))
library(jsonlite)
sub_json <- read_json("panther_results_cat1.json")
go_output <- data.frame(gene=character(), GO=character())
for (i in seq(1,length(sub_json$overrepresentation$group))){
if (is.null(names(sub_json$overrepresentation$group[[i]]$result))){
for (j in seq(1, length(sub_json$overrepresentation$group[[i]]$result))){
if (sub_json$overrepresentation$group[[i]]$result[[j]]$input_list$number_in_list > 0){
go_output <- rbind(go_output, data.frame(gene = unlist(sub_json$overrepresentation$group[[i]]$result[[j]]$input_list$mapped_id_list$mapped_id),
GO = sub_json$overrepresentation$group[[i]]$result[[j]]$term$id))
}
}
}
else{
if (sub_json$overrepresentation$group[[i]]$result$input_list$number_in_list > 0){
if (is.null(sub_json$overrepresentation$group[[i]]$result$term$id)){
go_output <- rbind(go_output,
data.frame(gene = unlist(sub_json$overrepresentation$group[[i]]$result$input_list$mapped_id_list$mapped_id),
GO = NA))
}
else{
go_output <- rbind(go_output,
data.frame(gene = unlist(sub_json$overrepresentation$group[[i]]$result$input_list$mapped_id_list$mapped_id),
GO = sub_json$overrepresentation$group[[i]]$result$term$id))
}
}
}
}
#calculate per gene score for each GO
gene_taxa <- read.table("gene_taxa_m2.txt", sep = "\t", header = T)
gene_taxa_cat1 <- read.table("gene_taxa_cat1_m2.txt", sep = "\t", header = T)
go_scores <- data.frame(GO=panther_merge$term.id, gene=NA, element=NA)
for (GO_term in panther_merge$term.id){
go_scores$gene[which(go_scores$GO %in% GO_term)] <- (mean(gene_taxa$targets_with_shifts[gene_taxa$gene %in% go_output$gene[go_output$GO %in% GO_term]]))
}
panther_merge <- merge(panther_merge, go_scores, by.x="term.id", by.y="GO")
panther_merge_max <- panther_merge[which(panther_merge$enrich_diff > 5),]
panther_merge_max$label <- paste(panther_merge_max$term.id, panther_merge_max$term.label.x, sep = "-")
library(ggrepel)
ggplot(panther_merge, aes(x=enrich.x, y=enrich.y, color=gene, fill=gene, segment.color=gene))+
geom_point()+
scale_color_gradient(low = "blue", high = "red") +
xlab("Enrichment:Short-tarsi Broad Dataset") +
ylab("Enrichment:Short-tarsi Specific Dataset") +
geom_label_repel(data = panther_merge_max, aes(x=enrich.x, y=enrich.y, label=label),
color = "white",
nudge_x = .15,
box.padding = 0.1,
nudge_y = 1,
segment.curvature = -0.1,
segment.ncp = 3,
segment.angle = 20)+
scale_fill_gradient(low = "blue", high = "red")+
theme_classic()
Plot heat maps of GO similarity
library(simplifyEnrichment)
pdf("go_sub_heatmap_m2.pdf", height = 5, width = 7)
mat_all <- GO_similarity(all_sigs$term.id, ont = "BP", db = 'org.Gg.eg.db')
go_df <- simplifyGO(mat_all)
dev.off()
pdf("go_cat1_heatmap_m2.pdf", height = 5, width = 7)
mat_cat1 <- GO_similarity(cat1_sigs$term.id, ont = "BP", db = 'org.Gg.eg.db')
go_df_cat1 <- simplifyGO(mat_cat1)
dev.off()
We looked for motifs potentially associated with limb development. To do this we first generated a list of CEs that were closest to known limb development genes.
limb_genes <- c("GLI3R", "TBX2", "SMAD2", "BMP4",
"GREM1", "GLIA", "SHH", "GDF11",
"HOXA9", "HOXA10", "HOXA11", "HOXA12", "HOXA13",
"HOXB9", "HOXB10", "HOXB11", "HOXB12", "HOXB13",
"HOXC9", "HOXC10", "HOXC11", "HOXC12", "HOXC13",
"HOXD9", "HOXD10", "HOXD11", "HOXD12", "HOXD13",
"TBX4", "PITX1", "FGF10", "PIZF", "HOXB5",
"IRX3", "IRX5", "GLI3", "SALL4", "PBX1",
"PBX2", "HAND2", "TBX3", "TBX18", "GSC",
"ISL1", "ETS1", "ETS2", "LMX1B", "WNT7A",
"EN1", "SP6", "SP8", "P63", "DIX5",
"DIX6", "FGF8", "FGF4", "FGF7", "FGF9",
"FGF17", "GREM1", "MEIS1", "MEIS2",
"CYB26B1", "TBX5", "VEGF", "MMP9", "CBFA1",
"OSX", "IHH", "PTHRP", "CNP", "FOS",
"SOX5", "SOX6", "SOX9", "FGFR3", "GDF5")
limb_genes_ce <- subset(closest, gene %in% limb_genes)
macro <- c("NC_052532.1", "NC_052533.1", "NC_052534.1", "NC_052535.1","NC_052536.1",
"NC_052537.1", "NC_052538.1", "NC_052539.1", "NC_052542.1")
micro <- c("NC_052544.1", "NC_052546.1", "NC_052548.1", "NC_052550.1", "NC_052555.1", "NC_052558.1","NC_052565.1")
Z <- "NC_052572.1"
limb_genes_ce$file <- paste(limb_genes_ce$V1, "/", limb_genes_ce$V4, ".fa", sep = "")
limb_genes_ce[which(limb_genes_ce$V1 %in% macro), "file"] <- paste("fasta_macro", unname(unlist(limb_genes_ce[which(limb_genes_ce$V1 %in% macro), "file"])), sep = "/")
limb_genes_ce[which(limb_genes_ce$V1 %in% micro), "file"] <- paste("fasta_micro", unname(unlist(limb_genes_ce[which(limb_genes_ce$V1 %in% micro), "file"])), sep = "/")
limb_genes_ce[which(limb_genes_ce$V1 %in% Z), "file"] <- gsub("NC_052572.1", "fasta_Z", unname(unlist(limb_genes_ce[which(limb_genes_ce$V1 %in% Z), "file"])))
write.table(limb_genes_ce$file, "limb_ces_m2.txt", quote = F, row.names = F, col.names = F)
Next we compiled list of motifs for these genes from JASPAR. We used the vertebrates CORE non-redundant motif file. We used MEME to search for motifs.
#bash
while IFS= read -r line; do
fbname=$(basename "$line" .fa)
echo $fbname
sed 's/-//g' ${line} > fastas/${fbname}.fa
~/meme/bin/mast limb.meme fastas/${fbname}.fa -oc $fbname -sep
done < "limb_ces_m2.txt"
To analyze MEME output and then correlate acceleration to gain of motif
#read memes compiled from JASPAR
read_memes <- read.table("limb_motifs.txt", header = T)
limb_genes_motifs <- as.data.frame(limb_genes_ce[,c("V4", "V1", "V2", "V3", "V14", "gene")])
colnames(limb_genes_motifs) <- c("ce_id", "scaffold", "start", "end", "length", "gene")
limb_genes_motifs[,read_memes$ALT_ID] <- "NA"
for (x in limb_genes_motifs$ce_id) {
tryCatch(
{
meme_hit <- read.table(paste("./meme/", x, "_hits.txt", sep = ""))
},
error=function(e) {
return(data.frame(V1=x, V4=NA))
}
)
unique_memes <- unique(meme_hit$V4)
for (unique_meme in unique_memes){
species <- paste(meme_hit$V1[which(meme_hit$V4 %in% unique_meme)], collapse = ",")
limb_genes_motifs[which(limb_genes_motifs$ce_id %in% x),unique_meme] <- species
}
}
#get accl species
limb_genes_accl <- subset(phyloacc_subset, phyloacc_subset$original.id %in% limb_genes_motifs$ce_id)
rate_data_M2 <- fread("M2_elem_Z.txt", header = T)
rate_data_M2 <- subset(rate_data_M2, rate_data_M2$`Locus ID` %in% limb_genes_accl$phyloacc.id)
rate_data <- rate_data_M2[,1:80]
results_m2 <- fread("rate_postZ_M2.txt", header = T, sep = "\t")
results_m2 <- subset(results_m2, results_m2$`Locus ID` %in% limb_genes_accl$phyloacc.id)
results <- results_m2[,c(1:6,seq(10,322,4))]
colnames(results) <- gsub("_3", "", colnames(results))
for (x in limb_genes_accl$phyloacc.id){
limb_genes_accl[which(limb_genes_accl$phyloacc.id %in% x),"accl_taxa"] <-
paste(intersect(colnames(rate_data)[rate_data[which(rate_data$`Locus ID` %in% x),] == 2],
colnames(results)[results[which(results$`Locus ID` %in% x),] >= 0.9]),
collapse = ",")
}
#calculate intersect
limb_genes_motifs_accl <- merge(limb_genes_motifs, limb_genes_accl, by.x = "ce_id", by.y = "original.id")
limb_genes_motifs_accl[limb_genes_motifs_accl == "NA"] <- ""
targets <- c("Brachypodius_melanocephalos", "Pycnonotus_jocosus",
"Hirundo_rustica", "Progne_subis", "Riparia_riparia",
"Chloroceryle_aenea", "Actenoides_hombroni", "Halcyon_senegalensis",
"Aptenodytes_patagonicus", "Aptenodytes_forsteri", "Pygoscelis_adeliae",
"Pygoscelis_papua", "Pygoscelis_antarcticus", "Megadyptes_antipodes",
"Eudyptula_novaehollandiae", "Eudyptula_albosignata", "Eudyptula_minor",
"Spheniscus_demersus", "Spheniscus_humboldti", "Spheniscus_magellanicus",
"Eudyptes_pachyrhynchus", "Eudyptes_sclateri", "Eudyptes_chrysolophus",
"Eudyptes_schlegeli", "Eudyptes_chrysocome", "Eudyptes_filholi",
"Eudyptes_moseleyi")
for (x in 1:(nrow(limb_genes_motifs_accl))){
for (y in 7:43){
limb_genes_motifs_accl[x,y] <- paste(length(unique(strsplit(limb_genes_motifs_accl[x,y], ",")[[1]])),
length(intersect(strsplit(limb_genes_motifs_accl[x,"accl_taxa"], ",")[[1]],
targets)),
length(intersect(strsplit(limb_genes_motifs_accl[x,y], ",")[[1]],
intersect(strsplit(limb_genes_motifs_accl[x,"accl_taxa"], ",")[[1]],
targets))),
length(intersect(strsplit(limb_genes_motifs_accl[x,y], ",")[[1]],
targets)),
sep=";")
}
}
colnames(limb_genes_motifs) <- gsub("\\:\\:", "-", colnames(limb_genes_motifs))
interesting <- limb_genes_motifs_accl[,c(1:43,66)]
interesting <- as.data.frame(lapply(interesting, function(y) gsub("^0;[0-9]*;[0-9]*;[0-9]*", "", y)))
interesting <- as.data.frame(lapply(interesting, function(y) gsub("^[0-9]*;[0-9]*;[0-9]*;0", "", y)))
interesting <- as.data.frame(lapply(interesting, function(y) gsub("^[0-9]*;0;0;[0-9]*", "", y)))
#colnames(interesting) <- gsub("\\.\\.", "-", colnames(interesting))
#format: all with motif; accelerated taxa; target accl with motifs; target taxa with motifs
get_accl <- function(motif_data, interesting_data, element, motif){
accl_taxa <- strsplit(interesting_data[which(interesting_data$ce_id %in% element),"accl_taxa"], ",")[[1]]
bulbul_accl <- intersect(accl_taxa, c("Brachypodius_melanocephalos", "Pycnonotus_jocosus"))
king_accl <- intersect(accl_taxa, c("Chloroceryle_aenea", "Actenoides_hombroni", "Halcyon_senegalensis"))
penguin_accl <- intersect(accl_taxa, c("Aptenodytes_patagonicus", "Aptenodytes_forsteri", "Pygoscelis_adeliae",
"Pygoscelis_papua", "Pygoscelis_antarcticus", "Megadyptes_antipodes",
"Eudyptula_novaehollandiae", "Eudyptula_albosignata", "Eudyptula_minor",
"Spheniscus_demersus", "Spheniscus_humboldti", "Spheniscus_magellanicus",
"Eudyptes_pachyrhynchus", "Eudyptes_sclateri", "Eudyptes_chrysolophus",
"Eudyptes_schlegeli", "Eudyptes_chrysocome", "Eudyptes_filholi",
"Eudyptes_moseleyi"))
swallow_accl <- intersect(accl_taxa, c("Hirundo_rustica", "Progne_subis", "Riparia_riparia"))
motif_taxa <- strsplit(motif_data[which(motif_data$ce_id %in% element), motif], ",")[[1]]
bulbul_motif <- intersect(motif_taxa, c("Brachypodius_melanocephalos", "Pycnonotus_jocosus"))
king_motif <- intersect(motif_taxa, c("Chloroceryle_aenea", "Actenoides_hombroni", "Halcyon_senegalensis"))
penguin_motif <- intersect(motif_taxa, c("Aptenodytes_patagonicus", "Aptenodytes_forsteri", "Pygoscelis_adeliae",
"Pygoscelis_papua", "Pygoscelis_antarcticus", "Megadyptes_antipodes",
"Eudyptula_novaehollandiae", "Eudyptula_albosignata", "Eudyptula_minor",
"Spheniscus_demersus", "Spheniscus_humboldti", "Spheniscus_magellanicus",
"Eudyptes_pachyrhynchus", "Eudyptes_sclateri", "Eudyptes_chrysolophus",
"Eudyptes_schlegeli", "Eudyptes_chrysocome", "Eudyptes_filholi",
"Eudyptes_moseleyi"))
swallow_motif <- intersect(motif_taxa, c("Hirundo_rustica", "Progne_subis", "Riparia_riparia"))
print(paste("Bulbul accl:", paste(bulbul_accl, collapse = ",")))
print(paste("Bulbul motif:", paste(bulbul_motif, collapse = ",")))
print("")
print(paste("Kingfisher accl:", paste(king_accl, collapse = ",")))
print(paste("Kingfisher motif:", paste(king_motif, collapse = ",")))
print("")
print(paste("Swallow accl:", paste(swallow_accl, collapse = ",")))
print(paste("Swallow motif:", paste(swallow_motif, collapse = ",")))
print("")
print(paste("Penguin accl:", paste(penguin_accl, collapse = ",")))
print(paste("Penguin motif:", paste(penguin_motif, collapse = ",")))
print("")
}
#get motifs where there is a potential gain in motif through acceleration
for (x in 7:43){
for (y in 1:(nrow(interesting))){
split_vars <- strsplit(interesting[y,x], ";")[[1]]
if (length(split_vars) == 4){
if ((split_vars[2] == split_vars[3]) & (split_vars[4] != 27)){
print(interesting[y, 1])
print(colnames(interesting)[x])
get_accl(limb_genes_motifs, limb_genes_motifs_accl, interesting[y, 1], colnames(interesting)[x])
}
}
}
}