Note

Following includes codes used to generate, run, and analyze phyloacc results. It uses phyloacc program (see installation link here), bash and R codes interchangeably along with other instructions to run. Please make sure you are running the codes in the correct interface.

1 Input files

1.1 Conserved elements fasta

Once you have a bed of conserved elements, we need to get alignments of the elements from the hal alignments. Steps to get the hal alignment are here. First we need to get maf alignments of the elements for conserved regions, separated by chromosomes.

#bash
#loop over bed file by chromosome and create mafs for each
#$1 is an input file that contains a list of chromosomes for a category in $2 (Z, macro, or micro)

while IFS= read -r line; do
  mkdir -p final_dataset/maf_${2}_sort
  mkdir -p final_dataset/beds/
  grep -P $line final_dataset/final_working_conserved_filt.bed > final_dataset/beds/${line}.bed
  hal2mafMP.py --numProc 6 --refGenome Gallus_gallus7 --noDupes --noAncestors --refTargets final_dataset/beds/${line}.bed all_birds_new.hal final_dataset/maf_${2}_sort/all_birds_${line}.maf
done < "$1"

Next we go from maf to fasta. Here is my github page for a script to convert maf files to fasta files.

#bash
#example of loop for macrochromosomes
while IFS= read -r line; do
do
       mkdir -p fasta/beds/
       grep -P ${line} cnee_macro.bed > fasta/beds/${line}.bed
       python phast_scripts/maf2fasta.py --maf maf_macro/all_birds_${line}.maf \
                                          --bed fasta/beds/${line}.bed \
                                          --out_folder fasta/maf_macro/${line} \
                                          --ref_species Gallus_gallus7
done < "$1"

Phyloacc takes a folder with elements. However, we concatenated the fasta files into a single file and removed alignments with less than 10% total taxa (so removes alignments with less than 8 species) using the ConcatinaTOR script.

#bash
#example for macrochromosomes
python ConcatinaTOR.py --fasta_folder fasta_macro \
                        --bed_file cnee_macro.txt \
                        --write_interval 10000 \
                        --output_folder fasta_concat/fasta_macro_concat/ \
                        --n_taxa 79 \
                        --per_taxa 0.10 \
                        --recursive
                        
cat fasta_macro/*.fa > cnee_macro.fa

Also to get some stats using the get_fasta_stat.py script

#python
python get_fasta_stat.py --input_folder /n/holylfs05/LABS/informatics/Users/sshakya/genomes_phyloacc/final_dataset/fasta_macro/ --output macro_stats
python get_fasta_stat.py --input_folder /n/holylfs05/LABS/informatics/Users/sshakya/genomes_phyloacc/final_dataset/fasta_micro/ --output micro_stats
python get_fasta_stat.py --input_folder /n/holylfs05/LABS/informatics/Users/sshakya/genomes_phyloacc/final_dataset/fasta_Z/ --output Z_stats

Plot the stats from the stats generated above

#R
library(tidyr)
align_stats <- xlsx::read.xlsx("alignment_stats.xlsx", sheetIndex = 2)
align_stats <- pivot_longer(align_stats, c("Z", "macro", "micro"), names_to = "Chromosomes", values_to = "Count")
ggplot(align_stats, aes(x=Number, y=Count, fill = Chromosomes))+
  geom_bar(stat="identity", position=position_dodge())+
  scale_x_reverse()+
  theme_bw()+
  facet_grid(row = vars(Chromosomes), scales = "free")

2 Running phyloacc

We ran phyloacc (phyloacc man here). We ran elements in 3 batches, with separate rates and trees for macro, micro, and Z chromosomes respectively.

#bash
#see phyloacc manual to see what each of these flags to. charset.txt is generated by ConcatinaTOR.py, 
#charset_id.txt is the fourth column of charset.txt
phyloacc.py --overwrite -a cnee_macro.fa \
                        -b charset.txt \
                        -i charset_id.txt \
                        -m all_birds_4d_macro_non-conserved.mod \
                        -o output \
                        -t "Aptenodytes_patagonicus;Aptenodytes_forsteri;Pygoscelis_adeliae;Pygoscelis_papua;Pygoscelis_antarcticus;Megadyptes_antipodes;Eudyptula_novaehollandiae;Eudyptula_albosignata;Eudyptula_minor;Spheniscus_demersus;Spheniscus_humboldti;Spheniscus_magellanicus;Eudyptes_pachyrhynchus;Eudyptes_sclateri;Eudyptes_chrysolophus;Eudyptes_schlegeli;Eudyptes_chrysocome;Eudyptes_filholi;Eudyptes_moseleyi;Halcyon_senegalensis;Chloroceryle_aenea;Actenoides_hombroni;Progne_subis;Riparia_riparia;Hirundo_rustica;Brachypodius_melanocephalos;Pycnonotus_jocosus" \
                        -g "Gavialis_gangeticus" \
                        -n 4 \
                        -batch 100 \
                        -j 20 \
                        -part "shared"

3 Analyzing phyloacc results

Plot for log of Bayes factor1 (comparison between the null model (with no acceleration allowed), and the target-lineage model (where acceleration is allowed in only the focal/target lineages)) and log of Bayes factor 2 (comparison between the target-lineage model (where acceleration is only allowed in only the focal/target lineages) and the unrestricted model (where acceleration can occur anywhere))

#R
library(plyr)
library(phangorn)
library(phytools)
library(data.table)

#read main result file
phyloacc <- fread("elem_lik_filt.txt", sep = "\t", header = T)

#find location of shifts on tree aka where acceleration occurred since phyloacc reports 
#all daughter branches of a shift as being accelerated
shift_calc <- function(tree, post_phylo, subtree = ""){
  tree_quant <- data.frame(node = tree$tip.label, id = seq(1, length(tree$tip.label)), type = "tip")
  tree_quant <- rbind(tree_quant, data.frame(node = tree$node.label, 
                                             id = seq(nrow(tree_quant) + 1 , length(tree$node.label) + nrow(tree_quant)), 
                                             type = "internal"))
  tree_quant <- merge(tree_quant, data.frame(edge = tree$edge[,2], edge_len = tree$edge.length, anc = tree$edge[,1]), 
                      by.x = "id", by.y = "edge", all.x = T)
  
  if (length(subtree) <= 1){
    all_daughts <- tree_quant$node[-which(tree_quant$node %in% "root")]
  }
  else{
    mrca <- getMRCA(tree, subtree)
    start_node <- tree_quant$anc[which(tree_quant$id %in% mrca)]
    all_daughts <- tree_quant$node[which(tree_quant$id %in% Descendants(tree, mrca, "all"))]
    all_daughts <- c(tree_quant$node[which(tree_quant$id %in% start_node)],
                     tree_quant$node[which(tree_quant$id %in% mrca)],
                     all_daughts)
  }
  
  tips <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "tip")])], 3, sep = "_")
  internals <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "internal")])], 3, sep = "_")
  
  shifts <- rowSums(post_phylo[, ..tips]) - rowSums(post_phylo[,..internals])
  return(shifts)
}

#get dataframe of shifts
run_shifts <- function(results){
  tree <- read.newick("macro_tree.tree")
  
  taxa_list <- fread("taxa_list.txt", header = F, sep = "\t")
  basic_taxa <- paste(taxa_list$V1[taxa_list$V2 == "basic"], 3, sep = "_")
  target_taxa <- paste(taxa_list$V1[taxa_list$V2 == "target"], 3, sep = "_")
  results$basic <- rowSums(results[,..basic_taxa] >= 0.9)
  results$target <- rowSums(results[,..target_taxa] >= 0.9)
  bulbul_taxa <- target_taxa[c(26,27,28)]
  results$bulbul <- rowSums(results[,..bulbul_taxa] >= 0.9)
  swallow_taxa <- target_taxa[c(23,24,25,29,38)]
  results$swallow <- rowSums(results[,..swallow_taxa] >= 0.9)
  kingfisher_taxa <- target_taxa[c(20,21,22,30,39)]
  results$kingfisher <- rowSums(results[,..kingfisher_taxa] >= 0.9)
  penguin_taxa <- target_taxa[c(1:19,31:37,40:50)]
  results$penguin <- rowSums(results[,..penguin_taxa] >= 0.9)
  
  shifts <- results[,c("Locus ID", "basic", "target", "bulbul", "swallow", "kingfisher", "penguin")]
  colnames(shifts)[1] <- "Locus"
  shifts$all_shifts <- shift_calc(tree, results)
  shifts$bulbul_shifts <- shift_calc(tree, results, subtree = c("Brachypodius_melanocephalos", "Pycnonotus_jocosus"))
  shifts$swallow_shifts <- shift_calc(tree, results, subtree = c("Hirundo_rustica", "Progne_subis", "Riparia_riparia"))
  shifts$kingfisher_shifts <- shift_calc(tree, results, subtree = c("Chloroceryle_aenea", "Actenoides_hombroni", "Halcyon_senegalensis"))
  shifts$penguin_shifts <- shift_calc(tree, results, subtree = c("Aptenodytes_patagonicus", "Aptenodytes_forsteri", "Pygoscelis_adeliae", "Pygoscelis_papua", "Pygoscelis_antarcticus", "Megadyptes_antipodes", "Eudyptula_novaehollandiae", "Eudyptula_albosignata", "Eudyptula_minor", "Spheniscus_demersus", "Spheniscus_humboldti", "Spheniscus_magellanicus", "Eudyptes_pachyrhynchus", "Eudyptes_sclateri", "Eudyptes_chrysolophus", "Eudyptes_schlegeli", "Eudyptes_chrysocome", "Eudyptes_filholi", "Eudyptes_moseleyi"))
  
  shifts$targets_with_shifts <- rowSums(shifts[,c("bulbul_shifts", "kingfisher_shifts", "swallow_shifts", "penguin_shifts")] >= 0.9)
  shifts$in_lin <- round(shifts$bulbul_shifts + shifts$kingfisher_shifts + shifts$swallow_shifts + shifts$penguin_shifts)
  shifts$out_lin <- round(shifts$all_shifts - shifts$bulbul_shifts - shifts$kingfisher_shifts - shifts$swallow_shifts - shifts$penguin_shifts)
  return(shifts)
}

results_m2 <- fread("rate_postZ_M2.txt", header = T, sep = "\t")
shifts_m2 <- run_shifts(results_m2)

We filtered the dataset for logbf1 >= 10 to get list of accelerated elements. We also filtered sites where there was acceleration in at least all bulbuls, or all kingifishers, or all swallows, or at least 3 penguins. This ensures that any element where an acceleration was not present in at least one of the three target clades or more than 3 species of penguins are omitted.

We also created a second highly filtered dataset where only elements with logbf1 >= 10 and logbf2 >=5 were kept. This would keep only elements where there acceleration was limited to the target clade.

#filter for all sites with evidence for acceleration (logbf1 > 10)
phyloacc_subset <- subset(phyloacc, logbf1 > 10)
phyloacc_subset <- merge(phyloacc_subset, shifts_m2, by.x = "phyloacc.id", by.y = "Locus", all.x = T, all.y = F)
phyloacc_subset <- subset(phyloacc_subset, targets_with_shifts > 0 & (bulbul > 1 | kingfisher > 2 | swallow > 2 | penguin > 2)) 
fwrite(phyloacc_subset, "accelerated_elements_all_m2.txt", sep = "\t")

phyloacc_sub_cat1 <- subset(phyloacc_subset, logbf2 >= 5)
fwrite(phyloacc_sub_cat1, "accelerated_elements_filtbf2_m2.txt", sep = "\t")
#read main result file
library(data.table)
phyloacc <- fread("elem_lik_filt.txt", sep = "\t", header = T)
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
phyloacc_subset_cat1 <- fread("accelerated_elements_filtbf2_m2.txt", sep = "\t")
phyloacc$col <- "CNEE"
phyloacc$col[which(phyloacc$phyloacc.id %in% phyloacc_subset$phyloacc.id)] <- "short-tarsi broad"
phyloacc$col[which(phyloacc$phyloacc.id %in% phyloacc_subset_cat1$phyloacc.id)] <- "short-tarsi specific"
#plot logbf1 vs logbf2
library(ggplot2)
ggplot(phyloacc, aes(x=logbf1, y=logbf2, col=col))+
  geom_point(shape = 19, alpha = 0.1)+
  theme_classic()+
  scale_color_manual(values = c("grey", "red", "blue")) +
  geom_vline(xintercept=10, linetype="dashed", color = "red")+
  geom_hline(yintercept=5, linetype="dashed", color = "blue")

We have two datasets, one large dataset with very little filtering with 7948 elements and a smaller highly filtered dataset with 403 elements.

Distribution of conserved rate vs accelerated rate for elements

library(gridExtra)
phyloacc_subset <- fread("accelerated_elements_all.txt", sep = "\t")
#rate0 <- fread("rate_postZ_M0.txt", sep = "\t", header = T, select = 1:6)
rate1 <- fread("rate_postZ_M1.txt", sep = "\t", header = T, select = 1:6)
rate1 <- subset(rate1, (`Locus ID` %in% phyloacc_subset$phyloacc.id) & (phyloacc_subset$best.fit.model == "M1"))
rate2 <- fread("rate_postZ_M2.txt", sep = "\t", header = T, select = 1:6)
rate2 <- subset(rate2, (`Locus ID` %in% phyloacc_subset$phyloacc.id) & (phyloacc_subset$best.fit.model == "M2"))
rates <- rbind(rate1, rate2)

p1 <- ggplot(rates)+
  geom_histogram(aes(x=n_rate), bins = 30, fill='red', alpha=0.5, position = 'identity')+
  geom_vline(xintercept = mean(rates$n_rate), col = "red", linetype = "dashed")+
  theme_classic()+
  xlab("Accelerated Rate")
#mean accelerated rate: 1.91
p2 <- ggplot(rates)+
  geom_histogram(aes(x=c_rate), bins = 30, fill='blue', alpha=0.5, position = 'identity')+
  geom_vline(xintercept = mean(rates$c_rate), col = "blue", linetype = "dashed")+
  theme_classic()+
  xlab("Conserved Rate")
grid.arrange(p1, p2, ncol=1)

#mean conserved rate: 0.26
#get stats for elements
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
phyloacc_subset_cat1 <- fread("accelerated_elements_filtbf2_m2.txt", sep = "\t")
final_data_filtered <- fread("final_data_filtered.txt", sep = "\t", header = T)

final_data_accl <- subset(final_data_filtered, id %in% phyloacc_subset$original.id)
final_data_accl_cat1 <- subset(final_data_filtered, id %in% phyloacc_subset_cat1$original.id)

#atac accl elements
table(final_data_accl$atac)
## 
## FALSE  TRUE 
##  3797  4151
table(final_data_accl_cat1$atac)
## 
## FALSE  TRUE 
##   187   216
fisher.test(matrix(c(4151, 3597, 343313, 589154), nrow = 2, byrow = T),
                      alternative = "greater", simulate.p.value = T)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(4151, 3597, 343313, 589154), nrow = 2, byrow = T)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  1.906846      Inf
## sample estimates:
## odds ratio 
##   1.980437
#write.table(final_data_accl[,1:4], "accl_elements.bed", sep = "\t", quote = F, row.names = F, col.names = F)
#get overlap of atac-seq data using 
#bedtools intersect -wao -a accl_elements.bed -b atac_all.bed > accl_atac_overlap.txt

atac_data <- fread("atac_all_annot.txt")
atac_overlap <- read.table("accl_atac_overlap.txt", sep="\t")
colnames(atac_overlap)[5:45] <- colnames(atac_data)
atac_overlap_true <- atac_overlap[-which(atac_overlap$atac_id %in% "."),]
length(unique(atac_overlap_true$V4)) #4151
length(unique(atac_overlap_true$V4[which(atac_overlap_true$hindlimb_HH18 == 1)])) #1575

atac_overlap_ingroup <- subset(atac_overlap_true, V4 %in% phyloacc_subset_cat1$original.id)
length(unique(atac_overlap_ingroup$V4)) #216
length(unique(atac_overlap_ingroup$V4[which(atac_overlap_true$hindlimb_HH18 == 1)])) #102
library(ggvenn)
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
venn_list <- list(penguin = phyloacc_subset$original.id[which(phyloacc_subset$penguin >= 1)],
                  bulbul = phyloacc_subset$original.id[which(phyloacc_subset$bulbul >= 1)],
                  swallow = phyloacc_subset$original.id[which(phyloacc_subset$swallow >= 1)],
                  kingfisher = phyloacc_subset$original.id[which(phyloacc_subset$kingfisher >= 1)])

g1 <- ggvenn(
  venn_list, 
  fill_color = c("#ff6600", "#9999cc", "#cccc00", "#cc66ff"),
       fill_alpha = 0.8,
  stroke_size = 0.5, set_name_size = 5,
  show_elements = F, 
  show_outside = "none")

g1

getphyper <- function(overlap, list1, list2, pop){
  test_val <- phyper(overlap-1,list1,pop-list1,list2)
  return(test_val)
}

#For broad

PopSize=nrow(phyloacc_subset)
listB=length(which(phyloacc_subset$bulbul_shifts>0.9))
listP=length(which(phyloacc_subset$penguin_shifts>0.9))
listS=length(which(phyloacc_subset$swallow_shifts>0.9))
listK=length(which(phyloacc_subset$kingfisher_shifts>0.9))
#calculate whether any of these pairwise comparisons are significant using a hypergeometric test
pvals = c()
#BP
pvals = c(pvals, getphyper(length(which(phyloacc_subset$bulbul_shifts>0.9 & phyloacc_subset$penguin_shifts>0.9)), listB, listP, PopSize))
#BS
pvals = c(pvals, getphyper(length(which(phyloacc_subset$bulbul_shifts>0.9 & phyloacc_subset$swallow_shifts>0.9)), listB, listS, PopSize))
#BK
pvals = c(pvals, getphyper(length(which(phyloacc_subset$bulbul_shifts>0.9 & phyloacc_subset$kingfisher_shifts>0.9)), listB, listK, PopSize))
#SK
pvals = c(pvals, getphyper(length(which(phyloacc_subset$swallow_shifts>0.9 & phyloacc_subset$kingfisher_shifts>0.9)), listS, listK, PopSize))
#PK
pvals = c(pvals, getphyper(length(which(phyloacc_subset$penguin_shifts>0.9 & phyloacc_subset$kingfisher_shifts>0.9)), listP, listK, PopSize))
#PS
pvals = c(pvals, getphyper(length(which(phyloacc_subset$penguin_shifts>0.9 & phyloacc_subset$swallow_shifts>0.9)), listP, listS, PopSize))

p.adjust(pvals, method = "bonferroni")

#For specific
phyloacc_subset_cat1 <- fread("accelerated_elements_filtbf2_m2.txt", sep = "\t")

PopSize=nrow(phyloacc_subset_cat1)
listB=length(which(phyloacc_subset_cat1$bulbul_shifts>0.9))
listP=length(which(phyloacc_subset_cat1$penguin_shifts>0.9))
listS=length(which(phyloacc_subset_cat1$swallow_shifts>0.9))
listK=length(which(phyloacc_subset_cat1$kingfisher_shifts>0.9))
#calculate whether any of these pairwise comparisons are significant using a hypergeometric test
pvals = c()
#BP
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$bulbul_shifts > 0.9 & phyloacc_subset_cat1$penguin_shifts > 0.9)), listB, listP, PopSize))
#BS
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$bulbul_shifts > 0.9 & phyloacc_subset_cat1$swallow_shifts > 0.9)), listB, listS, PopSize))
#BK
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$bulbul_shifts > 0.9 & phyloacc_subset_cat1$kingfisher_shifts > 0.9)), listB, listK, PopSize))
#SK
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$swallow_shifts > 0.9 & phyloacc_subset_cat1$kingfisher_shifts > 0.9)), listS, listK, PopSize))
#PK
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$penguin_shifts > 0.9 & phyloacc_subset_cat1$kingfisher_shifts > 0.9)), listP, listK, PopSize))
#PS
pvals = c(pvals, getphyper(length(which(phyloacc_subset_cat1$penguin_shifts > 0.9 & phyloacc_subset_cat1$swallow_shifts > 0.9)), listP, listS, PopSize))

p.adjust(pvals, method = "bonferroni")
phyloacc_subset$targets_with_shifts <- rowSums(phyloacc_subset[,c("bulbul", "kingfisher", "swallow", "penguin")] >= 1)

ggplot(phyloacc_subset, aes(x=targets_with_shifts))+
  geom_histogram(fill="black", col="grey", bins = 4)+
  theme_classic()

3.1 Convergence test

#R
library(plyr)
library(phangorn)
library(phytools)
library(data.table)

#read main result file
phyloacc <- fread("elem_lik_filt.txt", sep = "\t", header = T)

#find location of shifts on tree aka where acceleration occurred since phyloacc reports 
#all daughter branches of a shift as being accelerated
shift_calc <- function(tree, post_phylo, subtree = ""){
  tree_quant <- data.frame(node = tree$tip.label, id = seq(1, length(tree$tip.label)), type = "tip")
  tree_quant <- rbind(tree_quant, data.frame(node = tree$node.label, 
                                             id = seq(nrow(tree_quant) + 1 , length(tree$node.label) + nrow(tree_quant)), 
                                             type = "internal"))
  tree_quant <- merge(tree_quant, data.frame(edge = tree$edge[,2], edge_len = tree$edge.length, anc = tree$edge[,1]), 
                      by.x = "id", by.y = "edge", all.x = T)
  
  if (length(subtree) <= 1){
    all_daughts <- tree_quant$node[-which(tree_quant$node %in% "root")]
  }
  else{
    mrca <- getMRCA(tree, subtree)
    start_node <- tree_quant$anc[which(tree_quant$id %in% mrca)]
    all_daughts <- tree_quant$node[which(tree_quant$id %in% Descendants(tree, mrca, "all"))]
    all_daughts <- c(tree_quant$node[which(tree_quant$id %in% start_node)],
                     tree_quant$node[which(tree_quant$id %in% mrca)],
                     all_daughts)
  }
  
  tips <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "tip")])], 3, sep = "_")
  internals <- paste(all_daughts[which(all_daughts %in% tree_quant$node[which(tree_quant$type %in% "internal")])], 3, sep = "_")
  
  shifts <- rowSums(post_phylo[, ..tips]) - rowSums(post_phylo[,..internals])
  return(shifts)
}

#get dataframe of shifts
run_shifts <- function(results){
  tree <- read.newick("macro_tree.tree")
  
  taxa_list <- fread("taxa_list_out.txt", header = F, sep = "\t")
  basic_taxa <- paste(taxa_list$V1[taxa_list$V2 == "basic"], 3, sep = "_")
  target_taxa <- paste(taxa_list$V1[taxa_list$V2 == "target"], 3, sep = "_")
  results$basic <- rowSums(results[,..basic_taxa] >= 0.9)
  results$target <- rowSums(results[,..target_taxa] >= 0.9)
  bulbul_taxa <- target_taxa[c(5,6,11)]
  results$abulbul <- rowSums(results[,..bulbul_taxa] >= 0.9)
  swallow_taxa <- target_taxa[c(7,8,9,10,12,15,16)]
  results$aswallow <- rowSums(results[,..swallow_taxa] >= 0.9)
  kingfisher_taxa <- target_taxa[c(3,4,13)]
  results$akingfisher <- rowSums(results[,..kingfisher_taxa] >= 0.9)
  penguin_taxa <- target_taxa[c(1,2,14)]
  results$apenguin <- rowSums(results[,..penguin_taxa] >= 0.9)
  
  shifts <- results[,c("Locus ID", "basic", "target", "abulbul", "aswallow", "akingfisher", "apenguin")]
  colnames(shifts)[1] <- "Locus"
  shifts$all_shifts <- shift_calc(tree, results)
  shifts$abulbul_shifts <- shift_calc(tree, results, subtree = c("Sturnus_vulgaris", "Ficedula_albicollis"))
  shifts$aswallow_shifts <- shift_calc(tree, results, subtree = c("Emberiza_elegans", "Molothrus_ater", "Setophaga_coronata", "Diglossa_brunneiventris"))
  shifts$akingfisher_shifts <- shift_calc(tree, results, subtree = c("Pogoniulus_pusillus", "Colaptes_auratus"))
  shifts$apenguin_shifts <- shift_calc(tree, results, subtree = c("Pluvialis_apricaria", "Sterna_hirundo"))
  shifts$targets_with_shifts <- rowSums(shifts[,c("abulbul_shifts", "akingfisher_shifts", "aswallow_shifts", "apenguin_shifts")] >= 0.9)
  shifts$in_lin <- round(shifts$abulbul_shifts + shifts$akingfisher_shifts + shifts$aswallow_shifts + shifts$apenguin_shifts)
  shifts$out_lin <- round(shifts$all_shifts - shifts$abulbul_shifts - shifts$akingfisher_shifts - shifts$aswallow_shifts - shifts$apenguin_shifts)
  return(shifts)
}

#calculate shift location for elements where m2 is best model
results_m2 <- fread("rate_postZ_M2.txt", header = T, sep = "\t")
shifts_m2 <- run_shifts(results_m2)

phyloacc_subset_a <- subset(phyloacc, logbf3 > 10)
phyloacc_subset_a <- merge(phyloacc_subset_a, shifts_m2, by.x = "phyloacc.id", by.y = "Locus", all.x = T, all.y = F)
phyloacc_subset_a <- subset(phyloacc_subset_a, targets_with_shifts > 0 & (abulbul > 1 | akingfisher > 1 | aswallow > 2 | apenguin > 1)) 
write.table(phyloacc_subset_a, "accelerated_elements_outgroups.txt", sep = "\t", row.names = F, col.names = T, quote = F)
phyloacc_subset <- fread("accelerated_elements_all_m2.txt", sep = "\t")
phyloacc_subset_a <- fread("accelerated_elements_outgroups.txt", sep = "\t")

#for the four group
conv_4 <- fisher.test(matrix(c(length(which(phyloacc_subset$targets_with_shifts == 4)),
                             length(which(phyloacc_subset_a$targets_with_shifts == 4)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$targets_with_shifts == 4)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$targets_with_shifts == 4))),nrow = 2),
                      alternative = "greater")

#for the three groups
conv_3_bks <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9 &
                                                  phyloacc_subset$swallow_shifts > 0.9)),
                             length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9 &
                                                  phyloacc_subset_a$aswallow_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9 &
                                                  phyloacc_subset$swallow_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9 &
                                                  phyloacc_subset_a$aswallow_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_3_bsp <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9 &
                                                  phyloacc_subset$swallow_shifts > 0.9)),
                             length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9 &
                                                  phyloacc_subset_a$aswallow_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9 &
                                                  phyloacc_subset$swallow_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9 &
                                                  phyloacc_subset_a$aswallow_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_3_bkp <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_3_skp <- fisher.test(matrix(c(length(which(phyloacc_subset$swallow_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$swallow_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")

# for the two groups
conv_2_bk <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9)),
                             length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_2_bs <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$swallow_shifts > 0.9)),
                             length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$aswallow_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$swallow_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$aswallow_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_2_bp <- fisher.test(matrix(c(length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$bulbul_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$abulbul_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_2_sk <- fisher.test(matrix(c(length(which(phyloacc_subset$swallow_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9)),
                             length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$swallow_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_2_sp <- fisher.test(matrix(c(length(which(phyloacc_subset$swallow_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$swallow_shifts > 0.9 &
                                                  phyloacc_subset$penguin_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$aswallow_shifts > 0.9 &
                                                  phyloacc_subset_a$apenguin_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")
conv_2_pk <- fisher.test(matrix(c(length(which(phyloacc_subset$penguin_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9)),
                             length(which(phyloacc_subset_a$apenguin_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9)),
                             nrow(phyloacc_subset)-length(which(phyloacc_subset$penguin_shifts > 0.9 &
                                                  phyloacc_subset$kingfisher_shifts > 0.9)),
                             nrow(phyloacc_subset_a)-length(which(phyloacc_subset_a$apenguin_shifts > 0.9 &
                                                  phyloacc_subset_a$akingfisher_shifts > 0.9))),nrow = 2, byrow = T),
                      alternative = "greater")

conv_df <- data.frame(groups=c("BK", "BP", "BS", "PK", "SK", "SP", "BKP", "BKS", "BSP", "SKP", "BKSP"),
                      prob=c(conv_2_bk$p.value, conv_2_bp$p.value, conv_2_bs$p.value, 
                             conv_2_pk$p.value, conv_2_sk$p.value, conv_2_sp$p.value,
                             conv_3_bkp$p.value, conv_3_bks$p.value, conv_3_bsp$p.value, conv_3_skp$p.value,
                             conv_4$p.value),
                      odds=c(conv_2_bk$estimate, conv_2_bp$estimate, conv_2_bs$estimate, 
                             conv_2_pk$estimate, conv_2_sk$estimate, conv_2_sp$estimate,
                             conv_3_bkp$estimate, conv_3_bks$estimate, conv_3_bsp$estimate, conv_3_skp$estimate,
                             conv_4$estimate))
conv_df$groups <- factor(conv_df$groups, levels = conv_df$groups)
ggplot(conv_df, aes(x=groups, y=odds))+
  geom_point(size=2)+
  theme_bw()+
  annotate("rect", xmin = "BK", xmax = "BKSP", ymin = 0, ymax = 1, col="grey", alpha=0.3)

3.2 PANTHER gene enrichment

3.2.1 Setup data

To understand the functions of genes we have identified as accelerated, we will calculate a PANTHER gene enrichment score. Because conserved elements are not randomly distributed across the genome, we cannot use the default enrichment and p-value score reported by PANTHER pipeline. We will need to calculate these values independently.

#R
all_data <- fread("conserved_elements_all.txt")
final_data_filtered <- subset(all_data, id %in% phyloacc$original.id)
#Get loci
loci_subset <- subset(all_data, id %in% phyloacc_subset$original.id)
loci_subset_cat1 <- subset(all_data, id %in% phyloacc_subset_cat1$original.id)

#write loci
fwrite(final_data_filtered[,1:4], "loci_all_m2.txt", sep = "\t", col.names = F, row.names = F, quote = F)
fwrite(loci_subset[,1:4], "loci_accl_m2.txt", sep = "\t", col.names = F, row.names = F, quote = F)
fwrite(loci_subset_cat1[,1:4], "loci_accl_cat1_m2.txt", sep = "\t", col.names = F, row.names = F, quote = F)

Get genes 10000 bp upstream and downstream of elements

#bash
#add 10000 bp filler in front and behind loci
awk 'BEGIN{ FS="\t"; OFS=FS }{ if ($2-10000>0) print $1,$2-10000,$3+10000,$4; else print $1,0,$3+10000,$4 }' loci_all_m2.txt > loci_all_m2_10000.txt
awk 'BEGIN{ FS="\t"; OFS=FS }{ if ($2-10000>0) print $1,$2-10000,$3+10000,$4; else print $1,0,$3+10000,$4 }' loci_accl_m2.txt > loci_accl_m2_10000.txt
awk 'BEGIN{ FS="\t"; OFS=FS }{ if ($2-10000>0) print $1,$2-10000,$3+10000,$4; else print $1,0,$3+10000,$4 }' loci_accl_cat1_m2.txt > loci_accl_cat1_m2_10000.txt
#using bedtools intersect get genes in this window
bedtools intersect -wao -a loci_all_m2_10000.txt -b galgal7b_allgene.gff > loci_all_genes_m2.txt
bedtools intersect -wao -a loci_accl_m2_10000.txt -b galgal7b_allgene.gff > loci_accl_genes_m2.txt
bedtools intersect -wao -a loci_accl_cat1_m2_10000.txt -b galgal7b_allgene.gff > loci_accl_cat1_genes_m2.txt

3.2.2 Run PANTHER

Read the gene sets and run PANTHER.

#R
library(stringr)
#all genes near all conserved elements
closest_all <- fread("loci_all_genes_m2.txt", header = F)
closest_all$gene <- str_extract(closest_all$V13, "gene=[a-zA-Z0-9_]+")
closest_all$gene <- gsub("^[A-Za-z_]+=", "", closest_all$gene)
write.csv(unique(closest_all$gene),"closest_all_gene_m2.csv", row.names = F, col.names = F, quote = F, eol = ",")

#all genes near accelerated cnees in full dataset
closest <- fread("loci_accl_genes_m2.txt", header = F)
closest$gene <- str_extract(closest$V13, "gene=[a-zA-Z0-9_]+")
closest$gene <- gsub("^[A-Za-z_]+=", "", closest$gene)
write.csv(unique(closest$gene),"closest_gene_m2.csv", row.names = F, col.names = F, quote = F, eol = ",")

#get a few stats
#elements with no genes near 10kb window (1720 elements)
length(which(is.na(closest$gene)))

#number of unique genes (5008 unique genes)
length(unique(closest$gene))-1

#average number of genes per element (1.5 genes/element)
genes_perelem <- data.frame(table(table(closest$V4[-(which(is.na(closest$gene)))])), stringsAsFactors = F)
genes_perelem$Var1 <- as.integer(unlist(lapply(genes_perelem$Var1, as.character)))
genes_perelem$tot <- genes_perelem$Var1 * genes_perelem$Freq
sum(genes_perelem$tot)/sum(genes_perelem$Freq)

#all genes near accelerated cnees in ingorup dataset
closest_cat1 <- fread("loci_accl_cat1_genes_m2.txt", header = F)
closest_cat1$gene <- str_extract(closest_cat1$V13, "gene=[a-zA-Z0-9_]+")
closest_cat1$gene <- gsub("^[A-Za-z_]+=", "", closest_cat1$gene)
write.csv(unique(closest_cat1$gene),"closest_cat1_gene_m2.csv", row.names = F, col.names = F, quote = F, eol = ",")

closest2 <- merge(closest, phyloacc_subset, by.x = "V4", by.y="original.id", all.x = T)
gene_taxa <- unique(closest2[!is.na(closest2$gene),c("gene", "bulbul_shifts", "swallow_shifts", "kingfisher_shifts", "penguin_shifts", "targets_with_shifts")])
gene_taxa <- gene_taxa %>% group_by(gene) %>% 
  summarise(across(everything(), sum),
            .groups = 'drop')  %>%
  as.data.frame()
gene_taxa$targets_with_shifts <- rowSums(gene_taxa[,c("bulbul_shifts", "kingfisher_shifts", "swallow_shifts", "penguin_shifts")] >= 0.9)
gene_taxa_cat1 <- subset(gene_taxa, gene %in% closest_cat1$gene)
#write.table(gene_taxa, "gene_taxa_m2.txt", sep = "\t", col.names = T, row.names = F, quote = F)
#write.table(gene_taxa_cat1, "gene_taxa_cat1_m2.txt", sep = "\t", col.names = T, row.names = F, quote = F)

getphyper <- function(overlap, list1, list2, pop){
  test_val <- phyper(overlap-1,list1,pop-list1,list2)
  return(test_val)
}

PopSize=nrow(gene_taxa)
listB=length(which(gene_taxa$bulbul_shifts>0.9))
listP=length(which(gene_taxa$penguin_shifts>0.9))
listS=length(which(gene_taxa$swallow_shifts>0.9))
listK=length(which(gene_taxa$kingfisher_shifts>0.9))
#calculate whether any of these pairwise comparisons are significant using a hypergeometric test
pvals = c()
#BP
pvals = c(pvals, getphyper(length(which(gene_taxa$bulbul_shifts > 0.9 & gene_taxa$penguin_shifts > 0.9)), listB, listP, PopSize))
#BS
pvals = c(pvals, getphyper(length(which(gene_taxa$bulbul_shifts > 0.9 & gene_taxa$swallow_shifts > 0.9)), listB, listS, PopSize))
#BK
pvals = c(pvals, getphyper(length(which(gene_taxa$bulbul_shifts > 0.9 & gene_taxa$kingfisher_shifts > 0.9)), listB, listK, PopSize))
#SK
pvals = c(pvals, getphyper(length(which(gene_taxa$swallow_shifts > 0.9 & gene_taxa$kingfisher_shifts > 0.9)), listS, listK, PopSize))
#PK
pvals = c(pvals, getphyper(length(which(gene_taxa$penguin_shifts > 0.9 & gene_taxa$kingfisher_shifts > 0.9)), listP, listK, PopSize))
#PS
pvals = c(pvals, getphyper(length(which(gene_taxa$penguin_shifts > 0.9 & gene_taxa$swallow_shifts > 0.9)), listP, listS, PopSize))

p.adjust(pvals, method = "bonferroni")

venn_list <- list(penguin = gene_taxa$gene[which(gene_taxa$penguin >= 1)],
                  bulbul = gene_taxa$gene[which(gene_taxa$bulbul >= 1)],
                  swallow = gene_taxa$gene[which(gene_taxa$swallow >= 1)],
                  kingfisher = gene_taxa$gene[which(gene_taxa$kingfisher >= 1)])
g1 <- ggvenn(
  venn_list, 
  fill_color = c("#ff6600", "#9999cc", "#cccc00", "#cc66ff"),
       fill_alpha = 0.8,
  stroke_size = 0.5, set_name_size = 5,
  show_elements = F, 
  show_outside = "none")

ggplot(gene_taxa, aes(x=targets_with_shifts))+
  geom_histogram(fill="#4d4d4d", col="grey", bins = 4)+
  theme_classic()
#R
#Run PANTHER
library(rbioapi)
# all_closest <- read.csv("closest_all_gene.csv", header = F)
#Organism 9031 is Gallus gallus
#Dataset GO:0008150 is GO terms in biological processes group
all_enrich_main <- rba_panther_enrich(genes = unique(closest$gene), 
                                 organism = 9031, 
                                 annot_dataset = "GO:0008150")
cat1_enrich_main <- rba_panther_enrich(genes = unique(closest_cat1$gene), 
                                organism = 9031, 
                                 annot_dataset = "GO:0008150")

3.2.3 Calculate distribution of conserved elements in GO categories

Next we need to calculate a baseline value of genes in GO categories if you randomly sampled genes close to conserved elements. We repeated this sampling and calculation 10000 times. Below is the R code to run to get this distribution fo each element for a supplied input element number (here 7948 and 403).

#R
library(data.table)
library(stringr)
library(rbioapi)

args = commandArgs(trailingOnly=TRUE)
# test if there is at least one argument: if not, return an error
if (length(args)==0) {
  stop("At least one argument must be supplied", call.=FALSE)
} else if (!is.numeric(as.numeric(args[1]))){
  stop("Argument must be a number", call.=FALSE)
}

all_data <- fread("final_data_filtered.bed", header = T)
nums <- args[1]

enrich_out <- data.frame(term.id=character())
gene_out <- data.frame(term.id=character())
for (i in seq(1,1000)){
  rand_ces <- sample(all_data$id, nums)
  loci_subset <- subset(all_data[,c(1:4)], id %in% rand_ces)
  loci_subset$start <- loci_subset$start - 10000
  loci_subset$end <- loci_subset$start + 10000
  loci_subset$start[which(loci_subset$start < 0)] <- 0
  fwrite(loci_subset, paste("loci_sub_", nums, ".txt", sep=""), sep = "\t", col.names = F, row.names = F, quote = F)
  x <- paste('bedtools intersect -wao -a loci_sub_', nums, '.txt -b galgal7b_allgene.gff > loci_sub_genes_', nums, '.txt', sep="")
  system(x)
  closest <- fread(paste("loci_sub_genes_", nums, ".txt", sep = ""), header = F)
  closest$gene <- sub(".*gene=([a-zA-Z0-9_]+).*", "\\1", closest$V13)
  enrich <- rba_panther_enrich(genes = unique(closest$gene),
                                   organism = 9031,
                                   annot_dataset = "GO:0008150")
  try({
    print(paste("Run", i, sep=" "))
    colnames(enrich$result)[1] <- paste(colnames(enrich$result)[1], i, sep = "_")
    colnames(enrich$result)[2] <- paste(colnames(enrich$result)[2], i, sep = "_")
    enrich_out <- merge(enrich_out, enrich$result[,c(paste("fold_enrichment",i, sep = "_"), "term.id")], by="term.id", all=T)
    gene_out <- merge(gene_out, enrich$result[,c(paste("number_in_list",i, sep = "_"), "term.id")], by="term.id", all=T)
  }, silent = T)
}

fwrite(enrich_out, paste("enrich_out_", nums, ".txt", sep=""), sep = "\t", col.names = T, row.names = F, quote = F)
fwrite(gene_out, paste("genes_out_", nums, ".txt", sep=""), sep = "\t", col.names = T, row.names = F, quote = F)

3.2.4 Calculate effective p-values and enrichment scores

Read these distributions and calculate an effective p-value:

\[ pvalue = \frac{({\sf Number\ of\ genes\ in\ GO\ category\ for\ replicates } >= \\{\sf Calculated\ number\ of\ genes\ in\ GO\ category\ for\ accelerated\ elements })+1}{{\sf Total\ number\ of\ replicates+1}} \] and enrichment score.
\[ enrichment = \frac{\frac{\sf Calculated\ number\ of\ genes\ in\ GO\ category\ for\ accelerated\ elements }{\sf Total\ number\ of\ input\ genes}}{\frac{\sf Median\ number\ of\ genes\ in\ GO\ category\ for\ replicates }{\sf Median\ number\ of\ input\ genes\ for\ replicates}} \]

#R
enrich_dist <- fread("genes_out_7948.txt", header = T)
enrich_dist[is.na(enrich_dist)] <- 0

enrich_dist_cat1 <- fread("genes_out_403.txt", header = T)
enrich_dist_cat1[is.na(enrich_dist_cat1)] <- 0

get_pgo <- function(GO, value, distr){
  if (GO %in% distr$term.id){
    all_vals <- length(which(t(distr[which(distr$term.id %in% GO),2:ncol(distr)]) >= value))
    p <- (all_vals+1)/(ncol(distr))
    return(p)
  }else{return(0)}
}

#calculate effective p
all_enrich <- all_enrich_main$result
all_enrich$pgo <- mapply(get_pgo, all_enrich$term.id, all_enrich$number_in_list, MoreArgs=list(dist=enrich_dist))
cat1_enrich <- cat1_enrich_main$result
cat1_enrich$pgo <- mapply(get_pgo, cat1_enrich$term.id, cat1_enrich$number_in_list, MoreArgs=list(dist=enrich_dist_cat1))

#calculate enrichment score
enrich_dist$median <- MatrixGenerics::rowMedians(as.matrix(enrich_dist[,2:ncol(enrich_dist)]), useNames = F)
cols_sub <- c(1, (ncol(enrich_dist)))
all_enrich <- merge(all_enrich, enrich_dist[,..cols_sub], by = "term.id", all.x = T, all.y = F)
tot <- read.table("total_out_7948.txt")
#total unique genes for this set 5009
all_enrich$median[which(all_enrich$median == 0)] <- 0.00001
all_enrich$enrich <- (all_enrich$number_in_list/5008)/(all_enrich$median/median(tot$V1))

enrich_dist_cat1$median <- MatrixGenerics::rowMedians(as.matrix(enrich_dist_cat1[,2:(ncol(enrich_dist_cat1))]), useNames = F)
cols_sub <- c(1, (ncol(enrich_dist_cat1)))
cat1_enrich <- merge(cat1_enrich, enrich_dist_cat1[,..cols_sub], by = "term.id", all.x = T, all.y = F)
tot_cat1 <- read.table("total_out_533.txt")
#total unique genes for this set 380
cat1_enrich$median[which(cat1_enrich$median == 0)] <- 0.00001
cat1_enrich$enrich <- (cat1_enrich$number_in_list/380)/(cat1_enrich$median/median(tot_cat1$V1))

#Gather significant p-values
#subset out GO categories with less than 5 genes
all_sigs <- all_enrich[(all_enrich$pgo < 0.05),]
cat1_sigs <- cat1_enrich[(cat1_enrich$pgo < 0.05),]
all_sigs <- subset(all_sigs, (number_in_reference >= 5))
all_sigs <- subset(all_sigs, (median >= 1))
cat1_sigs <- subset(cat1_sigs, (number_in_reference >= 5))
cat1_sigs <- subset(cat1_sigs, (median >= 1))

write.table(all_enrich, "all_enrich_m2.txt", sep = "\t", row.names = F, col.names = T, quote = F)
#all_enrich <- fread("all_enrich_m2.txt", sep = "\t", header = T)
write.table(cat1_enrich, "cat1_enrich_m2.txt", sep = "\t", row.names = F, col.names = T, quote = F)
#cat1_enrich <- fread("cat1_enrich_m2.txt", sep = "\t", header = T)

3.2.5 GO similarity clustering

Next we will do a GO term clustering to figure out common groups of GO terms

#R
library(stringr)
#do GO similarity clustering for each group
panther_out <- fread("all_enrich_m2.txt", sep = "\t", header = T)
panther_cat1_out <- fread("cat1_enrich_m2.txt", sep = "\t", header = T)
panther_merge <- merge(panther_out, panther_cat1_out, by = "term.id", all.y=T, all.x=F)
panther_merge$enrich_diff <- (panther_merge$enrich.y - panther_merge$enrich.x)
panther_merge <- subset(panther_merge, (number_in_reference.y >= 5) & (median.y >= 1) & (pgo.y < 0.05))

library(jsonlite)
sub_json <- read_json("panther_results_cat1.json")

go_output <- data.frame(gene=character(), GO=character())
for (i in seq(1,length(sub_json$overrepresentation$group))){
  if (is.null(names(sub_json$overrepresentation$group[[i]]$result))){
    for (j in seq(1, length(sub_json$overrepresentation$group[[i]]$result))){
      if (sub_json$overrepresentation$group[[i]]$result[[j]]$input_list$number_in_list > 0){
        go_output <- rbind(go_output, data.frame(gene = unlist(sub_json$overrepresentation$group[[i]]$result[[j]]$input_list$mapped_id_list$mapped_id),
                                                 GO = sub_json$overrepresentation$group[[i]]$result[[j]]$term$id))
      }
    }
  }
  else{
    if (sub_json$overrepresentation$group[[i]]$result$input_list$number_in_list > 0){
      if (is.null(sub_json$overrepresentation$group[[i]]$result$term$id)){
        go_output <- rbind(go_output, 
                           data.frame(gene = unlist(sub_json$overrepresentation$group[[i]]$result$input_list$mapped_id_list$mapped_id),
                                                   GO = NA))
      }
      else{
        go_output <- rbind(go_output, 
                           data.frame(gene = unlist(sub_json$overrepresentation$group[[i]]$result$input_list$mapped_id_list$mapped_id),
                                                   GO = sub_json$overrepresentation$group[[i]]$result$term$id))
      }
    }
  }
}

#calculate per gene score for each GO
gene_taxa <- read.table("gene_taxa_m2.txt", sep = "\t", header = T)
gene_taxa_cat1 <- read.table("gene_taxa_cat1_m2.txt", sep = "\t", header = T)
go_scores <- data.frame(GO=panther_merge$term.id, gene=NA, element=NA)
for (GO_term in panther_merge$term.id){
  go_scores$gene[which(go_scores$GO %in% GO_term)] <- (mean(gene_taxa$targets_with_shifts[gene_taxa$gene %in% go_output$gene[go_output$GO %in% GO_term]]))
}

panther_merge <- merge(panther_merge, go_scores, by.x="term.id", by.y="GO")
panther_merge_max <- panther_merge[which(panther_merge$enrich_diff > 5),]
panther_merge_max$label <- paste(panther_merge_max$term.id, panther_merge_max$term.label.x, sep = "-")

library(ggrepel)

ggplot(panther_merge, aes(x=enrich.x, y=enrich.y, color=gene, fill=gene, segment.color=gene))+
  geom_point()+
  scale_color_gradient(low = "blue", high = "red") +
  xlab("Enrichment:Short-tarsi Broad Dataset") +
  ylab("Enrichment:Short-tarsi Specific Dataset") +
  geom_label_repel(data = panther_merge_max, aes(x=enrich.x, y=enrich.y, label=label),
                  color = "white",
                  nudge_x = .15,
                  box.padding = 0.1,
                  nudge_y = 1,
                  segment.curvature = -0.1,
                  segment.ncp = 3,
                  segment.angle = 20)+
  scale_fill_gradient(low = "blue", high = "red")+
  theme_classic()

Plot heat maps of GO similarity

library(simplifyEnrichment)
pdf("go_sub_heatmap_m2.pdf", height = 5, width = 7)
mat_all <- GO_similarity(all_sigs$term.id, ont = "BP", db = 'org.Gg.eg.db')
go_df <- simplifyGO(mat_all)
dev.off()

pdf("go_cat1_heatmap_m2.pdf", height = 5, width = 7)
mat_cat1 <- GO_similarity(cat1_sigs$term.id, ont = "BP", db = 'org.Gg.eg.db')  
go_df_cat1 <- simplifyGO(mat_cat1)
dev.off()