Following includes codes used to generate an generate alignments of coding sequences and then run selection analysis
This markdown file documents selection tests used to identify genes selected in short-tarsi taxa. We used two approaches one using TOGA to generate coding sequence alignments for the 78 species in our study and then using aBSREL to look for evidence of positive selection. For the second approach we used population resequencing data to perform a MK test.
We used the Hal file generated from cactus: see cactus doc. For the reference species we used Galgal7b refseq gff file: here
We used TOGA (see Kirilenko et al. (2023)) to generate orthologous sequences for the 78 species. To prepare the data for TOGA, we followed the steps outlined here.
#bash
#Generate file containing chromosome lenghts for galgal7 assembly
python GenomeAnnotation-TOGA/utilities/WriteChromLengthBedFromFasta.py galgal7 Gallus_gallus_GCF016699485.fna
#Gather only CDS
gffread galgal7b.gff -g Gallus_gallus_GCF016699485.fna -F -C -o galgal7b_cds.gff
#Get isoform information file
awk -F"\t" '$3=="mRNA"{print $9}' galgal7b_cds.gff | \
awk -F";" '{print $3"\t"$1}' | \
sed 's/gene_name=//g' | \
sed 's/ID=//g' > galgal7b_isoform.tsv
#convert gff to GenePred
gff3ToGenePred galgal7b.gff galgal7b.GenePred
#convert GenePred to bed
genePredToBed galgal7b.GenePred galgal7b.bed
#Keep only CDS in bed
python GenomeAnnotation-TOGA/utilities/FilterAnnotationBedForCDS.py galgal7b_isoform.tsv galgal7b.bed
#Final file is CDS_galgal7b.bed
#bash
#Create array of all species
files=( $(ls /n/holylfs05/LABS/informatics/Users/sshakya/genomes_cactus/snakemake_out/*.fna) )
#Get scaffold length files for each species
#Convert fasta to 2bit
for i in ${files[@]};
do
fbname=$(basename "$i" .fna)
python GenomeAnnotation-TOGA/utilities/WriteChromLengthBedFromFasta.py chrom_length_bed/${fbname} $i
faToTwoBit $i 2bit/${fbname}.2bit
done
#Manually removed the two gallus gallus genomes
#Use halLiftover to convert hal to psl
#Convert psl to positive strand
#use axtChain to create chain files from PSL
files=( $(ls chrom_length_bed/*.bed) )
for i in ${files[@]};
do
fbname=$(basename $i _chromlengths.bed)
species=$(echo $fbname | awk -F_ '{print $1"_"$2}')
halLiftover --outPSL ../all_birds_new.hal $species $i Gallus_gallus7 psl/${species}_galgal7.psl
pslPosTarget psl/${species}_galgal7.psl pos_psl/${species}_galgal7.psl
axtChain -psl -verbose=0 -linearGap=loose pos_psl/${species}_galgal7.psl 2bit/${fbname}.2bit \
Gallus_gallus_GCF016699485.2bit chain/${species}_galgal7.chain
done
TOGA was run with 500 parallel processes per species
#bash
files=( $(ls chrom_length_bed/*.bed) )
for i in ${files[@]};
do
fbname=$(basename $i _chromlengths.bed)
species=$(echo $fbname | awk -F_ '{print $1"_"$2}')
~/TOGA/toga.py chain/${species}_galgal7.chain \
galgal7b_new.bed \
Gallus_gallus_GCF016699485.2bit \
2bit/$fbname.2bit \
--kt \
--pn toga_run/$fbname \
-i galgal7b_isoform.tsv \
--nc ~/TOGA/nextflow_config_files/ \
--cb 10,100 \
--cjn 500
done
First we gather all output files. TOGA failed for two species Eudyptes filholi and Eudyptes moseleyi and were subsequently omitted from the selection analysis
#bash
#The main toga output is in the orthology_classification.tsv file
files=( $(ls chrom_length_bed/*.bed) )
for i in ${files[@]};
do
cp toga_run/${i}/orthology_classification.tsv orthology_files/
mv orthology_files/orthology_classification.tsv orthology_files/${i}.tsv
done
Next we compiled a dataframe identifying only single copy orthologs. * We included all genes labeled one2one with galgal7b * We also included all genes labeled one2zero * For genes labeled one2many we dropped the species with the multiple hits and only kept single copy genes * We kept only one transcript (largest) per gene * We dropped any gene with less than 15 species
#R
chicken <- read.table("galgal7b_isoform.tsv",
sep = "\t", header = F, col.names = c("gene", "transcript"))
chicken_bed <- read.table("galgal7b.bed", sep = "\t", header = F)
chicken_bed$length <- unlist(lapply(lapply(strsplit(chicken_bed$V11,split = ","), as.integer), sum))
chicken <- merge(chicken, chicken_bed[,c("V4", "length")], by.x="transcript", by.y="V4", all.x = T, all.y = F)
species <- read.table("species_list.txt", sep = "\t", header = F)
for (specie in species$V1){
print(specie)
try({
transcripts <- read.table(paste("orthology_files/", specie, ".tsv", sep = ""), sep = "\t", header = T)
#keep only one to one
transcripts <- subset(transcripts, orthology_class %in% c("one2one","one2zero", "one2many"))
transcripts <- transcripts[!duplicated(transcripts[,1:2]),]
transcripts$q_gene[transcripts$orthology_class %in% "one2many"] <- "None"
transcripts$q_transcript[transcripts$orthology_class %in% "one2many"] <- "None"
colnames(transcripts)[3:4] <- paste(colnames(transcripts)[3:4], specie, sep = ".")
chicken <- merge(chicken, transcripts[,2:4], by.x = "transcript", by.y="t_transcript", all.x = F, all.y = F)
})
}
chicken <- chicken[order(chicken$gene, -chicken$length), ]
chicken <- chicken[!duplicated(chicken$gene),]
chicken$none_count <- rowSums(chicken == "None")
#We filtered genes with less than 15 species represented
chicken2 <- subset(chicken, none_count < 30)
chicken2 <- chicken2[!duplicated(chicken2$gene),]
write.table(chicken2[,c(1:153)], "one2one_orthos.txt", sep = "\t", row.names = F, col.names = T, quote = F)
We ended up with 10435 genes.
!usr/bin/python
from Bio import SeqIO
import pandas as pd
import os
class orthologues:
def __init__(self):
self.seqs = []
def main():
one2one_df = pd.read_table("one2one_orthos.txt")
species = [one2one_df.columns[x].split(".")[1] for x in range(3,len(one2one_df.columns),2)]
genes = {}
#Create list for each gene as we parse across species
for row in range(len(one2one_df)):
genes[one2one_df.loc[row][1]+"_"+one2one_df.loc[row][0]] = orthologues()
for specie in species:
print(specie)
for record in SeqIO.parse("toga_run/"+specie+"/nucleotide.fasta", "fasta"):
if ("ref" not in record.id):
try:
geneidx = one2one_df[one2one_df["q_transcript."+specie]==record.id].index.values.astype(int)[0]
geneid = one2one_df.loc[geneidx][1]+"_"+one2one_df.loc[geneidx][0]
record.id = specie
genes[geneid].seqs.append(record)
except:
pass
# Finally work on Gallus gallus
print("Gallus_gallus")
for record in SeqIO.parse("toga_run/"+species[0]+"/nucleotide.fasta", "fasta"):
if ("ref" in record.id):
try:
geneidx = one2one_df[one2one_df["transcript"]==record.id[4:]].index.values.astype(int)[0]
geneid = one2one_df.loc[geneidx][1]+"_"+one2one_df.loc[geneidx][0]
record.id = "Gallus_gallus"
genes[geneid].seqs.append(record)
except:
pass
for gene in genes.keys():
SeqIO.write(genes[gene].seqs, "prot_align/"+gene+".fasta", "fasta")
if __name__=="__main__":
main()
We used macse v2 (Ranwez et al. 2018) to align genes.
files=( $(ls *.fasta) )
#Submits array jobs each with batches of 100 genes
for ((i=${SLURM_ARRAY_TASK_ID}*100; i<=(${SLURM_ARRAY_TASK_ID}*100)+99; i++)); do
echo ${files[${i}]}
macse -prog alignSequences -seq ${files[${i}]} -out_NT ../prot_align_macse/${files[${i}]} -out_AA ../prot_align_aa/${files[${i}]}
sed -i 's/\(>[A-Za-z]*_[A-Za-z]*\).*/\1/g' ../prot_align_macse/${files[${i}]}
done
We used the filtering code (see here) to filter alignments of misalinged and problematic codons. We ended up with 9854 genes.
#bash
python aln_filter.py -i prot_align_macse/ -n 10 -o prot_align_macse_filt --overwrite
We used HMMCleaner (Franco et al. 2019), to mask codons that are potentially the result of bad assemblies.
#bash
files=( $(ls ../prot_align_macse_filt-spec10-seq20-site50/cds/*.fasta) )
for ((i=${SLURM_ARRAY_TASK_ID}*100; i<=(${SLURM_ARRAY_TASK_ID}*100)+99; i++)); do
echo ${files[${i}]}
fbname=$(basename "$1" .fasta)
HmmCleaner.pl -costs -0.25 -0.20 0.15 0.45 ${files[${i}]}
mv ../prot_align_macse_filt-spec10-seq20-site50/cds/${fbname}_hmm.fasta ./
done
We ran aBSREL (Smith et al. 2015) for each gene alignment using a synonymous rate variation model. An custom R-script was used to prune input tree to match the taxa present in each alignment.
files=( $(ls ../prot_align_hmm/*.fasta) )
for ((i=${SLURM_ARRAY_TASK_ID}*10; i<=(${SLURM_ARRAY_TASK_ID}*10)+9; i++)); do
echo ${files[${i}]}
fbname=$(basename ${files[$i]} .fasta)
Rscript tree_format.Rscript macro_tree_node.txt prot_align_hmm/${fbname}.fasta trees/$fbname
hyphy absrel --alignment prot_align_hmm/${fbname}.fasta --tree trees/${fbname}.txt \
--output output_hmm/${fbname}_SRV.JSON --srv Yes
done
#R script to subset taxa in tree to match fasta file
args <- commandArgs(trailingOnly = TRUE)
library(ape)
tree <- read.tree(args[1])
fasta <- read.FASTA(args[2])
tree2 <- keep.tip(tree, intersect(names(fasta), tree$tip.label))
write.tree(tree2, paste(args[3], ".txt", sep=""))
Parse out aBSREL results and find branches selected among the four focal groups.
!usr/bin/python
import json
import argparse
import pandas as pd
from functools import reduce
def count_targets(species):
bulbul = set(["Pycnonotus_jocosus", "Brachypodius_atriceps", "Anc33"])
penguin = set(["Aptenodytes_patagonicus", "Aptenodytes_forsteri", "Pygoscelis_adeliae",
"Pygoscelis_papua", "Pygoscelis_antarcticus", "Megadyptes_antipodes",
"Eudyptula_novaehollandiae", "Eudyptula_albosignata", "Eudyptula_minor",
"Spheniscus_demersus", "Spheniscus_humboldti", "Spheniscus_magellanicus",
"Eudyptes_pachyrhynchus", "Eudyptes_sclateri", "Eudyptes_chrysolophus",
"Eudyptes_schlegeli", "Eudyptes_chrysocome", "Eudyptes_filholi",
"Eudyptes_moseleyi", "Anc56", "Anc58", "Anc60", "Anc63", "Anc65",
"Anc68", "Anc69", "Anc48", "Anc59", "Anc62", "Anc64", "Anc67",
"Anc57", "Anc61", "Anc66", "Anc55", "Anc54", "Anc53", "Anc52"])
kingfisher = set(["Chloroceryle_aenea", "Actenoides_hombroni", "Halcyon_senegalensis",
"Anc46", "Anc45"])
swallow = set(["Hirundo_rustica", "Progne_subis", "Riparia_riparia",
"Anc34", "Anc35"])
input_list = set(species)
return(len(input_list & bulbul), len(input_list & penguin), len(input_list & kingfisher), len(input_list & swallow))
def absrel_out(args):
genes = pd.read_table(args.gene_list, header=None, names=["gene_name"])
genes[["AIC_SRV", "nsel_SRV", "bulbul", "penguin", "kingfisher", "swallow", "targets", "taxa"]] = ''
for idx in range(len(genes)):
jfile = json.load(open(args.input_folder + genes.loc[idx]["gene_name"] + "_SRV" + ".JSON", 'r'))
genes.loc[idx]["AIC_SRV"] = round(jfile["fits"]["Full adaptive model"]["AIC-c"], 1)
genes.loc[idx]["nsel_SRV"] = jfile["test results"]["positive test results"]
if jfile["test results"]["positive test results"] > 0:
selected = set()
for k in jfile["branch attributes"]["0"].keys():
if jfile["branch attributes"]["0"][k]["Corrected P-value"] < 0.05:
selected.add(k)
genes.loc[idx]["taxa"] = reduce(lambda x, y: str(x) + "," + str(y), selected)
genes.loc[idx]["bulbul"], genes.loc[idx]["penguin"], genes.loc[idx]["kingfisher"], genes.loc[idx]["swallow"] = count_targets(selected)
genes.loc[idx]["targets"] = genes.loc[idx]["bulbul"] + genes.loc[idx]["penguin"] + genes.loc[idx]["kingfisher"] + genes.loc[idx]["swallow"]
return genes
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_folder", type=str, help="Input folder with JSON outputs")
parser.add_argument("-g", "--gene_list", type=str, help="Names of JSON files")
parser.add_argument("-o", "--output", type=str, help="Output file name")
args = parser.parse_args()
output = absrel_out(args)
output.to_csv(args.output+".txt", sep="\t", index=False)
if __name__=="__main__":
main()
To analyze the summarized data from above:
#R
library(stringr)
#Selection results
sel_out <- read.table("output_sele_hmm.txt", header = T, sep = "\t")
sel_out$gene <- str_split_fixed(sel_out$gene_name, "_", 3)[,1]
#Save list of all genes used in selection analysis
#write.table(sel_out$gene, "sel_all.txt", quote = F, sep = "", row.names = F, col.names = F)
write.table(sel_out$gene[sel_out$bulbul > 0], "sel_bul.txt", quote = F, sep = "", row.names = F, col.names = F)
write.table(sel_out$gene[sel_out$kingfisher > 0], "sel_king.txt", quote = F, sep = "", row.names = F, col.names = F)
write.table(sel_out$gene[sel_out$swallow > 0], "sel_swa.txt", quote = F, sep = "", row.names = F, col.names = F)
write.table(sel_out$gene[sel_out$penguin > 0], "sel_pen.txt", quote = F, sep = "", row.names = F, col.names = F)
sel_out <- subset(sel_out, nsel_SRV > 0)
sel_out$non_target <- sel_out$nsel_SRV - sel_out$targets
sel_out$frac_target <- sel_out$targets/sel_out$nsel_SRV
all_taxa <- unlist(strsplit(sel_out$taxa,","))
plot(sort(table(all_taxa)), main="Number of genes showing positive selection", xlab="Species", ylab="Number of genes")
Since Malurus cyaneus had too many positively selected genes than all other species, we removed it from further analysis of selection.
sel_out$nsel_SRV_corr <- sel_out$nsel_SRV
#Malurus cyaneus is an outlier
for (x in seq(1, nrow(sel_out))){
if ("Malurus_cyaneus" %in% unlist(strsplit(sel_out[x,"taxa"], ","))){
sel_out[x, "nsel_SRV_corr"] <- sel_out[x, "nsel_SRV_corr"] - 1
}
}
sel_out$frac_target_corr <- sel_out$targets/sel_out$nsel_SRV
sel_out_targets <- subset(sel_out, frac_target_corr > 0.5)
getphyper <- function(overlap, list1, list2, pop){
test_val <- phyper(overlap-1,list1,pop-list1,list2)
return(test_val)
}
PopSize=nrow(sel_out_targets)
listB=length(which(sel_out_targets$bulbul>0))
listP=length(which(sel_out_targets$penguin>0))
listS=length(which(sel_out_targets$swallow>0))
listK=length(which(sel_out_targets$kingfisher>0))
#calculate whether any of these pairwise comparisons are significant using a hypergeometric test
pvals = c()
#BP
pvals = c(pvals, getphyper(length(which(sel_out_targets$bulbul > 0 & sel_out_targets$penguin > 0)), listB, listP, PopSize))
#BS
pvals = c(pvals, getphyper(length(which(sel_out_targets$bulbul > 0 & sel_out_targets$swallow > 0)), listB, listS, PopSize))
#BK
pvals = c(pvals, getphyper(length(which(sel_out_targets$bulbul > 0 & sel_out_targets$kingfisher > 0)), listB, listK, PopSize))
#SK
pvals = c(pvals, getphyper(length(which(sel_out_targets$swallow > 0 & sel_out_targets$kingfisher > 0)), listS, listK, PopSize))
#PK
pvals = c(pvals, getphyper(length(which(sel_out_targets$penguin > 0 & sel_out_targets$kingfisher > 0)), listP, listK, PopSize))
#PS
pvals = c(pvals, getphyper(length(which(sel_out_targets$penguin > 0 & sel_out_targets$swallow > 0)), listP, listS, PopSize))
p.adjust(pvals, method = "bonferroni")
gene_taxa <- read.table("gene_taxa.txt", sep = "\t", header = T)
gene_taxa_cat1 <- read.table("gene_taxa_cat1.txt", sep = "\t", header = T)
overlap <- intersect(gene_taxa$gene, sel_out_targets$gene)
overlap_cat1 <- intersect(gene_taxa_cat1$gene, sel_out_targets$gene)
#Save list of genes selected only in target species
write.table(sel_out_targets$gene, "sel_all_target.txt", quote = F, sep = "", eol = ",", row.names = F, col.names = F)
#outgroups
sel_out_og <- read.table("output_sele_hmm_outgroup.txt", header = T, sep = "\t")
sel_out_og$gene <- str_split_fixed(sel_out_og$gene_name, "_", 3)[,1]
sel_out_og <- subset(sel_out_og, nsel_SRV > 0)
sel_out_og$non_target <- sel_out_og$nsel_SRV - sel_out_og$targets
sel_out_og$frac_target <- sel_out_og$targets/sel_out_og$nsel_SRV
sel_out_og$nsel_SRV_corr <- sel_out_og$nsel_SRV
for (x in seq(1, nrow(sel_out))){
if ("Malurus_cyaneus" %in% unlist(strsplit(sel_out_og[x,"taxa"], ","))){
sel_out_og[x, "nsel_SRV_corr"] <- sel_out_og[x, "nsel_SRV_corr"] - 1
}
}
sel_out_og$frac_target_corr <- sel_out_og$targets/sel_out_og$nsel_SRV
sel_out_targets_og <- subset(sel_out_og, frac_target_corr > 0.5)
conv_gene_4 <- fisher.test(matrix(c(length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$swallow > 0 &
sel_out_targets$penguin > 0)),
length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$apenguin > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$swallow > 0 &
sel_out_targets$penguin > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$apenguin > 0))),nrow = 2),
alternative = "greater")
conv_gene_3_bsk <- fisher.test(matrix(c(length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$swallow > 0)),
length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$aswallow > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$swallow > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$aswallow > 0))),nrow = 2),
alternative = "greater")
conv_gene_3_bsk <- fisher.test(matrix(c(length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$swallow > 0 &
sel_out_targets$penguin > 0)),
length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$apenguin > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$swallow > 0 &
sel_out_targets$penguin > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$apenguin > 0))),nrow = 2),
alternative = "greater")
conv_gene_3_bpk <- fisher.test(matrix(c(length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$penguin > 0)),
length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$apenguin > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$penguin > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$apenguin > 0))),nrow = 2),
alternative = "greater")
conv_gene_3_psk <- fisher.test(matrix(c(length(which(sel_out_targets$penguin > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$swallow > 0)),
length(which(sel_out_targets_og$apenguin > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$aswallow > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$penguin > 0 &
sel_out_targets$kingfisher > 0 &
sel_out_targets$swallow > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$apenguin > 0 &
sel_out_targets_og$akingfisher > 0 &
sel_out_targets_og$aswallow > 0))),nrow = 2),
alternative = "greater")
conv_gene_2_bk <- fisher.test(matrix(c(length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0)),
length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$kingfisher > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$akingfisher > 0))),nrow = 2),
alternative = "greater")
conv_gene_2_bs <- fisher.test(matrix(c(length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$swallow > 0)),
length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$aswallow > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$swallow > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$aswallow > 0))),nrow = 2),
alternative = "greater")
conv_gene_2_bp <- fisher.test(matrix(c(length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$penguin > 0)),
length(which(sel_out_targets_og$abulbul > 0 &
sel_out_targets_og$apenguin > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$bulbul > 0 &
sel_out_targets$penguin > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$apenguin > 0 &
sel_out_targets_og$akingfisher > 0))),nrow = 2),
alternative = "greater")
conv_gene_2_ks <- fisher.test(matrix(c(length(which(sel_out_targets$swallow > 0 &
sel_out_targets$kingfisher > 0)),
length(which(sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$akingfisher > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$swallow > 0 &
sel_out_targets$kingfisher > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$akingfisher > 0))),nrow = 2),
alternative = "greater")
conv_gene_2_kp <- fisher.test(matrix(c(length(which(sel_out_targets$penguin > 0 &
sel_out_targets$kingfisher > 0)),
length(which(sel_out_targets_og$apenguin > 0 &
sel_out_targets_og$akingfisher > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$penguin > 0 &
sel_out_targets$kingfisher > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$apenguin > 0 &
sel_out_targets_og$akingfisher > 0))),nrow = 2),
alternative = "greater")
conv_gene_2_sp <- fisher.test(matrix(c(length(which(sel_out_targets$swallow > 0 &
sel_out_targets$penguin > 0)),
length(which(sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$apenguin > 0)),
nrow(sel_out_targets)-length(which(sel_out_targets$swallow > 0 &
sel_out_targets$penguin > 0)),
nrow(sel_out_targets_og)-length(which(sel_out_targets_og$aswallow > 0 &
sel_out_targets_og$apenguin > 0))),nrow = 2),
alternative = "greater")
For GO enrichment analysis we used PANTHER
For the MK test we used population resequencing data for a bulbul species and a swallow species.
Population resequencing data for bulbuls come from Shakya et al. (2021) for Brachypodius melanocephalos. We only included one individual from the Maratua Island (a small island off the coast of Borneo).
| BioSample | Run | BioProject | lat | long |
|---|---|---|---|---|
| LSUMZ_198531_Mentawai | SRR14761645 | PRJNA736057 | -1.6040 | 99.1590 |
| KU_98891_Borneo | SRR14761646 | PRJNA736057 | 9.8372 | 118.6412 |
| LSUMZ_198032_Borneo | SRR14761648 | PRJNA736057 | 2.1080 | 117.4020 |
| LSUMZ_191390_Borneo | SRR14761649 | PRJNA736057 | 1.1150 | 110.2170 |
| LSUMZ_216191_Borneo | SRR14761651 | PRJNA736057 | 5.8455 | 117.1825 |
| LSUMZ_176475_Borneo | SRR14761652 | PRJNA736057 | 2.9416 | 113.0333 |
| LSUMZ_198033_Bawean | SRR14761656 | PRJNA736057 | -5.8100 | 112.6140 |
| LSUMZ_198030_Maratua | SRR14761657 | PRJNA736057 | 2.2650 | 118.5630 |
| LSUMZ_179296_Borneo | SRR14761663 | PRJNA736057 | 5.3261 | 115.6736 |
| LSUMZ_215800_Borneo | SRR14761664 | PRJNA736057 | 5.3997 | 116.1022 |
| LSUMZ_198530_Sumatra | SRR14761647 | PRJNA736057 | 0.1000 | 99.9370 |
| LSUMZ_186231_Borneo | SRR14761650 | PRJNA736057 | 1.8011 | 109.7122 |
Population resequencing data for swallows come from Safran et al. (2016) for Hirundo rustica.
| BioSample | LibraryName | Run | BioProject |
|---|---|---|---|
| SAMN18196679 | 1040_WGS | SRR14127698 | PRJNA323498 |
| SAMN18196678 | 1036_WGS | SRR14127699 | PRJNA323498 |
| SAMN18196677 | 1021_WGS | SRR14127700 | PRJNA323498 |
| SAMN18196712 | 1763_WGS | SRR14127701 | PRJNA323498 |
| SAMN18196676 | 1019_WGS | SRR14127702 | PRJNA323498 |
| SAMN18196711 | 1760_WGS | SRR14127703 | PRJNA323498 |
| SAMN18196709 | 1747_WGS | SRR14127705 | PRJNA323498 |
| SAMN18196708 | 1744_WGS | SRR14127706 | PRJNA323498 |
| SAMN18196706 | 1723_WGS | SRR14127708 | PRJNA323498 |
| SAMN18196704 | 1838_WGS | SRR14127710 | PRJNA323498 |
| SAMN18196675 | 1015_WGS | SRR14127713 | PRJNA323498 |
| SAMN18196701 | 1798_WGS | SRR14127714 | PRJNA323498 |
| SAMN18196700 | 1791_WGS | SRR14127715 | PRJNA323498 |
| SAMN18196699 | 1777_WGS | SRR14127716 | PRJNA323498 |
| SAMN18196698 | 1311_WGS | SRR14127717 | PRJNA323498 |
| SAMN18196696 | 1303_WGS | SRR14127719 | PRJNA323498 |
| SAMN18196695 | 1289_WGS | SRR14127720 | PRJNA323498 |
| SAMN18196692 | 1223_WGS | SRR14127723 | PRJNA323498 |
| SAMN18196690 | 1177_WGS | SRR14127726 | PRJNA323498 |
| SAMN18196689 | 1171_WGS | SRR14127727 | PRJNA323498 |
| SAMN18196687 | 1158_WGS | SRR14127729 | PRJNA323498 |
| SAMN18196686 | 1107_WGS | SRR14127730 | PRJNA323498 |
| SAMN18196684 | 1091_WGS | SRR14127732 | PRJNA323498 |
| SAMN18196683 | 1075_WGS | SRR14127733 | PRJNA323498 |
| SAMN18196682 | 1058_WGS | SRR14127734 | PRJNA323498 |
| SAMN18196673 | 479_WGS | SRR14127735 | PRJNA323498 |
| SAMN18196672 | 464_WGS | SRR14127736 | PRJNA323498 |
| SAMN18196722 | 2057_WGS | SRR14127737 | PRJNA323498 |
| SAMN18196721 | 2049_WGS | SRR14127738 | PRJNA323498 |
| SAMN18196718 | 2028_WGS | SRR14127741 | PRJNA323498 |
| SAMN18196753 | 1473_WGS | SRR14127742 | PRJNA323498 |
| SAMN18196717 | 2025_WGS | SRR14127743 | PRJNA323498 |
| SAMN18196752 | 1240_WGS | SRR14127744 | PRJNA323498 |
| SAMN18196748 | 100_WGS | SRR14127748 | PRJNA323498 |
| SAMN18196746 | 43_WGS | SRR14127750 | PRJNA323498 |
| SAMN18196745 | 27_WGS | SRR14127751 | PRJNA323498 |
| SAMN18196744 | 15_WGS | SRR14127752 | PRJNA323498 |
| SAMN18196742 | 278_WGS | SRR14127755 | PRJNA323498 |
| SAMN18196740 | 271_WGS | SRR14127757 | PRJNA323498 |
| SAMN18196739 | 265_WGS | SRR14127758 | PRJNA323498 |
| SAMN18196738 | 251_WGS | SRR14127759 | PRJNA323498 |
| SAMN18196734 | 208_WGS | SRR14127763 | PRJNA323498 |
| SAMN18196733 | 199_WGS | SRR14127764 | PRJNA323498 |
| SAMN18196715 | 2005_WGS | SRR14127765 | PRJNA323498 |
| SAMN18196731 | 187_WGS | SRR14127767 | PRJNA323498 |
| SAMN18196728 | 170_WGS | SRR14127770 | PRJNA323498 |
| SAMN18196727 | 151_WGS | SRR14127771 | PRJNA323498 |
| SAMN18196726 | 2071_WGS | SRR14127772 | PRJNA323498 |
| SAMN18196725 | 2067_WGS | SRR14127773 | PRJNA323498 |
| SAMN18196724 | 2065_WGS | SRR14127774 | PRJNA323498 |
| SAMN18196723 | 2061_WGS | SRR14127775 | PRJNA323498 |
| SAMN18196713 | 1661_WGS | SRR14127777 | PRJNA323498 |
| SAMN18196803 | x242035_WGS | SRR14129407 | PRJNA323498 |
| SAMN18196802 | x242025_WGS | SRR14129408 | PRJNA323498 |
| SAMN18196801 | x242024_WGS | SRR14129409 | PRJNA323498 |
| SAMN18196799 | x242007_WGS | SRR14129411 | PRJNA323498 |
| SAMN18196814 | 327_WGS | SRR14129415 | PRJNA323498 |
| SAMN18196813 | 326_WGS | SRR14129416 | PRJNA323498 |
| SAMN18196812 | 324_WGS | SRR14129417 | PRJNA323498 |
| SAMN18196810 | 318_WGS | SRR14129419 | PRJNA323498 |
| SAMN18196809 | 316_WGS | SRR14129420 | PRJNA323498 |
| SAMN18196806 | 303_WGS | SRR14129423 | PRJNA323498 |
| SAMN18196796 | 1512_WGS | SRR14129425 | PRJNA323498 |
| SAMN18196795 | 1471_WGS | SRR14129426 | PRJNA323498 |
| SAMN18196763 | 1921_WGS | SRR14131088 | PRJNA323498 |
| SAMN18196760 | 1646_WGS | SRR14131091 | PRJNA323498 |
| SAMN18196759 | 1550_WGS | SRR14131092 | PRJNA323498 |
| SAMN18196794 | 1470_WGS | SRR14131093 | PRJNA323498 |
| SAMN18196793 | 1457_WGS | SRR14131095 | PRJNA323498 |
| SAMN18196792 | 1433_WGS | SRR14131096 | PRJNA323498 |
| SAMN18196791 | 1425_WGS | SRR14131097 | PRJNA323498 |
| SAMN18196790 | 1424_WGS | SRR14131098 | PRJNA323498 |
| SAMN18196789 | 1411_WGS | SRR14131099 | PRJNA323498 |
| SAMN18196788 | 1410_WGS | SRR14131100 | PRJNA323498 |
| SAMN18196787 | 1392_WGS | SRR14131101 | PRJNA323498 |
| SAMN18196786 | 1379_WGS | SRR14131102 | PRJNA323498 |
| SAMN18196785 | 1362_WGS | SRR14131103 | PRJNA323498 |
| SAMN18196784 | 1361_WGS | SRR14131104 | PRJNA323498 |
| SAMN18196757 | 1487_WGS | SRR14131105 | PRJNA323498 |
| SAMN18196783 | 1350_WGS | SRR14131106 | PRJNA323498 |
| SAMN18196782 | 1343_WGS | SRR14131107 | PRJNA323498 |
| SAMN18196781 | 1341_WGS | SRR14131108 | PRJNA323498 |
| SAMN18196780 | 1332_WGS | SRR14131109 | PRJNA323498 |
| SAMN18196779 | 1331_WGS | SRR14131110 | PRJNA323498 |
| SAMN18196778 | 1330_WGS | SRR14131111 | PRJNA323498 |
| SAMN18196777 | 1324_WGS | SRR14131112 | PRJNA323498 |
| SAMN18196776 | 1319_WGS | SRR14131113 | PRJNA323498 |
| SAMN18196775 | 450_WGS | SRR14131114 | PRJNA323498 |
| SAMN18196774 | 432_WGS | SRR14131115 | PRJNA323498 |
| SAMN18196756 | 1485_WGS | SRR14131116 | PRJNA323498 |
| SAMN18196773 | 382_WGS | SRR14131117 | PRJNA323498 |
| SAMN18196772 | 379_WGS | SRR14131118 | PRJNA323498 |
| SAMN18196771 | 364_WGS | SRR14131119 | PRJNA323498 |
| SAMN18196770 | 361_WGS | SRR14131120 | PRJNA323498 |
| SAMN18196769 | 358_WGS | SRR14131121 | PRJNA323498 |
| SAMN18196768 | 347_WGS | SRR14131122 | PRJNA323498 |
| SAMN18196766 | 1992_WGS | SRR14131124 | PRJNA323498 |
| SAMN18196765 | 1971_WGS | SRR14131125 | PRJNA323498 |
| SAMN18196764 | 1966_WGS | SRR14131126 | PRJNA323498 |
| SAMN18196755 | 1483_WGS | SRR14131127 | PRJNA323498 |
| SAMN18196754 | 1475_WGS | SRR14131128 | PRJNA323498 |
| SAMN18196804 | x242036_WGS | SRR14129406 | PRJNA323498 |
| SAMN18196800 | x242008_WGS | SRR14129410 | PRJNA323498 |
| SAMN18196798 | x242006_WGS | SRR14129412 | PRJNA323498 |
| SAMN18196797 | x242002_WGS | SRR14129414 | PRJNA323498 |
| SAMN18196811 | 319_WGS | SRR14129418 | PRJNA323498 |
| SAMN18196808 | 309_WGS | SRR14129421 | PRJNA323498 |
| SAMN18196807 | 307_WGS | SRR14129422 | PRJNA323498 |
| SAMN18196805 | 302_WGS | SRR14129424 | PRJNA323498 |
| SAMN18196762 | 1905_WGS | SRR14131089 | PRJNA323498 |
| SAMN18196761 | 1648_WGS | SRR14131090 | PRJNA323498 |
| SAMN18196758 | 1534_WGS | SRR14131094 | PRJNA323498 |
| SAMN18196767 | 1995_WGS | SRR14131123 | PRJNA323498 |
| SAMN18196681 | 1051_WGS | SRR14127696 | PRJNA323498 |
| SAMN18196680 | 1041_WGS | SRR14127697 | PRJNA323498 |
| SAMN18196710 | 1758_WGS | SRR14127704 | PRJNA323498 |
| SAMN18196707 | 1736_WGS | SRR14127707 | PRJNA323498 |
| SAMN18196705 | 1848_WGS | SRR14127709 | PRJNA323498 |
| SAMN18196703 | 1822_WGS | SRR14127711 | PRJNA323498 |
| SAMN18196702 | 1808_WGS | SRR14127712 | PRJNA323498 |
| SAMN18196697 | 1307_WGS | SRR14127718 | PRJNA323498 |
| SAMN18196694 | 1279_WGS | SRR14127721 | PRJNA323498 |
| SAMN18196693 | 1224_WGS | SRR14127722 | PRJNA323498 |
| SAMN18196674 | 1003_WGS | SRR14127724 | PRJNA323498 |
| SAMN18196691 | 1198_WGS | SRR14127725 | PRJNA323498 |
| SAMN18196688 | 1162_WGS | SRR14127728 | PRJNA323498 |
| SAMN18196685 | 1095_WGS | SRR14127731 | PRJNA323498 |
| SAMN18196720 | 2036_WGS | SRR14127739 | PRJNA323498 |
| SAMN18196719 | 2035_WGS | SRR14127740 | PRJNA323498 |
| SAMN18196751 | 131_WGS | SRR14127745 | PRJNA323498 |
| SAMN18196750 | 130_WGS | SRR14127746 | PRJNA323498 |
| SAMN18196749 | 113_WGS | SRR14127747 | PRJNA323498 |
| SAMN18196747 | 48_WGS | SRR14127749 | PRJNA323498 |
| SAMN18196743 | 3_WGS | SRR14127753 | PRJNA323498 |
| SAMN18196716 | 2013_WGS | SRR14127754 | PRJNA323498 |
| SAMN18196741 | 275_WGS | SRR14127756 | PRJNA323498 |
| SAMN18196737 | 239_WGS | SRR14127760 | PRJNA323498 |
| SAMN18196736 | 223_WGS | SRR14127761 | PRJNA323498 |
| SAMN18196735 | 211_WGS | SRR14127762 | PRJNA323498 |
| SAMN18196732 | 192_WGS | SRR14127766 | PRJNA323498 |
| SAMN18196730 | 184_WGS | SRR14127768 | PRJNA323498 |
| SAMN18196729 | 176_WGS | SRR14127769 | PRJNA323498 |
| SAMN18196714 | 1712_WGS | SRR14127776 | PRJNA323498 |
To generate vcf files for each species, we remapped the raw fastq
files to the respective reference genomes and called SNPs using
snpArcher. Check snpArcher github or
paper, Mirchandani et al. (2023), for more
information on snpArcher.
QC files for bulbul
QC files for swallow
We used degenotate to run imputed MK test. First, we filtered each of the two vcf files for low quality sites.
#!bin/bash
vcftools --gzvcf B_melanocephalos_final.vcf.gz --remove-indels --max-missing 0.25 --minDP 2 --recode --recode-INFO-all --min-alleles 2 --max-alleles 2 --out B_melanocephalos_bi_filt_mac2_75p_noindel
bgzip B_melanocephalos_bi_filt_nomac_75p_noindel.recode.vcf
tabix B_melanocephalos_bi_filt_nomac_75p_noindel.recode.vcf.gz
#removed one individual with high missing data based on QC file.
vcftools --gzvcf H_rustica_final.vcf.gz --remove-indels --max-missing 0.25 --minDP 2 --recode --recode-INFO-all --min-alleles 2 --max-alleles 2 --out H_rustica_bi_filt_mac2_75p_noindel --remove-indv SAMN18196751 --remove-indv SAMN18196742
bgzip B_melanocephalos_bi_filt_nomac_75p_noindel.recode.vcf
tabix B_melanocephalos_bi_filt_nomac_75p_noindel.recode.vcf.gz
To perform an imputed MK test, we need a polarized vcf. We used an outgroup species to polarize vcfs. For the bulbul, we used Sylvia atricapilla (GCA_009819655.1) and for the swallow, we used Acrocephalus scripaceus (GCA_910950805.1) to polarize the respective vcf files.
# Get maf file for the outgroup with ingroup species as reference
~/hal/bin/hal2maf --noAncestors --noDupes --refGenome Brachypodius_melanocephalos --targetGenomes Brachypodius_melanocephalos,Sylvia_atricapilla all_birds_new.hal B_mel-S_atr.maf
~/hal/bin/hal2maf --noAncestors --noDupes --refGenome Brachypodius_melanocephalos --targetGenomes Brachypodius_melanocephalos,Phylloscopus_trochilus all_birds_new.hal B_mel-P_tro.maf
To convert the maf to a vcf we used a custom python script below:
import argparse
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.SeqFeature import SeqFeature, FeatureLocation
def main(params):
# Open MAF file for reading
maf_file = open(params.maf, "r")
# Parse MAF alignment
alignment = AlignIO.parse(maf_file, "maf")
# Open VCF file for writing
vcf_file = open(params.vcf, "w")
from_sp = params.ref_sp
to_sp = params.map_sp
# Write VCF header
vcf_file.write("##fileformat=VCFv4.2\n")
vcf_file.write(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{to_sp}\n")
# Iterate through each alignment block in MAF file
for block in alignment:
for record in block:
# Assuming the first sequence is the reference
record_split = record.id.split('.')
record_sp = record_split[0]
if record_sp == from_sp:
ref_seq = record.seq
ref_start = record.annotations['start']
ref_chrom = '.'.join(record_split[-(len(record_split)-1):])
continue
# Compare non-reference sequences with the reference
alt_seq = record.seq
for i, (ref_base, alt_base) in enumerate(zip(ref_seq, alt_seq)):
pos = ref_start + i + 1 # 1-based position
if ref_base.upper() != alt_base.upper():
# Write VCF record for variant
vcf_file.write(f"{ref_chrom}\t{pos}\t.\t{ref_base.upper()}\t{alt_base.upper()}\t.\tPASS\t.\tGT\t1/1\n")
# Close files
maf_file.close()
vcf_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--maf", type=str, help="Maf File")
parser.add_argument("--ref_sp", type=str, help="Reference species")
parser.add_argument("--map_sp", type=str, help="Species to map to")
parser.add_argument("--vcf", type=str, help="Output Vcf filename")
options, args = parser.parse_known_args()
main(options)
sp1="B_mel"
sp2="S_atr"
## filter VCFs for degenotate
bcftools view ${sp1}-${sp2}.vcf -v snps -U -m2 -M2 -e 'F_MISSING > 0.75 | ref="N" | ALT="." | ALT="n" | ALT="-"| TYPE~"indel"' -O z -o ${sp1}-${sp2}_filt.vcf.gz
tabix ${sp1}-${sp2}_filt.vcf.gz
## make ancestral fasta from outgroup vcf
bcftools consensus ${sp1}-${sp2}_filt.vcf.gz -f Brachypodius_melanocephalos_GCA013398615.fna -o ${sp1}-${sp2}_filt.fa
samtools faidx ${sp1}-${sp2}_filt.fa
## make VCF with ONE outgroup
bcftools merge --missing-to-ref ../B_melanocephalos_bi_filt_mac2_75p_noindel.recode.vcf.gz ${sp1}-${sp2}_filt.vcf.gz -O v -o ${sp1}-${sp2}_comb.vcf
sed -i -e "s/sample/$sp2/" ${sp1}-${sp2}_comb.vcf
## infer ancestral allele
vcfdo polarize -i ${sp1}-${sp2}_comb.vcf -f ${sp1}-${sp2}_filt.fa > ${sp1}-${sp2}_pol.vcf
bgzip ${sp1}-${sp2}_pol.vcf
tabix ${sp1}-${sp2}_pol.vcf.gz
We ran the MK test with two outgroup species for each run. For bulbul, we included Phylloscopus trochilus (GCA_016584745.1) as the second outgroup and for swallow, we included Eremophila alpestris (GCA_009792885.1) as the second outgroup.
sp1="B_mel"
sp2="P_tro"
## filter VCFs for degenotate
bcftools view ${sp1}-${sp2}.vcf -v snps -U -m2 -M2 -e 'F_MISSING > 0.75 | ref="N" | ALT="." | ALT="n" | ALT="-" | TYPE~"indel"' -O z -o ${sp1}-${sp2}_filt.vcf.gz
tabix ${sp1}-${sp2}-_filt.vcf.gz
## merge to polarized vcf file
bcftools merge --missing-to-ref B_mel-S_atr_pol.vcf.gz ${sp1}-${sp2}_filt.vcf.gz -O z -o B_mel_polarized.vcf.gz
tabix B_mel_polarized.vcf.gz
vcftools --gz B_mel_polarized.vcf.gz --remove rem_island.txt --recode --recode-INFO-all --out B_mel_polarized_noisle
vcftools --gz H_rus_polarized.vcf.gz --keep rustica_samples.txt --recode --recode-INFO-all --out H_rus_polarized_noisle
To run degenotate we also need a gtf/gff file with annotations of coding sequences in the target species. For this we used the TOGA predictions from above.
bedToGenePred Brachypodius_melanocephalos_GCA013398615/query_annotation.bed B_mel.genepred
genePredToBed 'file' B_mel.genepred B_mel.gtf
degenotate.py -a B_mel.gtf -g Brachypodius_melanocephalos_GCA013398615.fna -u S_atr,sample -v B_mel_polarized_noisle.vcf.gz
bulbul_selected <- subset(trans_largest, transcript %in% mk_table_largest$transcript[which(mk_table_largest$sig %in% "red" & mk_table_largest$imp.dos > 0)])
library(rbioapi)
bulbul_enrich <- rba_panther_enrich(genes = unique(bulbul_selected$gene),
organism = 9031,
annot_dataset = "GO:0008150",
ref_genes = unique(mk_table_largest$gene),
ref_organism = 9031)
write.table(unique(bulbul_selected$gene), "bulbul_selected_mk.txt", quote = F, sep = "", eol = ",", row.names = F, col.names = F)
write.table(unique(trans$gene), "bulbul_all_genes_mk.txt", quote = F, sep = "", row.names = F, col.names = F)
write.table(bulbul_enrich$result, "bulbul_all_enrichment_mk.txt", quote = F, sep = "\t", row.names = F, col.names = F)
swallow_selected <- subset(trans_largest, transcript %in% mk_table_largest$transcript[which(mk_table_largest$sig %in% "red" & mk_table_largest$imp.dos > 0)])
swallow_enrich <- rba_panther_enrich(genes = unique(swallow_selected$gene),
organism = 9031,
annot_dataset = "GO:0008150",
ref_genes = unique(mk_table_largest$gene),
ref_organism = 9031)
write.table(unique(swallow_selected$gene), "swallow_selected_mk.txt", quote = F, sep = "", eol = ",", row.names = F, col.names = F)
write.table(unique(trans$gene), "swallow_all_genes_mk.txt", quote = F, sep = "", row.names = F, col.names = F)
write.table(swallow_enrich$result, "swallow_all_enrichment_mk.txt", quote = F, sep = "\t", row.names = F, col.names = F)
bs_overlap <- unique(intersect(bulbul_selected$gene, swallow_selected$gene))
phyper(length(bs_overlap)-1,1069,15259-2584,2584)
absrel_bulbul_overlap <- unique(intersect(bulbul_selected$gene, sel_out_targets$gene[which(sel_out_targets$bulbul>0)]))
absrel_swallow_overlap <- unique(intersect(swallow_selected$gene, sel_out_targets$gene[which(sel_out_targets$swallow>0)]))
We used a modified version of the MK test comparing fixed and polymorphic sites in each CNEE to synonymous sites in the gene. The same vcf files from above were used in the analysis.
We used a custom python script to count the fixed and polymorphic sites in all elements using the vcf files for bulbuls and swallow respectively
import pandas as pd
import pysam
import argparse,sys,os
import glob
def get_fix_poly(vcf, scaffold, start, stop):
samples = list((vcf.header.samples))
fix = 0
poly = 0
for rec in vcf.fetch(scaffold, start, stop):
out = False
alt = 0
tot = 0
#Uses same filters as those implemented in degenotate for outgroups
if (rec.samples[samples[-2]]['GT'] == (1,1) and rec.samples[samples[-1]]['GT'] == (1,1)):
out = True
for sample in samples[:-2]:
for haplotype in rec.samples[sample]['GT']:
if haplotype == 1:
alt += 1
tot += 1
elif haplotype == 0:
tot += 1
#Using an allele frequency cutoff of 1/number of samples.
af = alt/tot
if ((af < (1/len(samples))) and out):
fix += 1
elif ((af < (1/len(samples))) and out is False):
pass
else:
poly += 1
return fix, poly
def parse_vcf(vcfs, bed):
vcf_reader = pysam.VariantFile(vcfs)
vcf_bed = pd.read_table(bed, header=None)
# Also get fixed and polymorphic sites upstream and downstream of each element
vcf_pd = pd.DataFrame(columns=['element', 'len', 'fixed', 'poly', 'u_fixed', 'u_poly', 'd_fixed', 'd_poly'])
for idx in range(len(vcf_bed)):
buff = 100
length = vcf_bed.loc[idx][2]-vcf_bed.loc[idx][1]
fix_main, fix_poly = get_fix_poly(vcf_reader, vcf_bed.loc[idx][0], vcf_bed.loc[idx][1], vcf_bed.loc[idx][2])
try:
u_main, u_poly = get_fix_poly(vcf_reader, vcf_bed.loc[idx][0], vcf_bed.loc[idx][1]-buff-length, vcf_bed.loc[idx][1]-buff)
except:
u_main, u_poly = "NA", "NA"
try:
d_main, d_poly = get_fix_poly(vcf_reader, vcf_bed.loc[idx][0], vcf_bed.loc[idx][2]+buff, vcf_bed.loc[idx][2]+buff+length)
except:
d_main, d_poly = "NA", "NA"
vcf_pd.loc[idx] = [vcf_bed.loc[idx][3], length, fix_main, fix_poly, u_main, u_poly, d_main, d_poly]
return(vcf_pd)
def main(options):
out_pd = parse_vcf(options.vcf, options.bed)
out_pd.to_csv(options.outname+".txt", sep='\t', index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--vcf", type=str, help="VCF File")
parser.add_argument("--bed", type=str, help="BED File")
parser.add_argument("--outname", type=str, help="output file name", default = "output")
options,args = parser.parse_known_args()
main(options)
library(stringr)
library(plyr)
library(data.table)
library(dplyr)
#get fixed and polymorphic sites from code above
bulbul <- read.table("all_accl_bulbul_noisle_stats.txt", sep="\t", header = T)
#load phyloacc results
phyloacc <- fread("elem_lik_filt.txt", sep = "\t", header = T)
bulbul <- merge(bulbul, phyloacc, by.x = "element", by.y="original.id", all.x=F, all.y=F)
#get closest genes for each element (using bedtools)
bulbul_closest <- read.table("B_mel_closest3.bed", sep="\t", header = F)
#get id fo genes (from toga)
ids <- read.table("Brachypodius_melanocephalos_GCA013398615.tsv", sep = "\t", header = T)
bulbul_closest$id <- str_extract(bulbul_closest$V13, "gene_id [A-Za-z0-9-_.]*")
bulbul_closest$id <- gsub("gene_id ", "", bulbul_closest$id)
bulbul_closest$id2 <- gsub(".[0-9]*$", "", bulbul_closest$id)
bulbul_closest$gene <- mapvalues(bulbul_closest$id2, from=ids$t_transcript, to=ids$t_gene, warn_missing = F)
bulbul <- merge(bulbul, bulbul_closest[,c("V4", "id", "gene", "V14")], by.x = "element", by.y="V4")
#get gene mk table (from degenotate)
mk_table <- read.table("bulbul_noisle_mk.tsv", sep = "\t", header = T)
bulbul <- merge(bulbul, mk_table_largest, by = "gene", all.x = T, all.y = F)
#function to do a fisher's exact test
do_fisher <- function(row){
p <- tryCatch({
fish <- fisher.test(matrix(data=c(row[1], row[2], row[3], row[4]), nrow = 2))
fish$p.value
}, error = function(e) {
NA
})
return(p)
}
bulbul$fisher <- apply(bulbul[, c("poly", "pS", "fixed", "dS")], 1, do_fisher)
bulbul$cnee_dos <- bulbul$fixed / (bulbul$fixed + bulbul$dS) - bulbul$poly / (bulbul$poly + bulbul$pS)
library(parallel)
cl <- makeCluster(4)
bulbul$fisher_u <- parApply(cl, bulbul[, c("poly", "u_poly", "fixed", "u_fixed")], 1, do_fisher)
bulbul$fisher_d <- parApply(cl, bulbul[, c("poly", "d_poly", "fixed", "d_fixed")], 1, do_fisher)
stopCluster(cl)
bulbul$sig <- ifelse(bulbul$fisher < 0.05 & (bulbul$fisher_d < 0.05 | bulbul$fisher_u < 0.05), "red", "black")
bulbul$sig[is.na(bulbul$sig)] <- "white"
fwrite(bulbul, "bulbul_noisle_psuedomk_3_flank.txt", sep = "\t", row.names = F, col.names = T, quote = F)
The same steps were repeated for the swallow
We repeated the above approach for each gene by counting up the fixed and polymorphic sites for all conserved elements within 10000 bp of each gene (including those in introns)
#Get list of elements within 10000 bp of each gene using bedtools
bulbul_gene_10000 <- read.table("Bmel_CDS_elements_intersect.txt", sep = "\t", header = F)
bulbul_gene_10000$id <- str_extract(bulbul_gene_10000$V9, "gene_id [A-Za-z0-9-_.]*")
bulbul_gene_10000$id <- gsub("gene_id ", "", bulbul_gene_10000$id)
bulbul_gene_10000$id2 <- gsub(".[0-9]*$", "", bulbul_gene_10000$id)
bulbul_gene_10000$gene <- mapvalues(bulbul_gene_10000$id2, from=ids$t_transcript, to=ids$t_gene, warn_missing = F)
bulbul_mk_table <- read.table("bulbul_noisle_mk.tsv", sep = "\t", header = T)
bulbul_mk_table <- merge(bulbul_mk_table, bulbul_gene_10000[,c("gene", "V13")], by="gene", all.x = F, all.y = F)
bulbul <- read.table("all_accl_bulbul_noisle_stats.txt", sep="\t", header = T)
bulbul_mk_table <- merge(bulbul_mk_table, bulbul, by.x = "V13", by.y="element", all.x = F, all.y = F)
bulbul_mk_table <- unique(bulbul_mk_table)
#remove elements accelerated in sylvia
rate2 <- fread("M2_elem_Z.txt", sep = "\t", header = T, select = c(1,13))
sylv_accl <- phyloacc$original.id[which(phyloacc$best.fit.model %in% c("M1","M2") & phyloacc$phyloacc.id %in% rate2$`Locus ID`[which(rate2$Sylvia_atricapilla == 2)])]
bulbul_mk_table <- subset(bulbul_mk_table, !(V13 %in% sylv_accl))
#get sum of all elements within 10000 bp of each gene
bulbul_gene <- bulbul_mk_table %>% group_by(gene) %>% summarize(fixed = sum(fixed),
poly = sum(poly),
len = sum(len),
dS = first(dS),
pS = first(pS),
cds_length = first(cds_length),
f0 = first(f0),
f2 = first(f2),
f3 = first(f3),
f4 = first(f4))
bulbul_gene$cnee_dos <- bulbul_gene$fixed / (bulbul_gene$fixed + bulbul_gene$dS) - bulbul_gene$poly / (bulbul_gene$poly + bulbul_gene$pS)
bulbul_gene$alpha <- 1-((bulbul_gene$dS/bulbul_gene$fixed)/(bulbul_gene$pS/bulbul_gene$poly))
bulbul_gene$fisher <- apply(bulbul_gene[, c("pS", "poly", "dS", "fixed")], 1, do_fisher)
bulbul_gene$sig <- ifelse(bulbul_gene$fisher < 0.05, "red", "black")
write.table(bulbul_gene, "bulbul_gene_mk.txt", sep = "\t", row.names = F, col.names = T, quote = F)
The above methods were repeated for swallows with elements in Acrocephalus removed.