Variant Effect Predictor Plugins

run

perl INSTALL.pl -a p -g list
tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz

tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg19.tsv.gz

wget https://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38.tar.gz
tar xfz homo_sapiens_ancestor_GRCh38.tar.gz
cat homo_sapiens_ancestor_GRCh38/*.fa | bgzip -c > homo_sapiens_ancestor_GRCh38.fa.gz
rm -rf homo_sapiens_ancestor_GRCh38/ homo_sapiens_ancestor_GRCh38.tar.gz
./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38.fa.gz

tar zxvf BayesDel_170824_addAF.tgz
rm *.gz.tbi
gunzip *.gz

cat BayesDel_170824_addAF.txt | sort -k1,1 -k2,2n > BayesDel_170824_addAF_sorted.txt

cat BayesDel_170824_addAF_sorted.txt >> BayesDel_170824_addAF_all_scores.txt
bgzip BayesDel_170824_addAF_all_scores.txt
tabix -s 1 -b 2 -e 2 BayesDel_170824_addAF_all_scores.txt.gz

gzip -d ClinPred.txt.gz # to unzip the text file 
cat ClinPred.txt | tr " " "\t" > ClinPred_tabbed.tsv # to change the file to a tabbed delimited file 
sed '1s/.*/#&/'  ClinPred_tabbed.tsv > tabbed_ClinPred.tsv  # to add a # in the first line of the file 
sed '1s/C/c/' tabbed_ClinPred.tsv > ClinPred_tabbed.tsv # to convert the Chr to chr
bgzip ClinPred_tabbed.tsv
tabix -f -s 1 -b 2 -e 2 ClinPred_tabbed.tsv.gz

version=4.4a
wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP${version}.zip
unzip dbNSFP${version}.zip
zcat dbNSFP${version}_variant.chr1.gz | head -n1 > h

zgrep -h -v ^#chr dbNSFP${version}_variant.chr* | sort -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP${version}_grch38.gz
tabix -s 1 -b 2 -e 2 dbNSFP${version}_grch38.gz

zgrep -h -v ^#chr dbNSFP${version}_variant.chr* | awk '$8 != "." ' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP${version}_grch37.gz
tabix -s 8 -b 9 -e 9 dbNSFP${version}_grch37.gz

--plugin dbNSFP,/path/to/dbNSFP.gz,LRT_score,GERP++_RS

--plugin dbNSFP,/path/to/dbNSFP.gz,ALL

--plugin dbNSFP,http://my.files.com/dbNSFP.gz,col1,col2

--plugin dbNSFP,/path/to/dbNSFP.gz,/path/to/dbNSFP_replacement_logic,LRT_score,GERP++_RS

--plugin dbNSFP,/path/to/dbNSFP.gz,pep_match=0,col1,col2

--plugin dbNSFP,/path/to/dbNSFP.gz,transcript_match=1,col1,col2

wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip
unzip dbscSNV1.1.zip
head -n1 dbscSNV1.1.chr1 > h

cat dbscSNV1.1.chr* | grep -v ^chr | sort -k5,5 -k6,6n | cat h - | awk '$5 != "."' | bgzip -c > dbscSNV1.1_GRCh38.txt.gz
tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz

cat dbscSNV1.1.chr* | grep -v ^chr | cat h - | bgzip -c > dbscSNV1.1_GRCh37.txt.gz
tabix -s 1 -b 2 -e 2 -c c dbscSNV1.1_GRCh37.txt.gz

--plugin dbscSNV,http://my.files.com/dbscSNV.txt.gz

gunzip all_variant_disease_pmid_associations.tsv.gz
awk '($1 ~ /^snpId/ || $2 ~ /NA/) {next} {print $0}' all_variant_disease_pmid_associations.tsv > all_variant_disease_pmid_associations_clean.tsv
sort -t $'\t' -k2,2 -k3,3n all_variant_disease_pmid_associations_clean.tsv > all_variant_disease_pmid_associations_sorted.tsv
awk '{ gsub (/\t +/, "\t", $0); print}' all_variant_disease_pmid_associations_sorted.tsv > all_variant_disease_pmid_associations_final.tsv
bgzip all_variant_disease_pmid_associations_final.tsv
tabix -s 2 -b 3 -e 3 all_variant_disease_pmid_associations_final.tsv.gz

./vep -i variations.vcf --plugin DisGeNET,file=all_variant_disease_pmid_associations_final.tsv.gz

./vep -i variations.vcf --plugin DisGeNET,file=all_variant_disease_pmid_associations_final.tsv.gz,disease=1
./vep -i variations.vcf --plugin DisGeNET,file=all_variant_disease_pmid_associations_final.tsv.gz,disease=1,filter_source='GWASDB&GWASCAT'

./vep -i variations.vcf --plugin Draw,myimg,2000,100

./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz

./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz,SAD=1 

./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz,SAR=1 

./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz,PC=1 

###################################################
# Bash script to merge all VCFs from EVE dataset. #
###################################################
### BEGIN
# EVE input file can be downloaded from https://evemodel.org/api/proteins/bulk/download/ 
# Input: VCF files by protein (vcf_files_missense_mutations inside zip folder)
# Output: Compressed Merged VCF file (vcf.gz) + index file (.tbi)
DATA_FOLDER='//vcf_files_missense_mutations' # Fill this line
OUTPUT_FOLDER='//eve_plugin' # Fill this line
OUTPUT_NAME='eve_merged.vcf' # Default output name
# Get header from first VCF
cat `ls ${DATA_FOLDER}/*vcf | head -n1` > header
# Get variants from all VCFs and add to a single-file
ls ${DATA_FOLDER}/*vcf | while read VCF; do grep -v '^#' ${VCF} >> variants; done
# Merge Header + Variants in a single file
cat header variants | \
awk '$1 ~ /^#/ {print $0;next} {print $0 | "sort -k1,1V -k2,2n"}' > ${OUTPUT_FOLDER}/${OUTPUT_NAME};
# Remove temporary files
rm header variants
# Compress and index
bgzip ${OUTPUT_FOLDER}/${OUTPUT_NAME};
# If not installed, use: sudo apt install tabix
tabix ${OUTPUT_FOLDER}/${OUTPUT_NAME}.gz;
### END

wget https://raw.github.com/HAShihab/fathmm/master/cgi-bin/fathmm.py
wget http://fathmm.biocompute.org.uk/database/fathmm.v2.3.SQL.gz
gunzip fathmm.v2.3.SQL.gz
mysql -h[host] -P[port] -u[user] -p[pass] -e"CREATE DATABASE fathmm"
mysql -h[host] -P[port] -u[user] -p[pass] -Dfathmm < fathmm.v2.3.SQL
echo -e "[DATABASE]\nHOST = [host]\nPORT = [port]\nUSER = [user]\nPASSWD = [pass]\nDB = fathmm\n" > config.ini

wget https://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt

--plugin G2P,file=G2P.csv,af_monoallelic=0.05,types=stop_gained&frameshift_variant
--plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_from_vcf=1
--plugin G2P,file=G2P.csv,af_from_vcf=1,af_from_vcf_keys='topmed&gnomADe_r2.1.1'
--plugin G2P,file=G2P.csv,af_from_vcf=1,af_from_vcf_keys='topmed&gnomADe_r2.1.1',confidence_levels='confirmed&probable&both RD and IF' 
--plugin G2P,file=G2P.csv

cd $GS/sources
make
cd -
./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human

cd $GS/sources
perl -pi -e "s/^main  /int main  /" genesplicer.cpp
make

wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1/coverage/genomes/gnomad.genomes.coverage.summary.tsv.bgz --no-check-certificate

wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1/coverage/exomes/gnomad.exomes.coverage.summary.tsv.bgz --no-check-certificate

wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.summary.tsv.bgz --no-check-certificate 

gunzip -c gnomad.genomes.coverage.summary.tsv.bgz | sed '1s/.*/#&/' > gnomad.genomes.tabbed.tsv
bgzip gnomad.genomes.tabbed.tsv
tabix -s 1 -b 2 -e 2 gnomad.genomes.tabbed.tsv.gz

gunzip -c gnomad.exomes.coverage.summary.tsv.bgz | sed '1s/.*/#&/' > gnomad.exomes.tabbed.tsv
bgzip gnomad.exomes.tabbed.tsv
tabix -s 1 -b 2 -e 2 gnomad.exomes.tabbed.tsv.gz

gunzip -c gnomad.genomes.r3.0.1.coverage.summary.tsv.bgz | sed '1s/.*/#&/' > gnomad.genomesv3.tabbed.tsv
sed "1s/locus/chr\tpos/; s/:/\t/g" gnomad.genomesv3.tabbed.tsv > gnomad.ch.genomesv3.tabbed.tsv
bgzip gnomad.ch.genomesv3.tabbed.tsv
tabix -s 1 -b 2 -e 2 gnomad.ch.genomesv3.tabbed.tsv

--plugin GO,dir=${HOME}/go_terms

--plugin GO,file=${HOME}/custom_go_terms.gff.gz

--plugin GO,match=gene
--plugin GO,match=gene_symbol

--plugin GO,remote

tabix -s 1 -b 2 -e 3 -f mutation_gc_map.txt.gz

./vep -i input.vcf -cache -vcf -o input_vep.vcf
./filter_vep -i input_vep.vcf -filter "Consequence is missense_variant" > input_vep_filtered.vcf
./vep -i input_vep_filtered.vcf -cache -plugin LD

cd supplement
zcat supplementary_dataset_11_full_constraint_metrics.tsv.gz | (head -n 1 && tail -n +2  | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_temp.tsv
sed '1s/.*/#&/' loeuf_temp.tsv > loeuf_dataset.tsv
bgzip loeuf_dataset.tsv
tabix -f -s 76 -b 77 -e 78 loeuf_dataset.tsv.gz

cat supplementary_dataset_11_full_constraint_metrics_grch38.tsv | (head -n 1 && tail -n +2  | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_grch38_temp.tsv
sed '1s/.*/#&/' loeuf_grch38_temp.tsv > loeuf_dataset_grch38.tsv
bgzip loeuf_dataset_grch38.tsv
tabix -f -s 76 -b 77 -e 78 loeuf_dataset_grch38.tsv.gz

./vep -i variants.vcf --plugin LoFtool,scores_file.txt

unzip mastermind_cited_variants_reference-XXXX.XX.XX-grch37-vcf.zip
bgzip mastermind_cited_variants_reference-XXXX.XX.XX-GRCh37-vcf
tabix -p vcf mastermind_cited_variants_reference-XXXX.XX.XX.GRCh37-vcf.gz

unzip mastermind_cited_variants_reference-XXXX.XX.XX-grch38-vcf.zip
bgzip mastermind_cited_variants_reference-XXXX.XX.XX-GRCh38-vcf
tabix -p vcf mastermind_cited_variants_reference-XXXX.XX.XX.GRCh38-vcf.gz

./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz

./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz,mutations=1 

./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz,mutations=0,var_iden=1

./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz,mutations=0,var_iden=0,url=1

gzip -d mtrflatfile_2.0.txt.gz # to unzip the text file 
cat mtrflatfile_2.0.txt | tr " " "\t" > mtrflatfile_2.00.tsv # to change the file to a tabbed delimited file 
sed '1s/.*/#&/'  mtrflatfile_2.00.tsv > mtrflatfile_2.0.tsv # to add # to the first line of the file 
bgzip mtrflatfile_2.0.tsv
tabix -f -s 1 -b 2 -e 2 mtrflatfile_2.0.tsv.gz

--plugin NearestExonJB,max_range=50000

--plugin NearestGene,limit=3,max_range=50000

./vep -i variations.vcf --plugin neXtProt

./vep -i variations.vcf --plugin neXtProt,return_values='Domain&InteractingRegion'

                                        vvvv
      ES...EE..I.ES...EE.I.ES....EE.I.ES....EE 

                               vvv
      ES...EE..I.ES...EE.I.ES....EE.I.ES....EE 

         vvv
      ..ES...EE..I.ES...EE.I.ES....EE.I.ES....EE 

--plugin Phenotypes,file=${HOME}/phenotypes.gff.gz,include_types=Gene
--plugin Phenotypes,dir=${HOME},include_types=Gene

awk '{print $2, $20 }'  fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt > plI_gene.txt 

awk '{print $1, $20 }'  fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt > plI_transcript.txt 

./vep -i variants.vcf --plugin pLI,values_file.txt
./vep -i variants.vcf --plugin pLI,values_file.txt,transcript # to check for the transcript score.

--plugin PON_P2,pyscript=/path/to/python/script/ponp2.py,hg=hg37

sed -i 's/Chromosome/#Chromosome/' cosmic_sorted.txt
bgzip cosmic_sorted.txt
tabix -s 1 -b 2 -e 2 cosmic_sorted.txt.gz

--plugin PON_P2,file=path/to/cosmic_sorted.txt.gz

wget https://storage.googleapis.com/postgap-data/postgap.txt.gz
gunzip postgap.txt.gz

(grep ^"ld_snp_rsID" postgap.txt; grep -v ^"ld_snp_rsID" postgap.txt | sort -k4,4 -k5,5n ) | bgzip > postgap_GRCh38.txt.gz
tabix -s 4 -b 5 -e 5 -c l postgap_GRCh38.txt.gz

(grep ^"ld_snp_rsID" postgap.txt; grep -v ^"ld_snp_rsID" postgap.txt | sort -k2,2 -k3,3n ) | bgzip > postgap_GRCh37.txt.gz
tabix -s 2 -b 3 -e 3 -c l postgap_GRCh37.txt.gz

--plugin POSTGAP,/path/to/PostGap.gz

--plugin POSTGAP,/path/to/PostGap.gz,ALL

--plugin POSTGAP,/path/to/PostGap.gz,gwas_pmid,gwas_size

--plugin PostGAP,http://my.files.com/postgap.txt.gz

gunzip -cf PrimateAI_scores_v0.2.tsv.gz | sed '12s/.*/#&/' | sed '/^$/d' | awk 'NR<12{print $0;next}{print $0 | "sort -k1,1 -k 2,2n -V"}' | bgzip > PrimateAI_scores_v0.2_GRCh37_sorted.tsv.bgz
tabix -s 1 -b 2 -e 2 PrimateAI_scores_v0.2_GRCh37_sorted.tsv.bgz

gunzip -cf PrimateAI_scores_v0.2_hg38.tsv.gz | sed '12s/.*/#&/' | sed '/^$/d' | awk 'NR<12{print $0;next}{print $0 | "sort -k1,1 -k 2,2n -V"}' | bgzip > PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz
tabix -s 1 -b 2 -e 2 PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz

wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/GRCh38/MISC/annotated_clone_assembly_problems_GCF_000001405.38.gff3
wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/Issue_Mapping/GRCh38.p12_issues.gff3
cat annotated_clone_assembly_problems_GCF_000001405.38.gff3 GRCh38.p12_issues.gff3 > GRCh38_quality_mergedfile.gff3
sort -k 1,1 -k 4,4n -k 5,5n GRCh38_quality_mergedfile.gff3 > sorted_GRCh38_quality_mergedfile.gff3
bgzip sorted_GRCh38_quality_mergedfile.gff3
tabix -p gff sorted_GRCh38_quality_mergedfile.gff3.gz

./vep -i variations.vcf --plugin ReferenceQuality,sorted_GRCh38_quality_mergedfile.gff3.gz

wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/GRCh37/MISC/annotated_clone_assembly_problems_GCF_000001405.25.gff3
wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/Issue_Mapping/GRCh37.p13_issues.gff3
cat annotated_clone_assembly_problems_GCF_000001405.25.gff3 GRCh37.p13_issues.gff3 > GRCh37_quality_mergedfile.gff3
sort -k 1,1 -k 4,4n -k 5,5n GRCh37_quality_mergedfile.gff3 > sorted_GRCh37_quality_mergedfile.gff3
bgzip sorted_GRCh37_quality_mergedfile.gff3
tabix -p gff sorted_GRCh37_quality_mergedfile.gff3.gz

./vep -i variations.vcf --plugin ReferenceQuality,sorted_GRCh37_quality_mergedfile.gff3.gz

unzip revel-v1.3_all_chromosomes.zip
cat revel_with_transcript_ids | tr "," "\t" > tabbed_revel.tsv
sed '1s/.*/#&/' tabbed_revel.tsv > new_tabbed_revel.tsv
bgzip new_tabbed_revel.tsv

tabix -f -s 1 -b 2 -e 2 new_tabbed_revel.tsv.gz

zcat new_tabbed_revel.tsv.gz | head -n1 > h
zgrep -h -v ^#chr new_tabbed_revel.tsv.gz | awk '$3 != "." ' | sort -k1,1 -k3,3n - | cat h - | bgzip -c > new_tabbed_revel_grch38.tsv.gz
tabix -f -s 1 -b 3 -e 3 new_tabbed_revel_grch38.tsv.gz

./vep -i variations.vcf --assembly GRCh38 --plugin REVEL,file=/path/to/revel/data.tsv.gz

./vep -i variations.vcf --assembly GRCh38 --plugin REVEL,file=/path/to/revel/data.tsv.gz,no_match=1

(grep ^Chr GRCh38_ALL.tsv; grep -v ^Chr GRCh38_ALL.tsv | sort -k1,1 -k2,2n ) | bgzip > satMutMPRA_GRCh38_ALL.gz
tabix -s 1 -b 2 -e 2 -c C satMutMPRA_GRCh38_ALL.gz

(grep ^Chr GRCh37_ALL.tsv; grep -v ^Chr GRCh37_ALL.tsv | sort -k1,1 -k2,2n ) | bgzip > satMutMPRA_GRCh37_ALL.gz
tabix -s 1 -b 2 -e 2 -c C satMutMPRA_GRCh37_ALL.gz

--plugin satMutMPRA,file=/path/to/satMutMPRA_GRCh38_ALL.gz

--plugin satMutMPRA,file=/path/to/satMutMPRA_GRCh38_ALL.gz,cols=ALL

--plugin satMutMPRA,file=/path/to/satMutMPRA_GRCh38_ALL.gz,cols=Tags:DNA

--plugin satMutMPRA,file=http://my.files.com/satMutMPRA.gz

tabix -p vcf spliceai_scores.raw.snv.hg37.vcf.gz
tabix -p vcf spliceai_scores.raw.indel.hg37.vcf.gz

tabix -p vcf spliceai_scores.raw.snv.hg38.vcf.gz
tabix -p vcf spliceai_scores.raw.indel.hg38.vcf.gz

./vep -i variations.vcf --plugin SpliceAI,snv=/path/to/spliceai_scores.raw.snv.hg38.vcf.gz,indel=/path/to/spliceai_scores.raw.indel.hg38.vcf.gz
./vep -i variations.vcf --plugin SpliceAI,snv=/path/to/spliceai_scores.raw.snv.hg38.vcf.gz,indel=/path/to/spliceai_scores.raw.indel.hg38.vcf.gz,cutoff=0.5

             v
 ...EEEEEIIIIIIIIII...

           vv vvv
 ...EEEEEIIIIIIIIII...

      vvvvvvvvvvvvvvv
 ...IIIIIIIIIIIIIIIIIIIIEEEEE...

./vep -i structvariants.vcf --plugin StructuralVariantOverlap,file=gnomad_v2_sv.sites.vcf.gz

--plugin TranscriptAnnotator,file=${HOME}/file.tsv.gz

bgzip file.txt
tabix -b2 -e2 file.txt.gz

tar -xzvf varity_all_predictions.tar.gz 
cat varity_all_predictions.txt | (head -n 1 && tail -n +2  | sort -t$'\t' -k 1,1 -k 2,2n) > varity_all_predictions_sorted.tsv
sed '1s/.*/#&/'  varity_all_predictions_sorted.tsv > varity_all_predictions.tsv  # to add a # in the first line of the file 
bgzip varity_all_predictions.tsv
tabix -f -s 1 -b 2 -e 2 varity_all_predictions.tsv.gz

./vep -i input.vcf --plugin MyPlugin
./vep -i input.vcf --plugin MyPlugin,1,FOO
sub feature_types {
    return ['Transcript', 'Intergenic'];
}
sub feature_types {
    return ['Feature', 'Intergenic'];
}

Plugin	Description	Category	External libraries	Developer
AlphaMissense	This plugin for the Ensembl Variant Effect Predictor (VEP) annotates missense variants with the pre-computed AlphaMissense pathogenicity scores. AlphaMissense is a deep learning model developed by Google DeepMind that predicts the pathogenicity of single nucleotide missense variants. more This plugin will add two annotations per missense variant: 'am_pathogenicity', a continuous score between 0 and 1 which can be interpreted as the predicted probability of the variant being pathogenic. 'am_class' is the classification of the variant into one of three discrete categories: 'Likely pathogenic', 'Likely benign', or 'ambiguous'. These are derived using the following thresholds of am_pathogenicity: 'Likely benign' if am_pathogenicity < 0.34; 'Likely pathogenic' if am_pathogenicity > 0.564; 'ambiguous' otherwise. These thresholds were chosen to achieve 90% precision for both pathogenic and benign ClinVar variants. Note that AlphaMissense was not trained on ClinVar variants. Variants labeled as 'ambiguous' should be treated as 'unknown' or 'uncertain' according to AlphaMissense. This plugin is available for both GRCh37 (hg19) and GRCh38 (hg38) genome builds. The prediction scores of AlphaMissense can be downloaded from https://console.cloud.google.com/storage/browser/dm_alphamissense (AlphaMissense Database Copyright (2023) DeepMind Technologies Limited). Data contained within the AlphaMissense Database is licensed under the Creative Commons Attribution 4.0 International License (CC-BY) (the “License”). You may obtain a copy of the License at: https://creativecommons.org/licenses/by/4.0/legalcode. Use of the AlphaMissense Database is subject to Google Cloud Platform Terms of Service Please cite the AlphaMissense publication alongside Ensembl VEP if you use this resource: https://doi.org/10.1126/science.adg7492 Disclaimer: The AlphaMissense Database and other information provided on or linked to this site is for theoretical modelling only, caution should be exercised in use. It is provided "as-is" without any warranty of any kind, whether express or implied. For clarity, no warranty is given that use of the information shall not infringe the rights of any third party (and this disclaimer takes precedence over any contrary provisions in the Google Cloud Platform Terms of Service). The information provided is not intended to be a substitute for professional medical advice, diagnosis, or treatment, and does not constitute medical or other professional advice. Before running the plugin for the first time, you need to create a tabix index (requires tabix to be installed). tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg19.tsv.gz Options are passed to the plugin as key=value pairs: file : (mandatory) Tabix-indexed AlphaMissense data cols : (optional) Colon-separated columns to print from AlphaMissense data; if set to 'all', all columns are printed (default: Missense_pathogenicity:Missense_class) transcript_match : Only print data if transcript identifiers match those from AlphaMissense data (default: 0) AlphaMissense predictions are matched to input data by genomic location and protein change by default.	Pathogenicity predictions	-	Ensembl
AncestralAllele	A VEP plugin that retrieves ancestral allele sequences from a FASTA file. more Ensembl produces FASTA file dumps of the ancestral sequences of key species. Data files for GRCh37 are available from https://ftp.ensembl.org/pub/release-75/fasta/ancestral_alleles/ Data files for GRCh38 are available from https://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/ For optimal retrieval speed, you should pre-process the FASTA files into a single bgzipped file that can be accessed via Bio::DB::HTS::Faidx (installed by VEP's INSTALL.pl): wget https://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38.tar.gz tar xfz homo_sapiens_ancestor_GRCh38.tar.gz cat homo_sapiens_ancestor_GRCh38/*.fa \| bgzip -c > homo_sapiens_ancestor_GRCh38.fa.gz rm -rf homo_sapiens_ancestor_GRCh38/ homo_sapiens_ancestor_GRCh38.tar.gz ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38.fa.gz Data file is only available for GRCh38. The plugin is also compatible with Bio::DB::Fasta and an uncompressed FASTA file. Note the first time you run the plugin with a newly generated FASTA file it will spend some time indexing the file. DO NOT INTERRUPT THIS PROCESS, particularly if you do not have Bio::DB::HTS installed. Special cases: "-" represents an insertion "?" indicates the chromosome could not be looked up in the FASTA	Conservation	-	Ensembl
BayesDel	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds the BayesDel scores to VEP output. more BayesDel is a deleteriousness meta-score combining multiple deleteriousness predictors to create an overall score. It works for coding and non-coding variants, single nucleotide variants and small insertion/deletions. The range of the score is from -1.29334 to 0.75731. The higher the score, the more likely the variant is pathogenic. For more information please visit: https://fenglab.chpc.utah.edu/BayesDel/BayesDel.html Please cite the BayesDel publication alongside the Ensembl VEP if you use this resource: https://onlinelibrary.wiley.com/doi/full/10.1002/humu.23158 BayesDel pre-computed scores can be downloaded from https://drive.google.com/drive/folders/1K4LI6ZSsUGBhHoChUtegC8bgCt7hbQlA Note: These files only contain pre-computed BayesDel scores for missense variants for assembly GRCh37. For GRCh37: tar zxvf BayesDel_170824_addAF.tgz rm .gz.tbi gunzip .gz for f in BayesDel_170824_addAF_chr*; do grep -v "^#" $f >> BayesDel_170824_addAF.txt; done cat BayesDel_170824_addAF.txt \| sort -k1,1 -k2,2n > BayesDel_170824_addAF_sorted.txt grep "^#" BayesDel_170824_addAF_chr1 > BayesDel_170824_addAF_all_scores.txt cat BayesDel_170824_addAF_sorted.txt >> BayesDel_170824_addAF_all_scores.txt bgzip BayesDel_170824_addAF_all_scores.txt tabix -s 1 -b 2 -e 2 BayesDel_170824_addAF_all_scores.txt.gz For GRCh38: Remap GRCh37 file The tabix utility must be installed in your path to use this plugin.	Pathogenicity predictions	-	Ensembl
Blosum62	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that looks up the BLOSUM 62 substitution matrix score for the reference and alternative amino acids predicted for a missense mutation. It adds one new entry to the VEP's Extra column, BLOSUM62 which is the associated score.	Conservation	-	Ensembl
CADD Combined Annotation Dependent Depletion	A VEP plugin that retrieves CADD scores for variants from one or more tabix-indexed CADD data files. more Please cite the CADD publication alongside the VEP if you use this resource: https://www.ncbi.nlm.nih.gov/pubmed/24487276 The tabix utility must be installed in your path to use this plugin. The CADD SNV and indels data files (and respective Tabix index files) can be downloaded from - http://cadd.gs.washington.edu/download The CADD SV data files (and respective Tabix index files) can be downloaded from - https://kircherlab.bihealth.org/download/CADD-SV/v1.1/ By default the plugin is designed to not annotate SV variant if a SNV and/or indels CADD annotation file is provided. Because it can results in too many lines matched from the annotation files and increase run time exponentially. You can override this behavior by providing force_annotate=1 which will force the plugin to annotate with the expense of increasing runtime. The plugin works with all versions of available CADD files. The plugin only reports scores and does not consider any additional annotations from a CADD file. It is therefore sufficient to use CADD files without the additional annotations.	Pathogenicity predictions	-	Ensembl
CAPICE	A VEP plugin that retrieves CAPICE scores for variants from one or more tabix-indexed CAPICE data files, in order to predict their pathogenicity. more Please cite the CAPICE publication alongside the VEP if you use this resource: https://pubmed.ncbi.nlm.nih.gov/32831124/ The tabix utility must be installed in your path to use this plugin. The CAPICE SNVs, InDels and respective index (TBI) files for GRCh37 can be downloaded from https://zenodo.org/record/3928295 To filter results, please use filter_vep with the output file or standard output. Documentation on filter_vep is available at: https://www.ensembl.org/info/docs/tools/vep/script/vep_filter.html For recommendations on threshold selection, please read the CAPICE publication.	Pathogenicity predictions	-	Ensembl
Carol	A VEP plugin that calculates the Combined Annotation scoRing toOL (CAROL) score (1) for a missense mutation based on the pre-calculated SIFT (2) and PolyPhen-2 (3) scores from the Ensembl API (4). more It adds one new entry class to the VEP's Extra column, CAROL which is the calculated CAROL score. Note that this module is a perl reimplementation of the original R script, available at: https://sanger.ac.uk/tool/carol/ I believe that both versions implement the same algorithm, but if there are any discrepancies the R version should be treated as the reference implementation. Bug reports are welcome. References: (1) Lopes MC, Joyce C, Ritchie GRS, John SL, Cunningham F, Asimit J, Zeggini E. A combined functional annotation score for non-synonymous variants Human Heredity 73(1):47-51 (2012) doi:10.1159/000334984 (2) Kumar P, Henikoff S, Ng PC. Predicting the effects of coding non-synonymous variants on protein function using the SIFT algorithm Nature Protocols 4(8):1073-1081 (2009) doi:10.1038/nprot.2009.86 (3) Adzhubei IA, Schmidt S, Peshkin L, Ramensky VE, Gerasimova A, Bork P, Kondrashov AS, Sunyaev SR. A method and server for predicting damaging missense mutations Nature Methods 7(4):248-249 (2010) doi:10.1038/nmeth0410-248 (4) Flicek P, et al. Ensembl 2012 Nucleic Acids Research 40(D1):D84-D90 (2011) doi: 10.1093/nar/gkr991	Pathogenicity predictions	Math::CDF qw(pnorm qnorm)	Ensembl
ClinPred	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds pre-calculated scores from ClinPred. ClinPred is a prediction tool to identify disease-relevant nonsynonymous variants. more This Plugin is only available for GRCh37 Please cite the ClinPred publication alongside the VEP if you use this resource: https://www.sciencedirect.com/science/article/pii/S0002929718302714 ClinPred scores can be downloaded from https://sites.google.com/site/clinpred/download The following steps are neccessary to tabix the ClinPred.txt.gz file before running the plugin: gzip -d ClinPred.txt.gz # to unzip the text file cat ClinPred.txt \| tr " " "\t" > ClinPred_tabbed.tsv # to change the file to a tabbed delimited file sed '1s/.*/#&/' ClinPred_tabbed.tsv > tabbed_ClinPred.tsv # to add a # in the first line of the file sed '1s/C/c/' tabbed_ClinPred.tsv > ClinPred_tabbed.tsv # to convert the Chr to chr bgzip ClinPred_tabbed.tsv tabix -f -s 1 -b 2 -e 2 ClinPred_tabbed.tsv.gz The tabix utility must be installed in your path to use this plugin. Check https://github.com/samtools/htslib.git for instructions.	Pathogenicity predictions	-	Ensembl
Condel	A VEP plugin that calculates the Consensus Deleteriousness (Condel) score (1) for a missense mutation based on the pre-calculated SIFT (2) and PolyPhen-2 (3) scores from the Ensembl API (4). more It adds one new entry class to the VEP's Extra column, Condel which is the calculated Condel score. This version of Condel was developed by the Biomedical Genomics Group of the Universitat Pompeu Fabra, at the Barcelona Biomedical Research Park and available at https://bg.upf.edu/condel. The code in this plugin is based on a script provided by this group and slightly reformatted to fit into the Ensembl API. The plugin takes 3 command line arguments, the first is the path to a Condel configuration directory which contains cutoffs and the distribution files etc., the second is either "s", "p", or "b" to output the Condel score, prediction or both (the default is both), and the third argument is either 1 or 2 to use the original version of Condel (1), or the newer version (2) - 2 is the default and is recommended to avoid false positive predictions from Condel in some circumstances. An example Condel configuration file and a set of distribution files can be found in the config/Condel directory in this repository. You should edit the config/Condel/config/condel_SP.conf file and set the 'condel.dir' parameter to the full path to the location of the config/Condel directory on your system. References: (1) Gonzalez-Perez A, Lopez-Bigas N. Improving the assessment of the outcome of non-synonymous SNVs with a Consensus deleteriousness score (Condel) Am J Hum Genet 88(4):440-449 (2011) doi:10.1016/j.ajhg.2011.03.004 (2) Kumar P, Henikoff S, Ng PC. Predicting the effects of coding non-synonymous variants on protein function using the SIFT algorithm Nature Protocols 4(8):1073-1081 (2009) doi:10.1038/nprot.2009.86 (3) Adzhubei IA, Schmidt S, Peshkin L, Ramensky VE, Gerasimova A, Bork P, Kondrashov AS, Sunyaev SR. A method and server for predicting damaging missense mutations Nature Methods 7(4):248-249 (2010) doi:10.1038/nmeth0410-248 (4) Flicek P, et al. Ensembl 2012 Nucleic Acids Research (2011) doi: 10.1093/nar/gkr991	Pathogenicity predictions	-	Ensembl
Conservation	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that retrieves a conservation score from the Ensembl Compara databases for variant positions. You can specify the method link type and species sets as command line options, the default is to fetch GERP scores from the EPO 35 way mammalian alignment (please refer to the Compara documentation for more details of available analyses). more If a variant affects multiple nucleotides the average score for the position will be returned, and for insertions the average score of the 2 flanking bases will be returned. The plugin uses the ensembl-compara API module (optional, see http://www.ensembl.org/info/docs/api/index.html) or obtains data directly from BigWig files (optional, see https://ftp.ensembl.org/pub/current_compara/conservation_scores/)	Conservation	Net::FTP	Ensembl
dbNSFP	A VEP plugin that retrieves data for missense variants from a tabix-indexed dbNSFP file. more Please cite the dbNSFP publications alongside the VEP if you use this resource: dbNSFP https://www.ncbi.nlm.nih.gov/pubmed/21520341 dbNSFP v2.0 https://www.ncbi.nlm.nih.gov/pubmed/23843252 dbNSFP v3.0 https://www.ncbi.nlm.nih.gov/pubmed/26555599 dbNSFP v4 https://www.ncbi.nlm.nih.gov/pubmed/33261662 You must have the Bio::DB::HTS module or the tabix utility must be installed in your path to use this plugin. The dbNSFP data file can be downloaded from https://sites.google.com/site/jpopgen/dbNSFP The file must be processed and indexed with tabix before use by this plugin. The file must be processed according to the dbNSFP release version and the assembly you use. It is recommended to use the -T option with the sort command to specify a temporary directory with sufficient space. For release 4.4a: version=4.4a wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFP${version}.zip unzip dbNSFP${version}.zip zcat dbNSFP${version}_variant.chr1.gz \| head -n1 > h # GRCh38/hg38 data zgrep -h -v ^#chr dbNSFP${version}_variant.chr* \| sort -k1,1 -k2,2n - \| cat h - \| bgzip -c > dbNSFP${version}_grch38.gz tabix -s 1 -b 2 -e 2 dbNSFP${version}_grch38.gz # GRCh37/hg19 data zgrep -h -v ^#chr dbNSFP${version}_variant.chr* \| awk '$8 != "." ' \| sort -k8,8 -k9,9n - \| cat h - \| bgzip -c > dbNSFP${version}_grch37.gz tabix -s 8 -b 9 -e 9 dbNSFP${version}_grch37.gz When running the plugin you must list at least one column to retrieve from the dbNSFP file, specified as parameters to the plugin e.g. --plugin dbNSFP,/path/to/dbNSFP.gz,LRT_score,GERP++_RS You may include all columns with ALL; this fetches a large amount of data per variant!: --plugin dbNSFP,/path/to/dbNSFP.gz,ALL Tabix also allows the data file to be hosted on a remote server. This plugin is fully compatible with such a setup - simply use the URL of the remote file: --plugin dbNSFP,http://my.files.com/dbNSFP.gz,col1,col2 The plugin replaces occurrences of ';' with ',' and '\|' with '&'. However, some data field columns, e.g. Interpro_domain, use the replacement characters. We added a file with replacement logic for customising the required replacement of ';' and '\|' in dbNSFP data columns. In addition to the default replacements (; to , and \| to &) users can add customised replacements. Users can either modify the file dbNSFP_replacement_logic in the VEP_plugins directory or provide their own file as second argument when calling the plugin: --plugin dbNSFP,/path/to/dbNSFP.gz,/path/to/dbNSFP_replacement_logic,LRT_score,GERP++_RS Note that transcript sequences referred to in dbNSFP may be out of sync with those in the latest release of Ensembl; this may lead to discrepancies with scores retrieved from other sources. If the dbNSFP README file is found in the same directory as the data file, column descriptions will be read from this and incorporated into the VEP output file header. The plugin matches rows in the tabix-indexed dbNSFP file on: genomic position alt allele aaref - reference amino acid aaalt - alternative amino acid To match only on the genomic position and the alt allele use pep_match=0 --plugin dbNSFP,/path/to/dbNSFP.gz,pep_match=0,col1,col2 Some fields contain multiple values, one per Ensembl transcript ID. By default all values are returned, separated by ";" in the default VEP output format. To return values only for the matched Ensembl transcript ID use transcript_match=1. This behaviour only affects transcript-specific fields; non-transcript-specific fields are unaffected. --plugin dbNSFP,/path/to/dbNSFP.gz,transcript_match=1,col1,col2 NB 1: Using this flag may cause no value to return if the version of the Ensembl transcript set differs between VEP and dbNSFP. NB 2: MutationTaster entries are keyed on a different set of transcript IDs. Using the transcript_match flag with any MutationTaster field selected will have no effect i.e. all entries are returned. Information on corresponding transcript(s) for MutationTaster fields can be found using http://www.mutationtaster.org/ChrPos.html	Pathogenicity predictions	File::Basename qw(basename)	Ensembl
dbscSNV	A VEP plugin that retrieves data for splicing variants from a tabix-indexed dbscSNV file. more Please cite the dbscSNV publication alongside the VEP if you use this resource: http://nar.oxfordjournals.org/content/42/22/13534 The Bio::DB::HTS perl library or tabix utility must be installed in your path to use this plugin. The dbscSNV data file can be downloaded from https://sites.google.com/site/jpopgen/dbNSFP. The file must be processed and indexed by tabix before use by this plugin. dbscSNV1.1 has coordinates for both GRCh38 and GRCh37; the file must be processed differently according to the assembly you use. wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip unzip dbscSNV1.1.zip head -n1 dbscSNV1.1.chr1 > h # GRCh38 cat dbscSNV1.1.chr* \| grep -v ^chr \| sort -k5,5 -k6,6n \| cat h - \| awk '$5 != "."' \| bgzip -c > dbscSNV1.1_GRCh38.txt.gz tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz # GRCh37 cat dbscSNV1.1.chr* \| grep -v ^chr \| cat h - \| bgzip -c > dbscSNV1.1_GRCh37.txt.gz tabix -s 1 -b 2 -e 2 -c c dbscSNV1.1_GRCh37.txt.gz Note that in the last command we tell tabix that the header line starts with "c"; this may change to the default of "#" in future versions of dbscSNV. Tabix also allows the data file to be hosted on a remote server. This plugin is fully compatible with such a setup - simply use the URL of the remote file: --plugin dbscSNV,http://my.files.com/dbscSNV.txt.gz Note that transcript sequences referred to in dbscSNV may be out of sync with those in the latest release of Ensembl; this may lead to discrepancies with scores retrieved from other sources.	Splicing predictions	-	Ensembl
DisGeNET	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds Variant-Disease-PMID associations from the DisGeNET database. It is available for GRCh38. more Please cite the DisGeNET publication alongside the VEP if you use this resource: https://academic.oup.com/nar/article/48/D1/D845/5611674 Options are passed to the plugin as key=value pairs: file : Path to DisGeNET data file (mandatory). disease : Set value to 1 to include the diseases/phenotype names reporting the Variant-PMID association (optional). rsid : Set value to 1 to include the dbSNP variant Identifier (optional). filter_score : Only reports citations with score greater or equal than input value (optional). filter_source : Only reports citations from input sources (optional). Accepted sources are: UNIPROT, CLINVAR, GWASDB, GWASCAT, BEFREE Separate multiple values with '&'. Output: Each element of the output includes: - PMID of the publication reporting the Variant-Disease association (default) - DisGeNET score for the Variant-Disease association (default) - diseases/phenotype names (optional) - dbSNP variant Identifier (optional) The following steps are necessary before running this plugin (tested with DisGeNET export date 2020-05-26): This plugin uses file 'all_variant_disease_pmid_associations.tsv.gz' File can be downloaded from: https://www.disgenet.org/downloads gunzip all_variant_disease_pmid_associations.tsv.gz awk '($1 ~ /^snpId/ \|\| $2 ~ /NA/) {next} {print $0}' all_variant_disease_pmid_associations.tsv > all_variant_disease_pmid_associations_clean.tsv sort -t $'\t' -k2,2 -k3,3n all_variant_disease_pmid_associations_clean.tsv > all_variant_disease_pmid_associations_sorted.tsv awk '{ gsub (/\t +/, "\t", $0); print}' all_variant_disease_pmid_associations_sorted.tsv > all_variant_disease_pmid_associations_final.tsv bgzip all_variant_disease_pmid_associations_final.tsv tabix -s 2 -b 3 -e 3 all_variant_disease_pmid_associations_final.tsv.gz The plugin can then be run as default: ./vep -i variations.vcf --plugin DisGeNET,file=all_variant_disease_pmid_associations_final.tsv.gz or with an option to include optional data or/and filters: ./vep -i variations.vcf --plugin DisGeNET,file=all_variant_disease_pmid_associations_final.tsv.gz,disease=1 ./vep -i variations.vcf --plugin DisGeNET,file=all_variant_disease_pmid_associations_final.tsv.gz,disease=1,filter_source='GWASDB&GWASCAT' Of notice: this plugin only matches the chromosome and the position in the chromosome, the alleles are not taken into account to append the DisGeNET data. The rsid is provided (optional) in the output in order to help to filter the relevant data.	Phenotype data and citations	List::MoreUtils qw(uniq)	Ensembl
DosageSensitivity	A VEP plugin that retrieves haploinsufficiency and triplosensitivity probability scores for affected genes from a dosage sensitivity catalogue published in paper - https://www.sciencedirect.com/science/article/pii/S0092867422007887 more Please cite the above publication alongside the VEP if you use this resource. This plugin returns two scores: - pHaplo score gives the probability of a gene being haploinsufficient (deletion intolerant) - pTriplo score gives the probability of a gene being triploinsensitive (duplication intolerant) Pre-requisites: You need the compressed tsv file containing the dosage sensitivity score. The file Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz can be downloaded from here - https://zenodo.org/record/6347673/files/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz Options are passed to the plugin as key=value pairs: file : (mandatory) compressed tsv file containing dosage sensitivity scores cover : set value to 1 (0 by default) to report scores only if the variant covers the affected feature completely (e.g. - a CNV that duplicates the gene). The feature is a gene if using --database otherwise it is a transcript.	Functional effect	-	Ensembl
Downstream	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that predicts the downstream effects of a frameshift variant on the protein sequence of a transcript. It provides the predicted downstream protein sequence (including any amino acids overlapped by the variant itself), and the change in length relative to the reference protein. more Note that changes in splicing are not predicted - only the existing translateable (i.e. spliced) sequence is used as a source of translation. Any variants with a splice site consequence type are ignored. If VEP is run in offline mode using the flag --offline, a FASTA file is required. See: https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#fasta Sequence may be incomplete without a FASTA file or database connection.	Nearby features	-	Ensembl
Draw	A VEP plugin that draws pictures of the transcript model showing the variant location. Can take five optional paramters: more 1) File name stem for images 2) Image width in pixels (default: 1000px) 3) Image height in pixels (default: 100px) 4) Transcript ID - only draw images for this transcript 5) Variant ID - only draw images for this variant e.g. ./vep -i variations.vcf --plugin Draw,myimg,2000,100 Images are written to [file_stem]_[transcript_id]_[variant_id].png Requires GD library installed to run.	Visualisation	GD::Polygon GD	Ensembl
Enformer	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds pre-calculated Enformer predictions of variant impact on chromatin and gene expression. more The predictions have been aggregated across all 896 spatial bins to generate 5313 features corresponding to track prediction changes in differnet assays and cell types. This plugin is available for GRCh37 and GRCh38 Please cite the Enformer publication alongside the VEP if you use this resource: https://www.nature.com/articles/s41592-021-01252-x GRCh38 scores were lifted over using CrossMap from the Enformer scores available here - https://console.cloud.google.com/storage/browser/dm-enformer/variant-scores/1000-genomes/enformer Enformer scores can be downloaded from https://ftp.ensembl.org/pub/current_variation/Enformer for GRCh37 and GRCh38. The plugin can then be run as default to retrieve SAD (SNP Activity Difference (SAD) and SAR (Same as SAD, by computing np.log2(1 + model(alternate_sequence)) - np.log2(1 + model(reference_sequence)) scores from Enforme : ./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz or run with option to only retrieve the SAD (SNP Activity Difference (SAD) scores - main variant effect score computed as model(alternate_sequence) - model(reference_sequence) score ./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz,SAD=1 or run with option to only retrieve the SAR (Same as SAD, by computing np.log2(1 + model(alternate_sequence)) - np.log2(1 + model(reference_sequence)) score ./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz,SAR=1 or run with option to also retrieve the principal component scores which are a reduced representation of a much bigger vector of the SAD and SAR after using principal component analysis (PCA) ./vep -i variations.vcf --assembly GRCh38 --plugin Enformer,file=/path/to/Enformer/data.vcf.gz,PC=1 The tabix utility must be installed in your path to use this plugin. Check https://github.com/samtools/htslib.git for instructions.	Regulatory impact	-	Ensembl
EVE	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds information from EVE (evolutionary model of variant effect). This plugin only report EVE scores for input variants and does not merge input lines to report on adjacent variants. It is only available for GRCh38. more Please cite EVE publication alongside the VEP if you use this resource: https://www.nature.com/articles/s41586-021-04043-8 ################################################### # Bash script to merge all VCFs from EVE dataset. # ################################################### ### BEGIN # EVE input file can be downloaded from https://evemodel.org/api/proteins/bulk/download/ # Input: VCF files by protein (vcf_files_missense_mutations inside zip folder) # Output: Compressed Merged VCF file (vcf.gz) + index file (.tbi) DATA_FOLDER='//vcf_files_missense_mutations' # Fill this line OUTPUT_FOLDER='//eve_plugin' # Fill this line OUTPUT_NAME='eve_merged.vcf' # Default output name # Get header from first VCF cat `ls ${DATA_FOLDER}/vcf \| head -n1` > header # Get variants from all VCFs and add to a single-file ls ${DATA_FOLDER}/vcf \| while read VCF; do grep -v '^#' ${VCF} >> variants; done # Merge Header + Variants in a single file cat header variants \| \ awk '$1 ~ /^#/ {print $0;next} {print $0 \| "sort -k1,1V -k2,2n"}' > ${OUTPUT_FOLDER}/${OUTPUT_NAME}; # Remove temporary files rm header variants # Compress and index bgzip ${OUTPUT_FOLDER}/${OUTPUT_NAME}; # If not installed, use: sudo apt install tabix tabix ${OUTPUT_FOLDER}/${OUTPUT_NAME}.gz; ### END	Pathogenicity predictions	-	Ensembl
FATHMM	A VEP plugin that gets FATHMM scores and predictions for missense variants. more You will need the fathmm.py script and its dependencies (Python, Python MySQLdb). You should create a "config.ini" file in the same directory as the fathmm.py script with the database connection options. More information about how to set up FATHMM can be found on the FATHMM website at https://github.com/HAShihab/fathmm A typical installation could consist of: wget https://raw.github.com/HAShihab/fathmm/master/cgi-bin/fathmm.py wget http://fathmm.biocompute.org.uk/database/fathmm.v2.3.SQL.gz gunzip fathmm.v2.3.SQL.gz mysql -h[host] -P[port] -u[user] -p[pass] -e"CREATE DATABASE fathmm" mysql -h[host] -P[port] -u[user] -p[pass] -Dfathmm < fathmm.v2.3.SQL echo -e "[DATABASE]\nHOST = [host]\nPORT = [port]\nUSER = [user]\nPASSWD = [pass]\nDB = fathmm\n" > config.ini	Pathogenicity predictions	-	Ensembl
FATHMM_MKL	A VEP plugin that retrieves FATHMM-MKL scores for variants from a tabix-indexed FATHMM-MKL data file. more See https://github.com/HAShihab/fathmm-MKL for details. NB: The currently available data file is for GRCh37 only.	Pathogenicity predictions	-	Ensembl
FlagLRG	A VEP plugin that retrieves the LRG ID matching either the RefSeq or Ensembl transcript IDs. You can obtain the 'list_LRGs_transcripts_xrefs.txt' using: more wget https://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt	External ID	Text::CSV	Stephen Kazakoff
FunMotifs	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds tissue-specific transcription factor motifs from FunMotifs to VEP output. more Please cite the FunMotifs publication alongside the VEP if you use this resource. The preprint can be found at: https://www.biorxiv.org/content/10.1101/683722v1 FunMotifs files can be downloaded from: http://bioinf.icm.uu.se:3838/funmotifs/ At the time of writing, all BED files found through this link support GRCh37, however other assemblies are supported by the plugin if an appropriate BED file is supplied. The tabix utility must be installed in your path to use this plugin.	Motif	-	Ensembl
G2P gene2phenotype	A VEP plugin that uses G2P allelic requirements to assess variants in genes for potential phenotype involvement. more The plugin has multiple configuration options, though minimally requires only the CSV file of G2P data. For further information see: Thormann A, Halachev M, McLaren W, et al. Flexible and scalable diagnostic filtering of genomic variants using G2P with Ensembl VEP. Nature Communications. 2019 May;10(1):2373. DOI: 10.1038/s41467-019-10016-3. PMID: 31147538; PMCID: PMC6542828. Options are passed to the plugin as key=value pairs, (defaults in parentheses): file : Path to G2P data file. The file needs to be uncompressed. - Download from https://www.ebi.ac.uk/gene2phenotype/downloads - Download from PanelApp variant_include_list : A list of variants to include even if variants do not pass allele frequency filtering. The include list needs to be a sorted, bgzipped and tabixed VCF file. af_monoallelic : maximum allele frequency for inclusion for monoallelic genes (0.0001) af_biallelic : maximum allele frequency for inclusion for biallelic genes (0.005) confidence_levels : Confidence levels include: definitive, strong, moderate, limited Former confidence terms are still supported: confirmed, probable, possible, both RD and IF. Separate multiple values with '&'. https://www.ebi.ac.uk/gene2phenotype/terminology Default levels are confirmed and probable. all_confidence_levels : Set to 1 to include all confidence levels Setting the value to 1 will overwrite any confidence levels provided with the confidence_levels option. af_from_vcf : set value to 1 to include allele frequencies from VCF file. Specifiy the list of reference populations to include with --af_from_vcf_keys af_from_vcf_keys : VCF collections used for annotating variant alleles with observed allele frequencies. Allele frequencies are retrieved from VCF files. If af_from_vcf is set to 1 but no VCF collections are specified with --af_from_vcf_keys all available VCF collections are included. Available VCF collections: topmed, uk10k, gnomADe, gnomADe_r2.1.1, gnomADg, gnomADg_v3.1.2. Separate multiple values with '&'. VCF collections contain the following populations: topmed : TOPMed (available for GRCh37 and GRCh38). uk10k : ALSPAC, TWINSUK (available for GRCh37 and GRCh38). gnomADe & gnomADe_r2.1.1 - gnomADe:AFR, gnomADe:ALL, gnomADe:AMR, gnomADe:ASJ, gnomADe:EAS, gnomADe:FIN, gnomADe:NFE, gnomADe:OTH, gnomADe:SAS (for GRCh37 and GRCh38 respectively). gnomADg & gnomADg_v3.1.2 - gnomADg:AFR, gnomADg:ALL, gnomADg:AMR, gnomADg:ASJ, gnomADg:EAS, gnomADg:FIN, gnomADg:NFE, gnomADg:OTH (for GRCh37 and GRCh38 respectively). Need to use af_from_vcf paramter to use this option. default_af : default frequency of the input variant if no frequency data is found (0). This determines whether such variants are included; the value of 0 forces variants with no frequency data to be included as this is considered equivalent to having a frequency of 0. Set to 1 (or any value higher than af) to exclude them. types : SO consequence types to include. Separate multiple values with '&' (splice_donor_variant,splice_acceptor_variant,stop_gained, frameshift_variant,stop_lost,initiator_codon_variant, inframe_insertion,inframe_deletion,missense_variant, coding_sequence_variant,start_lost,transcript_ablation, transcript_amplification,protein_altering_variant) log_dir : write stats to log files in log_dir txt_report : write all G2P complete genes and attributes to txt file html_report : write all G2P complete genes and attributes to html file filter_by_gene_symbol: set to 1 if filter by gene symbol. Do not set if filtering by HGNC_id. This option is set to 1 when using PanelApp files. only_mane : set to 1 to ignore transcripts that are not MANE N/B - Information may be lost if this option is used. For more information - https://www.ebi.ac.uk/gene2phenotype/g2p_vep_plugin Example: --plugin G2P,file=G2P.csv,af_monoallelic=0.05,types=stop_gained&frameshift_variant --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_from_vcf=1 --plugin G2P,file=G2P.csv,af_from_vcf=1,af_from_vcf_keys='topmed&gnomADe_r2.1.1' --plugin G2P,file=G2P.csv,af_from_vcf=1,af_from_vcf_keys='topmed&gnomADe_r2.1.1',confidence_levels='confirmed&probable&both RD and IF' --plugin G2P,file=G2P.csv	Phenotype data and citations	List::Util qw(any) Text::CSV Scalar::Util qw(looks_like_number) FileHandle Cwd	Ensembl
GeneSplicer	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get splice site predictions. more It evaluates a tract of sequence either side of and including the variant, both in reference and alternate states. The amount of sequence included either side defaults to 100bp, but can be modified by passing e.g. "context=50" as a parameter to the plugin. Any predicted splicing regions that overlap the variant are reported in the output with one of four states: no_change, diff, gain, loss There follows a "/"-separated string consisting of the following data: 1) type (donor, acceptor) 2) coordinates (start-end) 3) confidence (Low, Medium, High) 4) score Example: loss/acceptor/727006-727007/High/16.231924 If multiple sites are predicted, their reports are separated by ",". For diff, the confidence and score for both the reference and alternate sequences is reported as REF-ALT. Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368 Several parameters can be modified by passing them to the plugin string: context : change the amount of sequence added either side of the variant (default: 100bp) tmpdir : change the temporary directory used (default: /tmp) cache_size : change how many sequences' scores are cached in memory (default: 50) Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp On some systems the binaries provided will not execute, but can be compiled from source: cd $GS/sources make cd - ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification: cd $GS/sources perl -pi -e "s/^main /int main /" genesplicer.cpp make	Splicing predictions	Digest::MD5 qw(md5_hex)	Ensembl
Geno2MP	A VEP plugin that adds information from Geno2MP, a web-accessible database of rare variant genotypes linked to phenotypic information. more Parameters can be set using a key=value system: file: VCF file containing Geno2MP data cols: colon-delimited list of Geno2MP columns to return from INFO fields (by default it only returns the column HPO_CT) url: build and return URL to Geno2MP variant page (boolean; 0 by default); the variant location in Geno2MP website is based on GRCh37 coordinates Please cite Geno2MP alongside the VEP if you use this resource: Geno2MP, NHGRI/NHLBI University of Washington-Center for Mendelian Genomics (UW-CMG), Seattle, WA (URL: http://geno2mp.gs.washington.edu [date (month, yr) accessed]).	Phenotype data and citations	-	Ensembl
gnomADc	A VEP plugin that retrieves gnomAD annotation from either the genome or exome coverage files, available here: https://gnomad.broadinstitute.org/downloads more To download the gnomad coverage file in TSV format: for Assembly GRCh37: gnomad genomes: wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1/coverage/genomes/gnomad.genomes.coverage.summary.tsv.bgz --no-check-certificate gnomad exomes: wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1/coverage/exomes/gnomad.exomes.coverage.summary.tsv.bgz --no-check-certificate for Assembly GRCh38: gnomad genomes: wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.0.1/coverage/genomes/gnomad.genomes.r3.0.1.coverage.summary.tsv.bgz --no-check-certificate Necessary before using the plugin for Assembly GRCh37: The following steps are necessary to tabix the gnomad genomes coverage file : gunzip -c gnomad.genomes.coverage.summary.tsv.bgz \| sed '1s/./#&/' > gnomad.genomes.tabbed.tsv bgzip gnomad.genomes.tabbed.tsv tabix -s 1 -b 2 -e 2 gnomad.genomes.tabbed.tsv.gz The following steps are neccessary to tabix the gnomad exomes coverage file : gunzip -c gnomad.exomes.coverage.summary.tsv.bgz \| sed '1s/./#&/' > gnomad.exomes.tabbed.tsv bgzip gnomad.exomes.tabbed.tsv tabix -s 1 -b 2 -e 2 gnomad.exomes.tabbed.tsv.gz for Assembly GRCh38: The following steps are necessary to tabix the gnomad genomes coverage file : gunzip -c gnomad.genomes.r3.0.1.coverage.summary.tsv.bgz \| sed '1s/.*/#&/' > gnomad.genomesv3.tabbed.tsv sed "1s/locus/chr\tpos/; s/:/\t/g" gnomad.genomesv3.tabbed.tsv > gnomad.ch.genomesv3.tabbed.tsv bgzip gnomad.ch.genomesv3.tabbed.tsv tabix -s 1 -b 2 -e 2 gnomad.ch.genomesv3.tabbed.tsv This plugin also tries to be backwards compatible with older versions of the coverage summary files, including releases 2.0.1 and 2.0.2. These releases provide one coverage file per chromosome and these can be used "as-is" without requiring any preprocessing. If you use this plugin, please see the terms and data information: https://gnomad.broadinstitute.org/terms You must have the Bio::DB::HTS module or the tabix utility must be installed in your path to use this plugin.	Frequency data	File::Spec File::Basename	Stephen Kazakoff
GO Gene Ontology	A VEP plugin that retrieves Gene Ontology (GO) terms associated with transcripts (e.g. GRCh38) or their translations (e.g. GRCh37) using custom GFF annotation containing GO terms. more The custom GFF files are automatically created if the input file do not exist by querying the Ensembl core database, according to database version, species and assembly used in VEP. Note that automatic retrieval fails if using the --offline option. The GFF files containing the GO terms are saved to and loaded from the working directory by default. To change this, provide a directory path as an argument: --plugin GO,dir=${HOME}/go_terms If your GFF file has a custom name, please provide the filename directly: --plugin GO,file=${HOME}/custom_go_terms.gff.gz The GO terms can also be fetched by gene match (either gene Ensembl ID or gene symbol) instead: --plugin GO,match=gene --plugin GO,match=gene_symbol To create/use a custom GFF file, these programs must be installed in your path: * The GNU zgrep and GNU sort commands to create the GFF file. * The tabix and bgzip utilities to create and read the GFF file: check https://github.com/samtools/htslib.git for installation instructions. Alternatively, for compatibility purposes, the plugin allows to use a remote connection to the Ensembl API by using "remote" as a parameter. This method retrieves GO terms one by one at both the transcript and translation level. This is not compatible with any other parameters: --plugin GO,remote	Phenotype data and citations	-	Ensembl
GWAS	A VEP plugin that retrieves relevant NHGRI-EBI GWAS Catalog data given the file. more This plugin supports both the curated data that is found in the download section of the NHGRI-EBI GWAS Catalog website and the summary statistics file. By default the plugin assumes the file provided is the curated file but you can pass "type=sstate" to say you want to annotate with a summary statistics file. Please cite the following publication alongside the VEP if you use this resource: https://pubmed.ncbi.nlm.nih.gov/30445434/ Pre-requisites: For curated NHGRI-EBI GWAS Catalog file - GWAS files can be downloaded from - https://www.ebi.ac.uk/gwas/api/search/downloads/alternative For summary statistics file - The plugin can process the harmonised version of the summary statistics file. Which can be downloaded from the FTP site - http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics They are under directory with related to their specific GCST id. For example - http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST000001-GCST001000/GCST000028/harmonised/17463246-GCST000028-EFO_0001360.h.tsv.gz Please keep the filename format as it is because filename is parsed to get information. When run for the first time the for either type of file plugin will create a processed file that have genomic locations and indexed and put it under the --dir location determined by Ensembl VEP. It might take 1~2 hour(s) to create the processed file depending on the file size. But subsequent runs will be faster as the plugin will be using the already generated processed file. Options are passed to the plugin as key=value pairs: file : (mandatory) Path to GWAS curated or summary statistics file type : type of the file. Valid values are "curated" and "sstate".	Phenotype data and citations	Storable qw(dclone) File::Basename	Ensembl
HGVSIntronOffset	A VEP plugin for the Ensembl Variant Effect Predictor (VEP) that returns HGVS intron start and end offsets. To be used with --hgvs option.	HGVS	-	Stephen Kazakoff
IntAct	A VEP plugin that retrieves molecular interaction data for variants as reprted by IntAct database. more Please cite the IntAct publication alongside the VEP if you use this resource: https://pubmed.ncbi.nlm.nih.gov/24234451/ Pre-requisites: 1) IntAct files can be downloaded from - https://ftp.ebi.ac.uk/pub/databases/intact/current/various 2) The genomic location mapped file needs to be tabix indexed. You can do this by following commands - a) filter, sort and then zip grep -v -e '^$' -e '^[#\-]' mutation_gc_map.txt \| sed '1s/.*/#&/' \| awk -F "\t" 'BEGIN { OFS="\t"} {if ($2 > $3) {a=$2; $2=$3; $3=a}; print $0 }' \| sort -k1,1 -k2,2n -k3,3n \| bgzip > mutation_gc_map.txt.gz b) create tabix indexed file - tabix -s 1 -b 2 -e 3 -f mutation_gc_map.txt.gz 3) As you have already noticed, tabix utility must be installed in your path to use this plugin. Options are passed to the plugin as key=value pairs: mapping_file : (mandatory) Path to tabix-indexed genomic location mapped file mutation_file : (mandatory) Path to IntAct data file By default the output will always contain feature_type and interaction_ac from the IntAct data file. You can also add more fields using the following options - feature_ac : Set value to 1 to include Feature AC in the output feature_short_label : Set value to 1 to include Feature short label in the output feature_annotation : Set value to 1 to include Feature annotation in the output ap_ac : Set value to 1 to include Affected protein AC in the output interaction_participants : Set value to 1 to include Interaction participants in the output pmid : Set value to 1 to include PubMedID in the output There are also two other options for customizing the output - all : Set value to 1 to include all the fields minimal : Set value to 1 to overwrite default behavior and include only interaction_ac in the output by default See what this options mean - https://www.ebi.ac.uk/intact/download/datasets#mutations Note that, interaction accession can be used to link to full details on the interaction website. For example, where the VEP output reports an interaction_ac of EBI-12501485, the URL would be : https://www.ebi.ac.uk/intact/details/interaction/EBI-12501485	Functional effect	-	Ensembl
LD Linkage Disequilibrium	A VEP plugin that finds variants in linkage disequilibrium with any overlapping existing variants from the Ensembl variation databases. more You can configure the population used to calculate the r2 value, and the r2 cutoff used by passing arguments to the plugin via the VEP command line (separated by commas). This plugin adds a single new entry to the Extra column with a comma-separated list of linked variant IDs and the associated r2 values: LinkedVariants=rs123:0.879,rs234:0.943 If no arguments are supplied, the default population used is the CEU sample from the 1000 Genomes Project phase 3, and the default r2 cutoff used is 0.8. WARNING: Calculating LD is a relatively slow procedure, so this will slow VEP down considerably when running on large numbers of variants. Consider running vep followed by filter_vep to get a smaller input set: ./vep -i input.vcf -cache -vcf -o input_vep.vcf ./filter_vep -i input_vep.vcf -filter "Consequence is missense_variant" > input_vep_filtered.vcf ./vep -i input_vep_filtered.vcf -cache -plugin LD	Variant data	-	Ensembl
LocalID	The LocalID plugin allows you to use variant IDs as input without making a database connection. more Requires sqlite3. A local sqlite3 database is used to look up variant IDs; this is generated either from Ensembl's public database (very slow, but includes synonyms), or from a VEP cache file (faster, excludes synonyms). NB this plugin is NOT compatible with the ensembl-tools variant_effect_predictor.pl version of VEP.	Look up	-	Ensembl
LOEUF	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds the LOEUF scores to VEP output. LOEUF stands for the "loss-of-function observed/expected upper bound fraction." more The score can be added matching by either transcript or gene. When matched by gene: If multiple transcripts are available for a gene, the most severe score is reported. NB: The plugin currently does not add the score for downstream_gene_variant and upstream_gene_variant Please cite the LOEUF publication alongside the VEP if you use this resource: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7334197/ LOEUF scores can be downloaded from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7334197/bin/41586_2020_2308_MOESM4_ESM.zip For GRCh37: These files can be tabix-processed by: Unzip 41586_2020_2308_MOESM4_ESM.zip cd supplement zcat supplementary_dataset_11_full_constraint_metrics.tsv.gz \| (head -n 1 && tail -n +2 \| sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_temp.tsv sed '1s/./#&/' loeuf_temp.tsv > loeuf_dataset.tsv bgzip loeuf_dataset.tsv tabix -f -s 76 -b 77 -e 78 loeuf_dataset.tsv.gz For GRCh38: The LOEUF scores are available for assembly GRCh37, to be able to run the plugin for GRCh38 please remap the regions from file supplementary_dataset_11_full_constraint_metrics.tsv After the remapping the file can be tabix-processed by: cat supplementary_dataset_11_full_constraint_metrics_grch38.tsv \| (head -n 1 && tail -n +2 \| sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_grch38_temp.tsv sed '1s/./#&/' loeuf_grch38_temp.tsv > loeuf_dataset_grch38.tsv bgzip loeuf_dataset_grch38.tsv tabix -f -s 76 -b 77 -e 78 loeuf_dataset_grch38.tsv.gz The tabix utility must be installed in your path to use this plugin.	Pathogenicity predictions	Scalar::Util qw(looks_like_number)	Ensembl
LoFtool Loss-of-function	Add LoFtool scores to the VEP output. more LoFtool provides a rank of genic intolerance and consequent susceptibility to disease based on the ratio of Loss-of-function (LoF) to synonymous mutations for each gene in 60,706 individuals from ExAC, adjusting for the gene de novo mutation rate and evolutionary protein conservation. The lower the LoFtool gene score percentile the most intolerant is the gene to functional variation. For more details please see (Fadista J et al. 2017), PMID:27563026. The authors would like to thank the Exome Aggregation Consortium and the groups that provided exome variant data for comparison. A full list of contributing groups can be found at http://exac.broadinstitute.org/about. The LoFtool_scores.txt file is found alongside the plugin in the VEP_plugins GitHub repo. To use another scores file, add it as a parameter i.e. ./vep -i variants.vcf --plugin LoFtool,scores_file.txt	Pathogenicity predictions	DBI	Ensembl
LOVD Leiden Open Variation Database	A VEP plugin that retrieves LOVD variation data from http://www.lovd.nl/. more Please be aware that LOVD is a public resource of curated variants, therefore please respect this resource and avoid intensive querying of their databases using this plugin, as it will impact the availability of this resource for others.	Variant data	LWP::UserAgent	Ensembl
Mastermind	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that uses the Mastermind Genomic Search Engine (https://www.genomenon.com/mastermind) to report variants that have clinical evidence cited in the medical literature. It is available for both GRCh37 and GRCh38. more Please cite the Mastermind publication alongside the VEP if you use this resource: https://www.frontiersin.org/article/10.3389/fgene.2020.577152 Running options: The plugin has multiple parameters, the first one is expected to be the file name path which can be followed by 3 optional flags. Default: the plugin matches the citation data with the specific mutation. Using first flag '1': returns the citations for all mutations/transcripts. Using the second flag '1': only returns the Mastermind variant identifier(s). Using the third flag '1': also returns the Mastermind URL. Output: The output includes three unique counts 'MMCNT1, MMCNT2, MMCNT3' and one identifier 'MMID3' to be used to build an URL which shows all articles from MMCNT3. 'MMCNT1' is the count of Mastermind articles with cDNA matches for a specific variant; 'MMCNT2' is the count of Mastermind articles with variants either explicitly matching at the cDNA level or given only at protein level; 'MMCNT3' is the count of Mastermind articles including other DNA-level variants resulting in the same amino acid change; 'MMID3' is the Mastermind variant identifier(s), as gene:key. Link to the Genomenon Mastermind Genomic Search Engine; To build the URL, substitute the 'gene:key' in the following link with the value from MMID3: https://mastermind.genomenon.com/detail?mutation=gene:key If the third flag is used then the built URL is returned and it's identified by 'URL'. More information can be found at: https://www.genomenon.com/cvr/ The following steps are necessary before running this plugin: Download and Registry (free): https://www.genomenon.com/cvr/ GRCh37 VCF: unzip mastermind_cited_variants_reference-XXXX.XX.XX-grch37-vcf.zip bgzip mastermind_cited_variants_reference-XXXX.XX.XX-GRCh37-vcf tabix -p vcf mastermind_cited_variants_reference-XXXX.XX.XX.GRCh37-vcf.gz GRCh38 VCF: unzip mastermind_cited_variants_reference-XXXX.XX.XX-grch38-vcf.zip bgzip mastermind_cited_variants_reference-XXXX.XX.XX-GRCh38-vcf tabix -p vcf mastermind_cited_variants_reference-XXXX.XX.XX.GRCh38-vcf.gz The plugin can then be run as default: ./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz or with an option to not filter by mutations (first flag): ./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz,mutations=1 or with an option to only return 'MMID3' e.g. the Mastermind variant identifier as gene:key (second flag): ./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz,mutations=0,var_iden=1 or with an option to also return the Mastermind URL (third flag): ./vep -i variations.vcf --plugin Mastermind,file=/path/to/mastermind_cited_variants_reference-XXXX.XX.XX.GRChXX-vcf.gz,mutations=0,var_iden=0,url=1 Note: when running VEP in offline mode Mastermind requires a fasta file (--fasta)	Phenotype data and citations	-	Ensembl
MaveDB	A VEP plugin that retrieves data from MaveDB (https://www.mavedb.org), a database that contains multiplex assays of variant effect, including deep mutational scans and massively parallel report assays. more To run the MaveDB plugin, please download the following file with MaveDB data mapped to variants: https://ftp.ensembl.org/pub/current_variation/MaveDB/MaveDB_variants.tsv.gz Options are passed to the plugin as key=value pairs: file : (mandatory) Tabix-indexed MaveDB file cols : Colon-separated columns to print from MaveDB files; if set to 'all', all columns are printed (default: 'urn:score:nt:pro') single_aminoacid_changes : Return matches for single aminoacid changes only; if disabled, return all matches associated with a genetic variant (default: 1) transcript_match : Return results only if (Ensembl or RefSeq) transcript identifiers match (default: 1) Please cite the MaveDB publication alongside the VEP if you use this resource: https://doi.org/10.1186/s13059-019-1845-6 The tabix utility must be installed in your path to use this plugin.	Functional effect	Bio::SeqUtils File::Basename	Ensembl
MaxEntScan	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that runs MaxEntScan (http://hollywood.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html) to get splice site predictions. more The plugin copies most of the code verbatim from the score5.pl and score3.pl scripts provided in the MaxEntScan download. To run the plugin you must get and unpack the archive from http://hollywood.mit.edu/burgelab/maxent/download/; the path to this unpacked directory is then the param you pass to the --plugin flag. The plugin executes the logic from one of the scripts depending on which splice region the variant overlaps: score5.pl : last 3 bases of exon --> first 6 bases of intron score3.pl : last 20 bases of intron --> first 3 bases of exon The plugin reports the reference, alternate and difference (REF - ALT) maximum entropy scores. If 'SWA' is specified as a command-line argument, a sliding window algorithm is applied to subsequences containing the reference and alternate alleles to identify k-mers with the highest donor and acceptor splice site scores. To assess the impact of variants, reference comparison scores are also provided. For SNVs, the comparison scores are derived from sequence in the same frame as the highest scoring k-mers containing the alternate allele. For all other variants, the comparison scores are derived from the highest scoring k-mers containing the reference allele. The difference between the reference comparison and alternate scores (SWA_REF_COMP - SWA_ALT) are also provided. If 'NCSS' is specified as a command-line argument, scores for the nearest upstream and downstream canonical splice sites are also included. By default, only scores are reported. Add 'verbose' to the list of command- line arguments to include the sequence output associated with those scores.	Splicing predictions	Digest::MD5 qw(md5_hex)	Ensembl
MPC missense deleteriousness metric	A VEP plugin that retrieves MPC scores for variants from a tabix-indexed MPC data file. more MPC is a missense deleteriousness metric based on the analysis of genic regions depleted of missense mutations in the Exome Agggregation Consortium (ExAC) data. The MPC score is the product of work by Kaitlin Samocha (ks20@sanger.ac.uk). Publication currently in pre-print: Samocha et al bioRxiv 2017 (TBD) The MPC score file is available to download from: https://ftp.broadinstitute.org/pub/ExAC_release/release1/regional_missense_constraint/ The data are currently mapped to GRCh37 only. Not all transcripts are included; see README in the above directory for exclusion criteria.	Pathogenicity predictions	-	Ensembl
MTR Missense Tolerance Ratio	A VEP plugin that retrieves Missense Tolerance Ratio (MTR) scores for variants from a tabix-indexed flat file. more MTR scores quantify the amount of purifying selection acting specifically on missense variants in a given window of protein-coding sequence. It is estimated across a sliding window of 31 codons and uses observed standing variation data from the WES component of the Exome Aggregation Consortium Database (ExAC), version 2.0 (http://gnomad.broadinstitute.org). Please cite the MTR publication alongside the VEP if you use this resource: http://genome.cshlp.org/content/27/10/1715 The Bio::DB::HTS perl library or tabix utility must be installed in your path to use this plugin. MTR flat files can be downloaded from http://biosig.unimelb.edu.au/mtr-viewer/downloads The following steps are necessary before running the plugin gzip -d mtrflatfile_2.0.txt.gz # to unzip the text file cat mtrflatfile_2.0.txt \| tr " " "\t" > mtrflatfile_2.00.tsv # to change the file to a tabbed delimited file sed '1s/.*/#&/' mtrflatfile_2.00.tsv > mtrflatfile_2.0.tsv # to add # to the first line of the file bgzip mtrflatfile_2.0.tsv tabix -f -s 1 -b 2 -e 2 mtrflatfile_2.0.tsv.gz NB: Data are available for GRCh37 only	Pathogenicity predictions	-	Slave Petrovski Michael Silk
mutfunc	A VEP plugin that retrieves data from mutfunc db predicting destabilization of protein structure, interaction interface, and motif. more Please cite the mutfunc publication alongside the VEP if you use this resource: http://msb.embopress.org/content/14/12/e8430 Pre-requisites: 1) The data file. mutfunc SQLite db can be downloaded from - https://ftp.ensembl.org/pub/current_variation/mutfunc/mutfunc_data.db Options are passed to the plugin as key=value pairs: By default all the fields (motif, int, mod, and exp) are added in the output. But if you want to have some selected fields and not all of them just select the relevant options. The default behavior will then go away outputting only the selected fields. db : (mandatory) Path to SQLite database containing data for other analysis. motif : Select this option to have mutfunc motif analysis in the output int : Select this option to have mutfunc protein interection analysis in the output mod : Select this option to have mutfunc protein structure analysis in the output exp : Select this option to have mutfunc protein structure (experimental) analysis in the output extended : By default mutfunc outputs the most significant field for any analysis. Select this option to get more verbose output.	Protein annotation	List::MoreUtils qw(first_index) Compress::Zlib Digest::MD5 qw(md5_hex) DBI	Ensembl
NearestExonJB	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that finds the nearest exon junction boundary to a coding sequence variant. More than one boundary may be reported if the boundaries are equidistant. more The plugin will report the Ensembl identifier of the exon, the distance to the exon boundary, the boundary type (start or end of exon) and the total length in nucleotides of the exon. Various parameters can be altered by passing them to the plugin command: - max_range : maximum search range in bp (default: 10000) Parameters are passed e.g.: --plugin NearestExonJB,max_range=50000	Nearby features	-	Ensembl
NearestGene	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that finds the nearest gene(s) to a non-genic variant. More than one gene may be reported if the genes overlap the variant or if genes are equidistant. more Various parameters can be altered by passing them to the plugin command: - limit : limit the number of genes returned (default: 1) - range : initial search range in bp (default: 1000) - max_range : maximum search range in bp (default: 10000) Parameters are passed e.g.: --plugin NearestGene,limit=3,max_range=50000 This plugin requires a database connection. It cannot be run with VEP in offline mode i.e. using the --offline flag.	Nearby features	-	Ensembl
neXtProt	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that retrieves data for missense and stop gain variants from neXtProt, which is a comprehensive human-centric discovery platform that offers integration of and navigation through protein-related data for example, variant information, localization and interactions (https://www.nextprot.org/). more Please cite the neXtProt publication alongside the VEP if you use this resource: https://doi.org/10.1093/nar/gkz995 This plugin is only suitable for small sets of variants as an additional individual remote API query is run for each variant. The neXtProt_headers.txt file is a requirement for running this plugin and is found alongside the plugin in the VEP_plugins GitHub repository. The file contains the RDF entities extracted from https://snorql.nextprot.org/ Running options: (Default) the data retrieved by default is the MatureProtein, NucleotidePhosphateBindingRegion, Variant, MiscellaneousRegion, TopologicalDomain and InteractingRegion. The plugin can also be run with other options to retrieve other data than the default. Options are passed to the plugin as key=value pairs: max_set : Set value to 1 to return all available protein-related data (includes the default data) return_values : The set of data to be returned with different data separated by '&'. Use file 'neXtProt_headers.txt' to check which data (labels) are available. Example: --plugin neXtProt,return_values='Domain&InteractingRegion' url : Set value to 1 to include the URL to link to the neXtProt entry. all_labels : Set value to 1 to include all labels, even if data is not available. position : Set value to 1 to include the start and end position in the protein. * note: 'max_set' and 'return_values' cannot be used simultaneously. Output: By default, the plugin only returns data that is available. Example (default behaviour): neXtProt_MatureProtein=Rho guanine nucleotide exchange factor 10 The option 'all_labels' returns a consistent set of the requested fields, using "-" where values are not available. Same example as above: neXtProt_MatureProtein=Rho guanine nucleotide exchange factor 10; neXtProt_InteractingRegion=-;neXtProt_NucleotidePhosphateBindingRegion=-;neXtProt_Variant=-; neXtProt_MiscellaneousRegion=-;neXtProt_TopologicalDomain=-; Of notice, multiple values can be returned for the same label. In this case, the values will be separeted by '\|' for tab and txt format, and '&' for VCF format. The plugin can then be run as default: ./vep -i variations.vcf --plugin neXtProt or to return only the data specified by the user: ./vep -i variations.vcf --plugin neXtProt,return_values='Domain&InteractingRegion'	Protein data	JSON::XS	Ensembl
NMD	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that predicts if a variant allows the transcript escape nonsense-mediated mRNA decay based on certain rules. more The rules are : 1. The variant location falls in the last exon of the transcript. vvvv ES...EE..I.ES...EE.I.ES....EE.I.ES....EE (ES= exon_start,EE = exon_end, I = intron, v = variant location) 2. The variant location falls 50 bases upstream of the penultimate (second to the last ) exon. vvv ES...EE..I.ES...EE.I.ES....EE.I.ES....EE (ES= exon_start,EE = exon_end, I = intron, v = variant location) 3. The variant falls in the first 100 coding bases in the transcript. vvv ..ES...EE..I.ES...EE.I.ES....EE.I.ES....EE (ES= exon_start,EE = exon_end, I = intron, v = variant location) 4. If the variant is in an intronless transcript, meaning only one exon exist in the transcript. The additional term NMD-escaping variant (nonsense-mediated mRNA decay escaping variants) will be added if the variant matches any of the rules. REFERENCES : Identifying Genes Whose Mutant Transcripts Cause Dominant Disease Traits by Potential Gain-of-Function Alleles (Coban-Akdemir, 2018) The rules and impact of nonsense-mediated mRNA decay in human cancers (Lindeboom, 2016)	Transcript annotation	-	Ensembl
OpenTargets	A VEP plugin that integrates data from Open Targets Genetics (https://genetics.opentargets.org), a tool that highlights variant-centric statistical evidence to allow both prioritisation of candidate causal variants at trait-associated loci and identification of potential drug targets. more Data from Open Targets Genetics includes locus-to-gene (L2G) scores to predict causal genes at GWAS loci. The tabix utility must be installed in your path to use this plugin. The Open Targets Genetics file and respective index (TBI) file can be downloaded from: https://ftp.ebi.ac.uk/pub/databases/opentargets/genetics/latest/OTGenetics_VEP Options are passed to the plugin as key=value pairs: file : (mandatory) Tabix-indexed file from Open Targets Genetics cols : (optional) Colon-separated list of columns to return from the plugin file (default: "l2g:geneId"); use 'all' to print all data Please cite the Open Targets Genetics publication alongside the VEP if you use this resource: https://doi.org/10.1093/nar/gkaa84	Variant data	Bio::SeqUtils File::Basename	Ensembl
Phenotypes	A VEP plugin that retrieves overlapping phenotype information. more On the first run for each new version/species/assembly will download a GFF-format dump to ~/.vep/Plugins/ Ensembl provides phenotype annotations mapped to a number of genomic feature types, including genes, variants and QTLs. This plugin is best used with JSON output format; the output will be more verbose and include all available phenotype annotation data and metadata. For other output formats, only a concatenated list of phenotype description strings is returned. Several paramters can be set using a key=value system: dir : provide a dir path, where either to create anew the species specific file from the download or to look for an existing file file : provide a file path, either to create anew from the download or to point to an existing file exclude_sources: exclude sources of phenotype information. By default HGMD and COSMIC annotations are excluded. See http://www.ensembl.org/info/genome/variation/phenotype/sources_phenotype_documentation.html Separate multiple values with '&' include_sources: force include sources, as exclude_sources exclude_types : exclude types of features. By default StructuralVariation and SupportingStructuralVariation annotations are excluded due to their size. Separate multiple values with '&'. Valid types: Gene, Variation, QTL, StructuralVariation, SupportingStructuralVariation, RegulatoryFeature include_types : force include types, as exclude_types expand_right : sets cache size in bp. By default annotations 100000bp (100kb) downstream of the initial lookup are cached phenotype_feature : report the specific gene or variation the phenotype is linked to, this can be an overlapping gene or structural variation, and the source of the annotation (default 0) Example: --plugin Phenotypes,file=${HOME}/phenotypes.gff.gz,include_types=Gene --plugin Phenotypes,dir=${HOME},include_types=Gene	Phenotype data and citations	-	Ensembl
pLI	A VEP plugin that adds the probabililty of a gene being loss-of-function intolerant (pLI) to the VEP output. more Lek et al. (2016) estimated pLI using the expectation-maximization (EM) algorithm and data from 60,706 individuals from ExAC (http://exac.broadinstitute.org). The closer pLI is to 1, the more likely the gene is loss-of-function (LoF) intolerant. Note: the pLI was calculated using a representative transcript and is reported by gene in the plugin. The data for the plugin is provided by Kaitlin Samocha and Daniel MacArthur. See https://www.ncbi.nlm.nih.gov/pubmed/27535533 for a description of the dataset and analysis. The pLI_values.txt file is found alongside the plugin in the VEP_plugins GitHub repository. The file contains the fields gene and pLI extracted from the file at https://ftp.broadinstitute.org/pub/ExAC_release/release0.3/functional_gene_constraint/ fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt From this file, extract gene or transcipt pLI scores: To extract gene scores : awk '{print $2, $20 }' fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt > plI_gene.txt NB: The gene scores file can also be found in the VEP_plugins directory. To extract transcript scores: awk '{print $1, $20 }' fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt > plI_transcript.txt NB: Using this file, No transcript score will be returned. To use another values file, add it as a parameter i.e. ./vep -i variants.vcf --plugin pLI,values_file.txt ./vep -i variants.vcf --plugin pLI,values_file.txt,transcript # to check for the transcript score.	Pathogenicity predictions	List::MoreUtils qw/zip/ DBI	Ensembl
PON_P2	This plugin for Ensembl Variant Effect Predictor (VEP) computes the predictions of PON-P2 for amino acid substitutions in human proteins. PON-P2 is developed and maintained by Protein Structure and Bioinformatics Group at Lund University and is available at http://structure.bmc.lu.se/PON-P2/. more There are two ways to run the plugin: 1) To compute the predictions from the PON-P2 API please use the following options: Option 1: python script 'ponp2.py'* Option 2: the reference genome. Acceptable values are: hg37, hg38 --plugin PON_P2,pyscript=/path/to/python/script/ponp2.py,hg=hg37 * To run this mode, you will require a python script and its dependencies (Python, python suds). The python file can be downloaded from http://structure.bmc.lu.se/PON-P2/vep.html/ and the complete path to this file must be supplied while using this plugin. 2) To fetch the predictions from a file containing pre-calculated predictions for somatic variations please use the following option (only available for GRCh37): file: COSMIC text file with pre-calculated predictions downloaded from http://structure.bmc.lu.se/PON-P2/cancer30.html/ The following steps are necessary before using the file: (head -n 1 COSMIC.txt && tail -n +2 COSMIC.txt \| sort -t $'\t' -k1,1 -k2,2n) > cosmic_sorted.txt sed -i 's/Chromosome/#Chromosome/' cosmic_sorted.txt bgzip cosmic_sorted.txt tabix -s 1 -b 2 -e 2 cosmic_sorted.txt.gz --plugin PON_P2,file=path/to/cosmic_sorted.txt.gz If you use this data, please cite the following publication Niroula, A., Vihinen, M. Harmful somatic amino acid substitutions affect key pathways in cancers. BMC Med Genomics 8, 53 (2015). https://doi.org/10.1186/s12920-015-0125-x	Pathogenicity predictions	-	Abhishek Niroula Mauno Vihinen
PostGAP	A VEP plugin that retrieves data for variants from a tabix-indexed PostGAP file (1-based file). more Please refer to the PostGAP github and wiki for more information: https://github.com/Ensembl/postgap https://github.com/Ensembl/postgap/wiki https://github.com/Ensembl/postgap/wiki/algorithm-pseudo-code The Bio::DB::HTS perl library or tabix utility must be installed in your path to use this plugin. The PostGAP data file can be downloaded from https://storage.googleapis.com/postgap-data. The file must be processed and indexed by tabix before use by this plugin. PostGAP has coordinates for both GRCh38 and GRCh37; the file must be processed differently according to the assembly you use. wget https://storage.googleapis.com/postgap-data/postgap.txt.gz gunzip postgap.txt.gz # GRCh38 (grep ^"ld_snp_rsID" postgap.txt; grep -v ^"ld_snp_rsID" postgap.txt \| sort -k4,4 -k5,5n ) \| bgzip > postgap_GRCh38.txt.gz tabix -s 4 -b 5 -e 5 -c l postgap_GRCh38.txt.gz # GRCh37 (grep ^"ld_snp_rsID" postgap.txt; grep -v ^"ld_snp_rsID" postgap.txt \| sort -k2,2 -k3,3n ) \| bgzip > postgap_GRCh37.txt.gz tabix -s 2 -b 3 -e 3 -c l postgap_GRCh37.txt.gz Note that in the last command we tell tabix that the header line starts with "l"; this may change to the default of "#" in future versions of PostGAP. When running the plugin by default 'disease_efo_id', 'disease_name', 'gene_id' and 'score' information is returned e.g. --plugin POSTGAP,/path/to/PostGap.gz You may include all columns with ALL; this fetches a large amount of data per variant!: --plugin POSTGAP,/path/to/PostGap.gz,ALL You may want to select only a specific subset of additional information to be reported, you can do this by specifying the columns as parameters to the plugin e.g. --plugin POSTGAP,/path/to/PostGap.gz,gwas_pmid,gwas_size If a requested column is not found, the error message will report the complete list of available columns in the POSTGAP file. For a brief description of the available information please refer to the 'How do I use POSTGAP output?' section in the POSTGAP wiki. Tabix also allows the data file to be hosted on a remote server. This plugin is fully compatible with such a setup - simply use the URL of the remote file: --plugin PostGAP,http://my.files.com/postgap.txt.gz Note that gene sequences referred to in PostGAP may be out of sync with those in the latest release of Ensembl; this may lead to discrepancies with scores retrieved from other sources.	Phenotype data and citations	-	Ensembl
PrimateAI	The PrimateAI VEP plugin is designed to retrieve clinical impact scores of variants, as described in https://www.nature.com/articles/s41588-018-0167-z. Please consider citing the paper if using this plugin. more In brief, common missense mutations in non-human primate species are usually benign in humans. Thousands of common variants from six non-human primate species were used to train a deep neural network to identify pathogenic mutations in humans with a rare disease. This plugin uses files generated by the PrimateAI software, which is available from https://github.com/Illumina/PrimateAI. The files containing predicted pathogenicity scores can be downloaded from https://basespace.illumina.com/s/yYGFdGih1rXL (a free BaseSpace account may be required): PrimateAI_scores_v0.2.tsv.gz (for GRCh37/hg19) PrimateAI_scores_v0.2_hg38.tsv.gz (for GRCh38/hg38) Before running the plugin for the first time, the following steps must be taken to format the downloaded files: 1. Unzip the score files 2. Add '#' in front of the column description line 3. Remove any empty lines. 4. Sort the file by chromosome and position 5. Compress the file in .bgz format 6. Create tabix index (requires tabix to be installed). Command line examples for formatting input files: gunzip -cf PrimateAI_scores_v0.2.tsv.gz \| sed '12s/./#&/' \| sed '/^$/d' \| awk 'NR<12{print $0;next}{print $0 \| "sort -k1,1 -k 2,2n -V"}' \| bgzip > PrimateAI_scores_v0.2_GRCh37_sorted.tsv.bgz tabix -s 1 -b 2 -e 2 PrimateAI_scores_v0.2_GRCh37_sorted.tsv.bgz gunzip -cf PrimateAI_scores_v0.2_hg38.tsv.gz \| sed '12s/./#&/' \| sed '/^$/d' \| awk 'NR<12{print $0;next}{print $0 \| "sort -k1,1 -k 2,2n -V"}' \| bgzip > PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz tabix -s 1 -b 2 -e 2 PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz	Pathogenicity predictions	-	Ensembl
ProteinSeqs	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that prints out the reference and mutated protein sequences of any proteins found with non-synonymous mutations in the input file. more You should supply the name of file where you want to store the reference protein sequences as the first argument, and a file to store the mutated sequences as the second argument. Note that, for simplicity, where stop codons are gained the plugin simply substitutes a '*' into the sequence and does not truncate the protein. Where a stop codon is lost any new amino acids encoded by the mutation are appended to the sequence, but the plugin does not attempt to translate until the next downstream stop codon. Also, the protein sequence resulting from each mutation is printed separately, no attempt is made to apply multiple mutations to the same protein.	Sequence	-	Ensembl
ReferenceQuality	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that reports on the quality of the reference genome using GRC data at the location of your variants. More information can be found at: https://www.ncbi.nlm.nih.gov/grc/human/issues more The following steps are necessary before running this plugin: GRCh38: wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/GRCh38/MISC/annotated_clone_assembly_problems_GCF_000001405.38.gff3 wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/Issue_Mapping/GRCh38.p12_issues.gff3 cat annotated_clone_assembly_problems_GCF_000001405.38.gff3 GRCh38.p12_issues.gff3 > GRCh38_quality_mergedfile.gff3 sort -k 1,1 -k 4,4n -k 5,5n GRCh38_quality_mergedfile.gff3 > sorted_GRCh38_quality_mergedfile.gff3 bgzip sorted_GRCh38_quality_mergedfile.gff3 tabix -p gff sorted_GRCh38_quality_mergedfile.gff3.gz The plugin can then be run with: ./vep -i variations.vcf --plugin ReferenceQuality,sorted_GRCh38_quality_mergedfile.gff3.gz GRCh37: wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/GRCh37/MISC/annotated_clone_assembly_problems_GCF_000001405.25.gff3 wget https://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/Issue_Mapping/GRCh37.p13_issues.gff3 cat annotated_clone_assembly_problems_GCF_000001405.25.gff3 GRCh37.p13_issues.gff3 > GRCh37_quality_mergedfile.gff3 sort -k 1,1 -k 4,4n -k 5,5n GRCh37_quality_mergedfile.gff3 > sorted_GRCh37_quality_mergedfile.gff3 bgzip sorted_GRCh37_quality_mergedfile.gff3 tabix -p gff sorted_GRCh37_quality_mergedfile.gff3.gz The plugin can then be run with: ./vep -i variations.vcf --plugin ReferenceQuality,sorted_GRCh37_quality_mergedfile.gff3.gz The tabix utility must be installed in your path to use this plugin.	Sequence	-	Ensembl
REVEL	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds the REVEL score for missense variants to VEP output. more Please cite the REVEL publication alongside the VEP if you use this resource: https://www.ncbi.nlm.nih.gov/pubmed/27666373 Running options: If available, the plugin will match the scores by transcript id (default). Using the flag '1' the plugin will not try to match by transcript id. REVEL scores can be downloaded from: https://sites.google.com/site/revelgenomics/downloads The plugin supports several REVEL file versions: - REVEL file version Dec 2017, which has 7 columns and only GRCh37 coordinates - REVEL file version Feb 2020, which has 8 columns with GRCh37 and GRCh38 coordinates - REVEL file version May 2021, which has 9 columns with GRCh37 and GRCh38 coordinates and a new column with transcript ids These files can be tabix-processed by: unzip revel-v1.3_all_chromosomes.zip cat revel_with_transcript_ids \| tr "," "\t" > tabbed_revel.tsv sed '1s/.*/#&/' tabbed_revel.tsv > new_tabbed_revel.tsv bgzip new_tabbed_revel.tsv for GRCh37: tabix -f -s 1 -b 2 -e 2 new_tabbed_revel.tsv.gz for GRCh38: zcat new_tabbed_revel.tsv.gz \| head -n1 > h zgrep -h -v ^#chr new_tabbed_revel.tsv.gz \| awk '$3 != "." ' \| sort -k1,1 -k3,3n - \| cat h - \| bgzip -c > new_tabbed_revel_grch38.tsv.gz tabix -f -s 1 -b 3 -e 3 new_tabbed_revel_grch38.tsv.gz The plugin can then be run as default: ./vep -i variations.vcf --assembly GRCh38 --plugin REVEL,file=/path/to/revel/data.tsv.gz or with the option to not match by transcript id: ./vep -i variations.vcf --assembly GRCh38 --plugin REVEL,file=/path/to/revel/data.tsv.gz,no_match=1 Requirements: The tabix utility must be installed in your path to use this plugin. The --assembly flag is required to use this plugin.	Pathogenicity predictions	-	Ensembl
SameCodon	A VEP plugin that reports existing variants that fall in the same codon. This plugin requires a database connection, can not be run in offline mode	Variant data	-	Ensembl
satMutMPRA	A VEP plugin that retrieves data for variants from a tabix-indexed satMutMPRA file (1-based file). The saturation mutagenesis-based massively parallel reporter assays (satMutMPRA) measures variant effects on gene RNA expression for 21 regulatory elements (11 enhancers, 10 promoters). more The 20 disease-associated regulatory elements and one ultraconserved enhancer analysed in different cell lines are the following: - ten promoters (of TERT, LDLR, HBB, HBG, HNF4A, MSMB, PKLR, F9, FOXE1 and GP1BB) and - ten enhancers (of SORT1, ZRS, BCL11A, IRF4, IRF6, MYC (2x), RET, TCF7L2 and ZFAND3) and - one ultraconserved enhancer (UC88). Please refer to the satMutMPRA web server and Kircher M et al. (2019) paper for more information: https://mpra.gs.washington.edu/satMutMPRA/ https://www.ncbi.nlm.nih.gov/pubmed/31395865 Parameters can be set using a key=value system: file : required - a tabix indexed file of the satMutMPRA data corresponding to desired assembly. pvalue : p-value threshold (default: 0.00001) cols : colon delimited list of data types to be returned from the satMutMPRA data (default: 'Value', 'P-Value', and 'Element') incl_repl : include replicates (default: off): - full replicate for LDLR promoter (LDLR.2) and SORT1 enhancer (SORT1.2) - a reversed sequence orientation for SORT1 (SORT1-flip) - other conditions: PKLR-48h, ZRSh-13h2, TERT-GAa, TERT-GBM, TERG-GSc The Bio::DB::HTS perl library or tabix utility must be installed in your path to use this plugin. The satMutMPRA data file can be downloaded from https://mpra.gs.washington.edu/satMutMPRA/ satMutMPRA data can be downloaded for both GRCh38 and GRCh37 from the web server (https://mpra.gs.washington.edu/satMutMPRA/): 'Download' section, select 'GRCh37' or 'GRCh38' for 'Genome release' and 'Download All Elements'. The file must be processed and indexed by tabix before use by this plugin. # GRCh38 (grep ^Chr GRCh38_ALL.tsv; grep -v ^Chr GRCh38_ALL.tsv \| sort -k1,1 -k2,2n ) \| bgzip > satMutMPRA_GRCh38_ALL.gz tabix -s 1 -b 2 -e 2 -c C satMutMPRA_GRCh38_ALL.gz # GRCh37 (grep ^Chr GRCh37_ALL.tsv; grep -v ^Chr GRCh37_ALL.tsv \| sort -k1,1 -k2,2n ) \| bgzip > satMutMPRA_GRCh37_ALL.gz tabix -s 1 -b 2 -e 2 -c C satMutMPRA_GRCh37_ALL.gz When running the plugin by default 'Value', 'P-Value', and 'Element' information is returned e.g. --plugin satMutMPRA,file=/path/to/satMutMPRA_GRCh38_ALL.gz You may include all columns with ALL; this fetches all data per variant (e.g. Tags, DNA, RNA, Value, P-Value, Element): --plugin satMutMPRA,file=/path/to/satMutMPRA_GRCh38_ALL.gz,cols=ALL You may want to select only a specific subset of information to be reported, you can do this by specifying the specific columns as parameters to the plugin e.g. --plugin satMutMPRA,file=/path/to/satMutMPRA_GRCh38_ALL.gz,cols=Tags:DNA If a requested column is not found, the error message will report the complete list of available columns in the satMutMPRA file. For a detailed description of the available information please refer to the manuscript or online web server. Tabix also allows the data file to be hosted on a remote server. This plugin is fully compatible with such a setup - simply use the URL of the remote file: --plugin satMutMPRA,file=http://my.files.com/satMutMPRA.gz Note that gene locations referred to in satMutMPRA may be out of sync with those in the latest release of Ensembl; this may lead to discrepancies with information retrieved from other sources.	Phenotype data and citations	-	Ensembl
SingleLetterAA	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that returns a HGVSp string with single amino acid letter codes	HGVS	-	Ensembl
SpliceAI	A VEP plugin that retrieves pre-calculated annotations from SpliceAI. SpliceAI is a deep neural network, developed by Illumina, Inc that predicts splice junctions from an arbitrary pre-mRNA transcript sequence. more Delta score of a variant, defined as the maximum of (DS_AG, DS_AL, DS_DG, DS_DL), ranges from 0 to 1 and can be interpreted as the probability of the variant being splice-altering. The author-suggested cutoffs are: * 0.2 (high recall) * 0.5 (recommended) * 0.8 (high precision) This plugin is available for both GRCh37 and GRCh38. More information can be found at: https://pypi.org/project/spliceai/ Please cite the SpliceAI publication alongside VEP if you use this resource: https://www.ncbi.nlm.nih.gov/pubmed/30661751 Running options: (Option 1) By default, this plugin appends all scores from SpliceAI files. (Option 2) Besides the pre-calculated scores, it can also be specified a score cutoff between 0 and 1. Output: The output includes the gene symbol, delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). For tab the output contains one header 'SpliceAI_pred' with all the delta scores and positions. The format is: SYMBOL\|DS_AG\|DS_AL\|DS_DG\|DS_DL\|DP_AG\|DP_AL\|DP_DG\|DP_DL For JSON the output is a hash with the following format: "spliceai": {"DP_DL":0,"DS_AL":0,"DP_AG":0,"DS_DL":0,"SYMBOL":"X","DS_AG":0,"DP_AL":0,"DP_DG":0,"DS_DG":0} For VCF output the delta scores and positions are stored in different headers. The values are 'SpliceAI_pred_xx' being 'xx' the score/position. Example: 'SpliceAI_pred_DS_AG' is the delta score for acceptor gain. Gene matching: SpliceAI can contain scores for multiple genes that overlap a variant, and VEP can also predict consequences on multiple genes for a given variant. The plugin only returns SpliceAI scores for the gene symbols that match (if any). If plugin is run with option 2, the output also contains a flag: 'PASS' if delta score passes the cutoff, 'FAIL' otherwise. The following steps are necessary before running this plugin: The files with the annotations for all possible substitutions (snv), 1 base insertions and 1-4 base deletions (indel) within genes are available here: https://basespace.illumina.com/s/otSPW8hnhaZR GRCh37: tabix -p vcf spliceai_scores.raw.snv.hg37.vcf.gz tabix -p vcf spliceai_scores.raw.indel.hg37.vcf.gz GRCh38: tabix -p vcf spliceai_scores.raw.snv.hg38.vcf.gz tabix -p vcf spliceai_scores.raw.indel.hg38.vcf.gz The plugin can then be run: ./vep -i variations.vcf --plugin SpliceAI,snv=/path/to/spliceai_scores.raw.snv.hg38.vcf.gz,indel=/path/to/spliceai_scores.raw.indel.hg38.vcf.gz ./vep -i variations.vcf --plugin SpliceAI,snv=/path/to/spliceai_scores.raw.snv.hg38.vcf.gz,indel=/path/to/spliceai_scores.raw.indel.hg38.vcf.gz,cutoff=0.5	Splicing predictions	List::Util qw(max)	Ensembl
SpliceRegion	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that provides more granular predictions of splicing effects. more Three additional terms may be added: # splice_donor_5th_base_variant : variant falls in the 5th base after the splice donor junction (5' end of intron) v ...EEEEEIIIIIIIIII... (E = exon, I = intron, v = variant location) # splice_donor_region_variant : variant falls in region between 3rd and 6th base after splice junction (5' end of intron) vv vvv ...EEEEEIIIIIIIIII... # splice_polypyrimidine_tract_variant : variant falls in polypyrimidine tract at 3' end of intron, between 17 and 3 bases from the end vvvvvvvvvvvvvvv ...IIIIIIIIIIIIIIIIIIIIEEEEE...	Splicing predictions	-	Ensembl
StructuralVariantOverlap	A VEP plugin that retrieves information from overlapping structural variants. more Parameters can be set using a key=value system: file : required - a VCF file of reference data. percentage : percentage overlap between SVs (default: 80) reciprocal : calculate reciprocal overlap, options: 0 or 1. (default: 0) (overlap is expressed as % of input SV by default) cols : colon delimited list of data types to return from the INFO fields (only AF by default) same_type : 1/0 only report SV of the same type (eg deletions for deletions, off by default) distance : the distance the ends of the overlapping SVs should be within. match_type : only report reference SV which lie within or completely surround the input SV options: within, surrounding label : annotation label that will appear in the output (default: "SV_overlap") Example- input: label=mydata, output: mydata_name=refSV,mydata_PC=80,mydata_AF=0.05 Example reference data 1000 Genomes Project: https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/integrated_sv_map/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.vcf.gz gnomAD: https://storage.googleapis.com/gcp-public-data--gnomad/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz Example: ./vep -i structvariants.vcf --plugin StructuralVariantOverlap,file=gnomad_v2_sv.sites.vcf.gz	Structural variant data	-	Ensembl
SubsetVCF	A VEP plugin to retrieve overlapping records from a given VCF file. Values for POS, ID, and ALT, are retrieved as well as values for any requested INFO field. Additionally, the allele number of the matching ALT is returned. more Though similar to using '--custom', this plugin returns all ALTs for a given POS, as well as all associated INFO values. By default, only VCF records with a filter value of "PASS" are returned, however this behaviour can be changed via the 'filter' option. Parameters: name: short name added used as a prefix (required) file: path to tabix-index vcf file (required) filter: only consider variants marked as 'PASS', 1 or 0 (default, 1) fields: info fields to be returned (default, not used) '%' can delimit multiple fields '*' can be used as a wildcard Returns: _POS: POS field from VCF _REF: REF field from VCF (minimised) _ALT: ALT field from VCF (minimised) _alt_index: Index of matching variant (zero-based) _: List of requested info values	Variant data	Data::Dumper Storable qw(dclone)	Joseph A. Prinz
TranscriptAnnotator	A VEP plugin that annotates variant-transcript pairs based on a given file: more --plugin TranscriptAnnotator,file=${HOME}/file.tsv.gz Example of a valid tab-separated annotation file: #Chrom Pos Ref Alt Transcript SIFT_score SIFT_pred Comment 11 436154 A G NM_001347882.2 0.03 Deleterious Bad 11 1887471 C T ENST00000421485 0.86 Tolerated Good Please bgzip and tabix the file with commands such as: bgzip file.txt tabix -b2 -e2 file.txt.gz Options are passed to the plugin as key=value pairs: file: (mandatory) Tabix-indexed file to parse. Must contain variant location (chromosome, position, reference allele, alternative allele) and transcript ID as the first 5 columns. Accepted transcript IDs include those from Ensembl and RefSeq. cols: Colon-delimited list with names of the columns to append. Column names are based on the last header line. By default, all columns (except the first 5) are appended. prefix: String to prefix the name of appended columns (default: basename of the filename without extensions). Set to 0 to avoid any prefix. trim: Trim whitespaces from both ends of each column (default: 1). The tabix and bgzip utilities must be installed in your path to read the tabix-indexed annotation file: check https://github.com/samtools/htslib.git for installation instructions.	Transcript annotation	File::Basename	Ensembl
TSSDistance	A VEP plugin that calculates the distance from the transcription start site for upstream variants.	Nearby features	-	Ensembl
UTRAnnotator	A VEP plugin that annotates the effect of 5' UTR variant especially for variant creating/disrupting upstream ORFs. Available for both GRCh37 and GRCh38. more Options are passed to the plugin as key=value pairs, (defaults in parentheses): file : Path to UTRAnnotator data file. - Download from https://github.com/Ensembl/UTRannotator - Download from http://www.sorfs.org/ Citation About the role of 5'UTR variants in human genetic disease: Whiffin, N., Karczewski, K.J., Zhang, X. et al. Characterising the loss-of-function impact of 5’ untranslated region variants in 15,708 individuals. Nat Commun 11, 2523 (2020). https://doi.org/10.1038/s41467-019-10717-9 About UTRAnnotator: The original UTRAnnotator plugin is written by Xiaolei Zhang et al. Later adopted by Ensembl VEP plugins with some changes. You can find the original plugin here - https://github.com/ImperialCardioGenetics/UTRannotator Please cite the UTRannotator publication alongside the Ensembl VEP if you use this resource - Annotating high-impact 5'untranslated region variants with the UTRannotator Zhang, X., Wakeling, M.N., Ware, J.S, Whiffin, N. Bioinformatics; doi: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btaa783/5905476	Transcript annotation	Scalar::Util qw(looks_like_number)	Xiaolei Zhang Ensembl
VARITY	This is a plugin for the Ensembl Variant Effect Predictor (VEP) that adds the pre-computed VARITY scores to predict pathogenicity of rare missense variants to VEP output. more Please cite the VARITY publication alongside the VEP if you use this resource: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8715197/ Running options : VARITY scores can be downloaded using wget http://varity.varianteffect.org/downloads/varity_all_predictions.tar.gz The files can be tabix processed by : tar -xzvf varity_all_predictions.tar.gz cat varity_all_predictions.txt \| (head -n 1 && tail -n +2 \| sort -t$'\t' -k 1,1 -k 2,2n) > varity_all_predictions_sorted.tsv sed '1s/.*/#&/' varity_all_predictions_sorted.tsv > varity_all_predictions.tsv # to add a # in the first line of the file bgzip varity_all_predictions.tsv tabix -f -s 1 -b 2 -e 2 varity_all_predictions.tsv.gz Requirements: The tabix utility must be installed in your path to use this plugin. The --assembly flag is required to use this plugin.	Pathogenicity predictions	-	Ensembl

Variant Effect Predictor Plugins

Existing plugins

How it works

Functionality

Filtering using plugins

Using plugins

Intergenic variants