Ensembl Core API Tutorial

Xref

my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' );

# get a slice adaptor for the human core database
my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' );

# Obtain a slice covering the entire chromosome X
my $slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X' );

# Obtain a slice covering the entire clone AL359765.6
$slice = $slice_adaptor->fetch_by_region( 'clone', 'AL359765.6' );

# Obtain a slice covering an entire scaffold
$slice = $slice_adaptor->fetch_by_region( 'scaffold', 'KI270510.1' );

# Obtain a slice covering the region from 1MB to 2MB (inclusively) of
# chromosome 20
$slice = $slice_adaptor->fetch_by_region( 'chromosome', '20', 1e6, 2e6 );

my $slice = $slice_adaptor->fetch_by_gene_stable_id( 'ENSG00000099889', 5e3 );

# Retrieve slices of every chromosome in the database
@slices = @{ $slice_adaptor->fetch_all('chromosome') };

# Retrieve slices of every BAC clone in the database
@slices = @{ $slice_adaptor->fetch_all('clone') };

use Bio::EnsEMBL::Utils::Slice qw(split_Slices);

# ...

my $slices = $slice_adaptor->fetch_all('chromosome');

# Base pair overlap between returned slices
my $overlap = 0;

# Maximum size of returned slices
my $max_size = 100_000;

# Break chromosomal slices into smaller 100k component slices
$slices = split_Slices( $slices, $max_size, $overlap );

my $sequence = $slice->seq();
print $sequence, "\n";

$sequence = $slice->subseq( 100, 200 );

# The method coord_system() returns a Bio::EnsEMBL::CoordSystem object
my $coord_sys  = $slice->coord_system()->name();
my $seq_region = $slice->seq_region_name();
my $start      = $slice->start();
my $end        = $slice->end();
my $strand     = $slice->strand();

print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";

@genes = @{ $gene_adaptor->fetch_all_by_Slice($slice) };

# Another way of doing the same thing:
@genes = @{ $slice->get_all_Genes() };

# Fetch the slice adaptor and slice
my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' );
my $slice = $slice_adaptor->fetch_by_region( 'chromosome', '10', 3770000, 3790000 );

# Get all the genes in the slice
my @genes = @{ $slice->get_all_Genes() };

foreach my $gene (@genes){
	printf( "In terms of slice: %d-%d (%+d)\n",
    	$gene->start(),
    	$gene->end(),
    	$gene->strand() );
	printf( "In terms of seq_region: %d-%d (%+d)\n",
    	$gene->seq_region_start(),
    	$gene->seq_region_end(),
    	$gene->seq_region_strand() );
}

$feature_slice = $feature->feature_Slice();

# Display the sequence of the feature region
print $feature_slice->seq(), "\n";

# Display the sequence of the feature region + 5000bp flanking sequence
print $feature_slice->expand( 5000, 5000 )->seq(), "\n";

# Get all genes which overlap the feature
$genes = $feature_slice->get_all_Genes();

my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' );
my $slice = $slice_adaptor->fetch_by_region( 'chromosome', '10', 3770000, 3790000 );

foreach my $gene ( @{ $slice->get_all_Genes() } ) {
    print "Gene ID: ", $gene->stable_id, "\n";

    foreach my $transcript ( @{ $gene->get_all_Transcripts() } ) {
        print  "\tTranscript ID: ", $transcript->stable_id, "\n";

        foreach my $exon ( @{ $transcript->get_all_Exons() } ) {
            print  "\t\tExon ID: ", $exon->stable_id, "\n";
        }
    }
}

# The spliced_seq() method returns the concatenation of the exon
# sequences. This is the cDNA of the transcript
print "cDNA: ", $transcript->spliced_seq(), "\n";

# The translateable_seq() method returns only the CDS of the transcript
print "CDS: ", $transcript->translateable_seq(), "\n";

# UTR sequences are obtained via the five_prime_utr() and
# three_prime_utr() methods
my $fiv_utr = $transcript->five_prime_utr();
my $thr_utr = $transcript->three_prime_utr();

print "5' UTR: ", ( defined $fiv_utr ? $fiv_utr->seq() : "None" ), "\n";
print "3' UTR: ", ( defined $thr_utr ? $thr_utr->seq() : "None" ), "\n";

# The protein sequence is obtained from the translate() method. If the
# transcript is non-coding, undef is returned.
my $protein = $transcript->translate();

print "Translation: ", ( defined $protein ? $protein->seq() : "None" ), "\n";

print $gene->display_id(), "\n";

my $stable_id = 'ENST00000528762';

my $transcript_adaptor =
  $registry->get_adaptor( 'Human', 'Core', 'Transcript' );
my $transcript = $transcript_adaptor->fetch_by_stable_id($stable_id);

print $transcript->translation()->stable_id(), "\n";
print $transcript->translate()->seq(),         "\n";

print $transcript->translation()->transcript()->stable_id(), "\n";

$translation = $transcript->translation();

my $pfeatures = $translation->get_all_ProteinFeatures();
while ( my $pfeature = shift @{$pfeatures} ) {
    my $logic_name = $pfeature->analysis()->logic_name();

    printf(
        "%d-%d %s %s %s\n",
        $pfeature->start(), $pfeature->end(), $logic_name,
        $pfeature->interpro_ac(),
        $pfeature->idesc()
    );
}

my $seg_features    = $translation->get_all_ProteinFeatures('Seg');
my $domain_features = $translation->get_all_DomainFeatures();

my $gene_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Gene' );

# Get the 'COG6' gene from human
my $gene = $gene_adaptor->fetch_by_display_label('COG6');

print "GENE ", $gene->stable_id(), "\n";
my @dbentries = @{ $gene->get_all_DBEntries() };

foreach my $dbe ( @dbentries ) {
    printf "\tXREF %s (%s)\n", $dbe->display_id(), $dbe->dbname();
}

my $gene_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Gene' );

# Get the 'COG6' gene from human
my $gene = $gene_adaptor->fetch_by_display_label('COG6');

print "GENE ", $gene->stable_id(), "\n";
my @dblinks = @{ $gene->get_all_DBLinks() };

foreach my $dbl ( @dblinks ) {
    printf "\tXREF %s (%s)\n", $dbl->display_id(), $dbl->dbname();
}

my @dblinks = @{ $gene->get_all_DBLinks('Uniprot%') };

# Retrieve dna-dna alignment features from the slice region
my @features = @{ $slice->get_all_DnaAlignFeatures('Vertrna') };

foreach my $feature ( @features ) {

    printf(
        "%s %d-%d (%+d)\t=> %d-%d (%+d)\n",
        $feature->hseqname(), $feature->hstart(), $feature->hend(),
        $feature->hstrand(),  $feature->start(),  $feature->end(),
        $feature->strand()
        );

    print "Percent identity: ", $feature->percent_id(),   "\n";
    print "Cigar string: ",     $feature->cigar_string(), "\n";

    my @ungapped = $feature->ungapped_features();

    print "ungapped:\n"; 
    foreach my $feature_pair (@ungapped) { 
        printf( 
	    "%s %d-%d (%+d)\t=> %d-%d (%+d)\n", 
	    $feature_pair->hseqname(), $feature_pair->hstart(), $feature_pair->hend(), 
	    $feature_pair->hstrand(), $feature_pair->start(), $feature_pair->end(), 
	    $feature_pair->strand() 
	    ); 
    }
    print "\n";
}

my @repeats = @{ $slice->get_all_RepeatFeatures() };

foreach my $repeat (@repeats) {
    printf( "%s %d-%d\n",
        $repeat->display_id(), $repeat->start(), $repeat->end() );
}

my $unmasked_seq   = $slice->seq();
my $hardmasked_seq = $slice->get_repeatmasked_seq()->seq();
my $softmasked_seq = $slice->get_repeatmasked_seq( undef, 1 )->seq();

# Soft-mask sequence using TRF results only
my $tandem_masked_seq = $slice->get_repeatmasked_seq( ['TRF'], 1 )->seq();

my $marker_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Marker' );

# Obtain marker by synonym (this returns a list, but there's seldom more
# than one marker in the list)
my $marker = $marker_adaptor->fetch_all_by_synonym('D9S1038E')->[0];

# Display the various names associated with the same marker
foreach my $synonym ( @{ $marker->get_all_MarkerSynonyms() } ) {
    if ( defined $synonym->source() ) {
        print $synonym->source(), ':';
    }
    print $synonym->name(), ' ';
}
print "\n";

# Display the primer info
printf( "left primer : %s\n", $marker->left_primer() );
printf( "right primer: %s\n", $marker->right_primer() );
printf(
    "product size: %d-%d\n",
    $marker->min_primer_dist(),
    $marker->max_primer_dist()
);

# Display out genetic/RH/FISH map information
print "Map locations:\n";
foreach my $map_loc ( @{ $marker->get_all_MapLocations() } ) {
    printf( "\t%s %s %s\n",
        $map_loc->map_name(), $map_loc->chromosome_name(),
        $map_loc->position() );
}

# Obtain the positions for an already retrieved marker
foreach my $marker_feature ( @{ $marker->get_all_MarkerFeatures() } ) {
    printf( "%s %d-%d\n",
        $marker_feature->seq_region_name(),
        $marker_feature->start(),
        $marker_feature->end() );
}

# Retrieve all marker features in a given region
my $marker_features = $slice->get_all_MarkerFeatures();
while ( my $marker_feature = shift @{$marker_features} ) {
    printf( "%s %s %d-%d\n",
        $marker_feature->display_id(), $marker_feature->seq_region_name(),
        $marker_feature->start(),      $marker_feature->end() );
}

my $encode_regions = $slice->get_all_MiscFeatures('encode');

while ( my $encode_region = shift @{$encode_regions} ) {
    my @attributes = @{ $encode_region->get_all_Attributes() };
    foreach my $attribute ( @attributes ) {
        printf "%s:%s\n", $attribute->name(), $attribute->value();
    }
}

my $mf_adaptor = $registry->get_adaptor( 'Human', 'Core', 'MiscFeature' );

my $clones =
  $mf_adaptor->fetch_all_by_attribute_type_value( 'clone_name', 'RP5-60P11' );

while ( my $clone = shift @{$clones} ) {
    my $slice = $clone->slice();

    printf( "%s %s %d-%d\n",
        $slice->coord_system->name(),
        $slice->seq_region_name(),
        $clone->start(), $clone->end() );

    my @attributes = @{ $clone->get_all_Attributes() };
    foreach my $attribute ( @attributes ) {
        printf "\t%s:%s\n", $attribute->name(), $attribute->value();
    }
}

my $cs_adaptor = $registry->get_adaptor( 'Human', 'Core', 'CoordSystem' );
my $cs = $cs_adaptor->fetch_by_name('chromosome');

printf "Coordinate system: %s %s\n", $cs->name(), $cs->version();

$slice =
  $slice_adaptor->fetch_by_region( 'chromosome', 'X', 1e6, 10e6, '1', 'NCBI36' );

my @chromosomes = @{ $slice_adaptor->fetch_all('chromosome') };
my @clones      = @{ $slice_adaptor->fetch_all('clone') };

# List all coordinate systems in this database:
my @coord_systems = @{ $cs_adaptor->fetch_all() };
foreach $cs (@coord_systems) {
    printf "Coordinate system: %s %s\n", $cs->name(), $cs->version;
}

# Get all slices on the highest coordinate system:
my @slices = @{ $slice_adaptor->fetch_all('toplevel') };

if ( my $new_feature = $feature->transform('clone') ) {
    printf(
        "Feature's clonal position is: %s %d-%d (%+d)\n",
        $new_feature->slice->seq_region_name(),
        $new_feature->start(), $new_feature->end(), $new_feature->strand()
    );
} else {
    print "Feature is not defined in clonal coordinate system\n";
}

                  |~~~~~~~| (Feature A) |~~~~| (Feature B)

 (ctg 1) [=============]
         (ctg 2) (------==========] (ctg 2)
                      (ctg 3)   (--============] (ctg3)

my $new_feature = $feature->transform('toplevel');

printf(
    "Feature's toplevel position is: %s %s %d-%d (%+d)\n",
    $new_feature->slice->coord_system->name(),
    $new_feature->slice->seq_region_name(),
    $new_feature->start(),
    $new_feature->end(),
    $new_feature->strand()
);

my $slice = $slice_adaptor->fetch_by_region( 'chromosome', '2',
1e6, 2e6 );

my $new_slice =
  $slice_adaptor->fetch_by_region( 'chromosome', '2', 1.5e6, 2e6 );

foreach my $simple_feature ( @{ $slice->get_all_SimpleFeatures('Eponine') } ) {
    printf(
        "Before:\t%6d - %6d\n",
        $simple_feature->start(),
        $simple_feature->end()
    );

    my $new_feature = $simple_feature->transfer($new_slice);
    if ( !defined $new_feature ) {
      print "Could not transfer feature\n";
    } else {
      printf( "After:\t%6d - %6d\n",
      $new_feature->start(),
      $new_feature->end()
      );
    }
}

printf( "Feature at: %s %d-%d (%+d) projects to\n",
    $feature->seq_region_name(), $feature->start(),
    $feature->end(),             $feature->strand() );

my $projection = $feature->project('clone');

foreach my $segment ( @{$projection} ) {
    my $to_slice = $segment->to_Slice();

    printf(
        "\t%s %d-%d (%+d)\n",
        $to_slice->seq_region_name(), $to_slice->start(),
        $to_slice->end(),             $to_slice->strand()
    );
}

Method	Converts from	Converts to
Transform	Feature in slice or coordinate system	Feature in a different coordinate system
Transfer	Feature in slice or coordinate system	Feature in a different slice
Project	Feature in slice or coordinate system	Feature spanning multiple seq_regions in a coordinate system

Ensembl Core API Tutorial

Introduction

Slices: genomic regions

Registry

Fetch by region

Fetch by gene

Fetch all

Split slices

Get information about the slice

Get features in a slice

Features

Slice and seq-region location

Slices from features

Genes, Transcripts, and Exons

Sequences

Names

Translations and ProteinFeatures

External References

Alignment Features

Repeats

Markers

MiscFeatures

Coordinate Systems

Transform

Transfer

Project

Further help