#!/usr/local/bin/perl # parsehumap.pl # parse human ensembl features -- genbank format $debug= 0; use Getopt::Std; use POSIX; use flybase::sym2id; ## paths for iubio/kalo $meowpub= '/c6/iubio/meow-pub/meow/server/'; $genomeshome= '/c7/eugenes/genomew/'; $humanens= '/c7/eugenes/human/ensembl/'; $mapdir= $humanens . 'maps/'; $gbfeatdir= $humanens . 'genbank/'; $org= 'man'; $orgacode= "$meowpub/$org/acode"; $isMeowId= 1; ## for getSym2Id - acode data $gnomapvers= '1'; $kMissingValue= -999999999; $doprot= 0; $wantdna= 0; $wantfeat= 1; $sourename= 'EnsEMBL human annotation data'; @chrs= qw( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un); # or is it 01 or CHR_01 ? $nchr= scalar(@chrs); $scaf= 0; %opt=(); Getopt::Std::getopts('o:xW:DA:',\%opt); $mework= $opt{W} || $genomeshome; $outpath= "$mework/$org/"; $outpath= $opt{o} || $outpath; $orgacode= $opt{A} if $opt{A}; $debug= 1 if $opt{D}; $featout= "$outpath/features.tsv"; unless ($opt{x} || scalar(@ARGV)) { print "parsehumap [-o output] [-Aacode] [-x] ensembl-gb-files\n"; print " Usage: \n -D == debug\n -o output path == [$outpath]\n -A acode == man.acode file\n -x = use default files \n"; exit; } # ? read @ARGV for input files? flybase::sym2id::set4meow($isMeowId); flybase::sym2id::readSym2Id($orgacode, $org); readcontigs(); readclones(); readfeats(); exit(); #-------- # readensgb - foreach locus(seqid) # foreach feature # featkind, featname, cmap?, bpmap, egID, dbxref, notes # featname = ? from egID->sym map || from /gene="ENSG00000089311" # egID = ? from /dbxref=LOCUSLINK # also have REFSEQ,EMBL,SPTREMBL,HUGO,MIM, # bpmap = reformat of location -> # flip?? join(complement(102579..102620),complement(102434..102574),...) # to complement(...,join(c..d),join(a..b)) # add csome offset: # ctgid= clones(seqid).ctgid # clones(seqid).start,len should == feat.location.start,len # + contigs(ctgid).start,len # # cmap? == CytogeneticMap sub readfeats { my $dir= shift || $gbfeatdir; my $fout= shift || "$outpath/features-$gname.tsv"; my $ok; local(*D,*FH,*FOUT,*FDNA); return "cant open $dir" unless (opendir(D,$dir)); @fn= grep( /ensembl/, readdir(D) ); #ensembl.1000.dat.gz close(D); if ($wantfeat) { unless (open(FOUT,">$fout")) { warn "can't write $fout"; return -1; } print FOUT &feattabheader("$org/features-$gname.tsv",$org,$dir.$fn[0]); } # if ($doprot) { # $fout= "$outpath/cds-$gname.fasta"; # unless (open(FPROT,">$fout")) { warn "can't write $fout"; return -1; } # } # if ($wantdna) { # $fout= "$outpath/dna-$gname.raw"; # unless (open(FDNA,">$fout")) { warn "can't write $fout"; return -1; } # } foreach my $fn (sort @fn) { $fn = $dir . $fn; print STDERR "readfeat($fn)\n" if $debug; if ($fn =~ /\.gz/) { $ok= open(FH,"gzcat $fn|"); } else { $ok= open(FH,$fn); } readfeat(*FH, *FOUT, *FDNA) if ($ok); close(FH); } close(FOUT); } # from genomefeat.pl sub feattabheader { my ($fname,$org,$sourcefile)= @_; my @tm= localtime( $^T + 24*60*60*(-M $sourcefile) ); my $date= POSIX::strftime("%d-%B-%Y",@tm); $sourcefile =~ s/^$genomeshome//; return <) { my $line= $_; if (/^LOCUS\s+(\S+)/) { $seqid= $1; $infeat= 0; $inseq= 0; } elsif (/^FEATURES/) { $infeat= 1; } elsif ($inseq && $wantdna) { s/\s+//g; ## no spaces s/\d+//g; ## eat line numbers $ndna += length($_); # print $fdna $_ if ($_); } elsif (/^(ORIGIN|CONTIG)/) { #? other end-of-feature fields? if (scalar(@feat)) { printFeat($fout,$org,$seqid,@feat) if ($wantfeat); $nfeat++; # $ncds += CDS2Fasta($fprot,@feat) if ($ftname eq 'CDS' && $doprot); @feat= (); } $infeat= 0; ## opt pull sequence data into single raw file ! if ($wantdna) { $inseq= 1; } # else { last; } ## return $ncds -- NOT last, many LOCUS per file; } elsif ($infeat) { if (/^ (\S+) /) { my $newftname= $1; if (scalar(@feat)) { $nfeat++; printFeat($fout,$org,$seqid,@feat) if ($wantfeat); ## $species{$org} # $ncds += CDS2Fasta(*FPROT,@feat) if ($ftname eq 'CDS' && $doprot); } $ftname= $newftname; @feat= (); push(@feat,$line); } elsif (/^ /) { push(@feat,$line); } } } } sub maxrange { my($range)= @_; my ($pre, $suf,$start,$stop, $b, $u); $start= $kMissingValue; $stop= $start; $range =~ s/^([^\d<>-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } return ($start,$stop); } sub parseFeat { my( @feat)= @_; my ($featname, $srange, $fnote, $val, $map, $id, $gsym,$inrange); my @dbx= (); my %fnotes= (); my $fline= shift(@feat); if ($fline =~ /^ (\S+)\s+(\S+)/) { ## 1st line only !? $featname= $1; $srange= $2; $inrange= 1; # recode some featnames $featname= 'repeat' if ($featname eq 'repeat_region'); } foreach (@feat) { if ( m|^\s+/(\w+)\s*=\s*"([^"]+)|) { $fnote= $1; $val= $2; $fnotes{$fnote}++; $inrange= 0; # ens notes include # /db_xref="EMBL:X07448" # /transcript="ENST00000004493" # /cds="ENSP00000004493" # /gene="ENSG00000003981" # /translation="MDWTWRILFLVAAATGAHSQVQLVQSGAEVKKPGASVKVSCKASG if ($fnote eq 'gene') { $gsym= $val;} #! this is ENSG0000 if ensembl elsif ($fnote eq 'map') { $map= $val;} elsif ($fnote eq 'chromosome') { #? only in source feature? not in worm csome $map= "Chr $val"; $chromosome= $val; } elsif ($fnote eq 'protein_id') { if ($featname eq 'CDS') { $id= $val; } else { push(@dbx, "PROTID:$val"); } ##? leave out if $id } elsif ($fnote eq 'db_xref') { ## FIXME parse out prime ID from here my $id1; if ($org eq 'man' && $featname eq 'CDS' && $val =~ /LOCUSLINK/) { $id1= $val; $id1 =~ s/LOCUSLINK://; $id1 = 'MEOW:HUgn' . sprintf("%07d",$id1); } elsif ($org eq 'man' && $featname eq 'CDS' && $val =~ /gene/) { $id1= $val; } # elsif ($org eq 'fly' && $featname eq 'gene' && $val =~ /FBgn/) { $id1= $val; } # elsif ($org eq 'fly' && $featname eq 'mRNA' && $val =~ /FBan/) { $id1= $val; } # elsif ($org eq 'yeast' && $featname eq 'gene' && $val =~ /SGD:/) { $id1= $val; } # elsif (!$id && $featname eq 'CDS' && $val =~ /PID:/) { $id1= $val; } # ## for yeast, need to move $id from CDS to gene (which hasnt a db_xref in ncbi genomes) if ($id1) { $id= $id1; } # else ##? leave out if $id { push(@dbx, $val); } } } elsif ( m|^\s+([^"]+)|) { ## " continuation line if ($inrange) { $srange .= $1; chomp($srange); } } else { ## nothing else? } } return ($featname,$srange,$gsym,$map,$id,\@dbx,\%fnotes); } sub printFeat { my ($fh,$org,$seqid,@feat)= @_; my $srec= $clones{$seqid}; my ($ctgid,$bpstart,$bplen)= split(/\t/,$srec); my $ctrec= $contigs{$ctgid}; my ($csome,$cbpstart,$cbplen)= split(/\t/,$ctrec); #? save $seqid, $ctgid for each record? my ($featname,$srange,$gsym,$map,$id,$rdbx,$rfnotes)= parseFeat(@feat); # offset $map by $bpstart/$cbpstart !?? # write to $fh->csome ? # ## # ## need special fixes for genome/fly/scaffolds to weave into csome ordered data # ## & join w/ more updated flybase data # ## # my $isscaf= 0; # if ($scaf) { ## && $org eq 'fly' # my $arange= scaffold2arm($scaf,$srange); # ## print STDERR "$srange => $arange\n" if $debug; # $srange= $arange; # if ($featname eq 'source') { # ##! and need to build new 'source' from scaffolds # $featname= 'scaffold'; # $gsym= $scaf->{name}; # $isscaf= 1; # } # } # if ($org eq 'fly') { # if ($chromosome && $cytomap ne $chromosome) { # readFlyCytomap($chromosome); # $cytomap= $chromosome; # } # my $fbid; # if ($id =~ /(FBgn\d+)/) { $fbid= $1; } # else { foreach (@{$rdbx}) { $fbid= $1 if (/(FBgn\d+)/); } } # if ($fbid) { # my $cmap= flyfeatures::getCytomap($fbid); $map= $cmap if ($cmap); # my $fsym= flyfeatures::getSym($fbid); $gsym= $fsym if ($fsym); # } # } unless($gsym) { if ($featname =~ /^source/) { $gsym= $org; } else { $gsym= '-'; } } unless($map) { if ($featname =~ /^(source|scaffold)/ && $csomeFile) { $map= $csomeFile; } else { $map= '-'; } } unless ($id) { $id= flybase::sym2id::symbol2id($gsym); ## $sym2id{ symcase($gsym) }; } my $idsym= flybase::sym2id::id2symbol($id); $gsym= $idsym if($idsym); # make sure is proper symbol $allfeats{$featname}++; print $fh "$featname\t$gsym\t$map\t$srange\t$id\t"; foreach (sort @{$rdbx}) { print $fh "$_,";} if ($wantnotes) { print $fh "\t"; foreach (sort keys %{$rfnotes}) { print $fh "$_,"; $allnotes{$_} += ${$rfnotes}{$_}; } } print $fh "\tCHR_$csome:$seqid"; print $fh "\n"; } sub symcase { my($sym)= shift; ## return uc($sym); ## if genbank doesnt keep proper case! return $sym; } #------> Fpc_Contig.txt.table.gz <------ >> 1396 lines #cID contigName bpstart bplen csome #871 ctg13815 25854336 2186457 18 sub readcontigs { my $fn= shift || ($mapdir . 'Fpc_Contig.txt.table.gz'); my $ok; local(*D,*FH); %contigs= (); print STDERR "readcontigs($fn)\n" if $debug; if ($fn =~ /\.gz/) { $ok= open(FH,"gzcat $fn|"); } else { $ok= open(FH,$fn); } while () { chomp(); my ($ctgid,$ctgname,$bpstart,$bplen,$csome) = split(); # /\t/ ? $contigs{$ctgid}= "$csome\t$bpstart\t$bplen"; } close(FH); } #------> Fpc_Clone.txt.table.gz <------ >> 32253 lines # Embl/Gb ID Clone name Orgzn. ContigID band fpcSize seqSize seqStart? #AP001105 RP11-759D19 JAPAN 871 0 0 164321 0 sub readclones { my $fn= shift || ($mapdir . 'Fpc_Clone.txt.table.gz'); my $ok; local(*D,*FH); print STDERR "readclones($fn)\n" if $debug; %clones= (); if ($fn =~ /\.gz/) { $ok= open(FH,"gzcat $fn|"); } else { $ok= open(FH,$fn); } while () { chomp(); my ($seqid,$cname,$og,$ctgid,$bd,$fsz,$bplen,$bpstart) = split(); $clones{$seqid}= "$ctgid\t$bpstart\t$bplen"; } close(FH); } __END__ ensembl maps/ CHECKSUMS.gz MarkChr.txt.table.gz ChromosomeBands.txt.table.gz Marker.txt.table.gz CytogeneticMap.txt.table.gz MarkerSynonym.txt.table.gz Fpc_Clone.txt.table.gz RHMaps.txt.table.gz Fpc_Contig.txt.table.gz help.txt.table.gz LandmarkMarker.txt.table.gz maps075.sql.gz Map.txt.table.gz build features/csome from ensembl: #Feature gene cmap bprange id db_xref notes contigs(ctgid,rec)= readcontigs(ctgid, Fpc_Contig(ctgid,bpstart,bplen, csome)) clones(seqid,rec)= readclones(seqid, Fpc_Clone(seqid, seqsize, seqstart)) readensgb - foreach locus(seqid) foreach feature featkind, featname, cmap?, bpmap, egID, dbxref, notes featname = ? from egID->sym map || from /gene="ENSG00000089311" egID = ? from /dbxref=LOCUSLINK also have REFSEQ,EMBL,SPTREMBL,HUGO,MIM, bpmap = reformat of location -> flip?? join(complement(102579..102620),complement(102434..102574),...) to complement(...,join(c..d),join(a..b)) add csome offset: ctgid= clones(seqid).ctgid clones(seqid).start,len should == feat.location.start,len + contigs(ctgid).start,len cmap? == CytogeneticMap build dna/csome from ensembl: read2hash(ctgid, Fpc_Contig(ctgid,bpstart,bplen, csome)) read2hash(seqid, Fpc_Clone(seqid, seqsize, seqstart)) ------> ens075.dna.fa.gz <------ >> these are seqid's >AB000381.00001 GCGGCCGGAATTAACCCTCACTAAAGGGATCCCTCGATCATACACTATGTGGCCTCTGTGTCTGGCTTCTGT >AB000381.00001 >AB012723.00001 >AB015355.00001 .. >AB020858.00001 >AB020859.00001 >AB020860.00001 >AB020861.00001 >AB020862.00001 ... >AC000003.00001 >AC000004.00001 >AC000007.00001 >AC000015.00001 >AC000021.00001 >AC000022.00001 >L47234.00001 ------> Fpc_Contig.txt.table.gz <------ >> 1396 lines #cID contigName bpstart bplen csome 871 ctg13815 25854336 2186457 18 1125 ctg15274 141041432 339582 X 652 ctg13492 72819205 3551438 12 943 ctg17057 54560273 214790 19 685 ctg16166 23449087 4607383 14 566 ctg13655 30582683 17914764 10 184 ctg14708 254158329 172797 2 125 ctg14546 7906646 2065334 2 986 ctg16167 -1 183481 21 491 ctg15438 0 952452 8 ------> Fpc_Clone.txt.table.gz <------ >> 32253 lines # Embl/Gb ID Clone name Orgzn. ContigID band fpcSize seqSize seqStart? AP001105 RP11-759D19 JAPAN 871 0 0 164321 0 AC015933 CTD-2023G8 WIBR 871 0 0 228390 12000 AP001145 RP11-687D24 JAPAN 871 0 0 183937 96000 AP001900 RP11-705A9 RIKEN 871 0 0 145550 112000 ------> CytogeneticMap.txt.table.gz <------ RH2337 p36 p36 3 2 1 0 RH2124 p36 p36 1 1 1 0 RH31559 p36 p35 1 1 0 ------> ChromosomeBands.txt.table.gz <------ 1 1 w p36 3 7800.63 1 2 b p36 2 3719.51 1 3 w p36 1 10590.26 ------> LandmarkMarker.txt.table.gz <------ 105129 D12S1957 106654 D4S3169 106926 D6S1886 ------> MarkChr.txt.table.gz <------ RH1 17 RH2 10 RH3 1 ------> Marker.txt.table.gz <------ RH1 CTGGTGAGGAAGCTCCAGTC AACCCGATTAGCAAGGCC cdna RH2 GTGCTGGCCCTCATAGTGTT AGGTTACAGCTGCTCCTGGA cdna RH3 AGGTAGCTCCCAGCACTCAA TTTATTGGCTGGAAAGGGC RH4 GCCTCCTCCTACTGTTTCCC GAGGTCAGGCTTTAGCGATG cdna ------> MarkerSynonym.txt.table.gz <------ RH1 RH1 RHdb RH1 stSG1 Sanger_STS RH1 SC91 RHalloc ------> RHMaps.txt.table.gz <------ GB4 RH50853 0.00 1 GB4 RH104552 0.58 1 GB4 RH75913 0.58 1 ------> help.txt.table.gz <------ NotMapped RH3 1 NotMapped RH10 22 ------> maps075.sql.gz <------ # MySQL dump 8.10 # # Host: localhost Database: maps075 #-------------------------------------------------------- # Server version 3.23.25-beta # # Table structure for table 'ChromosomeBands' # CREATE TABLE ChromosomeBands ( chromosome char(2) DEFAULT 'un' NOT NULL, tag int(11) DEFAULT '0' NOT NULL, colour char(2), major_band char(3), minor_band char(10), length float(10,2), PRIMARY KEY (chromosome,tag) ); # # Table structure for table 'CytogeneticMap' # CREATE TABLE CytogeneticMap ( marker varchar(40) DEFAULT '' NOT NULL, start_band varchar(40) DEFAULT '' NOT NULL, end_band varchar(40), start_sub_band varchar(40), end_sub_band varchar(40), chromosome varchar(10) DEFAULT '' NOT NULL, position_tag int(11) DEFAULT '0' NOT NULL, PRIMARY KEY (marker), KEY BandIdx (start_band), KEY ChromosomeIdx (chromosome), KEY posIdx (position_tag) ); # # Table structure for table 'Fpc_Clone' # CREATE TABLE Fpc_Clone ( embl_id varchar(15) DEFAULT '' NOT NULL, clone_name varchar(20) DEFAULT '' NOT NULL, organisation varchar(15) DEFAULT '' NOT NULL, contig_id int(11) DEFAULT '0' NOT NULL, band int(11) DEFAULT '0' NOT NULL, fpc_size int(11) DEFAULT '0' NOT NULL, seq_size int(11), start_guess int(11), KEY contigIdx (contig_id), PRIMARY KEY (clone_name,embl_id), KEY embl_id (embl_id) ); # # Table structure for table 'Fpc_Contig' # CREATE TABLE Fpc_Contig ( contig_id int(11) NOT NULL auto_increment, contig_name varchar(15) DEFAULT '' NOT NULL, start int(11) DEFAULT '0' NOT NULL, length int(11) DEFAULT '0' NOT NULL, chromosome char(3) DEFAULT '' NOT NULL, PRIMARY KEY (contig_id), KEY beginIdx (chromosome,start), KEY NameIdx (contig_name) ); # # Table structure for table 'LandmarkMarker' # CREATE TABLE LandmarkMarker ( marker varchar(40) DEFAULT '' NOT NULL, name varchar(40) DEFAULT '' NOT NULL, KEY synname (name), PRIMARY KEY (marker,name) ); # # Table structure for table 'Map' # CREATE TABLE Map ( id varchar(40) DEFAULT '' NOT NULL, name varchar(40), tablename varchar(40), origin varchar(40), version int(11), last_updated timestamp(14), PRIMARY KEY (id) ); # # Table structure for table 'MarkChr' # CREATE TABLE MarkChr ( marker varchar(40) DEFAULT '' NOT NULL, chr varchar(5), PRIMARY KEY (marker) ); # # Table structure for table 'Marker' # CREATE TABLE Marker ( marker varchar(40) DEFAULT '' NOT NULL, seq_left varchar(40) DEFAULT '' NOT NULL, seq_right varchar(40) DEFAULT '' NOT NULL, marker_type varchar(5) DEFAULT '' NOT NULL, PRIMARY KEY (marker), KEY leftIdx (seq_left), KEY rightIdx (seq_right) ); # # Table structure for table 'MarkerSynonym' # CREATE TABLE MarkerSynonym ( marker varchar(40) DEFAULT '' NOT NULL, name varchar(40) DEFAULT '' NOT NULL, db varchar(15) DEFAULT '' NOT NULL, KEY synname (name), PRIMARY KEY (marker,name,db) ); # # Table structure for table 'RHMaps' # CREATE TABLE RHMaps ( map varchar(40) DEFAULT '' NOT NULL, marker varchar(40) DEFAULT '' NOT NULL, map_position float(10,2) DEFAULT '-1.00' NOT NULL, chromosome varchar(10) DEFAULT '' NOT NULL, PRIMARY KEY (marker,map), KEY chrIdx (chromosome) ); # # Table structure for table 'help' # CREATE TABLE help ( map varchar(40), marker varchar(40), chromosome varchar(10) ); #---------------------- while() { if (/^component/) { @v= split(); $l= $v[5] - $v[4]; print $_; print "length= $l\n"; } } __DATA__ >> genbank length= 157089 Ensembl LOCUS feature IDs are in Fpc_Clone.txt.table == embl_id and in genbank genome maps as 'component' /c7/eugenes/human/ensembl/maps oat% gzgrep -e 'AB019441|AC026279|AB019440' *gz Fpc_Clone.txt.table.gz:AC026279 RP11-321G12 WUGSC 718 0 0 157159 1392000 Fpc_Clone.txt.table.gz:AB019441 JAPAN 702 0 0 157090 0 Fpc_Clone.txt.table.gz:AB019440 KYOTO 702 0 0 200000 200000 /c7/eugenes/human/ncbi/genomes/maps/chromosome_order oat% gzgrep -e 'AB019441|AC026279|AB019440' *gz chr14_sequence.gz:component 4512311 AB019441.1 14 104253954 104411043 - chr14_sequence.gz:component 4512300 AB019440.1 14 104461044 104661043 + chr15_sequence.gz:component 8954310 AC026279.4 15 66275784 66432873 - # loclink gene count = 19256 # grep -c 'RSQ|REFSEQ' acode = 10786 # grep -c 'RPA|REFPROT' acode = 10799 # # Ens_gene=44835 << many predicted # Ens_LOCUSLINK=7844 # Ens_REFSEQ=17442 # # NCBI_gene=8889 Ensembl gene annots. have # LOCUSLINK=7844 # REFSEQ=17442 ?? many dups? NCBI genomes has features: variation == /db_xref="dbSNP:689125" STS /db_xref="UniSTS:184132" hs_chr11.gbs: CDS 574 LOCUS 297 STS 6200 gene 545 mRNA 574 misc_feature 307 source 1891 variation 102980 LOCUS AB019440 ENSEMBL; DNA; HTG; 200001 BP. 15-MAY-1999 source 1..200001 LOCUS AB019441 ENSEMBL; DNA; HTG; 157091 BP. 15-MAY-1999 AC060783 source 1..153156 AC026279 LOCUS AC026279 ENSEMBL; DNA; HTG; 157160 BP. 24-AUG-2000 source 1..157160 CDS join(complement(5466..5550),complement(46721..46900), complement(45668..45810),AC023591:432..527, AC023591:1746..1809,AC023591:2084..2241,28540..28666, 39355..39387,40252..40336,41128..41197) /db_xref="EMBL:AF037335" /db_xref="EMBL:AF051882" /db_xref="HUGO:CA12" /db_xref="LOCUSLINK:771" /db_xref="MIM:603263" /db_xref="REFSEQ:NP_001209" /db_xref="SPTREMBL:O43570" /transcript="ENST00000178638" /cds="ENSP00000178638" /gene="ENSG00000074410" NCBI genomes maps chr15_sequence.gz: component 8954310 AC026279.4 15 66275784 66432873 - LOCUS AB019441 ENSEMBL; DNA; HTG; 157091 BP. 15-MAY-1999 DEFINITION Reannotated sequence via Ensembl ACCESSION AB019441 VERSION AB019441.1 KEYWORDS HTG; HTGS_PHASE. SOURCE Homo sapiens. ORGANISM Homo sapiens Eukaryota; Metazoa; Chordata; Vertebrata; Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo. COMMENT This sequence was reannotated via the Ensembl system. Please visit the Ensembl web site, http://www.ensembl.org/ for more information. COMMENT The reference, comment, description and feature table of the original entry can be found in the DDBJ/EMBL/GenBank database with the identical accession number. COMMENT The /gene indicates a unique id for a gene, /cds a unique id for a translation and a /exon a unique id for an exon. These ids are maintained wherever possible between versions. For more information on how to interpret the feature table, please visit http://www.ensembl.org/Docs/embl.html. COMMENT All the exons and transcripts in Ensembl are confirmed by similarity to either protein or cDNA sequences. COMMENT In unfinished, rough draft DNA sequence gene structures can cross fragments and, in these cases, the order and orientation of the fragments is likely to be different from the order in the the International Nucleotide Sequence Databases DDBJ/EMBL/GenBank. FEATURES Location/Qualifiers source 1..157091 /organism="Homo sapiens" CDS join(16680..16736) /transcript="ENST00000203141" /cds="ENSP00000203141" /gene="ENSG00000089318" /translation="TVCVCVCVCVCVCVCVCPL" CDS join(17860..17905,36566..36870) /db_xref="EMBL:X07448" /db_xref="SPTREMBL:P23083" /transcript="ENST00000004493" /cds="ENSP00000004493" /gene="ENSG00000003981" /translation="MDWTWRILFLVAAATGAHSQVQLVQSGAEVKKPGASVKVSCKASG YTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSR LRSDDTAVYYCAR" CDS join(55380..55427) /transcript="ENST00000203134" /cds="ENSP00000203134" /gene="ENSG00000089311" /translation="APVVLTCPPVCPNHSQ" CDS join(79678..79770) /transcript="ENST00000203129" /cds="ENSP00000203129" /gene="ENSG00000089306" /translation="QWLTPVIPALWEAEAGGSLEVRTLRPAWPTW" CDS join(95775..95831) /transcript="ENST00000203127" /cds="ENSP00000203127" /gene="ENSG00000089304" /translation="PREQTLRPGPEDGGGAESS" CDS join(complement(102579..102620),complement(102434..102574), complement(101459..101479),complement(101422..101454), complement(101338..101419),complement(101038..101141)) /db_xref="EMBL:AF161538" /db_xref="REFSEQ:NP_054870" /db_xref="SPTREMBL:Q9NZY2" /transcript="ENST00000203137" /cds="ENSP00000203137" /gene="ENSG00000089314" /translation="EPGDTGPILPNGDTGAALRSRERPSWPQETHGHRERTEEGCAVAA FSADALRTGGQELEQTGLRPKAGPLCQTSWVTGYTDIGKGWRMDGGRTCSCSSFCRCPE RGARRSSPDAPGLALDFPLLLDLLWHLCSWTSQPLEL" CDS join(complement(109919..109993)) /transcript="ENST00000203126" /cds="ENSP00000203126" /gene="ENSG00000089303" /translation="SLSLPVGGDSSVGFPGEVWKGAGLR" exon 16680..16736 /start_phase=0 /exon_id="ENSE00000567233" /end_phase=0 exon 17860..17905 /start_phase=0 /exon_id="ENSE00000567229" /end_phase=1 exon 36566..36870 /start_phase=1 /exon_id="ENSE00000372030" /end_phase=0 .. repeat_region complement(380..773) /note="LTR52: matches 4 to 404 of consensus" repeat_region 1054..2080 /note="LTR5: matches 3 to 968 of consensus" repeat_region complement(2140..3003) /note="MER49: matches 3 to 916 of consensus"