#!/usr/local/bin/perl # Meow/Man.pm =head1 NAME Meow::Man - manage Human Genome data for Meow =head1 DESCRIPTION data source: ftp://ncbi.nlm.nih.gov/refseq/LocusLink/LL_tmpl =cut package Meow::Man; @ISA = qw( Meow::Data ); sub new { my $class= shift; my $self = $class->SUPER::new( tag => 'HUgn', name => 'Human Genes', species => 'Homo sapiens', sourcedb => 'LocusLink:', orgpath => 'man', # $SERVER_PATH/$orgpath/ source => { comment => 'LocusLink from NCBI', url => 'ftp://ftp.ncbi.nih.gov/refseq/LocusLink/', web => 'http://www.ncbi.nlm.nih.gov/LocusLink/', get_patt => '(LL_tmpl|README)', recursive => 'false', }, idurl => 'http://www.ncbi.nlm.nih.gov/LocusLink/LocRpt.cgi?l=%s', srcurls => [ 'ftp://ncbi.nlm.nih.gov/refseq/LocusLink/', 'http://genome.cse.ucsc.edu/goldenPath/07oct2000/bigZips/', ], webs => { 'LocusLink, NCBI, USA' => 'http://www.ncbi.nlm.nih.gov/LocusLink/', 'GoldenPath Human Genome Annotation, UC Santa Cruz, USA' => 'http://genome.ucsc.edu/', 'GDB - Human Genome Database, Toronto, Canada' => 'http://www.gdb.org/', 'OMIM - OnLine Mendelian Inheritance in Man, NCBI, USA' => 'http://www3.ncbi.nlm.nih.gov/Omim/', }, makeflags => $Meow::Data::kFromdata|$Meow::Data::kDumpxml, sourcepath => '/bio/biomir-pub/biomirror/eugenes/human/', sourcedata => [ 'LL_tmpl.gz', ## was .Z - fix soft to handle both? '$gnomap/man/HUgn-gold.acode', #'$workpath/genomes/man/HUgn-gold.acode', #'/bio/data/locuslink/LL_tmpl' , ## local path - ? need variable ], targetdata => 'HUgn.acode', doc => [ 'README' ], needIdDb => 0, goass => "gene_association.ll",# added nov01 haspredicted => 'true', csomes => [ qw/ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y / ], # golinks => 'GNH', #? @_ ); return $self; } #------------- BEGIN { %urldbs = ( 'gdbwww.gdb.org/gdb-bin/genera/accno\?GDB:(.+)' => 'GDB', 'bioinformatics.weizmann.ac.il/cards-bin/carddisp\?(.+)' => 'CARDS', 'www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi\?locusId=(.+)' => 'SNP', 'www-lecb.ncifcrf.gov/cgi-bin/dbEngine\?mitoDat,getDataByID,(.+)' => 'MITO', 'www.ncbi.nlm.nih.gov/ncicgap/cgaptsorec.cgi\?rec=(.+)' => 'NCICGAP', 'www.uwcm.ac.uk/uwcm/mg/search/(.+)' => 'UWCM', 'ixdb.mpimg-berlin-dahlem.mpg.de/bin/ixdbcmd.cgi\?cmd=showObj\&id=(.+)' => 'IXDB', 'www.geneclinics.org/profiles/(.+)' => 'GCLINIC', '.hcuge.ch/cgi-bin/get-enzyme-entry\?(.+)' => 'ENZ', '.hcuge.ch/cgi-bin/get-prodoc-entry\?(.+)' => 'PROSITEDOC', 'www.gene.ucl.ac.uk/nomenclature/(.+)' => 'GUNOMEN', 'www.gene.ucl.ac.uk/users/hester/(.+)' => 'GUHESTER', 'www.gene.ucl.ac.uk/(.+)' => 'GENEUCL', 'drnelson.utmem.edu/(.+)' => 'DRNELSON', 'genome.nhgri.nih.gov/(.+)' => 'NHGRI', 'www.ebi.ac.uk/imgt/(.+)' => 'IMGT', 'imgt.cnusc.fr:8104/' => 'IMGTFR', 'zearth.kazusa.or.jp/huge/gfpage/([^/]+)/' => HUGE, 'www.ncbi.nlm.nih.gov/prow/guide/([0-9]+)_g.htm' => PROW, ); ## not used? %fieldKeys = ( 'OFFICIAL_SYMBOL' => 'SYM', 'PREFERRED_SYMBOL' => 'SYM', 'OFFICIAL_GENE_NAME' => 'NAM', 'PREFERRED_GENE_NAME' => 'NAM', 'LOCUSID' => 'ID', 'ORGANISM' => 'ORG', 'SUMMARY' => 'REAB', ## do we have abstract/summary field yet? 'ALIAS_SYMBOL' => 'SYN', 'ALIAS_PROT' => 'PRD', ##? what is right acode? 'PREFERRED_PRODUCT' => 'FNC', ##? what is right acode? 'ECNUM' => 'ENZ', ##? is this right acode? 'PHENOTYPE' => 'PHP', ##? is this right acode? 'CHR' => 'CHR', ## chromosome ?? 'MAP' => 'MAP', ## not quite a match - fix Map position? #'mACCNUM' => 'DBA', ## don't want seq acc nums in this summary?? #'gACCNUM' => 'DBA', #'eACCNUM' => 'DBA', 'UNIGENE' => 'DBL', ## -- need dblink acode 'OMIM' => 'DBL', ## -- need dblink acode 'LINK' => 'URL', ## -- need dblink acode 'DB_LINK' => 'URL', ## -- need dblink acode ); } sub toMeow { my $self= shift; my($fromdir, $todir)= @_; $species= $self->species; $sourcedb= $self->sourcedb; my $outfile= $todir . $self->targetdata; $reftab= $self->readHgtable(); %hgtable= %{$reftab} if ($reftab); require Acodes; $Acodes::debug= $Meow::debug; %tableEntryFields = ( ID => $Acodes::RETE_ID, SYM => $Acodes::RETE_VALUE, NAM => $Acodes::RETE_VALUE, CHR => $Acodes::RETE_VALUE, MAP => $Acodes::RETE_VALUE, #? ORG => $Acodes::RETE_VALUE, DID => $Acodes::RETE_VALUE, ); $acode= new Acodes(); @outfiles= $acode->create($outfile); $acode->registerKeys( \%fieldKeys); $acode->addTableEntryKeys( \%tableEntryFields); my $savego= 1; #? save to produce goass.org list for other uses? my @savego= (); my %unused= (); my %sppcount= (); my $fh= $self->openSource('LL_tmpl'); return 1 unless( $fh ); my $datadate= $self->getFileDate('LL_tmpl', '%Y%m%d'); ## also add predicted genes from GoldenPath annots ... from acode dump ## non LL ids are HUgn01xxxxxx ## are there any other flds from Gold to add to LL records? - maploc? while (<$fh>) { chomp(); #!? redo as a hash of flds/record and process all at end-of-rec? # my ($key,$val)=(); # if (/^([\w\_]+):\s*(.+)/) { # ($key,$val)= ($1,$2); # } if (/^>>/) { putifrec(); $mainrec= $acode->newRecord('GENR'); $nrefs= 0; $nale= 0; } elsif (/^LOCUSID:\s*(.+)/) { $did= $idn= $1; $id= $self->tag . sprintf("%07d", $idn); $acode->setId( $id, $mainrec); $atfld= $acode->addField( $mainrec, 'ID', $id); $atfld= $acode->addField( $mainrec, 'DID', $sourcedb . $did); ## ref db id } elsif (/^CURRENT_LOCUSID:\s*(.+)/) { # indicates obsolete $skiprec = 1; } elsif (/^(OFFICIAL|PREFERRED)_SYMBOL:\s*(.+)/) { $gsym= $2; if ($gsym eq 'na') { $gsym= 'LOC'.$idn; $ispredicted= 1; # this is not a good call - see STATUS ## predicted flag ## also: LOCUS_CONFIRMED: no } $atfld= $acode->addField( $mainrec, 'SYM', $gsym); } elsif (/^(OFFICIAL|PREFERRED)_GENE_NAME:\s*(.+)/) { $atfld= $acode->addField( $mainrec, 'NAM', $2); } elsif (/^ORGANISM:\s*(.+)/) { $org= $1; $sppcount{$org}++ unless($skiprec); $skiprec = 1 unless ($org =~ m/Homo sapiens/); $atfld= $acode->addFieldHash( $mainrec, 'ORG', $org);##? add even if no ORG fld? ## Homo sapiens -- collect only these? skip Mus musculus - same as Jax data ## } elsif (/^ALIAS_SYMBOL:\s*(.+)/) { $atfld= $acode->addFieldHash( $mainrec, 'SYN', $1); } elsif (/^MAP:\s*(.+)/) { ## syntax is 'position|source/url|kind', with kind = (G = genetic, C=cytogenetc) ## MAP: 13q12|HUGO|C| $map= $1; $map =~ s/\|.+//; $atfld= $acode->addFieldHash( $mainrec, 'MAP', $map); } elsif (/^CHR:\s*(.+)/) { $atfld= $acode->addFieldHash( $mainrec, 'CHR', $1); } elsif (/^SUMMARY:\s*(.+)/) { $sum= $1; $sum =~ s/^Summary:\s*//; $sum= &splitLongLine($sum); $atfld= $acode->addFieldHash( $mainrec, 'REAB', $sum); } elsif (/^ALIAS_PROT:\s*(.+)/) { $atfld= $acode->addFieldHash( $mainrec, 'PRD', $1); ##? what fld key - like FNC } elsif (/^PREFERRED_PRODUCT:\s*(.+)/) { $prefprod= $1; $atfld= $acode->addFieldHash( $mainrec, 'FNC', $prefprod); ##? what fld key } elsif (/^PRODUCT:\s*(.+)/) { $otherprod= $1; # add only if no PREFERRED_PRODUCT } elsif (/^ECNUM:\s*(.+)/) { $atfld= $acode->addFieldHash( $mainrec, 'ENZ', 'EC:'.$1); } elsif (/^PHENOTYPE:\s*(.+)/) { $atfld= $acode->addFieldHash( $mainrec, 'PHP', $1); } ## need some more function et al fields to handle these variants: PHENOTYPE, PRODUCT, EXTANNOT GO: ... elsif (/^SUMFUNC:\s*(.+)/) { my $sf= $1; $sf =~ s/\|Proteome//; $atfld= $acode->addFieldHash( $mainrec, 'PHP', $sf); } ## EXTANNOT: Proteome's BioKnowledge Library terms and other annotation from external sources # EXTANNOT: subcellular localization|Nuclear|E|Proteome|9070310 # EXTANNOT: biochemical function|Inhibitor or repressor|NR|Proteome|2404007 # EXTANNOT: cellular role|Other metabolism|P|Proteome|8063807 # EXTANNOT: subcellular localization|Microsomal fraction|NR|Proteome|8063807 # EXTANNOT: organismal role|Cell migration/motility|P|Proteome|7743515 # EXTANNOT: cellular role|RNA processing/modification|E|Proteome|7654687 # EXTANNOT: biochemical function|tRNA synthetase|E|Proteome|7654687 # EXTANNOT: molecular localization|Soluble|P|Proteome|7654687 elsif (/^EXTANNOT:\s*(.+)/) { my($sect,$term,$scode,$ptm,$pid)= split(/\|/,$1); ## is $pid proper accnum here? my $key; my $val= $term; if ($sect =~ /function/) { $key= 'ENZ'; } elsif ($sect =~ /organismal|cellular role|process/) { $key= 'FNC'; } elsif ($sect =~ /subcellular|molecular/) { $key= 'CEL'; } else { $key= 'PHP'; } #?? $atfld= $acode->addFieldHash( $mainrec, $key, $val); } # GO: molecular function|tRNA binding|E|GO:0000049|Proteome|7654687 # GO: biological process|alanyl-tRNA biosynthesis|P|GO:0006419|Proteome|7761427 # GO: cellular component|soluble fraction|P|GO:0005625|Proteome|7654687 # GO: molecular function|heparin binding|P|GO:0008201|Proteome|7743515 # GO: biological process|cell motility|NR|GO:0006928|Proteome|na elsif (/GO:\s*(.+)/) { my $gol= $1; my($sect,$term,$scode,$goid,$ptm,$pid)= split(/\|/,$gol); ## is $pid proper accnum here? my $key; if ($sect =~ /function/) { $key= 'ENZ'; $sect= 'F'; } elsif ($sect =~ /process/) { $key= 'FNC'; $sect= 'P';} elsif ($sect =~ /component/) { $key= 'CEL'; $sect= 'C'; } else { $key= 'PHP'; print STDERR "Odd GO: $_\n" if ($Meow::debug); } #?? push( @savego,"$gol|$org|$did|$gsym|$sect") if ($savego); my $val= "$term ; $goid"; $acode->addFieldHash( $mainrec, $key, $val); } elsif (/^ACCNUM:\s*(.+)/) { my $na= $1; $na =~ s/\|.+//; $atfld= $acode->addFieldHash( $mainrec, 'DBA', "NA:$na"); ##? don't want seq accs ? } elsif (/^UNIGENE:\s*(.+)/) { $unigene= $1; $atfld= $acode->addFieldHash( $mainrec, 'DBL', "UNIGENE:$unigene"); } elsif (/^OMIM:\s*(.+)/) { $atfld= $acode->addFieldHash( $mainrec, 'DBL', "OMIM:$1"); } elsif (/^NM:\s*(.+)/) { my($acc,$pid,$mmdb)= split(/\|/,$1); ## is $pid proper accnum here? $atfld= $acode->addFieldHash( $mainrec, 'RSQ', "REFSEQ:$acc"); } elsif (/^NP:\s*(.+)/) { my($acc,$pid,$mmdb)= split(/\|/,$1); ## is $pid proper accnum here? $atfld= $acode->addFieldHash( $mainrec, 'RPA', "REFPROT:$acc"); } # jun02 add these to link w/ ncbigenomes proteins # XM: XM_045113|14727714|na # XP: XP_045113|14727715|na *** elsif (/^XM:\s*(.+)/) { my($acc,$pid,$mmdb)= split(/\|/,$1); ## is $pid proper accnum here? $atfld= $acode->addFieldHash( $mainrec, 'DBA', "XM:$acc"); ##? don't want seq accs ? } elsif (/^XP:\s*(.+)/) { my($acc,$pid,$mmdb)= split(/\|/,$1); ## is $pid proper accnum here? $atfld= $acode->addFieldHash( $mainrec, 'PAC', "XP:$acc"); } #? add these as gene CLA ss ? # LOCUS_TYPE: pseudogene # LOCUS_TYPE: gene with protein product, function known or inferred elsif (/^LOCUS_TYPE:\s*(.+)/) { $loctype= $1; $acode->addField( $mainrec, 'CLA', 'Pseudogene') if ($loctype =~ /pseudogene/i); } elsif (/^STATUS:\s*(.+)/) { $status= $1; $ispredicted= 1 if ($status =~ /predicted/); # $acode->addField( $mainrec, 'CLA', 'Predicted') if ($status =~ /predicted/i); } #Values in LOCUSLINK:STATUS - jun02 #Ê model 74294 ** these also include genesym == 'na'; ncbi-model, supported by mRNA alignments #Ê predicted 9600 ** -- for human: 3398 entries #Ê provisional 20785 #Ê reviewed 5341 #Ê withdrawn 117 elsif (/^LINK:\s*(.+)/) { $url= $1; if ($url =~ m/UniGene/ && $unigene) { # if have UNIGENE and link == same UniGene id, skip my($udb,$uid); ($udb,$uid)= split(/\./,$unigene); next if ($url =~ m/ORG=$udb/ && $url =~ m/CID=$uid/); } # elsif ($org eq 'Mus musculus' && $url =~ m/(MGI:\d+)/) { # $mgid= $1; # save to compare w/ mouse MGI data # $mgilist{$mgid}++; # } if ($dbl= &url2dbs($url)) { $atfld= $acode->addFieldHash( $mainrec, 'DBL', $dbl); } else { $atfld= $acode->addFieldHash( $mainrec, 'URL', $url); } } elsif (/^DB_LINK:\s*(.+)/) { $url= $1; if ($dbl= &url2dbs($url)) { $atfld= $acode->addFieldHash( $mainrec, 'DBL', $dbl); } else { $atfld= $acode->addFieldHash( $mainrec, 'URL', $url); } } elsif (/^(\w+):\s*(.+)/) { ## unused tag... debug output $unused{$1}= $2; $atfld= undef; } else { $acode->appendField( $atfld, $_) if defined($atfld); ## continuation line ..? } } close($fh); putifrec(); if ($savego) { local(*GAF); my $fn= $self->goass; warn "cant write $fn" unless ( open(GAF,">$fn") ); saveGoAssoc( *GAF, \@savego, $datadate); close(GAF); } # FIXME: # $self->addGoldPredicted(); # 01feb $acode->close(); print STDERR "Data update for: $species\n"; ##? put on stderr or stdout print STDERR "gene count (LL): $total\n" ; # if ( $Meow::debug); print STDERR "predicted gene count (Gold): $npred\n" ; # if ( $Meow::debug); print STDERR "species count (only $species saved)\n"; foreach (sort keys %sppcount) { print STDERR "$_\t$sppcount{$_}\n"; } $norgless= scalar(@orgless); print STDERR "org-less records\t$norgless\n"; # foreach (@orgless) { print STDERR "$_ "; } print STDERR "\n"; if ($Meow::debug) { print STDERR "Unused keys & example values\n"; foreach (sort keys %unused) { print STDERR "$_ \t$unused{$_}\n"; } } # open(MGI,">$todir/LocusLink-MGI.list"); # foreach $mgid (sort keys %mgilist) { print MGI "$mgid\n"; } # close(MGI); return 0; } sub addGoldPredicted { my $self= shift; my $fh= $self->openSource('HUgn-gold.acode'); return 1 unless( $fh ); my $alib= $acode->{alib}; my $aidx= $acode->{aidx}; my $data; $npred= 0; $/= "# EOR\n"; while ($data= <$fh>) { if ($data =~ /ID\|HUgn(01\d+)/) { my $idnum= $1; my $at= tell($alib); print $alib $data; my $size= tell($alib) - $at; my $record= pack("LL", $at, $size); my $idloc = $idnum * length($record); seek($aidx, $idloc, 0); print $aidx $record; $npred++; } } $/= "\n"; close($fh); } sub addGoldFields { ## HERE? add GoldenPath extra flds of any: ## check these flds for extra data: DBL CHR MAP BLOC SYN ## CLA|Predicted # my $gprec= $gpacode->readId($id, 'HUgn'); # my $gpar= new AcodeRecord(); $gpar->fromtext( $gprec); # my %fkeys= $gpar->getFieldKeys(); # foreach my $fk (keys %fkeys) { # @fvals= $gpar->getAllFields($fk); # if ($mainrec->haskey($fk)) { xxx; } # else { $acode->addField($mainrec, $fk, $fvals[$i]); } # } } sub org2taxa { my $org= shift; return 'taxon:7227' if ($org =~ m/Drosophila melanogaster/); return 'taxon:9606' if ($org =~ m/Homo sapiens/); return 'taxon:27702' if ($org =~ m/Danio rerio/); #? 27702 == Danio frankei return 'taxon:10090' if ($org =~ m/Mus musculus/); return 'taxon:10116' if ($org =~ m/Rattus norvegicus/); return 'taxon:$org'; #??? } sub saveGoAssoc { my ($gaf,$rgoa,$date)= @_; my @goa= @$rgoa; my %no; # 14 fields: # DB,DB_Object_ID,DB_Object_Symbol,[NOT],GOid,DB:Reference(|DB:Reference),Evidence, # With,Aspect,DB_Object_Name(|Name),DB_Object_Synonym(|Synonym),DB_Object_Type,taxon(|taxon),Date # SGD,S0000296,PHO3,,GO:0015888,SGD:8789|PMID:2676709,IMP,,P,,YBR092C,gene,taxon:4932,20001122 # GO: molecular function|tRNA binding|E|GO:0000049|Proteome|7654687 # GO: biological process|alanyl-tRNA biosynthesis|P|GO:0006419|Proteome|7761427 foreach my $a (@goa) { my ($not,$db_xref,$evid,$with,$xasp,$name,$syn,$type,$taxa); my ($sect,$term,$scode,$goid,$ptm,$pid,$org,$did,$gsym,$asp)= split(/\|/,$a); ## is $pid proper accnum here? # if ($sect =~ /function/) { $asp= 'F'; } # elsif ($sect =~ /component/) { $asp= 'C'; } # elsif ($sect =~ /process/) { $asp= 'P'; } if ($scode eq 'NR') { $evid= 'ND'; } elsif ($scode eq 'E') { $evid= 'IEA'; } elsif ($scode eq 'P') { $evid= 'ISS'; } #?? else { $evid= $scode; } #?? $db_xref= "$ptm:$pid" unless($pid eq 'na'); $taxa= org2taxa($org); print $gaf "LL\t$did\t$gsym\t$not\t$goid\t$db_xref\t$evid\t" ."$with\t$asp\t$name\t$syn\tgene\t$taxa\t$date\n"; $no{$org}++; } print STDERR "GO associations: \n"; foreach my $a (sort keys %no) { print STDERR " $a = $no{$a}\n"; } } sub putifrec { push(@orgless, $id) unless($org); ## these seem to be obsolete (2ndary ID) records $skiprec = 1 unless ($org =~ m/Homo sapiens/); if ($mainrec && !$skiprec) { if ($otherprod && !$prefprod) { $atfld= $acode->addFieldHash( $mainrec, 'FNC', $otherprod); } $hg= $hgtable{$id}; $acode->addField( $mainrec, 'HG', $hg) if ($hg); $acode->addField( $mainrec, 'CLA', 'Predicted') if ($ispredicted); ## HERE? add GoldenPath extra flds of any: $acode->packFields($mainrec); $acode->putRec( $mainrec); $total++; } $otherprod= ''; $prefprod= ''; $skiprec= 0; $mainrec= ''; $org= ''; $unigene= ''; $ispredicted= 0; } sub url2dbs { local($url)= @_; local($dbpat); foreach $dbpat (keys %urldbs) { if ($url =~ m|$dbpat|) { local $dbkey= $urldbs{$dbpat}; if ($1) { return "$dbkey:$1"; } else { return "$dbkey:link"; } } } return ''; } sub splitLongLine { local($s) = @_; if (length($s)>150) { local @lns= split(/\n/,$s); local $i; for ($i= 0; $i <= $#lns; $i++) { if (length($lns[$i]) > 100) { local $k= 80; local $l= $lns[$i]; local $len= length($l); while ($k < $len) { local $at= index($l,' ',$k); if ($at>0) { $k= $at; } else { $k= $len; } local $l2= substr($l,0,$k); if ($k < $len) { $l2 .= "\n" . substr($l,$k+1); } $k += 80; $l= $l2; $len= length($l); } $lns[$i]= $l; } } $s= join("\n", @lns); } return $s; } 1; __END__ ## these are recoded as DBNAME:id ## and report software inserts html link from runtime changeable list # URL|http://gdbwww.gdb.org/gdb-bin/genera/accno?GDB:118953 # URL|http://bioinformatics.weizmann.ac.il/cards-bin/carddisp?A12M4 # URL|http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?locusId=10 # URL|http://www-lecb.ncifcrf.gov/cgi-bin/dbEngine?mitoDat,getDataByID,MD500177 # URL|http://www.ncbi.nlm.nih.gov/ncicgap/cgaptsorec.cgi?rec=25 # URL|http://www.uwcm.ac.uk/uwcm/mg/search/118958.html # URL|http://ixdb.mpimg-berlin-dahlem.mpg.de/bin/ixdbcmd.cgi?cmd=showObj&id=533401 # URL|http://www.geneclinics.org/profiles/fap/ # URL|http://www.geneclinics.org/profiles/ataxia-telangiectasia # URL|http://expasy.hcuge.ch/cgi-bin/get-enzyme-entry?3.5.1.23 << change to ENZ| # URL|http://www.gene.ucl.ac.uk/nomenclature/KCN.shtml # URL|http://genome.nhgri.nih.gov/histones/ # URL|http://drnelson.utmem.edu/CytochromeP450.html # URL|http://www.gene.ucl.ac.uk/users/hester/tnftop.html # URL|http://www.ebi.ac.uk/imgt/hla/ # URL|http://expasy.hcuge.ch/cgi-bin/get-prodoc-entry?PDOC00550 # URL|http://www.expasy.ch/cgi-bin/get-prodoc-entry?PDOC00550 # URL|http://imgt.cnusc.fr:8104/ ## keep as URL?? # URL|http://www.albumin.org # URL|http://bio.bu.edu/~vfunari/ # |http://perso.curie.fr/Thierry.Soussi/APC.html # |http://www.mcgill.ca/androgendb/ # URL|http://www.med.ic.ac.uk/dd/ddhc/default.htm ### add these url codes, jan'2000 #'www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=Hs&CID=(.+)' => UNIGENEHS or 'UNIGENE.Hs' # ^^ these probably are all dups of UNIGENE field, redundant #'zearth.kazusa.or.jp/huge/gfpage/([^/]+)/' => HUGE # A Database of Human Unidentified Gene-Encoded Large Proteins Analyzed # by Kazusa cDNA Project #'www.ncbi.nlm.nih.gov/prow/guide/([0-9]+)_g.htm' => PROW # An International WWW Resource/Journal on Human Proteins add parsing of ref seq - can be many / gene LOCUS_STRING: [SET][alphanumeric] [optional] (only if a reference sequence exists) NM: the value in the LOCUS field of the RefSeq record [alphanumeric] [optional] (only if a reference sequence exists) NP: the RefSeq accession number for an mRNA record [alphanumeric] [optional] (only if a reference sequence exists) the RefSeq accession number for a protein record, followed by the PID for that protein and either MMDB or na (values separated by |. If MMDB is present, it indicates there are structure data available for a protein related to the protein referenced by the PID. LOCUS_STRING: PAX2b NM: NM_000278 NP: NP_000269|4557821|MMDB LOCUS_STRING: PAX2e NM: NM_003990 NP: NP_003981|4557829|MMDB perl -e 'while(<>){if (m/^LINK/ && m/(MGI:\d+)/){print "$1\n";}}' ## locuslink LL species counts as of 4 April 2000 724 ORGANISM: Danio rerio 19497 ORGANISM: Drosophila melanogaster 14539 ORGANISM: Homo sapiens 13295 ORGANISM: Mus musculus 2327 ORGANISM: Rattus norvegicus =FIXME 17 July 2000: ACCNUM: accession|gi|strain ^added PROT: accession|gi|structure information (no longer printed if there are no data) 13 Aug 2000: added tag LOCUS_TYPE added loc2cit file: (locus_id and citations) tab-delimited: LocusID, PubMed id, MedLine uid updated daily 2 November 2000: modified PROT reporting: PROT: accession|gi (structure link discontinued) One line per protein accession added structures to support NCBI's model transcripts from genomic analysis (MODEL, LOCUS_STRING, XM, XP) added domains predicted on proteins: CDD added links to the predicted mouse-human comparative map: COMP 5 December: added homol_seq_pairs a tab-delimited file of related mRNA sequences associated with current locus_ids 01 February 2001 modified representation of STS so each marker has one line in LL_templ and so that the method of assigning the marker to the gene is explicit added functional representation ** SUMFUNC: brief summary of the function ** GO: Gene Ontology terms ** EXTANNOT: Proteome's BioKnowledge Library terms and other annotation from external sources 12 February 2001 loc2ref and loc2acc: added column 5 to provide protein accessions =cut === nov01 LocusLink !! GO: has been removed !? why added functional representation SUMFUNC: brief summary of the function GO: Gene Ontology terms EXTANNOT: Proteome's BioKnowledge Library terms and other annotation from external sources 12-14 February 2001 loc2ref and loc2acc: added column 5 to provide protein accession added section to represent related loci RELL: block to describe other genes (LocusID) related to the one being reported 12 September 2001: (first documented) GRIF: block to report text supplied by the public, desribing the function of a gene or other critical aspect of a gene, and the PubMed id of the paper in which this was reported RELL: additional qualifier added to the block to support reporting mRNA accessions that related to the LocusID, but may not necessarily instantiate that gene. This latter category is used only in those records generated through the genome annotation pipeline. October, 2001: COMP: additional values to support linking to more than one comparative map XR: a new type of RefSeq from the NCBI Annotation Project RNA only (no protein translation product) The LL_tmpl file uses the following conventions: [x|y|z] x or y or z at that position [required] the tag is required, but a value may be null [unique] tag appears only once per record [multiple] tag may appear more than once per record [SET] indicates the beginning of a possible repeat unit for hierarchical tag:value pairs [/SET] end of repeat unit [optional] the tag may not appear in all records Description of LL_tmpl: >>[numeric] record separator; the number equals the LocusID LOCUSID [numeric] [unique] [required] the unique integer id for a locu s CURRENT_LOCUSID: [numeric] [unique] [optional] If a LocusID has been merged with another, the current LOCUSID corresponding to the value on the previous LOCUSID line, is provided here. LOCUS_CONFIRMED: [alphanumeric][yes|no] The LOCUSID has been assigned to a confirmed locus and can be treated as an identifier that will be tracked. LOCUS_TYPE: [alphanumeric] description of the type of locus ORGANISM: [alphanumeric] [unique] [required]source species (Homo sapiens , Rattus norvegicus, etc.), based on NCBI's Taxonomy RELL: [set][optional][alphanumeric][multiple] description|id|id type|print representation[/set] brief text summarizing the relationship, the other id, the type of id, and the display for that second id. At present these id types of are 2 classes: l for locus_id, n for nucleotide accession official/default symbol for the other locus being described STATUS: [alphanumeric] [optional] (only if a reference sequence exists) [REVIEWED|PROVISIONAL|PREDICTED|MODEL] type of reference sequence record PROVISIONAL: generated automatically from an existing GenBank record and information stored in the LocusLink database; no curation REVIEWED: generated from the most representative, complete GenBank sequence or merge of GenBank sequences and from information stored in the LocusLink database PREDICTED: mRNA from a large-scale sequencing project the CDS has been predicted from the nucleotide sequence, but usually has not been verified MODEL: a model based on NCBI's genomic sequence assembly NG: the RefSeq accession for genomic region (nucleotide) records [SET] NM: the RefSeq accession for a mRNA record [alphanumeric] [optional] (only if a mRNA reference sequence exists) the accession for the mRNA, followed by the gi and the strain, if applicable NC: the accession for chromosome RefSeq records [alphanumeric] [optional] (only if a reference sequence exists) the RefSeq accession for a genomic record, followed by the gi and strain, if applicable. NP: the RefSeq accession for a protein record [alphanumeric] [optional] (only if a reference sequence exists) the RefSeq accession number for a protein record, followed by the PID for that protein and either MMDB or CBLASTP or na (values separated by |). MMDB indicates structure data are available for a protein related to the protein referenced by the PID. CBLASTP indicates that related proteins identified by BLASTP can be reviewed from the WWW site. PRODUCT: [alphanumeric] [optional] (only if a reference sequence exists) the name of the product of this transcript TRANSVAR: [alphanumeric] [optional] (only if a reference sequence exists) a variant-specific description ASSEMBLY: [alphanumeric] [optional][multiple] (only if a reference sequence exists)[/SET] CONTIG: [alphanumeric][optional][multiple] the accession of the RefSeq contig XR: [alphanumeric][optional] (only if a model exists) the RefSeq accession of a model RNA, not associated with a protein product [SET] EVID: [alphanumeric] [optional] (only if a model exists) test summary of the evidence for this model XM: [alphanumeric] [optional] (only if a model exists) the accession for the mRNA, followed by the gi and the strain, if applicable XP: the RefSeq accession for a model protein record [alphanumeric] [optional] (only if a model exists) the RefSeq accession of a model protein, followed by the PID for that protein and either MMDB or CBLASTP or na (values separated by |. MMDB indicates structure data are available for a protein related to the protein referenced by the PID. CBLASTP indicates that related proteins identified by BLASTP can be reviewed from the WWW site. CDD: [alphanumeric][multiple][optional] name|key|score|e_value|bit_score --More-- [/SET] ACCNUM: GenBank accession used to assemble the RefSeq record [SET][alphanumeric] [optional] [multiple] nucleotide sequence accession number (no version) one accession number per line, with |strain if available TYPE: [e|m|g] refers to type of nucleotide sequence: e=EST m=mRNA g=genomic PROT: [SET][multiple][optional]A potentially repeating set of two values: accession and identifier (PID value) for the coding region or regions annotated on the associated nucleotide record, one line for each accession If no data are available, na is supplied. The delimiter is |. [/SET][/SET] [OFFICIAL|PREFERRED]_SYMBOL: [alphanumeric] [unique] [required] the symbol used for gene reports OFFICIAL: validated by the appropriate nomenclature committee PREFERRED: interim option selected for display na is used for models without evidence [OFFICIAL|PREFERRED]_GENE_NAME: [alphanumeric] [unique] [required (but may be null)] the gene description used for gene reports OFFICIAL: validated by the appropriate nomenclature committee PREFERRED: interim selected for display [NOTES--If the symbol is official, the gene_name will be official. No record will have both official AND interim nomenclature. PREFERRED_PRODUCT: [alphanumeric] [unique] [optional] the name of the product used in the RefSeq record ALIAS_SYMBOL: [alphanumeric][multiple] other symbols associated with this gene ALIAS_PROT: [alphanumeric][multiple] other protein names associated with this gene PHENOTYPE: [SET][alphanumeric][multiple] a phenotype associated with a mutation in this gene PHENOTYPE_ID: [/SET] an ID used for this phenotype. For humans, this is the MIM number SUMMARY: [alphanumeric][optional] a summary description of the gene, its products, its significance, and mutant phenotypes UNIGENE: [alphanumeric][multiple] UniGene cluster id(s) associated with this gene OMIM: [numeric][optional][multiple] MIM number CHR: [alphanumeric][optional][multiple] the chromosome assignment MAP: [alphanumeric][optional][multiple] One line, consisting of a repeating set of 3 data elements, each element separated by | the first element is the location; the second is the source (as a URL when appropriate), and the third element is the type of map information (G = genetic, C=cytogenetc) STS: set of STS markers [SET][alphanumeric][optional][multiple] multiline set, one marker per line marker name|chromosome|sts_id|D segment|seq_known|evidence[/SET] evidence types are currently either epcr, or PubMed id(s) COMP: set of comparative map links [alphanumeric][optional][multiple] c_tax_id|c_symbol|c_chromosome|c_position|c_locus_id| q_chromosome|map_name[/SET] the tax_id of the homolog, the symbol of the homolog, the homologous chromosome, the homologous position, the locus_id of the homolog, the chromosome of the source record, the map name ECNUM: [alphanumeric][optional][multiple] BUTTON: [SET][alphanumeric][optional] an web resource accessed by a button, as well as or in addition to text LINK: [/SET][alphanumeric the url underlying the button (note: if there are variation data for this locus at NCBI, the line "BUTTON: snp.gif" will be present) DB_DESCR: [SET][alphanumeric][optional][multiple] The name of an external web site with more information about this locus DB_LINK: [/SET][alphanumeric] the URL PMID: [numeric][multiple] a subset of publications associated with this locus with the link being the PubMed unique identifier comma separated GRIF: [SET][alphanumeric][optional][multiple][/SET] PubMed unique identifier|comment SUMFUNC: [alphanumeric][optional] a brief summary of the function of the products of this locus GO: [SET][alphanumeric][optional][/SET] category of term|the term itself|evidence code|GO identifier| source of annotation|PubMed id(s) EXTANNOT: [SET][alphanumeric][optional][/SET] category of term|the term itself|evidence code| source of annotation|PubMed id(s) EXAMPLE (hypothetical only) >>5076 LOCUSID: 5076 LOCUS_CONFIRMED: yes LOCUS_TYPE: gene with protein product, function known or inferred ORGANISM: Homo sapiens STATUS: REVIEWED NM: NM_000278|4557820|na NP: NP_000269|4557821 CDD: Paired Box domain|PAX|517|na|205.797 CDD: 'Paired box' domain|pfam00292|540|na|214.756 PRODUCT: paired box protein 2 isoform b TRANSVAR: Transcript Variant: This splice variant (b) does not contain the alter nate exons (6 and 10), and utilizes the normal exon 12 splice junction. ASSEMBLY: M89470 NM: NM_003987|4557822|na NP: NP_003978|4557823 CDD: Paired Box domain|PAX|527|na|209.692 CDD: 'Paired box' domain|pfam00292|540|na|214.756 PRODUCT: paired box protein 2, isoform a TRANSVAR: Transcript Variant: This splice variant (a) includes the alternate exo n 6 but not exon 10, and utilizes the normal exon 12 splice junction. ASSEMBLY: AH006910,M89470 NM: NM_003988|4557824|na NP: NP_003979|4557825 CDD: Paired Box domain|PAX|517|na|205.797 CDD: 'Paired box' domain|pfam00292|540|na|214.756 PRODUCT: paired box protein 2, isoform c TRANSVAR: Transcript Variant: This splice variant (c) includes alternate exon 10 but not exon 6, and utilizes the normal exon 12 splice junction. ASSEMBLY: L25597,M89470 NM: NM_003989|4557826|na NP: NP_003980|4557827 CDD: Paired Box domain|PAX|527|na|209.692 CDD: 'Paired box' domain|pfam00292|540|na|214.756 PRODUCT: paired box protein 2, isoform d TRANSVAR: Transcript Variant: This splice variant (d) includes the alternate exo n 6 but not exon 10, and also utilizes an alternate exon 12 splice junct ion that results in a different COOH-terminus. ASSEMBLY: AH006910,M89470 NM: NM_003990|4557828|na NP: NP_003981|4557829 CDD: Paired Box domain|PAX|527|na|209.692 CDD: 'Paired box' domain|pfam00292|540|na|214.756 PRODUCT: paired box protein 2, isoform e TRANSVAR: Transcript Variant: This splice variant (e) includes the alternate exo n 6, lacks alternate exon 10, and uses an alternate exon 12 splice junct ion that results in a different COOH-terminus. ASSEMBLY: M89470 CONTIG: NT_008874 EVID: supported by alignment with mRNA XM: XM_005943|11432394|na XP: XP_005943|11432395|na EVID: supported by alignment with mRNA XM: XM_005944|11432408|na XP: XP_005944|11432409|na EVID: supported by alignment with mRNA XM: XM_005945|11432403|na XP: XP_005945|11432404|na EVID: supported by alignment with mRNA XM: XM_005946|11432412|na XP: XP_005946|11432413|na EVID: supported by alignment with mRNA XM: XM_005947|11432398|na XP: XP_005947|11432399|na ACCNUM: L09747|292380|na TYPE: g ACCNUM: U45245|3649601|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45247|1469405|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45248|1469406|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45249|1469407|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45250|1469408|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45251|1469409|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45252|1469410|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45253|1469411|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45254|1469412|na TYPE: g PROT: AAC63385|1469415 ACCNUM: U45255|1469413|na TYPE: g PROT: AAC63385|1469415 ACCNUM: L25597|438649|na TYPE: m PROT: AAA36417|438650 ACCNUM: M89470|409138|na TYPE: m PROT: AAA60024|409139 OFFICIAL_SYMBOL: PAX2 OFFICIAL_GENE_NAME: paired box gene 2 PREFERRED_PRODUCT: paired box protein 2, isoform d PREFERRED_PRODUCT: paired box protein 2, isoform e PREFERRED_PRODUCT: paired box protein 2 isoform b PREFERRED_PRODUCT: paired box protein 2, isoform a PREFERRED_PRODUCT: paired box protein 2, isoform c SUMMARY: Summary: PAX2 encodes paired box gene 2, one of many human homologues o f the Drosophila melanogaster gene prd. The central feature of this tran scription factor gene family is the conserved DNA-binding paired box domain. PAX 2 is believed to be a target of transcriptional supression by the tumor supressor gene WT1. Mutations within PAX2 have been shown to result in optic ner ve colobomas and renal hypoplasia. PAX2 undergoes alternative splicing t hat results in 5 transcripts, splice variants a-e. CHR: 10 RELL: gene|51441|l|HGRG8 RELL: related mRNA|BC002559|n|XM_033717--BC002559 RELL: related mRNA|NM_016258|n|XM_001812--NM_016258 STS: CHLC.UTR_04354_M89470|10|74159|D10S2478|seq_map|epcr STS: sts-M89470|10|88437|na|seq_map|epcr COMP: Pax2|10|19|19 43.0 cM|18504 ALIAS_PROT: paired box homeotic gene 2 UNIGENE: Hs.155644 BUTTON: unigene.gif LINK: http://www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=Hs&CID=155644 OMIM: 167409 MAP: 10q22.1-q24.3|RefSeq|C| MAPLINK: http://www.ncbi.nlm.nih.gov/cgi-bin/Entrez/maps.cgi?ORG=hum&chr=10&maps =morbid,gene,loc&query=PAX2&VERBOSE=ON&ZOOM=1 PHENOTYPE: Optic nerve coloboma with renal disease PHENOTYPE_ID: 120330 BUTTON: snp.gif LINK: http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?locusId=5076 BUTTON: homol.gif LINK: http://www.ncbi.nlm.nih.gov/HomoloGene/homolquery.cgi?TEXT=5076[loc] BUTTON: gdb.gif LINK: http://gdbwww.gdb.org/gdb-bin/genera/accno?GDB:138771 DB_DESCR: GeneCard for PAX2 DB_LINK: http://bioinformatics.weizmann.ac.il/cards-bin/carddisp?PAX2 DB_DESCR: Human PAX2 Allelic Variant Database DB_LINK: http://www.hgu.mrc.ac.uk/Softdata/PAX2/ PMID: 9439670,9297966,8661132,8431641,8241771,7981748,7819127,7795640,1378753 GRIF: 10958699|Murine orthologue of the human retinal glycoprotein IPM 150 (IMPG 1), involved in retinal adhesion and photoreceptor cell survival. Analyses of I PM 150 and IPM 200 core proteins reveals the presence of multiple conserved doma in of unknown function. SUMFUNC: Member of the paired domain family of nuclear transcription activators; stimulates transcription of Wilms tumor suppressor gene (WT1)|Proteome GO: molecular function|transcription activating factor|E|GO:0003710|Proteome|876 0285 GO: molecular function|DNA binding|P|GO:0003677|Proteome|9106533 GO: biological process|transcription from Pol II promoter|E|GO:0006366|Proteome| 8760285 GO: biological process|axonogenesis|P|GO:0007409|Proteome|9106533 GO: biological process|vision|P|GO:0007601|Proteome|9106533 GO: biological process|histogenesis and organogenesis|NR|GO:0007397|Proteome|na EXTANNOT: cellular role|Pol II transcription|NR|Proteome|8760285 EXTANNOT: biochemical function|DNA-binding protein|NR|Proteome|9106533 EXTANNOT: biochemical function|Activator|NR|Proteome|8760285 EXTANNOT: organismal role|Osmoregulation and Excretion|NR|Proteome|9106533 EXTANNOT: organismal role|Photoreception|NR|Proteome|9106533 EXTANNOT: molecular localization|DNA-associated (direct or indirect)|NR|Proteome |9106533 go evid: P == NR == ND E == IEA >>124809 LOCUSID: 124809 LOCUS_CONFIRMED: no LOCUS_TYPE: model, supported by EST alignments ORGANISM: Homo sapiens STATUS: MODEL CONTIG: NT_010765 EVID: supported by alignment with ESTs (1) XM: XM_071927|17479431|na XP: XP_071927|17479432|na PREFERRED_SYMBOL: na PREFERRED_GENE_NAME: hypothetical gene supported by XM_071927 CHR: 17 MAP: 17q21.31|RefSeq|C| MAPLINK: default_human_gene|LOC124809 >>124810 LOCUSID: 124810 LOCUS_CONFIRMED: no LOCUS_TYPE: model, supported by EST alignments ORGANISM: Homo sapiens STATUS: MODEL CONTIG: NT_010765 EVID: supported by alignment with ESTs (1) XM: XM_071926|17479435|na XP: XP_071926|17479436|na PREFERRED_SYMBOL: na PREFERRED_GENE_NAME: hypothetical gene supported by XM_071926 CHR: 17 MAP: 17q21.31|RefSeq|C| MAPLINK: default_human_gene|LOC124810 >>124811 LOCUSID: 124811 LOCUS_CONFIRMED: no LOCUS_TYPE: model, ab initio, with EST support ORGANISM: Homo sapiens STATUS: MODEL CONTIG: NT_010765 EVID: supported by alignment with ESTs (1) XM: XM_064322|17479439|na XP: XP_064322|17479440|na PREFERRED_SYMBOL: na PREFERRED_GENE_NAME: similar to DBF4/ASK (H. sapiens) CHR: 17 MAP: 17q21.31|RefSeq|C| MAPLINK: default_human_gene|LOC124811