#!/usr/local/bin/perl # pullrefseq.pl =head NOTES collect reference sequence/protein for eugenes gene set updated 1jul01 taken out of meowrefseq.pl (separated from blast/fasta methods) paths for iubio computer this should be automated (1,2 times/month ?) 2002 - updates jun02 (j02) - see NCBI UNIGENE for essentially same data/computes - add more species (rat?, rice, microbes? other RefSeq orgs?) - compare to NCBI HomoloGene, using Megablast on nuc. acids (mail from nat goodman - odd graph clusters in homologene). 2001 - ncbi's genbank/genomes data is mostly stale, but for yeast/SGD which updates it to NCBI alternates: ncbi refseq (man,mouse,rat,? yeast, ?fly, ?others) organism genome projects: wormbase/acedb, tigr-weed, fly-gadfly yeast acode: RPA|PROTID:NP_009395.1 -- yeast has 6,349 NP entries in refseq (jun01), 7,230 entries in eugenes worm acode: RPA|WormBase:W09B12.1 weed: use tigr data set mouse acode - uses locuslink/refseq ids RPA|REFPROT:NP_082129 -- get missing mouse prots from UniGene? man also? fly genome computed prot. aa_gadfly data; use as alternate to RPA? weed genome computed prot data; alternate/only aa data? 3jul01 pull: fish - 939 new refp; 751 old refp; 1,220 eugenes fly - 13492 new; 13432 old refp; 23,649 eugenes; ( incorporates aa_gadfly2 - 147 not found elsewhere ), weed - 23799 new refp (TIGR); 6160 old refp (TAIR); 26,819 eugenes; using tigr pep data now worm - 19609 new refp; old refp 19592; 21,686 eugenes; acedb fasta had 4215 seqs w/o eugenes id match ! man - 11813 new refp ; old refp 10367; 37,049 eugenes; add gnomap mRNA/dna translation for missing ones (2/3 of total)? mouse - 6883 new refprot ; old refprot 5052; 28,280 eugenes yeast - 6174 refprots from refseq; 7,230 eugenes; 6281 old refprots pulled from genbank/genomes CDS worm - 5jul01: 19758 refp (5037 missing eugenes id !?); 21,881 eugenes -? change to use NCBI fasta Defline format: >gb|accession|locus ; genbank >pir||entry ; pir >sp|accession|entry name ; swissprot >ref|accession|locus ; ncbi refseq >gnl|PID|e1632 ; any old db - add extra orgs for comparison $a/genbank/genomes/Bacteria/Escherichia_coli_K12/U00096.faa.Z -- 4289 seqs $a/genbank/genomes/Bacteria/Bacillus_subtilis/AL009126.faa.Z -- 4100 seqs - TIGR has rice data in same format as weed (sequences, gene locs (computed) xml) -- but data is locked now (jul01; was open apr01) ! - rice refprot from swissprot/trembl - 6296 seqs =cut use euGenes::GenomeFeatures; $debug= 0; $makelib= 0; $dofast= 0; $dodna= 0; $doprot= 1; $nofilter= 1; $isMeowId= 0; $isCaselessSymbol= 1; # $fixsyms= 1; ## till get new .fasta data $force= 0; @sys= `uname -X`; %sys= map { chomp; split(' = '); } @sys; if ($sys{Node} eq 'oat') { $TMPD= "/c7/eugenes/tmp/" ; } # elsif ($sys{Node} eq 'kalo') { $TMPD= "/c5/tmp"; } #?? $SRS5Root= '/bio/mb/srs/'; ## kalo/oat path - srs5 $SRS6Root= '/bio/mb/srs61/'; ## kalo/oat path $meowpub= "/bio/meow-pub/eugenes/server/"; $workpath= "/c7/eugenes/refseq02/"; # $genomes= "/c7/eugenes/genomes/ncbi"; ## GONE j02 # $genomeshome= "/bio/biomir-pub/biomirror/genbank/genomes/"; # $wormcds= "CDS.fasta.gz"; # ## $flygad= "/fly/gadfly.CDS.fasta"; # has FBgn ids # $flycds= "CDS.fasta.Z"; # == "/c7/tmp/flybase/bdgpseqs/aa_gadfly.dros.RELEASE2.Z"; # $weedcds= "CDS.fasta.gz"; #== '/c7/eugenes/weed/tigr/SEQUENCES/ATH1.pep.gz'; # # weed/tigr/SEQUENCES now has 13991584 Jun 13 14:59 ATH1.pep - does this include all? # $mosqcds= "cds*.fasta"; $faprefix= 'gnl|euG|'; # for NCBI-fasta format; 'euG' short for euGenes %srsdbs = ( GENBANK => "getz6 '[GENBANK-acc:%val]'", GENBANK_CDS => "getz6 '[GENBANK-acc:%val]>GENBANK_features'", NA => "getz6 '[GENBANK-acc:%val]'", NA_CDS => "getz6 '[GENBANK-acc:%val]>GENBANK_features'", # special for srs6 NUCID => "getz6 '[GENBANK-id:%val]'", REFSEQ => "getz6 '[REFSEQ-acc:%val]'", REFSEQ_CDS => "getz6 '[REFSEQ-acc:%val]>REFSEQ_features'", REFPROT => "getz6 '[REFSEQ-acc:%val]'", PROTID => "getz6 '[REFSEQ-acc:%val]'", #? for yeast only? need trim .1 vers# from NP_009395.1 SPTREMBL => "getz6 '[SWALL-acc:%val]'", # sw, swnew, tr, trnew SWISSPROT => "getz6 '[SWALL-acc:%val]'", #! for fish! SWP => "getz6 '[SWISSPROT-acc:%val]'", # sw & swnew PIR => "getz6 '[PIR-acc:%val]'", ); # SPTREMBL => "getz5 '[libs={swissprot swissnew trembl tremblnew}-acc:%val]'", # SWISSPROT => "getz5 '[libs={swissprot swissnew trembl tremblnew}-acc:%val]'", #! for fish! # SWP => "getz5 '[libs={swissprot swissnew}-acc:%val]'", # ## SWP => "getz6 '[SWISSPROT-acc:%val]'", # PIR => "getz5 '[PIR-acc:%val]'", #... %protdbids = ( worm => 'WormBase', fly => 'Gadfly2', weed => 'TIGR_ATH1', mosquito => 'Ensembl', man => 'NCBI_XP', ); ## ordered to build dna/prot lib @fullorglist= qw(fly man mouse worm weed yeast fish mosquito); # %fullorghash = map { $_ => 1; } @fullorglist; ## TEST # $workpath=''; # parseblastout('meowprot.fa.blastout','meow.hg1.tsv'); # exit; require Meow::SRS; use Getopt::Long; use flybase::sym2id; $ENV{'SRSROOT'}= $SRS6Root; # dang srs::init Meow::SRS::init($SRS6Root); $getz6= "$ENV{SRSEXE}/getz"; $ENV{'SRSROOT'}= $SRS5Root; # dang srs::init Meow::SRS::init($SRS5Root); ## need to do this again before each changed version $getz5= "$ENV{SRSEXE}/getz"; $getz= $getz6; # default now #------- @orgs= (); # %opt=(); # Getopt::Std::getopts('dmpDO:',\%opt); Getopt::Long::GetOptions( 'dna!' => \$dodna, 'protein!' => \$doprot, 'makelib!' => \$makelib, 'Organism=s' => \@orgs, 'Workpath=s' => \$workpath, 'Pubpath=s' => \$meowpub, 'debug!' => \$debug, 'force!' => \$force, 'readmissing=s' => \$readMissingLog, ); if (@orgs) { @orgs= map{ split(/\,/,$_); } @orgs; } else { @orgs= @fullorglist; } $usage= <1) { die "readmissing for one org only - set -Org=xxx"; } #if ($opt{O}) { @wantorg= split(/\,/,$opt{O}); } #unless (@orgs) { @orgs= @fullorglist; } if ($dodna && $makelib) { makeDnaLib(); } if ($doprot && $makelib) { makeProtLib(); } exit(); #---------- sub readLogMissing { my ($runlog,$org) = @_; my ($missed,$nmiss); my $genomefeats= new euGenes::GenomeFeatures; my $orgIdTag= $genomefeats->getOrgIdTag($org); open(F,$runlog) or die $runlog; while() { if (/missing data for $org/) { $missed= 1; warn "For $orgIdTag - $_"; } next unless $missed; last if (/done man/); my @v= grep(/$orgIdTag\d+/, split " "); foreach my $id (@v) { $nmiss++; $missid{$id}++; # print STDERR '.'; # if (($nmiss % 50) == 0) { print STDERR "\n"; } # $xp= $id2db{$v}; # dang, case is changed on xp # if ($xp) { # print "$id : $xp\n" if ($nshow++ < 10); # $ngot++; # } } } close(F); warn "read $nmiss missing IDs from $runlog\n"; } sub makeProtLib { ## drop this ncbi-genomes ; stale data # foreach $org (@orgs) { # next unless($ncbi_genome_paths{$org}); # print STDERR "makelib for $org\n" if $debug; # my $acode= "$workpath/$org/acode"; # my $refseq= "$workpath/$org/refprot.fasta"; # if (isOldTarget($acode,$refseq)) { # rename ($refseq, "$refseq.old") if (-r $refseq); # if ($org eq 'worm') { # fasta2refseq($org, $acode, $refseq, "$workpath/$org/$wormcds", 'WormBase'); } # else { # readgenomes( $org, $acode, $refseq); } # } # } foreach $org (@orgs) { # next if($ncbi_genome_paths{$org}); print STDERR "makelib for $org\n" if $debug; my $acode= "$workpath/$org/acode"; unless (-f $acode) { $acode= "$meowpub/$org/acode"; } my $refseq= "$workpath/$org/refprot.fasta"; if ($force || isOldTarget($acode,$refseq)) { if (!$readMissingLog && -r $refseq ) { rename ($refseq, "$refseq.old"); } %missid= (); %didid= (); if ($readMissingLog) { readLogMissing($readMissingLog,$org); die "no missing IDs" unless scalar(%missid); $nfound += fasta2refseq($org, $acode, $refseq, 1); # "$workpath/$org/", "cds", $protdbids{$org}); } elsif ($org =~ m/(worm|weed|mosquito)/) { $nfound= fasta2refseq($org, $acode, $refseq, 0); # "$workpath/$org/", "cds", $protdbids{$org}); } else { $nfound= readrefs($acode, $refseq, $doprot); if (scalar(%missid)) { if ($org eq 'fly' || $org eq 'man' || $org eq 'mouse') { $nfound += fasta2refseq($org, $acode, $refseq, 1); # "$workpath/$org/", "cds", $protdbids{$org}); } } } if ($debug && scalar(%missid)) { my $nid; my @ids= sort keys %missid; print STDERR "missing data for $org ids n=".scalar(@ids)." :\n"; foreach my $id (@ids) { print STDERR "$id "; print STDERR "\n" if ((++$nid % 10)==0); } } print STDERR "\n------ done $org, pulled $nfound prots\n" if $debug; } } } sub makeDnaLib { warn "FIXME! pull dna needs updating - july01 !"; foreach $org (@orgs) { next unless($org =~ /\w/); my $acode= "$workpath/$org/acode"; unless (-f $acode) { $acode= "$meowpub/$org/acode"; } my $refseq= "$workpath/$org/refseq.fasta"; if ($force || isOldTarget($acode,$refseq)) { rename ($refseq, "$refseq.old") if (-r $refseq); readrefs($acode,$refseq, 0); } } # rename ($dnalib, "$dnalib.old") if (-r $dnalib); # foreach $org (@orgs) { # ## next unless($org =~ /\w/); # my $refseq= "$workpath/$org/refseq.fasta"; # system("/bin/cat $refseq >> $dnalib") if (-r $refseq); # } } # # step thru CDS looking for /gene=$sym # # CDS join(124392..124403,124468..125109) # # /gene="Ccp84Ab" # # /translation="MAFKFVFALAFVAVASAGYAPIAAPQVYHAAPAVATYAHAPVAV" # # getz '[GENBANK-id:AF035197]' -f ftd # # /gene="zyg-9" # # some of these CDS have no /gene, only 1 CDS - use it ! sub parseCDS { my(@cds)= @_; my ($gsym,$tr,$dbid) = ('','',''); my $intr=0; my $range= shift @cds; $range= chomp($range); $range =~ s/^\s*CDS\s+//; foreach (@cds) { if (m|/gene="([^"]+)|) { $gsym= $1; # print STDERR "/gene=$gsym\n" if $debug; } elsif (m|/protein_id="([^"]+)|) { $dbid= "PROTID:$1"; ## is this always best one? } elsif (m|/db_xref="([^"]+)|) { my $dbx= $1; $dbid= $dbx if ($dbx =~ /^GI:/ && $dbid =~ /^PID:/); $dbid= $dbx if ($dbx =~ /^PID:/ && $dbid eq ''); ## PROTID > GI > PID /db_xref="GI:6598620" ## worst: /db_xref="PID:g6322237" -- want only this one? ## no: /db_xref="SGD:S0003760" } elsif ( m|/translation="([^"]+)|) { $tr= $1; $intr=1 unless( m/\"$/ ); #" # print STDERR "/translation=$tr" if $debug; # return ($gsym,$tr,$dbid,$range) unless($intr); } elsif ($intr) { s/^\s+//; if (/\"/) { $intr= 0; s/\".+$//; } #" fixed to chop after " for garbage bug source ... $tr .= $_; # return ($gsym,$tr,$dbid,$range) unless($intr); } } return ($gsym,$tr,$dbid,$range); } sub getsrsq { my($db,$val,$defq)= @_; my $srsq= $srsdbs{$db} || $defq; # NA => "getz6 '[GENBANK-acc:%val]'", unless ($srsq =~ s/getz6/$getz6/) { unless ($srsq =~ s/getz5/$getz5/) { $srsq= "$getz $srsq"; } } if (ref($val) =~ /ARRAY/) { my $vals= join('|',@$val); $srsq =~ s/\%val/$vals/; } else { $srsq =~ s/\%val/$val/; } if ($srsq =~ /$getz6/ && $ENV{'SRSROOT'} ne $SRS6Root) { $ENV{'SRSROOT'}= $SRS6Root; # dang srs::init Meow::SRS::init($SRS6Root); } elsif ($srsq =~ /$getz5/ && $ENV{'SRSROOT'} ne $SRS5Root) { $ENV{'SRSROOT'}= $SRS5Root; # dang srs::init Meow::SRS::init($SRS5Root); } return $srsq; } sub getGeneTrans { my($db, $rsq, $sym, $nosymcheck)= @_; # jul01 - use caseless sym match - too many db mistakes to know if case is preserved # jul01 - bug in getz6 -f ftd : duplicate entries w/ source 1..3931 appended # at end of CDS line !! # jun02 -- URK - got fish NA: accession w/ 38 entries in fish gene recs and 13 CDS ! # RSQ|NA:AC024175 $sym= lc($sym); my $srsq= getsrsq($db."_CDS", $rsq, " '[$db-acc:%val]'"); # print STDERR "$srsq -f ftd\n" if $debug; $/= "\n"; ## "# EOR\n"; #?? is this the bozo? yes my @ftd= `$srsq -f ftq`; ## for srs6, ftk, ftq both seem to pull features # warn "$srsq -f ftq\n" if ($debug); # fish problems missing CDS # (with >GENBANK_features) ## dang, >GENBANK_features shifts all fields to left margin ! no leading '^ ' #CDS <1..1314 # /note="Tbr" #1234567890123456789012 my ($tr,$incds,$intr,$ncds,$asym,$dbid); my @cds; foreach (@ftd) { if (/^\s{0,5}CDS /) { # srs5: /^ CDS / $ncds++; $mysym=0; $incds= 1; $intr=0; @cds= (); push(@cds,$_); # print STDERR $_ if $debug; } elsif (/^\s{0,5}\S/) { # srs5: /^ \S/ $incds= 0; if (scalar(@cds)) { ($asym,$tr,$dbid) = parseCDS(@cds); # lots of valid fish CDS are missed due to no sym match - use other data? # if ($nosymcheck && $debug) { # warn "0. ncds=$ncds,n=".scalar(@cds)." trlen=".length($tr)."\n"; } return $tr if ($tr && ($nosymcheck || lc($asym) eq $sym)); ## was $intr } @cds= (); } elsif (/^\s{10,}/ && $incds) { #srs5: /^ / s/\s+source\s.+$//; #jul01 - bug in getz6 -f ftd patch push(@cds,$_); } } if (scalar(@cds)) { ($asym,$tr,$dbid) = parseCDS(@cds); # if ($nosymcheck && $debug) { # warn "1. ncds=$ncds,n=".scalar(@cds)." trlen=".length($tr)."\n"; } return $tr if ($tr && ($nosymcheck || lc($asym) eq $sym)); ## was $intr } # if ($nosymcheck && $debug) { # warn "2. ncds=$ncds,n=".scalar(@cds)." trlen=".length($tr)."\n"; } return $tr if ($ncds<2); return ''; ##? or $tr } sub getzseq { my($outh,$raccs)= @_; my $nseq= 0; my @accs= @$raccs; my @rsq= (); my $db; foreach my $a (@accs) { $db= $a->[3]; push(@rsq, $a->[0]); } my $srsq= getsrsq($db, \@rsq, " '[$db-acc:%val]'"); # my $fas= `$srsq -d -sf fasta`; ## need -f 'acc' -d -sf fasta when ID != acc my $fas= `$srsq -f 'acc' -d -sf fasta`; foreach my $a (@accs) { my $rsq= $a->[0]; my $id= $a->[1]; my $sym= $a->[2]; my $b= index($fas,$rsq); if ($b<0) { warn "$srsdbs{$db} ($rsq for $id) - no match\n" ; next; } my $e= $b; # scan for blank line or end if ($b>0 && substr($fas,$b-1,1) eq '>') { $b--; } my $l= length($fas); my $fa= ''; while ($e<$l && !$fa) { my $e1= index($fas,"\n",$e); if ($e1>0) { $e= $e1+1; if (substr($fas,$e,1) =~ /[\n\r\s]/) { $fa= substr($fas,$b,$e-$b); $e= $l; } } else { $e= $l; $fa= substr($fas,$b,$e-$b); } } $b= index($fa,">"); if ($b>=0) { $b= index($fa,"\n",$b); if ($b>0) { $b++; $fa= substr($fa,$b); } } next if ($didid{$id}); #? shouldnt be here but fish is getting dups print $outh ">$faprefix$id ; $sym $db:$rsq\n"; print $outh $fa,"\n"; $nseq++; $didid{$id}++; delete $missid{$id}; } return $nseq; # my $ok= `$srsq`; #? need to do 2 lookups ? # if (!$ok) { print STDERR "$srsq - no match\n"; next; } #my $idc= "\">$faprefix$id ; $sym $db\\c\""; # NCBI fasta ? # system("/bin/echo $idc >> $outfile"); # system("$srsq -d -sf fasta >> $outfile"); } sub readrefs { my($file,$outfile,$doprot)= @_; my $nseq= 0; local(*R,*T); open(R,$file) || return -1; print STDERR "readrefs($file,$outfile)\n" if $debug; open(T,">>$outfile") || die $outfile; $outh= *T; my $maxacc= 10; #? 20 # for batch query my @accs= (); my $dbfld, $ldbfld; $/= "# EOR\n"; while () { my($id,$sym,$rsq,$rpa,$db)= ('','',''); $id= $1 if (/\nID\|(.+)/); next unless $id; next if ($didid{$id}); $missid{$id}++; # here for next; delete below when found $sym= $1 if (/\nSYM\|(.+)/); ## check for multiple RSQ/RPA lines? in human data $rsq= $1 if (/\nRSQ\|([\w:-]+)/); $rpa= $1 if (/\nRPA\|([\w:-]+)/); my $srsq= ''; ## ## Need to catch entries w/o RSQ/RPA fields and check alternate sources ? ## at least list missing? my $racc = ($doprot && $rpa) ? $rpa : $rsq; my $docds= ($doprot && !$rpa); $db= "GENBANK"; ($db,$racc)= split(/:/,$racc) if ($racc =~ /:/); next unless ($racc =~ /\w/); ## some bad records? only REFSEQ: $racc =~ s/\.\d+$//; # drop any version# $dbfld= $srsdbs{$db} || "$db-acc"; if (scalar(@accs) > $maxacc || ($dbfld ne $ldbfld && scalar(@accs))) { $nseq += getzseq($outh,\@accs); @accs= (); } $ldbfld= $dbfld; # $srsq= getsrsq($db, $racc, " '[$db-acc:%val]'"); if (!$docds) { push(@accs, [$racc,$id,$sym,$db]); } else { next if ($org eq 'fly' && $db eq 'GENBANK'); # skip this CDS check; use aa_gadfly ? $/= "\n"; my $tr= getGeneTrans($db, $racc, $sym, ($org eq 'fish')); $/= "# EOR\n"; if ($tr) { chomp($tr); ## make sure...## note: $tr from getGeneTrans ends with \n print $outh ">$faprefix$id ; $sym $db:$racc CDS\n$tr\n"; $nseq++; $didid{$id}++; delete $missid{$id}; } else { my $sq= getsrsq($db, $racc, " '[$db-acc:%val]'"); print STDERR "$sq : no CDS for '$sym'\n"; } # next; } # last if ($debug && $nseq >= 100); } if (scalar(@accs)) { $nseq += getzseq($outh,\@accs); @accs= (); } $/= "\n"; close(T); close(R); return $nseq; } sub readrefs0 { my($file,$outfile,$doprot)= @_; my $nseq= 0; local(*R,*T); open(R,$file) || return -1; print STDERR "readrefs($file,$outfile)\n" if $debug; $/= "# EOR\n"; while () { my($id,$sym,$rsq,$rpa,$db)= ('','',''); $id= $1 if (/\nID\|(.+)/); next unless $id; $missid{$id}++; # here for next; delete below when found $sym= $1 if (/\nSYM\|(.+)/); ## check for multiple RSQ/RPA lines? in human data $rsq= $1 if (/\nRSQ\|([\w:-]+)/); $rpa= $1 if (/\nRPA\|([\w:-]+)/); my $srsq= ''; ## ## Need to catch entries w/o RSQ/RPA fields and check alternate sources ? ## at least list missing? ## if (!$doprot && $rsq) { $db= "GENBANK"; ($db,$rsq)= split(/:/,$rsq) if ($rsq =~ /:/); next unless ($rsq =~ /\w/); ## some bad records? only REFSEQ: $rsq =~ s/\.\d+$//; # drop any version# $dbfld= $srsdbs{$db} || "$db-acc"; $srsq= getsrsq($db, $rsq, " '[$db-acc:%val]'"); } if ($doprot && ($rpa||$rsq)) { $db= "GENBANK"; if ($rpa) { ($db,$rpa)= split(/:/,$rpa) if ($rpa =~ /:/); next unless ($rpa =~ /\w/); ## some bad records? only REFSEQ: $rsq =~ s/\.\d+$//; # drop any version# $dbfld= $srsdbs{$db} || "$db-acc"; $srsq= getsrsq($db, $rpa, " '[$db-acc:%val]'"); } else { ($db,$rsq)= split(/:/,$rsq) if ($rsq =~ /:/); next unless ($rsq =~ /\w/); ## some bad records? only REFSEQ: $rsq =~ s/\.\d+$//; # drop any version# $dbfld= $srsdbs{$db} || "$db-acc"; next if ($org eq 'fly' && $db eq 'GENBANK'); # skip this CDS check; use aa_gadfly ? $/= "\n"; my $tr= getGeneTrans($db, $dbfld, $rsq, $sym, ($org eq 'fish')); $/= "# EOR\n"; if ($tr) { chomp($tr); ## make sure...## note: $tr from getGeneTrans ends with \n if (open(T,">>$outfile")) { print T ">$faprefix$id ; $sym $db:$rsq CDS\n$tr\n"; close(T); } $nseq++; $didid{$id}++; delete $missid{$id}; } else { my $sq= getsrsq($db, $rsq, " '[$db-acc:%val]'"); print STDERR "$sq : no CDS for '$sym'\n"; } next; } } if ($srsq) { my $ok= `$srsq`; #? need to do 2 lookups ? if (!$ok) { print STDERR "$srsq - no match\n"; next; } # $idc= "\">$id; $sym $db\\c\""; $idc= "\">$faprefix$id ; $sym $db\\c\""; # NCBI fasta ? ## ? change to use >db|acc|dbid instead of eugenes ID as 1st part of header? ## as in >ref|NP_001321 ; gnl|euG|$egid ; for a refprot system("/bin/echo $idc >> $outfile"); system("$srsq -d -sf fasta >> $outfile"); $nseq++; $didid{$id}++; delete $missid{$id}; } # last if ($debug && $nseq >= 20); } $/= "\n"; close(R); return $nseq; } # # fasta2refseq # read fasta dump of CDS that has synonym Identifiers # (e.g., acedb dump from 'Find sequence;query CDS;Peptides dump.fasta') # and rewrite > line to add meow IDs # sub fasta2refseq { my($org, $orgacode, $outfile, $usemiss)= @_; # my($org, $orgacode, $outfile, $inpath, $inpatt, $dbname)= @_; # constant: # "$workpath/$org/", "cds", $protdbids{$org} my $inpath= "$workpath/$org/"; my $inpatt= "cds"; #? always? my $dbname= $protdbids{$org}; # getSym2Id($org, $orgacode); flybase::sym2id::set4meow($isMeowId); flybase::sym2id::setcaseless($isCaselessSymbol); flybase::sym2id::setDbId(1); # ($org =~ /weed/); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); my $ncds= 0; local(*OUTF,*INF,*D); # unlink $outfile; ## NO unless (open(OUTF,">>$outfile")) { warn "can't write $outfile"; return -1; } opendir(D,$inpath) || die $inpath; my @incds= grep( /$inpatt/, readdir(D)); closedir(D); foreach my $infile (sort @incds) { $infile = "$inpath/$infile"; if ($infile =~ /\.(gz|Z)$/) { $ok= open(INF,"gzcat $infile|"); } # else { $ok= open(INF,"cat $infile|"); } # for multiple files else { $ok= open(INF,"$infile"); } unless ($ok) { warn "can't read $infile"; return -1; } print STDERR "fasta2refseq of $org [$infile]\n" if $debug; my $keepseq= 0; while () { if (/^>\s*(\S+)/) { my $sym= $1; my $dbid= ($sym =~ /:/) ? $sym : "$dbname:$sym" ; # acedb CDS have EMBL:xxx my $id; # $faprefix= 'gnl|euG|$id ; $sym $dbid CDS'; # for NCBI-fasta format; 'euG' short for euGenes ## print OUTF ">$faprefix$id ; $sym $dbid CDS\n"; ## if (m/^\>${faprefix}(\w+)[\s\;]+(\S+)\s+(\S+)/) { if (m/^>gnl\|euG\|(\w+)[\s\;]+(\S+)\s+(\S+)/) { $id= $1; $sym= $2; $dbid= $3; } elsif ($org eq 'weed') { # weed fadoc # >68198.m00003#T25K16_3#At1g01030 DNA-binding protein, putative similar # $idval= $tufeatid.'#'.$bacloc.'#'.$publoc; ## urgh! j02 - TIGR_ATH1:At1g01310-68300.m00038 -- changed ids again ## RPA|AGI:51784.m00035#T25K16.1#At1g01010 ## fasta2refseq: missing id for TIGR_ATH1:At1g01010-68300.m00001 ## RPA|AGI:51784.m00035#T25K16.1#At1g01010 # oat% gzgrep At1g01010 weed/cds* |more # >At1g01010-68300.m00001 T25K16.1 NAC domain protein, putative similar to NAC do # main protein NAM GB: AAD17313 GI:4325282 from [Arabidopsis thaliana]; supported # by cDNA: gi_16612276_gb_AF439834.1_AF439834 my ($tufeatid,$bacloc,$publoc); if (m/>(At\dg\d+)\-(\d+.\w+)\s+(\S+)/i) { ($publoc,$tufeatid,$bacloc)= ($1,$2,$3); ## we use which? } elsif ($sym =~ m/#/) { ($tufeatid,$bacloc,$publoc)= split(/#/,$sym); ## we use which? } else { # what now? warn "new/unknown weed cds ID: $sym\n"; if (++$newweedsym > 10) { die "failed: too many bad weed symbols"; } } $publoc =~ s/At(\d)g/At$1g/i; # ! standardize case ! $id= flybase::sym2id::dbid2id('AGI:'.$publoc); unless($id) { $id= flybase::sym2id::symbol2id($publoc); } if ($id) { $sym= $publoc; } else { $id= flybase::sym2id::symbol2id($bacloc); if ($id) { $sym= $bacloc; } } #? need weeobj here? # $weedobj= Meow::Weed->new(); $weedobj->openIdDb('ATgn', 'r'); unless ($id) { my $agiId= $publoc; $agiId =~ s/^\w*://; $agiId =~ s/TIGR_AT/At/i; my $asym= flybase::sym2id::id2symbol($agiId); if ($asym) { $sym= $asym; $id= $agiId; } } # $dbid= "$dbname:$tufeatid"; #? } elsif ($org eq 'fly') { # gadfly fadoc #>CG1009|FBgn0035226|CT1016|FBan0001009 "aminopeptidase" mol_weight=99328 located on: 3L 62A5-62A5; my ($cgsym,$fbid,$ctsym,$anid)= split(/\|/,$sym); ## we use which? my $asym= flybase::sym2id::id2symbol($fbid); if ($asym) { $id= $fbid; $sym= $asym; } else { $id= flybase::sym2id::symbol2id($cgsym); $sym= $cgsym if ($id); } #? $dbid= "$dbname:$id" if ($id); } elsif ($org eq 'man' || $org eq 'mouse') { # ncbi genomes XP proteins - XP_ can match LocusLink/acode PAC|XP: field #>gi|17433806|ref|XP_032749.2| KIAA1634 protein [Homo sapiens] $id= flybase::sym2id::symbol2id($sym); unless($id) { my ($gi,$ginum,$ref,$xpacc)= split(/\|/,$sym); $xpacc =~ s/\..+$//; # drop version# $id= flybase::sym2id::dbid2id($xpacc); $dbid= "$dbname:$xpacc" if ($id); } } else { $id= flybase::sym2id::symbol2id($sym); } unless ($id) { print STDERR "fasta2refseq: missing id for $dbid\n"; $keepseq= 0; } elsif (!$didid{$id}) { if ($usemiss && !$missid{$id}) { $didid{$id}++; $keepseq= 0; } else { $sym= flybase::sym2id::id2symbol($id) || $sym; # print OUTF ">$id; $sym $dbid CDS\n"; print OUTF ">$faprefix$id ; $sym $dbid CDS\n"; $ncds++; $keepseq= 1; $didid{$id}++; delete $missid{$id}; } } else { $keepseq= 0; } } elsif (/^>/) { $keepseq= 0; } elsif ($keepseq) { ## 05jul01 - filter out error lines from worm acedb dump - always after seq/before next print OUTF $_ if(/^\s*[a-zA-Z]/); ## is this safe? } } close(INF); } close(OUTF); return $ncds; } sub isOldTarget( $$) { # usage: if (&isOldTarget( $sourcefile, $targetfile)) { blah; } local($source,$target) = @_; my $res= 0; return $res unless( -f $source); if (! -f $target) { $res= 1; } else { my $targtime= -M $target; ## -M is file age in days.hrs before now $res= (-M $source) < $targtime; } return $res; } __END__ #---------------------------------------------------- ## DROP THIS scan of ncbi genbank/genomes data for now ## read gene sym from gbseq, add as refseq/refprot? to acode??? - ref full csome? or wait for gene sized ref ## only $doprot now ? ## See also (j02): euGenes/GenomeFeatures.pm for same Genbank readgenomes() #sub readgenomes { # my($org, $orgacode, $outfile)= @_; # # my $genomedir= $ncbi_genome_paths{$org}; # unless(-d $genomedir) { warn "can't find genome path for $org"; return 1; } # # # getSym2Id($org, $orgacode); # flybase::sym2id::set4meow($isMeowId); # flybase::sym2id::setcaseless($isCaselessSymbol); # flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); # # # local(*FOUT); # # unless (open(FOUT,">$outfile")) { warn "can't write $outfile"; return 1; } # unlink $outfile; # # print STDERR "readgenome of $org [$genomedir]\n" if $debug; # %didid= (); # # local(*D); # opendir(D,$genomedir); # @chrs= grep(/\w/,readdir(D)); # closedir(D); # foreach my $chrd (@chrs) { # $chrd= "$genomedir/$chrd"; # next unless (-d $chrd); # $csomeFile= ($fileIsChromosome{$org} ? $chrd : ''); # # opendir(D,$chrd); # my @gbk= grep(/gbk\.(Z|gz)$/,readdir(D)); # closedir(D); # foreach my $gbkf (sort @gbk) { # next if ($org eq 'yeast' && $gbkf =~ /yst_/); ## skip dup data ! # my $gbk= "$chrd/$gbkf"; ##! don't do 1st only - D_m has many in Scaffold/LARGE/ # if (-f $gbk) { # print STDERR "readcsome $gbk\n" if $debug; # ($nfeat,$ncds)= readcsome ($gbk, $org, $outfile, $genomedir); # print STDERR " done, feats = $nfeat, cds= $ncds\n" if $debug; # } # } # # } # # close(FOUT); # # undef %sym2id; #} ## DROP THIS scan of ncbi genbank/genomes data for now #sub readcsome { # my($gbk, $org, $outfile, $genomedir)= @_; # my ($fout, $ftname,$ncds,$nfeat, $infeat); # my @feat= (); # # local(*GB,*FOUT,*FPROT); # $genomeDID= $gbk; # $genomeDID= $1 if ($gbk =~ m=/(\w+)\.gbk\.(Z|gz)=); # # if ($doprot) { # $fout= $outfile; ## "$outpath/$genomeDID-cds.fasta"; # unless (open(FPROT,">>$fout")) { warn "can't write $fout"; return -1; } # } # # unless (open(GB,"gzcat $gbk|")) { warn "can't read $gbk"; return -1; } # while () { # my $line= $_; # if (/^FEATURES /) { $infeat= 1; } # elsif (/^ORIGIN /) { $infeat= -1; last; } # elsif ($infeat) { # if (/^ (\S+) /) { # my $newftname= $1; # if (scalar(@feat)) { # $nfeat++; # ## printFeat(*FOUT,$org,@feat) if ($wantfeat); # # ## need to parse gene feature from D_melanogaster to get /db_xref="FLYBASE:FBgn0005427" # ## otherwise can't match many CDS to ID by gene sym (genbank sym not same as flybase sym!) # ## ditto for SGD / yeast in CDS feature # my $getfeat= ( ($ftname eq 'gene' && $org eq 'fly') ); # ## ! FIXME: ( || ($ftname eq 'CDS' && $org eq 'yeast')); # # if ($getfeat) { # my ($fname,$srange,$gsym,$map,$rdbx,$rfnotes)= parseFeat(@feat); # if ($gsym) { # my $db; # foreach $db (@$rdbx) { # my $did= ''; # # dang this genbank ID is DatabaseID not meow ID ... except for fly # if ($org eq 'fly' && ($db =~ m/(FBgn\d+)/)) { $did= $1; } # ##FIXME: elsif ($org eq 'yeast' && ($db =~ m/SGD:(\w+)/)) { $did= $1; } # if ($did) { # flybase::sym2id::addsyn($did,$gsym); # # $sym2id{ symcase($gsym)}= $did; # } # } # } # } # # $ncds += CDS2Fasta(*FPROT,@feat) if ($ftname eq 'CDS' && $doprot); # } # $ftname= $newftname; # @feat= (); push(@feat,$line); # } # elsif (/^ /) { # push(@feat,$line); # } # } # } # close(GB); # close(FPROT) if ($doprot); # return ($nfeat++,$ncds); #} #sub parseFeat { # my( @feat)= @_; # my ($fname, $srange, $fnote, $val, $map, $gsym,$inrange); # my @dbx= (); # my %fnotes= (); # # my $fline= shift(@feat); # if ($fline =~ /^ (\S+)\s+(\S+)/) { ## 1st line only !? # $fname= $1; # $srange= $2; # $inrange= 1; # } # foreach (@feat) { # if ( m|^\s+/(\w+)\s*=\s*"([^"]+)|) { # $fnote= $1; $val= $2; # $fnotes{$fnote}++; $inrange= 0; # # if ($fnote eq 'gene') { $gsym= $val;} # elsif ($fnote eq 'map') { $map= $val;} # elsif ($fnote eq 'chromosome') { $map= "Chr $val";} #? only in source feature? # ## ^^ not in worm csome files ! # elsif ($fnote eq 'protein_id') { # push(@dbx, "PROTID:$val"); # } # elsif ($fnote eq 'db_xref') { # push(@dbx, $val); # } # } # elsif ( m|^\s+([^"]+)|) { ## " continuation line # if ($inrange) { $srange .= $1; chomp($srange); } # } # else { ## nothing else? # } # } # return ($fname,$srange,$gsym,$map,\@dbx,\%fnotes); #} #sub CDS2Fasta { # my($fhfasta,@cds)= @_; # my $ncds= 0; # my ($sym,$tr,$dbid,$range) = parseCDS(@cds); # if ($sym && $tr) { # $dbid= "GBGENOME:?" unless($dbid); ## messup - need something # $dbid= "$genomeDID:$range"; # ## ^ use gbgenome file-name:337..2799 (feature-range) # # my $id= $sym2id{ symcase($sym) }; # my $id= flybase::sym2id::symbol2id($sym); # if (!$id) { # print STDERR "CDS2Fasta: missing id for $sym\n"; # } # elsif (!$didid{$id}) { # # $sym= $idsym{$id} if($idsym{$id}); # make sure is proper symbol # $sym= flybase::sym2id::id2symbol($id) || $sym; # print $fhfasta ">$id; $sym $dbid CDS\n$tr\n"; # $didid{$id}++; # $ncds++; # } # } # return $ncds; #} ## removed j02 ## use euGenes::GenomeFeatures.pm or other package for these #%spptag = ( # FBgn => 'Fruitfly', # 'Drosophila melanogaster', # HUgn => 'Human', # 'Homo sapiens', # SGgn => 'Yeast', # 'Saccharomyces cerevisiae', # MGgn => 'Mouse', # 'Mus musculus', # CEgn => 'Worm', # 'Caenorhabditis elegans', # ATgn => 'Weed', # 'Arabidopsis thaliana', # ZFgn => 'Zebrafish', # 'Danio rerio', # ); # #%orgtag = ( # FBgn => 'fly', # HUgn => 'man', # SGgn => 'yeast', # MGgn => 'mouse', # CEgn => 'worm', # ATgn => 'weed', # ZFgn => 'fish', # ); # # drop this #%ncbi_genome_paths = ( # yeast => "$genomeshome/S_cerevisiae/", #? or drop this and use REFSEQ ? ## weed => "$genomeshome/A_thaliana/", ## worm => "$genomeshome/C_elegans/", ## fly => "$genomeshome/D_melanogaster/Scaffolds/", ## man => "$genomeshome/H_sapiens/", ## mouse => "$genomeshome/M_musculus/", # ); # drop this #%fileIsChromosome = ( # fly => 0, # man => 1, # mouse => 1, # weed => 1, # worm => 1, # yeast => 1, # ); __END__ ---------- ## add parsing for % identity from align output -- use that instead of E for reports ! #>MGgn0000009; Aanat REFPROT>6752938 # Length = 205 # Score = 432 bits (1098), Expect = e-122 # Identities = 205/205 (100%), Positives = 205/205 (100%) # moved parseblastout to blastout.pl sub parseblastout sub readGenesyms # moved to flybase::sym2id sub symcase sub getSym2Id ############ fasta libs MEOW seqs$1M/c3/tmp/meow/work/refseq/meow.seq fasta -H my.seq /c3/tmp/meow/work/refseq/meow.seq 6 ID|FBgn0020238 RSQ|U84897 RPA|SWP:P92177 # EOR ID|HUgn0000034 RSQ|REFSEQ:NM_000016 RPA|REFPROT:NP_000007 # EOR ## have 9583 total dna entries for fly(3397) + human ## fly RSQ = 3418, RPA = 1877 ## man RSQ , RPA = 6196 creating prot lib: getz '[TREMBL-acc:Q27587]' - no match << in SWISSPROT getz '[TREMBL-acc:P91927]' - no match ?? getz '[TREMBL-acc:Q23708]' - no match ?? getz '[TREMBL-acc:O01666]' - no match -- swissnew getz '[TREMBL-acc:Q94516]' - no match -- swissnew getz '[TREMBL-acc:Q24270]' - no match -- swissprot getz '[TREMBL-acc:Q94514]' - no match -- swissnew getz '[TREMBL-acc:O61305]' - no match -- swissprot getz '[TREMBL-acc:Q94524]' - no match -- swissnew getz '[TREMBL-acc:O46046]' - no match ?? getz '[SWISSPROT-acc:P81829]' - no match -- swissnew getz '[TREMBL-acc:Q94519]' - no match getz '[TREMBL-acc:O01713]' - no match getz '[TREMBL-acc:P91929]' - no match getz '[SWISSPROT-acc:Q27415]' - no match -- swissnew getz '[TREMBL-acc:O61613]' - no match getz '[TREMBL-acc:Q24439]' - no match getz '[TREMBL-acc:Q26463]' - no match getz '[TREMBL-acc:Q94522]' - no match getz '[TREMBL-acc:Q94523]' - no match getz '[TREMBL-acc:O18373]' - no match getz '[TREMBL-acc:P91682]' - no match getz '[TREMBL-acc:O18405]' - no match getz '[TREMBL-acc:O16867]' - no match ---- read genomes for: worm, yeast, weed ... ------> worm_X.ptt.Z <------ NOTE: PID => GI in NCBI.newspeak No definition found 2625 proteins Location Length PID Gene Product 31496..33425 + 231 1825688 R57.2 37015..42479 + 752 1825687 R57.1 similar to human prostrate-specific membrane antigen (PSM) (SP:Q04609) 60210..67935 + 1071 1519654 M02E1.1 78375..80537 - 519 1086673 C04E7.3 C04E7.3 87868..91969 + 535 1086674 C04E7.2 C04E7.2 92921..94446 + 114 1086675 C04E7.1 C04E7.1 106503..110900 + 1058 1118046 R04A9.2 coded for by C. elegans cDNA y ------> (yeast) X.ptt.Z <------ Saccharomyces cerevisiae chromosome X, complete chromosome sequence. 386 proteins Location Length PID Gene Synonym 466..6130 - 1759 6322237 YJL225C 8776..9138 - 121 6322238 PAU1 YJL223C 11475..16124 + 1550 6322239 VTH2 YJL222W 16767..18536 - 590 6322241 FSP2 YJL221C 18243..18695 + 151 6322240 YJL220W 19497..21200 + 568 6322242 HXT9 YJL219W ------> worm_X.gbk.Z <------ LOCUS chr_X 17624844 bp INV 26-MAY-1999 DEFINITION Caenorhabditis elegans Chromosome X. ACCESSION chr_X KEYWORDS . SOURCE Caenorhabditis elegans. ORGANISM Caenorhabditis elegans Eukaryotae; mitochondrial eukaryotes; Metazoa; Nematoda; Secernentea; Rhabditia; Rhabditida; Rhabditina; Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis. REFERENCE 1 (bases 1 to 17624844) AUTHORS Sulston,J., Du,Z., Thomas,K., Wilson,R., Hillier,L., Staden,R., Halloran,N., Green,P., Thierry-Mieg,J., Qiu,L., Dear,S., Coulson,A., Craxton,M., Durbin,R., Berks,M., Metzstein,M., Hawkins,T., Ainscough,R. and Waterston,R. TITLE The C. elegans genome sequencing project: a beginning [see comments] JOURNAL Nature 356 (6364), 37-41 (1992) MEDLINE 92168156 REFERENCE 2 (bases 1 to 17624844) AUTHORS The C. elegans Sequencing Consortium. TITLE Genome sequence of the nematode C. elegans: a platform for investigating biology. The C. elegans Sequencing Consortium JOURNAL Science 282 (5396), 2012-2018 (1998) MEDLINE 99069613 FEATURES Location/Qualifiers source 1..17624844 /organism="unknown" gene 31496..33425 /gene="R57.2" CDS join(31496..31594,32045..32141,32187..32239,32424..32603, 32689..32817,33291..33425) /gene="R57.2" /codon_start=1 /evidence=not_experimental /db_xref="PID:g1825688" /translation="MRLLSSSLLVLFLTQFVSSEKKCPAVKKGNLNELCSMLIHAYKK HWFSGACCTAPITPGCRLPRQFFSRKVGNDRRSAPRKPLGKCGTSEVNYQPCTSKGIA NKLFLSCCQLYVPEECHHMCVYETDQSVTRNMVSVQSKFKILTNMRKDSQCRIKHLSS ILYCASQNRDNRKCCLDLDLNAPQLQVGSRCLRMCDPSGTSIDRITKEDVTCLYNWNV IMYCHHAGIREM" gene 37015..42479 /gene="R57.1" CDS join(37015..37132,37904..38218,38434..38688,38770..38906, 39224..39335,39382..39475,39917..40098,40425..40569, 40907..41275,41863..42147,42192..42318,42363..42479) /gene="R57.1" /note="similar to human prostrate-specific membrane antigen (PSM) (SP:Q04609); coded for by C. elegans cDNA yk160f6.3; coded for by C. elegans cDNA CEMSB14F; coded for by C. elegans cDNA yk190d9.3; coded for by C. elegans cDNA yk160f6.5" /codon_start=1 /db_xref="PID:g1825687" /translation="MVKAYIAIAASLIFVFCIAALGVHHSERKFNKFNKVSIDDIHKS DAGVIQDNIKTENIKKYLRIFTKDPHVAGTEANKKVAYEIANAWSEAGLEDVHTLPYE VLLSYPDFENPNSVIIKSSAGKEVFKSKGVSPVIIPDEQSGKYAGHQWLAYAGNGSAS ADVVYINHGTANDFKNLKLMGVDIKGKIALMRYGHGFRGDKIHKAQQAGAIGAILFSD ...." CDS complement(join(78375..79523,80130..80537)) /gene="C04E7.3" /codon_start=1 /evidence=not_experimental /db_xref="PID:g1086673" /translation="MSSHDFTHPVRYTNKNQPGRNSLSRHESGDSSRSTQGHASLPRN TVGPPDPNALSPEGLVSRPLNTGVGPNQQVQEQYNAFSNQNENDRKMMEDPRESAVPN KRAAAAPPWIGYTQTYQGQQKYRNPQKLQQSTIKKRPKPETTSGPKKASKPTSLSEPK KTECSISEDELLSASLAERVQAEVSENEQIENLTEWAQAYKQKQEKSSETKRRSRKTA HREEEQKSKILTVPIGSLSEDQAVKQLKELTKERRRDDALVAEPSAKKKWQQKEVESK ..." ---- test $id= "CEgn0000312"; $_= < 0) { if (substr($_,$x+1,1) eq '|') { $e = $x+1; $at= $e if ($at<0); } else { $e= $x; last; } } if ($at>=0 && $e>$at) { my $s= substr($_, $at, $e-$at); foreach $s (split(/\n/, $s)) { $s =~ s/^\w*\|//; $sym2id{$s}= $id; } } } foreach (keys %sym2id) { print "$_ => $sym2id{$_}\n"; } ==== use gnomap mRNA/CDS features and dna.raw to make pep for missing refseqs? -- human - use gnomap ? -- fly, use gadfly translations ? -- weed, use TIGR transl. ? -- worm - uses wormbase/acedb cds dump -- mouse - only refseq -- fish -- only refseq -- yeast -- refseq has all? jul01 pull: fish/ 939 fly/ 13492 man/ 11813 mouse/ 6883 weed/ 23799 worm/ 19609 yeast/ 6174 nov00 pull: fish/ 751 fly/ 13432 man/ 10367 mouse/ 5052 weed/ 6160 worm/ 19592 yeast/ 6281 === 05jul01 == lots of error lines in worm acedb CDS dump: what is cause? filter out here anyway >CB005O21.c MHKPHKPSILPSLTIFSKSNHKFFAYINPSRSFIFYTSSMLSQSSILLFL LFSVNGALFRFMGSSNEDPLKIVNNHFVEIKKACKDLNQTAIRQLIVATD HDKINITMTMLIFGSYEYYPTEVGYFEEGHIHTTVDMIKPGRKMYMYFFL RPARPSPIFTYPSDDFSLEALGDDESIRVKRWGGWGGRGGWGGGRGYYGG RGGGWGRGGGWGRGGGWGRGYGRGWGK // ERROR: Subsequence coords missing for GAPRA_236 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_238 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_239 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_240 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_241 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_242 in SUPERLINK_RWRA /c7/eugenes/refseq/worm oat% /bio/mb/blast2/formatdb -p T -o T -i refprot.fasta [formatdb] WARNING: Sequence number 4283 (lcl|CEgn0007232;), 24 illegal characte rs were removed: 24 Os >CEgn0007232; C56G7.3 WormBase:C56G7.3 CDS MEAEPLETEQDEWRKERMRVLDEWCEIELKGADALKMDVTREPGSPTVAV APETSLQDLYPLLIATSSSNAPPSATPTDDVTDPEVQVVSRESIDNRSTF NRIIDVCLCRPVKSQMGPKAYTDKTLILKLAQIPYDHENGTHWLLLSDYF NNVSRSLMTSSEYSHVTNPSRVGAHWVTVGFQSATPHTDFRGCGVLGLLQ MHTFTQRVPANLLRAIVLLATTEPNDFPLAVVSINITSILLTQLKKGALD NFGNEIEGLYPFFSALHASAMCRFCSIYKSQKCTLANTQTIFSEITRQLE KSPLSLIMLLNPTNDELINTLL // ERROR: Subsequence coords missing for GAPRA_236 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_238 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_239 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_240 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_241 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_242 in SUPERLINK_RWRA ^^^^^^ lots of these in acedb CDS dump !?? what from? >CEgn0007253; CC4.2 WormBase:CC4.2 CDS MSTVESAAVRLRPVGSLFFLNRPHEKRAFDSLAGSGFDNGFNKRAFDSLA $b/fastacmd -d refprot.fasta -s 'lcl|CEgn0007232;' >lcl|CEgn0007232; C56G7.3 WormBase:C56G7.3 CDS MEAEPLETEQDEWRKERMRVLDEWCEIELKGADALKMDVTREPGSPTVAVAPETSLQDLYPLLIATSSSNAPPSATPTDD VTDPEVQVVSRESIDNRSTFNRIIDVCLCRPVKSQMGPKAYTDKTLILKLAQIPYDHENGTHWLLLSDYFNNVSRSLMTS SEYSHVTNPSRVGAHWVTVGFQSATPHTDFRGCGVLGLLQMHTFTQRVPANLLRAIVLLATTEPNDFPLAVVSINITSIL LTQLKKGALDNFGNEIEGLYPFFSALHASAMCRFCSIYKSQKCTLANTQTIFSEITRQLEKSPLSLIMLLNPTNDELINT LL >>> rest is garbage message: ERRRSUBSEQUENCECRDSMISSINGFRGAPRAINSUPERLINKRWRAERRRSUBSEQUENCECRDSMISSINGFRGA PRAINSUPERLINKRWRAERRRSUBSEQUENCECRDSMISSINGFRGAPRAINSUPERLINKRWRAERRRSUBSEQUENC ECRDSMISSINGFRGAPRAINSUPERLINKRWRAERRRSUBSEQUENCECRDSMISSINGFRGAPRAINSUPERLINKRW RAERRRSUBSEQUENCECRDSMISSINGFRGAPRAINSUPERLINKRWRA === weed data error -- empty peptide entry >ATgn0019611; AT4g15210 AGI:68169.m00130#dl3650c#AT4g15210 CDS >ATgn0019612; AT4g15220 AGI:68169.m00131#dl3655w#AT4g15220 CDS MIQTGEEDEEKATSLEVEFASGNGVDDEEELRLQWATVERLPTFKRVTTALLARDEVSGK error is in TIGR file ftp.tigr.org:/pub/data/a_thaliana/ath1/SEQUENCES/ATH1_chr4.pep.gz Jun 13 15:02 ATH1_chr4.pep.gz ------> ATH1_chr4.pep.gz <------ >68169.m00130#dl3650c#AT4g15210 beta-amylase >68169.m00131#dl3655w#AT4g15220 ABC transporter homolog MIQTGEEDEEKATSLEVEFASGNGVDDEEELRLQWATVERLPTFKRVTTALLARDEVSGK and cds also Jun 13 15:02 ATH1_chr4.cds.gz ------> ATH1_chr4.cds.gz <------ >68169.m00130#dl3650c#AT4g15210 beta-amylase >68169.m00131#dl3655w#AT4g15220 ABC transporter homolog ATGATCCAAACAGGTGAAGAAGATGAAGAAAAGGCAACGTCCCTTGAAGTTGAGTTTGCT okay in seq: ------> ATH1_chr4.cds.gz <------ >68169.m00130#dl3650c#AT4g15210 beta-amylase ATGGCTACCAATTACAACGAGAAGCTTCTTCTTAATTATGTTCCCGTTTACGTTATGCTT also CDS feature locs seem okay in xml file ------> chr4v06112001.xml.gz <------ ==== pull extra human cds from goldenpath?? (genieAltPep is empty table) Human: Genes include 22052 experimentally determined loci and 14997 predicted loci. Refseq NP_ in srs found 11813 of the 22,052 goldenpath mysql has 8243 in genieKnownPep - not worth checking and using? what is source? predicted CDS would be useful -- use instead gnomap feature tables and dna.raw to pull CDS, translate ? gene FLJ20542 - 505753..508068 MEOW:HUgn0054973 mRNA FLJ20542 - join(505753..505933,506072..506156,506540..50663 0,506715..506878,506964..507072,507163..507225,507296..507439,507517..507647,507 740..508068) MEOW:HUgn0054973 Gold:RS.ctg17658-000000.1.0 mysql> select * from genieKnownPep limit 2 -> \G *************************** 1. row *************************** name: RS.ctg101-000000.0.0 seq: MVVLGMQTEEGHCIMLRGLAHSLGGTQVICKVVGLPSSIGFNTSSHLLFPATLQGAPTHFPCRWRQGGSTDNLPA *************************** 2. row *************************** name: RS.ctg13284-000000.3.0 seq: MAQVLIVGAGMTGSLCAALLRRQTSGPLYLAVWDKAEDSGGRMTTACSPHNPQCTADLGAQYITCTPHYAKKHQR FYDELLAYGVLRPLSSPIEGMVMKEGDCNFVAPQGISSIIKHYLKESGAEVYFRHRVTQINLRDDKWEVSKQTGSPEQFD LIVLTMPVPEILQLQGDITTLISECQRQQLEAVSYSSRYALGLFYEAGTKIDVPWAGQYITSNPCIRFVSIDNKKRNIES SEIGPSLVIHTTVPFGVTYLEHSIEDVQELVFQQLENILPGLPQPIATKCQKWRHSQVPSAGVILGCAKSPWMMAIGFPI -- test batch getzseq >gnl|euG|HUgn0000001 ; A1BG REFPROT>NP_570602 >gnl|euG|HUgn0000002 ; A2M REFPROT>NP_000005 >gnl|euG|HUgn0000009 ; NAT1 REFPROT>NP_000653 >gnl|euG|HUgn0000010 ; NAT2 REFPROT>NP_000006 /bio/mb/srs61//bin/solaris/getz '[REFSEQ-acc:NP_060480]' - no match /bio/mb/srs61//bin/solaris/getz '[REFSEQ-acc:NP_060632]' - no match /bio/mb/srs61//bin/solaris/getz '[REFSEQ-acc:NP_060692]' - no match set rvals='NP_570602|NP_060480|NP_000005|NP_060632|NP_000653' getz "[REFSEQ-acc:$rvals]" -d -sf fasta -- ok for refseq >NP_570602 == acc vals getz "[REFSEQ-acc:$rvals]" -f 'acc' -d -sf fasta SPTREMBL => "getz6 '[SWALL-acc:%val]'", # sw, swnew, tr, trnew SWISSPROT => "getz6 '[SWALL-acc:%val]'", #! for fish! SWP => "getz6 '[SWISSPROT-acc:%val]'", # sw & swnew PIR => "getz6 '[PIR-acc:%val]'", set svals='O62621|Q9NF44|P32234|Q9NF45|P29310' getz "[SWALL-acc:$svals]" -d -sf fasta -- cant tell from locid which acc is in result getz "[SWALL-acc:$svals]" -f 'acc' -d -sf fasta -- this works, but need to parse AC line set pvals='T13852|Q9NF44|T13927|Q9NF45|S52154' getz "[PIR-acc:$pvals]" -d -sf fasta getz "[PIR-acc:$pvals]" -f 'acc' -d -sf fasta >gnl|euG|FBgn0025724 ; &bgr;'Cop SWP>COPP_DROME -- SWP:O62621 >gnl|euG|FBgn0010347 ; 1.28 SPTREMBL>Q24300 >gnl|euG|FBgn0010339 ; 128up SWP>128U_DROME -- SWP:P32234 >gnl|euG|FBgn0020238 ; 14-3-3&egr; SWP>143E_DROME >gnl|euG|FBgn0004907 ; 14-3-3&zgr; SWP>143Z_DROME -- SWP:P29310 >gnl|euG|FBgn0010340 ; 140up SPTREMBL>P81928 >gnl|euG|FBgn0004364 ; 18w PIR>T13852 >gnl|euG|FBgn0004364 ; 18w PIR>T13852 -- PIR:T13852 >gnl|euG|FBgn0023416 ; Ac3 PIR>T13927 -- PIR:T13927 >gnl|euG|FBgn0004852 ; Ac76E PIR>T13158 >gnl|euG|FBgn0012034 ; AcCoAS PIR>S52154 /bio/mb/srs61//bin/solaris/getz '[SWALL-acc:Q9NF44]' - no match /bio/mb/srs61//bin/solaris/getz '[SWALL-acc:Q9NF45]' - no match /bio/mb/srs61//bin/solaris/getz '[SWALL-acc:Q9NF52]' - no match refprot: ACCESSION NP_000653 >NP --- ??? why not all of id . another srs parser bug mdieaylerigykksrnkldletltdilqhqiravpfenlnihcgdamdlgleaifdqvv rrnrggwclqvnhllywalttigfettmlggyvystpakkystgmihlllqvtidgrnyi vdagfgrsyqmwqplelisgkdqpqvpcvfrlteengfwyldqirreqyipneeflhsdl ledskyrkiysftlkprtiedfesmntylqtspssvftsksfcslqtpdgvhclvgftlt hrrfnykdntdliefktlseeeiekvlknifnislqrklvpkhgdrffti swissprot: AC P29310; O01665; Q9V5G6; >143Z_DROME MSTVDKEELVQKAKLAEQSERYDDMAQAMKSVTETGVELSNEERNLLSVAYKNVVGARRS SWRVISSIEQKTEASARKQQLAREYRERVEKELREICYEVLGLLDKYLIPKASNPESKVF YLKMKGDYYRYLAEVATGDARNTVVDDSQTAYQDAFDISKGKMQPTHPIRLGLALNFSVF YYEILNSPDKACQLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDTQGDEA EPQEGGDN