#!/usr/local/bin/perl # blastout.pl =head1 NOTES blastout.pl parser for eugenes species comparison blasts - jun02 - revise to parse ncbi bl -m 8,9 table outputs add parsing for % identity from align output -- use that instead of E for reports ! paths for iubio computer problem - hgtables are missing all non-matches (unless we do self x self blasts - screws up summary statistics # need some blast params in header: blast -b 30 -a 4 -e 1e-30 -p blastp -d /b1/b/work//meow/work/refseq//meowprot.fa -i /b1/b/work//meow/work/refseq//meowprot.fa minprob (-e) in *blastout as # before start Database: /b1/b/work//meow/work/refseq//meowprot.fa 61,635 sequences; 28,932,922 total letters # at end of each out Number of sequences better than 1.0e-30: 11 length of database: 28,932,922 Posted date: Nov 23, 2000 9:31 PM Number of letters in database: 28,932,922 Number of sequences in database: 61,635 =cut use Getopt::Long; use POSIX; $debug= 0; $makelib= 0; $dofast= 0; $doprot= 1; $doselfscore= 0; $blfmt= 9; # read from .cmdline ? $blastvers= 'BLASTP 2.2.2 [Jan-08-2002]'; # get from blast output ! $blastpmaxp= '1e-30'; ## for blastp, higher for blastn. $blastnmaxp= '1e-10'; ## blastn $blastmaxp= $blastpmaxp; $faprefix= 'gnl\|euG\|'; # for NCBI-fasta format; 'euG' short for euGenes $kPctRange= 0; ## was 4; ## don't get too many low scores - or leave in hgtable, but keep from acode/gene record # $meowpub= "/c6/iubio/meow-pub/eugenes/server/"; # $mework= "/c7/eugenes/refseq/"; $refpath= "/c7/eugenes/refseq02/"; $pubpath= "/bio/meow-pub/eugenes/server"; $dnalib= 'refdna.fasta'; # "$refpath/refdna.fa"; $protlib= 'refprot.fasta'; # "$refpath/refprot.fa"; # was meowprot.fa # $hgtabdef= "hgtable"; ##? split into each spp/ folder? # $maxfactor= 1e30; # $seqlib= $protlib; # $blastout= "$seqlib.blastout" unless($blastout); ## ordered to build dna/prot lib ## add non eug: bsub ecoli rat rice @fullorglist= qw(fly worm yeast man mouse mosquito fish weed ); %spptag = ( FBgn => 'Ffly', # 'Drosophila melanogaster', HUgn => 'Human', # 'Homo sapiens', SGgn => 'Yeast', # 'Saccharomyces cerevisiae', MGgn => 'Mouse', # 'Mus musculus', CEgn => 'Worm', # 'Caenorhabditis elegans', ATgn => 'Weed', # 'Arabidopsis thaliana', ZFgn => 'Zfish', # 'Danio rerio', AGgn => 'Mosquito', ); %orgtag = ( FBgn => 'fly', HUgn => 'man', SGgn => 'yeast', MGgn => 'mouse', CEgn => 'worm', ATgn => 'weed', ZFgn => 'fish', AGgn => 'mosquito', ecoli => 'ecoli', bsub => 'bsub', rice => 'rice', # Oryza sativa rat => 'rat', # RNgn ); ## TEST # $mework=''; # parseblastout2('testc.blastout','testc.hgtab',$doprot); # exit; $optok= Getopt::Long::GetOptions( 'dna!' => \$dodna, 'protein!' => \$doprot, 'run!' => \$dorun, 'selfscore!' => \$doselfscore, 'splitall!' => \$dosplit, 'otherorgs!' => \$otherorg, 'blformat=n' => \$blfmt, 'Organism=s' => \@orgs, 'path=s' => \$refpath, 'pubpath=s' => \$pubpath, 'alltogether!' => \$doalltogether, 'debug!' => \$debug, ); #'Workpath=s' => \$workpath, #'Pubpath=s' => \$meowpub, #'input=s' => \$blastout, #'output=s' => \$hgtab, if (@orgs) { @orgs= map{ split(/\,/,$_); } @orgs; $allorgs= 0; } else { @orgs= @fullorglist; $allorgs= 1; } %wantorg = map { $_, 1; } @orgs; # $blopts= getBlastOpts($blastopts,$doalltogether); ($blastout,$hgtab,$cmdlf)= getInOutFiles($orgs[0],$orgs[1]); #$blastout= $blastout1 unless($blastout); #$hgtab= $hgtab1 unless($hgtab); $usage= < $blastout, hgtable output => $hgtab, EOF unless($optok && ($dorun || $dosplit)) { warn $usage; exit; } # -i[nput]=blastout - blast results as input [ $blastout ] # -output=hgtable - homolog table output [ $hgtab ] #------------- %idinfo= (); # hash{id} of hashes {sym},{dbid},{refprot},{xx} if ($dorun) { # do this once here in case we change mind about $hgtab loc/contents foreach $qorg (@orgs) { next unless($allorgs || $wantorg{$qorg}); foreach $sorg (@orgs) { next unless($allorgs || $wantorg{$sorg}); ($blastout,$hgtab,$cmdlf)= getInOutFiles($sorg,$qorg); rename($hgtab,"$hgtab.old") if (-e $hgtab); } } if ($doalltogether) { ($blastout,$hgtab,$cmdlf)= getInOutFiles('all','all'); ## ^^ not right - need hgtab per $org ## but one blastout file if (-r $cmdlf) { open(D,$cmdlf); @blastcmd=; close(D); } warn "\nblastout all x all\n" if $debug; if ($blfmt == 8 || $blfmt == 9) { parseblastout9($blastout,$hgtab,$doprot,$doselfscore); } else { # check $blfmt ? parseblastout2($blastout,$hgtab,$doprot,$doselfscore); } } else { # foreach my $sorg (@orgs) { # #? read all org syms, etc here? or as needed below # } my $norg= scalar(@orgs); for ($iq= 0; $iq < $norg; $iq++) { my $qorg= $orgs[$iq]; readGenesyms($qorg); for ($js= 0; $js < $norg; $js++) { my $sorg= $orgs[$js]; # next if ($sorg eq $qorg); #? or do these - YES (for stats and paralogs) ($blastout,$hgtab,$cmdlf)= getInOutFiles($sorg,$qorg); if (-r $cmdlf) { open(D,$cmdlf); @blastcmd=; close(D); } warn "\nblastout $qorg x $sorg\n" if $debug; readGenesyms($sorg); if ($blfmt == 8 || $blfmt == 9) { parseblastout9($qorg,$blastout,$hgtab,$doprot,$doselfscore); } else { # check $blfmt ? parseblastout2($blastout,$hgtab,$doprot,$doselfscore); } } } } } if ($hgtabAllInOne && ($dosplit || $dorun)) { if ($doselfscore) { readGenesyms(@orgs); } # ok if read already sortNsplit($hgtab, $refpath, $doselfscore); } exit(); #---------- # sort hgtable and split by $org ? col1 = query spp, col 5 = bitscore # open(IN, "sort -k 1,1 -k 5,5rn $path/$featf|") or die $featf; # open(F,">$path/$featf.new") or die "$path/$featf.new"; sub sortNsplit { my ($hgall,$refpath,$doselfscore)= @_; my $head; open(IN, "$hgall") or die $hgall; while() { if (/^#/) { $head .= $_; } else { last; } } close(IN); my ($lorg, $lid); # sort by 1st org, bit_score open(IN, "sort -k 1,1 -k 5,5rn $hgall|") or die $hgall; while() { if (/^\w/) { #my($sid,$x)= split "\t",$_,2; my ($qid, $sid, $qorg, $qsym, $score, $prob, $pctident, $refsq, $dbid ) = split "\t"; (my $qtag= $qid) =~ s/\d+//; my $qorg= $orgtag{$qtag}; # (my $stag= $sid) =~ s/\d+//; # my $sorg= $orgtag{$stag}; unless($qorg) { warn "bad data:$_"; next; } if ($qorg ne $lorg) { close(O) if ($lorg); open(O,">$refpath/$qorg/hgtable") or die "$qorg/hgtable"; warn "writing $refpath/$qorg/hgtable\n" if $debug; print O $head; ## diff by org ! FIXME $lorg= $qorg; } my $selfid= ($sid eq $qid); # ! selfid *should* be at top of list from sort # $doselfscore= 0 if ($selfid); #? turn off? if (!$selfid && $doselfscore && $qid ne $lid) { my $idflds= $idinfo{$qid}; # skip if no $idflds ? my $qsym= $idflds->{sym} || '--'; my $dbid= $idflds->{dbid} || '--'; my $refsq= $idflds->{refprot} || '--'; my $score= '--'; my $prob= 0.0; my $pctident= 100; my @oline= ($qid, $qid, $qorg, $qsym, $score, $prob, $pctident, $refsq, $dbid ); my $oline= join("\t", @oline)."\n"; print O "\n",$oline; } $lid= $qid; ## remove extra low score entries here? # if ($kPctRange && $gottag{$qtag.$stag}) { # my $minp= $gottag{$qtag.$stag} - $kPctRange; # next if($pctident < $minp); ## skip low scores unless 1st of spp. # } print O $_; } } close(IN); close(O); } # from blastrefseq.pl ... sub getInOutFiles { my($org1,$org2)= @_; #globals? my($sourcelib, $querylib, $blastout, $hgtab, $seqfile); if ($doprot) { $seqfile= $protlib; } else { $seqfile= $dnalib; } if ($org1 eq 'all') { $sourcelib= "$refpath/$seqfile"; } else { $sourcelib= "$refpath/$org1/$seqfile"; } if ($org2 eq 'all') { $querylib= "$refpath/$seqfile"; $blastout= "$refpath/blastout.$org1-$org2"; # $hgtab= "$refpath/hgtable.$org1-$org2"; #?? want all source (org1) orgs in one table # $hgtabAllInOne= 1; } else { $querylib= "$refpath/$org2/$seqfile"; $blastout= "$refpath/$org2/blastout.$org1-$org2"; # $hgtab= "$refpath/$org2/hgtable.$org1-$org2"; # $hgtabAllInOne= 0; } $cmdlf= "$blastout.cmdline"; #moved this info to top of $blastout unless( -r $blastout) { if (-r "$blastout.gz") { $blastout .= ".gz"; } } #?? do one table for all orgs then sort & split $hgtab= "$refpath/hgtable.all"; #?? want all source (org1) orgs in one table $hgtabAllInOne= 1; return ($blastout, $hgtab, $cmdlf); # unless (-r $sourcelib) { # $sourcelib= "$sourcelib.gz" if (-r "$sourcelib.gz"); # } # unless (-r $querylib) { # $querylib= "$querylib.gz" if (-r "$querylib.gz"); # } # return ($sourcelib, $querylib, $blastout, $hgtab, $seqfile); } sub hgtabheader { my ($fname,$org,$date,$blstats)= @_; my $o= <) { my($id,$sym,$rsq,$rpa,$db)= ('','',''); $id= $1 if (/\nID\|(.+)/); next unless $id; my %idflds= (); $idflds{sym}= $1 if (/\nSYM\|(.+)/); # $idsym{$id}= $idflds{dbid}= $1 if (/\nDID\|(.+)/); # $dbid{$id} $idflds{refprot}= $1 if (/\nRPA\|(.+)/); ## else { $idflds{refprot}= $fastadefs{$id} } ## FIXME ! # $idflds{refseq}= $1 if (/\nRNA\|(.+)/); $idinfo{$id}= \%idflds; } local $/= "\n"; close(ALIB); $idinfo{$org}= 1; } return %idinfo; } #sub initSymbols { # my( $org, $orgacode)= @_; # # $myorg= $org if ($org); # flybase::sym2id::set4meow($isMeowId); # # flybase::sym2id::setcaseless(1) if ($self->{org} =~ m/weed/); # flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); #} ## version 8/9 of blast out formats sub putline { my($outh, $sid, $qid, $score, $prob, $pctident) = @_; # $pct= $1; (my $stag= $sid) =~ s/\d+//; (my $qtag= $qid) =~ s/\d+//; my $qorg= $spptag{$qtag} || '--'; ## not here? do after sort by $sid, $score # if ($kPctRange && $gottag{$qtag.$stag}) { # my $minp= $gottag{$qtag.$stag} - $kPctRange; # return () if($pctident < $minp); ## skip low scores unless 1st of spp. # } my $idflds= $idinfo{$qid}; # load info needed/id for all orgs processed? from acode? # need to get refprot, dbid? from fasta deflines my $qsym= $idflds->{sym}; #? my $dbid= $idflds->{dbid}; # my $dbid= $dbid{$sid}; my $refsq= $idflds->{refprot}; ## print HERE $gottag{$qtag.$stag}= $pctident; $qsym= '--' unless($qsym); $score= '--' unless($score); $prob= '--' unless($prob); $refsq= '--' unless($refsq); $dbid= '--' unless($dbid); # add refdb:sid for others' uses #?? which is first $source_id or $query_id ? my @oline= ($sid, $qid, $qorg, $qsym, $score, $prob, $pctident, $refsq, $dbid ); my $oline= join("\t", @oline)."\n"; if ($needstats) { $saveout .= $oline; } else { print $outh $oline; } # $nt++; # $sid= $qsym= $score= $prob= $pctident= ''; return @oline; ## \@oline ? } sub parseblastout9 { my($qorg, $file,$outfile,$doprot,$doselfscore)= @_; local(*R,*O); unless (open(R,$file)) { warn "cannot read $file"; return -1; } warn "\nparsing $file to $outfile\n" if $debug; unless (open(O,">>$outfile")) { warn "cannot write $outfile"; return -1; } my $outh= *O; my @tm= localtime( $^T - 24*60*60*(-M $file) ); my $fdate= POSIX::strftime("%d-%B-%Y",@tm); print $outh &hgtabheader($outfile,$qorg, $fdate, $blstats); my $nl= 0; while () { chomp; $nl++; if (/^#/) { # parse some of this if ($nl == 1) { push(@blastcmd,$_); } elsif ($nl < 3 && /date:/) { push(@blastcmd,$_); } } else { my @v= split "\t"; my ($qid, $sid, $pctident, $alignment_length, $mismatches, $gap_openings, $q_start, $q_end, $s_start, $s_end, $prob, $bit_score ) = @v; # not saved: alignment_length mismatches gap_openings q_start/end s_start/end #? should use pctident * (alignlength / min(srclen,targlen) ) ?? unless( $qid =~ s/$faprefix//) { $qid =~ s/^gi\|\d+\|(\S+)/$1/; } unless( $sid =~ s/$faprefix//) { $sid =~ s/^gi\|\d+\|(\S+)/$1/; } $pctident= int($pctident); $bit_score= int($bit_score); # $refsq =~ s/>/:/; $selfid= ($sid eq $qid); # ($tag= $qid) =~ s/\d+//; # $spp= $spptag{$tag}; # $gotid{$qid}++; putline($outh, $qid, $sid, $bit_score, $prob, $pctident); #? do reciprocal here for tables? putline($outh, $sid, $qid, $bit_score, $prob, $pctident) unless($selfid); $nput++; print STDERR "." if ($debug && ($nput % 100)==0); print STDERR "\n" if ($debug && ($nput % 7000)==0); # $sym= $idsym{$qid} || '--'; # unless($sym); # # $sym= flybase::sym2id::id2symbol($qid) || '--'; # if ($qtag ne $lasttag) { # close(O) if ($lasttag); # if ($otherorg) { $org= $wantorg[0]; } # from $file or @wantorg ?? # # if (!$otherorg) { # $org= $orgtag{$qtag}; # unless( $org && ($allorgs || $wantorg{$org})) { # $validorg= 0; $validquery= 0; next; # } # readGenesyms($org); # } # open(O,">>$refpath/$org/hgtable") || warn "cannot write $refpath/$org/hgtable"; # # print O &hgtabheader("$org/hgtable",$org,$fdate, $blstats) unless($needstats); # $validorg= 1; # $lasttag= $qtag; # } } } close(O); close(R); } sub parseblastout2 { my($file,$outfile,$doprot,$doselfscore)= @_; local(*R,*O); unless (open(R,$file)) { warn "cannot read $file"; return -1; } my @tm= localtime( $^T - 24*60*60*(-M $file) ); my $fdate= POSIX::strftime("%d-%B-%Y",@tm); ## open(O,">$outfile") || return -1; $lasttag= ''; my ($validorg,$validquery)= (0,0); my %donequery= (); my $saveout; my $oline; my $needstats= 1; # my $blstats; my $blstats= {}; my $queryline; my( $nseqs, $nletters, $matrix, $gapo, $gape ); while () { if (/^Reference:/) { $qid= ''; $indat= 0; $getscore= 0; $insum= 0; $nt=0; $minp= 0; $havedb= 0; $maxp= $blastmaxp; %gottag=(); %gotid= (); } elsif (/^BLAST/) { chomp(); $blastvers= $_; } #bad# elsif (/^Query=\s*>?(\w+)/) #elsif (/^Query=\s*(\w\wgn\d+)/) { ## ok elsif (/^Query=\s*(\w+)/) { ## ok $qid= $1; $queryline= $_; if ($otherorg) { $qid= (/^Query=\s*(\S+)/) ? $1 : $qid; $qid =~ s/^gi\|\d+\|(\S+)/$1/; } $donequery{$qid}++; ## to skip dupped sequences ## fixme for $otherorg ## Query= gi|1786183|gb|AAC73113.1| aspartokinase I, homoserine dehydrogenase I [Escherichia coli K12] # ($qtag= $qid) =~ s/\d+$//; if ($otherorg) { $qtag= $file; } elsif (/^Query=\s*(\w\wgn)\d+/) { $qtag= $1; } elsif (m/^Query=\s*\w+\s*;\s+\S+\s+(\w+):\S+/) { $qtag= $1; } if ($qtag ne $lasttag) { close(O) if ($lasttag); if ($otherorg) { $org= $wantorg[0]; } # from $file or @wantorg ?? if (!$otherorg) { $org= $orgtag{$qtag}; unless( $org && ($allorgs || $wantorg{$org})) { $validorg= 0; $validquery= 0; next; } readGenesyms($org); } open(O,">>$refpath/$org/hgtable") || warn "cannot write $refpath/$org/hgtable"; print O &hgtabheader("$org/hgtable",$org,$fdate, $blstats) unless($needstats); $validorg= 1; $lasttag= $qtag; } if ($donequery{$qid}>1) { $validquery= 0; next; } else { $validquery= $validorg; } if ($validquery && $doselfscore) { ## special case, query not in database - write Query= value # Query= yaaI; yaaI ecoli:1 CDS $queryline =~ m/^Query=\s*(\w+)\s*;\s+(\S+)\s+(\S+)/; my $id= $qid; ## KEEP SAME ; $oherseq also my $sym= $2; my $refsq= $3; ## ^^ format change, jul01, to NCBI defline: gnl|euG|egid ; sym org:refprot ?? ## or maybe ref|refacc ; gnl|euG|egid|sym|org my $spp= ($refsq =~ /^(\w+)\:/ ) ? $1 : '--'; my $score= '--'; my $prob= 0.0; my $pct= 100; my $dbid= '--'; $oline= "$qid\t$id\t$spp\t$sym\t$score\t$prob\t$pct\t$refsq\t$dbid\n"; if ($needstats) { $saveout .= $oline; } else { print O $oline; } } } elsif (!$validquery) { next; } elsif (/significant alignments:/) { $indat=1; ## $insum=1; } elsif ($indat && /\s*Database:/) { print O "\n" if ($nt>0); $indat= 0; $getscore= 0; $insum= 0; $nt= 0; } elsif ($needstats) { if (/\s+(\S+) sequences;\s+(\S+) total letters/) { $blstats->{nseqs}= $1; $blstats->{nletters}= $2; } elsif (/Matrix: (\S+)/) { $blstats->{matrix}= $1; } elsif (/Gap Penalties: Existence: (\d+), Extension: (\d+)/) { $blstats->{gapo}= $1; $blstats->{gape}= $2; } # this is after all scores! elsif (/Number of sequences better than ([\w\.\-\+]+):/) { # 1.0e-30 $blstats->{blastmaxprob}= $1; # $blstats = "\# No. sequences=$nseqs; No. letters=$nletters; Expectation cutoff=$blastmaxprob \n"; # $blstats .= "\# Matrix=$matrix; Gap Existence=$gapo; Gap Extension=$gape \n"; print O &hgtabheader("$org/hgtable",$org, $fdate, $blstats); print O $saveout; $saveout= ''; $needstats= 0; } } # Score = 1219 bits (3119), Expect = 0.0 elsif ($getscore && /^\s*Score =\s+(\d+) bits [^E]+Expect =\s+([e\d\.-]+)/) { $score= $1; $prob= $2; ## $getscore= 0; ## $getpct= 1; } # Identities = 611/666 (91%), Positives = 611/666 (91%) elsif ($getscore && m|^\s*Identities = [\d/]+\s+.(\d+)\%|) { $pct= $1; $getscore= 0; if ($kPctRange && $gottag{$tag}) { next if($pct < ($gottag{$tag} - $kPctRange)); ## skip low scores unless 1st of spp. } ## print HERE $gottag{$tag}= $pct; $sym= '--' unless($sym); $score= '--' unless($score); $prob= '--' unless($prob); $refsq= '--' unless($refsq); # add refdb:id for others' uses my $dbid= $dbid{$id}; $dbid= '--' unless($dbid); if ($id) { $oline= "$qid\t$id\t$spp\t$sym\t$score\t$prob\t$pct\t$refsq\t$dbid\n"; if ($needstats) { $saveout .= $oline; } else { print O $oline; } } $nt++; $id= $sym= $score= $prob= $pct= ''; } ## start of align info #>MGgn0000019; Abc8 REFPROT>6752940 # Length = 666 elsif ($indat && /^>(\w+)/) { $id= $1; $insum= 0; $refsq= ''; $sym= ''; if (/^>\w+\s*;\s+([^\s]+)\s+(\w+[>:]\S+)/) { $sym= $1; $refsq= $2; } elsif (/^>\w+\s*;\s+(SGD>\S+)\s+(\w+)/) { $refsq= $1; $sym= $2; } elsif (/^>\w+\s*;\s+(\w+[>:]\S+)/) { $refsq= $1; } $refsq =~ s/>/:/; $selfid= ($id eq $qid); $sym= $idsym{$id} unless($sym); ($tag= $id) =~ s/\d+//; $gotid{$id}++; $spp= $spptag{$tag}; $getscore= 1 unless($gotid{$id}>1); } } close(R); close(O); } __END__ === jun02 blastouts - separated by spp x spp tables - need more work to fill in blast stats (not in -m 9 table out) and to get each query/source refseq ID - need to use .fasta > deflines ./fly/blastout.worm-fly ./fly/blastout.yeast-fly ./fly/blastout.man-fly ./fly/blastout.mouse-fly ./fly/blastout.mosquito-fly ./fly/blastout.fish-fly ./fly/blastout.weed-fly ./worm/blastout.yeast-worm ./worm/blastout.man-worm ./worm/blastout.mouse-worm ./worm/blastout.mosquito-worm ./worm/blastout.fish-worm ./worm/blastout.weed-worm ./yeast/blastout.man-yeast ./yeast/blastout.mouse-yeast ./yeast/blastout.mosquito-yeast ./yeast/blastout.fish-yeast ./yeast/blastout.weed-yeast === 7jul01 blastout oat% find . -name hgtable -exec wc {} \; 61511 438562 3750348 ./fly/hgtable 63713 468973 3936918 ./man/hgtable 38868 288973 2382853 ./mouse/hgtable 29590 184816 1745909 ./weed/hgtable 81648 561754 4872493 ./worm/hgtable 29233 207550 1812848 ./yeast/hgtable (weed, fish not finished yet...) 4894 37774 306646 ./00nov/fish/hgtable 56700 395504 3297658 ./00nov/fly/hgtable 53497 389877 3165053 ./00nov/man/hgtable 28713 213488 1692640 ./00nov/mouse/hgtable 20810 133010 1125062 ./00nov/weed/hgtable 75376 506698 4304445 ./00nov/worm/hgtable 26688 184452 1565009 ./00nov/yeast/hgtable === jul01 -- add more blastout params to top text # fly/hgtable # Homologous genes for fly computed 07-July-2001 with BLASTP 2.1.3 [Apr-1-2001] # BLAST command: blastall -b 30 -a 4 -e 1e-30 -p blastp -d 'refprot.fa' -i refprot.fa # No. sequences=82,808; No. letters=37,984,691; Expectation cutoff=1.0e-30 # ID lookup at http://iubio.bio.indiana.edu/meow/.bin/moidq.html?id # Includes source-id == homolog-id to document source # # tab-separated-values # source homolog organ- gene blast blast pct homolog ref. db # id id ism sym. score prob ident refseq id == jul01 - read some more blastout params and put into methods text Database: /c7/eugenes/refseq//refprot.fa Posted date: Jul 5, 2001 1:12 PM Number of letters in database: 37,984,691 Number of sequences in database: 82,808 Lambda K H 0.319 0.135 0.404 Gapped Lambda K H 0.267 0.0410 0.140 Matrix: BLOSUM62 <<< Gap Penalties: Existence: 11, Extension: 1 <<< Number of Hits to DB: 34300114 Number of Sequences: 82808 Number of extensions: 1474329 Number of successful extensions: 4574 Number of sequences better than 1.0e-30: 84 << get cutoff prob Number of HSP's better than 0.0 without gapping: 4 Number of HSP's successfully gapped in prelim test: 80 Number of HSP's that attempted gapping in prelim test: 4242 Number of HSP's gapped (non-prelim): 96 length of query: 449 length of database: 37,984,691 effective HSP length: 51 effective length of query: 398 effective length of database: 33,761,483 effective search space: 13437070234 effective search space used: 13437070234 T: 11 A: 40 X1: 16 ( 7.4 bits) X2: 38 (14.6 bits) X3: 64 (24.7 bits) S1: 41 (21.7 bits) S2: 335 (133.7 bits) BLASTP 2.1.3 [Apr-1-2001] <<< Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402. Query= ATgn0008890; At2g43810 AGI:5256.m00084#F18O19.8#At2g43810 CDS (91 letters) Database: /c7/eugenes/refseq//refprot.fa 82,808 sequences; 37,984,691 total letters ================== ## this parses summary score table -- drop # elsif ($insum && /^(\w+)/) { # $id= $1; # $refsq= ''; $sym= ''; # if (/^\w+;\s+([^\s]+)\s+(\w+>\w+)/) { # $sym= $1; $refsq= $2; # } # elsif (/^\w+;\s+(SGD>\S+)\s+(\w+)/) { # $refsq= $1; $sym= $2; # } # elsif (/^\w+;\s+(\w+>\w+)/) { # $refsq= $1; # } # $refsq =~ s/>/:/; # # ## if ($id eq $qid) { $qrefsq= $refsq; next; } ##? no, write data below for ref, but $qid eq $id # $selfid= ($id eq $qid); # $sym= $idsym{$id} unless($sym); # # ## need to decide how many to keep, based on score... # $score= substr($_,67); # $score =~ /(\d+)\s+(\S+)/; $prob= $2; $score= $1; # $prob= '1'.$prob if ($prob =~ /^e/); ##? fix for 'e-140' parsing of real # # unless($selfid) { # if ($minp==0 && $prob > 1e-100) { ##$prob>0 # $minp= $prob; $p= $minp * $maxfactor; $maxp= $p if ($p<$maxp); # } # if ($prob > $blastmaxp || ($nt>3 && $prob > $maxp)) { # $insum= 0; next; # } # } # # ($tag= $id) =~ s/\d+//; # $nt++; # $gottag{$tag}++; # $spp= $spptag{$tag}; # # ##! print O "$qid\t$id\t$spp\t$sym\t$score\t$prob\t$refsq\n"; # } ## blast output - default pair align w/ %identities Query= MGgn0000019; Abc8 REFPROT>6752940 (666 letters) Database: refprot.fasta 3545 sequences; 1,827,919 total letters Searching..................................................done Score E Sequences producing significant alignments: (bits) Value MGgn0000019; Abc8 REFPROT>6752940 1219 0.0 MGgn0000024; Abcg2 REFPROT>6752944 183 7e-47 MGgn0000016; Abc10 REFPROT>6671495 88 3e-18 MGgn0008968; Pgy2 REFPROT>6679297 84 4e-17 MGgn0008967; Pgy1 REFPROT>6755046 81 5e-16 MGgn0008969; Pgy3 REFPROT>6755048 79 2e-15 MGgn0011394; Sur2 REFPROT>6755696 57 5e-09 MGgn0007487; Mdrap REFPROT>6678848 50 9e-07 MGgn0000023; Abcd4 REFPROT>6680614 45 3e-05 MGgn0000020; Abcd1 REFPROT>6671497 35 0.027 MGgn0000022; Abcd3 REFPROT>6680612 34 0.060 MGgn0000021; Abcd2 REFPROT>6752942 34 0.078 MGgn0004708; Gem REFPROT>6753972 31 0.52 MGgn0009068; Pkdrej REFPROT>6755086 30 0.89 MGgn0006117; Ifi47 REFPROT>6680359 29 1.5 MGgn0012296; Tgtp REFPROT>6755777 28 4.5 MGgn0008665; Orc4 REFPROT>6754946 28 4.5 MGgn0007705; Mpa2 REFPROT>6678922 27 5.9 MGgn0005068; Guk1 REFPROT>6680137 27 5.9 MGgn0004512; Gab2 REFPROT>6753932 27 5.9 >MGgn0000019; Abc8 REFPROT>6752940 Length = 666 Score = 1219 bits (3119), Expect = 0.0 Identities = 611/666 (91%), Positives = 611/666 (91%) Query: 1 MACLMAAFSVGTAMNASSYSAAMTEPKXXXXXXXXXXXXXXXXXXXXLLNGHLKKVDNNF 60 MACLMAAFSVGTAMNASSYSAAMTEPK LLNGHLKKVDNNF Sbjct: 1 MACLMAAFSVGTAMNASSYSAAMTEPKSVCVSVDEVVSSNVDEVETDLLNGHLKKVDNNF 60 Query: 61 TEAQRFSSLPRRAAVNIEFKDLSYSVPEGPWWKKKGYKTLLKGISGKFNSGELVAIMGPS 120 TEAQRFSSLPRRAAVNIEFKDLSYSVPEGPWWKKKGYKTLLKGISGKFNSGELVAIMGPS Sbjct: 61 TEAQRFSSLPRRAAVNIEFKDLSYSVPEGPWWKKKGYKTLLKGISGKFNSGELVAIMGPS 120 Query: 121 GAGKSTLMNILAGYRETGMKGAVLINGMPRDLRCFRKVSCYIMQDDMLLPHLTVQEAMMV 180 GAGKSTLMNILAGYRETGMKGAVLINGMPRDLRCFRKVSCYIMQDDMLLPHLTVQEAMMV Sbjct: 121 GAGKSTLMNILAGYRETGMKGAVLINGMPRDLRCFRKVSCYIMQDDMLLPHLTVQEAMMV 180 Query: 181 SAHLKLQEKDEGRREMVKEILTALGLLPCANTRTGSLSGGQRKRLAIALELVNNPPVMFF 240 SAHLKLQEKDEGRREMVKEILTALGLLPCANTRTGSLSGGQRKRLAIALELVNNPPVMFF Sbjct: 181 SAHLKLQEKDEGRREMVKEILTALGLLPCANTRTGSLSGGQRKRLAIALELVNNPPVMFF 240 Query: 241 DEPTSGLDSASCFQVVSLMKGLAQGGRSIVCTIHQPSAKLFELFDQLYVLSQGQCVYRGK 300 DEPTSGLDSASCFQVVSLMKGLAQGGRSIVCTIHQPSAKLFELFDQLYVLSQGQCVYRGK Sbjct: 241 DEPTSGLDSASCFQVVSLMKGLAQGGRSIVCTIHQPSAKLFELFDQLYVLSQGQCVYRGK 300 Query: 301 VSNLVPYLRDLGLNCPTYHNPADFVMEVASGEYGDQNSRLVRAVREGMCDADYKRDLGGD 360 VSNLVPYLRDLGLNCPTYHNPADFVMEVASGEYGDQNSRLVRAVREGMCDADYKRDLGGD Sbjct: 301 VSNLVPYLRDLGLNCPTYHNPADFVMEVASGEYGDQNSRLVRAVREGMCDADYKRDLGGD 360 Query: 361 TDVNPFLWHRPAEEDSASMEGCHSFSASCLTQFCILFKRTFLSIMRDSVLTHLRITSHXX 420 TDVNPFLWHRPAEEDSASMEGCHSFSASCLTQFCILFKRTFLSIMRDSVLTHLRITSH Sbjct: 361 TDVNPFLWHRPAEEDSASMEGCHSFSASCLTQFCILFKRTFLSIMRDSVLTHLRITSHIG 420 Query: 421 XXXXXXXXXXXXXNEAKKVXXXXXXXXXXXXXXXXXXXXPTVLTFPLEMSVFLREHLNYW 480 NEAKKV PTVLTFPLEMSVFLREHLNYW Sbjct: 421 IGLLIGLLYLGIGNEAKKVLSNSGFLFFSMLFLMFAALMPTVLTFPLEMSVFLREHLNYW 480 Query: 481 YSLKAYYLAKTMADVPFQIMFPVAYCSIVYWMTSQPSDAVRFVLFAALGTMTSLVAQSLG 540 YSLKAYYLAKTMADVPFQIMFPVAYCSIVYWMTSQPSDAVRFVLFAALGTMTSLVAQSLG Sbjct: 481 YSLKAYYLAKTMADVPFQIMFPVAYCSIVYWMTSQPSDAVRFVLFAALGTMTSLVAQSLG 540 Query: 541 LLIGAASTSLQVATFVGPVTAIPVLLFSGFFVSFDTIPAYLQWMSYISYVRYGFEGVILS 600 LLIGAASTSLQVATFVGPVTAIPVLLFSGFFVSFDTIPAYLQWMSYISYVRYGFEGVILS Sbjct: 541 LLIGAASTSLQVATFVGPVTAIPVLLFSGFFVSFDTIPAYLQWMSYISYVRYGFEGVILS 600 Query: 601 IYGLDREDLHCDIAETCHFQKSEAILRELDVENAKLYLDFIVLGIFFISLRLIAYFVLRY 660 IYGLDREDLHCDIAETCHFQKSEAILRELDVENAKLYLDFIVLGIFFISLRLIAYFVLRY Sbjct: 601 IYGLDREDLHCDIAETCHFQKSEAILRELDVENAKLYLDFIVLGIFFISLRLIAYFVLRY 660 Query: 661 KIRAER 666 KIRAER Sbjct: 661 KIRAER 666 >MGgn0000024; Abcg2 REFPROT>6752944 Length = 657 Score = 183 bits (459), Expect = 7e-47 Identities = 101/266 (37%), Positives = 166/266 (61%), Gaps = 12/266 (4%) Query: 77 IEFKDLSY--SVPEGPWWKKKGYKTLLKGISGKFNSGELVAIMGPSGAGKSTLMNILAGY 134 + F ++Y V G +K K +L I+G G L AI+GP+G GKS+L+++LA Sbjct: 37 LSFHHITYRVKVKSGFLVRKTVEKEILSDINGIMKPG-LNAILGPTGGGKSSLLDVLAAR 95 Query: 135 RET-GMKGAVLINGMPRDLRCFRKVSCYIMQDDMLLPHLTVQEAMMVSAHLKLQE--KDE 191 ++ G+ G VLING P+ F+ S Y++QDD+++ LTV+E + SA L+L K+ Sbjct: 96 KDPKGLSGDVLINGAPQPAH-FKCCSGYVVQDDVVMGTLTVRENLQFSAALRLPTTMKNH 154 Query: 192 GRREMVKEILTALGLLPCANTRTGS-----LSGGQRKRLAIALELVNNPPVMFFDEPTSG 246 + E + I+ LGL A+++ G+ +SGG+RKR +I +EL+ +P ++F DEPT+G Sbjct: 155 EKNERINTIIKELGLEKVADSKVGTQFIRGISGGERKRTSIGMELITDPSILFLDEPTTG 214 Query: 247 LDSASCFQVVSLMKGLAQGGRSIVCTIHQPSAKLFELFDQLYVLSQGQCVYRGKVSNLVP 306