#!/usr/local/bin/perl # blastrefseq.pl =head NOTES compute similar/homologous genes among euGenes reference gene sequences paths for iubio computer see blastout.pl for parseblastout2 output table jun02 updates - dont use common protlib/dnalib of all org seqs but blast for each org against each other org ? - how does this affect scores ? can we do single pair seq blasts for updates? - output with -m 8 or 9 table (9 adds comments/query) - for org x org processing (std now), creates refprot.fasta.blastout.org1-org2 files and looks for xxx.blastout.org1-org2.skip and .org1-org2.stop to control processing jul01 udpates - separated from meowrefseq.pl -- generating seqs now in pullrefseq.pl blastrefseq.pl only does comparision of data - change to NCBI BL 2.1.3 (apr01) - new parameters: use filter default (SEG) - try '-m 8' table output - try BLASTCLUST ? -? try org x org - how many pairwise tests needed? are they reciprocal? probly not -- no obvious advantage to all x all - produces same matches, scores =cut $debug= 0; $makelib= 0; $dofast= 0; $doprot= 1; $nofilter= 0; # dont use by default !? jul01 $isMeowId= 0; $doparalogs= 1; @sys= `uname -X`; %sys= map { chomp; split(' = '); } @sys; if ($sys{Node} eq 'oat') { $TMPD= "/c7/eugenes/tmp/" ; } # elsif ($sys{Node} eq 'kalo') { $TMPD= "/c5/tmp"; } #?? $blasthome= '/bio/mb/blast2'; $blast= "$blasthome/blastall"; $blastvers= 'BLASTP 2.2.2 [Jan-08-2002]'; # get from blast output ! $formatdb= "$blasthome/formatdb"; $formatdbopts= '-o T '; # index >uniqid on header $fasta= "/bio/mb/fasta/fasta33_t"; ## set -b -v low if per species runs, or higher if all spp batch ! #$blastopts= '-v 30 -b 30 -a '.$sys{NumCPU}; # -a #cpus -v 100 limit #matches below 500 default? $blastopts = "-v 10 -b 10 -m 9 -a $sys{NumCPU}"; ## add -F F no query filter by default $blastpmaxp= '1e-30'; ## for blastp, higher for blastn. $blastnmaxp= '1e-10'; ## blastn $blastmaxp= $blastpmaxp; $faprefix= 'gnl|euG|'; # for NCBI-fasta format; 'euG' short for euGenes $tmpseq= "/tmp/egseq.fa"; $refsq= "/c7/eugenes/refseq02/"; $dnalib= 'refdna.fasta'; # "$refsq/refdna.fa"; $protlib= 'refprot.fasta'; # "$refsq/refprot.fa"; # was meowprot.fa #!? change to per-org blast db? do orgA x orgB instead of all x all? # $maxfactor= 1e30; ## ordered to build dna/prot lib ## add non eug: bsub ecoli rat rice @fullorglist= qw(fly worm yeast man mouse mosquito fish weed ); @orgs= (); #------- use Getopt::Long; # use flybase::sym2id; Getopt::Long::GetOptions( 'dna!' => \$dodna, 'protein!' => \$doprot, 'blast!' => \$doblast, 'fasta!' => \$dofasta, 'formatdb!' => \$dopressdb, 'blhome!' => \$blasthome, 'Organism=s' => \@orgs, 'qorganisms=s' => \@qorgs, 'paralogs!' => \$doparalogs, 'path=s' => \$refsq, 'nofilter' => \$nofilter, 'alltogether!' => \$doalltogether, 'debug!' => \$debug, ); #'Workpath=s' => \$workpath, #'Pubpath=s' => \$meowpub, if (@orgs) { @orgs= map{ split(/\,/,$_); } @orgs; } else { @orgs= @fullorglist; } if (@qorgs) { @qorgs= map{ split(/\,/,$_); } @qorgs; } else { @qorgs= @orgs; } $blopts= getBlastOpts($blastopts,$doalltogether); ($sourcelib,$querylib,$outfile,$seqfile)= getInOutFiles($orgs[0],$orgs[1]); $blast= "$blasthome/blastall"; $formatdb= "$blasthome/formatdb"; $usage= < $blopts, sourcelib => $sourcelib, querylib => $querylib, output => $outfile, org1 x org2 processing is standard now, creating org2/blastout.org1-org2 outputs org2/blastout.org1-org2.skip file will skip that comparison org2/blastout.org1-org2.stop file will halt program at that point Use to compute similar genes among euGenes reference gene sequences See pullrefseq.pl for pulling reference sequences from organism gene data See blastout.pl for parsed blast output EOF unless($doblast||$dofasta||$dopressdb) { warn $usage; exit; } if ($doalltogether) { ($sourcelib,$querylib,$outfile,$seqfile)= getInOutFiles('all','all'); makeAllOrgLib($sourcelib,$seqfile); unless($dopressdb) { $dopressdb= isOldTarget($sourcelib,"$sourcelib.phr"); } if ($dopressdb) { formatdb($sourcelib,$doprot); } if ($doblast) { warn "\nblast all x all\n" if $debug; rename ($outfile, "$outfile.old") if (-r $outfile); blastall($sourcelib,$sourcelib,$outfile,$blopts); # $doprot); } } else { foreach my $sorg (@orgs) { my $needfmt= $dopressdb; ($sourcelib,$querylib,$outfile,$seqfile)= getInOutFiles($sorg,$sorg); unless($needfmt) { $needfmt= isOldTarget($sourcelib,"$sourcelib.phr"); } if ($needfmt) { formatdb($sourcelib,$doprot); } } # my $norg= scalar(@orgs); # for ($iq= 0; $iq < $norg; $iq++) { # my $qorg= $orgs[$iq]; # for ($js= $iq; $js < $norg; $js++) { # my $sorg= $orgs[$js]; foreach my $qorg (@qorgs) { foreach my $sorg (@orgs) { next if (!$doparalogs && $sorg eq $qorg); #? or do these - NEED option for paralogs ($sourcelib,$querylib,$outfile,$seqfile)= getInOutFiles($sorg,$qorg); if (-e "$outfile.skip") { # flag to skip some pairs warn "skipping $sourcelib,$querylib,$outfile\n"; next; } if (-e "$outfile.stop") { die "stopping before $sourcelib,$querylib,$outfile\n"; } if ($doblast) { warn "\nblast $qorg x $sorg\n" if $debug; rename ($outfile, "$outfile.old") if (-r $outfile); blastall($sourcelib,$querylib,$outfile,$blopts); # $doprot); } } } } exit(); #---------- #### old methods #if ($doprot) { # $seqlib= $protlib; $blastmaxp= $blastpmaxp; # $blastopts .= " -e $blastpmaxp "; # $seqfile= 'refprot.fasta'; # } #else { # $seqlib= $dnalib; $blastmaxp= $blastnmaxp; # $blastopts .= " -e $blastnmaxp "; # $seqfile= 'refdna.fasta'; # } #unless (-r $seqlib) { $needmakelib=1; } #else { #foreach $org (@orgs) { # my $refseq= "$refsq/$org/$seqfile"; # if (isOldTarget($refseq,$seqlib)) { $needmakelib= 1; last; } # } #} # #if ($needmakelib) { # rename ($seqlib, "$seqlib.old") if (-r $seqlib); #? # system("touch $seqlib"); # foreach $org (@orgs) { # my $refseq= "$refsq/$org/$seqfile"; # system ("cat $refseq >> $seqlib") if (-r $refseq) ; # } # $dopressdb= 1; #} #unless($dopressdb) { $dopressdb= isOldTarget($seqlib,"$seqlib.phr"); } #if ($dopressdb) { formatdb($seqlib,$doprot); } # #if ($doblast) { # $blastout= "$seqlib.blastout"; # rename ($blastout, "$blastout.old") if (-r $blastout); # blastall($seqlib,$seqlib,$blastout,$doprot); ## if ($blastall) { blastall($seqlib,$blastout,$doprot); } ## else { blast($seqlib,$blastout,$doprot); } #} #if ($dofasta) { # $fastout= "$seqlib.fastaout"; # ## unlink $fastout; # rename ($fastout, "$fastout.old") if (-r $fastout); # runfasta($seqlib,$seqlib,$fastout,$doprot); #} sub makeAllOrgLib { my($sourcelib,$seqfile)= @_; my $needmakelib= 0; # ($sourcelib,$querylib,$outfile,$seqfile)= getInOutFiles('all','all'); unless (-r $sourcelib) { $needmakelib=1; } else { foreach $org (@orgs) { my $refseq= "$refsq/$org/$seqfile"; if (isOldTarget($refseq,$sourcelib)) { $needmakelib= 1; last; } } } if ($needmakelib) { rename ($sourcelib, "$sourcelib.old") if (-r $sourcelib); #? system("touch $sourcelib"); foreach $org (@orgs) { my $refseq= "$refsq/$org/$seqfile"; system ("cat $refseq >> $sourcelib") if (-r $refseq) ; } $dopressdb= 1; } } sub getInOutFiles { my($org1,$org2)= @_; if ($doprot) { $seqfile= $protlib; } else { $seqfile= $dnalib; } if ($org1 eq 'all') { $sourcelib= "$refsq/$seqfile"; } else { $sourcelib= "$refsq/$org1/$seqfile"; } if ($org2 eq 'all') { $querylib= "$refsq/$seqfile"; $blastout= "$refsq/blastout.$org1-$org2"; } else { $querylib= "$refsq/$org2/$seqfile"; $blastout= "$refsq/$org2/blastout.$org1-$org2"; } ## ??leave the .gz addition to formatdb ? or drop for blastall() ## cant do - need libs to be -r readable unless (-r $sourcelib) { $sourcelib= "$sourcelib.gz" if (-r "$sourcelib.gz"); } unless (-r $querylib) { $querylib= "$querylib.gz" if (-r "$querylib.gz"); } return ($sourcelib,$querylib,$blastout,$seqfile); } sub getBlastOpts { my($opts,$doalltogether)= @_; $opts .= " -F F" if ($nofilter); if ($doalltogether) { $opts =~ s/\-v (\d+)/-v 30/; $opts =~ s/\-b (\d+)/-b 30/; } if ($doprot) { $opts .= " -p blastp -e $blastpmaxp "; } else { $opts .= " -p blastn -e $blastnmaxp "; } return $opts; } # formatdb -i stdin -o T -p T -n "nt" sub formatdb { my($seqlib,$doprot)= @_; print STDERR "formatdb($seqlib)\n" if $debug; my $opts= $formatdbopts; # = '-n meow'; if ($doprot) { $opts .= ' -p T ';} else { $opts .= ' -p F '; } my $seqpath= $seqlib; $seqpath =~ s,/([^/]+$),,; my $sfile= $1 || $seqlib; chdir($seqpath); # unless (-r $seqlib) { # if (-r "$seqlib.gz") { $seqlib= "$seqlib.gz"; } # elsif (-r "$seqlib.Z") { $seqlib= "$seqlib.Z"; } # } if ($seqlib =~ /\.(gz|Z)$/) { system("gzcat $seqlib | $formatdb $opts -i stdin "); opendir(D,'.'); my @f= grep(/stdin/,readdir(D)); closedir(D); foreach $f (@f) { (my $t= $f) =~ s/stdin/$sfile/; rename($f,$t); } } else { system("$formatdb $opts -i $seqlib "); } } sub blastall { my($seqlib,$querylib,$outfile,$opts)= @_; my $date= "# date=".`date`; print STDERR "# blastall($seqlib,$querylib,$outfile)\n" if $debug; print STDERR $date if $debug; my $cmd; ## querylib must be fasta or stdin of fasta if ($querylib =~ /\.(gz|Z)$/) { $cmd= "gzcat $querylib | $blast $opts -d '$seqlib' -i stdin"; } else { $cmd= "$blast $opts -d '$seqlib' -i $querylib"; } my $cmdl= "# cmd=".$cmd; $cmdl =~ s,$refsq/,,g; print STDERR $cmdl,"\n",$date if $debug; # ## don't really need separate file - put in $outfile # local(*D); open(D,">>$outfile.cmdline"); print D $cmdl,"\n",$date; close(D); system("echo '$cmdl' >> $outfile"); system("echo '$date' >> $outfile"); system("$cmd >> $outfile"); } #sub blast { # my($seqlib,$outfile,$doprot)= @_; # print STDERR "blast($seqlib,$outfile)\n" if $debug; # ## NCBI blast v2 Mon Jan 24 17:29:31 EST 2000 # $opts= $blastopts; # if ($doprot) { $opts .= '-p blastp ';} else { $opts .= '-p blastn '; } # # my $cmdl= "$blast $opts -d $seqlib -i $tmpseq\n"; # $cmdl =~ s=/[\S\/]+\/==g; # my $cmdlf= $outfile; $cmdlf =~ s=\.\w+$=.cmdline=; # print STDERR $cmdl if $debug; # local(*D); open(D,">$cmdlf"); print D $cmdl; close(D); # # local(*S,*TMP); # open(S,$seqlib) or return -1; # $/= "\n>"; # my $n= 0; # while () { # $n++; next if ($debug && ($n % 100 != 1)); # /([^\n]+)/; $id= $1; # s/>$//; ## trunc trailing > in $_ # open(TMP,">$tmpseq"); print TMP ">$_\n"; close(TMP); # system("$blast $opts -d $seqlib -i $tmpseq >> $outfile"); # $nblast++; # last if ($debug && $nblast >= 100); # } # $/= "\n"; # close(S); # return $nblast; #} sub runfasta { my($seqlib,$querylib,$outfile,$doprot)= @_; if ($doprot) { $ktup= 2; } else { $ktup= 6; } ## $opts= '-m 9 -q -H'; $opts= '-m 9 -E 0.01 -q -H'; $falib= "'$seqlib 12'"; ## BLAST2 format from formatdb print STDERR "fasta($seqlib,$querylib,$outfile)\n" if $debug; local(*S,*TMP); open(S,$querylib) || return -1; $/= "\n>"; ## revise to do upper-triangle of matrix tests? - don't need to do complete matrix ! my $n= 0; while () { $n++; next if ($debug && ($n % 200 != 1)); /([^\n]+)/; $id= $1; s/>$//; ## trunc trailing > in $_ open(TMP,">$tmpseq"); print TMP ">$_\n"; close(TMP); print STDERR "fasta $opts $tmpseq $falib $ktup [$id]\n" if $debug; system("$fasta $opts $tmpseq $falib $ktup >> $outfile"); $nfasta++; last if ($debug && $nfasta >= 20); } $/= "\n"; close(S); return $nfasta; } sub isOldTarget( $$) { # usage: if (&isOldTarget( $sourcefile, $targetfile)) { blah; } local($source,$target) = @_; my $res= 0; return $res unless( -f $source); if (! -f $target) { $res= 1; } else { my $targtime= -M $target; ## -M is file age in days.hrs before now $res= (-M $source) < $targtime; } return $res; } __END__ # $cmdl= "/bio/mb/blast2/blastall -b 30 -a 4 -e 1e-30 -p blastp -d /b1/b/work//meow/work/refseq//meowprot.fa -i /b1/b/work//meow/work/refseq//meowprot.fa"; # $cmdl =~ s=/[\S\/]+\/==g; # print $cmdl; ############ jun02 pull of ref prots fish - 1069 fly - 13559 man - 46465 mouse - 9339 mosquito- 12664 weed - 25007 worm - 19448 yeast - 6031 rice/grep: can't open rice//refprot.fasta rat/grep: can't open rat//refprot.fasta yeast/grep: can't open yeast//refprot.fasta ############ 3jul01 pull of euGenes reference proteins: fish - 889 new refp; 751 old refp; 1,220 eugenes fly - 13492 new; 13432 old refp; 23,649 eugenes; ( incorporates aa_gadfly2 - 147 not found elsewhere ), weed - 23799 new refp (TIGR); 6160 old refp (TAIR); 26,819 eugenes; using tigr pep data now worm - 19609 new refp; old refp 19592; 21,686 eugenes; acedb fasta had 4215 seqs w/o eugenes id match ! man - 11813 new refp ; old refp 10367; 37,049 eugenes; add gnomap mRNA/dna translation for missing ones (2/3 of total)? mouse - 6883 new refprot ; old refprot 5052; 28,280 eugenes yeast - 6174 refprots from refseq; 7,230 eugenes; 6281 old refprots pulled from genbank/genomes CDS ############ - try BLASTCLUST automatically and systematically clusters protein or DNA sequences /bio/mb/blast2/blastclust -a 2 -i fish.fa -o fish.clust >& log.fishblc & /bio/mb/blast2/blastclust -a 2 -i fishmouse.fa -o fishmouse.clust -s fishmouse.nb >& log.blc & - try formatdb w/ parsed header id - okay if not NCBI's >db|acc|id as long as >uniqid ? /bio/mb/blast2/formatdb -p T -o T -i refprot.fasta /c7/eugenes/refseq/worm oat% /bio/mb/blast2/formatdb -p T -o T -i refprot.fasta [formatdb] WARNING: Sequence number 4283 (lcl|CEgn0007232;), 24 illegal characte rs were removed: 24 Os >CEgn0007232; C56G7.3 WormBase:C56G7.3 CDS MEAEPLETEQDEWRKERMRVLDEWCEIELKGADALKMDVTREPGSPTVAV APETSLQDLYPLLIATSSSNAPPSATPTDDVTDPEVQVVSRESIDNRSTF NRIIDVCLCRPVKSQMGPKAYTDKTLILKLAQIPYDHENGTHWLLLSDYF NNVSRSLMTSSEYSHVTNPSRVGAHWVTVGFQSATPHTDFRGCGVLGLLQ MHTFTQRVPANLLRAIVLLATTEPNDFPLAVVSINITSILLTQLKKGALD NFGNEIEGLYPFFSALHASAMCRFCSIYKSQKCTLANTQTIFSEITRQLE KSPLSLIMLLNPTNDELINTLL // ERROR: Subsequence coords missing for GAPRA_236 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_238 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_239 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_240 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_241 in SUPERLINK_RWRA // ERROR: Subsequence coords missing for GAPRA_242 in SUPERLINK_RWRA ^^^^^^ lots of these in acedb CDS dump !?? what from? >CEgn0007253; CC4.2 WormBase:CC4.2 CDS MSTVESAAVRLRPVGSLFFLNRPHEKRAFDSLAGSGFDNGFNKRAFDSLA ############ -? try org x org - how many pairwise tests needed? are they reciprocal? probly not fish x fly, fish x mouse, fish x man, fish x worm, fish x weed, fish x yeast fly x mouse, fly x man, fly x worm, fly x weed, fly x yeast - so far, doesn't seem to be any advantage to splitting by organism - likely slower; all x all seems to produce same matches, values /bio/mb/blast2/formatdb -p T -i mouse.fa /bio/mb/blast2/formatdb -p T -i fish.fa /bio/mb/blast2/formatdb -p T -i fishmouse.fa /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 13 -a 1 -e 1e-30 -p blastp -d mouse.fa -i fish2.fa -o test.blout real 1.7 user 1.6 /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 10 -a 1 -e 1e-30 -p blastp -d mouse.fa -i fish2.fa -o test.blout real 2.3 user 2.2 /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 11 -a 1 -e 1e-30 -p blastp -d mouse.fa -i fish2.fa -o test.blout real 2.1 user 1.9 /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 13 -a 1 -e 1e-03 -p blastp -d mouse.fa -i fish2.fa -o test.blout real 2.0 user 1.9 /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 13 -F F -a 1 -e 1e-30 -p blastp -d mouse.fa -i fish2.fa -o test.blout real 2.0 user 1.9 /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 13 -a 1 -e 1e-30 -p blastp -d mouse.fa -i fish.fa -o mouse-fish.blout >& log.mouse-fish & /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 13 -a 1 -e 1e-30 -p blastp -d fish.fa -i mouse.fa -o fish-mouse.blout >& log.fish-mouse & /usr/bin/time /bio/mb/blast2/blastall -b 30 -G 13 -a 1 -e 1e-30 -p blastp -d fishmouse.fa -i fishmouse.fa -o fishmouse.blout >& log.fishmouse & egrep '^Query=|^MGgn|^ZFgn' mouse-fish.blout | more egrep '^Query=|^MGgn|^ZFgn' fishmouse.blout | more egrep '^Query=|^MGgn|^ZFgn' fish-mouse.blout | more /bio/mb/blast2/blastall -b 30 -G 13 -a 1 -e 1e-30 -p blastp -d fish.fa -i fish2.fa -o test.blout /bio/mb/blast2/blastall -m 8 -b 30 -G 13 -a 1 -e 1e-30 -p blastp -d fish.fa -i fish2.fa -o test.bltsv oat% more test.bltsv - looks useful, but we want to preserve align output for users to see? qry db % Ident ? ? ? ? ? ZFgn0000687; ZFgn0000687; 87.58 322 40 0 1 322 1 322 1e-177 613.8 ZFgn0000687; ZFgn0000686; 57.85 325 123 5 1 322 1 314 4e-100 356.4 ZFgn0000690; ZFgn0000690; 95.32 619 29 0 15 633 15 633 0.0 1321.1 ZFgn0000690; ZFgn0000689; 42.33 619 332 7 20 626 22 627 3e-157 547.7 egrep '^Query=|^FBgn|^HUgn|^MGgn|^CEgn|^ATgn|^SGgn|^ZFgn' refprot.fa.blastout | more ############ fasta libs MEOW seqs$1M/c3/tmp/meow/work/refseq/meow.seq fasta -H my.seq /c3/tmp/meow/work/refseq/meow.seq 6 ############ BLAST options - optimize? see http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/Blast_setup.html#Choose for some param options $blastpmaxp= '1e-30'; ## for blastp, higher for blastn. -? change to 1e-03 $blastopts= '-b 30'; # default align shown is 250 $blastopts .= ' -a '.$sys{NumCPU}; $blastopts .= " -F F" if ($nofilter); # SEG is default - best for prot. compares? $blastopts .= " -m 8" if ($tableout); # new, test $blastopts .= " -e $blastpmaxp "; $opts .= '-p blastp '; $blast $opts -d $seqlib -i $seqlib >> $outfile what is default -G (open gap) 11 what is default -E (extend gap) 1 ^^^ try -G 13 to speed up w/o much sacrifice in sensitivity? -M BLOSUM62 is default -W Word size ?? what is default T: 11 YPD blast opts: BLAST details Program: blastp Expectation value:1e-03 ##? Filter: seg Cost to open a gap:10 Cost to extend a gap: 1 Max alignments:250 Substitution Matrix: BLOSUM62 ############ old -- fixed BUG in weed/other readgenomes - dup gene id sequences - need to filter out kalo% grep '^>' $r/weed/refprot.fasta | sort | uniq -d |wc 6158 dups for 12998 total (each is dupped?) kalo% grep '^>' $r/worm/refprot.fasta | sort | uniq -d |wc 889 of 18174 total kalo% grep '^>' $r/yeast/refprot.fasta | sort | uniq -d |wc 387 of 7340 total for 50881 in total meowprot.fa (15% are dups now) ############ # moved to blastout.pl sub parseblastout sub readGenesyms # moved to flybase::sym2id sub symcase sub getSym2Id