#!/usr/local/bin/perl # epdfeats.pl # compute Eukaryotic Promoter locations in euGenes genomes using EPD data # paths for iubio computer =head1 epdfeats.pl -- locate Eukaryotic Promoter features in euGenes genomes -- using EPD data set (~ 1000 expt. determined euk. promoters) -- scan for TF site data also ? -- revise to use patscan instead/plus fasta ? =cut $debug= 0; @sys= `uname -X`; %sys= map { chomp; split(' = '); } @sys; if ($sys{Node} eq 'oat') { $TMPD= "/b1/b/work/" ; } elsif ($sys{Node} eq 'kalo') { $TMPD= "/c5/tmp"; } $fasta= "/bio/mb/fasta/fasta33"; $fastopts= '-m 10 -E 0.10 -q -H '; #? -m 9 or -m 10 ? -E 0.01 ; default is -E 10 $ktup= 6; # # ktup=1 for <20 nuc epd/oligos $genomes= '/c7/eugenes/genomes/'; $tmpseq= "/tmp/epd.fa"; $outpath= ''; #? use /c7/eugenes/genomes/$org or genomew/$org ? $epdlib= '/c7/eugenes/epd/epd66.fa'; # use /bio/data/epd/ ? $transfac= '/bio/data/transfac/'; # getz '[epd-id:*]' -f seq -sf fasta > epd66.fa ## note -sf puts epd ID in >, want ACC instead, as well ? # >XL_ALB1 # GATCTCTCTGAGCAATAGTATAAAACAAGAGGTATCACTCATTTCAGATCAGGCTTCTCA # >GG_ALBU # AAGCAGTCAGTAAAAGGTATATAAGAAAATGATTTCCCTCAATCATCCTAGCATTTTTGA @fullorglist= qw(fly); # @fullorglist= qw(fly man mouse worm yeast weed fish); #? can we use dna.raw files from $genomes/ %orgdna = ( fly => '/c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE1', # weed => "$genomeshome/A_thaliana/", # worm => "$genomeshome/C_elegans/", # yeast => "$genomeshome/S_cerevisiae/", # man => "$genomeshome/H_sapiens/", # mouse => "$genomeshome/M_musculus/", ); use Getopt::Std; # require Meow::SRS; # use flybase::sym2id; # $SRSRoot= '/bio/mb/srs61/'; ## - use srs61 # Meow::SRS::init($SRSRoot); # $getz= "$ENV{SRSEXE}/getz"; #------- %opt=(); Getopt::Std::getopts('e:frDO:',\%opt); $usage= <$outfile") ) { warn "cant write $outfile"; return -1; } my $featname= 'promoter'; my ($nout ); $/= ">>><<<\n"; while () { # next if (/\! No library sequences/); next unless (/best scores are/); $nout++; my $frame= (/fa_frame: (\w+)/) ? $1 : 'f'; #? my $at= index($_,"fa_overlap:"); $at= index($_,">",$at); my $e= index($_,">", $at+10); my $qs= substr($_,$at,$e-$at); $at= index($_,">",$e); # $e= index($_,">", $at+10); # my $libs= substr($_,$at); # ,$e-$at my $sym= ($qs =~ m/\>(\S+)/) ? $1 : '-'; ## include EPD acc and ID here? my $libid= ($libs =~ m/\>(\S+)/) ? $1 : '-'; my $start= ($libs =~ m/ al_start:\s+(\d+)/) ? $1 : ''; my $stop= ($libs =~ m/ al_stop:\s+(\d+)/) ? $1 : ''; my $range= "$start..$stop"; $range= 'complement('.$range.')' if ($frame eq 'r'); my $map= '-'; if ($libid =~ /Chromosome_arm_(\w+)/) { # fly - others? $map= 'Chr '.$1; } my $ids= 'EPD:'.$sym; # want EPD:accnum instead # need chr from $libid #?? save any of fa_expect, fa_ident, fa_overlap printFeat( *O, $org, '', $featname, $range, $sym, $map, $ids, '', '' ); } $/= "\n"; close(S); close(O); return $nout; } sub epdfasta { my($querylib, $seqlib, $outfile)= @_; # $fastopts= '-m 9 -E 0.01 -q -H'; $falib= "'$seqlib 0'"; ## 12= BLAST2 format from formatdb; 0 fasta print STDERR "fasta($querylib, $seqlib, $outfile)\n" if $debug; local(*S,*TMP); open(S,$querylib) || return -1; $/= "\n>"; ## revise to do upper-triangle of matrix tests? - don't need to do complete matrix ! my $n= 0; while () { $n++; # next if ($debug && ($n % 200 != 1)); /([^\n]+)/; $id= $1; s/>$//; ## trunc trailing > in $_ open(TMP,">$tmpseq"); print TMP ">$_\n"; close(TMP); print STDERR "fasta $fastopts $tmpseq $falib $ktup [$id]\n" if $debug; system("$fasta $fastopts $tmpseq $falib $ktup >> $outfile"); $nfasta++; # last if ($debug && $nfasta >= 20); } $/= "\n"; close(S); return $nfasta; } # cut from genomefeat.pl sub printFeat { my ($fh,$org,$scaf, $featname,$srange,$gsym,$map,$id,$rdbx,$rfnotes)= @_; # if ($org eq 'fly') {} # unless($gsym) { # if ($featname =~ /^source/) { $gsym= $org; } # else { $gsym= '-'; } # } # unless($map) { # if ($featname =~ /^(source|scaffold)/ && $csomeFile) { $map= $csomeFile; } # else { $map= '-'; } # } # unless ($id) { # $id= $sym2id{ symcase($gsym) }; # } # $gsym= $idsym{$id} if($idsym{$id}); # make sure is proper symbol # # $allfeats{$featname}++; print $fh "$featname\t$gsym\t$map\t$srange\t$id\t"; if ($rdbx =~ /ARRAY/) { foreach (sort @{$rdbx}) { print $fh "$_,";} } # if ($wantnotes) { # print $fh "\t"; # foreach (sort keys %{$rfnotes}) { # print $fh "$_,"; # $allnotes{$_} += ${$rfnotes}{$_}; # } # } print $fh "\n"; } sub getzEPD { #? get list of epd acc nums or IDs, then for each of 1390, make fasta entry? my $ok= `$getz '$srsq'`; if (!$ok) { print STDERR "getz '$srsq' - no match\n"; next; } # print STDERR "getz '$srsq' -d -sf fasta\n" if $debug; $idc= "\">$id; $sym $db\\c\""; system("/bin/echo $idc >> $outfile"); system("$getz '$srsq' -d -sf fasta >> $outfile"); ## need to know if rsq is not found - ~ 20 fly ones are missed - in genbanknew? $nseq++; } 1; __END__ FASTA can read the following formats: 0 Pearson/FASTA (>SEQID - comment/sequence) 1 Uncompressed Genbank (LOCUS/DEFINITION/ORIGIN) 2 NBRF CODATA (ENTRY/SEQUENCE) 3 EMBL/SWISS-PROT (ID/DE/SQ) 4 Intelligenetics (;comment/SEQID/sequence) 5 NBRF/PIR VMS (>P1;SEQID/comment/sequence) 6 GCG (version 8.0) Unix Protein and DNA (compressed) 11 NCBI Blast1.3.2 format (unix only) 12 NCBI Blast2.0 format (unix only, fasta32t08 or later) http://transfac.gbf.de/ ftp://transfac.gbf.de/ release 3.4 (last public w/o restrictions) TF SITE gives information on individual (putatively) regulatory protein binding sites, 4637 sequences with 78625 nucleotides. Format (embl/sw): AC R00001 ID HS$6-16_01 ... SQ gGGAAAaTGAAACT. // AC R00007 ID HS$4F2H_03 SQ GTGACTCA. // AC R00009 ID CAMV$35SR_01 SQ cTGACGtaagggaTGACGcac. // FastA out (-m 10 format) 60 residues in 1 query sequences 116117226 residues in 6 library sequences Scomplib [version 3.3t04 January 25, 2000] start: Mon May 7 02:05:01 2001 done: Mon May 7 02:05:32 2001 Scan time: 27.460 Display time: 0.000 Function used was FASTA FASTA searches a protein or DNA sequence data bank version 3.3t04 January 25, 2000 Please cite: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 /tmp/epd.fa: 60 nt >ATH_U25 vs /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE1 library searching /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE1 0 library 116117226 residues in 6 sequences MLE_cen statistics: Lambda= 0.1525; K=0.004193 (cen=72) FASTA (3.34 January 2000) function [optimized, +5/-4 matrix (5:-4)] ktup: 6 join: 45, opt: 30, gap-pen: -16/ -4, width: 16 Scan time: 27.800 !! No library sequences with E() < 0.10 >>><<< 60 residues in 1 query sequences 116117226 residues in 6 library sequences Scomplib [version 3.3t04 January 25, 2000] start: Mon May 7 02:05:32 2001 done: Mon May 7 02:06:03 2001 Scan time: 27.800 Display time: 0.000 Function used was FASTA FASTA searches a protein or DNA sequence data bank version 3.3t04 January 25, 2000 Please cite: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 /tmp/epd.fa: 60 nt >ATH_U5 vs /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE1 library searching /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE1 0 library 116117226 residues in 6 sequences MLE_cen statistics: Lambda= 0.1586; K=0.007116 (cen=72) FASTA (3.34 January 2000) function [optimized, +5/-4 matrix (5:-4)] ktup: 6 join: 45, opt: 30, gap-pen: -16/ -4, width: 16 Scan time: 27.660 The best scores are: opt bits E(2913) Chromosome_arm_3L 12X coverage (03/23/2000) (79866) [r] 127 36 0.089 >>>/tmp/epd.fa, 60 nt vs /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE1 library ; mp_name: /bio/mb/fasta/fasta33 ; mp_ver: 33t04 ; mp_argv: /bio/mb/fasta/fasta33 -m 10 -E 0.10 -q -H /tmp/epd.fa /c7/tmp/flybase /bdgpseqs/na_arms.dros.RELEASE1 0 6 ; pg_name: FASTA ; pg_ver: 3.34 January 2000 ; pg_matrix: +5/-4 (5:-4) ; pg_gap-pen: -16 -4 ; pg_ktup: 6 ; pg_optcut: 30 ; pg_cgap: 45 ; mp_extrap: 60000 2913 ; mp_stats: MLE_cen statistics: Lambda= 0.1586; K=0.007116 (cen=72) ; mp_KS: -0.0000 (N=0) at 20 >>Chromosome_arm_3L 12X coverage (03/23/2000) ; fa_frame: r ; fa_initn: 129 ; fa_init1: 104 ; fa_opt: 127 ; fa_z-score: 121.1 ; fa_bits: 36.2 ; fa_expect: 0.089 ; fa_ident: 0.795 ; fa_overlap: 44 >ATH_U5 .. ; sq_len: 60 ; sq_offset: 1 ; sq_type: D ; al_start: 49 ; al_stop: 6 ; al_display_start: 60 -------------------CATGGCTGCGTGATTTTTACTCCCAGGAGAG TGAAGTTTATTTATACTATCATTTCCTCT >Chromosome_arm_3L .. ; sq_len: 79866 ; sq_type: D ; al_start: 15966889 ; al_stop: 15966931 ; al_display_start: 15966859 GAGCTAACCCCGCTAAAAAGCTGCTAACTCGATTTTTGCTCCCAAGAAAG CGAAGTTT-TTTGTAGCATGATTTTATGAGATTGACTTTGAACTTTCCGA TCGATTGATTAACCGTGTGT >>><<<