#!/usr/local/bin/perl # orgsums.pl # paths for iubio computer $debug= 1; $minpct= 0; $nofilter= 0; # read from blast out ?? not in output $dofilterLowPercent= 0; #? to make consistant w/ nov00 data set? do we want these or not ## one big change w/o above is large # intra-org matches $nosameorg= 1; ## do this 1 ! $dohgsummary= 0; $dofeatsummary= 0; $egtitle = 'euGenes:'; $protdna= 'protein'; $format= 'html'; #$hgurl= 'http://iubio.bio.indiana.edu/eugenes/.bin/moquery?libs=ATgn&homology=Human'; $hgurl= '/.bin/moquery'; $mapurl= '/.bin/gnomap'; #$baseurl= 'http://iubio.bio.indiana.edu:8089/'; $baseurl= 'http://iubio.bio.indiana.edu/eugenes/'; # use flybase::sym2id; #@sys= `uname -X`; #%sys= map { chomp; split(' = '); } @sys; $meowpub= "/bio/meow-pub/eugenes/server"; $mework= '/c7/eugenes/'; $genomefeatd= "$mework/genomes"; # $genomes= '/c5/tmp/genomes/ncbi'; ## '/c3/iubio/biomir-pub/biomirror/genbank/genomes' $hgoutfile= "all/hgsummary"; $featoutfile= "all/feature-summary"; # $blastvers= 'BLASTP 2.0'; $blastpmaxp= '1e-30'; ## for blastp, higher for blastn. $blastnmaxp= '1e-10'; ## blastn ## ^ read from blastout: # Number of sequences better than 1.0e-30 ## FIXME - need blastout correction for this missing cmd $defaultBlastCmd= "blastall -v 10 -b 10 -m 9 -a 4 -p blastp -e 1e-30 -d org1/refprot.fasta -i org2/refprot.fasta"; ## ordered to build dna/prot lib @fullorglist= qw( fly man mouse mosquito weed worm yeast fish ); #original: Fruitfly Human Mouse Worm Yeast Weed Zebrafish | Any N genes #better for output: # @fullorglist= qw(man mouse rat fish fly mosquito worm weed rice yeast ); ## need this dang fix for name in data: $fporg = ($porg eq 'Ffly') ? 'Fruitfly' : $porg; ## fix to add other orgs in hgtable form: ecoli bsub @otherorgs= (); ## qw(ecoli bsub); %spptag = ( FBgn => 'Ffly', # 'Drosophila melanogaster', HUgn => 'Human', # 'Homo sapiens', SGgn => 'Yeast', # 'Saccharomyces cerevisiae', MGgn => 'Mouse', # 'Mus musculus', CEgn => 'Worm', # 'Caenorhabditis elegans', ATgn => 'Weed', # 'Arabidopsis thaliana', ZFgn => 'Zfish', # 'Danio rerio', AGgn => 'Mosquito', ); %porgs = ( fly => 'Ffly', # 'Drosophila melanogaster', man => 'Human', # 'Homo sapiens', yeast => 'Yeast', # 'Saccharomyces cerevisiae', mouse => 'Mouse', # 'Mus musculus', worm => 'Worm', # 'Caenorhabditis elegans', weed => 'Weed', # 'Arabidopsis thaliana', fish => 'Zfish', # 'Danio rerio', mosquito => 'Mosquito', ecoli => 'ecoli', bsub => 'bsub', rice => 'rice', # Oryza sativa ); #?? what is thid full porg/aorg distinction? %fullporgs = ( Ffly => 'Fruitfly', Zfish => 'Zebrafish', ecoli => 'E. coli (bacteria)', bsub => 'B. subtilis (bacteria)', rice => 'Rice', ); %fullaorgs = ( fly => 'Fruitfly', # 'Drosophila melanogaster', man => 'Human', # 'Homo sapiens', yeast => 'Yeast', # 'Saccharomyces cerevisiae', mouse => 'Mouse', # 'Mus musculus', worm => 'Worm', # 'Caenorhabditis elegans', weed => 'Weed', # 'Arabidopsis thaliana', fish => 'Zebrafish', # 'Danio rerio', mosquito => 'Mosquito', ecoli => 'E. coli (bacteria)', bsub => 'B. subtilis (bacteria)', rice => 'rice', # Oryza sativa ); %commonFeats = ( 'insertion' => 'insertions', 'repeat' => 'repeats', ); # 'cDNA' => 'CDS', %CsomeSort = ( fly => 'string', mosquito => 'string', man => 'numeric', # 1..22,X,Y weed => 'numeric', # 1..5 worm => 'roman', # I..V, X yeast => 'roman', # I..XVI ); my %roman2arab= ( I => 1, II => 2, III => 3, IV => 4, V => 5, VI => 6, VII => 7, VIII => 8, IX => 9, X => 10, XI => 11, XII => 12, XIII => 13, XIV => 14, XV => 15, XVI => 16, XVII => 17, XVIII => 18, XIX => 19, XX => 20, XXI => 21, XXII => 22, XXIII => 23, XXIV => 24, XXV => 25, ); use POSIX; use Getopt::Long; #use Getopt::Std; # %opt=(); # Getopt::Std::getopts('m:d:o:p:xsB:DFHO:',\%opt); $optok= Getopt::Long::GetOptions( 'featsummary!' => \$dofeatsummary, 'hgsummary!' => \$dohgsummary, 'paralogs!' => \$nosameorg, 'splitall!' => \$dosplit, 'Organism=s' => \@wantorg, 'otherorgs=s' => \@otherorgs, 'outfile=s' => \$outfile, 'minpct=n' => \$minpct, 'format=s' => \$format, 'pubpath=s' => \$meowpub, 'featpath=s' => \$genomefeatd, 'usedefs!' => \$usedefs, 'debug!' => \$debug, 'blastcmd=s' => \$defaultBlastCmd, ); #'path=s' => \$refpath, # $debug= $opt{D} if ($opt{D}); # $meowpub= $opt{d} if ($opt{d}); # $usedefs= $opt{x}; # $minpct= $opt{p} if ($opt{p}); # $format= $opt{m} if $opt{m}; # # $dofeatsummary= 1 if $opt{F}; # $dohgsummary= 1 if $opt{H}; # $nosameorg= 1 if $opt{s}; # }} if ($usedefs) { $dohgsummary= 1 unless($dofeatsummary);} #if ($opt{B}) { @otherorgs= split(/\,/,$opt{B}); } #if ($opt{O}) { @wantorg= split(/\,/,$opt{O}); } # unless (scalar(@wantorg)) { @wantorg= @fullorglist; } if (@wantorg) { @wantorg= map{ split(/\,/,$_); } @wantorg; } else { @wantorg= @fullorglist; } if (@otherorgs) { @otherorgs= map{ split(/\,/,$_); } @otherorgs; } $usage= < Source (rows): source $protdna for BLAST query
Homologs (columns): sequences in BLAST database match with probability <= $blastpmaxp
Any column: Source organism $protdna have one or more homologs in any organism
N genes column: Number of $protdna sequences available for the organism

Methods:
This is a computed gene homology or similarity using reference $protdna sequences identified by the source databases. A BLAST of all these sequences is computed, each sequence against all others, using specific parameters given below.

The sequences used in these calculations are found in the "Reference proteins" data files in each organism's folder. This summary is determined from the "Homologous genes table" data files also in each folder.

The counts in this table are of the number of available genes for an organism, and those which have one or more significant homologs in the other organisms. Percentages are the count of genes with any homolog (one or several) in another organism, divided by the total available genes in that organism (x 100).

You can read the homology summary table best this way. To answer the question what percent of all fruitfly genes have any homologs in C.elegans, look at the Fruitfly row, and find the Worm column. To answer the reverse, the Worm row and Fruitfly column says the percent of all worm genes that have any fruitfly homologs.

Note the asymmetry in these counts and percents. Fruitfly genome may have 5600 of its 13000 genes (43%) showing some homology to Human genes, whereas the Human genome may have 8700 of its 47000 genes (18%) showing some homology to Fruitfly genes. These include the same gene-pairs.

Please note that the summary you find here is based on a specific set of data and homology/similarity (BLAST) calculation parameters. Someone else may publish a different set of values, which would be equally valid, using a different set of data and options for determining "homology". The euGenes summary tries to stay up-to-date by using reference sequences identified by genome databases, and by matching current gene information from these databases with their current sequences.

These values are all moving targets; more data that is coming from genome sequencing projects will change these values. Especially in the euGenes summary, the fruitfly, worm (C.elegans) and yeast data is based on full genome sequences, while man, mouse, weed, and fish are based only on partial sequencing, which may skew values like percent homologies higher than in fact these organisms have.

TEOF =head1 note for asym of table

The counts in this table are of the number of available genes for an organism, and those which have one or more significant homologs in the other organisms. Percentages are the count of genes with any homolog (one or several) in another organism, divided by the total available genes in that organism (x 100).

You can read the homology summary table best this way. To answer the question what percent of all fruitfly genes have any homologs in C.elegans, look at the Fruitfly row, and find the Worm column. To answer the reverse, the Worm row and Fruitfly column says the percent of all worm genes that have any fruitfly homologs.

Note the asymmetry in these counts and percents. Fruitfly genome may have 5600 of its 13000 genes (43%) showing some homology to Human genes, whereas the Human genome may have 8700 of its 47000 genes (18%) showing some homology to Fruitfly genes. These include the same gene-pairs. ========== Note that there is asymmetry in the percent homologies. This is due to differences in total number of genes per organism, as well as number of significant matches depending on direction of comparison. -- why this later !? esp. when we do only one direction blast and copy for reciprocal ! - is it a counting error here? fly x fish = 826 homologs fish x fly = 548 homologs -- there are 3075 ZFgn lines in blastout.fish-fly, no .fly-fish fly x yeast = 1772 - there are 5143 SGgn in blastout.yeast-fly, no .fly-yeast yeast x fly = 1508 - 10130 SGgn x FBgn lines in hgtable.all (not quite 2x above) - 5065 SGgn in fly/hgtable (?) - 5065 FBgn in yeast/hgtable (good) Logic now ($KEEPALL= 0) is for each fly gene, does one or more fish/yeast/... homologs exist percent of total fly genes with one or more orgX homologs differs from for each fish gene, does one more more fly/... homolog exist percent of total fish genes with one or more orgX homologs With $KEEPALL= 1, then above fly x yeast, yeast x fly have same homolog counts, 5065 as in hgtables, but need to adjust $T sums so % is not > 100% ! (and even so, get for instance 28859 human homologs of 13046 fly genes) - paralogs are even worse - get many more than no. of genes and for instance 25585 fly x mosquito homologs even though each spp has only 13,000 or 12,000 genes to count =cut #--- homology summary if ($dohgsummary) { $hgoutfile= $outfile if ($outfile); %T= (); %S= (); ## fixme for BLAST command $hgnotes{CMDLINE} = $defaultBlastCmd."\n"; ## separate out org-self matches from table and counts - do as separate line ?! foreach my $org (@wantorg,@otherorgs) { calcHgSummary($org); } $nofilter=1 if ($hgnotes{CMDLINE} =~ m/\-F F/); if ($format =~ /html/) { $outpath= "$meowpub/$hgoutfile.$format"; if (-e $outpath) { rename ($outpath,"$outpath.old"); } print STDERR "printing $outpath\n" if $debug; if ($hgoutfile && open(OUT,">$outpath")) { $outh= *OUT; } else { $outh= *STDOUT; warn "bad hgoutfile: $outpath"; } htmlSummary($outh); close $outh if ($hgoutfile); } if ($format =~ /tsv|table|plain|txt/) { $outpath= "$meowpub/$hgoutfile.text"; if (-e $outpath) { rename ($outpath,"$outpath.old"); } if ($hgoutfile && open(OUT,">$outpath")) { $outh= *OUT; } else { $outh= *STDOUT; warn "bad hgoutfile: $outpath"; } printSummary($outh); close $outh if ($hgoutfile); } } #--- seq features summary if ($dofeatsummary) { $featoutfile= $outfile if ($outfile); %FT= (); %FTC= (); sub indexfeatdir { my $org= shift; my $dir= shift; local(*D); opendir(D, $dir) || warn "can't open $dir"; my @files= grep( /^features-\w+\.tsv$/, readdir(D)); closedir(D); foreach my $file (sort @files) { my $sfile= $dir . $file; if ($file =~ /^features-(\w+)/) { $csome= $1; } else { $csome= 'Other'; } getFeatureSummary($sfile, $org, $csome); } } foreach $org (@wantorg) { $dir= "$genomefeatd/$org/"; indexfeatdir($org, $dir); } $outpath= "$meowpub/$featoutfile.$format"; if (-e $outpath) { rename ($outpath,"$outpath.old"); } print STDERR "printing $outpath\n" if $debug; if ($featoutfile && open(OUT,">$outpath")) { $outh= *OUT; } else { $outh= *STDOUT; warn "bad hgoutfile: $outpath"; } printFeatSummary($outh, $format); close $outh if ($featoutfile); } #exit; #---------- ## feature fixes (for common feats) - move to genomefeat.pl ?? # fly cDNA -> CDS ?? no, not same # fly p_insertion -> insertion # worm *repeat* > repeat(s) or repeat_region sub getFeatureSummary { my($file, $org, $csome )= @_; local(*F,$nf); print STDERR "read $org, $csome feat. in $file\n" if $debug; unless (open(F,$file)) { warn "Can't read $file"; return; } while () { # Features for fly from BDGP/Celera/FlyBase data [ X.chunk9950000-10000000.xml, 18-June-2000] # if (/^# Features for $org/) { s/\s*\[[^,]+(,\s*[\w\-]+).*\]//; $ftnotes{$org}= $_ . $1; } if (/^# Features for $org/) { s/\s*\[[^,]+(,[^\]]+)\]//; $ftnotes{$org}= $_ . $1; } next unless (/^\w/); chomp(); my ($class,$sym,$map,$range,$ids,$dbx)= split(/\t/); next if ($org eq 'fly' && $class =~ /^cyto/); # skip the fly cytofeatures ## do something w/ range = calc. % bases covered by each feature class / csome,all ? my $size= maxrange($range); $FT{$org}{$csome}{$class}++; $FT{$org}{all}{$class}++; $FTS{$org}{$csome}{$class} += $size; $FTS{$org}{all}{$class} += $size; my $ftc= $class; foreach my $c (keys %commonFeats) { if ($ftc =~ m/$c/) { $ftc= $commonFeats{$c}; last; } } $FTC{$org}{$ftc}++; $FTCS{$org}{$ftc} += $size; # use {source} for total $nf++; } close(F); print STDERR "read $nf features\n" if $debug; } sub maxrange { my($range)= @_; my ($pre, $suf,$start,$stop, $b, $u); $start= -999999; $stop= $start; $range =~ s/^([^\d<>-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } if ($start == -999999) { return 0; } elsif ($stop <= $start) { return 1; } else { return $stop - $start; } # return ($start,$stop); } sub intformat { my $n= shift || 0; my $sn; while ($n =~ s/(\d)(\d\d\d)$/$1/) { $sn = ",$2$sn"; } return $n.$sn; } sub pctformat { my $n= shift || 0; $n= sprintf("%.3g", $n); # $n =~ s/0+$//; $n = substr($n,0,4); return $n; } sub _numericCsomeSort { if ($a =~ /\D/ || $b =~ /\D/) { return ($a cmp $b); } else { return $a <=> $b; } } sub _romanCsomeSort { my $aa= $roman2arab{$a} || $a; my $ba= $roman2arab{$b} || $b; if ($aa =~ /\D/ || $ba =~ /\D/) { return ($aa cmp $ba); } else { return $aa <=> $ba; } } sub sortedCsomes { my $org= shift; my @chs= keys %{ $FT{$org} }; if ($CsomeSort{$org} eq 'numeric') { @chs = sort _numericCsomeSort @chs;} elsif ($CsomeSort{$org} eq 'roman') { @chs = sort _romanCsomeSort @chs;} else { @chs = sort @chs; } return @chs; } sub printFeatSummary { my ($fh, $format)= @_; my %allfeats= (); my @forgs= sort keys %FT; foreach my $org (@forgs) { while ( my($ft,$v)= each(%{ $FTC{$org} }) ) { $allfeats{$ft}++; } # while ( my($ft,$v)= each(%{ $FT{$org}{all} }) ) { $allfeats{$ft}++; } } # my @wantporgs= map { $porgs{$_} } @forgs; # my @fullorgs= map { $fullporgs{$_} || $_ } @wantporgs; print $fh "\n"; print $fh ""; print $fh "$egtitle Genome Features\n"; print $fh "

$egtitle Genome Sequence Features Summary Table

\n"; print $fh "

Common Features for all chromosomes

\n"; print $fh <"; print $fh ""; print $fh '  Organism >
Feature'; foreach $org (@forgs) { print $fh "$fullaorgs{$org}"; } print $fh "All\n"; foreach my $ft (sort keys %allfeats) { next unless ($allfeats{$ft}>1); print $fh ""; print $fh $ft; print $fh ""; $nft= 0; foreach my $org (@forgs) { my $nf= $FTC{$org}{$ft}; # {all} my $bp= $FTCS{$org}{$ft}; # {all} $nft += $nf; print $fh ""; print $fh intformat($nf || 0), ' n'; #if ($ft eq 'source') { my $sbp= intformat($bp); print $fh "
$sbp bp"; # } #else { my $bpa= $FTCS{$org}{source}; $bpa= 100.0 / $bpa if ($bpa > 0); $pct= pctformat( $bp * $bpa); print $fh "
$pct %"; # } print $fh ""; } print $fh ""; print $fh intformat($nft || 0), ' n'; print $fh "\n"; } print $fh "\n"; print $fh "
\n"; print $fh < common-name):
TEOF foreach my $c (sort keys %commonFeats) { print $fh " *$c* => $commonFeats{$c}
\n"; } print $fh "
\n"; print $fh "

All Features for each organism, per chromosome


\n"; foreach my $org (@forgs) { print $fh "

Organism: $fullaorgs{$org}

\n"; my $ftnote= $ftnotes{$org}; print $fh $ftnote,"
\n" if ($ftnote); # fix sort for human - need 20 to sort after 19, 10 after 9 # rather than alpha sort 1 10 .. 20 .. 3 .. 9 # likewise yeast roman# sort by arabic numbering # my @chs= sort keys %{ $FT{$org} }; my @chs= sortedCsomes($org); my %fts= %{ $FT{$org}{all} }; print $fh ""; print $fh "\n"; foreach my $ft (sort keys %fts) { print $fh ""; $nft= 0; foreach my $chr (@chs) { my $nf= $FT{$org}{$chr}{$ft}; print $fh ""; } print $fh "\n"; } print $fh "
", join( "",'  Chromosome >
Feature', @chs), "
"; print $fh $ft; print $fh ""; print $fh "" if ($nf && $chr ne 'all'); print $fh intformat($nf || 0), ' n'; print $fh "" if ($nf && $chr ne 'all'); my $bp= $FTS{$org}{$chr}{$ft}; # if ($ft eq 'source') { my $sbp= intformat($bp); print $fh "
$sbp bp"; # } # else { my $bpa= $FTS{$org}{$chr}{source}; $bpa= 100.0 / $bpa if ($bpa > 0); $pct= pctformat( $bp * $bpa); print $fh "
$pct %"; # } print $fh "
\n"; print $fh "
\n"; } } sub htmlSummary { my ($fh)= @_; my $aporg; my @wantporgs= map { $porgs{$_} } @wantorg; my @fullorgs= map { $fullporgs{$_} || $_ } @wantporgs; my %orglib= (); while ( my ($l,$o)= each (%spptag) ) { $orglib{$o}= $l; }; print $fh "\n"; print $fh ""; print $fh "$egtitle Homologous Genes\n"; print $fh "

$egtitle Homologous Genes Summary Table

\n"; print $fh "Genes with one or more similar genes or homologs in other organisms.
\n"; print $fh "Number and percent with homologs, of total $protdna sequences available per organism.
\n"; print $fh "Minimum percent identity: $minpct%
\n" if ($minpct>0); print $fh ""; print $fh "\n"; # add otherorgs to this loop foreach my $porg (@wantporgs, @otherorgs) { my $fullorg= $fullporgs{$porg} || $porg; my $allorg; my $orglib= $orglib{$porg}; print $fh ""; $pctseqs= $T{$porg}{nseqs}; $pctseqs= 100.0 / $pctseqs if ($pctseqs>0); foreach my $aporg (@wantporgs) { $afporg = ($aporg eq 'Ffly') ? 'Fruitfly' : $aporg; $allorg .= $afporg.'|'; print $fh ""; } $allorg =~ s/\|$//; print $fh ''; print $fh ""; print $fh "\n"; } print $fh "
", join( "",'  Homologs >
Source', @fullorgs,'|','Any','N genes'), "
$fullorg"; ## FIXME for $nosameorg && ($porg eq $myporg)); if ($nosameorg && ($aporg eq $porg)) { print $fh " -- "; } else { print $fh "" if ($orglib); print $fh $S{$porg}{$aporg} || 0; print $fh "" if ($orglib); print $fh "
"; $pct= int ($pctseqs * $S{$porg}{$aporg} ); print $fh $pct,'%'; } print $fh "
|"; print $fh "" if ($orglib); print $fh $S{$porg}{nhomologs} || 0; print $fh "" if ($orglib); print $fh "
"; $pct= int ($pctseqs * $S{$porg}{nhomologs} ); print $fh $pct,'%'; print $fh "
"; print $fh $T{$porg}{nseqs} || 0; print $fh "
\n"; if ($nosameorg) { print $fh "
Genes with one or more similar genes, or paralogs, in same organism.
\n"; print $fh ""; print $fh "\n"; print $fh ""; foreach my $porg (@wantporgs) { $afporg = ($porg eq 'Ffly') ? 'Fruitfly' : $porg; my $orglib= $orglib{$porg}; # $allorg .= $afporg.'|'; $pctseqs= $T{$porg}{nseqs}; $pctseqs= 100.0 / $pctseqs if ($pctseqs>0); print $fh ""; } # print $fh "\n"; print $fh "
", join( "",'  ', @fullorgs), # Paralogs > ,'|','N genes' "
Paralogs"; print $fh "" if ($orglib); print $fh $S{$porg}{nparalogs} || 0; print $fh "" if ($orglib); print $fh "
"; $pct= int ($pctseqs * $S{$porg}{nparalogs} ); print $fh $pct,'%'; print $fh "
"; # print $fh $T{$porg}{nseqs} || 0; # print $fh "
\n"; } if (scalar(@otherorgs)) { print $fh "Gene homologies of "; foreach my $porg (@otherorgs) { my $fullorg= $fullporgs{$porg} || $porg; print $fh "$fullorg, "; } print $fh "are provided for comparison. They are not part of the euGenes data set.
\n"; } print $fh $hgtablenotes; ## this isn't so - asymmetry in % homologs is due to total # genes / org differing # print $fh "Asymmetry is due to blast query filter.
\n" # unless($nofilter); my $hgtitles; foreach my $n (sort keys %hgnotes) { $hgtitles .= $hgnotes{$n}; } $hgtitles =~ s/\n/
\n/g; print $fh "
Homology computations:
\n"; print $fh $hgtitles if $hgtitles; print $fh "\n"; } sub printSummary { my ($fh)= @_; my $aporg; my @wantporgs= map { $porgs{$_} } @wantorg; print $fh "Minimum percent identity: $minpct%\n" if ($minpct>0); print $fh "Total number of homologous $protdna sequences\n"; print $fh "\t",join("\t",@wantporgs),"\tAll\tNseqs\n"; foreach my $porg (@wantporgs) { print $fh "$porg\t"; foreach my $aporg (@wantporgs) { print $fh $T{$porg}{$aporg} || 0,"\t"; } print $fh $T{$porg}{nhomologs} || 0,"\t"; print $fh $T{$porg}{nseqs} || 0,"\n"; } print $fh "\n"; print $fh "Number of $protdna sequences with one or more homologs\n"; print $fh " (asymmetrical because blast query seq. is filtered)\n" unless($nofilter); print $fh "\t",join("\t",@wantporgs),"\tAny\tNseqs\n"; foreach my $porg (@wantporgs) { print $fh "$porg\t"; foreach my $aporg (@wantporgs) { print $fh $S{$porg}{$aporg} || 0,"\t"; } print $fh $S{$porg}{nhomologs} || 0,"\t"; print $fh $T{$porg}{nseqs} || 0; print $fh "\n"; } ## these are bogus values # print $fh "Any\t"; # foreach my $aporg (@wantporgs) { # print $fh $S{nhomologs}{$aporg}|| 0,"\t"; # } # print $fh "\n"; print $fh "\n"; print $fh "Percent of sequences with one or more homologs\n"; print $fh "\t",join("\t",@wantporgs),"\tAny\n"; foreach my $porg (@wantporgs) { my $pct; print $fh "$porg\t"; $pctseqs= $T{$porg}{nseqs}; $pctseqs= 100.0 / $pctseqs if ($pctseqs>0); foreach my $aporg (@wantporgs) { $pct= int ($pctseqs * $S{$porg}{$aporg} ); print $fh $pct,"\t"; } $pct= int ($pctseqs * $S{$porg}{nhomologs} ); print $fh $pct,"\n"; } ## these are bogus values # print $fh "Any\t"; # foreach my $aporg (@wantporgs) { # $pctseqs= $T{nseqs}{$aporg}; # $pctseqs= 100.0 / $pctseqs if ($pctseqs>0); # $pct= int ($pctseqs * $S{nhomologs}{$aporg} ); # print $fh $pct,"\t"; # } # print $fh "\n"; # add average triangle matrix ? } ## fixme for BLAST command # $hgnotes{CMDLINE} = "/bio/mb/blast2/blastall -v 10 -b 10 -m 9 -a 4 -p blastp -e 1e-30 -d $org1/refprot.fasta -i $org2/refprot.fasta" sub calcHgSummary { my $myorg= shift; my $orgpath = "$meowpub/$myorg"; my %hgtable= (); my %hg1table= (); my %pgtable= (); my $myporg= $porgs{$myorg}; my %gottag= (); my $kPctRange= 4; ## don't get too many low scores my $hg; $KEEPALL= 0; # debug test - below counting logic is off local(*H); if (open(H,"$orgpath/hgtable")) { print STDERR "reading $orgpath/hgtable\n" if $debug; while() { if (/^#\s*/) { s/^#\s*//; if (/Homologous genes/) { s/\[\S+\]//; $hgnotes{$myorg}= $_; } elsif (/No. sequences/) { $hgnotes{NSEQS} = $_; } elsif (/Matrix/) { $hgnotes{MATRIX} = $_; } elsif (/BLAST command/) { $hgnotes{CMDLINE} = $_; } } next unless (/^\w/); ## skip blanks, comments ## as of 29feb2000: my($id,$hgid,$porg,$sym,$score,$prob,$pct,$hgref)= split(/\t/); next unless ($id && $hgid); if ($dofilterLowPercent) { ##? filter out low percent matches as per gene records ?? ## moved this limit from blastout.pl - limit no. low scores %gottag= () if ($id eq $hgid); if ($gottag{$porg}) { next if($pct < ($gottag{$porg} - $kPctRange)); ## skip low scores unless 1st of spp. } $gottag{$porg}= $pct; } unless ($hg1table{$hgid}{any}) { $T{nseqs}{$porg}++; $hg1table{$hgid}{any} = 1; } if ($id eq $hgid) { $T{$myporg}{nseqs}++; # $T{nseqs}{$myporg}++; $T{nhomologs}{nseqs}++; %gottag= (); next; } next if ($pct < $minpct); if ($nosameorg && ($porg eq $myporg)) { #? save nums for separate table? # $T{$myporg}{self}++; unless ($pgtable{$id}{$porg}) { $S{$myporg}{nparalogs}++; $pgtable{$id}{$porg}= 1; # unless($KEEPALL); } next; } $T{$myporg}{$porg}++; $T{$myporg}{nhomologs}++; #?? this is limiting homologs per gene to 1 per other org ! why? # idea is that per gene $id, count only if $porg has any homolog, rather than total !? unless ($hgtable{$id}{$porg}) { $S{$myporg}{$porg}++; $hgtable{$id}{$porg} = 1; # unless($KEEPALL); # $S{nhomologs}{$porg}++; } unless ($hg1table{$hgid}{$porg}) { $S{nhomologs}{$porg}++; $hg1table{$hgid}{$porg} = 1; # unless($KEEPALL); } unless ($hgtable{$id}{any}) { $S{$myporg}{nhomologs}++; $hgtable{$id}{any} = 1; # unless($KEEPALL); } } close(H); } } 1; __END__ ============== test.pl perl -e 'open(D,"hgtable"); while() { next unless (/^\w/); \ ($id,$hgid,$porg,$sym,$score,$prob,$pct,$hgref)= split(/\t/);\ next unless ($id && $hgid); $ids{$id}++ if ($id eq $hgid); }\ close(D); open(D,"acode"); while(){if (/^ID\|(.+)/){$id=$1; delete $ids{$id};}}\ close(D); print "Missing in acode\n"; foreach(keys %ids){print "$_\n";}' | more # #$bp= "1234567890987"; # $bp= "12345"; # # $bp= sprintf("%'20d",$bp); # while ($bp =~ s/(\d)(\d\d\d)$/$1/) { # $sbp = ",$2$sbp"; # } # $sbp= $bp.$sbp; # # sub pctformat1 { # my $n= shift || 0; # $n= sprintf("%.3g", $n); # # $n =~ s/0+$//; $n = substr($n,0,4); # return $n; # } # $sbp= pctformat1(199.1234); print $sbp,"\n"; # $sbp= pctformat1(99.1234); print $sbp,"\n"; # $sbp= pctformat1(9.1234); print $sbp,"\n"; # $sbp= pctformat1(0.1234); print $sbp,"\n"; # exit; $s= < Fly Weed Worm Yeast Feature gene mRNA xxx. ------------- Organism: Fly Chromosome: X 2L 2R 3L 3R 4 Feature gene mRNA xxx. ------------- Organism: Worm ... ------------- HG Table: Org / Fish Fly Human .. Total Homologs / Nseqs Fish 10 20 .... Fly Human ... Totals kalo% date Tue Nov 21 22:02:56 EST 2000 ./orgsums.pl -x reading /c3/iubio/meow-pub/meow/server/fly/hgtable reading /c3/iubio/meow-pub/meow/server/man/hgtable reading /c3/iubio/meow-pub/meow/server/mouse/hgtable reading /c3/iubio/meow-pub/meow/server/worm/hgtable reading /c3/iubio/meow-pub/meow/server/yeast/hgtable reading /c3/iubio/meow-pub/meow/server/weed/hgtable Total number of homologous proteins Ffly Human Mouse Worm Yeast Weed Zfish All Nseqs Ffly 264 10190 6146 7305 2516 2077 0 28498 12471 Human 7413 935 7863 6925 2446 2082 0 27664 8236 Mouse 4132 5503 256 3452 1143 1148 0 15634 4459 Worm 7877 9582 5527 6653 2748 2344 0 34731 18720 Yeast 2526 2802 1456 2517 2976 1531 0 13808 6190 Weed 1739 1992 1114 1865 1330 379 0 8419 6028 Zfish 0 0 0 0 0 0 0 0 0 Number of proteins with one or more homologs (asymmetrical because blast query seq. is filtered) Ffly Human Mouse Worm Yeast Weed Zfish Any Nseqs Ffly 191 4143 2752 3926 1689 1154 0 5314 12471 Human 4363 438 5109 3635 1575 1100 0 6619 8236 Mouse 2307 3949 163 1878 739 531 0 4070 4459 Worm 4326 3739 2550 1659 1718 1214 0 6070 18720 Yeast 1509 1354 850 1370 397 821 0 2084 6190 Weed 958 873 586 869 747 169 0 1342 6028 Zfish 0 0 0 0 0 0 0 0 0 Any 11635 14390 9141 12755 5650 4028 0 Percent of proteins with one or more homologs Ffly Human Mouse Worm Yeast Weed Zfish Any Ffly 1 33 22 31 13 9 0 42 Human 52 5 62 44 19 13 0 80 Mouse 51 88 3 42 16 11 0 91 Worm 23 19 13 8 9 6 0 32 Yeast 24 21 13 22 6 13 0 33 Weed 15 14 9 14 12 2 0 22 Zfish 0 0 0 0 0 0 0 0 Any 48 64 68 42 49 40 0 #--- partial blast.out/hgtable (23,000 / 61635 seqs; in middle of mouse) ========================[ Nov 23, 2000 9:31 PM ]======================== Version 2.1.2 [Oct-19-2000] Started database file "/b1/b/work//meow/work/refseq//meowprot.fa" Formated 61635 sequences oat% date Fri Nov 24 12:18:54 EST 2000 ./orgsums.pl -p$r/ -D reading /export/home/gilbertd/meow/work/refseq/fly/hgtable reading /export/home/gilbertd/meow/work/refseq/man/hgtable reading /export/home/gilbertd/meow/work/refseq/mouse/hgtable Total number of homologous proteins Ffly Human Mouse Worm Yeast Weed Zfish All Nseqs Ffly 430 11335 6456 7434 2585 2077 858 31178 12750 Human 8298 1171 9050 7763 2695 2261 1896 33138 10174 Mouse 419 329 20 397 143 144 47 1499 281 Worm 0 0 0 0 0 0 0 0 0 Yeast 0 0 0 0 0 0 0 0 0 Weed 0 0 0 0 0 0 0 0 0 Zfish 0 0 0 0 0 0 0 0 0 Number of proteins with one or more homologs (asymmetrical because blast query seq. is filtered) Ffly Human Mouse Worm Yeast Weed Zfish Any Nseqs Ffly 307 4685 2969 4097 1734 1168 457 5699 12750 Human 5079 459 6007 4153 1756 1208 1143 7812 10174 Mouse 191 271 10 170 83 70 39 275 281 Worm 0 0 0 0 0 0 0 0 0 Yeast 0 0 0 0 0 0 0 0 0 Weed 0 0 0 0 0 0 0 0 0 Zfish 0 0 0 0 0 0 0 0 0 Any 4440 5590 6767 7466 2808 1889 941 Percent of proteins with one or more homologs Ffly Human Mouse Worm Yeast Weed Zfish Any Ffly 2 36 23 32 13 9 3 44 Human 49 4 59 40 17 11 11 76 Mouse 67 96 3 60 29 24 13 97 Worm 0 0 0 0 0 0 0 0 Yeast 0 0 0 0 0 0 0 0 Weed 0 0 0 0 0 0 0 0 Zfish 0 0 0 0 0 0 0 0 Any 26 36 96 100 100 100 100