#!/usr/local/bin/perl # gbgenomecds.pl # compute similar/homologous genes among meow set of organisms # paths for iubio computer $debug= 1; $doprot= 1; @sys= `uname -X`; %sys= map { chomp; split(' = '); } @sys; if ($sys{Node} eq 'oat') { $TMPD= "/b1/b/work/" ; } elsif ($sys{Node} eq 'kalo') { $TMPD= "/c5/tmp"; } $genomeshome= "$TMPD/genomes/ncbi"; ## '/c3/iubio/biomir-pub/biomirror/genbank/genomes' $mework= "$TMPD/meow/work/refseq/"; %genomepaths = ( weed => "$genomeshome/A_thaliana/", worm => "$genomeshome/C_elegans/", yeast => "$genomeshome/S_cerevisiae/", # fly => "$genomeshome/D_melanogaster/Scaffolds/", # man => "$genomeshome/H_sapiens/", # mouse => "$genomeshome/M_musculus/", ecoli => "$genomeshome/bacteria/Ecoli/", bsub => "$genomeshome/bacteria/Bsub/", ); @bactorgs= qw(ecoli bsub); foreach $org (@bactorgs) { mkdir("$mework/$org/",0777) unless (-d "$mework/$org/"); my $outfile= "$mework/$org/refprot.fasta"; my $genomedir= $genomepaths{$org}; opendir(D,$genomedir); my @gbk= grep(/gbk\.(Z|gz)$/,readdir(D)); closedir(D); foreach my $gbkf (sort @gbk) { my $gbk= "$genomedir/$gbkf"; ##! don't do 1st only - D_m has many in Scaffold/LARGE/ print STDERR "readcsome $gbk\n" if $debug; ($nfeat,$ncds)= readcsome ($gbk, $org, $outfile, $genomedir); print STDERR " done, feats = $nfeat, cds= $ncds\n" if $debug; } } #--------- sub readcsome { my($gbk, $org, $outfile, $genomedir)= @_; my ($fout, $ftname,$ncds,$nfeat, $infeat); my @feat= (); local(*GB,*FOUT,*FPROT); $genomeDID= $gbk; $genomeDID= $1 if ($gbk =~ m=/(\w+)\.gbk\.(Z|gz)=); if ($doprot) { $fout= $outfile; ## "$outpath/$genomeDID-cds.fasta"; unless (open(FPROT,">>$fout")) { warn "can't write $fout"; return -1; } } unless (open(GB,"gzcat $gbk|")) { warn "can't read $gbk"; return -1; } while () { my $line= $_; if (/^FEATURES /) { $infeat= 1; } elsif (/^ORIGIN /) { $infeat= -1; last; } elsif ($infeat) { if (/^ (\S+) /) { my $newftname= $1; if (scalar(@feat)) { $nfeat++; $ncds += CDS2Fasta(*FPROT,@feat) if ($ftname eq 'CDS' && $doprot); } $ftname= $newftname; @feat= (); push(@feat,$line); } elsif (/^ /) { push(@feat,$line); } } } close(GB); close(FPROT) if ($doprot); return ($nfeat++,$ncds); } sub CDS2Fasta { my($fhfasta,@cds)= @_; my $ncds= 0; my ($sym,$tr,$dbid,$range) = parseCDS(@cds); if ($sym && $tr) { $dbid= "GBGENOME:?" unless($dbid); ## messup - need something ## ^ use gbgenome file-name:337..2799 (feature-range) $dbid= "$genomeDID:$range"; # my $id= flybase::sym2id::symbol2id($sym); my $id= $sym; # if (!$id) { # $id= $sym; # # print STDERR "CDS2Fasta: missing id for $sym\n"; # } if (!$didid{$id}) { # $sym= flybase::sym2id::id2symbol($id) || $sym; print $fhfasta ">$id; $sym $dbid CDS\n$tr\n"; $didid{$id}++; $ncds++; } } return $ncds; } sub parseCDS { my(@cds)= @_; my ($gsym,$tr,$dbid) = ('','',''); my $intr=0; my $range= shift @cds; $range= chomp($range); $range =~ s/^\s+CDS\s+//; foreach (@cds) { if (m|/gene="([^"]+)|) { $gsym= $1; # print STDERR "/gene=$gsym\n" if $debug; } elsif (m|/protein_id="([^"]+)|) { $dbid= "PROTID:$1"; ## is this always best one? } elsif (m|/db_xref="([^"]+)|) { my $dbx= $1; $dbid= $dbx if ($dbx =~ /^GI:/ && $dbid =~ /^PID:/); $dbid= $dbx if ($dbx =~ /^PID:/ && $dbid eq ''); ## PROTID > GI > PID /db_xref="GI:6598620" ## worst: /db_xref="PID:g6322237" -- want only this one? ## no: /db_xref="SGD:S0003760" } elsif ( m|/translation="([^"]+)|) { $tr= $1; $intr=1 unless( m/\"$/ ); #" # print STDERR "/translation=$tr" if $debug; return ($gsym,$tr,$dbid,$range) unless($intr); } elsif ($intr) { s/^\s+//; if (/\"/) { $intr= 0; s/\"//; } #" $tr .= $_; return ($gsym,$tr,$dbid,$range) unless($intr); } } return ($gsym,$tr,$dbid,$range); }