#!/usr/local/bin/perl # genomefeat.pl # for meow & flybase =head1 NOTES scan ncbi genome .gbk , ? .ptt for feature info combine parts with meowrefseq.pl need also to capture /chromosome tag - split feature.tsv into csomes? for flybase data, add cytoloc map pos from fly.acode or cytodb, ... add other cytoloc mapped data (cytodb set) using cyto->base mapping of sorsa.txt june 2000 add some way to correlate gene locations for homologs among spp. -- use hgtable per species, add 'feature' == hg genes in same range, -- iff "several" genes match in a range (? opt - 5-10?, over what range?) --> write new multiorg/features.tsv, don't put in org features 2001 - ncbi genome data is mostly obsolete, not best current data - instead of pulling features from these data, use data sets from genome projects, parsers for each data set 2002 - ncbigenomes/ section is curated and more parseable that some others Jun01 - parsers in progress - fly - use parseflyxml.pl and/or gadb2feats.pl, gadb2acode2.pl -- change to bdgp gff parser ? (jun2002) worm - use parsewormgff.pl human - use parsegolden.pl -- 2002 needs update for new goldenpath sql weed - use parseweedxml.pl (or use this, j2002) yeast - use this parser? (ncbi yeast genome dataset is kept current from SGD) apr01 -- feature range offset off-by-1 errs: should change to common base (1), but till get this figured, use a keyval in gnomap fly celera/BDGP xml is 0-based, location 1 base past feature ! jun01 - revised fly to 1-based, location inclusive for feature -- use gadb2feats.pl or parseflyxml.pl worm (sanger) GFF seems to be 1-based: ##gff-version 2 ##date 2001-03-20 ##sequence-region CHROMOSOME_I 1 14910158 - use parsewormgff.pl human goldenpath - 0 based? - use parsegolden.pl weed genome (TIGR) - use parseweedxml.pl ## yeast NCBI Genbank/Genomes - 1 based # FEATURES Location/Qualifiers # source 1..230203 -------------------------------------- =cut $debug= 1; use Getopt::Std; use POSIX; use flybase::sym2id; use flyfeatures; # move to separate perl? ## paths for iubio/kalo $meowpub= '/bio/meow-pub/eugenes/server/'; $mework= '/c7/eugenes/genomes/'; $genbankgenomes= '/bio/biomir-pub/biomirror/genbank/genomes'; $ncbigenomes= '/bio/biomir-pub/biomirror/ncbigenomes'; # $sourceName= 'NCBI Genomes'; # ^^ this data mostly is not current... $isMeowId= 1; ## for getSym2Id - acode data $gnomapvers= '1'; $kMissingValue= -999999999; $kMaxValue= 9999999; ## ordered to build dna/prot lib @fullorglist= qw(fly man mouse worm yeast weed fish); ## those now w/ refseq acode entries @refseqorglist= qw(fly man mouse worm); #? add weed, yeast %spptag = ( FBgn => 'Fruitfly', # 'Drosophila melanogaster', HUgn => 'Human', # 'Homo sapiens', SGgn => 'Yeast', # 'Saccharomyces cerevisiae', MGgn => 'Mouse', # 'Mus musculus', CEgn => 'Worm', # 'Caenorhabditis elegans', ATgn => 'Weed', # 'Arabidopsis thaliana', ZFgn => 'Zebrafish', # 'Danio rerio', ANgn => 'Mosquito', # 'Anopheles gambiae', OSgn => 'Rice', # 'Oryza sativa', # japonica or xxx ); %orgtag = ( FBgn => 'fly', HUgn => 'man', SGgn => 'yeast', MGgn => 'mouse', CEgn => 'worm', ATgn => 'weed', ZFgn => 'fish', ANgn => 'mosquito', OSgn => 'rice', ); # currency of genbank genomes as of June 2002: %genomepaths = ( # not usable fly => "$genbankgenomes/D_melanogaster/Scaffolds/", ## 2000 data ! # usable if can join segments man => "$ncbigenomes/H_sapiens/", # ncbigenomes - May2002; - each chr file is set of many LOCUS segments # do we have any scaffold/assembly map for it? # ! see maps/mapview/chromosome../ for chr*.sequence.gz == all feature table # genbankgenomes obsolete for features # usable if can join segments mouse => "$ncbigenomes/M_musculus/", # May 2002 data ## CHR_01 .. CHR_19, X, Y - *gbk* has many LOCUS segments # usable weed => "$ncbigenomes/A_thaliana/", ## current Feb 2002 # usable worm => "$ncbigenomes/C_elegans/", ## ncbigenomes - Jan2002 (older than wormbase) # genbankgenomes - 1999 last update ! # usable yeast => "$genbankgenomes/S_cerevisiae/", # current 2002 - kept current from SGD # usable if can join segments mosquito => "$genbankgenomes/Anopheles_gambiae/Assembly_scaffolds/", # 2002 data # see also $ncbigenomes/Anopheles_gambiae/maps/mapview ); %fileIsChromosome = ( fly => 0, man => 1, mouse => 1, weed => 1, worm => 1, yeast => 1, mosquito => 0, # scaffs like fly ); %species = ( fly => "D. melanogaster", man => "H. sapiens", mouse => "M. musculus", weed => "A. thaliana", worm => "C. elegans", yeast => "S. cerevisiae", mosquito => "Anopheles gambiae", ); %roman2arab= ( I => 1, II => 2, III => 3, IV => 4, V => 5, VI => 6, VII => 7, VIII => 8, IX => 9, X => 10, XI => 11, XII => 12, XIII => 13, XIV => 14, XV => 15, XVI => 16, XVII => 17, XVIII => 18, XIX => 19, XX => 20, XXI => 21, XXII => 22, XXIII => 23, XXIV => 24, XXV => 25, ); #------- if ($macHome) { &flyfeatures::initFlydata(); } ## reset paths main(); exit; #------- sub main { @orgs=(); foreach (sort keys %genomepaths) { push(@orgs,$_); } $orgs= join(',',@orgs); @flycsomes= flyfeatures::getCsomes(); #? not exported right? $flycsomes= join(',', @flycsomes); %opt=(); Getopt::Std::getopts('dfijmprADSC:O:cuwsxR:F:W:',\%opt); $usage= <$dir/idmap.tsv"); open(IMAPX,">$dir/idmap.tsv.idx"); foreach my $file (sort @files) { my $sfile= $dir . $file; $csome= $1 if ($file =~ /^features-(\w+)/); print &indexFeatures($sfile, 'index'); print &indexIds($sfile, 'idindex', $csome, *IMAP, *IMAPX); } close(IMAP); close(IMAPX); } foreach $org (@orgs) { if ($indexfeat && $org =~ /fly/) { readflysorsa(); $dir= "$mework/$org/"; #?? $dir= flyfeatures::getDatapath(); } else { $dir= "$mework/$org/"; } indexfeatdir($dir); if ($org =~ /fly/) { $dir .= '/eugenes/'; indexfeatdir($dir) if (-d $dir) ; } } } if ($rewritefeats) { foreach $org (@orgs) { next unless($org =~ /\w/); if ( $org =~ /fly/) { readflysorsa(); } print STDERR "rewriteFeats for $org\n" if $debug; my $acode= "$meowpub/$org/acode"; rewriteFeats( $org, $acode, "$mework/$org/", $dosort); } } if ($markergenes) { # test foreach $org (@orgs) { next unless($org =~ /\w/); print STDERR "markegenes for $org\n" if $debug; my $acode= "$meowpub/$org/acode"; markergenes( $org, $acode, "$mework/$org/"); } } if ($wantfeat || $wantdna || $doprot) { foreach $org (@orgs) { next unless($org =~ /\w/); print STDERR "makelib for $org\n" if $debug; my $acode= "$meowpub/$org/acode"; readgenomes( $org, $acode, "$mework/$org/"); } } } sub dumpdna { ## hack for testing dna.xxx.raw indexing in gnomap my ($dnafile, $srange)= @_; $dnafile= "$mework/$org/".$dnafile unless (-r $dnafile); local(*F); die "Can't open $dnafile" unless(open(F,$dnafile)); my ($start, $stop)= maxrange($srange); my ($len, $data); $len= $stop - $start; seek(F, $start, 0); read(F, $data, $len); close(F); print ">$dnafile $start..$stop\n$data\n"; } sub fasta2raw { my ($fasta, $chr, $splitonfaline)= @_; if ($fasta =~ /\.(gz|Z)$/) { $ok= (open(FT,"gzcat $fasta|")); } else { $ok= (open(FT,$fasta)); } unless($ok) { warn "Cant read $fasta"; return; } my $rawfile; unless($splitonfaline) { unless($chr) { $chr= $1 if ($fasta =~ m/(\d+)/); # ATH1_pseudo_(\d) } $rawfile= "dna-$chr.raw"; $rawfile= "$mework/$org/".$rawfile; unless(open(OT,">$rawfile")) { warn "Cant write $rawfile"; return; } } my ($nline, $nb); while() { $nline++; if (/^>/) { unless($splitonfaline) { warn "extra > $_" if $nline>1; } else { #>Chromosome_arm_2L 12X coverage (Oct 2000) if (/arm_(\w+)/) { $chr= $1; } else { $chr++; } if ($rawfile) { close(OT); warn "wrote $nb bases ($nline lines) to $rawfile from $fasta\n"; $nline= $nb= 0; } $rawfile= "dna-$chr.raw"; $rawfile= "$mework/$org/".$rawfile; unless(open(OT,">$rawfile")) { warn "Cant write $rawfile"; return; } $isopen= 1; } next; } chomp(); print OT $_; $nb += length($_); } close(FT); close(OT); warn "wrote $nb bases ($nline lines) to $rawfile from $fasta\n"; } sub byidsize { my $as= $idsize{$a}; my $bs= $idsize{$b}; return ($bs <=> $as); ## big to small } sub byidsizerange { my ($as,$ax)= split(/\t/, $idsteph{$a}); my ($bs,$bx)= split(/\t/, $idsteph{$b}); return ($bs <=> $as); ## big to small } sub markergenes { my( $org, $orgacode, $dir)= @_; $markmax= 100; # %sym2id= getSym2Id($org, $orgacode, 'idsize'); flybase::sym2id::set4meow($isMeowId); flybase::sym2id::setcaseless(1) if ($org =~ m/weed/); flybase::sym2id::doMarker('idsize'); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); # %idsize= flybase::sym2id::getMarkerVals(); my @tm= localtime( $^T - 24*60*60*(-M $orgacode) ); my $adate= POSIX::strftime("%d-%B-%Y",@tm); ## should also read features - pick some set of best, spaced ~ evenly thru each csome local(*D,*F,*O); opendir(D, $dir) || warn "can't open $dir"; my @files= grep( /^features-\w+\.tsv$/, readdir(D)); closedir(D); foreach my $file (sort @files) { my $sfile= $dir . $file; ## my $idx= $dir . "idmap.idx"; $csome= $1 if ($file =~ /^features-(\w+)/); my $mfile= $dir . "markers-$csome.tsv"; die "Can't read $file" unless (open(F,$sfile)); my $recsize = length(pack("LL", 1, 500)); ## a constant -- 2 long integers my %didid= (); $allstart= ''; my $at= 0; @idstep= (); $stepsize= ''; while () { if (/^\w/) { $nl++; chomp(); my ($class,$sym,$map,$range,$ids,$dbx)= split(/\t/); if ($class eq 'source' && !$stepsize) { ($allstart, $allstop)= maxrange($range); $stepsize= int(($allstop - $allstart)/20); } $ids= '' unless($class =~ /gene/); ## FIXME - need another index method to mix FBgn/FBan/FBxx if ($ids =~ s/([^:]+)://) { my $db= $1; ## skip not-our-id ids $ids= '' unless ($db =~ m/FlyBase|MEOW|euGenes/i); } if ($ids =~ m/[A-Za-z]*(\d+)/) { my $idnum= $1; if (!$didid{$idnum}) { if ( $range && $range ne '-') { $didid{$idnum}++; my($start, $stop)= maxrange($range); my $mid= int( ($start + $stop)/2); my $step= int( ($mid - $allstart)/ $stepsize); my $size= $idsize{"MEOW:$ids"}; $idstep[$step]{$ids}= "$size\t$start\t$stop" if ($size); } } } } } close(F); open(O, ">$mfile") || die "can't write $mfile"; print O "# marker genes, chr $csome for $org\n"; print O "# computed from best populated MEOW records, $adate\n"; print O "# id\t sym\t range\t marker-score\n"; for (my $i= 0; $i<= $#idstep; $i++) { %idsteph= % {$idstep[$i]}; next unless (scalar(%idsteph)); # bad [$step] ?? @markid= sort byidsizerange keys %idsteph; # @markid= sort byidsize keys %idsize; $ids= $markid[0]; next unless ($ids); # bad [$step] ?? # $sym= $idsym{"MEOW:$ids"}; $sym= flybase::sym2id::id2symbol("MEOW:$ids") || $sym; ($size,$start,$stop)= split(/\t/, $idsteph{$ids}); print O "$ids\t$sym\t$start..$stop\t$size\n"; } close(O); } ## close(IMAP); close(IMAPX); } sub rewriteFeats { my( $org, $orgacode, $dir, $dosort)= @_; # update features*.tsv w/ current cmap, gsym, id, etc. # jun01 - revise this to also sort by location, other fixes? # %sym2id= getSym2Id($org, $orgacode ); flybase::sym2id::set4meow($isMeowId); flybase::sym2id::setcaseless(1) if ($org =~ m/weed/); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); local(*D,*F,*O); opendir(D, $dir) || warn "can't open $dir"; my @files= grep( /^features-\w+\.tsv$/, readdir(D)); closedir(D); foreach my $file (sort @files) { my $sfile= $dir . $file; ## my $idx= $dir . "idmap.idx"; $csome= $1 if ($file =~ /^features-(\w+)/); $chromosome = $csome; my $mfile= $dir . "features-$csome.tsv.rewrite"; die "Can't read $file" unless (open(F,$sfile)); die "can't write $mfile" unless open(O, ">$mfile") ; my @featlist= (); while () { if (/^\w/) { $nl++; chomp(); my @feats= split(/\t/);; # ^^ FIXME - sym/range swap below !! # my ($featname,$sym,$map,$range,$ids,$dbx)= split(/\t/); # my @dbx= ($dbx eq '-') ? () : split(/,/,$dbx); if ($dosort) { push( @featlist, \@feats); } else { printFeat( *O, $org, '', @feats ); # $featname, $range, $sym, $map, $ids, $dbx, '' } } else { print O $_; } } if ($dosort) { @featlist= sort _by_startAndGroup @featlist; foreach my $ft (@featlist) { printFeat( *O, $org, '', @$ft ); } } close(F); close(O); } } sub _by_startAndGroup { # jun01 added cytomap as [2] ! my($afeat,$bfeat,$arange,$brange,$astart,$bstart,$agroup,$bgroup); $afeat= $a->[0]; $arange= $a->[3]; # $astart= $1 if ($arange =~ /^\D*(\d+)/); $astart= ($arange =~ m/^\D*([-]?\d+)/) ? $1 : $kMaxValue; #?? $kMissingValue if ($afeat eq 'source') { $agroup= ' '; } else { $agroup= $a->[1]; $agroup= "xxx" if (!$agroup || $agroup eq '-'); } if ($afeat =~ /gene/) { $seqstart{$agroup}= $astart; $arange= 1; } elsif ($afeat =~ /CDS|mRNA/) { $astart= $seqstart{$agroup} || $astart; $arange= 2; } else { $arange= 3; } $bfeat= $b->[0]; $brange= $b->[3]; $bstart= ($brange =~ m/^\D*([-]?\d+)/) ? $1 : $kMaxValue; #?? $kMissingValue if ($bfeat eq 'source') { $bgroup= ' '; } else { $bgroup= $b->[1]; $bgroup= "xxx" if (!$bgroup || $bgroup eq '-'); } if ($bfeat =~ /gene/) { $seqstart{$bgroup}= $bstart; $brange= 1; } elsif ($bfeat =~ /CDS|mRNA/) { $bstart= $seqstart{$bgroup} || $bstart; $brange= 2; } else { $brange= 3; } # Larry Wall sez: (0 cmp/<=> fall down to next 'or' case) return ($astart <=> $bstart) or ($agroup cmp $bgroup) or ($arange <=> $brange); # $byr= ( $astart <=> $bstart ); # if ($byr == 0) { # $byr= ( $agroup cmp $bgroup ); # if ($byr == 0) { $byr = ($arange <=> $brange); } # } # return $byr; } # ##### # ## # ## fly data fixups --> moved to flyfeatures.pl # ## # ##### ## read gene sym info from genbank genomes, add as refseq/refprot? to acode? sub readgenomes { my($org, $orgacode, $outpath, $refseq)= @_; my $genomedir= $genomepaths{$org}; unless(-d $genomedir) { warn "can't find genome path for $org"; return 1; } %allfeats= (); %allnotes= (); %didid= (); # %sym2id= getSym2Id($org, $orgacode); flybase::sym2id::set4meow($isMeowId); flybase::sym2id::setcaseless(1) if ($org =~ m/weed/); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); # local(*FOUT); # unless (open(FOUT,">$featureout")) { warn "can't write $featureout"; return 1; } print STDERR "readgenome of $org [$genomedir]\n" if $debug; my $first= 1; local(*D); opendir(D,$genomedir); @chrs= grep(/\w/,readdir(D)); closedir(D); foreach my $chrd (sort @chrs) { $csomeFile= ($fileIsChromosome{$org} ? $chrd : ''); if ($csomeFile =~ m/^CHR.(\w+)/i) { # fix for worm data $csomeFile= 'Chr '.$1; } $chrd= "$genomedir/$chrd"; next unless (-d $chrd); opendir(D,$chrd); my @gbk= grep(/gbk\.(Z|gz)$/,readdir(D)); closedir(D); foreach my $gbkf (sort @gbk) { next if ($org eq 'yeast' && $gbkf =~ /yst_/); ## skip dup data ! next if ($gbkf =~ /mrna/); ## also need to drop 'worm_' on fname for 'worm' org my $gbk= "$chrd/$gbkf"; ##! dont do 1st only - D_m has many in Scaffold/LARGE/ if (-f $gbk) { print STDERR "readcsome $gbk\n" if $debug; ($nfeat,$ncds,$ndna)= readcsome ($gbk, $org, $outpath, $genomedir); print STDERR " done, feats = $nfeat, dna= $ndna, cds= $ncds\n" if $debug; } } } # undef %sym2id; # undef %idsym; } sub featsum { my ($org, $genomedir)= @_; my $fs = "#\n"; $fs .= "# Genome feature summary of $org [$genomedir]\n"; ## $species{$org} $fs .= "#\n"; $fs .= "# Feature count\n"; foreach (sort keys %allfeats) { $fs .= "# $_\t$allfeats{$_}\n"; } $fs .= "#\n"; $fs .= "# Feature notes count\n"; foreach (sort keys %allnotes) { $fs .= "# $_\t$allnotes{$_}\n"; } return $fs; } #sub readcsome0 { # my($fhgenome, $fhout, $refsym2id, $doprot)= @_; # my %sym2id= %{$refsym2id}; # my ($tr,$incds,$ftname,$ncds,$infeat); # my @feat= (); #} sub readcsome { my($gbk, $org, $outpath, $genomedir)= @_; my ($fout, $ftname,$ncds,$nfeat, $infeat,$ndna, $inseq); my @feat= (); local(*GB,*FOUT,*FPROT,*FDNA); my $gname= $gbk; $gname =~ s/worm_// if ($org eq 'worm'); if ($org eq 'weed') { $gname= $1 if ($gname =~ m,/CHR_(\w+)/,); ## back to 1..5 from I..V $gname= $roman2arab{$gname} || $gname; } $gname= $1 if ($gname =~ m=/([\w\_]+)\.gbk\.(Z|gz)=); $chromosome= ''; my $scaf= ''; if ($org =~ /fly/) { $scaf= flyfeatures::getScaffold($gname); } if ($wantfeat) { $fout= "$outpath/features-$gname.tsv"; unless (open(FOUT,">$fout")) { warn "can't write $fout"; return -1; } print FOUT &feattabheader("$org/features-$gname.tsv",$org,$gbk); } if ($doprot) { $fout= "$outpath/cds-$gname.fasta"; unless (open(FPROT,">$fout")) { warn "can't write $fout"; return -1; } } if ($wantdna) { $fout= "$outpath/dna-$gname.raw"; unless (open(FDNA,">$fout")) { warn "can't write $fout"; return -1; } } ## FIXME - add dump of document above FEATURES to sep. file for regenerating seqs unless (open(GB,"gzcat $gbk|")) { warn "can't read $gbk"; return -1; } while () { my $line= $_; if (/^FEATURES/) { $infeat= 1; } elsif ($inseq && $wantdna) { s/\s+//g; ## no spaces s/\d+//g; ## eat line numbers $ndna += length($_); print FDNA $_ if ($_); } elsif (/^ORIGIN/) { if (scalar(@feat)) { my @parsedfeat= parseFeat(@feat); printFeat(*FOUT,$org,$scaf,@parsedfeat) if ($wantfeat); $nfeat++; $ncds += CDS2Fasta(*FPROT,@feat) if ($ftname eq 'CDS' && $doprot); } $infeat= -1; ## opt pull sequence data into single raw file ! if ($wantdna) { $inseq= 1; } else { last; } ## return $ncds; } elsif ($infeat) { if (/^ (\S+) /) { my $newftname= $1; if (scalar(@feat)) { $nfeat++; my @parsedfeat= parseFeat(@feat); printFeat(*FOUT,$org,$scaf,@parsedfeat) if ($wantfeat); ## $species{$org} $ncds += CDS2Fasta(*FPROT,@feat) if ($ftname eq 'CDS' && $doprot); } $ftname= $newftname; @feat= (); push(@feat,$line); } elsif (/^ /) { push(@feat,$line); } } } close(GB); close(FPROT) if ($doprot); close(FDNA) if ($wantdna); if ($wantfeat) { my $fsum= featsum($org,$genomedir); %allfeats= (); %allnotes= (); print FOUT $fsum; ## print STDOUT $fsum; close(FOUT); } return ($nfeat++,$ncds,$ndna); } sub feattabheader { my ($fname,$org,$sourcefile)= @_; my @tm= localtime( $^T - 24*60*60*(-M $sourcefile) ); my $date= POSIX::strftime("%d-%B-%Y",@tm); $sourcefile =~ s/^($genbankgenomes|$ncbigenomes)//; ## $sourcefile =~ s/\w+\.gbk\./\*.gbk./; ## $species{$org} return <-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } return ($start,$stop); } sub scaffold2arm { my($scaf, $range)= @_; my ($nr, $end, $b, $u); $range =~ s/^([^\d<>-]*)//; $nr= $1; $range =~ s/(\D*)$//; $end= $1; if ($scaf->{reversed}) { my @nr= (); while ($range =~ s/^([<>]?)([\d-]+)([^\d<>-]*)//) { $u= $1; $b= $2; $nd= $3; $b= $scaf->{baseb} + ($scaf->{bsize} - $b); if ($u eq '>') { $u= '<'; } elsif ($u eq '<') { $u= '>'; } push(@nr, $u . $b); push(@nr, $nd) if ($nd); } while ($b= pop(@nr)) {$nr .= $b; } $nr .= $end; unless ($nr =~ s/^complement\((.+)\)$/$1/) { $nr= 'complement('.$nr.')'; } } else { while ($range =~ s/^([<>]?)([\d-]+)([^\d<>-]*)//) { $u= $1; $b= $2; $nd= $3; $b= $scaf->{baseb} + $b; $nr .= $u . $b . $nd; } $nr .= $end; } return $nr; } ## ## index features*.tsv by ID field for lookup by id ## sub indexIds { my($file, $kind, $csome, $idmapf, $idmapx)= @_; local(*F,*O,*IDMAP,*IDMAPX,$n, $nl); $kind= 'idindex' unless($kind); my $idx= $file . ".idx"; die "Can't read $file" unless (open(F,$file)); die "Can't write $idx" unless (open(O,">$idx")); my $recsize = length(pack("LL", 1, 500)); ## a constant -- 2 long integers my %didid= (); my $at= 0; while () { my $e= tell(F); if (/^\w/) { $nl++; chomp(); my ($class,$sym,$map,$range,$ids,$dbx)= split(/\t/); $ids= '' if ($class =~ /cytowalk/); $ids= '' unless($class =~ /gene|tRNA/ || $ids =~ /gn\d+/); ## FIXME - need another index method to mix FBgn/FBan/FBxx if ($ids =~ s/([^:]+)://) { my $db= $1; ## skip not-our-id ids # $ids= '' if ($db =~ m/PID|SGD/i); $ids= '' unless ($db =~ m/FlyBase|MEOW|euGenes/i); } if ($ids =~ m/[A-Za-z]*(\d+)/) { my $idnum= $1; if (!$didid{$ids}) { ## {$idnum}) { $didid{$ids}++; ## {$idnum}++; my $size= $e - $at; my $record= pack("LL", $at, $size); my $idloc = $idnum * $recsize; seek(O, $idloc, 0); ## check if already did id == e.g., several feats have same ID, pick 1st? always print O $record; $n++; # ?? also write to single $org id-map.tsv ? and do .idx for it? if ($idmapf && $range && $range ne '-') { my($start, $stop)= maxrange($range); $at= tell($idmapf); ## my $id= $ids; $id =~ s/[^:]+://; print $idmapf "$ids\t$csome\t$start..$stop\n"; $e= tell($idmapf); $size= $e - $at; $record= pack("LL", $at, $size); seek($idmapx, $idloc, 0); print $idmapx $record; } } } } $at = $e; } close(O); close(F); return "indexIds $file = $n / $nl\n"; } ## ## index features*.tsv by base range ## sub indexFeatures { my($file, $kind)= @_; local(*F,*O,$n); $kind= 'index' unless($kind); ##return if ($kind ne 'index'); my $idx= $file . ".ranges"; die "Can't read $file" unless (open(F,$file)); die "Can't write $idx" unless (open(O,">$idx")); print O "# $idx\n"; print O "# base range -> file index, and source, scaffold ranges\n"; print O "# tab-separated-values of: \n"; print O "# base-start | file-index OR class-name | file-index | location\n"; my $bindex= 0; ##? off by one? was 0; my $nextstep= -666; my $stepsize= 100000; while () { # my $blength= length($_); if (/^\w/) { chomp(); my ($class,$sym,$map,$range,$ids,$dbx)= split(/\t/); my ($start,$stop)= ($kMissingValue, $kMissingValue); if ($range eq '-') { ($start, $stop)= flyfeatures::flyCytomap2Bases($map) if ($org =~ /fly/); } else { ($start, $stop)= maxrange($range); } if ($start!=$kMissingValue && $stop!=$kMissingValue) { if ($class eq 'source') { @csomerange= ($start, $stop); $stepsize= int( ($stop - $start) / 100 ); } if ($class eq 'source' || $class eq 'scaffold') { print O "$class\t$bindex\t$range\n"; } elsif ($nextstep == -666 || $start >= $nextstep) { $nextstep= $start if ($nextstep == -666); $nextstep= 0 unless($nextstep); ## dang perl print O "$nextstep\t$bindex\n"; $nextstep += $stepsize; } $n++; } } # $bindex += $blength; $bindex= tell(F); ##? off by one ?? } print O "$csomerange[1]\t$bindex\n"; close(F); close(O); return "indexFeatures $file = $n\n"; } sub printFeat { my ($fh,$org,$scaf, $featname,$gsym,$map,$srange,$id,$rdbx,$rfnotes)= @_; #Was# out of order##! $featname,$srange,$gsym,$map,$id,$rdbx,$rfnotes)= @_; # my ($featname,$srange,$gsym,$map,$id,$rdbx,$rfnotes)= parseFeat(@feat); # @parsedfeat == ($featname,$srange,$gsym,$map,$id,$rdbx,$rfnotes) unless (ref($rdbx) =~ /ARRAY/) { my @dbx= ($rdbx eq '-') ? () : split(/,/,$rdbx); $rdbx= \@dbx; } ## ## need special fixes for genome/fly/scaffolds to weave into csome ordered data ## & join w/ more updated flybase data ## my $isscaf= 0; if ($scaf) { ## && $org =~ /fly/ my $arange= scaffold2arm($scaf,$srange); ## print STDERR "$srange => $arange\n" if $debug; $srange= $arange; if ($featname eq 'source') { ##! and need to build new 'source' from scaffolds $featname= 'scaffold'; $gsym= $scaf->{name}; $isscaf= 1; } } if ($org =~ /fly/) { if ($map !~ /^\d/ && $chromosome && $cytomap ne $chromosome) { readFlyCytomap($chromosome); $cytomap= $chromosome; } my $fbid; if ($id =~ /(FBgn\d+)/) { $fbid= $1; } else { foreach (@{$rdbx}) { $fbid= $1 if (/(FBgn\d+)/); } } if ($fbid) { my $fsym= flyfeatures::getSym($fbid); $gsym= $fsym if ($fsym); if ($map !~ /^\d/) { my $cmap= flyfeatures::getCytomap($fbid); $map= $cmap if ($cmap); } } } unless($gsym) { if ($featname =~ /^source/) { $gsym= $org; } else { $gsym= '-'; } } unless($map) { if ($featname =~ /^(source|scaffold)/ && $csomeFile) { $map= $csomeFile; } else { $map= '-'; } } unless ($id) { # $id= $sym2id{ symcase($gsym) }; $id= flybase::sym2id::symbol2id( $gsym ); } # $gsym= $idsym{$id} if($idsym{$id}); # make sure is proper symbol $gsym= flybase::sym2id::id2symbol($id) || $gsym; $allfeats{$featname}++; print $fh "$featname\t$gsym\t$map\t$srange\t$id\t"; foreach (sort @{$rdbx}) { print $fh "$_,";} if ($wantnotes) { print $fh "\t"; foreach (sort keys %{$rfnotes}) { print $fh "$_,"; $allnotes{$_} += ${$rfnotes}{$_}; } } print $fh "\n"; # push(@newsource,$line) if ($isscaf); } sub parseFeat { my( @feat)= @_; my ($featname, $srange, $fnote, $val, $map, $id, $gsym,$inrange); my @dbx= (); my %fnotes= (); my $fline= shift(@feat); if ($fline =~ /^ (\S+)\s+(\S+)/) { ## 1st line only !? $featname= $1; $srange= $2; $inrange= 1; } foreach (@feat) { if ( m|^\s+/(\w+)\s*=\s*"([^"]+)|) { $fnote= $1; $val= $2; $fnotes{$fnote}++; $inrange= 0; if ($fnote eq 'gene') { $gsym= $val;} elsif ($fnote eq 'map') { $map= $val;} elsif ($fnote eq 'chromosome') { #? only in source feature? not in worm csome $map= "Chr $val"; $chromosome= $val; } elsif ($fnote eq 'protein_id') { if ($featname eq 'CDS') { $id= $val; } else { push(@dbx, "PROTID:$val"); } ##? leave out if $id } elsif ($fnote eq 'db_xref') { ## FIXME parse out prime ID from here my $id1; if ($org =~ /fly/ && $featname eq 'gene' && $val =~ /FBgn/) { $id1= $val; } elsif ($org =~ /fly/ && $featname eq 'mRNA' && $val =~ /FBan/) { $id1= $val; } elsif ($org eq 'yeast' && $featname eq 'gene' && $val =~ /SGD:/) { $id1= $val; } elsif (!$id && $featname eq 'CDS' && $val =~ /PID:/) { $id1= $val; } ## for yeast, need to move $id from CDS to gene (which hasnt a db_xref in ncbi genomes) if ($id1) { $id= $id1; } else { push(@dbx, $val); } ##? leave out if $id } } elsif ( m|^\s+([^"]+)|) { ## " continuation line if ($inrange) { $srange .= $1; chomp($srange); } } else { ## nothing else? } } ## return ($featname,$srange,$gsym,$map,$id,\@dbx,\%fnotes); # ^^ wrong order return ($featname,$gsym,$map,$srange,$id,\@dbx,\%fnotes); } sub CDS2Fasta { my($fhfasta,@cds)= @_; my $ncds= 0; my ($sym,$tr,$dbid) = parseCDS(@cds); if ($sym && $tr) { $dbid= "GBGENOME:?" unless($dbid); ## messup - need something # my $id= $sym2id{ symcase($sym) }; my $id= flybase::sym2id::symbol2id( $sym ); if (!$id) { print STDERR "readcsome: missing id for $sym\n"; } elsif (!$didid{$id}) { # $sym= $idsym{$id} if($idsym{$id}); # make sure is proper symbol $sym= flybase::sym2id::id2symbol($id) || $sym; print $fhfasta ">$id; $sym $dbid CDS\n$tr\n"; $didid{$id}++; $ncds++; } } return $ncds; } sub symcase { my($sym)= shift; ## return uc($sym); ## if genbank doesnt keep proper case! return $sym; } ## step thru CDS looking for /gene=$sym # CDS join(124392..124403,124468..125109) # /gene="Ccp84Ab" # /translation="MAFKFVFALAFVAVASAGYAPIAAPQVYHAAPAVATYAHAPVAV" ## getz '[GENBANK-id:AF035197]' -f ftd ## /gene="zyg-9" ## some of these CDS have no /gene, only 1 CDS - use it ! sub parseCDS { my(@cds)= @_; my ($gsym,$tr,$dbid) = ('','',''); my $intr=0; foreach (@cds) { if (m|/gene="([^"]+)|) { $gsym= $1; # print STDERR "/gene=$gsym\n" if $debug; } elsif (m|/protein_id="([^"]+)|) { $dbid= "PROTID:$1"; ## is this always best one? } elsif (m|/db_xref="([^"]+)|) { my $dbx= $1; $dbid= $dbx if ($dbx =~ /^GI:/ && $dbid =~ /^PID:/); $dbid= $dbx if ($dbx =~ /^PID:/ && $dbid eq ''); } elsif ( m|/translation="([^"]+)|) { $tr= $1; $intr=1 unless( m/\"$/ ); #" # print STDERR "/translation=$tr" if $debug; return ($gsym,$tr,$dbid) unless($intr); } elsif ($intr) { s/^\s+//; if (/\"/) { $intr= 0; s/\"//; } #" $tr .= $_; return ($gsym,$tr,$dbid) unless($intr); } } return ($gsym,$tr,$dbid); } sub isgoodmarker { local($_)= @_; my $nq= 0; my ($nf,$tag,$lat,$at,$subs,$ltag); study; $nq -= 20 if (/\nCLA\|Predicted/); while (/\n(\w+)\|/g) { $tag= $1; $at= pos; $nq += 10 if ($tag eq 'MAP'); if ($ltag =~ /CEL|FNC|PDOM|DBL|ENZ|HG/) { $subs= substr($_,$lat, $at-$lat); $nq += $subs =~ tr/\n/\n/; } $ltag= $tag; $lat= $at; } return $nq; } ## flybase::sym2id --- ## need to add idsize, domarker, isgoodmarker to sym2id ... ## #sub getSym2Id { # my ($org, $acode, $domarker)= @_; # %sym2id= (); # %idsym= (); # %idsize= (); # local(*ALIB); # open(ALIB,$acode); # $/= "# EOR\n"; # my $insyn; # while () { # my($id,$sym,$syn)= ('','',''); # $id= $1 if (/\nID\|(.+)/); # $id= 'MEOW:'.$id if ($isMeowId); # # $sym= $1 if (/\nSYM\|(.+)/); # next unless ($sym && $id); # if ($domarker) { # # $idsize{$id}= length($_); ## not so good for marker genes, try field specific # $idsize{$id}= isgoodmarker($_); # } # $sym2id{symcase($sym)}= $id; # $idsym{$id}= $sym; # # if (/\nSYN\|(.+)/) { # $syn= $1; # $sym2id{symcase($syn)}= $id; # my ($at, $e, $x); ## get any more SYN|s ... # $at= index($_,"\nSYN\|"); $e= $at+5; $at= -1; # while (($x= index($_,"\n", $e)) > 0) { # if (substr($_,$x+1,1) eq '|') { $e = $x+1; $at= $e if ($at<0); } # else { $e= $x; last; } # } # if ($at>=0 && $e>$at) { # my $s= substr($_, $at, $e-$at); # foreach $s (split(/\n/, $s)) { # $s =~ s/^\w*\|//; # # $s= uc($s); # $sym2id{symcase($s)}= $id; # } # } # } # # if ($org eq 'worm') { # ##?? and add DBL|WP:B0379.7 since some worm PRED gene syms dont match GB sym - ACEPRED:B0379.7a # if (/\nDBL\|WP:(.+)/) { # $sym2id{symcase($1)}= $id; # } # } # } # $/= "\n"; # close(ALIB); # return %sym2id; ## ?? also return %idsym #} sub isOldTarget( $$) { # usage: if (&isOldTarget( $sourcefile, $targetfile)) { blah; } local($source,$target) = @_; my $res= 0; if (! -f $target) { $res= 1; } else { my $targtime= -M $target; ## -M is file age in days.hrs before now $res= (-M $source) < $targtime; } return $res; } __END__ ./genomefeat.pl -DOfly01junr2 -W$gw -F$fd/na_arms.dros.RELEASE2.Z wrote 22097677 bases (368297 lines) to /c7/eugenes/genomew/fly01junr2/dna-2L.raw from /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE2.Z wrote 20220687 bases (337013 lines) to /c7/eugenes/genomew/fly01junr2/dna-2R.raw from /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE2.Z wrote 23190497 bases (386510 lines) to /c7/eugenes/genomew/fly01junr2/dna-3L.raw from /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE2.Z wrote 27778053 bases (462969 lines) to /c7/eugenes/genomew/fly01junr2/dna-3R.raw from /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE2.Z wrote 1155939 bases (19267 lines) to /c7/eugenes/genomew/fly01junr2/dna-4.raw fr om /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE2.Z wrote 7513406 bases (125225 lines) to /c7/eugenes/genomew/fly01junr2/dna-U.raw f rom /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE2.Z wrote 21666217 bases (361104 lines) to /c7/eugenes/genomew/fly01junr2/dna-X.raw from /c7/tmp/flybase/bdgpseqs/na_arms.dros.RELEASE2.Z -rw-r--r-- 1 gilbertd staff 22097677 Jun 23 20:40 dna-2L.raw -rw-r--r-- 1 gilbertd staff 20220687 Jun 23 20:40 dna-2R.raw -rw-r--r-- 1 gilbertd staff 23190497 Jun 23 20:40 dna-3L.raw -rw-r--r-- 1 gilbertd staff 27778053 Jun 23 20:40 dna-3R.raw -rw-r--r-- 1 gilbertd staff 1155939 Jun 23 20:40 dna-4.raw -rw-r--r-- 1 gilbertd staff 7513406 Jun 23 20:40 dna-U.raw -rw-r--r-- 1 gilbertd staff 21666217 Jun 23 20:40 dna-X.raw ## TEST testr( 'complement(join(1185..1445,1712..1898,2658..3132,3425..>3949))'); testr( '<2143150..>2143536'); testr( 'join(1081294..1081401,1088571..1089515)'); testr( 'complement(join(2149727..2150059,2150849..2151217,2151345..2151715,2153536..2153575,2155159..2155191,2156201..2156236,2156815..2156837,2157586..2158384))'); exit; sub testr { my ($srange)= @_; my $arange= scaffold2arm( $flyscaffolds{AE002681}, $srange); print "$srange =>\n $arange\n"; } /// #!/usr/local/bin/perl # split bdgp na_arms.dros (fasta per csome dna) split_na_arms(); sub split_na_arms { local(*F,*O, $outf, $csome, $ndna); open(F,"na_arms.dros") || die "can't open na_arms.dros"; while() { chomp(); if (/^>/) { if (/Chromosome_arm_(\S+)/) { $csome= $1; if ($outf) { close(O); my $fsize= (-s $outf); print STDERR "$outf size: $fsize, dnalen= $ndna\n"; } $outf= "dna-$csome.raw"; open(O,">$outf") || die "can't write $outf"; $ndna= 0; } } else { # s/\s+//g; ## no spaces # s/\d+//g; ## eat line numbers $ndna += length($_); print O $_ if ($_); } } close(F); close(O) if ($outf); }