#!/usr/local/bin/perl # mergeflyfeats.pl =head1 NOTES ## ? sep01 - need to check current FBgn.acode symbols and fix old gadb ones =cut #use strict; use POSIX; use Getopt::Std; # use flyfeatures; #? use flybase::sym2id; my $rel= 2; my $OUTH= *STDOUT; my $outpath= '/c7/eugenes/genomew/fly02jun/'; my $sorsapath= '/c7/eugenes/genomew/fly/'; my $org= 'fly'; my $doindex= 1; my $isMeowId= 0; my $orgacode= $sorsapath.'acode'; my $dogenemerge= 1; my $dosplitcytos= 1; my $dosum= 1; my %opt= (); Getopt::Std::getopts('Do:i:p:r:a:',\%opt); die "usage: $0 -o output-path [ $outpath ] -p path-to-sorsa-maps [ ${sorsapath} sorsa.txt ] -a path-to-FBgn.acode [ $orgacode ] -r gadfly-release-num [ $rel ] -i 0/1 -- index features [ $doindex ] -D -- debug " unless($opt{r}); my $debug= $opt{D}; $outpath= $opt{o} || $outpath; $rel= $opt{r} || $rel; $doindex= $opt{i} || $doindex; $sorsapath= $opt{p} || $sorsapath; $orgacode= $opt{a} || $orgacode; flybase::sym2id::set4meow($isMeowId); ## ! fix sym2id for ID2 2ndary ids -> prime id/sym flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); readflysorsa("${sorsapath}sorsa.txt"); if ($rel == 2) { %fh= %feat2data; } else { %fh= %feat1data; } foreach $csome (@csomes) { @featlist= (); @doclist= (); @srclist= (); %allfeats= (); @sources= (); %cytogene= (); %seqgene= (); warn "merge $org-$csome ...\n" if $debug; foreach $fset (sort keys %fh) { warn "merge $fh{$fset}->{name}\n" if $debug; ($nread, $nkept)= addFeats( $fh{$fset}, $csome ); push( @srclist, '# source= '.$fh{$fset}->{name}.', '.$fh{$fset}->{date}."\n"); $fh{$fset}->{featsread}= $nread; $fh{$fset}->{featskept}= $nkept; warn " read=$nread , kept=$nkept\n" if $debug; } @featlist= sort _by_startAndGroup @featlist; my $outf= $outpath . 'features-' . $csome . '.tsv'; open(OH, ">$outf") || die "cant write $outf\n"; $OUTH= *OH; if (@doclist) { splice( @doclist, 2, 0, @srclist); print $OUTH @doclist; print $OUTH "\n"; } else { my $header= feattabheader($org,$csome,\%fh,join('',@srclist)); print $OUTH $header; } foreach my $ft (@featlist) { printFeat( $OUTH, $org, '', @$ft ); } close(OH); # indexFeatures - write features-.tsv.ranges here ? - no, need .tsv file to get byte indices if ($dosum) { my $fs = featsum($org,$csome); ## global @sources %allfeats %allnotes %fh my $outf= $outpath . 'features-' . $csome . '.summary'; if (open(OH, ">$outf")) { print OH $fs; close(OH); warn " Summary in $outf\n" if $debug; } else { print STDERR $fs; } ##? } } indexfeatdir($outpath) if ($doindex); #---------------------- ### add this if missing # fly/features-2L.tsv # Features for fly from BDGP/Celera/FlyBase Release 2 data [ gadfly2, Jul 10 2001] # source: FlyBase cytologic features, 20010924 # source: BDGP/Celera/FlyBase Release 2 gadfly2 genes & transcripts, 20010710 # source: BDGP/Celera/FlyBase Release 2 gadfly2 misc. features, 20010710 # gnomap-version 1 # # tab-separated-values # Feature gene map range id db_xref notes sub feattabheader { my ($org, $csome, $rfth, $moresrc)= @_; # my @tm= localtime( $^T - 24*60*60*(-M $sourcefile) ); # my $date= POSIX::strftime("%d-%B-%Y",@tm); # $sourcefile =~ s/^($genbankgenomes|$ncbigenomes)//; my %fh= %$rfth; my $gnomapvers= '1'; my ($date,$sourceName,$sourcefile); foreach $fset (sort keys %fh) { $sourceName= $fh{$fset}->{title} if $fh{$fset}->{title}; $sourcefile .= $fh{$fset}->{file} if $fh{$fset}->{file}; my $adate= $fh{$fset}->{date}; $date= $adate if ($adate>$date); } my $fthead = <{name}."\n"; $fs .= "date\t".$fh{$fset}->{date}."\n"; $fs .= "read\t".$fh{$fset}->{featsread}."\n"; $fs .= "kept\t".$fh{$fset}->{featskept}."\n"; } return $fs; } sub addFeats { my( $pathref, $csome )= @_; my ($nf, $nd); local(*D,*F,*O); my $file= $pathref->{path} . $csome . '.tsv'; unless (open(F,$file)) { warn "cant read $file\n"; return; } my @keepset= @ {$pathref->{keep}}; my @dropset= @ {$pathref->{drop}}; while () { if (/^\w/) { chomp(); $nf++; my @feats= split(/\t/); my $ok= 1; if (scalar(@keepset)) { $ok= 0; map { $ok=1 if( $_ eq $feats[0]) } @keepset; } if ($ok && scalar(@dropset)) { map { $ok=0 if( $_ eq $feats[0]) } @dropset; } next unless $ok; $feats[2] =~ s/\s+//g; # squish '3B4-5; 3C1-2' if ($feats[0] =~ /^cyto/ && $feats[3] !~ /\d/) { ## oct01 - separate cytofeats at ; make dup recs w/ each base loc my $cmap= $feats[2]; if ($dosplitcytos && $cmap =~ m/;/) { my @mp= split(/;/, $cmap); foreach my $i (0..$#mp) { # my $sr= getMap2Bases( $mp, $csome); my ($sb, $se)= getMap2Bases( $mp[$i], $csome); next if ($sb == $kMissingValue); ## dont dup if $sb near or == last sb?? my @ft= @feats; #? perl dups # my $mpx= $cmap; $mpx =~ s/\;?$mp\;?//; $mpx = "$mp;$mpx"; # my $mpx= join(";", ($mp[$i], @mp[0 .. ($i-1)], @mp[($i+1) .. $#mp]) ); # "$mp;$mpx"; my $mpx= join(";", @mp[$i,0..$i-1,$i+1..$#mp] ); # "$mp;$mpx"; $ft[3]= "$sb..$se"; $ft[2]= $mpx; push( @featlist, \@ft); $nd++; } @feats= (); } else { my ($sb, $se)= getMap2Bases( $cmap, $csome); if ($sb == $kMissingValue) { @feats= (); } else { $feats[3]= "$sb..$se"; } } } elsif ($feats[0] =~ /^gene/ && $feats[2] !~ /\d/) { my ($astart, $astop)= maxrange( $feats[3]); $feats[2]= getCytolocFromSeqloc( $csome, $astart, $astop); } elsif ($feats[0] =~ /^tRNA/ && $feats[1] =~ /tRNAscan/) { $feats[1]= '-'; # should be fixed in cset, feature extractor } ## merge cytogene and seq gene more - match ids, assume cytogene sym/map are correct if ($dogenemerge && $feats[0] =~ /gene/) { if ($feats[4] =~ /(FBgn\d+)/) { my $gid= $1; if ($feats[0] =~ /^cyto/) { $cytogene{$gid}= \@feats; } else { $seqgene{$gid}= \@feats; } } } if (scalar(@feats)) { push( @featlist, \@feats); $nd++; } } else { if (/^#/ && ! $pathref->{skipdocs}) { push (@doclist, $_); } } } close(F); return ($nf, $nd); } ## need better sort for cytoxxx which ignores ranges outside of csome arm ## fix in getMap2Bases sub _by_startAndGroup { # jun01 added cytomap as [2] ! my($afeat,$bfeat,$arange,$brange,$astart,$bstart,$agroup,$bgroup); $afeat= $a->[0]; $arange= $a->[3]; ## $astart= ($arange =~ m/^\D*([-]?\d+)/) ? $1 : $kMaxValue; #?? $kMissingValue $astart= ($arange =~ m/([-]?\d+)/) ? $1 : $kMaxValue; #?? $kMissingValue if ($afeat eq 'source') { $agroup= ' '; $astart= $kMinValue if ($astart == 1); } else { $agroup= $a->[1]; $agroup= "xxx" if (!$agroup || $agroup eq '-'); } if ($afeat =~ /gene/) { $seqstart{$agroup}= $astart; $arange= 1; } elsif ($afeat =~ /CDS|mRNA/) { $astart= $seqstart{$agroup} || $astart; $arange= 2; } else { $arange= 3; } $bfeat= $b->[0]; $brange= $b->[3]; $bstart= ($brange =~ m/([-]?\d+)/) ? $1 : $kMaxValue; #?? $kMissingValue if ($bfeat eq 'source') { $bgroup= ' '; $bstart= $kMinValue if ($bstart == 1); } else { $bgroup= $b->[1]; $bgroup= "xxx" if (!$bgroup || $bgroup eq '-'); } if ($bfeat =~ /gene/) { $seqstart{$bgroup}= $bstart; $brange= 1; } elsif ($bfeat =~ /CDS|mRNA/) { $bstart= $seqstart{$bgroup} || $bstart; $brange= 2; } else { $brange= 3; } # Larry Wall sez: (0 cmp/<=> fall down to next 'or' case) return ($astart <=> $bstart) or ($agroup cmp $bgroup) or ($arange <=> $brange); # $byr= ( $astart <=> $bstart ); # if ($byr == 0) { # $byr= ( $agroup cmp $bgroup ); # if ($byr == 0) { $byr = ($arange <=> $brange); } # } # return $byr; } sub printFeat { my ($fh,$org,$scaf, $featname,$gsym,$map,$srange,$id,$rdbx,$rfnotes)= @_; # my $isscaf= 0; # if ($scaf) { ## && $org eq 'fly' # my $arange= scaffold2arm($scaf,$srange); # ## print STDERR "$srange => $arange\n" if $debug; # $srange= $arange; # if ($featname eq 'source') { # ##! and need to build new 'source' from scaffolds # $featname= 'scaffold'; # $gsym= $scaf->{name}; # $isscaf= 1; # } # } if ($org eq 'fly') { # if ($map !~ /^\d/ && $chromosome && $cytomap ne $chromosome) { # readFlyCytomap($chromosome); # $cytomap= $chromosome; # } ## leave in for debugging sort ! $debug && if ( $featname =~ /^cyto/) { $srange= '-'; } # dont print it, may be bad range like -9999999 my $fbid; if ($id =~ /(FBgn\d+)/) { $fbid= $1; } else { my @dbx; if (ref($rdbx) =~ /ARRAY/) { @dbx= @$rdbx; } else { @dbx= ($rdbx eq '-') ? () : split(/,/,$rdbx); } foreach (@dbx) { $fbid= $1 if (/(FBgn\d+)/); } } if ($fbid) { if ($dogenemerge) { if ($featname =~ /^gene/ && $cytogene{$fbid}) { my @cft= @{ $cytogene{$fbid} }; # trust cytogene data >> seqgene data if ($cft[1] && $cft[1] ne '-') { ## note- these cytogene syms can be WRONG - from cytodb - SYMCHANGE2 catches warn "SYMCHANGE1: id=$fbid seqsym=$gsym cytosym=$cft[1]\n" if ($gsym ne $cft[1] && $debug); $gsym= $cft[1]; } if ($cft[2] && $cft[2] ne '-') { $map= $cft[2]; } } elsif ($featname =~ /^cytogene/ && $seqgene{$fbid}) { return; } } ## use use flybase::sym2id -- from acode not from fly features !? my $fsym= flybase::sym2id::id2symbol($fbid); if ($fsym && $fsym ne $gsym) { warn "SYMCHANGE2: id=$fbid seqsym=$gsym fbsym=$fsym\n" if $debug; $gsym= $fsym if ($fsym); } # # my $fsym= flyfeatures::getSym($fbid); $gsym= $fsym if ($fsym); # if ($map !~ /^\d/) { # my $cmap= flyfeatures::getCytomap($fbid); # $map= $cmap if ($cmap); # } } } # unless($gsym) { # if ($featname =~ /^source/) { $gsym= $org; } # else { $gsym= '-'; } # } # unless($map) { # if ($featname =~ /^(source|scaffold)/ && $csomeFile) { $map= $csomeFile; } # else { $map= '-'; } # } # unless ($id) { # $id= $sym2id{ symcase($gsym) }; # } # $gsym= $idsym{$id} if($idsym{$id}); # make sure is proper symbol if (ref($rdbx) =~ /ARRAY/) { $rdbx= join(',',@$rdbx); } $allfeats{$featname}++; my $ftline= "$featname\t$gsym\t$map\t$srange\t$id\t$rdbx\t$rfnotes"; print $fh $ftline; push(@sources, $ftline) if ($featname eq 'source'); # foreach (sort @{$rdbx}) { print $fh "$_,";} # if ($wantnotes) { # print $fh "\t"; # foreach (sort keys %{$rfnotes}) { # print $fh "$_,"; # $allnotes{$_} += ${$rfnotes}{$_}; # } # } print $fh "\n"; # push(@newsource,$line) if ($isscaf); } # # from flyfeatures.pm !!! fixme # need initFlydata($flydatapath); # !! Copy back to flyfeatures.pm - used by genomefeat.pl indexer sub readflysorsa { my ($sorsa)= @_; local(*F,*O,$n); return if (scalar(%cytobases)); %cytobases= (); @sorsalist= (); # %cytoarms= ( 'X' => '1A1', '2L' => '21A1', '2R' => '41A1', # '3L' => '61A1', '3R' => '81F1', '4' => '101A1' ); # %flycsomebands = ( 'X' => [ '1A1','20F4'], '2L' => ['21A1','40F7'], '2R' => ['41A1','60F5'], '3L' => ['61A1','80F9'], '3R' => ['81F1','100F5'], '4' => ['101A1','102F8'], ); unless (open(F,$sorsa)) { warn "Can't read $sorsa" ; return; } warn "reading $sorsa\n" if $debug; while () { next unless(/^\d/); chomp(); my($cyto,$bb,$be)= split(); ### /\t/ ## (' ') was, now \t separated! $cytobases{$cyto}= [ $bb, $be ]; push( @sorsalist, $cyto); # sorted } close(F); } sub getCytolocFromSeqloc { my ($arm, $bstart, $bend)= @_; # $chr my ($cstart, $cend); my $ca= $flycsomebands{$arm}->[0]; my $ina= 0; foreach $cb (@sorsalist) { if ($cb eq $ca) { $ina= 1; } if ($ina) { my ($bb, $be)= @ {$cytobases{$cb}}; if ($bstart >= $bb && $bstart <= $be) { $cstart= $cb; } if ($bend >= $bb && $bend <= $be) { $cend= $cb; last; } } } if ($cstart && !$cend) { $cend= $cstart; } elsif ($cend && !$cstart) { $cstart= $cend; } if (wantarray) { return ($cstart, $cend); } else { return ($cstart eq $cend) ? $cstart : "$cstart--$cend"; } } sub maxrange { my( $range)= @_; my ($pre, $suf,$start,$stop, $b, $u); $start= $kMissingValue; $stop= $start; $range =~ s/^([^\d<>-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } return ($start,$stop); } sub getCytorange { my($ca,$cx)= @_; $ca =~ s/^\s*//; $ca =~ s/[\s+-]*$//; return () unless ($ca =~ /^\d/); #? don't have conversion info for hXXX my $offs= 0; ## need patch for 1Lt; 1Rt; 1Cen? h, ... if ($ca !~ /[A-G]/) { if ($cx) { $cx =~ s/\d*$//; $ca= $cx . $ca; } else { $ca .= 'A1'; $offs= -1; } } elsif ($ca !~ /\d$/) { $ca .= '1'; $offs= -1; } my ($start1,$stop1)= @ {$cytobases{$ca}}; # split(/\t/, $cytobases{$ca}); return ($start1,$stop1,$ca); # +$offs } # sub getMap2Bases { return flyCytomap2Bases(@_); } ## ignore ranges outside of csome arm ## same as sub flyCytomap2Bases() sub getMap2Bases { my ($map, $arm)= @_; my($stop,$start)= ($kMissingValue,$kMissingValue); my $carm= $flycsomebands{$arm}->[0]; # $cytoarms{$arm}; my $darm= ($carm =~ m/^(\d+)/) ? $1 : 0; my ($armb, $x)= @ {$cytobases{$carm}}; my $carme= $flycsomebands{$arm}->[1]; my $darme= ($carme =~ m/^(\d+)/) ? $1 : 0; my ($y, $arme)= @ {$cytobases{$carme}}; $map =~ s/\s+//g; foreach my $mp (split(/;/, $map)) { next if ($mp eq '*'); next if ($mp =~ /^h/); # cant handle these yet my($ca, $cb)= split(/-/, $mp); my $da= ($ca =~ m/^(\d+)/) ? $1 : 0; next if ($da < $darm || $da > $darme); my($start1,$stop1,$bstart,$bstop); ($start1,$stop1,$ca)= getCytorange($ca); # next unless($stop1 >= $armb && $start1 <= $arme); if ($cb) { ($bstart,$bstop,$cb)= getCytorange($cb,$ca); $stop1= $bstop if ($bstop); } ##? skip/ignore if both $ca,$cb are outside of $arm ? next if ($stop1 < $armb || $start1 > $arme); $start= $start1 if ($start==$kMissingValue && $start1 >= $armb && $start1 <= $arme); $stop = $stop1 if ($stop1 >= $armb && $stop1 <= $arme); } # $start= $armb if ($start==$kMissingValue && $stop != $kMissingValue); $stop = $start if ($stop==$kMissingValue); ## was = $arme ##?? need band range here, for e.g. '41' => 41A1-41F29 # debug - getting missing when should get real range !?? return (wantarray) ? ($start, $stop) : "$start..$stop"; } ## indexing parts from genomefeat.pl - fixed for changed sorsa.txt, other flyfeat parts ## ## index features*.tsv by ID field for lookup by id ## sub indexfeatdir { my $dir= shift; local(*D); opendir(D, $dir) || warn "can't open $dir"; my @files= grep( /^features-\w+\.tsv$/, readdir(D)); closedir(D); local(*IMAP,*IMAPX); open(IMAP,">$dir/idmap.tsv"); open(IMAPX,">$dir/idmap.tsv.idx"); foreach my $file (sort @files) { my $sfile= $dir . $file; $csome= $1 if ($file =~ /^features-(\w+)/); ## warn "indexing chr-$csome, $sfile\n" if $debug; print &indexFeatures( $sfile, 'index', $csome); print &indexIds( $sfile, 'idindex', $csome, *IMAP, *IMAPX); } close(IMAP); close(IMAPX); } sub indexIds { my($file, $kind, $csome, $idmapf, $idmapx)= @_; local(*F,*O,*IDMAP,*IDMAPX,$n, $nl); $kind= 'idindex' unless($kind); my $idx= $file . ".idx"; die "Can't read $file" unless (open(F,$file)); die "Can't write $idx" unless (open(O,">$idx")); my $recsize = length(pack("LL", 1, 500)); ## a constant -- 2 long integers my %didid= (); my $at= 0; while () { my $e= tell(F); if (/^\w/) { $nl++; chomp(); my ($class,$sym,$map,$range,$ids,$dbx)= split(/\t/); $ids= '' if ($class =~ /cytowalk/); $ids= '' unless($class =~ /gene|tRNA/ || $ids =~ /gn\d+/); ## FIXME - need another index method to mix FBgn/FBan/FBxx if ($ids =~ s/([^:]+)://) { my $db= $1; ## skip not-our-id ids # $ids= '' if ($db =~ m/PID|SGD/i); $ids= '' unless ($db =~ m/FlyBase|MEOW|euGenes/i); } if ($ids =~ m/[A-Za-z]*(\d+)/) { my $idnum= $1; if (!$didid{$ids}) { ## {$idnum}) { $didid{$ids}++; ## {$idnum}++; my $size= $e - $at; my $record= pack("LL", $at, $size); my $idloc = $idnum * $recsize; seek(O, $idloc, 0); ## check if already did id == e.g., several feats have same ID, pick 1st? always print O $record; $n++; # ?? also write to single $org id-map.tsv ? and do .idx for it? if ($idmapf && $range && $range ne '-') { my($start, $stop)= maxrange($range); my $idat= tell($idmapf); ## my $id= $ids; $id =~ s/[^:]+://; print $idmapf "$ids\t$csome\t$start..$stop\n"; $e= tell($idmapf); $size= $e - $idat; $record= pack("LL", $idat, $size); seek($idmapx, $idloc, 0); print $idmapx $record; } } } } $at = $e; } close(O); close(F); return "indexIds $file = $n / $nl\n"; } ## ## index features*.tsv by base range ## sub indexFeatures { my($file, $kind, $csome)= @_; local(*F,*O,$n); $kind= 'index' unless($kind); ##return if ($kind ne 'index'); my $idx= $file . ".ranges"; die "Can't read $file" unless (open(F,$file)); die "Can't write $idx" unless (open(O,">$idx")); print O "# $idx\n"; print O "# base range -> file index, and source, scaffold ranges\n"; print O "# tab-separated-values of: \n"; print O "# base-start | file-index OR class-name | file-index | location\n"; my $bindex= 0; ##? off by one? was 0; my $nextstep= -666; my $stepsize= 100000; while () { # my $blength= length($_); if (/^\w/) { chomp(); my ($class,$sym,$map,$range,$ids,$dbx)= split(/\t/); my ($start,$stop)= ($kMissingValue, $kMissingValue); if ($range eq '-') { # ($start, $stop)= flyfeatures::flyCytomap2Bases($map) if ($org =~ /fly/); ($start, $stop)= getMap2Bases( $map, $csome) ; # if ($org =~ /fly/); } else { ($start, $stop)= maxrange($range); } if ($start!=$kMissingValue && $stop!=$kMissingValue) { if ($class eq 'source') { @csomerange= ($start, $stop); $stepsize= int( ($stop - $start) / 100 ); } if ($class eq 'source' || $class eq 'scaffold') { print O "$class\t$bindex\t$range\n"; } elsif ($nextstep == -666 || $start >= $nextstep) { $nextstep= $start if ($nextstep == -666); $nextstep= 0 unless($nextstep); ## dang perl print O "$nextstep\t$bindex\n"; $nextstep += $stepsize; } $n++; } } # $bindex += $blength; $bindex= tell(F); ##? off by one ?? } print O "$csomerange[1]\t$bindex\n"; close(F); close(O); return "indexFeatures $file = $n\n"; } BEGIN { @csomes = qw( X 2L 2R 3L 3R 4 ); %armnum= ( 'X' => 1, '2L' => 2, '2R' => 3, '3L' => 4, '3R' => 5, '4' => 6); $kMissingValue= -99999999; $kMaxValue= 999999999; $kMinValue= $kMissingValue+1; #? /c7/eugenes/genomew/fly/cyto/ %feat1data = ( aset => { path => '/c7/eugenes/genomew/fly/features-', # keep => [ ], drop => [ qw/mRNA gene cytoclone/ ], name => 'FlyBase cytologic features', # & BDGP/Celera/FlyBase Release 1 xml-chunks # date => '20000902', date => '20010924', #'20010620', }, bset => { path => '/export/home/gilbertd/fbl/egad/01jun/gad1/features-gn-', keep => [ qw/ mRNA CDS gene/ ], # drop => [], name => 'BDGP/Celera/FlyBase Release 1 gadfly1 genes & transcripts', date => '20010611', }, cset => { path => '/export/home/gilbertd/fbl/egad/01jun/feat1/features-', keep => [ qw/segment/ ], # drop => [], name => 'BDGP/Celera/FlyBase Release 1 gadfly1 misc. features', date => '20010611', skipdocs => 1, } ); #? /c7/eugenes/genomew/fly/cyto/ %feat2data = ( aset => { path => '/c7/eugenes/genomew/fly/features-cyto-', keep => [ qw/cytoabs cytodef cytodef cytodup cytogene cytoins cytoinv cytotloc cytowalk/ ], # drop source - should be in cset # drop => [ qw/source mRNA gene cytoclone gap cDNA p_insertion repeat tRNA/ ], name => 'FlyBase cytologic features', title => 'FlyBase cytologic and sequence rel. 2', file => 'cytodb ', date => '20020615', #'20010620', skipdocs => 1, # skipdoc => [ # '# Features for fly', # '# gnomap-version', # '# tab-separated-values', # "# Feature\t", # ], }, bset => { # path => '/export/home/gilbertd/fbl/egad/01sep/gad2/features-gn-', path => '/c7/eugenes/genomew/fly/features-gd-', # keep => [ qw/ mRNA CDS gene/ ], drop => [ qw/blastn blastx alignment HSP/, 'result set' ], name => 'BDGP/Celera/FlyBase Release 2 (GadFly) features', title => 'FlyBase cytologic and sequence rel. 2', file => 'gadfly-gff ', # date => '20010611', date => '20020308', }, #cset => { # # path => '/export/home/gilbertd/fbl/egad/01jun/feat2/features-', # path => '/export/home/gilbertd/fbl/egad/01sep/feat2/features-', # # keep => [ ], # drop => [ qw/gene/ ], # name => 'BDGP/Celera/FlyBase Release 2 gadfly2 misc. features', # # date => '20010611', # date => '20010710', # }, ); } __END__ #| name | path | date | #+---------+-------------------------------------------------+--------------+ #| gadfly1 | /c7/tmp/flybase/fbl/01jun/gadfly1-lite-20010611 | Jun 17 23:58 | #| gadfly2 | /c7/tmp/flybase/fbl/01jun/gadfly2-lite-20010611 | Jun 11 18:17 | #--------- =begin ## sort/cut docs for this output ## drop aset 'Features for' was # fly/features-2R.tsv # Features for fly from BDGP/Celera/FlyBase Release1 data [ 2R.chunk9950000-10000000.xml, 22-June-2001] # gnomap-version 1 # # tab-separated-values # Feature gene map range id db_xref notes # source: FlyBase cytologic features, 20010620 # source: BDGP/Celera/FlyBase Release 2 gadfly2 genes & transcripts, 20010611 # fly/features-2R.tsv # Features for fly from BDGP/Celera/FlyBase Release 2 data [ gadfly2, Jun 11 2001] # gnomap-version 1 # # tab-separated-values # Feature gene map range id db_xref notes # source: BDGP/Celera/FlyBase Release 2 gadfly2 misc. features, 20010611 to: # fly/features-2R.tsv # Features for fly from BDGP/Celera/FlyBase Release 2 data [ gadfly2, Jun 11 2001] # gnomap-version 1 # source: FlyBase cytologic features, 20010620 # source: BDGP/Celera/FlyBase Release 2 gadfly2 genes & transcripts, 20010611 # source: BDGP/Celera/FlyBase Release 2 gadfly2 misc. features, 20010611 # # tab-separated-values # Feature gene map range id db_xref notes =cut @doc = split(/\n/,<){next unless(/^\w/); ($n,$x)=split(/\t/); $h{$n}++;} \ foreach (sort keys %h) {print "$_\t$h{$_}\n";}' end features-2L.tsv cDNA 1998 cytoabs 422 cytoclone 1317 cytodef 626 cytodup 159 cytogene 794 cytoins 1736 cytoinv 561 cytotloc 731 cytowalk 89 gap 114 gene 2391 mRNA 2511 p_insertion 646 repeat 586 source 1 features-3L.tsv cDNA 2160 cytoabs 130 cytoclone 1268 cytodef 371 cytodup 78 cytogene 845 cytoins 1061 cytoinv 412 cytotloc 578 cytowalk 55 gap 206 gene 2596 mRNA 2737 p_insertion 508 repeat 5578 source 1 tRNA 50 features-4.tsv cDNA 106 cytoabs 3 cytoclone 27 cytodef 16 cytodup 1 cytogene 41 cytoins 26 cytoinv 1 cytotloc 188 cytowalk 4 gap 32 gene 84 mRNA 96 repeat 586 source 1 features-2R.tsv cDNA 2408 cytoabs 141 cytoclone 1361 cytodef 522 cytodup 83 cytogene 883 cytoins 1836 cytoinv 533 cytotloc 597 cytowalk 60 gap 117 gene 2631 mRNA 2772 p_insertion 823 repeat 5550 source 1 tRNA 96 features-3R.tsv cDNA 2745 cytoabs 140 cytoclone 1610 cytodef 647 cytodup 185 cytogene 1169 cytoins 1430 cytoinv 551 cytotloc 761 cytowalk 114 gap 188 gene 3363 mRNA 3538 p_insertion 716 repeat 6526 source 1 tRNA 81 features-X.tsv cDNA 1822 cytoabs 1014 cytoclone 1298 cytodef 1353 cytodup 166 cytogene 1627 cytoins 1484 cytoinv 464 cytotloc 822 cytowalk 106 gap 654 gene 2293 mRNA 2403 p_insertion 357 repeat 11366 source 1 tRNA 26 # my $cmap= '1A1;30B5-31E1;44A;88F1-88A1'; # print "source: $cmap\n"; # my @mp= split(/;/, $cmap); # my $n= $#mp; # foreach my $i (0..$n) { # # my @mpx= ($mp[$i]); # push(@mpx,@mp[0 .. ($i-1)]) if ($i>0); # push(@mpx,@mp[($i+1) .. $n]) if ($i<$n); # $mpx= join(";", @mpx); # print "$i a) $mpx\n"; # # $mpx= join(";", (@mp[$i], @mp[0 .. ($i-1)], @mp[($i+1) .. $n]) ); # "$mp;$mpx"; # print "$i b) $mpx\n"; # $mpx= join(";", @mp[$i,0..$i-1,$i+1..$n] ); # "$mp;$mpx"; # print "$i c> $mpx\n"; # } # exit; #__END__