#!/usr/local/bin/perl # parseflyxml.pl # parse fly xml sequence features files ## need to combine other fly features -> genomefeat.pl & flyfeatures.pl =head1 NOTES can't trust the fb gene sym/id's here -> need to go to FBgn.acode to match against synonym, ID2 ! jun01 -- add scaffold table to use xml now provided by bdgp only as scaffolds to chromosomes ... apr01 -- off-by-1 err - ranges from xml data are likely 0-based, while gnomap/SeqRange assumes 1-based -- check other org csome start# FIXME #2 -- end values are all +1 (after end base), due to celera counting system >>>> this code was built around the data we received from celera. the celera coordinate system is: start - the coordinate of the first base of the feature on the source sequence, counting from zero end - the coordinate *one beyond* the last base of the feature on the source sequence. i.e. the end coordinate is non-inclusive. <<<<< sep00 -- add 2 new features: Unknown values feature_span=translate offset 15111 ##? what is this result_span=cdna 72409 call: set gw=/c7/eugenes/genomew ./parseflyxml.pl -DA$gw/fly01jun/acode -o$gw/fly01jun/ -x >& log.flyx & =cut # $debug= 1; my $gr2maphome= 'MacHome:bio:flybase:fbjava:fbgrmap2:'; ## $macHome= 1; push(@ARGV,"-W:","-oflyxml2.tsv","${gr2maphome}csomes:xml-chunks:"); use Getopt::Std; use POSIX; use flybase::sym2id; my $org= 'fly'; my $sourceName= 'BDGP/Celera/FlyBase Release1 data'; # fixme for rel.2 ## paths for iubio/kalo my $genomeshome= '/c7/eugenes/genomew/'; # '/c5/tmp/genomew/'; my $meowpub= '/bio/meow-pub/meow/server/'; #'/c3/iubio/meow-pub/meow/server/'; my $mework= '/c7/eugenes/genomes/'; # '/c5/tmp/meow/work/genomes/'; my $flyxmlchunks= '/c7/eugenes/genomew/flyoth/xml-chunks/'; #'/c5/tmp/genomew/fly/xml-chunks/'; my $flyxmlscafs= '/c7/eugenes/genomew/flyoth/01feb/RELEASE1/xml/'; # $gnomappath= '/c5/tmp/meow/work/genomes/flyx/'; my $sorsapath= '/c7/eugenes/genomew/fly/'; # my $sorsa= '/c7/eugenes/genomew/fly/sorsa.txt'; & scaffolds.txt my $orgacode= "$meowpub/$org/acode"; my $isMeowId= 1; ## for getSym2Id - acode data my @arms= ( 'X', '2L', '2R', '3L', '3R', '4' ); my $nchr= scalar(@arms); my %armlengths= (); my %armsh= map{ $armlengths{$_}= 0; $_ => 1; } @arms; my $chr= ''; my %scaffoldsh= (); my %cytobases= (); my @sorsalist= (); my %cytoarms=(); my $isscaff= 0; my $forGnomap= 1; my $gnomapvers= '1'; my $kMissingValue= -999999999; my $activescaf= ''; my $scafoffset= 0; my %opt=(); Getopt::Std::getopts('o:p:sxW:DA:',\%opt); $mework= $opt{W} || $genomeshome; ##!!!! test, fix $outpath= "$mework/$org/"; $outpath= $gr2maphome if ($macHome); $outpath= $opt{o} || $outpath; $sorsapath= $opt{p} || $sorsapath; $debug= 1 if $opt{D}; $orgacode= $opt{A} if $opt{A}; push(@ARGV, $flyxmlchunks) if $opt{x} ; push(@ARGV, $flyxmlscafs) if $opt{s} ; # }} unless (scalar(@ARGV)) { die "$0 [-o output] [-Aacode] [-p mappath] [-x|-s] fly-xml-paths-or-files Usage: -D == debug -o output path == [$outpath] -A acode == fly.acode file [$orgacode] -p path-to-sorsa-and-scaffold-maps == [${sorsapath}sorsa.txt, ${sorsapath}scaffolds.txt] -s default xml scafs [$flyxmlscafs] -x default xml chunks [$flyxmlchunks] "; } # %sym2id= getSym2Id($org, $orgacode) if (-f $orgacode); flybase::sym2id::set4meow($isMeowId); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); readflysorsa("${sorsapath}sorsa.txt"); readscaffoldmap("${sorsapath}scaffolds.txt"); ## $OUTH= \*STDOUT; foreach $infile (@ARGV) { $achr= 'null1'; if (-d $infile) { $dir= $infile; local(*D); opendir(D, $dir) || warn "can't opendir $infile"; my @files= grep( /\.xml$/, readdir(D)); closedir(D); foreach $fn (sort @files) { if ($fn =~ /^(AE\d+)/) { $isscaff= 1; $ascaf= $1; } #? join csomes after or here? elsif ($fn =~ /^([X234RL]+)\./) { $isscaff= 0; $achr= $1; } if (!$isscaff && $achr ne $lastchr) { dumpFeatures( $lastchr, $lastinfile, $outpath); $lastchr= $achr; } warn "parse $fn\n" if ($isscaff && $debug); parseXML("$dir$fn"); # should read $chr $lastinfile= $fn; } } else { dumpFeatures($lastchr, $lastinfile, $outpath) if ($achr ne $lastchr); parseXML($infile); ## reads $chr always? $lastchr= $achr; $lastinfile= $infile; } } dumpFeatures($lastchr, $lastinfile, $outpath); if (1) { print "Counts:\n N gene=$ngene\n N mrna=$nmrna\n N annot=$nannot\n N span=$nspan\n"; print "Unknown values\n"; foreach (sort keys %unknowns) { print "$_\t$unknowns{$_}\n"; } } exit; #--------------------- sub readflysorsa { my ($sorsa)= @_; local(*F,*O,$n); return if (scalar(%cytobases)); %cytobases= (); @sorsalist= (); %cytoarms= ( 'X' => '1A1', '2L' => '21A1', '2R' => '41A1', '3L' => '61A1', '3R' => '81F1', '4' => '101A1' ); unless (open(F,$sorsa)) { warn "Can't read $sorsa" ; return; } warn "reading $sorsa\n" if $debug; while () { next unless(/^\d/); chomp(); my($cyto,$bb,$be)= split(); ### /\t/ ## (' ') was, now \t separated! $cytobases{$cyto}= [ $bb, $be ]; push( @sorsalist, $cyto); # sorted } close(F); } sub getcytoloc { my ($arm, $bstart, $bend)= @_; # $chr my ($cstart, $cend); my $ca= $cytoarms{$arm}; my $ina= 0; foreach $cb (@sorsalist) { if ($cb eq $ca) { $ina= 1; } if ($ina) { my ($bb, $be)= @ {$cytobases{$cb}}; if ($bstart >= $bb && $bstart <= $be) { $cstart= $cb; } if ($bend >= $bb && $bend <= $be) { $cend= $cb; last; } } } if ($cstart && !$cend) { $cend= $cstart; } elsif ($cend && !$cstart) { $cstart= $cend; } if (wantarray) { return ($cstart, $cend); } else { return ($cstart eq $cend) ? $cstart : "$cstart--$cend"; } } sub readscaffoldmap { my ($mapfile)= @_; local(*F); unless(open(F,$mapfile)) { warn "cant read $mapfile"; return; } warn "reading $mapfile\n" if $debug; %scaffoldsh= (); while() { next unless(/^\w/); chomp(); ($name,$arm,$start,$end)= split(); if ($arm && ($end > $armlengths{$arm})) { $armlengths{$arm}= $end; } $start--; $end--; # save effort when adding $scaffoldsh{$name}= { name => $name, arm => $arm, start => $start, end => $end }; } close(F); } sub dumpFeatures { my( $chr, $infile, $outpath) = @_; return unless(scalar(%fts)); unless ($chr) { $chr= $infile; } $fout= "${outpath}features-$chr.tsv"; print STDERR "dumpFeatures $fout\n" if ($debug); my ($ok, $append); if (-r $fout && $isscaff) { # dang, may need to append w/ scaffold data $ok= open(FOUT,">>$fout"); $append= 1; warn "appending scaffold - need to sort $fout\n"; } else { $ok= open(FOUT,">$fout"); } unless ($ok) { warn "cant write $fout"; return -1; } $OUTH= *FOUT; unless($append) { print $OUTH &feattabheader("$org/features-$chr.tsv",$org,$infile); my $clen= ($chrsize) ? $chrsize : 0; print $OUTH "source\t$org\tChr $chr\t1..$clen\t-\n"; } ## pack together CDS for same gene, exon/intron sets per gene %seqstart= (); @fts= sort _by_startAndGroup values %fts; ## @fts; undef %fts; while ($gf = shift(@fts)) { # print STDERR "sym: ".$gf->[1]."\t range: ".$gf->[2]."\n" if ($debug); @ft= @ $gf; printpart( @ft ); } close(FOUT); } sub _by_startAndGroup { # jun01 added cytomap as [2] ! $arange= $a->[3]; $astart= $1 if ($arange =~ /^\D*(\d+)/); $afeat= $a->[0]; $agroup= $a->[1] || "xxx"; if ($afeat =~ /gene/) { $seqstart{$agroup}= $astart; $arange= 1; } elsif ($afeat =~ /CDS|mRNA/) { $astart= $seqstart{$agroup} || $astart; $arange= 2; } else { $arange= 3; } $brange= $b->[3]; $bstart= $1 if ($brange =~ /^\D*(\d+)/); $bfeat= $b->[0]; $bgroup= $b->[1] || "xxx"; if ($bfeat =~ /gene/) { $seqstart{$bgroup}= $bstart; $brange= 1; } elsif ($bfeat =~ /CDS|mRNA/) { $bstart= $seqstart{$bgroup} || $bstart; $brange= 2; } else { $brange= 3; } $byr= ( $astart <=> $bstart ); if ($byr == 0) { $byr= ( $agroup cmp $bgroup ); if ($byr == 0) { $byr = ($arange <=> $brange); } } return $byr; } sub feattabheader { my ($fname,$org,$sourcefile)= @_; my @tm= localtime( $^T - 24*60*60*(-M $sourcefile) ); my $date= POSIX::strftime("%d-%B-%Y",@tm); $sourcefile =~ s|^$genomeshome||; return <) { if (m/<([^>]+)>/) { dotag($1); } elsif (m/^\s*(.+)\s*$/) { dotext($1); } } close(F); } sub dotag { local $_= shift; if (/^\!/) { # comment - pull date? } elsif (m|^/(\w+)|) { # end tag $tag= $1; $blev= scalar(@tags); $btag= pop(@tags); if ($btag ne $tag) { print STDERR "level $blev; pop $btag ne $tag\n"; } if ($tag eq 'annotation' && ($getgene || $getpred)) { ## ($debug || $gsym || $fbid || $fban) $npred++ if $getpred; ## are there some genes w/o sym or id ? ## need some fix for 89 CG genes that don't show up - changed real gene assignment? $range= makerange( \@ranges, $strand, 1); my $cytomap= ''; my($astart,$astop)= maxrange($range); $cytomap= getcytoloc( $chr, $astart, $astop); my @ft= ('gene',$gsym, $cytomap,$range,$fbid,"$dbxref,$fban",'-'); ## push( @fts, \@ft); #? $fh= join("\t",@ft); $fts{$fh}= \@ft unless($fts{$fh}); } elsif ($tag eq 'feature_set' && ($debug || $ctsym || $fban)) { ## are there some mRNA w/o sym or id ? $range= makerange( \@ranges, $strand, 0); # but keep @ranges for gene? my @ft= ('mRNA',$ctsym,'-',$range,$fban,$fbid||$dbxref,'-'); ## push( @fts, \@ft); #? $fh= join("\t",@ft); $fts{$fh}= \@ft unless($fts{$fh}); } elsif ($tag eq 'result_set' && $misfeat && scalar(@ranges)>0) { $range= makerange( \@ranges, $strand, 0); my @ft= ($misfeat,$missym||'-','-',$range,'-','-','-'); ## push( @fts, \@ft); #? $fh= join("\t",@ft); $fts{$fh}= \@ft unless($fts{$fh}); $getcdna= 0; } elsif ($tag =~ /feature_span|result_span/ && $getrange && $rstop) { # $rstart && unless($getcdna>1) { # if ($rstart>$rstop) { $rstop++; #FIXME#2 # $rng= "$rstop..$rstart"; $strand= -1; # } # else { $rstart++; #FIXME#2 # $rng= "$rstart..$rstop"; $strand= 0; # } $rng= makeFeatureLocation( $rstart, $rstop, $scafoffset); push(@ranges, $rng) ; ## all will be same $strand } $getcdna= 0; } elsif ($getcdna > 0 && $tag eq 'seq_relationship' && $rstop) { # $getrange && $rstart && # result_span type="subject" is getting here $getcdna++; if ($getrange) { # if ($rstart>$rstop) { $rstop++; # FIXME#2 - note rstop is always one beyond, even when rstart>rstop # $rng= "$rstop..$rstart"; $strand= -1; } # else { $rstart++; #FIXME#2 # $rng= "$rstart..$rstop"; $strand= 0; } $rng= makeFeatureLocation( $rstart, $rstop, $scafoffset); push(@ranges, $rng); } } } elsif (s/^(\w+)//) { ## pull params from tag line - store in text{} ? $tag= $1; ##? screen out/in some tags here, or at dotext point? push(@tags, $tag); $lev= scalar(@tags); %parm= (); while (m/\s*(\w+)="([^"]+)"/g) { #" ## my $key= $1; my $val= $2; ## $parm{"$tag.$lev"}{$key}= $val; $parm{$1}= $2; } if ($tag eq 'annotation') { $dbxref= 'FlyBase:' . $parm{id}; ## FBan always ? # @ft= ($feat,$gsym,$range,$fbid,$dbxref,$note); @ranges= undef; $fbid= ''; $gsym= ''; $fban= ''; $ctsym= ''; $getgene= $getpred= 0; $nannot++; } elsif ($tag eq 'gene') { $ngene++; $getgene= 1; } elsif ($tag eq 'result_set') { $misfeat= ''; $missym= ''; $getcdna= 0; @ranges= undef; } elsif ($getcdna > 0 && $tag eq 'seq_relationship') { if ($parm{type} eq 'query') { $getrange= 1; } # result_span type="subject" is getting here else { $getrange= 0; } ## $rstart= ''; $rstop= ''; } elsif ($tag eq 'feature_set') { $ctsym= ''; # $fban= ''; # can be >1 CT per gene my $ctid= $parm{id}; $nmrna++; if ($ctid =~ /(FBan\d+)/) { $fban .= ',' if ($fban); $fban .= 'FlyBase:'.$1; } elsif ($ctid =~ /(CT\w+)/) { $fban .= ',' if ($fban); $fban .= 'GadFly:'.$1; } } elsif ($tag =~ /feature_span|result_span/ ) { $getrange= 0; $rstart= ''; $rstop= ''; $nspan++; } elsif ($tag eq 'seq' && $lev == 2) { # # # -- ignore these # - ditto - LP01188.3prime my $seqid= $parm{id}; ## jun01 - now parsing also scaffolds my $seqlen= $parm{length}; # $scafoffset= 0; $activescaf= ''; # $chr= 'null'; # $chrsize= 0; if ($seqid =~ m/\.(5prime|3prime|complete)/) { } elsif ($armsh{$seqid}) { $chr= $seqid; $chrsize= $seqlen; $scafoffset= 0; $activescaf= ''; } elsif ($scaffoldsh{$seqid}) { my $arm= $scaffoldsh{$seqid}->{arm}; #?? dumpFeatures if $chr ne $lastchr if ($arm ne $lastchr) { dumpFeatures( $lastchr, $lastinfile, $outpath) ; $lastchr= $arm; } $chr= $arm; $chrsize= $armlengths{$chr}; $activescaf= $scaffoldsh{$seqid}; $scafoffset= $activescaf->{start}; warn "got scaffold $seqid, len=$seqlen, arm=$chr armofs=$scafoffset armlen=$chrsize\n" if $debug; } else { $chr= $seqid; #?? $chrsize= $seqlen; #?? $scafoffset= 0; $activescaf= ''; warn "Cant locate arm for $seqid\n"; } } undef %parm; } else { print STDERR "Bad tag? '$_'\n"; } } sub dotext { local $_= shift; ## $text{"$tag.$lev"} .= "$_\n"; if ($tag eq 'name' && $tags[$#tags-1] eq 'gene') { ## there are cases here where $gsym is bogus ! not in flybase sym/syn set ## need to use CGxxx or FBsym instead $gsym1= $_; $feat= 'gene'; my $id= flybase::sym2id::symbol2id($gsym1); ### or '-'? ## $gsym= $gsym1 if ($id); # $gsym= $idsym{$id} if ($id); # which is best? $gsym= flybase::sym2id::id2symbol($id) if ($id); # which is best? ## need fix for gadfly's english of greek: sym err: FB: 'CG17566' ; GadFly: 'gammaTub37C' if ($debug && $gsym ne $gsym1) { print STDERR "$xmlfile $cgsym / $id sym err: FB: '$gsym' ; GadFly: '$gsym1'\n"; } ## ? set $fbid here? } elsif ($tag eq 'name' && $tags[$#tags-1] eq 'annotation' && $_ =~ /^CG/) { $gsym= $cgsym= $_; ## get here before tag, if it exists $feat= 'gene'; $getpred= 1; } elsif ($tag eq 'name' && $tags[$#tags-1] eq 'feature_set') { $ctsym= $_; $feat= 'mRNA'; ## 'transcript'; } elsif ($tag eq 'start' && $tags[$#tags-1] eq 'span') { $rstart= $_; # FIXME #1, see above for change, depends on +/- strand; note this is 0-based origin } elsif ($tag eq 'end' && $tags[$#tags-1] eq 'span') { $rstop= $_ ; ## - 1, FIXME #2 see above change; for celera odd coord system - but rstop < rstart for -strand } elsif ($tag eq 'db_xref_id' && $tags[$#tags-2] eq 'gene' ## && $text{"xref_db.$lev"} =~ /flybase/i && ($_ =~ /FB/) ) { if ($isMeowId) { $fbid= 'MEOW:'.$_; } else { $fbid= 'FlyBase:'.$_; } } elsif ($tag eq 'type' && $tags[$#tags-1] eq 'feature_span') { ##? start range if ($_ eq 'exon') { $getrange= 1; } else { $unknowns{"feature_span=$_"}++; } } # if ($tag eq 'name' && $prog =~ /Repeat/i && $tags[$#tags-1] eq 'result_set') { # $misfeat= $_; # $misfeat= lc($1) if ($misfeat =~ m/#(.+)/); # } elsif ($tag eq 'name' && $prog =~ /^(sim|blastn)/i && $tags[$#tags-1] eq 'result_set') { $missym= $_; # cdna names: SD01823.5prime-hit, LD22117.complete-hit # p_insertions: EP(3)0935 l(3)j2B10 ->FBti0004920:P{lacW}14-3-3ej2B10 EP(3)3112 -> FBti0008027:P{EP}EP3112 # # p_insertion name is BFD| field in FBti.acode - link~ /.bin/asksrs?[fbti-bfd:$name] or [fbti-syn:$name] ##? also collect name of repeat kind? (TA)n#Simple_repeat , (CA)n#Simple_repeat } elsif ($tag eq 'type' && $tags[$#tags-1] eq 'result_span') { if ($_ =~ m/REPEAT|GAP|INSERTION/ ) { $getrange= 1; $misfeat= lc($_); $misfeat =~ s/^p element /p_/; ## insertions have names like 'EP(3)3350' - want data link ! } elsif ($_ eq 'tRNA' ) { $getrange= 1; $misfeat= $_; } elsif ($_ eq 'cdna' ) { $getrange= 1; $getcdna= 1; $misfeat= 'cDNA'; #? $_; ## getrange only for: } else { $unknowns{"result_span=$_"}++; } } elsif ($tag eq 'program' && $tags[$#tags-1] eq 'computational_analysis') { $prog= $_; } } sub makeFeatureLocation { my($bb,$be, $offset)= @_; if ($bb > $be) { $be += 1; # fix weird celera coord. numbers; see HasSeqRange::coords_origin1() $strand= -1; my $ee= $bb; $bb= $be; $be= $ee; } else { $bb += 1; $strand= 0; } # $scaffoldsh{$name}= { name => $name, arm => $arm, start => $start, end => $end }; if ($offset) { $bb += $offset; $be += $offset; } return (wantarray) ? ($bb, $be, $strand) : "$bb..$be"; } sub makerange { my ($aref, $strand, $domax)= @_; my @ra= @$aref; my $rng; #? need to sort ranges numerically? - YES, for $strand<0 at least # given order of spans is from start of gene/mRNA to end, even if from top to bottom? if ($strand<0) { for (my $i= $#ra; $i>=0; $i--) { $_= $ra[$i]; next unless($_); ##? if ($rng) { $rng .= ",$_"; } else { $rng= $_; } } } else { foreach (@ra) { next unless($_); ##? if ($rng) { $rng .= ",$_"; } else { $rng= $_; } } } if ($domax) { my($start,$stop)= maxrange($rng); $rng= "$start..$stop"; } $rng= 'join('.$rng.')' if ($rng =~ /,/); $rng= 'complement('.$rng.')' if ($strand<0); ##? return $rng; } sub maxrange { my( $range)= @_; my ($pre, $suf,$start,$stop, $b, $u); $start= $kMissingValue; $stop= $start; $range =~ s/^([^\d<>-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } return ($start,$stop); } # ## from genomefeat.pl - move to some package/pm # %updown= ( # up => '[', '/up' => ']', # down => '[[', '/down' => ']]', # ); # # %greektrans =( # agr => 'alpha', Agr => 'Alpha', # bgr => 'beta', Bgr => 'Beta', # ggr => 'gamma', Ggr => 'Gamma', # dgr => 'delta', Dgr => 'Delta', # egr => 'epsilon', Egr => 'Epsilon', # zgr => 'zeta', Zgr => 'Zeta', # eegr => 'eta', EEgr => 'Eta', # thgr => 'theta', THgr => 'Theta', # igr => 'iota', Igr => 'Iota', # kgr => 'kappa', Kgr => 'Kappa', # lgr => 'lambda', Lgr => 'Lambda', # mgr => 'mu', Mgr => 'Mu', # ngr => 'nu', Ngr => 'Nu', # xgr => 'xi', Xgr => 'Xi', # ogr => 'omicron', Ogr => 'Omicron', # pgr => 'pi', Pgr => 'Pi', # rgr => 'rho', Rgr => 'Rho', # sgr => 'sigma', Sgr => 'Sigma', # sfgr => 's', # tgr => 'tau', Tgr => 'Tau', # ugr => 'upsilon', Ugr => 'Upsilon', # phgr => 'phi', PHgr => 'Phi', # khgr => 'chi', KHgr => 'Chi', # psgr => 'psi', PSgr => 'Psi', # ohgr => 'omega', OHgr => 'Omega', # ); # # # sub symcase { # return shift; # ## my($sym)= shift; # ## return uc($sym); ## if data doesn't keep proper case! # ## return $sym; # } # # # sub greek2en { # my($sym)=@_; # while ( $sym =~ m|&([A-Za-z][A-Za-z]?gr);| ) { # my $so= $1; # my $sn= $greektrans{$so} || $so; # $sym =~ s|&$so;|$sn|g; # } # while ( $sym =~ m|\<(/?[a-z]+)\>|) { # $tg= $1; # $tt= $updown{$tg} || $tg; # $sym =~ s|<$tg>|$tt|g; # } # return $sym; # } # # sub getSym2Id { # my ($org, $acode, $domarker)= @_; # %sym2id= (); # %idsym= (); # %idsize= (); # local(*ALIB); # open(ALIB,$acode); # $/= "# EOR\n"; # my $insyn; # while () { # my($id,$sym,$syn)= ('','',''); # $id= $1 if (/\nID\|(.+)/); # $id= 'MEOW:'.$id if ($isMeowId); # # $sym= $1 if (/\nSYM\|(.+)/); # next unless ($sym && $id); # # if ($domarker) { # # $idsize{$id}= isgoodmarker($_); # # } # $sym2id{symcase($sym)}= $id; # $idsym{$id}= $sym; # $sym2id{symcase(greek2en($sym))}= $id if ($sym =~ /&| 0) { # if (substr($_,$x+1,1) eq '|') { $e = $x+1; $at= $e if ($at<0); } # else { $e= $x; last; } # } # if ($at>=0 && $e>$at) { # my $s= substr($_, $at, $e-$at); # foreach $s (split(/\n/, $s)) { # $s =~ s/^\w*\|//; # $sym2id{symcase($s)}= $id; # $sym2id{symcase(greek2en($s))}= $id if ($s =~ /&| 2L chromosome arm 2L_12X_3p /// perl -e 'while(<>){next unless(/^\w/);($f,$x)=split(/\t/);$h{$f}++;}\ foreach(sort keys %h){print "$_\t$h{$_}\n";} ' features-*.tsv fly features: cytoabs 1845 cytoclone 6881 cytodef 3163 cytodup 672 cytogene 5806 cytoins 9220 cytoinv 2518 cytotloc 3680 cytowalk 428 gap 1311 gene 13277 mRNA 14979 p_insertion 3208 pelement insertion 1 <MacHome:defaults1.cc"); while () { ## $in =~ s/\"\n([^\`]+)\`\"/X_X_X/) { # # " if (s/,\s*\"\n/,\n\"/){ #" $st=1; $nc++; } elsif (s/^\`\"/\"/) { #" $st=0; } elsif ($st) { s/\n/\\\n/; } print OUT $_; } close(IN); close(OUT); print "nc= $nc\n"; exit; #---- open(IN,"MacHome:defaults.cc"); $\=""; $in=; close(IN); ## $\="\n"; open(OUT,">MacHome:defaults1.cc"); while ($in =~ s/\"\n([^\`]+)\`\"/X_X_X/) { # # " $t= $1; $nc++; $t =~ s/\n/\\\n/g; $in =~ s/X_X_X/\n"$t"/; } print OUT $in; close(OUT); print "nc= $nc\n"; exit; #-------------- scaffold xml: start/end releative to this scaffold only... need to do math to get csome-based numbers AE003585 E003585|Drosophila melanogaster genomic scaffold 142000013386046 section 11 of 16, complete sequence.|AE003585.1 GI:7296031 ... CG14350 .. translate offset <<< start/end releative to this scaffold only 290519 290516 ..