#!/usr/local/bin/perl # parse worm gff features =head NOTES use GFF is slow and mem. piggy - avoid? may02 - revised to drop GFF.pm cut unwanted from worm.gff: -- may later want some of these egrep -v 'assembly_tag|BLAST|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch' x.gff > n.gff keep or drop LINK, SUPERLINK ? what are these ? - 2-4 per csome.gff: CHROMOSOME_I.gff: Sequence 2137 6396950 Sequence "SUPERLINK_RW1" ? new gff feats to strip mar01: curated Sequence | provisional Sequence | Genomic_canonical ? join long lists of repeats that cluster at same range ? - likely a computation exess kind ==? repeat_region (already a few of these in worm.gff) or repeat_tandem_region, repeat_inverted_region, repeat_family_region jul01 FIXME: experimental - - 173894..174992 - - RNAi "JA:F56C11.6" ^^ should be RNAi structural - - 180299..181965 - - PCR_product "sjj_F56C11.5" ^^ should be PCR_product ? jul01 new oat% wc worm/features-I.tsv 56407 484930 3790824 worm/features-I.tsv mar01 old: oat% wc ../genomes/worm/features-I.tsv 22483 175322 1992298 ../genomes/worm/features-I.tsv ^^^^^ is big diff due to junk of some kind? perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1]_$f[2]"}++;} foreach(sort keys %a){print "$_\t$a{$_}\n"; }' CHROMOSOME_V.gff CHROMOSOME_I.gff:##sequence-region CHROMOSOME_I 1 14435323 CHROMOSOME_II.gff:##sequence-region CHROMOSOME_II 1 15205541 CHROMOSOME_III.gff:##sequence-region CHROMOSOME_III 1 13093964 CHROMOSOME_IV.gff:##sequence-region CHROMOSOME_IV 1 17047437 CHROMOSOME_V.gff:##sequence-region CHROMOSOME_V 1 21085422 CHROMOSOME_X.gff:##sequence-region CHROMOSOME_X 1 17408926 /c7/eugenes/worm/current_release/CHROMOSOMES/ ... .dna == .fasta CHROMOSOME_I.dna.gz CHROMOSOME_IV.dna.gz chrom_dump.log.Z CHROMOSOME_I.gff.gz CHROMOSOME_IV.gff.gz composition.all.Z CHROMOSOME_II.dna.gz CHROMOSOME_V.dna.gz dna_dump.log.Z CHROMOSOME_II.gff.gz CHROMOSOME_V.gff.gz gff_dump.log.Z CHROMOSOME_III.dna.gz CHROMOSOME_X.dna.gz totals.Z CHROMOSOME_III.gff.gz CHROMOSOME_X.gff.gz dec01: drop these feats: WABA_briggsae_cosmid_similarity 254896 / 408525 (I) BLAT_EST_BEST_similarity 104070 / 408525 (I) BLATX_ 4000/ set epat=" -v 'qrxrqx|assembly_tag|BLAST|BLAT_|BLATX_|WABA_|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" gzegrep $epat $wc/*_I.gff.gz | readseq -pipe -inform=gff -f=fff -a -o $gw/worm/worm_I.fff Exception in thread "main" java.lang.OutOfMemoryError set rs='/usr/java1.3/bin/java -Xms80M -Xmx256M -cp /usr/local/bin/readseq.jar run' gzegrep $epat $wc/*_I.gff.gz | $rs -pipe -inform=gff -f=fff -a -o $gw/worm/worm_I.fff =cut # may02 - add wublastx.similarity $egreppatt= " -v 'qrxrqx|assembly_tag|wublastx|BLAST|BLAT_|BLATX_|waba_|WABA_|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'"; $egrep= 'egrep '. $egreppatt; $gzgrep= 'gzegrep ' . $egreppatt; $debug= 0; $progname= $0; # 'parsewormgff2'; use Getopt::Std; use POSIX; use flybase::sym2id; ## use GFF; # use GFF::GeneFeature; # use GFF::GeneFeatureSet; # use GFF::Analysis; ## buggers ! ## paths for iubio/kalo $meowpub= '/c6/iubio/meow-pub/meow/server'; # /c3/iubio/meow-pub/meow/server/ $genomeshome= '/c7/eugenes/genomew/'; $wormchrdata= '/c7/eugenes/worm/current_release/CHROMOSOMES/'; $org= 'worm'; $orgacode= "$meowpub/$org/acode"; $isMeowId= 1; ## for getSym2Id - acode data $orgaw= "$genomeshome/$org/acode"; if (-r $orgaw && -M _ < -M $orgacode) { $orgacode= $orgaw; } $sourename= 'WormBase GFF data'; @chrs= qw( I II III IV V X ); $nchr= scalar(@chrs); ## FIXME this hardcode sizes are bad; get from CHROMOSOME_I.fasta... %chrsizes = ( I => 14435323, II => 15205541, III => 13093964, IV => 17047437, V => 21085422, X => 17408926, ); $FeatByChr= 1; $useGFFpm= 0; $forGnomap= 1; $gnomapvers= '1'; %opt=(); Getopt::Std::getopts('o:c:dxW:DA:',\%opt); $mework= $opt{W} || $genomeshome; ##!!!! test, fix $outpath= "$mework/$org/"; $outpath= $opt{o} || $outpath; $orgacode= $opt{A} if $opt{A}; $debug= 1 if $opt{D}; $usedefdata= 1 if $opt{x}; $nodna= 1 if $opt{d}; if ($opt{c}) { @chrset= split(/[,;:]/,$opt{c}); } else { @chrset= @chrs; } %chrset= map { $_ => 1; } @chrset; if($usedefdata) { opendir(D, $wormchrdata); @fs= readdir(D); closedir(D); @gffset= map { $wormchrdata.$_; } grep( /CHROMOSOME.+\.gff/, @fs); @dnaset= map { $wormchrdata.$_; } grep( /CHROMOSOME.+\.dna/, @fs); } else { @gffset= grep( /gff/, @ARGV); @dnaset= grep( /dna/, @ARGV); } $okay= scalar(@gffset); if ($okay) { foreach $f (@gffset) { $okay= 0 unless (-r $f); } } unless ($okay) { print "$progname [-o output] [-Aacode] [-x] worm-gff-files\n"; print " Usage: \n -D == debug\n -o output path == [ $outpath ]\n"; print " -d NO DNA output\n -A acode == worm.acode [ $orgacode ]\n "; print " -c I,II,.. == chrs to read [ ".join(",",@chrs)." ] \n"; print " -x use default data [ $wormchrdata/CHROMOSOME*gff ] \n"; print "Data chosen: ".join("\n",@gffset)."\n"; print "Dna chosen: ".join("\n",@dnaset)."\n"; exit; } unless($nodna) { foreach $f (@dnaset) { if ($f =~ /([IVX]+)\./) { $chr= $1; } else { warn "What chromosome is $f ?"; next; } $outf= "$outpath/dna-$chr.raw"; fasta2raw( $f, $outf, $chr) if (-r $f); ## set %chrsizes } } else { foreach $chr (@chrs) { $outf= "$outpath/dna-$chr.raw"; getChrSizes( $outf, $chr) if (-r $outf); } } $totalsize= 0; $autosize= 0; foreach (@chrs) { $totalsize += $chrsizes{$_}; $autosize += $chrsizes{$_} unless($_ eq 'X'); } if (!$useGFFpm) { flybase::sym2id::set4meow($isMeowId); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); foreach $infile (@gffset) { $chr= 'null'; $chr= $1 if ($infile =~ m/_([IVX]+)/i); next unless ($chrset{$chr}); #?? $outname= "$outpath/features-$chr.tsv"; my @tm= localtime( $^T - 24*60*60*(-M $infile) ); my $fdate= POSIX::strftime("%d-%B-%Y",@tm); open(OUTF,">$outname") or die "cant write $outname"; if ($infile =~ /\.(gz|Z)$/) { $cmd= $gzgrep; } else { $cmd= $egrep; } open(INF,"$cmd $infile |") or die "cant $cmd $infile"; parseGFFdirect("[$infile, $fdate]", *INF, *OUTF); close(INF); close(OUTF); system("touch -r $infile $outname"); } } if ($useGFFpm) { # %sym2id= getSym2Id($org, $orgacode); flybase::sym2id::set4meow($isMeowId); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); ## $outf= \*STDOUT; foreach $infile (@gffset) { $chr= 'null'; $chr= $1 if ($infile =~ m/_([IVX]+)/i); next unless ($chrset{$chr}); #?? # $chrsize= $chrsizes{$chr}; # BAD, get from gff or from dna $outname= "$outpath/features-$chr.tsv"; my @tm= localtime( $^T - 24*60*60*(-M $infile) ); my $fdate= POSIX::strftime("%d-%B-%Y",@tm); open(OUTF,">$outname") or die "cant write $outname"; if ($infile =~ /\.(gz|Z)$/) { $cmd= $gzgrep; } else { $cmd= $egrep; } open(INF,"$cmd $infile |") or die "cant $cmd $infile"; parseGFF("[$infile, $fdate]", *INF, *OUTF); close(INF); close(OUTF); system("touch -r $infile $outname"); } } if ($summarize) { summarizeChrGFF(); } exit; #------------------------------------- sub fasta2raw { my ($dnafile, $outfile, $chr)= @_; local(*F,*O, $ok); warn "fasta2raw $dnafile -> $outfile\n" if ($debug); if ($dnafile =~ /\.(gz|Z)$/) { $ok= open(F,"gzcat $dnafile|"); } else { $ok= open(F,$dnafile); } if ($ok) { $ok= open(O,">$outfile"); } unless($ok) { warn "Cant read $dnafile"; return; } $headline= ; # skip single fasta >title line unless($headline =~ /^>/) { warn "Bad fasta title: $headline"; } while () { chomp(); print O $_; } close(F); close(O); system("touch -r $dnafile $outfile"); $chrsizes{$chr}= -s $outfile; warn "chr $chr size= $chrsizes{$chr}\n" if $debug; } sub getChrSizes { my ($dnafile, $chr)= @_; if (-s $dnafile) { $chrsizes{$chr}= -s $dnafile; warn "chr $chr size= $chrsizes{$chr}\n" if $debug; } } sub summarizeChrGFF { while (<>) { # if (/sequence-region\s+CHROMOSOME_(\w+)\s+(\d+)\s+(\d+)/) { # $chr= $1; $b= $2; $c= 3; # $len= $e - $b + 1; # $chrsizes{$chr}= $len; # $totalsize += $len; # $a{Length} = $totalsize; # $ac{$chr}{Length}= $len; # } next if(/^#/); next unless(/\t/); @f= split(/\t/); $chr= $f[0]; $chr =~ s/CHROMOSOME_//i; $key= "$f[1]_$f[2]"; $a{$key}++; $ac{$chr}{$key}++; } ## end input.. $a{Length}= $totalsize; foreach $chr (@chrs) { $ac{$chr}{Length} = $chrsizes{$chr}; } @fldlabs= (sort keys %a); if ($FeatByChr) { ## features (row) x csomes (col) + chi values print "Feature\tObs.",join("\tObs.",@chrs); print "\tDev.(o-e)\tDev.",join("\tDev.",@chrs); print "\tDev.auto\tDev.X"; print "\tChi2(all)\tChi2(X-auto)"; print "\n"; foreach $key (@fldlabs) { print "$key"; $chisum= 0.0; $tv= 0; $autov= 0; foreach $c (@chrs) { %ch= %{ $ac{$c} }; $v= $ch{$key} || 0; $tv += $v; $autov += $v unless($c eq 'X'); print "\t$v"; } print "\td"; foreach $c (@chrs) { %ch= %{ $ac{$c} }; $v= $ch{$key} || 0; $chrsizefac = $chrsizes{$c} / $totalsize; $expect = $tv * $chrsizefac; # null hypothesis 1 - evenly distributed per base ## need also X vs autosome test H2, and maybe inter-autosome test? ## ? combine Chis over features, test of all? $dev = $v - $expect; $chi = ($expect == 0) ? 0 : (($dev * $dev) / $expect); $chisum += $chi; printf "\t%0.1f",$dev; if ($c eq 'X') { $chiX= $chi; $devX= $dev; } # else { } } # print "\te"; foreach $c (@chrs) { print "\t$chie{$c}"; } # print "\td"; foreach $c (@chrs) { print "\t$chid{$c}"; } $expect= $tv * $autosize / $totalsize; $dev= $autov - $expect; $chisum2= $chiX + (($dev * $dev) / $expect); printf "\t%0.1f",$dev; printf "\t%0.1f",$devX; printf "\t%0.1f",$chisum; # df = 5 printf "\t%0.1f",$chisum2; # df = 1 print "\n"; } print "\n"; } else { ## csomes (row) x features (col) print "CHR"; foreach $key (@fldlabs){ print "\t$key"; } print "\n"; foreach $c (@chrs) { $cv= $ac{$c}; %ch= % $cv; print $c; foreach $key (@fldlabs){ $v= $ch{$key} || 0; print "\t$v"; } print "\n"; } } } ## from bioperls1 - bulk_load_gff.pl sub parseGFFdirect { my($srcinfo, $inh, $outh) = @_; my $FID = 1; my $GID = 1; my $FTYPEID = 1; my $ATTRIBUTEID = 1; my %GROUPID = (); my %FTYPEID = (); my %ATTRIBUTEID = (); my %DONE = (); my $FEATURES = 0; my $header; my @feats= (); while (<$inh>) { ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_II 1 15205541 if (/^\#/) { unless($FEATURES) { $header .= $_; if (/date\s+(\S+)/) { $datadate= $1; } if (/sequence-region\s+(\S+)\s+(\S+)\s+(\S+)/) { $chrname= $1; $chrstart= $2; $chrsize= $3; warn "source\t$org\tChr $chr\t$chrstart..$chrsize\t-\n" if ($debug); } } next; } elsif (!$FEATURES) { if ($forGnomap) { print $outh "# $org/features-$chr.tsv\n"; print $outh "# Features for $org from $sourename $srcinfo\n"; print $outh $header; print $outh "#\n"; print $outh "#gnomap-version $gnomapvers\n"; print $outh "#Feature\tgene\tmap\trange\tid\tdb_xref\tnotes\n"; print $outh "source\t$org\tChr $chr\t$chrstart..$chrsize\t-\n" if $chrsize; } else { print $outh $header; } } chomp; my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; $FEATURES++; my $range= "$start..$stop"; my $gsym= '-'; my $note; ## $ref == chromosome name, CHROMOSOME_X # handle group parsing $group =~ s/\\;/$;/g; # protect embedded semicolons in the group $group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/g; my @groups = split(/\s*;\s*/,$group); foreach (@groups) { s/$;/;/g } my ($group_class,$group_name,$target_start,$target_stop,$attributes) = (); ## Bio::DB::GFF->_split_group(@groups); for (@groups) { my ($tag,$value) = /^(\S+)(?:\s+(.+))?/; if ($value =~ /^\"(.+)\"$/) { $value = $1; } $value =~ s/\\t/\t/g; $value =~ s/\\r/\r/g; if ($tag eq 'Note' or ($group_class && $group_name)) { $note= $value; } elsif ($tag eq 'Target' && /\"([^:\"]+):([^\"]+)\"/) { ($group_class,$group_name) = ($1,$2); ($target_start,$target_stop) = /(\d+) (\d+)/; } elsif (!$value) { $note= $tag; } else { ($group_class,$group_name) = ($tag,$value); } if ($tag =~ m/Sequence/) { $gsym= $value; } ## always Sequence/gene name? - no # else { $note= $group_class.$group_name; } ## save it allways? see below summary: Note, Allele, etc. } $gsym= $group_name if ($gsym eq '-'); my $fid = $FID++; my $gid = $GROUPID{$group_class,$group_name} ||= $GID++; my $ftypeid = $FTYPEID{$source,$method} ||= $FTYPEID++; my $da= [$method,$gsym,$start,$stop,$range,$strand,$source,$note]; ## tognomap1 needs this form: ## [$feat,$gsym,$gfstart,$gfend,$range,$compl,$source,$note]; ## method != feat always ## need to group exon/introns, others by groups = Sequence/Locus "name" push( @feats, $da); if ( $fid % 1000 == 0) { print STDERR "$fid features parsed..."; print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; } } %seqstart= (); @feats = sort feats_by_startAndGroup @feats ; foreach my $ft (@feats) { tognomap1($ft, $outh); # printGFF } endgnomap($outh); # printGFF } sub feats_by_startAndGroup { ## my ($ar, $br, $astart, $afeat, $agroup, $bstart, $bfeat, $r); ##? save mem w/o my? ## array is [$method,$gsym,$start,$stop,$range,$strand,$source,$note]; $astart= $a->[2]; $afeat= $a->[0]; $agroup= $a->[1]; ## $a[7] || "xxx"; if ($afeat =~ /Sequence/) { $seqstart{$agroup}= $astart; $ar= 1; } elsif ($afeat =~ /CDS|exon|intron/) { $ar= 2; } #bad? $astart= $seqstart{$agroup} || $astart; else { $ar= 3; } $bstart= $b->[2]; $bfeat= $b->[0]; $bgroup= $b->[1]; ##$b[7] || "xxx"; if ($bfeat =~ /Sequence/) { $seqstart{$bgroup}= $bstart; $br= 1; } elsif ($bfeat =~ /CDS|exon|intron/) { $br= 2; } # ? $bstart= $seqstart{$bgroup} || $bstart; else { $br= 3; } return ($astart <=> $bstart) or ($agroup cmp $bgroup) or ($ar <=> $br); } sub parseGFF { die "use of GFF.pm retired may02 ..."; } ## Methods using GFF.pm ## very slow... # #sub parseGFF { # my($srcinfo, $inh, $outh) = @_; # # ##open(F, $infile) || die "can't open $infile"; # ## =item read( \*INPUT, \&filter, \&converter ) ## use filter to exclude assembly_tag items, others ? # $gfs= new GFF::GeneFeatureSet; # $gfs->read_header($inh); #? get chr# from header comments? # $gfs->read($inh); ## ! big memory pig, and slowish - can we bypass use? # ## but need sorted out - use unix sort in 2nd pass? # # my @chrreg= $gfs->region(); # #$self->{'region'}->[0] = $seqname ; # #$self->{'region'}->[1] = $start ; # #$self->{'region'}->[2] = $end ; # $chrstart= $chrreg[1]; # $chrsize= $chrreg[2]; # warn "source\t$org\tChr $chr\t$chrstart..$chrsize\t-\n" if ($debug); # # if ($forGnomap) { # print $outh "# $org/features-$chr.tsv\n"; # print $outh "# Features for $org from $sourename $srcinfo\n"; # $gfs->dump_header($outh); ##? # print $outh "#\n"; # print $outh "#gnomap-version $gnomapvers\n"; # print $outh "#Feature\tgene\tmap\trange\tid\tdb_xref\tnotes\n"; # print $outh "source\t$org\tChr $chr\t$chrstart..$chrsize\t-\n" if $chrsize; # } # else { # $gfs->dump_header($outh); ##? # } # # # ## don't need this - same info as in current list ### print $outh "# GFF makeGenes list\n"; ### $gfgenes= GFF::Analysis::makeGenes($gfs); ## need this to make gene, mRNA join()s - TEST ### foreach $gf ( $gfgenes->order_gf() ) { ### ## tognomap($gf, $outh); ### $gf->dump($outh); ### last if ($ntest++ > 20); ### } ### print $outh "# GFF input list\n"; # # ## need to pack together CDS for same gene, exon/intron sets per gene # ## $gfs->order_gf() doesn't ensure Sequence/gene item is before its parts # ## doesn't keep all parts of gene together either ! # %seqstart= (); # my @sfeats = sort _by_startAndGroup @{$gfs->{'feature'}}; # foreach $gf (@ sfeats ) { # tognomap( $gf, $outh); # printGFF # } # endgnomap($outh); # printGFF # # # foreach $gf ( @{$gfs->{'feature'}} ) { } # # while ($gf= $gfs->nextGeneFeature()) { printGFF($gf, $outh); } # # ##close(F); #} # # # #sub _by_startAndGroup { # ## my ($ar, $br, $astart, $afeat, $agroup, $bstart, $bfeat, $r); ##? save mem w/o my? # $astart= $a->start; # $afeat= $a->feature(); # $agroup= $a->group() || "xxx"; # if ($afeat =~ /Sequence/) { $seqstart{$agroup}= $astart; $ar= 1; } # elsif ($afeat =~ /CDS|exon|intron/) { $astart= $seqstart{$agroup} || $astart; $ar= 2; } # else { $ar= 3; } # # $bstart= $b->start; # $bfeat= $b->feature(); # $bgroup= $b->group() || "xxx"; # if ($bfeat =~ /Sequence/) { $seqstart{$bgroup}= $bstart; $br= 1; } # elsif ($bfeat =~ /CDS|exon|intron/) { $bstart= $seqstart{$bgroup} || $bstart; $br= 2; } # else { $br= 3; } # # # Larry Wall sez: (0 cmp/<=> fall down to next 'or' case) # return ($astart <=> $bstart) # or ($agroup cmp $bgroup) # or ($ar <=> $br); ## $rsort= ( $astart <=> $bstart ); ## if ($rsort == 0) { ## $rsort= ( $agroup cmp $bgroup ); ## if ($rsort == 0) { $rsort = ($ar <=> $br); } ## } ## return $rsort; # #} # #sub gff2fff { # my($gf)= @_; # # my $gsym= '-'; # my $note= '-'; # my $dbxref= '-'; # # my $group= $gf->dump_group(); ## this is Note: for misc features - never gsym if Note !? # if ($group =~ m/Sequence\s+"([^"]+)/) { $gsym= $1; } ## always Sequence/gene name? - no # ## elsif ($group =~ m/Note\s+"([^"]+)/) { $note= $1; } # else { $note= $group; } ## save it allways? see below summary: Note, Allele, etc. # # my $gfstart= $gf->start()+0; # my $gfend= $gf->end()+0; # my $range= "$gfstart..$gfend"; # my $compl= $gf->strand(); # # # $gf->score(); # version 2 GFF can have '.' # # $gf->frame(); # # my $feat= $gf->feature(); # my $source= $gf->source(); ## curated, etc. # # return [$feat,$gsym,$gfstart,$gfend,$range,$compl,$source,$note]; #} sub endgnomap { ## EndOfData my($file)= @_; $OUTH= $file; ## hack - fix callees my $glist; foreach $ghkey (keys %ghash) { $glist= $gparts{$ghkey}; joinparts1($glist) if ($glist); delete $gparts{$ghkey}; delete $ghash{$ghkey}; } if (scalar(@tandlast)>5) { &printpart( @tandlast ); @tandlast= undef; } if (scalar(@invlast)>5) { &printpart( @invlast ); @invlast= undef; } if (scalar(@rfamlast)>5) { &printpart( @rfamlast ); @rfamlast= undef; } } #sub tognomap { # my($gf,$file)= @_; # # my $gsym= '-'; # my $note= '-'; # my $dbxref= '-'; # my $group= $gf->dump_group(); ## this is Note: for misc features - never gsym if Note !? # if ($group =~ m/Sequence\s+"([^"]+)/) { $gsym= $1; } ## always Sequence/gene name? - no # ## elsif ($group =~ m/Note\s+"([^"]+)/) { $note= $1; } # else { $note= $group; } ## save it allways? see below summary: Note, Allele, etc. # # ## my $chr1= $gf->seqname(); ## always CHROMOSOME_z when from chromo_z.gff # # my $gfstart= $gf->start()+0; # my $gfend= $gf->end()+0; # my $range= "$gfstart..$gfend"; # my $compl= $gf->strand(); # # # $gf->score(); # version 2 GFF can have '.' # # $gf->frame(); # # my $feat= $gf->feature(); # my $source= $gf->source(); ## curated, etc. # # tognomap1( [$feat,$gsym,$gfstart,$gfend,$range,$compl,$source,$note], $file); #} sub tognomap1 { my($refdata,$file)= @_; $OUTH= $file; ## hack - fix callees my $glist; my ($feat,$gsym,$gfstart,$gfend,$range,$compl,$source,$note)= @ $refdata; ## add/edit $feat w/ $source ## need various feature/flavor renames for gnomap data ## odd source/features: ## curated Sequence/intron/exon/CDS (most seq/genes), hand_built S/i/e/C ?, stl S/i/e/C ? ## provisional S/i/e/C ? ## hmmfs.3_similarity == repeat family 2 ## hmmfs_similarity == repeat fam. 1 ## hold all gene components and print if current $gsym != last $gsym ## Sequence/intron/exon/CDS foreach $ghkey (keys %ghash) { my $end= $ghash{$ghkey}; if ($gfstart > $end) { $glist= $gparts{$ghkey}; ## print STDERR "got glist for $ghkey: $glist\n" if $debug; if ($glist) { joinparts1($glist); delete $gparts{$ghkey}; delete $ghash{$ghkey}; } } } if (scalar(@tandlast)>5 && $tandlast[6] < $gfstart && $source !~ /tandem|inverted|hmmfs|scan/) { ## print STDERR "Dump tand before $gsym/$gfstart: ",join(',',@tandlast),"\n" if $debug; &printpart( @tandlast ); @tandlast= undef; } if (scalar(@invlast)>5 && $invlast[6] < $gfstart && $source !~ /tandem|inverted|hmmfs|scan/ ) { ## print STDERR "Dump inv before $gsym/$gfstart: ",join(',',@invlast),"\n" if $debug; &printpart( @invlast ); @invlast= undef; } if (scalar(@rfamlast)>5 && $rfamlast[6] < $gfstart && $source !~ /tandem|inverted|hmmfs|scan/ ) { ## print STDERR "Dump rfam before $gsym/$gfstart: ",join(',',@rfamlast),"\n" if $debug; &printpart( @rfamlast ); @rfamlast= undef; } if ($source =~ /tRNA/) { ## has CDS/intron/exon parts if ($feat =~ /Sequence/i) { $feat= 'tRNA'; } elsif ($feat =~ /intron|exon/) { return; } ##? keep only Sequence variant here? } # this caching of repeats leads to out-of-sort ordering - need to fix or resort table elsif ( $source =~ /tandem/ && $feat =~ /repeat|misc_feature/) { ## jul01 tandem repeat ; was tandem misc_feature $feat= 'tandem_repeat'; ## 'repeat_tandem'; ## ??need to condense overlapping items to repeat_tandem_region @tand= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); if (scalar(@tandlast)<6) { @tandlast= @tand; } elsif ($tandlast[6]+500 < $gfstart) { &printpart( @tandlast); @tandlast= @tand; } else { ## $tandlast[0]= 'repeat_tandem_region'; $tandlast[6]= $tand[6]; $tandlast[2] =~ s/\.\.\d+/..$tand[6]/; } return; } elsif ($source =~ /inverted/ && $feat =~ /repeat|misc_feature/) { ## jul01 inverted repeat; was inverted_misc_feature $feat= 'inverted_repeat'; ## 'repeat_inverted'; ## ??need to condense overlapping items to repeat_inverted_region ## @inv= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); if (scalar(@invlast)<6) { @invlast= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); } elsif ($invlast[6]+500 < $gfstart) { ## dup of above?! &printpart( @invlast ); @invlast= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); } else { ## $invlast[0]= 'repeat_inverted_region'; $invlast[6]= $gfend; $invlast[2] =~ s/\.\.\d+/..$gfend/; ## print STDERR "Inv region extended to $gfend: ",join(',',@invlast),"\n" if $debug; } return; } elsif ($source =~ /hmmfs|scan/ && $feat =~ /similarity/) { ## hmmfs_similarity $note= $source.'_'.$feat; $feat= 'repeat_family'; $dbxref= $note; $note= '-'; ## ??need to condense overlapping items to repeat_family_region @rfam= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); if (scalar(@rfamlast)<6) { @rfamlast= @rfam; } elsif ($rfamlast[6]+500 < $gfstart) { &printpart( @rfamlast ); @rfamlast= @rfam; } else { ## $rfamlast[0]= 'repeat_family_region'; $rfamlast[6]= $rfam[6]; $rfamlast[2] =~ s/\.\.\d+/..$rfam[6]/; } return; } elsif ($feat =~ /similarity/) { $feat= $source.'_'.$feat; } elsif ($source =~ /Transposon/) { $feat= $source.'_'.$feat; } elsif ($feat eq 'structural') { $feat= $source; } # PCR_product so far - others? elsif ($feat =~ /experimental/) { $feat= $source; } # jul01 elsif ($feat eq 'ALLELE') { $feat= 'allele'; } # standard name is lc elsif ($feat eq 'OLIGO') { $feat= 'oligo'; } #? elsif ($feat =~ /^d+/) { return; } #? what are these odd '411144' features (1 per csome?) # cDNA_for_RNAi yk201d11 - 343219..348348 - - - # RNAi - - 343219..348348 - - RNAi "SA:yk201d11" #?? skip cDNA_for_RNAi if ($feat =~ /^Sequence/i) { return if ($gsym =~ m/CHROMOSOME/i) ; # return if ($gsym =~ m/LINK/) ; ## ? SUPERLINK_RWXL & LINK_B0272 =~ fly scaffolds ? if ($gsym =~ m/LINK/) { $feat= lc($feat); ## keep these or not? } else { $feat= 'gene'; $ghash{$gsym}= $gfend; } } elsif ($feat =~ /CDS|intron|exon/) { $ftab= "$feat\t$gsym\t$range\t$compl\n"; $gparts{$gsym} .= $ftab; ##? is $gsym valid here? # $cparts{$feat}++; ## debug check # warn "part: $ftab" if $debug; return; } printpart($feat,$gsym,$range,$compl,$dbxref,$note); } sub join2 { my $a= shift; my $b= shift; return "$a,$b" if ($a && $b); return $b if ($b); return $a; } sub joinparts1 { my $tnlist= shift; @join1flist= split(/\n/,$tnlist); my ($ft, $feat,$gsym,$range,$compl, $ref); ## $feat == intron, exon and CDS here - drop intron my %feats= (); foreach $ft (@join1flist) { ## ($feat,$gsym,$range,$compl)= @{$ref}; ($feat,$gsym,$range,$compl)= split(/\t/,$ft); next if ($feat eq '' || $feat =~ /intron/); # was getting only introns !? # warn "join: $feat,$gsym,$range,$compl\n" if ($debug && $debugnj++ < 20); $feat =~ s/exon/mRNA/; $feats{$feat}= join2( $feats{$feat}, $range); } # warn "joinparts1: ",join(' ',sort keys %feats),"\n" if ($debug && $debugnj < 20); foreach $ft (sort keys %feats) { $range= $feats{$ft}; $range= 'join('.$range.')' if ($range =~ /,/); printpart( $ft, $gsym, $range, $compl); } } sub joinparts0 { my ($feat,$gsym,$range,$compl, $ref); ## $feat == intron, exon and CDS here - drop intron my %feats= (); while ($ref= shift @_) { ($feat,$gsym,$range,$compl)= @{$ref}; warn "join: $feat,$gsym,$range,$compl\n" if ($debug && $debugnj++ < 20); ## ^ got nulls here ! next if ($feat eq '' || $feat =~ /intron/); $feat =~ s/exon/mRNA/; $feats{$feat}= join2( $feats{$feat}, $range); } foreach my $ft (sort keys %feats) { $range= $feats{$ft}; $range= 'join('.$range.')' if ($range =~ /,/); printpart( $ft, $gsym, $range, $compl); } } sub printpart { my ($feat,$gsym,$range,$compl,$dbxref,$note)= @_; # my ($ref) = @_; # my ($feat,$gsym,$range,$compl)= @$ref; $range= 'complement('.$range.')' if ($compl eq '-'); my $id= '-'; unless ($gsym eq '-') { # $id= $sym2id{ symcase($gsym) } || '-'; # $gsym= $idsym{$id} || $gsym; # make sure is proper symbol $id= flybase::sym2id::symbol2id($gsym) || '-'; $gsym= flybase::sym2id::id2symbol($id) || $gsym; # make sure is proper symbol } print $OUTH "$feat\t$gsym\t-\t$range\t$id\t$dbxref\t$note\n"; } sub symcase { my($sym)= shift; ## return uc($sym); ## if data doesn't keep proper case! return $sym; } sub printGFF { my($gf,$file)= @_; my $groupval= $gf->dump_group(); my $string = sprintf("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s", #%s $gf->seqname(), ## this is always CHROMOSOME_z - change to $groupval name $gf->source(), $gf->feature(), $gf->start()+0, $gf->end()+0, $gf->score(), # version 2 GFF can have '.' $gf->strand(), $gf->frame(), $groupval ); print $file $string; print $file "\n"; } __END__ ## from genomefeat.pl - move to some package/pm #sub getSym2Id { # my ($org, $acode, $domarker)= @_; # %sym2id= (); # %idsym= (); # %idsize= (); # local(*ALIB); # open(ALIB,$acode); # $/= "# EOR\n"; # my $insyn; # while () { # my($id,$sym,$syn)= ('','',''); # $id= $1 if (/\nID\|(.+)/); # $id= 'MEOW:'.$id if ($isMeowId); # # $sym= $1 if (/\nSYM\|(.+)/); # next unless ($sym && $id); ## if ($domarker) { ## $idsize{$id}= isgoodmarker($_); ## } # $sym2id{symcase($sym)}= $id; # $idsym{$id}= $sym; # # if (/\nSYN\|(.+)/) { # $syn= $1; # $sym2id{symcase($syn)}= $id; # my ($at, $e, $x); ## get any more SYN|s ... # $at= index($_,"\nSYN\|"); $e= $at+5; $at= -1; # while (($x= index($_,"\n", $e)) > 0) { # if (substr($_,$x+1,1) eq '|') { $e = $x+1; $at= $e if ($at<0); } # else { $e= $x; last; } # } # if ($at>=0 && $e>$at) { # my $s= substr($_, $at, $e-$at); # foreach $s (split(/\n/, $s)) { # $s =~ s/^\w*\|//; # $sym2id{symcase($s)}= $id; # } # } # } # # if ($org eq 'worm') { # ##?? and add DBL|WP:B0379.7 since some worm PRED gene syms don't match GB sym - ACEPRED:B0379.7a # if (/\nDBL\|WP:(.+)/) { # $sym2id{symcase($1)}= $id; # } # } # # } # $/= "\n"; # close(ALIB); # return %sym2id; ## ?? also return %idsym #} ######## ## do some G/chi stats here - can't seem to find stats package that can do? ## -- do Chi instead - ? need multi feature Chi test? C = chromosome F = feature count G = Sum( freq * ln(freq) ) df = No. levels or values per class ? adjust counts by # bases/csome? - treat as yes/no items per base position? e.g., has_trna vs. no_trna ? sub Gtest { my($tabref)= @_; my %t= % $tabref; foreach my $tc (sort keys %t) { my $tv= $t{$tc}; if (ref($tv) =~ /HASH/) { # $tsum{$tc} = logsumref($tv, $tc); } } sub logsumref { my($tabref, $tabkey)= @_; my %t= % $tabref; my $tsum= 0; foreach my $tc (sort keys %t) { my $tv= $t{$tc}; if (ref($tv) =~ /HASH/) { $tsum{$tabkey}{$tc} = logsumref($tv,$tc); } else { # $tsum += $tv; $tsum{$tabkey} += $tv * log($tv); } } # return $tsum; } Feature counts differ over all csomes (true) Chromosomes differ in any feature counts Which csomes differ from others in which feature counts? X differs from autosomes (which features)? Hypothesis df G C x F C x F[1] C x F[2] ... C[x]F[1] vs. C[others]F[1] ??? ... @n= ( 1, 1.234567, 123456.7890, -1.2345, -123445.67890); foreach (@n) { printf " [%8.1f]",$_; printf " [%-8.1f]",$_; printf " [%8.1g]",$_; printf " [%0.1f]",$_; print "\n"; } exit; ### # GFF input list -- fix to join gene parts gene R04A9 - 93008..145953 - - ## one gene -> one CDS, one mRNA (exon/intron...) gene R04A9.2 - 93592..97989 MEOW:CEgn0014353 - - CDS R04A9.2 - 93592..93850 MEOW:CEgn0014353 - - exon R04A9.2 - 93592..93850 MEOW:CEgn0014353 - - intron R04A9.2 - 93851..93896 MEOW:CEgn0014353 - - exon R04A9.2 - 93897..94074 MEOW:CEgn0014353 - - CDS R04A9.2 - 93897..94074 MEOW:CEgn0014353 - - intron R04A9.2 - 94075..94123 MEOW:CEgn0014353 - - CDS R04A9.2 - 94124..94403 MEOW:CEgn0014353 - - exon R04A9.2 - 94124..94403 MEOW:CEgn0014353 - - intron R04A9.2 - 94404..94751 MEOW:CEgn0014353 - - CDS R04A9.2 - 94752..95362 MEOW:CEgn0014353 - - exon R04A9.2 - 94752..95362 MEOW:CEgn0014353 - - intron R04A9.2 - 95363..95414 MEOW:CEgn0014353 - - exon R04A9.2 - 95415..95625 MEOW:CEgn0014353 - - CDS R04A9.2 - 95415..95625 MEOW:CEgn0014353 - - intron R04A9.2 - 95626..95833 MEOW:CEgn0014353 - - CDS R04A9.2 - 95834..96162 MEOW:CEgn0014353 - - exon R04A9.2 - 95834..96162 MEOW:CEgn0014353 - - intron R04A9.2 - 96163..96209 MEOW:CEgn0014353 - - CDS R04A9.2 - 96210..96403 MEOW:CEgn0014353 - - exon R04A9.2 - 96210..96403 MEOW:CEgn0014353 - - intron R04A9.2 - 96404..96727 MEOW:CEgn0014353 - - CDS R04A9.2 - 96728..97133 MEOW:CEgn0014353 - - exon R04A9.2 - 96728..97133 MEOW:CEgn0014353 - - intron R04A9.2 - 97134..97179 MEOW:CEgn0014353 - - CDS R04A9.2 - 97180..97531 MEOW:CEgn0014353 - - exon R04A9.2 - 97180..97531 MEOW:CEgn0014353 - - intron R04A9.2 - 97532..97587 MEOW:CEgn0014353 - - CDS R04A9.2 - 97588..97778 MEOW:CEgn0014353 - - exon R04A9.2 - 97588..97778 MEOW:CEgn0014353 - - intron R04A9.2 - 97779..97826 MEOW:CEgn0014353 - - exon R04A9.2 - 97827..97989 MEOW:CEgn0014353 - - CDS R04A9.2 - 97827..97989 MEOW:CEgn0014353 - - ## .. gene R04A9.3 - complement(98728..101906) MEOW:CEgn0014354 - - CDS R04A9.3 - complement(98728..98783) MEOW:CEgn0014354 ## perl -e 'while(<>){@f=split(/\t/); $k{$1}++ if ($f[8]=~/^(\w+)\s+/);}\ foreach (sort keys %k){print "$_\t$k{$_}\n";}' CHROMOSOME_X.gff Allele 114 >> insertion Allele "pkP5006" ; Insert "Tc1" Clone 710 Confirmed_by_EST 29 > curated intron Sequence "F13C5.2" ; Confirmed_by_EST Confirmed_by_cDNA 1 Confirmed_in_UTR 26 Note 11132 Sequence 60575 Target 440687 >> BLASTX Target "Sequence:G42E22" 28113 28331 ## for worm/acode, sstrome - look at tRNA kinds per csome perl -e ' $/="\n# EOR"; \ while(<>){ if(/\nFNC\|(tRNA\-\w+)/){$t= $1; \ if(/\nCHR\|(\w+)/){$c= $1; $h{$c}++; $h{"$c,$t"}++;}} else {$h{"unlocated,$t"}++;} }\ foreach (sort keys %h) { print "$_\t$h{$_}\n"; }' acode ###### kalo% wc c*gff 91135 992580 6778691 csome_I.gff 95938 1038845 7171225 csome_II.gff 85902 943874 6612285 csome_III.gff 86786 935123 6484390 csome_IV.gff 124483 1335074 9160464 csome_V.gff 74809 780435 5345909 csome_X.gff 559053 6025931 41552964 total kalo% head -5 c*gff ==> csome_I.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_I 1 14435323 CHROMOSOME_I Sequence 1 2036 . + . Sequence "cTel33B" CHROMOSOME_I Sequence 1 14435323 . + . Sequence "CHROMOSOME_I" ==> csome_II.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_II 1 15205541 CHROMOSOME_II Sequence 1 15205541 . + . Sequence "CHROMOSOME_II" CHROMOSOME_II curated Sequence 1711 4390 . + . Sequence "2L52.1" ==> csome_III.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_III 1 13093964 CHROMOSOME_III Sequence 1 13093964 . + . Sequence "CHROMOSOME_III" CHROMOSOME_III Sequence 3907 35782 . + . Sequence "H10E21" ==> csome_IV.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_IV 1 17047437 CHROMOSOME_IV Sequence 1 1243 . + . Sequence "cTel4X" ==> csome_V.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_V 1 21085422 CHROMOSOME_V Sequence 1 4957 . + . Sequence "cTel3X" ==> csome_X.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_X 1 17408926 CHROMOSOME_X Sequence 1 2649 . + . Sequence "CTEL7X" =========== jul01 check of .gff features set epat="'assembly_tag|BLAST|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" set epat="'BLAST|assembly_tag|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" ## gzegrep -v $epat is missing 1st patt ! set epat="'qrxrqx|BLAST|assembly_tag|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" set ws=/c7/eugenes/worm/current_release/CHROMOSOMES/ gzegrep -v $epat $ws/CHROMOSOME_V.gff.gz |\ perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1].$f[2]"}++;} foreach(sort keys %a){print "$_\t$a{$_}\n"; }' oat% echo $epat 'qrxrqx|BLAST|assembly_tag|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch' gzegrep -v $epat $ws/CHROMOSOME_V.gff.gz |\ perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1].$f[2]"}++;} \ foreach(sort keys %a){print "$_\t$a{$_}\n"; }' .ALLELE 44 .OLIGO 9296 .Sequence 2 .insertion 80 .intron 175 .misc_feature 37 .misc_signal 5 .misc_structure 1 .repeat_region 39 .variation 3 GenePair_STS.structural 4748 Genomic_canonical.Sequence 730 Link.Sequence 4 NDB_CDS.CDS 860 NDB_CDS.Sequence 124 NDB_CDS.exon 1004 NDB_CDS.intron 880 ^^^ ? what are these? PSsearch.similarity 164 Pseudogene.CDS 3 Pseudogene.Sequence 197 Pseudogene.exon 456 Pseudogene.intron 259 RNAi.experimental 453 SL1.trans-splice_acceptor 21 SL2.trans-splice_acceptor 16 Transposon.repeat 2 cDNA_for_RNAi.experimental 451 curated.CDS 27440 curated.Sequence 4854 curated.exon 27450 curated.intron 22596 hmmfs.3.similarity 13120 inverted.repeat 17681 mRNA.transcription 34 mRNA_GENOME.similarity 2021 misc_feature.miscellaneous 7 operon.transcription 3 possible_error.miscellaneous 3 provisional.CDS 250 provisional.Sequence 42 provisional.exon 250 provisional.intron 208 scan.similarity 707 tRNAscan-SE-1.11.Sequence 153 tRNAscan-SE-1.11.exon 160 tRNAscan-SE-1.11.intron 7 tandem.repeat 11823 ==== may02 check of .gff features set epat="'qrxrqx|assembly_tag|BLAST|BLAT_|BLATX_|WABA_|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" set ws=/c7/eugenes/worm/current_release/CHROMOSOMES/ gzegrep -v $epat $ws/CHROMOSOME_V.gff.gz |\ perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1].$f[2]"}++;} \ foreach(sort keys %a){print "$_\t$a{$_}\n"; }' .ALLELE 340 .Feature_data 633 .Homol_data 5067 .LTR 18 .OLIGO 6 .PCR_product 79 .Sequence 1 .deletion 139 .insertion 44 .intron 899 .misc_feature 3 .misc_structure 1 .repeat_region 39 .variation 1332 Expr_profile.Expression 4367 GenePair_STS.structural 4978 Genomic_canonical.Sequence 727 Link.Sequence 3 Pseudogene.CDS 3 Pseudogene.Sequence 175 Pseudogene.exon 438 Pseudogene.intron 263 RNA.Sequence 16 RNA.exon 16 RNAi.experimental 427 RepeatMasker.repeat 5718 SL1.trans-splice_acceptor 99 SL2.trans-splice_acceptor 27 Transposon.repeat 36 WTP.partial_gene 1833 cDNA_for_RNAi.experimental 421 curated.CDS 28087 curated.Sequence 4961 curated.exon 28087 curated.intron 23126 hmmfs.3.similarity 12943 inverted.repeat 16992 misc_feature.miscellaneous 7 operon.transcription 3 possible_error.miscellaneous 9 scan.similarity 299 tRNAscan-SE-1.11.Sequence 153 tRNAscan-SE-1.11.exon 160 tRNAscan-SE-1.11.intron 7 tandem.repeat 10560 waba_coding.similarity 70103 ## what are these waba's ?? waba_strong.similarity 72808 waba_weak.similarity 159979 wublastx.similarity 351003 ## filter out ? >>> output of 1st try parse2: ??? Where are the CDS ??? cat $wt/features-V.tsv | perl -e 'while(<>){next unless(/^(\w+)/);$a{$1}++;} \ foreach(sort keys %a){print "$_\t$a{$_}\n"; }' --- features-V.tsv Expression 4367 Feature_data 633 GenePair_STS 4978 Homol_data 705 LTR 18 PCR_product 79 RNAi 427 Transposon_repeat 36 allele 340 cDNA_for_RNAi 421 gene 5879 insertion 44 inverted_repeat 9652 mRNA 238 misc_feature 3 misc_structure 1 miscellaneous 16 oligo 6 partial_gene 1833 repeat 5718 repeat_family 6057 repeat_region 39 sequence 3 source 1 tRNA 153 tandem_repeat 2620 trans 126 transcription 3 variation 1332 --- features-I.tsv CDS 3006 Expression 2478 Feature_data 465 GenePair_STS 3057 Homol_data 463 LTR 6 PCR_product 70 RNAi 2657 Transposon_repeat 11 allele 212 cDNA_for_RNAi 358 gene 3531 insertion 31 inverted_repeat 5825 mRNA 3060 misc_feature 8 misc_signal 5 miscellaneous 9 oligo 40 partial_gene 1794 polyA_site 2 repeat 5045 repeat_family 4248 repeat_region 20 repeat_unit 8 sequence 6 source 1 tRNA 71 tandem_repeat 2766 trans 203 -- FIXME == trans.splice... transcription 30 variation 863 --- features-II.tsv CDS 3524 Expression 3068 Feature_data 459 GenePair_STS 3652 Homol_data 493 LTR 6 PCR_product 57 RNAi 350 Transposon_repeat 14 allele 320 cDNA_for_RNAi 338 gene 4110 insertion 34 inverted_repeat 4914 mRNA 3601 misc_feature 13 misc_signal 1 miscellaneous 20 oligo 20 partial_gene 1785 repeat 5143 repeat_family 3594 repeat_region 67 sequence 4 source 1 tRNA 64 tandem_repeat 2713 trans 121 variation 983 --- features-III.tsv CDS 2778 Expression 2205 Feature_data 426 GenePair_STS 6439 Homol_data 397 LTR 2 PCR_product 55 RNAi 2890 Transposon_repeat 9 allele 191 cDNA_for_RNAi 356 gene 3216 insertion 13 inverted_repeat 5581 mRNA 2814 misc_feature 31 misc_signal 1 miscellaneous 20 oligo 4274 partial_gene 1691 polyA_site 2 repeat 6439 repeat_family 4496 repeat_region 16 repeat_unit 3 sequence 6 source 1 tRNA 71 tandem_repeat 2554 trans 212 transcription 8 variation 770 === dec01 check === oat% echo $epat qrxrqx|assembly_tag|BLAST|BLAT_|BLATX_|WABA_|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch gzegrep -v $epat $ws/CHROMOSOME_V.gff.gz |\ perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1].$f[2]"}++;} \ foreach(sort keys %a){print "$_\t$a{$_}\n"; }' .ALLELE 336 .Feature_data 422 .Homol_data 707 .OLIGO 9224 .PCR_product 79 .Sequence 1 .deletion 139 .insertion 54 .intron 705 .misc_feature 76 .misc_signal 4 .misc_structure 1 .repeat_region 39 .variation 1332 Expr_profile.Expression 4394 GenePair_STS.structural 5037 Genomic_canonical.Sequence 729 Link.Sequence 3 NDB_CDS.CDS 649 NDB_CDS.Sequence 99 NDB_CDS.exon 697 NDB_CDS.intron 598 Pseudogene.CDS 3 Pseudogene.Sequence 198 Pseudogene.exon 461 Pseudogene.intron 263 RNA.Sequence 7 RNA.exon 7 RNAi.experimental 448 SL1.trans-splice_acceptor 22 SL2.trans-splice_acceptor 16 Transposon.repeat 5 WTP.partial_gene 1833 cDNA_for_RNAi.experimental 445 curated.CDS 27985 curated.Sequence 4948 curated.exon 27985 curated.intron 23037 history.Sequence 29 history.exon 52 history.intron 42 hmmfs.3.similarity 6929 inverted.repeat 17000 misc_feature.miscellaneous 7 operon.transcription 3 possible_error.miscellaneous 9 scan.similarity 299 tRNAscan-SE-1.11.Sequence 153 tRNAscan-SE-1.11.exon 160 tRNAscan-SE-1.11.intron 7 tandem.repeat 10560 === check features*tsv kinds cat worm/features-I.tsv |\ perl -e 'while(<>){next unless(/^\w/);($f,$x)=split(/\t/);$h{$f}++;}foreach(sort keys %h){print "$_\t$h{$_}\n";}' jul01 ----------- : 2nd run chr I 135550 1 CDS 3040 EMBL_mRNA_similarity 1 RNAi 2628 Transposon_repeat 2 allele 24 cDNA_for_RNAi 360 gene 3550 insertion 44 inverted_repeat 5486 mRNA 3079 mRNA_GENOME_similarity 2450 misc_feature 66 misc_signal 1 miscellaneous 4 oligo 5359 polyA_site 2 repeat_family 3775 repeat_region 20 repeat_unit 8 sequence 5 source 1 structural 2681 << all are PCR_product tRNA 71 tandem_repeat 2802 trans-splice_acceptor 39 variation 1 -------- chr II 111217 1 CDS 3551 RNAi 353 allele 23 cDNA_for_RNAi 351 gene 4115 insertion 90 inverted_repeat 4472 mRNA 3611 mRNA_GENOME_similarity 1380 misc_feature 35 misc_signal 2 misc_structure 1 miscellaneous 14 oligo 6562 repeat_family 3185 repeat_region 67 sequence 2 source 1 structural 3322 tRNA 64 tandem_repeat 2633 trans-splice_acceptor 22 variation 7 --------- chr III CDS 2809 RNAi 764 Transposon_repeat 5 allele 34 cDNA_for_RNAi 372 gene 3244 insertion 39 inverted_repeat 4903 mRNA 2837 mRNA_GENOME_similarity 2119 misc_feature 69 misc_signal 3 miscellaneous 12 oligo 4696 polyA_site 2 repeat_family 3662 repeat_region 16 repeat_unit 3 sequence 5 source 1 structural 2425 tRNA 71 tandem_repeat 2420 trans-splice_acceptor 33 transcription 8 variation 1 ---------- chr IV 112132 1 CDS 3344 RNAi 403 Transposon_repeat 4 allele 26 cDNA_for_RNAi 401 gene 3928 insertion 39 inverted_repeat 5544 mRNA 3421 mRNA_GENOME_similarity 2063 misc_feature 43 misc_signal 8 miscellaneous 2 oligo 5812 repeat_family 2957 repeat_region 32 sequence 4 source 1 structural 2963 tRNA 83 tandem_repeat 2246 trans-splice_acceptor 23 ----- chr V CDS 5002 RNAi 451 Transposon_repeat 2 allele 44 cDNA_for_RNAi 449 gene 5918 insertion 80 inverted_repeat 5946 mRNA 5197 mRNA_GENOME_similarity 2021 misc_feature 37 misc_signal 5 misc_structure 1 miscellaneous 10 oligo 9296 repeat_family 3999 repeat_region 39 sequence 3 source 1 structural 4718 tRNA 153 tandem_repeat 3001 trans-splice_acceptor 37 transcription 37 variation 3 ------- chr X 411144 1 CDS 2895 RNAi 428 Transposon_repeat 8 allele 45 cDNA_for_RNAi 425 gene 3555 insertion 116 inverted_repeat 3029 mRNA 2921 mRNA_GENOME_similarity 2676 misc_feature 20 misc_signal 2 miscellaneous 6 oligo 5282 polyA_signal 1 polyA_site 5 repeat_family 2208 repeat_region 21 repeat_unit 1 sequence 4 source 1 structural 2616 tRNA 305 tandem_repeat 1204 trans-splice_acceptor 17 variation 4 jul01 ----------- : 135550 1 << which is this? ALLELE 24 <