#!/usr/local/bin/perl # parse worm gff features =head NOTES use GFF is slow and mem. piggy - avoid? cut unwanted from worm.gff: -- may later want some of these egrep -v 'assembly_tag|BLAST|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch' x.gff > n.gff keep or drop LINK, SUPERLINK ? what are these ? - 2-4 per csome.gff: CHROMOSOME_I.gff: Sequence 2137 6396950 Sequence "SUPERLINK_RW1" ? new gff feats to strip mar01: curated Sequence | provisional Sequence | Genomic_canonical ? join long lists of repeats that cluster at same range ? - likely a computation exess kind ==? repeat_region (already a few of these in worm.gff) or repeat_tandem_region, repeat_inverted_region, repeat_family_region jul01 FIXME: experimental - - 173894..174992 - - RNAi "JA:F56C11.6" ^^ should be RNAi structural - - 180299..181965 - - PCR_product "sjj_F56C11.5" ^^ should be PCR_product ? jul01 new oat% wc worm/features-I.tsv 56407 484930 3790824 worm/features-I.tsv mar01 old: oat% wc ../genomes/worm/features-I.tsv 22483 175322 1992298 ../genomes/worm/features-I.tsv ^^^^^ is big diff due to junk of some kind? perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1]_$f[2]"}++;} foreach(sort keys %a){print "$_\t$a{$_}\n"; }' CHROMOSOME_V.gff CHROMOSOME_I.gff:##sequence-region CHROMOSOME_I 1 14435323 CHROMOSOME_II.gff:##sequence-region CHROMOSOME_II 1 15205541 CHROMOSOME_III.gff:##sequence-region CHROMOSOME_III 1 13093964 CHROMOSOME_IV.gff:##sequence-region CHROMOSOME_IV 1 17047437 CHROMOSOME_V.gff:##sequence-region CHROMOSOME_V 1 21085422 CHROMOSOME_X.gff:##sequence-region CHROMOSOME_X 1 17408926 /c7/eugenes/worm/current_release/CHROMOSOMES/ ... .dna == .fasta CHROMOSOME_I.dna.gz CHROMOSOME_IV.dna.gz chrom_dump.log.Z CHROMOSOME_I.gff.gz CHROMOSOME_IV.gff.gz composition.all.Z CHROMOSOME_II.dna.gz CHROMOSOME_V.dna.gz dna_dump.log.Z CHROMOSOME_II.gff.gz CHROMOSOME_V.gff.gz gff_dump.log.Z CHROMOSOME_III.dna.gz CHROMOSOME_X.dna.gz totals.Z CHROMOSOME_III.gff.gz CHROMOSOME_X.gff.gz dec01: drop these feats: WABA_briggsae_cosmid_similarity 254896 / 408525 (I) BLAT_EST_BEST_similarity 104070 / 408525 (I) BLATX_ 4000/ set epat=" -v 'qrxrqx|assembly_tag|BLAST|BLAT_|BLATX_|WABA_|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" gzegrep $epat $wc/*_I.gff.gz | readseq -pipe -inform=gff -f=fff -a -o $gw/worm/worm_I.fff Exception in thread "main" java.lang.OutOfMemoryError set rs='/usr/java1.3/bin/java -Xms80M -Xmx256M -cp /usr/local/bin/readseq.jar run' gzegrep $epat $wc/*_I.gff.gz | $rs -pipe -inform=gff -f=fff -a -o $gw/worm/worm_I.fff =cut $egreppatt= " -v 'qrxrqx|assembly_tag|BLAST|BLAT_|BLATX_|WABA_|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'"; $egrep= 'egrep '. $egreppatt; $gzgrep= 'gzegrep ' . $egreppatt; $debug= 0; use Getopt::Std; use POSIX; use flybase::sym2id; # use lib ("GFF"); use GFF; # or require ""; import ""; # use GFF::GeneFeature; # use GFF::GeneFeatureSet; use GFF::Analysis; ## buggers ! ## paths for iubio/kalo $meowpub= '/c6/iubio/meow-pub/meow/server'; # /c3/iubio/meow-pub/meow/server/ $genomeshome= '/c7/eugenes/genomew/'; $wormchrdata= '/c7/eugenes/worm/current_release/CHROMOSOMES/'; $org= 'worm'; $orgacode= "$meowpub/$org/acode"; $isMeowId= 1; ## for getSym2Id - acode data $orgaw= "$genomeshome/$org/acode"; if (-r $orgaw && -M _ < -M $orgacode) { $orgacode= $orgaw; } $sourename= 'WormBase GFF data'; @chrs= qw( I II III IV V X ); $nchr= scalar(@chrs); ## FIXME this hardcode sizes are bad; get from CHROMOSOME_I.fasta... %chrsizes = ( I => 14435323, II => 15205541, III => 13093964, IV => 17047437, V => 21085422, X => 17408926, ); $FeatByChr= 1; $useGFFpm= 1; $forGnomap= 1; $gnomapvers= '1'; %opt=(); Getopt::Std::getopts('o:c:dxW:DA:',\%opt); $mework= $opt{W} || $genomeshome; ##!!!! test, fix $outpath= "$mework/$org/"; $outpath= $opt{o} || $outpath; $orgacode= $opt{A} if $opt{A}; $debug= 1 if $opt{D}; $usedefdata= 1 if $opt{x}; $nodna= 1 if $opt{d}; if ($opt{c}) { @chrset= split(/[,;:]/,$opt{c}); } else { @chrset= @chrs; } %chrset= map { $_ => 1; } @chrset; if($usedefdata) { opendir(D, $wormchrdata); @fs= readdir(D); closedir(D); @gffset= map { $wormchrdata.$_; } grep( /CHROMOSOME.+\.gff/, @fs); @dnaset= map { $wormchrdata.$_; } grep( /CHROMOSOME.+\.dna/, @fs); } else { @gffset= grep( /gff/, @ARGV); @dnaset= grep( /dna/, @ARGV); } $okay= scalar(@gffset); if ($okay) { foreach $f (@gffset) { $okay= 0 unless (-r $f); } } unless ($okay) { print "parsewormgff [-o output] [-Aacode] [-x] worm-gff-files\n"; print " Usage: \n -D == debug\n -o output path == [ $outpath ]\n"; print " -d NO DNA output\n -A acode == worm.acode [ $orgacode ]\n "; print " -c I,II,.. == chrs to read [ ".join(",",@chrs)." ] \n"; print " -x use default data [ $wormchrdata/CHROMOSOME*gff ] \n"; print "Data chosen: ".join("\n",@gffset)."\n"; print "Dna chosen: ".join("\n",@dnaset)."\n"; exit; } unless($nodna) { foreach $f (@dnaset) { if ($f =~ /([IVX]+)\./) { $chr= $1; } else { warn "What chromosome is $f ?"; next; } $outf= "$outpath/dna-$chr.raw"; fasta2raw( $f, $outf, $chr) if (-r $f); ## set %chrsizes } } else { foreach $chr (@chrs) { $outf= "$outpath/dna-$chr.raw"; getChrSizes( $outf, $chr) if (-r $outf); } } $totalsize= 0; $autosize= 0; foreach (@chrs) { $totalsize += $chrsizes{$_}; $autosize += $chrsizes{$_} unless($_ eq 'X'); } if ($useGFFpm) { # %sym2id= getSym2Id($org, $orgacode); flybase::sym2id::set4meow($isMeowId); flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); ## $outf= \*STDOUT; foreach $infile (@gffset) { $chr= 'null'; $chr= $1 if ($infile =~ m/_([IVX]+)/i); next unless ($chrset{$chr}); #?? # $chrsize= $chrsizes{$chr}; # BAD, get from gff or from dna $outname= "$outpath/features-$chr.tsv"; my @tm= localtime( $^T - 24*60*60*(-M $infile) ); my $fdate= POSIX::strftime("%d-%B-%Y",@tm); open(OUTF,">$outname") or die "cant write $outname"; if ($infile =~ /\.(gz|Z)$/) { $cmd= $gzgrep; } else { $cmd= $egrep; } open(INF,"$cmd $infile |") or die "cant $cmd $infile"; parseGFF("[$infile, $fdate]", *INF, *OUTF); close(INF); close(OUTF); system("touch -r $infile $outname"); } } else { summarizeChrGFF(); } exit; #------------------------------------- sub fasta2raw { my ($dnafile, $outfile, $chr)= @_; local(*F,*O, $ok); warn "fasta2raw $dnafile -> $outfile\n" if ($debug); if ($dnafile =~ /\.(gz|Z)$/) { $ok= open(F,"gzcat $dnafile|"); } else { $ok= open(F,$dnafile); } if ($ok) { $ok= open(O,">$outfile"); } unless($ok) { warn "Cant read $dnafile"; return; } $headline= ; # skip single fasta >title line unless($headline =~ /^>/) { warn "Bad fasta title: $headline"; } while () { chomp(); print O $_; } close(F); close(O); system("touch -r $dnafile $outfile"); $chrsizes{$chr}= -s $outfile; warn "chr $chr size= $chrsizes{$chr}\n" if $debug; } sub getChrSizes { my ($dnafile, $chr)= @_; if (-s $dnafile) { $chrsizes{$chr}= -s $dnafile; warn "chr $chr size= $chrsizes{$chr}\n" if $debug; } } sub summarizeChrGFF { while (<>) { # if (/sequence-region\s+CHROMOSOME_(\w+)\s+(\d+)\s+(\d+)/) { # $chr= $1; $b= $2; $c= 3; # $len= $e - $b + 1; # $chrsizes{$chr}= $len; # $totalsize += $len; # $a{Length} = $totalsize; # $ac{$chr}{Length}= $len; # } next if(/^#/); next unless(/\t/); @f= split(/\t/); $chr= $f[0]; $chr =~ s/CHROMOSOME_//i; $key= "$f[1]_$f[2]"; $a{$key}++; $ac{$chr}{$key}++; } ## end input.. $a{Length}= $totalsize; foreach $chr (@chrs) { $ac{$chr}{Length} = $chrsizes{$chr}; } @fldlabs= (sort keys %a); if ($FeatByChr) { ## features (row) x csomes (col) + chi values print "Feature\tObs.",join("\tObs.",@chrs); print "\tDev.(o-e)\tDev.",join("\tDev.",@chrs); print "\tDev.auto\tDev.X"; print "\tChi2(all)\tChi2(X-auto)"; print "\n"; foreach $key (@fldlabs) { print "$key"; $chisum= 0.0; $tv= 0; $autov= 0; foreach $c (@chrs) { %ch= %{ $ac{$c} }; $v= $ch{$key} || 0; $tv += $v; $autov += $v unless($c eq 'X'); print "\t$v"; } print "\td"; foreach $c (@chrs) { %ch= %{ $ac{$c} }; $v= $ch{$key} || 0; $chrsizefac = $chrsizes{$c} / $totalsize; $expect = $tv * $chrsizefac; # null hypothesis 1 - evenly distributed per base ## need also X vs autosome test H2, and maybe inter-autosome test? ## ? combine Chis over features, test of all? $dev = $v - $expect; $chi = ($expect == 0) ? 0 : (($dev * $dev) / $expect); $chisum += $chi; printf "\t%0.1f",$dev; if ($c eq 'X') { $chiX= $chi; $devX= $dev; } # else { } } # print "\te"; foreach $c (@chrs) { print "\t$chie{$c}"; } # print "\td"; foreach $c (@chrs) { print "\t$chid{$c}"; } $expect= $tv * $autosize / $totalsize; $dev= $autov - $expect; $chisum2= $chiX + (($dev * $dev) / $expect); printf "\t%0.1f",$dev; printf "\t%0.1f",$devX; printf "\t%0.1f",$chisum; # df = 5 printf "\t%0.1f",$chisum2; # df = 1 print "\n"; } print "\n"; } else { ## csomes (row) x features (col) print "CHR"; foreach $key (@fldlabs){ print "\t$key"; } print "\n"; foreach $c (@chrs) { $cv= $ac{$c}; %ch= % $cv; print $c; foreach $key (@fldlabs){ $v= $ch{$key} || 0; print "\t$v"; } print "\n"; } } } sub parseGFF { my($srcinfo, $inh, $outh) = @_; ##open(F, $infile) || die "can't open $infile"; # =item read( \*INPUT, \&filter, \&converter ) # use filter to exclude assembly_tag items, others ? $gfs= new GFF::GeneFeatureSet; $gfs->read_header($inh); #? get chr# from header comments? $gfs->read($inh); ## ! big memory pig, and slowish - can we bypass use? ## but need sorted out - use unix sort in 2nd pass? my @chrreg= $gfs->region(); #$self->{'region'}->[0] = $seqname ; #$self->{'region'}->[1] = $start ; #$self->{'region'}->[2] = $end ; $chrstart= $chrreg[1]; $chrsize= $chrreg[2]; warn "source\t$org\tChr $chr\t$chrstart..$chrsize\t-\n" if ($debug); if ($forGnomap) { print $outh "# $org/features-$chr.tsv\n"; print $outh "# Features for $org from $sourename $srcinfo\n"; $gfs->dump_header($outh); ##? print $outh "#\n"; print $outh "#gnomap-version $gnomapvers\n"; print $outh "#Feature\tgene\tmap\trange\tid\tdb_xref\tnotes\n"; print $outh "source\t$org\tChr $chr\t$chrstart..$chrsize\t-\n" if $chrsize; } else { $gfs->dump_header($outh); ##? } ## don't need this - same info as in current list ## print $outh "# GFF makeGenes list\n"; ## $gfgenes= GFF::Analysis::makeGenes($gfs); ## need this to make gene, mRNA join()s - TEST ## foreach $gf ( $gfgenes->order_gf() ) { ## ## tognomap($gf, $outh); ## $gf->dump($outh); ## last if ($ntest++ > 20); ## } ## print $outh "# GFF input list\n"; ## need to pack together CDS for same gene, exon/intron sets per gene ## $gfs->order_gf() doesn't ensure Sequence/gene item is before its parts ## doesn't keep all parts of gene together either ! %seqstart= (); foreach $gf ( order_byStartAndGroup($gfs) ) { # $gf->dump($outh,$tab,$newline,$flen,$tag); tognomap($gf, $outh); # printGFF } tognomap('EndOfData', $outh); # printGFF # foreach $gf ( @{$gfs->{'feature'}} ) { } # while ($gf= $gfs->nextGeneFeature()) { printGFF($gf, $outh); } ##close(F); } sub _by_startAndGroup { ## my ($ar, $br, $astart, $afeat, $agroup, $bstart, $bfeat, $r); ##? save mem w/o my? $astart= $a->start; $afeat= $a->feature(); $agroup= $a->group() || "xxx"; if ($afeat =~ /Sequence/) { $seqstart{$agroup}= $astart; $ar= 1; } elsif ($afeat =~ /CDS|exon|intron/) { $astart= $seqstart{$agroup} || $astart; $ar= 2; } else { $ar= 3; } $bstart= $b->start; $bfeat= $b->feature(); $bgroup= $b->group() || "xxx"; if ($bfeat =~ /Sequence/) { $seqstart{$bgroup}= $bstart; $br= 1; } elsif ($bfeat =~ /CDS|exon|intron/) { $bstart= $seqstart{$bgroup} || $bstart; $br= 2; } else { $br= 3; } # Larry Wall sez: (0 cmp/<=> fall down to next 'or' case) return ($astart <=> $bstart) or ($agroup cmp $bgroup) or ($ar <=> $br); # $rsort= ( $astart <=> $bstart ); # if ($rsort == 0) { # $rsort= ( $agroup cmp $bgroup ); # if ($rsort == 0) { $rsort = ($ar <=> $br); } # } # return $rsort; } sub order_byStartAndGroup { my $gfs = shift; @sort_array = sort _by_startAndGroup @{$gfs->{'feature'}} ; return @sort_array; } sub tognomap { my($gf,$file)= @_; $OUTH= $file; ## hack - fix callees my $glist; if ($gf eq 'EndOfData') { foreach $ghkey (keys %ghash) { $glist= $gparts{$ghkey}; joinparts1($glist) if ($glist); delete $gparts{$ghkey}; delete $ghash{$ghkey}; } if (scalar(@tandlast)>5) { &printpart( @tandlast ); @tandlast= undef; } if (scalar(@invlast)>5) { &printpart( @invlast ); @invlast= undef; } if (scalar(@rfamlast)>5) { &printpart( @rfamlast ); @rfamlast= undef; } return; } my $gsym= '-'; my $note= '-'; my $dbxref= '-'; my $group= $gf->dump_group(); ## this is Note: for misc features - never gsym if Note !? if ($group =~ m/Sequence\s+"([^"]+)/) { $gsym= $1; } ## always Sequence/gene name? - no ## elsif ($group =~ m/Note\s+"([^"]+)/) { $note= $1; } else { $note= $group; } ## save it allways? see below summary: Note, Allele, etc. ## my $chr1= $gf->seqname(); ## always CHROMOSOME_z when from chromo_z.gff my $gfstart= $gf->start()+0; my $gfend= $gf->end()+0; my $range= "$gfstart..$gfend"; my $compl= $gf->strand(); # $gf->score(); # version 2 GFF can have '.' # $gf->frame(); my $feat= $gf->feature(); my $source= $gf->source(); ## curated, etc. ## add/edit $feat w/ $source ## need various feature/flavor renames for gnomap data ## odd source/features: ## curated Sequence/intron/exon/CDS (most seq/genes), hand_built S/i/e/C ?, stl S/i/e/C ? ## provisional S/i/e/C ? ## hmmfs.3_similarity == repeat family 2 ## hmmfs_similarity == repeat fam. 1 ## hold all gene components and print if current $gsym != last $gsym ## Sequence/intron/exon/CDS foreach $ghkey (keys %ghash) { my $end= $ghash{$ghkey}; if ($gfstart > $end) { $glist= $gparts{$ghkey}; ## print STDERR "got glist for $ghkey: $glist\n" if $debug; joinparts1($glist) if ($glist); delete $gparts{$ghkey}; delete $ghash{$ghkey}; } } if (scalar(@tandlast)>5 && $tandlast[6] < $gfstart && $source !~ /tandem|inverted|hmmfs|scan/) { ## print STDERR "Dump tand before $gsym/$gfstart: ",join(',',@tandlast),"\n" if $debug; &printpart( @tandlast ); @tandlast= undef; } if (scalar(@invlast)>5 && $invlast[6] < $gfstart && $source !~ /tandem|inverted|hmmfs|scan/ ) { ## print STDERR "Dump inv before $gsym/$gfstart: ",join(',',@invlast),"\n" if $debug; &printpart( @invlast ); @invlast= undef; } if (scalar(@rfamlast)>5 && $rfamlast[6] < $gfstart && $source !~ /tandem|inverted|hmmfs|scan/ ) { ## print STDERR "Dump rfam before $gsym/$gfstart: ",join(',',@rfamlast),"\n" if $debug; &printpart( @rfamlast ); @rfamlast= undef; } if ($source =~ /tRNA/) { ## has CDS/intron/exon parts if ($feat =~ /Sequence/i) { $feat= 'tRNA'; } elsif ($feat =~ /intron|exon/) { return; } ##? keep only Sequence variant here? } # this caching of repeats leads to out-of-sort ordering - need to fix or resort table elsif ( $source =~ /tandem/ && $feat =~ /repeat|misc_feature/) { ## jul01 tandem repeat ; was tandem misc_feature $feat= 'tandem_repeat'; ## 'repeat_tandem'; ## ??need to condense overlapping items to repeat_tandem_region @tand= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); if (scalar(@tandlast)<6) { @tandlast= @tand; } elsif ($tandlast[6]+500 < $gfstart) { &printpart( @tandlast); @tandlast= @tand; } else { ## $tandlast[0]= 'repeat_tandem_region'; $tandlast[6]= $tand[6]; $tandlast[2] =~ s/\.\.\d+/..$tand[6]/; } return; } elsif ($source =~ /inverted/ && $feat =~ /repeat|misc_feature/) { ## jul01 inverted repeat; was inverted_misc_feature $feat= 'inverted_repeat'; ## 'repeat_inverted'; ## ??need to condense overlapping items to repeat_inverted_region ## @inv= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); if (scalar(@invlast)<6) { @invlast= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); } elsif ($invlast[6]+500 < $gfstart) { ## dup of above?! &printpart( @invlast ); @invlast= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); } else { ## $invlast[0]= 'repeat_inverted_region'; $invlast[6]= $gfend; $invlast[2] =~ s/\.\.\d+/..$gfend/; ## print STDERR "Inv region extended to $gfend: ",join(',',@invlast),"\n" if $debug; } return; } elsif ($source =~ /hmmfs|scan/ && $feat =~ /similarity/) { ## hmmfs_similarity $note= $source.'_'.$feat; $feat= 'repeat_family'; $dbxref= $note; $note= '-'; ## ??need to condense overlapping items to repeat_family_region @rfam= ($feat,$gsym,$range,$compl,$dbxref,$note,$gfend); if (scalar(@rfamlast)<6) { @rfamlast= @rfam; } elsif ($rfamlast[6]+500 < $gfstart) { &printpart( @rfamlast ); @rfamlast= @rfam; } else { ## $rfamlast[0]= 'repeat_family_region'; $rfamlast[6]= $rfam[6]; $rfamlast[2] =~ s/\.\.\d+/..$rfam[6]/; } return; } elsif ($feat =~ /similarity/) { $feat= $source.'_'.$feat; } elsif ($source =~ /Transposon/) { $feat= $source.'_'.$feat; } elsif ($feat eq 'structural') { $feat= $source; } # PCR_product so far - others? elsif ($feat =~ /experimental/) { $feat= $source; } # jul01 elsif ($feat eq 'ALLELE') { $feat= 'allele'; } # standard name is lc elsif ($feat eq 'OLIGO') { $feat= 'oligo'; } #? elsif ($feat =~ /^d+/) { return; } #? what are these odd '411144' features (1 per csome?) # cDNA_for_RNAi yk201d11 - 343219..348348 - - - # RNAi - - 343219..348348 - - RNAi "SA:yk201d11" #?? skip cDNA_for_RNAi if ($feat =~ /^Sequence/i) { return if ($gsym =~ m/CHROMOSOME/i) ; # return if ($gsym =~ m/LINK/) ; ## ? SUPERLINK_RWXL & LINK_B0272 =~ fly scaffolds ? if ($gsym =~ m/LINK/) { $feat= lc($feat); ## keep these or not? } else { $feat= 'gene'; $ghash{$gsym}= $gfend; } } elsif ($feat =~ /CDS|intron|exon/) { $ftab= "$feat\t$gsym\t$range\t$compl\n"; $gparts{$gsym} .= $ftab; return; } printpart($feat,$gsym,$range,$compl,$dbxref,$note); } sub join2 { my $a= shift; my $b= shift; return "$a,$b" if ($a && $b); return $b if ($b); return $a; } sub joinparts1 { my $tnlist= shift; @join1flist= split(/\n/,$tnlist); my ($ft, $feat,$gsym,$range,$compl, $ref); ## $feat == intron, exon and CDS here - drop intron my %feats= (); foreach $ft (@join1flist) { ## ($feat,$gsym,$range,$compl)= @{$ref}; ($feat,$gsym,$range,$compl)= split(/\t/,$ft); ## print STDERR "join: $feat,$gsym,$range,$compl\n" if ($debug && $debugnj++ < 20); next if ($feat eq '' || $feat =~ /intron/); $feat =~ s/exon/mRNA/; $feats{$feat}= join2( $feats{$feat}, $range); } foreach $ft (sort keys %feats) { $range= $feats{$ft}; $range= 'join('.$range.')' if ($range =~ /,/); printpart( $ft, $gsym, $range, $compl); } } sub joinparts0 { my ($feat,$gsym,$range,$compl, $ref); ## $feat == intron, exon and CDS here - drop intron my %feats= (); while ($ref= shift @_) { ($feat,$gsym,$range,$compl)= @{$ref}; warn "join: $feat,$gsym,$range,$compl\n" if ($debug && $debugnj++ < 20); ## ^ got nulls here ! next if ($feat eq '' || $feat =~ /intron/); $feat =~ s/exon/mRNA/; $feats{$feat}= join2( $feats{$feat}, $range); } foreach my $ft (sort keys %feats) { $range= $feats{$ft}; $range= 'join('.$range.')' if ($range =~ /,/); printpart( $ft, $gsym, $range, $compl); } } sub printpart { my ($feat,$gsym,$range,$compl,$dbxref,$note)= @_; # my ($ref) = @_; # my ($feat,$gsym,$range,$compl)= @$ref; $range= 'complement('.$range.')' if ($compl eq '-'); my $id= '-'; unless ($gsym eq '-') { # $id= $sym2id{ symcase($gsym) } || '-'; # $gsym= $idsym{$id} || $gsym; # make sure is proper symbol $id= flybase::sym2id::symbol2id($gsym) || '-'; $gsym= flybase::sym2id::id2symbol($id) || $gsym; # make sure is proper symbol } print $OUTH "$feat\t$gsym\t-\t$range\t$id\t$dbxref\t$note\n"; } sub symcase { my($sym)= shift; ## return uc($sym); ## if data doesn't keep proper case! return $sym; } sub printGFF { my($gf,$file)= @_; my $groupval= $gf->dump_group(); my $string = sprintf("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s", #%s $gf->seqname(), ## this is always CHROMOSOME_z - change to $groupval name $gf->source(), $gf->feature(), $gf->start()+0, $gf->end()+0, $gf->score(), # version 2 GFF can have '.' $gf->strand(), $gf->frame(), $groupval ); print $file $string; print $file "\n"; } __END__ ## from genomefeat.pl - move to some package/pm #sub getSym2Id { # my ($org, $acode, $domarker)= @_; # %sym2id= (); # %idsym= (); # %idsize= (); # local(*ALIB); # open(ALIB,$acode); # $/= "# EOR\n"; # my $insyn; # while () { # my($id,$sym,$syn)= ('','',''); # $id= $1 if (/\nID\|(.+)/); # $id= 'MEOW:'.$id if ($isMeowId); # # $sym= $1 if (/\nSYM\|(.+)/); # next unless ($sym && $id); ## if ($domarker) { ## $idsize{$id}= isgoodmarker($_); ## } # $sym2id{symcase($sym)}= $id; # $idsym{$id}= $sym; # # if (/\nSYN\|(.+)/) { # $syn= $1; # $sym2id{symcase($syn)}= $id; # my ($at, $e, $x); ## get any more SYN|s ... # $at= index($_,"\nSYN\|"); $e= $at+5; $at= -1; # while (($x= index($_,"\n", $e)) > 0) { # if (substr($_,$x+1,1) eq '|') { $e = $x+1; $at= $e if ($at<0); } # else { $e= $x; last; } # } # if ($at>=0 && $e>$at) { # my $s= substr($_, $at, $e-$at); # foreach $s (split(/\n/, $s)) { # $s =~ s/^\w*\|//; # $sym2id{symcase($s)}= $id; # } # } # } # # if ($org eq 'worm') { # ##?? and add DBL|WP:B0379.7 since some worm PRED gene syms don't match GB sym - ACEPRED:B0379.7a # if (/\nDBL\|WP:(.+)/) { # $sym2id{symcase($1)}= $id; # } # } # # } # $/= "\n"; # close(ALIB); # return %sym2id; ## ?? also return %idsym #} ######## ## do some G/chi stats here - can't seem to find stats package that can do? ## -- do Chi instead - ? need multi feature Chi test? C = chromosome F = feature count G = Sum( freq * ln(freq) ) df = No. levels or values per class ? adjust counts by # bases/csome? - treat as yes/no items per base position? e.g., has_trna vs. no_trna ? sub Gtest { my($tabref)= @_; my %t= % $tabref; foreach my $tc (sort keys %t) { my $tv= $t{$tc}; if (ref($tv) =~ /HASH/) { # $tsum{$tc} = logsumref($tv, $tc); } } sub logsumref { my($tabref, $tabkey)= @_; my %t= % $tabref; my $tsum= 0; foreach my $tc (sort keys %t) { my $tv= $t{$tc}; if (ref($tv) =~ /HASH/) { $tsum{$tabkey}{$tc} = logsumref($tv,$tc); } else { # $tsum += $tv; $tsum{$tabkey} += $tv * log($tv); } } # return $tsum; } Feature counts differ over all csomes (true) Chromosomes differ in any feature counts Which csomes differ from others in which feature counts? X differs from autosomes (which features)? Hypothesis df G C x F C x F[1] C x F[2] ... C[x]F[1] vs. C[others]F[1] ??? ... @n= ( 1, 1.234567, 123456.7890, -1.2345, -123445.67890); foreach (@n) { printf " [%8.1f]",$_; printf " [%-8.1f]",$_; printf " [%8.1g]",$_; printf " [%0.1f]",$_; print "\n"; } exit; ### # GFF input list -- fix to join gene parts gene R04A9 - 93008..145953 - - ## one gene -> one CDS, one mRNA (exon/intron...) gene R04A9.2 - 93592..97989 MEOW:CEgn0014353 - - CDS R04A9.2 - 93592..93850 MEOW:CEgn0014353 - - exon R04A9.2 - 93592..93850 MEOW:CEgn0014353 - - intron R04A9.2 - 93851..93896 MEOW:CEgn0014353 - - exon R04A9.2 - 93897..94074 MEOW:CEgn0014353 - - CDS R04A9.2 - 93897..94074 MEOW:CEgn0014353 - - intron R04A9.2 - 94075..94123 MEOW:CEgn0014353 - - CDS R04A9.2 - 94124..94403 MEOW:CEgn0014353 - - exon R04A9.2 - 94124..94403 MEOW:CEgn0014353 - - intron R04A9.2 - 94404..94751 MEOW:CEgn0014353 - - CDS R04A9.2 - 94752..95362 MEOW:CEgn0014353 - - exon R04A9.2 - 94752..95362 MEOW:CEgn0014353 - - intron R04A9.2 - 95363..95414 MEOW:CEgn0014353 - - exon R04A9.2 - 95415..95625 MEOW:CEgn0014353 - - CDS R04A9.2 - 95415..95625 MEOW:CEgn0014353 - - intron R04A9.2 - 95626..95833 MEOW:CEgn0014353 - - CDS R04A9.2 - 95834..96162 MEOW:CEgn0014353 - - exon R04A9.2 - 95834..96162 MEOW:CEgn0014353 - - intron R04A9.2 - 96163..96209 MEOW:CEgn0014353 - - CDS R04A9.2 - 96210..96403 MEOW:CEgn0014353 - - exon R04A9.2 - 96210..96403 MEOW:CEgn0014353 - - intron R04A9.2 - 96404..96727 MEOW:CEgn0014353 - - CDS R04A9.2 - 96728..97133 MEOW:CEgn0014353 - - exon R04A9.2 - 96728..97133 MEOW:CEgn0014353 - - intron R04A9.2 - 97134..97179 MEOW:CEgn0014353 - - CDS R04A9.2 - 97180..97531 MEOW:CEgn0014353 - - exon R04A9.2 - 97180..97531 MEOW:CEgn0014353 - - intron R04A9.2 - 97532..97587 MEOW:CEgn0014353 - - CDS R04A9.2 - 97588..97778 MEOW:CEgn0014353 - - exon R04A9.2 - 97588..97778 MEOW:CEgn0014353 - - intron R04A9.2 - 97779..97826 MEOW:CEgn0014353 - - exon R04A9.2 - 97827..97989 MEOW:CEgn0014353 - - CDS R04A9.2 - 97827..97989 MEOW:CEgn0014353 - - ## .. gene R04A9.3 - complement(98728..101906) MEOW:CEgn0014354 - - CDS R04A9.3 - complement(98728..98783) MEOW:CEgn0014354 ## perl -e 'while(<>){@f=split(/\t/); $k{$1}++ if ($f[8]=~/^(\w+)\s+/);}\ foreach (sort keys %k){print "$_\t$k{$_}\n";}' CHROMOSOME_X.gff Allele 114 >> insertion Allele "pkP5006" ; Insert "Tc1" Clone 710 Confirmed_by_EST 29 > curated intron Sequence "F13C5.2" ; Confirmed_by_EST Confirmed_by_cDNA 1 Confirmed_in_UTR 26 Note 11132 Sequence 60575 Target 440687 >> BLASTX Target "Sequence:G42E22" 28113 28331 ## for worm/acode, sstrome - look at tRNA kinds per csome perl -e ' $/="\n# EOR"; \ while(<>){ if(/\nFNC\|(tRNA\-\w+)/){$t= $1; \ if(/\nCHR\|(\w+)/){$c= $1; $h{$c}++; $h{"$c,$t"}++;}} else {$h{"unlocated,$t"}++;} }\ foreach (sort keys %h) { print "$_\t$h{$_}\n"; }' acode ###### kalo% wc c*gff 91135 992580 6778691 csome_I.gff 95938 1038845 7171225 csome_II.gff 85902 943874 6612285 csome_III.gff 86786 935123 6484390 csome_IV.gff 124483 1335074 9160464 csome_V.gff 74809 780435 5345909 csome_X.gff 559053 6025931 41552964 total kalo% head -5 c*gff ==> csome_I.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_I 1 14435323 CHROMOSOME_I Sequence 1 2036 . + . Sequence "cTel33B" CHROMOSOME_I Sequence 1 14435323 . + . Sequence "CHROMOSOME_I" ==> csome_II.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_II 1 15205541 CHROMOSOME_II Sequence 1 15205541 . + . Sequence "CHROMOSOME_II" CHROMOSOME_II curated Sequence 1711 4390 . + . Sequence "2L52.1" ==> csome_III.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_III 1 13093964 CHROMOSOME_III Sequence 1 13093964 . + . Sequence "CHROMOSOME_III" CHROMOSOME_III Sequence 3907 35782 . + . Sequence "H10E21" ==> csome_IV.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_IV 1 17047437 CHROMOSOME_IV Sequence 1 1243 . + . Sequence "cTel4X" ==> csome_V.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_V 1 21085422 CHROMOSOME_V Sequence 1 4957 . + . Sequence "cTel3X" ==> csome_X.gff <== ##gff-version 2 ##date 2000-03-01 ##sequence-region CHROMOSOME_X 1 17408926 CHROMOSOME_X Sequence 1 2649 . + . Sequence "CTEL7X" =========== jul01 check of .gff features set epat="'assembly_tag|BLAST|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" set epat="'BLAST|assembly_tag|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" ## gzegrep -v $epat is missing 1st patt ! set epat="'qrxrqx|BLAST|assembly_tag|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch'" set ws=/c7/eugenes/worm/current_release/CHROMOSOMES/ gzegrep -v $epat $ws/CHROMOSOME_V.gff.gz |\ perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1].$f[2]"}++;} foreach(sort keys %a){print "$_\t$a{$_}\n"; }' oat% echo $epat 'qrxrqx|BLAST|assembly_tag|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch' gzegrep -v $epat $ws/CHROMOSOME_V.gff.gz |\ perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1].$f[2]"}++;} \ foreach(sort keys %a){print "$_\t$a{$_}\n"; }' .ALLELE 44 .OLIGO 9296 .Sequence 2 .insertion 80 .intron 175 .misc_feature 37 .misc_signal 5 .misc_structure 1 .repeat_region 39 .variation 3 GenePair_STS.structural 4748 Genomic_canonical.Sequence 730 Link.Sequence 4 NDB_CDS.CDS 860 NDB_CDS.Sequence 124 NDB_CDS.exon 1004 NDB_CDS.intron 880 ^^^ ? what are these? PSsearch.similarity 164 Pseudogene.CDS 3 Pseudogene.Sequence 197 Pseudogene.exon 456 Pseudogene.intron 259 RNAi.experimental 453 SL1.trans-splice_acceptor 21 SL2.trans-splice_acceptor 16 Transposon.repeat 2 cDNA_for_RNAi.experimental 451 curated.CDS 27440 curated.Sequence 4854 curated.exon 27450 curated.intron 22596 hmmfs.3.similarity 13120 inverted.repeat 17681 mRNA.transcription 34 mRNA_GENOME.similarity 2021 misc_feature.miscellaneous 7 operon.transcription 3 possible_error.miscellaneous 3 provisional.CDS 250 provisional.Sequence 42 provisional.exon 250 provisional.intron 208 scan.similarity 707 tRNAscan-SE-1.11.Sequence 153 tRNAscan-SE-1.11.exon 160 tRNAscan-SE-1.11.intron 7 tandem.repeat 11823 === dec01 check === oat% echo $epat qrxrqx|assembly_tag|BLAST|BLAT_|BLATX_|WABA_|EST_GENOME|Queryprosite|OLIGO_PAIR|Clone_|VISIBLE|PSsearch oat% gzegrep -v $epat $ws/CHROMOSOME_V.gff.gz |\ perl -e 'while (<>){next unless(/\t/); @f=split(/\t/); $a{"$f[1].$f[2]"}++;} \ foreach(sort keys %a){print "$_\t$a{$_}\n"; }' .ALLELE 336 .Feature_data 422 .Homol_data 707 .OLIGO 9224 .PCR_product 79 .Sequence 1 .deletion 139 .insertion 54 .intron 705 .misc_feature 76 .misc_signal 4 .misc_structure 1 .repeat_region 39 .variation 1332 Expr_profile.Expression 4394 GenePair_STS.structural 5037 Genomic_canonical.Sequence 729 Link.Sequence 3 NDB_CDS.CDS 649 NDB_CDS.Sequence 99 NDB_CDS.exon 697 NDB_CDS.intron 598 Pseudogene.CDS 3 Pseudogene.Sequence 198 Pseudogene.exon 461 Pseudogene.intron 263 RNA.Sequence 7 RNA.exon 7 RNAi.experimental 448 SL1.trans-splice_acceptor 22 SL2.trans-splice_acceptor 16 Transposon.repeat 5 WTP.partial_gene 1833 cDNA_for_RNAi.experimental 445 curated.CDS 27985 curated.Sequence 4948 curated.exon 27985 curated.intron 23037 history.Sequence 29 history.exon 52 history.intron 42 hmmfs.3.similarity 6929 inverted.repeat 17000 misc_feature.miscellaneous 7 operon.transcription 3 possible_error.miscellaneous 9 scan.similarity 299 tRNAscan-SE-1.11.Sequence 153 tRNAscan-SE-1.11.exon 160 tRNAscan-SE-1.11.intron 7 tandem.repeat 10560 === check features*tsv kinds cat worm/features-I.tsv |\ perl -e 'while(<>){next unless(/^\w/);($f,$x)=split(/\t/);$h{$f}++;}foreach(sort keys %h){print "$_\t$h{$_}\n";}' jul01 ----------- : 2nd run chr I 135550 1 CDS 3040 EMBL_mRNA_similarity 1 RNAi 2628 Transposon_repeat 2 allele 24 cDNA_for_RNAi 360 gene 3550 insertion 44 inverted_repeat 5486 mRNA 3079 mRNA_GENOME_similarity 2450 misc_feature 66 misc_signal 1 miscellaneous 4 oligo 5359 polyA_site 2 repeat_family 3775 repeat_region 20 repeat_unit 8 sequence 5 source 1 structural 2681 << all are PCR_product tRNA 71 tandem_repeat 2802 trans-splice_acceptor 39 variation 1 -------- chr II 111217 1 CDS 3551 RNAi 353 allele 23 cDNA_for_RNAi 351 gene 4115 insertion 90 inverted_repeat 4472 mRNA 3611 mRNA_GENOME_similarity 1380 misc_feature 35 misc_signal 2 misc_structure 1 miscellaneous 14 oligo 6562 repeat_family 3185 repeat_region 67 sequence 2 source 1 structural 3322 tRNA 64 tandem_repeat 2633 trans-splice_acceptor 22 variation 7 --------- chr III CDS 2809 RNAi 764 Transposon_repeat 5 allele 34 cDNA_for_RNAi 372 gene 3244 insertion 39 inverted_repeat 4903 mRNA 2837 mRNA_GENOME_similarity 2119 misc_feature 69 misc_signal 3 miscellaneous 12 oligo 4696 polyA_site 2 repeat_family 3662 repeat_region 16 repeat_unit 3 sequence 5 source 1 structural 2425 tRNA 71 tandem_repeat 2420 trans-splice_acceptor 33 transcription 8 variation 1 ---------- chr IV 112132 1 CDS 3344 RNAi 403 Transposon_repeat 4 allele 26 cDNA_for_RNAi 401 gene 3928 insertion 39 inverted_repeat 5544 mRNA 3421 mRNA_GENOME_similarity 2063 misc_feature 43 misc_signal 8 miscellaneous 2 oligo 5812 repeat_family 2957 repeat_region 32 sequence 4 source 1 structural 2963 tRNA 83 tandem_repeat 2246 trans-splice_acceptor 23 ----- chr V CDS 5002 RNAi 451 Transposon_repeat 2 allele 44 cDNA_for_RNAi 449 gene 5918 insertion 80 inverted_repeat 5946 mRNA 5197 mRNA_GENOME_similarity 2021 misc_feature 37 misc_signal 5 misc_structure 1 miscellaneous 10 oligo 9296 repeat_family 3999 repeat_region 39 sequence 3 source 1 structural 4718 tRNA 153 tandem_repeat 3001 trans-splice_acceptor 37 transcription 37 variation 3 ------- chr X 411144 1 CDS 2895 RNAi 428 Transposon_repeat 8 allele 45 cDNA_for_RNAi 425 gene 3555 insertion 116 inverted_repeat 3029 mRNA 2921 mRNA_GENOME_similarity 2676 misc_feature 20 misc_signal 2 miscellaneous 6 oligo 5282 polyA_signal 1 polyA_site 5 repeat_family 2208 repeat_region 21 repeat_unit 1 sequence 4 source 1 structural 2616 tRNA 305 tandem_repeat 1204 trans-splice_acceptor 17 variation 4 jul01 ----------- : 135550 1 << which is this? ALLELE 24 <