#!/usr/local/bin/perl # parsemoen.pl =head NOTES parse-mosquito-ensembl.pl for gnomap features.tsv, postprocess as | sort -k 1,1 -k 2,2n | perl -e 'while(<>){next unless(/^\w/);\ ($c,$b,$r)=split "\t",$_,3; \ unless($c eq $lc) { close(F); open(F,">features-$c.tsv"); $lc=$c; }\ print F $r; } close(F); ' # mosquito/features-2L.tsv # Features for mosquito from Ensembl Anopheles gambiae data [Anopheles_gambiae.0.gb, 11-June-2002] # gnomap-version 1 # # tab-separated-values # Feature gene map range id db_xref notes =cut $debug= 0; use euGenes::GenomeFeatures; use Getopt::Long; my $sourceName= 'Ensembl Anopheles gambiae data'; my $org= 'mosquito'; my $workpath= '/c7/eugenes/genomew/'; my $datapath= '/c7/eugenes/mosquito/ensembl/'; my $sourcePath= $datapath.'Anopheles_gambiae.0.gb.gz'; my $geneprot= $datapath.'anopheles_gambiae_lite_6_1/gene_prot.txt.table.gz'; my $url= 'ftp://ftp.ensembl.org/pub/current_mosquito/data/flatfiles/genbank/Anopheles_gambiae.0.dat.gz'; # my $outpath= '/c7/eugenes/genomew/mosquito/'; #? or rely on package to set my $outpath= $workpath.$org; my $csome= 'all'; # bogus GetOptions( 'debug!' => \$debug, 'data=s' => \$sourcePath, 'outpath=s' => \$outpath, 'features!' => \$wantfeat, 'dna!' => \$wantdna, 'protein!' => \$wantprot, 'acode!' => \$wantacode, 'splitfeats!' => \$dosplitfeats, ); $usage= < $debug, 'data=s' => $sourcePath, 'outpath=s' => $outpath, 'features!' => $wantfeat, 'dna!' => $wantdna, 'protein!' => $wantprot, 'acode!' => $wantacode, 'splitfeats!' => $dosplitfeats, USAGE die $usage unless( $dosplitfeats || $wantacode || $wantfeat || $wantdna || $wantprot); my $enspars= euGenes::EnsemGenbankFeatures->new( org => $org, workpath => $workpath, chromosome => $csome, # bogus doChrOffset => 1, # for $source files specifically sourceName => $sourceName, skipdbx => { Celera_Pep => 1, Celera_Trans => 1 }, skipft => { exon => 1 , misc => 1 }, # makeft 'gene' from CDS !? wantfeat => $wantfeat, wantdna => $wantdna, wantprot => $wantprot, ); ## or $ens->init( ... ); if ($wantfeat || $wantdna || $wantprot || $wantacode) { $gpsyms= load_gene_prot_table($geneprot); $enspars->setProtNames($gpsyms); } if ($wantfeat || $wantdna || $wantprot) { $enspars->readFeatures($sourcePath,$outpath); # put these in params ? } # $featfile= "$enspars->{outpath}/features-$csome.tsv"; if ($dosplitfeats) { splitfeats($enspars->{outpath},"features-$csome.tsv"); } # quick and dirty - use Meow/Mosquito.pm for real acode if ($wantacode) { feat2acode( $enspars->{outpath}, 'features-[2,3,X]*.tsv', $gpsyms ); } exit; #---------- sub feat2acode { my $path= shift; my $feats= shift; my $rsyms= shift; my %psyms= %$rsyms; opendir(D,$path); my @ff= grep( /features-[23X].*.tsv$/, readdir(D)); closedir(D); open(O,">$path/acode") or die "$path/acode"; foreach $ff (sort @ff) { $chr = $1 if ($ff =~ m,features-(\w+),); warn "feat2acode $path/$ff\n" if $debug; open(IN, "egrep '^(CDS|source|#)' $path/$ff|") or die $feats; while() { # my @t= split "\t"; if (/^# Features for/) { $date= $1 if (/(\S+)\]/); next; } elsif (/^#/) { next; } my ($class,$sym,$map,$range,$ids,$dbx)= split "\t"; if ($class eq 'source') { $chr= $map; $chr =~ s/Chr//i; } elsif ($class eq 'CDS') { my $pid= $1 if ($dbx =~ /(ENSANGP0\d+)/); my($psym,$ipro,$pname)= @{ $psyms{$pid} }; my $did= 'Ensembl:'.$1 if ($dbx =~ /(ENSANGG0\d+)/); my ($start, $stop)= euGenes::GenomeFeatures::maxrange('self',$range); my $maploc= "$start..$stop"; $maploc= 'complement('.$maploc.')' if ($range =~ /complement/); my $namepro= ''; chomp($pname); $namepro = "PDOM|INTERPRO:$ipro" if ($ipro); $namepro .= " == $pname" if ($pname); $ids =~ s/MEOW://; print O <) { if (/^#/) { $head .= $_; } # skip !? else { last; } } close(IN); open(IN, "sort -k 1,1 -k 2,2n $path/$feats|") or die $feats; while() { # if (/^#/) { $head .= $_; } # skip !? if (/^\w/) { my($c,$b,$r)= split "\t",$_,3; unless($c eq $lc) { close(F); open(F,">$path/features-$c.tsv"); $lc=$c; push(@cs,$c); } next if (/CRA_x9P1GAV5DB8/); # a few bad data print F $r; if ($r =~ m/^source/) { my @t= split "\t", $r; $t[3] =~ m/(\d+)\.\.(\d+)/; my($b,$e)= ($1,$2); $chr{$c}= $e if ($e>$chr{$c}); } } } close(F); close(IN); foreach my $c (@cs) { my $fn= "features-$c.tsv"; my $f= $path.$fn; my $o= $f.'.tmp'; unless(open(F,$f) && open(O,">$o")) { die "$f or $o"; } my $head1= $head; $head1 =~ s,/c7/eugenes/mosquito/,,; $head1 =~ s,$feats,$fn,; # s,$feats,$f, print O $head1; print O "\n"; print O join("\t","source","mosquito","Chr".$c,"1..".$chr{$c}),"\n"; while () { next if (/^source/); # also may need to patch ID field: AGgn0 > MEOW:AGgn0 and # for stupid gnomap2 bug, add extra \t before \n -- or is it some other stupid bug if (m/\t(AGgn0\d+)/) { my $aid= $1; if (m/\t(AGgn0\d+)\n/) { $aid .= "\t"; } s/\t(AGgn0\d+)/\tMEOW:$aid/; } print O $_; } close(F); close(O); rename ($o,$f); } } sub load_gene_prot_table { my $file= shift; my $ok; if ($file =~ /\.(gz|Z)$/) { $ok= (open(FT,"gzcat $file|")); } else { $ok= (open(FT,$file)); } return undef unless ($ok); my %psyms= (); while() { my @v= split "\t"; $pid= $v[2]; $psym= $v[11]; next if ($psym eq '\N'); $ipro= $v[10]; $pname= $v[12]; # use for acode $psyms{$pid}= [ $psym, $ipro, $pname ]; } close(FT); return \%psyms; } #12754 15243 ENSANGP00000015243 \N PS01200 PF01167 \N Seg #\N \N IPR000007 Tubby Tubby #12754 15243 ENSANGP00000015243 \N PS01201 PF01167 \N Seg #\N \N IPR000007 Tubby Tubby #CREATE TABLE gene_prot ( # gene_id int(10) unsigned NOT NULL default '0', # translation_id int(10) unsigned NOT NULL default '0', # translation_stable_id char(40) NOT NULL default '', # prints char(40) default NULL, # prosite char(40) default NULL, # pfam char(40) default NULL, # coil char(40) default NULL, # low_complexity char(40) default NULL, # signal_peptide char(40) default NULL, # transmembrane char(40) default NULL, # interpro char(40) default '', # short_description char(40) default '', # description char(255) default NULL, # KEY gene_id (gene_id), # KEY prosite (prosite,gene_id), # KEY prints (prints,gene_id), # KEY pfam (pfam,gene_id), # KEY interpro (interpro,gene_id), # KEY gene_id_2 (gene_id) #) TYPE=MyISAM;