#!/usr/local/bin/perl # parseweedxml.pl =head1 DESCRIPTION parseweedxml.pl - parse tigr weed genome annotations data at ftp://ftp.tigr.org/pub/data/a_thaliana/ath1/PUBLICATION_RELEASE/ notes: features.tsv not yet sorted by location - add all useful data frm annot. xml to .tsv, then copy to weed.acode? - ? use revised gnomap features.tsv? with top-line == field names =cut $debug= 1; use lib('/bio/work/meow/'); use POSIX; use Getopt::Std; use XML::Parser; ## uses expat.c use Meow::Data; use Meow::Weed; # use flybase::sym2id; $org= 'weed'; $sourceName= 'TIGR a_thaliana/ath1/'; $meowpub= '/bio/meow-pub/meow/server/'; #'/c3/iubio/meow-pub/meow/server/'; $genomeshome= '/c7/eugenes/genomew/'; # '/c5/tmp/genomew/'; $tairpath= '/c7/eugenes/weed/tair/'; # oat.bio $tigrpath= '/c7/eugenes/weed/tigr/'; # oat.bio $outpath = '/c7/eugenes/weed/tigr/local/'; #test $append= 1; $outname= 'features'; # $chr.tsv $weedids= 'ATgn'; #? $tigrpath$name or $meowpub/.etc/jdata/fbobs/$name ? $tairGeneLocTable= $tairpath.'Gene_orf.*'; $infile= 'T9E19.xml'; ## test $chr= '0'; # 66414454 May 10 01:55 CHR5v03172001.xml == all Chr 5 features # goes w/ Chr5 dna in 26718118 May 10 22:42 ATH1_pseudo_5.1con $gnomapvers= '1'; $kMissingValue= -999999999; $orgacode= "$meowpub/$org/acode"; $isMeowId= 1; ## for getSym2Id - acode data @chrs= qw( 1 2 3 4 5 ); $nchr= scalar(@chrs); %opt=(); Getopt::Std::getopts('afio:p:sxW:DA:',\%opt); $debug= 1 if $opt{D}; $useinlist= 1 if $opt{i}; $append= 0 if $opt{a}; $mework= $opt{W} || $genomeshome; ##!!!! test, fix $dosort= 1 if $opt{s}; # { } { } $dorawdna= 1 if $opt{f}; # $outpath= "$mework/$org/"; $outpath= $opt{p} || $outpath; $outname= $opt{o} || $outname; unless ($useinlist || scalar(@ARGV)) { print "parseweedxml [-a] [-o output] [-Aacode] xml-paths-or-files\n"; print " converts $sourceName XML annotations to gnomap tables\n"; print " Usage: \n -D == debug\n -a == DONT append output\n"; print " -i == read xml file list from stdin\n"; print " -o outname == [$outname]\n -p /output/path == [$outpath]\n"; print " -s == sort input features.tsv by location\n"; print " -f == convert input csome fasta seq to dna.raw \n"; # print " -A acode == $org.acode file\n \n"; exit; } # flybase::sym2id::set4meow($isMeowId); # flybase::sym2id::readSym2Id($orgacode, $org) if (-f $orgacode); # ?? use tair Gene_orf.032101 for ~ 1000 matches b/n LOCUS/publocus sym and gene sym ?? unless($dosort||$dorawdna) { readTairGenes($tairGeneLocTable); $weedobj= Meow::Weed->new(); # to get id $weedobj->openIdDb($weedids, 'w'); #? should be 'w' but need to create } if ($useinlist) { while ($infile=<>) { chomp($infile); die "cant read $infile" unless(-r $infile); if ($dosort) { sortTable($infile); } elsif ($dorawdna) { fasta2raw($infile); } else { parseNprint( $infile); } } } else { foreach $infile (@ARGV) { die "cant read $infile" unless(-r $infile); if ($dosort) { sortTable($infile); } elsif ($dorawdna) { fasta2raw($infile); } else { parseNprint( $infile); } } } $weedobj->closeIdDb() unless($dosort||$dorawdna); exit; #------- sub fasta2raw { my ($fasta, $chr)= @_; if ($fasta =~ /\.(gz|Z)$/) { $ok= (open(FT,"gzcat $fasta|")); } else { $ok= (open(FT,$fasta)); } unless($ok) { warn "Cant read $fasta"; return; } unless($chr) { # ATH1_pseudo_4.1con $chr= $1 if ($fasta =~ m/ATH1_pseudo_(\d)/); } my $rawfile= "dna-$chr.raw"; unless(open(OT,">$rawfile")) { warn "Cant write $rawfile"; return; } my ($nline, $nb); while() { $nline++; if (/^>/) { warn "extra > $_" if $nline>1; next; } chomp(); print OT $_; $nb += length($_); } close(FT); close(OT); warn "wrote $nb bases ($nline lines) to $rawfile from $fasta\n"; } sub sortTable { my ($feattab)= @_; #feat sym map range id dbx notes ... my @fts= (); unless(open(FT,$feattab)) { warn "Cant read $feattab"; return; } unless(open(OT,">$feattab.sort")) { warn "Cant write $feattab.sort"; return; } $OUTH= *OT; while() { unless(/^\w/) { print OT $_; next; } my @cols= split(/\t/); # do here or in _by_ ? push(@fts,\@cols); } close(FT); my @sfts= sort _by_startAndGroup @fts; while ($gf = shift(@sfts)) { @ft= @ $gf; print OT join("\t",@ft); # printpart( @ft ); } close(OT); } sub _by_startAndGroup { my ($arange, $astart, $afeat, $agroup); $arange= $a->[3]; $astart= $1 if ($arange =~ /^\D*(\d+)/); $afeat= $a->[0]; $agroup= $a->[1] || "xxx"; # ($afeat, $agroup, $amap, $arange)= split(/\t/, $a); # if ($afeat =~ /gene/) { $seqstart{$agroup}= $astart; $arange= 1; } # elsif ($afeat =~ /CDS|mRNA/) { $astart= $seqstart{$agroup} || $astart; $arange= 2; } # else { $arange= 3; } if ($afeat =~ /gene/) { $arange= 1; } elsif ($afeat =~ /CDS|mRNA/) { $arange= 2; } else { $arange= 3; } my ($brange, $bstart, $bfeat, $bgroup); $brange= $b->[3]; $bstart= $1 if ($brange =~ /^\D*(\d+)/); $bfeat= $b->[0]; $bgroup= $b->[1] || "xxx"; if ($bfeat =~ /gene/) { $brange= 1; } elsif ($bfeat =~ /CDS|mRNA/) { $brange= 2; } else { $brange= 3; } $byr= ( $astart <=> $bstart ); if ($byr == 0) { $byr= ( $agroup cmp $bgroup ); if ($byr == 0) { $byr = ($arange <=> $brange); } } return $byr; } sub readTairGenes { my ($genetab)= @_; my $fn= $genetab; unless(-r $fn) { $fn= `ls -1 $fn`; } # wild-card name? unless(open(TG,$fn)) { warn "Cant read $genetab"; return; } %loc2gsym= (); my($gsym, $atloc); while() { chomp(); ($gsym, $atloc)= split(/\t/); $atloc= uc($atloc); # make sure of case $loc2gsym{$atloc}= $gsym; } close(TG); } sub parseNprint { my ( $inxml ) = @_; %els= (); %assemblyvals= (); $p2 = new XML::Parser(Handlers => {Start => \&handle_start, End => \&handle_end, Char => \&handle_char, # Default => \&handle_default, } ); if ($inxml =~ /\.(gz|Z)$/) { if (open(XFILE,"gzcat $inxml|")) { $p2->parse(*XFILE); close(XFILE); } } else { $p2->parsefile($inxml); } close($OUTH) if ($OUTH); $OUTH= undef; # while ($gf = shift(@fts)) { # print as feat is collected ? # # print STDERR "sym: ".$gf->[1]."\t range: ".$gf->[2]."\n" if ($debug); # @ft= @ $gf; # printpart( @ft ); # } if (1) { print "Weed XML tags for $inxml\n"; foreach (sort keys %els) { print "$_\t$els{$_}\n"; } # print "Counts:\n N gene=$ngene\n N mrna=$nmrna\n N annot=$nannot\n N span=$nspan\n"; # print "Unknown values\n"; # foreach (sort keys %unknowns) { print "$_\t$unknowns{$_}\n"; } } } sub openout { my ($infile, $chr, $chrsize, $date) = @_; unless($OUTH) { my $fname= "$outname-$chr.tsv"; $fout= "${outpath}$fname"; # print "dumpFeatures $fout\n" if ($debug); my $ok; if ($append && -r $fout) { $ok= open(FOUT,">>$fout"); } else { $ok= open(FOUT,">$fout"); } unless ($ok) { warn "can't write $fout"; return -1; } $OUTH= *FOUT; $append= 1; # for remaining files of same csome print $OUTH &feattabheader("$org/$fname",$org,$infile,$date); print $OUTH "source\t$org\tChr $chr\t1..$chrsize\t-\n" if $chrsize; #? get from .xml } } sub pushFeat { my @ft= @_; # printpart wants: ($feat,$gsym,$range,$id,$dbxref,$note) if ($OUTH) { printpart( @ft ); } else { push( @fts, \@ft); } } sub pushGene { my ($tutype)= @_; ## ? COM_NAME only for RNAs ? || $vals{COM_NAME} ## save COM_NAME , PUB_COMMENT for acode genes ? ## $vals{GENE_SYM} not seen in CHR4/5 ## ? PUB_COMMENT my ($gsym, $notes, $egid, $dbxref, $aval, $publoc, $ispred, $idval, $public, $bacloc); ## Argh ! {PUB_LOCUS} is not a uniq id - 145 dups are found ! $vals{LOCUS} has more dups ## !? need to use TU-FEAT_NAME id? do as tigr chr.pep fasta: id=feat-id#bacloc#publoc ## Double Argh! different cases for PUB_LOCUS - need same case always - AT#g##### if (scalar(@ranges)<1) { warn "pushGene( $vals{PUB_LOCUS} ) - null range\n"; return; } my $range= makerange( \@ranges, 0); @ranges= (); if ($tutype eq 'mRNA' || $tutype eq 'gene') { # $gsym= $vals{GENE_SYM} || $vals{PUB_LOCUS} || $vals{LOCUS} ; #? which is best $publoc= $vals{PUB_LOCUS}; $publoc =~ s/At(\d)g/At$1g/i; # ! standardize case ! $bacloc= $vals{LOCUS}; if ($vals{GENE_SYM}) { $gsym= $vals{GENE_SYM}; } elsif ($publoc) { my $ucloc= uc($publoc); if ($loc2gsym{$ucloc}) { $gsym= $loc2gsym{$ucloc}; } else { $gsym= $publoc; $ispred= 1; } } else { $gsym= $bacloc; # this is BAC_Locus, always there $ispred= 1; } # !! using old weed data this LOCUS is prefered symbol over PUB_LOCUS (change mup24_50 to mup24.50 tho) # $notes .= 'BAC_Locus='.$vals{LOCUS}.' ; '; # BAC_Locus: mup24_50 $idval= $tufeatid.'#'.$bacloc.'#'.$publoc; # $idval= $gsym; # -- was, but NOT UNIQUE to a gene loc } if ($tutype eq 'mRNA') { # $gsym= $vals{GENE_SYM} || $vals{PUB_LOCUS} || $vals{LOCUS} ; if ($weedobj) { # my $cksum= $publoc; # or FEAT_NAME ? $weedobj->checksum($_); $egid= $weedobj->getId($idval, $datadate, $publoc); ## reuse same id/sym } } elsif ($tutype =~ /RNA/) { $gsym= $vals{COM_NAME}; $notes .= 'comment='.$vals{PUB_COMMENT}.' ; ' if ($vals{PUB_COMMENT}); } else { # if ($tutype eq 'gene') $dbxref .= 'EC:'.$vals{EC_NUM}.' , ' if ($vals{EC_NUM}) ; # both LOCUS == BAC_Locus and PUB_LOCUS work TIGRLOC url # $publoc= $vals{PUB_LOCUS} || $vals{LOCUS}; $dbxref .= 'AGILOC:'.$publoc.' , ' if ($publoc); $dbxref .= 'AGIBAC:'.$bacloc.' , ' if ($bacloc); #! this is in AGR acedb Locus = BAC_Locus $dbxref .= 'TIGRFT:'.$tufeatid.' , '; $aval= $vals{COM_NAME}; $notes .= 'name='.$aval.' ; ' if ($aval =~ m/\w/); $notes .= 'pseudogene=1 ; ' if ($vals{IS_PSEUDOGENE} eq '1'); $aval= $vals{PUB_COMMENT}; $notes .= 'comment='.$aval.' ; ' if ($aval =~ m/\w/); $notes .= 'predicted=1 ; ' if ($ispred); # $egid= 'ATgn'.sprintf("%07d",0); # FIXME -- dummy id, need euGenes generated one if ($weedobj) { # my $cksum= $publoc; # or FEAT_NAME ? $weedobj->checksum($_); $egid= $weedobj->getId($idval, $datadate, $publoc); ## reuse same id/sym } } pushFeat( $tutype, $gsym, $range, $egid, $dbxref, $notes); #! need RNA options for this feat. } #-- start xml handler subs # change to hash all of a TU (see TIGR parser.pl example) # and print at end of TU (or related RNA forms) sub handle_start { #(Expat, Element [, Attr, Val [,...]]) my ($expat, $elem, @atvals)= @_; push(@els, $inel); local $_= $elem; $inel= $_; $els{$_}++; $skipchar= 0; $vals{$_}= ''; SWITCH: { /^ASSEMBLY$/ && do { %assemblyvals= @atvals; # $database= $assemblyvals{DATABASE}; # $datadate= $assemblyvals{CURRENT_DATE}; # $chr= $assemblyvals{CHROMOSOME}; last SWITCH; }; /^GENE_LIST$/ && do { %vals= (); last SWITCH; }; # collect all in hash{}{}{} from here? /^PROTEIN_CODING$/ && do { $tutype= 'gene'; $tufeatid= ''; %vals= (); last SWITCH; }; /^RNA_GENES$/ && do { $tutype= 'RNA'; %vals= (); last SWITCH; }; /^RRNA$/ && do { $tutype= 'rRNA'; %vals= (); last SWITCH; }; /^SNRNA$/ && do { $tutype= 'snRNA'; %vals= (); last SWITCH; }; /^SNORNA$/ && do { $tutype= 'snoRNA'; %vals= (); last SWITCH; }; /^PRE-TRNA$/ && do { $tutype= 'tRNA'; $tufeatid= ''; %vals= (); last SWITCH; }; /^TU$/ && do { %vals= (); $tufeatid= ''; @ranges= (); last SWITCH; }; /^MODEL$/ && do { # |TRNA pushGene($tutype); # $tutype TU end or MODEL start? # push(@rangeset, \@ranges); @ranges= (); # save gene @ranges last SWITCH; }; /^END5$/ && do { $start= ''; last SWITCH; }; /^END3$/ && do { $stop= ''; last SWITCH; }; # /^EXON$/ && do { last SWITCH; }; # /^GENE_INFO$/ && do { last SWITCH; }; # /^COORDSET$/ && do { last SWITCH; }; # /^FEAT_NAME$/ && do { last SWITCH; }; # /^LOCUS$/ && do { last SWITCH; }; # /^PUB_LOCUS$/ && do { last SWITCH; }; # /^GENE_SYM$/ && do { last SWITCH; }; # /^EC_NUM$/ && do { last SWITCH; }; # /^IS_PSEUDOGENE$/ && do { last SWITCH; }; # /^REPEAT$/ && do { last SWITCH; }; # /^REPEAT_TYPE$/ && do { last SWITCH; }; $nada= 1; } # if ($addxpath) { # # $ar= $xpath->{$inel}; # # if (ref($ar) =~ /ARRAY/) $na= $ar-> # } } sub handle_end { #(Expat, Element) my ($expat, $elem)= @_; warn "end: $elem <> $inel inel\n" if ($elem ne $inel); $inel= pop(@els); #== one above current $elem local $_= $elem; SWITCH: { /^HEADER$/ && do { $database= $assemblyvals{DATABASE}; $datadate= $assemblyvals{CURRENT_DATE}; $chr= $assemblyvals{CHROMOSOME}; $chrsize= $sourcerange[1]; openout( $infile, $chr, $chrsize, $datadate); last SWITCH; }; # /^GENE_LIST$/ && do { last SWITCH; }; # /^RNA_GENES$/ && do { last SWITCH; }; # /^PROTEIN_CODING$/ && do { last SWITCH; }; /^TU$/ && do { # pushGene($tutype); # TU end or MODEL start? last SWITCH; }; /^MODEL$/ && do { pushGene('mRNA'); # $range= makerange( \@ranges, 0); $rref= pop(@rangeset); @ranges= @$rref; # $gsym= $vals{GENE_SYM} || $vals{PUB_LOCUS} || $vals{LOCUS} ; #? which is best # # $id= ??? # pushFeat('mRNA',$gsym,$range); last SWITCH; }; /^(SNORNA|RRNA|SNRNA)$/ && do { pushGene($tutype); last SWITCH; }; /^TRNA$/ && do { pushGene('tRNA'); last SWITCH; }; # /^EXON$/ && do { last SWITCH; }; # /^GENE_INFO$/ && do { last SWITCH; }; /^END5$/ && do { $rstart= $vals{$_}; last SWITCH; }; /^END3$/ && do { $rstop = $vals{$_}; last SWITCH; }; /^COORDSET$/ && do { $getrange= ($inel =~ /^(EXON|TU|REPEAT|TRNA|SNORNA|SNRNA|RRNA|ASSEMBLY)$/); # |RNA-EXON |PRE-TRNA == mRNA or gene ? if ($getrange) { if ($rstart>$rstop) { $rng= "$rstop..$rstart"; $compl= 1; } else { $rng= "$rstart..$rstop"; $compl= 0; } if ($inel =~ /ASSEMBLY/) { @sourcerange= ($rstart, $rstop); } else { push(@ranges, $rng) ; } } last SWITCH; }; /^FEAT_NAME$/ && do { if ($inel =~ /^(TU|PRE-TRNA)$/) { $tufeatid= $vals{$_}; } last SWITCH; }; # /^LOCUS$/ && do { last SWITCH; }; # /^PUB_LOCUS$/ && do { last SWITCH; }; # /^GENE_SYM$/ && do { last SWITCH; }; # /^EC_NUM$/ && do { last SWITCH; }; # /^IS_PSEUDOGENE$/ && do { last SWITCH; }; # /^REPEAT_TYPE$/ && do { last SWITCH; }; /^REPEAT$/ && do { $range= makerange( \@ranges, 0); @ranges= (); $featname= $vals{REPEAT_TYPE}; pushFeat('repeat',$vals{REPEAT_TYPE},$range); last SWITCH; }; $nada= 1; } } sub handle_char { #(Expat, String) return if ($skipchar); my ($expat, $val)= @_; $vals{$inel} .= $val; # $xpath->{$inel} .= $val if ($xpath); # $xpath->{$inel}->[0]->{content} .= $val if ($xpath); } sub handle_default { #(Expat, String) my ($expat, $val)= @_; } #-- end xml parse sub feattabheader { my ($fname,$org,$sourcefile, $date)= @_; unless($date) { my @tm= localtime( $^T - 24*60*60*(-M $sourcefile) ); $date= POSIX::strftime("%d-%B-%Y",@tm); } $sourcefile =~ s|^$genomeshome||; $sourcefile =~ s|^$tigrpath||; $sourcefile =~ s|^$tairpath||; return <=0; $i--) { $_= $ra[$i]; next unless($_); ##? if ($rng) { $rng .= ",$_"; } else { $rng= $_; } } } else { foreach (@ra) { next unless($_); ##? if ($rng) { $rng .= ",$_"; } else { $rng= $_; } } } if ($domax) { my($start,$stop)= maxrange($rng); $rng= "$start..$stop"; } $rng= 'join('.$rng.')' if ($rng =~ /,/); $rng= 'complement('.$rng.')' if ($compl); return $rng; } sub maxrange { my( $range)= @_; my ($pre, $suf,$start,$stop, $b, $u); $start= $kMissingValue; $stop= $start; $range =~ s/^([^\d<>-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } return ($start,$stop); } __END__ tigrxml.dtd tags we want 68197 1 26280090
-- LOCUS or PUB_LOCUS # this is 1-based, if END5 > END3, the negative strand -- BAC id -- publication/true name -- gene symbol -- {1=true|0=false} -- PRE-TRNA ~ TU , TRNA ~ MODEL , RNA-EXON ~ EXON -- function info, (gene ontology), etc ?? .. 67977.repeat00001 Feb 7 2001 7:42AM 17839 17814 AT_rich 68071.pre-tRNA-Ile& #x2D;1 Sep 21 2000 1:14AM 105712 105639 68071.tRNA-Ile&# x2D;1 Sep 21 2000 1:14AM 105712 105639 tRNA-Ile 68071.x00001< /FEAT_NAME> Sep 21 2000 1:14A M 105712 105639 >> ?? NO repeats in the pseudochromosomes ?? oat% ./parsewe* CHR5v03172001.xml dumpFeatures /c7/eugenes/weed/tigr/features-5.tsv Weed XML tags ASMBL_ID 409 ASMBL_LEFT_COORD 408 ASMBL_RIGHT_COORD 408 ASSEMBLY 1 ASSEMBLY_SEQUENCE 1 AUTHOR_LIST 1 CDS 31063 CDS_SEQUENCE 5849 CHR_LEFT_COORD 408 CHR_RIGHT_COORD 408 CLONE_NAME 1 COM_NAME 5851 COORDSET 73838 DATE 80094 EC_NUM 661 END3 73838 END5 73838 EXON 31070 FEAT_NAME 73835 GB_ACCESSION 1 GENE_INFO 5851 GENE_LIST 1 HEADER 1 IS_PSEUDOGENE 5851 LEFT_UTR 1 LINEAGE 1 LOCUS 5851 MODEL 5851 ORGANISM 1 ORIENTATION 408 PROTEIN_CODING 1 PROTEIN_SEQUENCE 5849 PSEUDOCHROMOSOME 1 PUB_COMMENT 5738 PUB_LOCUS 5839 RIGHT_UTR 1 RNA_GENES 1 SCAFFOLD 1 SCAFFOLD_COMPONENT 408 SEQ_GROUP 1 TIGR 1 TRANSCRIPT_SEQUENCE 5851 TU 5851 UTRS 1 # weed/features-5.tsv # Features for weed from TIGR a_thaliana/ath1/ [ CHR5v03172001.xml, Wed May 2 09:16:00 EDT 2001] # gnomap-version 1 # # tab-separated-values # Feature gene map range id db_xref notes source weed Chr 5 1..26280090 - gene AT5g60640 - complement(23675728..23678580) ATgn0000000 EC:5.3.4.1 - mRNA AT5g60640 - complement(join(23675728..23675937,23676027..236 oat% more ATH1_pseudo_5.1con >68197 , , , 26280090 bp TATACCATGTACCCTCAACCTTAAAACCCTAAAACCTATACTATAAATCTTTAAAACCTA TACTCTAAACCATAGGGTTTGTGAGTTTGCATAAAGCGTCACGTATAAGTGTTTCTAACA