#!/usr/bin/perl # parseflygff.pl =head NOTES from gmod:gbrowser/bin/process_gadfly.pl changes for eugenes gnomap feature tables, dgg, jun02 TODO: split by csome/arm to features-$arm files sort on $start add .tsv header fix BAD CDS $start (like -16076450 which always sets csome start == 1 ) for gnomap features.tsv, postprocess as | sort -k 1,1 -k 2,2n | perl -e 'while(<>){($c,$b,$r)=split "\t",$_,3; \ unless($c eq $lc) { close(F); open(F,">features-gd-$c.tsv"); $lc=$c; }\ print F $r; } close(F); ' >> dgg note: the BDGP/Gadfly GFF lacks important data for determining what >> features are (cDNAs, transposons, others) >> XML format has some more. GadFly mysql db may be best choice, though >> software to parse it keeps changing. header for features.tsv ... # fly/features-2L.tsv # Features for fly from BDGP/Celera/FlyBase Release 2 data [ gadfly2, Jul 10 2001] # source: FlyBase cytologic features, 20010924 # source: BDGP/Celera/FlyBase Release 2 gadfly2 genes & transcripts, 20010710 # source: BDGP/Celera/FlyBase Release 2 gadfly2 misc. features, 20010710 # gnomap-version 1 # # tab-separated-values # Feature gene map range id db_xref notes =cut if ($ARGV[0]=~/^-?-h/i) { die < ???? #The resulting database will have the following feature types #(represented as "method:source"): # # Component:arm A chromosome arm # Component:scaffold A chromosome scaffold (accession #) # Component:gap A gap in the assembly # clone:clonelocator A BAC clone # gene:gadfly A gene accession number # transcript:gadfly A transcript accession number # translation:gadfly A translation # codon:gadfly Significance unknown # exon:gadfly An exon # symbol:gadfly A classical gene symbol # similarity:blastn A BLASTN hit # similarity:blastx A BLASTX hit # similarity:sim4 EST->genome using SIM4 # similarity:groupest EST->genome using GROUPEST # similarity:repeatmasker A repeat USAGE ; } use strict; my $dir = shift || './RELEASE2'; $dir .= "/gff"; my $map = read_map($dir) || die; my %bacs= (); for my $scaffold (sort { $map->{$a}[0] cmp $map->{$b}[0] || $map->{$a}[1] <=> $map->{$b}[1] } keys %$map) { convert_scaffold($dir,$scaffold,$map->{$scaffold}); } sub read_map { my $dir = shift; my ($segments) = <$dir/*/SEGMENTS*.gff>; #/ my %arms; $segments or die "Can't find SEGMENTS file"; open (F,$segments) or die "Can't open $segments: $!"; my %position; while () { chomp; my ($ref,$source,$method,$start,$stop,undef,$strand,undef,$group) = split "\t"; $group =~ /name=(\w+)/ or next; $position{$1}=[$ref,$start,$stop,$strand]; $arms{$ref}{min} = $start if !defined($arms{$ref}{min}) || $arms{$ref}{min} > $start; $arms{$ref}{max} = $stop if !defined($arms{$ref}{max}) || $arms{$ref}{max} < $stop; } for my $ref (keys %arms) { # print join("\t",$ref,'arm','Component',$arms{$ref}{min},$arms{$ref}{max},'.','+','.',qq(Sequence "$ref")),"\n"; my $loc= "$arms{$ref}{min}..$arms{$ref}{max}"; my $astart= $arms{$ref}{min}; # Feature gene map range id db_xref notes print join("\t",$ref,$astart,'source','fly','Chr'.$ref,$loc),"\n"; } return \%position; close F; } sub convert_scaffold { my ($dir,$scaffold,$pos) = @_; my ($ref,$rstart,$rstop,$rstrand) = @$pos; my ($file) = <$dir/*/$scaffold*.gff>; #/ unless ($file && -r $file) { warn "$scaffold: Can't find corresponding GFF file. Skipping.\n"; return; } # print join("\t",$ref,'scaffold','Component',$rstart,$rstop,'.',$rstrand,'.',qq(Sequence "$scaffold")),"\n"; # Feature gene map range id db_xref notes my $loc= "$rstart..$rstop"; $loc= 'complement('.$loc.')' if $rstrand eq '-'; print join("\t",$ref,$rstart,'segment','',$scaffold,$loc),"\n"; # or scaffold ? open (F,$file) or die "Can't open $file: $!"; my @resultset = (); my %gexon= (); my $oldresultset = ''; while () { next if /^#/; chomp; my ($cref,$csource,$cmethod,$cstart,$cstop,$cscore,$cstrand,$cphase,$cgroup) = split "\t"; ## warn "$cref,$csource,$cmethod $cstart > $cstop" if $cstart > $cstop; next if $cstart > $cstop; # something wrong. Don't bother fixing it. if ($cgroup =~ /resultset_id=([^ ;]+)/) { # put aside to deal with later my $resultset = $1; if ($resultset ne $oldresultset && @resultset) { dump_resultset($scaffold,$pos,\@resultset); @resultset = (); } push @resultset,[$cref,$csource,$cmethod,$cstart,$cstop,$cscore,$cstrand,$cphase,$cgroup]; $oldresultset = $resultset; next; } else { (my $scref = $cref) =~ s/\.\d+$//; # get rid of version number, if there is one unless ($scref eq $scaffold) { warn "$scaffold: feature uses $cref as reference. Skipping"; next; } } # if ($cmethod eq 'translation' && $cstart<0) { $cstart= 1; } # fix for bad CDS starts # ^^ no, use exon start my ($start,$stop,$strand) = fix_coordinates($rstart,$rstop,$rstrand,$cstart,$cstop,$cstrand); my $fixed_group = fix_group($csource,$cmethod,$cgroup); # print join("\t",$ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$fixed_group),"\n"; my $gname= $1 if $fixed_group =~ m/^\w+\s+(\S+)/; $gname =~ s,",,g; my $loc= "$start..$stop"; if ($cmethod eq 'exon') { @{$gexon{$gname}}= () unless($gexon{$gname}); push( @{$gexon{$gname}}, $loc); next; } ## can we safely next? will translation/transcript follow? my $ftname= $cmethod; my $dbxref= ($gname =~ /^(CG|CT)/) ? "GadFly:$gname" : ''; my $id; if ($fixed_group =~ m/FlyBase (\w+)/) { $id= "MEOW:$1"; } elsif ($fixed_group =~ m/GadFly (\w+)/) { $id= "GadFly:$1"; } ## these are CGs want CTs for mRNA, CDS # $cmethod == translation > CDS, and need to use $rstart,$rstop for loc beg/end rather than exons # $cmethod == transcript > mRNA (longer than CDS) $ftname= 'CDS' if $cmethod eq 'translation'; $ftname= 'mRNA' if $cmethod eq 'transcript'; if ($cmethod eq 'translation' || $cmethod eq 'transcript') { my @lds= sort{$a <=>$b} @{$gexon{$gname}}; ## .. this is bad .. # my($lb,$lm,$le)= ($1,$2,$3) if ($loc =~ m,^(\d+)(.+)(\d+)$,); # my @lds= split '..',$loc; if ($cstart < 0) { my $lb= $lds[0]; $start= $1 if ($lb =~ m/^(\d+)/); # $start= $rstart; ## bad CDS start ! } else { my $lb= shift @lds; $lb =~ s/^\d+/$start/; if (@lds) { my $le= pop @lds; $le =~ s/\d+$/$stop/; push(@lds,$le); } else { $lb =~ s/\d+$/$stop/; } unshift(@lds, $lb); } # if ($le || @lds) { $loc= join ",", ($lb,@lds,$le); } # else { $loc= $lb; } $loc= join ",",@lds; # if ($start ne $lb) { $lb= $start; } # if ($stop ne $le) { $le= $stop; } # $loc= join "..", ($lb,@lds,$le); if ($id =~ m/:CG/ && $gname =~ m/CT/) { $id= "GadFly:$gname"; } } $loc= 'join('.$loc.')' if ($loc =~ m/,/); $loc= 'complement('.$loc.')' if $strand eq '-'; my $sym= $gname; # ($ftname eq 'gene') ? $gname : '-'; if ($fixed_group =~ m/Symbol (\S+)/) { $sym= $1; $sym =~ s,",,g; } # Feature gene map range id db_xref notes print join("\t",$ref,$start,$ftname,$sym,'-',$loc,$id,$dbxref),"\n"; ## just a dup of gene feature, called 'symbol'.. ## dump_symbol($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup),"\n" ## if $cgroup =~ /symbol/i; } close F; dump_resultset($scaffold,$pos,\@resultset); } # called when a full resultset is found sub dump_resultset { my ($scaffold,$pos,$results) = @_; return unless @$results; local $^W = 0; my ($rref,$rstart,$rstop,$rstrand) = @$pos; my $genomic; for my $d (@$results,[]) { if (index($d->[0],$scaffold) >= 0) { # bit of the genomic sequence $genomic = $d; next; } # if we get here, we're in the target # warn join(",","resultset nongenomic:",@$d) unless !scalar(@$d) || $genomic; # what is it? next unless $genomic; # don't know what to do with the thing my ($tref,$tsource,$tmethod,$tstart,$tstop,$tscore,$tstrand,$tphase,$tgroup) = @$d; my ($qref,$qsource,$qmethod,$qstart,$qstop,$qscore,$qstrand,$qphase,$qgroup) = @$genomic; # Comments in the reference field are a no-no $tref =~ s/\s+\(\d+ total\)$//; # While we're at it, might as well change the EST nomenclature so that the # 5', 3' pairs will automatically match in the viewer... $tstrand = '-' if $tref =~ /revcomp/; $tref =~ s/prime(_revcomp)?$//; my ($method,$group,$ftname); ($tstart,$tstop) = ($tstop,$tstart) if $tstrand eq '-'; $ftname= $qmethod; # got some 'cdna' also $ftname= 'cDNA' if ($ftname =~ m/cdna/i); if ($qmethod eq 'alignment') { $method = 'similarity'; $ftname= 'cDNA' if ($tref =~ m,(\.5prime|\.3prime|\.complete)$, && $qsource eq 'sim4'); ## $ftname= 'EST' if ($qsource eq 'groupest'); #?? ## ! also 'transposon' features are here (blastn alignment) but ## GFF lacks info enough to determine - need to know if blast db is ## na_te.dros-blastn $group = qq(Target "Sequence:$tref" $tstart $tstop); } elsif ($qmethod eq 'HSP' && $qsource eq 'blastx') { $method = 'similarity'; $ftname= $qsource; $group = qq(Target "Protein:$tref" $tstart $tstop); } elsif ($qmethod eq 'HSP' && $qsource eq 'blastn') { $method = 'similarity'; $ftname= $qsource; $group = qq(Target "Sequence:$tref" $tstart $tstop); } elsif ($qsource eq 'clonelocator') { $ftname= 'bac'; ##? got lots more of these than from gadfly mysql !? ## need to assemble parts of bacs to one location, like exons? ## OR is data all dup entries (same start/stop)? next if ($bacs{$tref}); $bacs{$tref}++; # only if ($gname =~ /^BAC/); $method = 'clone'; $group = qq(Target "Clone:$tref" $tstart $tstop); } elsif ($qsource eq 'gap') { $ftname= 'gap'; $method = 'Component'; if ($qgroup =~ /span_id=(:\w+)/) { $group = "Gap $1"; } } elsif ($qsource eq 'tRNAscan-SE') { $ftname= 'tRNA'; $method = 'tRNA'; $group = qq(Target "tRNA:$tref" $tstart $tstop); } elsif (defined $tstart && defined $tstop) { # p_insertion == EP( and similarity ## dang gff lacks info to make these calls w/o looking at name !! ## p_insertion l(2)k16213 ## p_insertion EP(2)2469 ## missing tRNA ; got some other HSP ?? $ftname= 'p_insertion' if ($tref =~ m,^(EP|l|ms|fs)\(, || $tref =~ m,^BG0,); $method = 'similarity'; $group = qq(Target "Sequence:$tref" $tstart $tstop); } $method ||= $qmethod; # warn "resultset skip: $method $qref $qsource $qstart" # unless !$method || $group || $qsource eq 'genscan'; next unless $group; my ($start,$stop,$strand) = fix_coordinates($rstart,$rstop,$rstrand,$qstart,$qstop,$qstrand); # print join("\t",$rref,$qsource,$method,$start,$stop,$qscore,$qstrand,$qphase,$group),"\n"; my $gname= $1 if $group =~ m/^\w+\s+(\S+)/; $gname =~ s,",,g; $gname =~ s,^\w+\:,,; #?? $gname = '' if ($gname =~ m/^\d/); # just a number my $loc= "$start..$stop"; $loc= 'complement('.$loc.')' if ($strand eq '-'); # Feature gene map range id db_xref notes print join("\t",$rref,$start,$ftname,$gname,'-',$loc),"\n"; undef $genomic; } } sub fix_group { my ($source,$method,$group) = @_; my (@group,$gene); push @group,"Transcript $1" if $group =~ /transgrp=([^; ]+)/; ## dgg ^ convert this group to 'mRNA' feature adding all exons push @group,"Gene $1" if $method eq 'gene' && $group =~ /genegrp=([^; ]+)/; $gene ||= qq(Note "FlyBase $1") if $group =~ /dbxref=FlyBase:(\w+)/; $gene ||= qq(Note "GadFly $1") if $group =~ /genegrp=([^; ]+)/; push @group,qq(Note "Symbol $1") if $group =~ /symbol=([^; ]+)/ && "Gene $1" ne $group[0]; push @group,$gene; return join ' ; ',@group; } sub fix_coordinates { my ($rstart,$rstop,$rstrand,$cstart,$cstop,$cstrand) = @_; # Fix the coordinates. We are going to accomodate (-) strand scaffolds, even though they # don't seem to occur if ($rstrand eq '+') { $cstart += ($rstart - 1); $cstop += ($rstart - 1); } else { $cstart += ($rstop - $cstart + 1); $cstop += ($rstop - $cstart + 1); $cstrand = $cstrand eq '+' ? '-' : '+'; } return ($cstart,$cstop,$cstrand); } # called when we encounter a gene symbol sub dump_symbol { my ($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup) = @_; my ($symbol) = $cgroup=~/symbol=([^;]+)/; my ($gene) = $cgroup=~/genegrp=([^;]+)/; return if $symbol eq $gene; $cmethod = 'symbol'; print join("\t",$ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,qq(Symbol "$symbol")),"\n"; } __END__ =head1 NAME parseflygff.pl - Massage Gadfly/FlyBase GFF files into a version suitable for the eugenes gnomap browser =head1 SYNOPSIS % parseflygff.pl ./RELEASE2 > gadfly.gff =head1 DESCRIPTION This script massages the Flybase/Gadfly GFF files located at ftp://ftp.fruitfly.org/pub/genomic/gadfly/ into gnomap feature tables. To use this script, get the Gadfly GFF distribution archive which is organized by GenBank accession unit (e.g. "RELEASE2GFF.tar.gz"). Unpacking it will yield a directory named after the release, e.g. RELEASE2. Give that directory as the argument to this script, and capture the script's output to a file: % parseflygff.pl ./RELEASE2 > xxxx #The resulting database will have the following feature types #(represented as "method:source"): # # Component:arm A chromosome arm # Component:scaffold A chromosome scaffold (accession #) # Component:gap A gap in the assembly # clone:clonelocator A BAC clone # gene:gadfly A gene accession number # transcript:gadfly A transcript accession number # translation:gadfly A translation # codon:gadfly Significance unknown # exon:gadfly An exon # symbol:gadfly A classical gene symbol # similarity:blastn A BLASTN hit # similarity:blastx A BLASTX hit # similarity:sim4 EST->genome using SIM4 # similarity:groupest EST->genome using GROUPEST # similarity:repeatmasker A repeat =head1 AUTHOR (stolen from) Lincoln Stein . Copyright (c) 2002 Cold Spring Harbor Laboratory This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty. =cut