#!/usr/bin/perl -w =head1 NAME tandemgenes.pl, a.k.a. tandy =head1 SYNOPSIS tandemgenes.pl - functions to locate tandem duplicate genes in genomes using seed set of predicted exons (GFF input) and genome fasta. The rational for tandy is that gene predictors are often in error modelling tandem gene regions, although they will usually predict at least one set of duplicate exons accurately. =head1 USAGE This is tested and used per chromosome/scaffold/contig units of genomes. A preliminary genome splitting step, such as EVidenceModeller, can be used to partition a genome + gene predictions into per scaffold parts. -- extract coding exons from genome + gene predictions ./tandemgenes.pl -debug -act extract \ -gff scaffold_4/dpulex1_predict.gff -genome scaffold_4/dpulex1.fa \ jex//CDS:JGI gex//CDS:NCBI_GNO dex//CDS:DGIL_SNO > scaffold_4/dpulex1_exons.fa -- reduce duplicate exons to non-redundant set (with blat), adding alternate IDs ./tandemgenes.pl -debug -act reduce \ -fasta scaffold_4/dpulex1_exons.fa > scaffold_4/dpulex1_exons.nr -- use blat/blast to locate all exons on genome, and convert marked up table of matches (find) -- this is an intermediate step to final tandy output, for debugging. ./tandemgenes.pl -debug -act find \ -genome scaffold_4/dpulex1.fa -query scaffold_4/dpulex1_exons.nr > scaffold_4.find -- final output GFF table of duplicate regions, genes and exons (HSPs) -- can be rerun and will re-use blat/blast intermediate output if available ./tandemgenes.pl -debug -act tandy \ -genome scaffold_4/dpulex1.fa -query scaffold_4/dpulex1_exons.nr > scaffold_4_tandy.gff -- options looking at location subrange, with alternate significance, alignment, and -- ignoring of partial exon matches ./tandemgenes.pl -debug -act tandy \ -genome scaffold_4/dpulex1.fa -query scaffold_4/dpulex1_exons.nr \ -mineval 1e-30 -minalign 0.50 -loc=scaffold_4:830000-1030000 -igpartial > cyp934_tandy.gff =head2 BATCH USAGE Script to handle a full genome partitioned by scaffolds/chromosomes, the recommeded use (probably won't work on multiple scaffolds at once). EVidenceModeler (http://EVidenceModeler.sourceforge.net/) is a suggested genome partitioner #!/bin/tcsh # tandyrun.sh source `/bio/argos/ROOT/bin/argos-env` set em=/bio/bio-grid/mb/EVidenceModeler set pa=/bio/bio-grid/mb/PASA/ # locate blat in PASA/bin, partitioned genome in EVidenceModeler setenv EVM_HOME $em setenv PASA_HOME $pa set path= ( $path $PASA_HOME/bin ) set td=/bio/bio-grid/dpulex/prots/ set tdata=/bio/bio-grid/mb/EVidenceModeler/daphd cd $tdata foreach sc (scaffold_*) echo "# tandemfind $sc" pushd $sc $td/tandemgenes.pl -debug -act extract -gff dpulex1_predict.gff -genome dpulex1.fa cex//CDS > dpulex1_exons.fa $td/tandemgenes.pl -debug -act reduce -fa dpulex1_exons.fa > dpulex1_exons.nr $td/tandemgenes.pl -debug -act tandy -genome dpulex1.fa -query dpulex1_exons.nr > dpulex1_exons.nr.gff popd end =head1 AUTHOR don gilbert, may-june 2007, gilbertd@indiana.edu =head1 Daphnia tandem genes problem: Rationale for Tandy Tandemgenes, or 'tandy', software has been developed to address a problem of gene predictions for tandem duplicate genes. Tandem duplicates can be nearly identical (95+% identity), and very close (within intron distances of each other), and very interesting biology. Nature rarely gives us clear and regular signals, and tandem genes are one case. These are, barring genome mis-assembly, clearly biological signals, yet many genome informatics tools have problems adequately modelling them. Any software that relies on alignment with gapping, from BLAST and BLAT thru GeneWise, Exonerate and like tools, can get confused in regions with repeated high-identity exons. These gapped alignments, as with genes and introns, will often skip over one exon to link its identical twin into a gene model. The Daphnia pulex genome (2007 release; http://wfleabase.org/) appears rich in tandem duplicate genes, perhaps as many as 20% of its genes. This is on the order of 50% or twice as many as another gene duplicate rich genome, C. elegans. Early estimates with protein homology to Daphnia suggested a high rate of tandem genes, however this evidence was not enough to accurately measure duplicate genes. Early gene predictions were not as promising in finding duplicates. A break through of evidence came with application of the PASA EST analysis pipeline (http://pasa.sourceforge.net/), which identified many problem areas with the initial gene predictions (from genewise, fgenesh, snap), but also produced reports of some very confused EST assemblies, spanning large regions with many interconnected EST-exons. A notable PASA confusion case turned out to be the Daphnia Hemoglobin gene cluster of 8. These errors lead to a catch-22 where one can't well assess gene duplications without good gene models, and but getting good gene models in tandem duplicate rich genomes is a problem. To address this, the tandy approach is to work with exons, not genes or proteins, as gene predictors typically call exons at much higher success rate than genes. Having one good exon set among a cluster of tandem genes can be as useful as having many duplicate sets, if one can locate the others. BLAT, BLAST or like tools, at exon level without gapping, will not run into the same problem of mis-aligning as gene-level alignments. The input data is (a) a genome sequence, and (b) a set of gene predictions, which can be from one or several predictors that may overlap. Exons from the predictions are consolidated in a non-redudant set (BLAT) and then matched with BLAT or BLAST to the genome. After genome scanning for all matches to all predicted exons, the core of tandy's algorithm is to mark runs of duplicate exons, then combine and split into new gene models based on a heuristic method that uses (a) intergene versus intron distances, (b) runs of exon sets (e.g. exons 1,2,3 of a gene model that are repeated), and (c) gene start/stop exons and strand inversions. The final output of tandy is a GFF feature file identifying duplication regions, the gene model (matches) and the exon matches (match_part or HSP) per gene model. Duplicates are classed as near (<15Kb) or far, every other duplicate on a scaffold/chromosome, the set of gene predictions included, and several quality measures. =head1 Tandy Status There are several inadequacies of this current release. A hoped for clean set of duplicate regions and models is a ways off, but the results are very useful as additional evidence. It seems to find duplicate regions well, however it now can produce several overlapping regions, based on different exon duplicate sets. Some of this is real biology (different sets of tandem genes in a region), and some artifact. It is successful in (a) splitting spuriously joined gene predictions in tandem regions, and (b) finding extra duplicate genes (or pseudogenes) that predictors miss. It produces too many partial gene sets, and is weak in what it calls a gene, in that it doesn't like most predictors, measure start,stop and donor/acceptor values of the gene models. Tandy has been tested on the well-studied genomes of C. elegans, Dros. melanogaster, and some other new Drosophila genomes, along with Daphnia pulex. This software is alpha release level. The documentation is not clear enough, and there is much debugging code. However, it should be usable, as its inputs are basic to genome informatics (sequence and gene predictions), and its requirements (BLAT and some BioPerl) are minimal. Its output can be used as an adjuct in genome annotation, and can be a useful guide to other annotation programs and curators. =head1 NOTES =item Efficiency recoding -- convert that gawdawful big string of altids on each exon to (a) shared array and (b) integer ids attached to each exon -- add better structure to basic exon[...] array, add field for gene marks -- struct per match-exon with array[0..altids/match] of [exonidnum,exoneq,other flags] -- (a) structure can be array[0..nexonids] of [idnum,geneid,exnum,exloc,fullxid], -- replace repeated calls to split_exonid -- sync with %idmap{geneid1}{geneid2}, {geneid}{exons},.. and %altids{xid1} = xid2 -- can generate (a) in exon_fasta_ids -- move more of these subs to a second .pm ; maybe split out parts of tandem5.pm =item FIXME: duplicate/overlap exons of a predictor class are a problem GeneWise, esp., others predict alternate transcripts or anything that may be transcript model w/o regard to overlapping exons are causing some problems. Overlaps across gene-predictor class are ok (expected), but w/in class needs some filtering. Don't filter by biggest gene span, as this is part of GeneWise result problem w/ tandem genes: biggest is one that mixes up tandem exons; -- find all overlapped predicted exons (or genes?), resort by (a) gene ids, (b) largest coding sequence ? (not span). Should we remove all exons from gene/tr models with some overlap, or keep any non-overlap exons (partial gene call can be problematic further on)? =item FIXME: blat_reduce has already found some good duplicates ** may have fixed, in exon_id_crossref see pairscores use those results where full/most gene exons are paired (or more) as alt. exon ids but w/ diff. locations. =item FIXME: predictors classes need argument to say if gene IDs can be parsed as predictor/type classes add to output counts of gene-methods (predictors from ids) =item FIXME: need to remove alternate transcript exons these are now called as near/near_overlap due to diff 'gene' ids, giving spurious tandem gene scores. need to consolidate (a) all exons from input gene-method (e.g. one predictor line GeneWise) where there is overlap among some exons of the gene models, or (b) pick only one gene model from a group with overlapping exons. fasta_extract would be right place, but would need to modify it for using mRNA and/or gene gff input?, or construct gene/mRNA models from input exon locations and ParentIDs TEMP SOLUTION: prefilter input GFF data to remove any overlapping gene/mRNA models. =item FIXME: tandy IDs in GFF see $IDPREFIX tandy GFF needs scaffoldID in tandy id, e.g. td_s4c452,td_s13c138 perl -pi.old -e'($sc)=/^scaffold_(\d+)/; s/(ID|Parent)=td_(c\d+)/$1=td_s$sc$2/g;' \ gffs/dpulex1_tandy6eg.gff =item FIXME: single exon genes are a problem ** do something in exon_finder to mark single-exon genes with good duplicates. Need some data flag on #n exons/gene (probably have it already). many of the checks, grouping filters are looking at multi-exon runs, so single exon genes are not handled well. =item FIXME: transposon repeats vs gene duplications should find way to score if dupl. exons are from transposon like events. One way may be to count # different matching geneids : high count and if they include far genes is likely clue to transposon duplicates. Also can mark in exon_id_crossref likely over-duplicated exons. Also may want to use masked-fasta. Add skip exons that are all NNNs in fasta_extract =item --igpartialexon, ignore-partial exon matches looking further, -igpartial may not be improvement; --igpartialexon seems to improve exon_finder for some true tandems, while expanding number of smaller gene groupings. Partial matches are segregated in blat_reduce output from (near) identical exons. This option may well improve gene-grouping, maybe should be default. Should test more. =item other flags --minEval default 1e-30 removes ~2/3 of input blat-found exon matches (adjust EXON_MATCH_CMD?) a weaker level would find more distant duplicates, may be warranted. --minalign 0.50 seems sensible default, drops only ~ 5% # exon_finder: input=47881; kept=10765; skip-eval: 31859; skip-align: 2461; The various --ignore-match-class (-igequal, -ignear, -igfar ...) are useful for tests, but not reliable yet. =item data bug dang, looks like DP_DGIL_SNO_ CDS exons are in source dpulex1_predict.gff twice, and have diff exon ids for same location ... this throws off calcs here; shouldnt expect this problem from "good" input data. However such could be screened at fasta_extract or blat_reduce (drop as alt exon if same-location) =head1 METHODS =cut use strict; use warnings; use FindBin; use lib ("$FindBin::Bin"); # see below require; #"$FindBin::Bin/../PerlLib" # use lib '/bio/argos/common/perl/lib','/bio/argos/common/system-local/perl/lib';#debug use Bio::DB::GFF; use Bio::DB::Fasta; use File::Basename; use Getopt::Long; $| = 1; # print unbuffer use constant SAMEBASE => 10; # for _sameloc, slop allowed in loca == locb use constant FILTER_DATA_DUPLICATES => 1; # short term data bug fix (dupl. exons) our ($debug); our ($NAME)= "tandemgenes"; # tandy our ($VERSION)= 1; # reset in tandemN.pm our %EQlegend; our $GENEID_NUM= 0; our $COMMONID_NUM= 0; our %commonids=(); our %commonlist=(); our $IDmap={}; # useful package global our $IDPREFIX="td_"; #? useful default? our $NOGENEGROUP=0; # GROUPOK=1; # by default on my $FILTER_GENEGROUP_OVERLAPS=1; # does this want a cli option? # #?? BINSIZE 5000 is it right size to get all exons in gene region? # # BINSIZE 10000 would cover most genes; but would include 2+ genes often our $BINSIZE = 1000 ; #was# 5000; our $GENEBINSIZE = 100000; #was 150000# use only where also checking gene ID our $NEARDIST = 15000; #? # our $COMMON_MERGE=1 ; ## need to balance this with possible true tandem genes with high copy number ** ## really should use some other tool to repeat mask input genome. ## or possibly count/precount #matches from all blat/blast input use constant REPEAT_CHECK_MIN => 50; our $REPEATS_MIN = 50; our $REPEATSKIP = 0; # user boolean our $EXON_MATCH_CMD ="blat -oneOff=1 -minIdentity=80 "; our $BLAT_REDUCE_CMD ="blat -fastMap "; # gff SO types our $genegrouptype; # region our $genetype; # match ? our $exontype; # match_part or HSP for ggb ?? our $sourcetag; # modify with dataset tag ** our $NOTEOUT= *STDERR; # cand to STDOUT for some actions: fasta_extract, blat_reduce, ... our $MINEVAL = 1e-5; our $MINALIGN= 0.5; #?? sub prnote { print $NOTEOUT "#n ",@_; } ## NEED Parent in aggregator to distinguish separate matches by same prot my @IDGROUPS = qw(Parent Target ID); # with this cant get target()->name !!! my @aggregators = qw( alignment coding match processed_transcript ); #matchparts my $MINEXON_GENEMARK=2; # means we need 2 exons to mark for further checking; # really need way to assess 1-exon genes my $ADDLOC2ID=1; my $ID_ADD_SOURCE=0; # option; e.g. wormbase gff uses ~ same ids among mRNA predictors =item min Eval, Align Warning: some perfect self-matches, exon parts we *want* have eval > 1e-20 Don't try for << eval; maybe best to use Blat's idea of good matches Eval is sensitive to length of exon; Example: 1159 / 6547 exons fail eval > 1e-30 test at self-location 246 / 7461 " for eval > 1e-10 cat scaffold_6/dpulex1_exons.nr.blatf8 | perl -ne '@v=split; ($b,$e)=$v[0]=~m/:(\d+)\-(\d+)/; $w=$ e-$b; ($tb,$te)=@v[-4,-3]; ($tb,$te)=($te,$tb) if($tb>$te); $p=$v[-2]; $a=$v[3]; $pa=$a/$w; print if($pa>0.5 && ($tb==$b && $te==$e) && $p > 1e-30 );' | wc =cut my ($useversion, $MINSIZE, $MAXSIZE, $REDUCE_GROUPONLY, $SEQ_SOFTMASK)= (0) x 10; my ($igpartialexon, $igequal, $igaltequal, $igunequal, $ignear, $igclass, $iginsideof, $iginnear, @ignoreclass, $ignoreclass); $iginsideof=1; # is $eq == 3 ; match is inside other known matches # maybe default: $REPEATSKIP=1; $useversion=6; my $dump_altids= 0; # $debug; my ($force, $fasta, $queryfa, $gff, $location, $action, $fasta_db, $database, $user, $password, ) =('')x30; my $groupok= 1; my @saveargs= @ARGV; my $optok= GetOptions( "database=s", \$database, "user=s", \$user, "password=s", \$password, "gff=s", \$gff, "fasta|genome=s",\$fasta, "queryfasta=s", \$queryfa, "location=s", \$location, "action=s", \$action, "debug|verbose=i", \$debug, "force!", \$force, "ignoreclass=s", \@ignoreclass, "igequal!", \$igequal, "igaltequal!", \$igaltequal, "igunequal|igfar!", \$igunequal, "iginside!", \$iginsideof, "igclass!", \$igclass, "ignear!", \$ignear, "iginnear!", \$iginnear, "igpartialexon!", \$igpartialexon, "REPEATSKIP=i", \$REPEATSKIP, "minrepeat=i", \$REPEATS_MIN, "MINALIGN=s", \$MINALIGN, "MINEVAL=s", \$MINEVAL, "MINSIZE=i", \$MINSIZE, "MAXSIZE=i", \$MAXSIZE, "NEARDIST=s", \$NEARDIST, "groupok!", \$groupok, "reduce_grouponly!", \$REDUCE_GROUPONLY, # is this devel. flag we dont need now? "dupfilter!", \$FILTER_GENEGROUP_OVERLAPS, "IDPREFIX=s", \$IDPREFIX, "ID_ADD_SOURCE!", \$ID_ADD_SOURCE, "softmask!", \$SEQ_SOFTMASK, "version=i", \$useversion, ); $NOGENEGROUP= defined $groupok ? !$groupok : 0; $FILTER_GENEGROUP_OVERLAPS= 0 if($NOGENEGROUP); # $EXON_MATCH_CMD ="blat -oneOff=1 -minIdentity=80 "; $EXON_MATCH_CMD .= " -mask=lower " if($SEQ_SOFTMASK); die < exons.fa script -act reduce -fa my.fa > exons.nr script -act find -genome genome.fa -query exons.nr > exon-find.table script -act tandy -genome genome.fa -query exons.nr > exon-find.gff -action : fasta-extract, reduce|nonredundant-fasta, find-tandem, tandy, overlap-count, ... -gff file.gff|- -fasta|genome file.fa [genome.fa] ; -queryfasta query.fa -location= scaffold:start-end for gff -database= mysql-dsn, lucene /path/to/index; for gff ; -user= , -pass= for mysql -debug 1,2 : level of verbose output find act: -igequal, -igaltequal, -igunequal/igfar, -iginside, -ignear, -iginnear : ignore these exon classes -igpartialexon : ignore partial alternate exons -ignoreclass=ID_pattern : dont compare IDs matching this regex pattern, use "-" prefix to skip entirely [e.g. -NCBI|DGIL ] -minEval $MINEVAL : skip low scores -minAlign $MINALIGN : skip low % alignment -nearDist $NEARDIST : distance to mark as near region unknown match (possible tandem) -idprefix $IDPREFIX : prefix for IDs in gff output -repeatskip 1 : mark(1) or skip(2) excessive duplications (transposon repeated ?) -minrepeat $REPEATS_MIN : repeat count (from blat matches) to trigger skips primary call: $EXON_MATCH_CMD tandy act: find, followed by reduction to common-gene grouped GFF output tandy5, tandy6 : versions reduce act: -reduce_grouponly : dont remove redundant exons, mark group only primary call: $BLAT_REDUCE_CMD extract act: -minsize $MINSIZE , -maxsize $MAXSIZE -ID_ADD_SOURCE $ID_ADD_SOURCE : add gff source prefix to ID (used in grouping) -softmask : ignore all-lowercased exons if true (will ignore all NNN exons) -dupfilter $FILTER_GENEGROUP_OVERLAPS : remove overlapping exons of same gene class (e.g. GeneWise) -version : use this version ($useversion) requirements: blat (reduce, find), Bioperl/DB/GFF,Fasta Not yet: exonerate (tandy final tandem validate?) USAGE my @typeset; my %typeset = map { my ($name, $rest) = split("//", $_); push @typeset, $name; ($name => [ split("/", $rest) ]) } @ARGV; my $skipc=0; $ignoreclass= join '|', map{ $skipc=1 if(s/^-//); split ",",$_; } @ignoreclass; $ignoreclass= "-".$ignoreclass if($skipc); # should print to some outputs? problems printing to same output of some results # bad here for some functions! # prnote "tandy args: ",join(" ",@saveargs),"\n" if $debug; if($action =~ /fasta|extract/) { set_NOTEOUT(*STDERR); # should allow $queryfa alias here my $seg = openGFFdb($location, $gff, $fasta, $database); # $gff, $database, user, password fasta_extract($seg); } elsif($action =~ /reduce|nonredundant/) { set_NOTEOUT(*STDERR); blat_reduce($fasta); # should allow $queryfa alias here } elsif($action =~ /find|tandy|tandx/) { set_NOTEOUT(*STDOUT); my $flags=""; $flags.="noprint" if($action =~ /tandy|tandx/ && $action !~ /print/); $dump_altids= $debug && ($action =~ /dump/); my ($exon_table, $idmap)= exon_finder( $fasta, $queryfa, $location, $flags); ## find tandems strategy 1: find -2 near dupls, show all in region ? if($action =~ /tandy/ && @$exon_table > 0) { if($action =~ m/(\d+)/) { $useversion=$1; } if($useversion =~ /6/) { require "tandem6.pm"; tandem_grouper( $exon_table, $fasta, $queryfa, $location, $idmap); } elsif($useversion =~ /5/) { require "tandem5.pm"; tandem_grouper( $exon_table, $fasta, $queryfa, $location, $idmap); } else { prnote "unavailable algorithm version: $useversion ; try 5 or 6\n"; } # require "tandem1.pm"; # tandem_gff( $exon_table, $fasta, $queryfa, $location, $idmap); } elsif($action =~ /tandx/ && @$exon_table > 0) { tandem_exonerate( $exon_table, $fasta, $queryfa, $location); } } elsif($action =~ /overlap/) { patch_feature_extend(); # for overlap_count Bio::DB::GFF; need before use statement? set_NOTEOUT(*STDERR); my $seg = openGFFdb($location, $gff, $fasta, $database); overlap_count($seg); } elsif($action) { set_NOTEOUT(*STDERR); prnote "unknown action: $action\n"; } exit; #-------------------------------- sub set_NOTEOUT { my ($outhandle)= @_; $NOTEOUT= $outhandle; prnote "tandy args: ",join(" ",@saveargs),"\n" if $debug; } sub openGFFdb { my($location, $gff, $fasta, $database)=@_; #return unless($gff || $database); prnote "openGFFdb($location, $gff, $fasta, $database)\n" if $debug; ## there is a damn bug in some version of BioPerl GFF (mem db) ## where ID/Parent/disid are missing ## is this related to refseq range entry missing? # globals: database, gff, fasta, user/password, @aggregators, @IDGROUPS my $db = ""; if($gff) { $db= Bio::DB::GFF->new( -adaptor => "memory", -gff => $gff, -fasta => $fasta, -aggregators => \@aggregators, -preferred_groups => \@IDGROUPS, ); } elsif($database =~ /lucene/) { $db= Bio::DB::GFF->new( -adaptor => "lucene", -dsn => $database, -aggregators => \@aggregators, -preferred_groups => \@IDGROUPS, ); } elsif($database) { my $dsn = ($database =~ /dbi:/) ? $database : "dbi:mysql:$database"; $db= Bio::DB::GFF->new( #-adaptor => "mysql", -dsn => $dsn, -user => $user, -password => $password, -aggregators => \@aggregators, -preferred_groups => \@IDGROUPS, ); } else { prnote "No GFF database\n"; return; } my $seg= $db; if($location) { my($ref,$start,$stop)= split(/[:,\-\.]+/, $location); ($seg)= $db->segment(-ref=>$ref,-start=>$start,-stop=>$stop); prnote "Location: $location\n"; } return $seg; } sub get_dna { my($fasta, $ref, $start, $stop)= @_; #, $fasta_db_ref unless(-f $fasta && $ref && $stop>0) { prnote "need fasta file, ref-ID, start, stop location\n"; return; } ## my $fasta_db= $$fasta_db_ref if(ref $fasta_db_ref); # ?? my $havedb= ref $fasta_db; if($havedb) { $havedb= $fasta_db->index_name() eq $fasta_db->index_name($fasta,-d $fasta); } unless($havedb) { my $db = eval {Bio::DB::Fasta->new($fasta);} or prnote "$@\nCan't open sequence file(s). \n" and return; $fasta_db= $db; ## $fasta_db_ref= \$fasta_db; } #? return $fasta_db unless($ref && $stop); my $seq = $fasta_db->seq($ref, $start => $stop) or prnote "cant locate seq of $ref:$start-$stop\n" and return; $seq= $seq->seq if(ref $seq); # is this weird bioperl change here or not return $seq; } =item fasta_extract a primary function, this creates exon query fasta file from input of GFF features and genome fasta. Common input GFF would be CDS exon predictions, but any exon-like non-gapped feature type (e.g. protein HSP) can be used. See also blat_reduce as next step. =cut sub fasta_extract { my($seg)= @_; my($nex,$skipoverlap)=(0)x10; for my $reference (@typeset) { my %didid=(); my %didloc=(); my %didtypeloc=(); my $infeat=0; my $typeref= $typeset{$reference}; ## should use iterator instead of @feat array for big data set # my @feat = $seg->features(-type => $typeref); # == any feature type # warn "# Found $reference//", @$typeref , " n=",scalar(@feat), " features\n"; # for my $feat (@feat) # FIXME: duplicate/overlap exons from same predictor class are a problem # esp. GeneWise and like w/ multiple calls/gene-span # should pre-filter input gff at this point; see above item my $iterator= $seg->get_feature_stream(-type => $typeref); while (my $feat = $iterator->next_seq) # Can't call method "get_feature_stream" on an undefined value at /bio/bio-grid/dpulex/prots/tandemgenes.pl line 379. # sigh; bioperl problems again { $infeat++; my($ab,$ae)= ($feat->abs_start, $feat->abs_end); ($ab,$ae)=($ae,$ab) if($ab>$ae); my $size= $ae-$ab+1; next if(( $MINSIZE>0 and $size < $MINSIZE) || ( $MAXSIZE>0 and $size > $MAXSIZE)); my $seq= $feat->seq; next unless($seq); $seq= $seq->seq if(ref $seq); #bioperl version change # add skip exons that are all NNN or optionally lowercase soft-masked. my $goodseq= ($seq =~ m/[^Nn]/); if($goodseq && $SEQ_SOFTMASK) { $goodseq= ($seq =~ m/[A-Z]/); } next unless($goodseq); my ($id, $ispar, $ftype); ($ftype)= $feat->type(); $ftype="$ftype"; #defeat hidden bioperlisms; my($fmeth,$fsource)=split /:/,$ftype,2; unless($ftype =~ /exon|CDS/) { ($id)= $feat->attributes("ID") and $ispar=0; } ($id)= $feat->attributes("Parent") and $ispar=1 unless($id); # ^^ using ID from exon/CDS is a problem;need Parent ($id)= $feat->display_id and $ispar=0 unless($id); ($id)= $feat->id and $ispar=0 unless($id); ## there is a damn bug in some version of BioPerl GFF (mem db) ## where ID/Parent/disid are missing #warn "#dbg ft=",$feat->gff3_string(0) if $debug; ## need option to add gff source to ID to provide for class grouping; e.g. wormbase data if($ID_ADD_SOURCE && $fsource) { $id= $fsource."0_".$id; } $id =~ s/\W/_/g; # CLEAN INPUT ID ## assume original id can have any dingbat (e.g WormBase: XYZ123.4.5; MGI:12345) ## need to ensure it won't clash with our dingbats [:.-] #my $loc= $feat->ref() .':'. $feat->abs_start .'-'. $feat->abs_end .':'. $feat->strand; my $ref= $feat->ref(); my $ori= $feat->strand(); my $loc= "$ab-$ae"; #? screen here for odd data bug: duplicate exons = same source.type.id.location next if( $didloc{$id.$ref.$loc}++); #? is this a problem for anything # for GeneWise and like that call many transcripts/exons at same location # ftype includes type:source, e.g. CDS:GeneWise # ** FIXME: ERROR here; this is filtering each exon; want to filter whole gene (?) # otherwise have non-comparable exons strung together, spurious gene parts if($FILTER_GENEGROUP_OVERLAPS) { # next if $didtypeloc{$ftype.$ref}{$loc}++; # need also overlap test my $isolap=0; my @bins= (int($ab/$BINSIZE) .. int($ae/$BINSIZE)); LBIN: foreach my $ib (@bins) { $didtypeloc{$ftype.$ref}{$ib} or next; my @locs= @{$didtypeloc{$ftype.$ref}{$ib}}; foreach my $lc (@locs) { my ($lb,$le)= split "-",$lc; if(_isoverlap($ab,$ae,$lb,$le,"XXlocXX")) { $isolap=1; last LBIN; } } } $skipoverlap++ and next if($isolap); foreach my $ib (@bins) { push @{$didtypeloc{$ftype.$ref}{$ib}}, $loc; } } $didid{$id}++; $id.=".".$didid{$id} if($ispar || $didid{$id}>1); # ** this is EXON NUMBER $id.=":$loc" if($ADDLOC2ID); # always need now my $doc= join("; ","type=$ftype", "loc=$ref:$loc:$ori"); #$id, $seq =~ s/(\w{60})/$1\n/g; print ">$id $doc\n$seq\n"; $nex++; } my $skip= $infeat - $nex; prnote "fasta_extract n=$nex (skip:$skip; overlapped:$skipoverlap) of $reference//", join(",", @$typeref),"\n"; $nex=0; } # prnote "fasta_extract $nex\n"; } sub dosystem { prnote join(" ",@_),"\n" if $debug; return system(@_) if(@_); } =item blat find alternate/tandem/lost exons blat -oneOff=1 -minIdentity=80 scaffold_4.fa jgifm4_exons.fa jgifm4_exons.blatf8 -out=blast8 # list non-same exons (maybe known, unknown) cat gnomon4_exons.blatf8 | perl -ne 'chomp; @v=split"\t"; ($q,$b,$e,$p,$s)=@v[0,8,9,10,11]; ($q,$qb,$qe)=sp lit(/[:-]/,$q); \ $eq=($qb==$b and $qe==$e); \ print join("\t",$q,$qb,$qe,$b,$e,$p),"\n" unless($eq or $p>1e-50);' | sort -k1,1 | more # list located exons, same, known-alt and unknown cat gnomon4_exons.blatf8 jgifm4_exons.blatf8 | \ perl -ne '@v=split"\t"; ($b,$e)=@v[8,9]; @v[8,9]=($e,$b) if($b>$e); \ print join("\t",@v);' | sort -k9,9n -k10,10nr | \ perl -ne 'BEGIN{print join("\t",qw(GeneExonID______ OrigB OrigE AtB AtE EQ Eval)),"\n";}\ chomp; @v=split"\t"; ($q,$b,$e,$p,$s)=@v[0,8,9,10,11]; ($q,$qb,$qe)=split(/[:-]/,$q); \ ($b,$e)=($e,$b) if($b>$e); ($qb,$qe)=($qe,$qb) if($qb>$qe); \ $eq=($qb==$b and $qe==$e)?1:0; \ print join("\t",$q,$qb,$qe,$b,$e,$eq,$p),"\n" unless( $p>1e-50);' | more # file format blatf8 == blast f8; genome locs at 8,9 NCBI_GNO_1182044.1:3009787-3009874 scaffold_4 100.00 88 0 0 1 88 3009787 3009874 1.1e-42 171.0 NCBI_GNO_1182044.2:3009959-3010051 scaffold_4 100.00 93 0 0 1 93 3009959 3010051 8.3e-46 181.0 =cut sub _isoverlap { # my($gb,$ge, $qb,$qe, $qid)= @_; if ($gb <= $qe && $ge >= $qb) { return ($ignoreclass && $qid =~ /$ignoreclass/) ? -1 : 1; } else { return 0; } } sub _isnear { # my($gb,$ge, $qb,$qe)= @_; my $gm= int(($gb+$ge)/2); my $qm= int(($qb+$qe)/2); return (abs($gm - $qm) < $NEARDIST) ? 1 : 0; } sub _sameloc { # ref1,b1,e1 vs ref2,b2,e2; equal location with some +/- 10b slop my($ar,$ab,$ae,$aor, $br,$bb,$be,$bor)= @_; $aor ||= 0; $bor ||= 0; $ab ||=0; $ae ||=0; $bb ||=0; $be ||=0; #? need return # $ar eq $br && #ref ?? is this bad? $aor eq $bor && # strand/orient (abs($ab-$bb) < SAMEBASE) && (abs($ae-$be) < SAMEBASE) ; } # format is NCBI_GNO_1182044.1:3009787-3009874 sub split_exonid { local $_= shift; my $dropnum= shift; $dropnum ||= 0; my ($exnum, $exloc, $exeq)=(0,"",-1); #s/[\+\*\@]//g; # drop marks at ^front s/^[\+\*\@\%\#\&]*//; # drop marks at ^front; or s/^\W+//; s/\=([\d\-]+)$// and $exeq=$1; # added; testing s/\:([\d\-]+)$// and $exloc=$1; if($dropnum && s/\.(\d+)$//) { $exnum= $1; }; return (wantarray) ? ($_,$exnum,$exloc,$exeq) : $_; } =item exon_fasta_ids This builds hash lists of cross-referenced exon primary and alternate ids, and gene ids with common-gene id reduction, from input exon query fasta headers. Used by exon_finder even after blat/blast matching is done (to collect all IDs). =cut sub exon_fasta_ids { my($queryfa, $idmap, $altids, $skipclass)= @_; my( @exonids, %sizemap, %exonidmap, %filterexon, %didloc, %exonstart, %exonstop, %reploc); ## my $GENEID_NUM= 0; # package wide global now $idmap->{sizemap}= \%sizemap; # hack, ok? $idmap->{exonidmap}= \%exonidmap; # hack, ok? $idmap->{altids}= $altids; # hack, ok? $idmap->{filtertest}= \%filterexon; # hack, ok? $idmap->{exonstrand}= {}; # also get exon size for % align/ident checks from loc=ref:b-e:+ prnote "grep >ids $queryfa\n" if $debug; open(GREP, "grep '^>' $queryfa|") or die "grep $queryfa"; while(){ my($xid)=m/>(\S+)/ or next; # next if($skipclass && $xid =~ m/$skipclass/); # was m/^($skipclass)/) # #urk, we need to keep skipclass till altids are processed if($skipclass && $_ =~ m/$skipclass/) { # test full line, then check each my @allids= ($xid); if(m/altids=(\S+)/) { push @allids, split/,/, $1; } @allids = grep { $_ !~ m/$skipclass/ } @allids; next unless(@allids); if($xid =~ m/$skipclass/) { $xid= $allids[0]; } } ## $xid =~ s/\W/_/g; ## CLEAN INPUT ID, BUT NOT HERE ; do in fasta_extract; ## $xid =~ s/\:[\d\-]+$//; # ?? cant do, need below; blast8tab uses same; drop location tag; .exnum enough # my($geneid,$exnum,$xid1,$xlen)= ($xid,1,$xid,0); my ($geneid,$exnum,$xloc)= split_exonid($xid,1); my $xid1= "$geneid.$exnum"; my ($xb,$xe, $xlen)= (0) x 9; if($xloc) { ($xb,$xe)= split(/[-\.]+/,$xloc); $xlen= abs($xe-$xb+1); } #? screen out odd bug with dupl DGIL_SNO exons: 2 each at same id/loc #? also need to filter altids .. if(FILTER_DATA_DUPLICATES && $xloc && $didloc{$geneid.$xloc}++) { $filterexon{$xid}= $filterexon{$xid1}= 1; # these would be SNO with high exon numbers? # warn "# Filter out: $xid\n"; next; } push(@exonids,$xid); # not using this? or should we save all exonnums/ids per geneid? $sizemap{$xid1}= $xlen; $exonidmap{$xid1}= $xid; $exonidmap{$xid} = $xid1; # $exonidmap{$geneid} = $xid1; # cant really do this 1 to n ## need to do {exons} also in crossref exists $idmap->{$geneid}{num} or $idmap->{$geneid}{num}= ++$GENEID_NUM; exists $idmap->{$geneid}{exons} or $idmap->{$geneid}{exons}=[]; push( @{$idmap->{$geneid}{exons}}, $exnum); # $xid or exnum ? # want to flag start,stop exons !?? need strand to know # strand: check for 'loc=scaffold_4:2080286-2080654:-1;' string in header my $strand=0; if(m/loc=([^;\s]+)/) { my($ref,$be,$o)= split(/:/,$1); $strand= ($o eq '+') ? 1 : ($o eq '-') ? -1 : $o; } $idmap->{exonstrand}->{$xid1}= $strand; ## guessing last exon here from inputs is problematic; last may be instead in ## alternate list (w/o sure strand info); should instead store :-+ with exonid:length string? if($strand>0) { $exonstart{$geneid}= $exnum if($exnum==1); $exonstop {$geneid}= $exnum if(!$exonstop{$geneid} || $exnum>$exonstop{$geneid}); } elsif($strand<0) { $exonstop {$geneid}= $exnum if($exnum==1); $exonstart{$geneid}= $exnum if(!$exonstart{$geneid} || $exnum>$exonstart{$geneid}); } $altids->{$xid}=""; # add entry for singletons if(m/altids=(\S+)/) { my $aid= $1; if( FILTER_DATA_DUPLICATES ) { my @aid1 = grep { ! $filterexon{$_} } split(/,/, $aid); $aid= join ",", @aid1; } if($skipclass && $aid =~ m/$skipclass/) { my @aid1 = grep { $_ !~ m/$skipclass/ } split/,/, $aid; $aid= join ",", @aid1; } $igpartialexon && $aid =~ s/pm,.*$//; $altids->{$xid}= $aid; ## FIXME: add $REPEATSKIP checking based on over-duplicated geneids/exonids ## here? from n altids w/ diff locations, or below in exon crossref? # my $aid1= split_exonid($aid,0); # $exonidmap{$aid1}= $aid; $exonidmap{$aid} = $aid1; #FIXME: need below# $altids->{$xid}=~ s/pm,//; # FIXME: pm, = partial match ## fixme?? need for gene/exon start,stop my @altids= split(/,/, $aid); # my $checkreps= ($REPEATSKIP && scalar(@altids) > REPEAT_CHECK_MIN); # if($checkreps) { # foreach my $axid (@altids) { # my($agid,$axnum,$xaloc) = split_exonid($axid,1); # my $aid1= "$agid.$axnum"; # if($xaloc && $xloc) { #?? need to do this for pm partial matches? or not # my($xab,$xae)= split(/[-\.]+/,$xaloc); # unless( _isoverlap($xb,$xe,$xab,$xae,"XXlocXX") ) { # $reploc{$aid1}++; $reploc{$xid1}++; #? score location or id? # $reploc{$agid}++; $reploc{$geneid}++; # } # } # } # } foreach my $axid (@altids) { last if($axid =~ /^pm/); # next if($axid eq $xid); #?? or keep my($agid,$axnum,$xaloc) = split_exonid($axid,1); my $aid1= "$agid.$axnum"; $exonidmap{$aid1}= $axid; $exonidmap{$axid} = $aid1; # if($checkreps && $xaloc && $xloc) { #?? need to do this for pm partial matches? or not # my($xab,$xae)= split(/[-\.]+/,$xaloc); # unless( _isoverlap($xb,$xe,$xab,$xae,"XXlocXX") ) { # $reploc{$aid1}++; $reploc{$xid1}++; #? score location or id? # $reploc{$agid}++; $reploc{$geneid}++; # } # } ## strand guessing here is off because blat_reduce lost strand info for alternate exons if($strand>0) { $exonstart{$agid}= $axnum if($axnum==1); $exonstop {$agid}= $axnum if(!$exonstop{$agid} || $axnum>$exonstop{$agid}); } elsif($strand<0) { $exonstop {$agid}= $axnum if($axnum==1); $exonstart{$agid}= $axnum if(!$exonstart{$agid} || $axnum>$exonstart{$agid}); } } use constant USE_exon_id_crossref => 1; # unless(USE_exon_id_crossref) { # foreach my $agene (split(/,/, $aid)) { # last if($agene =~ /^pm/); # next if($agene eq $xid); #?? or keep # $agene = split_exonid($agene,1); # $idmap->{$geneid}{$agene}= $idmap->{$agene}{$geneid}= 1; # exists $idmap->{$agene}{num} or $idmap->{$agene}{num}= ++$GENEID_NUM; # # this can get confused with chains .. # my $commonid= $idmap->{$geneid}{common} || $idmap->{$agene}{common} || $geneid; # $idmap->{$agene}{common}= $idmap->{$geneid}{common}= $commonid; # } # } #........ } # unless(USE_exon_id_crossref) { # exists $idmap->{$geneid}{common} or $idmap->{$geneid}{common} = $geneid; # } #........ } close(GREP); foreach my $gn (sort keys %exonstart) { my $xn= $exonstart{$gn}; $idmap->{exonstart}{"$gn.$xn"}= 1; $idmap->{exonstart}{"$gn"}= $xn; } foreach my $gn (sort keys %exonstop) { my $xn= $exonstop{$gn}; $idmap->{exonstop}{"$gn.$xn"}= 1; $idmap->{exonstop}{"$gn"}= $xn; } my $nrepeats=0; # foreach my $gn (sort keys %reploc) { # my $nrep= $reploc{$gn}; $nrepeats++ if($gn =~ m/\.\d/); # $idmap->{numrepeats}{"$gn"}= $nrep if ($nrep >= $REPEATS_MIN); # } my $nexon= scalar(@exonids); prnote "gene-ids=$GENEID_NUM; nexons=$nexon; \n" if($debug); #nrepeats=$nrepeats dump_altids("$queryfa", $idmap, $altids) if($dump_altids); # test only.. if(USE_exon_id_crossref) { exon_id_crossref($queryfa, $idmap, $altids); dump_altids("exon_id_crossref", $idmap, $altids) if($dump_altids); # test only.. } ## try compressing down idmap to smallest common set? ## WARNING: turning common exons into common genes is problematic; ## this includes compressing possibly separate genes into one if a model is overly inclusive of tandems use constant REDUCE_COMMONS => 0; # for gene {common} ids; problematic if(REDUCE_COMMONS) { foreach my $irep (0..3) { my ($ncom,$nred)=(0,0); foreach my $ad (sort keys %$idmap) { next if($ad =~ /sizemap|altids|exon|test/); # exonidmap and others $ncom++; foreach my $bd (sort keys %{$idmap->{$ad}}) { next if($bd =~ /^(common|num|exons)$/ or $bd eq $ad); my $acom= $idmap->{$ad}{common}; my $bcom= $idmap->{$bd}{common}; if($bcom ne $acom) { if( $idmap->{$acom}{common} eq $idmap->{$bcom}{common}) { $idmap->{$bd}{common}= $idmap->{$ad}{common} = $idmap->{$acom}{common}; } else { $idmap->{$bd}{common}= $acom; $nred++; } } } } prnote "comm-ids[$irep]=$ncom reduced=$nred\n" if($debug); } } } sub dump_altids { my($queryfa, $idmap, $altids, $extest)= @_; print "\n\n# ",("-") x 50,"\n"; print "# dump_altids from $queryfa\n"; my $nxin= 0; foreach my $xid (sort keys %{$altids}) { next if($xid =~ m/:\d/); # dump only long/short? named entries print "$xid"; $nxin++; $altids->{$xid} and print "\t",$altids->{$xid}; print "\n"; } print "# dump_altids exon in=$nxin\n\n"; print "# ",("-") x 50,"\n"; } =item exon_id_crossref This cross-references all exon primary and alternate ids, and creates common gene id map. It re-check all altids and adds in missing full-match cross-exon ids, from tertiary links: a = b and b = c implies a = c; Current input (from blat_reduce) has spotty reciprocal cross-refs .. why? Updates both altids{exonlongid} and idmap{geneid} Used by exon_fasta_ids ; *should* update blat_reduce to use this. =cut sub exon_id_crossref { my($queryfa, $idmap, $altids)= @_; prnote "exon_id_crossref for $queryfa\n" if $debug; ## be sure to keep xloc in exon id key for now; double hash keys; see below # 1. crossref the exonids from alt lists my %xaltid=(); my $nxin= 0; foreach my $xid (sort keys %{$altids}) { $xaltid{$xid}{$xid}=1; $nxin++; my $alist= $altids->{$xid} or next; foreach my $axid (split(/,/, $alist)) { last if($axid =~ /^pm/); # skip all partials $xaltid{$xid}{$axid}= 1; $xaltid{$axid}{$xid}= 1; $xaltid{$axid}{$axid}= 1; # ensure self match } } # 1b. update alt lists with all xrefs; must keep xloc as part of xid ## ? we are missing self/xid in altlist ?? why? my $nxout= 0; foreach my $xid (sort keys %xaltid) { my @blist = sort keys %{$xaltid{$xid}}; $nxout++; if(@blist) { my $alist = $altids->{$xid}||""; unless( $alist =~ s/^.*\b(pm,)/$1/) { $alist=""; } # no partials if($alist) { # no partials in @blist *** THIS IS A PROBLEM (?) *** @blist = grep { $alist !~ m/$_/ } @blist; } push(@blist, $alist) if $alist; $altids->{$xid}= join(",", @blist ); } my $xid1 = split_exonid($xid); # short id; $altids->{$xid1}= $altids->{$xid}; # suspenders & belt here } # 2. update/create idmap crossrefs ## WARNING: turning common exons into common genes is problematic; ## this includes compressing possibly separate genes into one if a model is overly inclusive of tandems ## FIXME: cross ref the full gene matches (~all exons) that are at alternate locations ## add this then to exon_finder marked items ## FIXME: add $REPEATSKIP checking based on over-duplicated geneids/exonids ## better than skipping, mark these on output as repetitive regions ? ## also add eq code for this? my %pairscores=(); my ($ngene,$ngenepair)=( 0,0); foreach my $exoni (sort keys %xaltid) { my ($genei,$numi,$loci)= split_exonid($exoni,1); # $ngene++; my $geneidnum= $idmap->{$genei}{num} || 0; # should exist my @loci= ("ref", split(/[-\.]+/,$loci), 0 ); exists $idmap->{$genei}{exons} or $idmap->{$genei}{exons}=[]; my $havei = grep { $numi == $_ } @{$idmap->{$genei}{exons}}; push( @{$idmap->{$genei}{exons}}, $numi) unless($havei); foreach my $exonj (sort keys %{$xaltid{$exoni}}) { my ($genej,$numj,$locj)= split_exonid($exonj,1); if($loci && $locj) { my @locj= ("ref", split(/[-\.]+/,$locj), 0 ); my $pairscore= _sameloc( @loci, @locj) ? 1 : -1 ; $pairscores{$genei}{$pairscore}{$genej}++; } exists $idmap->{$genei}{$genej} or $ngenepair++; $idmap->{$genei}{$genej}= $idmap->{$genej}{$genei} = 1; # $pairscore; # flag value here if these are same-loc or paralogs ## pairscore: should sum for all exons but cant have -1,+1 combined ## should count when full gene is matched, versus single exon exists $idmap->{$genej}{num} or $idmap->{$genej}{num}= $geneidnum; # same as above my $commonid= $idmap->{$genei}{common} || $idmap->{$genej}{common} || $genei; $idmap->{$genej}{common}= $idmap->{$genei}{common}= $commonid; } exists $idmap->{$genei}{common} or $idmap->{$genei}{common} = $genei; } foreach my $genei (sort keys %pairscores) { my $iexon= scalar( @{$idmap->{$genei}{exons}} ) or next; my @genej= sort keys %{$pairscores{$genei}{-1}}; # dont care about +1 same loc #?? use count of pairscores to score for REPEATSKIP ? #?? need to worry about how many predictor duplicate calls also; really want to # count diff-locations to score transposon repeats foreach my $genej (@genej) { my $jexon= $pairscores{$genei}{-1}{$genej}; if( $jexon/$iexon > 0.8 ) { #? minalign? $idmap->{$genei}{$genej}= $idmap->{$genej}{$genei} = -1; # flag diff loc, gene same } } } $ngene = scalar(keys %$idmap); prnote "exon_id_crossref exon in=$nxin; out=$nxout; ngene=$ngene; ngenepair=$ngenepair\n" if $debug; } sub scan_for_repeats { my($outblat)= @_; # blast8 table format expected my $sizemap = $IDmap->{sizemap}; my (%xrepeats, %grepeats, %repeatlist); open(F, $outblat) or die "processing $outblat"; while(){ # my ($qid)=split "\t"; chomp; my @v=split "\t"; my($qid,$ref,$pctid,$alen,$tb,$te,$eval,$bits)=@v[0,1,2,3,8,9,10,11]; ## use same score filters as exon_finder here if($eval > $MINEVAL) { next; } my $palign= ($sizemap && $sizemap->{$qid}) || $alen; $palign= $alen / $palign; if( $palign < $MINALIGN) { next; } my($geneid,$exnum)= split_exonid($qid, 1); my $qid1="$geneid.$exnum"; $xrepeats{$qid1}++; $grepeats{$geneid}{$exnum}++; } close(F); my @xids= sort{$xrepeats{$b} <=> $xrepeats{$a}} keys %xrepeats; foreach my $qid (@xids) { my $nrep= $xrepeats{$qid}; last if ($nrep < $REPEATS_MIN); # sorted by this my($geneid,$exnum,$xloc)= split_exonid($qid, 1); my @ex= sort keys %{$grepeats{$geneid}}; my $minrep= $nrep; foreach my $ex (@ex) { my $xrep= $grepeats{$geneid}{$ex}; $minrep= $xrep if($xrep < $minrep); } next if($minrep < $REPEATS_MIN); $repeatlist{$geneid}= $repeatlist{$qid}= $minrep; prnote "repeat $geneid nexon=".scalar(@ex)."; nrep=$nrep; minrep=$minrep; \n" if $debug>1; } return \%repeatlist; } =item exon_finder This is main algorithm in this package. It reads blat/blast results (in blast 8 table format) of exon matching to genome. It categorizes matches as equal/ontop of query exon, near to query exon, or far from Currently it handles only same-scaffold/chromosome equal/near/far. Input exons have known locations (as part of ID, from fasta_extract) for this. It combines in all alternate exon ids (from different gene models), and also identifies alt-equal, alt-near type matches. Results are printed as a table (v), or returned for further reduction (to gff). Genome___ GeneExonID______ OrigB OrigE AtB AtE Or EQ Eval AltIDs Genome = scaffold/genome name OrigB,E = original exon begin,end; AtB,E = found exon begin,end EQ = match location code (Orig equals At; near; far; alt-ID eq/near/far) Or = strand; Eval = e-value; GeneExonID/AltIDs = primary and alternate exon IDs (geneid.exonnum:begin-end format) Common usage will run blat/blast once, and reprocess results often (debugging at least). This should be usable with NCBI-Megablast (discontiguous) as well as Blat or like. (could separate out the blat/blast system call). =cut sub exon_finder { my($genomefa,$queryfa,$location,$flags)=@_; unless($genomefa && $queryfa && -f $genomefa && -f $queryfa) { prnote "Missing: genome $genomefa or query $queryfa\n"; return; } my $outblat= "$queryfa.blatf8"; my $outfind= "$queryfa.find"; # not used; note we could make 1nce and reuse but for idmap{alltable} my( %ab, @table, @alltable, %altids, %idmap, @groups, %ingroup); my( $doprint, $dosave)= (1,1); #?? $flags ||=""; $doprint=0 if $flags =~ /noprint/; $dosave=0 if $flags =~ /nosave/; prnote "exon_finder\n"; my $skipclass=""; if($ignoreclass && $ignoreclass =~ s/^-//) { $skipclass=$ignoreclass; prnote "Ignore class completely: $ignoreclass\n"; } elsif($ignoreclass) { prnote "Ignore class: $ignoreclass\n"; } %idmap=(); %altids=(); exon_fasta_ids($queryfa, \%idmap, \%altids, $skipclass); $IDmap= \%idmap; # global access my $sizemap= $idmap{sizemap}; #? my $filterexon= $idmap{filtertest}; $idmap{alltable}= \@alltable; # want this for missing primary exons ?? my (%extest); # debug check # $idmap{extest}= \%extest; #? ## FIXME: option for megablast, parameter choices my $cmd="$EXON_MATCH_CMD $genomefa $queryfa -out=blast8 $outblat"; if (!$force && -f $outblat) { prnote "reusing old $outblat\n"; prnote "$cmd\n"; } else { my $ok= dosystem($cmd); } unless(-f $outblat) { prnote "No blat result found: $outblat\n"; return; } ## find tandems strategy 1: find -2 near dupls, show all in region ? ## FIXME: some of these eq==0 matches to 1mary exon are == alt exon with error ## i.e. alt exon wasn't perfect match to 1st exon, show here as eq==0 to alt, w/ overlap ## fixed: add $ref/scaffold column to sort: -k2,2 ## fixed: add >bitscore column to sort : -k12,12nr my (%hits, %genehits); my ($naskip, $neskip, $nkept, $ninput, $nrepskip)= (0)x10; my $fixlocperl= '@v=split"\t"; ($b,$e)=@v[8,9]; if($b>$e){ @v[8,9]=($e,$b); $v[0].="-";} print join("\t",@v);'; #'@v=split"\t"; ($b,$e)=@v[8,9]; @v[8,9]=($e,$b) if($b>$e); print join("\t",@v);'; my $sortblast8= 'sort -k2,2 -k9,9n -k10,10nr -k12,12nr'; # genome.ref,b,e ; bitscore my($locref,$locstart,$locstop)=(0)x10; if($location) { ($locref,$locstart,$locstop)= split(/[:,\-\.]+/, $location); prnote "Location: $locref:$locstart-$locstop\n"; } prnote "minEval: $MINEVAL ; minAlign:$MINALIGN\n"; #? print in gff #?? cache this sorted output and re-read as available? #?? REPEATSKIP: prescan outblat for over-abundant matches == transposon repeat like? my $repeatlist= $REPEATSKIP ? scan_for_repeats($outblat) : {}; $idmap{repeatlist}= $repeatlist if(%$repeatlist); # want this ?? print "# EQ_legend: ", join( ", ", map{ $_."=".$EQlegend{$_}} sort{$a<=>$b}keys %EQlegend),"\n" . "#".join("\t",qw(Genome___ GeneExonID______ OrigB OrigE AtB AtE Or EQ Eval AltIDs)),"\n" if($doprint); prnote "cat $outblat | perl -ne '$fixlocperl' | $sortblast8 | exon_finder\n" if $debug; open(F, "cat $outblat | perl -ne '$fixlocperl' | $sortblast8 |") or die "processing $outblat"; while(){ chomp; my @v=split"\t"; my($qid,$ref,$pctid,$alen,$tb,$te,$eval,$bits)=@v[0,1,2,3,8,9,10,11]; my $qid0= $qid; my $strand= ($qid =~ s/-$//) ? "-" : "+"; my $wantloc=1; if($locstop>0) { ## be careful here; may want some below checks done before skipping location next unless($ref eq $locref); # Ok always? $wantloc= _isoverlap($tb,$te,$locstart,$locstop,"XXlocXX"); # next if($wantloc==0); # ** defer to later; after _isnear() checks } next if( FILTER_DATA_DUPLICATES && $filterexon->{$qid} ); next if( $skipclass && $qid =~ m/$skipclass/ ); $ninput++ if ($wantloc); my $alt= $altids{$qid}; # || "."; # NOTE: qid and altids both use 'geneid.xnum:location' $alt =~ s/pm,// if($alt); # FIXME: drop partial match pointer?? my @alt= ($alt) ? split(",",$alt) : (); $alt ||= ""; my($qb,$qe); ($qid,$qb,$qe)=split(/[:-]/,$qid); # split geneid.exum from :location b-e ($qb,$qe)=($qe,$qb) if($qb>$qe); ($tb,$te)=($te,$tb) if($tb>$te); ### already done in sort ** NEED for strand info ** ?????? my $eq= _isoverlap($tb,$te,$qb,$qe,$qid); # ($qb<=$te && $qe>=$tb)? 1 : 0; #Wait# $eq= 9 if($REPEATSKIP > 1 && exists $repeatlist->{$qid}); ## $idmap{numrepeats}{$qid} my $isnear=0; #? always test ## can,should we adjust neardist if exons are in cluster of tandems? $isnear= ($eq == 0) ? _isnear($tb,$te,$qb,$qe,$qid) : 0; my $tablerow= [$ref,$qid,$qb,$qe,$tb,$te,$strand,$eq,$eval,$alt]; ## eq changes; alt can change push(@alltable, $tablerow); # $alltable[$ninput]= $tablerow; if($eval > $MINEVAL) { $neskip++ if($wantloc); next; } # FIXME: MINALIGN is removing valid exon matches broken into HSPs # e.g. 1-exon gene matches nearby over 90% of size, but in 7 HSPs # need to look at all close exon HSPs, qb,qe range as per prothsp mapping my $palign= ($sizemap && $sizemap->{$qid}) || $alen; $palign= $alen / $palign; if( $palign < $MINALIGN) { $naskip++ if($wantloc); next; } =item clean up eq grouping: ** how to handle the many possible alt id values? ** maybe better return list of target eq values for each qid,altid ( 1/at, 0/far, -2/near, -1/skip ) ** question: are there any quality exons (eval,pctalign) that do not have their own input row, but are listed as alt. ids with diff locations? -- check input exon.fasta, blat result 1a. main query qid exon is at (overlap) target (eq == 1; but for 2b??) 1b. query qid is near target (aside from any alternate exon ids) (eq == -2 for 2c,2b ; eq == 2 for 2a) 1c. query qid is far from target (eq == 0 for 2c; eq == 6? for 2a; eq == -3 for 2b) 2a. alternate exon id(s) is at target : ignore if 1a.(same exon), mark otherwise, but also need to know 1b (near-main) vs 1c (far-main) (eq == 2 for 1b; eq == 6 for 1c ?) 2b. alt id is near target : mark unless 1b? == same location (same exon) (eq == 5 for 1a ; eq == -3 for 1c; ?) 2c. alt is far : ignore for any 1. value ? (may be same exon as 1, or other far match) 3. add +9 == transposon-like overabundant repeats =cut # check altids for eq >> known exon locs my ($altnear_id,$alteq_id)=('') x 9; my $altnear=0; my %alteq=(); my %alteqc=(); foreach my $aid0 (@alt) { next if($aid0 =~ /^pm$/); my($aid,$ab,$ae)= split(/[:-]/,$aid0); next unless(defined $ae); ($ab,$ae)=($ae,$ab) if($ab>$ae); #? save $astrand ? my $aeq= _isoverlap($tb,$te,$ab,$ae,$aid); if($REPEATSKIP > 1 && exists $repeatlist->{$aid}) { $aeq= 9; } #Wait: $eq= 9; $idmap{numrepeats}{$aid}) ## as below, need to know if alt aid is near/far from main qid ## can we ever have alt-is-near-target and alt-is-near-main but at 3rd location between? ## yes, in tandem regions .. but alt can be at/near target and far from main also if($aeq != 0) { #Wait# $eq= 2 unless($eq>0 || $aeq <= 0 || $aeq == 9); $alteq_id=$aid; $alt =~ s/$aid0/\*$aid0/; # mark all matches #? boost value of eq==2 in calcs .. this can be tandem key } else { # update: want to mark all is-near alt-ids for collation # but see below +5, +6 tests we rejected before # BUT distinguish target tb,te near-to-alt FROM qid-main-near-to-alt # i.e. alt match needs to be marked near-qid or far-qid if( _isnear($tb,$te,$ab,$ae,$aid)) { $altnear= 1; $altnear_id= $aid; $aeq= -2; #Wait# $eq= ($eq == 1) ? 5 #? drop this change to eq? # # : ($eq == 2) ? 6 # : -3; $alt =~ s/$aid0/\@$aid0/; } } $alteq{$aid0}= $aeq; $alteqc{$aeq}++; $alt =~ s/$aid0/$aid0\=$aeq/; } # do we want to add this into long list? unless($alt =~ m/$qid0/) { my $eq2= $eq == 1 ? $eq : ($isnear) ? -2 : $eq; $alt.= ",$qid0\=$eq2"; } $tablerow->[9]= $alt; # update $alt changes here? ## also, keep list of $eq, $alteq_id and screen out ! $eq inside these matches ## should be sorted to allow if ($eq > 0) { my $loc= [$qb,$qe,$qid0,$eq]; ## need to add $alt locs from above ?? my @bins= (int($qb/$BINSIZE) .. int($qe/$BINSIZE)); foreach my $ib (@bins) { push @{$hits{$ib}}, $loc; } } else { #FIXME; overlap with a following Better exon not captured; sort by >bitscore? my $overlap=0; my $overlap_eq=0; my $overlap_id=""; my @bins= (int($tb/$BINSIZE) .. int($te/$BINSIZE)); BINS: foreach my $ib (@bins) { if($hits{$ib}) { foreach my $be (@{$hits{$ib}}) { my @be=@$be; ## {$hits{$ib}}; # fixme: list of @be my $heq= _isoverlap($tb,$te,$be[0],$be[1],"XXignoreXX"); if($heq>0) { $overlap= 1; my $oid= $be[2]; #?? $alt =~ s/$oid/\%$oid/; # mark all matches ?? # this may be lower qual, partial exon match # ignore if prior is much better: ($be1-be0 >> $te-te) # BUT this match passed above qual check, # keep as potential run of exons in related gene? my $blen= abs($be[1]-$be[0]); my $tlen= abs($te-$tb); #? if($blen>1 && $tlen/$blen > 0.80) { $overlap_id= $be[2]; $overlap_eq= $be[3]; last BINS; #? } } } } } if($overlap && $eq > -3) { #Cant Wait??# $eq= 3 if($eq == 0); # iginsideof = 1 ; eq == 3 low-qual overlap ignored by default unless(!$overlap_id || $overlap_id eq $qid || $overlap_id eq $altnear_id) { # done# $isnear= _isnear($tb,$te,$qb,$qe,$qid); #Cant Wait??# if($isnear||$altnear) { $eq= ($overlap_eq < 1) ? -2 : 4; } #? or $eq= -2; # done# if($altnear) { $alt =~ s/$altnear_id/\@$altnear_id/; } #? need to add overlap_id to this exons alt list ?? unless($alt =~ m/$overlap_id/) { $alt.= ",$overlap_id\=$overlap_eq"; } } } } ## ?? is this right?? if($eq <= 0 && @alt) { ## add to hits so we don't show all alt matches also my $loc= [$tb,$te,$qid0,$eq]; my @bins= (int($tb/$BINSIZE) .. int($te/$BINSIZE)); foreach my $ib (@bins) { push @{$hits{$ib}}, $loc; } # FIXME: should push(hits{ib},$loc) } ## if $eq == 0, want to check if near: need to look thru bins? ## if $eq == 3 (inside known), also check for _isnear and flag if these are known but tandems? ## Recode $eq now using values %alteqc ? ## eq here is -1/skip, 0/far, 1/same, 3/insideof # add -2/near, 4/"near+overlap", 5/"near+equal", NOT: 6/"near+alt+far",?? ## alteqc has -2/near, -1/skip, 0/far, 1/same if(1) { #........... revised eq codes ............. ## $altnear always true if any $alteqc{-2} ## add eq level for main exon eq = 1,2 but altid exon eq -2/near, 0/far, ?? $eq= 9 if($REPEATSKIP > 1 && (exists $repeatlist->{$qid} || exists $alteqc{9}) ); # ?? if($eq == 0) { if($isnear||$altnear) { $eq= -2; } elsif ($alteqc{1}) { $eq= 2; } } elsif($eq == 1) { $eq= 5 if($isnear||$altnear); } elsif($eq == 2) { # have we any altequal here? when aeq == 1, eq == 0 $eq= -2 if($isnear||$altnear); } elsif($eq == 3) { # insideof $eq= 4 if($isnear||$altnear); } elsif($eq == 4) { # near+overlap ## $eq= 4 if($isnear||$altnear); } elsif($eq == -1) { # skipid $eq= -2 if($altnear); #? } } else { #........... old eq codes ............. if($eq == 0) { # done # $isnear= _isnear($tb,$te,$qb,$qe,$qid); $eq= -2 if($isnear||$altnear); ## * distinguish these (isnear from altnear but alt-far-from qid) } elsif($eq == -1) { $isnear= 0; $eq= -2 if($altnear); #?? } ## FIXME: add here? score if repetitive; maybe cant do from exon altids $eq= 9 if($REPEATSKIP > 1 && (exists $repeatlist->{$qid} || exists $alteqc{9}) ); # ?? # # not what we want; _isnear same as _isoverlap where eq == 1 # } elsif ($eq == 1) { # $isnear= _isnear($tb,$te,$qb,$qe,$qid); # $eq= 5 if($isnear||$altnear); #?? # } elsif ($eq == 2) { # $isnear= _isnear($tb,$te,$qb,$qe,$qid); # $eq= 6 if($isnear||$altnear); #?? } # ................. eq codes ............... ## for $eq <= 0; mark regions of multi-exon matches (same geneid) ## need to change this to include eq == 1,2 if already have marked for eq -2,0,4 ... ## otherwise we miss 2,3 exon tandem genes that change from eq -2 to +1 as find 2nd my $didmark= 0; my $markt= ($eq <= -2 || $eq == 0 || $eq == 4 || $eq == 5) ? '+' : ''; $didmark += gene_mark( $qid, \%genehits, $markt, \$alt, $tb,$te); foreach my $aid0 (@alt) { next if($aid0 =~ /^pm$/); my $aeq= $alteq{$aid0} || -1; $markt = ($aeq <= -2 || $aeq == 0 || $aeq == 4 || $aeq == 5) ? '+' : ''; $didmark += gene_mark( $aid0, \%genehits, $markt, \$alt, $tb, $te, $qid); } $tablerow->[7]= $eq; $tablerow->[9]= $alt; # [$ref,$qid,$qb,$qe,$tb,$te,$strand,$eq,$eval,$alt] next if($wantloc==0); # defered skip from above; need _isnear() check before BEGIN { # define here so we can update easily %EQlegend= ( 0,"far", #was# 0,"unknown", 1,"equal", 2,"altequal", 3,"insideof", 4,"near+overlap", 5,"near+equal", 6,"near+alt+far", ## -9,"error", -1,"ignoredmatch", 9, "repeats", # fixme +4 scoring -2, "near", -3, "near+alt", -4, "far+alt", #< same as above 5,6 : use those? ); }; if($REPEATSKIP > 1 && $eq == 9) { $nrepskip++; # prnote "repeat: ",join("\t",@$tablerow),"\n" if($debug>2); #? next; } unless( $eval >= $MINEVAL || ($eq == 0 && $igunequal) || ($eq == -4 && $igunequal) || ($eq == 1 && $igequal) || ($eq == 2 && $igaltequal) || ($eq == 3 && $iginsideof) || ($eq == 4 && $iginnear) || ($eq == -2 && $ignear) || ($eq == -3 && $ignear) || ($eq == 5 && $ignear) || ($eq == -1 && $igclass) ) { push(@table, $tablerow) if $dosave; print join("\t",@$tablerow),"\n" if $doprint; $nkept++; # my $pfmt= sprintf("%8.1e",$eval); #? same as before '7.8e-63' '2.8e-106' # push(@table, [$ref,$qid,$qb,$qe,$tb,$te,$strand,$eq,$eval,$alt]) if($dosave); # print join("\t",$ref,$qid,$qb,$qe,$tb,$te,$strand,$eq,sprintf("%8.1e",$eval),$alt),"\n" if($doprint); } } close(F); prnote "exon_finder: input=$ninput; kept=$nkept; skip-eval: $neskip; skip-align: $naskip; skip-repeats: $nrepskip \n"; # dump_altids("exon_finder", \%idmap, \%altids, \%extest) # if($debug && $action =~ /test/); # test only.. return (\@table, \%idmap); } sub gene_mark { my($gid, $genehits, $markt, $altref, $gb, $ge, $gmainid)= @_; my $didmark= 0; my $minexonmark= $MINEXON_GENEMARK; my($geneid,$exnum,$xloc)= split_exonid($gid, 1); if($xloc) { ($gb,$ge)= split /[-]/, $xloc; } $exnum ||= 1; if($gmainid) { ## markalways= $IDmap->{$genei}{$genej} and not sameloc(genei,genej) my($genei,$mexnum,$mxloc)= split_exonid($gmainid, 1); if( $genei ne $geneid && exists $IDmap->{$genei}{$geneid} && $IDmap->{$genei}{$geneid} < 0 # flag for diff. loc ) { $markt= '+'; # we always want this one marked $minexonmark= 0; } } # adjust minexonmark for geneid == 1-exon gene: if($minexonmark>1) { my $exonids = $IDmap->{$geneid}{exons} || []; $minexonmark=1 if(@$exonids == 1); } unless( $markt ) { $markt= $genehits->{$geneid}{mark} || '#'; } $genehits->{$geneid}{mark}= $markt; my $nexons= 0; my @bins= (int($gb/$GENEBINSIZE) .. int($ge/$GENEBINSIZE)); foreach my $ib (@bins) { $genehits->{$geneid}{$ib}{$exnum}++; $nexons += scalar(keys %{$genehits->{$geneid}{$ib}}); # mark all eq types as % or ^, then change to + if find eq -2,0,4 among them if($nexons >= $minexonmark) { ## pre-mark exons 1,2 if get here $nexons=3 if($nexons > 3); # dont need more than 3 marks.. my $mark = ($markt) x $nexons; unless($$altref =~ s/[\+\#]*$gid/$mark$gid/){ $$altref =~ s/$/,$mark$gid/; } $didmark++; } } return $didmark; # also $geneid ? also $mark? } sub tandy_gffhead { my($exon_table, $genomefa, $queryfa, $location, $idmap)=@_; print "##gff-version 3\n"; print "# $NAME-version $VERSION\n"; print "# find unlocated genes in genome with exon-collection\n"; print "# query: $queryfa; genome: $genomefa\n"; print "# Location: $location\n" if($location); print "# Eval-min: $MINEVAL; Align-min: $MINALIGN\n"; print "# source-legend: ", join( ", ", map{ my $s= $EQlegend{$_}; $s =~ s/\W/_/g; "tandy.".$s; } sort{$a<=>$b} keys %EQlegend),"\n\n"; } =item tandem algo2 see tandem1.pm sub printgenes_gff sub printgenes2_gff sub _exlocsort sub tandem_gff -- not quite good enough; foreach my $ib (@bins) { store $genes{$geneid}{$ib} , $geneloc{$geneid}{$ib} } my %bins= map{ $_,1 } @bins; ## need to track back before current @bins foreach my $ib ( sort{$a<=>$b} keys %bin_save ) { next if( exists $bins{$ib} || $ib > $b - $GENEBINSIZE ); printgenes2_gff( \%genes, \%geneloc, $ib); delete $bin_save{$ib}; } map { $bin_save{$_}++; } @bins; ## finally: foreach my $ib ( sort{$a<=>$b} keys %bin_save ) { printgenes2_gff( \%genes, \%geneloc, $ib); } =cut =item tandem algo3 1. find interesting ++ marked exon/gene 2. collect all exons of ++marked in region i - 100 .. i + 200 or such save as geneid->@exons; this should include exons of tandem dupls. 3. add in same/almost same exon locations for more geneids to add common id redo step 2. to collect new geneid exons in region 4. segregate exons to gene models in tandem region (how??) - using exon.nums from ids, break at change in exon.num sequence? 1,2,3,4,5 | 2,3,4,5 - using max intron size? max/ave of step b/n base exons? 5. mark as done all gene/exons in region and move on (step 1) =cut =item tandem_exonerate for input table like this, create 'cdna' and use exonerate to locate fully on genome? #table = # Genome___ GeneExonID______ OrigB OrigE AtB AtE EQ Eval AltIDs scaffold_4 Dappu1_FM5_234437.8 663878 664019 651718 651855 -2 5.0e-53 . ** putative rev-tandem; 6 of 8 exon matches scaffold_4 Dappu1_FM5_234437.4 662308 662457 653140 653225 -2 3.0e-38 . scaffold_4 Dappu1_FM5_234437.3 661775 662013 653520 653745 -2 2.3e-103 . scaffold_4 Dappu1_FM5_234437.2 661649 661752 653776 653879 -2 4.1e-51 . scaffold_4 Dappu1_FM5_234437.1 661211 661538 653988 654116 -2 1.5e-58 . scaffold_4 Dappu1_FM5_234437.1 661211 661538 654118 654319 -2 3.6e-85 . =cut sub tandem_exonerate { my($exon_table, $genomefa, $queryfa, $location)=@_; my $regionfile= "$queryfa.generegion"; #?? my $cdnafile= "$queryfa.cdnas"; prnote "tandem_exonerate: making inputs $regionfile, $cdnafile\n"; # if $debug my (%genes,%geneloc); foreach my $ex (@$exon_table) { my($ref,$qid,$qb,$qe,$b,$e,$strand,$eq,$p,$alt)= @$ex; #next unless($eq == -2 || $eq == 0); #?? screen here or not? do we want $eq == 0 also? next unless($eq <= 0); #?? screen here or not? do we want $eq == 0 also? # arg; putative gene may have exons from 2+ geneid's # need to separate by contiguous location .. ## xid may have location: ID=Dappu1_FM5_96539.2:169647-169896 ;; drop it my($geneid,$exnum)= $qid =~ m/^(.+)\.(\d+)/; $exnum ||= 1; #?? ## arg; need to worry about exon order, etc. not just random set of exons/gene my $exdna= get_dna( $genomefa, $ref, $qb, $qe); push(@{ $genes{$geneid}}, [$exnum, $exdna,$qid,$ref,$b,$e]); unless(exists $geneloc{$geneid}) { $geneloc{$geneid}=[$ref,$b,$e]; } else { $geneloc{$geneid}[1]= $b if( $b<$geneloc{$geneid}[1]); $geneloc{$geneid}[2]= $e if( $e>$geneloc{$geneid}[2]); } } open(GENOME,">$regionfile") or die "> $regionfile"; open(CDNA,">$cdnafile") or die "> $cdnafile"; foreach my $geneid (sort keys %genes) { my($gref,$gb,$ge)= @{ $geneloc{$geneid} }; ($gb,$ge)= ($gb-1000, $ge+1000); # expand some .. $NEARDIST ?? my $generegion= get_dna($genomefa, $gref, $gb, $ge); print GENOME ">${geneid}_region loc=$gref:$gb-$ge\n$generegion\n"; my @cdna=(); my $chead=""; my $cdna=""; my $exmiss="nnnnnn"; my @exons= @{ $genes{$geneid} }; foreach my $ex (@exons) { my($exnum,$exdna,$qid,$ref,$b,$e)= @$ex; $chead .= "loc.$exnum=$ref:$b-$e"; $cdna .= $exdna ."\n"; ## sample exnum is 8,4,3,2,1,1, ... 8 ## this is wrong ; input order is sorted right, but spacers are unknown # while( $cdna[$exnum-1] ) { $exnum++; } # $cdna[$exnum-1]= $exdna; # print CDNA ">${geneid}_exon.$exnum loc=$ref:$b-$e\n$exdna\n"; #?? want to try this way? ; exonerate wont stitch these into one alignment tho. } # do we write separate EST-like .fa instead of one (partial) cDNA ?? # foreach (@cdna) { $_= $exmiss unless($_); } # my $cdna= join("",@cdna); print CDNA ">${geneid}_cdna\n$cdna\n"; } close(GENOME); close(CDNA); prnote "tandem_exonerate: exonerate --bestn 1 --model cdna2genome $cdnafile $regionfile \n"; } =item blat reduce redundant exons This is a primary function, used following fasta_extract on a file of exons, to create a non-redundant set with listing of alternate exon IDs. Results distinguish near-perfect identities (a-size == b-size == match +/- 2 bases) from partial matches (e.g. short exon matching inside longer one). Output: fasta file $infasta.nr with alternate exon IDs appended to >header The current call is (BLAT_REDUCE_CMD): blat -fastMap $infasta $infasta -out=psl stdout test: blat -fastMap all_exons.fa all_exons.fa stdout | perl -ne '@v=split"\t";($a,$b)= sort @v[9,13]; $a{$a}++; $ab{$a}{$b}++; $ab{$b}{$a}++; END{ @a= reverse sort keys %ab; foreach $a (@a) { @bb= sort keys %{$ab{$a}}; @b=(); foreach (@bb) { push(@b,$_) if($ab{$a}{$_});} print join(" ",@b),"\n" if(@b); foreach $b (@b) { $ab{$a}{$b}=0; $ab{$b}{$a}=0; } } }' > ! common_exons.idlist2 # blat table cols (psl) match mismat xxx nnn qg qg tg tg ori Qname Qsize Qb Qe Tname Tsize Tb Te blkn blkxxx 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18.. =cut sub blat_reduce { my($infasta)=@_; my($outfasta)= "$infasta.nr"; my(%ab, %abpart, @allids, @pgroups, %inpgroup, @groups, %ingroup); prnote "grep >ids $infasta\n" if $debug; open(GREP, "grep '^>' $infasta|") or die "grep $infasta"; while(){ my($id)=m/>(\S+)/; push(@allids,$id) if($id); } close(GREP); my $RMIS=2; # should this be option? prnote "$BLAT_REDUCE_CMD $infasta $infasta -out=psl stdout\n" if $debug; open(BLAT, "$BLAT_REDUCE_CMD $infasta $infasta -out=psl stdout |") or die "$BLAT_REDUCE_CMD $infasta"; # psLayout version 3 while(){ my @v=split"\t"; next unless(/^\d+/ && @v > 13); my($a,$b)= sort @v[9,13]; # ids my($mat, $mis, $at,$ab,$ae, $bt,$bb,$be)= @v[0, 1, 10,11,12, 14,15,16]; # sizes #$abmat{$a}{$b}=$mat; $ablen{$a}=$at; $ablen{$b}= $bt; ## need to handle case where $at <<< $bt or vs; short vs long if($at >= $bt-$RMIS && $bt >= $at-$RMIS && $mat >= $at-$RMIS) { #? should use some %align, like 90% $ab{$a}{$b}= $mat; $ab{$b}{$a}= $mat; } else { ## keep all others ; if ($at >= $bt-$RMIS && $mat >= $bt-$RMIS) { # here $b is ~full in $a but may be <0 && $ab{$b}{$a}>0 ); $ab{$a}{$b}= $ab{$b}{$a} = 0; } if(@b>1) { push(@groups, \@b); $ngroup= @groups; @ingroup{@b}= ($ngroup) x scalar(@b); } } my $npgroup=0; foreach my $a (sort keys %abpart) { my @b=(); foreach my $b (sort keys %{$abpart{$a}}) { push(@b,$b) if( $abpart{$a}{$b} >0 ); $abpart{$a}{$b}= 0; } if(@b>1) { push(@pgroups, \@b); $npgroup= @pgroups; @inpgroup{@b}= ($npgroup) x scalar(@b); } } # my %singles= map{ $_ => 1 } grep {!exists $ingroup{$_}} @allids; my %singles= map{ $_ => 1 } grep {! $ingroup{$_}} @allids; my $ninput = scalar(@allids); my $ngrouped = scalar (keys %ingroup); my $nsingle = scalar (keys %singles); my ($nprint, $print)=(0)x10; my $pgrouped = scalar (keys %inpgroup); prnote "reduce $infasta n=$ninput to $ngroup [w/$ngrouped, part=$pgrouped] + $nsingle [x1] \n" if $debug; open(FA,"$infasta"); # open(OUT,">$outfasta") or die ">$outfasta"; # ? write to stdout instead while(){ if(/^>(\S+)/){ my $id=$1; if(my $ig= $ingroup{$id}) { my $h= join ",", @{$groups[$ig-1]}; #? do want $id in main alt list? # $h =~ s/$id[,]?//; $print=0; if($h) { s/$/; altids=$h/ ; $print=1; } unless($REDUCE_GROUPONLY) { $groups[$ig-1]= []; } # done group } else { $print= exists $singles{$id}; # should be always true here } ## add in $inpgroup ; $pgroups[] here for partial/subset matches if(my $ig= $inpgroup{$id}) { my $h= join ",", @{$pgroups[$ig-1]}; $h =~ s/$id[,]?//; if($h) { unless(s/(altids=[^\s;]+)/$1,pm,$h/) { s/$/; altids=pm,$h/ ; } } } $nprint++ if $print; } print $_ if($print); } # close(OUT); close(FA); prnote "blat_reduce $infasta\n"; prnote "blat_reduce in ids=$ninput; out ids=$nprint; grouped=$ngrouped [$ngroup]; partial=$pgrouped; single=$nsingle\n"; # return ($outfasta); } sub overlap_count { my($seg)= @_; print join("\t", qw(ref type gene_id comp overlaps equals distinct)),"\n"; for my $reference (@typeset) { my @feat = $seg->features(-type => $typeset{$reference}); for my $feat (@feat) { for my $typeset (@typeset) { ##next if ($typeset eq $reference); # dgg my @overlaps = $feat->overlapping_features(-type => $typeset{$typeset}); my @equals; # dgg my @distinct; if (@overlaps) { @distinct = $overlaps[0]; for (my $i = 1 ; $i < @overlaps ; $i++) { my $b = $overlaps[$i]; ## print "b=",$b->display_id,",",$b->type,"\n"; #DEBUG my $overlapped = 0; for my $a (@distinct) { #?? should this be exons of $feat ### for my $a (map{$_->sub_SeqFeature()}@distinct) { #?? exons of $feat ### print "a=",$a->display_id,",",$a->type,"\n"; #DEBUG if ($a->equals($b)) { push @equals, $b; $overlapped = 1; last; } elsif ($a->overlaps($b)) { $a->extend($b); # dgg what is this? sub union()? overlap_extent ? $overlapped = 1; last; } } unless ($overlapped) { push @distinct, $b; } } } my ($gname)= ($feat->attributes("Name"),$feat->display_id); my ($gtype)= $feat->type(); print join("\t", $reference, $gtype, $gname, #was: id, $typeset, scalar(@overlaps), scalar(@equals), scalar(@distinct)), "\n"; } } } } sub patch_feature_extend { local $^W = 0; eval <<'END'; use Bio::DB::GFF::RelSegment; sub Bio::DB::GFF::RelSegment::extend { my $self = shift; my($other,$so) = @_; return unless $other->isa('Bio::DB::GFF::RelSegment'); return if $self->abs_ref ne $other->abs_ref; my ($low,$high); foreach ($self,$other) { $low = $_->abs_start if !defined($low) or $low > $_->abs_start; $high = $_->abs_end if !defined($high) or $high < $_->abs_end; } $self->{start}= $self->_abs2rel($low); $self->{stop}= $self->_abs2rel($high); } END warn $@ if $@; } 1; __END__