#!/usr/bin/perl # overlapfilter.perl =item usage perl overlapfilter -act keep|drop|mark -mark=terepeat -overlaps terepeats.gff -itype=gff|blast -input stdin|tandy.gff > tandyfilt.gff apply identified/predicted TE repeat GFF to tandy or blast results generalized to filter or mark GFF for any GFF overlap data (protein & EST matches, repeats) gzcat *exons_tandy6j.gff.gz | perl $td/overlapfilter.perl \ -act mark -mark=proteinhsp -over *prot9-hsp.gff.gz -in stdin > dere_exons_tandy6jm.gff =cut use strict; use warnings; use Getopt::Long; use constant SAMEBASE => 0; # for _sameloc, slop allowed in loca == locb use constant { ACT_DROP=>1, ACT_KEEP=>2, ACT_MARK=>3, ACT_MARK_WITH_ID=>4, ACT_MARK_OVERBASE=> 5}; use constant { kOVERLAP=>1, kSAMELOC=>2, kNEARLOC=>3, kCUTOVER=>4 }; our $BINSIZE = 1000 ; #was# 5000; our $NEARDIST = 500; # was 15k; needs to be < BINSIZE our $debug=1; my ($overlaps,$overlaplist,$markidtype,$input,$itype,$action,$actid,$ok,$mark,$baseover); my ($overtype,$typeover,$sametypes,$pctover,$passtypes)= (kOVERLAP,"",1,0,""); my ($save_pctover,$save_baseover)=(0,0); my ($save_miss,$n_overlaps)=(0,0); my $optok= GetOptions( "overlaps=s", \$overlaps, "typeover=s", \$typeover, "pctover=i", \$pctover, "NEARDIST=i", \$NEARDIST, "BINSIZE=i", \$BINSIZE, "mark=s", \$mark, "input=s", \$input, "itype=s", \$itype, "midtype=s", \$markidtype, # return not ID= but other attribute or score "action=s", \$action, "passtypes=s", \$passtypes, "debug!", \$debug, "baseover!", \$baseover, # show base,pct overlap counts per item and total ); die " usage: perl overlapfilter -act keep|drop|mark|markid|markbase -mark=terepeat -overlaps terepeats.gff -typeover overlap|sameloc|samefeat|near -pctover=0 (for overlap type, min % overlap) -baseover : show overlap base count -neardist=$NEARDIST (for near type, base distance) -passtypes='CDS' | 'mRNA,CDS' | 'gene.Gnomon,gene.Chainer' : act on these types only -midtype=ID|Name|score|source|... (for markid, attribute or score to mark, ID default) -itype=gff|blast -input stdin|tandy.gff > tandyfilt.gff " unless($optok and $action and (-f $overlaps or -f $input)); if($action =~ /^cut/) { $typeover= "cut"; $action= "mark"; } ## for matching IDs b/n updates; need also match type: gene/mRNA/exon/CDS/... $mark ||= "terepeat"; $itype ||= "gff"; $actid= ($action =~ /keep/) ? ACT_KEEP : ($action =~ /mark/) ? ACT_MARK : ACT_DROP; $actid= ACT_MARK_WITH_ID if($actid == ACT_MARK && $action =~ /id/i); $actid= ACT_MARK_OVERBASE if($actid == ACT_MARK && $action =~ /base/i); $overtype= ($typeover =~ /same/) ? kSAMELOC : ($typeover =~ /near/) ? kNEARLOC : ($typeover =~ /cut/) ? kCUTOVER : kOVERLAP; $sametypes= ($typeover =~ /feat/) ? 1 : 0; #?? $pctover= $pctover/100.0 if($pctover); $passtypes =~ s/[,]/\|/g; $BINSIZE= int($NEARDIST*2) if($overtype == kNEARLOC and $NEARDIST > $BINSIZE); # allow -overlap gff == stdin if -input is a file? ##local *R; my $ovh; ##= *OVR; #$ok = ($overlaps =~ /.gz$/) ? open(R,"gunzip -c $overlaps |") : open(R,$overlaps); if($overlaps =~ /.gz$/) { $ok= open(OVR,"gunzip -c $overlaps |"); $ovh= *OVR; } elsif($overlaps =~ /^(stdin|-)/) { $ovh= *STDIN; $ok=1; } else { $ok= open(OVR,$overlaps); $ovh= *OVR; } die "bad -overlaps=$overlaps" unless($ok); $overlaplist= collect_overlaps($ovh); close($ovh); my $inh= *STDIN; $ok = ($input =~ /.gz$/) ? open($inh,"gunzip -c $input |") : ($input =~ /^(stdin|-)/) ? $inh= *STDIN : open($inh,$input); die "bad -input=$input" unless($ok); my($sum_baseover,$sum_pctover,$sum_basetotal,$sum_pcttotal,$sum_miss)=(0) x 9; my $nin=0; my $nr=0; if($itype =~ /blast/i) { $nr= filter_blast($inh); } else { $nr= filter_gff($inh); } warn"#overlaps found=$nr\n" if $debug; $sum_basetotal||=1; $sum_pcttotal||=1; warn "# base statistics: overlaps n=$nr , input n=$nin , overset n=$n_overlaps\n" if ($baseover || $debug); if ($baseover) { $nr||= 1; my $ave=sprintf("ave_baseover=%.3f, ave_pctover=%.3f, ave_miss=%.3f", $sum_baseover/$sum_basetotal,$sum_pctover/$sum_pcttotal, $sum_miss/$nr); warn "# $ave # sum_baseover=$sum_baseover, sum_pctover=$sum_pctover, sum_miss=$sum_miss # sum_basetotal=$sum_basetotal, sum_pcttotal=$sum_pcttotal\n" } ; #.................. sub filter_gff { my($inh)= @_; my $nr=0; while(<$inh>){ unless(/^\w/){ next if(/^(#n |$)/); print and next; } my $line=$_; my($ref,$src,$typ,$tb,$te,$tp,$to,$tx,$tattr)= split"\t"; if($passtypes and "$typ.$src" !~ m/$passtypes/) { print; next; } # pass other types # ^ do in both filter_gff and in collect_overlaps **?? $nin++; $save_baseover= $save_pctover= 0; my $isover= ($overtype == kSAMELOC) ? sameloc($ref,$tb,$te,$to,$typ) : ($overtype == kNEARLOC) ? nearloc($ref,$tb,$te) : ($overtype == kCUTOVER) ? cut_overlaps($ref,$tb,$te) : overlaps($ref,$tb,$te); if($isover) { # now $isover == ID of match $nr++; if($overtype == kCUTOVER) { # $isover == \@nonoverlap locs my $ci=1; my $cn= ($tattr=~/\S/) ? ";ci" : "ci"; foreach my $be (@$isover) { my($nb,$ne)= @$be; my $ns= 1+$ne-$nb; (my $nline= $line) =~ s/\t$tb\t$te/\t$nb\t$ne/; $nline =~ s/$/$cn=$nr.$ci;clen=$ns/; $ci++; print $nline; } next; } next if($actid == ACT_DROP); if($actid == ACT_MARK) { s/$/;$mark=1/; } elsif($actid == ACT_MARK_WITH_ID) { s/$/;$mark=$isover/ ; } elsif($actid == ACT_MARK_OVERBASE) { s/$/;$mark=$save_baseover/ ; } #? save_pctover $sum_baseover += $save_baseover; $sum_pctover += $save_pctover; $sum_miss += $save_miss; } else { next if($actid == ACT_KEEP); } print; } return $nr; } sub filter_blast # ncbi format=8,9 blast table { my($inh)= @_; my $nr=0; while(<$inh>){ unless(/^\w/){ print and next; } my($qid,$ref,$pid,$align,$xa,$xb,$qb,$qe,$tb,$te,@bmore)= split"\t"; # fix blast loc swap for orient my $to='+'; ($tb,$te,$to)= ($te,$tb,'-') if($tb>$te); my $typ="HSP"; $nin++; $save_baseover= $save_pctover= 0; my $isover= ($overtype == kSAMELOC) ? sameloc($ref,$tb,$te,$to,$typ) #($ref,$tb,$te,$to,$typ) : ($overtype == kNEARLOC) ? nearloc($ref,$tb,$te) : overlaps($ref,$tb,$te); if($isover) { $nr++; next if($actid == ACT_DROP); if($actid == ACT_MARK) { s/$/\t$mark=1/; } elsif($actid == ACT_MARK_WITH_ID) { s/$/\t$mark=$isover/;} elsif($actid == ACT_MARK_OVERBASE) { s/$/\t$mark=$save_baseover/ ; } $sum_baseover += $save_baseover; $sum_pctover += $save_pctover; $sum_miss += $save_miss; } else { next if($actid == ACT_KEEP); } print; } return $nr; } my $warns=0; # convert to array handling? _max(@list) sub _min { return ($_[1] < $_[0]) ? $_[1] : $_[0]; } sub _max { return ($_[1] > $_[0]) ? $_[1] : $_[0]; } sub overlaps { my($ref,$tb,$te)= @_; my @lid; $save_baseover= $save_pctover= 0; $sum_basetotal += $te - $tb + 1; $sum_pcttotal += 1; $save_miss = 0; my($missb, $misse)=(undef,undef); my %didid=(); return 0 unless($overlaplist->{$ref}); # warn join",",("overlap",$ref,$tb,$te),"\n" if $debug and $warns++<10; my @bins= (int($tb/$BINSIZE) .. int($te/$BINSIZE)); foreach my $ib (@bins) { $overlaplist->{$ref}{$ib} or next; my @locs= @{$overlaplist->{$ref}{$ib}}; foreach my $rloc (@locs) { my ($lb,$le,$lid,$oid)= @{$rloc}[0,1,3,6]; next if($didid{$oid.$lb.$le}++); #* bad# next if($didid{$lid}++); # .. this didid is bad as we have some features w/ same ids.. # fix in collect overlaps: separate fields for true id, markid my $over= ($tb <= $le && $te >= $lb) ? 1 : 0; # add option to screen out trival 5% UTR overlaps .. pctoverlap=i if($over and $pctover) { # my $maxo= _max( $le - $tb, $te - $lb); # wrong my ($bb,$be)= ( _max($tb,$lb), _min($te,$le) ); my $maxo= abs($be - $bb); my $leno= _min( abs($le - $lb), abs($te - $tb)) || 1; my $pover= $maxo/$leno; $over = 0 if $pover < $pctover; #?? add save_basemiss_in, save_basemiss_over # $save_basemiss = _max($save_basemiss, $bb - $tb, $bb - $lb, $te - $be, $le - $be); if($over) { $missb= (defined $missb) ? _min($missb, abs($tb - $lb)) : abs($tb - $lb); $misse= (defined $misse) ? _min($misse, abs($te - $le)) : abs($te - $le); # $save_baseover = $maxo if($maxo > $save_baseover); # FIX, really want all non-duplic overlaps # $save_pctover = $pover if($pover > $save_pctover); # FIX # which? $save_baseover += $maxo; # FIX, really want all non-duplic overlaps $save_pctover += $pover; # FIX } } push @lid, $lid if($over); # ^^ collect *all* overlap ids as list to return } } $missb ||= 0; $misse ||= 0; $save_miss = $missb + $misse; if(@lid) { my %lid= map{$_,1}@lid; return join",", sort keys %lid; } return 0; } sub _sort_over { # @[b,e] min-b, max-e return ($a->[0] <=> $b->[0]) || ($b->[1] <=> $a->[1]); } sub cut_overlaps { my($ref,$tb,$te)= @_; my @lid; $save_baseover= $save_pctover= 0; # $sum_basetotal += $te - $tb + 1; # $sum_pcttotal += 1; return 0 unless($overlaplist->{$ref}); my @bins= (int($tb/$BINSIZE) .. int($te/$BINSIZE)); my @overs=(); foreach my $ib (@bins) { $overlaplist->{$ref}{$ib} or next; my @locs= @{$overlaplist->{$ref}{$ib}}; foreach my $rloc (@locs) { my ($lb,$le,$lid)= @{$rloc}[0,1,3]; my $over= ($tb <= $le && $te >= $lb) ? 1 : 0; # add option to screen out trival 5% UTR overlaps .. pctoverlap=i if($over and $pctover) { # my $maxo= _max( $le - $tb, $te - $lb); # wrong my ($bb,$be)= ( _max($tb,$lb), _min($te,$le) ); my $maxo= abs($be - $bb); $save_baseover = $maxo if($maxo > $save_baseover); # FIX, really want all non-duplic overlaps my $leno= _min( abs($le - $lb), abs($te - $tb)) || 1; my $pover= $maxo/$leno; $save_pctover = $pover if($pover > $save_pctover); # FIX $over = 0 if $pover < $pctover; } push @overs, [$lb,$le] if ($over); } } my @opens=(); if(@overs) { my($bb,$be)= ($tb,$te); @overs= sort _sort_over @overs; foreach my $ab (@overs) { my($lb,$le)= @$ab; if($le < $bb) { next; } elsif($lb <= $bb && $le > $bb) { $bb= $le+1; } elsif($lb < $be) { # && $le > $be my($b1,$e1)= ($bb,$lb-1); push @opens, [$b1,$e1]; $bb= $le+1; } elsif($lb > $te) { last; } #? last if($bb >= $te); } return \@opens; } else { return 0; ## [[$tb,$te]]; } } sub nearloc { my($ref,$tb,$te)= @_; my @lid; return 0 unless($overlaplist->{$ref}); # warn join",",("nearloc",$ref,$tb,$te),"\n" if $debug and $warns++<10; my $tm= int(($tb+$te)/2); # change to min-distance from ends my @bins= (int($tb/$BINSIZE) .. int($te/$BINSIZE)); foreach my $ib (@bins) { $overlaplist->{$ref}{$ib} or next; my @locs= @{$overlaplist->{$ref}{$ib}}; foreach my $rloc (@locs) { my ($lb,$le,$lid)= @{$rloc}[0,1,3]; # my $lm= int(($lb+$le)/2); # push @lid, $lid if (abs($lm - $tm) < $NEARDIST); my $over= ($tb <= $le && $te >= $lb) ? 1 : 0; if($over and $pctover) { my ($bb,$be)= ( _max($tb,$lb), _min($te,$le) ); my $maxo= abs($be - $bb); my $leno= _min( abs($le - $lb), abs($te - $tb)) || 1; $over = 0 if $maxo/$leno < $pctover; } next if($over); # skip any overlaps ?? pctover? my $bd= abs($tb - $le); my $ed= abs($lb - $te); my $mind= ($ed < $bd) ? $ed : $bd; push @lid, $lid if ($mind < $NEARDIST); } } if(@lid) { my %lid= map{$_,1}@lid; return join",", sort keys %lid; } return 0; } sub sameloc { my($ref,$tb,$te,$to,$typ)= @_; my @lid; return 0 unless($overlaplist->{$ref}); # warn join",",("sameloc",$ref,$tb,$te,$to,$typ),"\n" if $debug and $warns++<10; $to ||= '+'; ($tb,$te,$to)= ($te,$tb,'-') if($tb>$te); my @bins= (int($tb/$BINSIZE) .. int($te/$BINSIZE)); foreach my $ib (@bins) { $overlaplist->{$ref}{$ib} or next; my @locs= @{$overlaplist->{$ref}{$ib}}; foreach my $rloc (@locs) { my ($lb,$le,$lid,$lo,$ltyp)= @{$rloc}[0,1,3,4,5]; # [$tb,$te,$ref,$gid,$to,$typ] # also need match ft types?, orient, next if($sametypes and not($typ eq $ltyp and $to eq $lo)); #? push @lid, $lid if(abs($tb-$lb) <= SAMEBASE and abs($te-$le) <= SAMEBASE); } } if(@lid) { my %lid= map{$_,1}@lid; return join",", sort keys %lid; } return 0; } sub collect_overlaps { my($gff)= @_; my %overlaps=(); my $nr=0; while(<$gff>){ next unless(/^\w/); chomp; my($ref,$src,$typ,$tb,$te,$tp,$to,@gffmore)= split"\t"; if($passtypes and "$typ.$src" !~ m/$passtypes/) { next; } # pass other types # ^ do in both filter_gff and in collect_overlaps **?? $nr++; my $oid= "N".$nr; # save always for uniq feature test my($gid); # fixme: separate $gid from markid : want two fields, one for id tests if($markidtype) { if($markidtype =~ /^score/i) { $gid=$tp; } elsif($markidtype =~ /^source/i) { $gid=$src; } elsif($markidtype =~ /^type/i) { $gid=$typ; } elsif($gffmore[-1] =~ m/\b$markidtype=([^;]+)/) { $gid=$1; } } else { if($gffmore[-1] =~ m/\bID=([^;]+)/) { $gid=$1; } elsif($gffmore[-1] =~ m/\bParent=([^;]+)/) { $gid=$1; } } unless(defined $gid) { $gid = $oid; } my $rloc= [$tb,$te,$ref,$gid,$to,$typ,$oid]; # change to string; save mem my @bins= (int($tb/$BINSIZE) .. int($te/$BINSIZE)); foreach my $ib (@bins) { push( @{$overlaps{$ref}{$ib}}, $rloc); } } $n_overlaps=$nr; # global warn"#collect_overlaps=$nr\n" if $debug; return \%overlaps; } # sub _isoverlap { # my($gb,$ge, $qb,$qe)= @_; # return ($gb <= $qe && $ge >= $qb) ? 1 : 0; # }