#!/usr/bin/perl # slimblastxml.pl -- cut verbosity down for NCBI BLAST -m 7 xml output # slimblast7(); bl7to9(); sub slimblast7 { print "\n"; # header print "\n"; # header while(<>){ next if (m,^<\?xml, || m,^<\!DOCTYPE,); if (m,<(BlastOutput_reference|BlastOutput_param|Iteration_stat),) { $skipto= $1; } if ($skipto) { $skipto='' if (m,,); next; } print; } print "\n"; # trailer } #---- also convert to -m 9 table # BLASTX 2.2.10 [Oct-19-2004] # Query: contig_0 # Database: dmel-blast/dmel-translation # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, # q. start, q. end, s. start, s. end, e-value, bit score # BlastOutput_version # BlastOutput_db = Database: # BlastOutput_query-def = Query sub bl7to9 { my ($blver,$db,$qfull,$query,$subj,$ident,$align,$mis,$gap,$qstart,$qend,$sstart,$send,$eval,$bits); while(<>){ next if (m,^<\?xml, || m,^<\!DOCTYPE,); if (m,([^<]+),) { $blver=$1; } elsif (m,([^<]+),) { $db=$1; } elsif (m,([^<]+),) { $query=$1; $qfull= $query; $query =~ s/\s+.*//; # chomp to ID } elsif (m,([^<]+),) { $subj=$1; $subj =~ s/\s+.*//; # chomp to ID } elsif (m,([^<]+),) { $bits=$1; } elsif (m,([^<]+),) { $eval=$1; } elsif (m,([^<]+),) { $qstart=$1; } elsif (m,([^<]+),) { $qend=$1; } elsif (m,([^<]+),) { $sstart=$1; } elsif (m,([^<]+),) { $send=$1; } elsif (m,([^<]+),) { $ident=$1; } elsif (m,([^<]+),) { $align=$1; } # dang have to count gaps as '---' in seqs if (m,<(Hsp_qseq|Hsp_hseq)>,) { $collect= $1; $colbuf=''; } if ($collect) { if (m,>?(.+),) { my @b= ($colbuf =~ m/[^-][-]/g); $gap += scalar(@b); $collect=''; } } if (m,,){ if (defined $bits) { $mis = $align - $ident; my $pident= sprintf("%6.2f",(100.0 * $ident/$align)); $eval= sprintf("%6.0g", $eval); $bits= sprintf("%6.3g", $bits); print join("\t", ($query,$subj,$pident,$align,$mis,$gap,$qstart,$qend,$sstart,$send,$eval,$bits)),"\n"; } ($ident,$align,$mis,$gap,$qstart,$qend,$sstart,$send,$eval,$bits)=(undef) x 10; } if (m,,){ $subj= undef; } } } =item $s="HRAS--LKMSLPTQPPDL--GGILAERLIG---DPKE"; @b= ($s =~ m/[^-][-]/g); # yes $n= ($s =~ tr/\-/-/c); # no counts all print "b=",scalar(@b),"; n=$n \n"; pct= ident/align; mis = ? gap = ? contig_0 CG4082-PA 85.94 448 63 2 59969 61312 1 428 0.0 771 contig_0 CG4082-PA 93.07 274 19 0 61441 62262 422 695 1e-158 504 contig_0 CG4082-PA 100.00 38 0 0 62327 62440 696 733 1e-158 81.3 contig_0 CG13654-PA 76.23 366 87 0 7796 8893 601 966 2e-166 588 contig_0 CG13654-PA 44.68 470 254 10 5765 7156 190 597 6e-95 350 contig_0 CG13654-PA 54.76 84 32 1 3945 4178 24 107 8e-33 93.6 contig_0 CG13654-PA 70.83 48 14 0 4951 5094 143 190 1e-13 80.5 contig_0 CG13654-PA 81.58 38 7 0 4234 4347 106 143 8e-33 71.6 contig_0 CG6554-PA 88.24 306 36 1 58252 57335 93 376 4e-147 523 contig_0 CG6554-PA 64.42 104 37 2 58811 58500 2 93 7e-27 124 CG4082-PA type=protein; loc=3R:join(6614387..6614538,6614593..6615703,66157 62..6616583,6616647..6616760); ID=CG4082-PA; name=Mcm5-PA; db_xref=FlyBase:FBpp0081756,GB_prot ein:AAF54557.1,FlyBase:FBgn0017577,Gadfly:CG4082-PA; species=dmel; len=733 1 771.541 1991 0 59969 61312 1 428 2 385 410 448 =cut