#!/usr/bin/perl
# slimblastxml.pl -- cut verbosity down for NCBI BLAST -m 7 xml output
# slimblast7();
bl7to9();
sub slimblast7
{
print "\n"; # header
print "\n"; # header
while(<>){
next if (m,^<\?xml, || m,^<\!DOCTYPE,);
if (m,<(BlastOutput_reference|BlastOutput_param|Iteration_stat),) { $skipto= $1; }
if ($skipto) {
$skipto='' if (m,$skipto>,);
next;
}
print;
}
print "\n"; # trailer
}
#---- also convert to -m 9 table
# BLASTX 2.2.10 [Oct-19-2004]
# Query: contig_0
# Database: dmel-blast/dmel-translation
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings,
# q. start, q. end, s. start, s. end, e-value, bit score
# BlastOutput_version
# BlastOutput_db = Database:
# BlastOutput_query-def = Query
sub bl7to9
{
my ($blver,$db,$qfull,$query,$subj,$ident,$align,$mis,$gap,$qstart,$qend,$sstart,$send,$eval,$bits);
while(<>){
next if (m,^<\?xml, || m,^<\!DOCTYPE,);
if (m,([^<]+),) { $blver=$1; }
elsif (m,([^<]+),) { $db=$1; }
elsif (m,([^<]+),) {
$query=$1; $qfull= $query; $query =~ s/\s+.*//; # chomp to ID
}
elsif (m,([^<]+),) {
$subj=$1; $subj =~ s/\s+.*//; # chomp to ID
}
elsif (m,([^<]+),) { $bits=$1; }
elsif (m,([^<]+),) { $eval=$1; }
elsif (m,([^<]+),) { $qstart=$1; }
elsif (m,([^<]+),) { $qend=$1; }
elsif (m,([^<]+),) { $sstart=$1; }
elsif (m,([^<]+),) { $send=$1; }
elsif (m,([^<]+),) { $ident=$1; }
elsif (m,([^<]+),) { $align=$1; }
# dang have to count gaps as '---' in seqs
if (m,<(Hsp_qseq|Hsp_hseq)>,) { $collect= $1; $colbuf=''; }
if ($collect) {
if (m,>?(.+),) { $colbuf.=$1; }
if (m,$collect>,) {
my @b= ($colbuf =~ m/[^-][-]/g);
$gap += scalar(@b);
$collect='';
}
}
if (m,,){
if (defined $bits) {
$mis = $align - $ident;
my $pident= sprintf("%6.2f",(100.0 * $ident/$align));
$eval= sprintf("%6.0g", $eval);
$bits= sprintf("%6.3g", $bits);
print join("\t", ($query,$subj,$pident,$align,$mis,$gap,$qstart,$qend,$sstart,$send,$eval,$bits)),"\n";
}
($ident,$align,$mis,$gap,$qstart,$qend,$sstart,$send,$eval,$bits)=(undef) x 10;
}
if (m,,){
$subj= undef;
}
}
}
=item
$s="HRAS--LKMSLPTQPPDL--GGILAERLIG---DPKE";
@b= ($s =~ m/[^-][-]/g); # yes
$n= ($s =~ tr/\-/-/c); # no counts all
print "b=",scalar(@b),"; n=$n \n";
pct= ident/align; mis = ? gap = ?
contig_0 CG4082-PA 85.94 448 63 2 59969 61312 1 428
0.0 771
contig_0 CG4082-PA 93.07 274 19 0 61441 62262 422 695
1e-158 504
contig_0 CG4082-PA 100.00 38 0 0 62327 62440 696 733
1e-158 81.3
contig_0 CG13654-PA 76.23 366 87 0 7796 8893 601 966
2e-166 588
contig_0 CG13654-PA 44.68 470 254 10 5765 7156 190 597
6e-95 350
contig_0 CG13654-PA 54.76 84 32 1 3945 4178 24 107
8e-33 93.6
contig_0 CG13654-PA 70.83 48 14 0 4951 5094 143 190
1e-13 80.5
contig_0 CG13654-PA 81.58 38 7 0 4234 4347 106 143
8e-33 71.6
contig_0 CG6554-PA 88.24 306 36 1 58252 57335 93 376
4e-147 523
contig_0 CG6554-PA 64.42 104 37 2 58811 58500 2 93
7e-27 124
CG4082-PA type=protein; loc=3R:join(6614387..6614538,6614593..6615703,66157
62..6616583,6616647..6616760); ID=CG4082-PA; name=Mcm5-PA; db_xref=FlyBase:FBpp0081756,GB_prot
ein:AAF54557.1,FlyBase:FBgn0017577,Gadfly:CG4082-PA; species=dmel; len=733
1
771.541
1991
0
59969
61312
1
428
2
385
410
448
=cut