#!/usr/bin/perl # cksumfa.pl =head1 DESCRIPTION cksumfa.pl preprocess genome assembly fasta, adding MD5 / SWISS CRC64 checksums and base counts to headers =head1 AUTHOR Don Gilbert, gilbertd@indiana.edu, 2005/2006. for drosophila species and daphnia genome blast annotations =cut use strict; use Getopt::Long; use POSIX; use Digest::MD5; my $debug= 1; =item calc and add >>was>> POSIX-standard String::CRC::Cksum >>BEST>> SWISSPROT CRC64 standard checksum of sequence to fasta header -- dbers use md5 checksum - do also?? (common in sql soft) grep '^>' ensAG.fa | perl -n -e '($cs)=/(CRC64=\w+; size=\d+)/; ($id)=/^(>\S+)/; print "$id $cs\n";' > ensAG.h zcat $up/uniprot_*.gz | egrep -e'^(ID|AC|SQ)' | perl -ne\ 'if(eof() || /^ID\s+(\S+)/){print ">$id $ac CRC64=$crc; size=$sz\n"; $id=$1;}\ elsif(/^AC\s+(.+)$/){$ac=$1;}\ elsif(/(\d+) AA;[^;]+;\s+(\S+) CRC64/){$sz=$1;$crc=$2;}' > uniprot.h ** this is dying * out of mem * for chromosomes .. .. is it MD5, swiss crc or more likely sucking all of data into mem 1st =item md5? # Functional style use Digest::MD5 qw(md5 md5_hex md5_base64); $digest = md5($data); == string will be 16 bytes $digest = md5_hex($data); == string will be 32 hex bytes $digest = md5_base64($data); == base64 encoded # OO style use Digest::MD5; $ctx = Digest::MD5->new; $ctx->add($data); $ctx->addfile(*FILE); $digest = $ctx->digest; $digest = $ctx->hexdigest; $digest = $ctx->b64digest; =cut my ($input,$output,$nodata)= ("",""); my ($qdbin,$chrin,$debug,$minlen)= ("","",1,50000); my ($countbases,$regex,$regexheader,$regexto)= (0,"",""); my $noNcheck= 0; my $optok= Getopt::Long::GetOptions( 'infile=s' => \$input, # @infile ? 'outfile=s' => \$output, 'nodata' => \$nodata, 'noncheck' => \$noNcheck, 'countbases' => \$countbases, 'reheader=s' => \$regex, #'qdb=s' => \$qdbin, #'chringff=s' => \$chrin, #'minlen=s' => \$minlen, #'chrgff!' => \$tochrgff, #'chrtype=s' => \$chrtype, #'debug!' => \$debug, #'splitquery=s' => \$splitquery, ); die "usage: cksumfa -input=data.fasta -output=out.fasta -nodata : dont output data, only header -countbases: add base count (dna) to header -reheader=s/super_/scaffold_/ : regex conversion for headers " unless($input); if ($regex =~ m/^s(\W)/) { my $del= $1; my($b,$c)= (2,index($regex,$del,2)); my($e)= (index($regex,$del,$c+1)); if($e > $c && $c > $b) { $regexto= substr($regex,$c+1,$e-$c-1); $regexheader= substr($regex,$b,$c-$b); } } cksumfa($input,$output); =item edit/fix eugenes prot22 deflines ==> protfa/sanPF.fa <== >gnl|sanPF|Pfa3D7|pfal_chr1|PFA0005w|Annotation|Sanger (protein coding) erythrocyte membrane protein 1 (PfEMP1) Location=join(29733..34985,36111..37349) ^ cut excess || blast-formatdb problem ==> protfa/modDM.fa <== >gnl|modDM|CG11023-PA type=protein; loc=2L:join(7680..8116,8229..8589,8668..9273); ID=CG11023-PA; name=CG11023-PA; db_xref=FlyBase:FBpp0088316,GB_protein:AAO41164.1,FlyBase:FBgn0031208, Gadfly:CG11023-PA; species=dmel; source=FlyBase; version=r4.0; len=468 ... shorten long header (lost in blast out): drop loc=...; other? =cut sub cleanEukprot22Defline { my($h)= @_; # fixhead ... other regex changes ? if ($h =~ m,(\>gnl\|sanPF\|\w+)\|(.+),) { my($id,$def)= ($1,$2); $def =~ s,\|, ,g; $def =~ s,Annotation,,; $h= $id . " ". $def; } elsif ($h =~ s,loc=(\w+):(\w+)\(([^\)]+)\),XLOCX,) { my($ref,$cmpl,$loc)= ($1,$2,$3); my $b= $1 if ($loc =~ /^(\d+)/); my $e= $1 if ($loc =~ /(\d+)$/); $h =~ s/XLOCX/ref=$ref:$b..$e/; } elsif ($regexheader && $regexto) { $h =~ s/$regexheader/$regexto/e; } # if ($h =~ s,location=(\w+)\(([^\)]+)\),XLOCX,i) { # my($ref,$cmpl,$loc)= ($1,$2,$3); # my $b= $1 if ($loc =~ /^(\d+)/); # my $e= $1 if ($loc =~ /(\d+)$/); # $h =~ s/XLOCX/loc=$b..$e/; # } return $h; } sub cksumfa { my($infile,$outfile,$chrs)= @_; my($inh,$outh); if (!$infile || $infile =~ m/^(stdin|\-)$/i) { $inh= *STDIN; } else { unless (open(R,$infile)) { warn "cannot read $infile"; return undef; } $inh= *R; } unless($outfile) { $outfile= $infile; unless($outfile =~ s/\.fa/-mz.fa/) { $outfile.="-mz"; } } warn "\nparsing $infile to $outfile\n" if $debug; if (!$outfile || $outfile =~ m/^(stdout|\-)$/i) { $outh= *STDOUT; } else { # overwrite or append out? unless (open(O,">$outfile")) { warn "cannot write $outfile"; return undef; } $outh= *O; } #my $sum = String::CRC::Cksum->new; my @entry; my ($md5sum,$crc64sum); my($id,$size,$head,$defmore,$doc,$sdb,$skipit); my($na,$nc,$ng,$nt,$nn,$nx,$no) = (0)x10; while(<$inh>){ chomp; if(/^#/) { } elsif(/^>/) { if ($size) { # my $cdata= uc(join '',@entry); # $cdata =~ tr/A-Z/ /cs; # $cdata =~ s/\s+//g if($cdata =~/\s/); # squeeze out spc, other? # my $csize= length($cdata); #$size; # my $crc= SWISS::CRC64::crc64($cdata); # my $md5= Digest::MD5::md5_hex($cdata); my $crc= $crc64sum->hexsum(); my $md5= $md5sum->hexdigest(); my $csize= $crc64sum->size(); my $oldsizefld=$1 if $head =~ s/[; ]*(\w+)=$csize/;/; my $addh=" CRC64=$crc; MD5=$md5; size=$csize;"; $addh .= " counts=${na}A,${nc}C,${ng}G,${nt}T,${nn}N,${nx}X,${no}o;" if $countbases; print $outh $head,$addh,"\n" if $head; foreach (@entry) { print $outh $_,"\n"; } } $head= $_; @entry=(); $size= 0; ($na,$nc,$ng,$nt,$nn,$nx,$no) = (0)x10; $head= cleanEukprot22Defline($head); $crc64sum= SWISS::CRC64->new; $md5sum= Digest::MD5->new; #m/^>(\w+)(\S*)\s*(.*)/ and ($id,$defmore,$doc,$size)= ($1,$2,$3,0); } elsif ($head) { ##my $nc= tr/A-z/A-z/; $size += $nc; my $c= uc($_); # $c =~ tr/A-Z/ /cs; $c =~ s/\s+//g; $size += length($c); # add counts of bases? esp dna: ACGT/U N/X other if ($countbases) { $na += $c =~ tr/A/A/; $nc += $c =~ tr/C/C/; $ng += $c =~ tr/G/G/; $nt += $c =~ tr/TU/TU/; $nn += $c =~ tr/N/N/; $nx += $c =~ tr/X/X/; # convert to N?? $no += $c =~ tr/ACGTUNX/ACGTUNX/c; } if($noNcheck) { $c =~ s/N|X//g; } $crc64sum->add($c); $md5sum->add($c); push @entry,$_ unless ($nodata); ## optionally write now } } if ($size) { my $crc= $crc64sum->hexsum(); my $md5= $md5sum->hexdigest(); my $csize= $crc64sum->size(); my $oldsizefld=$1 if $head =~ s/[; ]*(\w+)=$csize/;/; my $addh=" CRC64=$crc; MD5=$md5; size=$csize;"; $addh .= " counts=${na}A,${nc}C,${ng}G,${nt}T,${nn}N,${nx}X,${no}o;" if $countbases; print $outh $head,$addh,"\n" if $head; foreach (@entry) { print $outh $_,"\n"; } } close($inh); close($outh); } =head1 CRC64 perl module documentation =head2 NAME CRC64 - Calculate the cyclic redundancy check. =head2 SYNOPSIS ** rewrite for line-add so dont need to suck all into mem ** use SWISS::CRC64; $crc = SWISS::CRC64::crc64("IHATEMATH"); #returns the string "E3DCADD69B01ADD1" ($crc_low, $crc_high) = SWISS::CRC64::crc64("IHATEMATH"); #returns two 32-bit unsigned integers, 3822890454 and 2600578513 =head2 DESCRIPTION SWISS-PROT + TREMBL use a 64-bit Cyclic Redundancy Check for the amino acid sequences. The algorithm to compute the CRC is described in the ISO 3309 standard. The generator polynomial is x64 + x4 + x3 + x + 1. Reference: W. H. Press, S. A. Teukolsky, W. T. Vetterling, and B. P. Flannery, "Numerical recipes in C", 2nd ed., Cambridge University Press. Pages 896ff. =head2 Functions =over =item crc64 string Calculate the CRC64 (cyclic redundancy checksum) for B. In array context, returns two integers equal to the higher and lower 32 bits of the CRC64. In scalar context, returns a 16-character string containing the CRC64 in hexadecimal format. =back =head1 AUTHOR Alexandre Gattiker, gattiker@isb-sib.ch =head1 ACKNOWLEDGEMENTS Based on SPcrc, a C implementation by Christian Iseli, available at ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/old/SPcrc.tar.gz =cut package SWISS::CRC64; # ** Initialisation #32 first bits of generator polynomial for CRC64 #the 32 lower bits are assumed to be zero # my $POLY64REVh = 0xd8000000; # my @CRCTableh = 256; # my @CRCTablel = 256; # my $initialized; use strict; use vars qw/$POLY64REVh @CRCTableh @CRCTablel $initialized/; BEGIN{ $POLY64REVh = 0xd8000000; @CRCTableh = 256; @CRCTablel = 256; $initialized= 0; } sub new { my $pkg= shift; init(); my $self= { crcl => 0, crch => 0, size => 0 }; return bless $self; # , $pkg ?? } sub size { return shift->{size}; } sub add { my $self= shift; my $sequence= shift; my $crcl = $self->{crcl}; my $crch = $self->{crch}; my $size = $self->{size}; foreach (split '', $sequence ) { my $shr = ($crch & 0xFF) << 24; my $temp1h = $crch >> 8; my $temp1l = ($crcl >> 8) | $shr; my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF; $crch = $temp1h ^ $CRCTableh[$tableindex]; $crcl = $temp1l ^ $CRCTablel[$tableindex]; $size++; } $self->{crcl}= $crcl; $self->{crch}= $crch; $self->{size}= $size; } sub hexsum { my $self= shift; my $crcl = $self->{crcl}; my $crch = $self->{crch}; return wantarray ? ($crch, $crcl) : sprintf("%08X%08X", $crch, $crcl); } sub init { if (!$initialized) { $initialized = 1; for (my $i=0; $i<256; $i++) { my $partl = $i; my $parth = 0; for (my $j=0; $j<8; $j++) { my $rflag = $partl & 1; $partl >>= 1; $partl |= (1 << 31) if $parth & 1; $parth >>= 1; $parth ^= $POLY64REVh if $rflag; } $CRCTableh[$i] = $parth; $CRCTablel[$i] = $partl; } } } sub crc64 { my $sequence = shift; my $crcl = 0; my $crch = 0; init(); foreach (split '', $sequence ) { my $shr = ($crch & 0xFF) << 24; my $temp1h = $crch >> 8; my $temp1l = ($crcl >> 8) | $shr; my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF; $crch = $temp1h ^ $CRCTableh[$tableindex]; $crcl = $temp1l ^ $CRCTablel[$tableindex]; } # my($c, $shr, $temp1h, $temp1l); # while(@_) { # my $n = length $_[0]; # for(my $i = 0; $i < $n; ++$i) { # $c = unpack 'C', substr $_[0], $i, 1; # $shr = ($crch & 0xFF) << 24; # $temp1h = $crch >> 8; # $temp1l = ($crcl >> 8) | $shr; # $c= ($crcl ^ $c) & 0xFF; # $crch = $temp1h ^ $CRCTableh[$c]; # $crcl = $temp1l ^ $CRCTablel[$c]; # #++$size; # } # } # continue { shift } #?? return wantarray ? ($crch, $crcl) : sprintf("%08X%08X", $crch, $crcl); } 1; __END__ #------ switch to UniProt CRC64 for compat. package String::CRC::Cksum;