#!/usr/bin/perl =head1 NAME dgfasplit.pl =head1 DESCRIPTION Generates biosequence fasta data subsets from arguments. Used for data grid parallelization in Grid/cluster computing. =head1 SYNOPSIS dgfasplit.pl -[npart|count] 10 [-infasta] infile [-indexfasta infile [-queryfasta '+docid:modSC'] ] [-splitfasta infile] [-JAVALIB path $BLAST_LOCATION/lib/java] dgfasplit.pl -count 10 -in $bg/shared/prots/protfa2/modSC578.fa =head1 AUTHOR Don Gilbert, gilbertd@indiana.edu, 2005/2006. =cut use strict; use Getopt::Long; my %opts; my $ckopt = GetOptions( \%opts, 'count|npart=n', 'infasta=s', 'queryfa=s', 'JAVALIB=s', 'debug', ); my $usage = "\nUsage: dgfasplit.pl -count 10 input.fasta "; die "$usage" unless($ckopt); my $error; my $JAVALIB= 'lib/java'; my $JAVALIB_URL= 'ftp://iubio.bio.indiana.edu/biogrid/gridserver/lucegene/lib/java/'; my $URLGETDIR= "wget -nv -r -N -nH -nd --directory-prefix="; my $URLCOPY= "globus-url-copy -dbg -vb"; ## -dbg -vb my $TMPDIR='$PWD'; ### '$TG_NODE_SCRATCH'; # use TG_NODE_SCRATCH or in/out dir ?? my $JAVA= "java"; # ($ENV{JAVA_HOME}) ? "$ENV{JAVA_HOME}/bin/java" : "java"; my $GREP= "grep"; # dont really need my $debug = $opts{debug}; my $count = $opts{count} or die "missing arg: -count=#parts $usage"; my $sessionid = $opts{session}; my $queryfa = $opts{queryfa}; $JAVALIB= $opts{'JAVALIB'} || $JAVALIB; $TMPDIR = $opts{'TMPDIR'} || $TMPDIR; $URLCOPY= $opts{'URLCOPY'} || $URLCOPY; chomp(my $host=`hostname`); # globus-hostname ? # $error= indexFasta($opts{'infasta'}) if($defaultrun || $opts{'indexfasta'}); # force or do if needed? ## problem with all NNNN fasta parts -- blast segfaults ?? is this blast err or not? my @dataparts= getQueryParts($count, $opts{'infasta'}, $sessionid); print join("\n", @dataparts),"\n"; #exit; #------------------- sub fetchJavaLib { my($javalib)= @_; $javalib ||= evalEnv( $JAVALIB); unless(-d $javalib) { (my $topdir = $javalib) =~ s,/lib/java$,,; unless(-d $topdir) { die "Error: missing java lib topdir: $topdir"; } my $fullpath= mkdirPath("lib/java",$topdir); #mkdir("$topdir/lib"); mkdir("$topdir/lib/java"); } my $url= evalEnv( $JAVALIB_URL); my $cmd="$URLGETDIR$javalib $url"; ##warn "# fetchJavaLib: ",$cmd,"\n" if($debug); my $err= callSystem($cmd); return $err; } sub getJavaCp { my($fetchIfNeeded)= @_; my $javalib= evalEnv( $JAVALIB); my $cp="${javalib}/lusimple.jar"; unless(-f $cp) { my $err= fetchJavaLib() if($fetchIfNeeded); die "Error: Missing javalib $cp" unless(-f $cp); } $cp .=":${javalib}/lucene.jar"; return $cp; } =item indexFasta($infile) create lucene index of fasta: java LuceneIndexerFasta -create -index $infile.luindex $infile =cut sub indexFasta { my($infile)= @_; warn "# Error: missing fasta $infile" and return -1 unless (-f $infile); warn "# Warning: index exists: $infile.luindex" and return -1 if (-d "$infile.luindex"); my $javalib= getJavaCp(1); my $deb=($debug) ? "-debug" : ""; my $cmd="$JAVA -cp ${javalib} " ." LuceneIndexerFasta $deb -create -index $infile.luindex $infile"; my $err= callSystem($cmd); warn "# indexFasta() = $err\n" if($err || $debug); return $err; } =item getFastaPart($infile,$partfile,$nparts,$partnum) create part file with lucene indexed fasta: ( echo 'set parts $nparts' ; echo 'part $partnum' ) | java LuceneGetter -index $infile.luindex -out $partfile =cut sub getFastaPart { my($infile,$partfile,$nparts,$partnum)= @_; my $javalib= getJavaCp(); my $deb=($debug) ? "-debug" : ""; ## need option for query here: echo 'find $queryfa ' my $fq=($queryfa) ? "echo 'find $queryfa' ;" : ""; my $cmd="( echo 'set parts $nparts' ; $fq echo 'part $partnum' ) " ."| $JAVA -cp ${javalib} " ." LuceneGetter $deb -index $infile.luindex -out $partfile"; my $err= callSystem($cmd); if($debug || $err) { my $ngot= `$GREP -c '^>' $partfile`; chomp($ngot); warn "# getFastaPart($partnum:$nparts,$queryfa,$infile) got:$ngot err:$err\n"; warn "# -> $partfile\n"; } return $err; } sub getQueryParts { my($nparts, $infile, $sessionid)= @_; my @parts=(); warn "# Error: missing fasta $infile" and return @parts unless (-f $infile); my $dig= length($nparts); # my $tmpdir= getTMPDIR($sessionid); my $partfile= "$infile-"; ## "$tmpdir/$partfile-"; indexFasta($infile) unless (-d "$infile.luindex"); foreach my $partnum (1..$nparts) { my $ipart= $partfile . sprintf("%0".$dig."d",$partnum); push(@parts, $ipart); my $err= getFastaPart($infile,$ipart,$nparts,$partnum) unless(-f $ipart); ## die if ($err); } return @parts; } sub evalEnv { my($s,$env)= @_; my %env= (defined $env && ref($env) =~ /HASH/) ? %$env : %ENV; $s =~ s/\$\{?(\w+)\}?/$env{$1}/g; return $s; } sub callSystem { my(@args)= @_; warn("# DEBUG $host: ",join(" ",@args),"\n") if($debug); return system(@args); } sub mkdirPath { my($dir,$topdir)= @_; $topdir= "/" unless($topdir); $dir =~ s,^$topdir,,; $dir =~ s,^/,,; $dir =~ s,/$,,; $topdir =~ s,/$,,; my @dirpath= split("/",$dir); foreach my $d (@dirpath) { $topdir = "$topdir/$d"; my $ok= (-d $topdir) ? 1 : mkdir($topdir); warn "Error: mkdir $topdir" and return 0 unless($ok); } return $topdir; } 1; __END__