#!/usr/local/bin/perl # fff2ldif.pl =head1 NOTES fff2ldif.pl - convert gnomap feature format (flat-feat-format) to ldap interchange formap (ldif) dec01, d.gilbert -- add LDAP interface (bypass ldif generation) ? =cut use POSIX; use Getopt::Std; my $debug= 0; my $kMissingValue= -999999999; my $kMaxValue= 9999999; my $mework= '/c7/eugenes/genomes/'; my $outpath= $mework.$org; my $dropMeowId= 1; my @orglist= qw(fly man mouse worm yeast weed fish); my @orgs= @orglist; my $orgs= join(',',@orglist); my $org= 'fly'; my $slapath= '/bio/grid/bio/slapd'; my $slapadd= "/usr/local/sbin/slapadd -b 'o=euGenes' -f $slapath/slapd-datagrid.conf -l "; # ^^ need to add species.ldif 1st, then each features-csome.ldif main(); sub main() { my %opt= (); Getopt::Std::getopts('rDO:W:o:i:',\%opt); die "usage: $0 -r -- write features to ldif -D -- debug -O fly,man... -- org list [ $orgs ] -o outdir -- output path [ $outpath ] -W workdir -- location of feature data [ $mework ] " unless($opt{r}); # -o output-path [ $mework$org ] # -p path-to-sorsa-maps [ ${sorsapath} sorsa.txt ] # -a path-to-FBgn.acode [ $orgacode ] # -r gadfly-release-num [ $rel ] # -i 0/1 -- index features [ $doindex ] $debug= $opt{D}; $outpath= $opt{o} || $outpath; $rewritefeats= $opt{r}; $mework= $opt{W} || $mework; if ($opt{O}) { @orgs= split(/\,/,$opt{O}); foreach (@orgs) { $wantorg{$_}= 1; } } foreach $org (@orgs) { $dir= "$mework/$org/"; fff2ldif($org, $dir); } exit; } #---------------------- sub fff2ldif { my( $org, $dir)= @_; local(*D,*F,*O,*T); opendir(D, $dir) || warn "can't open $dir"; my @files= grep( /^features-\w+\.tsv$/, readdir(D)); closedir(D); # %ftsum= (); # Feature-summary hash / org / chr # %chrsum= (); # chromosom hash / org my $csome= 'unknown'; warn "fff2ldif org=$org, dir=$dir\n" if $debug; my @csomes= (); %idhash= (); foreach my $file (sort @files) { my $sfile= $dir . $file; $csome= $1 if ($file =~ /^features-(\w+)/); my $mfile= $dir . "features-$csome.ldif"; die "Can't read $file" unless open(F,$sfile); die "Can't write $mfile" unless open(O, ">$mfile") ; push(@csomes, $csome); %chrsum= (); %ftsum= (); # Feature-summary hash / org / chr my $dnaf= $dir . "dna-$csome.raw"; if (-r $dnaf) { $chrsum{bases}= -s $dnaf; } warn "# $org/features-$csome.ldif\n" if $debug; print O "# $org/features-$csome.ldif\n"; print O "# format: ldap \n"; die "Can't write $mfile.tmp" unless open(T, ">$mfile.tmp") ; my $nl= 0; while () { if (/^\#/) { if (/tab-separated-values/) {} elsif ($nl == 0) { print O $_; } else { print T $_; } } elsif (/^\w/) { $nl++; chomp(); my @feats= split(/\t/); printFeat( *T, $org, $csome, @feats ); } } close(T); close(F); ## dang -- need to print this first - and OrgSum before this ! printFeatSum( *O, $org, $csome, $chrsum{bases}, $chrsum{loc}, \%ftsum); open(T,"$mfile.tmp"); while() { print O ; } close(T); unlink("$mfile.tmp"); close(O); } $mfile= $dir."species.ldif"; die "Can't write $mfile" unless open(O, ">$mfile") ; printOrgSum( *O, $org, \@csomes); close(O); } sub printOrgSum { my ($fh, $org, $chrs)= @_; my @chrs= @$chrs; warn "dn: spp=$org,srv=Genome map,o=euGenes\n" if $debug; print $fh <90) { my ($sr,$s1,$s2,$s3); my @sr= split(/,/,$srange); # foreach my $s1 (@sr) { # if (length($s2)>70) { # if ($s3) { $s3 .= ",\n "; } # $s3 .= $s2; $s2= ''; # } # if (!$s2) { $s2= $s1; } # elsif (length($s2)<70) { $s2 .= ',' . $s1 } # } # $srange= $s3; while (scalar(@sr)) { $sr= shift @sr; while (scalar(@sr) && length($sr)<70) { $sr .= ',' . shift @sr; } if ($s2) { $s2 .= ",\n "; } $s2 .= $sr; } $srange= $s2; } print $fh <-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } return ($start,$stop); } __END__ oat% set g=/c7/eugenes/work/genomes set slapth=/bio/grid/bio/slapd oat% find $g -name species.ldif /c7/eugenes/work/genomes/fly/species.ldif /c7/eugenes/work/genomes/yeast/species.ldif foreach f (`find $g -name species.ldif`) echo slapadd $f /usr/local/sbin/slapadd -b "o=euGenes" -f $slapth/slapd-datagrid.conf -l $f end set n=0 foreach f (`ls $g/fly/feature*.ldif`) @ n++ echo slapadd $f $n /usr/local/sbin/slapadd -b "o=euGenes" -f $slapth/slapd-datagrid.conf \ -l $f >& log.fly$n & end /usr/local/sbin/slapindex -b "o=euGenes" -f $slapth/slapd-datagrid.conf # /usr/local/sbin/slapcat -l eugenes-ldbm.ldif -b "o=euGenes" -f $slapth/slapd-datagrid.conf ldapsearch -x -H "ldap://oat.bio.indiana.edu:3891/" -b "o=euGenes" '(objectClass=Species)' ldapsearch -x -H "ldap://oat.bio.indiana.edu:3891/" -b "o=euGenes" '(objectClass=Feature-summary)' ldapsearch -x -H "ldap://eugenes.org:3891/" -b "o=euGenes" '(objectClass=Feature-summary)' slapadd: could not add entry dn="id=258091-258465-1,ft=transposon,chr=2R,spp=fly ,srv=Genome map,o=euGenes" (line=1053) slapadd: could not add entry dn="id=GadFly:CT9981,ft=CDS,chr=2R,spp=fly,srv=Geno me map,o=euGenes" (line=1288) ^^^ dup IDs for CT9981 # gnomap1.ldif # test gnomap data for ldap service ? # ?? can ldap handle 100,000s of entries well? dn: o=euGenes objectClass: top objectClass: extensibleObject objectClass: organization objectClass: organizationalUnit objectclass: Bioinformatics-Service name: euGenes o: euGenes.org description: eukaryote genome information service web: http://iubio.bio.indiana.edu/eugenes/ web: ftp://iubio.bio.indiana.edu/eugenes/ web: http://eugenes.org/ mail: eugenes@eugenes.org srv: genome annotation srv: genome map srv: genome data # this should be ref/seeAlso to other place? ou: Center for Genomics and Bioinformatics ou: Biology Department o: Indiana Unversity st: Indiana c: US dn: srv=Genome map,o=euGenes objectClass: top objectClass: Genome-map id: euGenes name: euGenes Genome map description: eukaryote genome information service spp: fly spp: man spp: worm spp: weed spp: yeast content: nucleic acid sequence content: genome annotation content: genome map format: biosequence/genbank format: biosequence/embl format: biosequence/fasta format: biosequence/fff format: biosequence/gff format: biosequence/xml format: image/gif format: image/pdf url: http://iubio.bio.indiana.edu:8089/.bin/gnomap? web: http://iubio.bio.indiana.edu/eugenes/ dn: spp=fly,srv=Genome map,o=euGenes objectClass: Species id: fly chr: X chr: 2L chr: 2R chr: 3L chr: 3R chr: 4 urc: spp={id} # url= parent.url + urc ... ? dn: chr=X,spp=fly,srv=Genome map,o=euGenes objectClass: Chromosome id: X loc: -259770..22617716 bases: 22617716 urc: chr={id} dn: ft=gene,chr=X,spp=fly,srv=Genome map,o=euGenes objectClass: Feature-summary id: gene count: 2314 urc: cl={id} dn: ft=mRNA,chr=X,spp=fly,srv=Genome map,o=euGenes objectClass: Feature-summary id: mRNA count: 2450 urc: cl={id} dn: ft=CDS,chr=X,spp=fly,srv=Genome map,o=euGenes objectClass: Feature-summary id: CDS count: 2450 urc: cl={id} dn: id=FBgn0004446,ft=gene,chr=X,spp=fly,srv=Genome map,o=euGenes objectClass: Feature ft: gene id: FBgn0004446 loc: 12041365..12063040 name: Ten-a chr: X map: 11A10-12 dbxref: FlyBase:FBgn0004446 dbxref: FlyBase:FBan0011270 urc: id={id}&cl={ft}&bp={loc} url: http://iubio.bio.indiana.edu:8089/.bin/gnomap?spp=fly&cl=gene&chr=X&id=FBgn0004446 &bp=12041365..12063040 # urc: id=FBgn0004446&bp=12041365..12063040 dn: id=12041365-12063040-8,ft=mRNA,chr=X,spp=fly,srv=Genome map,o=euGenes objectClass: Feature ft: mRNA id: 12041365-12063040-8 loc: join(12041365..12041670,12043942..12044048,12052931..12054136,12054290..12054484, 12056396..12056604,12059011..12059197,12062542..12062688,12062829..12063040) chr: X name: CT31455 dbxref: GadFly:CT31455 dbxref: FlyBase:FBan0011270 urc: cl={ft}&bp={loc} url: http://iubio.bio.indiana.edu:8089/.bin/gnomap?spp=fly&cl=mRNA&chr=X &bp=join(12041365..12041670,12043942..12044048,12052931..12054136,12054290..12054484,12056396..12056604,12059011..12059197,12062542..12062688,12062829..12063040) # id based on hash of loc ??; simple id: start-end-sublocCount =============