#!/usr/local/bin/perl # parsencbiman.pl =head NOTES parse NCBI's human genome man => "$ncbigenomes/H_sapiens/", # ncbigenomes - May2002; - each chr file is set of many LOCUS segments # do we have any scaffold/assembly map for it? # ! see maps/mapview/chromosome../ for chr*.sequence.gz == all feature table # genbankgenomes obsolete for features ?? make this replace old genomefeat.pl -- handle all orgs using NCBi genbank chromosomes? - man, yeast, (EnsemblGenbank mosquito) -? A_thaliana, C_elegans, M_musculus?? for gnomap features.tsv, postprocess as foreach f (`ls features*tsv`) cat $f | sort -k 1,1 -k 2,2n | perl -e 'while(<>){next unless(/^\w/);\ ($c,$b,$r)=split "\t",$_,3; \ unless($c eq $lc) { close(F); open(F,">featnew-$c.tsv"); $lc=$c; }\ $r=~s/Derived by automated computational analysis using gene prediction method:/predicted:/;\ $r=~s/Supporting evidence includes similarity to: ([^ \"]+)/Similar to: $1/;\ $r=~s/Supporting evidence includes similarity to://;\ print F $r; } close(F); ' end sed -e 's/Derived by automated computational analysis using gene prediction method:/predicted: /' \ -e 's/Supporting evidence includes similarity to://' patch for InterimID: locusID: db_xref == LocusLink/HUgn ID foreach ff (`ls features*tsv`) perl -pi.old \ -e 'if(m/^gene|mRNA/ && s/\t(InterimID|LocusID):(\d+)/\tLocusLink:$2/) {\ $id= sprintf("MEOW:HUgn%07d",$2); s/\tLocusLink:/$id\tLocusLink:/; }' \ $ff end # man/features-1.tsv # Features for mosquito from Ensembl Anopheles gambiae data [Anopheles_gambiae.0.gb, 11-June-2002] # gnomap-version 1 # # tab-separated-values # Feature gene map range id db_xref notes =cut $debug= 0; use euGenes::GenomeFeatures; use Getopt::Long; my $sourceName= 'NCBI H_sapiens data'; my $org= 'man'; my $workpath= '/c7/eugenes/genomew/'; # my $genomepack= euGenes::NcbiHumanGenomeFeatures->new; # my $ncbigenomes= $genomepack->{ncbigenomes}; # '/bio/biomir-pub/biomirror/ncbigenomes'; # my $ncbigenomes= '/bio/biomir-pub/biomirror/ncbigenomes'; # ## euGenes::GenomeFeatures::init() knows this - $genompaths{man} ! my $datapath= "$ncbigenomes/H_sapiens/"; ## CHR_01 .._22, CHR_Un, CHR_X / CHR_Y my $contigmap="$ncbigenomes/H_sapiens/seq_contig.md.gz"; my $outpath= $workpath.$org; my $csome= 'all'; # bogus my $wantnotes= 1; GetOptions( 'debug!' => \$debug, 'data=s' => \$datapath, 'outpath=s' => \$outpath, 'features!' => \$wantfeat, 'dna!' => \$wantdna, 'protein!' => \$wantprot, 'acode!' => \$wantacode, 'splitfeats!' => \$dosplitfeats, 'notes!' => \$wantnotes, 'csomes=s' => \@wantcsomes, ); $wantcsomes=''; if (@wantcsomes) { %wantcsomes= map { map { $_, 1; } split ",",$_; } @wantcsomes; $wantcsomes= \%wantcsomes; @wantcsomes= sort keys %wantcsomes; } $usage= < $debug, 'data=s' => $datapath, 'outpath=s' => $outpath, 'features!' => $wantfeat, 'dna!' => $wantdna, 'protein!' => $wantprot, 'acode!' => $wantacode, 'splitfeats!' => $dosplitfeats, 'notes!' => $wantnotes, 'csomes=s' => @wantcsomes, USAGE die $usage unless( $dosplitfeats || $wantacode || $wantfeat || $wantdna || $wantprot); $contigmap= readContigmap($contigmap); my $huparse= euGenes::NcbiHumanGenomeFeatures->new( org => $org, workpath => $workpath, chromosome => $csome, # bogus doChrOffset => 1, # for $source files specifically contigmap => $contigmap, sourceName => $sourceName, skipdbx => { taxon => 1, }, skipft => { variation => 1, source => 1, }, # variation = SNP's, have a dbx wantfeat => $wantfeat, wantdna => $wantdna, wantprot => $wantprot, wantnotes => $wantnotes, wantcsomes => $wantcsomes, debug => $debug, ); ## or $ens->init( ... ); if ($wantfeat || $wantdna || $wantprot) { $huparse->readChromosomes($datapath,$outpath); # put these in params ? } if ($dosplitfeats) { splitfeats($huparse->{outpath},"features-"); } # quick and dirty - use Meow/Mosquito.pm for real acode #if ($wantacode) { # feat2acode( $huparse->{outpath}, 'features-[2,3,X]*.tsv', $gpsyms ); #} exit; #---------- # taxon chr cstart cend str acc.vers lid # 9606 1 110996448 117328737 + NT_019273.11 19273 contig 1 sub readContigmap { my $contigmap= shift; my %contigmap= (); warn "readContigmap $contigmap\n" if $debug; my $ok; if ($contigmap =~ /\.(gz|Z)$/) { $ok= (open(M,"gzcat $contigmap|")); } else { $ok= (open(M,$contigmap)); } unless($ok) { warn "Cant read $contigmap"; return \%contigmap; } my @unlocs; while(){ #my @v= split "\t"; my ($t,$chr,$cstart,$cend,$strand,$accver,$locid,$contg,$xid) = split "\t"; my $acc= $accver; $acc =~ s/\.\d+$//; next if (%wantcsomes && !$wantcsomes{$chr}); ## this drops |NT variants ! if ($acc eq 'end' || $acc eq 'start') { $acc= "Chr$chr-$acc"; } warn "chr $chr $acc start=$cstart size=".($cend-$cstart+1)." orient=$strand\n" if $debug; $contigmap{$acc}= [$acc,$chr,$cstart,$cend,$strand]; if ($debug && $chr =~ m,\|NT,) { # warn ("skiplocus: Unlocated contig: $chr\n"); push(@unlocs, $chr); } } if ($debug && @unlocs) { warn "Unlocated contigs: n=".scalar(@unlocs)."\n"; ## warn "Unlocated contigs: ".join(", ",@unlocs)."\n"; } return \%contigmap; } sub splitfeats { my $path= shift; my $featpatt= shift; warn "split $path/$featpatt\n" if $debug; opendir(D,$path); my @feats= grep(/$featpatt/,readdir(D)); ## not good enough - skip non CHR dirs closedir(D); foreach $featf (sort @feats) { my ($lc, $head, $chre); my @cs=(); open(IN, "$path/$featf") or die $featf; while() { if (/^#/) { $head .= $_; } # skip !? else { last; } } close(IN); open(IN, "sort -k 1,1 -k 2,2n $path/$featf|") or die $featf; open(F,">$path/$featf.new") or die "$path/$featf.new"; print F $head; print F "\n"; $needsource= 1; while() { # if (/^#/) { $head .= $_; } # skip !? if (/^\w/) { my($c,$b,$r)= split "\t",$_,3; # unless($c eq $lc) { # close(F); open(F,">$path/featnew-$c.tsv"); # $lc=$c; push(@cs,$c); # } if ($needsource) { my $cmap= $contigmap->{"Chr$c-end"}; if (ref $cmap) { my $csize= $cmap->[2]; print F join("\t","source",$org,"Chr$c","1..$csize"),"\n"; } $needsource= 0; } $r=~s/Derived by automated computational analysis using gene prediction method:/predicted:/; $r=~s/Supporting evidence includes similarity to: ([^ \"]+)/Similar to: $1/; $r=~s/Supporting evidence includes similarity to://; print F $r; if ($r =~ m/^segment/) { my @t= split "\t", $r; $t[3] =~ m/(\d+)\.\.(\d+)/; my($b,$e)= ($1,$2); $chr{$c}= $chre= $e if ($e>$chre); } } } close(F); close(IN); } } # foreach my $c (@cs) { # my $fn= "features-$c.tsv"; # my $f= $path.$fn; my $o= $f.'.tmp'; # unless(open(F,$f) && open(O,">$o")) { die "$f or $o"; } # my $head1= $head; # $head1 =~ s,/c7/eugenes/mosquito/,,; # $head1 =~ s,$feats,$fn,; # s,$feats,$f, # print O $head1; # print O "\n"; # print O join("\t","source","mosquito","Chr".$c,"1..".$chr{$c}),"\n"; # while () { # next if (/^source/); # # also may need to patch ID field: AGgn0 > MEOW:AGgn0 and # # for stupid gnomap2 bug, add extra \t before \n -- or is it some other stupid bug # if (m/\t(AGgn0\d+)/) { # my $aid= $1; # if (m/\t(AGgn0\d+)\n/) { $aid .= "\t"; } # s/\t(AGgn0\d+)/\tMEOW:$aid/; # } # # print O $_; # } # close(F); close(O); # rename ($o,$f); # } #sub feat2acode { # my $path= shift; # my $feats= shift; # my $rsyms= shift; # my %psyms= %$rsyms; # # opendir(D,$path); # my @ff= grep( /features-[23X].*.tsv$/, readdir(D)); # closedir(D); # open(O,">$path/acode") or die "$path/acode"; # foreach $ff (sort @ff) { # $chr = $1 if ($ff =~ m,features-(\w+),); # warn "feat2acode $path/$ff\n" if $debug; # open(IN, "egrep '^(CDS|source|#)' $path/$ff|") or die $feats; # while() { # # # my @t= split "\t"; # if (/^# Features for/) { # $date= $1 if (/(\S+)\]/); # next; # } # elsif (/^#/) { next; } # # my ($class,$sym,$map,$range,$ids,$dbx)= split "\t"; # if ($class eq 'source') { # $chr= $map; $chr =~ s/Chr//i; # } # # elsif ($class eq 'CDS') { # my $pid= $1 if ($dbx =~ /(ENSANGP0\d+)/); # my($psym,$ipro,$pname)= @{ $psyms{$pid} }; # my $did= 'Ensembl:'.$1 if ($dbx =~ /(ENSANGG0\d+)/); # my ($start, $stop)= euGenes::GenomeFeatures::maxrange('self',$range); # my $maploc= "$start..$stop"; # $maploc= 'complement('.$maploc.')' if ($range =~ /complement/); # my $namepro= ''; chomp($pname); # $namepro = "PDOM|INTERPRO:$ipro" if ($ipro); # $namepro .= " == $pname" if ($pname); # $ids =~ s/MEOW://; # # print O < hs_chr1.gbk.gz <------ LOCUS NT_019273 6332290 bp DNA linear CON 13-MAY-2002 DEFINITION Homo sapiens chromosome 1 working draft sequence segment. ACCESSION NT_019273 VERSION NT_019273.11 GI:20532581 KEYWORDS . SOURCE human. ORGANISM Homo sapiens Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo. REFERENCE 1 (bases 1 to 6332290) AUTHORS NCBI Annotation Project. TITLE Direct Submission JOURNAL Submitted (09-MAY-2002) National Center for Biotechnology Information, NIH, Bethesda, MD 20894, USA COMMENT GENOME ANNOTATION REFSEQ: NCBI contigs are derived from assembled genomic sequence data. They may include both draft and finished sequence. On May 13, 2002 this sequence version replaced gi:20503326. COMPLETENESS: not full length. FEATURES Location/Qualifiers source 1..6332290 /organism="Homo sapiens" /db_xref="taxon:9606" /chromosome="1" source 1..1240 /note="Accession AC019225 sequenced by Washington University, Genome Sequencing Center" /organism="Homo sapiens" /db_xref="taxon:9606" /clone="RP11-550G16" misc_feature 15486..199069 /standard_name="RP11-180N18" /note="FISH-mapped clone" source 17494..96930 /note="Accession AL360171 sequenced by The Sanger Centre" /organism="Homo sapiens" /db_xref="taxon:9606" /clone="RP11-180N18" variation 17598 /allele="G" /allele="A" /db_xref="dbSNP:333083" variation 26108 /allele="G" /allele="A" /db_xref="dbSNP:333079" mRNA join(27786..28272,52134..52245,54314..54457,55462..55562, 55997..56099,57863..57957,58525..58631,59444..59560, 59767..59830,60595..60683,61046..61116,61473..61567, 61652..61750,62151..62219,62648..62726,63831..63951, 64774..66842) /gene="AHCYL1" /product="S-adenosylhomocysteine hydrolase-like 1" /transcript_id="NM_006621.2" /db_xref="LocusID:10768" /note="Derived by automated computational analysis using gene prediction method: BLAST. Supporting evidence includes similarity to: 9 mRNAs" gene 27786..66842 /gene="AHCYL1" /note="XPVKONA" /db_xref="LocusID:10768" CDS join(28243..28272,52134..52245,54314..54457,55462..55562, 55997..56099,57863..57957,58525..58631,59444..59560, 59767..59830,60595..60683,61046..61116,61473..61567, 61652..61750,62151..62219,62648..62726,63831..63951, 64774..64780) /gene="AHCYL1" /note="S-adenosyl homocysteine hydrolase homolog" /codon_start=1 /product="S-adenosylhomocysteine hydrolase-like 1" /protein_id="NP_006612.1" /db_xref="GI:5729724" ------ md entry for above LOCUS: taxon chr cstart cend str acc.vers lid 9606 1 110996448 117328737 + NT_019273.11 19273 contig 1 len= 117328737 - 110996448 + 1 = 6332290 Contig layout: seq_contig.md file ===================================== The seq_contig.md file provides information on the order and orientation of the contigs along the chromosome. columns: tax_id: 9606 is Homo sapiens chromosome: 1-22, X, Y, value|contig accession or Un|contig accession where value|contig indicates contig is associated with a chromosome but not localized on the chromosome, and Un|contig indicates the contig is not placed on any chromosome from: chromosome coordinate to: chromosome coordinate orientation: +, -, ? where ? indicates uncertainty in orientation accession: NT_######.version format id: internal ID object: contig, start, stop - object type contig rows are placement of contigs along chromosome start - indicates beginning location of chromosome end - indicates calculated end position of chromosome weight: internal id ---- oat% more s*md 9606 1 1 621109 + NT_032954.1 32954 contig 1 9606 1 671110 2152144 - NT_004350.11 4350 contig 1 9606 1 2202145 2591724 0 NT_022058.10 22058 contig 1 9606 1 2641725 3769244 + NT_004321.11 4321 contig 1 9606 1 3819245 4764451 + NT_004547.11 4547 contig 1 9606 1 4814452 5119909 - NT_021917.11 21917 contig 1 9606 1 5169910 8481245 - NT_028054.8 28054 contig 1 9606 1 8531246 8790813 + NT_032970.1 32970 contig 1 9606 1 8840814 8920978 0 NT_032967.1 32967 contig 1 9606 1 8970979 11063290 + NT_021937.11 21937 contig 1 9606 1 11113291 11670357 + NT_004488.10 4488 contig 1 9606 1 11720358 12239113 - NT_022041.9 22041 contig 1 9606 1 12289114 15794273 - NT_004873.11 4873 contig 1 9606 1 15844274 17004410 + NT_030584.5 30584 contig 1 =cut