#!/usr/local/bin/perl # parsegolden.pl # parse human goldenPath features -- mysql .sql/.txt dumps of annotations =TODO now have genieKnown & knownMore tables -- add other features / tables -- will need to sort on bp loc: use perl/unix or mysql new/tmp tables? Main annot tables: x* genieKnown x* genieAlt -- same format as genieKnown, no knownMore; 3000+ dups in genieKnown - knownInfo (don't need, use knownMore) x* knownMore -- refSeq, other dbxref x* rnaGene ? snp* ? stsMarker x cpgIsland mouseSyn ? gcPercent ? exoFish ? chrN_est ?* chrN_mrna -- include productName, tissue, other dblink info? ~ chrN_gap ~ chrN_rmsk extras: cytoBand -- use for gnomap cytoband axis =cut $debug= 1; $dofeat= 0; $doacode= 0; $tabout= 0; $wantfeat= $dofeat; # $doprot= 0; # $wantdna= 0; $maxr= 999999; $isMeowId= 1; $gnomapvers= '1'; $kMissingValue= -999999999; $sourceName= 'GoldenPath annotated human genome'; $org= 'man'; @chrs= qw( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y ); ## these only have _random sets: UL NA # local data paths $meowpub= '/c6/iubio/meow-pub/meow/server/'; $genomeshome= '/c7/eugenes/work/genomes/'; $genomeswork= '/c7/eugenes/genomew/'; $orgacode= "$meowpub/$org/acode"; $goldenpath= '/c7/eugenes/human/ucsc/mirror/goldenPath/07oct2000/'; $seqpath= $goldenpath . 'chromosomes/'; $annotpath= $goldenpath . 'database/'; use lib ("/bio/work/meow/"); use Getopt::Std; use POSIX; use DBI; use flybase::sym2id; %opt=(); Getopt::Std::getopts('o:C:aftDA:W:',\%opt); unless(scalar(keys %opt)) { print STDERR <connect($dbspec,$opt_user,$opt_password) || die( "Can't connect to '$dbspec' : $DBI::errstr\n"); # genieKnown.name -> name to work for genieAlt. # ! name here == transId @geflds= qw/ name strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds /; # main seq feature queries $geksql= 'SELECT '.join(',',@geflds).' FROM genieKnown WHERE chrom = ?'; $geasql= 'SELECT '.join(',',@geflds).' FROM genieAlt WHERE chrom = ?'; # ^^ FROM genieAlt,genieKnown where (genieAlt.name != genieKnown.name) ?? $geknownsql= 'SELECT name FROM genieKnown WHERE name = ?'; # TABLE knownInfo @kiflds= qw/ transId productName proteinId/; # TABLE knownMore @kmflds= qw/ name gbProductName gbProteinAcc omimId omimName hugoId hugoSymbol hugoName hugoMap refSeqAcc locusLinkId gdbId aliases /; # knownMore.transId knownMore.name # these are table ids: # gbGeneName: 4459461 # gbProductName: 4459462 ** get this one # TABLE rnaGene @rnaflds= qw/ name type strand chromStart chromEnd /; $rnasql= 'SELECT '.join(',',@rnaflds).' FROM rnaGene WHERE chrom = ?'; # TABLE snpNih and snpTsc - don't care for name, just a # @snpflds= qw/ chromStart chromEnd/; $snpnsql= 'SELECT '.join(',',@snpflds).' FROM snpNih WHERE chrom = ?'; $snptsql= 'SELECT '.join(',',@snpflds).' FROM snpTsc WHERE chrom = ?'; # TABLE stsMarker @stsflds= qw/ chromStart chromEnd name /; $stssql= 'SELECT '.join(',',@stsflds).' FROM stsMarker WHERE chrom = ?'; # TABLE cpgIsland #name== CpG: 120, use cpgNum instead? @cpgflds= qw/ chromStart chromEnd name /; $cpgsql= 'SELECT '.join(',',@cpgflds).' FROM cpgIsland WHERE chrom = ?'; # TABLE mouseSyn ; name == mouse 4 @mouseflds= qw/ chromStart chromEnd name /; $mousesql= 'SELECT '.join(',',@mouseflds).' FROM mouseSyn WHERE chrom = ?'; @featflds= qw/ featname egid goldid chrom bpStart bpEnd featrange sym map dbx notes /; ## can have nulls in egid, goldid, sym, map, dbx, notes ... $featcreateSql= <do($sql) or die($sql); $dbh->do($featcreateSql) or die( $featcreateSql ); } flybase::sym2id::set4meow($isMeowId); flybase::sym2id::readSym2Id($orgacode, $org); if ($doacode) { open(AOUT,">$outpath/HUgn-gold.acode"); $aout= *AOUT; } # $chr= 'chr22'; $chrname= $chr; $chrname =~ s/chr//; my $predid= 0; # special case to avoid LocusLinkID clash: HUgn0100001 == predicted foreach $chrname (@chrs) { $chr= 'chr'.$chrname; print STDERR "Processing $chr ...\n" if $debug; $fout= undef; ##? # my $foutname; # if ($dofeat) { $foutname= "$outpath/features-$chrname.tsv"; } # # elsif ($doacode) { $foutname= "$outpath/HUgn-$chrname.acode";} # if ($outpath) { open(FOUT,">$foutname"); $fout= *FOUT; } # else { $fout= *STDOUT; } if ($dofeat) { # my $datafile= $mework . "$org/dna-$chrname.fasta"; # print $fout &feattabheader("$org/features-$chrname.tsv", $org, $datafile); # ($csize)= $dbh->selectrow_array("$cisql '$chr'"); ($csize)= dbiSelectNFetch( $dbh, $cisql, [$chr]); insertFeat( 'source', undef, undef, $chr, 1, $csize, "1..$csize", $org, "Chr $chrname", undef, undef); # printFeat($fout, 'source', "1..$csize", $org, "Chr $chrname", '-'); } # $sql= $geksql . "'$chr'"; # print STDERR "$sql\n---------------------\n" if $debug; # $sth= $dbh->prepare($sql) or die(" prepare($sql) - " . $sth->errstr); # $sth->execute() or die(" execute - " . $sth->errstr); # print STDERR "nvals= ".$sth->rows."\n" if ($debug); # known genes: genieKnown table $sth= dbiSelect( $dbh, $geksql, [$chr]); my @rw=(); my @kmrw=(); my @pnrw= (); my $ir= 0; my $row; while (($row = $sth->fetchrow_arrayref)) { %fld= (); # global !? @rw= @$row; my $geid= $rw[0]; # genieKnown.name my ($i,$k); for $i (0..$#rw) { $k= $geflds[$i]; $k='geID' if ($k eq 'name'); $fld{$k}= $rw[$i]; } # @kmrw= $dbh->selectrow_array("$kmsql '$geid'"); @kmrw= dbiSelectNFetch( $dbh, $kmsql, [$geid]); for $i (0..$#kmrw) { $k= $kmflds[$i]; $fld{$k}= $kmrw[$i]; } my $pnid= $kmrw[1]; # gbProductName my($prodname)= ($pnid == 0) ? ('') : dbiSelectNFetch( $dbh, $pnsql, [$pnid]); # : $dbh->selectrow_array("$pnsql '$pnid'"); $fld{gbProductName}= $prodname; $gsym = $fld{name}; $id= $fld{locusLinkId}; ##?? missing for 432 cases ? if ($id > 0) { $id = 'MEOW:HUgn' . sprintf("%07d",$id); my $idsym= flybase::sym2id::id2symbol($id); if ($idsym && $idsym ne $gsym) { # make sure is proper symbol print STDERR "Sym. mismatch - $id: $idsym <> $gsym\n"; $gsym= $idsym; } } else { $id= flybase::sym2id::symbol2id($gsym); ## $sym2id{ symcase($gsym) }; print STDERR "NO ID for $fld{geID} " unless($id); print STDERR "symbol2id($gsym)= $id\n" if ($debug || !$id); # unless($id) { $id = 'MEOW:HUgn' . sprintf("02%05d",++$predid); } #?? } parseGeneFeat( $fout, $id, $fld{geID}, $gsym) if ($dofeat); # uses %fld printGeneAcode( $aout, $id, $fld{geID}, $gsym) if ($id && $doacode); # uses %fld } # predicted genes: genieAlt table $sth= dbiSelect( $dbh, $geasql, [$chr]); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; $goldid= $fld{name}; #! need to check that goldid not already in knownInfo/genieKnown # ($known)= $dbh->selectrow_array("SELECT * FROM genieKnown WHERE name = '$goldid'"); ($known)= dbiSelectNFetch( $dbh, $geknownsql, [$goldid]); next if ($known); ## need HUgnXXXXX id for these predicted genes $id = 'MEOW:HUgn' . sprintf("01%05d",++$predid); $gsym = $goldid; parseGeneFeat( $fout, $id, $goldid, $gsym); # uses %fld printGeneAcode( $aout, $id, $goldid, $gsym, 1) if ($doacode); # uses %fld } # rnas: rnaGene table $sth= dbiSelect( $dbh, $rnasql, [$chr]); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; # rnaGene: @rnaflds= qw/ name type strand chromStart chromEnd /; $featname= $fld{type}; # any fixup? == snRNA, rRNA, tRNA ... $srange= "$fld{chromStart}..$fld{chromEnd}"; if ($fld{strand} eq '-') { $srange = "complement($srange)"; } insertFeat( $featname, undef, undef, $chr, $fld{chromStart}, $fld{chromEnd}, $srange, $fld{name}, undef, undef, undef ); } # more features: repeats, snps, ... $sth= dbiSelect( $dbh, $snpnsql, [$chr]); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; insertFeat( 'SNPn', undef, undef, $chr, $fld{chromStart}, $fld{chromEnd}, "$fld{chromStart}..$fld{chromEnd}", undef, undef, undef, undef ); } $sth= dbiSelect( $dbh, $snptsql, [$chr]); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; insertFeat( 'SNPt', undef, undef, $chr, $fld{chromStart}, $fld{chromEnd}, "$fld{chromStart}..$fld{chromEnd}", undef, undef, undef, undef ); } $sth= dbiSelect( $dbh, $stssql, [$chr]); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; insertFeat( 'STS', undef, undef, $chr, $fld{chromStart}, $fld{chromEnd}, "$fld{chromStart}..$fld{chromEnd}", $fld{name}, undef, undef, undef ); } $sth= dbiSelect( $dbh, $cpgsql, [$chr]); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; insertFeat( 'CpG', undef, undef, $chr, $fld{chromStart}, $fld{chromEnd}, "$fld{chromStart}..$fld{chromEnd}", $fld{name}, undef, undef, undef ); } $sth= dbiSelect( $dbh, $mousesql, [$chr]); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; insertFeat( 'MouseSyn', undef, undef, $chr, $fld{chromStart}, $fld{chromEnd}, "$fld{chromStart}..$fld{chromEnd}", $fld{name}, undef, undef, undef ); } # close($fout) if ($outpath); } close($aout) if ($aout); printFeatureTables() if ($dofeat); $rc = $dbh->disconnect; exit(0); #------------------ sub dbiSelect { my($dbh,$sql,$bindvalsref)= @_; print STDERR "$sql\n" if $debug; my $sth= $dbh->prepare($sql) or die(" prepare($sql) - " . $sth->errstr); $sth->execute(@$bindvalsref) or die(" execute - " . $sth->errstr); print STDERR "nrows= ".$sth->rows."\n" if ($debug); print STDERR "---------------------\n" if $debug; return $sth; } sub dbiSelectNFetch { my($dbh,$sql,$bindvalsref)= @_; my $dbg= $debug; $debug= 0; my $sth= dbiSelect($dbh, $sql, $bindvalsref); $debug= $dbg; return $sth->fetchrow_array; } sub printFeatureTables { foreach $chrname (@chrs) { $chr= 'chr'.$chrname; my $foutname= "$outpath/features-$chrname.tsv"; open(FOUT,">$foutname"); my $datafile= $mework . "$org/dna-$chrname.fasta"; print FOUT &feattabheader("$org/features-$chrname.tsv", $org, $datafile); printOrderedFeatsFromSql(*FOUT, $chr); close(FOUT); } } sub parseGeneFeat { my ($fh, $egid, $goldid, $gsym)= @_; @dbx= (); @notes= (); ## don't need all/any these in features table ! # if ($fld{omimId}) { push(@dbx,"OMIM:$fld{omimId}"); } # if ($fld{gdbId}) { push(@dbx,"$fld{gdbId}"); } # if ($fld{hugoId}) { push(@dbx,"HUGO:$fld{hugoId}"); } # if ($fld{locusLinkId}) { push(@dbx,"LOCUSLINK:$fld{locusLinkId}"); } # if ($fld{refSeqAcc}) { push(@dbx,"REFSEQ:$fld{refSeqAcc}"); } $srange= "$fld{txStart}..$fld{txEnd}"; if ($fld{strand} eq '-') { $srange = "complement($srange)"; } insertFeat( 'gene', $egid, $goldid, $chr, $fld{txStart}, $fld{txEnd}, $srange, $gsym, $fld{hugoMap}, join(',',@dbx), join(',',@notes) ); # printFeat($fh, 'gene', $srange, $gsym, # $fld{hugoMap}, $egid, join(',',@dbx), join(',',@notes), # $chr, $goldid ); # both mRNA and CDS - same? # $srange= "$fld{cdsStart}..$fld{cdsEnd}"; @dbx= (); push(@dbx,"Gold:$goldid"); ##?? GOLDENPATH:, GOPATH: AUPATH: GOLD: @eb= split(',', $fld{exonStarts}); @ee= split(',', $fld{exonEnds}); $srange= ($#eb == 0) ? '' : 'join('; for my $i (0..$#eb) { $srange .= ',' if ($i>0); $srange .= "$eb[$i]..$ee[$i]"; } $srange .= ($#eb == 0) ? '' : ')'; if ($fld{strand} eq '-') { $srange = "complement($srange)"; } insertFeat( 'mRNA', $egid, $goldid, $chr, $fld{txStart}, $fld{txEnd}, $srange, $gsym, undef, join(',',@dbx), undef ); # printFeat($fh, 'mRNA', $srange, $gsym, # '', $egid, join(',',@dbx), '', # $chr, $goldid ); } sub insertFeat { # my(featname,egid,goldid,chrom,bpStart,bpEnd,featrange,sym,map,dbx,notes ) = @_; # my $v= join(',',@_); my $v= join(',', map{ $dbh->quote($_);} @_ ); my $sql= 'INSERT INTO features ('.join(',',@featflds).') VALUES ('.$v.' )'; $dbh->do($sql) or die($sql); } sub printOrderedFeatsFromSql { my ($fh, $chr)= @_; my $sth= dbiSelect( $dbh, $featSelectSql, [$chr]); my $row; my %rh; while (($row = $sth->fetchrow_hashref)) { %rh= %$row; printFeat( $fh, $rh{featname}, $rh{featrange}, $rh{sym}, $rh{map}, $rh{egid}, $rh{dbx}, $rh{notes}, # these should be array refs !? $rh{chrom}, $rh{goldid} ); } } sub printFeat { my ($fh, $featname, $srange, $gsym, $map,$id,$rdbx,$rfnotes,$chr,$seqid)= @_; unless($gsym) { if ($featname =~ /^source/) { $gsym= $org; } else { $gsym= '-'; } } unless($map) { if ($featname =~ /^(source|scaffold)/ && $csomeFile) { $map= $csomeFile; } else { $map= '-'; } } print $fh "$featname\t$gsym\t$map\t$srange\t$id\t$rdbx"; # foreach (sort @{$rdbx}) { print $fh "$_,";} if ($wantnotes) { print $fh "\t$rfnotes"; # foreach (sort @{$rfnotes}) { print $fh "$_,";} # foreach (sort keys %{$rfnotes}) { # print $fh "$_,"; # # $allnotes{$_} += ${$rfnotes}{$_}; # } } # print $fh "\t$chr:$seqid"; print $fh "\n"; } sub feattabheader { my ($fname,$org,$sourcefile)= @_; my $filedate= $^T - 24*60*60 * (-M $sourcefile); my @tm= localtime( $filedate ); ## bad!? my $date= POSIX::strftime("%d-%B-%Y",@tm); $sourcefile =~ s/^$genomeshome//; return <prepare($sql) or die(" prepare($sql) - " . $sth->errstr); # $sth->execute() or die(" execute - " . $sth->errstr); # print STDERR "nvals= ".$sth->rows."\n" if ($debug); $sth= dbiSelect( $dbh, $geasql . "'$chr'"); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; $goldid= $fld{name}; #! need to check that goldid not already in knownInfo/genieKnown ($known)= $dbh->selectrow_array("SELECT * FROM genieKnown WHERE name = '$goldid'"); next if ($known); ## need HUgnXXXXX id for these predicted genes $id = 'MEOW:HUgn' . sprintf("01%05d",++$predid); $gsym = $goldid; parseGeneFeat( $fout, $id, $goldid, $gsym); printGeneAcode( $aout, $id, $goldid, $gsym, 1) if ($doacode); } # $sql= $rnasql . "'$chr'"; # print STDERR "$sql\n---------------------\n" if $debug; # $sth= $dbh->prepare($sql) or die(" prepare($sql) - " . $sth->errstr); # $sth->execute() or die(" execute - " . $sth->errstr); # print STDERR "nvals= ".$sth->rows."\n" if ($debug); $sth= dbiSelect( $dbh, $rnasql . "'$chr'"); while (($row = $sth->fetchrow_hashref)) { %fld= %$row; # rnaGene: @rnaflds= qw/ name type strand chromStart chromEnd /; $featname= $fld{type}; # any fixup? == snRNA, rRNA, tRNA ... $srange= "$fld{chromStart}..$fld{chromEnd}"; if ($fld{strand} eq '-') { $srange = "complement($srange)"; } insertFeat( $featname, undef, undef, $chr, $fld{chromStart}, $fld{chromEnd}, $srange, $fld{name}, undef, undef, undef ); } } } # $table= 'genieKnown'; # genieKnown, genieAlt # $getflds= 'name,strand,txStart,txEnd,cdsStart,cdsEnd'; # $where= "where chrom = 'chr22'"; # # $sql= "select $getflds from $table $wbere"; # $sth=$dbh->prepare($sql) or die("select failed"); # $sth->execute() or die(" execute - " . $sth->errstr); # $rv = $sth->rows; # print "nvals= $rv for $sql\n"; # # $table= 'genieKnown,knownInfo'; # $getflds= 'genieKnown.name,genieKnown.strand,genieKnown.txStart,genieKnown.txEnd' # .',knownInfo.name, knownInfo.transId, knownInfo.geneId, knownInfo.geneName'; # $where= "where (chrom = 'chr22' && genieKnown.name = knownInfo.transId )"; # #?? is where name=transId needed? yes These get all known gene info (or most): mysql> select count(knownInfo.transId) -> from genieKnown, knownInfo , knownMore -> where (chrom = 'chr22' && genieKnown.name = knownInfo.transId -> && knownInfo.transId = knownMore.transId); +--------------------------+ | count(knownInfo.transId) | +--------------------------+ | 266 | +--------------------------+ 1 row in set (0.17 sec) mysql> select count(knownInfo.transId) -> from genieKnown, knownInfo , knownMore -> where (genieKnown.name = knownInfo.transId -> && knownInfo.transId = knownMore.transId); +--------------------------+ | count(knownInfo.transId) | +--------------------------+ | 9550 | +--------------------------+ 1 row in set (2.79 sec) SELECT genieKnown.name,strand,txStart,txEnd,cdsStart,cdsEnd FROM genieKnown WHER E chrom = 'chr22' --------------------- nvals= 266 ge.name strand txStart txEnd cdsStart cdsEnd km.name gbProductName gbProteinAcc omimId omimName hugoId hugoSymbol hugoName hugoMap refSeqAcc locusLinkId gdbId aliases gbProductName -------------- RS.ctg22fin3-000000.0.0 - 14015958 14017919 14016002 14017676 KCNMB3L 4467933 NP_055221.1 0 0 NM_014406 27093 potassium large conducta nce calcium-activatedchannel, subfamily M, beta member 3-like RS.ctg22fin3-000000.1.0 + 14484177 14485187 14484498 14485078 CYCL 4467935 NP_066359.1 0 0 NM_021031 25835 cytochrome c-like antige n RS.ctg22fin3-000000.2.0 + 14510176 14535424 14510208 14534937 IL17R 4467937 NP_055154.1 0 0 NM_014339 23765 interleukin 17 receptor RS.ctg22fin3-000000.3.0 - 14562636 14584084 14563137 14584058 FLJ20454 4467939 NP_060299.1 0 0 NM_017829 55649 hypothetical pro tein FLJ20454 mysql> select name,strand,txStart,txEnd,cdsStart,cdsEnd from genieAlt where chrom = 'chr22' limit 10; +---------------------+--------+---------+---------+----------+---------+ | name | strand | txStart | txEnd | cdsStart | cdsEnd | +---------------------+--------+---------+---------+----------+---------+ | A.ctgC22-000000.1.0 | + | 43884 | 47845 | 43884 | 44022 | | A.ctgC22-000000.0.0 | - | 33565 | 74884 | 33565 | 33634 | | A.ctgC22-000000.3.0 | + | 114491 | 125653 | 122586 | 125653 | | A.ctgC22-000000.2.8 | - | 134031 | 139119 | 134031 | 134385 | | A.ctgC22-000000.4.0 | - | 357885 | 366345 | 363222 | 366345 | | A.ctgC22-000001.0.0 | + | 973520 | 980193 | 976202 | 980193 | | A.ctgC22-000001.1.0 | + | 1027022 | 1123748 | 1122699 | 1122945 | mysql> select name,strand,txStart,txEnd,cdsStart,cdsEnd from genieKnown -> where chrom = 'chr22' limit 10; +-------------------------+--------+----------+----------+----------+----------+ | name | strand | txStart | txEnd | cdsStart | cdsEnd | +-------------------------+--------+----------+----------+----------+----------+ | RS.ctg22fin3-000000.0.0 | - | 14015958 | 14017919 | 14016002 | 14017676 | | RS.ctg22fin3-000000.1.0 | + | 14484177 | 14485187 | 14484498 | 14485078 | | RS.ctg22fin3-000000.2.0 | + | 14510176 | 14535424 | 14510208 | 14534937 | | RS.ctg22fin3-000000.3.0 | - | 14562636 | 14584084 | 14563137 | 14584058 | | RS.ctg22fin3-000000.4.0 | - | 14604112 | 14634675 | 14606289 | 14634463 | | RS.ctg22fin3-000000.5.0 | - | 15015094 | 15051636 | 15015599 | 15051561 | select name,strand,txStart,txEnd from genieKnown, knownInfo where chrom = 'chr22' limit 10; select genieKnown.name,genieKnown.strand,genieKnown.txStart,genieKnown.txEnd , knownInfo.name, knownInfo.transId, knownInfo.geneId, knownInfo.geneName from genieKnown, knownInfo where (chrom = 'chr22' && genieKnown.name = knownInfo.transId ) limit 10; select genieKnown.name,genieKnown.strand,genieKnown.txStart,genieKnown.txEnd , knownInfo.name, knownInfo.transId, knownInfo.geneId, knownInfo.geneName , geneName.id, geneName.name from genieKnown, knownInfo, geneName where (genieKnown.chrom = 'chr22' && genieKnown.name = knownInfo.transId && knownInfo.geneName = geneName.id) limit 10; SELECT genieKnown.name,genieKnown.strand,genieKnown.txStart,genieKnown.txEnd , knownInfo.name, knownInfo.transId, knownInfo.geneId, knownInfo.geneName SELECT COUNT(*) FROM genieKnown LEFT JOIN knownInfo ON genieKnown.name = knownInfo.transId WHERE ( chrom = 'chr22' ); SELECT COUNT(*) FROM genieAlt LEFT JOIN knownInfo ON genieAlt.name = knownInfo.transId WHERE ( chrom = 'chr22' ); SELECT * from genieAlt, knownInfo WHERE ( chrom = 'chr22' && genieAlt.name = knownInfo.transId ) limit 10; SELECT count(*) from genieAlt, knownInfo WHERE (genieAlt.name = knownInfo.transId ) +----------+ | count(*) | +----------+ | 3714 | ??? why are these in knownInfo ? -- aha, these where copied or moved into genieKnown table (same .name/transId in both) LEFT JOIN genieKnown,knownInfo ON genieKnown.name = knownInfo.transId # knownInfo.name == geneName.name # knownInfo.geneName = geneName.id # genieKnown.name = knownInfo.transId mysql> SELECT COUNT(*) -> FROM genieKnown -> LEFT JOIN knownInfo ON genieKnown.name = knownInfo.transId -> WHERE (chrom = 'chr22' ); +----------+ | COUNT(*) | +----------+ | 266 | == same as # chr22 in genieKnown, 466 in genieAlt mysql> select count(*) from genieAlt -> left join genieKnown on genieAlt.name = genieKnown.name -> ; +----------+ | count(*) | +----------+ | 19168 | mysql> select count(*) from genieAlt -> left join genieKnown on genieAlt.name = genieKnown.name -> where ( isnull(genieKnown.name)); +----------+ | count(*) | +----------+ | 15454 | mysql> select count(*) from genieAlt, genieKnown -> where (genieAlt.name = genieKnown.name) ; +----------+ | count(*) | +----------+ | 3714 | oat% grep -c '^ID|' $mm/HU*acode 24456 oat% grep -c 'CLA|Predicted' $mm/HU*acode 14997 perl -e 'print 15454 - 14997;' = 457 acode.known= 9459 (versus 9550 in sql -- diff due to chr..random data? errors? dups?) 91 genieKnown are in random chrs == 9550 - 9459 457 in genieAlt, not in genieKnown, are in random chrs mysql> select count(*) from genieAlt where chrom like '%random%' ; mysql> select count(*) from genieAlt -> left join genieKnown on genieAlt.name = genieKnown.name -> where ( isnull(genieKnown.name) && genieAlt.chrom like '%random%' ); | 457 | select * from knownMore where (locusLinkId > 0 && locusLinkId < 10) *************************** 1. row *************************** name: A2M transId: RS.ctg17396-000001.4.0 geneId: RS.ctg17396-000001.4 gbGeneName: 178538 gbProductName: 4462222 gbProteinAcc: NP_000005.1 gbNgi: 6226959 gbPgi: 4557225 omimId: 103950 omimName: A2M hugoId: 7 hugoSymbol: A2M hugoName: alpha-2-macroglobulin hugoMap: 12p13.3-p12.3 pmId1: 4294967248 pmId2: 2581640270 refSeqAcc: NM_000014 aliases: locusLinkId: 2 gdbId: GDB:119639 *************************** 2. row *************************** name: NAT1 transId: RS.ctg45-000013.0.0 geneId: RS.ctg45-000013.0 gbGeneName: 200006 gbProductName: 4471823 gbProteinAcc: NP_000653.1 gbNgi: 4505334 gbPgi: 4505335 omimId: 108345 omimName: NAT1 hugoId: 7645 hugoSymbol: NAT1 hugoName: N-acetyltransferase 1 (arylamine N-acetyltransferase) hugoMap: 8p23.1-p21.3 pmId1: 7773298 pmId2: 2581640918 refSeqAcc: NM_000662 aliases: locusLinkId: 9 gdbId: GDB:125364 mysql> select count(*) from knownMore where (locusLinkId > 0); +----------+ | count(*) | +----------+ | 8836 | +----------+ 1 row in set (0.15 sec) mysql> select count(*) from knownMore where (locusLinkId = 0); +----------+ | count(*) | +----------+ | 714 | +----------+ mysql> select name,refSeqAcc from knownMore where (locusLinkId = 0) -> limit 10; +-----------------------+-----------+ | name | refSeqAcc | +-----------------------+-----------+ -| HBp15/L22 | | *| VAMP3 | | -| C.ctg13980-000000.6.0 | | -| C.ctg13980-000000.9.0 | | -| CD30v | | -| PRO2222 | | *| hPAD-colony10 | | -| PAD-H19 | | *| PRO1489 | | *| HS6ST | | ^^^ some now have locusLink record, same name mysql> select count(*) from genieKnown where (chrom like '%random'); +----------+ | count(*) | +----------+ | 91 | mysql> select chrom from genieKnown -> where (chrom like '%random') -> GROUP BY chrom -> limit 30; +--------------+ | chrom | +--------------+ | chr14_random | | chr15_random | | chr16_random | | chr17_random | | chr19_random | | chr1_random | | chr2_random | | chr7_random | | chrNA_random | | chrUL_random | | chrX_random | #-- new table for generating/using feature maps #? id == primary/unique key #? sym, map, dbx, notes can be null - but need NOT NULL to index CREATE TABLE features ( featname varchar(255) DEFAULT '' NOT NULL, id varchar(255) DEFAULT '' NOT NULL, chrom varchar(255) DEFAULT '' NOT NULL, bpStart int(10) unsigned DEFAULT '0' NOT NULL, bpEnd int(10) unsigned DEFAULT '0' NOT NULL, featrange longblob NOT NULL, sym varchar(255) DEFAULT '' NOT NULL, map varchar(255) DEFAULT '' NOT NULL, dbx longblob NOT NULL, notes longblob NOT NULL, KEY featname (featname), KEY id (id), KEY chrom (chrom(12),bpStart), KEY chrom_2 (chrom(12),bpEnd) ); --- problem w/ knownMore - no LL id, no symbol oat% grep '^ID|' HU*acode | grep -v MEOW | wc 432 == no ID assignable mysql> select count(*) from knownMore where (locusLinkId = 0 -> && name = transId); +----------+ | count(*) | +----------+ | 208 | +----------+ 1 row in set (0.18 sec) mysql> select name from knownMore where (locusLinkId = 0 -> && name = transId) limit 50; +-------------------------+ | name | +-------------------------+ | C.ctg13980-000000.6.0 | | C.ctg13980-000000.9.0 | name: C.ctg15142-000000.0.0 transId: C.ctg15142-000000.0.0 geneId: C.ctg15142-000000.0 gbGeneName: 0 gbProductName: 39307 gbProteinAcc: AAD51876.1 >> FAF1 (HUgn0011124) gbNgi: 5805195 gbPgi: 5805196 omimId: 0 omimName: hugoId: 0 hugoSymbol: hugoName: hugoMap: pmId1: 0 pmId2: 0 refSeqAcc: aliases: locusLinkId: 0 gdbId: name: C.ctg17731-000000.8.0 transId: C.ctg17731-000000.8.0 geneId: C.ctg17731-000000.8 gbGeneName: 0 gbProductName: 0 gbProteinAcc: AAA20871.1 >> NSEP1 == LL:4904 name: C.ctg13701-000000.3.1 transId: C.ctg13701-000000.3.1 geneId: C.ctg13701-000000.3 gbGeneName: 0 gbProductName: 0 gbProteinAcc: AAA35793.1 transId: C.ctg13980-000000.9.0 geneId: C.ctg13980-000000.9 gbGeneName: 0 gbProductName: 0 gbProteinAcc: AAA35669.1