#!/bin/env icarus # meow.is # for acode format for MEOW Genes data # d.gilbert, dec'99, from flybase versions ## $fldinfo is hash of code => { vl:"visible label" accession:1 - parse accessions fld, ... } $fldinfo={ GENR: { words:1 vl:"MEOW Gene Record" } CHR: { words:1 vl:"Chromosome location" } CLA: { words:1 vl:"Class of gene" } DID: { accession:1 vl:"Databank ID" dbrecoder:DBLrecoder } DBL: { accession:1 vl:"Databank link" dbrecoder:DBLrecoder } DT: { date:1 vl:"Date" } ENZ: { function:1 vl:"Enzyme name/number" funcrecoder:ENZrecoder } FNC: { function:1 vl:"Function of product" funcrecoder:FNCrecoder } HG: { accession:1 vl:"Homologues" dbrecoder:HGrecoder } ID: { id:1 vl:"ID" } IFL: { words:1 vl:"Interactive Fly" } MAP: { words:1 vl:"Map position" } NAM: { words:1 vl:"Full name" } ORG: { phrase:1 vl:"Organism" } PDOM: { accession:1 vl:"Protein domains" dbrecoder:PDOMrecoder } PHI: { phenotype:1 vl:"Phenotypic info." } PHP: { phenotype:1 vl:"Phenotypic info.(2)" } PRD: { function:1 vl:"Protein data" funcrecoder:FNCrecoder } REAB: { words:1 vl:"Abstract" } RETE: { noindex:1 vl:"Table Entry" noview:1} SYM: { symbol:1 vl:"Gene Symbol" } SYN: { symbol:1 vl:"Synonyms" } URL: { words:1 vl:"Url" } eor: { noindex:1 comment:"End of record" } sor: { noindex:1 comment:"Start of record" } skip: { noindex:1 comment:"Skip entry" noview:1 } } $recode={ #? need this? GSYM:SYM } # RETE:skip # IFL:IFL ## recode IFL to DBL in FBgn for meow $dbrecode={ NUCACCNO:EMBL NA:EMBL PA:EMBL SWP:SWISSPROT } $rules={ # ---------- general definitions ----------------- word: ~ /[0-9A-Za-z_]+/ ~ nonword: ~ (/[^0-9A-Za-z_\n]+/ | /\n +/) ~ phrase: ~ /[^\n]*/ ~ ln: ~ /[^\n]*\n/ ~ nl: ~ / *\n/ ~ notnl: ~ /[^\n]/ ~ # key: ~ /[0-9A-Za-z_]*[\|]/ ~ # lrec: ~ /([A-Za-z_]*)[\.]/ {$lrec=$1} ~ #? stop using this prefix? # fbid: ~ /[A-Za-z][A-Za-z][A-Za-z][A-Za-z][0-9]+/ ~ # fbidsymbol: ~ fbid? '=='? phrase {$Uniq} ~ # ---------- #---------- record and field parsing entry: ~ { $In:[file:text] $Out init { $mainrec='GENR' } pre { $Skip:0 $recclass='gene' $code='skip' $klass='determined' $reclev=1 $lrec=$mainrec $reclist[$reclev]=$lrec } } ( 'GENR' { $Not } ln )* ( 'GENR' { $entryFip=$Fip $Wrt } ln { $App } ( 'GENR' { $Not } ln { $Wrt } )+ )? ~ entry_coded: ~ { $In:entry $Out $end=$Fip-1 } /./ {$Wrt} ( /[^<\&\n]+/ {$App} # decode FB greek markup to english | /\&([A-Za-z][A-Za-z]?)gr;/ {$App:[s:$greekcodes.$1]} # decode FB super/subscript markup to brackets | '' {$App:[s:"["]} | '' {$App:[s:"]"]} | '' {$App:[s:"[["]} | '' {$App:[s:"]]"]} | /[<\&\n]/ {$App} )* ~ keynew: ~ /([0-9A-Za-z_]+)[\|]?/ ~ ##? take out '?' after | fields: ~ { $In:entry_coded $Out $Skip:1 } ( ( keynew { ## field key $code=$1 if:$recode.$code!="" $code=$recode.$code $Wrt:[$code s:""] # s:"$code|" } | '|' ## continuation of field # {$App:[s:"\n"] } # {$Wrt:[$code s:""]} # s:"|" | '{' { ## start (sub)record # /[^\n]*/ $Wrt:[sor] $lrec=$code $reclev=$reclev+1 $reclist[$reclev]=$lrec } | '}' { ## end (sub)record # /[^\n]*/ $Wrt:[eor] $reclev=$reclev-1 $lrec=$reclist[$reclev] } # | '#' /[^\n]*/ ## comments | /[^\n]*/ {$Wrt:[skip]} ## shouldn't be any other line starts ) ln {$App } # append rest, including newline - icarus needs newline! )+ ~ #---------- indexing methods i_word: ~ /[0-9A-Za-z_]+/ {$Uniq} ~ #{$Uniq:$Itc} i_words: ~ /[0-9A-Za-z_]+/ {$Uniq} | /[^0-9A-Za-z_\n]+/ ~ #{$Uniq:$Itc} i_phrase: ~ /[^\n]+/ {$Uniq} ~ ##{$Uniq:$Itc} i_blackspace: ~ /[^ \t\n]+/ {$Uniq} ~ # {$Uniq:$Itc} ## controlled-vocab: index both as words and as phrase (' ' -> '_') i_cvterm: ~ ## this one always gives -- no more input error !??? x{$cv=''} ( /[a-z0-9A-Z_-]+/ {$Uniq $StrApp:[$cv s:$Ct]} ##$Uniq:$Itc | /[^\na-z0-9A-Z_-]+/ {$StrApp:[$cv s:$Ct]} )* x{ $cv= $StrTrans:[s:$Trim:$cv from:' ' to:'_'] if:$cv!='' $Uniq:[ s:$cv] } ##$Uniq:[$Itc ~ i_cvterms: ~ x{$cv=''} ( /[a-z0-9A-Z_-]+/ {$Uniq $StrApp:[$cv s:$Ct]} ##$Uniq:$Itc | /[^\na-z0-9A-Z_-]+/ {$StrApp:[$cv s:$Ct]} | /[\n]/ { $cv= $StrTrans:[s:$Trim:$cv from:' ' to:'_'] if:$cv!='' $Uniq:[ s:$cv] $cv='' } #$Uniq:[$Itc )* ~ i_date: ~ /[^0-9]*([0-9]+) *([A-Z][a-z][a-z])[a-z]* *([0-9]+)/ { $yr=$3 if:$yr<100 $yr= 1900 + $yr $Uniq:[ s:($yr*10000 + $months.$2*100 + $1)] ##$Uniq:[$Itc } ~ #---------- field indexing dbname: ~ /([A-Za-z_]+)[\/:]/ ~ dbaccession: ~ /([0-9A-Za-z_\.-]+)/ ~ preacc: ~ {$In:[fields] $Out pre{$Skip:1} } x{ if:$Itc == "" || $fldinfo.($Itc).accession!=1 $Fail} x{ $dbrecoder=$fldinfo.($Itc).dbrecoder if:$dbrecoder=="" $dbrecoder= 'DBLrecoder' } ( x{$Prod:$dbrecoder} )* ~ accessions: ~ {$In:[preacc] $Out } ( 'DBL|' dbname { $db=$1 if:$dbrecode.$db!="" $db= $dbrecode.$db } ## dbaccession ==? too restrictive a patt? may have to deal w/ symbols as ids /[^\n]+/ { $acc=$Ct $Uniq:[$db s:"$acc"] $Uniq:[DBL s:"$db:$acc"] } ln? | ln )+ ~ DBLrecoder: ~ ## recode lines for easier parsing by several productions ( ( dbname { $db=$1 } dbaccession { $acc=$1 $Uniq:[$Itc s:{"DBL|%s:%s\n" $db $acc}] } ) | word { $Uniq:[$Itc s:{"WORD|%s\n" $Ct}] } | /./ )* ~ ## PDOM string: |PS00905 == GTP1/OBG family signature. | ## PDOM string: |Leucine-rich repeat protein. ## PFAM:PF00560 ## Transmembrane domain protein.| PDOMrecoder: ~ ## recode a line for easier parsing by several productions ( (dbname {$db= $1} dbaccession { $acc=$1 $Uniq:[$Itc s:{"DBL|%s:%s\n" $db $acc}] } ) | /(PS[0-9A-Za-z_\.]+)/ { $acc=$1 $Uniq:[$Itc s:{"DBL|PROSITE:%s\n" $acc}] } | '==' #? do phrase? should these words go into functions? | word { $Uniq:[$Itc s:{"WORD|%s\n" $Ct}] } | /./ )* ~ # HG|species == Homo sapiens; gene == ABL1; GDB:119640; OMIM:189980 # HG|species == Phoneutria nigriventer; gene == 'neurotoxin TX2-6'; SWP:P29425 # HG|species == Drosophila melangaster; gene == fz (frizzled) # ?? drop "'" on genes ? HGrecoder: ~ ( 'species == ' /[^\;\=\n\,]+/ { $Uniq:[$Itc s:{"ORG|%s\n" $Ct}] } | 'gene == ' /'?([^\;\=\n\,\']+)/ { $Uniq:[$Itc s:{"SYM|%s\n" $1}] } | /[;']/ | (dbname {$db= $1} dbaccession { $acc=$1 $Uniq:[$Itc s:{"DBL|%s:%s\n" $db $acc}] } ) | word { $Uniq:[$Itc s:{"WORD|%s\n" $Ct}] } | /./ )* ~ preHG: ~ {$In:[fields c:HG] $Out } HGrecoder ~ ##? add 'homolog' to index stream? - many data sets use this term in name,other flds ## but don't have HG fld, e.g. NAME= 'xxx (Mouse) homolog...' ## ? or pull data from other fields (in extraction methods) and put into HG fld? ## Query: [libs={FBgn HUgn MGgn ATgn CEgn SGgn ZFgn}-all:homolog*] , No. matches= 2312 HG: ~ {$In:[preHG] $Out pre{$Skip:0}} ( 'DBL|' /[^\n]*/ {$Uniq:DBL} | 'ORG|' /[^\n]*/ {$Uniq:ORG} | 'SYM|' /[^\n]*/ {$Uniq:SYM} | 'WORD|' /[^\n]*/ {$Uniq} | /./ )+ ~ FNCrecoder: ~ # ( /[^\n]+/ { $Uniq:[$Itc s:{"TERM|%s\n" $Ct}] } | /./ )* /.+/ {$Uniq:[$Itc s:{"TERM|%s\n" $Ct}] } ~ prefunc: ~ {$In:[fields] $Out pre{$Skip:0} } x{ if:$Itc == "" || $fldinfo.($Itc).function!=1 $Fail} x{ $funcrecoder=$fldinfo.($Itc).funcrecoder if:$funcrecoder=="" $funcrecoder= 'FNCrecoder' } ( x{$Prod:$funcrecoder} )* ~ functions: ~ {$In:[prefunc] $Out } ( 'ECNUM|' /[^\n]*/ { $Uniq:[$Itc s:"$Ct"] $Uniq:[$Itc s:"EC$Ct"] } | 'TERM|' i_cvterm #?? | keynew i_cvterm | /./ )+ ~ phenotypes: ~ {$In:[fields] $Out pre{$Skip:1}} # c:{PHP PHI} x{if:$Itc == "" || $fldinfo.($Itc).phenotype!=1 $Fail} (i_words nl)+ ~ ecnum: ~ /[0-9-]+\\.[0-9-]+\\.[0-9\\.-]+/ ~ # Enzyme - CV field + EC num field # 'ENZ|arylalkylamine N-acetyltransferase == EC 2.3.1.87' # 'ENZ|glycerol-3-phosphate dehydrogenase (NAD+) == EC 1.1.1.8' ENZrecoder: ~ ( '==' | /EC[ :-]*/? ecnum { $Uniq:[$Itc s:{"ECNUM|%s\n" $Ct}] } | /[^=\n]+/ { $Uniq:[$Itc s:{"TERM|%s\n" $Trim:$Ct}] } | /./ )* ~ preENZ: ~ {$In:[fields c:ENZ] $Out pre{$Skip:1}} ENZrecoder ~ ENZ: ~ {$In:[preENZ] $Out } ## ? do we need this one? handle all in functions? ( 'ECNUM|' /[^\n]*/ {$Uniq:ECNUM} | 'TERM|' /[^\n]*/ {$Uniq:TERM} | /./ )+ ~ idpatt: ~ /[A-Za-z][A-Za-z][A-Za-z][A-Za-z][0-9]+/ ~ ID: ~ { $In:[fields c:ID] $Out } ( idpatt {if:$lrec == $mainrec {$Uniq $entryName=$Ct}} | /./ )+ ~ allids: ~ { $In:[fields c:{ID ID2}] $Out } ( idpatt {$Uniq } | /./ )+ ~ symbolsplit: ~ {$In:[fields] $Out pre{$Skip:0 $inapp=0}} x{if:$Itc == "" || $fldinfo.($Itc).symbol!=1 $Fail} ( /[^:\n]+/ { if:$inapp==0 {$Wrt:$Itc $inapp=1} else $App:$Itc} | '::' { $inapp=0 } | ': ' { $inapp=0 } | /:[^ :\n]/ { if:$inapp==0 {$Wrt:$Itc $inapp=1} else $App:$Itc } | /./ {$App:$Itc} )+ ~ SYM_CS: ~ {$In:[symbolsplit c:SYM] $Out } # _CS == case-sensitive field, requires srs c changes x{if:$lrec!=$mainrec $Fail} #? do in pre{} ## Species\Symbol is funky drosophila symbol syntax, and \\\\ is icarus match for '\' ( /([^ \\]+)\\\\/? { $Uniq:[s:$1]} i_phrase nl? )+ ~ synonyms_CS: ~ {$In:[symbolsplit c:{SYM SYN}] $Out } x{if:$lrec!=$mainrec $Fail} #? do in pre{} ( /([^ \\]+)\\\\/? { $Uniq:[s:$1]} i_phrase nl? )+ ~ SYM: ~ {$In:SYM_CS $Out } (/.+/ {$Uniq})+ ~ synonyms: ~ {$In:synonyms_CS $Out } (/.+/ {$Uniq})+ ~ NAM: ~ {$In:[fields c:NAM] $Out pre{$Skip:0}} x{if:$lrec != $mainrec $Fail} #? do in pre{} i_cvterms # ( i_cvterm /./ )* # ((i_cvterm /[\n]/ ) | /./)* # i_cvterm+ ~ DT: ~ {$In:[fields c:DT] $Out } ( i_date | /./ )+ ~ ORG: ~ {$In:[fields c:ORG] $Out } ( i_phrase | /./ )+ ~ ## these should be numeric, but vary so widely among orgs will be a big task ## is only first ~word valid? MAP: ~ {$In:[fields c:MAP] $Out } i_blackspace ~ CHR: ~ {$In:[fields c:CHR] $Out } i_blackspace ~ ## CLA should be a cvocab - Predicted, ORF, determined ## default = determined, Experimentally determined or Confirmed preCLA: ~ {$In:[fields c:{CLA ID}] $Out } /[^ \t\n]+/ {$Wrt if:$Itc==CLA $klass=$Ct } ~ CLA: ~ {$In:preCLA $Out} /.+/ {$Uniq:[s:$klass]} ~ allwords: ~ {$In:[fields] $Out } x{ if:$fldinfo.($Itc).noindex $Fail} (i_words+ nl)+ ~ ## x{if:$Itc == "" || $fldinfo.($Itc).noindex $Fail} ## i_words+ ## ! THIS IS BAD - puts word in many fld indices: {$Uniq:$Itc} ##((/[0-9A-Za-z_]+/ {$Uniq:$Itc} | /[^0-9A-Za-z_\n]+/) nl)+ ##((/[0-9A-Za-z_]+/ {$Uniq} | /[^0-9A-Za-z_\n]+/) nl)+ #---------- html output h_sor: ~ { $In:[fields c:sor t:html] $Out } x{$Rep:""} /.*/ ~ h_eor: ~ { $In:[fields c:eor t:html] $Out } x{$Rep:"
\n"} /.*/ ~ h_field: ~ { $In:[fields t:html] $Out } x{ $vl= $fldinfo.($Itc).vl if:$fldinfo.($Itc).noview { $Rep:'' } elif:$reclev>0 {$Rep:"$vl" } else {$Rep:"$vl"} } /.+/ {if:$fldinfo.($Itc).noview $Rep:''} x{ if:$fldinfo.($Itc).noview { $Rep:'' } elif:$reclev>0 { $Rep:"\n"} } ~ h_ENZ: ~ {$In:[ENZ c:ECNUM t:html] $Out } /.*/ {$Rep:{$ParStr:enzymeR $Ct $Ct}} ~ h_accessions: ~ {$In:[accessions c:DBL t:html] $Out } ( dbname { $db=$1 } /[^\n]+/ { $acc=$Ct $dbr=$dblinks.($db)[1] if:$dbr!="" $Rep:{$ParStr:$dbr $acc $acc} } | /./ )+ ~ alllinks: ~ {$In:[fields c:DBL t:html] $Out} /.*/ {$Uniq} ~ # ^^ not working, need? to call html parser for each [fields c:code] ? } # end $rules ## ? can we use a 'global' var like this in rules, pre initialized properly? $mainrec= 'MAINREC' $recclass='CLASS' $reclist={lev1 lev2 lev3 lev4 lev5 lev6 lev7 lev8 lev9} ## need to init array size ! $months={Jan:1 Feb:2 Mar:3 Apr:4 May:5 Jun:6 Jul:7 Aug:8 Sep:9 Oct:10 Nov:11 Dec:12} $greekcodes={ 'a':'alpha' 'A':'Alpha' 'b':'beta' 'B':'Beta' 'g':'gamma' 'G':'Gamma' 'd':'delta' 'D':'Delta' 'e':'epsilon' 'E':'Epsilon' 'z':'zeta' 'Z':'Zeta' 'ee':'eta' 'EE':'Eta' 'th':'theta' 'TH':'Theta' 'i':'iota' 'I':'Iota' 'k':'kappa' 'K':'Kappa' 'l':'lambda' 'L':'Lambda' 'm':'mu' 'M':'Mu' 'n':'nu' 'N':'Nu' 'x':'xi' 'X':'Xi' 'o':'omicron' 'O':'Omicron' 'p':'pi' 'P':'Pi' 'r':'rho' 'R':'Rho' 's':'sigma' 'S':'Sigma' 't':'tau' 'T':'Tau' 'u':'upsilon' 'U':'Upsilon' 'ph':'phi' 'PH':'Phi' 'kh':'chi' 'KH':'Chi' 'ps':'psi' 'PS':'Psi' 'oh':'omega' 'OH':'Omega' 'sf':'s' } $dblinks={ EMBL: emblR PROSITE: prositeR SWISSPROT: swissR UNIGENE: unigeneR LOCUSLINK: locuslinkR MEDLINE: medlineR ECNUM: enzymeR FLYBASE: flygeneR GDB: gdbR PFAM: pfamR } # Testing....................... if:$TestMode { $job = $JobNew:[prod:$rules skip:" " fileName:'/bio/data/meow/FBgn.acode'] # fileName:'SRSROOT:data/meow/FBgn.acode'] # fileName:'SRSROOT:data/FBgn.acode'] # fileName:'SRSROOT:data/ATgn.acode'] # fileName:'SRSROOT:data/MGgn.acode'] # fileName:'SRSROOT:data/SGgn.acode'] # fileName:'SRSROOT:data/ZFgn.acode'] # fileName:'SRSROOT:data/CEgn.acode'] # fileName:'SRSROOT:data/HUgn.acode'] while:$JobHasInput:$job { $JobTokens:[$job name:fields print:1] ## $JobTokens:[$job task:html name:h_fields print:1] # no good # $JobTokens:[$job name:ID print:1] # $JobTokens:[$job name:SYM print:1] # $JobTokens:[$job name:SYN print:1] # $JobTokens:[$job name:NAM print:1] # $JobTokens:[$job name:ORG print:1] # $JobTokens:[$job name:CHR print:1] # $JobTokens:[$job name:MAP print:1] $JobTokens:[$job name:ENZ print:1] $JobTokens:[$job name:HG print:1] $JobTokens:[$job name:accessions print:1] $JobTokens:[$job name:functions print:1] $JobTokens:[$job name:phenotypes print:1] # $JobTokens:[$job name:DID print:1] # $JobTokens:[$job name:DBL print:1] # $JobTokens:[$job name:PDOM print:1] # $JobTokens:[$job name:DT print:1] # $JobTokens:[$job name:allwords print:1] $JobNext:$job } }