#!/bin/env icarus
# meow.is
# for acode format for MEOW Genes data
# d.gilbert, dec'99, from flybase versions
## $fldinfo is hash of code => { vl:"visible label" accession:1 - parse accessions fld, ... }
$fldinfo={
GENR: { words:1 vl:"MEOW Gene Record" }
CHR: { words:1 vl:"Chromosome location" }
CLA: { words:1 vl:"Class of gene" }
DID: { accession:1 vl:"Databank ID" dbrecoder:DBLrecoder }
DBL: { accession:1 vl:"Databank link" dbrecoder:DBLrecoder }
DT: { date:1 vl:"Date" }
ENZ: { function:1 vl:"Enzyme name/number" funcrecoder:ENZrecoder }
FNC: { function:1 vl:"Function of product" funcrecoder:FNCrecoder }
HG: { accession:1 vl:"Homologues" dbrecoder:HGrecoder }
ID: { id:1 vl:"ID" }
IFL: { words:1 vl:"Interactive Fly" }
MAP: { words:1 vl:"Map position" }
NAM: { words:1 vl:"Full name" }
ORG: { phrase:1 vl:"Organism" }
PDOM: { accession:1 vl:"Protein domains" dbrecoder:PDOMrecoder }
PHI: { phenotype:1 vl:"Phenotypic info." }
PHP: { phenotype:1 vl:"Phenotypic info.(2)" }
PRD: { function:1 vl:"Protein data" funcrecoder:FNCrecoder }
REAB: { words:1 vl:"Abstract" }
RETE: { noindex:1 vl:"Table Entry" noview:1}
SYM: { symbol:1 vl:"Gene Symbol" }
SYN: { symbol:1 vl:"Synonyms" }
URL: { words:1 vl:"Url" }
eor: { noindex:1 comment:"End of record" }
sor: { noindex:1 comment:"Start of record" }
skip: { noindex:1 comment:"Skip entry" noview:1 }
}
$recode={ #? need this?
GSYM:SYM
}
# RETE:skip
# IFL:IFL ## recode IFL to DBL in FBgn for meow
$dbrecode={
NUCACCNO:EMBL
NA:EMBL
PA:EMBL
SWP:SWISSPROT
}
$rules={
# ---------- general definitions -----------------
word: ~ /[0-9A-Za-z_]+/ ~
nonword: ~ (/[^0-9A-Za-z_\n]+/ | /\n +/) ~
phrase: ~ /[^\n]*/ ~
ln: ~ /[^\n]*\n/ ~
nl: ~ / *\n/ ~
notnl: ~ /[^\n]/ ~
# key: ~ /[0-9A-Za-z_]*[\|]/ ~
# lrec: ~ /([A-Za-z_]*)[\.]/ {$lrec=$1} ~ #? stop using this prefix?
# fbid: ~ /[A-Za-z][A-Za-z][A-Za-z][A-Za-z][0-9]+/ ~
# fbidsymbol: ~ fbid? '=='? phrase {$Uniq} ~
# ----------
#---------- record and field parsing
entry: ~ { $In:[file:text] $Out init { $mainrec='GENR' }
pre { $Skip:0
$recclass='gene' $code='skip' $klass='determined'
$reclev=1 $lrec=$mainrec
$reclist[$reclev]=$lrec
}
}
( 'GENR' { $Not } ln )*
( 'GENR' { $entryFip=$Fip $Wrt } ln { $App }
( 'GENR' { $Not } ln { $Wrt } )+
)?
~
entry_coded: ~ { $In:entry $Out $end=$Fip-1 }
/./ {$Wrt}
( /[^<\&\n]+/ {$App}
# decode FB greek markup to english
| /\&([A-Za-z][A-Za-z]?)gr;/ {$App:[s:$greekcodes.$1]}
# decode FB super/subscript markup to brackets
| '' {$App:[s:"["]}
| '' {$App:[s:"]"]}
| '' {$App:[s:"[["]}
| '' {$App:[s:"]]"]}
| /[<\&\n]/ {$App}
)*
~
keynew: ~ /([0-9A-Za-z_]+)[\|]?/ ~ ##? take out '?' after |
fields: ~ { $In:entry_coded $Out $Skip:1 }
(
( keynew { ## field key
$code=$1 if:$recode.$code!="" $code=$recode.$code
$Wrt:[$code s:""] # s:"$code|"
}
| '|' ## continuation of field
# {$App:[s:"\n"] }
# {$Wrt:[$code s:""]} # s:"|"
| '{' { ## start (sub)record # /[^\n]*/
$Wrt:[sor]
$lrec=$code
$reclev=$reclev+1
$reclist[$reclev]=$lrec
}
| '}' { ## end (sub)record # /[^\n]*/
$Wrt:[eor]
$reclev=$reclev-1
$lrec=$reclist[$reclev]
}
# | '#' /[^\n]*/ ## comments
| /[^\n]*/ {$Wrt:[skip]} ## shouldn't be any other line starts
)
ln {$App } # append rest, including newline - icarus needs newline!
)+
~
#---------- indexing methods
i_word: ~ /[0-9A-Za-z_]+/ {$Uniq} ~ #{$Uniq:$Itc}
i_words: ~ /[0-9A-Za-z_]+/ {$Uniq} | /[^0-9A-Za-z_\n]+/ ~ #{$Uniq:$Itc}
i_phrase: ~ /[^\n]+/ {$Uniq} ~ ##{$Uniq:$Itc}
i_blackspace: ~ /[^ \t\n]+/ {$Uniq} ~ # {$Uniq:$Itc}
## controlled-vocab: index both as words and as phrase (' ' -> '_')
i_cvterm: ~
## this one always gives -- no more input error !???
x{$cv=''}
( /[a-z0-9A-Z_-]+/ {$Uniq $StrApp:[$cv s:$Ct]} ##$Uniq:$Itc
| /[^\na-z0-9A-Z_-]+/ {$StrApp:[$cv s:$Ct]}
)*
x{ $cv= $StrTrans:[s:$Trim:$cv from:' ' to:'_']
if:$cv!='' $Uniq:[ s:$cv] } ##$Uniq:[$Itc
~
i_cvterms: ~
x{$cv=''}
( /[a-z0-9A-Z_-]+/ {$Uniq $StrApp:[$cv s:$Ct]} ##$Uniq:$Itc
| /[^\na-z0-9A-Z_-]+/ {$StrApp:[$cv s:$Ct]}
| /[\n]/ { $cv= $StrTrans:[s:$Trim:$cv from:' ' to:'_']
if:$cv!='' $Uniq:[ s:$cv] $cv='' } #$Uniq:[$Itc
)*
~
i_date: ~ /[^0-9]*([0-9]+) *([A-Z][a-z][a-z])[a-z]* *([0-9]+)/ {
$yr=$3 if:$yr<100 $yr= 1900 + $yr
$Uniq:[ s:($yr*10000 + $months.$2*100 + $1)] ##$Uniq:[$Itc
}
~
#---------- field indexing
dbname: ~ /([A-Za-z_]+)[\/:]/ ~
dbaccession: ~ /([0-9A-Za-z_\.-]+)/ ~
preacc: ~ {$In:[fields] $Out pre{$Skip:1} }
x{ if:$Itc == "" || $fldinfo.($Itc).accession!=1 $Fail}
x{ $dbrecoder=$fldinfo.($Itc).dbrecoder
if:$dbrecoder=="" $dbrecoder= 'DBLrecoder'
}
( x{$Prod:$dbrecoder} )*
~
accessions: ~ {$In:[preacc] $Out }
( 'DBL|'
dbname { $db=$1 if:$dbrecode.$db!="" $db= $dbrecode.$db }
## dbaccession ==? too restrictive a patt? may have to deal w/ symbols as ids
/[^\n]+/ { $acc=$Ct $Uniq:[$db s:"$acc"] $Uniq:[DBL s:"$db:$acc"] }
ln?
| ln )+
~
DBLrecoder: ~ ## recode lines for easier parsing by several productions
(
( dbname { $db=$1 } dbaccession { $acc=$1
$Uniq:[$Itc s:{"DBL|%s:%s\n" $db $acc}]
} )
| word { $Uniq:[$Itc s:{"WORD|%s\n" $Ct}] }
| /./
)*
~
## PDOM string: |PS00905 == GTP1/OBG family signature. |
## PDOM string: |Leucine-rich repeat protein.
## PFAM:PF00560
## Transmembrane domain protein.|
PDOMrecoder: ~ ## recode a line for easier parsing by several productions
(
(dbname {$db= $1} dbaccession { $acc=$1
$Uniq:[$Itc s:{"DBL|%s:%s\n" $db $acc}]
} )
| /(PS[0-9A-Za-z_\.]+)/ { $acc=$1
$Uniq:[$Itc s:{"DBL|PROSITE:%s\n" $acc}]
}
| '==' #? do phrase? should these words go into functions?
| word { $Uniq:[$Itc s:{"WORD|%s\n" $Ct}] }
| /./
)*
~
# HG|species == Homo sapiens; gene == ABL1; GDB:119640; OMIM:189980
# HG|species == Phoneutria nigriventer; gene == 'neurotoxin TX2-6'; SWP:P29425
# HG|species == Drosophila melangaster; gene == fz (frizzled)
# ?? drop "'" on genes ?
HGrecoder: ~
( 'species == ' /[^\;\=\n\,]+/ { $Uniq:[$Itc s:{"ORG|%s\n" $Ct}] }
| 'gene == ' /'?([^\;\=\n\,\']+)/ { $Uniq:[$Itc s:{"SYM|%s\n" $1}] }
| /[;']/
| (dbname {$db= $1} dbaccession { $acc=$1
$Uniq:[$Itc s:{"DBL|%s:%s\n" $db $acc}] } )
| word { $Uniq:[$Itc s:{"WORD|%s\n" $Ct}] }
| /./
)*
~
preHG: ~ {$In:[fields c:HG] $Out } HGrecoder ~
##? add 'homolog' to index stream? - many data sets use this term in name,other flds
## but don't have HG fld, e.g. NAME= 'xxx (Mouse) homolog...'
## ? or pull data from other fields (in extraction methods) and put into HG fld?
## Query: [libs={FBgn HUgn MGgn ATgn CEgn SGgn ZFgn}-all:homolog*] , No. matches= 2312
HG: ~ {$In:[preHG] $Out pre{$Skip:0}}
( 'DBL|' /[^\n]*/ {$Uniq:DBL}
| 'ORG|' /[^\n]*/ {$Uniq:ORG}
| 'SYM|' /[^\n]*/ {$Uniq:SYM}
| 'WORD|' /[^\n]*/ {$Uniq}
| /./
)+
~
FNCrecoder: ~
# ( /[^\n]+/ { $Uniq:[$Itc s:{"TERM|%s\n" $Ct}] } | /./ )*
/.+/ {$Uniq:[$Itc s:{"TERM|%s\n" $Ct}] }
~
prefunc: ~ {$In:[fields] $Out pre{$Skip:0} }
x{ if:$Itc == "" || $fldinfo.($Itc).function!=1 $Fail}
x{ $funcrecoder=$fldinfo.($Itc).funcrecoder
if:$funcrecoder=="" $funcrecoder= 'FNCrecoder'
}
( x{$Prod:$funcrecoder} )*
~
functions: ~ {$In:[prefunc] $Out }
( 'ECNUM|' /[^\n]*/ { $Uniq:[$Itc s:"$Ct"] $Uniq:[$Itc s:"EC$Ct"] }
| 'TERM|' i_cvterm
#?? | keynew i_cvterm
| /./
)+
~
phenotypes: ~ {$In:[fields] $Out pre{$Skip:1}} # c:{PHP PHI}
x{if:$Itc == "" || $fldinfo.($Itc).phenotype!=1 $Fail}
(i_words nl)+
~
ecnum: ~ /[0-9-]+\\.[0-9-]+\\.[0-9\\.-]+/ ~
# Enzyme - CV field + EC num field
# 'ENZ|arylalkylamine N-acetyltransferase == EC 2.3.1.87'
# 'ENZ|glycerol-3-phosphate dehydrogenase (NAD+) == EC 1.1.1.8'
ENZrecoder: ~
( '=='
| /EC[ :-]*/? ecnum { $Uniq:[$Itc s:{"ECNUM|%s\n" $Ct}] }
| /[^=\n]+/ { $Uniq:[$Itc s:{"TERM|%s\n" $Trim:$Ct}] }
| /./
)*
~
preENZ: ~ {$In:[fields c:ENZ] $Out pre{$Skip:1}} ENZrecoder ~
ENZ: ~ {$In:[preENZ] $Out } ## ? do we need this one? handle all in functions?
( 'ECNUM|' /[^\n]*/ {$Uniq:ECNUM}
| 'TERM|' /[^\n]*/ {$Uniq:TERM}
| /./
)+
~
idpatt: ~ /[A-Za-z][A-Za-z][A-Za-z][A-Za-z][0-9]+/ ~
ID: ~ { $In:[fields c:ID] $Out }
( idpatt {if:$lrec == $mainrec {$Uniq $entryName=$Ct}} | /./ )+
~
allids: ~ { $In:[fields c:{ID ID2}] $Out }
( idpatt {$Uniq } | /./ )+
~
symbolsplit: ~ {$In:[fields] $Out pre{$Skip:0 $inapp=0}}
x{if:$Itc == "" || $fldinfo.($Itc).symbol!=1 $Fail}
( /[^:\n]+/ { if:$inapp==0 {$Wrt:$Itc $inapp=1} else $App:$Itc}
| '::' { $inapp=0 }
| ': ' { $inapp=0 }
| /:[^ :\n]/ { if:$inapp==0 {$Wrt:$Itc $inapp=1} else $App:$Itc }
| /./ {$App:$Itc}
)+
~
SYM_CS: ~ {$In:[symbolsplit c:SYM] $Out } # _CS == case-sensitive field, requires srs c changes
x{if:$lrec!=$mainrec $Fail} #? do in pre{}
## Species\Symbol is funky drosophila symbol syntax, and \\\\ is icarus match for '\'
( /([^ \\]+)\\\\/? { $Uniq:[s:$1]} i_phrase nl? )+
~
synonyms_CS: ~ {$In:[symbolsplit c:{SYM SYN}] $Out }
x{if:$lrec!=$mainrec $Fail} #? do in pre{}
( /([^ \\]+)\\\\/? { $Uniq:[s:$1]} i_phrase nl? )+
~
SYM: ~ {$In:SYM_CS $Out } (/.+/ {$Uniq})+ ~
synonyms: ~ {$In:synonyms_CS $Out } (/.+/ {$Uniq})+ ~
NAM: ~ {$In:[fields c:NAM] $Out pre{$Skip:0}}
x{if:$lrec != $mainrec $Fail} #? do in pre{}
i_cvterms
# ( i_cvterm /./ )*
# ((i_cvterm /[\n]/ ) | /./)*
# i_cvterm+
~
DT: ~ {$In:[fields c:DT] $Out } ( i_date | /./ )+ ~
ORG: ~ {$In:[fields c:ORG] $Out } ( i_phrase | /./ )+ ~
## these should be numeric, but vary so widely among orgs will be a big task
## is only first ~word valid?
MAP: ~ {$In:[fields c:MAP] $Out } i_blackspace ~
CHR: ~ {$In:[fields c:CHR] $Out } i_blackspace ~
## CLA should be a cvocab - Predicted, ORF, determined
## default = determined, Experimentally determined or Confirmed
preCLA: ~ {$In:[fields c:{CLA ID}] $Out }
/[^ \t\n]+/ {$Wrt if:$Itc==CLA $klass=$Ct }
~
CLA: ~ {$In:preCLA $Out} /.+/ {$Uniq:[s:$klass]} ~
allwords: ~ {$In:[fields] $Out }
x{ if:$fldinfo.($Itc).noindex $Fail}
(i_words+ nl)+
~
## x{if:$Itc == "" || $fldinfo.($Itc).noindex $Fail}
## i_words+
## ! THIS IS BAD - puts word in many fld indices: {$Uniq:$Itc}
##((/[0-9A-Za-z_]+/ {$Uniq:$Itc} | /[^0-9A-Za-z_\n]+/) nl)+
##((/[0-9A-Za-z_]+/ {$Uniq} | /[^0-9A-Za-z_\n]+/) nl)+
#---------- html output
h_sor: ~ { $In:[fields c:sor t:html] $Out }
x{$Rep:"
"}
/.*/ ~
h_eor: ~ { $In:[fields c:eor t:html] $Out } x{$Rep:"
\n"} /.*/ ~
h_field: ~ { $In:[fields t:html] $Out }
x{ $vl= $fldinfo.($Itc).vl
if:$fldinfo.($Itc).noview { $Rep:'' }
elif:$reclev>0 {$Rep:"| $vl | " }
else {$Rep:"$vl"}
}
/.+/ {if:$fldinfo.($Itc).noview $Rep:''}
x{ if:$fldinfo.($Itc).noview { $Rep:'' }
elif:$reclev>0 { $Rep:" |
\n"}
}
~
h_ENZ: ~ {$In:[ENZ c:ECNUM t:html] $Out }
/.*/ {$Rep:{$ParStr:enzymeR $Ct $Ct}}
~
h_accessions: ~ {$In:[accessions c:DBL t:html] $Out }
(
dbname { $db=$1 }
/[^\n]+/ { $acc=$Ct $dbr=$dblinks.($db)[1]
if:$dbr!="" $Rep:{$ParStr:$dbr $acc $acc}
}
| /./ )+
~
alllinks: ~ {$In:[fields c:DBL t:html] $Out} /.*/ {$Uniq} ~
# ^^ not working, need? to call html parser for each [fields c:code] ?
} # end $rules
## ? can we use a 'global' var like this in rules, pre initialized properly?
$mainrec= 'MAINREC'
$recclass='CLASS'
$reclist={lev1 lev2 lev3 lev4 lev5 lev6 lev7 lev8 lev9} ## need to init array size !
$months={Jan:1 Feb:2 Mar:3 Apr:4 May:5 Jun:6 Jul:7 Aug:8 Sep:9 Oct:10 Nov:11 Dec:12}
$greekcodes={
'a':'alpha' 'A':'Alpha' 'b':'beta' 'B':'Beta'
'g':'gamma' 'G':'Gamma' 'd':'delta' 'D':'Delta'
'e':'epsilon' 'E':'Epsilon' 'z':'zeta' 'Z':'Zeta'
'ee':'eta' 'EE':'Eta' 'th':'theta' 'TH':'Theta'
'i':'iota' 'I':'Iota' 'k':'kappa' 'K':'Kappa'
'l':'lambda' 'L':'Lambda' 'm':'mu' 'M':'Mu'
'n':'nu' 'N':'Nu' 'x':'xi' 'X':'Xi'
'o':'omicron' 'O':'Omicron' 'p':'pi' 'P':'Pi'
'r':'rho' 'R':'Rho' 's':'sigma' 'S':'Sigma'
't':'tau' 'T':'Tau' 'u':'upsilon' 'U':'Upsilon'
'ph':'phi' 'PH':'Phi' 'kh':'chi' 'KH':'Chi'
'ps':'psi' 'PS':'Psi' 'oh':'omega' 'OH':'Omega'
'sf':'s'
}
$dblinks={
EMBL: emblR
PROSITE: prositeR
SWISSPROT: swissR
UNIGENE: unigeneR
LOCUSLINK: locuslinkR
MEDLINE: medlineR
ECNUM: enzymeR
FLYBASE: flygeneR
GDB: gdbR
PFAM: pfamR
}
# Testing.......................
if:$TestMode {
$job = $JobNew:[prod:$rules skip:" "
fileName:'/bio/data/meow/FBgn.acode']
# fileName:'SRSROOT:data/meow/FBgn.acode']
# fileName:'SRSROOT:data/FBgn.acode']
# fileName:'SRSROOT:data/ATgn.acode']
# fileName:'SRSROOT:data/MGgn.acode']
# fileName:'SRSROOT:data/SGgn.acode']
# fileName:'SRSROOT:data/ZFgn.acode']
# fileName:'SRSROOT:data/CEgn.acode']
# fileName:'SRSROOT:data/HUgn.acode']
while:$JobHasInput:$job {
$JobTokens:[$job name:fields print:1]
## $JobTokens:[$job task:html name:h_fields print:1] # no good
# $JobTokens:[$job name:ID print:1]
# $JobTokens:[$job name:SYM print:1]
# $JobTokens:[$job name:SYN print:1]
# $JobTokens:[$job name:NAM print:1]
# $JobTokens:[$job name:ORG print:1]
# $JobTokens:[$job name:CHR print:1]
# $JobTokens:[$job name:MAP print:1]
$JobTokens:[$job name:ENZ print:1]
$JobTokens:[$job name:HG print:1]
$JobTokens:[$job name:accessions print:1]
$JobTokens:[$job name:functions print:1]
$JobTokens:[$job name:phenotypes print:1]
# $JobTokens:[$job name:DID print:1]
# $JobTokens:[$job name:DBL print:1]
# $JobTokens:[$job name:PDOM print:1]
# $JobTokens:[$job name:DT print:1]
# $JobTokens:[$job name:allwords print:1]
$JobNext:$job
}
}