# dbs/lucegene/fbgn.properties # from LuceneAcodeIndexer.properties # version 3, nov 2004 -- using 'flat' view of record (all fields indexed at top) # but for certain main record fields (ID, CLAss) # using revised Biodata filters and fbacode.java class parsers ## ?? rename these special fields: .cs, .cv -> _cs, _cv ## what of others: .start, .stop, .strand, .chr LIB_NAME=fbgn title = FlyBase Genes # can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set DATA_ROOT=web/data/fbobs/ ## index path ignored here; set by caller shell to base-path/libname INDEX_PATH=indices/lucene/fbgn/ MIME_TYPE=text/acode ## --------- search/report keys ------------------- ### common; dont need here # docurl=/cgi-bin/fbidq.html? # batchurl=/cgi-bin/fbidq.html? # batchproc=cgi-bin/fbidq.html # batch.forward= # fields that user can sort on # genes.sort= sortFields = GSYM # index fields [ , ABA, ACM, AFC, AFS, ALC, AM, AMIS, AMSO, ANRB, APC, # APR, APRB, ARB, ARG2, ARS, ASQ, ASYM, BMD, BMDD, BMDP, BMND, BMPD, CEL, # CLA, CLOC, CNS, CYA, CYC, DBA, DHO, DIS, DT, ENZ, FNC, FSQ, GIA, GIA2, # GIC, GIC2, GII, GLC, GLOC, GPD, GSNA, GSYM, ID, ID2, IFL, KLOC, LOI, # MAIN_ID, MD, MU, NAF, NAM, OTH, PAC, PDOM, PEVD, PEVN, PEVR, PHC, PHI, # PHM, PHP, PPC, PPS, PRG, RDID, REF, REFM, REV, SK, SKC, SYN, TE, TRN, # UAB, WT, WTI, docclass, docid, modified, title, uid, url] ##displayFields = GSYM, NAM, CLOC, ALESR, REF, DBA, DT, RESZ outfields= GSYM, NAM, CLOC, ALESR, REF, DBA, DT, iscore ## acode RETE field >> title storefields=title url searchfield = all #### need to flip searchallfield with queryboost ### queryboost=1; searchallfield = all; for best relevance ### queryboost=0; searchallfield = contents ; for best speed & min memuse for complex query; or if relevance not useful # searchallfield = contents searchallfield = all # turn on/off query boosting relevance by fields queryboost = 1 ## see below # boost.GSYM=10 # boost.NAM=5 resultxsl = conf/fbgn_result.xsl resultspage = resultxsl.jsp header.native=# FlyBase Gene Acode Record footer.native=\n # TEST inter-lib linking ; predefined valid links # Note: for these to work, fieldtype must be STORED for this index (Text or Unindex) linkto=\ docid>go-GN_FB\n\ docid>anatomy-GN\n\ docid,ID2>fban-ID\n\ docid>fbal-ID\n\ docid>fbrf-DBX\n\ docid>seqs-db_xref\n\ docid>bindxml-DBX\n\ ## these may be useful, need to STORE in index # ASYM.cs>fbal-ASYM.cs\n\ # RDID>fbrf-ID\n\ # ABA>fbab-ID\n\ # some of these are not stored in luc doc; need title=RETE parsed! fieldlabel.docid=ID fieldlabel.docclass=Class fieldlabel.GSYM=Symbol fieldlabel.NAM=Name fieldlabel.CLOC=Map fieldlabel.DT=Date fieldlabel.REF=Refs fieldlabel.DBA=DNA acc. fieldlabel.SK=Stocks fieldlabel.ALESR=Alleles fieldlabel.RESZ=Rept. size # do in base properties # include.fieldlabel=acode-labels.properties ## share field configs w/ include file # formatter.CLOC=fbacodefmt$CLOC_Format # formatter.style.CLOC=url=/cgi-bin/gbrowse_fb/dmel?cytomap=;format=html;; ## for search only? searchskip.MAIN_CLA=1 searchskip.docclass=1 #searchskip.CLA=1 searchskip.count=1 searchskip.field=1 searchskip.title=1 searchskip.uid=1 searchskip.url=1 searchskip.modified=1 searchskip.lastModified=1 # this has odd CV list; Partially duplicated in searchskip.BMPD=1 ## ---------- indexing values -------------- # analyzer= and tokenizer;tokenfilter values # locate data with regex file, folder patterns regex_folder= # can we index multiple files w/ same docid - best do as separate indice? #regex_file=^(FBgn.acode|FBgn.(expat|ourl|rci|summary|xdba).acode)$ regex_file=^(FBgn.acode)$ regex_skipfile= regex_skipfolder=(.*) INDEX_CLASS=org.eugenes.index.LuceneAcodeIndexer INDEX_APPEND=false INDEX_TAGS=false INDEX_LEVEL=0 INDEX_BLANKS=false ## use fieldname xpath; full top.middle.last field name? INDEX_XPATH=false ## merge=10 is default; 4 == less mem usage ; 2 minimum merge_factor=4 max_field_length=1000000 MAX_FIELDS=30000 ## MAX_FIELDS 30000 is enough but for bizarre lacZ FBgn0014447 # . MAX_FIELDS overflow=109889 i=28820 id=FBgn0014447 <<< # . MAX_FIELDS overflow=27913 i=44356 id=FBgn0003996 << next biggest record ## field indexing parameters ## sumfields list needs to match field.xxx common summary fields sumfields=docid,docclass,title,summary # special summary fields -- replace w/ fieldalias.TAG=newtag #field.docid=ID #field.docclass=CLA field.title=RETE field.summary=SUMX ## NOTE: using fieldalias.ID=docid makes ID go away in index ## using only field.docid=ID keeps both ID and docid fields fieldalias.RETE=title fieldalias.SUMX=summary ## default - Text // not UnStored = index but dont store text fieldtype=UnStored fieldtype.RETE=UnIndexed fieldtype.ID=Text fieldtype.GSYM=Text fieldtype.SYM=Text fieldtype.title=UnIndexed fieldtype.summary=UnIndexed ## add RETE parts as stored values fieldtype.NAM=Text fieldtype.RSQ=Text fieldtype.RPA=Text # for links fieldtype.ASYM=Text fieldtype.RDID=Text fieldtype.ABA=Text fieldtype.DBX=Text ## these must always be stored; Text or UnIndexed fieldtype.docid=Text fieldtype.docclass=Text fieldtype.url=UnIndexed fieldtype.modified=Keyword analyzer=org.eugenes.index.BiodataAnalyzer2 # all field defaults tokenizer=fbacode$LowerWordTokenizer #tokenfilter=fbacode$DebugFilter tokenfilter.EOR=fbacode$DebugEndOfRecordFilter # add default recoders? - but skip for fields w/o greeks? fieldrecoder=fbacode$Greek_Recoder # to create "contents" field of all text -- may want to use this ! ## >> using searchallfield = all # Query: "+(+(+(+(all:head all:kinase) -NAM:kinase -GSYM:cg*) -NAM:fork*) # -NAM:head) -NAM:lethal" # No. matches = 1032 of 44664 documents, in 2.555 sec. # Query: "+(+(+(+(all:head all:kinase) -NAM:kinase -GSYM:cg*) -all:fork*) # -NAM:head) -all:lethal" # No. matches = 415 of 44664 documents, in 2.336 sec. ## >> using searchallfield = contents # Query: "+(+(+(+(all:head all:kinase) -NAM:kinase -GSYM:cg*) -NAM:fork*) # -NAM:head) -NAM:lethal" # No. matches = 1032 of 44664 documents, in 0.734 sec. <<< faster # Query: "+(+(+(+(all:head all:kinase) -NAM:kinase -GSYM:cg*) -all:fork*) # -NAM:head) -all:lethal" # No. matches = 415 of 44664 documents, in 2.051 sec. ## BUT using field boost, then search all gets more relevance # LGQueryParser.multiparse =(+(ABA:dpp ACM:dpp AFC:dpp AFS:dpp ALC:dpp^3.0 # ALC.cv:dpp AM:dpp AMIS:dpp AMSO:dpp ANRB:dpp APC:dpp APR:dpp APRB:dpp # ARB:dpp ARG2:dpp ARS:dpp ASQ:dpp ASYM:dpp BMD:dpp BMDD:dpp BMDP:dpp # BMND:dpp BMPD:dpp CEL:dpp CEL.cv:dpp CLA:dpp CLOC:dpp CNS:dpp CYA:dpp # CYC:dpp DBA:dpp DBX:dpp DHO:dpp DIS:dpp DT:dpp ENZ:dpp ENZ.cv:dpp # FNC:dpp FNC.cv:dpp FSQ:dpp GIA:dpp GIA.cv:dpp GIA2:dpp GIA2.cv:dpp # GIC:dpp GIC.cv:dpp GIC2:dpp GIC2.cv:dpp GII:dpp GLC:dpp GLOC:dpp GPD:dpp # GSNA:dpp GSYM:dpp^11.0 GSYM.cs:dpp ID:dpp ID2:dpp IFL:dpp LOI:dpp MD:dpp # MU:dpp^2.0 NAF:dpp NAM:dpp^6.0 OTH:dpp PAC:dpp PDOM:dpp PEVD:dpp # PEVN:dpp PEVR:dpp PHC:dpp PHC.cv:dpp PHI:dpp PHM:dpp PHM.cv:dpp PHP:dpp # PPC:dpp PPS:dpp PRG:dpp RDID:dpp REF:dpp REFM:dpp REV:dpp SK:dpp SKC:dpp # SYN:dpp TE:dpp TRN:dpp UAB:dpp WT:dpp WTI:dpp contents:dpp docid:dpp^6.0 # species:dpp symbols:dpp) +species:dmel) # -docclass:existence-uncertain*^6.0 # Query: "(fbgn-(+all:dpp +species:dmel) -docclass:existence-uncertain*^6.0) " # No. matches = 487 of 44518 documents, in 1.654 sec. # << vs 0.026 sec for search contents # .. add "use relevance ranks" switch ? # or use smart switch - if searching "all" only? indexall=true tokenizer.contents=fbacode$LowerWordTokenizer fieldrecoder.contents=fbacode$Greek_Recoder,fbacode$Contents_Recoder fieldtype.contents=UnStored tokenizer.ID=fbacode$IDTokens tokenizer.ID2=fbacode$IDTokens boost.ID=2 #field.docid=MAIN_ID fieldtype.MAIN_ID=Text fieldalias.MAIN_ID=docid ##fieldrecoder.ID=fbacode$FBMainID_Recoder fieldrecoder.ID=fbacode$FBMainField_Recoder tokenizer.MAIN_ID=fbacode$IDTokens tokenizer.docid=fbacode$IDTokens boost.docid=10 #field.docclass=MAIN_CLA fieldtype.MAIN_CLA=Text fieldalias.MAIN_CLA=docclass fieldrecoder.CLA=fbacode$FBMainField_Recoder,fbacode$ALC_Recoder #tokenizer.MAIN_CLA=fbacode$cvterms tokenizer.CLA=fbacode$cvterms tokenizer.docclass=fbacode$cvterms boost.docclass=2 ## special case field sent at end of each record; dont want to index it fieldtype.EOR=UnStored fieldrecoder.EOR=fbacode$FB_EOR_Recoder ## tokenfilter used only if fieldtype=Text or UnStored boost.NAM=5 fieldrecoder.GSYM=fbacode$Greek_Recoder,fbacode$SYM_Recoder tokenizer.GSYM=fbacode$LowerSymTokens tokenizer.GSYM.cs=fbacode$SymTokens tokenfilter.GSYM.cs=fbacode$DebugFilter boost.GSYM=10 fieldrecoder.SYM=fbacode$Greek_Recoder,fbacode$SYM_Recoder tokenizer.SYM=fbacode$LowerSymTokens tokenizer.SYM.cs=fbacode$SymTokens boost.SYM=5 tokenizer.SYN=fbacode$LowerSymTokens # tokenizer.SYN.cs=fbacode$SymTokens fieldtype.SYN.cs=skip tokenizer.DT=fbacode$DateTokens tokenfilter.DT=fbacode$DateFilter, fbacode$DebugFilter tokenizer.ENZ=fbacode$words tokenizer.ENZ.cv=fbacode$cvterms fieldrecoder.ENZ=fbacode$GO_Recoder boost.ENZ.cv=4 boost.ENZ=2 tokenizer.FNC=fbacode$words tokenizer.FNC.cv=fbacode$cvterms fieldrecoder.FNC=fbacode$GO_Recoder boost.FNC.cv=4 boost.FNC=2 tokenizer.CEL=fbacode$words tokenizer.CEL.cv=fbacode$cvterms fieldrecoder.CEL=fbacode$GO_Recoder boost.CEL.cv=4 boost.CEL=2 # recode db_xref/id fields into one like w/ srs? tokenizer.DBX=fbacode$dbxrefTokens boost.DBX=2 ## ---- allele parts , see also fbal.props ----------------------- ## ALC + CLA for alleles == class; CLA == wild-type generic if no ALC? fieldtype.ALC=Text tokenizer.ALC.cv=fbacode$cvterms fieldrecoder.ALC=fbacode$ALC_Recoder #tokenfilter.ALC.cv=fbacode$DebugFilter fieldvalues.ALC.cv=200 searchskip.ALC=1 boost.ALC=2 boost.ALC.cv=4 tokenizer.GIC.cv=fbacode$cvterms #tokenfilter.GIC.cv=fbacode$DebugFilter fieldrecoder.GIC=fbacode$Greek_Recoder,fbacode$PhenotypeCV_Recoder fieldvalues.GIC.cv=200 boost.GIC.cv=4 boost.GIC=2 tokenizer.GIC2.cv=fbacode$cvterms #tokenfilter.GIC2.cv=fbacode$DebugFilter fieldrecoder.GIC2=fbacode$Greek_Recoder,fbacode$PhenotypeCV_Recoder fieldvalues.GIC2.cv=200 boost.GIC2.cv=4 boost.GIC2=2 tokenizer.GIA.cv=fbacode$cvterms #tokenfilter.GIA.cv=fbacode$DebugFilter fieldrecoder.GIA=fbacode$Greek_Recoder,fbacode$PhenotypeCV_Recoder fieldvalues.GIA.cv=200 boost.GIA.cv=4 boost.GIA=2 tokenizer.GIA2.cv=fbacode$cvterms #tokenfilter.GIA2.cv=fbacode$DebugFilter fieldrecoder.GIA2=fbacode$Greek_Recoder,fbacode$PhenotypeCV_Recoder fieldvalues.GIA2.cv=200 boost.GIA2.cv=4 boost.GIA2=2 ## this one is mainly text #tokenizer.PHI=fbacode$cvterms tokenizer.PHM.cv=fbacode$cvterms #tokenfilter.PHM.cv=fbacode$DebugFilter fieldrecoder.PHM=fbacode$Greek_Recoder,fbacode$PhenotypeCV_Recoder fieldvalues.PHM.cv=1200 boost.PHM.cv=4 boost.PHM=2 tokenizer.PHC.cv=fbacode$cvterms #tokenfilter.PHC.cv=fbacode$DebugFilter fieldrecoder.PHC=fbacode$Greek_Recoder,fbacode$PhenotypeCV_Recoder fieldvalues.PHC.cv=1200 boost.PHC.cv=4 boost.PHC=2 tokenizer.MU.cv=fbacode$cvterms #tokenfilter.MU.cv=fbacode$DebugFilter fieldrecoder.MU=fbacode$Greek_Recoder,fbacode$MU_Recoder fieldvalues.MU.cv=300 searchskip.MU=1 boost.MU=1