# dbs/lucegene/bindxml.properties
# d.gilbert, fall 2004
LIB_NAME = bindxml
title = BIND Protein Interactions
# can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set
DATA_ROOT=web/data/extdb/bind/
INDEX_PATH=indices/lucene/bindxml/
MIME_TYPE=text/xml
## --------- search/report keys -------------------
# docid is now interaction id; gi: for Geninfo-id
docurl=http://bind.ca/Action?idsearch=bindid:
batch.forward=lookup.jsp
## parser chopped at '-' ; ok now?
linkto= Object-id_str>fbgn-docid
linkto.fbgn=Object-id_str>docid
searchfield=all
# Fields to display when xml chosen on search result page.
outfields=docid,Molecule_a,MolID_a,Molecule_b,MolID_b,BIND-descr_simple-descr,url
storefields=docid,BIND-descr_simple-descr,Molecule_a,Molecule_b,MolID_a,MolID_b,url
## title == BIND-descr_simple-descr but not yet stored
## for xslt to produce the result page tables
resultxsl = conf/bindxml_result.xsl
resultspage = resultxsl.jsp
header.native=\n\
\n\
\n
footer.native=\n
#
# SAXException Relative URI "BIND.dtd"; can not be resolved without a base URI.
# >> dang these parsers, how do you turn off this w/o editing data?
# perl -pi -e's/^<\!(DOCTYPE.*)>/<\!-- $1 -->/'
header.xml=\n
footer.xml=\n
# ? this returns flybase FBrf reports now ; change
batchformat = text/xml
batchformats = text/xml, text/plain, text/csv, text/tsv
title.text/xml=BIND XML
nativeformat = text/xml
# batchheadlines=1
# batch.outfields= docid, docclass, ArticleTitle, PMID, LastName, Year, all
fieldlabel.docid=ID
fieldlabel.docclass=Class
## for search only?
searchskip.docclass=0
searchskip.title=0
searchskip.docid=1
searchskip.summary=1
# ---- indexing values; NOTE: need these for proper (cased) searches
# locate data with regex file, folder patterns
regex_folder=
regex_file=^bind.*\.xml$
regex_skipfile=
regex_skipfolder=.*
INDEX_CLASS=org.eugenes.index.LuceneXmlIndexer
## append existing index or create new
INDEX_APPEND=false
## index names as values (as well as field names)?
INDEX_TAGS=false
## index values ?
## none of medline attributes look interesting
INDEX_ATTRIBUTES=true
## use fieldname xpath; full top.middle.last field name?
#? INDEX_XPATH=true -- Medline.xml leaf names are unique enough we can drop the Xpath mess
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml tag
INDEX_LEVEL=3
INDEX_BLANKS=false
# level=0
#
#
#
#
#
#
#
#
# level=1
# level=2
#
#
#
# level=2
#
#
#
# level=2
#
#
#
# zcat fbmedline*.gz |\
# perl -pe'if(m,ID:(FBrf\d+),){$id=$1} s,,$id\n,;'\
# > medline.xml
# grep -c FlyBaseID med*xml = 32640
## IndexWriter opts
## merge=10 is default; 4 == less mem usage ; 2 minimum
merge_factor=4
## max_field_length is max # terms/field
max_field_length=1000000
MAX_FIELDS=10000
## field indexing parameters
## sumfields list needs to match field.xxx common summary fields
sumfields=docid,docclass,title
# special summary fields -- replace w/ fieldalias.TAG=newtag
# these must always have fieldtype Text or UnIndexed to be useful
fieldtype=UnStored
fieldtype.docid=Text
fieldtype.Geninfo-id=Text
fieldtype.Interaction-id=Text
field.docid=Interaction-id
# this is BIND GI id
# Interaction-id is interaction id; bindid:
fieldtype.BIND-descr_simple-descr=Text
field.title=BIND-descr_simple-descr
fieldtype.title=Text
fieldtype.Pub_muid=Text
fieldtype.PubMedId=Text
fieldtype.BIND-object_short-label=Text
fieldtype.BIND-object_other-names_E=Text
fieldtype.BIND-object_descr=Text
fieldtype.Object-id_id=Text
fieldtype.Object-id_str=Text
# ^^ FBgn ids are here
#field.docid=Object-id_str
#fieldalias.FlyBaseID=docid
analyzer=org.eugenes.index.BiodataAnalyzer2
# all field defaults
tokenizer=fbacode$LowerWordTokenizer
tokenfilter=fbacode$DebugFilter
## make DBX FB id field instead of relying on Object-id_str ?
## want to separately index BIND-Interaction_a and _b fields ??
## esp want FBid and BIND-object_short-label from both for results views
## this wont work with INDEX_XPATH=false
#fieldalias.BIND-Interaction_a.BIND-object.BIND-object_short-label=Molecule_a
#fieldalias.BIND-Interaction_b.BIND-object.BIND-object_short-label=Molecule_b
fieldrecoder.BIND-object_short-label=fbacode$BINDInter_Recoder
fieldrecoder.Object-id_str=fbacode$BINDInter_Recoder,fbacode$FBID_Recoder
fieldalias.BIND-object_short-label_a=Molecule_a
fieldalias.BIND-object_short-label_b=Molecule_b
fieldalias.Object-id_str_a=MolID_a
fieldalias.Object-id_str_b=MolID_b
fieldtype.Molecule_a=Text
fieldtype.Molecule_b=Text
fieldtype.MolID_a=Text
fieldtype.MolID_b=Text
# fieldrecoder.Object-id_str=fbacode$FBID_Recoder
fieldtype.DBX=Text
## this full xpath is hopeless..
# fieldalias.BIND-Interaction_b.BIND-object.BIND-object_id\
# .BIND-object-type-id.BIND-object-type-id_protein.BIND-id.BIND-id_other\
# .Seq-id.Seq-id_general.Dbtag.Dbtag_tag.Object-id.Object-id_str
# =FBID_B
# fieldtype.FBID_B=Text
tokenfilter.docid=fbacode$DebugEndOfRecordFilter
#tokenfilter.EOR=fbacode$DebugEndOfRecordFilter
# to create "contents" field of all text
indexall=false
# # AddCommonField_FieldRecoder
# # generate base fieldname fields when INDEX_XPATH=true
# # this doesnt prevent default indexing of full xpath
# # using property 'fieldalias.att_timestamp=timestamp' will collapse all
# # for given field.
#
# #fieldrecoder.name=LucegeneIndexers$AddCommonField_FieldRecoder
# #fieldrecoder.att_id=LucegeneIndexers$AddCommonField_FieldRecoder
# #fieldrecoder.synonym=LucegeneIndexers$AddCommonField_FieldRecoder
# #fieldrecoder.description=LucegeneIndexers$AddCommonField_FieldRecoder
#
# # tokenfilter.att_length=org.eugenes.index.BiodataAnalyzer$NumberFilter
#
# ## these look useless:
# fieldtype.QualifierName.att_MajorTopicYN=skip
# fieldtype.DescriptorName.att_MajorTopicYN=skip
# fieldtype.RegistryNumber=skip
# fieldtype.GrantList.att_CompleteYN=skip
# #----------------- sample debug output
# bin/lucegene-search.sh -verbose=1 -lib medline