# dbs/lucegene/bindxml.properties # d.gilbert, fall 2004 LIB_NAME = bindxml title = BIND Protein Interactions # can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set DATA_ROOT=web/data/extdb/bind/ INDEX_PATH=indices/lucene/bindxml/ MIME_TYPE=text/xml ## --------- search/report keys ------------------- # docid is now interaction id; gi: for Geninfo-id docurl=http://bind.ca/Action?idsearch=bindid: batch.forward=lookup.jsp ## parser chopped at '-' ; ok now? linkto= Object-id_str>fbgn-docid linkto.fbgn=Object-id_str>docid searchfield=all # Fields to display when xml chosen on search result page. outfields=docid,Molecule_a,MolID_a,Molecule_b,MolID_b,BIND-descr_simple-descr,url storefields=docid,BIND-descr_simple-descr,Molecule_a,Molecule_b,MolID_a,MolID_b,url ## title == BIND-descr_simple-descr but not yet stored ## for xslt to produce the result page tables resultxsl = conf/bindxml_result.xsl resultspage = resultxsl.jsp header.native=\n\ \n\ \n footer.native=\n # # SAXException Relative URI "BIND.dtd"; can not be resolved without a base URI. # >> dang these parsers, how do you turn off this w/o editing data? # perl -pi -e's/^<\!(DOCTYPE.*)>/<\!-- $1 -->/' header.xml=\n footer.xml=\n # ? this returns flybase FBrf reports now ; change batchformat = text/xml batchformats = text/xml, text/plain, text/csv, text/tsv title.text/xml=BIND XML nativeformat = text/xml # batchheadlines=1 # batch.outfields= docid, docclass, ArticleTitle, PMID, LastName, Year, all fieldlabel.docid=ID fieldlabel.docclass=Class ## for search only? searchskip.docclass=0 searchskip.title=0 searchskip.docid=1 searchskip.summary=1 # ---- indexing values; NOTE: need these for proper (cased) searches # locate data with regex file, folder patterns regex_folder= regex_file=^bind.*\.xml$ regex_skipfile= regex_skipfolder=.* INDEX_CLASS=org.eugenes.index.LuceneXmlIndexer ## append existing index or create new INDEX_APPEND=false ## index names as values (as well as field names)? INDEX_TAGS=false ## index values ? ## none of medline attributes look interesting INDEX_ATTRIBUTES=true ## use fieldname xpath; full top.middle.last field name? #? INDEX_XPATH=true -- Medline.xml leaf names are unique enough we can drop the Xpath mess INDEX_XPATH=false ## INDEX_LEVEL=0 means index main records one level below xml tag INDEX_LEVEL=3 INDEX_BLANKS=false # level=0 # # # # # # # # # level=1 # level=2 # # # # level=2 # # # # level=2 # # # # zcat fbmedline*.gz |\ # perl -pe'if(m,ID:(FBrf\d+),){$id=$1} s,,$id\n,;'\ # > medline.xml # grep -c FlyBaseID med*xml = 32640 ## IndexWriter opts ## merge=10 is default; 4 == less mem usage ; 2 minimum merge_factor=4 ## max_field_length is max # terms/field max_field_length=1000000 MAX_FIELDS=10000 ## field indexing parameters ## sumfields list needs to match field.xxx common summary fields sumfields=docid,docclass,title # special summary fields -- replace w/ fieldalias.TAG=newtag # these must always have fieldtype Text or UnIndexed to be useful fieldtype=UnStored fieldtype.docid=Text fieldtype.Geninfo-id=Text fieldtype.Interaction-id=Text field.docid=Interaction-id # this is BIND GI id # Interaction-id is interaction id; bindid: fieldtype.BIND-descr_simple-descr=Text field.title=BIND-descr_simple-descr fieldtype.title=Text fieldtype.Pub_muid=Text fieldtype.PubMedId=Text fieldtype.BIND-object_short-label=Text fieldtype.BIND-object_other-names_E=Text fieldtype.BIND-object_descr=Text fieldtype.Object-id_id=Text fieldtype.Object-id_str=Text # ^^ FBgn ids are here #field.docid=Object-id_str #fieldalias.FlyBaseID=docid analyzer=org.eugenes.index.BiodataAnalyzer2 # all field defaults tokenizer=fbacode$LowerWordTokenizer tokenfilter=fbacode$DebugFilter ## make DBX FB id field instead of relying on Object-id_str ? ## want to separately index BIND-Interaction_a and _b fields ?? ## esp want FBid and BIND-object_short-label from both for results views ## this wont work with INDEX_XPATH=false #fieldalias.BIND-Interaction_a.BIND-object.BIND-object_short-label=Molecule_a #fieldalias.BIND-Interaction_b.BIND-object.BIND-object_short-label=Molecule_b fieldrecoder.BIND-object_short-label=fbacode$BINDInter_Recoder fieldrecoder.Object-id_str=fbacode$BINDInter_Recoder,fbacode$FBID_Recoder fieldalias.BIND-object_short-label_a=Molecule_a fieldalias.BIND-object_short-label_b=Molecule_b fieldalias.Object-id_str_a=MolID_a fieldalias.Object-id_str_b=MolID_b fieldtype.Molecule_a=Text fieldtype.Molecule_b=Text fieldtype.MolID_a=Text fieldtype.MolID_b=Text # fieldrecoder.Object-id_str=fbacode$FBID_Recoder fieldtype.DBX=Text ## this full xpath is hopeless.. # fieldalias.BIND-Interaction_b.BIND-object.BIND-object_id\ # .BIND-object-type-id.BIND-object-type-id_protein.BIND-id.BIND-id_other\ # .Seq-id.Seq-id_general.Dbtag.Dbtag_tag.Object-id.Object-id_str # =FBID_B # fieldtype.FBID_B=Text tokenfilter.docid=fbacode$DebugEndOfRecordFilter #tokenfilter.EOR=fbacode$DebugEndOfRecordFilter # to create "contents" field of all text indexall=false # # AddCommonField_FieldRecoder # # generate base fieldname fields when INDEX_XPATH=true # # this doesnt prevent default indexing of full xpath # # using property 'fieldalias.att_timestamp=timestamp' will collapse all # # for given field. # # #fieldrecoder.name=LucegeneIndexers$AddCommonField_FieldRecoder # #fieldrecoder.att_id=LucegeneIndexers$AddCommonField_FieldRecoder # #fieldrecoder.synonym=LucegeneIndexers$AddCommonField_FieldRecoder # #fieldrecoder.description=LucegeneIndexers$AddCommonField_FieldRecoder # # # tokenfilter.att_length=org.eugenes.index.BiodataAnalyzer$NumberFilter # # ## these look useless: # fieldtype.QualifierName.att_MajorTopicYN=skip # fieldtype.DescriptorName.att_MajorTopicYN=skip # fieldtype.RegistryNumber=skip # fieldtype.GrantList.att_CompleteYN=skip # #----------------- sample debug output # bin/lucegene-search.sh -verbose=1 -lib medline