# dbs/lucegene/webdocs.properties # index mostly web docs, html and txt # would like to add pdf indexer ... # see also LuceneHtmlIndexer.properties LIB_NAME=webdocs title = FlyBase Web Documents DATA_ROOT=web/ INDEX_PATH=indices/lucene/webdocs/ ## -- fixme - get from url.suffix MIME_TYPE=text/html ## --------- search/report keys ------------------- ## reportExtension = /cgi-bin/fbidq.html? docurl=lookup.jsp?lid= ## Fields that can be searched/displayed outfields=title,url storefields=docid,title,url # Fields to display when xml chosen on search result page. # displayFields = title, url ## default search field searchfield=all # default search field (if one not chosen from list on web) defaultSearchField = all # format=html header.native= footer.native=\n header.xml=\n footer.xml= xsl = webdocs.xsl fieldList =\ all, All Fields, \ docid, Document ID,\ title, Title, \ summary, Summary, \ url, URL # fields that user can sort on sortFields = title ## ---- indexing values; # locate data with regex file, folder patterns regex_folder=^\\w.+$ regex_file=^\\w.+\\.(txt|html|shtml|htm)$ ## skip extended till type=gene gets changed to gene_extended regex_skipfile=genes.txt|aberrations.txt|.*\\.old|tmp|(index-main|robots).* ## probably want to skip big data files, but maybe index small docs in data/, data2/ ? ## note also indexer gets symlinks which may duplicate other folders ## web/docs/flydocs/ seems a symlink/folder duplicating many other paths ## also skip these - make separate index for them (very many doc files) ## web/allied-data/lk/interactive-fly/ ## web/anatomy/Drosophilidae/ ## web/docs/working-papers/flybase-usage-stats/ ## web/genes/by-map ## web/genes/by-symbol ## web/maps/by-cytoloc/ ## web/maps/lk/sean/gifs ## web/maps/lk/kevin/gifs ## web/genes/lk/function/ -- more data ## ? web/allied-data/lk/phylogeny/Drosophilidae-Tree/ ## this is old: web/docs/lk/redbook/ regex_skipfolder=common|by-cytoloc|by-map|by-symbol|interactive-fly|redbook|Drosophilidae|phylogeny|sean|kevin|flybase-usage-stats|flydocs|annotfb|common|data|data2|images|seqs|templates|stocks/stock-centers/lk/bloomington|tmp|old.*|.*\\.old INDEX_CLASS=org.eugenes.index.LuceneHtmlIndexer analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer ## append existing index or create new INDEX_APPEND=false ## index names as values (as well as field names)? INDEX_TAGS=false ## use fieldname xpath; full top.middle.last field name? INDEX_XPATH=false ## INDEX_LEVEL=0 means index main records one level below xml tag INDEX_LEVEL=0 INDEX_BLANKS=false merge_factor=10 max_field_length=1000000 ## memory crash cure: MAX_FIELDS=10000 # to create "contents" field of all text indexall=false ## field indexing parameters # special summary fields -- replace w/ fieldalias.TAG=newtag # field.title=RETE # field.summary=SUMX # field.docid=ID ## fieldalias.OLDTAG=NEWTAG ## fieldalias precedes fieldtype.tag processing # fieldalias.ID=docid ## subrecdoc.TAG == say whether to index subrecs as separate docs ## overrides INDEX_LEVEL # subrecdoc.ALESR=true ## default - UnStored = index but dont store text fieldtype=UnStored fieldtype.docid=Text fieldtype.title=Text fieldtype.contents=UnStored fieldtype.summary=UnIndexed #fieldtype.XXX=Text #fieldtype.XXX=Keyword #fieldtype.XXX=UnIndexed #fieldtype.XXX=UnStored #fieldtype.XXX=ignore ## HtmlIndexer uses apache.lucene.demo.html indexer classes # analyzer=org.eugenes.index.BiodataAnalyzer # # ## field filters # tokenfilter.GSYM=org.eugenes.index.BiodataAnalyzer$DataFilter # tokenfilter.SYM=org.eugenes.index.BiodataAnalyzer$DataFilter # tokenfilter.ID=org.eugenes.index.BiodataAnalyzer$LowerDataFilter # tokenfilter.contents=org.eugenes.index.BiodataAnalyzer$LowerWordFilter # # ##tokenfilter.YR=org.eugenes.index.BiodataAnalyzer$DateFilter # tokenfilter.DT=org.eugenes.index.BiodataAnalyzer$DateFilter # tokenfilter.BLOC=org.eugenes.index.BiodataAnalyzer$NumberFilter # tokenfilter.CDS=org.eugenes.index.BiodataAnalyzer$NumberFilter # tokenfilter.SQLEN=org.eugenes.index.BiodataAnalyzer$NumberFilter # tokenfilter.AALEN=org.eugenes.index.BiodataAnalyzer$NumberFilter # # # ## field tokenizers -- replace with filters # tokenizer.GSYM=org.eugenes.index.LuceneTableIndexer$DataTokenizer # tokenizer.SYM=org.eugenes.index.LuceneTableIndexer$DataTokenizer # tokenizer.ID=org.eugenes.index.LuceneTableIndexer$LowerDataTokenizer # tokenizer.contents=org.eugenes.index.LuceneTableIndexer$LowerWordTokenizer # ## also $WordTokenizer # ## tokenizer.RETE=org.eugenes.index.LuceneTableIndexer$NullTokenizer # # ## fieldtyper.RETE=org.eugenes.index.LuceneTableIndexer$ReteHandler