# dbs/lucegene/webdocs.properties
# index mostly web docs, html and txt
# would like to add pdf indexer ...
# see also LuceneHtmlIndexer.properties
LIB_NAME=webdocs
title = FlyBase Web Documents
DATA_ROOT=web/
INDEX_PATH=indices/lucene/webdocs/
## -- fixme - get from url.suffix
MIME_TYPE=text/html
## --------- search/report keys -------------------
## reportExtension = /cgi-bin/fbidq.html?
docurl=lookup.jsp?lid=
## Fields that can be searched/displayed
outfields=title,url
storefields=docid,title,url
# Fields to display when xml chosen on search result page.
# displayFields = title, url
## default search field
searchfield=all
# default search field (if one not chosen from list on web)
defaultSearchField = all
# format=html
header.native=
footer.native=\n
header.xml=\n
footer.xml=
xsl = webdocs.xsl
fieldList =\
all, All Fields, \
docid, Document ID,\
title, Title, \
summary, Summary, \
url, URL
# fields that user can sort on
sortFields = title
## ---- indexing values;
# locate data with regex file, folder patterns
regex_folder=^\\w.+$
regex_file=^\\w.+\\.(txt|html|shtml|htm)$
## skip extended till type=gene gets changed to gene_extended
regex_skipfile=genes.txt|aberrations.txt|.*\\.old|tmp|(index-main|robots).*
## probably want to skip big data files, but maybe index small docs in data/, data2/ ?
## note also indexer gets symlinks which may duplicate other folders
## web/docs/flydocs/ seems a symlink/folder duplicating many other paths
## also skip these - make separate index for them (very many doc files)
## web/allied-data/lk/interactive-fly/
## web/anatomy/Drosophilidae/
## web/docs/working-papers/flybase-usage-stats/
## web/genes/by-map
## web/genes/by-symbol
## web/maps/by-cytoloc/
## web/maps/lk/sean/gifs
## web/maps/lk/kevin/gifs
## web/genes/lk/function/ -- more data
## ? web/allied-data/lk/phylogeny/Drosophilidae-Tree/
## this is old: web/docs/lk/redbook/
regex_skipfolder=common|by-cytoloc|by-map|by-symbol|interactive-fly|redbook|Drosophilidae|phylogeny|sean|kevin|flybase-usage-stats|flydocs|annotfb|common|data|data2|images|seqs|templates|stocks/stock-centers/lk/bloomington|tmp|old.*|.*\\.old
INDEX_CLASS=org.eugenes.index.LuceneHtmlIndexer
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
## append existing index or create new
INDEX_APPEND=false
## index names as values (as well as field names)?
INDEX_TAGS=false
## use fieldname xpath; full top.middle.last field name?
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml tag
INDEX_LEVEL=0
INDEX_BLANKS=false
merge_factor=10
max_field_length=1000000
## memory crash cure:
MAX_FIELDS=10000
# to create "contents" field of all text
indexall=false
## field indexing parameters
# special summary fields -- replace w/ fieldalias.TAG=newtag
# field.title=RETE
# field.summary=SUMX
# field.docid=ID
## fieldalias.OLDTAG=NEWTAG
## fieldalias precedes fieldtype.tag processing
# fieldalias.ID=docid
## subrecdoc.TAG == say whether to index subrecs as separate docs
## overrides INDEX_LEVEL
# subrecdoc.ALESR=true
## default - UnStored = index but dont store text
fieldtype=UnStored
fieldtype.docid=Text
fieldtype.title=Text
fieldtype.contents=UnStored
fieldtype.summary=UnIndexed
#fieldtype.XXX=Text
#fieldtype.XXX=Keyword
#fieldtype.XXX=UnIndexed
#fieldtype.XXX=UnStored
#fieldtype.XXX=ignore
## HtmlIndexer uses apache.lucene.demo.html indexer classes
# analyzer=org.eugenes.index.BiodataAnalyzer
#
# ## field filters
# tokenfilter.GSYM=org.eugenes.index.BiodataAnalyzer$DataFilter
# tokenfilter.SYM=org.eugenes.index.BiodataAnalyzer$DataFilter
# tokenfilter.ID=org.eugenes.index.BiodataAnalyzer$LowerDataFilter
# tokenfilter.contents=org.eugenes.index.BiodataAnalyzer$LowerWordFilter
#
# ##tokenfilter.YR=org.eugenes.index.BiodataAnalyzer$DateFilter
# tokenfilter.DT=org.eugenes.index.BiodataAnalyzer$DateFilter
# tokenfilter.BLOC=org.eugenes.index.BiodataAnalyzer$NumberFilter
# tokenfilter.CDS=org.eugenes.index.BiodataAnalyzer$NumberFilter
# tokenfilter.SQLEN=org.eugenes.index.BiodataAnalyzer$NumberFilter
# tokenfilter.AALEN=org.eugenes.index.BiodataAnalyzer$NumberFilter
#
#
# ## field tokenizers -- replace with filters
# tokenizer.GSYM=org.eugenes.index.LuceneTableIndexer$DataTokenizer
# tokenizer.SYM=org.eugenes.index.LuceneTableIndexer$DataTokenizer
# tokenizer.ID=org.eugenes.index.LuceneTableIndexer$LowerDataTokenizer
# tokenizer.contents=org.eugenes.index.LuceneTableIndexer$LowerWordTokenizer
# ## also $WordTokenizer
# ## tokenizer.RETE=org.eugenes.index.LuceneTableIndexer$NullTokenizer
#
# ## fieldtyper.RETE=org.eugenes.index.LuceneTableIndexer$ReteHandler