# dbs/lucegene/paperspdf.properties
# d.gilbert, 2004
LIB_NAME=paperspdf
title = Drosophila OA Publications (PDF)
DATA_ROOT=web/data/papers/
INDEX_PATH=indices/lucene/paperspdf/
MIME_TYPE=application/pdf
#-------- search ------------
searchfield=all
format=table
outfields=docid,summary
storefields=docid,summary,url
# bmc pdfs have bogus author,title,...
# need this in addition to MIME_TYPE ?
native.binary=1
batchformat = application/pdf
batchformats = application/pdf, text/plain, text/xml, text/csv, text/tsv
# nativeformat = application/pdf
header.native=
footer.native=
header.xml=\n
footer.xml=
docurl=lookup.jsp?id=
batch.forward=lookup.jsp
## Fields that can be searched/displayed
# linkto.fbrf=docid>docid
# bmc pdfs have bogus author,title,...
searchskip.title=1
searchskip.author=1
searchskip.subject=1
## these are for search system; map generic field names to lib-specific
# searchfieldalias.title=title
# searchfieldalias.year=date
# searchfieldalias.author=author
# searchfieldalias.abstract=contents
# searchfieldalias.pubtype=docclass
# searchfieldalias.symbol=contents
# ------ index --------------
# locate data with regex file, folder patterns
regex_folder=
regex_file=^\\w*.*\.pdf$
regex_skipfile=
regex_skipfolder=.*
INDEX_CLASS=org.eugenes.index.LucenePdfIndexer
INDEX_APPEND=false
## index names as values (as well as field names)?
INDEX_TAGS=false
## use fieldname xpath; full top.middle.last field name?
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml tag
INDEX_LEVEL=0
INDEX_BLANKS=false
merge_factor=10
max_field_length=1000000
## memory crash cure:
MAX_FIELDS=10000
#? analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
analyzer=org.eugenes.index.BiodataAnalyzer2
tokenizer=fbacode$LowerWordTokenizer
#tokenfilter=fbacode$DebugFilter
#tokenfilter.docid=fbacode$DebugEndOfRecordFilter
#tokenfilter.EOR=fbacode$DebugEndOfRecordFilter
# to create "contents" field of all text
indexall=true
# special filename docid parsing for LucenePdfIndexer
regex_docid=^(\\w+).pdf$
tokenizer.docid=org.eugenes.index.BiodataFilters$LowerDataTokenizer
## default - UnStored = index but dont store text
fieldtype=Text
fieldtype.docid=Text
fieldtype.title=Text
fieldtype.contents=UnStored
fieldtype.summary=Text