# dbs/lucegene/paperspdf.properties
# d.gilbert, 2004

LIB_NAME=paperspdf
title = Drosophila OA Publications (PDF)

DATA_ROOT=web/data/papers/
INDEX_PATH=indices/lucene/paperspdf/

MIME_TYPE=application/pdf

#-------- search ------------

searchfield=all
format=table

outfields=docid,summary
storefields=docid,summary,url
# bmc pdfs have bogus author,title,...

# need this in addition to MIME_TYPE ?
native.binary=1

batchformat = application/pdf
batchformats = application/pdf, text/plain, text/xml, text/csv,  text/tsv
# nativeformat = application/pdf

header.native=
footer.native=
header.xml=<?xml version="1.0"?>\n<LuceGene>
footer.xml=</LuceGene>

docurl=lookup.jsp?id=
batch.forward=lookup.jsp

## Fields that can be searched/displayed

# linkto.fbrf=docid>docid

# bmc pdfs have bogus author,title,...
searchskip.title=1
searchskip.author=1
searchskip.subject=1

## these are for search system; map generic field names to lib-specific
# searchfieldalias.title=title
# searchfieldalias.year=date
# searchfieldalias.author=author
# searchfieldalias.abstract=contents
# searchfieldalias.pubtype=docclass
# searchfieldalias.symbol=contents


# ------ index --------------

# locate data with regex file, folder patterns
regex_folder=
regex_file=^\\w*.*\.pdf$
regex_skipfile=
regex_skipfolder=.*

INDEX_CLASS=org.eugenes.index.LucenePdfIndexer
INDEX_APPEND=false

## index <tag> names as values (as well as field names)?
INDEX_TAGS=false
## use fieldname xpath; full top.middle.last field name?
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml <top> tag 
INDEX_LEVEL=0
INDEX_BLANKS=false

merge_factor=10
max_field_length=1000000
## memory crash cure:
MAX_FIELDS=10000

#? analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
analyzer=org.eugenes.index.BiodataAnalyzer2
tokenizer=fbacode$LowerWordTokenizer
#tokenfilter=fbacode$DebugFilter
#tokenfilter.docid=fbacode$DebugEndOfRecordFilter
#tokenfilter.EOR=fbacode$DebugEndOfRecordFilter

# to create "contents" field of all text
indexall=true

# special filename docid parsing for LucenePdfIndexer
regex_docid=^(\\w+).pdf$
tokenizer.docid=org.eugenes.index.BiodataFilters$LowerDataTokenizer

## default - UnStored = index but dont store text
fieldtype=Text
fieldtype.docid=Text
fieldtype.title=Text
fieldtype.contents=UnStored
fieldtype.summary=Text