# dbs/lucegene/paperspdf.properties # d.gilbert, 2004 LIB_NAME=paperspdf title = Drosophila OA Publications (PDF) DATA_ROOT=web/data/papers/ INDEX_PATH=indices/lucene/paperspdf/ MIME_TYPE=application/pdf #-------- search ------------ searchfield=all format=table outfields=docid,summary storefields=docid,summary,url # bmc pdfs have bogus author,title,... # need this in addition to MIME_TYPE ? native.binary=1 batchformat = application/pdf batchformats = application/pdf, text/plain, text/xml, text/csv, text/tsv # nativeformat = application/pdf header.native= footer.native= header.xml=\n footer.xml= docurl=lookup.jsp?id= batch.forward=lookup.jsp ## Fields that can be searched/displayed # linkto.fbrf=docid>docid # bmc pdfs have bogus author,title,... searchskip.title=1 searchskip.author=1 searchskip.subject=1 ## these are for search system; map generic field names to lib-specific # searchfieldalias.title=title # searchfieldalias.year=date # searchfieldalias.author=author # searchfieldalias.abstract=contents # searchfieldalias.pubtype=docclass # searchfieldalias.symbol=contents # ------ index -------------- # locate data with regex file, folder patterns regex_folder= regex_file=^\\w*.*\.pdf$ regex_skipfile= regex_skipfolder=.* INDEX_CLASS=org.eugenes.index.LucenePdfIndexer INDEX_APPEND=false ## index names as values (as well as field names)? INDEX_TAGS=false ## use fieldname xpath; full top.middle.last field name? INDEX_XPATH=false ## INDEX_LEVEL=0 means index main records one level below xml tag INDEX_LEVEL=0 INDEX_BLANKS=false merge_factor=10 max_field_length=1000000 ## memory crash cure: MAX_FIELDS=10000 #? analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer analyzer=org.eugenes.index.BiodataAnalyzer2 tokenizer=fbacode$LowerWordTokenizer #tokenfilter=fbacode$DebugFilter #tokenfilter.docid=fbacode$DebugEndOfRecordFilter #tokenfilter.EOR=fbacode$DebugEndOfRecordFilter # to create "contents" field of all text indexall=true # special filename docid parsing for LucenePdfIndexer regex_docid=^(\\w+).pdf$ tokenizer.docid=org.eugenes.index.BiodataFilters$LowerDataTokenizer ## default - UnStored = index but dont store text fieldtype=Text fieldtype.docid=Text fieldtype.title=Text fieldtype.contents=UnStored fieldtype.summary=Text