# dbs/lucegene/uniprot.properties
# index swissprot/uniprot ... 

# http://www.pir.uniprot.org/support/docs/rel_notes/relnote3.0.html
#
# Database -- Entries
# -------------------------------------------------------------
# UniProt -- 1,612,609 (UniProt/Swiss-Prot 45.0: 163,235; UniProt/TrEMBL 28.0: 1,449,374)


LIB_NAME = uniprot
title = UniProt sequences

# can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set
DATA_ROOT=web/data/uniprot/
INDEX_PATH=indices/lucene/uniprot/

# need text in this MIME to get browser output via lookup - ok? 
# or biosequence/embl;text ; need also below batchformat
MIME_TYPE=text/embl

## --------- search/report keys -------------------

docurl=lookup.jsp?id=
batch.forward=lookup.jsp

batchformat = text/embl
batchformats = text/embl, text/xml, text/csv,  text/tsv
title.text/embl=EMBL Sequence

# Fields to display when xml chosen on search result page.
outfields= GN, DE, AC, OS
storefields= docid, GN, DE, AC, OS

# allow batch fetch of headlinefields
batchheadlines=1
batch.outfields= docid, GN, DE,  AC, OS, contents

title.ID = ID             
title.AC = accession           
title.SV = seq_version           
title.NI = nuc_identifier      
title.DT = date                       
title.DE = description                
title.KW = keyword                    
title.OS = species           
title.OC = classification    
title.OG = organelle                  
title.RN = ref_number           
title.RC = ref_comment          
title.RP = ref_positions        
title.RX = ref_crossref  
title.RA = ref_author       
title.RT = ref_title            
title.RL = ref_location         
title.DR = database_crossref   
title.FH = feature_header       
title.FT = feature_table         
title.CC = comments    
title.GN = gene    
title.SQ = sequence 

## add FT.xxxx fields

# XX - spacer line                
# bb - (blanks) sequence data     
# // - termination line  


selectfields = false


title.contents=EMBL Sequence
searchskip.contents=1

# linkto=\
#   db_xref>fban-ID\n\
#   docid>blasttab-QueryID\n

header.native=
footer.native=\n
header.xml=<?xml version="1.0" encoding="ISO-8859-1"?>\n<LuceGene>
footer.xml=\n</LuceGene>

searchfield = all
searchallfield = all

format=table

## ---- indexing values -----------


regex_folder=
regex_file=^\\w+.*$
 
regex_skipfile=
regex_skipfolder=(.*)

# INDEX_CLASS=org.eugenes.index.LuceneReadseqIndexer
INDEX_CLASS=org.eugenes.index.LuceneEMBLIndexer

## append existing index or create new
INDEX_APPEND=false
## index <tag> names as values (as well as field names)?
INDEX_TAGS=false
## use fieldname xpath; full top.middle.last field name?
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml <top> tag 
INDEX_LEVEL=0
INDEX_BLANKS=false

## IndexWriter opts ; merge=10 is default; 4 == less mem usage ; 2 minimum  
merge_factor=6
max_field_length=1000000
MAX_FIELDS=10000

analyzer=org.eugenes.index.BiodataAnalyzer2

tokenizer=BioIndexers$LowerWordTokenizer
#tokenizer=BioIndexers$LowerDataTokenizer

# tokenfilter=BioIndexers$DebugFilter
# tokenfilter.EOR=BioIndexers$DebugEndOfRecordFilter

# to create "contents" field of all text ==  defline for fasta
# contents has become full sequence record - test, see if useful; i.e. can drop files
indexall=false

# fieldtype.contents=UnIndexed
# tokenizer.contents=org.eugenes.index.biodata.DataTokenizer
# # for retreival, native format
# contents.native=contents

## field indexing parameters
sumfields=docid

## default - Text // not UnStored = index but dont store text
fieldtype=UnStored

## special EMBL field = ' ' = sequence data
fieldtype.sequence=skip

# idfield.keys= dataclass; molecule; division; sequencelength
idfield.keys= docclass; molecule; sequencelength
# ID   ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH. == swissprot
# ID   entryname  dataclass; molecule; division; sequencelength BP. == embl


# is best docid = ID or AC ??
## EMBL parser gets docid from field.docid=ID
fieldtype.docid=Text
fieldtype.ID=Text
fieldtype.AC=Text
fieldtype.DE=Text
fieldtype.GN=Text
fieldtype.OS=Text

# swissprot IDs have '_' at least
tokenizer.docid=org.eugenes.index.biodata.DataTokenizer

## these must always be stored; Text or UnIndexed
#field.docclass=type
#fieldtype.docclass=Text

tokenfilter.sequencelength=BioIndexers$NumberFilter

fieldtype.DT=Text
tokenizer.DT=BioIndexers$DateTokens
tokenfilter.DT=BioIndexers$DateFilter

tokenizer.RX=BioIndexers$dbxrefTokens
#fieldtype.RX=Text

tokenizer.DR=BioIndexers$dbxrefTokens
#fieldtype.DR=Text

tokenizer.url=org.eugenes.index.biodata.DataTokenizer

fieldtype.url=Text
fieldtype.modified=Keyword


# SQ   SEQUENCE   262 AA;  28969 MW;  DA87363A0D92BAF4 CRC64;
fieldrecoder.SQ=BioIndexers$Swiss_SQ_FieldRecoder
tokenfilter.SQ.AA=BioIndexers$NumberFilter
tokenfilter.SQ.MW=BioIndexers$NumberFilter

## debug filter off
# tokenfilter.CC=org.eugenes.index.biodata.DataFilter
# tokenfilter.RA=org.eugenes.index.biodata.DataFilter
# tokenfilter.RT=org.eugenes.index.biodata.DataFilter
# tokenfilter.OC=org.eugenes.index.biodata.DataFilter