# dbs/lucegene/uniprot.properties
# index swissprot/uniprot ...
# http://www.pir.uniprot.org/support/docs/rel_notes/relnote3.0.html
#
# Database -- Entries
# -------------------------------------------------------------
# UniProt -- 1,612,609 (UniProt/Swiss-Prot 45.0: 163,235; UniProt/TrEMBL 28.0: 1,449,374)
LIB_NAME = uniprot
title = UniProt sequences
# can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set
DATA_ROOT=web/data/uniprot/
INDEX_PATH=indices/lucene/uniprot/
# need text in this MIME to get browser output via lookup - ok?
# or biosequence/embl;text ; need also below batchformat
MIME_TYPE=text/embl
## --------- search/report keys -------------------
docurl=lookup.jsp?id=
batch.forward=lookup.jsp
batchformat = text/embl
batchformats = text/embl, text/xml, text/csv, text/tsv
title.text/embl=EMBL Sequence
# Fields to display when xml chosen on search result page.
outfields= GN, DE, AC, OS
storefields= docid, GN, DE, AC, OS
# allow batch fetch of headlinefields
batchheadlines=1
batch.outfields= docid, GN, DE, AC, OS, contents
title.ID = ID
title.AC = accession
title.SV = seq_version
title.NI = nuc_identifier
title.DT = date
title.DE = description
title.KW = keyword
title.OS = species
title.OC = classification
title.OG = organelle
title.RN = ref_number
title.RC = ref_comment
title.RP = ref_positions
title.RX = ref_crossref
title.RA = ref_author
title.RT = ref_title
title.RL = ref_location
title.DR = database_crossref
title.FH = feature_header
title.FT = feature_table
title.CC = comments
title.GN = gene
title.SQ = sequence
## add FT.xxxx fields
# XX - spacer line
# bb - (blanks) sequence data
# // - termination line
selectfields = false
title.contents=EMBL Sequence
searchskip.contents=1
# linkto=\
# db_xref>fban-ID\n\
# docid>blasttab-QueryID\n
header.native=
footer.native=\n
header.xml=\n
footer.xml=\n
searchfield = all
searchallfield = all
format=table
## ---- indexing values -----------
regex_folder=
regex_file=^\\w+.*$
regex_skipfile=
regex_skipfolder=(.*)
# INDEX_CLASS=org.eugenes.index.LuceneReadseqIndexer
INDEX_CLASS=org.eugenes.index.LuceneEMBLIndexer
## append existing index or create new
INDEX_APPEND=false
## index names as values (as well as field names)?
INDEX_TAGS=false
## use fieldname xpath; full top.middle.last field name?
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml tag
INDEX_LEVEL=0
INDEX_BLANKS=false
## IndexWriter opts ; merge=10 is default; 4 == less mem usage ; 2 minimum
merge_factor=6
max_field_length=1000000
MAX_FIELDS=10000
analyzer=org.eugenes.index.BiodataAnalyzer2
tokenizer=BioIndexers$LowerWordTokenizer
#tokenizer=BioIndexers$LowerDataTokenizer
# tokenfilter=BioIndexers$DebugFilter
# tokenfilter.EOR=BioIndexers$DebugEndOfRecordFilter
# to create "contents" field of all text == defline for fasta
# contents has become full sequence record - test, see if useful; i.e. can drop files
indexall=false
# fieldtype.contents=UnIndexed
# tokenizer.contents=org.eugenes.index.biodata.DataTokenizer
# # for retreival, native format
# contents.native=contents
## field indexing parameters
sumfields=docid
## default - Text // not UnStored = index but dont store text
fieldtype=UnStored
## special EMBL field = ' ' = sequence data
fieldtype.sequence=skip
# idfield.keys= dataclass; molecule; division; sequencelength
idfield.keys= docclass; molecule; sequencelength
# ID ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH. == swissprot
# ID entryname dataclass; molecule; division; sequencelength BP. == embl
# is best docid = ID or AC ??
## EMBL parser gets docid from field.docid=ID
fieldtype.docid=Text
fieldtype.ID=Text
fieldtype.AC=Text
fieldtype.DE=Text
fieldtype.GN=Text
fieldtype.OS=Text
# swissprot IDs have '_' at least
tokenizer.docid=org.eugenes.index.biodata.DataTokenizer
## these must always be stored; Text or UnIndexed
#field.docclass=type
#fieldtype.docclass=Text
tokenfilter.sequencelength=BioIndexers$NumberFilter
fieldtype.DT=Text
tokenizer.DT=BioIndexers$DateTokens
tokenfilter.DT=BioIndexers$DateFilter
tokenizer.RX=BioIndexers$dbxrefTokens
#fieldtype.RX=Text
tokenizer.DR=BioIndexers$dbxrefTokens
#fieldtype.DR=Text
tokenizer.url=org.eugenes.index.biodata.DataTokenizer
fieldtype.url=Text
fieldtype.modified=Keyword
# SQ SEQUENCE 262 AA; 28969 MW; DA87363A0D92BAF4 CRC64;
fieldrecoder.SQ=BioIndexers$Swiss_SQ_FieldRecoder
tokenfilter.SQ.AA=BioIndexers$NumberFilter
tokenfilter.SQ.MW=BioIndexers$NumberFilter
## debug filter off
# tokenfilter.CC=org.eugenes.index.biodata.DataFilter
# tokenfilter.RA=org.eugenes.index.biodata.DataFilter
# tokenfilter.RT=org.eugenes.index.biodata.DataFilter
# tokenfilter.OC=org.eugenes.index.biodata.DataFilter