# dbs/lucegene/seqs.properties
# index fasta seq files for retrieval by ids/...
## flybase deflines, r3.2.0, look like:
# >FBgn0031208 type=gene; loc=2L:7529..9491; ID=FBgn0031208; name=CG11023; map=21A2-21A
# 2; db_xref='CG11023,FlyBase:FBan0011023'; len=11963
## dbxref='CG11023,FlyBase:FBan0011023' need special regex splitter for '',
## drop [:] from regex_fasta_keyval
## add fieldrecoder.loc=chr,start,end parser
LIB_NAME=seqs
title = FlyBase Sequences
# can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set
#DATA_ROOT=web/data/genomes/Drosophila_melanogaster/current/fasta/
DATA_ROOT=web/data/genomes/
INDEX_PATH=indices/lucene/seqs/
MIME_TYPE=text/fasta
## --------- search/report keys -------------------
# this works
docurl=lookup.jsp?id=
# batchurl=/cgi-bin/gnoseqbatch?idlist=
# batchproc=cgi-bin/gnoseqbatch idlist=
# ^^ this is really a call back to lucene seq db; via perl cgi
# batch.idfield=docid; url; id field works also for seqs
# batchurl=lookup.jsp?field=url&list={idlist}
#batch.url=lookup.jsp
batch.forward=lookup.jsp
#? text/plain or text/fasta ? was biosequence/fasta
batchformat = text/fasta
batchformats = text/fasta, text/xml, text/csv, text/tsv
title.text/fasta=FastA Sequence
#title.text/plain=Document Text
# allow batch fetch of headlinefields
batchheadlines=1
batch.outfields= docid, type, name, loc.chr, loc.start, len, contents
title.loc.chr = Chromosome
title.loc.start = Seq start
title.loc.stop = Seq end
title.loc.strand = Strand
title.len = Length
title.type = Feature type
title.name = Symbol
title.db_xref = Database xref
selectfields = false
# Fields to display when xml chosen on search result page.
#outfields= docid, type, name, loc.chr, loc.start, len, contents
outfields= name, type, loc.chr, loc.start, len
storefields= docid, type, name, loc.chr, loc.start, len
title.contents=Fasta Sequence
searchskip.contents=1
linkto=\
db_xref>fban-ID\n\
docid>blasttab-QueryID\n
header.native=
footer.native=\n
header.xml=\n
footer.xml=\n
# default search field (if one not chosen from list on web)
searchfield = all
searchallfield = all
format=table
## ---- indexing values; NOTE: need these for proper (cased) searches
## look for all fasta in genomes/ folder?
## but need to be sure headers have enough distinguishing fields, like species name
# locate data with regex file, folder patterns
regex_folder=
#regex_file=^dmel-(all|sample).+\.(fasta|fa)$
regex_file=^\\w+-(all|sample).+\.(fasta|fa)$
## skip extended till type=gene gets changed to gene_extended
## current file set has _mRNA_ and _transcript_;both type=mRNA; drop former
## the _all_ and per-csome files overlap - which to drop? all?
regex_skipfile=^.*(syntenic_region|intergenic|CDS|chromosome|extended|mRNA).*$
#regex_skipfolder=(.*)
regex_skipfolder=(more.*)
INDEX_CLASS=org.eugenes.index.LuceneFastaIndexer
# INDEX_CLASS=org.eugenes.index.LuceneReadseqIndexer
## append existing index or create new
INDEX_APPEND=false
## index names as values (as well as field names)?
INDEX_TAGS=false
## use fieldname xpath; full top.middle.last field name?
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml tag
INDEX_LEVEL=0
INDEX_BLANKS=false
## IndexWriter opts
## merge=10 is default; 4 == less mem usage ; 2 minimum
merge_factor=6
max_field_length=1000000
MAX_FIELDS=10000
## these fasta defline index regex can be set
# regex_fasta_id=^>(\S+)
# regex_fasta_vals=[;,\|\s]+([^;,\|\s]+)
## vals need to get db_xref='CG00000,FlyBase:FBgn00000'
regex_fasta_vals=[;|\\s]+('([^'])'|([^;|\\s]+))
## include odd '/gene=xxxx' ; should drop / from deflines
regex_fasta_keyval=^/?(\\w+)[=](.*)$
# regex_fasta_ncbi=^>(\w+)\|([^|\s]+)\|?([^|\s]*)\|?([^|\s]*)\|?([^|\s]*)\|?([^|\s]*)
## make this empty for speedier index checks
regex_fasta_ncbi=
analyzer=org.eugenes.index.BiodataAnalyzer2
tokenizer=org.eugenes.index.BiodataFilters$LowerDataTokenizer
tokenfilter=fbacode$DebugFilter
tokenfilter.EOR=fbacode$DebugEndOfRecordFilter
# to create "contents" field of all text == defline for fasta
# contents has become full sequence record - test, see if useful; i.e. can drop files
# indexall=true
# fieldtype.contents=UnIndexed
# tokenizer.contents=org.eugenes.index.biodata.DataTokenizer
# # for retreival, native format
# contents.native=contents
indexall=false
fieldtype.contents=skip
## field indexing parameters
## sumfields list needs to match field.xxx common summary fields
sumfields=docid,docclass,title
# special summary fields -- replace w/ fieldalias.TAG=newtag
## default - Text // not UnStored = index but dont store text
fieldtype=Text
field.docid=accession
fieldtype.docid=Text
field.accession=Text
## these must always be stored; Text or UnIndexed
field.docclass=type
fieldtype.docclass=Text
fieldtype.url=Text
fieldtype.modified=Keyword
## field filters
## these only are used if fieldtype=Text or UnStored
tokenfilter.DT=org.eugenes.index.BiodataFilters$DateFilter
tokenfilter.len=org.eugenes.index.BiodataFilters$NumberFilter
tokenfilter.loc.start=org.eugenes.index.BiodataFilters$NumberFilter
tokenfilter.loc.stop=org.eugenes.index.BiodataFilters$NumberFilter
## field tokenizers - work before Filters
## default lucene StandardTokenizer breaks on all symbols ..
## DataTokenizer breaks tokens on whitespace, not symbols
tokenizer.dbxref=org.eugenes.index.BiodataFilters$LowerWordTokenizer
tokenizer.db_xref=org.eugenes.index.BiodataFilters$LowerWordTokenizer
## fieldrecoder recodes data, possibly making new index fields
## this one parses loc into .chr, numeric .start, .stop, .strand
fieldrecoder.loc=LucegeneIndexers$Location_FieldRecoder
## this one is temp fix to add FBan ID when missing but have CG/CR id
fieldrecoder.db_xref=LucegeneIndexers$SeqDbxref_FieldRecoder
fieldrecoder.dbxref=LucegeneIndexers$SeqDbxref_FieldRecoder