# dbs/lucegene/blasttab.properties
# d.gilbert, jan 05
# NCBI BLAST output table -m 8,9

# BLASTX 2.2.10 [Oct-19-2004]
# Query: contig_0
# Database: dmel-blast/dmel-translation
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, 
#  q. start, q. end, s. start, s. end, e-value, bit score


LIB_NAME=blasttab
title = BLAST output table

# dmel-dvir-tblastn.xml
DATA_ROOT=web/data/blast/
INDEX_PATH=indices/lucene/blasttab/

MIME_TYPE=text/tsv
# text/table ? tsv ?

## --------- search/report keys -------------------

searchfield=all

outfields=QueryID	SubjectID	PctIdent	AlignLen	Mismatch	Gaps	Qstart	Qend	Sstart	Send	Evalue	Bitscore
storefields=QueryID	SubjectID	PctIdent	AlignLen	Mismatch	Gaps	Qstart	Qend	Sstart	Send	Evalue	Bitscore

docurl=lookup.jsp?id=
batch.forward=lookup.jsp

linkto=\
  QueryID>seqs-docid\n

## for xslt to produce the result page tables
# resultxsl = conf/blast_result.xsl
# resultspage = resultxsl.jsp

header.native=QueryID	SubjectID	PctIdent	AlignLen	Mismatch	Gaps	Qstart	Qend	Sstart	Send	Evalue	Bitscore
footer.native=

## difference in header. footer. is for native file data versus lucegene index fields
header.xml=<?xml version="1.0"?>\n<LuceGene>
footer.xml=\n</LuceGene>

# ---- indexing values -----------------------

# locate data with regex file, folder patterns
regex_folder=
regex_file=^.*blast.*\.tsv$

regex_skipfile=
regex_skipfolder=.*
# (tmp|.*\.old)


INDEX_CLASS=org.eugenes.index.LuceneTableIndexer

fieldnames_lastcomment=false
fieldnames_firstline=false
#fieldnames=chr start feature gene map range id db_xref notes
fieldnames=QueryID	SubjectID	PctIdent	AlignLen	Mismatch	Gaps	Qstart	Qend	Sstart	Send	Evalue	Bitscore


regex_comment=^\\s*[!#]

## table field separator regex

## any whitespace
# regex_keyval=\\s*(\\S+)

## tabs only
regex_keyval=\\t*([^\\t]+)
## commas
# regex_keyval=,*([^,]+)
## commas, optional quotes
# regex_keyval=,*([^,]+)|,*"([^"]+)"


## append existing index or create new
INDEX_APPEND=false
## index <tag> names as values (as well as field names)?
INDEX_TAGS=false
## index <tag attr="val"> values ?
INDEX_ATTRIBUTES=false
## use fieldname xpath; full top.middle.last field name?
INDEX_XPATH=false

## INDEX_LEVEL=0 means index main records one level below xml <top> tag 
INDEX_LEVEL=0
INDEX_BLANKS=false

## IndexWriter opts
## merge=10 is default; 4 == less mem usage ; 2 minimum  
merge_factor=6
## max_field_length is max # terms/field
max_field_length=1000000
MAX_FIELDS=50000

# to create "contents" field of all text
indexall=false

## field indexing parameters
## sumfields list needs to match field.xxx common summary fields
sumfields=docid
# special summary fields -- replace w/ fieldalias.TAG=newtag
# field.docid=ID
# field.docclass=CLA
# field.title=RETE
# field.summary=GeneSummary.SUMMARY.text

## default -  Text or UnStored = index but dont store text
fieldtype=Text
field.docid=QueryID
# docid = SubjectID or QueryID ??

# Summary.text
# fieldtype.text=UnStored

## these must always be stored; Text or UnIndexed
## Keyword is problem as Search wants to lc() all 1st, Keyword is casefull
fieldtype.docid=Text
fieldtype.docclass=Text
fieldtype.url=UnIndexed
fieldtype.modified=Keyword
fieldtype.title=UnIndexed
fieldtype.summary=UnIndexed

analyzer=org.eugenes.index.BiodataAnalyzer2

# all field defaults
tokenizer=org.eugenes.index.BiodataFilters$LowerDataTokenizer
tokenfilter=fbacode$DebugFilter
tokenfilter.EOR=fbacode$DebugEndOfRecordFilter

fieldrecoder=fbacode$FBID_Recoder

# query,subject are cv/symbol terms; rest are numbers (some floats)
# this wont accept floats .. e.g. eval
tokenfilter.Bitscore=fbacode$NumberFilter,fbacode$DebugFilter
tokenfilter.Send=fbacode$NumberFilter
tokenfilter.Sstart=fbacode$NumberFilter,fbacode$DebugFilter
tokenfilter.Qend=fbacode$NumberFilter
tokenfilter.Qstart=fbacode$NumberFilter,fbacode$DebugFilter
tokenfilter.Gaps=fbacode$NumberFilter
tokenfilter.Mismatch=fbacode$NumberFilter
tokenfilter.AlignLen=fbacode$NumberFilter,fbacode$DebugFilter
tokenfilter.PctIdent=fbacode$NumberFilter,fbacode$DebugFilter


#----- fix blast.xml output ----
# perlfix= \
# #!/usr/bin/perl
# # slimblastxml.pl -- cut verbosity down for NCBI BLAST -m 7 xml output
# print "<?xml version=\"1.0\"?>\n<BlastOutputs>\n";  
# while(<>){
#    next if (m,^<\?xml, || m,^<\!DOCTYPE,);
#    if (m,<(BlastOutput_reference|BlastOutput_param|Iteration_stat),) { $skipto= $1; }
#    if ($skipto) {
#     $skipto='' if (m,</$skipto>,);
#     next; 
#     }
#   print;
# }
# print "</BlastOutputs>\n"; # trailer