# dbs/lucegene/seqs.properties # index fasta seq files for retrieval by ids/... ## flybase deflines, r3.2.0, look like: # >FBgn0031208 type=gene; loc=2L:7529..9491; ID=FBgn0031208; name=CG11023; map=21A2-21A # 2; db_xref='CG11023,FlyBase:FBan0011023'; len=11963 ## dbxref='CG11023,FlyBase:FBan0011023' need special regex splitter for '', ## drop [:] from regex_fasta_keyval ## add fieldrecoder.loc=chr,start,end parser LIB_NAME=seqs title = FlyBase Sequences # can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set #DATA_ROOT=web/data/genomes/Drosophila_melanogaster/current/fasta/ DATA_ROOT=web/data/genomes/ INDEX_PATH=indices/lucene/seqs/ MIME_TYPE=text/fasta ## --------- search/report keys ------------------- # this works docurl=lookup.jsp?id= # batchurl=/cgi-bin/gnoseqbatch?idlist= # batchproc=cgi-bin/gnoseqbatch idlist= # ^^ this is really a call back to lucene seq db; via perl cgi # batch.idfield=docid; url; id field works also for seqs # batchurl=lookup.jsp?field=url&list={idlist} #batch.url=lookup.jsp batch.forward=lookup.jsp #? text/plain or text/fasta ? was biosequence/fasta batchformat = text/fasta batchformats = text/fasta, text/xml, text/csv, text/tsv title.text/fasta=FastA Sequence #title.text/plain=Document Text # allow batch fetch of headlinefields batchheadlines=1 batch.outfields= docid, type, name, loc.chr, loc.start, len, contents title.loc.chr = Chromosome title.loc.start = Seq start title.loc.stop = Seq end title.loc.strand = Strand title.len = Length title.type = Feature type title.name = Symbol title.db_xref = Database xref selectfields = false # Fields to display when xml chosen on search result page. #outfields= docid, type, name, loc.chr, loc.start, len, contents outfields= name, type, loc.chr, loc.start, len storefields= docid, type, name, loc.chr, loc.start, len title.contents=Fasta Sequence searchskip.contents=1 linkto=\ db_xref>fban-ID\n\ docid>blasttab-QueryID\n header.native= footer.native=\n header.xml=\n footer.xml=\n # default search field (if one not chosen from list on web) searchfield = all searchallfield = all format=table ## ---- indexing values; NOTE: need these for proper (cased) searches ## look for all fasta in genomes/ folder? ## but need to be sure headers have enough distinguishing fields, like species name # locate data with regex file, folder patterns regex_folder= #regex_file=^dmel-(all|sample).+\.(fasta|fa)$ regex_file=^\\w+-(all|sample).+\.(fasta|fa)$ ## skip extended till type=gene gets changed to gene_extended ## current file set has _mRNA_ and _transcript_;both type=mRNA; drop former ## the _all_ and per-csome files overlap - which to drop? all? regex_skipfile=^.*(syntenic_region|intergenic|CDS|chromosome|extended|mRNA).*$ #regex_skipfolder=(.*) regex_skipfolder=(more.*) INDEX_CLASS=org.eugenes.index.LuceneFastaIndexer # INDEX_CLASS=org.eugenes.index.LuceneReadseqIndexer ## append existing index or create new INDEX_APPEND=false ## index names as values (as well as field names)? INDEX_TAGS=false ## use fieldname xpath; full top.middle.last field name? INDEX_XPATH=false ## INDEX_LEVEL=0 means index main records one level below xml tag INDEX_LEVEL=0 INDEX_BLANKS=false ## IndexWriter opts ## merge=10 is default; 4 == less mem usage ; 2 minimum merge_factor=6 max_field_length=1000000 MAX_FIELDS=10000 ## these fasta defline index regex can be set # regex_fasta_id=^>(\S+) # regex_fasta_vals=[;,\|\s]+([^;,\|\s]+) ## vals need to get db_xref='CG00000,FlyBase:FBgn00000' regex_fasta_vals=[;|\\s]+('([^'])'|([^;|\\s]+)) ## include odd '/gene=xxxx' ; should drop / from deflines regex_fasta_keyval=^/?(\\w+)[=](.*)$ # regex_fasta_ncbi=^>(\w+)\|([^|\s]+)\|?([^|\s]*)\|?([^|\s]*)\|?([^|\s]*)\|?([^|\s]*) ## make this empty for speedier index checks regex_fasta_ncbi= analyzer=org.eugenes.index.BiodataAnalyzer2 tokenizer=org.eugenes.index.BiodataFilters$LowerDataTokenizer tokenfilter=fbacode$DebugFilter tokenfilter.EOR=fbacode$DebugEndOfRecordFilter # to create "contents" field of all text == defline for fasta # contents has become full sequence record - test, see if useful; i.e. can drop files # indexall=true # fieldtype.contents=UnIndexed # tokenizer.contents=org.eugenes.index.biodata.DataTokenizer # # for retreival, native format # contents.native=contents indexall=false fieldtype.contents=skip ## field indexing parameters ## sumfields list needs to match field.xxx common summary fields sumfields=docid,docclass,title # special summary fields -- replace w/ fieldalias.TAG=newtag ## default - Text // not UnStored = index but dont store text fieldtype=Text field.docid=accession fieldtype.docid=Text field.accession=Text ## these must always be stored; Text or UnIndexed field.docclass=type fieldtype.docclass=Text fieldtype.url=Text fieldtype.modified=Keyword ## field filters ## these only are used if fieldtype=Text or UnStored tokenfilter.DT=org.eugenes.index.BiodataFilters$DateFilter tokenfilter.len=org.eugenes.index.BiodataFilters$NumberFilter tokenfilter.loc.start=org.eugenes.index.BiodataFilters$NumberFilter tokenfilter.loc.stop=org.eugenes.index.BiodataFilters$NumberFilter ## field tokenizers - work before Filters ## default lucene StandardTokenizer breaks on all symbols .. ## DataTokenizer breaks tokens on whitespace, not symbols tokenizer.dbxref=org.eugenes.index.BiodataFilters$LowerWordTokenizer tokenizer.db_xref=org.eugenes.index.BiodataFilters$LowerWordTokenizer ## fieldrecoder recodes data, possibly making new index fields ## this one parses loc into .chr, numeric .start, .stop, .strand fieldrecoder.loc=LucegeneIndexers$Location_FieldRecoder ## this one is temp fix to add FBan ID when missing but have CG/CR id fieldrecoder.db_xref=LucegeneIndexers$SeqDbxref_FieldRecoder fieldrecoder.dbxref=LucegeneIndexers$SeqDbxref_FieldRecoder