# dbs/lucegene/uniprot.properties # index swissprot/uniprot ... # http://www.pir.uniprot.org/support/docs/rel_notes/relnote3.0.html # # Database -- Entries # ------------------------------------------------------------- # UniProt -- 1,612,609 (UniProt/Swiss-Prot 45.0: 163,235; UniProt/TrEMBL 28.0: 1,449,374) LIB_NAME = uniprot title = UniProt sequences # can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set DATA_ROOT=web/data/uniprot/ INDEX_PATH=indices/lucene/uniprot/ # need text in this MIME to get browser output via lookup - ok? # or biosequence/embl;text ; need also below batchformat MIME_TYPE=text/embl ## --------- search/report keys ------------------- docurl=lookup.jsp?id= batch.forward=lookup.jsp batchformat = text/embl batchformats = text/embl, text/xml, text/csv, text/tsv title.text/embl=EMBL Sequence # Fields to display when xml chosen on search result page. outfields= GN, DE, AC, OS storefields= docid, GN, DE, AC, OS # allow batch fetch of headlinefields batchheadlines=1 batch.outfields= docid, GN, DE, AC, OS, contents title.ID = ID title.AC = accession title.SV = seq_version title.NI = nuc_identifier title.DT = date title.DE = description title.KW = keyword title.OS = species title.OC = classification title.OG = organelle title.RN = ref_number title.RC = ref_comment title.RP = ref_positions title.RX = ref_crossref title.RA = ref_author title.RT = ref_title title.RL = ref_location title.DR = database_crossref title.FH = feature_header title.FT = feature_table title.CC = comments title.GN = gene title.SQ = sequence ## add FT.xxxx fields # XX - spacer line # bb - (blanks) sequence data # // - termination line selectfields = false title.contents=EMBL Sequence searchskip.contents=1 # linkto=\ # db_xref>fban-ID\n\ # docid>blasttab-QueryID\n header.native= footer.native=\n header.xml=\n footer.xml=\n searchfield = all searchallfield = all format=table ## ---- indexing values ----------- regex_folder= regex_file=^\\w+.*$ regex_skipfile= regex_skipfolder=(.*) # INDEX_CLASS=org.eugenes.index.LuceneReadseqIndexer INDEX_CLASS=org.eugenes.index.LuceneEMBLIndexer ## append existing index or create new INDEX_APPEND=false ## index names as values (as well as field names)? INDEX_TAGS=false ## use fieldname xpath; full top.middle.last field name? INDEX_XPATH=false ## INDEX_LEVEL=0 means index main records one level below xml tag INDEX_LEVEL=0 INDEX_BLANKS=false ## IndexWriter opts ; merge=10 is default; 4 == less mem usage ; 2 minimum merge_factor=6 max_field_length=1000000 MAX_FIELDS=10000 analyzer=org.eugenes.index.BiodataAnalyzer2 tokenizer=BioIndexers$LowerWordTokenizer #tokenizer=BioIndexers$LowerDataTokenizer # tokenfilter=BioIndexers$DebugFilter # tokenfilter.EOR=BioIndexers$DebugEndOfRecordFilter # to create "contents" field of all text == defline for fasta # contents has become full sequence record - test, see if useful; i.e. can drop files indexall=false # fieldtype.contents=UnIndexed # tokenizer.contents=org.eugenes.index.biodata.DataTokenizer # # for retreival, native format # contents.native=contents ## field indexing parameters sumfields=docid ## default - Text // not UnStored = index but dont store text fieldtype=UnStored ## special EMBL field = ' ' = sequence data fieldtype.sequence=skip # idfield.keys= dataclass; molecule; division; sequencelength idfield.keys= docclass; molecule; sequencelength # ID ENTRY_NAME DATA_CLASS; MOLECULE_TYPE; SEQUENCE_LENGTH. == swissprot # ID entryname dataclass; molecule; division; sequencelength BP. == embl # is best docid = ID or AC ?? ## EMBL parser gets docid from field.docid=ID fieldtype.docid=Text fieldtype.ID=Text fieldtype.AC=Text fieldtype.DE=Text fieldtype.GN=Text fieldtype.OS=Text # swissprot IDs have '_' at least tokenizer.docid=org.eugenes.index.biodata.DataTokenizer ## these must always be stored; Text or UnIndexed #field.docclass=type #fieldtype.docclass=Text tokenfilter.sequencelength=BioIndexers$NumberFilter fieldtype.DT=Text tokenizer.DT=BioIndexers$DateTokens tokenfilter.DT=BioIndexers$DateFilter tokenizer.RX=BioIndexers$dbxrefTokens #fieldtype.RX=Text tokenizer.DR=BioIndexers$dbxrefTokens #fieldtype.DR=Text tokenizer.url=org.eugenes.index.biodata.DataTokenizer fieldtype.url=Text fieldtype.modified=Keyword # SQ SEQUENCE 262 AA; 28969 MW; DA87363A0D92BAF4 CRC64; fieldrecoder.SQ=BioIndexers$Swiss_SQ_FieldRecoder tokenfilter.SQ.AA=BioIndexers$NumberFilter tokenfilter.SQ.MW=BioIndexers$NumberFilter ## debug filter off # tokenfilter.CC=org.eugenes.index.biodata.DataFilter # tokenfilter.RA=org.eugenes.index.biodata.DataFilter # tokenfilter.RT=org.eugenes.index.biodata.DataFilter # tokenfilter.OC=org.eugenes.index.biodata.DataFilter