LIB_NAME=geo title = NCBI Gene Expression Omnibus # can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set DATA_ROOT=web/data/extdb/geo/ INDEX_PATH=indices/lucene/geo/ MIME_TYPE=text/table ## --------- search/report keys ------------------- # docurl=lookup.jsp?id= # id = GDS732 ; id url should be below but w/o the dang GDS prefix ! # docurl=http://www.ncbi.nlm.nih.gov/geo/gds/gds_browse.cgi?gds=732 docurl=http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&cmd=search&term= batch.forward=lookup.jsp linkto= DBX>fbgn-docid linkto.fbgn=DBX>docid ## ---- indexing values ------------------------- regex_folder=^\\w.+$ # regex_file=^\\w.+\\.(tsv|csv|txt)$ regex_file=^\\w.+\\.(soft)$ regex_skipfile=.*\\.old # no subdirs regex_skipfolder=(.*) # INDEX_CLASS=org.eugenes.index.LuceneTableIndexer INDEX_CLASS=org.eugenes.index.LuceneBaseIndexer analyzer=org.eugenes.index.BiodataAnalyzer2 tokenizer=fbacode$LowerWordTokenizer tokenfilter=fbacode$DebugFilter tokenfilter.EOR=fbacode$DebugEndOfRecordFilter # fieldnames_firstline=false # fieldnames_lastcomment=true # fieldnames_firstline=false # fieldnames=field1 field2 field3 field4 # regex_comment=^\\s*[!#] # ? no comment lines regex_comment = ## GSM files have header key=value; then tabbed table; need two patterns; # regex_keyval = ^\\W?(\\w+)\\s*[\\W]\\s*(.*)$ # regex_keyval = ^[!#\\^](\\S+)\\s*[=\\t]\\s*(.*)$ regex_keyval = ^[!#\\^](GSM|\\w\\S+)\\S*\\s*[=\\t]\\s*(.*)$ #regex_continue = ^\\s*(.*)$ regex_continue = ^[^!#\\^](.*)$ # each file is 1 record ? regex_endrec = fieldalias.GSM=sample_data tokenfilter.sample_data=org.eugenes.index.biodata.DataFilter # ^database # !database_name = Gene Expression Omnibus (GEO) # !database_institute = NCBI NLM NIH # !database_web_link = http://www.ncbi.nlm.nih.gov/geo # !database_email = geo@ncbi.nlm.nih.gov # !database_ref = Nucleic Acids Res. 2002 Jan 1;30(1):207-10 # ^dataset = GDS196 <<< docid value # !dataset_completeness = all # !dataset_description = Gene expression in Drosophila (third instar wandering larvae) eye p # ... # ^dataset = GDS196 # #ID_REF = Platform reference identifier # #IDENTIFIER = probe identifier # #GSM3698 = Value for GSM3698: Eye Discs iso4 lmC44; src: Drosophila wandering larvae eye i #... # ID_REF IDENTIFIER GSM3698 GSM3699 GSM3706 GSM3707 GSM3708 GSM3700 GSM3701 GSM3702 GS # M3703 GSM3704 GSM3705 GSM3709 # AFFX-MurIL2_at M16762 -56.2 -29.8 20.6 -8.2 -36.4 29 -49.7 -46.3 -7 # 0.5 -7.6 -42.4 -68.4 # AFFX-MurIL10_at M37897 45.9 -10.3 -1.8 -16.8 12.6 40.9 79.3 5 79 # dataset_organism: == species # subset_type = development stage # ^ docclass ? ## append existing index or create new INDEX_APPEND=false ## index names as values (as well as field names)? INDEX_TAGS=false ## index values ? INDEX_ATTRIBUTES=false ## use fieldname xpath; full top.middle.last field name? INDEX_XPATH=false ## INDEX_LEVEL=0 means index main records one level below xml tag INDEX_LEVEL=0 INDEX_BLANKS=false merge_factor=10 max_field_length=1000000 ## memory crash cure: MAX_FIELDS=50000 # to create "contents" field of all text indexall=false ## field indexing parameters # special summary fields -- replace w/ fieldalias.TAG=newtag sumfields=docid,docclass,title,summary field.docid=dataset field.docclass=subset_type field.title=dataset_title field.summary=dataset_description # this one puts FBxxnnnn ids into DBX: field fieldrecoder=fbacode$FBID_Recoder tokenfilter.DBX=org.eugenes.index.biodata.DataFilter,fbacode$DebugFilter fieldtype.DBX=Text ## default - UnStored = index but dont store text fieldtype=UnStored fieldtype.docid=Text fieldtype.docclass=Text fieldtype.title=Text fieldtype.dataset_title=Text fieldtype.dataset_description=Text fieldtype.contents=UnStored fieldtype.summary=UnIndexed fieldtype.ancestors=ignore #fieldtype.XXX=Text #fieldtype.XXX=Keyword #fieldtype.XXX=UnIndexed #fieldtype.XXX=UnStored #fieldtype.XXX=ignore #---- data source ----- # ftp://ftp.ncbi.nih.gov/pub/geo/data/gds/soft_gz/ # set gs=(GDS1 GDS191 GDS192 GDS195 GDS196 GDS209 GDS210 GDS211 GDS212 GDS23 GDS24) # set gs=(GDS374 GDS375 GDS376 GDS377 GDS438 GDS439 GDS440 GDS441 GDS443 GDS444) # set gs=(GDS445 GDS483 GDS490 GDS516 GDS582 GDS602 GDS653 GDS664 GDS667 GDS732) # # foreach g ($gs) # curl --disable-epsv -R -O ftp://ftp.ncbi.nih.gov/pub/geo/data/gds/soft_gz/$g.soft.gz # end