# dbs/lucegene/medline.properties
# d.gilbert, summer 2004
LIB_NAME=medline
title = Medline Abstracts (Drosophila)
# can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set
DATA_ROOT=web/data/refs/
INDEX_PATH=indices/lucene/medline/
MIME_TYPE=text/xml
## --------- search/report keys -------------------
# bin/lucegene-search.sh -verbose=1 -lib medline
# Index fields [ , AbstractText, AccessionNumber, Acronym, Affiliation,
# Agency, ArticleTitle, CitationSubset, CollectiveName,
# CopyrightInformation, Country, DataBankName, Day, DescriptorName,
# FlyBaseID, ForeName, GeneralNote, GrantID, ISSN, Initials, Issue,
# Keyword, Language, LastName, MedlineDate, MedlineID, MedlinePgn,
# MedlineTA, Month, NameOfSubstance, NlmUniqueID, Note,
# NumberOfReferences, OtherID, PMID, PublicationType, QualifierName,
# RefSource, SpaceFlightMission, Suffix, VernacularTitle, Volume, Year,
# docclass, docid, url] Total documents=12960
searchfield=all
searchallfield = all
# Fields to display when xml chosen on search result page.
outfields=LastName,ArticleTitle,Year,docclass,url
storefields=docid,docclass,ArticleTitle,PMID,LastName,Year,url
#? diff in PMID and MedlineID ?
## for xslt to produce the result page tables
resultxsl = conf/medline_result.xsl
resultspage = resultxsl.jsp
# reportxsl = conf/medline_report.xsl
# reporthtmlxsl=conf/medline_report.xsl
linkto=\
FlyBaseID>fbrf-docid\n\
linkto.fbrf=FlyBaseID>docid
## these are for search system; map generic field names to lib-specific
searchfieldalias.title=ArticleTitle
searchfieldalias.year=Year
searchfieldalias.author=LastName
searchfieldalias.abstract=AbstractText
searchfieldalias.pubtype=docclass
searchfieldalias.symbol=GeneSymbol,docid
## difference in header. footer. is for native file data versus lucegene index fields
header.native=\n\
\n\
\n
footer.native=\n
header.xml-medline=\n\
\n
footer.xml-medline=\n
# outcharset to preserve special chars; test case FBrf0151864 author: Hombr’a
# outcharset=ISO-8859-1
outcharset=UTF-8
header.xml=\n
footer.xml=\n
# xsl = medline.xsl
# ? this returns flybase FBrf reports now ; change
batchformat = text/xml
batchformats = text/xml, text/xml-medline, text/plain, text/csv, text/tsv
title.text/xml=Medline Hypertext
title.text/xml-medline=Medline XML
nativeformat = text/xml text/xml-medline
batchheadlines=1
batch.outfields= docid, docclass, ArticleTitle, PMID, LastName, Year, all
docurl=http://flybase.net/cgi-bin/fbidq.html?
# batchurl=/cgi-bin/fbidq.html?
# batchproc=cgi-bin/fbidq.html
batch.forward=lookup.jsp
# ^^ this works well
fieldlabel.docid=ID
fieldlabel.docclass=Class
## for search only?
searchskip.docclass=0
searchskip.title=0
searchskip.docid=1
searchskip.summary=1
# ---- indexing values; NOTE: need these for proper (cased) searches
# locate data with regex file, folder patterns
regex_folder=
regex_file=^\\w*medline.*\.xml$
regex_skipfile=
regex_skipfolder=.*
INDEX_CLASS=org.eugenes.index.LuceneXmlIndexer
## append existing index or create new
INDEX_APPEND=false
## index names as values (as well as field names)?
INDEX_TAGS=true
## index values ?
## none of medline attributes look interesting
INDEX_ATTRIBUTES=false
## use fieldname xpath; full top.middle.last field name?
#? INDEX_XPATH=true -- Medline.xml leaf names are unique enough we can drop the Xpath mess
INDEX_XPATH=false
## INDEX_LEVEL=0 means index main records one level below xml tag
INDEX_LEVEL=0
INDEX_BLANKS=false
# some medline xpaths
# MedlineCitationSet.MedlineCitation. .MedlineID, .ArticleTitle, .Abstract.AbstractText
# special added field .FlyBaseID -- srs dump script adds this comment; fix to be xml tag
#
#
# 22380106
# ---- Change to
#
# FBrf0155939
# 22380106
# -------
# zcat fbmedline*.gz |\
# perl -pe'if(m,ID:(FBrf\d+),){$id=$1} s,,$id\n,;'\
# > medline.xml
# grep -c FlyBaseID med*xml = 32640
## IndexWriter opts
## merge=10 is default; 4 == less mem usage ; 2 minimum
merge_factor=6
## max_field_length is max # terms/field
max_field_length=1000000
MAX_FIELDS=100000
## field indexing parameters
## sumfields list needs to match field.xxx common summary fields
sumfields=docid,docclass
# special summary fields -- replace w/ fieldalias.TAG=newtag
# these must always have fieldtype Text or UnIndexed to be useful
fieldtype=UnStored
# field.docid=MedlineCitationSet.MedlineCitation.FlyBaseID
# fieldalias.MedlineCitationSet.MedlineCitation.FlyBaseID=docid
field.docid=FlyBaseID
#fieldalias.FlyBaseID=docid
fieldtype.docid=Text
fieldtype.FlyBaseID=Text
fieldtype.MedlineID=Text
fieldtype.PMID=Text
fieldtype.MedlineDate=Text
fieldtype.LastName=Text
## want only PubDate.Year
fieldtype.Year=Text
fieldtype.MedlineCitationSet.MedlineCitation.DateCreated.Year=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateCompleted.Year=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateRevised.Year=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateCreated.Month=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateCompleted.Month=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateRevised.Month=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateCreated.Day=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateCompleted.Day=skip
fieldtype.MedlineCitationSet.MedlineCitation.DateRevised.Day=skip
## Note: this Year is in several fields PubDate, DateCreated, DateCompleted, others?
# - want only PubDate year... problem with flattening xpath; back to full xpath fields?
# Query: find +all:dpp +(Year:2000 Year:2002) +LastName:Gelbart
# docid LastName Year docclass url
# FBrf0049566 Gelbart 1990;1990;2000;1989 Review, Academic medline.xml,109688880-109693311
# FBrf0038632 Gelbart 1982;1982;2000;1982 Journal Article medline.xml,129968726-129972614
#field.title=MedlineCitationSet.MedlineCitation.Article.ArticleTitle
#fieldalias.MedlineCitationSet.MedlineCitation.Article.ArticleTitle=title
field.title=ArticleTitle
fieldtype.title=Text
##bad##fieldtype.title=UnIndexed
fieldtype.ArticleTitle=Text
## PublicationType is dang list - probably need to recode
# field.docclass=MedlineCitationSet.MedlineCitation.Article.PublicationTypeList.PublicationType
# fieldalias.MedlineCitationSet.MedlineCitation.Article.PublicationTypeList.PublicationType=docclass
field.docclass=PublicationType
fieldtype.docclass=Text
# field.summary=MedlineCitationSet.MedlineCitation.Article.Abstract.AbstractText
# fieldalias.MedlineCitationSet.MedlineCitation.Article.Abstract.AbstractText=summary
# field.summary=AbstractText
# fieldtype.summary=Text
##bad##fieldtype.summary=UnIndexed
fieldtype.AbstractText=Text
fieldtype.url=UnIndexed
fieldtype.modified=Keyword
# MedlineCitationSet.MedlineCitation.MedlineID
# field.ID=MedlineCitationSet.MedlineCitation.MedlineID
# fieldalias.MedlineCitationSet.MedlineCitation.MedlineID=ID
# fieldtype.ID=Text
# document links fields:
# doclink is created by LucegeneIndexers$GameSeqRelationLink_FieldRecoder
# search.linkfrom=doclink
# search.linkto=docid
# fieldtype.doclink=Text
## default - Text or UnStored = index but dont store text; only types to tokenizer
#fieldtype.att_id=Text
#fieldtype.Symbol=Text
#fieldtype.Date=Text
#fieldtype.Source=Text
analyzer=org.eugenes.index.BiodataAnalyzer2
# all field defaults
tokenizer=fbacode$LowerWordTokenizer
tokenfilter=fbacode$DebugFilter
tokenfilter.MedlineCitation=fbacode$DebugEndOfRecordFilter
# fieldrecoder=fbacode$DebugMedlineRecoder
#tokenfilter.Year=fbacode$DebugFilter
#? convert .Day,.Month,.Year to DT/Date field ?
# to create "contents" field of all text
indexall=false
# AddCommonField_FieldRecoder
# generate base fieldname fields when INDEX_XPATH=true
# this doesnt prevent default indexing of full xpath
# using property 'fieldalias.att_timestamp=timestamp' will collapse all
# for given field.
#fieldrecoder.name=LucegeneIndexers$AddCommonField_FieldRecoder
#fieldrecoder.att_id=LucegeneIndexers$AddCommonField_FieldRecoder
#fieldrecoder.synonym=LucegeneIndexers$AddCommonField_FieldRecoder
#fieldrecoder.description=LucegeneIndexers$AddCommonField_FieldRecoder
# tokenfilter.att_length=org.eugenes.index.BiodataAnalyzer$NumberFilter
## these look useless:
fieldtype.QualifierName.att_MajorTopicYN=skip
fieldtype.DescriptorName.att_MajorTopicYN=skip
fieldtype.RegistryNumber=skip
fieldtype.GrantList.att_CompleteYN=skip
# #----------------- sample debug output
#
# docclass:journal
# docclass:article
# MedlineCitation.att_Owner:nlm
# MedlineCitation.att_Status:completed
# FlyBaseID:fbrf0158699
# MedlineID:22480891
# PMID:12593455
# Year:2003
# Month:02
# Day:20
# Year:2003
# Month:05
# Day:13
# Year:2003
# Month:09
# Day:09
# ISSN:0065
# ISSN:2660
# Volume:48
# Year:2003
# title:genetics
# title:and
# title:molecular
# title:biology
# ...
# MedlinePgn:1
# MedlinePgn:280
# summary:application
# summary:of
# summary:generic
# summary:variants
# summary:sections
# ...
# DescriptorName:genetics
# NumberOfReferences:500
# docid:fbrf0158699
-----------------
# bin/lucegene-search.sh -verbose=1 -lib medline
# index in progres