# dbs/lucegene/medline.properties # d.gilbert, summer 2004 LIB_NAME=medline title = Medline Abstracts (Drosophila) # can use absolute paths here, or assume ARGOS_SERVICE_ROOT is set DATA_ROOT=web/data/refs/ INDEX_PATH=indices/lucene/medline/ MIME_TYPE=text/xml ## --------- search/report keys ------------------- # bin/lucegene-search.sh -verbose=1 -lib medline # Index fields [ , AbstractText, AccessionNumber, Acronym, Affiliation, # Agency, ArticleTitle, CitationSubset, CollectiveName, # CopyrightInformation, Country, DataBankName, Day, DescriptorName, # FlyBaseID, ForeName, GeneralNote, GrantID, ISSN, Initials, Issue, # Keyword, Language, LastName, MedlineDate, MedlineID, MedlinePgn, # MedlineTA, Month, NameOfSubstance, NlmUniqueID, Note, # NumberOfReferences, OtherID, PMID, PublicationType, QualifierName, # RefSource, SpaceFlightMission, Suffix, VernacularTitle, Volume, Year, # docclass, docid, url] Total documents=12960 searchfield=all searchallfield = all # Fields to display when xml chosen on search result page. outfields=LastName,ArticleTitle,Year,docclass,url storefields=docid,docclass,ArticleTitle,PMID,LastName,Year,url #? diff in PMID and MedlineID ? ## for xslt to produce the result page tables resultxsl = conf/medline_result.xsl resultspage = resultxsl.jsp # reportxsl = conf/medline_report.xsl # reporthtmlxsl=conf/medline_report.xsl linkto=\ FlyBaseID>fbrf-docid\n\ linkto.fbrf=FlyBaseID>docid ## these are for search system; map generic field names to lib-specific searchfieldalias.title=ArticleTitle searchfieldalias.year=Year searchfieldalias.author=LastName searchfieldalias.abstract=AbstractText searchfieldalias.pubtype=docclass searchfieldalias.symbol=GeneSymbol,docid ## difference in header. footer. is for native file data versus lucegene index fields header.native=\n\ \n\ \n footer.native=\n header.xml-medline=\n\ \n footer.xml-medline=\n # outcharset to preserve special chars; test case FBrf0151864 author: Hombr’a # outcharset=ISO-8859-1 outcharset=UTF-8 header.xml=\n footer.xml=\n # xsl = medline.xsl # ? this returns flybase FBrf reports now ; change batchformat = text/xml batchformats = text/xml, text/xml-medline, text/plain, text/csv, text/tsv title.text/xml=Medline Hypertext title.text/xml-medline=Medline XML nativeformat = text/xml text/xml-medline batchheadlines=1 batch.outfields= docid, docclass, ArticleTitle, PMID, LastName, Year, all docurl=http://flybase.net/cgi-bin/fbidq.html? # batchurl=/cgi-bin/fbidq.html? # batchproc=cgi-bin/fbidq.html batch.forward=lookup.jsp # ^^ this works well fieldlabel.docid=ID fieldlabel.docclass=Class ## for search only? searchskip.docclass=0 searchskip.title=0 searchskip.docid=1 searchskip.summary=1 # ---- indexing values; NOTE: need these for proper (cased) searches # locate data with regex file, folder patterns regex_folder= regex_file=^\\w*medline.*\.xml$ regex_skipfile= regex_skipfolder=.* INDEX_CLASS=org.eugenes.index.LuceneXmlIndexer ## append existing index or create new INDEX_APPEND=false ## index names as values (as well as field names)? INDEX_TAGS=true ## index values ? ## none of medline attributes look interesting INDEX_ATTRIBUTES=false ## use fieldname xpath; full top.middle.last field name? #? INDEX_XPATH=true -- Medline.xml leaf names are unique enough we can drop the Xpath mess INDEX_XPATH=false ## INDEX_LEVEL=0 means index main records one level below xml tag INDEX_LEVEL=0 INDEX_BLANKS=false # some medline xpaths # MedlineCitationSet.MedlineCitation. .MedlineID, .ArticleTitle, .Abstract.AbstractText # special added field .FlyBaseID -- srs dump script adds this comment; fix to be xml tag # # # 22380106 # ---- Change to # # FBrf0155939 # 22380106 # ------- # zcat fbmedline*.gz |\ # perl -pe'if(m,ID:(FBrf\d+),){$id=$1} s,,$id\n,;'\ # > medline.xml # grep -c FlyBaseID med*xml = 32640 ## IndexWriter opts ## merge=10 is default; 4 == less mem usage ; 2 minimum merge_factor=6 ## max_field_length is max # terms/field max_field_length=1000000 MAX_FIELDS=100000 ## field indexing parameters ## sumfields list needs to match field.xxx common summary fields sumfields=docid,docclass # special summary fields -- replace w/ fieldalias.TAG=newtag # these must always have fieldtype Text or UnIndexed to be useful fieldtype=UnStored # field.docid=MedlineCitationSet.MedlineCitation.FlyBaseID # fieldalias.MedlineCitationSet.MedlineCitation.FlyBaseID=docid field.docid=FlyBaseID #fieldalias.FlyBaseID=docid fieldtype.docid=Text fieldtype.FlyBaseID=Text fieldtype.MedlineID=Text fieldtype.PMID=Text fieldtype.MedlineDate=Text fieldtype.LastName=Text ## want only PubDate.Year fieldtype.Year=Text fieldtype.MedlineCitationSet.MedlineCitation.DateCreated.Year=skip fieldtype.MedlineCitationSet.MedlineCitation.DateCompleted.Year=skip fieldtype.MedlineCitationSet.MedlineCitation.DateRevised.Year=skip fieldtype.MedlineCitationSet.MedlineCitation.DateCreated.Month=skip fieldtype.MedlineCitationSet.MedlineCitation.DateCompleted.Month=skip fieldtype.MedlineCitationSet.MedlineCitation.DateRevised.Month=skip fieldtype.MedlineCitationSet.MedlineCitation.DateCreated.Day=skip fieldtype.MedlineCitationSet.MedlineCitation.DateCompleted.Day=skip fieldtype.MedlineCitationSet.MedlineCitation.DateRevised.Day=skip ## Note: this Year is in several fields PubDate, DateCreated, DateCompleted, others? # - want only PubDate year... problem with flattening xpath; back to full xpath fields? # Query: find +all:dpp +(Year:2000 Year:2002) +LastName:Gelbart # docid LastName Year docclass url # FBrf0049566 Gelbart 1990;1990;2000;1989 Review, Academic medline.xml,109688880-109693311 # FBrf0038632 Gelbart 1982;1982;2000;1982 Journal Article medline.xml,129968726-129972614 #field.title=MedlineCitationSet.MedlineCitation.Article.ArticleTitle #fieldalias.MedlineCitationSet.MedlineCitation.Article.ArticleTitle=title field.title=ArticleTitle fieldtype.title=Text ##bad##fieldtype.title=UnIndexed fieldtype.ArticleTitle=Text ## PublicationType is dang list - probably need to recode # field.docclass=MedlineCitationSet.MedlineCitation.Article.PublicationTypeList.PublicationType # fieldalias.MedlineCitationSet.MedlineCitation.Article.PublicationTypeList.PublicationType=docclass field.docclass=PublicationType fieldtype.docclass=Text # field.summary=MedlineCitationSet.MedlineCitation.Article.Abstract.AbstractText # fieldalias.MedlineCitationSet.MedlineCitation.Article.Abstract.AbstractText=summary # field.summary=AbstractText # fieldtype.summary=Text ##bad##fieldtype.summary=UnIndexed fieldtype.AbstractText=Text fieldtype.url=UnIndexed fieldtype.modified=Keyword # MedlineCitationSet.MedlineCitation.MedlineID # field.ID=MedlineCitationSet.MedlineCitation.MedlineID # fieldalias.MedlineCitationSet.MedlineCitation.MedlineID=ID # fieldtype.ID=Text # document links fields: # doclink is created by LucegeneIndexers$GameSeqRelationLink_FieldRecoder # search.linkfrom=doclink # search.linkto=docid # fieldtype.doclink=Text ## default - Text or UnStored = index but dont store text; only types to tokenizer #fieldtype.att_id=Text #fieldtype.Symbol=Text #fieldtype.Date=Text #fieldtype.Source=Text analyzer=org.eugenes.index.BiodataAnalyzer2 # all field defaults tokenizer=fbacode$LowerWordTokenizer tokenfilter=fbacode$DebugFilter tokenfilter.MedlineCitation=fbacode$DebugEndOfRecordFilter # fieldrecoder=fbacode$DebugMedlineRecoder #tokenfilter.Year=fbacode$DebugFilter #? convert .Day,.Month,.Year to DT/Date field ? # to create "contents" field of all text indexall=false # AddCommonField_FieldRecoder # generate base fieldname fields when INDEX_XPATH=true # this doesnt prevent default indexing of full xpath # using property 'fieldalias.att_timestamp=timestamp' will collapse all # for given field. #fieldrecoder.name=LucegeneIndexers$AddCommonField_FieldRecoder #fieldrecoder.att_id=LucegeneIndexers$AddCommonField_FieldRecoder #fieldrecoder.synonym=LucegeneIndexers$AddCommonField_FieldRecoder #fieldrecoder.description=LucegeneIndexers$AddCommonField_FieldRecoder # tokenfilter.att_length=org.eugenes.index.BiodataAnalyzer$NumberFilter ## these look useless: fieldtype.QualifierName.att_MajorTopicYN=skip fieldtype.DescriptorName.att_MajorTopicYN=skip fieldtype.RegistryNumber=skip fieldtype.GrantList.att_CompleteYN=skip # #----------------- sample debug output # # docclass:journal # docclass:article # MedlineCitation.att_Owner:nlm # MedlineCitation.att_Status:completed # FlyBaseID:fbrf0158699 # MedlineID:22480891 # PMID:12593455 # Year:2003 # Month:02 # Day:20 # Year:2003 # Month:05 # Day:13 # Year:2003 # Month:09 # Day:09 # ISSN:0065 # ISSN:2660 # Volume:48 # Year:2003 # title:genetics # title:and # title:molecular # title:biology # ... # MedlinePgn:1 # MedlinePgn:280 # summary:application # summary:of # summary:generic # summary:variants # summary:sections # ... # DescriptorName:genetics # NumberOfReferences:500 # docid:fbrf0158699 ----------------- # bin/lucegene-search.sh -verbose=1 -lib medline # index in progres