// LucegeneIndexers.java /** * LucegeneIndexers Data class and field specific parsers to handle flybase, other biology data for LuceGene indexing lucene-indexer.sh script will recompile and use these $PROP_ROOT/dataclass.properties should add these as fieldrecoder.FIELD=classname tokenfilter.FIELD=classname tokenizer.FIELD=classname E.g., go.properties: fieldrecoder.LNK=LucegeneIndexers$GOID_FieldRecoder fban.properties: fieldrecoder.BLOC=LucegeneIndexers$Location_FieldRecoder note apr04 - split out the Game XML specific ones to other main class? */ import java.util.regex.*; import org.eugenes.index.LuceneBaseIndexer; import org.eugenes.index.BiodataAnalyzer; import org.apache.lucene.document.Field; import org.apache.lucene.document.Document; public class LucegeneIndexers { final static boolean nodups=true, withdups=false; // add >1 field value to doc? /** * GOID_FieldRecoder Parse flybase FBgo.acode field LNK|GO:000000 for GO:id (data should be changed) -- create new lucene index field "GOID" to be used for docid (need GOID intermediate?) */ public static class GOID_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { //final public int kNoChange= 0, kValChanged=1, kFieldAdded=2, kSkipField=4; // | OR these flags? final static String regex_goid="\\bGO:\\d+"; static Pattern regexGOID= Pattern.compile(regex_goid,Pattern.CASE_INSENSITIVE); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { // CONFUSING: ma.matches() == "^patt$" lookingAt() == "^patt" find() == next "patt" Matcher ma = regexGOID.matcher(val); if (ma.find()) { String id = ma.group(); // check for all matches? idx.addField("GOID", id, doc, nodups); return kFieldAdded; } return kNoChange; } } /** small field recoder to add new main ID field for acode with multiple nested ID fields - really need to have main acode indexer handle this. */ public static class FBMainID_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>1) return kNoChange; // skip subrec locations always ? idx.addField("MAIN_ID", val.toString(), doc, nodups); // or docid? return kFieldAdded; } } /** * Location_FieldRecoder Parse flybase sequence location fields = genbank standard locations create three numeric index fields: fieldName.start, fieldName.stop, fieldName.strand (-1,1) BLOC|join(100..200,300..400) BLOC|complement(2000..3000) BLOC|1..2 -- also handle loc=2L:100..200,300..400 ; loc=2L:complement(2000..3000) create field fieldName.chr=2L if (\w+): */ public static class Location_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { final static String regex_chr="(\\w+):"; final static String regex_loc="(\\d+)"; //? final static String regex_loc="(\\d+)\\.\\.(\\d+)"; static Pattern regexLoc= Pattern.compile(regex_loc); static Pattern regexChr= Pattern.compile(regex_chr); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) return kNoChange; // skip subrec locations always ? //^ instead use addField(... doc, nodup) ? String chr=null; String sval= val.toString(); Matcher ma; ma= regexChr.matcher(sval); if (ma.lookingAt()) { chr= ma.group(1); int e= ma.end(); sval= sval.substring(e); // dont delete - caller wants full buf ?? } ma= regexLoc.matcher(sval); if (ma.find()) { String start = ma.group(); //(1) ? String stop = start; while( ma.find() ) stop = ma.group(); //(2) ? //(ma.groupCount()>1) ? ma.group(ma.groupCount()) : start; // ^^ not good, need to .find() last if (chr!=null) idx.addField(fieldName+".chr", chr, doc, withdups); String strand= (sval.indexOf("complement(")>=0) ? "-1" : "1"; idx.addField(fieldName+".start", start, doc, withdups); idx.addField(fieldName+".stop", stop, doc, withdups); idx.addField(fieldName+".strand", strand, doc, withdups); return kFieldAdded; } return kNoChange; } } /** * GFFAttribute_FieldRecoder */ public static class GFFAttribute_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { final static String regex_attr="(\\w+)=([^;\\s]+);?"; static Pattern regexAttr= Pattern.compile(regex_attr); static int debugc=0; public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { int ret= kNoChange; //if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) // return kNoChange; // skip subrec locations always ? Matcher ma= regexAttr.matcher(val.toString()); while (ma.find()) { String akey = ma.group(1); String aval = ma.group(2); idx.addField(akey, aval, doc, withdups); ret= kFieldAdded; if (LuceneBaseIndexer.debug && debugc++ < 30) LuceneBaseIndexer.logp.println("GFFAttribute."+fieldName+": "+akey+"="+aval); } return ret; } } /** * SeqDbxref_FieldRecoder recode this: dbxref='CG11023,FlyBase:FBan0011023' to CG11023 FBan0011023 -- this thing will strip out '' and other symbols tokenizer.db_xref=org.eugenes.index.BiodataAnalyzer$LowerWordTokenizer -- temp fixer to add FBan ID when missing but have CG/CR id -- current parser is not seeing separate ids? need to wordbreak at ,: */ public static class SeqDbxref_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { static Pattern regexCG = Pattern.compile("\\bC[GR](\\d+)"); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { int ret= kNoChange; int len= val.length(); if (len>1) { if (val.charAt(len-1) == '\'') val.deleteCharAt(len-1); if (val.charAt(0) == '\'') val.deleteCharAt(0); ret= kValChanged; } // -- dont need this, just index property for word breaks //val.replace(0, 9999999, val.toString().replace(',',' ')); ret= kValChanged; int ian= val.indexOf("FBan"); if (ian<0) { Matcher ma= regexCG.matcher(val); if (ma.find()) { String idnum= ma.group(1); while(idnum.length()<7) idnum="0"+idnum; val.append(",FBan"+idnum); ret= kValChanged; } } return ret; } } /** * Swiss_SQ_FieldRecoder parser for swissprot/uniprot SQ line SQ SEQUENCE 262 AA; 28969 MW; DA87363A0D92BAF4 CRC64; */ public static class Swiss_SQ_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { static Pattern regexSQ = Pattern.compile("\\s+(\\S+)\\s+(\\S+)\\s*[;]"); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { int ret= kNoChange; Matcher ma= regexSQ.matcher(val); while (ma.find()) { String aval= ma.group(1); String akey= ma.group(2); idx.addField( fieldName+LuceneBaseIndexer.xpathDelim+akey, aval, doc, withdups); ret= kSkipField | kFieldAdded; } return ret; } } /** * AddCommonField_FieldRecoder generate base fieldname fields when INDEX_XPATH=true this doesnt prevent default indexing of full xpath using property 'fieldalias.att_timestamp=timestamp' will collapse all for given field. addField( indexFieldName( currentFieldName), val, storeDoc, false); ^^^^ bypass this for some fields can we do part? - add last part of fieldPath to get span.start ? E.g. want common span.start,end fields for these field=game.annotation.feature_set.feature_span.seq_relationship.span.end field=game.annotation.feature_set.feature_span.seq_relationship.span.startfield=game.computational_analysis.result_set.result_span.seq_relationship.span.end field=game.computational_analysis.result_set.result_span.seq_relationship.span.start field=game.computational_analysis.result_set.seq_relationship.span.end field=game.computational_analysis.result_set.seq_relationship.span.start */ public static class AddCommonField_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { idx.addField( fieldName, val.toString(), doc, withdups); return kFieldAdded; } } /** * GameSpan_FieldRecoder -- need to check enclosing xml seq_relationship.type="query" or "subject" -- skip indexing subject data, query == genome locations // need to look at prior doc fields ## double urk: all scaffold query span.start,end are ## RELATIVE TO scaffold span.tile_start,tile_end ## are any of them reversed ?? ## need also to recode spans for output ... */ public static class GameSpan_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String tp= fieldPath; int ti= tp.lastIndexOf( LuceneBaseIndexer.xpathDelim); if (ti>0) tp= tp.substring(0,ti+1); String stype= idx.getLastField(tp+"att_type",0); // good only for INDEX_XPATH=true if (stype==null) stype= idx.getLastField("att_type",0); // ^? try also tp.lastElement + att_type ? //if (LuceneBaseIndexer.debug) //LuceneBaseIndexer.logp.println("GameSpan."+fieldPath+"."+fieldName+" type="+stype+" val="+val); if ("query".equals(stype)) { String tilestart= idx.getLastField("game.map_position.span.tile_start",0); String tileend = idx.getLastField("game.map_position.span.tile_end",0); int istart=-1, iend=-1; if (tilestart!=null) istart= Integer.parseInt(tilestart); if (tileend!=null) iend= Integer.parseInt(tileend); if (iend >= 0 && iend < istart) { int iswp= iend; iend= istart; istart= iswp; } String sval= val.toString(); int ival= Integer.parseInt(sval); if (istart > 0) ival += istart; idx.addField( fieldName, String.valueOf(ival), doc, withdups); String arm= idx.getLastField("arm",0); if (arm!=null) idx.addField( "arm", arm, doc, nodups); // add so can do start:[100 200] AND arm:x -- need only one arm value/object return kFieldAdded; } else { if (stype == null) stype="untyped"; String fn= fieldPath + LuceneBaseIndexer.xpathDelim + stype + "_" + fieldName; idx.addField( fn, val.toString(), doc, withdups); val.setLength(0); return kFieldAdded + kSkipField; } //return kNoChange; } } /** * GameProperty_FieldRecoder ## -- recode all property.type,value pairs as 'property.type=value' fields? # # cyto_range # 40F7-40F7 # # # gbunit # AE002603 # */ public static class GameProperty_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String del=LuceneBaseIndexer.xpathDelim; String tp= fieldPath + del + fieldName + del; String stype= idx.getLastField(tp+"type",0); // good only for INDEX_XPATH=true String value= idx.getLastField(tp+"value",0); //if (LuceneBaseIndexer.debug) //LuceneBaseIndexer.logp.println("GameProperty."+tp+stype+"="+value); if (stype != null && value != null) { idx.addField( tp+stype, value, doc, withdups); return kFieldAdded; } return kNoChange; } } /** * GameArm_FieldRecoder -- fix for bad data # # # 2L # --as this chado2game field is bad # # $5 << should be 2L # # 22109491 # 22217931 # # */ public static class GameArm_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (val.toString().startsWith("$")) { String del=LuceneBaseIndexer.xpathDelim; String tp= fieldPath; //int ti= tp.lastIndexOf( del); if (ti>0) tp= tp.substring(0,ti+1); String arm= idx.getLastField(tp+del+"att_seq",0); if (arm==null) arm= idx.getLastField("att_seq",0); String fn= fieldPath+del+fieldName; if (LuceneBaseIndexer.debug) LuceneBaseIndexer.logp.println("GameArm."+fn+"="+arm); if ( arm != null ) { idx.addField( "arm", arm, doc, nodups); //idx.addField( fn, arm, doc, false); return kFieldAdded + kSkipField; } return kNoChange; } else { // got valid arm field; jun04 String arm= val.toString(); idx.addField( "arm", arm, doc, nodups); if (LuceneBaseIndexer.debug) { String del=LuceneBaseIndexer.xpathDelim; String fn= fieldPath+del+fieldName; LuceneBaseIndexer.logp.println("GameArm."+fn+"="+arm); } return kFieldAdded; } //return kNoChange; } } /** * GameAddmap_position_FieldRecoder ## these are top-level records in game scaffold files; some need ## arm:start-end of scaf added for retrieval by range # game.seq < none have start,end ? == sequence dbxref,name # game.map_position == 1 record/scaffold == scaf range ## game.map_position.span.tile_start ## game.map_position.span.tile_end ## arm -- all recs now have arm, see above # game.computational_analysis << these all have span.start,end # game.annotation << these all should have start,end ## double urk: all game scaffold query span.start,end are ## RELATIVE TO scaffold span.tile_start,tile_end */ public static class GameAddmap_position_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String del=LuceneBaseIndexer.xpathDelim; // silly if we are not using '.' as below String tilestart= idx.getLastField("game.map_position.span.tile_start",0); String tileend = idx.getLastField("game.map_position.span.tile_end",0); String fn = fieldPath+del+fieldName; //if (LuceneBaseIndexer.debug) //LuceneBaseIndexer.logp.println("GameAddmap."+fn+" start="+start+" end="+end); if ( tilestart != null && tileend != null ) { // do we want/need both? idx.addField( "start", tilestart, doc, nodups); idx.addField( "end", tileend, doc, nodups); //idx.addField( fn+".span.start", tilestart, doc, nodups); //idx.addField( fn+".span.end", tileend, doc, nodups); } String arm= idx.getLastField("arm",0); if (arm!=null) idx.addField( "arm", arm, doc, nodups); // see GameClass_FieldRecoder idx.addField( "docclass", fieldName, doc, nodups); return kFieldAdded; } } /** * GameClass_FieldRecoder -- add tag name == docclass for top-level (doc) objects fieldrecoder.game.computational_analysis=LucegeneIndexers$GameClass_FieldRecoder fieldrecoder.game.annotation=LucegeneIndexers$GameClass_FieldRecoder fieldrecoder.game.map_position=LucegeneIndexers$GameClass_FieldRecoder fieldrecoder.game.seq=LucegeneIndexers$GameClass_FieldRecoder fieldrecoder.game.seq=LucegeneIndexers$GameAddmap_position_FieldRecoder ^^ this one has two recoders, do in one ? */ public static class GameClass_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { idx.addField( "docclass", fieldName, doc, nodups); return kFieldAdded; } } /** * GameSeqRelationLink_FieldRecoder -- add "doclink" for these fields in companal (only? none such in fieldrecoder.game.computational_analysis.result_set.seq_relationship=GameSeqRelationLink_FieldRecoder */ public static class GameSeqRelationLink_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String del=LuceneBaseIndexer.xpathDelim; String tp= fieldPath + del + fieldName ; // att field name is always elname + xpathDelim + attNamePrefix + atkey String type= idx.getLastField(tp+del+"att_type",0); if (type==null) type= idx.getLastField(fieldName+del+"att_type",0); if ("subject".equals(type)) { String doclink= idx.getLastField(tp+del+"att_seq",0); if (doclink==null) doclink= idx.getLastField(fieldName+del+"att_seq",0); //? can we weed out duplicate doc/doclink values here? // doclink=RE54557.5prime doclink=RE54557.5prime doclink=RE54557.5prime if ( doclink != null ) { //if (LuceneBaseIndexer.debug) //LuceneBaseIndexer.logp.println("GameSeqRelationLink."+fieldPath+"."+fieldName+" doclink="+doclink); idx.addField( "doclink", doclink, doc, withdups); return kFieldAdded; } } return kNoChange; } } /** * FeatureSourceChr_FieldRecoder Parse gnomap feature 'source' line for chromosome value; add to each feature doc */ public static class FeatureSourceChr_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { static String chr="unknown"; static boolean insource; public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if ("feature".equals(fieldName)) { insource= ("source".equals(val.toString())); } else if ("map".equals(fieldName)) { if (insource) chr= val.toString(); idx.addField("chr", chr, doc, nodups); // or docid? return kFieldAdded; } return kNoChange; } } }