// BioIndexers.java /** * BioIndexers Data class and field specific parsers to handle flybase, other biology data for LuceGene indexing lucene-indexer.sh script will recompile and use these $PROP_ROOT/dataclass.properties should add these as fieldrecoder.FIELD=classname tokenfilter.FIELD=classname tokenizer.FIELD=classname E.g., go.properties: fieldrecoder.LNK=BioIndexers$GOID_FieldRecoder fban.properties: fieldrecoder.BLOC=BioIndexers$Location_FieldRecoder note apr04 - split out the Game XML specific ones to other main class? */ import java.io.*; import java.util.*; import java.util.regex.*; import java.text.SimpleDateFormat; import org.eugenes.index.LuceneBaseIndexer; import org.eugenes.index.LuceneBaseIndexer.FieldRecoder; import org.eugenes.index.biodata.DataFilter; import org.eugenes.index.biodata.DataTokenizer; import org.eugenes.index.BiodataFilters.NumberField; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.document.Field; import org.apache.lucene.document.Document; import org.apache.lucene.document.DateField; public class BioIndexers { final static boolean nodups=true, withdups=false; // add >1 field value to doc? static boolean debug = LuceneBaseIndexer.debug; static PrintStream logp = LuceneBaseIndexer.logp; /** * GOID_FieldRecoder Parse flybase FBgo.acode field LNK|GO:000000 for GO:id (data should be changed) -- create new lucene index field "GOID" to be used for docid (need GOID intermediate?) */ public static class GOID_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { //final public int kNoChange= 0, kValChanged=1, kFieldAdded=2, kSkipField=4; // | OR these flags? final static String regex_goid="\\bGO:\\d+"; static Pattern regexGOID= Pattern.compile(regex_goid,Pattern.CASE_INSENSITIVE); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { // CONFUSING: ma.matches() == "^patt$" lookingAt() == "^patt" find() == next "patt" Matcher ma = regexGOID.matcher(val); if (ma.find()) { String id = ma.group(); // check for all matches? idx.addField("GOID", id, doc, nodups); return kFieldAdded; } return kNoChange; } } /** * Location_FieldRecoder Parse flybase sequence location fields = genbank standard locations create three numeric index fields: fieldName.start, fieldName.stop, fieldName.strand (-1,1) BLOC|join(100..200,300..400) BLOC|complement(2000..3000) BLOC|1..2 -- also handle loc=2L:100..200,300..400 ; loc=2L:complement(2000..3000) create field fieldName.chr=2L if (\w+): */ public static class Location_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { final static String regex_chr="(\\w+):"; final static String regex_loc="(\\d+)"; //? final static String regex_loc="(\\d+)\\.\\.(\\d+)"; static Pattern regexLoc= Pattern.compile(regex_loc); static Pattern regexChr= Pattern.compile(regex_chr); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) return kNoChange; // skip subrec locations always ? //^ instead use addField(... doc, nodup) ? String chr=null; String sval= val.toString(); Matcher ma; ma= regexChr.matcher(sval); if (ma.lookingAt()) { chr= ma.group(1); int e= ma.end(); sval= sval.substring(e); // dont delete - caller wants full buf ?? } ma= regexLoc.matcher(sval); if (ma.find()) { String start = ma.group(); //(1) ? String stop = start; while( ma.find() ) stop = ma.group(); //(2) ? //(ma.groupCount()>1) ? ma.group(ma.groupCount()) : start; // ^^ not good, need to .find() last if (chr!=null) idx.addField(fieldName+".chr", chr, doc, withdups); String strand= (sval.indexOf("complement(")>=0) ? "-1" : "1"; idx.addField(fieldName+".start", start, doc, withdups); idx.addField(fieldName+".stop", stop, doc, withdups); idx.addField(fieldName+".strand", strand, doc, withdups); return kFieldAdded; } return kNoChange; } } /** * AddCommonField_FieldRecoder generate base fieldname fields when INDEX_XPATH=true this doesnt prevent default indexing of full xpath using property 'fieldalias.att_timestamp=timestamp' will collapse all for given field. addField( indexFieldName( currentFieldName), val, storeDoc, false); ^^^^ bypass this for some fields can we do part? - add last part of fieldPath to get span.start ? */ public static class AddCommonField_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { idx.addField( fieldName, val.toString(), doc, withdups); return kFieldAdded; } } /** * GFFAttribute_FieldRecoder */ public static class GFFAttribute_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { final static String regex_attr="(\\w+)=([^;\\s]+);?"; static Pattern regexAttr= Pattern.compile(regex_attr); static int debugc=0; public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { int ret= kNoChange; //if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) // return kNoChange; // skip subrec locations always ? Matcher ma= regexAttr.matcher(val.toString()); while (ma.find()) { String akey = ma.group(1); String aval = ma.group(2); idx.addField(akey, aval, doc, withdups); ret= kFieldAdded; if (debug && debugc++ < 30) logp.println("GFFAttribute."+fieldName+": "+akey+"="+aval); } return ret; } } /** * SeqDbxref_FieldRecoder recode this: dbxref='CG11023,FlyBase:FBan0011023' to CG11023 FBan0011023 -- this thing will strip out '' and other symbols tokenizer.db_xref=org.eugenes.index.BiodataAnalyzer$LowerWordTokenizer -- temp fixer to add FBan ID when missing but have CG/CR id -- current parser is not seeing separate ids? need to wordbreak at ,: */ public static class SeqDbxref_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { static Pattern regexCG = Pattern.compile("\\bC[GR](\\d+)"); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { int ret= kNoChange; int len= val.length(); if (len>1) { if (val.charAt(len-1) == '\'') val.deleteCharAt(len-1); if (val.charAt(0) == '\'') val.deleteCharAt(0); ret= kValChanged; } // -- dont need this, just index property for word breaks //val.replace(0, 9999999, val.toString().replace(',',' ')); ret= kValChanged; int ian= val.indexOf("FBan"); if (ian<0) { Matcher ma= regexCG.matcher(val); if (ma.find()) { String idnum= ma.group(1); while(idnum.length()<7) idnum="0"+idnum; val.append(",FBan"+idnum); ret= kValChanged; } } return ret; } } /** * Swiss_SQ_FieldRecoder parser for swissprot/uniprot SQ line SQ SEQUENCE 262 AA; 28969 MW; DA87363A0D92BAF4 CRC64; */ public static class Swiss_SQ_FieldRecoder implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { static Pattern regexSQ = Pattern.compile("\\s+(\\S+)\\s+(\\S+)\\s*[;]"); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { int ret= kNoChange; Matcher ma= regexSQ.matcher(val); while (ma.find()) { String aval= ma.group(1); String akey= ma.group(2); idx.addField( fieldName+LuceneBaseIndexer.xpathDelim+akey, aval, doc, withdups); ret= kSkipField | kFieldAdded; } return ret; } } /** BINDInter_Recoder for bindxml with Interaction_a and _b sections separately index fieldPath subfields under _a and _b */ public static class BINDInter_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String mol=null; if (fieldPath.indexOf("Interaction_a")>=0) mol="_a"; else if (fieldPath.indexOf("Interaction_b")>=0) mol="_b"; else return kNoChange; String sval= val.toString(); idx.addField( fieldName+mol, sval, doc, withdups); // add _mol field return kFieldAdded; } } public static class mainrecsave_Recoder implements FieldRecoder { public static String theVal=""; //?? public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { theVal= val.toString(); // or savedfields.put(fieldName, val.toString()); return kNoChange; } } public static class defline_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); int at= sval.indexOf(' '); if (at>0) sval= sval.substring(0,at); if (sval.startsWith(">")) sval= sval.substring(1); idx.addField( "ID", sval, doc, nodups); if (mainrecsave_Recoder.theVal!=null) idx.addField( "ID2", mainrecsave_Recoder.theVal, doc, nodups); return kFieldAdded; } } public static class deflineTokens extends DataTokenizer { public deflineTokens(Reader in) { super(in); } public deflineTokens() { super(); } protected boolean isTokenChar(char c) { return !(c<=' ' || ",;:=()".indexOf(c)>=0); } protected char normalize(char c) { return Character.toLowerCase(c); } //usually case insense } public static class SymTokens extends DataTokenizer { public SymTokens(Reader in) { super(in); } public SymTokens() { super(); } protected boolean isTokenChar(char c) { return !Character.isWhitespace(c); } protected char normalize(char c) { return c; } } public static class LowerSymTokens extends DataTokenizer { public LowerSymTokens(Reader in) { super(in); } public LowerSymTokens() { super(); } protected boolean isTokenChar(char c) { return !Character.isWhitespace(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class dbxrefTokens extends DataTokenizer { public dbxrefTokens(Reader in) { super(in); } public dbxrefTokens() { super(); } protected boolean isTokenChar(char c) { return !(c<=' ' || ",;(){}[]<>|/".indexOf(c)>=0); } protected char normalize(char c) { return Character.toLowerCase(c); } //usually case insense } public static class words extends DataTokenizer { public words(Reader in) { super(in); } public words() { super(); } protected boolean isTokenChar(char c) { return Character.isLetterOrDigit(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class cvterms extends DataTokenizer { public cvterms(Reader in) { super(in); } public cvterms() { super(); } protected boolean isTokenChar(char c) { return !(c<' ' || c== ';' || c==','); }// allow whitespace in cvterm; but not newlines protected char normalize(char c) { return Character.toLowerCase(c); } } // ??change .cv suffix to _cv public static class CV_Recoder implements FieldRecoder { static Pattern cvterm= Pattern.compile("\\s*([^;]+)\\s*;?",Pattern.CASE_INSENSITIVE); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); Matcher ma= cvterm.matcher(sval); // get all in field? or just .lookingAt() while(ma.find()) idx.addField(fieldName+".cv", ma.group(1), doc, withdups); return kFieldAdded; //? kSkipField or cut indexed parts and put rest in generic WORDs/XXX field ? } } /** * Location_FieldRecoder Parse flybase sequence location fields = genbank standard locations create three numeric index fields: fieldName.start, fieldName.stop, fieldName.strand (-1,1) BLOC|join(100..200,300..400) BLOC|complement(2000..3000) BLOC|1..2 -- also handle loc=2L:100..200,300..400 ; loc=2L:complement(2000..3000) create field fieldName.chr=2L if (\w+): */ public static class Location_Recoder implements FieldRecoder { //? final static String regex_loc="(\\d+)\\.\\.(\\d+)"; static Pattern regexLoc= Pattern.compile("(\\d+)"); static Pattern regexChr= Pattern.compile("(\\w+):"); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) return kNoChange; // skip subrec locations always ? //^ instead use addField(... doc, nodup) ? String chr=null; String sval= val.toString(); Matcher ma; ma= regexChr.matcher(sval); if (ma.lookingAt()) { chr= ma.group(1); int e= ma.end(); sval= sval.substring(e); // dont delete - caller wants full buf ?? } ma= regexLoc.matcher(sval); if (ma.find()) { String start = ma.group(); //(1) ? String stop = start; while( ma.find() ) stop = ma.group(); //(2) ? //(ma.groupCount()>1) ? ma.group(ma.groupCount()) : start; // ^^ not good, need to .find() last if (chr!=null) idx.addField(fieldName+".chr", chr, doc, withdups); String strand= (sval.indexOf("complement(")>=0) ? "-1" : "1"; idx.addField(fieldName+".start", start, doc, withdups); idx.addField(fieldName+".stop", stop, doc, withdups); idx.addField(fieldName+".strand", strand, doc, withdups); return kFieldAdded; } return kNoChange; } } public static class LocationTokens extends DataTokenizer { public LocationTokens(Reader in) { super(in); } public LocationTokens() { super(); } // need to keep "Chr:complement(1111..22222,33333..44444)" chars protected boolean isTokenChar(char c) { return !(c==';'||c<=' '); } protected char normalize(char c) { return c; } } public static class IDTokens extends DataTokenizer { public IDTokens(Reader in) { super(in); } public IDTokens() { super(); } protected boolean isTokenChar(char c) { return Character.isLetterOrDigit(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } //## dateword: ~ /[^0-9]*([0-9]+)[ \/\-]*([A-Z][a-z][a-z])[a-z]*[ \/\-]*([0-9]+)/ ~ // use java date parser? public static class DateTokens extends DataTokenizer { public DateTokens(Reader in) { super(in); } public DateTokens() { super(); } protected boolean isTokenChar(char c) { return !(c<' ' || c==';'); } protected char normalize(char c) { return c; } } public static class DateFilter extends DataFilter { static SimpleDateFormat df1, df2, df3, df4, todf; static { df1 = new SimpleDateFormat("dd MMM yy"); df1.setLenient(true); df2 = new SimpleDateFormat("yyyy.MM.dd"); df2.setLenient(true); df3 = new SimpleDateFormat("yyyy"); df3.setLenient(true); df4 = new SimpleDateFormat("MM/dd/yy"); df4.setLenient(true); // what of dd/mm/yy or mm/dd/yy ? todf = new SimpleDateFormat("yyyyMMdd"); } // fbformat is "dd MMM yy" 28 Feb 04 public Token next() throws IOException { Token t = input.next(); if (t != null) try { int c; Date dt= null; String text = t.termText(); //text= text.trim(); // allows whitespace.. c= text.indexOf('/'); if (c>0 && (dt==null)) try { dt= df4.parse(text); } catch (Exception dx) {} c= text.indexOf('-'); if (c>0) text= text.replace('-','.'); c= text.indexOf('.'); if (dt==null) try { dt= df1.parse(text); } catch (Exception dx) {} if (c>0 && dt==null) try { dt= df2.parse(text); } catch (Exception dx) {} if (dt==null) { if (c>0) text= text.substring(0,c); try { dt= df3.parse(text); } catch (Exception dx) {} } //if (debug) logp.println("# datefilter in="+t.termText()+" out="+dt); if (dt==null) return null; // t or null? text = todf.format(dt); //DateField.dateToString(dt); return new Token( text, t.startOffset(), t.endOffset(), t.type()); } catch (Exception ex) { } //? eat it; this is mostly failing .. need to handle messy date formats better // 2004.5.17 -- in refs > drop '.', '-', .. return t; } } public static class LowerDataTokenizer extends DataTokenizer { public LowerDataTokenizer(Reader in) { super(in); } public LowerDataTokenizer() { super(); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class LowerWordTokenizer extends DataTokenizer { public LowerWordTokenizer(Reader in) { super(in); } public LowerWordTokenizer() { super(); } protected boolean isTokenChar(char c) { return Character.isLetterOrDigit(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class DebugFilter extends DataFilter { public Token next() throws IOException { Token t = input.next(); if (t == null) ; else if (debug) logp.println(this.getField()+":"+t.termText()); return t; } } public static class DebugEndOfRecordFilter extends DataFilter { public Token next() throws IOException { Token t = input.next(); if (t == null) ; else if (debug) { logp.println(this.getField()+":"+t.termText()); logp.println("-----------------"); logp.println(); } return t; } } public static class NumberFilter // should return null unless is numeric string extends DataFilter { public Token next() throws IOException { Token t = input.next(); if (t == null) return null; try { String text = t.termText(); // need word tokenizer + num parts: .+- int ival= 0; int c= text.indexOf('.'); if (c>0) text= text.substring(0,c); ival= Integer.parseInt(text); String nums = NumberField.numToString( ival ); // can except t= new Token( nums, t.startOffset(), t.endOffset(), t.type()); return t; } catch (Exception e) { return null; } //? eat it or throw IOException } } } // class