// fbacode.java /** * fbacode Data class and field specific parsers to handle flybase reference acode data. See also fbacode.properties which links these classes to general acode parser. .. some of these are not for Acode data; move out ? E.g., fbgnslim2.properties: field.docid=MAIN_ID fieldrecoder.ID=fbacode$FBMainID_Recoder -- decide if ID is for main record tokenizer.MAIN_ID=fbacode$IDTokens fieldrecoder.GSYM=fbacode$SYM_Recoder -- parses species field; create lowercase field tokenizer.GSYM=fbacode$SymTokens tokenizer.GSYM.lc=fbacode$LowerSymTokens tokenizer.ENZ=fbacode$words -- parse as words tokenizer.ENZ.cv=fbacode$cvterms -- parse as cv phrases fieldrecoder.ENZ=fbacode$GO_Recoder -- creates .cv subfield w/ cv term string, GO/EC ids Equivalent to these flybase srs/icarus/db/fbgn.is, fbrules.is parsers ## take FBgn's only pri_acc: ~ { $In:[fields c:{ID ID2}] $Out } # ( lrec? key ( /FBgn[0-9]+/ {if:$tp=='ID' $Uniq:ID else $Uniq} ( lrec? key ( /FBgn[0-9]+/ { if:$tp=='ID' $Uniq:ID else $Uniq} | /PFgn([0-9]+)/ { $Uniq:[s:"FBgn$1"] if:$tp=='ID' $Uniq:ID } | word | nonword )+ nl )+ ~ ## take FBgn's only ID: ~ { $In:[pri_acc c:ID] $Out } (/(FBgn|PFgn)[0-9]+/ { $Uniq $entryName=$Ct } | word | nonword)+ ~ SYN: ~ { $In:[preSYMlike c:{GSYM SYN}] $Out} ( /([0-9A-Za-z.]+)\\\\(.+)/ {$Uniq $Uniq:[s:$2]} | /.+/ {$Uniq} )* ~ SYM: ~ { $In:[preSYMlike c:GSYM] $Out } ( /([0-9A-Za-z.]+)\\\\(.+)/ {$Uniq $Uniq:[s:$2] $species=$1} | /.+/ {$Uniq} )* ~ # Enzyme - CV field + GO field + EC num field # 'serine-tRNA ligase ; GO:0004828 ; EC:6.1.1.11 .....' - new May20 # 'ENZ|arylalkylamine N-acetyltransferase == EC 2.3.1.87' - old enzfuncRecoder: ~ x{ $cv='' } ( word {$Uniq $StrApp:[$cv s:$Ct]} | / +/ {$StrApp:[$cv s:'_']} | /[^ ;\n]/ {$StrApp:[$cv s:$Ct]} )* x{ $Uniq:[s:$cv] $Uniq:[TERM s:$cv] $StrApp:[$cv s:'_'] $Uniq:[s:$cv] $Uniq:[TERM s:$cv] } ( /EC[ =:-]* / /[0-9-]+\\.[0-9-]+\\.[0-9\\.-]+/ { $Uniq $Uniq:[ECNUM s:$Ct] $Uniq:[s:"EC:$Ct"] } | /GO[ =:-]* / /[0-9]+/ { $Uniq $Uniq:[GO s:$Ct] $Uniq:[s:"GO:$Ct"] } | / *;/ | (/ *[|][^\n]+/) # skip comment about homology in new format | /[^\n]/ )* ~ $fbrules.ENZ -- enzfuncRecoder $fbrules.FNC -- enzfuncRecoder $fbrules.CEL -- enzfuncRecoder Usage, given location in dbs/lucegene/ that index script finds: bin/lucegene-index.sh -debug -l fbrftest >& tmp/log.fbrft Some SRS icarus fbgn.is rules ## fbid: ~ /[FP][BF][A-Za-z][A-Za-z][0-9]+/ ~ ## fbidpatt: ~ /(FB|PF)[a-z][a-z][A-Za-z0-9-]+/ ~ ## this one includes FBst-BL0123 stocks ## word: ~ /[0-9A-Za-z_]+/ ~ ## nonword: ~ (/[^0-9A-Za-z_\n]+/ | /\n +/) ~ ## symword: ~ /[^ \/|\n]+/ ~ # new style - sure delimiters ## glocword: ~ /[0-9A-Za-z_.\[\]\-]+/ ~ ## clocword: ~ /[<>*?-]*[0-9h][0-9A-Za-z_<>?+*-] / ~ */ import java.io.*; import java.util.*; import java.util.regex.*; import java.text.SimpleDateFormat; import org.eugenes.index.LuceneBaseIndexer; import org.eugenes.index.LuceneBaseIndexer.FieldRecoder; import org.eugenes.index.biodata.DataFilter; import org.eugenes.index.biodata.DataTokenizer; import org.eugenes.index.BiodataFilters.NumberField; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.document.Field; import org.apache.lucene.document.Document; import org.apache.lucene.document.DateField; public class fbacode { final static boolean nodups=true, withdups=false; // add >1 field value to doc? final static int kSubrecLevel = 1; // for fieldpath count static boolean debug = LuceneBaseIndexer.debug; static PrintStream logp = LuceneBaseIndexer.logp; /** small field recoder to add new main ID field for acode with multiple nested ID fields - really need to have main acode indexer handle this. */ public static class FBMainID_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>kSubrecLevel) return kNoChange; idx.addField("MAIN_ID", val.toString(), doc, nodups); return kFieldAdded; } } /** Greek_Recoder converts any SGML &agr; or to plain text */ public static class Greek_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String s= val.toString(); if (isgreek(s)) { val.replace(0, val.length(), greek2text(s)); //if (debug && !"contents".equals(fieldName)) //logp.println("# greek2text "+fieldName+":{"+s+"}={"+val+"}"); return kValChanged; } return kNoChange; } } /** replace FBMainID_Recoder for any field; adds "MAIN_"+fieldName -- use other syntax? */ public static class FBMainField_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>kSubrecLevel) return kNoChange; idx.addField("MAIN_"+fieldName, val.toString(), doc, nodups); return kFieldAdded; } } public static class FB_ALESR_Field_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { boolean skip= !(fieldPath.endsWith("ALESR") || fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)<=kSubrecLevel) ; //if (debug) logp.println("# ALESR "+fieldPath+"."+fieldName+" skip="+skip); if(skip) return kNoChange; idx.addField("MAIN_"+fieldName, val.toString(), doc, nodups); return kFieldAdded; } } /** add default gene class value if not already added EOR is a special case field sent at end of each record; dont' realy want to index it */ // public static class FBGN_EOR_Recoder implements FieldRecoder { // static String kDefaultClass="gene"; // static String kDefaultSpecies="Dmel"; // // public int recodeField(LuceneBaseIndexer idx, Document doc, // String fieldName, String fieldPath, StringBuffer val) // { // // should be able to use one class here and get FBGN/FBAN from fieldPath: GENR.* = FBgn; // //if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>1) return kNoChange; // boolean added= idx.addField("MAIN_CLA", kDefaultClass, doc, nodups); // added = added || idx.addField("species", kDefaultSpecies, doc, nodups); // if (debug) logp.println("# EOR "+fieldPath); // return kSkipField + kFieldAdded; // dont index or leave this to props ? // } // } //? need FB_EOSUBR_Recoder for FBgn/FBal ? public static class FB_EOR_Recoder implements FieldRecoder { static String kFBanClass="annotation"; // gene? now use annotation static String kFBgnClass="gene"; static String kFBalClass="allele"; static String kFBrfClass="paper"; static String kDefaultSpecies="Dmel"; public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { //if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>1) return kNoChange; String mclass=""; if (fieldPath.indexOf("ALESR")>=0) mclass=kFBalClass; else if (fieldPath.startsWith("GENR")) mclass=kFBgnClass; else if (fieldPath.startsWith("GADR")) mclass=kFBanClass; else if (fieldPath.startsWith("REFR")) mclass=kFBrfClass; boolean added= idx.addField("MAIN_CLA", mclass, doc, nodups); if ( mclass == kFBrfClass ) ; else { boolean addsp= idx.addField("species", kDefaultSpecies, doc, nodups); //? missing this added = added || addsp; } if (debug) logp.println("# EOR "+fieldPath); return kSkipField + kFieldAdded; } } // should rename this: FBidDBX_Recoder public static class Contents_Recoder implements FieldRecoder { // look for things in all fields text/word stream static Pattern fbid = Pattern.compile("\\b((FB|PF)[a-zA-Z]+\\d+)"); //static Pattern ecnum = Pattern.compile("EC:[0-9-]+\\.[0-9-]+\\.[0-9\\.-]+",Pattern.CASE_INSENSITIVE); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { Matcher ma; int n=0; String sval= val.toString(); ma= fbid.matcher(sval); while(ma.find()) { idx.addField("DBX", ma.group(1), doc, withdups); n++; } //return kSkipField + kFieldAdded; return (n>0 ? kFieldAdded : kNoChange); } } public static class FBID_Recoder implements FieldRecoder { // look for things in all fields text/word stream //static Pattern fbid = Pattern.compile("\\b((FB|PF)[a-zA-Z]+\\d+)"); //static Pattern fbid = Pattern.compile("\\b([FP][BF][A-Za-z]+\\d+)"); static Pattern fbid = Pattern.compile("\\b(FB|PF)[A-Za-z]{2,5}[0-9]+"); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { int n=0; String sval= val.toString(); Matcher ma= fbid.matcher(sval); while( ma.find() ) { idx.addField("DBX", ma.group(), doc, withdups); n++; } return (n>0 ? kFieldAdded : kNoChange); } } /** BINDInter_Recoder for bindxml with Interaction_a and _b sections separately index fieldPath subfields under _a and _b */ public static class BINDInter_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String mol=null; if (fieldPath.indexOf("Interaction_a")>=0) mol="_a"; else if (fieldPath.indexOf("Interaction_b")>=0) mol="_b"; else return kNoChange; String sval= val.toString(); idx.addField( fieldName+mol, sval, doc, withdups); // add _mol field return kFieldAdded; } } public static class defline_Recoder implements FieldRecoder { public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); int at= sval.indexOf(' '); if (at>0) sval= sval.substring(0,at); if (sval.startsWith(">")) sval= sval.substring(1); idx.addField( "ID", sval, doc, nodups); if (mainrecsave_Recoder.theVal!=null) idx.addField( "ID2", mainrecsave_Recoder.theVal, doc, nodups); return kFieldAdded; } } public static class mainrecsave_Recoder implements FieldRecoder { public static String theVal=""; //?? public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { theVal= val.toString(); // or savedfields.put(fieldName, val.toString()); return kNoChange; } } public static class deflineTokens extends DataTokenizer { public deflineTokens(Reader in) { super(in); } public deflineTokens() { super(); } protected boolean isTokenChar(char c) { return !(c<=' ' || ",;:=()".indexOf(c)>=0); } protected char normalize(char c) { return Character.toLowerCase(c); } //usually case insense } /** get species from symbol */ public static class SYM_Recoder implements FieldRecoder { static Pattern regex= Pattern.compile("(\\w[^\\\\]+)\\\\"); // awful patt for \ escape char static String kDefaultSpecies="Dmel"; public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); Matcher ma= regex.matcher(sval); int ret= 0; if (ma.lookingAt()) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)<=kSubrecLevel) { // ==1 ? idx.addField("species", ma.group(1), doc, nodups); // do this only for MAIN record ret |= kFieldAdded; } int e= ma.end(); val.append(' ').append(sval.substring(e)); //e+1? and add non-species to main field ret |= kValChanged; } else { //? move this into FBGN_EOR_Recoder, nodups ? //idx.addField("species", kDefaultSpecies, doc, withdups); //ret |= kFieldAdded; } // changed .lc to .cs (case-sense); make lowercase default field search // ?? change .cs to _cs for backward compat; more sensible idx.addField(fieldName+".cs", val.toString(), doc, withdups); // 2nd copy for case-insens? ret |= kFieldAdded; return ret; } } public static class SymTokens extends DataTokenizer { public SymTokens(Reader in) { super(in); } public SymTokens() { super(); } protected boolean isTokenChar(char c) { return !Character.isWhitespace(c); } protected char normalize(char c) { return c; } } public static class LowerSymTokens extends DataTokenizer { public LowerSymTokens(Reader in) { super(in); } public LowerSymTokens() { super(); } protected boolean isTokenChar(char c) { return !Character.isWhitespace(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class dbxrefTokens extends DataTokenizer { public dbxrefTokens(Reader in) { super(in); } public dbxrefTokens() { super(); } protected boolean isTokenChar(char c) { return !(c<=' ' || ",;(){}[]<>|/".indexOf(c)>=0); } protected char normalize(char c) { return Character.toLowerCase(c); } //usually case insense } public static class words extends DataTokenizer { public words(Reader in) { super(in); } public words() { super(); } protected boolean isTokenChar(char c) { return Character.isLetterOrDigit(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class cvterms extends DataTokenizer { public cvterms(Reader in) { super(in); } public cvterms() { super(); } protected boolean isTokenChar(char c) { return !(c<' ' || c== ';' || c==','); }// allow whitespace in cvterm; but not newlines protected char normalize(char c) { return Character.toLowerCase(c); } } // ??change .cv suffix to _cv public static class CV_Recoder implements FieldRecoder { static Pattern cvterm= Pattern.compile("\\s*([^;]+)\\s*;?",Pattern.CASE_INSENSITIVE); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); Matcher ma= cvterm.matcher(sval); // get all in field? or just .lookingAt() while(ma.find()) idx.addField(fieldName+".cv", ma.group(1), doc, withdups); return kFieldAdded; //? kSkipField or cut indexed parts and put rest in generic WORDs/XXX field ? } } /** Allele class recoder -- put direct to 'docclass'? or ALC.cv ## ALC + CLA for alleles == class; CLA == wild-type generic if no ALC? fieldrecoder.CLA=fbacode$FB_ALESR_Field_Recoder,fbacode$ALC_Recoder fieldrecoder.ALC=fbacode$ALC_Recoder */ public static class ALC_Recoder implements FieldRecoder { //ALC|antimorph with @Scer\GAL4GMR.PF@ static String ALCfield = "ALC.cv"; // not docclass? static Pattern withx = Pattern.compile("\\s*(with)\\s*"); static Pattern symbol = PhenotypeCV_Recoder.symbol; //Pattern.compile("@([^@]+)@"); static Pattern cvterm = Pattern.compile("\\s*(\\w[^;,&(|\\\\]+)\\s*"); // require leading char so '/+' is skipped public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { boolean skip= ( "CLA".equals(fieldName) && fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)<=kSubrecLevel) ; if (skip) return kNoChange; String sval= val.toString(); // this is full field with newlines ! do by line? sval= symbol.matcher(sval).replaceAll(";"); sval= withx.matcher(sval).replaceAll(";"); Matcher ma= cvterm.matcher(sval); while(ma.find()) { String cv= ma.group(1).trim(); if (cv.length()>1) idx.addField( ALCfield, cv, doc, withdups); //? no dups } return kFieldAdded; } } public static class MU_Recoder extends PhenotypeCV_Recoder { static Pattern escaped = Pattern.compile("\\\\[^\n]*"); //what are all these escapes for? ' MU:spontaneous \mutator' //MU:recombination \between transposable elements public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); sval= escaped.matcher(sval).replaceAll("\n"); val.replace(0, val.length(), sval); return super.recodeField(idx, doc, fieldName, fieldPath, val); } } public static class PhenotypeCV_Recoder implements FieldRecoder { static Pattern cvterm = Pattern.compile("\\s*(\\w[^;,&(|\\\\]+)\\s*[;,&(|\\\\]?"); // require leading char so '/+' is skipped static Pattern prewith= Pattern.compile("\\(with\\s+(.+)\\) "); // need trailing space; problems static Pattern symbol = Pattern.compile("@([^@]+)@"); static Pattern squiggles = Pattern.compile("\\{[^}]*\\}"); static Pattern parens = Pattern.compile("\\([^)]*\\)"); static Pattern withx = Pattern.compile("\\s*(of|by|with)\\s*"); static Pattern alesym = Pattern.compile("\\w+\\[.+\\]\\S*"); // still get this occasional symbol GIC.cv:sxl[-] // still get some symbols as cv: scer\gal4[1] public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { Matcher ma; String sval= val.toString(); // this is full field with newlines ! do by line? //if (debug) logp.println("#"+fieldName+"="+sval); StringBuffer cvbuf= new StringBuffer(); ma= prewith.matcher(sval); while(ma.find()) { idx.addField("symbols", ma.group(1), doc, withdups); ma.appendReplacement(cvbuf,";"); } ma.appendTail(cvbuf); sval= cvbuf.toString(); cvbuf.setLength(0); ma= symbol.matcher(sval); while(ma.find()) { idx.addField("symbols", ma.group(1), doc, withdups); ma.appendReplacement(cvbuf,";"); } ma.appendTail(cvbuf); sval= cvbuf.toString(); cvbuf.setLength(0); sval= withx.matcher(sval).replaceAll(";"); sval= squiggles.matcher(sval).replaceAll(";"); sval= parens.matcher(sval).replaceAll(";"); ma= cvterm.matcher(sval); while(ma.find()) { String cv= ma.group(1).trim(); if (cv.length()>1 && !alesym.matcher(cv).matches()) idx.addField(fieldName+".cv", cv, doc, withdups); } return kFieldAdded; //? kSkipField or cut indexed parts and put rest in generic WORDs/XXX field ? } } public static class GO_Recoder implements FieldRecoder { static Pattern goterm= Pattern.compile("\\s*([^;]+)\\s*;?",Pattern.CASE_INSENSITIVE); static Pattern goid = Pattern.compile("\\bGO:[0-9-]+",Pattern.CASE_INSENSITIVE); static Pattern ecnum = Pattern.compile("EC:[0-9-]+\\.[0-9-]+\\.[0-9\\.-]+",Pattern.CASE_INSENSITIVE); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); Matcher ma; ma= goterm.matcher(sval); if(ma.lookingAt()) idx.addField(fieldName+".cv", ma.group(1), doc, withdups); //? put the ids in separate field? .go .ec ... //? or separate toplevel field: GOID, ECID, or generic DBXREF ? - ACC/AccNumber now in SRS index ma= goid.matcher(sval); while(ma.find()) idx.addField("DBX", ma.group(), doc, withdups); //fieldName+".cv" ma= ecnum.matcher(sval); while(ma.find()) idx.addField("DBX", ma.group(), doc, withdups); //fieldName+".cv" return kFieldAdded; //? kSkipField or cut indexed parts and put rest in generic WORDs/XXX field ? } } public static class FBbtId_Recoder // apply to FBcv.acode/LNK field now implements FieldRecoder { static Pattern btid = Pattern.compile("\\bFBbt:[0-9-]+",Pattern.CASE_INSENSITIVE); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { String sval= val.toString(); Matcher ma= btid.matcher(sval); if(ma.find()) { idx.addField("docid", ma.group(), doc, nodups); return kFieldAdded; } return kNoChange; } } /** * Location_FieldRecoder Parse flybase sequence location fields = genbank standard locations create three numeric index fields: fieldName.start, fieldName.stop, fieldName.strand (-1,1) BLOC|join(100..200,300..400) BLOC|complement(2000..3000) BLOC|1..2 -- also handle loc=2L:100..200,300..400 ; loc=2L:complement(2000..3000) create field fieldName.chr=2L if (\w+): */ public static class Location_Recoder implements FieldRecoder { //? final static String regex_loc="(\\d+)\\.\\.(\\d+)"; static Pattern regexLoc= Pattern.compile("(\\d+)"); static Pattern regexChr= Pattern.compile("(\\w+):"); public int recodeField(LuceneBaseIndexer idx, Document doc, String fieldName, String fieldPath, StringBuffer val) { if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) return kNoChange; // skip subrec locations always ? //^ instead use addField(... doc, nodup) ? String chr=null; String sval= val.toString(); Matcher ma; ma= regexChr.matcher(sval); if (ma.lookingAt()) { chr= ma.group(1); int e= ma.end(); sval= sval.substring(e); // dont delete - caller wants full buf ?? } ma= regexLoc.matcher(sval); if (ma.find()) { String start = ma.group(); //(1) ? String stop = start; while( ma.find() ) stop = ma.group(); //(2) ? //(ma.groupCount()>1) ? ma.group(ma.groupCount()) : start; // ^^ not good, need to .find() last if (chr!=null) idx.addField(fieldName+".chr", chr, doc, withdups); String strand= (sval.indexOf("complement(")>=0) ? "-1" : "1"; idx.addField(fieldName+".start", start, doc, withdups); idx.addField(fieldName+".stop", stop, doc, withdups); idx.addField(fieldName+".strand", strand, doc, withdups); return kFieldAdded; } return kNoChange; } } public static class LocationTokens extends DataTokenizer { public LocationTokens(Reader in) { super(in); } public LocationTokens() { super(); } // need to keep "Chr:complement(1111..22222,33333..44444)" chars protected boolean isTokenChar(char c) { return !(c==';'||c<=' '); } protected char normalize(char c) { return c; } } public static class IDTokens extends DataTokenizer { public IDTokens(Reader in) { super(in); } public IDTokens() { super(); } protected boolean isTokenChar(char c) { return Character.isLetterOrDigit(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } //## dateword: ~ /[^0-9]*([0-9]+)[ \/\-]*([A-Z][a-z][a-z])[a-z]*[ \/\-]*([0-9]+)/ ~ // use java date parser? public static class DateTokens extends DataTokenizer { public DateTokens(Reader in) { super(in); } public DateTokens() { super(); } protected boolean isTokenChar(char c) { return !(c<' ' || c==';'); } protected char normalize(char c) { return c; } } public static class DateFilter extends DataFilter { static SimpleDateFormat df1, df2, df3, df4, todf; static { df1 = new SimpleDateFormat("dd MMM yy"); df1.setLenient(true); df2 = new SimpleDateFormat("yyyy.MM.dd"); df2.setLenient(true); df3 = new SimpleDateFormat("yyyy"); df3.setLenient(true); df4 = new SimpleDateFormat("MM/dd/yy"); df4.setLenient(true); // what of dd/mm/yy or mm/dd/yy ? todf = new SimpleDateFormat("yyyyMMdd"); } // fbformat is "dd MMM yy" 28 Feb 04 public Token next() throws IOException { Token t = input.next(); if (t != null) try { int c; Date dt= null; String text = t.termText(); //text= text.trim(); // allows whitespace.. c= text.indexOf('/'); if (c>0 && (dt==null)) try { dt= df4.parse(text); } catch (Exception dx) {} c= text.indexOf('-'); if (c>0) text= text.replace('-','.'); c= text.indexOf('.'); if (dt==null) try { dt= df1.parse(text); } catch (Exception dx) {} if (c>0 && dt==null) try { dt= df2.parse(text); } catch (Exception dx) {} if (dt==null) { if (c>0) text= text.substring(0,c); try { dt= df3.parse(text); } catch (Exception dx) {} } //if (debug) logp.println("# datefilter in="+t.termText()+" out="+dt); if (dt==null) return null; // t or null? text = todf.format(dt); //DateField.dateToString(dt); return new Token( text, t.startOffset(), t.endOffset(), t.type()); } catch (Exception ex) { } //? eat it; this is mostly failing .. need to handle messy date formats better // 2004.5.17 -- in refs > drop '.', '-', .. return t; } } public static class LowerDataTokenizer extends DataTokenizer { public LowerDataTokenizer(Reader in) { super(in); } public LowerDataTokenizer() { super(); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class LowerWordTokenizer extends DataTokenizer { public LowerWordTokenizer(Reader in) { super(in); } public LowerWordTokenizer() { super(); } protected boolean isTokenChar(char c) { return Character.isLetterOrDigit(c); } protected char normalize(char c) { return Character.toLowerCase(c); } } public static class DebugFilter extends DataFilter { public Token next() throws IOException { Token t = input.next(); if (t == null) ; else if (debug) logp.println(this.getField()+":"+t.termText()); return t; } } public static class DebugEndOfRecordFilter extends DataFilter { public Token next() throws IOException { Token t = input.next(); if (t == null) ; else if (debug) { logp.println(this.getField()+":"+t.termText()); logp.println("-----------------"); logp.println(); } return t; } } public static class NumberFilter // should return null unless is numeric string extends DataFilter { public Token next() throws IOException { Token t = input.next(); if (t == null) return null; try { String text = t.termText(); // need word tokenizer + num parts: .+- int ival= 0; int c= text.indexOf('.'); if (c>0) text= text.substring(0,c); ival= Integer.parseInt(text); String nums = NumberField.numToString( ival ); // can except t= new Token( nums, t.startOffset(), t.endOffset(), t.type()); return t; } catch (Exception e) { return null; } //? eat it or throw IOException } } // not much use as filter given that most tokenizers skip ';' -- use in recoder chain public static class GreekFilter extends DataFilter { public Token next() throws IOException { Token t = input.next(); if (t == null) return null; String term= t.termText(); if (isgreek(term)) { String text= greek2text(term); return new Token( text, t.startOffset(), t.endOffset(), t.type()); } else return t; } } public final static String[] updownSGML= { "", "", "", "" }; public final static String[] updownTEXT= { "[", "]", "[[", "]]" }; public static String[] greekSGML = { "&agr;", "&Agr;", "&bgr;", "&Bgr;", "&dgr;", "&Dgr;", "&egr;", "&Egr;", "&ggr;", "&Ggr;", "&kgr;", "&Kgr;", "&lgr;", "&Lgr;", "&ngr;", "&Ngr;", "&pgr;", "&Pgr;", "&ohgr;", "&OHgr;", "&zgr;", "&Zgr;", "&psgr;", "&PSgr;", "&eegr;", "&EEgr;", "&thgr;", "&THgr;", "&igr;", "&Igr;", "&mgr;", "&Mgr;", "&xgr;", "&Xgr;", "&ogr;", "&Ogr;", "&rgr;", "&Rgr;", "&sgr;", "&Sgr;", "&tgr;", "&Tgr;", "&ugr;", "&Ugr;", "&phgr;", "&PHgr;", "&khgr;", "&KHgr;" }; public static String[] greekTEXT = { "alpha", "Alpha", "beta", "Beta", "delta", "Delta", "epsilon", "Epsilon", "gamma", "Gamma", "kappa","Kappa", "lambda", "Lambda", "nu", "Nu", "pi", "Pi", "omega", "Omega", "zeta", "Zeta", "psi", "Psi", "eta", "Eta", "theta", "Theta", "iota", "Iota", "mu", "Mu", "xi", "Xi", "omicron","Omicron", "rho", "Rho", "sigma", "Sigma", "tau", "Tau", "upsilon", "Upsilon", "phi", "Phi", "chi", "Chi" }; static Pattern anyxPattern = Pattern.compile("(&\\w{1,3}gr;||[&<])"); static Pattern greekPattern = Pattern.compile("(&\\w{1,3}gr;)"); // agr, pgr... static Pattern updownPattern = Pattern.compile("()"); static Properties greektextmap = makemap(greekSGML, greekTEXT ); static Properties updntextmap = makemap(updownSGML, updownTEXT ); static Properties makemap(String[] alist, String[] tolist) { Properties p= new Properties(); for (int i=0; i