// LucegeneIndexers.java
/**
* LucegeneIndexers
Data class and field specific parsers to handle
flybase, other biology data for LuceGene indexing
lucene-indexer.sh script will recompile and use these
$PROP_ROOT/dataclass.properties should add these as
fieldrecoder.FIELD=classname
tokenfilter.FIELD=classname
tokenizer.FIELD=classname
E.g.,
go.properties:
fieldrecoder.LNK=LucegeneIndexers$GOID_FieldRecoder
fban.properties:
fieldrecoder.BLOC=LucegeneIndexers$Location_FieldRecoder
note apr04 - split out the Game XML specific ones to other main class?
*/
import java.util.regex.*;
import org.eugenes.index.LuceneBaseIndexer;
import org.eugenes.index.BiodataAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
public class LucegeneIndexers
{
final static boolean nodups=true, withdups=false; // add >1 field value to doc?
/**
* GOID_FieldRecoder
Parse flybase FBgo.acode field LNK|GO:000000 for GO:id
(data should be changed)
-- create new lucene index field "GOID" to be used for docid (need GOID intermediate?)
*/
public static class GOID_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
//final public int kNoChange= 0, kValChanged=1, kFieldAdded=2, kSkipField=4; // | OR these flags?
final static String regex_goid="\\bGO:\\d+";
static Pattern regexGOID= Pattern.compile(regex_goid,Pattern.CASE_INSENSITIVE);
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
// CONFUSING: ma.matches() == "^patt$" lookingAt() == "^patt" find() == next "patt"
Matcher ma = regexGOID.matcher(val);
if (ma.find()) {
String id = ma.group(); // check for all matches?
idx.addField("GOID", id, doc, nodups);
return kFieldAdded;
}
return kNoChange;
}
}
/**
small field recoder to add new main ID field for acode with multiple nested
ID fields - really need to have main acode indexer handle this.
*/
public static class FBMainID_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>1)
return kNoChange; // skip subrec locations always ?
idx.addField("MAIN_ID", val.toString(), doc, nodups); // or docid?
return kFieldAdded;
}
}
/**
* Location_FieldRecoder
Parse flybase sequence location fields = genbank standard locations
create three numeric index fields: fieldName.start, fieldName.stop, fieldName.strand (-1,1)
BLOC|join(100..200,300..400)
BLOC|complement(2000..3000)
BLOC|1..2
-- also handle loc=2L:100..200,300..400 ; loc=2L:complement(2000..3000)
create field fieldName.chr=2L if (\w+):
*/
public static class Location_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
final static String regex_chr="(\\w+):";
final static String regex_loc="(\\d+)";
//? final static String regex_loc="(\\d+)\\.\\.(\\d+)";
static Pattern regexLoc= Pattern.compile(regex_loc);
static Pattern regexChr= Pattern.compile(regex_chr);
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0)
return kNoChange; // skip subrec locations always ?
//^ instead use addField(... doc, nodup) ?
String chr=null;
String sval= val.toString();
Matcher ma;
ma= regexChr.matcher(sval);
if (ma.lookingAt()) {
chr= ma.group(1);
int e= ma.end();
sval= sval.substring(e); // dont delete - caller wants full buf ??
}
ma= regexLoc.matcher(sval);
if (ma.find()) {
String start = ma.group(); //(1) ?
String stop = start;
while( ma.find() ) stop = ma.group(); //(2) ?
//(ma.groupCount()>1) ? ma.group(ma.groupCount()) : start;
// ^^ not good, need to .find() last
if (chr!=null) idx.addField(fieldName+".chr", chr, doc, withdups);
String strand= (sval.indexOf("complement(")>=0) ? "-1" : "1";
idx.addField(fieldName+".start", start, doc, withdups);
idx.addField(fieldName+".stop", stop, doc, withdups);
idx.addField(fieldName+".strand", strand, doc, withdups);
return kFieldAdded;
}
return kNoChange;
}
}
/**
* GFFAttribute_FieldRecoder
*/
public static class GFFAttribute_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
final static String regex_attr="(\\w+)=([^;\\s]+);?";
static Pattern regexAttr= Pattern.compile(regex_attr);
static int debugc=0;
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
int ret= kNoChange;
//if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0)
// return kNoChange; // skip subrec locations always ?
Matcher ma= regexAttr.matcher(val.toString());
while (ma.find()) {
String akey = ma.group(1);
String aval = ma.group(2);
idx.addField(akey, aval, doc, withdups);
ret= kFieldAdded;
if (LuceneBaseIndexer.debug && debugc++ < 30)
LuceneBaseIndexer.logp.println("GFFAttribute."+fieldName+": "+akey+"="+aval);
}
return ret;
}
}
/**
* SeqDbxref_FieldRecoder
recode this: dbxref='CG11023,FlyBase:FBan0011023'
to CG11023 FBan0011023
-- this thing will strip out '' and other symbols
tokenizer.db_xref=org.eugenes.index.BiodataAnalyzer$LowerWordTokenizer
-- temp fixer to add FBan ID when missing but have CG/CR id
-- current parser is not seeing separate ids? need to wordbreak at ,:
*/
public static class SeqDbxref_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
static Pattern regexCG = Pattern.compile("\\bC[GR](\\d+)");
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
int ret= kNoChange;
int len= val.length();
if (len>1) {
if (val.charAt(len-1) == '\'') val.deleteCharAt(len-1);
if (val.charAt(0) == '\'') val.deleteCharAt(0);
ret= kValChanged;
}
// -- dont need this, just index property for word breaks
//val.replace(0, 9999999, val.toString().replace(',',' ')); ret= kValChanged;
int ian= val.indexOf("FBan");
if (ian<0) {
Matcher ma= regexCG.matcher(val);
if (ma.find()) {
String idnum= ma.group(1);
while(idnum.length()<7) idnum="0"+idnum;
val.append(",FBan"+idnum);
ret= kValChanged;
}
}
return ret;
}
}
/**
* Swiss_SQ_FieldRecoder
parser for swissprot/uniprot SQ line
SQ SEQUENCE 262 AA; 28969 MW; DA87363A0D92BAF4 CRC64;
*/
public static class Swiss_SQ_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
static Pattern regexSQ = Pattern.compile("\\s+(\\S+)\\s+(\\S+)\\s*[;]");
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
int ret= kNoChange;
Matcher ma= regexSQ.matcher(val);
while (ma.find()) {
String aval= ma.group(1);
String akey= ma.group(2);
idx.addField( fieldName+LuceneBaseIndexer.xpathDelim+akey, aval, doc, withdups);
ret= kSkipField | kFieldAdded;
}
return ret;
}
}
/**
* AddCommonField_FieldRecoder
generate base fieldname fields when INDEX_XPATH=true
this doesnt prevent default indexing of full xpath
using property 'fieldalias.att_timestamp=timestamp' will collapse all
for given field.
addField( indexFieldName( currentFieldName), val, storeDoc, false);
^^^^ bypass this for some fields
can we do part? - add last part of fieldPath to get span.start ?
E.g. want common span.start,end fields for these
field=game.annotation.feature_set.feature_span.seq_relationship.span.end
field=game.annotation.feature_set.feature_span.seq_relationship.span.startfield=game.computational_analysis.result_set.result_span.seq_relationship.span.end
field=game.computational_analysis.result_set.result_span.seq_relationship.span.start
field=game.computational_analysis.result_set.seq_relationship.span.end
field=game.computational_analysis.result_set.seq_relationship.span.start
*/
public static class AddCommonField_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
idx.addField( fieldName, val.toString(), doc, withdups);
return kFieldAdded;
}
}
/**
* GameSpan_FieldRecoder
-- need to check enclosing xml seq_relationship.type="query" or "subject"
-- skip indexing subject data, query == genome locations
// need to look at prior doc fields
## double urk: all scaffold query span.start,end are
## RELATIVE TO scaffold span.tile_start,tile_end
## are any of them reversed ??
## need also to recode spans for output ...
*/
public static class GameSpan_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
String tp= fieldPath;
int ti= tp.lastIndexOf( LuceneBaseIndexer.xpathDelim);
if (ti>0) tp= tp.substring(0,ti+1);
String stype= idx.getLastField(tp+"att_type",0); // good only for INDEX_XPATH=true
if (stype==null) stype= idx.getLastField("att_type",0);
// ^? try also tp.lastElement + att_type ?
//if (LuceneBaseIndexer.debug)
//LuceneBaseIndexer.logp.println("GameSpan."+fieldPath+"."+fieldName+" type="+stype+" val="+val);
if ("query".equals(stype)) {
String tilestart= idx.getLastField("game.map_position.span.tile_start",0);
String tileend = idx.getLastField("game.map_position.span.tile_end",0);
int istart=-1, iend=-1;
if (tilestart!=null) istart= Integer.parseInt(tilestart);
if (tileend!=null) iend= Integer.parseInt(tileend);
if (iend >= 0 && iend < istart) { int iswp= iend; iend= istart; istart= iswp; }
String sval= val.toString();
int ival= Integer.parseInt(sval);
if (istart > 0) ival += istart;
idx.addField( fieldName, String.valueOf(ival), doc, withdups);
String arm= idx.getLastField("arm",0);
if (arm!=null) idx.addField( "arm", arm, doc, nodups);
// add so can do start:[100 200] AND arm:x -- need only one arm value/object
return kFieldAdded;
}
else {
if (stype == null) stype="untyped";
String fn= fieldPath + LuceneBaseIndexer.xpathDelim + stype + "_" + fieldName;
idx.addField( fn, val.toString(), doc, withdups);
val.setLength(0);
return kFieldAdded + kSkipField;
}
//return kNoChange;
}
}
/**
* GameProperty_FieldRecoder
## -- recode all property.type,value pairs as 'property.type=value' fields?
#
# cyto_range
# 40F7-40F7
#
#
# gbunit
# AE002603
#
*/
public static class GameProperty_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
String del=LuceneBaseIndexer.xpathDelim;
String tp= fieldPath + del + fieldName + del;
String stype= idx.getLastField(tp+"type",0); // good only for INDEX_XPATH=true
String value= idx.getLastField(tp+"value",0);
//if (LuceneBaseIndexer.debug)
//LuceneBaseIndexer.logp.println("GameProperty."+tp+stype+"="+value);
if (stype != null && value != null) {
idx.addField( tp+stype, value, doc, withdups);
return kFieldAdded;
}
return kNoChange;
}
}
/**
* GameArm_FieldRecoder
-- fix for bad data
#
#
# 2L
# --as this chado2game field is bad
#
# $5 << should be 2L
#
# 22109491
# 22217931
#
#
*/
public static class GameArm_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
if (val.toString().startsWith("$")) {
String del=LuceneBaseIndexer.xpathDelim;
String tp= fieldPath;
//int ti= tp.lastIndexOf( del); if (ti>0) tp= tp.substring(0,ti+1);
String arm= idx.getLastField(tp+del+"att_seq",0);
if (arm==null) arm= idx.getLastField("att_seq",0);
String fn= fieldPath+del+fieldName;
if (LuceneBaseIndexer.debug)
LuceneBaseIndexer.logp.println("GameArm."+fn+"="+arm);
if ( arm != null ) {
idx.addField( "arm", arm, doc, nodups);
//idx.addField( fn, arm, doc, false);
return kFieldAdded + kSkipField;
}
return kNoChange;
}
else { // got valid arm field; jun04
String arm= val.toString();
idx.addField( "arm", arm, doc, nodups);
if (LuceneBaseIndexer.debug) {
String del=LuceneBaseIndexer.xpathDelim;
String fn= fieldPath+del+fieldName;
LuceneBaseIndexer.logp.println("GameArm."+fn+"="+arm);
}
return kFieldAdded;
}
//return kNoChange;
}
}
/**
* GameAddmap_position_FieldRecoder
## these are top-level records in game scaffold files; some need
## arm:start-end of scaf added for retrieval by range
# game.seq < none have start,end ? == sequence dbxref,name
# game.map_position == 1 record/scaffold == scaf range
## game.map_position.span.tile_start
## game.map_position.span.tile_end
## arm -- all recs now have arm, see above
# game.computational_analysis << these all have span.start,end
# game.annotation << these all should have start,end
## double urk: all game scaffold query span.start,end are
## RELATIVE TO scaffold span.tile_start,tile_end
*/
public static class GameAddmap_position_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
String del=LuceneBaseIndexer.xpathDelim; // silly if we are not using '.' as below
String tilestart= idx.getLastField("game.map_position.span.tile_start",0);
String tileend = idx.getLastField("game.map_position.span.tile_end",0);
String fn = fieldPath+del+fieldName;
//if (LuceneBaseIndexer.debug)
//LuceneBaseIndexer.logp.println("GameAddmap."+fn+" start="+start+" end="+end);
if ( tilestart != null && tileend != null ) {
// do we want/need both?
idx.addField( "start", tilestart, doc, nodups);
idx.addField( "end", tileend, doc, nodups);
//idx.addField( fn+".span.start", tilestart, doc, nodups);
//idx.addField( fn+".span.end", tileend, doc, nodups);
}
String arm= idx.getLastField("arm",0);
if (arm!=null) idx.addField( "arm", arm, doc, nodups);
// see GameClass_FieldRecoder
idx.addField( "docclass", fieldName, doc, nodups);
return kFieldAdded;
}
}
/**
* GameClass_FieldRecoder
-- add tag name == docclass for top-level (doc) objects
fieldrecoder.game.computational_analysis=LucegeneIndexers$GameClass_FieldRecoder
fieldrecoder.game.annotation=LucegeneIndexers$GameClass_FieldRecoder
fieldrecoder.game.map_position=LucegeneIndexers$GameClass_FieldRecoder
fieldrecoder.game.seq=LucegeneIndexers$GameClass_FieldRecoder
fieldrecoder.game.seq=LucegeneIndexers$GameAddmap_position_FieldRecoder
^^ this one has two recoders, do in one ?
*/
public static class GameClass_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
idx.addField( "docclass", fieldName, doc, nodups);
return kFieldAdded;
}
}
/**
* GameSeqRelationLink_FieldRecoder
-- add "doclink" for these fields in companal (only? none such in
fieldrecoder.game.computational_analysis.result_set.seq_relationship=GameSeqRelationLink_FieldRecoder
*/
public static class GameSeqRelationLink_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
String del=LuceneBaseIndexer.xpathDelim;
String tp= fieldPath + del + fieldName ;
// att field name is always elname + xpathDelim + attNamePrefix + atkey
String type= idx.getLastField(tp+del+"att_type",0);
if (type==null) type= idx.getLastField(fieldName+del+"att_type",0);
if ("subject".equals(type)) {
String doclink= idx.getLastField(tp+del+"att_seq",0);
if (doclink==null) doclink= idx.getLastField(fieldName+del+"att_seq",0);
//? can we weed out duplicate doc/doclink values here?
// doclink=RE54557.5prime doclink=RE54557.5prime doclink=RE54557.5prime
if ( doclink != null ) {
//if (LuceneBaseIndexer.debug)
//LuceneBaseIndexer.logp.println("GameSeqRelationLink."+fieldPath+"."+fieldName+" doclink="+doclink);
idx.addField( "doclink", doclink, doc, withdups);
return kFieldAdded;
}
}
return kNoChange;
}
}
/**
* FeatureSourceChr_FieldRecoder
Parse gnomap feature 'source' line for chromosome value; add to each feature doc
*/
public static class FeatureSourceChr_FieldRecoder
implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder {
static String chr="unknown";
static boolean insource;
public int recodeField(LuceneBaseIndexer idx, Document doc,
String fieldName, String fieldPath, StringBuffer val)
{
if ("feature".equals(fieldName)) {
insource= ("source".equals(val.toString()));
}
else if ("map".equals(fieldName)) {
if (insource) chr= val.toString();
idx.addField("chr", chr, doc, nodups); // or docid?
return kFieldAdded;
}
return kNoChange;
}
}
}