// LucegeneIndexers.java

/**
  * LucegeneIndexers
  Data class and field specific parsers to handle
  flybase, other biology data for LuceGene indexing
  
  lucene-indexer.sh script will recompile and use these  
  $PROP_ROOT/dataclass.properties should add these as
    fieldrecoder.FIELD=classname
    tokenfilter.FIELD=classname
    tokenizer.FIELD=classname
  E.g.,
  go.properties:
    fieldrecoder.LNK=LucegeneIndexers$GOID_FieldRecoder
  fban.properties:
    fieldrecoder.BLOC=LucegeneIndexers$Location_FieldRecoder 

  note apr04 - split out the Game XML specific ones to other main class?
  
*/

import java.util.regex.*;

import org.eugenes.index.LuceneBaseIndexer;
import org.eugenes.index.BiodataAnalyzer;

import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;

public class LucegeneIndexers
{

  final static boolean nodups=true, withdups=false; // add >1 field value to doc?

  /**
    * GOID_FieldRecoder
    Parse flybase FBgo.acode field LNK|GO:000000 for GO:id
    (data should be changed)
    -- create new lucene index field "GOID" to be used for docid (need GOID intermediate?)
  */
  
  public static class GOID_FieldRecoder 
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    //final public int kNoChange= 0, kValChanged=1, kFieldAdded=2, kSkipField=4; // | OR these flags?

    final static String regex_goid="\\bGO:\\d+";
    static Pattern regexGOID= Pattern.compile(regex_goid,Pattern.CASE_INSENSITIVE);

    public int recodeField(LuceneBaseIndexer idx, Document doc, 
      String fieldName, String fieldPath, StringBuffer val)
    {
        // CONFUSING: ma.matches() == "^patt$"  lookingAt() == "^patt"  find() == next "patt"
      Matcher ma = regexGOID.matcher(val); 
      if (ma.find()) {
        String id = ma.group();   // check for all matches?
        idx.addField("GOID", id, doc, nodups);
        return kFieldAdded;
        }
        
      return kNoChange;
    }
  }

  /**
   small field recoder to add new main ID field for acode with multiple nested
   ID fields - really need to have main acode indexer handle this.
  */
  public static class FBMainID_FieldRecoder 
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 

    public int recodeField(LuceneBaseIndexer idx, Document doc, 
      String fieldName, String fieldPath, StringBuffer val)
    {
     if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>1) 
        return kNoChange; // skip subrec locations always ?
      idx.addField("MAIN_ID", val.toString(), doc, nodups); // or docid?
      return kFieldAdded;
   }
  }


  /**
    * Location_FieldRecoder
    Parse flybase sequence location fields = genbank standard locations
    create three numeric index fields: fieldName.start, fieldName.stop, fieldName.strand (-1,1)
    BLOC|join(100..200,300..400) 
    BLOC|complement(2000..3000)
    BLOC|1..2
    
    -- also handle loc=2L:100..200,300..400 ; loc=2L:complement(2000..3000)
    create field fieldName.chr=2L if (\w+):
  */

  public static class Location_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    final static String regex_chr="(\\w+):";
    final static String regex_loc="(\\d+)";
    //? final static String regex_loc="(\\d+)\\.\\.(\\d+)";
    static Pattern regexLoc= Pattern.compile(regex_loc);
    static Pattern regexChr= Pattern.compile(regex_chr);
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) 
        return kNoChange; // skip subrec locations always ?
        //^ instead use addField(... doc, nodup) ?
      
      String chr=null;
      String sval= val.toString();
      Matcher ma;
      ma= regexChr.matcher(sval);
      if (ma.lookingAt()) {
        chr= ma.group(1);
        int e= ma.end();
        sval= sval.substring(e); // dont delete - caller wants full buf ??
        }
      ma= regexLoc.matcher(sval);
      if (ma.find()) {
        String start = ma.group(); //(1) ?   
        String stop  = start;
        while( ma.find() ) stop = ma.group(); //(2) ?
        //(ma.groupCount()>1) ? ma.group(ma.groupCount()) : start;
        // ^^ not good, need to .find() last
        
        if (chr!=null) idx.addField(fieldName+".chr", chr, doc, withdups);
        String strand= (sval.indexOf("complement(")>=0) ? "-1" : "1";
        idx.addField(fieldName+".start", start, doc, withdups);
        idx.addField(fieldName+".stop", stop, doc, withdups);
        idx.addField(fieldName+".strand", strand, doc, withdups);
        return kFieldAdded;
        }
      return kNoChange;
    }
  }
  

  /**
    * GFFAttribute_FieldRecoder
  */

  public static class GFFAttribute_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    final static String regex_attr="(\\w+)=([^;\\s]+);?";
    static Pattern regexAttr= Pattern.compile(regex_attr);
    static int debugc=0;
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      int ret= kNoChange;
      //if (fieldPath.indexOf(LuceneBaseIndexer.xpathDelim)>0) 
      //  return kNoChange; // skip subrec locations always ?
      
      Matcher ma= regexAttr.matcher(val.toString());
      while (ma.find()) {
        String akey = ma.group(1);     
        String aval = ma.group(2);  
        idx.addField(akey, aval, doc, withdups);
        ret= kFieldAdded;
        if (LuceneBaseIndexer.debug && debugc++ < 30)
        LuceneBaseIndexer.logp.println("GFFAttribute."+fieldName+": "+akey+"="+aval);
        }
      return ret;
    }
  }
  
  
  /**
   * SeqDbxref_FieldRecoder
     recode this: dbxref='CG11023,FlyBase:FBan0011023'
     to CG11023 FBan0011023
     -- this thing will strip out '' and other symbols
     tokenizer.db_xref=org.eugenes.index.BiodataAnalyzer$LowerWordTokenizer
     -- temp fixer to add FBan ID when missing but have CG/CR id

     -- current parser is not seeing separate ids? need to wordbreak at ,:

  */

  public static class SeqDbxref_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    static Pattern regexCG = Pattern.compile("\\bC[GR](\\d+)");
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      int ret= kNoChange;
      int len= val.length();
      if (len>1) {
        if (val.charAt(len-1) == '\'') val.deleteCharAt(len-1);
        if (val.charAt(0) == '\'') val.deleteCharAt(0);
        ret= kValChanged;
        }
      // -- dont need this, just index property for word breaks
      //val.replace(0, 9999999, val.toString().replace(',',' ')); ret= kValChanged;
      int ian= val.indexOf("FBan");
      if (ian<0) {
        Matcher ma= regexCG.matcher(val);
        if (ma.find()) {
          String idnum= ma.group(1);
          while(idnum.length()<7) idnum="0"+idnum;
          val.append(",FBan"+idnum);
          ret= kValChanged;
          }
        }
      
      return ret;
    }
  }


 /**
   * Swiss_SQ_FieldRecoder
    parser for swissprot/uniprot  SQ line
    SQ   SEQUENCE   262 AA;  28969 MW;  DA87363A0D92BAF4 CRC64;
  */

  public static class Swiss_SQ_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    static Pattern regexSQ = Pattern.compile("\\s+(\\S+)\\s+(\\S+)\\s*[;]");
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      int ret= kNoChange;
      Matcher ma= regexSQ.matcher(val);
      while (ma.find()) {
        String aval= ma.group(1);
        String akey= ma.group(2);
        idx.addField( fieldName+LuceneBaseIndexer.xpathDelim+akey, aval, doc, withdups);
        ret= kSkipField | kFieldAdded;
        }
    
      return ret;
    }
  }


  /**
    * AddCommonField_FieldRecoder
    
      generate base fieldname fields when INDEX_XPATH=true
      this doesnt prevent default indexing of full xpath
      using property 'fieldalias.att_timestamp=timestamp' will collapse all
        for given field.

        addField( indexFieldName( currentFieldName), val, storeDoc, false);
                  ^^^^ bypass this for some fields
        can we do part? - add last part of fieldPath to get span.start ?
    E.g. want common span.start,end fields for these
field=game.annotation.feature_set.feature_span.seq_relationship.span.end
field=game.annotation.feature_set.feature_span.seq_relationship.span.startfield=game.computational_analysis.result_set.result_span.seq_relationship.span.end
field=game.computational_analysis.result_set.result_span.seq_relationship.span.start
field=game.computational_analysis.result_set.seq_relationship.span.end
field=game.computational_analysis.result_set.seq_relationship.span.start
     
  */
  public static class AddCommonField_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      idx.addField( fieldName, val.toString(), doc, withdups);
      return kFieldAdded;
     }
  }


  /**
    * GameSpan_FieldRecoder
      -- need to check enclosing xml seq_relationship.type="query"  or "subject"
      -- skip indexing subject data, query == genome locations
      // need to look at prior doc fields

    ## double urk: all  scaffold query span.start,end  are 
    ## RELATIVE TO scaffold span.tile_start,tile_end
    ## are any of them reversed ??
    ## need also to recode spans for output ...
    */  
  
  public static class GameSpan_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      String tp= fieldPath; 
      int ti= tp.lastIndexOf( LuceneBaseIndexer.xpathDelim);
      if (ti>0) tp= tp.substring(0,ti+1);
      String stype= idx.getLastField(tp+"att_type",0);  // good only for INDEX_XPATH=true
      if (stype==null) stype= idx.getLastField("att_type",0); 
      // ^? try also tp.lastElement + att_type ?
      //if (LuceneBaseIndexer.debug)
      //LuceneBaseIndexer.logp.println("GameSpan."+fieldPath+"."+fieldName+" type="+stype+" val="+val);
      if ("query".equals(stype)) {

        String tilestart= idx.getLastField("game.map_position.span.tile_start",0);  
        String tileend  = idx.getLastField("game.map_position.span.tile_end",0);  
        int istart=-1, iend=-1;
        if (tilestart!=null) istart= Integer.parseInt(tilestart);
        if (tileend!=null) iend= Integer.parseInt(tileend);
        if (iend >= 0 && iend < istart) { int iswp= iend; iend= istart; istart= iswp; }
        String sval= val.toString();
        int ival= Integer.parseInt(sval);
        if (istart > 0) ival += istart;
        idx.addField( fieldName, String.valueOf(ival), doc, withdups);
        String arm= idx.getLastField("arm",0);  
        if (arm!=null) idx.addField( "arm", arm, doc, nodups); 
            // add so can do start:[100 200] AND arm:x -- need only one arm value/object
        return kFieldAdded;
        }
      else {
        if (stype == null) stype="untyped";
        String fn= fieldPath + LuceneBaseIndexer.xpathDelim + stype + "_" + fieldName;
        idx.addField( fn, val.toString(), doc, withdups);
        val.setLength(0); 
        return kFieldAdded + kSkipField; 
        }
      //return  kNoChange;
     }
  }


  /**
    * GameProperty_FieldRecoder
  ## -- recode all property.type,value pairs as 'property.type=value' fields?
  #     <property>
  #       <type>cyto_range</type>
  #       <value>40F7-40F7</value>
  #     </property>
  #     <property>
  #       <type>gbunit</type>
  #       <value>AE002603</value>
  #     </property>

  */
  
  public static class GameProperty_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      String del=LuceneBaseIndexer.xpathDelim;
      String tp= fieldPath + del + fieldName + del; 
      String stype= idx.getLastField(tp+"type",0);  // good only for INDEX_XPATH=true
      String value= idx.getLastField(tp+"value",0);   
      //if (LuceneBaseIndexer.debug)
      //LuceneBaseIndexer.logp.println("GameProperty."+tp+stype+"="+value);
      if (stype != null && value != null) {
        idx.addField( tp+stype, value, doc, withdups);
        return kFieldAdded;
        }
      return  kNoChange;
     }
  }

  /**
    * GameArm_FieldRecoder
    -- fix for bad data 
#<game>
#  <seq id="2L" length="108441" focus="true">
#    <name>2L</name>
# --as this chado2game field is bad
#   <map_position type="tile" seq="2L">
#     <arm>$5</arm>   << should be 2L
#     <span>
#       <start>22109491</start>
#       <end>22217931</end>
#     </span>
#   </map_position>

    */
    
  public static class GameArm_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      if (val.toString().startsWith("$")) {
        String del=LuceneBaseIndexer.xpathDelim;
        String tp= fieldPath; 
        //int ti= tp.lastIndexOf( del); if (ti>0) tp= tp.substring(0,ti+1);
        String arm= idx.getLastField(tp+del+"att_seq",0);  
        if (arm==null) arm= idx.getLastField("att_seq",0);  
        String fn= fieldPath+del+fieldName;
        if (LuceneBaseIndexer.debug)
        LuceneBaseIndexer.logp.println("GameArm."+fn+"="+arm);
        if ( arm != null ) {
          idx.addField( "arm", arm, doc, nodups);
          //idx.addField( fn, arm, doc, false);
          return kFieldAdded + kSkipField;
          }
        return  kNoChange;
        }
     else { // got valid arm field; jun04
        String arm= val.toString();  
        idx.addField( "arm", arm, doc, nodups);
        if (LuceneBaseIndexer.debug) {
          String del=LuceneBaseIndexer.xpathDelim;
          String fn= fieldPath+del+fieldName;
          LuceneBaseIndexer.logp.println("GameArm."+fn+"="+arm);
          }
        return kFieldAdded;
        }
       //return  kNoChange;
    }
  }

  /**
    * GameAddmap_position_FieldRecoder

## these are top-level records in game scaffold files; some need
## arm:start-end of scaf added for retrieval by range
# game.seq < none have start,end ? == sequence dbxref,name
# game.map_position == 1 record/scaffold == scaf range
##   game.map_position.span.tile_start
##   game.map_position.span.tile_end
##   arm -- all recs now have arm, see above
# game.computational_analysis << these all have span.start,end
# game.annotation << these all should have start,end

## double urk: all game scaffold query span.start,end  are 
## RELATIVE TO scaffold span.tile_start,tile_end

    */

  public static class GameAddmap_position_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      String del=LuceneBaseIndexer.xpathDelim; // silly if we are not using '.' as below
      String tilestart= idx.getLastField("game.map_position.span.tile_start",0);  
      String tileend  = idx.getLastField("game.map_position.span.tile_end",0);  
      String fn   = fieldPath+del+fieldName;
      //if (LuceneBaseIndexer.debug)
      //LuceneBaseIndexer.logp.println("GameAddmap."+fn+" start="+start+" end="+end);
      if ( tilestart != null && tileend != null ) {
        // do we want/need both?
        idx.addField( "start", tilestart, doc, nodups);
        idx.addField( "end", tileend, doc, nodups);
        //idx.addField( fn+".span.start", tilestart, doc, nodups);
        //idx.addField( fn+".span.end", tileend, doc, nodups);
        }
      String arm= idx.getLastField("arm",0);  
      if (arm!=null) idx.addField( "arm", arm, doc, nodups); 
      // see GameClass_FieldRecoder
      idx.addField( "docclass", fieldName, doc, nodups);
      return  kFieldAdded;
    }
  }

  /**
    * GameClass_FieldRecoder
   -- add tag name == docclass for top-level (doc) objects
  fieldrecoder.game.computational_analysis=LucegeneIndexers$GameClass_FieldRecoder
  fieldrecoder.game.annotation=LucegeneIndexers$GameClass_FieldRecoder
  fieldrecoder.game.map_position=LucegeneIndexers$GameClass_FieldRecoder
  
  fieldrecoder.game.seq=LucegeneIndexers$GameClass_FieldRecoder
  fieldrecoder.game.seq=LucegeneIndexers$GameAddmap_position_FieldRecoder
      ^^ this one has two recoders, do in one ?
    */

  public static class GameClass_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
        idx.addField( "docclass", fieldName, doc, nodups);
        return kFieldAdded;
    }
  }

  /**
    * GameSeqRelationLink_FieldRecoder
   -- add "doclink" for these fields in companal (only? none such in <annotation ?
   <seq_relationship type="subject" seq="AAK97883">
   fieldrecoder.game.computational_analysis.result_set.seq_relationship=GameSeqRelationLink_FieldRecoder
    */

  public static class GameSeqRelationLink_FieldRecoder
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
        String fieldName, String fieldPath, StringBuffer val)
    {
      String del=LuceneBaseIndexer.xpathDelim;  
      String tp= fieldPath + del + fieldName ; 
      // att field name is always elname + xpathDelim + attNamePrefix + atkey
      String type= idx.getLastField(tp+del+"att_type",0);  
      if (type==null) type= idx.getLastField(fieldName+del+"att_type",0);  
      if ("subject".equals(type)) {
        String doclink= idx.getLastField(tp+del+"att_seq",0);  
        if (doclink==null) doclink= idx.getLastField(fieldName+del+"att_seq",0);  
        //? can we weed out duplicate doc/doclink values here?
        // doclink=RE54557.5prime doclink=RE54557.5prime doclink=RE54557.5prime 
        if ( doclink != null ) {
          //if (LuceneBaseIndexer.debug)
          //LuceneBaseIndexer.logp.println("GameSeqRelationLink."+fieldPath+"."+fieldName+" doclink="+doclink);
          idx.addField( "doclink", doclink, doc, withdups);
          return  kFieldAdded;
          }
        }
      return  kNoChange;
    }
  }

  /**
    * FeatureSourceChr_FieldRecoder
    Parse gnomap feature 'source' line for chromosome value; add to each feature doc
  */
  public static class FeatureSourceChr_FieldRecoder 
    implements org.eugenes.index.LuceneBaseIndexer.FieldRecoder { 
    static String chr="unknown";
    static boolean insource;
    
    public int recodeField(LuceneBaseIndexer idx, Document doc, 
      String fieldName, String fieldPath, StringBuffer val)
    {
      if ("feature".equals(fieldName)) {
        insource= ("source".equals(val.toString()));
        }
      else if ("map".equals(fieldName)) {
        if (insource) chr= val.toString();
        idx.addField("chr", chr, doc, nodups); // or docid?
        return kFieldAdded;
        }
      return  kNoChange;
    }
  }

}