replaced some gnu RE's with the much faster jakarta ORO
[mir.git] / source / mircoders / localizer / basic / MirBasicProducerAssistantLocalizer.java
index 84afe0d..e408c09 100755 (executable)
  */
 package mircoders.localizer.basic;
 
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.StringWriter;
-import java.util.GregorianCalendar;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
 import gnu.regexp.RE;
-
 import mir.config.MirPropertiesConfiguration;
 import mir.entity.adapter.EntityAdapter;
 import mir.entity.adapter.EntityIteratorAdapter;
@@ -47,16 +37,15 @@ import mir.generator.Generator;
 import mir.generator.GeneratorExc;
 import mir.generator.GeneratorFailure;
 import mir.log.LoggerWrapper;
-import mir.misc.StringUtil;
 import mir.util.GeneratorDateTimeFunctions;
 import mir.util.GeneratorFormatAdapters;
-import mir.util.generator.ReflectionGeneratorFunctionsAdapter;
+import mir.util.HTMLStripper;
 import mir.util.StringRoutines;
+import mir.util.generator.ReflectionGeneratorFunctionsAdapter;
 import mircoders.global.MirGlobal;
 import mircoders.localizer.MirLocalizerExc;
 import mircoders.localizer.MirLocalizerFailure;
 import mircoders.localizer.MirProducerAssistantLocalizer;
-
 import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
@@ -64,15 +53,28 @@ import org.w3c.dom.NodeList;
 import org.w3c.tidy.Configuration;
 import org.w3c.tidy.Tidy;
 
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.GregorianCalendar;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
 public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantLocalizer {
   protected LoggerWrapper logger;
 
+  private HTMLStripper stripper;
   private RE regularExpressionLT;
   private RE regularExpressionGT;
   private RE regularExpressionWhitespace;
 
+
   public MirBasicProducerAssistantLocalizer() throws MirLocalizerFailure {
-    try{
+    try {
+      stripper = new HTMLStripper();
+
       regularExpressionLT = new RE("<");
       regularExpressionGT = new RE(">");
       regularExpressionWhitespace = new RE("\\s+");
@@ -82,7 +84,7 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
     }
   }
 
-  public void initializeGenerationValueSet(Map aValueSet) throws MirLocalizerExc, MirLocalizerFailure  {
+  public void initializeGenerationValueSet(Map aValueSet) throws MirLocalizerExc, MirLocalizerFailure {
     try {
       Iterator i;
 
@@ -110,13 +112,13 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
 
       aValueSet.put("config", configMap);
 
-      aValueSet.put("utility", new Utility()); 
+      aValueSet.put("utility", new Utility());
 
       aValueSet.put("languages",
-        new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "language"));
+          new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "language"));
 
       aValueSet.put("topics",
-        new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "topic"));
+          new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "topic"));
 
       Map articleTypeMap = new HashMap();
       articleTypeMap.put("openposting", "0");
@@ -150,7 +152,7 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
     }
 
   }
-  
+
   public static class getLanguageIdFunction implements Generator.Function {
     private Map languageCodeToId;
     private String otherLanguageId;
@@ -164,8 +166,9 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
         Iterator i = new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "language");
         while (i.hasNext()) {
           EntityAdapter language = (EntityAdapter) i.next();
-          if (language.get("code").equals("ot"))
+          if (language.get("code").equals("ot")) {
             otherLanguageId = (String) language.get("id");
+          }
 
           languageCodeToId.put(language.get("code"), language.get("id"));
         }
@@ -179,12 +182,14 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
 
     public Object perform(List aParameters) throws GeneratorExc, GeneratorFailure {
       try {
-        if (aParameters.size() != 1)
+        if (aParameters.size() != 1) {
           throw new GeneratorExc("getLanguageIdFunction: 1 parameter expected: language-code");
+        }
 
         String result = (String) languageCodeToId.get(aParameters.get(0));
-        if (result == null)
+        if (result == null) {
           result = otherLanguageId;
+        }
 
         return result;
       }
@@ -203,13 +208,12 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
     logger.debug("about to filter non HTML Text of length " + aText.length());
     try {
       String result =
-          StringUtil.createHTML(
-          StringUtil.removeHTMLTags(aText),
-          MirGlobal.config().getString("Producer.ImageRoot"),
-          MirGlobal.config().getString("Producer.MailLinkName"),
-          MirGlobal.config().getString("Producer.ExtLinkName"),
-          MirGlobal.config().getString("Producer.IntLinkName")
-          );
+          stripper.createHTML(
+              stripper.removeHTMLTags(aText),
+              MirGlobal.config().getString("Producer.ImageRoot"),
+              MirGlobal.config().getString("Producer.MailLinkName"),
+              MirGlobal.config().getString("Producer.ExtLinkName"),
+              MirGlobal.config().getString("Producer.IntLinkName"));
       logger.debug("done filtering non-HTML text ");
       return result;
     }
@@ -219,6 +223,60 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
       throw new RuntimeException(t.toString());
     }
   }
+
+  public Generator.Interceptor createGenerationInterceptor() throws MirLocalizerExc, MirLocalizerFailure {
+
+    if (MirGlobal.config().getBoolean("Mir.Producer.UseInterceptor", true)) {
+      return new Generator.Interceptor() {
+
+        public Object intercept(Object anObject) {
+          if (anObject instanceof EntityAdapter) {
+            return new InterceptedEntityAdapter((EntityAdapter) anObject);
+          }
+
+          return anObject;
+        }
+      };
+    }
+    else {
+      return null;
+    }
+  }
+
+  public class InterceptedEntityAdapter {
+    private EntityAdapter adapter;
+
+    InterceptedEntityAdapter(EntityAdapter anEntityAdapter) {
+      adapter = anEntityAdapter;
+    }
+
+    public Object get(String aField) {
+      Object result = adapter.get(aField);
+      if (result instanceof String) {
+        return filterHTMLText((String) result);
+      }
+      else {
+        return result;
+      }
+    }
+
+    public Object getRaw() {
+      return new RawEntityAdapter(adapter);
+    }
+  }
+
+  public class RawEntityAdapter {
+    private EntityAdapter adapter;
+
+    RawEntityAdapter(EntityAdapter anEntityAdapter) {
+      adapter = anEntityAdapter;
+    }
+
+    public Object get(String aField) {
+      return adapter.get(aField);
+    }
+  }
+
   public String filterHTMLText(String aText) {
     try {
       StringWriter out = new StringWriter();
@@ -228,7 +286,7 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
       tidy.setCharEncoding(Configuration.UTF8);
       tidy.setErrout(logger.asPrintWriter(LoggerWrapper.DEBUG_MESSAGE));
       print(tidy.parseDOM(in, null), out);
-      
+
       return out.toString();
     }
     catch (IOException e) {
@@ -237,53 +295,81 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
   }
 
 
-  private String[] badAttributeValuePrefixes= {"javascript","vbscript","about","wysiwyg","data","view-source","ms-its","mhtml","shell","lynxexec","lynxcgi","hcp","ms-help","help","disk","vnd.ms.radio","opera","res","resource","chrome","mocha","livescript"};
-
-  private String[] badAttributes = {"onabort", "onblur",  "onchange", "onclick", "ondblclick", "onerror", "onfocus", "onkeydown", "onKeypress", "onkeyup", "onload", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset", "onselect", "onsubmit", "onunload","onload","onclick","onfocus","onblur","FSCommand","onAbort","onActivate","onAfterPrint","onAfterUpdate","onBeforeActivate","onBeforeCopy","onBeforeCut","onBeforeDeactivate","onBeforeEditFocus","onBeforePaste","onBeforePrint","onBeforeUnload","onBegin","onBlur","onBounce","onCellChange","onChange","onClick","onContextMenu","onControlSelect","onCopy","onCut","onDataAvailible","onDataSetChanged","onDataSetComplete","onDblClick","onDeactivate","onDrag","onDragEnd","onDragLeave","onDragEnter","onDragOver","onDragDrop","onDrop","onEnd","onError","onErrorUpdate","onExit","onFilterChange","onFinish","onFocus","onFocusIn","onFocusOut","onHelp","onKeyDown","onKeyPress","onKeyUp","onLayoutComplete","onLoad","onLoseCapture","onMediaComplete","onMediaError","onMouseDown","onMouseEnter","onMouseLeave","onMouseMove","onMouseOut","onMouseOver","onMouseUp","onMouseWheel","onMove","onMoveEnd","onMoveStart","onOutOfSync","onPaste","onPause","onProgress","onPropertyChange","onReadyStateChange","onRepeat","onReset","onResize","onResizeEnd","onResizeStart","onResume","onReverse","onRowEnter","onRowExit","onRowDelete","onRowInserted","onScroll","onSeek","onSelect","onSelectionChange","onSelectStart","onStart","onStop","onSynchRestored","onSubmit","onTimeError","onTrackChange","onUnload","onURLFlip","seekSegmentTime","style","height","width"};
-  
-  private boolean isBadAttr(String attrName){
-    for (int i=0;i<badAttributes.length;i++){
-      if (badAttributes[i].toLowerCase().equals(attrName.toLowerCase()))
-       return true;
+  private String[] badAttributeValuePrefixes = {
+      "javascript", "vbscript", "about", "wysiwyg", "data", "view-source",
+      "ms-its", "mhtml", "shell", "lynxexec", "lynxcgi", "hcp", "ms-help",
+      "help", "disk", "vnd.ms.radio", "opera", "res", "resource", "chrome",
+      "mocha", "livescript"};
+
+
+  private String[] badAttributes = {
+      "onabort", "onblur", "onchange", "onclick", "ondblclick", "onerror",
+      "onfocus", "onkeydown", "onKeypress", "onkeyup", "onload", "onmousedown",
+      "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset",
+      "onselect", "onsubmit", "onunload", "onload", "onclick", "onfocus",
+      "onblur", "FSCommand", "onAbort", "onActivate", "onAfterPrint",
+      "onAfterUpdate", "onBeforeActivate", "onBeforeCopy", "onBeforeCut",
+      "onBeforeDeactivate", "onBeforeEditFocus", "onBeforePaste",
+      "onBeforePrint", "onBeforeUnload", "onBegin", "onBlur", "onBounce",
+      "onCellChange", "onChange", "onClick", "onContextMenu", "onControlSelect",
+      "onCopy", "onCut", "onDataAvailible", "onDataSetChanged", "onDataSetComplete",
+      "onDblClick", "onDeactivate", "onDrag", "onDragEnd", "onDragLeave", "onDragEnter",
+      "onDragOver", "onDragDrop", "onDrop", "onEnd", "onError", "onErrorUpdate", "onExit",
+      "onFilterChange", "onFinish", "onFocus", "onFocusIn", "onFocusOut", "onHelp",
+      "onKeyDown", "onKeyPress", "onKeyUp", "onLayoutComplete", "onLoad", "onLoseCapture",
+      "onMediaComplete", "onMediaError", "onMouseDown", "onMouseEnter", "onMouseLeave",
+      "onMouseMove", "onMouseOut", "onMouseOver", "onMouseUp", "onMouseWheel", "onMove",
+      "onMoveEnd", "onMoveStart", "onOutOfSync", "onPaste", "onPause", "onProgress",
+      "onPropertyChange", "onReadyStateChange", "onRepeat", "onReset", "onResize",
+      "onResizeEnd", "onResizeStart", "onResume", "onReverse", "onRowEnter", "onRowExit",
+      "onRowDelete", "onRowInserted", "onScroll", "onSeek", "onSelect", "onSelectionChange",
+      "onSelectStart", "onStart", "onStop", "onSynchRestored", "onSubmit", "onTimeError",
+      "onTrackChange", "onUnload", "onURLFlip", "seekSegmentTime", "style", "height", "width"};
+
+  private boolean isBadAttr(String attrName) {
+    for (int i = 0; i < badAttributes.length; i++) {
+      if (badAttributes[i].toLowerCase().equals(attrName.toLowerCase())) {
+        return true;
       }
+    }
     return false;
   }
 
-  private String stripWhitespace(String aString){
-    try{
+  private String stripWhitespace(String aString) {
+    try {
       return regularExpressionWhitespace.substituteAll(aString, "");
-     }
-    catch (Throwable t){
+    }
+    catch (Throwable t) {
       return "";
     }
   }
 
   private boolean checkAttr(String attrName) {
-    if (isBadAttr(attrName)){
-       return false;
+    if (isBadAttr(attrName)) {
+      return false;
     }
     return true;
 
   }
 
   private boolean checkAttrValue(String attrValue) {
-    for (int i=0;i<badAttributeValuePrefixes.length;i++){
-      if ((stripWhitespace(attrValue.toLowerCase())).startsWith(badAttributeValuePrefixes[i].toLowerCase()+":")){
-       return false;
-      } 
+    for (int i = 0; i < badAttributeValuePrefixes.length; i++) {
+      if ((stripWhitespace(attrValue.toLowerCase())).startsWith(badAttributeValuePrefixes[i].toLowerCase() + ":")) {
+        return false;
+      }
     }
     return true;
   }
 
 
   private boolean checkNode(String nodeName) {
-    List languages =  StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";");
-    
+    List languages = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";");
+
     Iterator i = languages.iterator();
     while (i.hasNext()) {
-      if (nodeName.equals(i.next()))
+      if (nodeName.equals(i.next())) {
         return true;
+      }
     }
     return false;
   }
@@ -317,13 +403,13 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
               out.write(' ');
               out.write(attrs.item(i).getNodeName());
               out.write("=\"");
-             
+
               out.write(attrs.item(i).getNodeValue());
               out.write('"');
             }
           }
 
-          if (node.getChildNodes()==null || node.getChildNodes().getLength()==0) {
+          if (node.getChildNodes() == null || node.getChildNodes().getLength() == 0) {
             out.write("/");
           }
           out.write('>');
@@ -338,21 +424,21 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
         break;
 
       case Node.TEXT_NODE:
-       String value=node.getNodeValue();
-       try{
-         value=regularExpressionLT.substituteAll(value, "&lt;");
-         value=regularExpressionGT.substituteAll(value, "&gt;");
-       }
-       catch (Throwable t){
-         value="";
-       }
-       out.write(value);
+        String value = node.getNodeValue();
+        try {
+          value = regularExpressionLT.substituteAll(value, "&lt;");
+          value = regularExpressionGT.substituteAll(value, "&gt;");
+        }
+        catch (Throwable t) {
+          value = "";
+        }
+        out.write(value);
 
         break;
 
     }
 
-    if (type == Node.ELEMENT_NODE && canOutput && node.getChildNodes()!=null && node.getChildNodes().getLength()>0) {
+    if (type == Node.ELEMENT_NODE && canOutput && node.getChildNodes() != null && node.getChildNodes().getLength() > 0) {
       out.write("</");
       out.write(node.getNodeName());
       out.write('>');
@@ -362,13 +448,13 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
   }
 
   public static class Utility extends ReflectionGeneratorFunctionsAdapter {
-    public Utility () {
+    public Utility() {
       super(new MirBasicUtilityFunctions());
     }
 
     public Object getDatetime() {
       return new GeneratorDateTimeFunctions.DateTimeFunctions(
-        MirPropertiesConfiguration.instance().getString("Mir.DefaultTimezone"));
+          MirPropertiesConfiguration.instance().getString("Mir.DefaultTimezone"));
     }
 
     public Object getCompressWhitespace() {