further refining the filter...
[mir.git] / source / mircoders / localizer / basic / MirBasicProducerAssistantLocalizer.java
index 3c9166f..6697d4c 100755 (executable)
@@ -38,6 +38,8 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
+import gnu.regexp.RE;
+
 import mir.config.MirPropertiesConfiguration;
 import mir.entity.adapter.EntityAdapter;
 import mir.entity.adapter.EntityIteratorAdapter;
@@ -65,6 +67,21 @@ import org.w3c.tidy.Tidy;
 public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantLocalizer {
   protected LoggerWrapper logger;
 
+  private RE regularExpressionLT;
+  private RE regularExpressionGT;
+  private RE regularExpressionWhitespace;
+
+  public MirBasicProducerAssistantLocalizer() throws MirLocalizerFailure {
+    try{
+      regularExpressionLT = new RE("<");
+      regularExpressionGT = new RE(">");
+      regularExpressionWhitespace = new RE("\\s+");
+    }
+    catch (Throwable t) {
+      throw new MirLocalizerFailure(t);
+    }
+  }
+
   public void initializeGenerationValueSet(Map aValueSet) throws MirLocalizerExc, MirLocalizerFailure  {
     try {
       Iterator i;
@@ -219,15 +236,50 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
     }
   }
 
+
+  private String[] badAttributeValuePrefixes= {"javascript","vbscript","about","wysiwyg","data","view-source","ms-its","mhtml","shell","lynxexec","lynxcgi","hcp","ms-help","help","disk","vnd.ms.radio","opera","res","resource","chrome","mocha","livescript"};
+
+  private String[] badAttributes = {"onabort", "onblur",  "onchange", "onclick", "ondblclick", "onerror", "onfocus", "onkeydown", "onKeypress", "onkeyup", "onload", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset", "onselect", "onsubmit", "onunload","onload","onclick","onfocus","onblur","style","height","width"};
+  
+  private boolean isBadAttr(String attrName){
+    for (int i=0;i<badAttributes.length;i++){
+      if (badAttributes[i].toLowerCase().equals(attrName.toLowerCase()))
+       return true;
+      }
+    return false;
+  }
+
+  private String stripWhitespace(String aString){
+    try{
+      return regularExpressionWhitespace.substituteAll(aString, "");
+     }
+    catch (Throwable t){
+      return "";
+    }
+  }
+
   private boolean checkAttr(String attrName) {
-    if (attrName.equals("onLoad") || attrName.equals("onClick") || attrName.equals("onFocus") || attrName.equals("onBlur") || attrName.equals("onMouseOver") || attrName.equals("onMouseOut") || attrName.equals("style") || attrName.equals("STYLE") || attrName.equals("height") || attrName.equals("width") || attrName.equals("HEIGHT") || attrName.equals("WIDTH"))
-      return false;
-               return true;
+    if (isBadAttr(attrName)){
+       return false;
+    }
+    return true;
 
   }
 
+  private boolean checkAttrValue(String attrValue) {
+    for (int i=0;i<badAttributeValuePrefixes.length;i++){
+      if ((stripWhitespace(attrValue.toLowerCase())).startsWith(badAttributeValuePrefixes[i].toLowerCase()+":")){
+       return false;
+      } 
+    }
+    return true;
+  }
+
+
   private boolean checkNode(String nodeName) {
-    List languages =  StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ",");
+    List languages =  StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";");
+    
     Iterator i = languages.iterator();
     while (i.hasNext()) {
       if (nodeName.equals(i.next()))
@@ -260,11 +312,12 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
 
           for (int i = 0; i < attrs.getLength(); i++) {
             String attrName = attrs.item(i).getNodeName();
-            if (checkAttr(attrName)) {
+            String attrValue = attrs.item(i).getNodeValue();
+            if (checkAttr(attrName) && checkAttrValue(attrValue)) {
               out.write(' ');
               out.write(attrs.item(i).getNodeName());
               out.write("=\"");
-
+             
               out.write(attrs.item(i).getNodeValue());
               out.write('"');
             }
@@ -285,7 +338,16 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
         break;
 
       case Node.TEXT_NODE:
-        out.write(node.getNodeValue());
+       String value=node.getNodeValue();
+       try{
+         value=regularExpressionLT.substituteAll(value, "&lt;");
+         value=regularExpressionGT.substituteAll(value, "&gt;");
+       }
+       catch (Throwable t){
+         value="";
+       }
+       out.write(value);
+
         break;
 
     }