support for CAPTCHAs

[mir.git] / source / mircoders / localizer / basic / MirBasicProducerAssistantLocalizer.java
diff --git a/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java b/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java

index 9fbaa04..f79f4d4 100755 (executable)
--- a/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java
+++ b/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java
@@ -29,7 +29,6 @@
   */
  package mircoders.localizer.basic;
  
-import gnu.regexp.RE;
  import mir.config.MirPropertiesConfiguration;
  import mir.entity.adapter.EntityAdapter;
  import mir.entity.adapter.EntityIteratorAdapter;
@@ -37,10 +36,7 @@ import mir.generator.Generator;
  import mir.generator.GeneratorExc;
  import mir.generator.GeneratorFailure;
  import mir.log.LoggerWrapper;
-import mir.util.GeneratorDateTimeFunctions;
-import mir.util.GeneratorFormatAdapters;
-import mir.util.HTMLStripper;
-import mir.util.StringRoutines;
+import mir.util.*;
  import mir.util.generator.ReflectionGeneratorFunctionsAdapter;
  import mircoders.global.MirGlobal;
  import mircoders.localizer.MirLocalizerExc;
@@ -52,34 +48,60 @@ import org.w3c.dom.Node;
  import org.w3c.dom.NodeList;
  import org.w3c.tidy.Configuration;
  import org.w3c.tidy.Tidy;
+import org.apache.oro.text.regex.*;
  
  import java.io.ByteArrayInputStream;
  import java.io.IOException;
  import java.io.StringWriter;
-import java.util.GregorianCalendar;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
  
  public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantLocalizer {
    protected LoggerWrapper logger;
  
    private HTMLStripper stripper;
-  private RE regularExpressionLT;
-  private RE regularExpressionGT;
-  private RE regularExpressionWhitespace;
-  private RE regularExpressionLeadingSlashes;
-    
+  private Pattern regularExpressionWhitespace;
+  private Pattern regularExpressionLeadingSlashes;
+  private Set disallowedAttributes = new HashSet();
+  private Set disallowedPrefixes = new HashSet();
+  private Set allowedNodes = new HashSet();
+  private Set externalPrefixes = new HashSet();
+  private Set allowedExternalPrefixes = new HashSet();
+
+
+
  
    public MirBasicProducerAssistantLocalizer() throws MirLocalizerFailure {
      try {
        stripper = new HTMLStripper();
+      Perl5Compiler compiler = new Perl5Compiler();
+
+      regularExpressionWhitespace = compiler.compile("\\s+|&#x0A;|&#x0D;", Perl5Compiler.READ_ONLY_MASK);
+      regularExpressionLeadingSlashes = compiler.compile("^//+", Perl5Compiler.READ_ONLY_MASK);
+
+      Iterator i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributes"), ";").iterator();
+      while (i.hasNext()) {
+        disallowedAttributes.add(((String) i.next()).toLowerCase());
+      }
+
+      i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributeValuePrefixes"), ";").iterator();
+      while (i.hasNext()) {
+        disallowedPrefixes.add(((String) i.next()).toLowerCase());
+      }
+
+      i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";").iterator();
+      while (i.hasNext()) {
+        allowedNodes.add(((String) i.next()).toLowerCase());
+      }
+
+      i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.ExternalLocationAttributeValuePrefixes"), ";").iterator();
+      while (i.hasNext()) {
+        externalPrefixes.add(((String) i.next()).toLowerCase());
+      }
  
-      regularExpressionLT = new RE("<");
-      regularExpressionGT = new RE(">");
-      regularExpressionWhitespace = new RE("\\s+|&#x0A;|&#x0D;");
-      regularExpressionLeadingSlashes = new RE("^//+");
+      i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.WhitelistedExternalLocationAttributeValuePrefixes"), ";").iterator();
+      while (i.hasNext()) {
+        allowedExternalPrefixes.add(((String) i.next()).toLowerCase());
+      }
      }
      catch (Throwable t) {
        throw new MirLocalizerFailure(t);
@@ -297,60 +319,40 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
    }
  
  
-
-  private boolean isBadAttr(String attrName) {
-    List badAttributes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributes"), ";");
-    Iterator i = badAttributes.iterator();
-    while (i.hasNext()) {
-      if (((String) i.next()).toLowerCase().equals(attrName.toLowerCase())) {
-        return true;
-      }
-    }
-    return false;
+  /**
+   * Test whether attributes of the given type are acceptable
+   *
+   * @param anAttibuteName
+   * @return <code>true</code> if the attribute is acceptable
+   */
+  private boolean testAttribueName(String anAttibuteName) {
+    return !disallowedAttributes.contains(anAttibuteName.toLowerCase());
    }
  
    private String stripWhitespace(String aString) {
-    try {
-      return regularExpressionWhitespace.substituteAll(aString, "");
-    }
-    catch (Throwable t) {
-      return "";
-    }
+    return Util.substitute(
+            new Perl5Matcher(), regularExpressionWhitespace, new Perl5Substitution(""), aString, Util.SUBSTITUTE_ALL);
    }
  
-  private boolean checkAttr(String attrName) {
-    if (isBadAttr(attrName)) {
-      return false;
-    }
-    return true;
+  private boolean testAttibuteValue(String anAttributeValue) {
+    Iterator i = disallowedPrefixes.iterator();
  
-  }
-
-  private boolean checkAttrValue(String attrValue) {
-      List badPrefixes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributeValuePrefixes"), ";");
-      Iterator i = badPrefixes.iterator();
-      while (i.hasNext()) {
-         if ((stripWhitespace(attrValue.toLowerCase())).startsWith(((String) i.next()).toLowerCase() + ":")) {
+    while (i.hasNext()) {
+      // todo: split the attribute value on : and use contains
+      if ((stripWhitespace(anAttributeValue.toLowerCase())).startsWith(((String) i.next()).toLowerCase() + ":")) {
          return false;
        }
      }
+
      return true;
    }
  
  
    private boolean checkNode(String nodeName) {
-    List acceptableNodes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";");
-
-    Iterator i = acceptableNodes.iterator();
-    while (i.hasNext()) {
-      if (nodeName.equals(i.next())) {
-        return true;
-      }
-    }
-    return false;
+    return allowedNodes.contains(nodeName.toLowerCase());
    }
  
-  private boolean checkAttrInContext(String nodeName,String attrName,String attrValue){
+  private boolean testAttributeInContext(String aTag, String anAttibute, String aValue){
      /* The intent here is to prevent external content from being loaded by the user's browser.
         It's extra paranoid, so will strip some legitimate stuff like an alt="http://www.indymedia.org"
      */
@@ -358,35 +360,43 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
        return true;
      }
      else {
-      if ((nodeName.toLowerCase()).equals("a") && (attrName.toLowerCase()).equals("href") || (nodeName.toLowerCase()).equals("form") && (attrName.toLowerCase()).equals("action")){
-       return true;  //because we still love the web, even if it doesn't return the favor
+      if (("a".equalsIgnoreCase(aTag) && "href".equalsIgnoreCase(anAttibute)) ||
+          ("form".equalsIgnoreCase(aTag) && "action".equalsIgnoreCase(anAttibute))) {
+        // because we still love the web, even if it doesn't return the favor
+
+        return true;
        }
        else {
-        List externalPrefixes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.ExternalLocationAttributeValuePrefixes"), ";");
-       List whitelist = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.WhitelistedExternalLocationAttributeValuePrefixes"), ";");
-       Iterator i = externalPrefixes.iterator();
-       while (i.hasNext()) {
-         if ((stripWhitespace(attrValue.toLowerCase())).startsWith(((String) i.next()).toLowerCase())) {
-           // we have hit a bad prefix, but we need to check the whitelist
-           Iterator wl=whitelist.iterator();
-           while (wl.hasNext()){
-             if ((stripWhitespace(attrValue.toLowerCase())).startsWith(((String) wl.next()).toLowerCase())) {
-               return true;  //say, for example, something on a trusted server   
+        String value = stripWhitespace(aValue.toLowerCase());
+
+        Iterator i = externalPrefixes.iterator();
+             while (i.hasNext()) {
+          if (value.startsWith((String) i.next())) {
+            // we have hit a bad prefix, but we need to check the whitelist
+            Iterator wl = allowedExternalPrefixes.iterator();
+            while (wl.hasNext()) {
+              if (value.startsWith((String) wl.next())) {
+                return true;
+              }
+            }
+          }
+
+          return false;  //don't let this attribute through
               }
-           }
-           return false;  //don't let this attribute through
-         }
-       }
-       return true; //didn't seem to be an external prefix, so it's fine
+
+        return true; //didn't seem to be an external prefix, so it's fine
        }
      }
    }
+
    private void print(Node node, StringWriter out) throws IOException {
      if (node == null) {
        return;
      }
      int type = node.getNodeType();
-    boolean canOutput = checkNode(node.getNodeName());
+
+    // will this node be present in the output?
+    boolean keepNode = checkNode(node.getNodeName());
  
      switch (type) {
  
@@ -397,7 +407,7 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
          break;
  
        case Node.ELEMENT_NODE:
-        if (canOutput) {
+        if (keepNode) {
            out.write('<');
  
            out.write(node.getNodeName());
@@ -406,11 +416,14 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
            for (int i = 0; i < attrs.getLength(); i++) {
              String attrName = attrs.item(i).getNodeName();
              String attrValue = attrs.item(i).getNodeValue();
-           if (attrValue.startsWith("//")){
-             attrValue=regularExpressionLeadingSlashes.substitute(attrValue, "/");
-           }
+
+            // todo: what is this?
+            if (attrValue.startsWith("//")){
+              attrValue = Util.substitute(
+                      new Perl5Matcher(), regularExpressionLeadingSlashes, new Perl5Substitution("/"), attrValue);
+            }
                             
-            if (checkAttr(attrName) && checkAttrValue(attrValue) && checkAttrInContext(node.getNodeName(),attrName,attrValue)) {
+            if (testAttribueName(attrName) && testAttibuteValue(attrValue) && testAttributeInContext(node.getNodeName(), attrName, attrValue)) {
                out.write(' ');
                out.write(attrs.item(i).getNodeName());
                out.write("=\"");
@@ -420,11 +433,17 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
              }
            }
  
+          // nodes without children will use the shorthand form <br/>. Some browsers
+          //    treat <br></br> as a double linebreak
            if (node.getChildNodes() == null || node.getChildNodes().getLength() == 0) {
              out.write("/");
+            out.write('>');
+            break;
            }
            out.write('>');
          }
+
+
          NodeList children = node.getChildNodes();
          if (children != null) {
            int len = children.getLength();
@@ -432,27 +451,19 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL
              print(children.item(i), out);
            }
          }
-        break;
  
-      case Node.TEXT_NODE:
-        String value = node.getNodeValue();
-        try {
-          value = regularExpressionLT.substituteAll(value, "&lt;");
-          value = regularExpressionGT.substituteAll(value, "&gt;");
-        }
-        catch (Throwable t) {
-          value = "";
+        if (keepNode) {
+          out.write("</");
+          out.write(node.getNodeName());
+          out.write('>');
          }
-        out.write(value);
  
          break;
  
-    }
+      case Node.TEXT_NODE:
+        out.write(HTMLRoutines.encodeHTML(node.getNodeValue()));
  
-    if (type == Node.ELEMENT_NODE && canOutput && node.getChildNodes() != null && node.getChildNodes().getLength() > 0) {
-      out.write("</");
-      out.write(node.getNodeName());
-      out.write('>');
+        break;
      }
  
      out.flush();