*/
package mircoders.localizer.basic;
-import gnu.regexp.RE;
import mir.config.MirPropertiesConfiguration;
import mir.entity.adapter.EntityAdapter;
import mir.entity.adapter.EntityIteratorAdapter;
import mir.generator.GeneratorExc;
import mir.generator.GeneratorFailure;
import mir.log.LoggerWrapper;
-import mir.util.GeneratorDateTimeFunctions;
-import mir.util.GeneratorFormatAdapters;
-import mir.util.HTMLStripper;
-import mir.util.StringRoutines;
+import mir.util.*;
import mir.util.generator.ReflectionGeneratorFunctionsAdapter;
import mircoders.global.MirGlobal;
import mircoders.localizer.MirLocalizerExc;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
+import org.apache.oro.text.regex.*;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.StringWriter;
-import java.util.GregorianCalendar;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantLocalizer {
protected LoggerWrapper logger;
private HTMLStripper stripper;
- private RE regularExpressionLT;
- private RE regularExpressionGT;
- private RE regularExpressionWhitespace;
- private RE regularExpressionLeadingSlashes;
-
+ private Pattern regularExpressionWhitespace;
+ private Pattern regularExpressionLeadingSlashes;
+ private Set disallowedAttributes = new HashSet();
+ private Set disallowedPrefixes = new HashSet();
+ private Set allowedNodes = new HashSet();
+ private Set externalPrefixes = new HashSet();
+ private Set allowedExternalPrefixes = new HashSet();
+
+
+
public MirBasicProducerAssistantLocalizer() throws MirLocalizerFailure {
try {
stripper = new HTMLStripper();
+ Perl5Compiler compiler = new Perl5Compiler();
+
+ regularExpressionWhitespace = compiler.compile("\\s+|
|
", Perl5Compiler.READ_ONLY_MASK);
+ regularExpressionLeadingSlashes = compiler.compile("^//+", Perl5Compiler.READ_ONLY_MASK);
+
+ Iterator i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributes"), ";").iterator();
+ while (i.hasNext()) {
+ disallowedAttributes.add(((String) i.next()).toLowerCase());
+ }
+
+ i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributeValuePrefixes"), ";").iterator();
+ while (i.hasNext()) {
+ disallowedPrefixes.add(((String) i.next()).toLowerCase());
+ }
+
+ i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";").iterator();
+ while (i.hasNext()) {
+ allowedNodes.add(((String) i.next()).toLowerCase());
+ }
+
+ i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.ExternalLocationAttributeValuePrefixes"), ";").iterator();
+ while (i.hasNext()) {
+ externalPrefixes.add(((String) i.next()).toLowerCase());
+ }
- regularExpressionLT = new RE("<");
- regularExpressionGT = new RE(">");
- regularExpressionWhitespace = new RE("\\s+|
|
");
- regularExpressionLeadingSlashes = new RE("^//+");
+ i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.WhitelistedExternalLocationAttributeValuePrefixes"), ";").iterator();
+ while (i.hasNext()) {
+ allowedExternalPrefixes.add(((String) i.next()).toLowerCase());
+ }
}
catch (Throwable t) {
throw new MirLocalizerFailure(t);
}
-
- private boolean isBadAttr(String attrName) {
- List badAttributes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributes"), ";");
- Iterator i = badAttributes.iterator();
- while (i.hasNext()) {
- if (((String) i.next()).toLowerCase().equals(attrName.toLowerCase())) {
- return true;
- }
- }
- return false;
+ /**
+ * Test whether attributes of the given type are acceptable
+ *
+ * @param anAttibuteName
+ * @return <code>true</code> if the attribute is acceptable
+ */
+ private boolean testAttribueName(String anAttibuteName) {
+ return !disallowedAttributes.contains(anAttibuteName.toLowerCase());
}
private String stripWhitespace(String aString) {
- try {
- return regularExpressionWhitespace.substituteAll(aString, "");
- }
- catch (Throwable t) {
- return "";
- }
+ return Util.substitute(
+ new Perl5Matcher(), regularExpressionWhitespace, new Perl5Substitution(""), aString, Util.SUBSTITUTE_ALL);
}
- private boolean checkAttr(String attrName) {
- if (isBadAttr(attrName)) {
- return false;
- }
- return true;
+ private boolean testAttibuteValue(String anAttributeValue) {
+ Iterator i = disallowedPrefixes.iterator();
- }
-
- private boolean checkAttrValue(String attrValue) {
- List badPrefixes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributeValuePrefixes"), ";");
- Iterator i = badPrefixes.iterator();
- while (i.hasNext()) {
- if ((stripWhitespace(attrValue.toLowerCase())).startsWith(((String) i.next()).toLowerCase() + ":")) {
+ while (i.hasNext()) {
+ // todo: split the attribute value on : and use contains
+ if ((stripWhitespace(anAttributeValue.toLowerCase())).startsWith(((String) i.next()).toLowerCase() + ":")) {
return false;
}
}
+
return true;
}
private boolean checkNode(String nodeName) {
- List acceptableNodes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";");
-
- Iterator i = acceptableNodes.iterator();
- while (i.hasNext()) {
- if (nodeName.equals(i.next())) {
- return true;
- }
- }
- return false;
+ return allowedNodes.contains(nodeName.toLowerCase());
}
- private boolean checkAttrInContext(String nodeName,String attrName,String attrValue){
+ private boolean testAttributeInContext(String aTag, String anAttibute, String aValue){
/* The intent here is to prevent external content from being loaded by the user's browser.
It's extra paranoid, so will strip some legitimate stuff like an alt="http://www.indymedia.org"
*/
return true;
}
else {
- if ((nodeName.toLowerCase()).equals("a") && (attrName.toLowerCase()).equals("href") || (nodeName.toLowerCase()).equals("form") && (attrName.toLowerCase()).equals("action")){
- return true; //because we still love the web, even if it doesn't return the favor
+ if (("a".equalsIgnoreCase(aTag) && "href".equalsIgnoreCase(anAttibute)) ||
+ ("form".equalsIgnoreCase(aTag) && "action".equalsIgnoreCase(anAttibute))) {
+ // because we still love the web, even if it doesn't return the favor
+
+ return true;
}
else {
- List externalPrefixes = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.ExternalLocationAttributeValuePrefixes"), ";");
- List whitelist = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.WhitelistedExternalLocationAttributeValuePrefixes"), ";");
- Iterator i = externalPrefixes.iterator();
- while (i.hasNext()) {
- if ((stripWhitespace(attrValue.toLowerCase())).startsWith(((String) i.next()).toLowerCase())) {
- // we have hit a bad prefix, but we need to check the whitelist
- Iterator wl=whitelist.iterator();
- while (wl.hasNext()){
- if ((stripWhitespace(attrValue.toLowerCase())).startsWith(((String) wl.next()).toLowerCase())) {
- return true; //say, for example, something on a trusted server
+ String value = stripWhitespace(aValue.toLowerCase());
+
+ Iterator i = externalPrefixes.iterator();
+ while (i.hasNext()) {
+ if (value.startsWith((String) i.next())) {
+ // we have hit a bad prefix, but we need to check the whitelist
+ Iterator wl = allowedExternalPrefixes.iterator();
+ while (wl.hasNext()) {
+ if (value.startsWith((String) wl.next())) {
+ return true;
+ }
+ }
+ }
+
+ return false; //don't let this attribute through
}
- }
- return false; //don't let this attribute through
- }
- }
- return true; //didn't seem to be an external prefix, so it's fine
+
+ return true; //didn't seem to be an external prefix, so it's fine
}
}
}
+
private void print(Node node, StringWriter out) throws IOException {
if (node == null) {
return;
}
int type = node.getNodeType();
- boolean canOutput = checkNode(node.getNodeName());
+
+ // will this node be present in the output?
+ boolean keepNode = checkNode(node.getNodeName());
switch (type) {
break;
case Node.ELEMENT_NODE:
- if (canOutput) {
+ if (keepNode) {
out.write('<');
out.write(node.getNodeName());
for (int i = 0; i < attrs.getLength(); i++) {
String attrName = attrs.item(i).getNodeName();
String attrValue = attrs.item(i).getNodeValue();
- if (attrValue.startsWith("//")){
- attrValue=regularExpressionLeadingSlashes.substitute(attrValue, "/");
- }
+
+ // todo: what is this?
+ if (attrValue.startsWith("//")){
+ attrValue = Util.substitute(
+ new Perl5Matcher(), regularExpressionLeadingSlashes, new Perl5Substitution("/"), attrValue);
+ }
- if (checkAttr(attrName) && checkAttrValue(attrValue) && checkAttrInContext(node.getNodeName(),attrName,attrValue)) {
+ if (testAttribueName(attrName) && testAttibuteValue(attrValue) && testAttributeInContext(node.getNodeName(), attrName, attrValue)) {
out.write(' ');
out.write(attrs.item(i).getNodeName());
out.write("=\"");
}
}
+ // nodes without children will use the shorthand form <br/>. Some browsers
+ // treat <br></br> as a double linebreak
if (node.getChildNodes() == null || node.getChildNodes().getLength() == 0) {
out.write("/");
+ out.write('>');
+ break;
}
out.write('>');
}
+
+
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
print(children.item(i), out);
}
}
- break;
- case Node.TEXT_NODE:
- String value = node.getNodeValue();
- try {
- value = regularExpressionLT.substituteAll(value, "<");
- value = regularExpressionGT.substituteAll(value, ">");
- }
- catch (Throwable t) {
- value = "";
+ if (keepNode) {
+ out.write("</");
+ out.write(node.getNodeName());
+ out.write('>');
}
- out.write(value);
break;
- }
+ case Node.TEXT_NODE:
+ out.write(HTMLRoutines.encodeHTML(node.getNodeValue()));
- if (type == Node.ELEMENT_NODE && canOutput && node.getChildNodes() != null && node.getChildNodes().getLength() > 0) {
- out.write("</");
- out.write(node.getNodeName());
- out.write('>');
+ break;
}
out.flush();