X-Git-Url: http://erislabs.net/gitweb/?a=blobdiff_plain;f=source%2Fmircoders%2Flocalizer%2Fbasic%2FMirBasicProducerAssistantLocalizer.java;h=f79f4d4e2461b5d7f2d136ecd1f3a459a02cfc72;hb=c26251faa299ed62d0e47c32636f440c554610ec;hp=5589145e45e4b3df7148fd5e1075395c371e37f0;hpb=855ecf8acedb12afbab7a621b2e2c0cf45b2f98f;p=mir.git diff --git a/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java b/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java index 5589145e..f79f4d4e 100755 --- a/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java +++ b/source/mircoders/localizer/basic/MirBasicProducerAssistantLocalizer.java @@ -29,38 +29,90 @@ */ package mircoders.localizer.basic; -import java.util.GregorianCalendar; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; - import mir.config.MirPropertiesConfiguration; import mir.entity.adapter.EntityAdapter; import mir.entity.adapter.EntityIteratorAdapter; +import mir.generator.Generator; +import mir.generator.GeneratorExc; +import mir.generator.GeneratorFailure; import mir.log.LoggerWrapper; -import mir.misc.StringUtil; -import mir.util.GeneratorDateTimeFunctions; -import mir.util.GeneratorExpressionFunctions; -import mir.util.GeneratorFormatAdapters; -import mir.util.GeneratorHTMLFunctions; -import mir.util.GeneratorIntegerFunctions; -import mir.util.GeneratorListFunctions; -import mir.util.GeneratorRegularExpressionFunctions; -import mir.util.GeneratorStringFunctions; +import mir.util.*; +import mir.util.generator.ReflectionGeneratorFunctionsAdapter; import mircoders.global.MirGlobal; import mircoders.localizer.MirLocalizerExc; import mircoders.localizer.MirLocalizerFailure; import mircoders.localizer.MirProducerAssistantLocalizer; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.tidy.Configuration; +import org.w3c.tidy.Tidy; +import org.apache.oro.text.regex.*; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringWriter; +import java.util.*; public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantLocalizer { protected LoggerWrapper logger; - public void initializeGenerationValueSet(Map aValueSet) throws MirLocalizerExc, MirLocalizerFailure { + private HTMLStripper stripper; + private Pattern regularExpressionWhitespace; + private Pattern regularExpressionLeadingSlashes; + private Set disallowedAttributes = new HashSet(); + private Set disallowedPrefixes = new HashSet(); + private Set allowedNodes = new HashSet(); + private Set externalPrefixes = new HashSet(); + private Set allowedExternalPrefixes = new HashSet(); + + + + + public MirBasicProducerAssistantLocalizer() throws MirLocalizerFailure { + try { + stripper = new HTMLStripper(); + Perl5Compiler compiler = new Perl5Compiler(); + + regularExpressionWhitespace = compiler.compile("\\s+| | ", Perl5Compiler.READ_ONLY_MASK); + regularExpressionLeadingSlashes = compiler.compile("^//+", Perl5Compiler.READ_ONLY_MASK); + + Iterator i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributes"), ";").iterator(); + while (i.hasNext()) { + disallowedAttributes.add(((String) i.next()).toLowerCase()); + } + + i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.BadAttributeValuePrefixes"), ";").iterator(); + while (i.hasNext()) { + disallowedPrefixes.add(((String) i.next()).toLowerCase()); + } + + i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.Whitelist"), ";").iterator(); + while (i.hasNext()) { + allowedNodes.add(((String) i.next()).toLowerCase()); + } + + i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.ExternalLocationAttributeValuePrefixes"), ";").iterator(); + while (i.hasNext()) { + externalPrefixes.add(((String) i.next()).toLowerCase()); + } + + i = StringRoutines.splitString(MirGlobal.config().getString("Localizer.HTML.WhitelistedExternalLocationAttributeValuePrefixes"), ";").iterator(); + while (i.hasNext()) { + allowedExternalPrefixes.add(((String) i.next()).toLowerCase()); + } + } + catch (Throwable t) { + throw new MirLocalizerFailure(t); + } + } + + public void initializeGenerationValueSet(Map aValueSet) throws MirLocalizerExc, MirLocalizerFailure { try { Iterator i; Map configMap = new HashMap(); - Map utilityMap = new HashMap(); logger = new LoggerWrapper("Localizer.ProducerAssistant"); @@ -82,32 +134,15 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL // "new": configMap.putAll(MirPropertiesConfiguration.instance().allSettings()); - utilityMap.put("compressWhitespace", new freemarker.template.utility.CompressWhitespace()); - utilityMap.put("encodeHTML", new GeneratorHTMLFunctions.encodeHTMLGeneratorFunction()); - utilityMap.put("prettyEncodeHTML", new GeneratorHTMLFunctions.prettyEncodeHTMLGeneratorFunction()); - utilityMap.put("encodeXML", new GeneratorHTMLFunctions.encodeXMLGeneratorFunction()); - utilityMap.put("encodeURI", new GeneratorHTMLFunctions.encodeURIGeneratorFunction()); - utilityMap.put("subString", new GeneratorStringFunctions.subStringFunction()); - utilityMap.put("subList", new GeneratorListFunctions.subListFunction()); - utilityMap.put("isOdd", new GeneratorIntegerFunctions.isOddFunction()); - utilityMap.put("increment", new GeneratorIntegerFunctions.incrementFunction()); - utilityMap.put("evaluate", new GeneratorExpressionFunctions.evaluateExpressionFunction()); - utilityMap.put("constructString", new GeneratorStringFunctions.constructStructuredStringFunction()); - utilityMap.put("parseStructuredString", new GeneratorStringFunctions.structuredStringParserFunction()); - utilityMap.put("escapeJDBCString", new GeneratorStringFunctions.jdbcStringEscapeFunction()); - utilityMap.put("regexpreplace", new GeneratorRegularExpressionFunctions.regularExpressionReplaceFunction()); - utilityMap.put("regexpmatch", new GeneratorRegularExpressionFunctions.regularExpressionMatchFunction()); - utilityMap.put("datetime", new GeneratorDateTimeFunctions.DateTimeFunctions( - MirPropertiesConfiguration.instance().getString("Mir.DefaultTimezone"))); - aValueSet.put("config", configMap); - aValueSet.put("utility", utilityMap); + + aValueSet.put("utility", new Utility()); aValueSet.put("languages", - new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "language")); + new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "language")); aValueSet.put("topics", - new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "topic")); + new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "topic")); Map articleTypeMap = new HashMap(); articleTypeMap.put("openposting", "0"); @@ -132,26 +167,77 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL commentStatusMap.put(commentStatus.get("name"), commentStatus.get("id")); } aValueSet.put("commentstatus", commentStatusMap); + aValueSet.put("languageCodeToId", new getLanguageIdFunction()); } catch (Throwable t) { logger.error("initializeGenerationValueSet: Exception while collecting comment statuses" + t.getMessage()); - throw new RuntimeException(t.getMessage()); + + throw new MirLocalizerFailure(t); + } + + } + + public static class getLanguageIdFunction implements Generator.Function { + private Map languageCodeToId; + private String otherLanguageId; + private LoggerWrapper logger = new LoggerWrapper("Localizer.Earth.getLanguageIdFunction"); + + public getLanguageIdFunction() throws MirLocalizerFailure { + try { + otherLanguageId = ""; + languageCodeToId = new HashMap(); + + Iterator i = new EntityIteratorAdapter("", "", 20, MirGlobal.localizer().dataModel().adapterModel(), "language"); + while (i.hasNext()) { + EntityAdapter language = (EntityAdapter) i.next(); + if (language.get("code").equals("ot")) { + otherLanguageId = (String) language.get("id"); + } + + languageCodeToId.put(language.get("code"), language.get("id")); + } + } + catch (Throwable t) { + logger.error(t.toString()); + + throw new MirLocalizerFailure(t); + } + } + + public Object perform(List aParameters) throws GeneratorExc, GeneratorFailure { + try { + if (aParameters.size() != 1) { + throw new GeneratorExc("getLanguageIdFunction: 1 parameter expected: language-code"); + } + + String result = (String) languageCodeToId.get(aParameters.get(0)); + if (result == null) { + result = otherLanguageId; + } + + return result; + } + catch (GeneratorExc e) { + throw e; + } + catch (Throwable t) { + throw new GeneratorFailure("getLanguageIdFunction: " + t.getMessage(), t); + } } + } - }; public String filterNonHTMLText(String aText) { logger.debug("about to filter non HTML Text of length " + aText.length()); try { String result = - StringUtil.createHTML( - StringUtil.removeHTMLTags(aText), - MirGlobal.config().getString("Producer.ImageRoot"), - MirGlobal.config().getString("Producer.MailLinkName"), - MirGlobal.config().getString("Producer.ExtLinkName"), - MirGlobal.config().getString("Producer.IntLinkName") - ); + stripper.createHTML( + stripper.removeHTMLTags(aText), + MirGlobal.config().getString("Producer.ImageRoot"), + MirGlobal.config().getString("Producer.MailLinkName"), + MirGlobal.config().getString("Producer.ExtLinkName"), + MirGlobal.config().getString("Producer.IntLinkName")); logger.debug("done filtering non-HTML text "); return result; } @@ -162,7 +248,239 @@ public class MirBasicProducerAssistantLocalizer implements MirProducerAssistantL } } + public Generator.Interceptor createGenerationInterceptor() throws MirLocalizerExc, MirLocalizerFailure { + + if (MirGlobal.config().getBoolean("Mir.Producer.UseInterceptor", true)) { + return new Generator.Interceptor() { + + public Object intercept(Object anObject) { + if (anObject instanceof EntityAdapter) { + return new InterceptedEntityAdapter((EntityAdapter) anObject); + } + + return anObject; + } + }; + } + else { + return null; + } + } + + public class InterceptedEntityAdapter { + private EntityAdapter adapter; + + InterceptedEntityAdapter(EntityAdapter anEntityAdapter) { + adapter = anEntityAdapter; + } + + public Object get(String aField) { + Object result = adapter.get(aField); + if (result instanceof String) { + return filterHTMLText((String) result); + } + else { + return result; + } + } + + public Object getRaw() { + return new RawEntityAdapter(adapter); + } + } + + public class RawEntityAdapter { + private EntityAdapter adapter; + + RawEntityAdapter(EntityAdapter anEntityAdapter) { + adapter = anEntityAdapter; + } + + public Object get(String aField) { + return adapter.get(aField); + } + } + public String filterHTMLText(String aText) { - return StringUtil.deleteForbiddenTags(aText); + try { + StringWriter out = new StringWriter(); + Tidy tidy = new Tidy(); + ByteArrayInputStream in = new ByteArrayInputStream(aText.getBytes("UTF8")); + tidy.setMakeClean(true); + tidy.setCharEncoding(Configuration.UTF8); + tidy.setErrout(logger.asPrintWriter(LoggerWrapper.DEBUG_MESSAGE)); + print(tidy.parseDOM(in, null), out); + + return out.toString(); + } + catch (IOException e) { + return e.getMessage(); + } + } + + + /** + * Test whether attributes of the given type are acceptable + * + * @param anAttibuteName + * @return true if the attribute is acceptable + */ + private boolean testAttribueName(String anAttibuteName) { + return !disallowedAttributes.contains(anAttibuteName.toLowerCase()); + } + + private String stripWhitespace(String aString) { + return Util.substitute( + new Perl5Matcher(), regularExpressionWhitespace, new Perl5Substitution(""), aString, Util.SUBSTITUTE_ALL); + } + + private boolean testAttibuteValue(String anAttributeValue) { + Iterator i = disallowedPrefixes.iterator(); + + while (i.hasNext()) { + // todo: split the attribute value on : and use contains + if ((stripWhitespace(anAttributeValue.toLowerCase())).startsWith(((String) i.next()).toLowerCase() + ":")) { + return false; + } + } + + return true; + } + + + private boolean checkNode(String nodeName) { + return allowedNodes.contains(nodeName.toLowerCase()); + } + + private boolean testAttributeInContext(String aTag, String anAttibute, String aValue){ + /* The intent here is to prevent external content from being loaded by the user's browser. + It's extra paranoid, so will strip some legitimate stuff like an alt="http://www.indymedia.org" + */ + if (! MirGlobal.config().getBoolean("Localizer.HTML.KillWebBugs")) { + return true; + } + else { + if (("a".equalsIgnoreCase(aTag) && "href".equalsIgnoreCase(anAttibute)) || + ("form".equalsIgnoreCase(aTag) && "action".equalsIgnoreCase(anAttibute))) { + // because we still love the web, even if it doesn't return the favor + + return true; + } + else { + String value = stripWhitespace(aValue.toLowerCase()); + + Iterator i = externalPrefixes.iterator(); + while (i.hasNext()) { + if (value.startsWith((String) i.next())) { + // we have hit a bad prefix, but we need to check the whitelist + Iterator wl = allowedExternalPrefixes.iterator(); + while (wl.hasNext()) { + if (value.startsWith((String) wl.next())) { + return true; + } + } + } + + return false; //don't let this attribute through + } + + return true; //didn't seem to be an external prefix, so it's fine + } + } + } + + private void print(Node node, StringWriter out) throws IOException { + if (node == null) { + return; + } + int type = node.getNodeType(); + + // will this node be present in the output? + boolean keepNode = checkNode(node.getNodeName()); + + switch (type) { + + case Node.DOCUMENT_NODE: + + print(((Document) node).getDocumentElement(), out); + out.flush(); + break; + + case Node.ELEMENT_NODE: + if (keepNode) { + out.write('<'); + + out.write(node.getNodeName()); + NamedNodeMap attrs = node.getAttributes(); + + for (int i = 0; i < attrs.getLength(); i++) { + String attrName = attrs.item(i).getNodeName(); + String attrValue = attrs.item(i).getNodeValue(); + + // todo: what is this? + if (attrValue.startsWith("//")){ + attrValue = Util.substitute( + new Perl5Matcher(), regularExpressionLeadingSlashes, new Perl5Substitution("/"), attrValue); + } + + if (testAttribueName(attrName) && testAttibuteValue(attrValue) && testAttributeInContext(node.getNodeName(), attrName, attrValue)) { + out.write(' '); + out.write(attrs.item(i).getNodeName()); + out.write("=\""); + + out.write(attrs.item(i).getNodeValue()); + out.write('"'); + } + } + + // nodes without children will use the shorthand form
. Some browsers + // treat

as a double linebreak + if (node.getChildNodes() == null || node.getChildNodes().getLength() == 0) { + out.write("/"); + out.write('>'); + break; + } + out.write('>'); + } + + + NodeList children = node.getChildNodes(); + if (children != null) { + int len = children.getLength(); + for (int i = 0; i < len; i++) { + print(children.item(i), out); + } + } + + if (keepNode) { + out.write("'); + } + + break; + + case Node.TEXT_NODE: + out.write(HTMLRoutines.encodeHTML(node.getNodeValue())); + + break; + } + + out.flush(); + } + + public static class Utility extends ReflectionGeneratorFunctionsAdapter { + public Utility() { + super(new MirBasicUtilityFunctions()); + } + + public Object getDatetime() { + return new GeneratorDateTimeFunctions.DateTimeFunctions( + MirPropertiesConfiguration.instance().getString("Mir.DefaultTimezone")); + } + + public Object getCompressWhitespace() { + return new freemarker.template.utility.CompressWhitespace(); + } } }