put some regular expressions in which turn
authorjohn <john>
Sat, 17 May 2003 16:01:12 +0000 (16:01 +0000)
committerjohn <john>
Sat, 17 May 2003 16:01:12 +0000 (16:01 +0000)
html articles into pdf format(more or less, a lot is still left to be desired, but at least
it doesn't spit lots of angle brackets into the pdf!)

source/mircoders/pdf/PDFGenerator.java

index 2176f34..7679103 100755 (executable)
@@ -30,6 +30,8 @@
 package mircoders.pdf;
 
 import gnu.regexp.RE;
+import gnu.regexp.REMatch;
+import gnu.regexp.REMatchEnumeration;
 import gnu.regexp.REException;
 
 import java.io.ByteArrayOutputStream;
@@ -616,6 +618,7 @@ public class PDFGenerator{
     */
     
     EntityList images=DatabaseContentToMedia.getInstance().getImages(entityContent);
+    String isHTML  = entityContent.getValue("is_html");
     String theTitle = entityContent.getValue("title");
     String theCreator = entityContent.getValue("creator");
     String theDate = entityContent.getValue("webdb_create_formatted");
@@ -628,24 +631,87 @@ public class PDFGenerator{
     String theContent = "";
     String theDescription = "";
     
-    try { 
-      RE re1 = new RE("\r?\n\r?\n");
-      String theContent1 = re1.substituteAll(theContentRaw,"BREAKHERE");
-      String theDescription1 = re1.substituteAll(theDescriptionRaw,"BREAKHERE");
+    if (isHTML.equals("1")){
       
-      RE re2 = new RE("\r?\n");
-      String theContent2 = re2.substituteAll(theContent1," ");
-      String theDescription2 = re2.substituteAll(theDescription1," ");
       
-      RE re3 = new RE("BREAKHERE");
-      theContent = "    " + re3.substituteAll(theContent2,"\n    ");
-      theDescription = re3.substituteAll(theDescription2,"\n    ");
+            
+      try { 
+       RE nobackslashr = new RE("\r");
+       theContent= nobackslashr.substituteAll(theContentRaw,"");
+       
+       RE HxTag = new RE("</?h[1-6][^>]*>",RE.REG_ICASE);
+       theContent = HxTag.substituteAll(theContent,"\n\n");
+       
+       RE ListItemTag = new RE("<li[^>]*>",RE.REG_ICASE);
+       theContent = ListItemTag.substituteAll(theContent,"\n * ");
 
+       RE ListTag = new RE("<(u|o)l[^>]*>",RE.REG_ICASE);
+       theContent = ListTag.substituteAll(theContent,"\n");
+       
+       RE DivTag = new RE("</?div[^>]*>",RE.REG_ICASE);
+       theContent= DivTag.substituteAll(theContent,"\n");
+
+       RE PTag = new RE("<(p|P)([:space:]+[^>]*)?>");
+       theContent= PTag.substituteAll(theContent,"\n    ");
+
+       RE PTagClose = new RE("</(p|P)([:space:]+[^>]*)?>");
+       theContent= PTagClose.substituteAll(theContent,"\n");
+
+       RE BRTag = new RE("<(br|BR)([:space:]+[^>]*)?>");
+       theContent= BRTag.substituteAll(theContent,"\n");
+       
+       RE ATagAll = new RE("<a[^>]*href=(?:\"|\')([^#\"\'][^\'\"]+)(?:\"|\')[^>]*>(.*?)</a>",RE.REG_ICASE);
+       REMatchEnumeration atags= ATagAll.getMatchEnumeration(theContent);
+       String theContentCopy=theContent;
+       while (atags.hasMoreMatches()){
+         REMatch atag = atags.nextMatch();
+         String atagString=atag.toString();
+         String atagStringHref=atag.toString(1);
+         String atagStringText=atag.toString(2);
+         int begin=theContentCopy.indexOf(atagString);
+         theContentCopy=theContentCopy.substring(0,begin) + atagStringText + " ["+ atagStringHref + "] " + theContentCopy.substring(begin+atagString.length());
+       }
+       theContent=theContentCopy;
+       
+       RE noTags = new RE("<[^>]*>");
+       theContent= noTags.substituteAll(theContent," ");
+       
+       theContent=mir.util.Translate.decode(theContent);
+
+       RE re1 = new RE("\r?\n\r?\n");
+       String theDescription1 = re1.substituteAll(theDescriptionRaw,"BREAKHERE");
+       
+       RE re2 = new RE("\r?\n");
+       String theDescription2 = re2.substituteAll(theDescription1," ");
+       
+       RE re3 = new RE("BREAKHERE");
+       theDescription = re3.substituteAll(theDescription2,"\n    ");
+       
+
+      }
+      catch(REException ree){
+       logger.error(ree.getMessage());
+      }
     }
-    catch(REException ree){
-      logger.error(ree.getMessage());
-    }
+    else {
+      try { 
+       RE re1 = new RE("\r?\n\r?\n");
+       String theContent1 = re1.substituteAll(theContentRaw,"BREAKHERE");
+       String theDescription1 = re1.substituteAll(theDescriptionRaw,"BREAKHERE");
+       
+       RE re2 = new RE("\r?\n");
+       String theContent2 = re2.substituteAll(theContent1," ");
+       String theDescription2 = re2.substituteAll(theDescription1," ");
+       
+       RE re3 = new RE("BREAKHERE");
+       theContent = "    " + re3.substituteAll(theContent2,"\n    ");
+       theDescription = re3.substituteAll(theDescription2,"\n    ");
 
+      }
+      catch(REException ree){
+       logger.error(ree.getMessage());
+      }
+    }
 
     addArticleSeparator();