fix the text splitting routines
[mir.git] / source / mircoders / producer / PDFPreFormattingProducerNode.java
index 44f657a..43dd3c5 100755 (executable)
@@ -97,25 +97,28 @@ public class PDFPreFormattingProducerNode implements ProducerNode {
       
 
       EntityList images=DatabaseContentToMedia.getInstance().getImages((EntityContent)entity);
+      String theContent = ((EntityContent) entity).getValue("content_data"); 
       if (images == null){
          HashMap row = new HashMap();
-         row.put("text",((EntityContent) entity).getValue("content_data"));
+         row.put("text",theContent);
          row.put("hasImage","0");
          brokenUpContent.add(row);
       }
       if (images != null){
          //need to add checks for out of content!
          HashMap row0 = new HashMap();
-         if (numCharsInAnImagelessRow>(((EntityContent) entity).getValue("content_data")).length()){
-             row0.put("text",((EntityContent) entity).getValue("content_data"));
+         if (numCharsInAnImagelessRow>(theContent).length()){
+             row0.put("text",theContent);
              outOfText = true;
          }
          else {
-             row0.put("text",((EntityContent) entity).getValue("content_data").substring(0,numCharsInAnImagelessRow));
+             //break on words so we don't split html entities
+             int lastSpaceAt = theContent.lastIndexOf(" ",numCharsInAnImagelessRow);
+             row0.put("text",theContent.substring(0,lastSpaceAt));
+             currentPosition=lastSpaceAt;
          }
          row0.put("hasImage","0");
          brokenUpContent.add(row0);
-         currentPosition=numCharsInAnImagelessRow;
          aLogger.println("CP1 is "+ currentPosition);
          while(images.hasNext()){
              HashMap row1 = new HashMap();
@@ -150,34 +153,37 @@ public class PDFPreFormattingProducerNode implements ProducerNode {
              row1.put("hasImage","1");
              if (! outOfText){
                  try {
-                     row1.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition,currentPosition+text_amount));
+                     int lastSpaceAt = theContent.lastIndexOf(" ",currentPosition+text_amount);
+                     row1.put("text",theContent.substring(currentPosition,lastSpaceAt));
+                     currentPosition=lastSpaceAt;
                  }
                  catch (IndexOutOfBoundsException e){
-                     row1.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition));
+                     row1.put("text",theContent.substring(currentPosition));
                      outOfText = true;
                          }
              }
-             currentPosition=currentPosition+text_amount;
              aLogger.println("CP2 is "+ currentPosition);
              brokenUpContent.add(row1);
              
              if (! outOfText){
                  try {
-                     row2.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition,currentPosition+numCharsInAnImagelessRow));
+                     int lastSpaceAt = theContent.lastIndexOf(" ",currentPosition+numCharsInAnImagelessRow);
+                     row2.put("text",theContent.substring(currentPosition,lastSpaceAt));
+                     currentPosition=lastSpaceAt;
                  }
                  catch (IndexOutOfBoundsException e){
-                     row2.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition));
+                     row2.put("text",theContent.substring(currentPosition));
                      outOfText = true;
                          }
              }
              row2.put("hasImage","0");
              brokenUpContent.add(row2);
-             currentPosition=currentPosition+numCharsInAnImagelessRow;
+
              aLogger.println("CP3 is "+ currentPosition);
          }
          HashMap row3 = new HashMap();
          if (! outOfText){
-             row3.put("text",((EntityContent) entity).getValue("content_data").substring(currentPosition));
+             row3.put("text",theContent.substring(currentPosition));
              row3.put("hasImage","0");
              brokenUpContent.add(row3);
          }