From: john Date: Thu, 19 Apr 2007 18:08:11 +0000 (+0000) Subject: Use jakarta feedparser to parse RSS2 files. X-Git-Url: http://erislabs.net/gitweb/?p=mir.git;a=commitdiff_plain;h=0b28407d4cbb0b1fd2f99c998192292a4a486943 Use jakarta feedparser to parse RSS2 files. Theoretically this could be use to parse everything, including Atom. --- diff --git a/lib/commons-feedparser-0.5-SNAPSHOT.jar b/lib/commons-feedparser-0.5-SNAPSHOT.jar new file mode 100644 index 00000000..3a802e30 Binary files /dev/null and b/lib/commons-feedparser-0.5-SNAPSHOT.jar differ diff --git a/lib/jaxen-1.0-FCS-full.jar b/lib/jaxen-1.0-FCS-full.jar new file mode 100644 index 00000000..9f5227eb Binary files /dev/null and b/lib/jaxen-1.0-FCS-full.jar differ diff --git a/lib/jdom-b9.jar b/lib/jdom-b9.jar new file mode 100644 index 00000000..92fc140b Binary files /dev/null and b/lib/jdom-b9.jar differ diff --git a/lib/saxpath-1.0-FCS.jar b/lib/saxpath-1.0-FCS.jar new file mode 100644 index 00000000..43cccc63 Binary files /dev/null and b/lib/saxpath-1.0-FCS.jar differ diff --git a/source/mir/producer/RSSProducerNode.java b/source/mir/producer/RSSProducerNode.java index 2c6c0488..2d485a90 100755 --- a/source/mir/producer/RSSProducerNode.java +++ b/source/mir/producer/RSSProducerNode.java @@ -29,6 +29,7 @@ package mir.producer; import mir.log.LoggerWrapper; import mir.rss.RSS091Reader; +import mir.rss.RSS2Reader; import mir.rss.RSSData; import mir.rss.RSSReader; import mir.util.ParameterExpander; @@ -83,10 +84,20 @@ public class RSSProducerNode extends AbstractProducerNode { rssData = reader.parseUrl(expandedUrl); } } + else if (expandedVersion.equals("2")) { + RSS2Reader reader = new RSS2Reader(); + if (expandedEncoding!=null) { + rssData = reader.parseUrl(expandedUrl, expandedEncoding); + } + else { + rssData = reader.parseUrl(expandedUrl); + } + } + ParameterExpander.setValueForKey(aValueMap, expandedKey, rssData); } catch (Throwable t) { aLogger.warn("Error while processing RSS data", t); } } -} \ No newline at end of file +} diff --git a/source/mir/rss/RSS2Reader.java b/source/mir/rss/RSS2Reader.java new file mode 100755 index 00000000..f37149ca --- /dev/null +++ b/source/mir/rss/RSS2Reader.java @@ -0,0 +1,170 @@ +/*TK make sure the items go somewhere! */ + +/* + * Copyright (C) 2001-2006 The Mir-coders group + * + * This file is part of Mir. + * + * Mir is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Mir is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Mir; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * In addition, as a special exception, The Mir-coders gives permission to link + * the code of this program with any library licensed under the Apache Software License, + * and distribute linked combinations including the two. You must obey the + * GNU General Public License in all respects for all of the code used other than + * the above mentioned libraries. If you modify this file, you may extend this + * exception to your version of the file, but you are not obligated to do so. + * If you do not wish to do so, delete this exception statement from your version. + */ +package mir.rss; + +import mir.log.LoggerWrapper; +import mir.util.HTTPClientHelper; + +import org.apache.commons.feedparser.*; +import org.apache.commons.feedparser.impl.*; +import org.apache.commons.feedparser.network.*; + + +import java.io.InputStream; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.LinkedList; +import java.util.Map; + +/** + * + *

Title:

+ *

Description:

+ *

Copyright: Copyright (c) 2007

+ *

Company:

+ * @author not attributable + * @version 1.0 + */ + +public class RSS2Reader { + private FeedParser parser; + private FeedParserListener listener; + private RSSData result; + private List items; + private RDFResource channel; + static final LoggerWrapper logger = + new LoggerWrapper("rss.rss2reader"); + + + private RDFResource currentItem(){ + try{ + return (RDFResource) items.get(items.size()-1); + } + catch (IndexOutOfBoundsException i){ + logger.debug( "Something screwy happened!"); + return new RDFResource("nonexistent","entity"); + } + } + + + public RSS2Reader() throws org.apache.commons.feedparser.FeedParserException { + parser = FeedParserFactory.newFeedParser(); + + result = new RSSData(); + items = new ArrayList(); + channel = new RDFResource("rss:channel"); + + + listener = new DefaultFeedParserListener() { + + + public void onItem( FeedParserState state, + String title, + String link, + String description, + String permalink ) throws FeedParserException { + logger.debug( "Found a new published article: " + permalink ); + RDFResource item = new RDFResource("rss:item", link); + item.set("rss:link",permalink); + item.set("rss:title",title); + item.set("rss:description",description); + items.add(item); + } + + public void onAuthor(FeedParserState state, java.lang.String name, java.lang.String email, java.lang.String resource){ + currentItem().set("dc:creator",name); + } + + public void onContent(FeedParserState state, java.lang.String type, java.lang.String format, java.lang.String encoding, java.lang.String mode, java.lang.String value, boolean isSummary) { + currentItem().set("content.encoded",value); + } + + public void onCreated(FeedParserState state,java.util.Date date){ + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-mm-ddThh:mm:ssTZ"); + currentItem().set("dc:date",formatter.format(date)); + } + + public void onItemEnd(){ + result.addResource(currentItem()); + logger.debug( "Finished processing article" ); + } + + + + + + }; + } + + public RSSData parseInputStream(InputStream aStream) throws RSSExc, RSSFailure { + try { + parser.parse( listener, aStream, "" ); + return result; + } + catch (Throwable t) { + throw new RSSFailure(t); + } + } + + public RSSData parseInputStream(InputStream aStream, String anEncoding) throws RSSExc, RSSFailure { + try { + return parseInputStream(aStream); + } + catch (Throwable t) { + throw new RSSFailure(t); + } + } + + public RSSData parseUrl(String anUrl) throws RSSExc, RSSFailure { + try { + ResourceRequest request = ResourceRequestFactory.getResourceRequest( anUrl ); + InputStream is = request.getInputStream(); + return parseInputStream(is); + + } + catch (Throwable t) { + throw new RSSFailure(t); + } + } + + public RSSData parseUrl(String anUrl, String anEncoding) throws RSSExc, RSSFailure { + try { + return parseUrl(anUrl); + } + + catch (Throwable t) { + throw new RSSFailure(t); + } + } + +}