Showing posts with label feed. Show all posts
Showing posts with label feed. Show all posts

Thursday, June 23, 2011

RSS Parser (SAX)

RSS (Really Simple Syndication)
RSS is way to publish frequently changing contents like blog posts, news updates, stock quotes & things like that. An RSS document, which is called a “feed,” “web feed,” or “channel,” contains either a summary of content from an associated web site or the full text. RSS formats are specified using XML, a generic specification for the creation of data formats.
I have attached a simple SAX parser for RSS. Please let me know if there is any flaw in the attached code. This code is provided for learning purpose with less focus on coding standards & it’s efficiency. You are free to use & modify it.

Code
RssParser.java

package com.vaani.rss.parser;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Properties;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;

public class RssParser extends DefaultHandler
{  
    private String        urlString;
    private RssFeed       rssFeed;
    private StringBuilder text;
    private Item          item;
    private boolean       imgStatus;
   
    public RssParser(String url)
    {
        this.urlString = url;
        this.text = new StringBuilder();
    }
   
    public void parse()
    {
        InputStream urlInputStream = null;
        SAXParserFactory spf = null;
        SAXParser sp = null;
       
        try
        {
            URL url = new URL(this.urlString);
            _setProxy(); // Set the proxy if needed
            urlInputStream = url.openConnection().getInputStream();           
            spf = SAXParserFactory.newInstance();
            if (spf != null)
            {
                sp = spf.newSAXParser();
                sp.parse(urlInputStream, this);
            }
        }

        /*
         * Exceptions need to be handled
         * MalformedURLException
         * ParserConfigurationException
         * IOException
         * SAXException
         */
       
        catch (Exception e)
        {
            System.out.println("Exception: " + e);
            e.printStackTrace();
        }
        finally
        {
            try
            {
                if (urlInputStream != null) urlInputStream.close();
            }
            catch (Exception e) {}
        }
    }

    public RssFeed getFeed()
    {
        return (this.rssFeed);
    }
   
    public void startElement(String uri, String localName, String qName,
            Attributes attributes)
    {
        if (qName.equalsIgnoreCase("channel"))
            this.rssFeed = new RssFeed();
        else if (qName.equalsIgnoreCase("item") && (this.rssFeed != null))
        {
            this.item = new Item();
            this.rssFeed.addItem(this.item);
        }
        else if (qName.equalsIgnoreCase("image") && (this.rssFeed != null))
            this.imgStatus = true;
    }
   
    public void endElement(String uri, String localName, String qName)
    {
        if (this.rssFeed == null)
            return;
       
        if (qName.equalsIgnoreCase("item"))
            this.item = null;
       
        else if (qName.equalsIgnoreCase("image"))
            this.imgStatus = false;
       
        else if (qName.equalsIgnoreCase("title"))
        {
            if (this.item != null) this.item.title = this.text.toString().trim();
            else if (this.imgStatus) this.rssFeed.imageTitle = this.text.toString().trim();
            else this.rssFeed.title = this.text.toString().trim();
        }       
       
        else if (qName.equalsIgnoreCase("link"))
        {
            if (this.item != null) this.item.link = this.text.toString().trim();
            else if (this.imgStatus) this.rssFeed.imageLink = this.text.toString().trim();
            else this.rssFeed.link = this.text.toString().trim();
        }       
       
        else if (qName.equalsIgnoreCase("description"))
        {
            if (this.item != null) this.item.description = this.text.toString().trim();
            else this.rssFeed.description = this.text.toString().trim();
        }
       
        else if (qName.equalsIgnoreCase("url") && this.imgStatus)
            this.rssFeed.imageUrl = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("language"))
            this.rssFeed.language = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("generator"))
            this.rssFeed.generator = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("copyright"))
            this.rssFeed.copyright = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("pubDate") && (this.item != null))
            this.item.pubDate = this.text.toString().trim();
       
        else if (qName.equalsIgnoreCase("category") && (this.item != null))
            this.rssFeed.addItem(this.text.toString().trim(), this.item);
       
        this.text.setLength(0);
    }
   
    public void characters(char[] ch, int start, int length)
    {
        this.text.append(ch, start, length);
    }
   
    public static void _setProxy()
    throws IOException
    {
        Properties sysProperties = System.getProperties();
        sysProperties.put("proxyHost", "<Proxy IP Address>");
        sysProperties.put("proxyPort", "<Proxy Port Number>");
        System.setProperties(sysProperties);
    }
  
    public static class RssFeed
    {
        public  String title;
        public  String description;
        public  String link;
        public  String language;
        public  String generator;
        public  String copyright;
        public  String imageUrl;
        public  String imageTitle;
        public  String imageLink;
       
        public ArrayList <Item> items;
        public HashMap <String, ArrayList <Item>> category;
       
        public void addItem(Item item)
        {
            if (this.items == null)
                this.items = new ArrayList<Item>();
            this.items.add(item);
        }
       
        public void addItem(String category, Item item)
        {
            if (this.category == null)
                this.category = new HashMap<String, ArrayList<Item>>();
            if (!this.category.containsKey(category))
                this.category.put(category, new ArrayList<Item>());
            this.category.get(category).add(item);
        }
    }
   
   
   
}

Item.java

package com.vaani.rss.parser;

public  class Item
{
    public  String title;
    public  String description;
    public  String link;
    public  String pubDate;
   
    public String toString()
    {
        return (this.title + ": " + 
             this.pubDate + "n" + this.description);
    }
}

RssParserDemo.java - Ready with demo

package com.vaani.rss.main;

import java.util.ArrayList;

import com.vaani.rss.parser.RssParser;
import com.vaani.rss.parser.RssParser.RssFeed;
import com.vaani.rss.parser.Item;



public class RssParserDemo {

    public static void main(String[] args){
        RssParser rp = new RssParser("<some rss feed>");
        rp.parse();
        RssFeed feed = rp.getFeed();

        // Listing all categories & the no. of elements in each category
        if (feed.category != null)
        {
         System.out.println("Category List: ");
         for (String category : feed.category.keySet())
         {
          System.out.println(category
            + ": "
            + ((ArrayList<Item>)feed.category.get(category)).size());
         }
        }

        // Listing all items in the feed
        for (int i = 0; i < feed.items.size(); i++)
         System.out.println(feed.items.get(i).title); 
    }
}

Friday, April 15, 2011

Reading/Parsing RSS feed using ROME

ROME is an open source tool to parse, generate and publish RSS and Atom feeds. Using Rome you can parse the available RSS and Atom feeds. Without bothering about format and version of RSS feed. The core library depends on the JDOM XML parser.
Atom is on the similar lines of RSS is another kind of feed. But it’s different in some aspects as protocol, payloads.
RSS is a method to share and publish contents. The contents may be any things from news to any little information. The main component is xml. Using xml you can share your contents on web. At the same time you are free to get what you like from others.

Why use Rome instead of other available readers

The Rome project started with the motivation of ‘ESCAPE’ where each letter stands for:
E – Easy to use. Just give a URL and forget about its type and version, you will be given a output in the format which you like.
S – Simple. Simple structure. The complications are all hidden from developers.
C – Complete. It handles all the versions of RSS and Atom feeds.
A – Abstract. It provides abstraction over various syndication specifications.
P – Powerful. Don’t worry about the format let Rome handle it.
E – Extensible. It needs a simple pluggable architecture to provide future extension of formats.

Dependency

Following are few dependencies:
J2SE 1.4+, JDOM 1.0, Jar files (rome-0.8.jar, purl-org-content-0.3.jar, jdom.jar)

Using Rome to read a Syndication Feed

Considering you have all the required jar files we will start with reading the RSS feed. ROME represents syndication feeds (RSS and Atom) as instances of the com.sun.syndication.synd.SyndFeed interface.
ROME includes parsers to process syndication feeds into SyndFeed instances. The SyndFeedInput class handles the parsers using the correct one based on the syndication feed being processed. The developer does not need to worry about selecting the right parser for a syndication feed, the SyndFeedInput will take care of it by peeking at the syndication feed structure. All it takes to read a syndication feed using ROME are the following 2 lines of code:
SyndFeedInput input = new SyndFeedInput();
SyndFeed feed = input.build (new XmlReader (feedUrl));
Now it’s simple to get the details of Feed. You have the object.

The sample code is as follows.
import java.net.URL;
import java.util.Iterator;
 
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
 
/**
 * @author Hanumant Shikhare
 */
public class Reader {
 
  public static void main(String[] args) throws Exception {
 
    URL url  = new URL("http://viralpatel.net/blogs/feed");
    XmlReader reader = null;
 
    try {
 
      reader = new XmlReader(url);
      SyndFeed feed = new SyndFeedInput().build(reader);
      System.out.println("Feed Title: "+ feed.getAuthor());
 
     for (Iterator i = feed.getEntries().iterator(); i.hasNext();) {
        SyndEntry entry = (SyndEntry) i.next();
        System.out.println(entry.getTitle());
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }
}

Understanding the Program

Initialize the URL object with the RSS Feed or Atom url. Then we will need XMLReader object which will then take URL object, as its constructor argument. Initialize the SyndFeed object by calling the build(reader) method. This method takes the XMLReader object as an argument.

References

https://rome.dev.java.net/
http://www.intertwingly.net/wiki/pie/Rss20AndAtom10Compared
http://www.rss-specifications.com

Chitika