`

博客导入及新闻订阅java实现(解析rss)

 
阅读更多

为了实现把自己的其他网站的博客,或新闻导入到现在的站点中来,rss订阅主要解析xml,很简单,但往往用户并不知道自己博客rss地址,所以要根据博客地址抓取网页并解析出rss地址,然后在解析xml并将内容导入进自己的站点,在此不贴图了,直接发代码:
package com.jyeba.core.rss;

import java.util.ArrayList;
import java.util.List;

public class RssBean {

/**
* 频道标题
*/
private String title;
/**
* 频道连接
*/

private String link;
/**
* 频道描述
*/

private String description;
private String date;

private List<RssBean> items = new ArrayList<RssBean>();

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getDescription() {
return description;
}

public void setDescription(String description) {
this.description = description;
}

public String getDate() {
return date;
}

public void setDate(String date) {
this.date = date;
}

public List<RssBean> getItems() {
return items;
}

public void setItems(List<RssBean> items) {
this.items = items;
}

public void setLink(String link) {
this.link = link;
}

public String getLink() {
return link;
}

}



package com.jyeba.core.rss;


/**
 * rss处理类
 * 
 * @author hanfei
 * 
 */
public class Rss {
public static String RSS_DOM_ROOT_TITLE = "//channel/title";

public static String RSS_DOM_ROOT_LINK = "//channel/link";

public static String RSS_DOM_ROOT_DESCRIPTION = "//channel/description";

public static String RSS_DOM_CHILRDEN_ROOT = "//channel/item";

public static String RSS_DOM_CHILRDEN_ROOT_TITLE = "title";

public static String RSS_DOM_CHILRDEN_ROOT_LINK = "link";

public static String RSS_DOM_CHILRDEN_ROOT_PUBDATE = "pubDate";

public static String RSS_DOM_CHILRDEN_ROOT_DESCRIPTION = "description";

public static String DATABASES_PATH = "#rss_databases.mdb";
private Document document;

public Document parse(URL url) throws DocumentException {
SAXReader reader = new SAXReader();
document = reader.read(url);
return document;
}

public void parseUrl(URL url) throws DocumentException {
SAXReader reader = new SAXReader();

document = reader.read(url);
System.out.print("文档全文"
+ document.getDocument().getRootElement().getName());
// return document;
}

public List getXmlInfo(String path, URL url) {
List info = new ArrayList();
try {
Document document = parse(url);
info = document.selectNodes(path);
} catch (DocumentException e) {
e.printStackTrace();
}
return info;
}

public List getXmlInfo(String path) throws DocumentException {
List info = new ArrayList();
// Document document = parse(url);
info = document.selectNodes(path);
return info;
}

public Element getFirstNodeTitle(String path, URL url) {
List list = getXmlInfo(path, url);
Element element = (Element) list.get(0);
return element;
}

public Element getFirstNodeTitle(String path) throws DocumentException {
List list = getXmlInfo(path);
Element element = (Element) list.get(0);
return element;
}

/**
* 通过rss地址获取rss内容
* 
* @param uri
* @return
*/
public RssBean getRssBean(String uri) {

RssBean rss = new RssBean();

// Test xu = new Test();
URL url;
try {
url = new URL(uri);

parseUrl(url);

List listRoot = getXmlInfo("//channel");
for (Iterator iter = listRoot.iterator(); iter.hasNext();) {
Element element = (Element) iter.next();

Node title = element.selectSingleNode(RSS_DOM_ROOT_TITLE);
Node link = element.selectSingleNode(RSS_DOM_ROOT_LINK);
Node description = element
.selectSingleNode(RSS_DOM_ROOT_DESCRIPTION);
System.out.println(description.getText()
+ description.getText());
rss.setTitle(title.getText());
rss.setDescription(description.getText());
rss.setLink(link.getText());

}
System.out.println("----------------------");
/**
* 获取内容项
*/
List list = getXmlInfo(RSS_DOM_CHILRDEN_ROOT);
for (Iterator iter = list.iterator(); iter.hasNext();) {
Element element = (Element) iter.next();
Node title = element
.selectSingleNode(RSS_DOM_CHILRDEN_ROOT_TITLE);
Node link = element
.selectSingleNode(RSS_DOM_CHILRDEN_ROOT_LINK);
Node time = element
.selectSingleNode(RSS_DOM_CHILRDEN_ROOT_PUBDATE);

Node description = element
.selectSingleNode(RSS_DOM_CHILRDEN_ROOT_DESCRIPTION);
/*
* System.out.println("标题: " + title.getText());
* System.out.println("时间: " + time.getText());
* System.out.println("地址: " + link.getText());
* System.out.println("描述  : " + description.getText());
* System.out
* .println("======================================================"
* );
*/
RssBean rs = new RssBean();
rs.setTitle(title.getText());
rs.setDate(time.getText());
rs.setDescription(description.getText());
rs.setLink(link.getText());
rss.getItems().add(rs);

}
return rss;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}

public RssBean getRSSrsult(String url) throws IOException {

// String url = "http://sun-store.blogbus.com/";
RssUtil r = new RssUtil();
// Test t = new Test();
RssBean rs;

/**
* 判断是否为html,如果是html并查找出rss地址
*/
String result = r.isHtmlOrXml(url);

if ("xml".equals(result)) {

rs = getRssBean(url);
System.out.print(rs.getTitle());

} else if ("eorr".equals(result)) {
rs = null;
System.out.print("不支持rss");
} else {
rs = getRssBean(result);
System.out.print(rs.getTitle());

}
return rs;
}

public static void main(String args[]) throws IOException {

String url = "http://blog.sina.com.cn/s/articlelist_1914306010_1_1.html";
Rss t = new Rss();
RssBean rs;
rs = t.getRSSrsult(url);
System.out.println(rs.getTitle());

}

public void setDocument(Document document) {
this.document = document;
}

public Document getDocument() {
return document;
}
}

package com.jyeba.core.rss;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class RssUtil {
private Document doc;

public String isHtmlOrXml(String url) throws IOException {
doc = Jsoup.connect(url)

.data("query", "Java")

.userAgent("Mozilla")

.cookie("auth", "token")

.timeout(6000)

.get();
// System.out.print(doc.html());
Elements head = doc.select("rss");
if (head.size() < 1) {

Elements rssEl = doc.select("link[type=application/rss+xml]");
if (rssEl.size() < 1) {
System.out.println("该地址不支持rss");
return "eorr";

} else {
System.out.println("this a html   RSS地址:"
+ rssEl.get(0).attr("href"));
return rssEl.get(0).attr("href");
// 如果有多个rss地址 进行遍历
// for(Element e:rssEl){
// System.out.println(e.attr("title")+"----------"+e.attr("href"));
//
// }
}

} else {

return "xml";
}

// return null;
}

public static void main(String[] args) {
RssUtil r = new RssUtil();
try {
r.isHtmlOrXml("http://hi.baidu.com/ybhanxiao/home");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
 

0
0
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics