========================================================= 

没办法附件上传不了,AcquisitionSvcImpl.java类:
//----------------------------------------------------------------------------
package com.jeecms.cms.service; import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.jeecms.cms.entity.assist.CmsAcquisition;
import com.jeecms.cms.entity.main.Content;
import com.jeecms.cms.manager.assist.CmsAcquisitionMng; @Service
public class AcquisitionSvcImpl implements AcquisitionSvc {
private Logger log = LoggerFactory.getLogger(AcquisitionSvcImpl.class); public boolean start(Integer id) {
CmsAcquisition acqu = cmsAcquisitionMng.findById(id);
if (acqu == null || acqu.getStatus() == CmsAcquisition.START) {
return false;
}
Thread thread = new AcquisitionThread(acqu);
thread.start();
return true;
} private CmsAcquisitionMng cmsAcquisitionMng; @Autowired
public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) {
this.cmsAcquisitionMng = cmsAcquisitionMng;
} private class AcquisitionThread extends Thread {
private CmsAcquisition acqu; public AcquisitionThread(CmsAcquisition acqu) {
super(acqu.getClass().getName() + "#" + acqu.getId());
this.acqu = acqu;
} @Override
public void run() {
if (acqu == null) {
return;
}
acqu = cmsAcquisitionMng.start(acqu.getId());
String[] plans = acqu.getAllPlans();
HttpClient client = new DefaultHttpClient();
CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding());
List<String> contentList;
String url;
int currNum = acqu.getCurrNum();
int currItem = acqu.getCurrItem();
Integer acquId = acqu.getId(); for (int i = plans.length - currNum; i >= ; i--)
{
url = plans[i]; contentList = getContentList(client, handler, url, acqu.getLinksetStart(), acqu.getLinksetEnd(), acqu.getLinkStart(), acqu.getLinkEnd()); String link; if(contentList!=null)
{
for (int j = contentList.size() - currItem; j >= ; j--)
{
if (cmsAcquisitionMng.isNeedBreak(acqu.getId(), plans.length - i, contentList.size() - j, contentList.size()))
{
client.getConnectionManager().shutdown();
log.info("Acquisition#{} breaked", acqu.getId());
return;
}
if (acqu.getPauseTime() > )
{
try
{
Thread.sleep(acqu.getPauseTime());
}
catch (InterruptedException e)
{
log.warn("", e);
}
}
link = contentList.get(j);
saveContent(client, handler, acquId, link, acqu.getTitleStart(), acqu.getTitleEnd(), acqu.getContentStart(), acqu.getContentEnd());
}
}
currItem = ;
}
client.getConnectionManager().shutdown();
cmsAcquisitionMng.end(acqu.getId());
log.info("Acquisition#{} complete", acqu.getId());
} private List<String> getContentList(HttpClient client,
CharsetHandler handler, String url, String linksetStart,
String linksetEnd, String linkStart, String linkEnd) { List<String> list = new ArrayList<String>(); try
{
HttpGet httpget = new HttpGet(new URI(url));
String html = client.execute(httpget, handler); Pattern pt = Pattern.compile(linksetStart.trim());
Matcher m = pt.matcher(html); if(m.find())
{
html = m.group();
} if(html!=null)
{
list = getUrlsList(html,linkStart);
} }
catch (Exception e)
{
log.warn(null, e);
}
return list;
} /**
* 得到地址集
*
* @param html
* @param linkStart
* @return
*/
private List<String> getUrlsList(String html,String linkStart)
{
List<String> list = new ArrayList<String>(); Pattern pt = Pattern.compile(linkStart); Matcher m = pt.matcher(html); while(m.find())
{
String link = m.group(); if(null!=link && !"".equals(link))
{
//System.out.println("url : " + link);
list.add(link);
}
}
return list;
} private Content saveContent(HttpClient client, CharsetHandler handler,
Integer acquId, String url, String titleStart, String titleEnd,
String contentStart, String contentEnd) { try { HttpGet httpget = new HttpGet(new URI(url));
String html = client.execute(httpget, handler); String title = "";
Pattern pt = Pattern.compile(titleStart.trim());
Matcher mt = pt.matcher(html); if (mt.find())
{
title = mt.group();
//System.out.println("title : " + title);
} String txt = "";
pt = Pattern.compile(contentStart.trim());
mt = pt.matcher(html);
if(mt.find()){
txt = mt.group();
//System.out.println("txt : " + txt);
} return cmsAcquisitionMng.saveContent(title, txt, acquId); }
catch (Exception e)
{
log.warn(null, e);
e.printStackTrace();
return null;
}
}
} private class CharsetHandler implements ResponseHandler<String> {
private String charset; public CharsetHandler(String charset) {
this.charset = charset;
} public String handleResponse(HttpResponse response)
throws ClientProtocolException, IOException {
StatusLine statusLine = response.getStatusLine();
if (statusLine.getStatusCode() >= ) {
throw new HttpResponseException(statusLine.getStatusCode(),
statusLine.getReasonPhrase());
}
HttpEntity entity = response.getEntity();
if (entity != null) {
if (!StringUtils.isBlank(charset)) {
return EntityUtils.toString(entity, charset);
} else {
return EntityUtils.toString(entity);
}
} else {
return null;
}
}
}
}
//--------------------------------------------------------------------------------

1:将AcquisitionSvcImpl.java 替换原工程项目com.jeecms.cms.service包下的对应文件。

2:编译工程即可

3:登陆后台配相关规则,如下所示参数:

==================================== 
*采集名称: 韩寒博客

*页面编码: UTF-8

动态地址: http://blog.sina.com.cn/s/articlelist_1191258123_0_[page].html

页码 从   1  到:  2

内容地址集:   <!-- 列表 START -->.*?<!-- 列表END -->

内容地址: target="_blank" href="(.*?)">(.*?)</a></span>

标题:         <title>(.*?)_韩寒_新浪博客</title>

内容:         <!-- 正文开始 -->(.*?)<!-- 正文结束 -->

最新文章

  1. Visual Studio 2015 正式版 官方下载地址
  2. CIB Training Scripts For TPC-H Benchmark
  3. c语言:printf系列的函数
  4. 如何检查oracle的归档空间是否满了?
  5. with check option(视图 )
  6. Web Scale IT 与 6 种 DevOps 工具
  7. 简单粗暴地理解 JavaScript 原型链
  8. java 计算器SWT/RAP(版本3)键盘鼠标兼容
  9. phpcms V9 联动菜单的调用
  10. APICloud |UIChatTools 模块demo
  11. 改变FileUpload文件上传控件的显示方式,选择文件后自动上传
  12. C语言数据结构基础学习笔记——基础线性表
  13. Python小项目四:实现简单的web服务器
  14. 题解——loj6277 数列分块入门1(分块)
  15. FaceAlignment blog
  16. 【数据分析】Superset 之四 直接安装
  17. sql server 2008数据库 降为 sql server 2005数据库 最终方案总结
  18. 算法导论第九章 第K顺序统计量
  19. python 序列结构-列表,元组,字典,字符串,集合
  20. 使用py-faster-rcnn训练VOC2007数据集时遇到问题

热门文章

  1. ie中onclick问题
  2. 深入理解Java虚拟机(类文件结构)
  3. 【9.14NOIP模拟pj】wtaxi 题解
  4. 【JZOJ3237】间谍派遣
  5. server端并发聊天
  6. LinkedHashMap笔记
  7. NSLayoutConstraint-代码实现自动布局的函数用法说明
  8. kafka集群搭建文档
  9. 图片转成base64 跨域等安全限制及解决方案
  10. vue后台管理项目中菜单栏切换的三种方法