copider 模仿scrapy的一些写法,当然我这个是单进程的,不是异步的

1.目录 copider/copider.py

#coding=utf-8

'''
Created on 2015年10月8日 @author: snt1
''' import urllib2
import lxml.html
import StringIO class Spider(object):
def __init__(self, url, meta=None):
self.URL = url
self.META = meta
self.TEXTMARK = self.get(url)
self.SEL = self.selector(doc=self.TEXTMARK) def get(self, url):
try:
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36')
shtml = urllib2.urlopen(req, timeout=15).read()
except Exception, e:
print e,"...next.." data = StringIO.StringIO(shtml)
HTML = data.read()
return(HTML) # 返回html
@property
def html(self):
return self.TEXTMARK @property
def url(self):
return self.URL @property
def meta(self):
return self.META def selector(self, doc=None):
if doc:
HTML = doc
else:
HTML = self.HTML
return lxml.html.fromstring(HTML) def xpath(self, rule):
iter_list = self.SEL.xpath(rule)
attrList = []
try:
for ele in iter_list:
attrList.append(ele.attrib)
#attrList.append(ele.attrib)
return attrList
except Exception, e:
return iter_list def Request(url, func, **meta):
if meta:
response=Spider(url,meta['meta'])
else:
response=Spider(url)
func(response)

2.copider/aero.py

#coding=utf-8

'''
Created on 2015年10月8日 @author: snt1
''' import re
import time
from copider import Spider, Request class AeroCopider(object): name = "aero"
storeId = "554b14c97b010cc731e81b35" # 站点ID
allowed_domains = ["www.xxxx.com"] root_url = 'http://www.xxxx.com'
category_url = root_url + '/category/index.jsp?numResultsPerPage=100&categoryId=%s'
cap_category_url = root_url + '/family/index.jsp?categoryId=%s&page=%d&numResultsPerPage=100'
url_dicts = {'':'Girls', '':'Guys'} def __init__(self):
self.start_urls() def start_urls(self):
for fid in self.url_dicts.keys():
url = self.category_url %fid
response = Spider(url)
node_a = response.xpath('//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/@href')
node_text = response.xpath('//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/text()') url_list, cid_list = [],[]
for num, preparing in enumerate(node_a):
parttern = re.compile(r'family.jsp\?categoryId=')
if parttern.search(preparing):
chd_url = self.root_url+preparing
pattern_sub = re.compile('&cp=.*?$')
chd_url = pattern_sub.sub('', chd_url, re.S|re.I|re.M) pattern_fin = re.compile(r'family.jsp\?categoryId=(\d+)')
cid = pattern_fin.findall(chd_url)[0]
url_list.append(chd_url)
cid_list.append(cid)
print(u'产品分类链接:%s -> %s' %(node_text[num], chd_url))
cateid = cid_list[num]
Request(chd_url, self.parse_page, meta={'cateid':cateid})
print def parse_page(self, response):
#total_page = response.xpath('//div[@class="pagination"]/ul/li/a[@rel="nofollow"]/text()')
total_items = int(response.xpath('//*[@id="main-wrap"]//li[@class="count"]/span/text()')[0])
mod, rem = divmod(total_items, 100)
if mod > 1:
if rem > 0:
mod += 1
else:
mod = 1 total_page = mod
print(u'产品总分页数: %s -> %s' %(total_page,response.url)) cateid = response.meta['cateid']
for page in range(1, total_page+1):
url = self.cap_category_url %(cateid, page)
Request(url, self.parse_product) def parse_product(self, response):
product = response.xpath('//*[@id="products"]//h4/a/@href')
print(u'以下来自哪个页面:%s' %response.url)
print(u'产品:%s个 -> 路径:%s' %(len(product), product)) if __name__ == '__main__':
AeroCopider()

最新文章

  1. 第三方Girdview中文件下载的方法,以及js显示图片
  2. jsp页面动态显示时间
  3. distributed caching for .net applications
  4. Ubuntu用作Server时出现乱码的解决方法
  5. OLTP与OLAP的介绍
  6. 最新Burpsuite Pro v1.7.03 介绍和破解版下载
  7. 递归删除指定目录下的 .git 文件
  8. setblendstate & setdepthstencilstate
  9. C语言-创建链表及排序
  10. 设计模式------Adapter(适配器)
  11. Android Studio无法关联Api23源码-提示Souces for android api 23 platform not found
  12. N使用exus2打造企业maven仓库(三)
  13. hdu1881(贪心+dp)
  14. javaWeb+servlet+mysql实现简单的企业员工管理系统
  15. 《CLR Via C#》读书笔记:26.线程基础
  16. 【原创】使用golang访问windows telnet服务器
  17. HTTP请求时间参数设置
  18. mysql用户权限分配专栏
  19. AIO编程
  20. 第三次Sprint计划

热门文章

  1. spring-cloud-starter-ribbon提供客户端的软件负载均衡算法
  2. 11、Java并发性和多线程-Java内存模型
  3. PayPal加密证书.pem的生成
  4. Python基础--高速改造:字符串
  5. GRANT 授权
  6. VFL演示样例
  7. FOBiz组合模糊查询
  8. Gold Coins
  9. 理解JavaScript中的闭包
  10. Windows(7/8/10)搭建Elasticsearch 6.x版本